/* * llc_sap.c - driver routines for SAP component. * * Copyright (c) 1997 by Procom Technology, Inc. * 2001-2003 by Arnaldo Carvalho de Melo * * This program can be redistributed or modified under the terms of the * GNU General Public License as published by the Free Software Foundation. * This program is distributed without any warranty or implied warranty * of merchantability or fitness for a particular purpose. * * See the GNU General Public License for more details. */ #include #include #include #include #include #include #include #include #include #include #include #include static int llc_mac_header_len(unsigned short devtype) { switch (devtype) { case ARPHRD_ETHER: case ARPHRD_LOOPBACK: return sizeof(struct ethhdr); } return 0; } /** * llc_alloc_frame - allocates sk_buff for frame * @dev: network device this skb will be sent over * @type: pdu type to allocate * @data_size: data size to allocate * * Allocates an sk_buff for frame and initializes sk_buff fields. * Returns allocated skb or %NULL when out of memory. */ struct sk_buff *llc_alloc_frame(struct sock *sk, struct net_device *dev, u8 type, u32 data_size) { int hlen = type == LLC_PDU_TYPE_U ? 3 : 4; struct sk_buff *skb; hlen += llc_mac_header_len(dev->type); skb = alloc_skb(hlen + data_size, GFP_ATOMIC); if (skb) { skb_reset_mac_header(skb); skb_reserve(skb, hlen); skb_reset_network_header(skb); skb_reset_transport_header(skb); skb->protocol = htons(ETH_P_802_2); skb->dev = dev; if (sk != NULL) skb_set_owner_w(skb, sk); } return skb; } void llc_save_primitive(struct sock *sk, struct sk_buff *skb, u8 prim) { struct sockaddr_llc *addr; /* save primitive for use by the user. */ addr = llc_ui_skb_cb(skb); memset(addr, 0, sizeof(*addr)); addr->sllc_family = sk->sk_family; addr->sllc_arphrd = skb->dev->type; addr->sllc_test = prim == LLC_TEST_PRIM; addr->sllc_xid = prim == LLC_XID_PRIM; addr->sllc_ua = prim == LLC_DATAUNIT_PRIM; llc_pdu_decode_sa(skb, addr->sllc_mac); llc_pdu_decode_ssap(skb, &addr->sllc_sap); } /** * llc_sap_rtn_pdu - Informs upper layer on rx of an UI, XID or TEST pdu. * @sap: pointer to SAP * @skb: received pdu */ void llc_sap_rtn_pdu(struct llc_sap *sap, struct sk_buff *skb) { struct llc_sap_state_ev *ev = llc_sap_ev(skb); struct llc_pdu_un *pdu = llc_pdu_un_hdr(skb); switch (LLC_U_PDU_RSP(pdu)) { case LLC_1_PDU_CMD_TEST: ev->prim = LLC_TEST_PRIM; break; case LLC_1_PDU_CMD_XID: ev->prim = LLC_XID_PRIM; break; case LLC_1_PDU_CMD_UI: ev->prim = LLC_DATAUNIT_PRIM; break; } ev->ind_cfm_flag = LLC_IND; } /** * llc_find_sap_trans - finds transition for event * @sap: pointer to SAP * @skb: happened event * * This function finds transition that matches with happened event. * Returns the pointer to found transition on success or %NULL for * failure. */ static struct llc_sap_state_trans *llc_find_sap_trans(struct llc_sap *sap, struct sk_buff *skb) { int i = 0; struct llc_sap_state_trans *rc = NULL; struct llc_sap_state_trans **next_trans; struct llc_sap_state *curr_state = &llc_sap_state_table[sap->state - 1]; /* * Search thru events for this state until list exhausted or until * its obvious the event is not valid for the current state */ for (next_trans = curr_state->transitions; next_trans[i]->ev; i++) if (!next_trans[i]->ev(sap, skb)) { rc = next_trans[i]; /* got event match; return it */ break; } return rc; } /** * llc_exec_sap_trans_actions - execute actions related to event * @sap: pointer to SAP * @trans: pointer to transition that it's actions must be performed * @skb: happened event. * * This function executes actions that is related to happened event. * Returns 0 for success and 1 for failure of at least one action. */ static int llc_exec_sap_trans_actions(struct llc_sap *sap, struct llc_sap_state_trans *trans, struct sk_buff *skb) { int rc = 0; const llc_sap_action_t *next_action = trans->ev_actions; for (; next_action && *next_action; next_action++) if ((*next_action)(sap, skb)) rc = 1; return rc; } /** * llc_sap_next_state - finds transition, execs actions & change SAP state * @sap: pointer to SAP * @skb: happened event * * This function finds transition that matches with happened event, then * executes related actions and finally changes state of SAP. It returns * 0 on success and 1 for failure. */ static int llc_sap_next_state(struct llc_sap *sap, struct sk_buff *skb) { int rc = 1; struct llc_sap_state_trans *trans; if (sap->state > LLC_NR_SAP_STATES) goto out; trans = llc_find_sap_trans(sap, skb); if (!trans) goto out; /* * Got the state to which we next transition; perform the actions * associated with this transition before actually transitioning to the * next state */ rc = llc_exec_sap_trans_actions(sap, trans, skb); if (rc) goto out; /* * Transition SAP to next state if all actions execute successfully */ sap->state = trans->next_state; out: return rc; } /** * llc_sap_state_process - sends event to SAP state machine * @sap: sap to use * @skb: pointer to occurred event * * After executing actions of the event, upper layer will be indicated * if needed(on receiving an UI frame). sk can be null for the * datalink_proto case. */ static void llc_sap_state_process(struct llc_sap *sap, struct sk_buff *skb) { struct llc_sap_state_ev *ev = llc_sap_ev(skb); /* * We have to hold the skb, because llc_sap_next_state * will kfree it in the sending path and we need to * look at the skb->cb, where we encode llc_sap_state_ev. */ skb_get(skb); ev->ind_cfm_flag = 0; llc_sap_next_state(sap, skb); if (ev->ind_cfm_flag == LLC_IND) { if (skb->sk->sk_state == TCP_LISTEN) kfree_skb(skb); else { llc_save_primitive(skb->sk, skb, ev->prim); /* queue skb to the user. */ if (sock_queue_rcv_skb(skb->sk, skb)) kfree_skb(skb); } } kfree_skb(skb); } /** * llc_build_and_send_test_pkt - TEST interface for upper layers. * @sap: sap to use * @skb: packet to send * @dmac: destination mac address * @dsap: destination sap * * This function is called when upper layer wants to send a TEST pdu. * Returns 0 for success, 1 otherwise. */ void llc_build_and_send_test_pkt(struct llc_sap *sap, struct sk_buff *skb, u8 *dmac, u8 dsap) { struct llc_sap_state_ev *ev = llc_sap_ev(skb); ev->saddr.lsap = sap->laddr.lsap; ev->daddr.lsap = dsap; memcpy(ev->saddr.mac, skb->dev->dev_addr, IFHWADDRLEN); memcpy(ev->daddr.mac, dmac, IFHWADDRLEN); ev->type = LLC_SAP_EV_TYPE_PRIM; ev->prim = LLC_TEST_PRIM; ev->prim_type = LLC_PRIM_TYPE_REQ; llc_sap_state_process(sap, skb); } /** * llc_build_and_send_xid_pkt - XID interface for upper layers * @sap: sap to use * @skb: packet to send * @dmac: destination mac address * @dsap: destination sap * * This function is called when upper layer wants to send a XID pdu. * Returns 0 for success, 1 otherwise. */ void llc_build_and_send_xid_pkt(struct llc_sap *sap, struct sk_buff *skb, u8 *dmac, u8 dsap) { struct llc_sap_state_ev *ev = llc_sap_ev(skb); ev->saddr.lsap = sap->laddr.lsap; ev->daddr.lsap = dsap; memcpy(ev->saddr.mac, skb->dev->dev_addr, IFHWADDRLEN); memcpy(ev->daddr.mac, dmac, IFHWADDRLEN); ev->type = LLC_SAP_EV_TYPE_PRIM; ev->prim = LLC_XID_PRIM; ev->prim_type = LLC_PRIM_TYPE_REQ; llc_sap_state_process(sap, skb); } /** * llc_sap_rcv - sends received pdus to the sap state machine * @sap: current sap component structure. * @skb: received frame. * * Sends received pdus to the sap state machine. */ static void llc_sap_rcv(struct llc_sap *sap, struct sk_buff *skb, struct sock *sk) { struct llc_sap_state_ev *ev = llc_sap_ev(skb); ev->type = LLC_SAP_EV_TYPE_PDU; ev->reason = 0; skb->sk = sk; llc_sap_state_process(sap, skb); } static inline bool llc_dgram_match(const struct llc_sap *sap, const struct llc_addr *laddr, const struct sock *sk) { struct llc_sock *llc = llc_sk(sk); return sk->sk_type == SOCK_DGRAM && llc->laddr.lsap == laddr->lsap && ether_addr_equal(llc->laddr.mac, laddr->mac); } /** * llc_lookup_dgram - Finds dgram socket for the local sap/mac * @sap: SAP * @laddr: address of local LLC (MAC + SAP) * * Search socket list of the SAP and finds connection using the local * mac, and local sap. Returns pointer for socket found, %NULL otherwise. */ static struct sock *llc_lookup_dgram(struct llc_sap *sap, const struct llc_addr *laddr) { struct sock *rc; struct hlist_nulls_node *node; int slot = llc_sk_laddr_hashfn(sap, laddr); struct hlist_nulls_head *laddr_hb = &sap->sk_laddr_hash[slot]; rcu_read_lock_bh(); again: sk_nulls_for_each_rcu(rc, node, laddr_hb) { if (llc_dgram_match(sap, laddr, rc)) { /* Extra checks required by SLAB_DESTROY_BY_RCU */ if (unlikely(!atomic_inc_not_zero(&rc->sk_refcnt))) goto again; if (unlikely(llc_sk(rc)->sap != sap || !llc_dgram_match(sap, laddr, rc))) { sock_put(rc); continue; } goto found; } } rc = NULL; /* * if the nulls value we got at the end of this lookup is * not the expected one, we must restart lookup. * We probably met an item that was moved to another chain. */ if (unlikely(get_nulls_value(node) != slot)) goto again; found: rcu_read_unlock_bh(); return rc; } static inline bool llc_mcast_match(const struct llc_sap *sap, const struct llc_addr *laddr, const struct sk_buff *skb, const struct sock *sk) { struct llc_sock *llc = llc_sk(sk); return sk->sk_type == SOCK_DGRAM && llc->laddr.lsap == laddr->lsap && llc->dev == skb->dev; } static void llc_do_mcast(struct llc_sap *sap, struct sk_buff *skb, struct sock **stack, int count) { struct sk_buff *skb1; int i; for (i = 0; i < count; i++) { skb1 = skb_clone(skb, GFP_ATOMIC); if (!skb1) { sock_put(stack[i]); continue; } llc_sap_rcv(sap, skb1, stack[i]); sock_put(stack[i]); } } /** * llc_sap_mcast - Deliver multicast PDU's to all matching datagram sockets. * @sap: SAP * @laddr: address of local LLC (MAC + SAP) * * Search socket list of the SAP and finds connections with same sap. * Deliver clone to each. */ static void llc_sap_mcast(struct llc_sap *sap, const struct llc_addr *laddr, struct sk_buff *skb) { int i = 0, count = 256 / sizeof(struct sock *); struct sock *sk, *stack[count]; struct llc_sock *llc; struct hlist_head *dev_hb = llc_sk_dev_hash(sap, skb->dev->ifindex); spin_lock_bh(&sap->sk_lock); hlist_for_each_entry(llc, dev_hb, dev_hash_node) { sk = &llc->sk; if (!llc_mcast_match(sap, laddr, skb, sk)) continue; sock_hold(sk); if (i < count) stack[i++] = sk; else { llc_do_mcast(sap, skb, stack, i); i = 0; } } spin_unlock_bh(&sap->sk_lock); llc_do_mcast(sap, skb, stack, i); } void llc_sap_handler(struct llc_sap *sap, struct sk_buff *skb) { struct llc_addr laddr; llc_pdu_decode_da(skb, laddr.mac); llc_pdu_decode_dsap(skb, &laddr.lsap); if (is_multicast_ether_addr(laddr.mac)) { llc_sap_mcast(sap, &laddr, skb); kfree_skb(skb); } else { struct sock *sk = llc_lookup_dgram(sap, &laddr); if (sk) { llc_sap_rcv(sap, skb, sk); sock_put(sk); } else kfree_skb(skb); } } utes to be deleted using just the prefix and length. For example: $ ip ro ls vrf red unreachable default metric 8192 1.1.1.0/24 nexthop via 10.100.1.254 dev eth1 weight 1 nexthop via 10.11.200.2 dev eth11.200 weight 1 10.11.200.0/24 dev eth11.200 proto kernel scope link src 10.11.200.3 10.100.1.0/24 dev eth1 proto kernel scope link src 10.100.1.3 $ ip ro del 1.1.1.0/24 vrf red $ ip ro ls vrf red unreachable default metric 8192 10.11.200.0/24 dev eth11.200 proto kernel scope link src 10.11.200.3 10.100.1.0/24 dev eth1 proto kernel scope link src 10.100.1.3 The same notation does not work with IPv6 because of how multipath routes are implemented for IPv6. For IPv6 only the first nexthop of a multipath route is deleted if the request contains only a prefix and length. This leads to unnecessary complexity in userspace dealing with IPv6 multipath routes. This patch allows all nexthops to be deleted without specifying each one in the delete request. Internally, this is done by walking the sibling list of the route matching the specifications given (prefix, length, metric, protocol, etc). $ ip -6 ro ls vrf red 2001:db8:1::/120 dev eth1 proto kernel metric 256 pref medium 2001:db8:2::/120 dev eth2 proto kernel metric 256 pref medium 2001:db8:200::/120 via 2001:db8:1::2 dev eth1 metric 1024 pref medium 2001:db8:200::/120 via 2001:db8:2::2 dev eth2 metric 1024 pref medium ... $ ip -6 ro del vrf red 2001:db8:200::/120 $ ip -6 ro ls vrf red 2001:db8:1::/120 dev eth1 proto kernel metric 256 pref medium 2001:db8:2::/120 dev eth2 proto kernel metric 256 pref medium ... Because IPv6 allows individual nexthops to be deleted without deleting the entire route, the ip6_route_multipath_del and non-multipath code path (ip6_route_del) have to be discriminated so that all nexthops are only deleted for the latter case. This is done by making the existing fc_type in fib6_config a u16 and then adding a new u16 field with fc_delete_all_nh as the first bit. Suggested-by: Dinesh Dutt <ddutt@cumulusnetworks.com> Signed-off-by: David Ahern <dsa@cumulusnetworks.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2017-02-03net: skb_needs_check() accepts CHECKSUM_NONE for txEric Dumazet1-3/+4 My recent change missed fact that UFO would perform a complete UDP checksum before segmenting in frags. In this case skb->ip_summed is set to CHECKSUM_NONE. We need to add this valid case to skb_needs_check() Fixes: b2504a5dbef3 ("net: reduce skb_warn_bad_offload() noise") Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Willem de Bruijn <willemb@google.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2017-02-03net: remove support for per driver ndo_busy_poll()Eric Dumazet2-16/+0 We added generic support for busy polling in NAPI layer in linux-4.5 No network driver uses ndo_busy_poll() anymore, we can get rid of the pointer in struct net_device_ops, and its use in sk_busy_loop() Saves NETIF_F_BUSY_POLL features bit. Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2017-02-03Merge git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf-nextDavid S. Miller53-640/+576 Pablo Neira Ayuso says: ==================== Netfilter updates for net-next The following patchset contains Netfilter updates for your net-next tree, they are: 1) Stash ctinfo 3-bit field into pointer to nf_conntrack object from sk_buff so we only access one single cacheline in the conntrack hotpath. Patchset from Florian Westphal. 2) Don't leak pointer to internal structures when exporting x_tables ruleset back to userspace, from Willem DeBruijn. This includes new helper functions to copy data to userspace such as xt_data_to_user() as well as conversions of our ip_tables, ip6_tables and arp_tables clients to use it. Not surprinsingly, ebtables requires an ad-hoc update. There is also a new field in x_tables extensions to indicate the amount of bytes that we copy to userspace. 3) Add nf_log_all_netns sysctl: This new knob allows you to enable logging via nf_log infrastructure for all existing netnamespaces. Given the effort to provide pernet syslog has been discontinued, let's provide a way to restore logging using netfilter kernel logging facilities in trusted environments. Patch from Michal Kubecek. 4) Validate SCTP checksum from conntrack helper, from Davide Caratti. 5) Merge UDPlite conntrack and NAT helpers into UDP, this was mostly a copy&paste from the original helper, from Florian Westphal. 6) Reset netfilter state when duplicating packets, also from Florian. 7) Remove unnecessary check for broadcast in IPv6 in pkttype match and nft_meta, from Liping Zhang. 8) Add missing code to deal with loopback packets from nft_meta when used by the netdev family, also from Liping. 9) Several cleanups on nf_tables, one to remove unnecessary check from the netlink control plane path to add table, set and stateful objects and code consolidation when unregister chain hooks, from Gao Feng. 10) Fix harmless reference counter underflow in IPVS that, however, results in problems with the introduction of the new refcount_t type, from David Windsor. 11) Enable LIBCRC32C from nf_ct_sctp instead of nf_nat_sctp, from Davide Caratti. 12) Missing documentation on nf_tables uapi header, from Liping Zhang. 13) Use rb_entry() helper in xt_connlimit, from Geliang Tang. ==================== Signed-off-by: David S. Miller <davem@davemloft.net> 2017-02-03sched: cls_flower: expose priority to offloading netdeviceJiri Pirko1-0/+3 The driver that offloads flower rules needs to know with which priority user inserted the rules. So add this information into offload struct. Signed-off-by: Jiri Pirko <jiri@mellanox.com> Acked-by: Ido Schimmel <idosch@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2017-02-03tcp: clear pfmemalloc on outgoing skbEric Dumazet1-0/+7 Josef Bacik diagnosed following problem : I was seeing random disconnects while testing NBD over loopback. This turned out to be because NBD sets pfmemalloc on it's socket, however the receiving side is a user space application so does not have pfmemalloc set on its socket. This means that sk_filter_trim_cap will simply drop this packet, under the assumption that the other side will simply retransmit. Well we do retransmit, and then the packet is just dropped again for the same reason. It seems the better way to address this problem is to clear pfmemalloc in the TCP transmit path. pfmemalloc strict control really makes sense on the receive path. Signed-off-by: Eric Dumazet <edumazet@google.com> Acked-by: Josef Bacik <jbacik@fb.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2017-02-03net: ipv6: Set protocol to kernel for local routesDavid Ahern1-0/+1 IPv6 stack does not set the protocol for local routes, so those routes show up with proto "none": $ ip -6 ro ls table local local ::1 dev lo proto none metric 0 pref medium local 2100:3:: dev lo proto none metric 0 pref medium local 2100:3::4 dev lo proto none metric 0 pref medium local fe80:: dev lo proto none metric 0 pref medium ... Set rt6i_protocol to RTPROT_KERNEL for consistency with IPv4. Now routes show up with proto "kernel": $ ip -6 ro ls table local local ::1 dev lo proto kernel metric 0 pref medium local 2100:3:: dev lo proto kernel metric 0 pref medium local 2100:3::4 dev lo proto kernel metric 0 pref medium local fe80:: dev lo proto kernel metric 0 pref medium ... Signed-off-by: David Ahern <dsa@cumulusnetworks.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2017-02-03bridge: vlan dst_metadata hooks in ingress and egress pathsRoopa Prabhu6-2/+82 - ingress hook: - if port is a tunnel port, use tunnel info in attached dst_metadata to map it to a local vlan - egress hook: - if port is a tunnel port, use tunnel info attached to vlan to set dst_metadata on the skb CC: Nikolay Aleksandrov <nikolay@cumulusnetworks.com> Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2017-02-03bridge: per vlan dst_metadata netlink supportRoopa Prabhu7-48/+641 This patch adds support to attach per vlan tunnel info dst metadata. This enables bridge driver to map vlan to tunnel_info at ingress and egress. It uses the kernel dst_metadata infrastructure. The initial use case is vlan to vni bridging, but the api is generic to extend to any tunnel_info in the future: - Uapi to configure/unconfigure/dump per vlan tunnel data - netlink functions to configure vlan and tunnel_info mapping - Introduces bridge port flag BR_LWT_VLAN to enable attach/detach dst_metadata to bridged packets on ports. off by default. - changes to existing code is mainly refactor some existing vlan handling netlink code + hooks for new vlan tunnel code - I have kept the vlan tunnel code isolated in separate files. - most of the netlink vlan tunnel code is handling of vlan-tunid ranges (follows the vlan range handling code). To conserve space vlan-tunid by default are always dumped in ranges if applicable. Use case: example use for this is a vxlan bridging gateway or vtep which maps vlans to vn-segments (or vnis). iproute2 example (patched and pruned iproute2 output to just show relevant fdb entries): example shows same host mac learnt on two vni's and vlan 100 maps to vni 1000, vlan 101 maps to vni 1001 before (netdev per vni): $bridge fdb show | grep "00:02:00:00:00:03" 00:02:00:00:00:03 dev vxlan1001 vlan 101 master bridge 00:02:00:00:00:03 dev vxlan1001 dst 12.0.0.8 self 00:02:00:00:00:03 dev vxlan1000 vlan 100 master bridge 00:02:00:00:00:03 dev vxlan1000 dst 12.0.0.8 self after this patch with collect metdata in bridged mode (single netdev): $bridge fdb show | grep "00:02:00:00:00:03" 00:02:00:00:00:03 dev vxlan0 vlan 101 master bridge 00:02:00:00:00:03 dev vxlan0 src_vni 1001 dst 12.0.0.8 self 00:02:00:00:00:03 dev vxlan0 vlan 100 master bridge 00:02:00:00:00:03 dev vxlan0 src_vni 1000 dst 12.0.0.8 self CC: Nikolay Aleksandrov <nikolay@cumulusnetworks.com> Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2017-02-03net/sched: act_ife: Change to use ife moduleYotam Gigi2-78/+33 Use the encode/decode functionality from the ife module instead of using implementation inside the act_ife. Reviewed-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: Yotam Gigi <yotamg@mellanox.com> Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com> Signed-off-by: Roman Mashak <mrv@mojatatu.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2017-02-03net: Introduce ife encapsulation moduleYotam Gigi5-0/+165 This module is responsible for the ife encapsulation protocol encode/decode logics. That module can: - ife_encode: encode skb and reserve space for the ife meta header - ife_decode: decode skb and extract the meta header size - ife_tlv_meta_encode - encodes one tlv entry into the reserved ife header space. - ife_tlv_meta_decode - decodes one tlv entry from the packet - ife_tlv_meta_next - advance to the next tlv Reviewed-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: Yotam Gigi <yotamg@mellanox.com> Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com> Signed-off-by: Roman Mashak <mrv@mojatatu.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2017-02-03net/sched: act_ife: Unexport ife_tlv_meta_encodeYotam Gigi1-2/+2 As the function ife_tlv_meta_encode is not used by any other module, unexport it and make it static for the act_ife module. Signed-off-by: Yotam Gigi <yotamg@mellanox.com> Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com> Signed-off-by: Roman Mashak <mrv@mojatatu.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2017-02-03tcp: add tcp_mss_clamp() helperEric Dumazet