/* * net/tipc/name_distr.c: TIPC name distribution code * * Copyright (c) 2000-2006, 2014, Ericsson AB * Copyright (c) 2005, 2010-2011, Wind River Systems * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions are met: * * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * 3. Neither the names of the copyright holders nor the names of its * contributors may be used to endorse or promote products derived from * this software without specific prior written permission. * * Alternatively, this software may be distributed under the terms of the * GNU General Public License ("GPL") version 2 as published by the Free * Software Foundation. * * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE * POSSIBILITY OF SUCH DAMAGE. */ #include "core.h" #include "link.h" #include "name_distr.h" int sysctl_tipc_named_timeout __read_mostly = 2000; struct distr_queue_item { struct distr_item i; u32 dtype; u32 node; unsigned long expires; struct list_head next; }; /** * publ_to_item - add publication info to a publication message */ static void publ_to_item(struct distr_item *i, struct publication *p) { i->type = htonl(p->type); i->lower = htonl(p->lower); i->upper = htonl(p->upper); i->ref = htonl(p->ref); i->key = htonl(p->key); } /** * named_prepare_buf - allocate & initialize a publication message * * The buffer returned is of size INT_H_SIZE + payload size */ static struct sk_buff *named_prepare_buf(struct net *net, u32 type, u32 size, u32 dest) { struct tipc_net *tn = net_generic(net, tipc_net_id); struct sk_buff *buf = tipc_buf_acquire(INT_H_SIZE + size, GFP_ATOMIC); struct tipc_msg *msg; if (buf != NULL) { msg = buf_msg(buf); tipc_msg_init(tn->own_addr, msg, NAME_DISTRIBUTOR, type, INT_H_SIZE, dest); msg_set_size(msg, INT_H_SIZE + size); } return buf; } /** * tipc_named_publish - tell other nodes about a new publication by this node */ struct sk_buff *tipc_named_publish(struct net *net, struct publication *publ) { struct tipc_net *tn = net_generic(net, tipc_net_id); struct sk_buff *buf; struct distr_item *item; list_add_tail_rcu(&publ->local_list, &tn->nametbl->publ_list[publ->scope]); if (publ->scope == TIPC_NODE_SCOPE) return NULL; buf = named_prepare_buf(net, PUBLICATION, ITEM_SIZE, 0); if (!buf) { pr_warn("Publication distribution failure\n"); return NULL; } item = (struct distr_item *)msg_data(buf_msg(buf)); publ_to_item(item, publ); return buf; } /** * tipc_named_withdraw - tell other nodes about a withdrawn publication by this node */ struct sk_buff *tipc_named_withdraw(struct net *net, struct publication *publ) { struct sk_buff *buf; struct distr_item *item; list_del(&publ->local_list); if (publ->scope == TIPC_NODE_SCOPE) return NULL; buf = named_prepare_buf(net, WITHDRAWAL, ITEM_SIZE, 0); if (!buf) { pr_warn("Withdrawal distribution failure\n"); return NULL; } item = (struct distr_item *)msg_data(buf_msg(buf)); publ_to_item(item, publ); return buf; } /** * named_distribute - prepare name info for bulk distribution to another node * @list: list of messages (buffers) to be returned from this function * @dnode: node to be updated * @pls: linked list of publication items to be packed into buffer chain */ static void named_distribute(struct net *net, struct sk_buff_head *list, u32 dnode, struct list_head *pls) { struct publication *publ; struct sk_buff *skb = NULL; struct distr_item *item = NULL; u32 msg_dsz = ((tipc_node_get_mtu(net, dnode, 0) - INT_H_SIZE) / ITEM_SIZE) * ITEM_SIZE; u32 msg_rem = msg_dsz; list_for_each_entry(publ, pls, local_list) { /* Prepare next buffer: */ if (!skb) { skb = named_prepare_buf(net, PUBLICATION, msg_rem, dnode); if (!skb) { pr_warn("Bulk publication failure\n"); return; } msg_set_bc_ack_invalid(buf_msg(skb), true); item = (struct distr_item *)msg_data(buf_msg(skb)); } /* Pack publication into message: */ publ_to_item(item, publ); item++; msg_rem -= ITEM_SIZE; /* Append full buffer to list: */ if (!msg_rem) { __skb_queue_tail(list, skb); skb = NULL; msg_rem = msg_dsz; } } if (skb) { msg_set_size(buf_msg(skb), INT_H_SIZE + (msg_dsz - msg_rem)); skb_trim(skb, INT_H_SIZE + (msg_dsz - msg_rem)); __skb_queue_tail(list, skb); } } /** * tipc_named_node_up - tell specified node about all publications by this node */ void tipc_named_node_up(struct net *net, u32 dnode) { struct tipc_net *tn = net_generic(net, tipc_net_id); struct sk_buff_head head; __skb_queue_head_init(&head); rcu_read_lock(); named_distribute(net, &head, dnode, &tn->nametbl->publ_list[TIPC_CLUSTER_SCOPE]); named_distribute(net, &head, dnode, &tn->nametbl->publ_list[TIPC_ZONE_SCOPE]); rcu_read_unlock(); tipc_node_xmit(net, &head, dnode, 0); } /** * tipc_publ_purge - remove publication associated with a failed node * * Invoked for each publication issued by a newly failed node. * Removes publication structure from name table & deletes it. */ static void tipc_publ_purge(struct net *net, struct publication *publ, u32 addr) { struct tipc_net *tn = net_generic(net, tipc_net_id); struct publication *p; spin_lock_bh(&tn->nametbl_lock); p = tipc_nametbl_remove_publ(net, publ->type, publ->lower, publ->node, publ->ref, publ->key); if (p) tipc_node_unsubscribe(net, &p->nodesub_list, addr); spin_unlock_bh(&tn->nametbl_lock); if (p != publ) { pr_err("Unable to remove publication from failed node\n" " (type=%u, lower=%u, node=0x%x, ref=%u, key=%u)\n", publ->type, publ->lower, publ->node, publ->ref, publ->key); } kfree_rcu(p, rcu); } /** * tipc_dist_queue_purge - remove deferred updates from a node that went down */ static void tipc_dist_queue_purge(struct net *net, u32 addr) { struct tipc_net *tn = net_generic(net, tipc_net_id); struct distr_queue_item *e, *tmp; spin_lock_bh(&tn->nametbl_lock); list_for_each_entry_safe(e, tmp, &tn->dist_queue, next) { if (e->node != addr) continue; list_del(&e->next); kfree(e); } spin_unlock_bh(&tn->nametbl_lock); } void tipc_publ_notify(struct net *net, struct list_head *nsub_list, u32 addr) { struct publication *publ, *tmp; list_for_each_entry_safe(publ, tmp, nsub_list, nodesub_list) tipc_publ_purge(net, publ, addr); tipc_dist_queue_purge(net, addr); } /** * tipc_update_nametbl - try to process a nametable update and notify * subscribers * * tipc_nametbl_lock must be held. * Returns the publication item if successful, otherwise NULL. */ static bool tipc_update_nametbl(struct net *net, struct distr_item *i, u32 node, u32 dtype) { struct publication *publ = NULL; if (dtype == PUBLICATION) { publ = tipc_nametbl_insert_publ(net, ntohl(i->type), ntohl(i->lower), ntohl(i->upper), TIPC_CLUSTER_SCOPE, node, ntohl(i->ref), ntohl(i->key)); if (publ) { tipc_node_subscribe(net, &publ->nodesub_list, node); return true; } } else if (dtype == WITHDRAWAL) { publ = tipc_nametbl_remove_publ(net, ntohl(i->type), ntohl(i->lower), node, ntohl(i->ref), ntohl(i->key)); if (publ) { tipc_node_unsubscribe(net, &publ->nodesub_list, node); kfree_rcu(publ, rcu); return true; } } else { pr_warn("Unrecognized name table message received\n"); } return false; } /** * tipc_named_add_backlog - add a failed name table update to the backlog * */ static void tipc_named_add_backlog(struct net *net, struct distr_item *i, u32 type, u32 node) { struct distr_queue_item *e; struct tipc_net *tn = net_generic(net, tipc_net_id); unsigned long now = get_jiffies_64(); e = kzalloc(sizeof(*e), GFP_ATOMIC); if (!e) return; e->dtype = type; e->node = node; e->expires = now + msecs_to_jiffies(sysctl_tipc_named_timeout); memcpy(e, i, sizeof(*i)); list_add_tail(&e->next, &tn->dist_queue); } /** * tipc_named_process_backlog - try to process any pending name table updates * from the network. */ void tipc_named_process_backlog(struct net *net) { struct distr_queue_item *e, *tmp; struct tipc_net *tn = net_generic(net, tipc_net_id); char addr[16]; unsigned long now = get_jiffies_64(); list_for_each_entry_safe(e, tmp, &tn->dist_queue, next) { if (time_after(e->expires, now)) { if (!tipc_update_nametbl(net, &e->i, e->node, e->dtype)) continue; } else { tipc_addr_string_fill(addr, e->node); pr_warn_ratelimited("Dropping name table update (%d) of {%u, %u, %u} from %s key=%u\n", e->dtype, ntohl(e->i.type), ntohl(e->i.lower), ntohl(e->i.upper), addr, ntohl(e->i.key)); } list_del(&e->next); kfree(e); } } /** * tipc_named_rcv - process name table update messages sent by another node */ void tipc_named_rcv(struct net *net, struct sk_buff_head *inputq) { struct tipc_net *tn = net_generic(net, tipc_net_id); struct tipc_msg *msg; struct distr_item *item; uint count; u32 node; struct sk_buff *skb; int mtype; spin_lock_bh(&tn->nametbl_lock); for (skb = skb_dequeue(inputq); skb; skb = skb_dequeue(inputq)) { skb_linearize(skb); msg = buf_msg(skb); mtype = msg_type(msg); item = (struct distr_item *)msg_data(msg); count = msg_data_sz(msg) / ITEM_SIZE; node = msg_orignode(msg); while (count--) { if (!tipc_update_nametbl(net, item, node, mtype)) tipc_named_add_backlog(net, item, mtype, node); item++; } kfree_skb(skb); tipc_named_process_backlog(net); } spin_unlock_bh(&tn->nametbl_lock); } /** * tipc_named_reinit - re-initialize local publications * * This routine is called whenever TIPC networking is enabled. * All name table entries published by this node are updated to reflect * the node's new network address. */ void tipc_named_reinit(struct net *net) { struct tipc_net *tn = net_generic(net, tipc_net_id); struct publication *publ; int scope; spin_lock_bh(&tn->nametbl_lock); for (scope = TIPC_ZONE_SCOPE; scope <= TIPC_NODE_SCOPE; scope++) list_for_each_entry_rcu(publ, &tn->nametbl->publ_list[scope], local_list) publ->node = tn->own_addr; spin_unlock_bh(&tn->nametbl_lock); } 2::/120 dev eth2 proto kernel metric 256 pref medium ... Because IPv6 allows individual nexthops to be deleted without deleting the entire route, the ip6_route_multipath_del and non-multipath code path (ip6_route_del) have to be discriminated so that all nexthops are only deleted for the latter case. This is done by making the existing fc_type in fib6_config a u16 and then adding a new u16 field with fc_delete_all_nh as the first bit. Suggested-by: Dinesh Dutt <ddutt@cumulusnetworks.com> Signed-off-by: David Ahern <dsa@cumulusnetworks.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2017-02-03net: skb_needs_check() accepts CHECKSUM_NONE for txEric Dumazet1-3/+4 My recent change missed fact that UFO would perform a complete UDP checksum before segmenting in frags. In this case skb->ip_summed is set to CHECKSUM_NONE. We need to add this valid case to skb_needs_check() Fixes: b2504a5dbef3 ("net: reduce skb_warn_bad_offload() noise") Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Willem de Bruijn <willemb@google.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2017-02-03net: remove support for per driver ndo_busy_poll()Eric Dumazet2-16/+0 We added generic support for busy polling in NAPI layer in linux-4.5 No network driver uses ndo_busy_poll() anymore, we can get rid of the pointer in struct net_device_ops, and its use in sk_busy_loop() Saves NETIF_F_BUSY_POLL features bit. Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2017-02-03Merge git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf-nextDavid S. Miller53-640/+576 Pablo Neira Ayuso says: ==================== Netfilter updates for net-next The following patchset contains Netfilter updates for your net-next tree, they are: 1) Stash ctinfo 3-bit field into pointer to nf_conntrack object from sk_buff so we only access one single cacheline in the conntrack hotpath. Patchset from Florian Westphal. 2) Don't leak pointer to internal structures when exporting x_tables ruleset back to userspace, from Willem DeBruijn. This includes new helper functions to copy data to userspace such as xt_data_to_user() as well as conversions of our ip_tables, ip6_tables and arp_tables clients to use it. Not surprinsingly, ebtables requires an ad-hoc update. There is also a new field in x_tables extensions to indicate the amount of bytes that we copy to userspace. 3) Add nf_log_all_netns sysctl: This new knob allows you to enable logging via nf_log infrastructure for all existing netnamespaces. Given the effort to provide pernet syslog has been discontinued, let's provide a way to restore logging using netfilter kernel logging facilities in trusted environments. Patch from Michal Kubecek. 4) Validate SCTP checksum from conntrack helper, from Davide Caratti. 5) Merge UDPlite conntrack and NAT helpers into UDP, this was mostly a copy&paste from the original helper, from Florian Westphal. 6) Reset netfilter state when duplicating packets, also from Florian. 7) Remove unnecessary check for broadcast in IPv6 in pkttype match and nft_meta, from Liping Zhang. 8) Add missing code to deal with loopback packets from nft_meta when used by the netdev family, also from Liping. 9) Several cleanups on nf_tables, one to remove unnecessary check from the netlink control plane path to add table, set and stateful objects and code consolidation when unregister chain hooks, from Gao Feng. 10) Fix harmless reference counter underflow in IPVS that, however, results in problems with the introduction of the new refcount_t type, from David Windsor. 11) Enable LIBCRC32C from nf_ct_sctp instead of nf_nat_sctp, from Davide Caratti. 12) Missing documentation on nf_tables uapi header, from Liping Zhang. 13) Use rb_entry() helper in xt_connlimit, from Geliang Tang. ==================== Signed-off-by: David S. Miller <davem@davemloft.net> 2017-02-03sched: cls_flower: expose priority to offloading netdeviceJiri Pirko1-0/+3 The driver that offloads flower rules needs to know with which priority user inserted the rules. So add this information into offload struct. Signed-off-by: Jiri Pirko <jiri@mellanox.com> Acked-by: Ido Schimmel <idosch@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2017-02-03tcp: clear pfmemalloc on outgoing skbEric Dumazet1-0/+7 Josef Bacik diagnosed following problem : I was seeing random disconnects while testing NBD over loopback. This turned out to be because NBD sets pfmemalloc on it's socket, however the receiving side is a user space application so does not have pfmemalloc set on its socket. This means that sk_filter_trim_cap will simply drop this packet, under the assumption that the other side will simply retransmit. Well we do retransmit, and then the packet is just dropped again for the same reason. It seems the better way to address this problem is to clear pfmemalloc in the TCP transmit path. pfmemalloc strict control really makes sense on the receive path. Signed-off-by: Eric Dumazet <edumazet@google.com> Acked-by: Josef Bacik <jbacik@fb.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2017-02-03net: ipv6: Set protocol to kernel for local routesDavid Ahern1-0/+1 IPv6 stack does not set the protocol for local routes, so those routes show up with proto "none": $ ip -6 ro ls table local local ::1 dev lo proto none metric 0 pref medium local 2100:3:: dev lo proto none metric 0 pref medium local 2100:3::4 dev lo proto none metric 0 pref medium local fe80:: dev lo proto none metric 0 pref medium ... Set rt6i_protocol to RTPROT_KERNEL for consistency with IPv4. Now routes show up with proto "kernel": $ ip -6 ro ls table local local ::1 dev lo proto kernel metric 0 pref medium local 2100:3:: dev lo proto kernel metric 0 pref medium local 2100:3::4 dev lo proto kernel metric 0 pref medium local fe80:: dev lo proto kernel metric 0 pref medium ... Signed-off-by: David Ahern <dsa@cumulusnetworks.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2017-02-03bridge: vlan dst_metadata hooks in ingress and egress pathsRoopa Prabhu6-2/+82 - ingress hook: - if port is a tunnel port, use tunnel info in attached dst_metadata to map it to a local vlan - egress hook: - if port is a tunnel port, use tunnel info attached to vlan to set dst_metadata on the skb CC: Nikolay Aleksandrov <nikolay@cumulusnetworks.com> Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2017-02-03bridge: per vlan dst_metadata netlink supportRoopa Prabhu7-48/+641 This patch adds support to attach per vlan tunnel info dst metadata. This enables bridge driver to map vlan to tunnel_info at ingress and egress. It uses the kernel dst_metadata infrastructure. The initial use case is vlan to vni bridging, but the api is generic to extend to any tunnel_info in the future: - Uapi to configure/unconfigure/dump per vlan tunnel data - netlink functions to configure vlan and tunnel_info mapping - Introduces bridge port flag BR_LWT_VLAN to enable attach/detach dst_metadata to bridged packets on ports. off by default. - changes to existing code is mainly refactor some existing vlan handling netlink code + hooks for new vlan tunnel code - I have kept the vlan tunnel code isolated in separate files. - most of the netlink vlan tunnel code is handling of vlan-tunid ranges (follows the vlan range handling code). To conserve space vlan-tunid by default are always dumped in ranges if applicable. Use case: example use for this is a vxlan bridging gateway or vtep which maps vlans to vn-segments (or vnis). iproute2 example (patched and pruned iproute2 output to just show relevant fdb entries): example shows same host mac learnt on two vni's and vlan 100 maps to vni 1000, vlan 101 maps to vni 1001 before (netdev per vni): $bridge fdb show | grep "00:02:00:00:00:03" 00:02:00:00:00:03 dev vxlan1001 vlan 101 master bridge 00:02:00:00:00:03 dev vxlan1001 dst 12.0.0.8 self 00:02:00:00:00:03 dev vxlan1000 vlan 100 master bridge 00:02:00:00:00:03 dev vxlan1000 dst 12.0.0.8 self after this patch with collect metdata in bridged mode (single netdev): $bridge fdb show | grep "00:02:00:00:00:03" 00:02:00:00:00:03 dev vxlan0 vlan 101 master bridge 00:02:00:00:00:03 dev vxlan0 src_vni 1001 dst 12.0.0.8 self 00:02:00:00:00:03 dev vxlan0 vlan 100 master bridge 00:02:00:00:00:03 dev vxlan0 src_vni 1000 dst 12.0.0.8 self CC: Nikolay Aleksandrov <nikolay@cumulusnetworks.com> Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2017-02-03net/sched: act_ife: Change to use ife moduleYotam Gigi2-78/+33 Use the encode/decode functionality from the ife module instead of using implementation inside the act_ife. Reviewed-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: Yotam Gigi <yotamg@mellanox.com> Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com> Signed-off-by: Roman Mashak <mrv@mojatatu.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2017-02-03net: Introduce ife encapsulation moduleYotam Gigi5-0/+165 This module is responsible for the ife encapsulation protocol encode/decode logics. That module can: - ife_encode: encode skb and reserve space for the ife meta header - ife_decode: decode skb and extract the meta header size - ife_tlv_meta_encode - encodes one tlv entry into the reserved ife header space. - ife_tlv_meta_decode - decodes one tlv entry from the packet - ife_tlv_meta_next - advance to the next tlv Reviewed-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: Yotam Gigi <yotamg@mellanox.com> Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com> Signed-off-by: Roman Mashak <mrv@mojatatu.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2017-02-03net/sched: act_ife: Unexport ife_tlv_meta_encodeYotam Gigi1-2/+2 As the function ife_tlv_meta_encode is not used by any other module, unexport it and make it static for the act_ife module. Signed-off-by: Yotam Gigi <yotamg@mellanox.com> Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com> Signed-off-by: Roman Mashak <mrv@mojatatu.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2017-02-03tcp: add tcp_mss_clamp() helperEric Dumazet