/* * Copyright (c) 2006 Oracle. All rights reserved. * * This software is available to you under a choice of one of two * licenses. You may choose to be licensed under the terms of the GNU * General Public License (GPL) Version 2, available from the file * COPYING in the main directory of this source tree, or the * OpenIB.org BSD license below: * * Redistribution and use in source and binary forms, with or * without modification, are permitted provided that the following * conditions are met: * * - Redistributions of source code must retain the above * copyright notice, this list of conditions and the following * disclaimer. * * - Redistributions in binary form must reproduce the above * copyright notice, this list of conditions and the following * disclaimer in the documentation and/or other materials * provided with the distribution. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. * */ #include #include #include #include #include #include "rds.h" /* * This file implements a getsockopt() call which copies a set of fixed * sized structs into a user-specified buffer as a means of providing * read-only information about RDS. * * For a given information source there are a given number of fixed sized * structs at a given time. The structs are only copied if the user-specified * buffer is big enough. The destination pages that make up the buffer * are pinned for the duration of the copy. * * This gives us the following benefits: * * - simple implementation, no copy "position" across multiple calls * - consistent snapshot of an info source * - atomic copy works well with whatever locking info source has * - one portable tool to get rds info across implementations * - long-lived tool can get info without allocating * * at the following costs: * * - info source copy must be pinned, may be "large" */ struct rds_info_iterator { struct page **pages; void *addr; unsigned long offset; }; static DEFINE_SPINLOCK(rds_info_lock); static rds_info_func rds_info_funcs[RDS_INFO_LAST - RDS_INFO_FIRST + 1]; void rds_info_register_func(int optname, rds_info_func func) { int offset = optname - RDS_INFO_FIRST; BUG_ON(optname < RDS_INFO_FIRST || optname > RDS_INFO_LAST); spin_lock(&rds_info_lock); BUG_ON(rds_info_funcs[offset]); rds_info_funcs[offset] = func; spin_unlock(&rds_info_lock); } EXPORT_SYMBOL_GPL(rds_info_register_func); void rds_info_deregister_func(int optname, rds_info_func func) { int offset = optname - RDS_INFO_FIRST; BUG_ON(optname < RDS_INFO_FIRST || optname > RDS_INFO_LAST); spin_lock(&rds_info_lock); BUG_ON(rds_info_funcs[offset] != func); rds_info_funcs[offset] = NULL; spin_unlock(&rds_info_lock); } EXPORT_SYMBOL_GPL(rds_info_deregister_func); /* * Typically we hold an atomic kmap across multiple rds_info_copy() calls * because the kmap is so expensive. This must be called before using blocking * operations while holding the mapping and as the iterator is torn down. */ void rds_info_iter_unmap(struct rds_info_iterator *iter) { if (iter->addr) { kunmap_atomic(iter->addr); iter->addr = NULL; } } /* * get_user_pages() called flush_dcache_page() on the pages for us. */ void rds_info_copy(struct rds_info_iterator *iter, void *data, unsigned long bytes) { unsigned long this; while (bytes) { if (!iter->addr) iter->addr = kmap_atomic(*iter->pages); this = min(bytes, PAGE_SIZE - iter->offset); rdsdebug("page %p addr %p offset %lu this %lu data %p " "bytes %lu\n", *iter->pages, iter->addr, iter->offset, this, data, bytes); memcpy(iter->addr + iter->offset, data, this); data += this; bytes -= this; iter->offset += this; if (iter->offset == PAGE_SIZE) { kunmap_atomic(iter->addr); iter->addr = NULL; iter->offset = 0; iter->pages++; } } } EXPORT_SYMBOL_GPL(rds_info_copy); /* * @optval points to the userspace buffer that the information snapshot * will be copied into. * * @optlen on input is the size of the buffer in userspace. @optlen * on output is the size of the requested snapshot in bytes. * * This function returns -errno if there is a failure, particularly -ENOSPC * if the given userspace buffer was not large enough to fit the snapshot. * On success it returns the positive number of bytes of each array element * in the snapshot. */ int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval, int __user *optlen) { struct rds_info_iterator iter; struct rds_info_lengths lens; unsigned long nr_pages = 0; unsigned long start; unsigned long i; rds_info_func func; struct page **pages = NULL; int ret; int len; int total; if (get_user(len, optlen)) { ret = -EFAULT; goto out; } /* check for all kinds of wrapping and the like */ start = (unsigned long)optval; if (len < 0 || len > INT_MAX - PAGE_SIZE + 1 || start + len < start) { ret = -EINVAL; goto out; } /* a 0 len call is just trying to probe its length */ if (len == 0) goto call_func; nr_pages = (PAGE_ALIGN(start + len) - (start & PAGE_MASK)) >> PAGE_SHIFT; pages = kmalloc(nr_pages * sizeof(struct page *), GFP_KERNEL); if (!pages) { ret = -ENOMEM; goto out; } ret = get_user_pages_fast(start, nr_pages, 1, pages); if (ret != nr_pages) { if (ret > 0) nr_pages = ret; else nr_pages = 0; ret = -EAGAIN; /* XXX ? */ goto out; } rdsdebug("len %d nr_pages %lu\n", len, nr_pages); call_func: func = rds_info_funcs[optname - RDS_INFO_FIRST]; if (!func) { ret = -ENOPROTOOPT; goto out; } iter.pages = pages; iter.addr = NULL; iter.offset = start & (PAGE_SIZE - 1); func(sock, len, &iter, &lens); BUG_ON(lens.each == 0); total = lens.nr * lens.each; rds_info_iter_unmap(&iter); if (total > len) { len = total; ret = -ENOSPC; } else { len = total; ret = lens.each; } if (put_user(len, optlen)) ret = -EFAULT; out: for (i = 0; pages && i < nr_pages; i++) put_page(pages[i]); kfree(pages); return ret; } S. Miller <davem@davemloft.net> 2017-02-06bridge: modify bridge and port to have often accessed fields in one cache lineNikolay Aleksandrov1-23/+20 Move around net_bridge so the vlan fields are in the beginning since they're checked on every packet even if vlan filtering is disabled. For the port move flags & vlan group to the beginning, so they're in the same cache line with the port's state (both flags and state are checked on each packet). Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2017-02-06net: remove ndo_neigh_{construct, destroy} from stacked devicesIdo Schimmel1-2/+0 In commit 18bfb924f000 ("net: introduce default neigh_construct/destroy ndo calls for L2 upper devices") we added these ndos to stacked devices such as team and bond, so that calls will be propagated to mlxsw. However, previous commit removed the reliance on these ndos and no new users of these ndos have appeared since above mentioned commit. We can therefore safely remove this dead code. Signed-off-by: Ido Schimmel <idosch@mellanox.com> Signed-off-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2017-02-03Merge git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf-nextDavid S. Miller3-32/+49 Pablo Neira Ayuso says: ==================== Netfilter updates for net-next The following patchset contains Netfilter updates for your net-next tree, they are: 1) Stash ctinfo 3-bit field into pointer to nf_conntrack object from sk_buff so we only access one single cacheline in the conntrack hotpath. Patchset from Florian Westphal. 2) Don't leak pointer to internal structures when exporting x_tables ruleset back to userspace, from Willem DeBruijn. This includes new helper functions to copy data to userspace such as xt_data_to_user() as well as conversions of our ip_tables, ip6_tables and arp_tables clients to use it. Not surprinsingly, ebtables requires an ad-hoc update. There is also a new field in x_tables extensions to indicate the amount of bytes that we copy to userspace. 3) Add nf_log_all_netns sysctl: This new knob allows you to enable logging via nf_log infrastructure for all existing netnamespaces. Given the effort to provide pernet syslog has been discontinued, let's provide a way to restore logging using netfilter kernel logging facilities in trusted environments. Patch from Michal Kubecek. 4) Validate SCTP checksum from conntrack helper, from Davide Caratti. 5) Merge UDPlite conntrack and NAT helpers into UDP, this was mostly a copy&paste from the original helper, from Florian Westphal. 6) Reset netfilter state when duplicating packets, also from Florian. 7) Remove unnecessary check for broadcast in IPv6 in pkttype match and nft_meta, from Liping Zhang. 8) Add missing code to deal with loopback packets from nft_meta when used by the netdev family, also from Liping. 9) Several cleanups on nf_tables, one to remove unnecessary check from the netlink control plane path to add table, set and stateful objects and code consolidation when unregister chain hooks, from Gao Feng. 10) Fix harmless reference counter underflow in IPVS that, however, results in problems with the introduction of the new refcount_t type, from David Windsor. 11) Enable LIBCRC32C from nf_ct_sctp instead of nf_nat_sctp, from Davide Caratti. 12) Missing documentation on nf_tables uapi header, from Liping Zhang. 13) Use rb_entry() helper in xt_connlimit, from Geliang Tang. ==================== Signed-off-by: David S. Miller <davem@davemloft.net> 2017-02-03bridge: vlan dst_metadata hooks in ingress and egress pathsRoopa Prabhu6-2/+82 - ingress hook: - if port is a tunnel port, use tunnel info in attached dst_metadata to map it to a local vlan - egress hook: - if port is a tunnel port, use tunnel info attached to vlan to set dst_metadata on the skb CC: Nikolay Aleksandrov <nikolay@cumulusnetworks.com> Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2017-02-03bridge: per vlan dst_metadata netlink supportRoopa Prabhu7-48/+641 This patch adds support to attach per vlan tunnel info dst metadata. This enables bridge driver to map vlan to tunnel_info at ingress and egress. It uses the kernel dst_metadata infrastructure. The initial use case is vlan to vni bridging, but the api is generic to extend to any tunnel_info in the future: - Uapi to configure/unconfigure/dump per vlan tunnel data - netlink functions to configure vlan and tunnel_info mapping - Introduces bridge port flag BR_LWT_VLAN to enable attach/detach dst_metadata to bridged packets on ports. off by default. - changes to existing code is mainly refactor some existing vlan handling netlink code + hooks for new vlan tunnel code - I have kept the vlan tunnel code isolated in separate files. - most of the netlink vlan tunnel code is handling of vlan-tunid ranges (follows the vlan range handling code). To conserve space vlan-tunid by default are always dumped in ranges if applicable. Use case: example use for this is a vxlan bridging gateway or vtep which maps vlans to vn-segments (or vnis). iproute2 example (patched and pruned iproute2 output to just show relevant fdb entries): example shows same host mac learnt on two vni's and vlan 100 maps to vni 1000, vlan 101 maps to vni 1001 before (netdev per vni): $bridge fdb show | grep "00:02:00:00:00:03" 00:02:00:00:00:03 dev vxlan1001 vlan 101 master bridge 00:02:00:00:00:03 dev vxlan1001 dst 12.0.0.8 self 00:02:00:00:00:03 dev vxlan1000 vlan 100 master bridge 00:02:00:00:00:03 dev vxlan1000 dst 12.0.0.8 self after this patch with collect metdata in bridged mode (single netdev): $bridge fdb show | grep "00:02:00:00:00:03" 00:02:00:00:00:03 dev vxlan0 vlan 101 master bridge 00:02:00:00:00:03 dev vxlan0 src_vni 1001 dst 12.0.0.8 self 00:02:00:00:00:03 dev vxlan0 vlan 100 master bridge 00:02:00:00:00:03 dev vxlan0 src_vni 1000 dst 12.0.0.8 self CC: Nikolay Aleksandrov <nikolay@cumulusnetworks.com> Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2017-02-02netfilter: allow logging from non-init namespacesMichal Kubeček1-1/+1 Commit 69b34fb996b2 ("netfilter: xt_LOG: add net namespace support for xt_LOG") disabled logging packets using the LOG target from non-init namespaces. The motivation was to prevent containers from flooding kernel log of the host. The plan was to keep it that way until syslog namespace implementation allows containers to log in a safe way. However, the work on syslog namespace seems to have hit a dead end somewhere in 2013 and there are users who want to use xt_LOG in all network namespaces. This patch allows to do so by setting /proc/sys/net/netfilter/nf_log_all_netns to a nonzero value. This sysctl is only accessible from init_net so that one cannot switch the behaviour from inside a container. Signed-off-by: Michal Kubecek <mkubecek@suse.cz> Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>