/*
 * Copyright (c) 2006 Oracle.  All rights reserved.
 *
 * This software is available to you under a choice of one of two
 * licenses.  You may choose to be licensed under the terms of the GNU
 * General Public License (GPL) Version 2, available from the file
 * COPYING in the main directory of this source tree, or the
 * OpenIB.org BSD license below:
 *
 *     Redistribution and use in source and binary forms, with or
 *     without modification, are permitted provided that the following
 *     conditions are met:
 *
 *      - Redistributions of source code must retain the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer.
 *
 *      - Redistributions in binary form must reproduce the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer in the documentation and/or other materials
 *        provided with the distribution.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 *
 */
#include <linux/percpu.h>
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/proc_fs.h>
#include <linux/export.h>

#include "rds.h"

/*
 * This file implements a getsockopt() call which copies a set of fixed
 * sized structs into a user-specified buffer as a means of providing
 * read-only information about RDS.
 *
 * For a given information source there are a given number of fixed sized
 * structs at a given time.  The structs are only copied if the user-specified
 * buffer is big enough.  The destination pages that make up the buffer
 * are pinned for the duration of the copy.
 *
 * This gives us the following benefits:
 *
 * - simple implementation, no copy "position" across multiple calls
 * - consistent snapshot of an info source
 * - atomic copy works well with whatever locking info source has
 * - one portable tool to get rds info across implementations
 * - long-lived tool can get info without allocating
 *
 * at the following costs:
 *
 * - info source copy must be pinned, may be "large"
 */

struct rds_info_iterator {
	struct page **pages;
	void *addr;
	unsigned long offset;
};

static DEFINE_SPINLOCK(rds_info_lock);
static rds_info_func rds_info_funcs[RDS_INFO_LAST - RDS_INFO_FIRST + 1];

void rds_info_register_func(int optname, rds_info_func func)
{
	int offset = optname - RDS_INFO_FIRST;

	BUG_ON(optname < RDS_INFO_FIRST || optname > RDS_INFO_LAST);

	spin_lock(&rds_info_lock);
	BUG_ON(rds_info_funcs[offset]);
	rds_info_funcs[offset] = func;
	spin_unlock(&rds_info_lock);
}
EXPORT_SYMBOL_GPL(rds_info_register_func);

void rds_info_deregister_func(int optname, rds_info_func func)
{
	int offset = optname - RDS_INFO_FIRST;

	BUG_ON(optname < RDS_INFO_FIRST || optname > RDS_INFO_LAST);

	spin_lock(&rds_info_lock);
	BUG_ON(rds_info_funcs[offset] != func);
	rds_info_funcs[offset] = NULL;
	spin_unlock(&rds_info_lock);
}
EXPORT_SYMBOL_GPL(rds_info_deregister_func);

/*
 * Typically we hold an atomic kmap across multiple rds_info_copy() calls
 * because the kmap is so expensive.  This must be called before using blocking
 * operations while holding the mapping and as the iterator is torn down.
 */
void rds_info_iter_unmap(struct rds_info_iterator *iter)
{
	if (iter->addr) {
		kunmap_atomic(iter->addr);
		iter->addr = NULL;
	}
}

/*
 * get_user_pages() called flush_dcache_page() on the pages for us.
 */
void rds_info_copy(struct rds_info_iterator *iter, void *data,
		   unsigned long bytes)
{
	unsigned long this;

	while (bytes) {
		if (!iter->addr)
			iter->addr = kmap_atomic(*iter->pages);

		this = min(bytes, PAGE_SIZE - iter->offset);

		rdsdebug("page %p addr %p offset %lu this %lu data %p "
			  "bytes %lu\n", *iter->pages, iter->addr,
			  iter->offset, this, data, bytes);

		memcpy(iter->addr + iter->offset, data, this);

		data += this;
		bytes -= this;
		iter->offset += this;

		if (iter->offset == PAGE_SIZE) {
			kunmap_atomic(iter->addr);
			iter->addr = NULL;
			iter->offset = 0;
			iter->pages++;
		}
	}
}
EXPORT_SYMBOL_GPL(rds_info_copy);

/*
 * @optval points to the userspace buffer that the information snapshot
 * will be copied into.
 *
 * @optlen on input is the size of the buffer in userspace.  @optlen
 * on output is the size of the requested snapshot in bytes.
 *
 * This function returns -errno if there is a failure, particularly -ENOSPC
 * if the given userspace buffer was not large enough to fit the snapshot.
 * On success it returns the positive number of bytes of each array element
 * in the snapshot.
 */
int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval,
			int __user *optlen)
{
	struct rds_info_iterator iter;
	struct rds_info_lengths lens;
	unsigned long nr_pages = 0;
	unsigned long start;
	unsigned long i;
	rds_info_func func;
	struct page **pages = NULL;
	int ret;
	int len;
	int total;

	if (get_user(len, optlen)) {
		ret = -EFAULT;
		goto out;
	}

	/* check for all kinds of wrapping and the like */
	start = (unsigned long)optval;
	if (len < 0 || len > INT_MAX - PAGE_SIZE + 1 || start + len < start) {
		ret = -EINVAL;
		goto out;
	}

	/* a 0 len call is just trying to probe its length */
	if (len == 0)
		goto call_func;

	nr_pages = (PAGE_ALIGN(start + len) - (start & PAGE_MASK))
			>> PAGE_SHIFT;

	pages = kmalloc(nr_pages * sizeof(struct page *), GFP_KERNEL);
	if (!pages) {
		ret = -ENOMEM;
		goto out;
	}
	ret = get_user_pages_fast(start, nr_pages, 1, pages);
	if (ret != nr_pages) {
		if (ret > 0)
			nr_pages = ret;
		else
			nr_pages = 0;
		ret = -EAGAIN; /* XXX ? */
		goto out;
	}

	rdsdebug("len %d nr_pages %lu\n", len, nr_pages);

call_func:
	func = rds_info_funcs[optname - RDS_INFO_FIRST];
	if (!func) {
		ret = -ENOPROTOOPT;
		goto out;
	}

	iter.pages = pages;
	iter.addr = NULL;
	iter.offset = start & (PAGE_SIZE - 1);

	func(sock, len, &iter, &lens);
	BUG_ON(lens.each == 0);

	total = lens.nr * lens.each;

	rds_info_iter_unmap(&iter);

	if (total > len) {
		len = total;
		ret = -ENOSPC;
	} else {
		len = total;
		ret = lens.each;
	}

	if (put_user(len, optlen))
		ret = -EFAULT;

out:
	for (i = 0; pages && i < nr_pages; i++)
		put_page(pages[i]);
	kfree(pages);

	return ret;
}
S. Miller &lt;davem@davemloft.net&gt;


</td></tr>
<tr class='logheader'><td><span title='2017-02-06 22:53:13 -0500'>2017-02-06</span></td><td class='logsubject'><a href='/cgit.cgi/linux/net-next.git/commit/net/bridge?id=1f90c7f3470580e24da25f6a6c1fb480ed9371ac'>bridge: modify bridge and port to have often accessed fields in one cache line</a></td><td>Nikolay Aleksandrov</td><td>1</td><td><span class='deletions'>-23</span>/<span class='insertions'>+20</span></td></tr>
<tr class='nohover-highlight'><td/><td colspan='5' class='logmsg'>
Move around net_bridge so the vlan fields are in the beginning since
they're checked on every packet even if vlan filtering is disabled.
For the port move flags &amp; vlan group to the beginning, so they're in the
same cache line with the port's state (both flags and state are checked
on each packet).

Signed-off-by: Nikolay Aleksandrov &lt;nikolay@cumulusnetworks.com&gt;
Signed-off-by: David S. Miller &lt;davem@davemloft.net&gt;


</td></tr>
<tr class='logheader'><td><span title='2017-02-06 11:25:57 -0500'>2017-02-06</span></td><td class='logsubject'><a href='/cgit.cgi/linux/net-next.git/commit/net/bridge?id=a8eca326151ee1beac82a4fd86d9edad3a37aaed'>net: remove ndo_neigh_{construct, destroy} from stacked devices</a></td><td>Ido Schimmel</td><td>1</td><td><span class='deletions'>-2</span>/<span class='insertions'>+0</span></td></tr>
<tr class='nohover-highlight'><td/><td colspan='5' class='logmsg'>
In commit 18bfb924f000 ("net: introduce default neigh_construct/destroy
ndo calls for L2 upper devices") we added these ndos to stacked devices
such as team and bond, so that calls will be propagated to mlxsw.

However, previous commit removed the reliance on these ndos and no new
users of these ndos have appeared since above mentioned commit. We can
therefore safely remove this dead code.

Signed-off-by: Ido Schimmel &lt;idosch@mellanox.com&gt;
Signed-off-by: Jiri Pirko &lt;jiri@mellanox.com&gt;
Signed-off-by: David S. Miller &lt;davem@davemloft.net&gt;


</td></tr>
<tr class='logheader'><td><span title='2017-02-03 16:58:20 -0500'>2017-02-03</span></td><td class='logsubject'><a href='/cgit.cgi/linux/net-next.git/commit/net/bridge?id=52e01b84a244473074fc0612c169e2e043d58b01'>Merge git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf-next</a></td><td>David S. Miller</td><td>3</td><td><span class='deletions'>-32</span>/<span class='insertions'>+49</span></td></tr>
<tr class='nohover-highlight'><td/><td colspan='5' class='logmsg'>
Pablo Neira Ayuso says:

====================
Netfilter updates for net-next

The following patchset contains Netfilter updates for your net-next
tree, they are:

1) Stash ctinfo 3-bit field into pointer to nf_conntrack object from
   sk_buff so we only access one single cacheline in the conntrack
   hotpath. Patchset from Florian Westphal.

2) Don't leak pointer to internal structures when exporting x_tables
   ruleset back to userspace, from Willem DeBruijn. This includes new
   helper functions to copy data to userspace such as xt_data_to_user()
   as well as conversions of our ip_tables, ip6_tables and arp_tables
   clients to use it. Not surprinsingly, ebtables requires an ad-hoc
   update. There is also a new field in x_tables extensions to indicate
   the amount of bytes that we copy to userspace.

3) Add nf_log_all_netns sysctl: This new knob allows you to enable
   logging via nf_log infrastructure for all existing netnamespaces.
   Given the effort to provide pernet syslog has been discontinued,
   let's provide a way to restore logging using netfilter kernel logging
   facilities in trusted environments. Patch from Michal Kubecek.

4) Validate SCTP checksum from conntrack helper, from Davide Caratti.

5) Merge UDPlite conntrack and NAT helpers into UDP, this was mostly
   a copy&amp;paste from the original helper, from Florian Westphal.

6) Reset netfilter state when duplicating packets, also from Florian.

7) Remove unnecessary check for broadcast in IPv6 in pkttype match and
   nft_meta, from Liping Zhang.

8) Add missing code to deal with loopback packets from nft_meta when
   used by the netdev family, also from Liping.

9) Several cleanups on nf_tables, one to remove unnecessary check from
   the netlink control plane path to add table, set and stateful objects
   and code consolidation when unregister chain hooks, from Gao Feng.

10) Fix harmless reference counter underflow in IPVS that, however,
    results in problems with the introduction of the new refcount_t
    type, from David Windsor.

11) Enable LIBCRC32C from nf_ct_sctp instead of nf_nat_sctp,
    from Davide Caratti.

12) Missing documentation on nf_tables uapi header, from Liping Zhang.

13) Use rb_entry() helper in xt_connlimit, from Geliang Tang.
====================

Signed-off-by: David S. Miller &lt;davem@davemloft.net&gt;


</td></tr>
<tr class='logheader'><td><span title='2017-02-03 15:21:22 -0500'>2017-02-03</span></td><td class='logsubject'><a href='/cgit.cgi/linux/net-next.git/commit/net/bridge?id=11538d039ac6efcf4f1a6c536e1b87cd3668a9fd'>bridge: vlan dst_metadata hooks in ingress and egress paths</a></td><td>Roopa Prabhu</td><td>6</td><td><span class='deletions'>-2</span>/<span class='insertions'>+82</span></td></tr>
<tr class='nohover-highlight'><td/><td colspan='5' class='logmsg'>
- ingress hook:
    - if port is a tunnel port, use tunnel info in
      attached dst_metadata to map it to a local vlan
- egress hook:
    - if port is a tunnel port, use tunnel info attached to
      vlan to set dst_metadata on the skb

CC: Nikolay Aleksandrov &lt;nikolay@cumulusnetworks.com&gt;
Signed-off-by: Roopa Prabhu &lt;roopa@cumulusnetworks.com&gt;
Signed-off-by: David S. Miller &lt;davem@davemloft.net&gt;


</td></tr>
<tr class='logheader'><td><span title='2017-02-03 15:21:22 -0500'>2017-02-03</span></td><td class='logsubject'><a href='/cgit.cgi/linux/net-next.git/commit/net/bridge?id=efa5356b0d9753b9d7e63e41459eba106cce30f3'>bridge: per vlan dst_metadata netlink support</a></td><td>Roopa Prabhu</td><td>7</td><td><span class='deletions'>-48</span>/<span class='insertions'>+641</span></td></tr>
<tr class='nohover-highlight'><td/><td colspan='5' class='logmsg'>
This patch adds support to attach per vlan tunnel info dst
metadata. This enables bridge driver to map vlan to tunnel_info
at ingress and egress. It uses the kernel dst_metadata infrastructure.

The initial use case is vlan to vni bridging, but the api is generic
to extend to any tunnel_info in the future:
    - Uapi to configure/unconfigure/dump per vlan tunnel data
    - netlink functions to configure vlan and tunnel_info mapping
    - Introduces bridge port flag BR_LWT_VLAN to enable attach/detach
    dst_metadata to bridged packets on ports. off by default.
    - changes to existing code is mainly refactor some existing vlan
    handling netlink code + hooks for new vlan tunnel code
    - I have kept the vlan tunnel code isolated in separate files.
    - most of the netlink vlan tunnel code is handling of vlan-tunid
    ranges (follows the vlan range handling code). To conserve space
    vlan-tunid by default are always dumped in ranges if applicable.

Use case:
example use for this is a vxlan bridging gateway or vtep
which maps vlans to vn-segments (or vnis).

iproute2 example (patched and pruned iproute2 output to just show
relevant fdb entries):
example shows same host mac learnt on two vni's and
vlan 100 maps to vni 1000, vlan 101 maps to vni 1001

before (netdev per vni):
$bridge fdb show | grep "00:02:00:00:00:03"
00:02:00:00:00:03 dev vxlan1001 vlan 101 master bridge
00:02:00:00:00:03 dev vxlan1001 dst 12.0.0.8 self
00:02:00:00:00:03 dev vxlan1000 vlan 100 master bridge
00:02:00:00:00:03 dev vxlan1000 dst 12.0.0.8 self

after this patch with collect metdata in bridged mode (single netdev):
$bridge fdb show | grep "00:02:00:00:00:03"
00:02:00:00:00:03 dev vxlan0 vlan 101 master bridge
00:02:00:00:00:03 dev vxlan0 src_vni 1001 dst 12.0.0.8 self
00:02:00:00:00:03 dev vxlan0 vlan 100 master bridge
00:02:00:00:00:03 dev vxlan0 src_vni 1000 dst 12.0.0.8 self

CC: Nikolay Aleksandrov &lt;nikolay@cumulusnetworks.com&gt;
Signed-off-by: Roopa Prabhu &lt;roopa@cumulusnetworks.com&gt;
Signed-off-by: David S. Miller &lt;davem@davemloft.net&gt;


</td></tr>
<tr class='logheader'><td><span title='2017-02-02 14:31:58 +0100'>2017-02-02</span></td><td class='logsubject'><a href='/cgit.cgi/linux/net-next.git/commit/net/bridge?id=2851940ffee313e0ff12540a8e11a8c54dea9c65'>netfilter: allow logging from non-init namespaces</a></td><td>Michal Kubeček</td><td>1</td><td><span class='deletions'>-1</span>/<span class='insertions'>+1</span></td></tr>
<tr class='nohover-highlight'><td/><td colspan='5' class='logmsg'>
Commit 69b34fb996b2 ("netfilter: xt_LOG: add net namespace support for
xt_LOG") disabled logging packets using the LOG target from non-init
namespaces. The motivation was to prevent containers from flooding
kernel log of the host. The plan was to keep it that way until syslog
namespace implementation allows containers to log in a safe way.

However, the work on syslog namespace seems to have hit a dead end
somewhere in 2013 and there are users who want to use xt_LOG in all
network namespaces. This patch allows to do so by setting

  /proc/sys/net/netfilter/nf_log_all_netns

to a nonzero value. This sysctl is only accessible from init_net so that
one cannot switch the behaviour from inside a container.

Signed-off-by: Michal Kubecek &lt;mkubecek@suse.cz&gt;
Signed-off-by: Pablo Neira Ayuso &lt;pablo@netfilter.org&gt;


</td></tr>