/* Copyright (C) 2010: YOSHIFUJI Hideaki <yoshfuji@linux-ipv6.org>
 * Copyright (C) 2015: Linus Lüssing <linus.luessing@c0d3.blue>
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of version 2 of the GNU General Public
 * License as published by the Free Software Foundation.
 *
 * This program is distributed in the hope that it will be useful, but
 * WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, see <http://www.gnu.org/licenses/>.
 *
 *
 * Based on the MLD support added to br_multicast.c by YOSHIFUJI Hideaki.
 */

#include <linux/skbuff.h>
#include <net/ipv6.h>
#include <net/mld.h>
#include <net/addrconf.h>
#include <net/ip6_checksum.h>

static int ipv6_mc_check_ip6hdr(struct sk_buff *skb)
{
	const struct ipv6hdr *ip6h;
	unsigned int len;
	unsigned int offset = skb_network_offset(skb) + sizeof(*ip6h);

	if (!pskb_may_pull(skb, offset))
		return -EINVAL;

	ip6h = ipv6_hdr(skb);

	if (ip6h->version != 6)
		return -EINVAL;

	len = offset + ntohs(ip6h->payload_len);
	if (skb->len < len || len <= offset)
		return -EINVAL;

	return 0;
}

static int ipv6_mc_check_exthdrs(struct sk_buff *skb)
{
	const struct ipv6hdr *ip6h;
	int offset;
	u8 nexthdr;
	__be16 frag_off;

	ip6h = ipv6_hdr(skb);

	if (ip6h->nexthdr != IPPROTO_HOPOPTS)
		return -ENOMSG;

	nexthdr = ip6h->nexthdr;
	offset = skb_network_offset(skb) + sizeof(*ip6h);
	offset = ipv6_skip_exthdr(skb, offset, &nexthdr, &frag_off);

	if (offset < 0)
		return -EINVAL;

	if (nexthdr != IPPROTO_ICMPV6)
		return -ENOMSG;

	skb_set_transport_header(skb, offset);

	return 0;
}

static int ipv6_mc_check_mld_reportv2(struct sk_buff *skb)
{
	unsigned int len = skb_transport_offset(skb);

	len += sizeof(struct mld2_report);

	return pskb_may_pull(skb, len) ? 0 : -EINVAL;
}

static int ipv6_mc_check_mld_query(struct sk_buff *skb)
{
	struct mld_msg *mld;
	unsigned int len = skb_transport_offset(skb);

	/* RFC2710+RFC3810 (MLDv1+MLDv2) require link-local source addresses */
	if (!(ipv6_addr_type(&ipv6_hdr(skb)->saddr) & IPV6_ADDR_LINKLOCAL))
		return -EINVAL;

	len += sizeof(struct mld_msg);
	if (skb->len < len)
		return -EINVAL;

	/* MLDv1? */
	if (skb->len != len) {
		/* or MLDv2? */
		len += sizeof(struct mld2_query) - sizeof(struct mld_msg);
		if (skb->len < len || !pskb_may_pull(skb, len))
			return -EINVAL;
	}

	mld = (struct mld_msg *)skb_transport_header(skb);

	/* RFC2710+RFC3810 (MLDv1+MLDv2) require the multicast link layer
	 * all-nodes destination address (ff02::1) for general queries
	 */
	if (ipv6_addr_any(&mld->mld_mca) &&
	    !ipv6_addr_is_ll_all_nodes(&ipv6_hdr(skb)->daddr))
		return -EINVAL;

	return 0;
}

static int ipv6_mc_check_mld_msg(struct sk_buff *skb)
{
	struct mld_msg *mld = (struct mld_msg *)skb_transport_header(skb);

	switch (mld->mld_type) {
	case ICMPV6_MGM_REDUCTION:
	case ICMPV6_MGM_REPORT:
		/* fall through */
		return 0;
	case ICMPV6_MLD2_REPORT:
		return ipv6_mc_check_mld_reportv2(skb);
	case ICMPV6_MGM_QUERY:
		return ipv6_mc_check_mld_query(skb);
	default:
		return -ENOMSG;
	}
}

static inline __sum16 ipv6_mc_validate_checksum(struct sk_buff *skb)
{
	return skb_checksum_validate(skb, IPPROTO_ICMPV6, ip6_compute_pseudo);
}

static int __ipv6_mc_check_mld(struct sk_buff *skb,
			       struct sk_buff **skb_trimmed)

{
	struct sk_buff *skb_chk = NULL;
	unsigned int transport_len;
	unsigned int len = skb_transport_offset(skb) + sizeof(struct mld_msg);
	int ret = -EINVAL;

	transport_len = ntohs(ipv6_hdr(skb)->payload_len);
	transport_len -= skb_transport_offset(skb) - sizeof(struct ipv6hdr);

	skb_chk = skb_checksum_trimmed(skb, transport_len,
				       ipv6_mc_validate_checksum);
	if (!skb_chk)
		goto err;

	if (!pskb_may_pull(skb_chk, len))
		goto err;

	ret = ipv6_mc_check_mld_msg(skb_chk);
	if (ret)
		goto err;

	if (skb_trimmed)
		*skb_trimmed = skb_chk;
	/* free now unneeded clone */
	else if (skb_chk != skb)
		kfree_skb(skb_chk);

	ret = 0;

err:
	if (ret && skb_chk && skb_chk != skb)
		kfree_skb(skb_chk);

	return ret;
}

/**
 * ipv6_mc_check_mld - checks whether this is a sane MLD packet
 * @skb: the skb to validate
 * @skb_trimmed: to store an skb pointer trimmed to IPv6 packet tail (optional)
 *
 * Checks whether an IPv6 packet is a valid MLD packet. If so sets
 * skb transport header accordingly and returns zero.
 *
 * -EINVAL: A broken packet was detected, i.e. it violates some internet
 *  standard
 * -ENOMSG: IP header validation succeeded but it is not an MLD packet.
 * -ENOMEM: A memory allocation failure happened.
 *
 * Optionally, an skb pointer might be provided via skb_trimmed (or set it
 * to NULL): After parsing an MLD packet successfully it will point to
 * an skb which has its tail aligned to the IP packet end. This might
 * either be the originally provided skb or a trimmed, cloned version if
 * the skb frame had data beyond the IP packet. A cloned skb allows us
 * to leave the original skb and its full frame unchanged (which might be
 * desirable for layer 2 frame jugglers).
 *
 * Caller needs to set the skb network header and free any returned skb if it
 * differs from the provided skb.
 */
int ipv6_mc_check_mld(struct sk_buff *skb, struct sk_buff **skb_trimmed)
{
	int ret;

	ret = ipv6_mc_check_ip6hdr(skb);
	if (ret < 0)
		return ret;

	ret = ipv6_mc_check_exthdrs(skb);
	if (ret < 0)
		return ret;

	return __ipv6_mc_check_mld(skb, skb_trimmed);
}
EXPORT_SYMBOL(ipv6_mc_check_mld);
b/usbip?id=b91e1302ad9b80c174a4855533f7e3aa2873355e&amp;id2=2d706e790f0508dff4fb72eca9b4892b79757feb'>diff</a>)</td></tr></table>
<div class='commit-subject'>mm: optimize PageWaiters bit use for unlock_page()</div><div class='commit-msg'>In commit 62906027091f ("mm: add PageWaiters indicating tasks are
waiting for a page bit") Nick Piggin made our page locking no longer
unconditionally touch the hashed page waitqueue, which not only helps
performance in general, but is particularly helpful on NUMA machines
where the hashed wait queues can bounce around a lot.

However, the "clear lock bit atomically and then test the waiters bit"
sequence turns out to be much more expensive than it needs to be,
because you get a nasty stall when trying to access the same word that
just got updated atomically.

On architectures where locking is done with LL/SC, this would be trivial
to fix with a new primitive that clears one bit and tests another
atomically, but that ends up not working on x86, where the only atomic
operations that return the result end up being cmpxchg and xadd.  The
atomic bit operations return the old value of the same bit we changed,
not the value of an unrelated bit.

On x86, we could put the lock bit in the high bit of the byte, and use
"xadd" with that bit (where the overflow ends up not touching other
bits), and look at the other bits of the result.  However, an even
simpler model is to just use a regular atomic "and" to clear the lock
bit, and then the sign bit in eflags will indicate the resulting state
of the unrelated bit #7.

So by moving the PageWaiters bit up to bit #7, we can atomically clear
the lock bit and test the waiters bit on x86 too.  And architectures
with LL/SC (which is all the usual RISC suspects), the particular bit
doesn't matter, so they are fine with this approach too.

This avoids the extra access to the same atomic word, and thus avoids
the costly stall at page unlock time.

The only downside is that the interface ends up being a bit odd and
specialized: clear a bit in a byte, and test the sign bit.  Nick doesn't
love the resulting name of the new primitive, but I'd rather make the
name be descriptive and very clear about the limitation imposed by
trying to work across all relevant architectures than make it be some
generic thing that doesn't make the odd semantics explicit.

So this introduces the new architecture primitive

    clear_bit_unlock_is_negative_byte();

and adds the trivial implementation for x86.  We have a generic
non-optimized fallback (that just does a "clear_bit()"+"test_bit(7)"
combination) which can be overridden by any architecture that can do
better.  According to Nick, Power has the same hickup x86 has, for
example, but some other architectures may not even care.

All these optimizations mean that my page locking stress-test (which is
just executing a lot of small short-lived shell scripts: "make test" in
the git source tree) no longer makes our page locking look horribly bad.
Before all these optimizations, just the unlock_page() costs were just
over 3% of all CPU overhead on "make test".  After this, it's down to
0.66%, so just a quarter of the cost it used to be.

(The difference on NUMA is bigger, but there this micro-optimization is
likely less noticeable, since the big issue on NUMA was not the accesses
to 'struct page', but the waitqueue accesses that were already removed
by Nick's earlier commit).

Acked-by: Nick Piggin &lt;npiggin@gmail.com&gt;
Cc: Dave Hansen &lt;dave.hansen@linux.intel.com&gt;
Cc: Bob Peterson &lt;rpeterso@redhat.com&gt;
Cc: Steven Whitehouse &lt;swhiteho@redhat.com&gt;
Cc: Andrew Lutomirski &lt;luto@kernel.org&gt;
Cc: Andreas Gruenbacher &lt;agruenba@redhat.com&gt;
Cc: Peter Zijlstra &lt;peterz@infradead.org&gt;
Cc: Mel Gorman &lt;mgorman@techsingularity.net&gt;
Signed-off-by: Linus Torvalds &lt;torvalds@linux-foundation.org&gt;
</div><div class='diffstat-header'><a href='/cgit.cgi/linux/net-next.git/diff/?id=b91e1302ad9b80c174a4855533f7e3aa2873355e'>Diffstat</a> (limited to 'drivers/usb/usbip')</div><table summary='diffstat' class='diffstat'>