summaryrefslogtreecommitdiff
path: root/net/mpls/mpls_iptunnel.c
blob: 1d281c1ff7c10b3ae6e0245e2b95cc404dd791c4 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
/*
 * mpls tunnels	An implementation mpls tunnels using the light weight tunnel
 *		infrastructure
 *
 * Authors:	Roopa Prabhu, <roopa@cumulusnetworks.com>
 *
 *		This program is free software; you can redistribute it and/or
 *		modify it under the terms of the GNU General Public License
 *		as published by the Free Software Foundation; either version
 *		2 of the License, or (at your option) any later version.
 *
 */
#include <linux/types.h>
#include <linux/skbuff.h>
#include <linux/net.h>
#include <linux/module.h>
#include <linux/mpls.h>
#include <linux/vmalloc.h>
#include <net/ip.h>
#include <net/dst.h>
#include <net/lwtunnel.h>
#include <net/netevent.h>
#include <net/netns/generic.h>
#include <net/ip6_fib.h>
#include <net/route.h>
#include <net/mpls_iptunnel.h>
#include <linux/mpls_iptunnel.h>
#include "internal.h"

static const struct nla_policy mpls_iptunnel_policy[MPLS_IPTUNNEL_MAX + 1] = {
	[MPLS_IPTUNNEL_DST]	= { .type = NLA_U32 },
};

static unsigned int mpls_encap_size(struct mpls_iptunnel_encap *en)
{
	/* The size of the layer 2.5 labels to be added for this route */
	return en->labels * sizeof(struct mpls_shim_hdr);
}

static int mpls_xmit(struct sk_buff *skb)
{
	struct mpls_iptunnel_encap *tun_encap_info;
	struct mpls_shim_hdr *hdr;
	struct net_device *out_dev;
	unsigned int hh_len;
	unsigned int new_header_size;
	unsigned int mtu;
	struct dst_entry *dst = skb_dst(skb);
	struct rtable *rt = NULL;
	struct rt6_info *rt6 = NULL;
	int err = 0;
	bool bos;
	int i;
	unsigned int ttl;

	/* Obtain the ttl */
	if (dst->ops->family == AF_INET) {
		ttl = ip_hdr(skb)->ttl;
		rt = (struct rtable *)dst;
	} else if (dst->ops->family == AF_INET6) {
		ttl = ipv6_hdr(skb)->hop_limit;
		rt6 = (struct rt6_info *)dst;
	} else {
		goto drop;
	}

	skb_orphan(skb);

	/* Find the output device */
	out_dev = dst->dev;
	if (!mpls_output_possible(out_dev) ||
	    !dst->lwtstate || skb_warn_if_lro(skb))
		goto drop;

	skb_forward_csum(skb);

	tun_encap_info = mpls_lwtunnel_encap(dst->lwtstate);

	/* Verify the destination can hold the packet */
	new_header_size = mpls_encap_size(tun_encap_info);
	mtu = mpls_dev_mtu(out_dev);
	if (mpls_pkt_too_big(skb, mtu - new_header_size))
		goto drop;

	hh_len = LL_RESERVED_SPACE(out_dev);
	if (!out_dev->header_ops)
		hh_len = 0;

	/* Ensure there is enough space for the headers in the skb */
	if (skb_cow(skb, hh_len + new_header_size))
		goto drop;

	skb_set_inner_protocol(skb, skb->protocol);
	skb_reset_inner_network_header(skb);

	skb_push(skb, new_header_size);

	skb_reset_network_header(skb);

	skb->dev = out_dev;
	skb->protocol = htons(ETH_P_MPLS_UC);

	/* Push the new labels */
	hdr = mpls_hdr(skb);
	bos = true;
	for (i = tun_encap_info->labels - 1; i >= 0; i--) {
		hdr[i] = mpls_entry_encode(tun_encap_info->label[i],
					   ttl, 0, bos);
		bos = false;
	}

	if (rt)
		err = neigh_xmit(NEIGH_ARP_TABLE, out_dev, &rt->rt_gateway,
				 skb);
	else if (rt6)
		err = neigh_xmit(NEIGH_ND_TABLE, out_dev, &rt6->rt6i_gateway,
				 skb);
	if (err)
		net_dbg_ratelimited("%s: packet transmission failed: %d\n",
				    __func__, err);

	return LWTUNNEL_XMIT_DONE;

drop:
	kfree_skb(skb);
	return -EINVAL;
}

static int mpls_build_state(struct net_device *dev, struct nlattr *nla,
			    unsigned int family, const void *cfg,
			    struct lwtunnel_state **ts)
{
	struct mpls_iptunnel_encap *tun_encap_info;
	struct nlattr *tb[MPLS_IPTUNNEL_MAX + 1];
	struct lwtunnel_state *newts;
	int ret;

	ret = nla_parse_nested(tb, MPLS_IPTUNNEL_MAX, nla,
			       mpls_iptunnel_policy);
	if (ret < 0)
		return ret;

	if (!tb[MPLS_IPTUNNEL_DST])
		return -EINVAL;


	newts = lwtunnel_state_alloc(sizeof(*tun_encap_info));
	if (!newts)
		return -ENOMEM;

	tun_encap_info = mpls_lwtunnel_encap(newts);
	ret = nla_get_labels(tb[MPLS_IPTUNNEL_DST], MAX_NEW_LABELS,
			     &tun_encap_info->labels, tun_encap_info->label);
	if (ret)
		goto errout;
	newts->type = LWTUNNEL_ENCAP_MPLS;
	newts->flags |= LWTUNNEL_STATE_XMIT_REDIRECT;
	newts->headroom = mpls_encap_size(tun_encap_info);

	*ts = newts;

	return 0;

errout:
	kfree(newts);
	*ts = NULL;

	return ret;
}

static int mpls_fill_encap_info(struct sk_buff *skb,
				struct lwtunnel_state *lwtstate)
{
	struct mpls_iptunnel_encap *tun_encap_info;
	
	tun_encap_info = mpls_lwtunnel_encap(lwtstate);

	if (nla_put_labels(skb, MPLS_IPTUNNEL_DST, tun_encap_info->labels,
			   tun_encap_info->label))
		goto nla_put_failure;

	return 0;

nla_put_failure:
	return -EMSGSIZE;
}

static int mpls_encap_nlsize(struct lwtunnel_state *lwtstate)
{
	struct mpls_iptunnel_encap *tun_encap_info;

	tun_encap_info = mpls_lwtunnel_encap(lwtstate);

	return nla_total_size(tun_encap_info->labels * 4);
}

static int mpls_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b)
{
	struct mpls_iptunnel_encap *a_hdr = mpls_lwtunnel_encap(a);
	struct mpls_iptunnel_encap *b_hdr = mpls_lwtunnel_encap(b);
	int l;

	if (a_hdr->labels != b_hdr->labels)
		return 1;

	for (l = 0; l < MAX_NEW_LABELS; l++)
		if (a_hdr->label[l] != b_hdr->label[l])
			return 1;
	return 0;
}

static const struct lwtunnel_encap_ops mpls_iptun_ops = {
	.build_state = mpls_build_state,
	.xmit = mpls_xmit,
	.fill_encap = mpls_fill_encap_info,
	.get_encap_size = mpls_encap_nlsize,
	.cmp_encap = mpls_encap_cmp,
	.owner = THIS_MODULE,
};

static int __init mpls_iptunnel_init(void)
{
	return lwtunnel_encap_add_ops(&mpls_iptun_ops, LWTUNNEL_ENCAP_MPLS);
}
module_init(mpls_iptunnel_init);

static void __exit mpls_iptunnel_exit(void)
{
	lwtunnel_encap_del_ops(&mpls_iptun_ops, LWTUNNEL_ENCAP_MPLS);
}
module_exit(mpls_iptunnel_exit);

MODULE_ALIAS_RTNL_LWT(MPLS);
MODULE_DESCRIPTION("MultiProtocol Label Switching IP Tunnels");
MODULE_LICENSE("GPL v2");
CONFIG_RELOCATABLE=y use absolute ELF symbol references as before" [0] http://marc.info/?l=linux-arch&m=148493613415294&w=2 * emailed patches from Ard Biesheuvel: module: unify absolute krctab definitions for 32-bit and 64-bit modversions: treat symbol CRCs as 32 bit quantities kbuild: modversions: add infrastructure for emitting relative CRCs 2017-02-03log2: make order_base_2() behave correctly on const input value zeroArd Biesheuvel1-1/+12 The function order_base_2() is defined (according to the comment block) as returning zero on input zero, but subsequently passes the input into roundup_pow_of_two(), which is explicitly undefined for input zero. This has gone unnoticed until now, but optimization passes in GCC 7 may produce constant folded function instances where a constant value of zero is passed into order_base_2(), resulting in link errors against the deliberately undefined '____ilog2_NaN'. So update order_base_2() to adhere to its own documented interface. [ See http://marc.info/?l=linux-kernel&m=147672952517795&w=2 and follow-up discussion for more background. The gcc "optimization pass" is really just broken, but now the GCC trunk problem seems to have escaped out of just specially built daily images, so we need to work around it in mainline. - Linus ] Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> 2017-02-03module: unify absolute krctab definitions for 32-bit and 64-bitArd Biesheuvel1-7/+0 The previous patch introduced a separate inline asm version of the krcrctab declaration template for use with 64-bit architectures, which cannot refer to ELF symbols using 32-bit quantities. This declaration should be equivalent to the C one for 32-bit architectures, but just in case - unify them in a separate patch, which can simply be dropped if it turns out to break anything. Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> 2017-02-03modversions: treat symbol CRCs as 32 bit quantitiesArd Biesheuvel3-12/+27 The modversion symbol CRCs are emitted as ELF symbols, which allows us to easily populate the kcrctab sections by relying on the linker to associate each kcrctab slot with the correct value. This has a couple of downsides: - Given that the CRCs are treated as memory addresses, we waste 4 bytes for each CRC on 64 bit architectures, - On architectures that support runtime relocation, a R_<arch>_RELATIVE relocation entry is emitted for each CRC value, which identifies it as a quantity that requires fixing up based on the actual runtime load offset of the kernel. This results in corrupted CRCs unless we explicitly undo the fixup (and this is currently being handled in the core module code) - Such runtime relocation entries take up 24 bytes of __init space each, resulting in a x8 overhead in [uncompressed] kernel size for CRCs. Switching to explicit 32 bit values on 64 bit architectures fixes most of these issues, given that 32 bit values are not treated as quantities that require fixing up based on the actual runtime load offset. Note that on some ELF64 architectures [such as PPC64], these 32-bit values are still emitted as [absolute] runtime relocatable quantities, even if the value resolves to a build time constant. Since relative relocations are always resolved at build time, this patch enables MODULE_REL_CRCS on powerpc when CONFIG_RELOCATABLE=y, which turns the absolute CRC references into relative references into .rodata where the actual CRC value is stored. So redefine all CRC fields and variables as u32, and redefine the __CRC_SYMBOL() macro for 64 bit builds to emit the CRC reference using inline assembler (which is necessary since 64-bit C code cannot use 32-bit types to hold memory addresses, even if they are ultimately resolved using values that do not exceed 0xffffffff). To avoid potential problems with legacy 32-bit architectures using legacy toolchains, the equivalent C definition of the kcrctab entry is retained for 32-bit architectures. Note that this mostly reverts commit d4703aefdbc8 ("module: handle ppc64 relocating kcrctabs when CONFIG_RELOCATABLE=y") Acked-by: Rusty Russell <rusty@rustcorp.com.au> Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> 2017-02-03ipv6: sr: remove cleanup flag and fix HMAC computationDavid Lebrun1-6/+3 In the latest version of the IPv6 Segment Routing IETF draft [1] the cleanup flag is removed and the flags field length is shrunk from 16 bits to 8 bits. As a consequence, the input of the HMAC computation is modified in a non-backward compatible way by covering the whole octet of flags instead of only the cleanup bit. As such, if an implementation compatible with the latest draft computes the HMAC of an SRH who has other flags set to 1, then the HMAC result would differ from the current implementation. This patch carries those modifications to prevent conflict with other implementations of IPv6 SR. [1] https://tools.ietf.org/html/draft-ietf-6man-segment-routing-header-05 Signed-off-by: David Lebrun <david.lebrun@uclouvain.be> Signed-off-by: David S. Miller <davem@davemloft.net> 2017-02-02Merge branch 'perf-urgent-for-linus' of ↵Linus Torvalds1-3/+0 git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip Pull perf fixes from Ingo Molnar: "Five kernel fixes: - an mmap tracing ABI fix for certain mappings - a use-after-free fix, found via KASAN - three CPU hotplug related x86 PMU driver fixes" * 'perf-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: perf/x86/intel/uncore: Make package handling more robust perf/x86/intel/uncore: Clean up hotplug conversion fallout perf/x86/intel/rapl: Make package handling more robust perf/core: Fix PERF_RECORD_MMAP2 prot/flags for anonymous memory perf/core: Fix use-after-free bug 2017-02-01Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/netLinus Torvalds4-19/+26 Pull networking fixes from David Miller: 1) Fix handling of interrupt status in stmmac driver. Just because we have masked the event from generating interrupts, doesn't mean the bit won't still be set in the interrupt status register. From Alexey Brodkin. 2) Fix DMA API debugging splats in gianfar driver, from Arseny Solokha. 3) Fix off-by-one error in __ip6_append_data(), from Vlad Yasevich. 4) cls_flow does not match on icmpv6 codes properly, from Simon Horman. 5) Initial MAC address can be set incorrectly in some scenerios, from Ivan Vecera. 6) Packet header pointer arithmetic fix in ip6_tnl_parse_tlv_end_lim(), from Dan Carpenter. 7) Fix divide by zero in __tcp_select_window(), from Eric Dumazet. 8) Fix crash in iwlwifi when unregistering thermal zone, from Jens Axboe. 9) Check for DMA mapping errors in starfire driver, from Alexey Khoroshilov. * git://git.kernel.org/pub/scm/linux/kernel/git/davem/net: (31 commits) tcp: fix 0 divide in __tcp_select_window() ipv6: pointer math error in ip6_tnl_parse_tlv_enc_lim() net: fix ndo_features_check/ndo_fix_features comment ordering net/sched: matchall: Fix configuration race be2net: fix initial MAC setting ipv6: fix flow labels when the traffic class is non-0 net: thunderx: avoid dereferencing xcv when NULL net/sched: cls_flower: Correct matching on ICMPv6 code ipv6: Paritially checksum full MTU frames net/mlx4_core: Avoid command timeouts during VF driver device shutdown gianfar: synchronize DMA API usage by free_skb_rx_queue w/ gfar_new_page net: ethtool: add support for 2500BaseT and 5000BaseT link modes can: bcm: fix hrtimer/tasklet termination in bcm op removal net: adaptec: starfire: add checks for dma mapping errors net: phy: micrel: KSZ8795 do not set SUPPORTED_[Asym_]Pause can: Fix kernel panic at security_sock_rcv_skb net: macb: Fix 64 bit addressing support for GEM stmmac: Discard masked flags in interrupt status register net/mlx5e: Check ets capability before ets query FW command net/mlx5e: Fix update of hash function/key via ethtool ... 2017-02-01Merge branch 'for-linus' of ↵Linus Torvalds1-0/+1 git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs Pull fscache fixes from Al Viro. * 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs: fscache: Fix dead object requeue fscache: Clear outstanding writes when disabling a cookie FS-Cache: Initialise stores_lock in netfs cookie 2017-02-01net: fix ndo_features_check/ndo_fix_features comment orderingDimitris Michailidis1-14/+15 Commit cdba756f5803a2 ("net: move ndo_features_check() close to ndo_start_xmit()") inadvertently moved the doc comment for .ndo_fix_features instead of .ndo_features_check. Fix the comment ordering. Fixes: cdba756f5803a2 ("net: move ndo_features_check() close to ndo_start_xmit()") Signed-off-by: Dimitris Michailidis <dmichail@google.com> Acked-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2017-02-01perf/x86/intel/uncore: Make package handling more robustThomas Gleixner1-2/+0 The package management code in uncore relies on package mapping being available before a CPU is started. This changed with: 9d85eb9119f4 ("x86/smpboot: Make logical package management more robust") because the ACPI/BIOS information turned out to be unreliable, but that left uncore in broken state. This was not noticed because on a regular boot all CPUs are online before uncore is initialized. Move the allocation to the CPU online callback and simplify the hotplug handling. At this point the package mapping is established and correct. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Jiri Olsa <jolsa@redhat.com> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Sebastian Siewior <bigeasy@linutronix.de> Cc: Stephane Eranian <eranian@google.com> Cc: Vince Weaver <vincent.weaver@maine.edu> Cc: Yasuaki Ishimatsu <yasu.isimatu@gmail.com> Fixes: 9d85eb9119f4 ("x86/smpboot: Make logical package management more robust") Link: http://lkml.kernel.org/r/20170131230141.377156255@linutronix.de Signed-off-by: Ingo Molnar <mingo@kernel.org>