summaryrefslogtreecommitdiff
path: root/net/packet/diag.c
blob: 7ef1c881ae7417e500630df398a900dddd598092 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
#include <linux/module.h>
#include <linux/sock_diag.h>
#include <linux/net.h>
#include <linux/netdevice.h>
#include <linux/packet_diag.h>
#include <linux/percpu.h>
#include <net/net_namespace.h>
#include <net/sock.h>

#include "internal.h"

static int pdiag_put_info(const struct packet_sock *po, struct sk_buff *nlskb)
{
	struct packet_diag_info pinfo;

	pinfo.pdi_index = po->ifindex;
	pinfo.pdi_version = po->tp_version;
	pinfo.pdi_reserve = po->tp_reserve;
	pinfo.pdi_copy_thresh = po->copy_thresh;
	pinfo.pdi_tstamp = po->tp_tstamp;

	pinfo.pdi_flags = 0;
	if (po->running)
		pinfo.pdi_flags |= PDI_RUNNING;
	if (po->auxdata)
		pinfo.pdi_flags |= PDI_AUXDATA;
	if (po->origdev)
		pinfo.pdi_flags |= PDI_ORIGDEV;
	if (po->has_vnet_hdr)
		pinfo.pdi_flags |= PDI_VNETHDR;
	if (po->tp_loss)
		pinfo.pdi_flags |= PDI_LOSS;

	return nla_put(nlskb, PACKET_DIAG_INFO, sizeof(pinfo), &pinfo);
}

static int pdiag_put_mclist(const struct packet_sock *po, struct sk_buff *nlskb)
{
	struct nlattr *mca;
	struct packet_mclist *ml;

	mca = nla_nest_start(nlskb, PACKET_DIAG_MCLIST);
	if (!mca)
		return -EMSGSIZE;

	rtnl_lock();
	for (ml = po->mclist; ml; ml = ml->next) {
		struct packet_diag_mclist *dml;

		dml = nla_reserve_nohdr(nlskb, sizeof(*dml));
		if (!dml) {
			rtnl_unlock();
			nla_nest_cancel(nlskb, mca);
			return -EMSGSIZE;
		}

		dml->pdmc_index = ml->ifindex;
		dml->pdmc_type = ml->type;
		dml->pdmc_alen = ml->alen;
		dml->pdmc_count = ml->count;
		BUILD_BUG_ON(sizeof(dml->pdmc_addr) != sizeof(ml->addr));
		memcpy(dml->pdmc_addr, ml->addr, sizeof(ml->addr));
	}

	rtnl_unlock();
	nla_nest_end(nlskb, mca);

	return 0;
}

static int pdiag_put_ring(struct packet_ring_buffer *ring, int ver, int nl_type,
		struct sk_buff *nlskb)
{
	struct packet_diag_ring pdr;

	if (!ring->pg_vec)
		return 0;

	pdr.pdr_block_size = ring->pg_vec_pages << PAGE_SHIFT;
	pdr.pdr_block_nr = ring->pg_vec_len;
	pdr.pdr_frame_size = ring->frame_size;
	pdr.pdr_frame_nr = ring->frame_max + 1;

	if (ver > TPACKET_V2) {
		pdr.pdr_retire_tmo = ring->prb_bdqc.retire_blk_tov;
		pdr.pdr_sizeof_priv = ring->prb_bdqc.blk_sizeof_priv;
		pdr.pdr_features = ring->prb_bdqc.feature_req_word;
	} else {
		pdr.pdr_retire_tmo = 0;
		pdr.pdr_sizeof_priv = 0;
		pdr.pdr_features = 0;
	}

	return nla_put(nlskb, nl_type, sizeof(pdr), &pdr);
}

static int pdiag_put_rings_cfg(struct packet_sock *po, struct sk_buff *skb)
{
	int ret;

	mutex_lock(&po->pg_vec_lock);
	ret = pdiag_put_ring(&po->rx_ring, po->tp_version,
			PACKET_DIAG_RX_RING, skb);
	if (!ret)
		ret = pdiag_put_ring(&po->tx_ring, po->tp_version,
				PACKET_DIAG_TX_RING, skb);
	mutex_unlock(&po->pg_vec_lock);

	return ret;
}

static int pdiag_put_fanout(struct packet_sock *po, struct sk_buff *nlskb)
{
	int ret = 0;

	mutex_lock(&fanout_mutex);
	if (po->fanout) {
		u32 val;

		val = (u32)po->fanout->id | ((u32)po->fanout->type << 16);
		ret = nla_put_u32(nlskb, PACKET_DIAG_FANOUT, val);
	}
	mutex_unlock(&fanout_mutex);

	return ret;
}

static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
			struct packet_diag_req *req,
			bool may_report_filterinfo,
			struct user_namespace *user_ns,
			u32 portid, u32 seq, u32 flags, int sk_ino)
{
	struct nlmsghdr *nlh;
	struct packet_diag_msg *rp;
	struct packet_sock *po = pkt_sk(sk);

	nlh = nlmsg_put(skb, portid, seq, SOCK_DIAG_BY_FAMILY, sizeof(*rp), flags);
	if (!nlh)
		return -EMSGSIZE;

	rp = nlmsg_data(nlh);
	rp->pdiag_family = AF_PACKET;
	rp->pdiag_type = sk->sk_type;
	rp->pdiag_num = ntohs(po->num);
	rp->pdiag_ino = sk_ino;
	sock_diag_save_cookie(sk, rp->pdiag_cookie);

	if ((req->pdiag_show & PACKET_SHOW_INFO) &&
			pdiag_put_info(po, skb))
		goto out_nlmsg_trim;

	if ((req->pdiag_show & PACKET_SHOW_INFO) &&
	    nla_put_u32(skb, PACKET_DIAG_UID,
			from_kuid_munged(user_ns, sock_i_uid(sk))))
		goto out_nlmsg_trim;

	if ((req->pdiag_show & PACKET_SHOW_MCLIST) &&
			pdiag_put_mclist(po, skb))
		goto out_nlmsg_trim;

	if ((req->pdiag_show & PACKET_SHOW_RING_CFG) &&
			pdiag_put_rings_cfg(po, skb))
		goto out_nlmsg_trim;

	if ((req->pdiag_show & PACKET_SHOW_FANOUT) &&
			pdiag_put_fanout(po, skb))
		goto out_nlmsg_trim;

	if ((req->pdiag_show & PACKET_SHOW_MEMINFO) &&
	    sock_diag_put_meminfo(sk, skb, PACKET_DIAG_MEMINFO))
		goto out_nlmsg_trim;

	if ((req->pdiag_show & PACKET_SHOW_FILTER) &&
	    sock_diag_put_filterinfo(may_report_filterinfo, sk, skb,
				     PACKET_DIAG_FILTER))
		goto out_nlmsg_trim;

	nlmsg_end(skb, nlh);
	return 0;

out_nlmsg_trim:
	nlmsg_cancel(skb, nlh);
	return -EMSGSIZE;
}

static int packet_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
	int num = 0, s_num = cb->args[0];
	struct packet_diag_req *req;
	struct net *net;
	struct sock *sk;
	bool may_report_filterinfo;

	net = sock_net(skb->sk);
	req = nlmsg_data(cb->nlh);
	may_report_filterinfo = netlink_net_capable(cb->skb, CAP_NET_ADMIN);

	mutex_lock(&net->packet.sklist_lock);
	sk_for_each(sk, &net->packet.sklist) {
		if (!net_eq(sock_net(sk), net))
			continue;
		if (num < s_num)
			goto next;

		if (sk_diag_fill(sk, skb, req,
				 may_report_filterinfo,
				 sk_user_ns(NETLINK_CB(cb->skb).sk),
				 NETLINK_CB(cb->skb).portid,
				 cb->nlh->nlmsg_seq, NLM_F_MULTI,
				 sock_i_ino(sk)) < 0)
			goto done;
next:
		num++;
	}
done:
	mutex_unlock(&net->packet.sklist_lock);
	cb->args[0] = num;

	return skb->len;
}

static int packet_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
{
	int hdrlen = sizeof(struct packet_diag_req);
	struct net *net = sock_net(skb->sk);
	struct packet_diag_req *req;

	if (nlmsg_len(h) < hdrlen)
		return -EINVAL;

	req = nlmsg_data(h);
	/* Make it possible to support protocol filtering later */
	if (req->sdiag_protocol)
		return -EINVAL;

	if (h->nlmsg_flags & NLM_F_DUMP) {
		struct netlink_dump_control c = {
			.dump = packet_diag_dump,
		};
		return netlink_dump_start(net->diag_nlsk, skb, h, &c);
	} else
		return -EOPNOTSUPP;
}

static const struct sock_diag_handler packet_diag_handler = {
	.family = AF_PACKET,
	.dump = packet_diag_handler_dump,
};

static int __init packet_diag_init(void)
{
	return sock_diag_register(&packet_diag_handler);
}

static void __exit packet_diag_exit(void)
{
	sock_diag_unregister(&packet_diag_handler);
}

module_init(packet_diag_init);
module_exit(packet_diag_exit);
MODULE_LICENSE("GPL");
MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 17 /* AF_PACKET */);
ide will simply retransmit. Well we do retransmit, and then the packet is just dropped again for the same reason. It seems the better way to address this problem is to clear pfmemalloc in the TCP transmit path. pfmemalloc strict control really makes sense on the receive path. Signed-off-by: Eric Dumazet <edumazet@google.com> Acked-by: Josef Bacik <jbacik@fb.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2017-02-03net: ipv6: Set protocol to kernel for local routesDavid Ahern1-0/+1 IPv6 stack does not set the protocol for local routes, so those routes show up with proto "none": $ ip -6 ro ls table local local ::1 dev lo proto none metric 0 pref medium local 2100:3:: dev lo proto none metric 0 pref medium local 2100:3::4 dev lo proto none metric 0 pref medium local fe80:: dev lo proto none metric 0 pref medium ... Set rt6i_protocol to RTPROT_KERNEL for consistency with IPv4. Now routes show up with proto "kernel": $ ip -6 ro ls table local local ::1 dev lo proto kernel metric 0 pref medium local 2100:3:: dev lo proto kernel metric 0 pref medium local 2100:3::4 dev lo proto kernel metric 0 pref medium local fe80:: dev lo proto kernel metric 0 pref medium ... Signed-off-by: David Ahern <dsa@cumulusnetworks.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2017-02-03bridge: vlan dst_metadata hooks in ingress and egress pathsRoopa Prabhu6-2/+82 - ingress hook: - if port is a tunnel port, use tunnel info in attached dst_metadata to map it to a local vlan - egress hook: - if port is a tunnel port, use tunnel info attached to vlan to set dst_metadata on the skb CC: Nikolay Aleksandrov <nikolay@cumulusnetworks.com> Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2017-02-03bridge: per vlan dst_metadata netlink supportRoopa Prabhu7-48/+641 This patch adds support to attach per vlan tunnel info dst metadata. This enables bridge driver to map vlan to tunnel_info at ingress and egress. It uses the kernel dst_metadata infrastructure. The initial use case is vlan to vni bridging, but the api is generic to extend to any tunnel_info in the future: - Uapi to configure/unconfigure/dump per vlan tunnel data - netlink functions to configure vlan and tunnel_info mapping - Introduces bridge port flag BR_LWT_VLAN to enable attach/detach dst_metadata to bridged packets on ports. off by default. - changes to existing code is mainly refactor some existing vlan handling netlink code + hooks for new vlan tunnel code - I have kept the vlan tunnel code isolated in separate files. - most of the netlink vlan tunnel code is handling of vlan-tunid ranges (follows the vlan range handling code). To conserve space vlan-tunid by default are always dumped in ranges if applicable. Use case: example use for this is a vxlan bridging gateway or vtep which maps vlans to vn-segments (or vnis). iproute2 example (patched and pruned iproute2 output to just show relevant fdb entries): example shows same host mac learnt on two vni's and vlan 100 maps to vni 1000, vlan 101 maps to vni 1001 before (netdev per vni): $bridge fdb show | grep "00:02:00:00:00:03" 00:02:00:00:00:03 dev vxlan1001 vlan 101 master bridge 00:02:00:00:00:03 dev vxlan1001 dst 12.0.0.8 self 00:02:00:00:00:03 dev vxlan1000 vlan 100 master bridge 00:02:00:00:00:03 dev vxlan1000 dst 12.0.0.8 self after this patch with collect metdata in bridged mode (single netdev): $bridge fdb show | grep "00:02:00:00:00:03" 00:02:00:00:00:03 dev vxlan0 vlan 101 master bridge 00:02:00:00:00:03 dev vxlan0 src_vni 1001 dst 12.0.0.8 self 00:02:00:00:00:03 dev vxlan0 vlan 100 master bridge 00:02:00:00:00:03 dev vxlan0 src_vni 1000 dst 12.0.0.8 self CC: Nikolay Aleksandrov <nikolay@cumulusnetworks.com> Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2017-02-03net/sched: act_ife: Change to use ife moduleYotam Gigi2-78/+33 Use the encode/decode functionality from the ife module instead of using implementation inside the act_ife. Reviewed-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: Yotam Gigi <yotamg@mellanox.com> Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com> Signed-off-by: Roman Mashak <mrv@mojatatu.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2017-02-03net: Introduce ife encapsulation moduleYotam Gigi5-0/+165 This module is responsible for the ife encapsulation protocol encode/decode logics. That module can: - ife_encode: encode skb and reserve space for the ife meta header - ife_decode: decode skb and extract the meta header size - ife_tlv_meta_encode - encodes one tlv entry into the reserved ife header space. - ife_tlv_meta_decode - decodes one tlv entry from the packet - ife_tlv_meta_next - advance to the next tlv Reviewed-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: Yotam Gigi <yotamg@mellanox.com> Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com> Signed-off-by: Roman Mashak <mrv@mojatatu.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2017-02-03net/sched: act_ife: Unexport ife_tlv_meta_encodeYotam Gigi1-2/+2 As the function ife_tlv_meta_encode is not used by any other module, unexport it and make it static for the act_ife module. Signed-off-by: Yotam Gigi <yotamg@mellanox.com> Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com> Signed-off-by: Roman Mashak <mrv@mojatatu.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2017-02-03tcp: add tcp_mss_clamp() helperEric Dumazet