#!/bin/bash function config_device { ip netns add at_ns0 ip netns add at_ns1 ip netns add at_ns2 ip link add veth0 type veth peer name veth0b ip link add veth1 type veth peer name veth1b ip link add veth2 type veth peer name veth2b ip link set veth0b up ip link set veth1b up ip link set veth2b up ip link set dev veth0b mtu 1500 ip link set dev veth1b mtu 1500 ip link set dev veth2b mtu 1500 ip link set veth0 netns at_ns0 ip link set veth1 netns at_ns1 ip link set veth2 netns at_ns2 ip netns exec at_ns0 ip addr add 172.16.1.100/24 dev veth0 ip netns exec at_ns0 ip addr add 2401:db00::1/64 dev veth0 nodad ip netns exec at_ns0 ip link set dev veth0 up ip netns exec at_ns1 ip addr add 172.16.1.101/24 dev veth1 ip netns exec at_ns1 ip addr add 2401:db00::2/64 dev veth1 nodad ip netns exec at_ns1 ip link set dev veth1 up ip netns exec at_ns2 ip addr add 172.16.1.200/24 dev veth2 ip netns exec at_ns2 ip addr add 2401:db00::3/64 dev veth2 nodad ip netns exec at_ns2 ip link set dev veth2 up ip link add br0 type bridge ip link set br0 up ip link set dev br0 mtu 1500 ip link set veth0b master br0 ip link set veth1b master br0 ip link set veth2b master br0 } function add_ipip_tunnel { ip netns exec at_ns0 \ ip link add dev $DEV_NS type ipip local 172.16.1.100 remote 172.16.1.200 ip netns exec at_ns0 ip link set dev $DEV_NS up ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24 ip netns exec at_ns1 \ ip link add dev $DEV_NS type ipip local 172.16.1.101 remote 172.16.1.200 ip netns exec at_ns1 ip link set dev $DEV_NS up # same inner IP address in at_ns0 and at_ns1 ip netns exec at_ns1 ip addr add dev $DEV_NS 10.1.1.100/24 ip netns exec at_ns2 ip link add dev $DEV type ipip external ip netns exec at_ns2 ip link set dev $DEV up ip netns exec at_ns2 ip addr add dev $DEV 10.1.1.200/24 } function add_ipip6_tunnel { ip netns exec at_ns0 \ ip link add dev $DEV_NS type ip6tnl mode ipip6 local 2401:db00::1/64 remote 2401:db00::3/64 ip netns exec at_ns0 ip link set dev $DEV_NS up ip netns exec at_ns0 ip addr add dev $DEV_NS 10.1.1.100/24 ip netns exec at_ns1 \ ip link add dev $DEV_NS type ip6tnl mode ipip6 local 2401:db00::2/64 remote 2401:db00::3/64 ip netns exec at_ns1 ip link set dev $DEV_NS up # same inner IP address in at_ns0 and at_ns1 ip netns exec at_ns1 ip addr add dev $DEV_NS 10.1.1.100/24 ip netns exec at_ns2 ip link add dev $DEV type ip6tnl mode ipip6 external ip netns exec at_ns2 ip link set dev $DEV up ip netns exec at_ns2 ip addr add dev $DEV 10.1.1.200/24 } function add_ip6ip6_tunnel { ip netns exec at_ns0 \ ip link add dev $DEV_NS type ip6tnl mode ip6ip6 local 2401:db00::1/64 remote 2401:db00::3/64 ip netns exec at_ns0 ip link set dev $DEV_NS up ip netns exec at_ns0 ip addr add dev $DEV_NS 2601:646::1/64 ip netns exec at_ns1 \ ip link add dev $DEV_NS type ip6tnl mode ip6ip6 local 2401:db00::2/64 remote 2401:db00::3/64 ip netns exec at_ns1 ip link set dev $DEV_NS up # same inner IP address in at_ns0 and at_ns1 ip netns exec at_ns1 ip addr add dev $DEV_NS 2601:646::1/64 ip netns exec at_ns2 ip link add dev $DEV type ip6tnl mode ip6ip6 external ip netns exec at_ns2 ip link set dev $DEV up ip netns exec at_ns2 ip addr add dev $DEV 2601:646::2/64 } function attach_bpf { DEV=$1 SET_TUNNEL=$2 GET_TUNNEL=$3 ip netns exec at_ns2 tc qdisc add dev $DEV clsact ip netns exec at_ns2 tc filter add dev $DEV egress bpf da obj tcbpf2_kern.o sec $SET_TUNNEL ip netns exec at_ns2 tc filter add dev $DEV ingress bpf da obj tcbpf2_kern.o sec $GET_TUNNEL } function test_ipip { DEV_NS=ipip_std DEV=ipip_bpf config_device # tcpdump -nei br0 & cat /sys/kernel/debug/tracing/trace_pipe & add_ipip_tunnel attach_bpf $DEV ipip_set_tunnel ipip_get_tunnel ip netns exec at_ns0 ping -c 1 10.1.1.200 ip netns exec at_ns2 ping -c 1 10.1.1.100 ip netns exec at_ns0 iperf -sD -p 5200 > /dev/null ip netns exec at_ns1 iperf -sD -p 5201 > /dev/null sleep 0.2 # tcp check _same_ IP over different tunnels ip netns exec at_ns2 iperf -c 10.1.1.100 -n 5k -p 5200 ip netns exec at_ns2 iperf -c 10.1.1.100 -n 5k -p 5201 cleanup } # IPv4 over IPv6 tunnel function test_ipip6 { DEV_NS=ipip_std DEV=ipip_bpf config_device # tcpdump -nei br0 & cat /sys/kernel/debug/tracing/trace_pipe & add_ipip6_tunnel attach_bpf $DEV ipip6_set_tunnel ipip6_get_tunnel ip netns exec at_ns0 ping -c 1 10.1.1.200 ip netns exec at_ns2 ping -c 1 10.1.1.100 ip netns exec at_ns0 iperf -sD -p 5200 > /dev/null ip netns exec at_ns1 iperf -sD -p 5201 > /dev/null sleep 0.2 # tcp check _same_ IP over different tunnels ip netns exec at_ns2 iperf -c 10.1.1.100 -n 5k -p 5200 ip netns exec at_ns2 iperf -c 10.1.1.100 -n 5k -p 5201 cleanup } # IPv6 over IPv6 tunnel function test_ip6ip6 { DEV_NS=ipip_std DEV=ipip_bpf config_device # tcpdump -nei br0 & cat /sys/kernel/debug/tracing/trace_pipe & add_ip6ip6_tunnel attach_bpf $DEV ip6ip6_set_tunnel ip6ip6_get_tunnel ip netns exec at_ns0 ping -6 -c 1 2601:646::2 ip netns exec at_ns2 ping -6 -c 1 2601:646::1 ip netns exec at_ns0 iperf -6sD -p 5200 > /dev/null ip netns exec at_ns1 iperf -6sD -p 5201 > /dev/null sleep 0.2 # tcp check _same_ IP over different tunnels ip netns exec at_ns2 iperf -6c 2601:646::1 -n 5k -p 5200 ip netns exec at_ns2 iperf -6c 2601:646::1 -n 5k -p 5201 cleanup } function cleanup { set +ex pkill iperf ip netns delete at_ns0 ip netns delete at_ns1 ip netns delete at_ns2 ip link del veth0 ip link del veth1 ip link del veth2 ip link del br0 pkill tcpdump pkill cat set -ex } cleanup echo "Testing IP tunnels..." test_ipip test_ipip6 test_ip6ip6 echo "*** PASS ***" in hooks, from Gao Feng. 10) Fix harmless reference counter underflow in IPVS that, however, results in problems with the introduction of the new refcount_t type, from David Windsor. 11) Enable LIBCRC32C from nf_ct_sctp instead of nf_nat_sctp, from Davide Caratti. 12) Missing documentation on nf_tables uapi header, from Liping Zhang. 13) Use rb_entry() helper in xt_connlimit, from Geliang Tang. ==================== Signed-off-by: David S. Miller <davem@davemloft.net> 2017-02-03bridge: uapi: add per vlan tunnel infoRoopa Prabhu2-0/+12 New nested netlink attribute to associate tunnel info per vlan. This is used by bridge driver to send tunnel metadata to bridge ports in vlan tunnel mode. This patch also adds new per port flag IFLA_BRPORT_VLAN_TUNNEL to enable vlan tunnel mode. off by default. One example use for this is a vxlan bridging gateway or vtep which maps vlans to vn-segments (or vnis). User can configure per-vlan tunnel information which the bridge driver can use to bridge vlan into the corresponding vn-segment. Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2017-02-03vxlan: support fdb and learning in COLLECT_METADATA modeRoopa Prabhu1-0/+1 Vxlan COLLECT_METADATA mode today solves the per-vni netdev scalability problem in l3 networks. It expects all forwarding information to be present in dst_metadata. This patch series enhances collect metadata mode to include the case where only vni is present in dst_metadata, and the vxlan driver can then use the rest of the forwarding information datbase to make forwarding decisions. There is no change to default COLLECT_METADATA behaviour. These changes only apply to COLLECT_METADATA when used with the bridging use-case with a special dst_metadata tunnel info flag (eg: where vxlan device is part of a bridge). For all this to work, the vxlan driver will need to now support a single fdb table hashed by mac + vni. This series essentially makes this happen. use-case and workflow: vxlan collect metadata device participates in bridging vlan to vn-segments. Bridge driver above the vxlan device, sends the vni corresponding to the vlan in the dst_metadata. vxlan driver will lookup forwarding database with (mac + vni) for the required remote destination information to forward the packet. Changes introduced by this patch: - allow learning and forwarding database state in vxlan netdev in COLLECT_METADATA mode. Current behaviour is not changed by default. tunnel info flag IP_TUNNEL_INFO_BRIDGE is used to support the new bridge friendly mode. - A single fdb table hashed by (mac, vni) to allow fdb entries with multiple vnis in the same fdb table - rx path already has the vni - tx path expects a vni in the packet with dst_metadata - prior to this series, fdb remote_dsts carried remote vni and the vxlan device carrying the fdb table represented the source vni. With the vxlan device now representing multiple vnis, this patch adds a src vni attribute to the fdb entry. The remote vni already uses NDA_VNI attribute. This patch introduces NDA_SRC_VNI netlink attribute to represent the src vni in a multi vni fdb table. iproute2 example (patched and pruned iproute2 output to just show relevant fdb entries): example shows same host mac learnt on two vni's. before (netdev per vni): $bridge fdb show | grep "00:02:00:00:00:03" 00:02:00:00:00:03 dev vxlan1001 dst 12.0.0.8 self 00:02:00:00:00:03 dev vxlan1000 dst 12.0.0.8 self after this patch with collect metadata in bridged mode (single netdev): $bridge fdb show | grep "00:02:00:00:00:03" 00:02:00:00:00:03 dev vxlan0 src_vni 1001 dst 12.0.0.8 self 00:02:00:00:00:03 dev vxlan0 src_vni 1000 dst 12.0.0.8 self Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2017-02-03net/sched: act_ife: Change to use ife moduleYotam Gigi1-9/+1 Use the encode/decode functionality from the ife module instead of using implementation inside the act_ife. Reviewed-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: Yotam Gigi <yotamg@mellanox.com> Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com> Signed-off-by: Roman Mashak <mrv@mojatatu.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2017-02-03net: Introduce ife encapsulation moduleYotam Gigi2-0/+19 This module is responsible for the ife encapsulation protocol encode/decode logics. That module can: - ife_encode: encode skb and reserve space for the ife meta header - ife_decode: decode skb and extract the meta header size - ife_tlv_meta_encode - encodes one tlv entry into the reserved ife header space. - ife_tlv_meta_decode - decodes one tlv entry from the packet - ife_tlv_meta_next - advance to the next tlv Reviewed-by: Jiri Pirko <jiri@mellanox.com> Signed-off-by: Yotam Gigi <yotamg@mellanox.com> Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com> Signed-off-by: Roman Mashak <mrv@mojatatu.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2017-02-02net: add LINUX_MIB_PFMEMALLOCDROP counterEric Dumazet1-0/+1 Debugging issues caused by pfmemalloc is often tedious. Add a new SNMP counter to more easily diagnose these problems. Signed-off-by: Eric Dumazet <edumazet@google.com> Cc: Josef Bacik <jbacik@fb.com> Acked-by: Josef Bacik <jbacik@fb.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2017-02-02unix: add ioctl to open a unix socket file with O_PATHAndrey Vagin1-0/+2 This ioctl opens a file to which a socket is bound and returns a file descriptor. The caller has to have CAP_NET_ADMIN in the socket network namespace. Currently it is impossible to get a path and a mount point for a socket file. socket_diag reports address, device ID and inode number for unix sockets. An address can contain a relative path or a file may be moved somewhere. And these properties say nothing about a mount namespace and a mount point of a socket file. With the introduced ioctl, we can get a path by reading /proc/self/fd/X and get mnt_id from /proc/self/fdinfo/X. In CRIU we are going to use this ioctl to dump and restore unix socket. Here is an example how it can be used: $ strace -e socket,bind,ioctl ./test /tmp/test_sock socket(AF_UNIX, SOCK_STREAM, 0) = 3 bind(3, {sa_family=AF_UNIX, sun_path="test_sock"}, 11) = 0 ioctl(3, SIOCUNIXFILE, 0) = 4 ^Z $ ss -a | grep test_sock u_str LISTEN 0 1 test_sock 17798 * 0 $ ls -l /proc/760/fd/{3,4} lrwx------ 1 root root 64 Feb 1 09:41 3 -> 'socket:[17798]' l--------- 1 root root 64 Feb 1 09:41 4 -> /tmp/test_sock $ cat /proc/760/fdinfo/4 pos: 0 flags: 012000000 mnt_id: 40 $ cat /proc/self/mountinfo | grep "^40\s" 40 19 0:37 / /tmp rw shared:23 - tmpfs tmpfs rw Signed-off-by: Andrei Vagin <avagin@openvz.org> Signed-off-by: David S. Miller <davem@davemloft.net> 2017-02-02Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/netDavid S. Miller1-1/+3 All merge conflicts were simple overlapping changes. Signed-off-by: David S. Miller <davem@davemloft.net>