/* * net/sched/sch_mqprio.c * * Copyright (c) 2010 John Fastabend * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * version 2 as published by the Free Software Foundation. */ #include #include #include #include #include #include #include #include #include #include struct mqprio_sched { struct Qdisc **qdiscs; int hw_owned; }; static void mqprio_destroy(struct Qdisc *sch) { struct net_device *dev = qdisc_dev(sch); struct mqprio_sched *priv = qdisc_priv(sch); struct tc_to_netdev tc = {.type = TC_SETUP_MQPRIO}; unsigned int ntx; if (priv->qdiscs) { for (ntx = 0; ntx < dev->num_tx_queues && priv->qdiscs[ntx]; ntx++) qdisc_destroy(priv->qdiscs[ntx]); kfree(priv->qdiscs); } if (priv->hw_owned && dev->netdev_ops->ndo_setup_tc) dev->netdev_ops->ndo_setup_tc(dev, sch->handle, 0, &tc); else netdev_set_num_tc(dev, 0); } static int mqprio_parse_opt(struct net_device *dev, struct tc_mqprio_qopt *qopt) { int i, j; /* Verify num_tc is not out of max range */ if (qopt->num_tc > TC_MAX_QUEUE) return -EINVAL; /* Verify priority mapping uses valid tcs */ for (i = 0; i < TC_BITMASK + 1; i++) { if (qopt->prio_tc_map[i] >= qopt->num_tc) return -EINVAL; } /* net_device does not support requested operation */ if (qopt->hw && !dev->netdev_ops->ndo_setup_tc) return -EINVAL; /* if hw owned qcount and qoffset are taken from LLD so * no reason to verify them here */ if (qopt->hw) return 0; for (i = 0; i < qopt->num_tc; i++) { unsigned int last = qopt->offset[i] + qopt->count[i]; /* Verify the queue count is in tx range being equal to the * real_num_tx_queues indicates the last queue is in use. */ if (qopt->offset[i] >= dev->real_num_tx_queues || !qopt->count[i] || last > dev->real_num_tx_queues) return -EINVAL; /* Verify that the offset and counts do not overlap */ for (j = i + 1; j < qopt->num_tc; j++) { if (last > qopt->offset[j]) return -EINVAL; } } return 0; } static int mqprio_init(struct Qdisc *sch, struct nlattr *opt) { struct net_device *dev = qdisc_dev(sch); struct mqprio_sched *priv = qdisc_priv(sch); struct netdev_queue *dev_queue; struct Qdisc *qdisc; int i, err = -EOPNOTSUPP; struct tc_mqprio_qopt *qopt = NULL; BUILD_BUG_ON(TC_MAX_QUEUE != TC_QOPT_MAX_QUEUE); BUILD_BUG_ON(TC_BITMASK != TC_QOPT_BITMASK); if (sch->parent != TC_H_ROOT) return -EOPNOTSUPP; if (!netif_is_multiqueue(dev)) return -EOPNOTSUPP; if (!opt || nla_len(opt) < sizeof(*qopt)) return -EINVAL; qopt = nla_data(opt); if (mqprio_parse_opt(dev, qopt)) return -EINVAL; /* pre-allocate qdisc, attachment can't fail */ priv->qdiscs = kcalloc(dev->num_tx_queues, sizeof(priv->qdiscs[0]), GFP_KERNEL); if (priv->qdiscs == NULL) { err = -ENOMEM; goto err; } for (i = 0; i < dev->num_tx_queues; i++) { dev_queue = netdev_get_tx_queue(dev, i); qdisc = qdisc_create_dflt(dev_queue, get_default_qdisc_ops(dev, i), TC_H_MAKE(TC_H_MAJ(sch->handle), TC_H_MIN(i + 1))); if (qdisc == NULL) { err = -ENOMEM; goto err; } priv->qdiscs[i] = qdisc; qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; } /* If the mqprio options indicate that hardware should own * the queue mapping then run ndo_setup_tc otherwise use the * supplied and verified mapping */ if (qopt->hw) { struct tc_to_netdev tc = {.type = TC_SETUP_MQPRIO, { .tc = qopt->num_tc }}; priv->hw_owned = 1; err = dev->netdev_ops->ndo_setup_tc(dev, sch->handle, 0, &tc); if (err) goto err; } else { netdev_set_num_tc(dev, qopt->num_tc); for (i = 0; i < qopt->num_tc; i++) netdev_set_tc_queue(dev, i, qopt->count[i], qopt->offset[i]); } /* Always use supplied priority mappings */ for (i = 0; i < TC_BITMASK + 1; i++) netdev_set_prio_tc_map(dev, i, qopt->prio_tc_map[i]); sch->flags |= TCQ_F_MQROOT; return 0; err: mqprio_destroy(sch); return err; } static void mqprio_attach(struct Qdisc *sch) { struct net_device *dev = qdisc_dev(sch); struct mqprio_sched *priv = qdisc_priv(sch); struct Qdisc *qdisc, *old; unsigned int ntx; /* Attach underlying qdisc */ for (ntx = 0; ntx < dev->num_tx_queues; ntx++) { qdisc = priv->qdiscs[ntx]; old = dev_graft_qdisc(qdisc->dev_queue, qdisc); if (old) qdisc_destroy(old); if (ntx < dev->real_num_tx_queues) qdisc_hash_add(qdisc); } kfree(priv->qdiscs); priv->qdiscs = NULL; } static struct netdev_queue *mqprio_queue_get(struct Qdisc *sch, unsigned long cl) { struct net_device *dev = qdisc_dev(sch); unsigned long ntx = cl - 1 - netdev_get_num_tc(dev); if (ntx >= dev->num_tx_queues) return NULL; return netdev_get_tx_queue(dev, ntx); } static int mqprio_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new, struct Qdisc **old) { struct net_device *dev = qdisc_dev(sch); struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl); if (!dev_queue) return -EINVAL; if (dev->flags & IFF_UP) dev_deactivate(dev); *old = dev_graft_qdisc(dev_queue, new); if (new) new->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT; if (dev->flags & IFF_UP) dev_activate(dev); return 0; } static int mqprio_dump(struct Qdisc *sch, struct sk_buff *skb) { struct net_device *dev = qdisc_dev(sch); struct mqprio_sched *priv = qdisc_priv(sch); unsigned char *b = skb_tail_pointer(skb); struct tc_mqprio_qopt opt = { 0 }; struct Qdisc *qdisc; unsigned int i; sch->q.qlen = 0; memset(&sch->bstats, 0, sizeof(sch->bstats)); memset(&sch->qstats, 0, sizeof(sch->qstats)); for (i = 0; i < dev->num_tx_queues; i++) { qdisc = rtnl_dereference(netdev_get_tx_queue(dev, i)->qdisc); spin_lock_bh(qdisc_lock(qdisc)); sch->q.qlen += qdisc->q.qlen; sch->bstats.bytes += qdisc->bstats.bytes; sch->bstats.packets += qdisc->bstats.packets; sch->qstats.backlog += qdisc->qstats.backlog; sch->qstats.drops += qdisc->qstats.drops; sch->qstats.requeues += qdisc->qstats.requeues; sch->qstats.overlimits += qdisc->qstats.overlimits; spin_unlock_bh(qdisc_lock(qdisc)); } opt.num_tc = netdev_get_num_tc(dev); memcpy(opt.prio_tc_map, dev->prio_tc_map, sizeof(opt.prio_tc_map)); opt.hw = priv->hw_owned; for (i = 0; i < netdev_get_num_tc(dev); i++) { opt.count[i] = dev->tc_to_txq[i].count; opt.offset[i] = dev->tc_to_txq[i].offset; } if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt)) goto nla_put_failure; return skb->len; nla_put_failure: nlmsg_trim(skb, b); return -1; } static struct Qdisc *mqprio_leaf(struct Qdisc *sch, unsigned long cl) { struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl); if (!dev_queue) return NULL; return dev_queue->qdisc_sleeping; } static unsigned long mqprio_get(struct Qdisc *sch, u32 classid) { struct net_device *dev = qdisc_dev(sch); unsigned int ntx = TC_H_MIN(classid); if (ntx > dev->num_tx_queues + netdev_get_num_tc(dev)) return 0; return ntx; } static void mqprio_put(struct Qdisc *sch, unsigned long cl) { } static int mqprio_dump_class(struct Qdisc *sch, unsigned long cl, struct sk_buff *skb, struct tcmsg *tcm) { struct net_device *dev = qdisc_dev(sch); if (cl <= netdev_get_num_tc(dev)) { tcm->tcm_parent = TC_H_ROOT; tcm->tcm_info = 0; } else { int i; struct netdev_queue *dev_queue; dev_queue = mqprio_queue_get(sch, cl); tcm->tcm_parent = 0; for (i = 0; i < netdev_get_num_tc(dev); i++) { struct netdev_tc_txq tc = dev->tc_to_txq[i]; int q_idx = cl - netdev_get_num_tc(dev); if (q_idx > tc.offset && q_idx <= tc.offset + tc.count) { tcm->tcm_parent = TC_H_MAKE(TC_H_MAJ(sch->handle), TC_H_MIN(i + 1)); break; } } tcm->tcm_info = dev_queue->qdisc_sleeping->handle; } tcm->tcm_handle |= TC_H_MIN(cl); return 0; } static int mqprio_dump_class_stats(struct Qdisc *sch, unsigned long cl, struct gnet_dump *d) __releases(d->lock) __acquires(d->lock) { struct net_device *dev = qdisc_dev(sch); if (cl <= netdev_get_num_tc(dev)) { int i; __u32 qlen = 0; struct Qdisc *qdisc; struct gnet_stats_queue qstats = {0}; struct gnet_stats_basic_packed bstats = {0}; struct netdev_tc_txq tc = dev->tc_to_txq[cl - 1]; /* Drop lock here it will be reclaimed before touching * statistics this is required because the d->lock we * hold here is the look on dev_queue->qdisc_sleeping * also acquired below. */ if (d->lock) spin_unlock_bh(d->lock); for (i = tc.offset; i < tc.offset + tc.count; i++) { struct netdev_queue *q = netdev_get_tx_queue(dev, i); qdisc = rtnl_dereference(q->qdisc); spin_lock_bh(qdisc_lock(qdisc)); qlen += qdisc->q.qlen; bstats.bytes += qdisc->bstats.bytes; bstats.packets += qdisc->bstats.packets; qstats.backlog += qdisc->qstats.backlog; qstats.drops += qdisc->qstats.drops; qstats.requeues += qdisc->qstats.requeues; qstats.overlimits += qdisc->qstats.overlimits; spin_unlock_bh(qdisc_lock(qdisc)); } /* Reclaim root sleeping lock before completing stats */ if (d->lock) spin_lock_bh(d->lock); if (gnet_stats_copy_basic(NULL, d, NULL, &bstats) < 0 || gnet_stats_copy_queue(d, NULL, &qstats, qlen) < 0) return -1; } else { struct netdev_queue *dev_queue = mqprio_queue_get(sch, cl); sch = dev_queue->qdisc_sleeping; if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), d, NULL, &sch->bstats) < 0 || gnet_stats_copy_queue(d, NULL, &sch->qstats, sch->q.qlen) < 0) return -1; } return 0; } static void mqprio_walk(struct Qdisc *sch, struct qdisc_walker *arg) { struct net_device *dev = qdisc_dev(sch); unsigned long ntx; if (arg->stop) return; /* Walk hierarchy with a virtual class per tc */ arg->count = arg->skip; for (ntx = arg->skip; ntx < dev->num_tx_queues + netdev_get_num_tc(dev); ntx++) { if (arg->fn(sch, ntx + 1, arg) < 0) { arg->stop = 1; break; } arg->count++; } } static const struct Qdisc_class_ops mqprio_class_ops = { .graft = mqprio_graft, .leaf = mqprio_leaf, .get = mqprio_get, .put = mqprio_put, .walk = mqprio_walk, .dump = mqprio_dump_class, .dump_stats = mqprio_dump_class_stats, }; static struct Qdisc_ops mqprio_qdisc_ops __read_mostly = { .cl_ops = &mqprio_class_ops, .id = "mqprio", .priv_size = sizeof(struct mqprio_sched), .init = mqprio_init, .destroy = mqprio_destroy, .attach = mqprio_attach, .dump = mqprio_dump, .owner = THIS_MODULE, }; static int __init mqprio_module_init(void) { return register_qdisc(&mqprio_qdisc_ops); } static void __exit mqprio_module_exit(void) { unregister_qdisc(&mqprio_qdisc_ops); } module_init(mqprio_module_init); module_exit(mqprio_module_exit); MODULE_LICENSE("GPL"); iptunnel.c:254:5: warning: symbol 'seg6_output' was not declared. Should it be static? Signed-off-by: Wei Yongjun <weiyongjun1@huawei.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2017-02-07net/sched: act_mirred: remove duplicated include from act_mirred.cWei Yongjun1-2/+0 Remove duplicated include. Signed-off-by: Wei Yongjun <weiyongjun1@huawei.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2017-02-07net: dsa: Add support for platform dataFlorian Fainelli1-18/+84 Allow drivers to use the new DSA API with platform data. Most of the code in net/dsa/dsa2.c does not rely so much on device_nodes and can get the same information from platform_data instead. We purposely do not support distributed configurations with platform data, so drivers should be providing a pointer to a 'struct dsa_chip_data' structure if they wish to communicate per-port layout. Multiple CPUs port could potentially be supported and dsa_chip_data is extended to receive up to one reference to an upstream network device per port described by a dsa_chip_data structure. dsa_dev_to_net_device() increments the network device's reference count, so we intentionally call dev_put() to be consistent with the DT-enabled path, until we have a generic notifier based solution. Signed-off-by: Florian Fainelli <f.fainelli@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2017-02-07net: dsa: Rename and export dev_to_net_device()Florian Fainelli1-2/+3 In preparation for using this function in net/dsa/dsa2.c, rename the function to make its scope DSA specific, and export it. Signed-off-by: Florian Fainelli <f.fainelli@gmail.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2017-02-06bridge: fdb: write to used and updated at most once per jiffyNikolay Aleksandrov2-2/+4 Writing once per jiffy is enough to limit the bridge's false sharing. After this change the bridge doesn't show up in the local load HitM stats. Suggested-by: David S. Miller <davem@davemloft.net> Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2017-02-06bridge: move write-heavy fdb members in their own cache lineNikolay Aleksandrov1-4/+6 Fdb's used and updated fields are written to on every packet forward and packet receive respectively. Thus if we are receiving packets from a particular fdb, they'll cause false-sharing with everyone who has looked it up (even if it didn't match, since mac/vid share cache line!). The "used" field is even worse since it is updated on every packet forward to that fdb, thus the standard config where X ports use a single gateway results in 100% fdb false-sharing. Note that this patch does not prevent the last scenario, but it makes it better for other bridge participants which are not using that fdb (and are only doing lookups over it). The point is with this move we make sure that only communicating parties get the false-sharing, in a later patch we'll show how to avoid that too. Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2017-02-06bridge: move to workqueue gcNikolay Aleksandrov10-23/+29 Move the fdb garbage collector to a workqueue which fires at least 10 milliseconds apart and cleans chain by chain allowing for other tasks to run in the meantime. When having thousands of fdbs the system is much more responsive. Most importantly remove the need to check if the matched entry has expired in __br_fdb_get that causes false-sharing and is completely unnecessary if we cleanup entries, at worst we'll get 10ms of traffic for that entry before it gets deleted. Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2017-02-06bridge: modify bridge and port to have often accessed fields in one cache lineNikolay Aleksandrov1-23/+20 Move around net_bridge so the vlan fields are in the beginning since they're checked on every packet even if vlan filtering is disabled. For the port move flags & vlan group to the beginning, so they're in the same cache line with the port's state (both flags and state are checked on each packet). Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2017-02-06net: dsa: introduce bridge notifierVivien Didelot2-11/+61 A slave device will now notify the switch fabric once its port is bridged or unbridged, instead of calling directly its switch operations. This code allows propagating cross-chip bridging events in the fabric. Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2017-02-06net: dsa: add switch notifierVivien Didelot5-0/+70 Add a notifier block per DSA switch, registered against a notifier head in the switch fabric they belong to. This infrastructure will allow to propagate fabric-wide events such as port bridging, VLAN configuration, etc. If a DSA switch driver cares about cross-chip configuration, such events can be caught. Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2017-02-06net: dsa: change state setter scopeVivien Didelot1-6/+9 The scope of the functions inside net/dsa/slave.c must be the slave net_device pointer. Change to state setter helper accordingly to simplify callers. Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2017-02-06net: dsa: rollback bridging on errorVivien Didelot1-1/+13 When an error is returned during the bridging of a port in a NETDEV_CHANGEUPPER event, net/core/dev.c rolls back the operation. Be consistent and unassign dp->bridge_dev when this happens. In the meantime, add comments to document this behavior. Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2017-02-06net: dsa: simplify netdevice events handlingVivien Didelot1-28/+16 Simplify the code handling the slave netdevice notifier call by providing a dsa_slave_changeupper helper for NETDEV_CHANGEUPPER, and so on (only this event is supported at the moment.) Return NOTIFY_DONE when we did not care about an event, and NOTIFY_OK when we were concerned but no error occurred, as the API suggests. Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2017-02-06net: dsa: move netdevice notifier registrationVivien Didelot3-10/+26 Move the netdevice notifier block register code in slave.c and provide helpers for dsa.c to register and unregister it. At the same time, check for errors since (un)register_netdevice_notifier may fail. Signed-off-by: Vivien Didelot <vivien.didelot@savoirfairelinux.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2017-02-06net-next: treewide use is_vlan_dev() helper function.Parav Pandit1-1/+2 This patch makes use of is_vlan_dev() function instead of flag comparison which is exactly done by is_vlan_dev() helper function. Signed-off-by: Parav Pandit <parav@mellanox.com> Reviewed-by: Daniel Jurgens <danielj@mellanox.com> Acked-by: Stephen Hemminger <stephen@networkplumber.org> Acked-by: Jon Maxwell <jmaxwell37@gmail.com> Acked-by: Johannes Thumshirn <jth@kernel.org> Acked-by: Haiyang Zhang <haiyangz@microsoft.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2017-02-06sctp: process fwd tsn chunk only when prsctp is enabledXin Long1-0/+6 This patch is to check if asoc->peer.prsctp_capable is set before processing fwd tsn chunk, if not, it will return an ERROR to the peer, just as rfc3758 section 3.3.1 demands. Reported-by: Julian Cordes <julian.cordes@gmail.com> Signed-off-by: Xin Long <lucien.xin@gmail.com> Acked-by: Neil Horman <nhorman@tuxdriver.com> Signed-off-by: David S. Miller <davem@davemloft.net> 2017-02-06net: remove ndo_neigh_{construct, destroy} from stacked devicesIdo Schimmel