/* * Functions to manage eBPF programs attached to cgroups * * Copyright (c) 2016 Daniel Mack * * This file is subject to the terms and conditions of version 2 of the GNU * General Public License. See the file COPYING in the main directory of the * Linux distribution for more details. */ #include #include #include #include #include #include #include DEFINE_STATIC_KEY_FALSE(cgroup_bpf_enabled_key); EXPORT_SYMBOL(cgroup_bpf_enabled_key); /** * cgroup_bpf_put() - put references of all bpf programs * @cgrp: the cgroup to modify */ void cgroup_bpf_put(struct cgroup *cgrp) { unsigned int type; for (type = 0; type < ARRAY_SIZE(cgrp->bpf.prog); type++) { struct bpf_prog *prog = cgrp->bpf.prog[type]; if (prog) { bpf_prog_put(prog); static_branch_dec(&cgroup_bpf_enabled_key); } } } /** * cgroup_bpf_inherit() - inherit effective programs from parent * @cgrp: the cgroup to modify * @parent: the parent to inherit from */ void cgroup_bpf_inherit(struct cgroup *cgrp, struct cgroup *parent) { unsigned int type; for (type = 0; type < ARRAY_SIZE(cgrp->bpf.effective); type++) { struct bpf_prog *e; e = rcu_dereference_protected(parent->bpf.effective[type], lockdep_is_held(&cgroup_mutex)); rcu_assign_pointer(cgrp->bpf.effective[type], e); } } /** * __cgroup_bpf_update() - Update the pinned program of a cgroup, and * propagate the change to descendants * @cgrp: The cgroup which descendants to traverse * @parent: The parent of @cgrp, or %NULL if @cgrp is the root * @prog: A new program to pin * @type: Type of pinning operation (ingress/egress) * * Each cgroup has a set of two pointers for bpf programs; one for eBPF * programs it owns, and which is effective for execution. * * If @prog is not %NULL, this function attaches a new program to the cgroup * and releases the one that is currently attached, if any. @prog is then made * the effective program of type @type in that cgroup. * * If @prog is %NULL, the currently attached program of type @type is released, * and the effective program of the parent cgroup (if any) is inherited to * @cgrp. * * Then, the descendants of @cgrp are walked and the effective program for * each of them is set to the effective program of @cgrp unless the * descendant has its own program attached, in which case the subbranch is * skipped. This ensures that delegated subcgroups with own programs are left * untouched. * * Must be called with cgroup_mutex held. */ void __cgroup_bpf_update(struct cgroup *cgrp, struct cgroup *parent, struct bpf_prog *prog, enum bpf_attach_type type) { struct bpf_prog *old_prog, *effective; struct cgroup_subsys_state *pos; old_prog = xchg(cgrp->bpf.prog + type, prog); effective = (!prog && parent) ? rcu_dereference_protected(parent->bpf.effective[type], lockdep_is_held(&cgroup_mutex)) : prog; css_for_each_descendant_pre(pos, &cgrp->self) { struct cgroup *desc = container_of(pos, struct cgroup, self); /* skip the subtree if the descendant has its own program */ if (desc->bpf.prog[type] && desc != cgrp) pos = css_rightmost_descendant(pos); else rcu_assign_pointer(desc->bpf.effective[type], effective); } if (prog) static_branch_inc(&cgroup_bpf_enabled_key); if (old_prog) { bpf_prog_put(old_prog); static_branch_dec(&cgroup_bpf_enabled_key); } } /** * __cgroup_bpf_run_filter_skb() - Run a program for packet filtering * @sk: The socken sending or receiving traffic * @skb: The skb that is being sent or received * @type: The type of program to be exectuted * * If no socket is passed, or the socket is not of type INET or INET6, * this function does nothing and returns 0. * * The program type passed in via @type must be suitable for network * filtering. No further check is performed to assert that. * * This function will return %-EPERM if any if an attached program was found * and if it returned != 1 during execution. In all other cases, 0 is returned. */ int __cgroup_bpf_run_filter_skb(struct sock *sk, struct sk_buff *skb, enum bpf_attach_type type) { struct bpf_prog *prog; struct cgroup *cgrp; int ret = 0; if (!sk || !sk_fullsock(sk)) return 0; if (sk->sk_family != AF_INET && sk->sk_family != AF_INET6) return 0; cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); rcu_read_lock(); prog = rcu_dereference(cgrp->bpf.effective[type]); if (prog) { unsigned int offset = skb->data - skb_network_header(skb); __skb_push(skb, offset); ret = bpf_prog_run_save_cb(prog, skb) == 1 ? 0 : -EPERM; __skb_pull(skb, offset); } rcu_read_unlock(); return ret; } EXPORT_SYMBOL(__cgroup_bpf_run_filter_skb); /** * __cgroup_bpf_run_filter_sk() - Run a program on a sock * @sk: sock structure to manipulate * @type: The type of program to be exectuted * * socket is passed is expected to be of type INET or INET6. * * The program type passed in via @type must be suitable for sock * filtering. No further check is performed to assert that. * * This function will return %-EPERM if any if an attached program was found * and if it returned != 1 during execution. In all other cases, 0 is returned. */ int __cgroup_bpf_run_filter_sk(struct sock *sk, enum bpf_attach_type type) { struct cgroup *cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); struct bpf_prog *prog; int ret = 0; rcu_read_lock(); prog = rcu_dereference(cgrp->bpf.effective[type]); if (prog) ret = BPF_PROG_RUN(prog, sk) == 1 ? 0 : -EPERM; rcu_read_unlock(); return ret; } EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); d (diff)
percpu-refcount: fix reference leak during percpu-atomic transition
percpu_ref_tryget() and percpu_ref_tryget_live() should return "true" IFF they acquire a reference. But the return value from atomic_long_inc_not_zero() is a long and may have high bits set, e.g. PERCPU_COUNT_BIAS, and the return value of the tryget routines is bool so the reference may actually be acquired but the routines return "false" which results in a reference leak since the caller assumes it does not need to do a corresponding percpu_ref_put(). This was seen when performing CPU hotplug during I/O, as hangs in blk_mq_freeze_queue_wait where percpu_ref_kill (blk_mq_freeze_queue_start) raced with percpu_ref_tryget (blk_mq_timeout_work). Sample stack trace: __switch_to+0x2c0/0x450 __schedule+0x2f8/0x970 schedule+0x48/0xc0 blk_mq_freeze_queue_wait+0x94/0x120 blk_mq_queue_reinit_work+0xb8/0x180 blk_mq_queue_reinit_prepare+0x84/0xa0 cpuhp_invoke_callback+0x17c/0x600 cpuhp_up_callbacks+0x58/0x150 _cpu_up+0xf0/0x1c0 do_cpu_up+0x120/0x150 cpu_subsys_online+0x64/0xe0 device_online+0xb4/0x120 online_store+0xb4/0xc0 dev_attr_store+0x68/0xa0 sysfs_kf_write+0x80/0xb0 kernfs_fop_write+0x17c/0x250 __vfs_write+0x6c/0x1e0 vfs_write+0xd0/0x270 SyS_write+0x6c/0x110 system_call+0x38/0xe0 Examination of the queue showed a single reference (no PERCPU_COUNT_BIAS, and __PERCPU_REF_DEAD, __PERCPU_REF_ATOMIC set) and no requests. However, conditions at the time of the race are count of PERCPU_COUNT_BIAS + 0 and __PERCPU_REF_DEAD and __PERCPU_REF_ATOMIC set. The fix is to make the tryget routines use an actual boolean internally instead of the atomic long result truncated to a int. Fixes: e625305b3907 percpu-refcount: make percpu_ref based on longs instead of ints Link: https://bugzilla.kernel.org/show_bug.cgi?id=190751 Signed-off-by: Douglas Miller <dougmill@linux.vnet.ibm.com> Reviewed-by: Jens Axboe <axboe@fb.com> Signed-off-by: Tejun Heo <tj@kernel.org> Fixes: e625305b3907 ("percpu-refcount: make percpu_ref based on longs instead of ints") Cc: stable@vger.kernel.org # v3.18+
Diffstat (limited to 'include/net/ip6_checksum.h')