#include #include #include #include /* depends on mm.h include */ static DEFINE_MUTEX(swap_cgroup_mutex); struct swap_cgroup_ctrl { struct page **map; unsigned long length; spinlock_t lock; }; static struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES]; struct swap_cgroup { unsigned short id; }; #define SC_PER_PAGE (PAGE_SIZE/sizeof(struct swap_cgroup)) /* * SwapCgroup implements "lookup" and "exchange" operations. * In typical usage, this swap_cgroup is accessed via memcg's charge/uncharge * against SwapCache. At swap_free(), this is accessed directly from swap. * * This means, * - we have no race in "exchange" when we're accessed via SwapCache because * SwapCache(and its swp_entry) is under lock. * - When called via swap_free(), there is no user of this entry and no race. * Then, we don't need lock around "exchange". * * TODO: we can push these buffers out to HIGHMEM. */ /* * allocate buffer for swap_cgroup. */ static int swap_cgroup_prepare(int type) { struct page *page; struct swap_cgroup_ctrl *ctrl; unsigned long idx, max; ctrl = &swap_cgroup_ctrl[type]; for (idx = 0; idx < ctrl->length; idx++) { page = alloc_page(GFP_KERNEL | __GFP_ZERO); if (!page) goto not_enough_page; ctrl->map[idx] = page; } return 0; not_enough_page: max = idx; for (idx = 0; idx < max; idx++) __free_page(ctrl->map[idx]); return -ENOMEM; } static struct swap_cgroup *lookup_swap_cgroup(swp_entry_t ent, struct swap_cgroup_ctrl **ctrlp) { pgoff_t offset = swp_offset(ent); struct swap_cgroup_ctrl *ctrl; struct page *mappage; struct swap_cgroup *sc; ctrl = &swap_cgroup_ctrl[swp_type(ent)]; if (ctrlp) *ctrlp = ctrl; mappage = ctrl->map[offset / SC_PER_PAGE]; sc = page_address(mappage); return sc + offset % SC_PER_PAGE; } /** * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry. * @ent: swap entry to be cmpxchged * @old: old id * @new: new id * * Returns old id at success, 0 at failure. * (There is no mem_cgroup using 0 as its id) */ unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, unsigned short old, unsigned short new) { struct swap_cgroup_ctrl *ctrl; struct swap_cgroup *sc; unsigned long flags; unsigned short retval; sc = lookup_swap_cgroup(ent, &ctrl); spin_lock_irqsave(&ctrl->lock, flags); retval = sc->id; if (retval == old) sc->id = new; else retval = 0; spin_unlock_irqrestore(&ctrl->lock, flags); return retval; } /** * swap_cgroup_record - record mem_cgroup for this swp_entry. * @ent: swap entry to be recorded into * @id: mem_cgroup to be recorded * * Returns old value at success, 0 at failure. * (Of course, old value can be 0.) */ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id) { struct swap_cgroup_ctrl *ctrl; struct swap_cgroup *sc; unsigned short old; unsigned long flags; sc = lookup_swap_cgroup(ent, &ctrl); spin_lock_irqsave(&ctrl->lock, flags); old = sc->id; sc->id = id; spin_unlock_irqrestore(&ctrl->lock, flags); return old; } /** * lookup_swap_cgroup_id - lookup mem_cgroup id tied to swap entry * @ent: swap entry to be looked up. * * Returns ID of mem_cgroup at success. 0 at failure. (0 is invalid ID) */ unsigned short lookup_swap_cgroup_id(swp_entry_t ent) { return lookup_swap_cgroup(ent, NULL)->id; } int swap_cgroup_swapon(int type, unsigned long max_pages) { void *array; unsigned long array_size; unsigned long length; struct swap_cgroup_ctrl *ctrl; if (!do_swap_account) return 0; length = DIV_ROUND_UP(max_pages, SC_PER_PAGE); array_size = length * sizeof(void *); array = vzalloc(array_size); if (!array) goto nomem; ctrl = &swap_cgroup_ctrl[type]; mutex_lock(&swap_cgroup_mutex); ctrl->length = length; ctrl->map = array; spin_lock_init(&ctrl->lock); if (swap_cgroup_prepare(type)) { /* memory shortage */ ctrl->map = NULL; ctrl->length = 0; mutex_unlock(&swap_cgroup_mutex); vfree(array); goto nomem; } mutex_unlock(&swap_cgroup_mutex); return 0; nomem: pr_info("couldn't allocate enough memory for swap_cgroup\n"); pr_info("swap_cgroup can be disabled by swapaccount=0 boot option\n"); return -ENOMEM; } void swap_cgroup_swapoff(int type) { struct page **map; unsigned long i, length; struct swap_cgroup_ctrl *ctrl; if (!do_swap_account) return; mutex_lock(&swap_cgroup_mutex); ctrl = &swap_cgroup_ctrl[type]; map = ctrl->map; length = ctrl->length; ctrl->map = NULL; ctrl->length = 0; mutex_unlock(&swap_cgroup_mutex); if (map) { for (i = 0; i < length; i++) { struct page *page = map[i]; if (page) __free_page(page); } vfree(map); } } td>2017-01-28 07:49:42 -0500 commit966d2b04e070bc040319aaebfec09e0144dc3341 (patch) tree4b96156e3d1dd4dfd6039b7c219c9dc4616da52d /include/net/xfrm.h parent1b1bc42c1692e9b62756323c675a44cb1a1f9dbd (diff)
percpu-refcount: fix reference leak during percpu-atomic transition
percpu_ref_tryget() and percpu_ref_tryget_live() should return "true" IFF they acquire a reference. But the return value from atomic_long_inc_not_zero() is a long and may have high bits set, e.g. PERCPU_COUNT_BIAS, and the return value of the tryget routines is bool so the reference may actually be acquired but the routines return "false" which results in a reference leak since the caller assumes it does not need to do a corresponding percpu_ref_put(). This was seen when performing CPU hotplug during I/O, as hangs in blk_mq_freeze_queue_wait where percpu_ref_kill (blk_mq_freeze_queue_start) raced with percpu_ref_tryget (blk_mq_timeout_work). Sample stack trace: __switch_to+0x2c0/0x450 __schedule+0x2f8/0x970 schedule+0x48/0xc0 blk_mq_freeze_queue_wait+0x94/0x120 blk_mq_queue_reinit_work+0xb8/0x180 blk_mq_queue_reinit_prepare+0x84/0xa0 cpuhp_invoke_callback+0x17c/0x600 cpuhp_up_callbacks+0x58/0x150 _cpu_up+0xf0/0x1c0 do_cpu_up+0x120/0x150 cpu_subsys_online+0x64/0xe0 device_online+0xb4/0x120 online_store+0xb4/0xc0 dev_attr_store+0x68/0xa0 sysfs_kf_write+0x80/0xb0 kernfs_fop_write+0x17c/0x250 __vfs_write+0x6c/0x1e0 vfs_write+0xd0/0x270 SyS_write+0x6c/0x110 system_call+0x38/0xe0 Examination of the queue showed a single reference (no PERCPU_COUNT_BIAS, and __PERCPU_REF_DEAD, __PERCPU_REF_ATOMIC set) and no requests. However, conditions at the time of the race are count of PERCPU_COUNT_BIAS + 0 and __PERCPU_REF_DEAD and __PERCPU_REF_ATOMIC set. The fix is to make the tryget routines use an actual boolean internally instead of the atomic long result truncated to a int. Fixes: e625305b3907 percpu-refcount: make percpu_ref based on longs instead of ints Link: https://bugzilla.kernel.org/show_bug.cgi?id=190751 Signed-off-by: Douglas Miller <dougmill@linux.vnet.ibm.com> Reviewed-by: Jens Axboe <axboe@fb.com> Signed-off-by: Tejun Heo <tj@kernel.org> Fixes: e625305b3907 ("percpu-refcount: make percpu_ref based on longs instead of ints") Cc: stable@vger.kernel.org # v3.18+
Diffstat (limited to 'include/net/xfrm.h')