/* * mft.h - Defines for mft record handling in NTFS Linux kernel driver. * Part of the Linux-NTFS project. * * Copyright (c) 2001-2004 Anton Altaparmakov * * This program/include file is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as published * by the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program/include file is distributed in the hope that it will be * useful, but WITHOUT ANY WARRANTY; without even the implied warranty * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program (in the main directory of the Linux-NTFS * distribution in the file COPYING); if not, write to the Free Software * Foundation,Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #ifndef _LINUX_NTFS_MFT_H #define _LINUX_NTFS_MFT_H #include #include #include #include "inode.h" extern MFT_RECORD *map_mft_record(ntfs_inode *ni); extern void unmap_mft_record(ntfs_inode *ni); extern MFT_RECORD *map_extent_mft_record(ntfs_inode *base_ni, MFT_REF mref, ntfs_inode **ntfs_ino); static inline void unmap_extent_mft_record(ntfs_inode *ni) { unmap_mft_record(ni); return; } #ifdef NTFS_RW /** * flush_dcache_mft_record_page - flush_dcache_page() for mft records * @ni: ntfs inode structure of mft record * * Call flush_dcache_page() for the page in which an mft record resides. * * This must be called every time an mft record is modified, just after the * modification. */ static inline void flush_dcache_mft_record_page(ntfs_inode *ni) { flush_dcache_page(ni->page); } extern void __mark_mft_record_dirty(ntfs_inode *ni); /** * mark_mft_record_dirty - set the mft record and the page containing it dirty * @ni: ntfs inode describing the mapped mft record * * Set the mapped (extent) mft record of the (base or extent) ntfs inode @ni, * as well as the page containing the mft record, dirty. Also, mark the base * vfs inode dirty. This ensures that any changes to the mft record are * written out to disk. * * NOTE: Do not do anything if the mft record is already marked dirty. */ static inline void mark_mft_record_dirty(ntfs_inode *ni) { if (!NInoTestSetDirty(ni)) __mark_mft_record_dirty(ni); } extern int ntfs_sync_mft_mirror(ntfs_volume *vol, const unsigned long mft_no, MFT_RECORD *m, int sync); extern int write_mft_record_nolock(ntfs_inode *ni, MFT_RECORD *m, int sync); /** * write_mft_record - write out a mapped (extent) mft record * @ni: ntfs inode describing the mapped (extent) mft record * @m: mapped (extent) mft record to write * @sync: if true, wait for i/o completion * * This is just a wrapper for write_mft_record_nolock() (see mft.c), which * locks the page for the duration of the write. This ensures that there are * no race conditions between writing the mft record via the dirty inode code * paths and via the page cache write back code paths or between writing * neighbouring mft records residing in the same page. * * Locking the page also serializes us against ->readpage() if the page is not * uptodate. * * On success, clean the mft record and return 0. On error, leave the mft * record dirty and return -errno. */ static inline int write_mft_record(ntfs_inode *ni, MFT_RECORD *m, int sync) { struct page *page = ni->page; int err; BUG_ON(!page); lock_page(page); err = write_mft_record_nolock(ni, m, sync); unlock_page(page); return err; } extern bool ntfs_may_write_mft_record(ntfs_volume *vol, const unsigned long mft_no, const MFT_RECORD *m, ntfs_inode **locked_ni); extern ntfs_inode *ntfs_mft_record_alloc(ntfs_volume *vol, const int mode, ntfs_inode *base_ni, MFT_RECORD **mrec); extern int ntfs_extent_mft_record_free(ntfs_inode *ni, MFT_RECORD *m); #endif /* NTFS_RW */ #endif /* _LINUX_NTFS_MFT_H */ /th>Doug Ledford <dledford@redhat.com>2016-04-28 12:00:38 -0400 commitde82bdff62a9078a6e4f1452e2f2604686e51e49 (patch) treee4a48f60bb12776d98795fb9a616ff617806b58f parentf19bd643dbded8672bfeffe9e51322464e4a9239 (diff)
IB/hfi1: Fix deadlock caused by locking with wrong scope
The locking around the interval RB tree is designed to prevent access to the tree while it's being modified. The locking in its current form is too overzealous, which is causing a deadlock in certain cases with the following backtrace: Kernel panic - not syncing: Watchdog detected hard LOCKUP on cpu 0 CPU: 0 PID: 5836 Comm: IMB-MPI1 Tainted: G O 3.12.18-wfr+ #1 0000000000000000 ffff88087f206c50 ffffffff814f1caa ffffffff817b53f0 ffff88087f206cc8 ffffffff814ecd56 0000000000000010 ffff88087f206cd8 ffff88087f206c78 0000000000000000 0000000000000000 0000000000001662 Call Trace: <NMI> [<ffffffff814f1caa>] dump_stack+0x45/0x56 [<ffffffff814ecd56>] panic+0xc2/0x1cb [<ffffffff810d4370>] ? restart_watchdog_hrtimer+0x50/0x50 [<ffffffff810d4432>] watchdog_overflow_callback+0xc2/0xd0 [<ffffffff81109b4e>] __perf_event_overflow+0x8e/0x2b0 [<ffffffff8110a714>] perf_event_overflow+0x14/0x20 [<ffffffff8101c906>] intel_pmu_handle_irq+0x1b6/0x390 [<ffffffff814f927b>] perf_event_nmi_handler+0x2b/0x50 [<ffffffff814f8ad8>] nmi_handle.isra.3+0x88/0x180 [<ffffffff814f8d39>] do_nmi+0x169/0x310 [<ffffffff814f8177>] end_repeat_nmi+0x1e/0x2e [<ffffffff81272600>] ? unmap_single+0x30/0x30 [<ffffffff814f780d>] ? _raw_spin_lock_irqsave+0x2d/0x40 [<ffffffff814f780d>] ? _raw_spin_lock_irqsave+0x2d/0x40 [<ffffffff814f780d>] ? _raw_spin_lock_irqsave+0x2d/0x40 <<EOE>> <IRQ> [<ffffffffa056c4a8>] hfi1_mmu_rb_search+0x38/0x70 [hfi1] [<ffffffffa05919cb>] user_sdma_free_request+0xcb/0x120 [hfi1] [<ffffffffa0593393>] user_sdma_txreq_cb+0x263/0x350 [hfi1] [<ffffffffa057fad7>] ? sdma_txclean+0x27/0x1c0 [hfi1] [<ffffffffa0593130>] ? user_sdma_send_pkts+0x1710/0x1710 [hfi1] [<ffffffffa057fdd6>] sdma_make_progress+0x166/0x480 [hfi1] [<ffffffff810762c9>] ? ttwu_do_wakeup+0x19/0xd0 [<ffffffffa0581c7e>] sdma_engine_interrupt+0x8e/0x100 [hfi1] [<ffffffffa0546bdd>] sdma_interrupt+0x5d/0xa0 [hfi1] [<ffffffff81097e57>] handle_irq_event_percpu+0x47/0x1d0 [<ffffffff81098017>] handle_irq_event+0x37/0x60 [<ffffffff8109aa5f>] handle_edge_irq+0x6f/0x120 [<ffffffff810044af>] handle_irq+0xbf/0x150 [<ffffffff8104c9b7>] ? irq_enter+0x17/0x80 [<ffffffff8150168d>] do_IRQ+0x4d/0xc0 [<ffffffff814f7c6a>] common_interrupt+0x6a/0x6a <EOI> [<ffffffff81073524>] ? finish_task_switch+0x54/0xe0 [<ffffffff814f56c6>] __schedule+0x3b6/0x7e0 [<ffffffff810763a6>] __cond_resched+0x26/0x30 [<ffffffff814f5eda>] _cond_resched+0x3a/0x50 [<ffffffff814f4f82>] down_write+0x12/0x30 [<ffffffffa0591619>] hfi1_release_user_pages+0x69/0x90 [hfi1] [<ffffffffa059173a>] sdma_rb_remove+0x9a/0xc0 [hfi1] [<ffffffffa056c00d>] __mmu_rb_remove.isra.5+0x5d/0x70 [hfi1] [<ffffffffa056c536>] hfi1_mmu_rb_remove+0x56/0x70 [hfi1] [<ffffffffa059427b>] hfi1_user_sdma_process_request+0x74b/0x1160 [hfi1] [<ffffffffa055c763>] hfi1_aio_write+0xc3/0x100 [hfi1] [<ffffffff8116a14c>] do_sync_readv_writev+0x4c/0x80 [<ffffffff8116b58b>] do_readv_writev+0xbb/0x230 [<ffffffff811a9da1>] ? fsnotify+0x241/0x320 [<ffffffff81073524>] ? finish_task_switch+0x54/0xe0 [<ffffffff8116b795>] vfs_writev+0x35/0x60 [<ffffffff8116b8c9>] SyS_writev+0x49/0xc0 [<ffffffff810cd876>] ? __audit_syscall_exit+0x1f6/0x2a0 [<ffffffff814ff992>] system_call_fastpath+0x16/0x1b As evident from the backtrace above, the process was being put to sleep while holding the lock. Limiting the scope of the lock only to the RB tree operation fixes the above error allowing for proper locking and the process being put to sleep when needed. Reviewed-by: Dennis Dalessandro <dennis.dalessandro@intel.com> Reviewed-by: Dean Luick <dean.luick@intel.com> Signed-off-by: Mitko Haralanov <mitko.haralanov@intel.com> Signed-off-by: Doug Ledford <dledford@redhat.com>