From 1cfaba5b49a5e9133b28da6d1a459240c1f61993 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 12 Dec 2017 08:38:30 -0800 Subject: writeback: synchronize sync(2) against cgroup writeback membership switches commit 7fc5854f8c6efae9e7624970ab49a1eac2faefb1 upstream. sync_inodes_sb() can race against cgwb (cgroup writeback) membership switches and fail to writeback some inodes. For example, if an inode switches to another wb while sync_inodes_sb() is in progress, the new wb might not be visible to bdi_split_work_to_wbs() at all or the inode might jump from a wb which hasn't issued writebacks yet to one which already has. This patch adds backing_dev_info->wb_switch_rwsem to synchronize cgwb switch path against sync_inodes_sb() so that sync_inodes_sb() is guaranteed to see all the target wbs and inodes can't jump wbs to escape syncing. v2: Fixed misplaced rwsem init. Spotted by Jiufei. Signed-off-by: Tejun Heo Reported-by: Jiufei Xue Link: http://lkml.kernel.org/r/dc694ae2-f07f-61e1-7097-7c8411cee12d@gmail.com Acked-by: Jan Kara Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- fs/fs-writeback.c | 40 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 38 insertions(+), 2 deletions(-) (limited to 'fs/fs-writeback.c') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index f3aea1b8702c..55bff21c71ae 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -331,11 +331,22 @@ struct inode_switch_wbs_context { struct work_struct work; }; +static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi) +{ + down_write(&bdi->wb_switch_rwsem); +} + +static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi) +{ + up_write(&bdi->wb_switch_rwsem); +} + static void inode_switch_wbs_work_fn(struct work_struct *work) { struct inode_switch_wbs_context *isw = container_of(work, struct inode_switch_wbs_context, work); struct inode *inode = isw->inode; + struct backing_dev_info *bdi = inode_to_bdi(inode); struct address_space *mapping = inode->i_mapping; struct bdi_writeback *old_wb = inode->i_wb; struct bdi_writeback *new_wb = isw->new_wb; @@ -343,6 +354,12 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) bool switched = false; void **slot; + /* + * If @inode switches cgwb membership while sync_inodes_sb() is + * being issued, sync_inodes_sb() might miss it. Synchronize. + */ + down_read(&bdi->wb_switch_rwsem); + /* * By the time control reaches here, RCU grace period has passed * since I_WB_SWITCH assertion and all wb stat update transactions @@ -435,6 +452,8 @@ skip_switch: spin_unlock(&new_wb->list_lock); spin_unlock(&old_wb->list_lock); + up_read(&bdi->wb_switch_rwsem); + if (switched) { wb_wakeup(new_wb); wb_put(old_wb); @@ -475,9 +494,18 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) if (inode->i_state & I_WB_SWITCH) return; + /* + * Avoid starting new switches while sync_inodes_sb() is in + * progress. Otherwise, if the down_write protected issue path + * blocks heavily, we might end up starting a large number of + * switches which will block on the rwsem. + */ + if (!down_read_trylock(&bdi->wb_switch_rwsem)) + return; + isw = kzalloc(sizeof(*isw), GFP_ATOMIC); if (!isw) - return; + goto out_unlock; /* find and pin the new wb */ rcu_read_lock(); @@ -511,12 +539,14 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) * Let's continue after I_WB_SWITCH is guaranteed to be visible. */ call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn); - return; + goto out_unlock; out_free: if (isw->new_wb) wb_put(isw->new_wb); kfree(isw); +out_unlock: + up_read(&bdi->wb_switch_rwsem); } /** @@ -894,6 +924,9 @@ fs_initcall(cgroup_writeback_init); #else /* CONFIG_CGROUP_WRITEBACK */ +static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi) { } +static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi) { } + static struct bdi_writeback * locked_inode_to_wb_and_lock_list(struct inode *inode) __releases(&inode->i_lock) @@ -2408,8 +2441,11 @@ void sync_inodes_sb(struct super_block *sb) return; WARN_ON(!rwsem_is_locked(&sb->s_umount)); + /* protect against inode wb switch, see inode_switch_wbs_work_fn() */ + bdi_down_write_wb_switch_rwsem(bdi); bdi_split_work_to_wbs(bdi, &work, false); wb_wait_for_completion(bdi, &done); + bdi_up_write_wb_switch_rwsem(bdi); wait_sb_inodes(sb); } -- cgit v1.2.3 From 54e35658dd06e3768b19b6e0f81bea362f3ae3fa Mon Sep 17 00:00:00 2001 From: Jiufei Xue Date: Fri, 17 May 2019 14:31:44 -0700 Subject: fs/writeback.c: use rcu_barrier() to wait for inflight wb switches going into workqueue when umount commit ec084de929e419e51bcdafaafe567d9e7d0273b7 upstream. synchronize_rcu() didn't wait for call_rcu() callbacks, so inode wb switch may not go to the workqueue after synchronize_rcu(). Thus previous scheduled switches was not finished even flushing the workqueue, which will cause a NULL pointer dereferenced followed below. VFS: Busy inodes after unmount of vdd. Self-destruct in 5 seconds. Have a nice day... BUG: unable to handle kernel NULL pointer dereference at 0000000000000278 evict+0xb3/0x180 iput+0x1b0/0x230 inode_switch_wbs_work_fn+0x3c0/0x6a0 worker_thread+0x4e/0x490 ? process_one_work+0x410/0x410 kthread+0xe6/0x100 ret_from_fork+0x39/0x50 Replace the synchronize_rcu() call with a rcu_barrier() to wait for all pending callbacks to finish. And inc isw_nr_in_flight after call_rcu() in inode_switch_wbs() to make more sense. Link: http://lkml.kernel.org/r/20190429024108.54150-1-jiufei.xue@linux.alibaba.com Signed-off-by: Jiufei Xue Acked-by: Tejun Heo Suggested-by: Tejun Heo Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/fs-writeback.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'fs/fs-writeback.c') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 55bff21c71ae..8b93d4b98428 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -530,8 +530,6 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) isw->inode = inode; - atomic_inc(&isw_nr_in_flight); - /* * In addition to synchronizing among switchers, I_WB_SWITCH tells * the RCU protected stat update paths to grab the mapping's @@ -539,6 +537,9 @@ static void inode_switch_wbs(struct inode *inode, int new_wb_id) * Let's continue after I_WB_SWITCH is guaranteed to be visible. */ call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn); + + atomic_inc(&isw_nr_in_flight); + goto out_unlock; out_free: @@ -908,7 +909,11 @@ restart: void cgroup_writeback_umount(void) { if (atomic_read(&isw_nr_in_flight)) { - synchronize_rcu(); + /* + * Use rcu_barrier() to wait for all pending callbacks to + * ensure that all in-flight wb switches are in the workqueue. + */ + rcu_barrier(); flush_workqueue(isw_wq); } } -- cgit v1.2.3 From 587a816cbe4cc9e6607e83ffe20c36582deca111 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 13 Jun 2019 15:30:41 -0700 Subject: blkcg, writeback: dead memcgs shouldn't contribute to writeback ownership arbitration [ Upstream commit 6631142229005e1b1c311a09efe9fb3cfdac8559 ] wbc_account_io() collects information on cgroup ownership of writeback pages to determine which cgroup should own the inode. Pages can stay associated with dead memcgs but we want to avoid attributing IOs to dead blkcgs as much as possible as the association is likely to be stale. However, currently, pages associated with dead memcgs contribute to the accounting delaying and/or confusing the arbitration. Fix it by ignoring pages associated with dead memcgs. Signed-off-by: Tejun Heo Cc: Jan Kara Signed-off-by: Jens Axboe Signed-off-by: Sasha Levin --- fs/fs-writeback.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'fs/fs-writeback.c') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 8b93d4b98428..baaed9369ab4 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -721,6 +721,7 @@ void wbc_detach_inode(struct writeback_control *wbc) void wbc_account_io(struct writeback_control *wbc, struct page *page, size_t bytes) { + struct cgroup_subsys_state *css; int id; /* @@ -732,7 +733,12 @@ void wbc_account_io(struct writeback_control *wbc, struct page *page, if (!wbc->wb) return; - id = mem_cgroup_css_from_page(page)->id; + css = mem_cgroup_css_from_page(page); + /* dead cgroups shouldn't contribute to inode ownership arbitration */ + if (!(css->flags & CSS_ONLINE)) + return; + + id = css->id; if (id == wbc->wb_id) { wbc->wb_bytes += bytes; -- cgit v1.2.3 From d92f4f02d46f7fe990fffb64987200443de3014a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 8 Nov 2019 12:18:29 -0800 Subject: cgroup,writeback: don't switch wbs immediately on dead wbs if the memcg is dead commit 65de03e251382306a4575b1779c57c87889eee49 upstream. cgroup writeback tries to refresh the associated wb immediately if the current wb is dead. This is to avoid keeping issuing IOs on the stale wb after memcg - blkcg association has changed (ie. when blkcg got disabled / enabled higher up in the hierarchy). Unfortunately, the logic gets triggered spuriously on inodes which are associated with dead cgroups. When the logic is triggered on dead cgroups, the attempt fails only after doing quite a bit of work allocating and initializing a new wb. While c3aab9a0bd91 ("mm/filemap.c: don't initiate writeback if mapping has no dirty pages") alleviated the issue significantly as it now only triggers when the inode has dirty pages. However, the condition can still be triggered before the inode is switched to a different cgroup and the logic simply doesn't make sense. Skip the immediate switching if the associated memcg is dying. This is a simplified version of the following two patches: * https://lore.kernel.org/linux-mm/20190513183053.GA73423@dennisz-mbp/ * http://lkml.kernel.org/r/156355839560.2063.5265687291430814589.stgit@buzz Cc: Konstantin Khlebnikov Fixes: e8a7abf5a5bd ("writeback: disassociate inodes from dying bdi_writebacks") Acked-by: Dennis Zhou Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- fs/fs-writeback.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'fs/fs-writeback.c') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index baaed9369ab4..882e9d6830df 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -582,10 +582,13 @@ void wbc_attach_and_unlock_inode(struct writeback_control *wbc, spin_unlock(&inode->i_lock); /* - * A dying wb indicates that the memcg-blkcg mapping has changed - * and a new wb is already serving the memcg. Switch immediately. + * A dying wb indicates that either the blkcg associated with the + * memcg changed or the associated memcg is dying. In the first + * case, a replacement wb should already be available and we should + * refresh the wb immediately. In the second case, trying to + * refresh will keep failing. */ - if (unlikely(wb_dying(wbc->wb))) + if (unlikely(wb_dying(wbc->wb) && !css_is_dying(wbc->wb->memcg_css))) inode_switch_wbs(inode, wbc->wb_id); } -- cgit v1.2.3