From 650495dedc34daf8590c708a5b48f82ed2787b75 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 13 May 2013 08:38:35 +0900 Subject: f2fs: fix the inconsistent state of data pages In get_lock_data_page, if there is a data race between get_dnode_of_data for node and grab_cache_page for data, f2fs is able to face with the following BUG_ON(dn.data_blkaddr == NEW_ADDR). kernel BUG at /home/zeus/f2fs_test/src/fs/f2fs/data.c:251! [] get_lock_data_page+0x1ec/0x210 [f2fs] Call Trace: [] f2fs_readdir+0x89/0x210 [f2fs] [] ? fillonedir+0x100/0x100 [] ? fillonedir+0x100/0x100 [] vfs_readdir+0xb8/0xe0 [] sys_getdents+0x8f/0x110 [] system_call_fastpath+0x16/0x1b This bug is able to be occurred when the block address of the data block is changed after f2fs_put_dnode(). In order to avoid that, this patch fixes the lock order of node and data blocks in which the node block lock is covered by the data block lock. Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'fs/f2fs/data.c') diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 91ff93b0b0f4..05fb5c6077b8 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -233,18 +233,23 @@ struct page *get_lock_data_page(struct inode *inode, pgoff_t index) struct page *page; int err; +repeat: + page = grab_cache_page(mapping, index); + if (!page) + return ERR_PTR(-ENOMEM); + set_new_dnode(&dn, inode, NULL, NULL, 0); err = get_dnode_of_data(&dn, index, LOOKUP_NODE); - if (err) + if (err) { + f2fs_put_page(page, 1); return ERR_PTR(err); + } f2fs_put_dnode(&dn); - if (dn.data_blkaddr == NULL_ADDR) + if (dn.data_blkaddr == NULL_ADDR) { + f2fs_put_page(page, 1); return ERR_PTR(-ENOENT); -repeat: - page = grab_cache_page(mapping, index); - if (!page) - return ERR_PTR(-ENOMEM); + } if (PageUptodate(page)) return page; -- cgit v1.2.3 From 64aa7ed98db489d1c41ef140876ada38498678ab Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 20 May 2013 09:55:50 +0900 Subject: f2fs: change get_new_data_page to pass a locked node page This patch is for passing a locked node page to get_dnode_of_data. Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'fs/f2fs/data.c') diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 05fb5c6077b8..af7454939362 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -280,8 +280,8 @@ repeat: * Also, caller should grab and release a mutex by calling mutex_lock_op() and * mutex_unlock_op(). */ -struct page *get_new_data_page(struct inode *inode, pgoff_t index, - bool new_i_size) +struct page *get_new_data_page(struct inode *inode, + struct page *npage, pgoff_t index, bool new_i_size) { struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct address_space *mapping = inode->i_mapping; @@ -289,18 +289,20 @@ struct page *get_new_data_page(struct inode *inode, pgoff_t index, struct dnode_of_data dn; int err; - set_new_dnode(&dn, inode, NULL, NULL, 0); + set_new_dnode(&dn, inode, npage, npage, 0); err = get_dnode_of_data(&dn, index, ALLOC_NODE); if (err) return ERR_PTR(err); if (dn.data_blkaddr == NULL_ADDR) { if (reserve_new_block(&dn)) { - f2fs_put_dnode(&dn); + if (!npage) + f2fs_put_dnode(&dn); return ERR_PTR(-ENOSPC); } } - f2fs_put_dnode(&dn); + if (!npage) + f2fs_put_dnode(&dn); repeat: page = grab_cache_page(mapping, index); if (!page) -- cgit v1.2.3 From 44a83ff6a81d84ab83bcb43a49ff1ba6c7e17cd1 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 20 May 2013 10:10:29 +0900 Subject: f2fs: update inode page after creation I found a bug when testing power-off-recovery as follows. [Bug Scenario] 1. create a file 2. fsync the file 3. reboot w/o any sync 4. try to recover the file - found its fsync mark - found its dentry mark : try to recover its dentry - get its file name - get its parent inode number : here we got zero value The reason why we get the wrong parent inode number is that we didn't synchronize the inode page with its newly created inode information perfectly. Especially, previous f2fs stores fi->i_pino and writes it to the cached node page in a wrong order, which incurs the zero-valued i_pino during the recovery. So, this patch modifies the creation flow to fix the synchronization order of inode page with its inode. Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/f2fs/data.c') diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index af7454939362..c320f7f31327 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -279,6 +279,7 @@ repeat: * * Also, caller should grab and release a mutex by calling mutex_lock_op() and * mutex_unlock_op(). + * Note that, npage is set only by make_empty_dir. */ struct page *get_new_data_page(struct inode *inode, struct page *npage, pgoff_t index, bool new_i_size) -- cgit v1.2.3 From 6f85b3520325a67ee4ac33e75bbcdbc25c79ce69 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 20 May 2013 16:15:22 +0900 Subject: f2fs: avoid RECLAIM_FS-ON-W: deadlock This patch tries to avoid the following deadlock condition of which the reclaim path can trigger f2fs_balance_fs again. ================================= [ INFO: inconsistent lock state ] --------------------------------- inconsistent {RECLAIM_FS-ON-W} -> {IN-RECLAIM_FS-W} usage. kswapd0/41 [HC0[0]:SC0[0]:HE1:SE1] takes: (&sbi->gc_mutex){+.+.?.}, at: f2fs_balance_fs+0xe6/0x100 [f2fs] {RECLAIM_FS-ON-W} state was registered at: [] mark_held_locks+0xb9/0x140 [] lockdep_trace_alloc+0x85/0xf0 [] __alloc_pages_nodemask+0x7c/0x9b0 [] alloc_pages_current+0xb8/0x180 [] __page_cache_alloc+0xaf/0xd0 [] find_or_create_page+0x4c/0xb0 [] find_data_page+0x14e/0x210 [f2fs] [] f2fs_gc+0x9eb/0xd90 [f2fs] [] f2fs_balance_fs+0xee/0x100 [f2fs] [] f2fs_setattr+0x6c/0x200 [f2fs] [] notify_change+0x1db/0x3a0 [] do_truncate+0x60/0xa0 [] vfs_truncate+0x185/0x1b0 [] do_sys_truncate+0x5c/0xa0 [] SyS_truncate+0xe/0x10 [] system_call_fastpath+0x16/0x1b Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/f2fs/data.c') diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index c320f7f31327..1644fffea251 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -199,7 +199,7 @@ struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync) if (dn.data_blkaddr == NEW_ADDR) return ERR_PTR(-EINVAL); - page = grab_cache_page(mapping, index); + page = grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS); if (!page) return ERR_PTR(-ENOMEM); @@ -234,7 +234,7 @@ struct page *get_lock_data_page(struct inode *inode, pgoff_t index) int err; repeat: - page = grab_cache_page(mapping, index); + page = grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS); if (!page) return ERR_PTR(-ENOMEM); -- cgit v1.2.3 From 35b09d82c3cf3fc0b8b6d923e7fd82ff7926aafc Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Thu, 23 May 2013 22:57:53 +0900 Subject: f2fs: push some variables to debug part Some, counters are needed only for the statistical information while debugging. So, those can be controlled using CONFIG_F2FS_STAT_FS, pushing the usage for few variables under this flag. Signed-off-by: Namjae Jeon Signed-off-by: Amit Sahrawat Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs/f2fs/data.c') diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 1644fffea251..93917e31dbdf 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -68,7 +68,9 @@ static int check_extent_cache(struct inode *inode, pgoff_t pgofs, struct buffer_head *bh_result) { struct f2fs_inode_info *fi = F2FS_I(inode); +#ifdef CONFIG_F2FS_STAT_FS struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); +#endif pgoff_t start_fofs, end_fofs; block_t start_blkaddr; @@ -78,7 +80,9 @@ static int check_extent_cache(struct inode *inode, pgoff_t pgofs, return 0; } +#ifdef CONFIG_F2FS_STAT_FS sbi->total_hit_ext++; +#endif start_fofs = fi->ext.fofs; end_fofs = fi->ext.fofs + fi->ext.len - 1; start_blkaddr = fi->ext.blk_addr; @@ -96,7 +100,9 @@ static int check_extent_cache(struct inode *inode, pgoff_t pgofs, else bh_result->b_size = UINT_MAX; +#ifdef CONFIG_F2FS_STAT_FS sbi->read_hit_ext++; +#endif read_unlock(&fi->ext.ext_lock); return 1; } -- cgit v1.2.3 From 699489bbbea4fc3b9b735d69941cf4fca91ce1d5 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 7 Jun 2013 22:08:23 +0900 Subject: f2fs: sync dir->i_size with its block allocation If new dentry block is allocated and its i_size is updated, we should update its inode block together in order to sync i_size and its block allocation. Otherwise, we can loose additional dentry block due to the unconsistent i_size. Errorneous Scenario ------------------- In the recovery routine, - recovery_dentry | - __f2fs_add_link | | - get_new_data_page | | | - i_size_write(new_i_size) | | | - mark_inode_dirty_sync(dir) | | - update_parent_metadata | | | - mark_inode_dirty(dir) | - write_checkpoint - sync_dirty_dir_inodes - filemap_flush(dentry_blocks) - f2fs_write_data_page - skip to write the last dentry block due to index < i_size In the above flow, new_i_size is not updated to its inode block so that the last dentry block will be lost accordingly. Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs/f2fs/data.c') diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 93917e31dbdf..5b145fcc2864 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -339,6 +339,8 @@ repeat: if (new_i_size && i_size_read(inode) < ((index + 1) << PAGE_CACHE_SHIFT)) { i_size_write(inode, ((index + 1) << PAGE_CACHE_SHIFT)); + /* Only the directory inode sets new_i_size */ + set_inode_flag(F2FS_I(inode), FI_UPDATE_DIR); mark_inode_dirty_sync(inode); } return page; -- cgit v1.2.3 From b25958b6ecf1dce087e62b9aa27cf8f2fe9b5c86 Mon Sep 17 00:00:00 2001 From: Haicheng Li Date: Thu, 13 Jun 2013 16:59:29 +0800 Subject: f2fs: optimize do_write_data_page() Since "need_inplace_update() == true" is a very rare case, using unlikely() to give compiler a chance to optimize the code. Signed-off-by: Haicheng Li Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs/f2fs/data.c') diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 5b145fcc2864..6d4a743caf86 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -497,8 +497,9 @@ int do_write_data_page(struct page *page) * If current allocation needs SSR, * it had better in-place writes for updated data. */ - if (old_blk_addr != NEW_ADDR && !is_cold_data(page) && - need_inplace_update(inode)) { + if (unlikely(old_blk_addr != NEW_ADDR && + !is_cold_data(page) && + need_inplace_update(inode))) { rewrite_data_page(F2FS_SB(inode->i_sb), page, old_blk_addr); } else { -- cgit v1.2.3 From a1dd3c13ce65b726fddfe72b9d2f1009db983ce6 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 27 Jun 2013 13:04:08 +0900 Subject: f2fs: fix to recover i_size from roll-forward If user requests many data writes and fsync together, the last updated i_size should be stored to the inode block consistently. But, previous write_end just marks the inode as dirty and doesn't update its metadata into its inode block. After that, fsync just writes the inode block with newly updated data index excluding inode metadata updates. So, this patch introduces write_end in which updates inode block too when the i_size is changed. Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) (limited to 'fs/f2fs/data.c') diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 6d4a743caf86..e88f46f122aa 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -701,6 +701,27 @@ err: return err; } +static int f2fs_write_end(struct file *file, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = page->mapping->host; + + SetPageUptodate(page); + set_page_dirty(page); + + if (pos + copied > i_size_read(inode)) { + i_size_write(inode, pos + copied); + mark_inode_dirty(inode); + update_inode_page(inode); + } + + unlock_page(page); + page_cache_release(page); + return copied; +} + static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs) { @@ -757,7 +778,7 @@ const struct address_space_operations f2fs_dblock_aops = { .writepage = f2fs_write_data_page, .writepages = f2fs_write_data_pages, .write_begin = f2fs_write_begin, - .write_end = nobh_write_end, + .write_end = f2fs_write_end, .set_page_dirty = f2fs_set_data_page_dirty, .invalidatepage = f2fs_invalidate_data_page, .releasepage = f2fs_release_data_page, -- cgit v1.2.3