From 1f60fbe7274918adb8db2f616e321890730ab7e3 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 23 Apr 2016 22:50:07 -0400 Subject: ext4: allow readdir()'s of large empty directories to be interrupted If a directory has a large number of empty blocks, iterating over all of them can take a long time, leading to scheduler warnings and users getting irritated when they can't kill a process in the middle of one of these long-running readdir operations. Fix this by adding checks to ext4_readdir() and ext4_htree_fill_tree(). This was reverted earlier due to a typo in the original commit where I experimented with using signal_pending() instead of fatal_signal_pending(). The test was in the wrong place if we were going to return signal_pending() since we would end up returning duplicant entries. See 9f2394c9be47 for a more detailed explanation. Added fix as suggested by Linus to check for signal_pending() in in the filldir() functions. Reported-by: Benjamin LaHaise Google-Bug-Id: 27880676 Signed-off-by: Theodore Ts'o --- fs/compat.c | 4 ++++ fs/ext4/dir.c | 5 +++++ fs/ext4/namei.c | 5 +++++ fs/readdir.c | 4 ++++ 4 files changed, 18 insertions(+) diff --git a/fs/compat.c b/fs/compat.c index a71936a3f4cb..f940cb20562f 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -936,6 +936,8 @@ static int compat_filldir(struct dir_context *ctx, const char *name, int namlen, } dirent = buf->previous; if (dirent) { + if (signal_pending(current)) + return -EINTR; if (__put_user(offset, &dirent->d_off)) goto efault; } @@ -1020,6 +1022,8 @@ static int compat_filldir64(struct dir_context *ctx, const char *name, dirent = buf->previous; if (dirent) { + if (signal_pending(current)) + return -EINTR; if (__put_user_unaligned(offset, &dirent->d_off)) goto efault; } diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 561d7308b393..4173bfe21114 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -150,6 +150,11 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) while (ctx->pos < inode->i_size) { struct ext4_map_blocks map; + if (fatal_signal_pending(current)) { + err = -ERESTARTSYS; + goto errout; + } + cond_resched(); map.m_lblk = ctx->pos >> EXT4_BLOCK_SIZE_BITS(sb); map.m_len = 1; err = ext4_map_blocks(NULL, inode, &map, 0); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 48e4b8907826..c07422d254b6 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1107,6 +1107,11 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, } while (1) { + if (fatal_signal_pending(current)) { + err = -ERESTARTSYS; + goto errout; + } + cond_resched(); block = dx_get_block(frame->at); ret = htree_dirblock_to_tree(dir_file, dir, block, &hinfo, start_hash, start_minor_hash); diff --git a/fs/readdir.c b/fs/readdir.c index e69ef3b79787..5f2d4bee5a73 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -169,6 +169,8 @@ static int filldir(struct dir_context *ctx, const char *name, int namlen, } dirent = buf->previous; if (dirent) { + if (signal_pending(current)) + return -EINTR; if (__put_user(offset, &dirent->d_off)) goto efault; } @@ -248,6 +250,8 @@ static int filldir64(struct dir_context *ctx, const char *name, int namlen, return -EINVAL; dirent = buf->previous; if (dirent) { + if (signal_pending(current)) + return -EINTR; if (__put_user(offset, &dirent->d_off)) goto efault; } -- cgit v1.2.3 From 06bd3c36a733ac27962fea7d6f47168841376824 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sun, 24 Apr 2016 00:56:03 -0400 Subject: ext4: fix data exposure after a crash Huang has reported that in his powerfail testing he is seeing stale block contents in some of recently allocated blocks although he mounts ext4 in data=ordered mode. After some investigation I have found out that indeed when delayed allocation is used, we don't add inode to transaction's list of inodes needing flushing before commit. Originally we were doing that but commit f3b59291a69d removed the logic with a flawed argument that it is not needed. The problem is that although for delayed allocated blocks we write their contents immediately after allocating them, there is no guarantee that the IO scheduler or device doesn't reorder things and thus transaction allocating blocks and attaching them to inode can reach stable storage before actual block contents. Actually whenever we attach freshly allocated blocks to inode using a written extent, we should add inode to transaction's ordered inode list to make sure we properly wait for block contents to be written before committing the transaction. So that is what we do in this patch. This also handles other cases where stale data exposure was possible - like filling hole via mmap in data=ordered,nodelalloc mode. The only exception to the above rule are extending direct IO writes where blkdev_direct_IO() waits for IO to complete before increasing i_size and thus stale data exposure is not possible. For now we don't complicate the code with optimizing this special case since the overhead is pretty low. In case this is observed to be a performance problem we can always handle it using a special flag to ext4_map_blocks(). CC: stable@vger.kernel.org Fixes: f3b59291a69d0b734be1fc8be489fef2dd846d3d Reported-by: "HUANG Weller (CM/ESW12-CN)" Tested-by: "HUANG Weller (CM/ESW12-CN)" Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 981a1fc30eaa..250c2df04a92 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -684,6 +684,21 @@ out_sem: ret = check_block_validity(inode, map); if (ret != 0) return ret; + + /* + * Inodes with freshly allocated blocks where contents will be + * visible after transaction commit must be on transaction's + * ordered data list. + */ + if (map->m_flags & EXT4_MAP_NEW && + !(map->m_flags & EXT4_MAP_UNWRITTEN) && + !(flags & EXT4_GET_BLOCKS_ZERO) && + !IS_NOQUOTA(inode) && + ext4_should_order_data(inode)) { + ret = ext4_jbd2_file_inode(handle, inode); + if (ret) + return ret; + } } return retval; } @@ -1289,15 +1304,6 @@ static int ext4_write_end(struct file *file, int i_size_changed = 0; trace_ext4_write_end(inode, pos, len, copied); - if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE)) { - ret = ext4_jbd2_file_inode(handle, inode); - if (ret) { - unlock_page(page); - put_page(page); - goto errout; - } - } - if (ext4_has_inline_data(inode)) { ret = ext4_write_inline_data_end(inode, pos, len, copied, page); -- cgit v1.2.3 From 3957ef53a5033bd519b19cf375061be1929bdb5f Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sun, 24 Apr 2016 00:56:05 -0400 Subject: ext4: remove EXT4_STATE_ORDERED_MODE This flag is just duplicating what ext4_should_order_data() tells you and is used in a single place. Furthermore it doesn't reflect changes to inode data journalling flag so it may be possibly misleading. Just remove it. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 1 - fs/ext4/inode.c | 5 +---- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 349afebe21ee..f75e9ebd4ca2 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1549,7 +1549,6 @@ enum { EXT4_STATE_DIOREAD_LOCK, /* Disable support for dio read nolocking */ EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */ - EXT4_STATE_ORDERED_MODE, /* data=ordered mode */ EXT4_STATE_EXT_PRECACHED, /* extents have been precached */ }; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 250c2df04a92..8ba46ad06aed 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3540,10 +3540,7 @@ void ext4_set_aops(struct inode *inode) { switch (ext4_inode_journal_mode(inode)) { case EXT4_INODE_ORDERED_DATA_MODE: - ext4_set_inode_state(inode, EXT4_STATE_ORDERED_MODE); - break; case EXT4_INODE_WRITEBACK_DATA_MODE: - ext4_clear_inode_state(inode, EXT4_STATE_ORDERED_MODE); break; case EXT4_INODE_JOURNAL_DATA_MODE: inode->i_mapping->a_ops = &ext4_journalled_aops; @@ -3636,7 +3633,7 @@ static int __ext4_block_zero_page_range(handle_t *handle, } else { err = 0; mark_buffer_dirty(bh); - if (ext4_test_inode_state(inode, EXT4_STATE_ORDERED_MODE)) + if (ext4_should_order_data(inode)) err = ext4_jbd2_file_inode(handle, inode); } -- cgit v1.2.3 From 41617e1a8dec9fe082ba5dec26bacb154eb55482 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sun, 24 Apr 2016 00:56:07 -0400 Subject: jbd2: add support for avoiding data writes during transaction commits Currently when filesystem needs to make sure data is on permanent storage before committing a transaction it adds inode to transaction's inode list. During transaction commit, jbd2 writes back all dirty buffers that have allocated underlying blocks and waits for the IO to finish. However when doing writeback for delayed allocated data, we allocate blocks and immediately submit the data. Thus asking jbd2 to write dirty pages just unnecessarily adds more work to jbd2 possibly writing back other redirtied blocks. Add support to jbd2 to allow filesystem to ask jbd2 to only wait for outstanding data writes before committing a transaction and thus avoid unnecessary writes. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/ext4/ext4_jbd2.h | 3 ++- fs/jbd2/commit.c | 4 ++++ fs/jbd2/journal.c | 3 ++- fs/jbd2/transaction.c | 22 ++++++++++++++++++---- fs/ocfs2/journal.h | 2 +- include/linux/jbd2.h | 13 +++++++++++-- 6 files changed, 38 insertions(+), 9 deletions(-) diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 5f5846211095..f1c940b38b30 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -362,7 +362,8 @@ static inline int ext4_journal_force_commit(journal_t *journal) static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode) { if (ext4_handle_valid(handle)) - return jbd2_journal_file_inode(handle, EXT4_I(inode)->jinode); + return jbd2_journal_inode_add_write(handle, + EXT4_I(inode)->jinode); return 0; } diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 2ad98d6e19f4..70078096117d 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -219,6 +219,8 @@ static int journal_submit_data_buffers(journal_t *journal, spin_lock(&journal->j_list_lock); list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { + if (!(jinode->i_flags & JI_WRITE_DATA)) + continue; mapping = jinode->i_vfs_inode->i_mapping; jinode->i_flags |= JI_COMMIT_RUNNING; spin_unlock(&journal->j_list_lock); @@ -256,6 +258,8 @@ static int journal_finish_inode_data_buffers(journal_t *journal, /* For locking, see the comment in journal_submit_data_buffers() */ spin_lock(&journal->j_list_lock); list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) { + if (!(jinode->i_flags & JI_WAIT_DATA)) + continue; jinode->i_flags |= JI_COMMIT_RUNNING; spin_unlock(&journal->j_list_lock); err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 435f0b26ac20..b31852f76f46 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -94,7 +94,8 @@ EXPORT_SYMBOL(jbd2_journal_blocks_per_page); EXPORT_SYMBOL(jbd2_journal_invalidatepage); EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers); EXPORT_SYMBOL(jbd2_journal_force_commit); -EXPORT_SYMBOL(jbd2_journal_file_inode); +EXPORT_SYMBOL(jbd2_journal_inode_add_write); +EXPORT_SYMBOL(jbd2_journal_inode_add_wait); EXPORT_SYMBOL(jbd2_journal_init_jbd_inode); EXPORT_SYMBOL(jbd2_journal_release_jbd_inode); EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate); diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 67c103867bf8..be56c8ca34c2 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -2462,7 +2462,8 @@ void jbd2_journal_refile_buffer(journal_t *journal, struct journal_head *jh) /* * File inode in the inode list of the handle's transaction */ -int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode) +static int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode, + unsigned long flags) { transaction_t *transaction = handle->h_transaction; journal_t *journal; @@ -2487,12 +2488,14 @@ int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *jinode) * and if jinode->i_next_transaction == transaction, commit code * will only file the inode where we want it. */ - if (jinode->i_transaction == transaction || - jinode->i_next_transaction == transaction) + if ((jinode->i_transaction == transaction || + jinode->i_next_transaction == transaction) && + (jinode->i_flags & flags) == flags) return 0; spin_lock(&journal->j_list_lock); - + jinode->i_flags |= flags; + /* Is inode already attached where we need it? */ if (jinode->i_transaction == transaction || jinode->i_next_transaction == transaction) goto done; @@ -2523,6 +2526,17 @@ done: return 0; } +int jbd2_journal_inode_add_write(handle_t *handle, struct jbd2_inode *jinode) +{ + return jbd2_journal_file_inode(handle, jinode, + JI_WRITE_DATA | JI_WAIT_DATA); +} + +int jbd2_journal_inode_add_wait(handle_t *handle, struct jbd2_inode *jinode) +{ + return jbd2_journal_file_inode(handle, jinode, JI_WAIT_DATA); +} + /* * File truncate and transaction commit interact with each other in a * non-trivial way. If a transaction writing data block A is diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index f4cd3c3e9fb7..497a4171ef61 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -619,7 +619,7 @@ static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb, static inline int ocfs2_jbd2_file_inode(handle_t *handle, struct inode *inode) { - return jbd2_journal_file_inode(handle, &OCFS2_I(inode)->ip_jinode); + return jbd2_journal_inode_add_write(handle, &OCFS2_I(inode)->ip_jinode); } static inline int ocfs2_begin_ordered_truncate(struct inode *inode, diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index fd1083c46c61..39511484ad10 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -403,11 +403,19 @@ static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh) /* Flags in jbd_inode->i_flags */ #define __JI_COMMIT_RUNNING 0 -/* Commit of the inode data in progress. We use this flag to protect us from +#define __JI_WRITE_DATA 1 +#define __JI_WAIT_DATA 2 + +/* + * Commit of the inode data in progress. We use this flag to protect us from * concurrent deletion of inode. We cannot use reference to inode for this * since we cannot afford doing last iput() on behalf of kjournald */ #define JI_COMMIT_RUNNING (1 << __JI_COMMIT_RUNNING) +/* Write allocated dirty buffers in this inode before commit */ +#define JI_WRITE_DATA (1 << __JI_WRITE_DATA) +/* Wait for outstanding data writes for this inode before commit */ +#define JI_WAIT_DATA (1 << __JI_WAIT_DATA) /** * struct jbd_inode is the structure linking inodes in ordered mode @@ -1270,7 +1278,8 @@ extern int jbd2_journal_clear_err (journal_t *); extern int jbd2_journal_bmap(journal_t *, unsigned long, unsigned long long *); extern int jbd2_journal_force_commit(journal_t *); extern int jbd2_journal_force_commit_nested(journal_t *); -extern int jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *inode); +extern int jbd2_journal_inode_add_write(handle_t *handle, struct jbd2_inode *inode); +extern int jbd2_journal_inode_add_wait(handle_t *handle, struct jbd2_inode *inode); extern int jbd2_journal_begin_ordered_truncate(journal_t *journal, struct jbd2_inode *inode, loff_t new_size); extern void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode); -- cgit v1.2.3 From ee0876bc69ee8d24d524fb2e9e41e3682aaebb11 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sun, 24 Apr 2016 00:56:08 -0400 Subject: ext4: do not ask jbd2 to write data for delalloc buffers Currently we ask jbd2 to write all dirty allocated buffers before committing a transaction when doing writeback of delay allocated blocks. However this is unnecessary since we move all pages to writeback state before dropping a transaction handle and then submit all the necessary IO. We still need the transaction commit to wait for all the outstanding writeback before flushing disk caches during transaction commit to avoid data exposure issues though. Use the new jbd2 capability and ask it to only wait for outstanding writeback during transaction commit when writing back data in ext4_writepages(). Tested-by: "HUANG Weller (CM/ESW12-CN)" Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 3 +++ fs/ext4/ext4_jbd2.h | 12 +++++++++++- fs/ext4/inode.c | 10 +++++++--- fs/ext4/move_extent.c | 2 +- 4 files changed, 22 insertions(+), 5 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index f75e9ebd4ca2..cb00b1119ec9 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -581,6 +581,9 @@ enum { #define EXT4_GET_BLOCKS_ZERO 0x0200 #define EXT4_GET_BLOCKS_CREATE_ZERO (EXT4_GET_BLOCKS_CREATE |\ EXT4_GET_BLOCKS_ZERO) + /* Caller will submit data before dropping transaction handle. This + * allows jbd2 to avoid submitting data before commit. */ +#define EXT4_GET_BLOCKS_IO_SUBMIT 0x0400 /* * The bit position of these flags must not overlap with any of the diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index f1c940b38b30..09c1ef38cbe6 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -359,7 +359,8 @@ static inline int ext4_journal_force_commit(journal_t *journal) return 0; } -static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode) +static inline int ext4_jbd2_inode_add_write(handle_t *handle, + struct inode *inode) { if (ext4_handle_valid(handle)) return jbd2_journal_inode_add_write(handle, @@ -367,6 +368,15 @@ static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode) return 0; } +static inline int ext4_jbd2_inode_add_wait(handle_t *handle, + struct inode *inode) +{ + if (ext4_handle_valid(handle)) + return jbd2_journal_inode_add_wait(handle, + EXT4_I(inode)->jinode); + return 0; +} + static inline void ext4_update_inode_fsync_trans(handle_t *handle, struct inode *inode, int datasync) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 8ba46ad06aed..17bfa42ac971 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -695,7 +695,10 @@ out_sem: !(flags & EXT4_GET_BLOCKS_ZERO) && !IS_NOQUOTA(inode) && ext4_should_order_data(inode)) { - ret = ext4_jbd2_file_inode(handle, inode); + if (flags & EXT4_GET_BLOCKS_IO_SUBMIT) + ret = ext4_jbd2_inode_add_wait(handle, inode); + else + ret = ext4_jbd2_inode_add_write(handle, inode); if (ret) return ret; } @@ -2319,7 +2322,8 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) * the data was copied into the page cache. */ get_blocks_flags = EXT4_GET_BLOCKS_CREATE | - EXT4_GET_BLOCKS_METADATA_NOFAIL; + EXT4_GET_BLOCKS_METADATA_NOFAIL | + EXT4_GET_BLOCKS_IO_SUBMIT; dioread_nolock = ext4_should_dioread_nolock(inode); if (dioread_nolock) get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; @@ -3634,7 +3638,7 @@ static int __ext4_block_zero_page_range(handle_t *handle, err = 0; mark_buffer_dirty(bh); if (ext4_should_order_data(inode)) - err = ext4_jbd2_file_inode(handle, inode); + err = ext4_jbd2_inode_add_write(handle, inode); } unlock: diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 325cef48b39a..a920c5d29fac 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -400,7 +400,7 @@ data_copy: /* Even in case of data=writeback it is reasonable to pin * inode to transaction, to prevent unexpected data loss */ - *err = ext4_jbd2_file_inode(handle, orig_inode); + *err = ext4_jbd2_inode_add_write(handle, orig_inode); unlock_pages: unlock_page(pagep[0]); -- cgit v1.2.3 From 7b8081912d75df1d910d6969f0a374b66ef242bf Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 25 Apr 2016 23:13:17 -0400 Subject: ext4: fix jbd2 handle extension in ext4_ext_truncate_extend_restart() The function jbd2_journal_extend() takes as its argument the number of new credits to be added to the handle. We weren't taking into account the currently unused handle credits; worse, we would try to extend the handle by N credits when it had N credits available. In the case where jbd2_journal_extend() fails because the transaction is too large, when jbd2_journal_restart() gets called, the N credits owned by the handle gets returned to the transaction, and the transaction commit is asynchronously requested, and then start_this_handle() will be able to successfully attach the handle to the current transaction since the required credits are now available. This is mostly harmless, but since ext4_ext_truncate_extend_restart() returns EAGAIN, the truncate machinery will once again try to call ext4_ext_truncate_extend_restart(), which will do the above sequence over and over again until the transaction has committed. This was found while I was debugging a lockup in caused by running xfstests generic/074 in the data=journal case. I'm still not sure why we ended up looping forever, which suggests there may still be another bug hiding in the transaction accounting machinery, but this commit prevents us from looping in the first place. Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 95bf4679ac54..ba2be53a61d2 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -120,9 +120,14 @@ static int ext4_ext_truncate_extend_restart(handle_t *handle, if (!ext4_handle_valid(handle)) return 0; - if (handle->h_buffer_credits > needed) + if (handle->h_buffer_credits >= needed) return 0; - err = ext4_journal_extend(handle, needed); + /* + * If we need to extend the journal get a few extra blocks + * while we're at it for efficiency's sake. + */ + needed += 3; + err = ext4_journal_extend(handle, needed - handle->h_buffer_credits); if (err <= 0) return err; err = ext4_truncate_restart_trans(handle, inode, needed); -- cgit v1.2.3 From 4c54659269ecb799133758330e7ea2a6fa4c65ca Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Mon, 25 Apr 2016 23:21:00 -0400 Subject: ext4: handle unwritten or delalloc buffers before enabling data journaling We already allocate delalloc blocks before changing the inode mode into "per-file data journal" mode to prevent delalloc blocks from remaining not allocated, but another issue concerned with "BH_Unwritten" status still exists. For example, by fallocate(), several buffers' status change into "BH_Unwritten", but these buffers cannot be processed by ext4_alloc_da_blocks(). So, they still remain in unwritten status after per-file data journaling is enabled and they cannot be changed into written status any more and, if they are journaled and eventually checkpointed, these unwritten buffer will cause a kernel panic by the below BUG_ON() function of submit_bh_wbc() when they are submitted during checkpointing. static int submit_bh_wbc(int rw, struct buffer_head *bh,... { ... BUG_ON(buffer_unwritten(bh)); Moreover, when "dioread_nolock" option is enabled, the status of a buffer is changed into "BH_Unwritten" after write_begin() completes and the "BH_Unwritten" status will be cleared after I/O is done. Therefore, if a buffer's status is changed into unwrutten but the buffer's I/O is not submitted and completed, it can cause the same problem after enabling per-file data journaling. You can easily generate this bug by executing the following command. ./kvm-xfstests -C 10000 -m nodelalloc,dioread_nolock generic/269 To resolve these problems and define a boundary between the previous mode and per-file data journaling mode, we need to flush and wait all the I/O of buffers of a file before enabling per-file data journaling of the file. Signed-off-by: Daeho Jeong Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara --- fs/ext4/inode.c | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 17bfa42ac971..779ef4c11bc1 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5452,22 +5452,29 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) return 0; if (is_journal_aborted(journal)) return -EROFS; - /* We have to allocate physical blocks for delalloc blocks - * before flushing journal. otherwise delalloc blocks can not - * be allocated any more. even more truncate on delalloc blocks - * could trigger BUG by flushing delalloc blocks in journal. - * There is no delalloc block in non-journal data mode. - */ - if (val && test_opt(inode->i_sb, DELALLOC)) { - err = ext4_alloc_da_blocks(inode); - if (err < 0) - return err; - } /* Wait for all existing dio workers */ ext4_inode_block_unlocked_dio(inode); inode_dio_wait(inode); + /* + * Before flushing the journal and switching inode's aops, we have + * to flush all dirty data the inode has. There can be outstanding + * delayed allocations, there can be unwritten extents created by + * fallocate or buffered writes in dioread_nolock mode covered by + * dirty data which can be converted only after flushing the dirty + * data (and journalled aops don't know how to handle these cases). + */ + if (val) { + down_write(&EXT4_I(inode)->i_mmap_sem); + err = filemap_write_and_wait(inode->i_mapping); + if (err < 0) { + up_write(&EXT4_I(inode)->i_mmap_sem); + ext4_inode_resume_unlocked_dio(inode); + return err; + } + } + jbd2_journal_lock_updates(journal); /* @@ -5492,6 +5499,8 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) ext4_set_aops(inode); jbd2_journal_unlock_updates(journal); + if (val) + up_write(&EXT4_I(inode)->i_mmap_sem); ext4_inode_resume_unlocked_dio(inode); /* Finally we can mark the inode as dirty. */ -- cgit v1.2.3 From c8585c6fcaf2011de54c3592e80a634a2b9e1a7f Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Mon, 25 Apr 2016 23:22:35 -0400 Subject: ext4: fix races between changing inode journal mode and ext4_writepages In ext4, there is a race condition between changing inode journal mode and ext4_writepages(). While ext4_writepages() is executed on a non-journalled mode inode, the inode's journal mode could be enabled by ioctl() and then, some pages dirtied after switching the journal mode will be still exposed to ext4_writepages() in non-journaled mode. To resolve this problem, we use fs-wide per-cpu rw semaphore by Jan Kara's suggestion because we don't want to waste ext4_inode_info's space for this extra rare case. Signed-off-by: Daeho Jeong Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara --- fs/ext4/ext4.h | 4 ++++ fs/ext4/inode.c | 15 ++++++++++++--- fs/ext4/super.c | 4 ++++ kernel/locking/percpu-rwsem.c | 1 + 4 files changed, 21 insertions(+), 3 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index cb00b1119ec9..ba5aecc07fbc 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -33,6 +33,7 @@ #include #include #include +#include #ifdef __KERNEL__ #include #endif @@ -1508,6 +1509,9 @@ struct ext4_sb_info { struct ratelimit_state s_err_ratelimit_state; struct ratelimit_state s_warning_ratelimit_state; struct ratelimit_state s_msg_ratelimit_state; + + /* Barrier between changing inodes' journal flags and writepages ops. */ + struct percpu_rw_semaphore s_journal_flag_rwsem; }; static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 779ef4c11bc1..4d8ebbe00456 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2612,11 +2612,14 @@ static int ext4_writepages(struct address_space *mapping, struct blk_plug plug; bool give_up_on_write = false; + percpu_down_read(&sbi->s_journal_flag_rwsem); trace_ext4_writepages(inode, wbc); - if (dax_mapping(mapping)) - return dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev, - wbc); + if (dax_mapping(mapping)) { + ret = dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev, + wbc); + goto out_writepages; + } /* * No pages to write? This is mainly a kludge to avoid starting @@ -2786,6 +2789,7 @@ retry: out_writepages: trace_ext4_writepages_result(inode, wbc, ret, nr_to_write - wbc->nr_to_write); + percpu_up_read(&sbi->s_journal_flag_rwsem); return ret; } @@ -5436,6 +5440,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) journal_t *journal; handle_t *handle; int err; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); /* * We have to be very careful here: changing a data block's @@ -5475,6 +5480,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) } } + percpu_down_write(&sbi->s_journal_flag_rwsem); jbd2_journal_lock_updates(journal); /* @@ -5491,6 +5497,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) err = jbd2_journal_flush(journal); if (err < 0) { jbd2_journal_unlock_updates(journal); + percpu_up_write(&sbi->s_journal_flag_rwsem); ext4_inode_resume_unlocked_dio(inode); return err; } @@ -5499,6 +5506,8 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) ext4_set_aops(inode); jbd2_journal_unlock_updates(journal); + percpu_up_write(&sbi->s_journal_flag_rwsem); + if (val) up_write(&EXT4_I(inode)->i_mmap_sem); ext4_inode_resume_unlocked_dio(inode); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 304c712dbe12..20c5d52253b4 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -859,6 +859,7 @@ static void ext4_put_super(struct super_block *sb) percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); percpu_counter_destroy(&sbi->s_dirtyclusters_counter); + percpu_free_rwsem(&sbi->s_journal_flag_rwsem); brelse(sbi->s_sbh); #ifdef CONFIG_QUOTA for (i = 0; i < EXT4_MAXQUOTAS; i++) @@ -3930,6 +3931,9 @@ no_journal: if (!err) err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0, GFP_KERNEL); + if (!err) + err = percpu_init_rwsem(&sbi->s_journal_flag_rwsem); + if (err) { ext4_msg(sb, KERN_ERR, "insufficient memory"); goto failed_mount6; diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index f231e0bb311c..bec0b647f9cc 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -37,6 +37,7 @@ void percpu_free_rwsem(struct percpu_rw_semaphore *brw) free_percpu(brw->fast_read_ctr); brw->fast_read_ctr = NULL; /* catch use after free bugs */ } +EXPORT_SYMBOL_GPL(percpu_free_rwsem); /* * This is the fast-path for down_read/up_read. If it succeeds we rely -- cgit v1.2.3 From 8d2ae1cbe8a984d7a755755fb53955de2f60a2f9 Mon Sep 17 00:00:00 2001 From: Jakub Wilk Date: Wed, 27 Apr 2016 01:11:21 -0400 Subject: ext4: remove trailing \n from ext4_warning/ext4_error calls Messages passed to ext4_warning() or ext4_error() don't need trailing newlines, because these function add the newlines themselves. Signed-off-by: Jakub Wilk --- fs/ext4/extents.c | 4 ++-- fs/ext4/extents_status.c | 2 +- fs/ext4/file.c | 2 +- fs/ext4/inline.c | 2 +- fs/ext4/mballoc.c | 2 +- fs/ext4/mmp.c | 4 ++-- fs/ext4/namei.c | 2 +- fs/ext4/resize.c | 2 +- 8 files changed, 10 insertions(+), 10 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index ba2be53a61d2..c53d5a8d2a79 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2588,7 +2588,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, } } else ext4_error(sbi->s_sb, "strange request: removal(2) " - "%u-%u from %u:%u\n", + "%u-%u from %u:%u", from, to, le32_to_cpu(ex->ee_block), ee_len); return 0; } @@ -3743,7 +3743,7 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle, if (ee_block != map->m_lblk || ee_len > map->m_len) { #ifdef EXT4_DEBUG ext4_warning("Inode (%ld) finished: extent logical block %llu," - " len %u; IO logical block %llu, len %u\n", + " len %u; IO logical block %llu, len %u", inode->i_ino, (unsigned long long)ee_block, ee_len, (unsigned long long)map->m_lblk, map->m_len); #endif diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index e38b987ac7f5..37e059202cd2 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -707,7 +707,7 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, (status & EXTENT_STATUS_WRITTEN)) { ext4_warning(inode->i_sb, "Inserting extent [%u/%u] as " " delayed and written which can potentially " - " cause data loss.\n", lblk, len); + " cause data loss.", lblk, len); WARN_ON(1); } diff --git a/fs/ext4/file.c b/fs/ext4/file.c index fa2208bae2e1..3e850b988923 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -378,7 +378,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp) if (ext4_encrypted_inode(d_inode(dir)) && !ext4_is_child_context_consistent_with_parent(d_inode(dir), inode)) { ext4_warning(inode->i_sb, - "Inconsistent encryption contexts: %lu/%lu\n", + "Inconsistent encryption contexts: %lu/%lu", (unsigned long) d_inode(dir)->i_ino, (unsigned long) inode->i_ino); dput(dir); diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 7bc6c855cc18..ff7538c26992 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -1780,7 +1780,7 @@ int empty_inline_dir(struct inode *dir, int *has_inline_data) ext4_warning(dir->i_sb, "bad inline directory (dir #%lu) - " "inode %u, rec_len %u, name_len %d" - "inline size %d\n", + "inline size %d", dir->i_ino, le32_to_cpu(de->inode), le16_to_cpu(de->rec_len), de->name_len, inline_size); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index eeeade76012e..efa111a7606d 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4935,7 +4935,7 @@ int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, * boundary. */ if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) { - ext4_warning(sb, "too much blocks added to group %u\n", + ext4_warning(sb, "too much blocks added to group %u", block_group); err = -EINVAL; goto error_return; diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index 24445275d330..23d436d6f8b8 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -121,7 +121,7 @@ void __dump_mmp_msg(struct super_block *sb, struct mmp_struct *mmp, __ext4_warning(sb, function, line, "%s", msg); __ext4_warning(sb, function, line, "MMP failure info: last update time: %llu, last update " - "node: %s, last update device: %s\n", + "node: %s, last update device: %s", (long long unsigned int) le64_to_cpu(mmp->mmp_time), mmp->mmp_nodename, mmp->mmp_bdevname); } @@ -353,7 +353,7 @@ skip: * wait for MMP interval and check mmp_seq. */ if (schedule_timeout_interruptible(HZ * wait_time) != 0) { - ext4_warning(sb, "MMP startup interrupted, failing mount\n"); + ext4_warning(sb, "MMP startup interrupted, failing mount"); goto failed; } diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index c07422d254b6..6e6b3230ee45 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1618,7 +1618,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi if (nokey) return ERR_PTR(-ENOKEY); ext4_warning(inode->i_sb, - "Inconsistent encryption contexts: %lu/%lu\n", + "Inconsistent encryption contexts: %lu/%lu", (unsigned long) dir->i_ino, (unsigned long) inode->i_ino); return ERR_PTR(-EPERM); diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 34038e3598d5..cf681004b196 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -41,7 +41,7 @@ int ext4_resize_begin(struct super_block *sb) */ if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { ext4_warning(sb, "There are errors in the filesystem, " - "so online resizing is not allowed\n"); + "so online resizing is not allowed"); return -EPERM; } -- cgit v1.2.3 From c9eb13a9105e2e418f72e46a2b6da3f49e696902 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 30 Apr 2016 00:48:54 -0400 Subject: ext4: fix hang when processing corrupted orphaned inode list If the orphaned inode list contains inode #5, ext4_iget() returns a bad inode (since the bootloader inode should never be referenced directly). Because of the bad inode, we end up processing the inode repeatedly and this hangs the machine. This can be reproduced via: mke2fs -t ext4 /tmp/foo.img 100 debugfs -w -R "ssv last_orphan 5" /tmp/foo.img mount -o loop /tmp/foo.img /mnt (But don't do this if you are using an unpatched kernel if you care about the system staying functional. :-) This bug was found by the port of American Fuzzy Lop into the kernel to find file system problems[1]. (Since it *only* happens if inode #5 shows up on the orphan list --- 3, 7, 8, etc. won't do it, it's not surprising that AFL needed two hours before it found it.) [1] http://events.linuxfoundation.org/sites/events/files/slides/AFL%20filesystem%20fuzzing%2C%20Vault%202016_0.pdf Cc: stable@vger.kernel.org Reported by: Vegard Nossum Signed-off-by: Theodore Ts'o --- fs/ext4/ialloc.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 237b877d316d..c2caf2df3695 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1183,11 +1183,13 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino) goto iget_failed; /* - * If the orphans has i_nlinks > 0 then it should be able to be - * truncated, otherwise it won't be removed from the orphan list - * during processing and an infinite loop will result. + * If the orphans has i_nlinks > 0 then it should be able to + * be truncated, otherwise it won't be removed from the orphan + * list during processing and an infinite loop will result. + * Similarly, it must not be a bad inode. */ - if (inode->i_nlink && !ext4_can_truncate(inode)) + if ((inode->i_nlink && !ext4_can_truncate(inode)) || + is_bad_inode(inode)) goto bad_orphan; if (NEXT_ORPHAN(inode) > max_ino) -- cgit v1.2.3 From 7827a7f6ebfcb7f388dc47fddd48567a314701ba Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 30 Apr 2016 00:49:54 -0400 Subject: ext4: clean up error handling when orphan list is corrupted Instead of just printing warning messages, if the orphan list is corrupted, declare the file system is corrupted. If there are any reserved inodes in the orphaned inode list, declare the file system corrupted and stop right away to avoid doing more potential damage to the file system. Cc: stable@vger.kernel.org Signed-off-by: Theodore Ts'o --- fs/ext4/ialloc.c | 49 ++++++++++++++++++++++--------------------------- 1 file changed, 22 insertions(+), 27 deletions(-) diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index c2caf2df3695..3da4cf8d18b6 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1150,25 +1150,20 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino) unsigned long max_ino = le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count); ext4_group_t block_group; int bit; - struct buffer_head *bitmap_bh; + struct buffer_head *bitmap_bh = NULL; struct inode *inode = NULL; - long err = -EIO; + int err = -EFSCORRUPTED; - /* Error cases - e2fsck has already cleaned up for us */ - if (ino > max_ino) { - ext4_warning(sb, "bad orphan ino %lu! e2fsck was run?", ino); - err = -EFSCORRUPTED; - goto error; - } + if (ino < EXT4_FIRST_INO(sb) || ino > max_ino) + goto bad_orphan; block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb); bitmap_bh = ext4_read_inode_bitmap(sb, block_group); if (IS_ERR(bitmap_bh)) { - err = PTR_ERR(bitmap_bh); - ext4_warning(sb, "inode bitmap error %ld for orphan %lu", - ino, err); - goto error; + ext4_error(sb, "inode bitmap error %ld for orphan %lu", + ino, PTR_ERR(bitmap_bh)); + return (struct inode *) bitmap_bh; } /* Having the inode bit set should be a 100% indicator that this @@ -1179,8 +1174,12 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino) goto bad_orphan; inode = ext4_iget(sb, ino); - if (IS_ERR(inode)) - goto iget_failed; + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + ext4_error(sb, "couldn't read orphan inode %lu (err %d)", + ino, err); + return inode; + } /* * If the orphans has i_nlinks > 0 then it should be able to @@ -1197,29 +1196,25 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino) brelse(bitmap_bh); return inode; -iget_failed: - err = PTR_ERR(inode); - inode = NULL; bad_orphan: - ext4_warning(sb, "bad orphan inode %lu! e2fsck was run?", ino); - printk(KERN_WARNING "ext4_test_bit(bit=%d, block=%llu) = %d\n", - bit, (unsigned long long)bitmap_bh->b_blocknr, - ext4_test_bit(bit, bitmap_bh->b_data)); - printk(KERN_WARNING "inode=%p\n", inode); + ext4_error(sb, "bad orphan inode %lu", ino); + if (bitmap_bh) + printk(KERN_ERR "ext4_test_bit(bit=%d, block=%llu) = %d\n", + bit, (unsigned long long)bitmap_bh->b_blocknr, + ext4_test_bit(bit, bitmap_bh->b_data)); if (inode) { - printk(KERN_WARNING "is_bad_inode(inode)=%d\n", + printk(KERN_ERR "is_bad_inode(inode)=%d\n", is_bad_inode(inode)); - printk(KERN_WARNING "NEXT_ORPHAN(inode)=%u\n", + printk(KERN_ERR "NEXT_ORPHAN(inode)=%u\n", NEXT_ORPHAN(inode)); - printk(KERN_WARNING "max_ino=%lu\n", max_ino); - printk(KERN_WARNING "i_nlink=%u\n", inode->i_nlink); + printk(KERN_ERR "max_ino=%lu\n", max_ino); + printk(KERN_ERR "i_nlink=%u\n", inode->i_nlink); /* Avoid freeing blocks if we got a bad deleted inode */ if (inode->i_nlink == 0) inode->i_blocks = 0; iput(inode); } brelse(bitmap_bh); -error: return ERR_PTR(err); } -- cgit v1.2.3 From ff0bc08454917964291f72ee5b8eca66de4bc250 Mon Sep 17 00:00:00 2001 From: Seth Forshee Date: Thu, 5 May 2016 10:52:38 -0400 Subject: ext4: fix check of dqget() return value in ext4_ioctl_setproject() A failed call to dqget() returns an ERR_PTR() and not null. Fix the check in ext4_ioctl_setproject() to handle this correctly. Fixes: 9b7365fc1c82 ("ext4: add FS_IOC_FSSETXATTR/FS_IOC_FSGETXATTR interface support") Cc: stable@vger.kernel.org # v4.5 Signed-off-by: Seth Forshee Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara --- fs/ext4/ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index eae5917c534e..0acf8cacb2be 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -365,7 +365,7 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid) struct dquot *transfer_to[MAXQUOTAS] = { }; transfer_to[PRJQUOTA] = dqget(sb, make_kqid_projid(kprojid)); - if (transfer_to[PRJQUOTA]) { + if (!IS_ERR(transfer_to[PRJQUOTA])) { err = __dquot_transfer(inode, transfer_to); dqput(transfer_to[PRJQUOTA]); if (err) -- cgit v1.2.3 From 74177f55b70e2f2be770dd28684dd6d17106a4ba Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 5 May 2016 11:10:15 -0400 Subject: ext4: fix oops on corrupted filesystem When filesystem is corrupted in the right way, it can happen ext4_mark_iloc_dirty() in ext4_orphan_add() returns error and we subsequently remove inode from the in-memory orphan list. However this deletion is done with list_del(&EXT4_I(inode)->i_orphan) and thus we leave i_orphan list_head with a stale content. Later we can look at this content causing list corruption, oops, or other issues. The reported trace looked like: WARNING: CPU: 0 PID: 46 at lib/list_debug.c:53 __list_del_entry+0x6b/0x100() list_del corruption, 0000000061c1d6e0->next is LIST_POISON1 0000000000100100) CPU: 0 PID: 46 Comm: ext4.exe Not tainted 4.1.0-rc4+ #250 Stack: 60462947 62219960 602ede24 62219960 602ede24 603ca293 622198f0 602f02eb 62219950 6002c12c 62219900 601b4d6b Call Trace: [<6005769c>] ? vprintk_emit+0x2dc/0x5c0 [<602ede24>] ? printk+0x0/0x94 [<600190bc>] show_stack+0xdc/0x1a0 [<602ede24>] ? printk+0x0/0x94 [<602ede24>] ? printk+0x0/0x94 [<602f02eb>] dump_stack+0x2a/0x2c [<6002c12c>] warn_slowpath_common+0x9c/0xf0 [<601b4d6b>] ? __list_del_entry+0x6b/0x100 [<6002c254>] warn_slowpath_fmt+0x94/0xa0 [<602f4d09>] ? __mutex_lock_slowpath+0x239/0x3a0 [<6002c1c0>] ? warn_slowpath_fmt+0x0/0xa0 [<60023ebf>] ? set_signals+0x3f/0x50 [<600a205a>] ? kmem_cache_free+0x10a/0x180 [<602f4e88>] ? mutex_lock+0x18/0x30 [<601b4d6b>] __list_del_entry+0x6b/0x100 [<601177ec>] ext4_orphan_del+0x22c/0x2f0 [<6012f27c>] ? __ext4_journal_start_sb+0x2c/0xa0 [<6010b973>] ? ext4_truncate+0x383/0x390 [<6010bc8b>] ext4_write_begin+0x30b/0x4b0 [<6001bb50>] ? copy_from_user+0x0/0xb0 [<601aa840>] ? iov_iter_fault_in_readable+0xa0/0xc0 [<60072c4f>] generic_perform_write+0xaf/0x1e0 [<600c4166>] ? file_update_time+0x46/0x110 [<60072f0f>] __generic_file_write_iter+0x18f/0x1b0 [<6010030f>] ext4_file_write_iter+0x15f/0x470 [<60094e10>] ? unlink_file_vma+0x0/0x70 [<6009b180>] ? unlink_anon_vmas+0x0/0x260 [<6008f169>] ? free_pgtables+0xb9/0x100 [<600a6030>] __vfs_write+0xb0/0x130 [<600a61d5>] vfs_write+0xa5/0x170 [<600a63d6>] SyS_write+0x56/0xe0 [<6029fcb0>] ? __libc_waitpid+0x0/0xa0 [<6001b698>] handle_syscall+0x68/0x90 [<6002633d>] userspace+0x4fd/0x600 [<6002274f>] ? save_registers+0x1f/0x40 [<60028bd7>] ? arch_prctl+0x177/0x1b0 [<60017bd5>] fork_handler+0x85/0x90 Fix the problem by using list_del_init() as we always should with i_orphan list. CC: stable@vger.kernel.org Reported-by: Vegard Nossum Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/ext4/namei.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 6e6b3230ee45..672c99a2051b 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2833,7 +2833,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) * list entries can cause panics at unmount time. */ mutex_lock(&sbi->s_orphan_lock); - list_del(&EXT4_I(inode)->i_orphan); + list_del_init(&EXT4_I(inode)->i_orphan); mutex_unlock(&sbi->s_orphan_lock); } } -- cgit v1.2.3 From b5cb316cdf3a3f5f6125412b0f6065185240cfdc Mon Sep 17 00:00:00 2001 From: Nicolai Stange Date: Thu, 5 May 2016 17:38:03 -0400 Subject: ext4: address UBSAN warning in mb_find_order_for_block() Currently, in mb_find_order_for_block(), there's a loop like the following: while (order <= e4b->bd_blkbits + 1) { ... bb += 1 << (e4b->bd_blkbits - order); } Note that the updated bb is used in the loop's next iteration only. However, at the last iteration, that is at order == e4b->bd_blkbits + 1, the shift count becomes negative (c.f. C99 6.5.7(3)) and UBSAN reports UBSAN: Undefined behaviour in fs/ext4/mballoc.c:1281:11 shift exponent -1 is negative [...] Call Trace: [] dump_stack+0xbc/0x117 [] ? _atomic_dec_and_lock+0x169/0x169 [] ubsan_epilogue+0xd/0x4e [] __ubsan_handle_shift_out_of_bounds+0x1fb/0x254 [] ? __ubsan_handle_load_invalid_value+0x158/0x158 [] ? ext4_mb_generate_from_pa+0x590/0x590 [] ? ext4_read_block_bitmap_nowait+0x598/0xe80 [] mb_find_order_for_block+0x1ce/0x240 [...] Unless compilers start to do some fancy transformations (which at least GCC 6.0.0 doesn't currently do), the issue is of cosmetic nature only: the such calculated value of bb is never used again. Silence UBSAN by introducing another variable, bb_incr, holding the next increment to apply to bb and adjust that one by right shifting it by one position per loop iteration. Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=114701 Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=112161 Cc: stable@vger.kernel.org Signed-off-by: Nicolai Stange Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index efa111a7606d..49e444b51a0c 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1266,6 +1266,7 @@ static void ext4_mb_unload_buddy(struct ext4_buddy *e4b) static int mb_find_order_for_block(struct ext4_buddy *e4b, int block) { int order = 1; + int bb_incr = 1 << (e4b->bd_blkbits - 1); void *bb; BUG_ON(e4b->bd_bitmap == e4b->bd_buddy); @@ -1278,7 +1279,8 @@ static int mb_find_order_for_block(struct ext4_buddy *e4b, int block) /* this block is part of buddy of order 'order' */ return order; } - bb += 1 << (e4b->bd_blkbits - order); + bb += bb_incr; + bb_incr >>= 1; order++; } return 0; -- cgit v1.2.3 From 935244cd54b86ca46e69bc6604d2adfb1aec2d42 Mon Sep 17 00:00:00 2001 From: Nicolai Stange Date: Thu, 5 May 2016 19:46:19 -0400 Subject: ext4: silence UBSAN in ext4_mb_init() Currently, in ext4_mb_init(), there's a loop like the following: do { ... offset += 1 << (sb->s_blocksize_bits - i); i++; } while (i <= sb->s_blocksize_bits + 1); Note that the updated offset is used in the loop's next iteration only. However, at the last iteration, that is at i == sb->s_blocksize_bits + 1, the shift count becomes equal to (unsigned)-1 > 31 (c.f. C99 6.5.7(3)) and UBSAN reports UBSAN: Undefined behaviour in fs/ext4/mballoc.c:2621:15 shift exponent 4294967295 is too large for 32-bit type 'int' [...] Call Trace: [] dump_stack+0xbc/0x117 [] ? _atomic_dec_and_lock+0x169/0x169 [] ubsan_epilogue+0xd/0x4e [] __ubsan_handle_shift_out_of_bounds+0x1fb/0x254 [] ? __ubsan_handle_load_invalid_value+0x158/0x158 [] ? kmem_cache_alloc+0x101/0x390 [] ? ext4_mb_init+0x13b/0xfd0 [] ? create_cache+0x57/0x1f0 [] ? create_cache+0x11a/0x1f0 [] ? mutex_lock+0x38/0x60 [] ? mutex_unlock+0x1b/0x50 [] ? put_online_mems+0x5b/0xc0 [] ? kmem_cache_create+0x117/0x2c0 [] ext4_mb_init+0xc49/0xfd0 [...] Observe that the mentioned shift exponent, 4294967295, equals (unsigned)-1. Unless compilers start to do some fancy transformations (which at least GCC 6.0.0 doesn't currently do), the issue is of cosmetic nature only: the such calculated value of offset is never used again. Silence UBSAN by introducing another variable, offset_incr, holding the next increment to apply to offset and adjust that one by right shifting it by one position per loop iteration. Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=114701 Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=112161 Cc: stable@vger.kernel.org Signed-off-by: Nicolai Stange Signed-off-by: Theodore Ts'o --- fs/ext4/mballoc.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 49e444b51a0c..c1ab3ec30423 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2585,7 +2585,7 @@ int ext4_mb_init(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); unsigned i, j; - unsigned offset; + unsigned offset, offset_incr; unsigned max; int ret; @@ -2614,11 +2614,13 @@ int ext4_mb_init(struct super_block *sb) i = 1; offset = 0; + offset_incr = 1 << (sb->s_blocksize_bits - 1); max = sb->s_blocksize << 2; do { sbi->s_mb_offsets[i] = offset; sbi->s_mb_maxs[i] = max; - offset += 1 << (sb->s_blocksize_bits - i); + offset += offset_incr; + offset_incr = offset_incr >> 1; max = max >> 1; i++; } while (i <= sb->s_blocksize_bits + 1); -- cgit v1.2.3 From 32157de29a3140bfaa74c6e0fae18c78fe84a5df Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 5 May 2016 22:09:49 -0400 Subject: ext4: remove unnecessary bio get/put ext4_io_submit() used to check for EOPNOTSUPP after bio submission, which is why it had to get an extra reference to the bio before submitting it. But since we no longer touch the bio after submission, get rid of the redundant get/put of the bio. If we do get the extra reference, we enter the slower path of having to flag this bio as now having external references. Signed-off-by: Jens Axboe Signed-off-by: Theodore Ts'o --- fs/ext4/page-io.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index e4fc8ea45d78..2a01df9cc1c3 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -342,9 +342,7 @@ void ext4_io_submit(struct ext4_io_submit *io) if (bio) { int io_op = io->io_wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE; - bio_get(io->io_bio); submit_bio(io_op, io->io_bio); - bio_put(io->io_bio); } io->io_bio = NULL; } -- cgit v1.2.3 From 466c3fb618b8520b75be37fcb115e9610663b945 Mon Sep 17 00:00:00 2001 From: Luis de Bethencourt Date: Thu, 5 May 2016 22:35:54 -0400 Subject: jbd2: remove excess descriptions for handle_s Commit bf6993276f74 ("jbd2: Use tracepoints for history file") removed the members j_history, j_history_max and j_history_cur from struct handle_s but the descriptions stayed lingering. Removing them. Signed-off-by: Luis de Bethencourt Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara --- include/linux/jbd2.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 39511484ad10..efb232c5f668 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -789,9 +789,6 @@ jbd2_time_diff(unsigned long start, unsigned long end) * @j_wbufsize: maximum number of buffer_heads allowed in j_wbuf, the * number that will fit in j_blocksize * @j_last_sync_writer: most recent pid which did a synchronous write - * @j_history: Buffer storing the transactions statistics history - * @j_history_max: Maximum number of transactions in the statistics history - * @j_history_cur: Current number of transactions in the statistics history * @j_history_lock: Protect the transactions statistics history * @j_proc_entry: procfs entry for the jbd statistics directory * @j_stats: Overall statistics -- cgit v1.2.3 From 816cd71b0c723a7296d14aadb8ff1ba42f6181d2 Mon Sep 17 00:00:00 2001 From: Nicolai Stange Date: Thu, 5 May 2016 22:43:04 -0400 Subject: ext4: remove unmeetable inconsisteny check from ext4_find_extent() ext4_find_extent(), stripped down to the parts relevant to this patch, reads as ppos = 0; i = depth; while (i) { --i; ++ppos; if (unlikely(ppos > depth)) { ... ret = -EFSCORRUPTED; goto err; } } Due to the loop's bounds, the condition ppos > depth can never be met. Remove this dead code. Signed-off-by: Nicolai Stange Signed-off-by: Theodore Ts'o --- fs/ext4/extents.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index c53d5a8d2a79..2a2eef9c14e4 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -912,13 +912,6 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block, eh = ext_block_hdr(bh); ppos++; - if (unlikely(ppos > depth)) { - put_bh(bh); - EXT4_ERROR_INODE(inode, - "ppos %d > depth %d", ppos, depth); - ret = -EFSCORRUPTED; - goto err; - } path[ppos].p_bh = bh; path[ppos].p_hdr = eh; } -- cgit v1.2.3 From aef39ab1534d1a07061e0ee7e428499e127232eb Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 13 May 2016 00:38:15 -0400 Subject: dax: call get_blocks() with create == 1 for write faults to unwritten extents Currently, __dax_fault() does not call get_blocks() callback with create argument set, when we got back unwritten extent from the initial get_blocks() call during a write fault. This is because originally filesystems were supposed to convert unwritten extents to written ones using complete_unwritten() callback. Later this was abandoned in favor of using pre-zeroed blocks however the condition whether get_blocks() needs to be called with create == 1 remained. Fix the condition so that filesystems are not forced to zero-out and convert unwritten extents when get_blocks() is called with create == 0 (which introduces unnecessary overhead for read faults and can be problematic as the filesystem may possibly be read-only). Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/dax.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/dax.c b/fs/dax.c index 75ba46d82a76..2494255c5785 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -667,7 +667,7 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, if (error) goto unlock_page; - if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) { + if (!buffer_mapped(&bh) && !vmf->cow_page) { if (vmf->flags & FAULT_FLAG_WRITE) { error = get_block(inode, block, &bh, 1); count_vm_event(PGMAJFAULT); -- cgit v1.2.3 From 7cb476f834d0dc2092e04eeafe9cbe509f6380eb Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 13 May 2016 00:38:16 -0400 Subject: ext4: handle transient ENOSPC properly for DAX ext4_dax_get_blocks() was accidentally omitted fixing get blocks handlers to properly handle transient ENOSPC errors. Fix it now to use ext4_get_blocks_trans() helper which takes care of these errors. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 75 +++++++++++++++------------------------------------------ 1 file changed, 20 insertions(+), 55 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 4d8ebbe00456..32825dee81d4 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3232,72 +3232,37 @@ static int ext4_releasepage(struct page *page, gfp_t wait) int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { - int ret, err; - int credits; - struct ext4_map_blocks map; - handle_t *handle = NULL; - int flags = 0; + int ret; ext4_debug("ext4_dax_mmap_get_block: inode %lu, create flag %d\n", inode->i_ino, create); - map.m_lblk = iblock; - map.m_len = bh_result->b_size >> inode->i_blkbits; - credits = ext4_chunk_trans_blocks(inode, map.m_len); - if (create) { - flags |= EXT4_GET_BLOCKS_PRE_IO | EXT4_GET_BLOCKS_CREATE_ZERO; - handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - return ret; - } - } + if (!create) + return _ext4_get_block(inode, iblock, bh_result, 0); - ret = ext4_map_blocks(handle, inode, &map, flags); - if (create) { - err = ext4_journal_stop(handle); - if (ret >= 0 && err < 0) - ret = err; - } - if (ret <= 0) - goto out; - if (map.m_flags & EXT4_MAP_UNWRITTEN) { - int err2; + ret = ext4_get_block_trans(inode, iblock, bh_result, + EXT4_GET_BLOCKS_PRE_IO | + EXT4_GET_BLOCKS_CREATE_ZERO); + if (ret < 0) + return ret; + if (buffer_unwritten(bh_result)) { /* * We are protected by i_mmap_sem so we know block cannot go * away from under us even though we dropped i_data_sem. * Convert extent to written and write zeros there. - * - * Note: We may get here even when create == 0. */ - handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto out; - } - - err = ext4_map_blocks(handle, inode, &map, - EXT4_GET_BLOCKS_CONVERT | EXT4_GET_BLOCKS_CREATE_ZERO); - if (err < 0) - ret = err; - err2 = ext4_journal_stop(handle); - if (err2 < 0 && ret > 0) - ret = err2; - } -out: - WARN_ON_ONCE(ret == 0 && create); - if (ret > 0) { - map_bh(bh_result, inode->i_sb, map.m_pblk); - /* - * At least for now we have to clear BH_New so that DAX code - * doesn't attempt to zero blocks again in a racy way. - */ - map.m_flags &= ~EXT4_MAP_NEW; - ext4_update_bh_state(bh_result, map.m_flags); - bh_result->b_size = map.m_len << inode->i_blkbits; - ret = 0; + ret = ext4_get_block_trans(inode, iblock, bh_result, + EXT4_GET_BLOCKS_CONVERT | + EXT4_GET_BLOCKS_CREATE_ZERO); + if (ret < 0) + return ret; } - return ret; + /* + * At least for now we have to clear BH_New so that DAX code + * doesn't attempt to zero blocks again in a racy way. + */ + clear_buffer_new(bh_result); + return 0; } #endif -- cgit v1.2.3 From dbc427ce4028580f1244b5b57ca1cbea31aad1e7 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 13 May 2016 00:42:40 -0400 Subject: ext4: fix race in transient ENOSPC detection When there are blocks to free in the running transaction, block allocator can return ENOSPC although the filesystem has some blocks to free. We use ext4_should_retry_alloc() to force commit of the current transaction and return whether anything was committed so that it makes sense to retry the allocation. However the transaction may get committed after block allocation fails but before we call ext4_should_retry_alloc(). So ext4_should_retry_alloc() returns false because there is nothing to commit and we wrongly return ENOSPC. Fix the race by unconditionally returning 1 from ext4_should_retry_alloc() when we tried to commit a transaction. This should not add any unnecessary retries since we had a transaction running a while ago when trying to allocate blocks and we want to retry the allocation once that transaction has committed anyway. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/ext4/balloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index fe1f50fe764f..3020fd70c392 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -610,7 +610,8 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries) jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id); - return jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal); + jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal); + return 1; } /* -- cgit v1.2.3 From 914f82a32d026884743fb3de9f6f0a5908a9d5dd Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 13 May 2016 00:44:16 -0400 Subject: ext4: refactor direct IO code Currently ext4 direct IO handling is split between ext4_ext_direct_IO() and ext4_ind_direct_IO(). However the extent based function calls into the indirect based one for some cases and for example it is not able to handle file extending. Previously it was not also properly handling retries in case of ENOSPC errors. With DAX things would get even more contrieved so just refactor the direct IO code and instead of indirect / extent split do the split to read vs writes. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 2 - fs/ext4/indirect.c | 127 --------------------------------------------------- fs/ext4/inode.c | 131 ++++++++++++++++++++++++++++++++++++++++++++++------- 3 files changed, 114 insertions(+), 146 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index ba5aecc07fbc..89e1bcb21341 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2587,8 +2587,6 @@ extern int ext4_get_next_extent(struct inode *inode, ext4_lblk_t lblk, /* indirect.c */ extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags); -extern ssize_t ext4_ind_direct_IO(struct kiocb *iocb, struct iov_iter *iter, - loff_t offset); extern int ext4_ind_calc_metadata_amount(struct inode *inode, sector_t lblock); extern int ext4_ind_trans_blocks(struct inode *inode, int nrblocks); extern void ext4_ind_truncate(handle_t *, struct inode *inode); diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 3027fa681de5..bc15c2c17633 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -648,133 +648,6 @@ out: return err; } -/* - * O_DIRECT for ext3 (or indirect map) based files - * - * If the O_DIRECT write will extend the file then add this inode to the - * orphan list. So recovery will truncate it back to the original size - * if the machine crashes during the write. - * - * If the O_DIRECT write is intantiating holes inside i_size and the machine - * crashes then stale disk data _may_ be exposed inside the file. But current - * VFS code falls back into buffered path in that case so we are safe. - */ -ssize_t ext4_ind_direct_IO(struct kiocb *iocb, struct iov_iter *iter, - loff_t offset) -{ - struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; - struct ext4_inode_info *ei = EXT4_I(inode); - handle_t *handle; - ssize_t ret; - int orphan = 0; - size_t count = iov_iter_count(iter); - int retries = 0; - - if (iov_iter_rw(iter) == WRITE) { - loff_t final_size = offset + count; - - if (final_size > inode->i_size) { - /* Credits for sb + inode write */ - handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto out; - } - ret = ext4_orphan_add(handle, inode); - if (ret) { - ext4_journal_stop(handle); - goto out; - } - orphan = 1; - ei->i_disksize = inode->i_size; - ext4_journal_stop(handle); - } - } - -retry: - if (iov_iter_rw(iter) == READ && ext4_should_dioread_nolock(inode)) { - /* - * Nolock dioread optimization may be dynamically disabled - * via ext4_inode_block_unlocked_dio(). Check inode's state - * while holding extra i_dio_count ref. - */ - inode_dio_begin(inode); - smp_mb(); - if (unlikely(ext4_test_inode_state(inode, - EXT4_STATE_DIOREAD_LOCK))) { - inode_dio_end(inode); - goto locked; - } - if (IS_DAX(inode)) - ret = dax_do_io(iocb, inode, iter, offset, - ext4_dio_get_block, NULL, 0); - else - ret = __blockdev_direct_IO(iocb, inode, - inode->i_sb->s_bdev, iter, - offset, ext4_dio_get_block, - NULL, NULL, 0); - inode_dio_end(inode); - } else { -locked: - if (IS_DAX(inode)) - ret = dax_do_io(iocb, inode, iter, offset, - ext4_dio_get_block, NULL, DIO_LOCKING); - else - ret = blockdev_direct_IO(iocb, inode, iter, offset, - ext4_dio_get_block); - - if (unlikely(iov_iter_rw(iter) == WRITE && ret < 0)) { - loff_t isize = i_size_read(inode); - loff_t end = offset + count; - - if (end > isize) - ext4_truncate_failed_write(inode); - } - } - if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) - goto retry; - - if (orphan) { - int err; - - /* Credits for sb + inode write */ - handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); - if (IS_ERR(handle)) { - /* This is really bad luck. We've written the data - * but cannot extend i_size. Bail out and pretend - * the write failed... */ - ret = PTR_ERR(handle); - if (inode->i_nlink) - ext4_orphan_del(NULL, inode); - - goto out; - } - if (inode->i_nlink) - ext4_orphan_del(handle, inode); - if (ret > 0) { - loff_t end = offset + ret; - if (end > inode->i_size) { - ei->i_disksize = end; - i_size_write(inode, end); - /* - * We're going to return a positive `ret' - * here due to non-zero-length I/O, so there's - * no way of reporting error returns from - * ext4_mark_inode_dirty() to userspace. So - * ignore it. - */ - ext4_mark_inode_dirty(handle, inode); - } - } - err = ext4_journal_stop(handle); - if (ret == 0) - ret = err; - } -out: - return ret; -} - /* * Calculate the number of metadata blocks need to reserve * to allocate a new block at @lblocks for non extent file based file diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 32825dee81d4..4879e93c91d3 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3295,7 +3295,9 @@ static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset, } /* - * For ext4 extent files, ext4 will do direct-io write to holes, + * Handling of direct IO writes. + * + * For ext4 extent files, ext4 will do direct-io write even to holes, * preallocated extents, and those write extend the file, no need to * fall back to buffered IO. * @@ -3313,21 +3315,37 @@ static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset, * if the machine crashes during the write. * */ -static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter, - loff_t offset) +static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter, + loff_t offset) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; + struct ext4_inode_info *ei = EXT4_I(inode); ssize_t ret; size_t count = iov_iter_count(iter); int overwrite = 0; get_block_t *get_block_func = NULL; int dio_flags = 0; loff_t final_size = offset + count; + int orphan = 0; + handle_t *handle; - /* Use the old path for reads and writes beyond i_size. */ - if (iov_iter_rw(iter) != WRITE || final_size > inode->i_size) - return ext4_ind_direct_IO(iocb, iter, offset); + if (final_size > inode->i_size) { + /* Credits for sb + inode write */ + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + ret = ext4_orphan_add(handle, inode); + if (ret) { + ext4_journal_stop(handle); + goto out; + } + orphan = 1; + ei->i_disksize = inode->i_size; + ext4_journal_stop(handle); + } BUG_ON(iocb->private == NULL); @@ -3336,8 +3354,7 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter, * conversion. This also disallows race between truncate() and * overwrite DIO as i_dio_count needs to be incremented under i_mutex. */ - if (iov_iter_rw(iter) == WRITE) - inode_dio_begin(inode); + inode_dio_begin(inode); /* If we do a overwrite dio, i_mutex locking can be released */ overwrite = *((int *)iocb->private); @@ -3346,7 +3363,7 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter, inode_unlock(inode); /* - * We could direct write to holes and fallocate. + * For extent mapped files we could direct write to holes and fallocate. * * Allocated blocks to fill the hole are marked as unwritten to prevent * parallel buffered read to expose the stale data before DIO complete @@ -3368,7 +3385,11 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter, iocb->private = NULL; if (overwrite) get_block_func = ext4_dio_get_block_overwrite; - else if (is_sync_kiocb(iocb)) { + else if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) || + round_down(offset, 1 << inode->i_blkbits) >= inode->i_size) { + get_block_func = ext4_dio_get_block; + dio_flags = DIO_LOCKING | DIO_SKIP_HOLES; + } else if (is_sync_kiocb(iocb)) { get_block_func = ext4_dio_get_block_unwritten_sync; dio_flags = DIO_LOCKING; } else { @@ -3378,10 +3399,11 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter, #ifdef CONFIG_EXT4_FS_ENCRYPTION BUG_ON(ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode)); #endif - if (IS_DAX(inode)) + if (IS_DAX(inode)) { + dio_flags &= ~DIO_SKIP_HOLES; ret = dax_do_io(iocb, inode, iter, offset, get_block_func, ext4_end_io_dio, dio_flags); - else + } else ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter, offset, get_block_func, @@ -3401,12 +3423,87 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter, ext4_clear_inode_state(inode, EXT4_STATE_DIO_UNWRITTEN); } - if (iov_iter_rw(iter) == WRITE) - inode_dio_end(inode); + inode_dio_end(inode); /* take i_mutex locking again if we do a ovewrite dio */ if (overwrite) inode_lock(inode); + if (ret < 0 && final_size > inode->i_size) + ext4_truncate_failed_write(inode); + + /* Handle extending of i_size after direct IO write */ + if (orphan) { + int err; + + /* Credits for sb + inode write */ + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); + if (IS_ERR(handle)) { + /* This is really bad luck. We've written the data + * but cannot extend i_size. Bail out and pretend + * the write failed... */ + ret = PTR_ERR(handle); + if (inode->i_nlink) + ext4_orphan_del(NULL, inode); + + goto out; + } + if (inode->i_nlink) + ext4_orphan_del(handle, inode); + if (ret > 0) { + loff_t end = offset + ret; + if (end > inode->i_size) { + ei->i_disksize = end; + i_size_write(inode, end); + /* + * We're going to return a positive `ret' + * here due to non-zero-length I/O, so there's + * no way of reporting error returns from + * ext4_mark_inode_dirty() to userspace. So + * ignore it. + */ + ext4_mark_inode_dirty(handle, inode); + } + } + err = ext4_journal_stop(handle); + if (ret == 0) + ret = err; + } +out: + return ret; +} + +static ssize_t ext4_direct_IO_read(struct kiocb *iocb, struct iov_iter *iter, + loff_t offset) +{ + int unlocked = 0; + struct inode *inode = iocb->ki_filp->f_mapping->host; + ssize_t ret; + + if (ext4_should_dioread_nolock(inode)) { + /* + * Nolock dioread optimization may be dynamically disabled + * via ext4_inode_block_unlocked_dio(). Check inode's state + * while holding extra i_dio_count ref. + */ + inode_dio_begin(inode); + smp_mb(); + if (unlikely(ext4_test_inode_state(inode, + EXT4_STATE_DIOREAD_LOCK))) + inode_dio_end(inode); + else + unlocked = 1; + } + if (IS_DAX(inode)) { + ret = dax_do_io(iocb, inode, iter, offset, ext4_dio_get_block, + NULL, unlocked ? 0 : DIO_LOCKING); + } else { + ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, + iter, offset, ext4_dio_get_block, + NULL, NULL, + unlocked ? 0 : DIO_LOCKING); + } + if (unlocked) + inode_dio_end(inode); return ret; } @@ -3434,10 +3531,10 @@ static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter, return 0; trace_ext4_direct_IO_enter(inode, offset, count, iov_iter_rw(iter)); - if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - ret = ext4_ext_direct_IO(iocb, iter, offset); + if (iov_iter_rw(iter) == READ) + ret = ext4_direct_IO_read(iocb, iter, offset); else - ret = ext4_ind_direct_IO(iocb, iter, offset); + ret = ext4_direct_IO_write(iocb, iter, offset); trace_ext4_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), ret); return ret; } -- cgit v1.2.3 From 12735f881952c32b31bc4e433768f18489f79ec9 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 13 May 2016 00:51:15 -0400 Subject: ext4: pre-zero allocated blocks for DAX IO Currently ext4 treats DAX IO the same way as direct IO. I.e., it allocates unwritten extents before IO is done and converts unwritten extents afterwards. However this way DAX IO can race with page fault to the same area: ext4_ext_direct_IO() dax_fault() dax_io() get_block() - allocates unwritten extent copy_from_iter_pmem() get_block() - converts unwritten block to written and zeroes it out ext4_convert_unwritten_extents() So data written with DAX IO gets lost. Similarly dax_new_buf() called from dax_io() can overwrite data that has been already written to the block via mmap. Fix the problem by using pre-zeroed blocks for DAX IO the same way as we use them for DAX mmap. The downside of this solution is that every allocating write writes each block twice (once zeros, once data). Fixing the race with locking is possible as well however we would need to lock-out faults for the whole range written to by DAX IO. And that is not easy to do without locking-out faults for the whole file which seems too aggressive. Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 11 +++++++++-- fs/ext4/file.c | 4 ++-- fs/ext4/inode.c | 43 +++++++++++++++++++++++++++++++++---------- 3 files changed, 44 insertions(+), 14 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 89e1bcb21341..b84aa1ca480a 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -2527,8 +2527,8 @@ struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int); struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int); int ext4_get_block_unwritten(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); -int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create); +int ext4_dax_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); int ext4_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); int ext4_dio_get_block(struct inode *inode, sector_t iblock, @@ -3334,6 +3334,13 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end) } } +static inline bool ext4_aligned_io(struct inode *inode, loff_t off, loff_t len) +{ + int blksize = 1 << inode->i_blkbits; + + return IS_ALIGNED(off, blksize) && IS_ALIGNED(len, blksize); +} + #endif /* __KERNEL__ */ #define EFSBADCRC EBADMSG /* Bad CRC detected */ diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 3e850b988923..37e28082885a 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -207,7 +207,7 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) if (IS_ERR(handle)) result = VM_FAULT_SIGBUS; else - result = __dax_fault(vma, vmf, ext4_dax_mmap_get_block, NULL); + result = __dax_fault(vma, vmf, ext4_dax_get_block, NULL); if (write) { if (!IS_ERR(handle)) @@ -243,7 +243,7 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, result = VM_FAULT_SIGBUS; else result = __dax_pmd_fault(vma, addr, pmd, flags, - ext4_dax_mmap_get_block, NULL); + ext4_dax_get_block, NULL); if (write) { if (!IS_ERR(handle)) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 4879e93c91d3..f9ab1e8cc416 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3229,13 +3229,17 @@ static int ext4_releasepage(struct page *page, gfp_t wait) } #ifdef CONFIG_FS_DAX -int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) +/* + * Get block function for DAX IO and mmap faults. It takes care of converting + * unwritten extents to written ones and initializes new / converted blocks + * to zeros. + */ +int ext4_dax_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) { int ret; - ext4_debug("ext4_dax_mmap_get_block: inode %lu, create flag %d\n", - inode->i_ino, create); + ext4_debug("inode %lu, create flag %d\n", inode->i_ino, create); if (!create) return _ext4_get_block(inode, iblock, bh_result, 0); @@ -3247,9 +3251,9 @@ int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock, if (buffer_unwritten(bh_result)) { /* - * We are protected by i_mmap_sem so we know block cannot go - * away from under us even though we dropped i_data_sem. - * Convert extent to written and write zeros there. + * We are protected by i_mmap_sem or i_mutex so we know block + * cannot go away from under us even though we dropped + * i_data_sem. Convert extent to written and write zeros there. */ ret = ext4_get_block_trans(inode, iblock, bh_result, EXT4_GET_BLOCKS_CONVERT | @@ -3264,6 +3268,14 @@ int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock, clear_buffer_new(bh_result); return 0; } +#else +/* Just define empty function, it will never get called. */ +int ext4_dax_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + BUG(); + return 0; +} #endif static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset, @@ -3385,8 +3397,20 @@ static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter, iocb->private = NULL; if (overwrite) get_block_func = ext4_dio_get_block_overwrite; - else if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) || - round_down(offset, 1 << inode->i_blkbits) >= inode->i_size) { + else if (IS_DAX(inode)) { + /* + * We can avoid zeroing for aligned DAX writes beyond EOF. Other + * writes need zeroing either because they can race with page + * faults or because they use partial blocks. + */ + if (round_down(offset, 1<i_blkbits) >= inode->i_size && + ext4_aligned_io(inode, offset, count)) + get_block_func = ext4_dio_get_block; + else + get_block_func = ext4_dax_get_block; + dio_flags = DIO_LOCKING; + } else if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) || + round_down(offset, 1 << inode->i_blkbits) >= inode->i_size) { get_block_func = ext4_dio_get_block; dio_flags = DIO_LOCKING | DIO_SKIP_HOLES; } else if (is_sync_kiocb(iocb)) { @@ -3400,7 +3424,6 @@ static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter, BUG_ON(ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode)); #endif if (IS_DAX(inode)) { - dio_flags &= ~DIO_SKIP_HOLES; ret = dax_do_io(iocb, inode, iter, offset, get_block_func, ext4_end_io_dio, dio_flags); } else -- cgit v1.2.3