summaryrefslogtreecommitdiff
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/async-thread.c14
-rw-r--r--fs/btrfs/backref.c21
-rw-r--r--fs/btrfs/block-group.c4
-rw-r--r--fs/btrfs/ctree.c19
-rw-r--r--fs/btrfs/disk-io.c14
-rw-r--r--fs/btrfs/extent_io.c6
-rw-r--r--fs/btrfs/extent_io.h2
-rw-r--r--fs/btrfs/inode.c11
-rw-r--r--fs/btrfs/ioctl.c6
-rw-r--r--fs/btrfs/qgroup.c47
-rw-r--r--fs/btrfs/root-tree.c3
-rw-r--r--fs/btrfs/send.c4
-rw-r--r--fs/btrfs/tree-log.c75
-rw-r--r--fs/btrfs/volumes.c2
14 files changed, 194 insertions, 34 deletions
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 11be02459b87..eb592b92aa9c 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -237,6 +237,13 @@ static void run_ordered_work(struct __btrfs_workqueue *wq,
ordered_list);
if (!test_bit(WORK_DONE_BIT, &work->flags))
break;
+ /*
+ * Orders all subsequent loads after reading WORK_DONE_BIT,
+ * paired with the smp_mb__before_atomic in btrfs_work_helper
+ * this guarantees that the ordered function will see all
+ * updates from ordinary work function.
+ */
+ smp_rmb();
/*
* we are going to call the ordered done function, but
@@ -325,6 +332,13 @@ static void btrfs_work_helper(struct work_struct *normal_work)
thresh_exec_hook(wq);
work->func(work);
if (need_order) {
+ /*
+ * Ensures all memory accesses done in the work function are
+ * ordered before setting the WORK_DONE_BIT. Ensuring the thread
+ * which is going to executed the ordered work sees them.
+ * Pairs with the smp_rmb in run_ordered_work.
+ */
+ smp_mb__before_atomic();
set_bit(WORK_DONE_BIT, &work->flags);
run_ordered_work(wq, work);
}
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 7f644a58db51..c701a19fac53 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1208,7 +1208,12 @@ again:
ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0);
if (ret < 0)
goto out;
- BUG_ON(ret == 0);
+ if (ret == 0) {
+ /* This shouldn't happen, indicates a bug or fs corruption. */
+ ASSERT(ret != 0);
+ ret = -EUCLEAN;
+ goto out;
+ }
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
if (trans && likely(trans->type != __TRANS_DUMMY) &&
@@ -1356,10 +1361,18 @@ again:
goto out;
if (!ret && extent_item_pos) {
/*
- * we've recorded that parent, so we must extend
- * its inode list here
+ * We've recorded that parent, so we must extend
+ * its inode list here.
+ *
+ * However if there was corruption we may not
+ * have found an eie, return an error in this
+ * case.
*/
- BUG_ON(!eie);
+ ASSERT(eie);
+ if (!eie) {
+ ret = -EUCLEAN;
+ goto out;
+ }
while (eie->next)
eie = eie->next;
eie->next = ref->inode_list;
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index e98d6ea35ea8..bcf19dfb0af3 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -2388,7 +2388,6 @@ int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
struct btrfs_path *path = NULL;
LIST_HEAD(dirty);
struct list_head *io = &cur_trans->io_bgs;
- int num_started = 0;
int loops = 0;
spin_lock(&cur_trans->dirty_bgs_lock);
@@ -2455,7 +2454,6 @@ again:
cache->io_ctl.inode = NULL;
ret = btrfs_write_out_cache(trans, cache, path);
if (ret == 0 && cache->io_ctl.inode) {
- num_started++;
should_put = 0;
/*
@@ -2556,7 +2554,6 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
int should_put;
struct btrfs_path *path;
struct list_head *io = &cur_trans->io_bgs;
- int num_started = 0;
path = btrfs_alloc_path();
if (!path)
@@ -2614,7 +2611,6 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
cache->io_ctl.inode = NULL;
ret = btrfs_write_out_cache(trans, cache, path);
if (ret == 0 && cache->io_ctl.inode) {
- num_started++;
should_put = 0;
list_add_tail(&cache->io_list, io);
} else {
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index dac30b00d14b..822c615840e8 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2658,12 +2658,9 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *b;
- int root_lock;
+ int root_lock = 0;
int level = 0;
- /* We try very hard to do read locks on the root */
- root_lock = BTRFS_READ_LOCK;
-
if (p->search_commit_root) {
/*
* The commit roots are read only so we always do read locks,
@@ -2701,6 +2698,9 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
goto out;
}
+ /* We try very hard to do read locks on the root */
+ root_lock = BTRFS_READ_LOCK;
+
/*
* If the level is set to maximum, we can skip trying to get the read
* lock.
@@ -2727,6 +2727,17 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
level = btrfs_header_level(b);
out:
+ /*
+ * The root may have failed to write out at some point, and thus is no
+ * longer valid, return an error in this case.
+ */
+ if (!extent_buffer_uptodate(b)) {
+ if (root_lock)
+ btrfs_tree_unlock_rw(b, root_lock);
+ free_extent_buffer(b);
+ return ERR_PTR(-EIO);
+ }
+
p->nodes[level] = b;
if (!p->skip_locking)
p->locks[level] = root_lock;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 1499531bc151..f18c6d97932e 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -3636,11 +3636,23 @@ static void btrfs_end_empty_barrier(struct bio *bio)
*/
static void write_dev_flush(struct btrfs_device *device)
{
- struct request_queue *q = bdev_get_queue(device->bdev);
struct bio *bio = device->flush_bio;
+#ifndef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+ /*
+ * When a disk has write caching disabled, we skip submission of a bio
+ * with flush and sync requests before writing the superblock, since
+ * it's not needed. However when the integrity checker is enabled, this
+ * results in reports that there are metadata blocks referred by a
+ * superblock that were not properly flushed. So don't skip the bio
+ * submission only when the integrity checker is enabled for the sake
+ * of simplicity, since this is a debug tool and not meant for use in
+ * non-debug builds.
+ */
+ struct request_queue *q = bdev_get_queue(device->bdev);
if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags))
return;
+#endif
bio_reset(bio);
bio->bi_end_io = btrfs_end_empty_barrier;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 9108a73423f7..95ddeb477797 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3755,6 +3755,12 @@ static void set_btree_ioerr(struct page *page)
return;
/*
+ * A read may stumble upon this buffer later, make sure that it gets an
+ * error and knows there was an error.
+ */
+ clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
+
+ /*
* If we error out, we should add back the dirty_metadata_bytes
* to make it consistent.
*/
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index fcf1807cc8dd..c8def2bdf247 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -202,7 +202,7 @@ struct extent_buffer {
*/
struct extent_changeset {
/* How many bytes are set/cleared in this operation */
- unsigned int bytes_changed;
+ u64 bytes_changed;
/* Changed ranges */
struct ulist range_changed;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b859ed50cf46..7755a0362a3a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -10808,9 +10808,19 @@ static int btrfs_add_swap_extent(struct swap_info_struct *sis,
struct btrfs_swap_info *bsi)
{
unsigned long nr_pages;
+ unsigned long max_pages;
u64 first_ppage, first_ppage_reported, next_ppage;
int ret;
+ /*
+ * Our swapfile may have had its size extended after the swap header was
+ * written. In that case activating the swapfile should not go beyond
+ * the max size set in the swap header.
+ */
+ if (bsi->nr_pages >= sis->max)
+ return 0;
+
+ max_pages = sis->max - bsi->nr_pages;
first_ppage = ALIGN(bsi->block_start, PAGE_SIZE) >> PAGE_SHIFT;
next_ppage = ALIGN_DOWN(bsi->block_start + bsi->block_len,
PAGE_SIZE) >> PAGE_SHIFT;
@@ -10818,6 +10828,7 @@ static int btrfs_add_swap_extent(struct swap_info_struct *sis,
if (first_ppage >= next_ppage)
return 0;
nr_pages = next_ppage - first_ppage;
+ nr_pages = min(nr_pages, max_pages);
first_ppage_reported = first_ppage;
if (bsi->start == 0)
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index e9d3eb7f0e2b..675112aa998f 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3027,10 +3027,8 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
inode_lock(inode);
err = btrfs_delete_subvolume(dir, dentry);
inode_unlock(inode);
- if (!err) {
- fsnotify_rmdir(dir, dentry);
- d_delete(dentry);
- }
+ if (!err)
+ d_delete_notify(dir, dentry);
out_dput:
dput(dentry);
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index bb034e19a2a8..5a3006c75d63 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -890,6 +890,14 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
int ret = 0;
int slot;
+ /*
+ * We need to have subvol_sem write locked, to prevent races between
+ * concurrent tasks trying to enable quotas, because we will unlock
+ * and relock qgroup_ioctl_lock before setting fs_info->quota_root
+ * and before setting BTRFS_FS_QUOTA_ENABLED.
+ */
+ lockdep_assert_held_write(&fs_info->subvol_sem);
+
mutex_lock(&fs_info->qgroup_ioctl_lock);
if (fs_info->quota_root)
goto out;
@@ -1035,8 +1043,19 @@ out_add_root:
goto out_free_path;
}
+ mutex_unlock(&fs_info->qgroup_ioctl_lock);
+ /*
+ * Commit the transaction while not holding qgroup_ioctl_lock, to avoid
+ * a deadlock with tasks concurrently doing other qgroup operations, such
+ * adding/removing qgroups or adding/deleting qgroup relations for example,
+ * because all qgroup operations first start or join a transaction and then
+ * lock the qgroup_ioctl_lock mutex.
+ * We are safe from a concurrent task trying to enable quotas, by calling
+ * this function, since we are serialized by fs_info->subvol_sem.
+ */
ret = btrfs_commit_transaction(trans);
trans = NULL;
+ mutex_lock(&fs_info->qgroup_ioctl_lock);
if (ret)
goto out_free_path;
@@ -1086,12 +1105,34 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
struct btrfs_trans_handle *trans = NULL;
int ret = 0;
+ /*
+ * We need to have subvol_sem write locked, to prevent races between
+ * concurrent tasks trying to disable quotas, because we will unlock
+ * and relock qgroup_ioctl_lock across BTRFS_FS_QUOTA_ENABLED changes.
+ */
+ lockdep_assert_held_write(&fs_info->subvol_sem);
+
mutex_lock(&fs_info->qgroup_ioctl_lock);
if (!fs_info->quota_root)
goto out;
+
+ /*
+ * Unlock the qgroup_ioctl_lock mutex before waiting for the rescan worker to
+ * complete. Otherwise we can deadlock because btrfs_remove_qgroup() needs
+ * to lock that mutex while holding a transaction handle and the rescan
+ * worker needs to commit a transaction.
+ */
mutex_unlock(&fs_info->qgroup_ioctl_lock);
/*
+ * Request qgroup rescan worker to complete and wait for it. This wait
+ * must be done before transaction start for quota disable since it may
+ * deadlock with transaction by the qgroup rescan worker.
+ */
+ clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
+ btrfs_qgroup_wait_for_completion(fs_info, false);
+
+ /*
* 1 For the root item
*
* We should also reserve enough items for the quota tree deletion in
@@ -1106,14 +1147,13 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
trans = NULL;
+ set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
goto out;
}
if (!fs_info->quota_root)
goto out;
- clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
- btrfs_qgroup_wait_for_completion(fs_info, false);
spin_lock(&fs_info->qgroup_lock);
quota_root = fs_info->quota_root;
fs_info->quota_root = NULL;
@@ -3285,6 +3325,9 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
btrfs_warn(fs_info,
"qgroup rescan init failed, qgroup is not enabled");
ret = -EINVAL;
+ } else if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
+ /* Quota disable is in progress */
+ ret = -EBUSY;
}
if (ret) {
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 612411c74550..0d07ebe511e7 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -371,7 +371,8 @@ int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
key.offset = ref_id;
again:
ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
- BUG_ON(ret < 0);
+ if (ret < 0)
+ goto out;
if (ret == 0) {
leaf = path->nodes[0];
ref = btrfs_item_ptr(leaf, path->slots[0],
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index fc688af57c23..e258fc484cea 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -5005,6 +5005,10 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len)
lock_page(page);
if (!PageUptodate(page)) {
unlock_page(page);
+ btrfs_err(fs_info,
+ "send: IO error at offset %llu for inode %llu root %llu",
+ page_offset(page), sctx->cur_ino,
+ sctx->send_root->root_key.objectid);
put_page(page);
ret = -EIO;
break;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 7bc4477d7ee7..b7bfecfc2ea3 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1308,6 +1308,15 @@ again:
inode, name, namelen);
kfree(name);
iput(dir);
+ /*
+ * Whenever we need to check if a name exists or not, we
+ * check the subvolume tree. So after an unlink we must
+ * run delayed items, so that future checks for a name
+ * during log replay see that the name does not exists
+ * anymore.
+ */
+ if (!ret)
+ ret = btrfs_run_delayed_items(trans);
if (ret)
goto out;
goto again;
@@ -1559,6 +1568,15 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
*/
if (!ret && inode->i_nlink == 0)
inc_nlink(inode);
+ /*
+ * Whenever we need to check if a name exists or
+ * not, we check the subvolume tree. So after an
+ * unlink we must run delayed items, so that future
+ * checks for a name during log replay see that the
+ * name does not exists anymore.
+ */
+ if (!ret)
+ ret = btrfs_run_delayed_items(trans);
}
if (ret < 0)
goto out;
@@ -4249,7 +4267,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
/*
* Log all prealloc extents beyond the inode's i_size to make sure we do not
- * lose them after doing a fast fsync and replaying the log. We scan the
+ * lose them after doing a full/fast fsync and replaying the log. We scan the
* subvolume's root instead of iterating the inode's extent map tree because
* otherwise we can log incorrect extent items based on extent map conversion.
* That can happen due to the fact that extent maps are merged when they
@@ -5042,6 +5060,7 @@ static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
struct btrfs_log_ctx *ctx,
bool *need_log_inode_item)
{
+ const u64 i_size = i_size_read(&inode->vfs_inode);
struct btrfs_root *root = inode->root;
int ins_start_slot = 0;
int ins_nr = 0;
@@ -5062,13 +5081,21 @@ again:
if (min_key->type > max_key->type)
break;
- if (min_key->type == BTRFS_INODE_ITEM_KEY)
+ if (min_key->type == BTRFS_INODE_ITEM_KEY) {
*need_log_inode_item = false;
-
- if ((min_key->type == BTRFS_INODE_REF_KEY ||
- min_key->type == BTRFS_INODE_EXTREF_KEY) &&
- inode->generation == trans->transid &&
- !recursive_logging) {
+ } else if (min_key->type == BTRFS_EXTENT_DATA_KEY &&
+ min_key->offset >= i_size) {
+ /*
+ * Extents at and beyond eof are logged with
+ * btrfs_log_prealloc_extents().
+ * Only regular files have BTRFS_EXTENT_DATA_KEY keys,
+ * and no keys greater than that, so bail out.
+ */
+ break;
+ } else if ((min_key->type == BTRFS_INODE_REF_KEY ||
+ min_key->type == BTRFS_INODE_EXTREF_KEY) &&
+ inode->generation == trans->transid &&
+ !recursive_logging) {
u64 other_ino = 0;
u64 other_parent = 0;
@@ -5099,10 +5126,8 @@ again:
btrfs_release_path(path);
goto next_key;
}
- }
-
- /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */
- if (min_key->type == BTRFS_XATTR_ITEM_KEY) {
+ } else if (min_key->type == BTRFS_XATTR_ITEM_KEY) {
+ /* Skip xattrs, logged later with btrfs_log_all_xattrs() */
if (ins_nr == 0)
goto next_slot;
ret = copy_items(trans, inode, dst_path, path,
@@ -5155,9 +5180,21 @@ next_key:
break;
}
}
- if (ins_nr)
+ if (ins_nr) {
ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
ins_nr, inode_only, logged_isize);
+ if (ret)
+ return ret;
+ }
+
+ if (inode_only == LOG_INODE_ALL && S_ISREG(inode->vfs_inode.i_mode)) {
+ /*
+ * Release the path because otherwise we might attempt to double
+ * lock the same leaf with btrfs_log_prealloc_extents() below.
+ */
+ btrfs_release_path(path);
+ ret = btrfs_log_prealloc_extents(trans, inode, dst_path);
+ }
return ret;
}
@@ -5258,6 +5295,18 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
}
/*
+ * For symlinks, we must always log their content, which is stored in an
+ * inline extent, otherwise we could end up with an empty symlink after
+ * log replay, which is invalid on linux (symlink(2) returns -ENOENT if
+ * one attempts to create an empty symlink).
+ * We don't need to worry about flushing delalloc, because when we create
+ * the inline extent when the symlink is created (we never have delalloc
+ * for symlinks).
+ */
+ if (S_ISLNK(inode->vfs_inode.i_mode))
+ inode_only = LOG_INODE_ALL;
+
+ /*
* a brute force approach to making sure we get the most uptodate
* copies of everything.
*/
@@ -5670,7 +5719,7 @@ process_leaf:
}
ctx->log_new_dentries = false;
- if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK)
+ if (type == BTRFS_FT_DIR)
log_mode = LOG_INODE_ALL;
ret = btrfs_log_inode(trans, root, BTRFS_I(di_inode),
log_mode, 0, LLONG_MAX, ctx);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 344d18de1f08..8898682c9103 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -4320,10 +4320,12 @@ static int balance_kthread(void *data)
struct btrfs_fs_info *fs_info = data;
int ret = 0;
+ sb_start_write(fs_info->sb);
mutex_lock(&fs_info->balance_mutex);
if (fs_info->balance_ctl)
ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL);
mutex_unlock(&fs_info->balance_mutex);
+ sb_end_write(fs_info->sb);
return ret;
}