summaryrefslogtreecommitdiff
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/backref.c1
-rw-r--r--fs/btrfs/block-group.c61
-rw-r--r--fs/btrfs/block-rsv.c3
-rw-r--r--fs/btrfs/ctree.c6
-rw-r--r--fs/btrfs/ctree.h15
-rw-r--r--fs/btrfs/delayed-inode.c2
-rw-r--r--fs/btrfs/dev-replace.c40
-rw-r--r--fs/btrfs/disk-io.c25
-rw-r--r--fs/btrfs/export.c8
-rw-r--r--fs/btrfs/export.h5
-rw-r--r--fs/btrfs/extent-tree.c100
-rw-r--r--fs/btrfs/extent_io.c71
-rw-r--r--fs/btrfs/extent_io.h6
-rw-r--r--fs/btrfs/file-item.c6
-rw-r--r--fs/btrfs/file.c33
-rw-r--r--fs/btrfs/free-space-cache.c6
-rw-r--r--fs/btrfs/inode.c400
-rw-r--r--fs/btrfs/ioctl.c58
-rw-r--r--fs/btrfs/print-tree.c12
-rw-r--r--fs/btrfs/qgroup.c28
-rw-r--r--fs/btrfs/qgroup.h2
-rw-r--r--fs/btrfs/ref-verify.c2
-rw-r--r--fs/btrfs/relocation.c68
-rw-r--r--fs/btrfs/scrub.c124
-rw-r--r--fs/btrfs/send.c86
-rw-r--r--fs/btrfs/space-info.c95
-rw-r--r--fs/btrfs/space-info.h4
-rw-r--r--fs/btrfs/super.c62
-rw-r--r--fs/btrfs/sysfs.c3
-rw-r--r--fs/btrfs/transaction.c90
-rw-r--r--fs/btrfs/transaction.h15
-rw-r--r--fs/btrfs/tree-checker.c42
-rw-r--r--fs/btrfs/tree-log.c332
-rw-r--r--fs/btrfs/volumes.c87
34 files changed, 1228 insertions, 670 deletions
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index e5d85311d5d5..86e280edf804 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1422,6 +1422,7 @@ static int btrfs_find_all_roots_safe(struct btrfs_trans_handle *trans,
if (ret < 0 && ret != -ENOENT) {
ulist_free(tmp);
ulist_free(*roots);
+ *roots = NULL;
return ret;
}
node = ulist_next(tmp, &uiter);
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 2fead6c3c687..ace49a999ece 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -910,7 +910,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
- goto out_put_group;
+ goto out;
}
/*
@@ -948,7 +948,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
ret = btrfs_orphan_add(trans, BTRFS_I(inode));
if (ret) {
btrfs_add_delayed_iput(inode);
- goto out_put_group;
+ goto out;
}
clear_nlink(inode);
/* One for the block groups ref */
@@ -971,13 +971,13 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
if (ret < 0)
- goto out_put_group;
+ goto out;
if (ret > 0)
btrfs_release_path(path);
if (ret == 0) {
ret = btrfs_del_item(trans, tree_root, path);
if (ret)
- goto out_put_group;
+ goto out;
btrfs_release_path(path);
}
@@ -986,6 +986,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
&fs_info->block_group_cache_tree);
RB_CLEAR_NODE(&block_group->cache_node);
+ /* Once for the block groups rbtree */
+ btrfs_put_block_group(block_group);
+
if (fs_info->first_logical_byte == block_group->key.objectid)
fs_info->first_logical_byte = (u64)-1;
spin_unlock(&fs_info->block_group_cache_lock);
@@ -1094,10 +1097,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
ret = remove_block_group_free_space(trans, block_group);
if (ret)
- goto out_put_group;
-
- /* Once for the block groups rbtree */
- btrfs_put_block_group(block_group);
+ goto out;
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret > 0)
@@ -1120,10 +1120,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
free_extent_map(em);
}
-out_put_group:
+out:
/* Once for the lookup reference */
btrfs_put_block_group(block_group);
-out:
if (remove_rsv)
btrfs_delayed_refs_rsv_release(fs_info, 1);
btrfs_free_path(path);
@@ -1167,7 +1166,7 @@ struct btrfs_trans_handle *btrfs_start_trans_remove_block_group(
free_extent_map(em);
return btrfs_start_transaction_fallback_global_rsv(fs_info->extent_root,
- num_items, 1);
+ num_items);
}
/*
@@ -1187,7 +1186,6 @@ static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
{
struct btrfs_space_info *sinfo = cache->space_info;
u64 num_bytes;
- u64 sinfo_used;
u64 min_allocable_bytes;
int ret = -ENOSPC;
@@ -1214,20 +1212,38 @@ static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
num_bytes = cache->key.offset - cache->reserved - cache->pinned -
cache->bytes_super - btrfs_block_group_used(&cache->item);
- sinfo_used = btrfs_space_info_used(sinfo, true);
/*
- * sinfo_used + num_bytes should always <= sinfo->total_bytes.
- *
- * Here we make sure if we mark this bg RO, we still have enough
- * free space as buffer (if min_allocable_bytes is not 0).
+ * Data never overcommits, even in mixed mode, so do just the straight
+ * check of left over space in how much we have allocated.
*/
- if (sinfo_used + num_bytes + min_allocable_bytes <=
- sinfo->total_bytes) {
+ if (force) {
+ ret = 0;
+ } else if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA) {
+ u64 sinfo_used = btrfs_space_info_used(sinfo, true);
+
+ /*
+ * Here we make sure if we mark this bg RO, we still have enough
+ * free space as buffer.
+ */
+ if (sinfo_used + num_bytes <= sinfo->total_bytes)
+ ret = 0;
+ } else {
+ /*
+ * We overcommit metadata, so we need to do the
+ * btrfs_can_overcommit check here, and we need to pass in
+ * BTRFS_RESERVE_NO_FLUSH to give ourselves the most amount of
+ * leeway to allow us to mark this block group as read only.
+ */
+ if (btrfs_can_overcommit(cache->fs_info, sinfo, num_bytes,
+ BTRFS_RESERVE_NO_FLUSH))
+ ret = 0;
+ }
+
+ if (!ret) {
sinfo->bytes_readonly += num_bytes;
cache->ro++;
list_add_tail(&cache->ro_list, &sinfo->ro_bgs);
- ret = 0;
}
out:
spin_unlock(&cache->lock);
@@ -1236,9 +1252,6 @@ out:
btrfs_info(cache->fs_info,
"unable to make block group %llu ro",
cache->key.objectid);
- btrfs_info(cache->fs_info,
- "sinfo_used=%llu bg_num_bytes=%llu min_allocable=%llu",
- sinfo_used, num_bytes, min_allocable_bytes);
btrfs_dump_space_info(cache->fs_info, cache->space_info, 0, 0);
}
return ret;
@@ -2169,7 +2182,7 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group,
return 0;
}
- if (trans->aborted)
+ if (TRANS_ABORTED(trans))
return 0;
again:
inode = lookup_free_space_inode(block_group, path);
diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c
index d07bd41a7c1e..343400d49bd1 100644
--- a/fs/btrfs/block-rsv.c
+++ b/fs/btrfs/block-rsv.c
@@ -5,6 +5,7 @@
#include "block-rsv.h"
#include "space-info.h"
#include "transaction.h"
+#include "block-group.h"
static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv,
@@ -313,6 +314,8 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info)
else
block_rsv->full = 0;
+ if (block_rsv->size >= sinfo->total_bytes)
+ sinfo->force_alloc = CHUNK_ALLOC_FORCE;
spin_unlock(&block_rsv->lock);
spin_unlock(&sinfo->lock);
}
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index a989105d39c8..c05127f50637 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1339,6 +1339,8 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
btrfs_tree_read_unlock_blocking(eb);
free_extent_buffer(eb);
+ btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb_rewin),
+ eb_rewin, btrfs_header_level(eb_rewin));
btrfs_tree_read_lock(eb_rewin);
__tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
WARN_ON(btrfs_header_nritems(eb_rewin) >
@@ -1412,7 +1414,6 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
if (!eb)
return NULL;
- btrfs_tree_read_lock(eb);
if (old_root) {
btrfs_set_header_bytenr(eb, eb->start);
btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
@@ -1420,6 +1421,9 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
btrfs_set_header_level(eb, old_root->level);
btrfs_set_header_generation(eb, old_generation);
}
+ btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb,
+ btrfs_header_level(eb));
+ btrfs_tree_read_lock(eb);
if (tm)
__tree_mod_log_rewind(fs_info, eb, time_seq, tm);
else
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 169075550a5a..23b4f38e2392 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -940,6 +940,8 @@ enum {
BTRFS_ROOT_DEAD_RELOC_TREE,
/* Mark dead root stored on device whose cleanup needs to be resumed */
BTRFS_ROOT_DEAD_TREE,
+ /* The root has a log tree. Used only for subvolume roots. */
+ BTRFS_ROOT_HAS_LOG_TREE,
};
/*
@@ -988,8 +990,10 @@ struct btrfs_root {
wait_queue_head_t log_writer_wait;
wait_queue_head_t log_commit_wait[2];
struct list_head log_ctxs[2];
+ /* Used only for log trees of subvolumes, not for the log root tree */
atomic_t log_writers;
atomic_t log_commit[2];
+ /* Used only for log trees of subvolumes, not for the log root tree */
atomic_t log_batch;
int log_transid;
/* No matter the commit succeeds or not*/
@@ -2411,7 +2415,7 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info,
u64 bytenr, u64 num_bytes);
int btrfs_exclude_logged_extents(struct extent_buffer *eb);
int btrfs_cross_ref_exist(struct btrfs_root *root,
- u64 objectid, u64 offset, u64 bytenr);
+ u64 objectid, u64 offset, u64 bytenr, bool strict);
struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 parent, u64 root_objectid,
@@ -2465,6 +2469,7 @@ enum btrfs_reserve_flush_enum {
BTRFS_RESERVE_FLUSH_LIMIT,
BTRFS_RESERVE_FLUSH_EVICT,
BTRFS_RESERVE_FLUSH_ALL,
+ BTRFS_RESERVE_FLUSH_ALL_STEAL,
};
enum btrfs_flush_state {
@@ -2816,7 +2821,7 @@ struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
u64 start, u64 len);
noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
u64 *orig_start, u64 *orig_block_len,
- u64 *ram_bytes);
+ u64 *ram_bytes, bool strict);
void __btrfs_del_delalloc_inode(struct btrfs_root *root,
struct btrfs_inode *inode);
@@ -2951,6 +2956,8 @@ int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end);
loff_t btrfs_remap_file_range(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out,
loff_t len, unsigned int remap_flags);
+int btrfs_check_can_nocow(struct btrfs_inode *inode, loff_t pos,
+ size_t *write_bytes);
/* tree-defrag.c */
int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
@@ -2960,6 +2967,8 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
unsigned long new_flags);
int btrfs_sync_fs(struct super_block *sb, int wait);
+char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
+ u64 subvol_objectid);
static inline __printf(2, 3) __cold
void btrfs_no_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
@@ -3161,7 +3170,7 @@ do { \
/* Report first abort since mount */ \
if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \
&((trans)->fs_info->fs_state))) { \
- if ((errno) != -EIO) { \
+ if ((errno) != -EIO && (errno) != -EROFS) { \
WARN(1, KERN_DEBUG \
"BTRFS: Transaction aborted (error %d)\n", \
(errno)); \
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 5bcccfbcc7c1..a34ee9c2f315 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1151,7 +1151,7 @@ static int __btrfs_run_delayed_items(struct btrfs_trans_handle *trans, int nr)
int ret = 0;
bool count = (nr > 0);
- if (trans->aborted)
+ if (TRANS_ABORTED(trans))
return -EIO;
path = btrfs_alloc_path();
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 48890826b5e6..196bd241e701 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -562,6 +562,37 @@ static void btrfs_rm_dev_replace_unblocked(struct btrfs_fs_info *fs_info)
wake_up(&fs_info->dev_replace.replace_wait);
}
+/*
+ * When finishing the device replace, before swapping the source device with the
+ * target device we must update the chunk allocation state in the target device,
+ * as it is empty because replace works by directly copying the chunks and not
+ * through the normal chunk allocation path.
+ */
+static int btrfs_set_target_alloc_state(struct btrfs_device *srcdev,
+ struct btrfs_device *tgtdev)
+{
+ struct extent_state *cached_state = NULL;
+ u64 start = 0;
+ u64 found_start;
+ u64 found_end;
+ int ret = 0;
+
+ lockdep_assert_held(&srcdev->fs_info->chunk_mutex);
+
+ while (!find_first_extent_bit(&srcdev->alloc_state, start,
+ &found_start, &found_end,
+ CHUNK_ALLOCATED, &cached_state)) {
+ ret = set_extent_bits(&tgtdev->alloc_state, found_start,
+ found_end, CHUNK_ALLOCATED);
+ if (ret)
+ break;
+ start = found_end + 1;
+ }
+
+ free_extent_state(cached_state);
+ return ret;
+}
+
static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
int scrub_ret)
{
@@ -636,8 +667,14 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
dev_replace->time_stopped = ktime_get_real_seconds();
dev_replace->item_needs_writeback = 1;
- /* replace old device with new one in mapping tree */
+ /*
+ * Update allocation state in the new device and replace the old device
+ * with the new one in the mapping tree.
+ */
if (!scrub_ret) {
+ scrub_ret = btrfs_set_target_alloc_state(src_device, tgt_device);
+ if (scrub_ret)
+ goto error;
btrfs_dev_replace_update_device_in_mapping_tree(fs_info,
src_device,
tgt_device);
@@ -648,6 +685,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
btrfs_dev_name(src_device),
src_device->devid,
rcu_str_deref(tgt_device->name), scrub_ret);
+error:
up_write(&dev_replace->rwsem);
mutex_unlock(&fs_info->chunk_mutex);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 273d1ccdd45d..cd65ef7c7c3f 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -649,16 +649,15 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
goto err;
if (memcmp_extent_buffer(eb, result, 0, csum_size)) {
- u32 val;
- u32 found = 0;
-
- memcpy(&found, result, csum_size);
+ u8 val[BTRFS_CSUM_SIZE] = { 0 };
read_extent_buffer(eb, &val, 0, csum_size);
btrfs_warn_rl(fs_info,
- "%s checksum verify failed on %llu wanted %x found %x level %d",
+ "%s checksum verify failed on %llu wanted " CSUM_FMT " found " CSUM_FMT " level %d",
fs_info->sb->s_id, eb->start,
- val, found, btrfs_header_level(eb));
+ CSUM_FMT_VALUE(csum_size, val),
+ CSUM_FMT_VALUE(csum_size, result),
+ btrfs_header_level(eb));
ret = -EUCLEAN;
goto err;
}
@@ -1475,9 +1474,16 @@ int btrfs_init_fs_root(struct btrfs_root *root)
spin_lock_init(&root->ino_cache_lock);
init_waitqueue_head(&root->ino_cache_wait);
- ret = get_anon_bdev(&root->anon_dev);
- if (ret)
- goto fail;
+ /*
+ * Don't assign anonymous block device to roots that are not exposed to
+ * userspace, the id pool is limited to 1M
+ */
+ if (is_fstree(root->root_key.objectid) &&
+ btrfs_root_refs(&root->root_item) > 0) {
+ ret = get_anon_bdev(&root->anon_dev);
+ if (ret)
+ goto fail;
+ }
mutex_lock(&root->objectid_mutex);
ret = btrfs_find_highest_objectid(root,
@@ -4470,6 +4476,7 @@ static void btrfs_cleanup_bg_io(struct btrfs_block_group_cache *cache)
cache->io_ctl.inode = NULL;
iput(inode);
}
+ ASSERT(cache->io_ctl.pages == NULL);
btrfs_put_block_group(cache);
}
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index ddf28ecf17f9..93cceeba484c 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -57,9 +57,9 @@ static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
return type;
}
-static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
- u64 root_objectid, u32 generation,
- int check_generation)
+struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
+ u64 root_objectid, u32 generation,
+ int check_generation)
{
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
struct btrfs_root *root;
@@ -152,7 +152,7 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1);
}
-static struct dentry *btrfs_get_parent(struct dentry *child)
+struct dentry *btrfs_get_parent(struct dentry *child)
{
struct inode *dir = d_inode(child);
struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
diff --git a/fs/btrfs/export.h b/fs/btrfs/export.h
index 57488ecd7d4e..f32f4113c976 100644
--- a/fs/btrfs/export.h
+++ b/fs/btrfs/export.h
@@ -18,4 +18,9 @@ struct btrfs_fid {
u64 parent_root_objectid;
} __attribute__ ((packed));
+struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
+ u64 root_objectid, u32 generation,
+ int check_generation);
+struct dentry *btrfs_get_parent(struct dentry *child);
+
#endif
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 47ecf7216b3e..388449101705 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -402,12 +402,11 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
ASSERT(eb->fs_info);
/*
- * Every shared one has parent tree
- * block, which must be aligned to
- * nodesize.
+ * Every shared one has parent tree block,
+ * which must be aligned to sector size.
*/
if (offset &&
- IS_ALIGNED(offset, eb->fs_info->nodesize))
+ IS_ALIGNED(offset, eb->fs_info->sectorsize))
return type;
}
} else if (is_data == BTRFS_REF_TYPE_DATA) {
@@ -416,12 +415,11 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
if (type == BTRFS_SHARED_DATA_REF_KEY) {
ASSERT(eb->fs_info);
/*
- * Every shared one has parent tree
- * block, which must be aligned to
- * nodesize.
+ * Every shared one has parent tree block,
+ * which must be aligned to sector size.
*/
if (offset &&
- IS_ALIGNED(offset, eb->fs_info->nodesize))
+ IS_ALIGNED(offset, eb->fs_info->sectorsize))
return type;
}
} else {
@@ -431,8 +429,9 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
}
btrfs_print_leaf((struct extent_buffer *)eb);
- btrfs_err(eb->fs_info, "eb %llu invalid extent inline ref type %d",
- eb->start, type);
+ btrfs_err(eb->fs_info,
+ "eb %llu iref 0x%lx invalid extent inline ref type %d",
+ eb->start, (unsigned long)iref, type);
WARN_ON(1);
return BTRFS_REF_TYPE_INVALID;
@@ -1306,8 +1305,10 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
u64 num_bytes, u64 *actual_bytes)
{
- int ret;
+ int ret = 0;
u64 discarded_bytes = 0;
+ u64 end = bytenr + num_bytes;
+ u64 cur = bytenr;
struct btrfs_bio *bbio = NULL;
@@ -1316,15 +1317,23 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
* associated to its stripes that don't go away while we are discarding.
*/
btrfs_bio_counter_inc_blocked(fs_info);
- /* Tell the block device(s) that the sectors can be discarded */
- ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, bytenr, &num_bytes,
- &bbio, 0);
- /* Error condition is -ENOMEM */
- if (!ret) {
- struct btrfs_bio_stripe *stripe = bbio->stripes;
+ while (cur < end) {
+ struct btrfs_bio_stripe *stripe;
int i;
+ num_bytes = end - cur;
+ /* Tell the block device(s) that the sectors can be discarded */
+ ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, cur,
+ &num_bytes, &bbio, 0);
+ /*
+ * Error can be -ENOMEM, -ENOENT (no such chunk mapping) or
+ * -EOPNOTSUPP. For any such error, @num_bytes is not updated,
+ * thus we can't continue anyway.
+ */
+ if (ret < 0)
+ goto out;
+ stripe = bbio->stripes;
for (i = 0; i < bbio->num_stripes; i++, stripe++) {
u64 bytes;
struct request_queue *req_q;
@@ -1341,10 +1350,19 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
stripe->physical,
stripe->length,
&bytes);
- if (!ret)
+ if (!ret) {
discarded_bytes += bytes;
- else if (ret != -EOPNOTSUPP)
- break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
+ } else if (ret != -EOPNOTSUPP) {
+ /*
+ * Logic errors or -ENOMEM, or -EIO, but
+ * unlikely to happen.
+ *
+ * And since there are two loops, explicitly
+ * go to out to avoid confusion.
+ */
+ btrfs_put_bbio(bbio);
+ goto out;
+ }
/*
* Just in case we get back EOPNOTSUPP for some reason,
@@ -1354,7 +1372,9 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
ret = 0;
}
btrfs_put_bbio(bbio);
+ cur += num_bytes;
}
+out:
btrfs_bio_counter_dec(fs_info);
if (actual_bytes)
@@ -1561,7 +1581,7 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
int err = 0;
int metadata = !extent_op->is_data;
- if (trans->aborted)
+ if (TRANS_ABORTED(trans))
return 0;
if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA))
@@ -1681,7 +1701,7 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
{
int ret = 0;
- if (trans->aborted) {
+ if (TRANS_ABORTED(trans)) {
if (insert_reserved)
btrfs_pin_extent(trans->fs_info, node->bytenr,
node->num_bytes, 1);
@@ -2169,7 +2189,7 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
int run_all = count == (unsigned long)-1;
/* We'll clean this up in btrfs_cleanup_transaction */
- if (trans->aborted)
+ if (TRANS_ABORTED(trans))
return 0;
if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags))
@@ -2320,7 +2340,8 @@ static noinline int check_delayed_ref(struct btrfs_root *root,
static noinline int check_committed_ref(struct btrfs_root *root,
struct btrfs_path *path,
- u64 objectid, u64 offset, u64 bytenr)
+ u64 objectid, u64 offset, u64 bytenr,
+ bool strict)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_root *extent_root = fs_info->extent_root;
@@ -2362,9 +2383,13 @@ static noinline int check_committed_ref(struct btrfs_root *root,
btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
goto out;
- /* If extent created before last snapshot => it's definitely shared */
- if (btrfs_extent_generation(leaf, ei) <=
- btrfs_root_last_snapshot(&root->root_item))
+ /*
+ * If extent created before last snapshot => it's shared unless the
+ * snapshot has been deleted. Use the heuristic if strict is false.
+ */
+ if (!strict &&
+ (btrfs_extent_generation(leaf, ei) <=
+ btrfs_root_last_snapshot(&root->root_item)))
goto out;
iref = (struct btrfs_extent_inline_ref *)(ei + 1);
@@ -2389,7 +2414,7 @@ out:
}
int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
- u64 bytenr)
+ u64 bytenr, bool strict)
{
struct btrfs_path *path;
int ret;
@@ -2400,7 +2425,7 @@ int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
do {
ret = check_committed_ref(root, path, objectid,
- offset, bytenr);
+ offset, bytenr, strict);
if (ret && ret != -ENOENT)
goto out;
@@ -2892,7 +2917,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
else
unpin = &fs_info->freed_extents[0];
- while (!trans->aborted) {
+ while (!TRANS_ABORTED(trans)) {
struct extent_state *cached_state = NULL;
mutex_lock(&fs_info->unused_bg_unpin_mutex);
@@ -2924,7 +2949,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
u64 trimmed = 0;
ret = -EROFS;
- if (!trans->aborted)
+ if (!TRANS_ABORTED(trans))
ret = btrfs_discard_extent(fs_info,
block_group->key.objectid,
block_group->key.offset,
@@ -4441,7 +4466,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
return ERR_PTR(-EUCLEAN);
}
- btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
+ btrfs_set_buffer_lockdep_class(owner, buf, level);
btrfs_tree_lock(buf);
btrfs_clean_tree_block(buf);
clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
@@ -5221,7 +5246,14 @@ int btrfs_drop_snapshot(struct btrfs_root *root,
goto out;
}
- trans = btrfs_start_transaction(tree_root, 0);
+ /*
+ * Use join to avoid potential EINTR from transaction start. See
+ * wait_reserve_ticket and the whole reservation callchain.
+ */
+ if (for_reloc)
+ trans = btrfs_join_transaction(tree_root);
+ else
+ trans = btrfs_start_transaction(tree_root, 0);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
goto out_free;
@@ -5417,8 +5449,6 @@ out:
*/
if (!for_reloc && !root_dropped)
btrfs_add_dead_root(root);
- if (err && err != -EAGAIN)
- btrfs_handle_fs_error(fs_info, err, NULL);
return err;
}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 8aab286f2028..60c21cfb1948 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1923,7 +1923,8 @@ static int __process_pages_contig(struct address_space *mapping,
if (!PageDirty(pages[i]) ||
pages[i]->mapping != mapping) {
unlock_page(pages[i]);
- put_page(pages[i]);
+ for (; i < ret; i++)
+ put_page(pages[i]);
err = -EAGAIN;
goto out;
}
@@ -4072,7 +4073,7 @@ retry:
if (!test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) {
ret = flush_write_bio(&epd);
} else {
- ret = -EUCLEAN;
+ ret = -EROFS;
end_write_bio(&epd, ret);
}
return ret;
@@ -4466,20 +4467,32 @@ int try_release_extent_mapping(struct page *page, gfp_t mask)
free_extent_map(em);
break;
}
- if (!test_range_bit(tree, em->start,
- extent_map_end(em) - 1,
- EXTENT_LOCKED, 0, NULL)) {
+ if (test_range_bit(tree, em->start,
+ extent_map_end(em) - 1,
+ EXTENT_LOCKED, 0, NULL))
+ goto next;
+ /*
+ * If it's not in the list of modified extents, used
+ * by a fast fsync, we can remove it. If it's being
+ * logged we can safely remove it since fsync took an
+ * extra reference on the em.
+ */
+ if (list_empty(&em->list) ||
+ test_bit(EXTENT_FLAG_LOGGING, &em->flags)) {
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
&btrfs_inode->runtime_flags);
remove_extent_mapping(map, em);
/* once for the rb tree */
free_extent_map(em);
}
+next:
start = extent_map_end(em);
write_unlock(&map->lock);
/* once for us */
free_extent_map(em);
+
+ cond_resched(); /* Allow large-extent preemption. */
}
}
return try_release_extent_state(tree, page, mask);
@@ -5025,25 +5038,28 @@ struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
static void check_buffer_tree_ref(struct extent_buffer *eb)
{
int refs;
- /* the ref bit is tricky. We have to make sure it is set
- * if we have the buffer dirty. Otherwise the
- * code to free a buffer can end up dropping a dirty
- * page
+ /*
+ * The TREE_REF bit is first set when the extent_buffer is added
+ * to the radix tree. It is also reset, if unset, when a new reference
+ * is created by find_extent_buffer.
*
- * Once the ref bit is set, it won't go away while the
- * buffer is dirty or in writeback, and it also won't
- * go away while we have the reference count on the
- * eb bumped.
+ * It is only cleared in two cases: freeing the last non-tree
+ * reference to the extent_buffer when its STALE bit is set or
+ * calling releasepage when the tree reference is the only reference.
*
- * We can't just set the ref bit without bumping the
- * ref on the eb because free_extent_buffer might
- * see the ref bit and try to clear it. If this happens
- * free_extent_buffer might end up dropping our original
- * ref by mistake and freeing the page before we are able
- * to add one more ref.
+ * In both cases, care is taken to ensure that the extent_buffer's
+ * pages are not under io. However, releasepage can be concurrently
+ * called with creating new references, which is prone to race
+ * conditions between the calls to check_buffer_tree_ref in those
+ * codepaths and clearing TREE_REF in try_release_extent_buffer.
*
- * So bump the ref count first, then set the bit. If someone
- * beat us to it, drop the ref we added.
+ * The actual lifetime of the extent_buffer in the radix tree is
+ * adequately protected by the refcount, but the TREE_REF bit and
+ * its corresponding reference are not. To protect against this
+ * class of races, we call check_buffer_tree_ref from the codepaths
+ * which trigger io after they set eb->io_pages. Note that once io is
+ * initiated, TREE_REF can no longer be cleared, so that is the
+ * moment at which any such race is best fixed.
*/
refs = atomic_read(&eb->refs);
if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags))
@@ -5493,6 +5509,11 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num)
clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
eb->read_mirror = 0;
atomic_set(&eb->io_pages, num_reads);
+ /*
+ * It is possible for releasepage to clear the TREE_REF bit before we
+ * set io_pages. See check_buffer_tree_ref for a more detailed comment.
+ */
+ check_buffer_tree_ref(eb);
for (i = 0; i < num_pages; i++) {
page = eb->pages[i];
@@ -5586,9 +5607,9 @@ void read_extent_buffer(const struct extent_buffer *eb, void *dstv,
}
}
-int read_extent_buffer_to_user(const struct extent_buffer *eb,
- void __user *dstv,
- unsigned long start, unsigned long len)
+int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb,
+ void __user *dstv,
+ unsigned long start, unsigned long len)
{
size_t cur;
size_t offset;
@@ -5609,7 +5630,7 @@ int read_extent_buffer_to_user(const struct extent_buffer *eb,
cur = min(len, (PAGE_SIZE - offset));
kaddr = page_address(page);
- if (copy_to_user(dst, kaddr + offset, cur)) {
+ if (probe_user_write(dst, kaddr + offset, cur)) {
ret = -EFAULT;
break;
}
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index cf3424d58fec..bc858c8cef0a 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -457,9 +457,9 @@ int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
void read_extent_buffer(const struct extent_buffer *eb, void *dst,
unsigned long start,
unsigned long len);
-int read_extent_buffer_to_user(const struct extent_buffer *eb,
- void __user *dst, unsigned long start,
- unsigned long len);
+int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb,
+ void __user *dst, unsigned long start,
+ unsigned long len);
void write_extent_buffer_fsid(struct extent_buffer *eb, const void *src);
void write_extent_buffer_chunk_tree_uuid(struct extent_buffer *eb,
const void *src);
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index f62a179f85bb..2b8f29c07668 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -798,10 +798,12 @@ again:
nritems = btrfs_header_nritems(path->nodes[0]);
if (!nritems || (path->slots[0] >= nritems - 1)) {
ret = btrfs_next_leaf(root, path);
- if (ret == 1)
+ if (ret < 0) {
+ goto out;
+ } else if (ret > 0) {
found_next = 1;
- if (ret != 0)
goto insert;
+ }
slot = path->slots[0];
}
btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 3cfbccacef7f..4126513e2429 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1546,8 +1546,8 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
return ret;
}
-static noinline int check_can_nocow(struct btrfs_inode *inode, loff_t pos,
- size_t *write_bytes)
+int btrfs_check_can_nocow(struct btrfs_inode *inode, loff_t pos,
+ size_t *write_bytes)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_root *root = inode->root;
@@ -1568,7 +1568,7 @@ static noinline int check_can_nocow(struct btrfs_inode *inode, loff_t pos,
num_bytes = lockend - lockstart + 1;
ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes,
- NULL, NULL, NULL);
+ NULL, NULL, NULL, false);
if (ret <= 0) {
ret = 0;
btrfs_end_write_no_snapshotting(root);
@@ -1647,7 +1647,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
if (ret < 0) {
if ((BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
BTRFS_INODE_PREALLOC)) &&
- check_can_nocow(BTRFS_I(inode), pos,
+ btrfs_check_can_nocow(BTRFS_I(inode), pos,
&write_bytes) > 0) {
/*
* For nodata cow case, no need to reserve
@@ -1919,13 +1919,28 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
pos = iocb->ki_pos;
count = iov_iter_count(from);
if (iocb->ki_flags & IOCB_NOWAIT) {
+ size_t nocow_bytes = count;
+
/*
* We will allocate space in case nodatacow is not set,
* so bail
*/
if (!(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
BTRFS_INODE_PREALLOC)) ||
- check_can_nocow(BTRFS_I(inode), pos, &count) <= 0) {
+ btrfs_check_can_nocow(BTRFS_I(inode), pos,
+ &nocow_bytes) <= 0) {
+ inode_unlock(inode);
+ return -EAGAIN;
+ }
+
+ /* check_can_nocow() locks the snapshot lock on success */
+ btrfs_end_write_no_snapshotting(root);
+ /*
+ * There are holes in the range or parts of the range that must
+ * be COWed (shared extents, RO block groups, etc), so just bail
+ * out.
+ */
+ if (nocow_bytes < count) {
inode_unlock(inode);
return -EAGAIN;
}
@@ -3130,14 +3145,14 @@ reserve_space:
if (ret < 0)
goto out;
space_reserved = true;
- ret = btrfs_qgroup_reserve_data(inode, &data_reserved,
- alloc_start, bytes_to_reserve);
- if (ret)
- goto out;
ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
&cached_state);
if (ret)
goto out;
+ ret = btrfs_qgroup_reserve_data(inode, &data_reserved,
+ alloc_start, bytes_to_reserve);
+ if (ret)
+ goto out;
ret = btrfs_prealloc_file_range(inode, mode, alloc_start,
alloc_end - alloc_start,
i_blocksize(inode),
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index d86ada9c3c54..6e6be922b937 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1166,7 +1166,6 @@ static int __btrfs_wait_cache_io(struct btrfs_root *root,
ret = update_cache_item(trans, root, inode, path, offset,
io_ctl->entries, io_ctl->bitmaps);
out:
- io_ctl_free(io_ctl);
if (ret) {
invalidate_inode_pages2(inode->i_mapping);
BTRFS_I(inode)->generation = 0;
@@ -1329,6 +1328,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
* them out later
*/
io_ctl_drop_pages(io_ctl);
+ io_ctl_free(io_ctl);
unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
i_size_read(inode) - 1, &cached_state);
@@ -2166,7 +2166,7 @@ out:
static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl,
struct btrfs_free_space *info, bool update_stat)
{
- struct btrfs_free_space *left_info;
+ struct btrfs_free_space *left_info = NULL;
struct btrfs_free_space *right_info;
bool merged = false;
u64 offset = info->offset;
@@ -2181,7 +2181,7 @@ static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl,
if (right_info && rb_prev(&right_info->offset_index))
left_info = rb_entry(rb_prev(&right_info->offset_index),
struct btrfs_free_space, offset_index);
- else
+ else if (!right_info)
left_info = tree_search_offset(ctl, offset - 1, 0, 0);
if (right_info && !right_info->bitmap) {
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 94b0df3fb3c8..67b49b94c9cd 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -49,6 +49,7 @@
#include "qgroup.h"
#include "delalloc-space.h"
#include "block-group.h"
+#include "space-info.h"
struct btrfs_iget_args {
struct btrfs_key *location;
@@ -640,12 +641,18 @@ cont:
page_error_op |
PAGE_END_WRITEBACK);
- for (i = 0; i < nr_pages; i++) {
- WARN_ON(pages[i]->mapping);
- put_page(pages[i]);
+ /*
+ * Ensure we only free the compressed pages if we have
+ * them allocated, as we can still reach here with
+ * inode_need_compress() == false.
+ */
+ if (pages) {
+ for (i = 0; i < nr_pages; i++) {
+ WARN_ON(pages[i]->mapping);
+ put_page(pages[i]);
+ }
+ kfree(pages);
}
- kfree(pages);
-
return 0;
}
}
@@ -974,6 +981,7 @@ static noinline int cow_file_range(struct inode *inode,
u64 num_bytes;
unsigned long ram_size;
u64 cur_alloc_size = 0;
+ u64 min_alloc_size;
u64 blocksize = fs_info->sectorsize;
struct btrfs_key ins;
struct extent_map *em;
@@ -1024,10 +1032,26 @@ static noinline int cow_file_range(struct inode *inode,
btrfs_drop_extent_cache(BTRFS_I(inode), start,
start + num_bytes - 1, 0);
+ /*
+ * Relocation relies on the relocated extents to have exactly the same
+ * size as the original extents. Normally writeback for relocation data
+ * extents follows a NOCOW path because relocation preallocates the
+ * extents. However, due to an operation such as scrub turning a block
+ * group to RO mode, it may fallback to COW mode, so we must make sure
+ * an extent allocated during COW has exactly the requested size and can
+ * not be split into smaller extents, otherwise relocation breaks and
+ * fails during the stage where it updates the bytenr of file extent
+ * items.
+ */
+ if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
+ min_alloc_size = num_bytes;
+ else
+ min_alloc_size = fs_info->sectorsize;
+
while (num_bytes > 0) {
cur_alloc_size = num_bytes;
ret = btrfs_reserve_extent(root, cur_alloc_size, cur_alloc_size,
- fs_info->sectorsize, 0, alloc_hint,
+ min_alloc_size, 0, alloc_hint,
&ins, 1, 1);
if (ret < 0)
goto out_unlock;
@@ -1132,7 +1156,7 @@ out_unlock:
*/
if (extent_reserved) {
extent_clear_unlock_delalloc(inode, start,
- start + cur_alloc_size,
+ start + cur_alloc_size - 1,
locked_page,
clear_bits,
page_ops);
@@ -1322,6 +1346,73 @@ static noinline int csum_exist_in_range(struct btrfs_fs_info *fs_info,
return 1;
}
+static int fallback_to_cow(struct inode *inode, struct page *locked_page,
+ const u64 start, const u64 end,
+ int *page_started, unsigned long *nr_written)
+{
+ const bool is_space_ino = btrfs_is_free_space_inode(BTRFS_I(inode));
+ const bool is_reloc_ino = (BTRFS_I(inode)->root->root_key.objectid ==
+ BTRFS_DATA_RELOC_TREE_OBJECTID);
+ const u64 range_bytes = end + 1 - start;
+ struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ u64 range_start = start;
+ u64 count;
+
+ /*
+ * If EXTENT_NORESERVE is set it means that when the buffered write was
+ * made we had not enough available data space and therefore we did not
+ * reserve data space for it, since we though we could do NOCOW for the
+ * respective file range (either there is prealloc extent or the inode
+ * has the NOCOW bit set).
+ *
+ * However when we need to fallback to COW mode (because for example the
+ * block group for the corresponding extent was turned to RO mode by a
+ * scrub or relocation) we need to do the following:
+ *
+ * 1) We increment the bytes_may_use counter of the data space info.
+ * If COW succeeds, it allocates a new data extent and after doing
+ * that it decrements the space info's bytes_may_use counter and
+ * increments its bytes_reserved counter by the same amount (we do
+ * this at btrfs_add_reserved_bytes()). So we need to increment the
+ * bytes_may_use counter to compensate (when space is reserved at
+ * buffered write time, the bytes_may_use counter is incremented);
+ *
+ * 2) We clear the EXTENT_NORESERVE bit from the range. We do this so
+ * that if the COW path fails for any reason, it decrements (through
+ * extent_clear_unlock_delalloc()) the bytes_may_use counter of the
+ * data space info, which we incremented in the step above.
+ *
+ * If we need to fallback to cow and the inode corresponds to a free
+ * space cache inode or an inode of the data relocation tree, we must
+ * also increment bytes_may_use of the data space_info for the same
+ * reason. Space caches and relocated data extents always get a prealloc
+ * extent for them, however scrub or balance may have set the block
+ * group that contains that extent to RO mode and therefore force COW
+ * when starting writeback.
+ */
+ count = count_range_bits(io_tree, &range_start, end, range_bytes,
+ EXTENT_NORESERVE, 0);
+ if (count > 0 || is_space_ino || is_reloc_ino) {
+ u64 bytes = count;
+ struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+ struct btrfs_space_info *sinfo = fs_info->data_sinfo;
+
+ if (is_space_ino || is_reloc_ino)
+ bytes = range_bytes;
+
+ spin_lock(&sinfo->lock);
+ btrfs_space_info_update_bytes_may_use(fs_info, sinfo, bytes);
+ spin_unlock(&sinfo->lock);
+
+ if (count > 0)
+ clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE,
+ 0, 0, NULL);
+ }
+
+ return cow_file_range(inode, locked_page, start, end, page_started,
+ nr_written, 1);
+}
+
/*
* when nowcow writeback call back. This checks for snapshots or COW copies
* of the extents that exist in the file, and COWs the file as required.
@@ -1487,7 +1578,7 @@ next_slot:
goto out_check;
ret = btrfs_cross_ref_exist(root, ino,
found_key.offset -
- extent_offset, disk_bytenr);
+ extent_offset, disk_bytenr, false);
if (ret) {
/*
* ret could be -EIO if the above fails to read
@@ -1569,15 +1660,11 @@ out_check:
* NOCOW, following one which needs to be COW'ed
*/
if (cow_start != (u64)-1) {
- ret = cow_file_range(inode, locked_page,
- cow_start, found_key.offset - 1,
- page_started, nr_written, 1);
- if (ret) {
- if (nocow)
- btrfs_dec_nocow_writers(fs_info,
- disk_bytenr);
+ ret = fallback_to_cow(inode, locked_page, cow_start,
+ found_key.offset - 1,
+ page_started, nr_written);
+ if (ret)
goto error;
- }
cow_start = (u64)-1;
}
@@ -1593,9 +1680,6 @@ out_check:
ram_bytes, BTRFS_COMPRESS_NONE,
BTRFS_ORDERED_PREALLOC);
if (IS_ERR(em)) {
- if (nocow)
- btrfs_dec_nocow_writers(fs_info,
- disk_bytenr);
ret = PTR_ERR(em);
goto error;
}
@@ -1660,8 +1744,8 @@ out_check:
if (cow_start != (u64)-1) {
cur_offset = end;
- ret = cow_file_range(inode, locked_page, cow_start, end,
- page_started, nr_written, 1);
+ ret = fallback_to_cow(inode, locked_page, cow_start, end,
+ page_started, nr_written);
if (ret)
goto error;
}
@@ -4250,7 +4334,7 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir)
* 1 for the inode ref
* 1 for the inode
*/
- return btrfs_start_transaction_fallback_global_rsv(root, 5, 5);
+ return btrfs_start_transaction_fallback_global_rsv(root, 5);
}
static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
@@ -4603,6 +4687,8 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry)
}
}
+ free_anon_bdev(dest->anon_dev);
+ dest->anon_dev = 0;
out_end_trans:
trans->block_rsv = NULL;
trans->bytes_reserved = 0;
@@ -5047,11 +5133,13 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len,
struct extent_state *cached_state = NULL;
struct extent_changeset *data_reserved = NULL;
char *kaddr;
+ bool only_release_metadata = false;
u32 blocksize = fs_info->sectorsize;
pgoff_t index = from >> PAGE_SHIFT;
unsigned offset = from & (blocksize - 1);
struct page *page;
gfp_t mask = btrfs_alloc_write_mask(mapping);
+ size_t write_bytes = blocksize;
int ret = 0;
u64 block_start;
u64 block_end;
@@ -5063,11 +5151,27 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len,
block_start = round_down(from, blocksize);
block_end = block_start + blocksize - 1;
- ret = btrfs_delalloc_reserve_space(inode, &data_reserved,
- block_start, blocksize);
- if (ret)
- goto out;
+ ret = btrfs_check_data_free_space(inode, &data_reserved, block_start,
+ blocksize);
+ if (ret < 0) {
+ if ((BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW |
+ BTRFS_INODE_PREALLOC)) &&
+ btrfs_check_can_nocow(BTRFS_I(inode), block_start,
+ &write_bytes) > 0) {
+ /* For nocow case, no need to reserve data space */
+ only_release_metadata = true;
+ } else {
+ goto out;
+ }
+ }
+ ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), blocksize);
+ if (ret < 0) {
+ if (!only_release_metadata)
+ btrfs_free_reserved_data_space(inode, data_reserved,
+ block_start, blocksize);
+ goto out;
+ }
again:
page = find_or_create_page(mapping, index, mask);
if (!page) {
@@ -5136,14 +5240,26 @@ again:
set_page_dirty(page);
unlock_extent_cached(io_tree, block_start, block_end, &cached_state);
+ if (only_release_metadata)
+ set_extent_bit(&BTRFS_I(inode)->io_tree, block_start,
+ block_end, EXTENT_NORESERVE, NULL, NULL,
+ GFP_NOFS);
+
out_unlock:
- if (ret)
- btrfs_delalloc_release_space(inode, data_reserved, block_start,
- blocksize, true);
+ if (ret) {
+ if (only_release_metadata)
+ btrfs_delalloc_release_metadata(BTRFS_I(inode),
+ blocksize, true);
+ else
+ btrfs_delalloc_release_space(inode, data_reserved,
+ block_start, blocksize, true);
+ }
btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize);
unlock_page(page);
put_page(page);
out:
+ if (only_release_metadata)
+ btrfs_end_write_no_snapshotting(BTRFS_I(inode)->root);
extent_changeset_free(data_reserved);
return ret;
}
@@ -7108,7 +7224,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
/* Only regular file could have regular/prealloc extent */
if (!S_ISREG(inode->vfs_inode.i_mode)) {
- ret = -EUCLEAN;
+ err = -EUCLEAN;
btrfs_crit(fs_info,
"regular/prealloc extent found for non-regular inode %llu",
btrfs_ino(inode));
@@ -7443,7 +7559,7 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
*/
noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
u64 *orig_start, u64 *orig_block_len,
- u64 *ram_bytes)
+ u64 *ram_bytes, bool strict)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_path *path;
@@ -7521,8 +7637,9 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
* Do the same check as in btrfs_cross_ref_exist but without the
* unnecessary search.
*/
- if (btrfs_file_extent_generation(leaf, fi) <=
- btrfs_root_last_snapshot(&root->root_item))
+ if (!strict &&
+ (btrfs_file_extent_generation(leaf, fi) <=
+ btrfs_root_last_snapshot(&root->root_item)))
goto out;
backref_offset = btrfs_file_extent_offset(leaf, fi);
@@ -7558,7 +7675,8 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
*/
ret = btrfs_cross_ref_exist(root, btrfs_ino(BTRFS_I(inode)),
- key.offset - backref_offset, disk_bytenr);
+ key.offset - backref_offset, disk_bytenr,
+ strict);
if (ret) {
ret = 0;
goto out;
@@ -7779,7 +7897,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
block_start = em->block_start + (start - em->start);
if (can_nocow_extent(inode, start, &len, &orig_start,
- &orig_block_len, &ram_bytes) == 1 &&
+ &orig_block_len, &ram_bytes, false) == 1 &&
btrfs_inc_nocow_writers(fs_info, block_start)) {
struct extent_map *em2;
@@ -8498,14 +8616,64 @@ err:
return ret;
}
-static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip)
+/*
+ * If this succeeds, the btrfs_dio_private is responsible for cleaning up locked
+ * or ordered extents whether or not we submit any bios.
+ */
+static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio,
+ struct inode *inode,
+ loff_t file_offset)
{
- struct inode *inode = dip->inode;
+ const bool write = (bio_op(dio_bio) == REQ_OP_WRITE);
+ struct btrfs_dio_private *dip;
+ struct bio *bio;
+
+ dip = kzalloc(sizeof(*dip), GFP_NOFS);
+ if (!dip)
+ return NULL;
+
+ bio = btrfs_bio_clone(dio_bio);
+ bio->bi_private = dip;
+ btrfs_io_bio(bio)->logical = file_offset;
+
+ dip->private = dio_bio->bi_private;
+ dip->inode = inode;
+ dip->logical_offset = file_offset;
+ dip->bytes = dio_bio->bi_iter.bi_size;
+ dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9;
+ dip->orig_bio = bio;
+ dip->dio_bio = dio_bio;
+ atomic_set(&dip->pending_bios, 1);
+
+ if (write) {
+ struct btrfs_dio_data *dio_data = current->journal_info;
+
+ /*
+ * Setting range start and end to the same value means that
+ * no cleanup will happen in btrfs_direct_IO
+ */
+ dio_data->unsubmitted_oe_range_end = dip->logical_offset +
+ dip->bytes;
+ dio_data->unsubmitted_oe_range_start =
+ dio_data->unsubmitted_oe_range_end;
+
+ bio->bi_end_io = btrfs_endio_direct_write;
+ } else {
+ bio->bi_end_io = btrfs_endio_direct_read;
+ dip->subio_endio = btrfs_subio_endio_read;
+ }
+ return dip;
+}
+
+static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode,
+ loff_t file_offset)
+{
+ const bool write = (bio_op(dio_bio) == REQ_OP_WRITE);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_dio_private *dip;
struct bio *bio;
- struct bio *orig_bio = dip->orig_bio;
- u64 start_sector = orig_bio->bi_iter.bi_sector;
- u64 file_offset = dip->logical_offset;
+ struct bio *orig_bio;
+ u64 start_sector;
int async_submit = 0;
u64 submit_len;
int clone_offset = 0;
@@ -8514,11 +8682,24 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip)
blk_status_t status;
struct btrfs_io_geometry geom;
+ dip = btrfs_create_dio_private(dio_bio, inode, file_offset);
+ if (!dip) {
+ if (!write) {
+ unlock_extent(&BTRFS_I(inode)->io_tree, file_offset,
+ file_offset + dio_bio->bi_iter.bi_size - 1);
+ }
+ dio_bio->bi_status = BLK_STS_RESOURCE;
+ dio_end_io(dio_bio);
+ return;
+ }
+
+ orig_bio = dip->orig_bio;
+ start_sector = orig_bio->bi_iter.bi_sector;
submit_len = orig_bio->bi_iter.bi_size;
ret = btrfs_get_io_geometry(fs_info, btrfs_op(orig_bio),
start_sector << 9, submit_len, &geom);
if (ret)
- return -EIO;
+ goto out_err;
if (geom.len >= submit_len) {
bio = orig_bio;
@@ -8534,7 +8715,6 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip)
/* bio split */
ASSERT(geom.len <= INT_MAX);
- atomic_inc(&dip->pending_bios);
do {
clone_len = min_t(int, submit_len, geom.len);
@@ -8582,9 +8762,10 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip)
submit:
status = btrfs_submit_dio_bio(bio, inode, file_offset, async_submit);
if (!status)
- return 0;
+ return;
- bio_put(bio);
+ if (bio != orig_bio)
+ bio_put(bio);
out_err:
dip->errors = 1;
/*
@@ -8595,107 +8776,6 @@ out_err:
*/
if (atomic_dec_and_test(&dip->pending_bios))
bio_io_error(dip->orig_bio);
-
- /* bio_end_io() will handle error, so we needn't return it */
- return 0;
-}
-
-static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode,
- loff_t file_offset)
-{
- struct btrfs_dio_private *dip = NULL;
- struct bio *bio = NULL;
- struct btrfs_io_bio *io_bio;
- bool write = (bio_op(dio_bio) == REQ_OP_WRITE);
- int ret = 0;
-
- bio = btrfs_bio_clone(dio_bio);
-
- dip = kzalloc(sizeof(*dip), GFP_NOFS);
- if (!dip) {
- ret = -ENOMEM;
- goto free_ordered;
- }
-
- dip->private = dio_bio->bi_private;
- dip->inode = inode;
- dip->logical_offset = file_offset;
- dip->bytes = dio_bio->bi_iter.bi_size;
- dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9;
- bio->bi_private = dip;
- dip->orig_bio = bio;
- dip->dio_bio = dio_bio;
- atomic_set(&dip->pending_bios, 0);
- io_bio = btrfs_io_bio(bio);
- io_bio->logical = file_offset;
-
- if (write) {
- bio->bi_end_io = btrfs_endio_direct_write;
- } else {
- bio->bi_end_io = btrfs_endio_direct_read;
- dip->subio_endio = btrfs_subio_endio_read;
- }
-
- /*
- * Reset the range for unsubmitted ordered extents (to a 0 length range)
- * even if we fail to submit a bio, because in such case we do the
- * corresponding error handling below and it must not be done a second
- * time by btrfs_direct_IO().
- */
- if (write) {
- struct btrfs_dio_data *dio_data = current->journal_info;
-
- dio_data->unsubmitted_oe_range_end = dip->logical_offset +
- dip->bytes;
- dio_data->unsubmitted_oe_range_start =
- dio_data->unsubmitted_oe_range_end;
- }
-
- ret = btrfs_submit_direct_hook(dip);
- if (!ret)
- return;
-
- btrfs_io_bio_free_csum(io_bio);
-
-free_ordered:
- /*
- * If we arrived here it means either we failed to submit the dip
- * or we either failed to clone the dio_bio or failed to allocate the
- * dip. If we cloned the dio_bio and allocated the dip, we can just
- * call bio_endio against our io_bio so that we get proper resource
- * cleanup if we fail to submit the dip, otherwise, we must do the
- * same as btrfs_endio_direct_[write|read] because we can't call these
- * callbacks - they require an allocated dip and a clone of dio_bio.
- */
- if (bio && dip) {
- bio_io_error(bio);
- /*
- * The end io callbacks free our dip, do the final put on bio
- * and all the cleanup and final put for dio_bio (through
- * dio_end_io()).
- */
- dip = NULL;
- bio = NULL;
- } else {
- if (write)
- __endio_write_update_ordered(inode,
- file_offset,
- dio_bio->bi_iter.bi_size,
- false);
- else
- unlock_extent(&BTRFS_I(inode)->io_tree, file_offset,
- file_offset + dio_bio->bi_iter.bi_size - 1);
-
- dio_bio->bi_status = BLK_STS_IOERR;
- /*
- * Releases and cleans up our dio_bio, no need to bio_put()
- * nor bio_endio()/bio_io_error() against dio_bio.
- */
- dio_end_io(dio_bio);
- }
- if (bio)
- bio_put(bio);
- kfree(dip);
}
static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
@@ -8772,9 +8852,6 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
dio_data.overwrite = 1;
inode_unlock(inode);
relock = true;
- } else if (iocb->ki_flags & IOCB_NOWAIT) {
- ret = -EAGAIN;
- goto out;
}
ret = btrfs_delalloc_reserve_space(inode, &data_reserved,
offset, count);
@@ -8997,20 +9074,17 @@ again:
/*
* Qgroup reserved space handler
* Page here will be either
- * 1) Already written to disk
- * In this case, its reserved space is released from data rsv map
- * and will be freed by delayed_ref handler finally.
- * So even we call qgroup_free_data(), it won't decrease reserved
- * space.
- * 2) Not written to disk
- * This means the reserved space should be freed here. However,
- * if a truncate invalidates the page (by clearing PageDirty)
- * and the page is accounted for while allocating extent
- * in btrfs_check_data_free_space() we let delayed_ref to
- * free the entire extent.
+ * 1) Already written to disk or ordered extent already submitted
+ * Then its QGROUP_RESERVED bit in io_tree is already cleaned.
+ * Qgroup will be handled by its qgroup_record then.
+ * btrfs_qgroup_free_data() call will do nothing here.
+ *
+ * 2) Not written to disk yet
+ * Then btrfs_qgroup_free_data() call will clear the QGROUP_RESERVED
+ * bit of its io_tree, and free the qgroup reserved data space.
+ * Since the IO will never happen for this page.
*/
- if (PageDirty(page))
- btrfs_qgroup_free_data(inode, NULL, page_start, PAGE_SIZE);
+ btrfs_qgroup_free_data(inode, NULL, page_start, PAGE_SIZE);
if (!inode_evicting) {
clear_extent_bit(tree, page_start, page_end, EXTENT_LOCKED |
EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
@@ -9485,7 +9559,7 @@ void btrfs_destroy_inode(struct inode *inode)
btrfs_put_ordered_extent(ordered);
}
}
- btrfs_qgroup_check_reserved_leak(inode);
+ btrfs_qgroup_check_reserved_leak(BTRFS_I(inode));
inode_tree_del(inode);
btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0);
}
@@ -10947,7 +11021,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file,
free_extent_map(em);
em = NULL;
- ret = can_nocow_extent(inode, start, &len, NULL, NULL, NULL);
+ ret = can_nocow_extent(inode, start, &len, NULL, NULL, NULL, true);
if (ret < 0) {
goto out;
} else if (ret) {
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index d88b8d8897cc..63394b450afc 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -167,8 +167,11 @@ static int btrfs_ioctl_getflags(struct file *file, void __user *arg)
return 0;
}
-/* Check if @flags are a supported and valid set of FS_*_FL flags */
-static int check_fsflags(unsigned int flags)
+/*
+ * Check if @flags are a supported and valid set of FS_*_FL flags and that
+ * the old and new flags are not conflicting
+ */
+static int check_fsflags(unsigned int old_flags, unsigned int flags)
{
if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \
FS_NOATIME_FL | FS_NODUMP_FL | \
@@ -177,9 +180,19 @@ static int check_fsflags(unsigned int flags)
FS_NOCOW_FL))
return -EOPNOTSUPP;
+ /* COMPR and NOCOMP on new/old are valid */
if ((flags & FS_NOCOMP_FL) && (flags & FS_COMPR_FL))
return -EINVAL;
+ if ((flags & FS_COMPR_FL) && (flags & FS_NOCOW_FL))
+ return -EINVAL;
+
+ /* NOCOW and compression options are mutually exclusive */
+ if ((old_flags & FS_NOCOW_FL) && (flags & (FS_COMPR_FL | FS_NOCOMP_FL)))
+ return -EINVAL;
+ if ((flags & FS_NOCOW_FL) && (old_flags & (FS_COMPR_FL | FS_NOCOMP_FL)))
+ return -EINVAL;
+
return 0;
}
@@ -193,7 +206,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
unsigned int fsflags, old_fsflags;
int ret;
const char *comp = NULL;
- u32 binode_flags = binode->flags;
+ u32 binode_flags;
if (!inode_owner_or_capable(inode))
return -EPERM;
@@ -204,22 +217,23 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
if (copy_from_user(&fsflags, arg, sizeof(fsflags)))
return -EFAULT;
- ret = check_fsflags(fsflags);
- if (ret)
- return ret;
-
ret = mnt_want_write_file(file);
if (ret)
return ret;
inode_lock(inode);
-
fsflags = btrfs_mask_fsflags_for_type(inode, fsflags);
old_fsflags = btrfs_inode_flags_to_fsflags(binode->flags);
+
ret = vfs_ioc_setflags_prepare(inode, old_fsflags, fsflags);
if (ret)
goto out_unlock;
+ ret = check_fsflags(old_fsflags, fsflags);
+ if (ret)
+ goto out_unlock;
+
+ binode_flags = binode->flags;
if (fsflags & FS_SYNC_FL)
binode_flags |= BTRFS_INODE_SYNC;
else
@@ -2091,9 +2105,14 @@ static noinline int copy_to_sk(struct btrfs_path *path,
sh.len = item_len;
sh.transid = found_transid;
- /* copy search result header */
- if (copy_to_user(ubuf + *sk_offset, &sh, sizeof(sh))) {
- ret = -EFAULT;
+ /*
+ * Copy search result header. If we fault then loop again so we
+ * can fault in the pages and -EFAULT there if there's a
+ * problem. Otherwise we'll fault and then copy the buffer in
+ * properly this next time through
+ */
+ if (probe_user_write(ubuf + *sk_offset, &sh, sizeof(sh))) {
+ ret = 0;
goto out;
}
@@ -2101,10 +2120,14 @@ static noinline int copy_to_sk(struct btrfs_path *path,
if (item_len) {
char __user *up = ubuf + *sk_offset;
- /* copy the item */
- if (read_extent_buffer_to_user(leaf, up,
- item_off, item_len)) {
- ret = -EFAULT;
+ /*
+ * Copy the item, same behavior as above, but reset the
+ * * sk_offset so we copy the full thing again.
+ */
+ if (read_extent_buffer_to_user_nofault(leaf, up,
+ item_off, item_len)) {
+ ret = 0;
+ *sk_offset -= sizeof(sh);
goto out;
}
@@ -2192,6 +2215,11 @@ static noinline int search_ioctl(struct inode *inode,
key.offset = sk->min_offset;
while (1) {
+ ret = fault_in_pages_writeable(ubuf + sk_offset,
+ *buf_size - sk_offset);
+ if (ret)
+ break;
+
ret = btrfs_search_forward(root, &key, path, sk->min_transid);
if (ret != 0) {
if (ret > 0)
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index 9cb50577d982..f4edadf1067f 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -95,9 +95,10 @@ static void print_extent_item(struct extent_buffer *eb, int slot, int type)
* offset is supposed to be a tree block which
* must be aligned to nodesize.
*/
- if (!IS_ALIGNED(offset, eb->fs_info->nodesize))
- pr_info("\t\t\t(parent %llu is NOT ALIGNED to nodesize %llu)\n",
- offset, (unsigned long long)eb->fs_info->nodesize);
+ if (!IS_ALIGNED(offset, eb->fs_info->sectorsize))
+ pr_info(
+ "\t\t\t(parent %llu not aligned to sectorsize %u)\n",
+ offset, eb->fs_info->sectorsize);
break;
case BTRFS_EXTENT_DATA_REF_KEY:
dref = (struct btrfs_extent_data_ref *)(&iref->offset);
@@ -112,8 +113,9 @@ static void print_extent_item(struct extent_buffer *eb, int slot, int type)
* must be aligned to nodesize.
*/
if (!IS_ALIGNED(offset, eb->fs_info->nodesize))
- pr_info("\t\t\t(parent %llu is NOT ALIGNED to nodesize %llu)\n",
- offset, (unsigned long long)eb->fs_info->nodesize);
+ pr_info(
+ "\t\t\t(parent %llu not aligned to sectorsize %u)\n",
+ offset, eb->fs_info->sectorsize);
break;
default:
pr_cont("(extent %llu has INVALID ref type %d)\n",
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 590defdf8860..04fd02e6124d 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -2636,6 +2636,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
struct btrfs_root *quota_root;
struct btrfs_qgroup *srcgroup;
struct btrfs_qgroup *dstgroup;
+ bool need_rescan = false;
u32 level_size = 0;
u64 nums;
@@ -2779,6 +2780,13 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
goto unlock;
}
++i_qgroups;
+
+ /*
+ * If we're doing a snapshot, and adding the snapshot to a new
+ * qgroup, the numbers are guaranteed to be incorrect.
+ */
+ if (srcid)
+ need_rescan = true;
}
for (i = 0; i < inherit->num_ref_copies; ++i, i_qgroups += 2) {
@@ -2798,6 +2806,9 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
dst->rfer = src->rfer - level_size;
dst->rfer_cmpr = src->rfer_cmpr - level_size;
+
+ /* Manually tweaking numbers certainly needs a rescan */
+ need_rescan = true;
}
for (i = 0; i < inherit->num_excl_copies; ++i, i_qgroups += 2) {
struct btrfs_qgroup *src;
@@ -2816,6 +2827,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, u64 srcid,
dst->excl = src->excl + level_size;
dst->excl_cmpr = src->excl_cmpr + level_size;
+ need_rescan = true;
}
unlock:
@@ -2823,6 +2835,8 @@ unlock:
out:
if (!committing)
mutex_unlock(&fs_info->qgroup_ioctl_lock);
+ if (need_rescan)
+ fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
return ret;
}
@@ -3755,7 +3769,7 @@ void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes)
* Check qgroup reserved space leaking, normally at destroy inode
* time
*/
-void btrfs_qgroup_check_reserved_leak(struct inode *inode)
+void btrfs_qgroup_check_reserved_leak(struct btrfs_inode *inode)
{
struct extent_changeset changeset;
struct ulist_node *unode;
@@ -3763,19 +3777,19 @@ void btrfs_qgroup_check_reserved_leak(struct inode *inode)
int ret;
extent_changeset_init(&changeset);
- ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
+ ret = clear_record_extent_bits(&inode->io_tree, 0, (u64)-1,
EXTENT_QGROUP_RESERVED, &changeset);
WARN_ON(ret < 0);
if (WARN_ON(changeset.bytes_changed)) {
ULIST_ITER_INIT(&iter);
while ((unode = ulist_next(&changeset.range_changed, &iter))) {
- btrfs_warn(BTRFS_I(inode)->root->fs_info,
- "leaking qgroup reserved space, ino: %lu, start: %llu, end: %llu",
- inode->i_ino, unode->val, unode->aux);
+ btrfs_warn(inode->root->fs_info,
+ "leaking qgroup reserved space, ino: %llu, start: %llu, end: %llu",
+ btrfs_ino(inode), unode->val, unode->aux);
}
- btrfs_qgroup_free_refroot(BTRFS_I(inode)->root->fs_info,
- BTRFS_I(inode)->root->root_key.objectid,
+ btrfs_qgroup_free_refroot(inode->root->fs_info,
+ inode->root->root_key.objectid,
changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA);
}
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index 17e8ac992c50..b0420c4f5d0e 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -399,7 +399,7 @@ void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root);
*/
void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes);
-void btrfs_qgroup_check_reserved_leak(struct inode *inode);
+void btrfs_qgroup_check_reserved_leak(struct btrfs_inode *inode);
/* btrfs_qgroup_swapped_blocks related functions */
void btrfs_qgroup_init_swapped_blocks(
diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c
index 454a1015d026..9a2f15f4c80e 100644
--- a/fs/btrfs/ref-verify.c
+++ b/fs/btrfs/ref-verify.c
@@ -286,6 +286,8 @@ static struct block_entry *add_block_entry(struct btrfs_fs_info *fs_info,
exist_re = insert_root_entry(&exist->roots, re);
if (exist_re)
kfree(re);
+ } else {
+ kfree(re);
}
kfree(be);
return exist;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index e890f09e2073..1bc57f7b91cf 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -1468,6 +1468,9 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
int clear_rsv = 0;
int ret;
+ if (!rc)
+ return 0;
+
/*
* The subvolume has reloc tree but the swap is finished, no need to
* create/update the dead reloc tree
@@ -1475,13 +1478,25 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
if (reloc_root_is_dead(root))
return 0;
+ /*
+ * This is subtle but important. We do not do
+ * record_root_in_transaction for reloc roots, instead we record their
+ * corresponding fs root, and then here we update the last trans for the
+ * reloc root. This means that we have to do this for the entire life
+ * of the reloc root, regardless of which stage of the relocation we are
+ * in.
+ */
if (root->reloc_root) {
reloc_root = root->reloc_root;
reloc_root->last_trans = trans->transid;
return 0;
}
- if (!rc || !rc->create_reloc_tree ||
+ /*
+ * We are merging reloc roots, we do not need new reloc trees. Also
+ * reloc trees never need their own reloc tree.
+ */
+ if (!rc->create_reloc_tree ||
root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
return 0;
@@ -2312,12 +2327,20 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
btrfs_unlock_up_safe(path, 0);
}
- min_reserved = fs_info->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
+ /*
+ * In merge_reloc_root(), we modify the upper level pointer to swap the
+ * tree blocks between reloc tree and subvolume tree. Thus for tree
+ * block COW, we COW at most from level 1 to root level for each tree.
+ *
+ * Thus the needed metadata size is at most root_level * nodesize,
+ * and * 2 since we have two trees to COW.
+ */
+ min_reserved = fs_info->nodesize * btrfs_root_level(root_item) * 2;
memset(&next_key, 0, sizeof(next_key));
while (1) {
ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved,
- BTRFS_RESERVE_FLUSH_ALL);
+ BTRFS_RESERVE_FLUSH_LIMIT);
if (ret) {
err = ret;
goto out;
@@ -2328,6 +2351,18 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
trans = NULL;
goto out;
}
+
+ /*
+ * At this point we no longer have a reloc_control, so we can't
+ * depend on btrfs_init_reloc_root to update our last_trans.
+ *
+ * But that's ok, we started the trans handle on our
+ * corresponding fs_root, which means it's been added to the
+ * dirty list. At commit time we'll still call
+ * btrfs_update_reloc_root() and update our root item
+ * appropriately.
+ */
+ reloc_root->last_trans = trans->transid;
trans->block_rsv = rc->block_rsv;
replaced = 0;
@@ -2525,12 +2560,10 @@ again:
reloc_root = list_entry(reloc_roots.next,
struct btrfs_root, root_list);
+ root = read_fs_root(fs_info, reloc_root->root_key.offset);
if (btrfs_root_refs(&reloc_root->root_item) > 0) {
- root = read_fs_root(fs_info,
- reloc_root->root_key.offset);
BUG_ON(IS_ERR(root));
BUG_ON(root->reloc_root != reloc_root);
-
ret = merge_reloc_root(rc, root);
if (ret) {
if (list_empty(&reloc_root->root_list))
@@ -2539,6 +2572,13 @@ again:
goto out;
}
} else {
+ if (!IS_ERR(root)) {
+ if (root->reloc_root == reloc_root)
+ root->reloc_root = NULL;
+ clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE,
+ &root->state);
+ }
+
list_del_init(&reloc_root->root_list);
/* Don't forget to queue this reloc root for cleanup */
list_add_tail(&reloc_root->reloc_dirty_list,
@@ -4329,6 +4369,18 @@ static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info)
return rc;
}
+static void free_reloc_control(struct reloc_control *rc)
+{
+ struct mapping_node *node, *tmp;
+
+ free_reloc_roots(&rc->reloc_roots);
+ rbtree_postorder_for_each_entry_safe(node, tmp,
+ &rc->reloc_root_tree.rb_root, rb_node)
+ kfree(node);
+
+ kfree(rc);
+}
+
/*
* Print the block group being relocated
*/
@@ -4461,7 +4513,7 @@ out:
btrfs_dec_block_group_ro(rc->block_group);
iput(rc->data_inode);
btrfs_put_block_group(rc->block_group);
- kfree(rc);
+ free_reloc_control(rc);
return err;
}
@@ -4634,7 +4686,7 @@ out_clean:
err = ret;
out_unset:
unset_reloc_control(rc);
- kfree(rc);
+ free_reloc_control(rc);
out:
if (!list_empty(&reloc_roots))
free_reloc_roots(&reloc_roots);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index a7b043fd7a57..93d7cb56e44b 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -3717,7 +3717,7 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
struct btrfs_fs_info *fs_info = sctx->fs_info;
if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
- return -EIO;
+ return -EROFS;
/* Seed devices of a new filesystem has their own generation. */
if (scrub_dev->fs_devices != fs_info->fs_devices)
@@ -3742,50 +3742,84 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
return 0;
}
+static void scrub_workers_put(struct btrfs_fs_info *fs_info)
+{
+ if (refcount_dec_and_mutex_lock(&fs_info->scrub_workers_refcnt,
+ &fs_info->scrub_lock)) {
+ struct btrfs_workqueue *scrub_workers = NULL;
+ struct btrfs_workqueue *scrub_wr_comp = NULL;
+ struct btrfs_workqueue *scrub_parity = NULL;
+
+ scrub_workers = fs_info->scrub_workers;
+ scrub_wr_comp = fs_info->scrub_wr_completion_workers;
+ scrub_parity = fs_info->scrub_parity_workers;
+
+ fs_info->scrub_workers = NULL;
+ fs_info->scrub_wr_completion_workers = NULL;
+ fs_info->scrub_parity_workers = NULL;
+ mutex_unlock(&fs_info->scrub_lock);
+
+ btrfs_destroy_workqueue(scrub_workers);
+ btrfs_destroy_workqueue(scrub_wr_comp);
+ btrfs_destroy_workqueue(scrub_parity);
+ }
+}
+
/*
* get a reference count on fs_info->scrub_workers. start worker if necessary
*/
static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
int is_dev_replace)
{
+ struct btrfs_workqueue *scrub_workers = NULL;
+ struct btrfs_workqueue *scrub_wr_comp = NULL;
+ struct btrfs_workqueue *scrub_parity = NULL;
unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
int max_active = fs_info->thread_pool_size;
+ int ret = -ENOMEM;
- lockdep_assert_held(&fs_info->scrub_lock);
+ if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt))
+ return 0;
- if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) {
- ASSERT(fs_info->scrub_workers == NULL);
- fs_info->scrub_workers = btrfs_alloc_workqueue(fs_info, "scrub",
- flags, is_dev_replace ? 1 : max_active, 4);
- if (!fs_info->scrub_workers)
- goto fail_scrub_workers;
-
- ASSERT(fs_info->scrub_wr_completion_workers == NULL);
- fs_info->scrub_wr_completion_workers =
- btrfs_alloc_workqueue(fs_info, "scrubwrc", flags,
- max_active, 2);
- if (!fs_info->scrub_wr_completion_workers)
- goto fail_scrub_wr_completion_workers;
+ scrub_workers = btrfs_alloc_workqueue(fs_info, "scrub", flags,
+ is_dev_replace ? 1 : max_active, 4);
+ if (!scrub_workers)
+ goto fail_scrub_workers;
- ASSERT(fs_info->scrub_parity_workers == NULL);
- fs_info->scrub_parity_workers =
- btrfs_alloc_workqueue(fs_info, "scrubparity", flags,
+ scrub_wr_comp = btrfs_alloc_workqueue(fs_info, "scrubwrc", flags,
max_active, 2);
- if (!fs_info->scrub_parity_workers)
- goto fail_scrub_parity_workers;
+ if (!scrub_wr_comp)
+ goto fail_scrub_wr_completion_workers;
+ scrub_parity = btrfs_alloc_workqueue(fs_info, "scrubparity", flags,
+ max_active, 2);
+ if (!scrub_parity)
+ goto fail_scrub_parity_workers;
+
+ mutex_lock(&fs_info->scrub_lock);
+ if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) {
+ ASSERT(fs_info->scrub_workers == NULL &&
+ fs_info->scrub_wr_completion_workers == NULL &&
+ fs_info->scrub_parity_workers == NULL);
+ fs_info->scrub_workers = scrub_workers;
+ fs_info->scrub_wr_completion_workers = scrub_wr_comp;
+ fs_info->scrub_parity_workers = scrub_parity;
refcount_set(&fs_info->scrub_workers_refcnt, 1);
- } else {
- refcount_inc(&fs_info->scrub_workers_refcnt);
+ mutex_unlock(&fs_info->scrub_lock);
+ return 0;
}
- return 0;
+ /* Other thread raced in and created the workers for us */
+ refcount_inc(&fs_info->scrub_workers_refcnt);
+ mutex_unlock(&fs_info->scrub_lock);
+ ret = 0;
+ btrfs_destroy_workqueue(scrub_parity);
fail_scrub_parity_workers:
- btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
+ btrfs_destroy_workqueue(scrub_wr_comp);
fail_scrub_wr_completion_workers:
- btrfs_destroy_workqueue(fs_info->scrub_workers);
+ btrfs_destroy_workqueue(scrub_workers);
fail_scrub_workers:
- return -ENOMEM;
+ return ret;
}
int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
@@ -3796,9 +3830,6 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
int ret;
struct btrfs_device *dev;
unsigned int nofs_flag;
- struct btrfs_workqueue *scrub_workers = NULL;
- struct btrfs_workqueue *scrub_wr_comp = NULL;
- struct btrfs_workqueue *scrub_parity = NULL;
if (btrfs_fs_closing(fs_info))
return -EAGAIN;
@@ -3845,13 +3876,17 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
if (IS_ERR(sctx))
return PTR_ERR(sctx);
+ ret = scrub_workers_get(fs_info, is_dev_replace);
+ if (ret)
+ goto out_free_ctx;
+
mutex_lock(&fs_info->fs_devices->device_list_mutex);
dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) &&
!is_dev_replace)) {
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
ret = -ENODEV;
- goto out_free_ctx;
+ goto out;
}
if (!is_dev_replace && !readonly &&
@@ -3860,7 +3895,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
btrfs_err_in_rcu(fs_info, "scrub: device %s is not writable",
rcu_str_deref(dev->name));
ret = -EROFS;
- goto out_free_ctx;
+ goto out;
}
mutex_lock(&fs_info->scrub_lock);
@@ -3869,7 +3904,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
mutex_unlock(&fs_info->scrub_lock);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
ret = -EIO;
- goto out_free_ctx;
+ goto out;
}
down_read(&fs_info->dev_replace.rwsem);
@@ -3880,17 +3915,10 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
mutex_unlock(&fs_info->scrub_lock);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
ret = -EINPROGRESS;
- goto out_free_ctx;
+ goto out;
}
up_read(&fs_info->dev_replace.rwsem);
- ret = scrub_workers_get(fs_info, is_dev_replace);
- if (ret) {
- mutex_unlock(&fs_info->scrub_lock);
- mutex_unlock(&fs_info->fs_devices->device_list_mutex);
- goto out_free_ctx;
- }
-
sctx->readonly = readonly;
dev->scrub_ctx = sctx;
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
@@ -3943,24 +3971,14 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
mutex_lock(&fs_info->scrub_lock);
dev->scrub_ctx = NULL;
- if (refcount_dec_and_test(&fs_info->scrub_workers_refcnt)) {
- scrub_workers = fs_info->scrub_workers;
- scrub_wr_comp = fs_info->scrub_wr_completion_workers;
- scrub_parity = fs_info->scrub_parity_workers;
-
- fs_info->scrub_workers = NULL;
- fs_info->scrub_wr_completion_workers = NULL;
- fs_info->scrub_parity_workers = NULL;
- }
mutex_unlock(&fs_info->scrub_lock);
- btrfs_destroy_workqueue(scrub_workers);
- btrfs_destroy_workqueue(scrub_wr_comp);
- btrfs_destroy_workqueue(scrub_parity);
+ scrub_workers_put(fs_info);
scrub_put_ctx(sctx);
return ret;
-
+out:
+ scrub_workers_put(fs_info);
out_free_ctx:
scrub_free_ctx(sctx);
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 3eb0fec2488a..b0e5dfb9be7a 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -23,6 +23,7 @@
#include "btrfs_inode.h"
#include "transaction.h"
#include "compression.h"
+#include "xattr.h"
/*
* Maximum number of references an extent can have in order for us to attempt to
@@ -1256,12 +1257,21 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
*/
if (found->root == bctx->sctx->send_root) {
/*
- * TODO for the moment we don't accept clones from the inode
- * that is currently send. We may change this when
- * BTRFS_IOC_CLONE_RANGE supports cloning from and to the same
- * file.
+ * If the source inode was not yet processed we can't issue a
+ * clone operation, as the source extent does not exist yet at
+ * the destination of the stream.
*/
- if (ino >= bctx->cur_objectid)
+ if (ino > bctx->cur_objectid)
+ return 0;
+ /*
+ * We clone from the inode currently being sent as long as the
+ * source extent is already processed, otherwise we could try
+ * to clone from an extent that does not exist yet at the
+ * destination of the stream.
+ */
+ if (ino == bctx->cur_objectid &&
+ offset + bctx->extent_len >
+ bctx->sctx->cur_inode_next_write_offset)
return 0;
}
@@ -4536,6 +4546,10 @@ static int __process_new_xattr(int num, struct btrfs_key *di_key,
struct fs_path *p;
struct posix_acl_xattr_header dummy_acl;
+ /* Capabilities are emitted by finish_inode_if_needed */
+ if (!strncmp(name, XATTR_NAME_CAPS, name_len))
+ return 0;
+
p = fs_path_alloc();
if (!p)
return -ENOMEM;
@@ -5098,6 +5112,64 @@ static int send_extent_data(struct send_ctx *sctx,
return 0;
}
+/*
+ * Search for a capability xattr related to sctx->cur_ino. If the capability is
+ * found, call send_set_xattr function to emit it.
+ *
+ * Return 0 if there isn't a capability, or when the capability was emitted
+ * successfully, or < 0 if an error occurred.
+ */
+static int send_capabilities(struct send_ctx *sctx)
+{
+ struct fs_path *fspath = NULL;
+ struct btrfs_path *path;
+ struct btrfs_dir_item *di;
+ struct extent_buffer *leaf;
+ unsigned long data_ptr;
+ char *buf = NULL;
+ int buf_len;
+ int ret = 0;
+
+ path = alloc_path_for_send();
+ if (!path)
+ return -ENOMEM;
+
+ di = btrfs_lookup_xattr(NULL, sctx->send_root, path, sctx->cur_ino,
+ XATTR_NAME_CAPS, strlen(XATTR_NAME_CAPS), 0);
+ if (!di) {
+ /* There is no xattr for this inode */
+ goto out;
+ } else if (IS_ERR(di)) {
+ ret = PTR_ERR(di);
+ goto out;
+ }
+
+ leaf = path->nodes[0];
+ buf_len = btrfs_dir_data_len(leaf, di);
+
+ fspath = fs_path_alloc();
+ buf = kmalloc(buf_len, GFP_KERNEL);
+ if (!fspath || !buf) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath);
+ if (ret < 0)
+ goto out;
+
+ data_ptr = (unsigned long)(di + 1) + btrfs_dir_name_len(leaf, di);
+ read_extent_buffer(leaf, buf, data_ptr, buf_len);
+
+ ret = send_set_xattr(sctx, fspath, XATTR_NAME_CAPS,
+ strlen(XATTR_NAME_CAPS), buf, buf_len);
+out:
+ kfree(buf);
+ fs_path_free(fspath);
+ btrfs_free_path(path);
+ return ret;
+}
+
static int clone_range(struct send_ctx *sctx,
struct clone_root *clone_root,
const u64 disk_byte,
@@ -6001,6 +6073,10 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end)
goto out;
}
+ ret = send_capabilities(sctx);
+ if (ret < 0)
+ goto out;
+
/*
* If other directory inodes depended on our current directory
* inode's move/rename, now do their move/rename operations.
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index e8a4b0ebe97f..90500b6c41fc 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -160,10 +160,9 @@ static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global)
return (global->size << 1);
}
-static int can_overcommit(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info, u64 bytes,
- enum btrfs_reserve_flush_enum flush,
- bool system_chunk)
+int btrfs_can_overcommit(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info, u64 bytes,
+ enum btrfs_reserve_flush_enum flush)
{
u64 profile;
u64 avail;
@@ -174,7 +173,7 @@ static int can_overcommit(struct btrfs_fs_info *fs_info,
if (space_info->flags & BTRFS_BLOCK_GROUP_DATA)
return 0;
- if (system_chunk)
+ if (space_info->flags & BTRFS_BLOCK_GROUP_SYSTEM)
profile = btrfs_system_alloc_profile(fs_info);
else
profile = btrfs_metadata_alloc_profile(fs_info);
@@ -228,8 +227,8 @@ again:
/* Check and see if our ticket can be satisified now. */
if ((used + ticket->bytes <= space_info->total_bytes) ||
- can_overcommit(fs_info, space_info, ticket->bytes, flush,
- false)) {
+ btrfs_can_overcommit(fs_info, space_info, ticket->bytes,
+ flush)) {
btrfs_space_info_update_bytes_may_use(fs_info,
space_info,
ticket->bytes);
@@ -304,8 +303,8 @@ again:
cache->key.objectid, cache->key.offset,
btrfs_block_group_used(&cache->item), cache->pinned,
cache->reserved, cache->ro ? "[readonly]" : "");
- btrfs_dump_free_space(cache, bytes);
spin_unlock(&cache->lock);
+ btrfs_dump_free_space(cache, bytes);
}
if (++index < BTRFS_NR_RAID_TYPES)
goto again;
@@ -462,6 +461,7 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info,
struct reserve_ticket *ticket = NULL;
struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv;
struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
+ struct btrfs_block_rsv *trans_rsv = &fs_info->trans_block_rsv;
struct btrfs_trans_handle *trans;
u64 bytes_needed;
u64 reclaim_bytes = 0;
@@ -524,6 +524,11 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info,
spin_lock(&delayed_refs_rsv->lock);
reclaim_bytes += delayed_refs_rsv->reserved;
spin_unlock(&delayed_refs_rsv->lock);
+
+ spin_lock(&trans_rsv->lock);
+ reclaim_bytes += trans_rsv->reserved;
+ spin_unlock(&trans_rsv->lock);
+
if (reclaim_bytes >= bytes_needed)
goto commit;
bytes_needed -= reclaim_bytes;
@@ -628,8 +633,7 @@ static void flush_space(struct btrfs_fs_info *fs_info,
static inline u64
btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
- struct btrfs_space_info *space_info,
- bool system_chunk)
+ struct btrfs_space_info *space_info)
{
struct reserve_ticket *ticket;
u64 used;
@@ -644,14 +648,14 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
return to_reclaim;
to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
- if (can_overcommit(fs_info, space_info, to_reclaim,
- BTRFS_RESERVE_FLUSH_ALL, system_chunk))
+ if (btrfs_can_overcommit(fs_info, space_info, to_reclaim,
+ BTRFS_RESERVE_FLUSH_ALL))
return 0;
used = btrfs_space_info_used(space_info, true);
- if (can_overcommit(fs_info, space_info, SZ_1M,
- BTRFS_RESERVE_FLUSH_ALL, system_chunk))
+ if (btrfs_can_overcommit(fs_info, space_info, SZ_1M,
+ BTRFS_RESERVE_FLUSH_ALL))
expected = div_factor_fine(space_info->total_bytes, 95);
else
expected = div_factor_fine(space_info->total_bytes, 90);
@@ -667,7 +671,7 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info,
static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info,
- u64 used, bool system_chunk)
+ u64 used)
{
u64 thresh = div_factor_fine(space_info->total_bytes, 98);
@@ -675,14 +679,41 @@ static inline int need_do_async_reclaim(struct btrfs_fs_info *fs_info,
if ((space_info->bytes_used + space_info->bytes_reserved) >= thresh)
return 0;
- if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info,
- system_chunk))
+ if (!btrfs_calc_reclaim_metadata_size(fs_info, space_info))
return 0;
return (used >= thresh && !btrfs_fs_closing(fs_info) &&
!test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state));
}
+static bool steal_from_global_rsv(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info,
+ struct reserve_ticket *ticket)
+{
+ struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
+ u64 min_bytes;
+
+ if (global_rsv->space_info != space_info)
+ return false;
+
+ spin_lock(&global_rsv->lock);
+ min_bytes = div_factor(global_rsv->size, 5);
+ if (global_rsv->reserved < min_bytes + ticket->bytes) {
+ spin_unlock(&global_rsv->lock);
+ return false;
+ }
+ global_rsv->reserved -= ticket->bytes;
+ ticket->bytes = 0;
+ list_del_init(&ticket->list);
+ wake_up(&ticket->wait);
+ space_info->tickets_id++;
+ if (global_rsv->reserved < global_rsv->size)
+ global_rsv->full = 0;
+ spin_unlock(&global_rsv->lock);
+
+ return true;
+}
+
/*
* maybe_fail_all_tickets - we've exhausted our flushing, start failing tickets
* @fs_info - fs_info for this fs
@@ -715,6 +746,10 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info,
ticket = list_first_entry(&space_info->tickets,
struct reserve_ticket, list);
+ if (ticket->steal &&
+ steal_from_global_rsv(fs_info, space_info, ticket))
+ return true;
+
/*
* may_commit_transaction will avoid committing the transaction
* if it doesn't feel like the space reclaimed by the commit
@@ -767,8 +802,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
spin_lock(&space_info->lock);
- to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
- false);
+ to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info);
if (!to_reclaim) {
space_info->flush = 0;
spin_unlock(&space_info->lock);
@@ -787,8 +821,7 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work)
return;
}
to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info,
- space_info,
- false);
+ space_info);
if (last_tickets_id == space_info->tickets_id) {
flush_state++;
} else {
@@ -860,8 +893,7 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info,
int flush_state;
spin_lock(&space_info->lock);
- to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info,
- false);
+ to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info);
if (!to_reclaim) {
spin_unlock(&space_info->lock);
return;
@@ -934,6 +966,7 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info,
switch (flush) {
case BTRFS_RESERVE_FLUSH_ALL:
+ case BTRFS_RESERVE_FLUSH_ALL_STEAL:
wait_reserve_ticket(fs_info, space_info, ticket);
break;
case BTRFS_RESERVE_FLUSH_LIMIT:
@@ -992,8 +1025,7 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info,
static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info,
u64 orig_bytes,
- enum btrfs_reserve_flush_enum flush,
- bool system_chunk)
+ enum btrfs_reserve_flush_enum flush)
{
struct reserve_ticket ticket;
u64 used;
@@ -1015,8 +1047,7 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
*/
if (!pending_tickets &&
((used + orig_bytes <= space_info->total_bytes) ||
- can_overcommit(fs_info, space_info, orig_bytes, flush,
- system_chunk))) {
+ btrfs_can_overcommit(fs_info, space_info, orig_bytes, flush))) {
btrfs_space_info_update_bytes_may_use(fs_info, space_info,
orig_bytes);
ret = 0;
@@ -1033,7 +1064,9 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
ticket.bytes = orig_bytes;
ticket.error = 0;
init_waitqueue_head(&ticket.wait);
- if (flush == BTRFS_RESERVE_FLUSH_ALL) {
+ ticket.steal = (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL);
+ if (flush == BTRFS_RESERVE_FLUSH_ALL ||
+ flush == BTRFS_RESERVE_FLUSH_ALL_STEAL) {
list_add_tail(&ticket.list, &space_info->tickets);
if (!space_info->flush) {
space_info->flush = 1;
@@ -1056,8 +1089,7 @@ static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
* the async reclaim as we will panic.
*/
if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags) &&
- need_do_async_reclaim(fs_info, space_info,
- used, system_chunk) &&
+ need_do_async_reclaim(fs_info, space_info, used) &&
!work_busy(&fs_info->async_reclaim_work)) {
trace_btrfs_trigger_flush(fs_info, space_info->flags,
orig_bytes, flush, "preempt");
@@ -1094,10 +1126,9 @@ int btrfs_reserve_metadata_bytes(struct btrfs_root *root,
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
int ret;
- bool system_chunk = (root == fs_info->chunk_root);
ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info,
- orig_bytes, flush, system_chunk);
+ orig_bytes, flush);
if (ret == -ENOSPC &&
unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
if (block_rsv != global_rsv &&
diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h
index 8867e84aa33d..b9cffc62cafa 100644
--- a/fs/btrfs/space-info.h
+++ b/fs/btrfs/space-info.h
@@ -72,6 +72,7 @@ struct btrfs_space_info {
struct reserve_ticket {
u64 bytes;
int error;
+ bool steal;
struct list_head list;
wait_queue_head_t wait;
};
@@ -128,6 +129,9 @@ int btrfs_reserve_metadata_bytes(struct btrfs_root *root,
enum btrfs_reserve_flush_enum flush);
void btrfs_try_granting_tickets(struct btrfs_fs_info *fs_info,
struct btrfs_space_info *space_info);
+int btrfs_can_overcommit(struct btrfs_fs_info *fs_info,
+ struct btrfs_space_info *space_info, u64 bytes,
+ enum btrfs_reserve_flush_enum flush);
static inline void btrfs_space_info_free_bytes_may_use(
struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index aea24202cd35..6a2ae208ff80 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -241,7 +241,7 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
{
struct btrfs_fs_info *fs_info = trans->fs_info;
- trans->aborted = errno;
+ WRITE_ONCE(trans->aborted, errno);
/* Nothing used. The other threads that have joined this
* transaction may be able to continue. */
if (!trans->dirty && list_empty(&trans->new_bgs)) {
@@ -435,6 +435,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
char *compress_type;
bool compress_force = false;
enum btrfs_compression_type saved_compress_type;
+ int saved_compress_level;
bool saved_compress_force;
int no_compress = 0;
@@ -517,6 +518,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
info->compress_type : BTRFS_COMPRESS_NONE;
saved_compress_force =
btrfs_test_opt(info, FORCE_COMPRESS);
+ saved_compress_level = info->compress_level;
if (token == Opt_compress ||
token == Opt_compress_force ||
strncmp(args[0].from, "zlib", 4) == 0) {
@@ -542,6 +544,7 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
} else if (strncmp(args[0].from, "lzo", 3) == 0) {
compress_type = "lzo";
info->compress_type = BTRFS_COMPRESS_LZO;
+ info->compress_level = 0;
btrfs_set_opt(info->mount_opt, COMPRESS);
btrfs_clear_opt(info->mount_opt, NODATACOW);
btrfs_clear_opt(info->mount_opt, NODATASUM);
@@ -561,6 +564,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
no_compress = 0;
} else if (strncmp(args[0].from, "no", 2) == 0) {
compress_type = "no";
+ info->compress_level = 0;
+ info->compress_type = 0;
btrfs_clear_opt(info->mount_opt, COMPRESS);
btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
compress_force = false;
@@ -581,11 +586,11 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
*/
btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
}
- if ((btrfs_test_opt(info, COMPRESS) &&
- (info->compress_type != saved_compress_type ||
- compress_force != saved_compress_force)) ||
- (!btrfs_test_opt(info, COMPRESS) &&
- no_compress == 1)) {
+ if (no_compress == 1) {
+ btrfs_info(info, "use no compression");
+ } else if ((info->compress_type != saved_compress_type) ||
+ (compress_force != saved_compress_force) ||
+ (info->compress_level != saved_compress_level)) {
btrfs_info(info, "%s %s compression, level %d",
(compress_force) ? "force" : "use",
compress_type, info->compress_level);
@@ -1005,8 +1010,8 @@ out:
return error;
}
-static char *get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
- u64 subvol_objectid)
+char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
+ u64 subvol_objectid)
{
struct btrfs_root *root = fs_info->tree_root;
struct btrfs_root *fs_root;
@@ -1287,6 +1292,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
{
struct btrfs_fs_info *info = btrfs_sb(dentry->d_sb);
const char *compress_type;
+ const char *subvol_name;
if (btrfs_test_opt(info, DEGRADED))
seq_puts(seq, ",degraded");
@@ -1371,8 +1377,13 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
seq_puts(seq, ",ref_verify");
seq_printf(seq, ",subvolid=%llu",
BTRFS_I(d_inode(dentry))->root->root_key.objectid);
- seq_puts(seq, ",subvol=");
- seq_dentry(seq, dentry, " \t\n\\");
+ subvol_name = btrfs_get_subvol_name_from_objectid(info,
+ BTRFS_I(d_inode(dentry))->root->root_key.objectid);
+ if (!IS_ERR(subvol_name)) {
+ seq_puts(seq, ",subvol=");
+ seq_escape(seq, subvol_name, " \t\n\\");
+ kfree(subvol_name);
+ }
return 0;
}
@@ -1417,8 +1428,8 @@ static struct dentry *mount_subvol(const char *subvol_name, u64 subvol_objectid,
goto out;
}
}
- subvol_name = get_subvol_name_from_objectid(btrfs_sb(mnt->mnt_sb),
- subvol_objectid);
+ subvol_name = btrfs_get_subvol_name_from_objectid(
+ btrfs_sb(mnt->mnt_sb), subvol_objectid);
if (IS_ERR(subvol_name)) {
root = ERR_CAST(subvol_name);
subvol_name = NULL;
@@ -1848,6 +1859,12 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
set_bit(BTRFS_FS_OPEN, &fs_info->flags);
}
out:
+ /*
+ * We need to set SB_I_VERSION here otherwise it'll get cleared by VFS,
+ * since the absence of the flag means it can be toggled off by remount.
+ */
+ *flags |= SB_I_VERSION;
+
wake_up_process(fs_info->transaction_kthread);
btrfs_remount_cleanup(fs_info, old_opts);
return 0;
@@ -2254,9 +2271,7 @@ static int btrfs_unfreeze(struct super_block *sb)
static int btrfs_show_devname(struct seq_file *m, struct dentry *root)
{
struct btrfs_fs_info *fs_info = btrfs_sb(root->d_sb);
- struct btrfs_fs_devices *cur_devices;
struct btrfs_device *dev, *first_dev = NULL;
- struct list_head *head;
/*
* Lightweight locking of the devices. We should not need
@@ -2266,18 +2281,13 @@ static int btrfs_show_devname(struct seq_file *m, struct dentry *root)
* least until the rcu_read_unlock.
*/
rcu_read_lock();
- cur_devices = fs_info->fs_devices;
- while (cur_devices) {
- head = &cur_devices->devices;
- list_for_each_entry_rcu(dev, head, dev_list) {
- if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
- continue;
- if (!dev->name)
- continue;
- if (!first_dev || dev->devid < first_dev->devid)
- first_dev = dev;
- }
- cur_devices = cur_devices->seed;
+ list_for_each_entry_rcu(dev, &fs_info->fs_devices->devices, dev_list) {
+ if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state))
+ continue;
+ if (!dev->name)
+ continue;
+ if (!first_dev || dev->devid < first_dev->devid)
+ first_dev = dev;
}
if (first_dev)
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index f6d3c80f2e28..5c299e1f2297 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -975,7 +975,9 @@ int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices,
{
int error = 0;
struct btrfs_device *dev;
+ unsigned int nofs_flag;
+ nofs_flag = memalloc_nofs_save();
list_for_each_entry(dev, &fs_devices->devices, dev_list) {
struct hd_struct *disk;
struct kobject *disk_kobj;
@@ -994,6 +996,7 @@ int btrfs_sysfs_add_device_link(struct btrfs_fs_devices *fs_devices,
if (error)
break;
}
+ memalloc_nofs_restore(nofs_flag);
return error;
}
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index cdca0f656594..c346ee7ec18d 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -21,6 +21,7 @@
#include "dev-replace.h"
#include "qgroup.h"
#include "block-group.h"
+#include "space-info.h"
#define BTRFS_ROOT_TRANS_TAG 0
@@ -173,7 +174,7 @@ loop:
cur_trans = fs_info->running_transaction;
if (cur_trans) {
- if (cur_trans->aborted) {
+ if (TRANS_ABORTED(cur_trans)) {
spin_unlock(&fs_info->trans_lock);
return cur_trans->aborted;
}
@@ -389,7 +390,7 @@ static inline int is_transaction_blocked(struct btrfs_transaction *trans)
{
return (trans->state >= TRANS_STATE_BLOCKED &&
trans->state < TRANS_STATE_UNBLOCKED &&
- !trans->aborted);
+ !TRANS_ABORTED(trans));
}
/* wait for commit against the current transaction to become unblocked
@@ -408,7 +409,7 @@ static void wait_current_trans(struct btrfs_fs_info *fs_info)
wait_event(fs_info->transaction_wait,
cur_trans->state >= TRANS_STATE_UNBLOCKED ||
- cur_trans->aborted);
+ TRANS_ABORTED(cur_trans));
btrfs_put_transaction(cur_trans);
} else {
spin_unlock(&fs_info->trans_lock);
@@ -451,6 +452,7 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
u64 num_bytes = 0;
u64 qgroup_reserved = 0;
bool reloc_reserved = false;
+ bool do_chunk_alloc = false;
int ret;
/* Send isn't supposed to start transactions. */
@@ -491,7 +493,8 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
* refill that amount for whatever is missing in the reserve.
*/
num_bytes = btrfs_calc_insert_metadata_size(fs_info, num_items);
- if (delayed_refs_rsv->full == 0) {
+ if (flush == BTRFS_RESERVE_FLUSH_ALL &&
+ delayed_refs_rsv->full == 0) {
delayed_refs_bytes = num_bytes;
num_bytes <<= 1;
}
@@ -512,6 +515,9 @@ start_transaction(struct btrfs_root *root, unsigned int num_items,
delayed_refs_bytes);
num_bytes -= delayed_refs_bytes;
}
+
+ if (rsv->space_info->force_alloc)
+ do_chunk_alloc = true;
} else if (num_items == 0 && flush == BTRFS_RESERVE_FLUSH_ALL &&
!delayed_refs_rsv->full) {
/*
@@ -594,6 +600,19 @@ got_it:
current->journal_info = h;
/*
+ * If the space_info is marked ALLOC_FORCE then we'll get upgraded to
+ * ALLOC_FORCE the first run through, and then we won't allocate for
+ * anybody else who races in later. We don't care about the return
+ * value here.
+ */
+ if (do_chunk_alloc && num_bytes) {
+ u64 flags = h->block_rsv->space_info->flags;
+
+ btrfs_chunk_alloc(h, btrfs_get_alloc_profile(fs_info, flags),
+ CHUNK_ALLOC_NO_FORCE);
+ }
+
+ /*
* btrfs_record_root_in_trans() needs to alloc new extents, and may
* call btrfs_join_transaction() while we're also starting a
* transaction.
@@ -627,43 +646,10 @@ struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
struct btrfs_trans_handle *btrfs_start_transaction_fallback_global_rsv(
struct btrfs_root *root,
- unsigned int num_items,
- int min_factor)
+ unsigned int num_items)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct btrfs_trans_handle *trans;
- u64 num_bytes;
- int ret;
-
- /*
- * We have two callers: unlink and block group removal. The
- * former should succeed even if we will temporarily exceed
- * quota and the latter operates on the extent root so
- * qgroup enforcement is ignored anyway.
- */
- trans = start_transaction(root, num_items, TRANS_START,
- BTRFS_RESERVE_FLUSH_ALL, false);
- if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
- return trans;
-
- trans = btrfs_start_transaction(root, 0);
- if (IS_ERR(trans))
- return trans;
-
- num_bytes = btrfs_calc_insert_metadata_size(fs_info, num_items);
- ret = btrfs_cond_migrate_bytes(fs_info, &fs_info->trans_block_rsv,
- num_bytes, min_factor);
- if (ret) {
- btrfs_end_transaction(trans);
- return ERR_PTR(ret);
- }
-
- trans->block_rsv = &fs_info->trans_block_rsv;
- trans->bytes_reserved = num_bytes;
- trace_btrfs_space_reservation(fs_info, "transaction",
- trans->transid, num_bytes, 1);
-
- return trans;
+ return start_transaction(root, num_items, TRANS_START,
+ BTRFS_RESERVE_FLUSH_ALL_STEAL, false);
}
struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root)
@@ -884,10 +870,13 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
if (throttle)
btrfs_run_delayed_iputs(info);
- if (trans->aborted ||
+ if (TRANS_ABORTED(trans) ||
test_bit(BTRFS_FS_STATE_ERROR, &info->fs_state)) {
wake_up_process(info->transaction_kthread);
- err = -EIO;
+ if (TRANS_ABORTED(trans))
+ err = trans->aborted;
+ else
+ err = -EROFS;
}
kmem_cache_free(btrfs_trans_handle_cachep, trans);
@@ -1741,7 +1730,8 @@ static void wait_current_trans_commit_start(struct btrfs_fs_info *fs_info,
struct btrfs_transaction *trans)
{
wait_event(fs_info->transaction_blocked_wait,
- trans->state >= TRANS_STATE_COMMIT_START || trans->aborted);
+ trans->state >= TRANS_STATE_COMMIT_START ||
+ TRANS_ABORTED(trans));
}
/*
@@ -1753,7 +1743,8 @@ static void wait_current_trans_commit_start_and_unblock(
struct btrfs_transaction *trans)
{
wait_event(fs_info->transaction_wait,
- trans->state >= TRANS_STATE_UNBLOCKED || trans->aborted);
+ trans->state >= TRANS_STATE_UNBLOCKED ||
+ TRANS_ABORTED(trans));
}
/*
@@ -1971,7 +1962,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
trans->dirty = true;
/* Stop the commit early if ->aborted is set */
- if (unlikely(READ_ONCE(cur_trans->aborted))) {
+ if (TRANS_ABORTED(cur_trans)) {
ret = cur_trans->aborted;
btrfs_end_transaction(trans);
return ret;
@@ -2045,7 +2036,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
wait_for_commit(cur_trans);
- if (unlikely(cur_trans->aborted))
+ if (TRANS_ABORTED(cur_trans))
ret = cur_trans->aborted;
btrfs_put_transaction(cur_trans);
@@ -2064,7 +2055,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
spin_unlock(&fs_info->trans_lock);
wait_for_commit(prev_trans);
- ret = prev_trans->aborted;
+ ret = READ_ONCE(prev_trans->aborted);
btrfs_put_transaction(prev_trans);
if (ret)
@@ -2118,8 +2109,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
wait_event(cur_trans->writer_wait,
atomic_read(&cur_trans->num_writers) == 1);
- /* ->aborted might be set after the previous check, so check it */
- if (unlikely(READ_ONCE(cur_trans->aborted))) {
+ if (TRANS_ABORTED(cur_trans)) {
ret = cur_trans->aborted;
goto scrub_continue;
}
@@ -2237,7 +2227,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
* The tasks which save the space cache and inode cache may also
* update ->aborted, check it.
*/
- if (unlikely(READ_ONCE(cur_trans->aborted))) {
+ if (TRANS_ABORTED(cur_trans)) {
ret = cur_trans->aborted;
mutex_unlock(&fs_info->tree_log_mutex);
mutex_unlock(&fs_info->reloc_mutex);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 2c5a6f6e5bb0..7291a2a93075 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -116,6 +116,10 @@ struct btrfs_trans_handle {
struct btrfs_block_rsv *orig_rsv;
refcount_t use_count;
unsigned int type;
+ /*
+ * Error code of transaction abort, set outside of locks and must use
+ * the READ_ONCE/WRITE_ONCE access
+ */
short aborted;
bool adding_csums;
bool allocating_chunk;
@@ -127,6 +131,14 @@ struct btrfs_trans_handle {
struct list_head new_bgs;
};
+/*
+ * The abort status can be changed between calls and is not protected by locks.
+ * This accepts btrfs_transaction and btrfs_trans_handle as types. Once it's
+ * set to a non-zero value it does not change, so the macro should be in checks
+ * but is not necessary for further reads of the value.
+ */
+#define TRANS_ABORTED(trans) (unlikely(READ_ONCE((trans)->aborted)))
+
struct btrfs_pending_snapshot {
struct dentry *dentry;
struct inode *dir;
@@ -181,8 +193,7 @@ struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root,
unsigned int num_items);
struct btrfs_trans_handle *btrfs_start_transaction_fallback_global_rsv(
struct btrfs_root *root,
- unsigned int num_items,
- int min_factor);
+ unsigned int num_items);
struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root);
struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root);
struct btrfs_trans_handle *btrfs_join_transaction_nostart(struct btrfs_root *root);
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index 0e44db066641..84b8d6ebf98f 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -674,6 +674,44 @@ int btrfs_check_chunk_valid(struct extent_buffer *leaf,
return 0;
}
+/*
+ * Enhanced version of chunk item checker.
+ *
+ * The common btrfs_check_chunk_valid() doesn't check item size since it needs
+ * to work on super block sys_chunk_array which doesn't have full item ptr.
+ */
+static int check_leaf_chunk_item(struct extent_buffer *leaf,
+ struct btrfs_chunk *chunk,
+ struct btrfs_key *key, int slot)
+{
+ int num_stripes;
+
+ if (btrfs_item_size_nr(leaf, slot) < sizeof(struct btrfs_chunk)) {
+ chunk_err(leaf, chunk, key->offset,
+ "invalid chunk item size: have %u expect [%zu, %u)",
+ btrfs_item_size_nr(leaf, slot),
+ sizeof(struct btrfs_chunk),
+ BTRFS_LEAF_DATA_SIZE(leaf->fs_info));
+ return -EUCLEAN;
+ }
+
+ num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+ /* Let btrfs_check_chunk_valid() handle this error type */
+ if (num_stripes == 0)
+ goto out;
+
+ if (btrfs_chunk_item_size(num_stripes) !=
+ btrfs_item_size_nr(leaf, slot)) {
+ chunk_err(leaf, chunk, key->offset,
+ "invalid chunk item size: have %u expect %lu",
+ btrfs_item_size_nr(leaf, slot),
+ btrfs_chunk_item_size(num_stripes));
+ return -EUCLEAN;
+ }
+out:
+ return btrfs_check_chunk_valid(leaf, chunk, key->offset);
+}
+
__printf(3, 4)
__cold
static void dev_item_err(const struct extent_buffer *eb, int slot,
@@ -772,7 +810,7 @@ static int check_inode_item(struct extent_buffer *leaf,
/* Here we use super block generation + 1 to handle log tree */
if (btrfs_inode_generation(leaf, iitem) > super_gen + 1) {
inode_item_err(fs_info, leaf, slot,
- "invalid inode generation: has %llu expect (0, %llu]",
+ "invalid inode transid: has %llu expect [0, %llu]",
btrfs_inode_generation(leaf, iitem),
super_gen + 1);
return -EUCLEAN;
@@ -1265,7 +1303,7 @@ static int check_leaf_item(struct extent_buffer *leaf,
break;
case BTRFS_CHUNK_ITEM_KEY:
chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
- ret = btrfs_check_chunk_valid(leaf, chunk, key->offset);
+ ret = check_leaf_chunk_item(leaf, chunk, key, slot);
break;
case BTRFS_DEV_ITEM_KEY:
ret = check_dev_item(leaf, key, slot);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 7d464b049507..7042b84edc89 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -167,6 +167,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
if (ret)
goto out;
+ set_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
clear_bit(BTRFS_ROOT_MULTI_LOG_TASKS, &root->state);
root->log_start_pid = current->pid;
}
@@ -193,6 +194,9 @@ static int join_running_log_trans(struct btrfs_root *root)
{
int ret = -ENOENT;
+ if (!test_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state))
+ return ret;
+
mutex_lock(&root->log_mutex);
if (root->log_root) {
ret = 0;
@@ -3136,29 +3140,17 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
btrfs_init_log_ctx(&root_log_ctx, NULL);
mutex_lock(&log_root_tree->log_mutex);
- atomic_inc(&log_root_tree->log_batch);
- atomic_inc(&log_root_tree->log_writers);
index2 = log_root_tree->log_transid % 2;
list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
root_log_ctx.log_transid = log_root_tree->log_transid;
- mutex_unlock(&log_root_tree->log_mutex);
-
- mutex_lock(&log_root_tree->log_mutex);
-
/*
* Now we are safe to update the log_root_tree because we're under the
* log_mutex, and we're a current writer so we're holding the commit
* open until we drop the log_mutex.
*/
ret = update_log_root(trans, log, &new_root_item);
-
- if (atomic_dec_and_test(&log_root_tree->log_writers)) {
- /* atomic_dec_and_test implies a barrier */
- cond_wake_up_nomb(&log_root_tree->log_writer_wait);
- }
-
if (ret) {
if (!list_empty(&root_log_ctx.list))
list_del_init(&root_log_ctx.list);
@@ -3204,8 +3196,6 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
root_log_ctx.log_transid - 1);
}
- wait_for_writer(log_root_tree);
-
/*
* now that we've moved on to the tree of log tree roots,
* check the full commit flag again
@@ -3327,6 +3317,7 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
if (root->log_root) {
free_log_tree(trans, root->log_root);
root->log_root = NULL;
+ clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state);
}
return 0;
}
@@ -3482,11 +3473,13 @@ fail:
btrfs_free_path(path);
out_unlock:
mutex_unlock(&dir->log_mutex);
- if (ret == -ENOSPC) {
+ if (err == -ENOSPC) {
btrfs_set_log_full_commit(trans);
- ret = 0;
- } else if (ret < 0)
- btrfs_abort_transaction(trans, ret);
+ err = 0;
+ } else if (err < 0 && err != -ENOENT) {
+ /* ENOENT can be returned if the entry hasn't been fsynced yet */
+ btrfs_abort_transaction(trans, err);
+ }
btrfs_end_log_trans(root);
@@ -4049,11 +4042,8 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
fs_info->csum_root,
ds + cs, ds + cs + cl - 1,
&ordered_sums, 0);
- if (ret) {
- btrfs_release_path(dst_path);
- kfree(ins_data);
- return ret;
- }
+ if (ret)
+ break;
}
}
}
@@ -4066,7 +4056,6 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
* we have to do this after the loop above to avoid changing the
* log tree while trying to change the log tree.
*/
- ret = 0;
while (!list_empty(&ordered_sums)) {
struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
struct btrfs_ordered_sum,
@@ -5007,6 +4996,138 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
return ret;
}
+static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode,
+ struct btrfs_key *min_key,
+ const struct btrfs_key *max_key,
+ struct btrfs_path *path,
+ struct btrfs_path *dst_path,
+ const u64 logged_isize,
+ const bool recursive_logging,
+ const int inode_only,
+ struct btrfs_log_ctx *ctx,
+ bool *need_log_inode_item)
+{
+ struct btrfs_root *root = inode->root;
+ int ins_start_slot = 0;
+ int ins_nr = 0;
+ int ret;
+
+ while (1) {
+ ret = btrfs_search_forward(root, min_key, path, trans->transid);
+ if (ret < 0)
+ return ret;
+ if (ret > 0) {
+ ret = 0;
+ break;
+ }
+again:
+ /* Note, ins_nr might be > 0 here, cleanup outside the loop */
+ if (min_key->objectid != max_key->objectid)
+ break;
+ if (min_key->type > max_key->type)
+ break;
+
+ if (min_key->type == BTRFS_INODE_ITEM_KEY)
+ *need_log_inode_item = false;
+
+ if ((min_key->type == BTRFS_INODE_REF_KEY ||
+ min_key->type == BTRFS_INODE_EXTREF_KEY) &&
+ inode->generation == trans->transid &&
+ !recursive_logging) {
+ u64 other_ino = 0;
+ u64 other_parent = 0;
+
+ ret = btrfs_check_ref_name_override(path->nodes[0],
+ path->slots[0], min_key, inode,
+ &other_ino, &other_parent);
+ if (ret < 0) {
+ return ret;
+ } else if (ret > 0 && ctx &&
+ other_ino != btrfs_ino(BTRFS_I(ctx->inode))) {
+ if (ins_nr > 0) {
+ ins_nr++;
+ } else {
+ ins_nr = 1;
+ ins_start_slot = path->slots[0];
+ }
+ ret = copy_items(trans, inode, dst_path, path,
+ ins_start_slot, ins_nr,
+ inode_only, logged_isize);
+ if (ret < 0)
+ return ret;
+ ins_nr = 0;
+
+ ret = log_conflicting_inodes(trans, root, path,
+ ctx, other_ino, other_parent);
+ if (ret)
+ return ret;
+ btrfs_release_path(path);
+ goto next_key;
+ }
+ }
+
+ /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */
+ if (min_key->type == BTRFS_XATTR_ITEM_KEY) {
+ if (ins_nr == 0)
+ goto next_slot;
+ ret = copy_items(trans, inode, dst_path, path,
+ ins_start_slot,
+ ins_nr, inode_only, logged_isize);
+ if (ret < 0)
+ return ret;
+ ins_nr = 0;
+ goto next_slot;
+ }
+
+ if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
+ ins_nr++;
+ goto next_slot;
+ } else if (!ins_nr) {
+ ins_start_slot = path->slots[0];
+ ins_nr = 1;
+ goto next_slot;
+ }
+
+ ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
+ ins_nr, inode_only, logged_isize);
+ if (ret < 0)
+ return ret;
+ ins_nr = 1;
+ ins_start_slot = path->slots[0];
+next_slot:
+ path->slots[0]++;
+ if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
+ btrfs_item_key_to_cpu(path->nodes[0], min_key,
+ path->slots[0]);
+ goto again;
+ }
+ if (ins_nr) {
+ ret = copy_items(trans, inode, dst_path, path,
+ ins_start_slot, ins_nr, inode_only,
+ logged_isize);
+ if (ret < 0)
+ return ret;
+ ins_nr = 0;
+ }
+ btrfs_release_path(path);
+next_key:
+ if (min_key->offset < (u64)-1) {
+ min_key->offset++;
+ } else if (min_key->type < max_key->type) {
+ min_key->type++;
+ min_key->offset = 0;
+ } else {
+ break;
+ }
+ }
+ if (ins_nr)
+ ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
+ ins_nr, inode_only, logged_isize);
+
+ return ret;
+}
+
/* log a single inode in the tree log.
* At least one parent directory for this inode must exist in the tree
* or be logged already.
@@ -5028,17 +5149,13 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
const loff_t end,
struct btrfs_log_ctx *ctx)
{
- struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_path *path;
struct btrfs_path *dst_path;
struct btrfs_key min_key;
struct btrfs_key max_key;
struct btrfs_root *log = root->log_root;
int err = 0;
- int ret;
- int nritems;
- int ins_start_slot = 0;
- int ins_nr;
+ int ret = 0;
bool fast_search = false;
u64 ino = btrfs_ino(inode);
struct extent_map_tree *em_tree = &inode->extent_tree;
@@ -5074,15 +5191,19 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
max_key.offset = (u64)-1;
/*
- * Only run delayed items if we are a dir or a new file.
- * Otherwise commit the delayed inode only, which is needed in
- * order for the log replay code to mark inodes for link count
- * fixup (create temporary BTRFS_TREE_LOG_FIXUP_OBJECTID items).
+ * Only run delayed items if we are a directory. We want to make sure
+ * all directory indexes hit the fs/subvolume tree so we can find them
+ * and figure out which index ranges have to be logged.
+ *
+ * Otherwise commit the delayed inode only if the full sync flag is set,
+ * as we want to make sure an up to date version is in the subvolume
+ * tree so copy_inode_items_to_log() / copy_items() can find it and copy
+ * it to the log tree. For a non full sync, we always log the inode item
+ * based on the in-memory struct btrfs_inode which is always up to date.
*/
- if (S_ISDIR(inode->vfs_inode.i_mode) ||
- inode->generation > fs_info->last_trans_committed)
+ if (S_ISDIR(inode->vfs_inode.i_mode))
ret = btrfs_commit_inode_delayed_items(trans, inode);
- else
+ else if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags))
ret = btrfs_commit_inode_delayed_inode(inode);
if (ret) {
@@ -5169,139 +5290,12 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
goto out_unlock;
}
- while (1) {
- ins_nr = 0;
- ret = btrfs_search_forward(root, &min_key,
- path, trans->transid);
- if (ret < 0) {
- err = ret;
- goto out_unlock;
- }
- if (ret != 0)
- break;
-again:
- /* note, ins_nr might be > 0 here, cleanup outside the loop */
- if (min_key.objectid != ino)
- break;
- if (min_key.type > max_key.type)
- break;
-
- if (min_key.type == BTRFS_INODE_ITEM_KEY)
- need_log_inode_item = false;
-
- if ((min_key.type == BTRFS_INODE_REF_KEY ||
- min_key.type == BTRFS_INODE_EXTREF_KEY) &&
- inode->generation == trans->transid &&
- !recursive_logging) {
- u64 other_ino = 0;
- u64 other_parent = 0;
-
- ret = btrfs_check_ref_name_override(path->nodes[0],
- path->slots[0], &min_key, inode,
- &other_ino, &other_parent);
- if (ret < 0) {
- err = ret;
- goto out_unlock;
- } else if (ret > 0 && ctx &&
- other_ino != btrfs_ino(BTRFS_I(ctx->inode))) {
- if (ins_nr > 0) {
- ins_nr++;
- } else {
- ins_nr = 1;
- ins_start_slot = path->slots[0];
- }
- ret = copy_items(trans, inode, dst_path, path,
- ins_start_slot,
- ins_nr, inode_only,
- logged_isize);
- if (ret < 0) {
- err = ret;
- goto out_unlock;
- }
- ins_nr = 0;
-
- err = log_conflicting_inodes(trans, root, path,
- ctx, other_ino, other_parent);
- if (err)
- goto out_unlock;
- btrfs_release_path(path);
- goto next_key;
- }
- }
-
- /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */
- if (min_key.type == BTRFS_XATTR_ITEM_KEY) {
- if (ins_nr == 0)
- goto next_slot;
- ret = copy_items(trans, inode, dst_path, path,
- ins_start_slot,
- ins_nr, inode_only, logged_isize);
- if (ret < 0) {
- err = ret;
- goto out_unlock;
- }
- ins_nr = 0;
- goto next_slot;
- }
-
- if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
- ins_nr++;
- goto next_slot;
- } else if (!ins_nr) {
- ins_start_slot = path->slots[0];
- ins_nr = 1;
- goto next_slot;
- }
-
- ret = copy_items(trans, inode, dst_path, path,
- ins_start_slot, ins_nr, inode_only,
- logged_isize);
- if (ret < 0) {
- err = ret;
- goto out_unlock;
- }
- ins_nr = 1;
- ins_start_slot = path->slots[0];
-next_slot:
-
- nritems = btrfs_header_nritems(path->nodes[0]);
- path->slots[0]++;
- if (path->slots[0] < nritems) {
- btrfs_item_key_to_cpu(path->nodes[0], &min_key,
- path->slots[0]);
- goto again;
- }
- if (ins_nr) {
- ret = copy_items(trans, inode, dst_path, path,
- ins_start_slot,
- ins_nr, inode_only, logged_isize);
- if (ret < 0) {
- err = ret;
- goto out_unlock;
- }
- ins_nr = 0;
- }
- btrfs_release_path(path);
-next_key:
- if (min_key.offset < (u64)-1) {
- min_key.offset++;
- } else if (min_key.type < max_key.type) {
- min_key.type++;
- min_key.offset = 0;
- } else {
- break;
- }
- }
- if (ins_nr) {
- ret = copy_items(trans, inode, dst_path, path,
- ins_start_slot, ins_nr, inode_only,
- logged_isize);
- if (ret < 0) {
- err = ret;
- goto out_unlock;
- }
- ins_nr = 0;
- }
+ err = copy_inode_items_to_log(trans, inode, &min_key, &max_key,
+ path, dst_path, logged_isize,
+ recursive_logging, inode_only, ctx,
+ &need_log_inode_item);
+ if (err)
+ goto out_unlock;
btrfs_release_path(path);
btrfs_release_path(dst_path);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 3e64f49c394b..e798caee978e 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -4,6 +4,7 @@
*/
#include <linux/sched.h>
+#include <linux/sched/mm.h>
#include <linux/bio.h>
#include <linux/slab.h>
#include <linux/buffer_head.h>
@@ -219,7 +220,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
*
* global::fs_devs - add, remove, updates to the global list
*
- * does not protect: manipulation of the fs_devices::devices list!
+ * does not protect: manipulation of the fs_devices::devices list in general
+ * but in mount context it could be used to exclude list modifications by eg.
+ * scan ioctl
*
* btrfs_device::name - renames (write side), read is RCU
*
@@ -232,6 +235,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
* may be used to exclude some operations from running concurrently without any
* modifications to the list (see write_all_supers)
*
+ * Is not required at mount and close times, because our device list is
+ * protected by the uuid_mutex at that point.
+ *
* balance_mutex
* -------------
* protects balance structures (status, state) and context accessed from
@@ -778,6 +784,11 @@ static int btrfs_free_stale_devices(const char *path,
return ret;
}
+/*
+ * This is only used on mount, and we are protected from competing things
+ * messing with our fs_devices by the uuid_mutex, thus we do not need the
+ * fs_devices->device_list_mutex here.
+ */
static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
struct btrfs_device *device, fmode_t flags,
void *holder)
@@ -1223,6 +1234,8 @@ again:
&device->dev_state)) {
if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
&device->dev_state) &&
+ !test_bit(BTRFS_DEV_STATE_MISSING,
+ &device->dev_state) &&
(!latest_dev ||
device->generation > latest_dev->generation)) {
latest_dev = device;
@@ -1416,8 +1429,14 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
int ret;
lockdep_assert_held(&uuid_mutex);
+ /*
+ * The device_list_mutex cannot be taken here in case opening the
+ * underlying device takes further locks like bd_mutex.
+ *
+ * We also don't need the lock here as this is called during mount and
+ * exclusion is provided by uuid_mutex
+ */
- mutex_lock(&fs_devices->device_list_mutex);
if (fs_devices->opened) {
fs_devices->opened++;
ret = 0;
@@ -1425,7 +1444,6 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
list_sort(NULL, &fs_devices->devices, devid_cmp);
ret = open_fs_devices(fs_devices, flags, holder);
}
- mutex_unlock(&fs_devices->device_list_mutex);
return ret;
}
@@ -2769,8 +2787,18 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path
ret = btrfs_commit_transaction(trans);
}
- /* Update ctime/mtime for libblkid */
+ /*
+ * Now that we have written a new super block to this device, check all
+ * other fs_devices list if device_path alienates any other scanned
+ * device.
+ * We can ignore the return value as it typically returns -EINVAL and
+ * only succeeds if the device was an alien.
+ */
+ btrfs_forget_devices(device_path);
+
+ /* Update ctime/mtime for blkid or udev */
update_dev_time(device_path);
+
return ret;
error_sysfs:
@@ -3271,7 +3299,7 @@ static int insert_balance_item(struct btrfs_fs_info *fs_info,
if (!path)
return -ENOMEM;
- trans = btrfs_start_transaction(root, 0);
+ trans = btrfs_start_transaction_fallback_global_rsv(root, 0);
if (IS_ERR(trans)) {
btrfs_free_path(path);
return PTR_ERR(trans);
@@ -4234,7 +4262,22 @@ int btrfs_balance(struct btrfs_fs_info *fs_info,
mutex_lock(&fs_info->balance_mutex);
if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req))
btrfs_info(fs_info, "balance: paused");
- else if (ret == -ECANCELED && atomic_read(&fs_info->balance_cancel_req))
+ /*
+ * Balance can be canceled by:
+ *
+ * - Regular cancel request
+ * Then ret == -ECANCELED and balance_cancel_req > 0
+ *
+ * - Fatal signal to "btrfs" process
+ * Either the signal caught by wait_reserve_ticket() and callers
+ * got -EINTR, or caught by btrfs_should_cancel_balance() and
+ * got -ECANCELED.
+ * Either way, in this case balance_cancel_req = 0, and
+ * ret == -EINTR or ret == -ECANCELED.
+ *
+ * So here we only check the return value to catch canceled balance.
+ */
+ else if (ret == -ECANCELED || ret == -EINTR)
btrfs_info(fs_info, "balance: canceled");
else
btrfs_info(fs_info, "balance: ended with status: %d", ret);
@@ -4526,6 +4569,7 @@ static int btrfs_uuid_scan_kthread(void *data)
goto skip;
}
update_tree:
+ btrfs_release_path(path);
if (!btrfs_is_empty_uuid(root_item.uuid)) {
ret = btrfs_uuid_tree_add(trans, root_item.uuid,
BTRFS_UUID_KEY_SUBVOL,
@@ -4550,6 +4594,7 @@ update_tree:
}
skip:
+ btrfs_release_path(path);
if (trans) {
ret = btrfs_end_transaction(trans);
trans = NULL;
@@ -4557,7 +4602,6 @@ skip:
break;
}
- btrfs_release_path(path);
if (key.offset < (u64)-1) {
key.offset++;
} else if (key.type < BTRFS_ROOT_ITEM_KEY) {
@@ -5632,12 +5676,13 @@ void btrfs_put_bbio(struct btrfs_bio *bbio)
* replace.
*/
static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
- u64 logical, u64 length,
+ u64 logical, u64 *length_ret,
struct btrfs_bio **bbio_ret)
{
struct extent_map *em;
struct map_lookup *map;
struct btrfs_bio *bbio;
+ u64 length = *length_ret;
u64 offset;
u64 stripe_nr;
u64 stripe_nr_end;
@@ -5670,7 +5715,8 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
}
offset = logical - em->start;
- length = min_t(u64, em->len - offset, length);
+ length = min_t(u64, em->start + em->len - logical, length);
+ *length_ret = length;
stripe_len = map->stripe_len;
/*
@@ -6085,7 +6131,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
if (op == BTRFS_MAP_DISCARD)
return __btrfs_map_block_for_discard(fs_info, logical,
- *length, bbio_ret);
+ length, bbio_ret);
ret = btrfs_get_io_geometry(fs_info, op, logical, *length, &geom);
if (ret < 0)
@@ -6665,8 +6711,17 @@ static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices,
u64 devid, u8 *dev_uuid)
{
struct btrfs_device *device;
+ unsigned int nofs_flag;
+ /*
+ * We call this under the chunk_mutex, so we want to use NOFS for this
+ * allocation, however we don't want to change btrfs_alloc_device() to
+ * always do NOFS because we use it in a lot of other GFP_KERNEL safe
+ * places.
+ */
+ nofs_flag = memalloc_nofs_save();
device = btrfs_alloc_device(NULL, &devid, dev_uuid);
+ memalloc_nofs_restore(nofs_flag);
if (IS_ERR(device))
return device;
@@ -7255,7 +7310,14 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
* otherwise we don't need it.
*/
mutex_lock(&uuid_mutex);
- mutex_lock(&fs_info->chunk_mutex);
+
+ /*
+ * It is possible for mount and umount to race in such a way that
+ * we execute this code path, but open_fs_devices failed to clear
+ * total_rw_bytes. We certainly want it cleared before reading the
+ * device items, so clear it here.
+ */
+ fs_info->fs_devices->total_rw_bytes = 0;
/*
* Read all device items, and then all the chunk items. All
@@ -7292,7 +7354,9 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
} else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
struct btrfs_chunk *chunk;
chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
+ mutex_lock(&fs_info->chunk_mutex);
ret = read_one_chunk(&found_key, leaf, chunk);
+ mutex_unlock(&fs_info->chunk_mutex);
if (ret)
goto error;
}
@@ -7322,7 +7386,6 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
}
ret = 0;
error:
- mutex_unlock(&fs_info->chunk_mutex);
mutex_unlock(&uuid_mutex);
btrfs_free_path(path);