summaryrefslogtreecommitdiff
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/backref.c2
-rw-r--r--fs/btrfs/compression.c31
-rw-r--r--fs/btrfs/compression.h3
-rw-r--r--fs/btrfs/ctree.c9
-rw-r--r--fs/btrfs/delayed-inode.c13
-rw-r--r--fs/btrfs/delayed-ref.c3
-rw-r--r--fs/btrfs/dev-replace.c29
-rw-r--r--fs/btrfs/extent-tree.c9
-rw-r--r--fs/btrfs/file.c7
-rw-r--r--fs/btrfs/free-space-cache.c6
-rw-r--r--fs/btrfs/inode.c27
-rw-r--r--fs/btrfs/props.c6
-rw-r--r--fs/btrfs/qgroup.c45
-rw-r--r--fs/btrfs/reada.c5
-rw-r--r--fs/btrfs/send.c102
-rw-r--r--fs/btrfs/super.c6
-rw-r--r--fs/btrfs/transaction.c10
-rw-r--r--fs/btrfs/tree-log.c84
-rw-r--r--fs/btrfs/volumes.c5
-rw-r--r--fs/btrfs/volumes.h6
20 files changed, 277 insertions, 131 deletions
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index d826fbaf7d50..e4d5e6eae409 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1290,8 +1290,6 @@ again:
ret = -EIO;
goto out;
}
- btrfs_tree_read_lock(eb);
- btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
if (!path->skip_locking) {
btrfs_tree_read_lock(eb);
btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 280384bf34f1..ccd9c709375e 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -43,6 +43,37 @@
#include "extent_io.h"
#include "extent_map.h"
+static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" };
+
+const char* btrfs_compress_type2str(enum btrfs_compression_type type)
+{
+ switch (type) {
+ case BTRFS_COMPRESS_ZLIB:
+ case BTRFS_COMPRESS_LZO:
+ case BTRFS_COMPRESS_ZSTD:
+ case BTRFS_COMPRESS_NONE:
+ return btrfs_compress_types[type];
+ }
+
+ return NULL;
+}
+
+bool btrfs_compress_is_valid_type(const char *str, size_t len)
+{
+ int i;
+
+ for (i = 1; i < ARRAY_SIZE(btrfs_compress_types); i++) {
+ size_t comp_len = strlen(btrfs_compress_types[i]);
+
+ if (len < comp_len)
+ continue;
+
+ if (!strncmp(btrfs_compress_types[i], str, comp_len))
+ return true;
+ }
+ return false;
+}
+
static int btrfs_decompress_bio(struct compressed_bio *cb);
static inline int compressed_bio_size(struct btrfs_fs_info *fs_info,
diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h
index d2781ff8f994..0b185e277df4 100644
--- a/fs/btrfs/compression.h
+++ b/fs/btrfs/compression.h
@@ -130,6 +130,9 @@ extern const struct btrfs_compress_op btrfs_zlib_compress;
extern const struct btrfs_compress_op btrfs_lzo_compress;
extern const struct btrfs_compress_op btrfs_zstd_compress;
+const char* btrfs_compress_type2str(enum btrfs_compression_type type);
+bool btrfs_compress_is_valid_type(const char *str, size_t len);
+
int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end);
#endif
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index d1b9900ebc9b..d2263caff307 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1414,6 +1414,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
struct tree_mod_elem *tm;
struct extent_buffer *eb = NULL;
struct extent_buffer *eb_root;
+ u64 eb_root_owner = 0;
struct extent_buffer *old;
struct tree_mod_root *old_root = NULL;
u64 old_generation = 0;
@@ -1448,6 +1449,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
free_extent_buffer(old);
}
} else if (old_root) {
+ eb_root_owner = btrfs_header_owner(eb_root);
btrfs_tree_read_unlock(eb_root);
free_extent_buffer(eb_root);
eb = alloc_dummy_extent_buffer(fs_info, logical);
@@ -1465,7 +1467,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
if (old_root) {
btrfs_set_header_bytenr(eb, eb->start);
btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
- btrfs_set_header_owner(eb, btrfs_header_owner(eb_root));
+ btrfs_set_header_owner(eb, eb_root_owner);
btrfs_set_header_level(eb, old_root->level);
btrfs_set_header_generation(eb, old_generation);
}
@@ -2986,6 +2988,10 @@ int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
again:
b = get_old_root(root, time_seq);
+ if (!b) {
+ ret = -EIO;
+ goto done;
+ }
level = btrfs_header_level(b);
p->locks[level] = BTRFS_READ_LOCK;
@@ -5492,6 +5498,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
advance_left = advance_right = 0;
while (1) {
+ cond_resched();
if (advance_left && !left_end_reached) {
ret = tree_advance(fs_info, left_path, &left_level,
left_root_level,
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 04f39111fafb..87414fc9e268 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1975,12 +1975,19 @@ void btrfs_kill_all_delayed_nodes(struct btrfs_root *root)
}
inode_id = delayed_nodes[n - 1]->inode_id + 1;
-
- for (i = 0; i < n; i++)
- refcount_inc(&delayed_nodes[i]->refs);
+ for (i = 0; i < n; i++) {
+ /*
+ * Don't increase refs in case the node is dead and
+ * about to be removed from the tree in the loop below
+ */
+ if (!refcount_inc_not_zero(&delayed_nodes[i]->refs))
+ delayed_nodes[i] = NULL;
+ }
spin_unlock(&root->inode_lock);
for (i = 0; i < n; i++) {
+ if (!delayed_nodes[i])
+ continue;
__btrfs_kill_delayed_node(delayed_nodes[i]);
btrfs_release_delayed_node(delayed_nodes[i]);
}
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 93ffa898df6d..d56bd3625468 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -195,8 +195,6 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans,
ref->in_tree = 0;
btrfs_put_delayed_ref(ref);
atomic_dec(&delayed_refs->num_entries);
- if (trans->delayed_ref_updates)
- trans->delayed_ref_updates--;
}
static bool merge_ref(struct btrfs_trans_handle *trans,
@@ -458,7 +456,6 @@ add_tail:
if (ref->action == BTRFS_ADD_DELAYED_REF)
list_add_tail(&ref->add_list, &href->ref_add_list);
atomic_inc(&root->num_entries);
- trans->delayed_ref_updates++;
spin_unlock(&href->lock);
return ret;
}
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index f86457713e60..f1e9dd246ab0 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -512,18 +512,27 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
}
btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
- trans = btrfs_start_transaction(root, 0);
- if (IS_ERR(trans)) {
- mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
- return PTR_ERR(trans);
+ while (1) {
+ trans = btrfs_start_transaction(root, 0);
+ if (IS_ERR(trans)) {
+ mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+ return PTR_ERR(trans);
+ }
+ ret = btrfs_commit_transaction(trans);
+ WARN_ON(ret);
+ mutex_lock(&uuid_mutex);
+ /* keep away write_all_supers() during the finishing procedure */
+ mutex_lock(&fs_info->fs_devices->device_list_mutex);
+ mutex_lock(&fs_info->chunk_mutex);
+ if (src_device->has_pending_chunks) {
+ mutex_unlock(&root->fs_info->chunk_mutex);
+ mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+ mutex_unlock(&uuid_mutex);
+ } else {
+ break;
+ }
}
- ret = btrfs_commit_transaction(trans);
- WARN_ON(ret);
- mutex_lock(&uuid_mutex);
- /* keep away write_all_supers() during the finishing procedure */
- mutex_lock(&fs_info->fs_devices->device_list_mutex);
- mutex_lock(&fs_info->chunk_mutex);
btrfs_dev_replace_lock(dev_replace, 1);
dev_replace->replace_state =
scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 49766721b2b1..fd15f396b3a0 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -7706,6 +7706,14 @@ search:
*/
if ((flags & extra) && !(block_group->flags & extra))
goto loop;
+
+ /*
+ * This block group has different flags than we want.
+ * It's possible that we have MIXED_GROUP flag but no
+ * block group is mixed. Just skip such block group.
+ */
+ btrfs_release_block_group(block_group, delalloc);
+ continue;
}
have_block_group:
@@ -10247,6 +10255,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info)
btrfs_err(info,
"bg %llu is a mixed block group but filesystem hasn't enabled mixed block groups",
cache->key.objectid);
+ btrfs_put_block_group(cache);
ret = -EINVAL;
goto error;
}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 97958ecaeed9..bf654d48eb46 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1625,6 +1625,7 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file,
break;
}
+ only_release_metadata = false;
sector_offset = pos & (fs_info->sectorsize - 1);
reserve_bytes = round_up(write_bytes + sector_offset,
fs_info->sectorsize);
@@ -1778,7 +1779,6 @@ again:
set_extent_bit(&BTRFS_I(inode)->io_tree, lockstart,
lockend, EXTENT_NORESERVE, NULL,
NULL, GFP_NOFS);
- only_release_metadata = false;
}
btrfs_drop_pages(pages, num_pages);
@@ -2784,6 +2784,11 @@ out_only_mutex:
* for detecting, at fsync time, if the inode isn't yet in the
* log tree or it's there but not up to date.
*/
+ struct timespec now = current_time(inode);
+
+ inode_inc_iversion(inode);
+ inode->i_mtime = now;
+ inode->i_ctime = now;
trans = btrfs_start_transaction(root, 1);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 9f31b81a5e27..abeb26d48d0a 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -398,6 +398,12 @@ static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, struct inode *inode
if (uptodate && !PageUptodate(page)) {
btrfs_readpage(NULL, page);
lock_page(page);
+ if (page->mapping != inode->i_mapping) {
+ btrfs_err(BTRFS_I(inode)->root->fs_info,
+ "free space cache page truncated");
+ io_ctl_drop_pages(io_ctl);
+ return -EIO;
+ }
if (!PageUptodate(page)) {
btrfs_err(BTRFS_I(inode)->root->fs_info,
"error reading free space cache");
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index ea7b65c025c2..739f45b04b52 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -405,10 +405,31 @@ static noinline int add_async_extent(struct async_cow *cow,
return 0;
}
+/*
+ * Check if the inode has flags compatible with compression
+ */
+static inline bool inode_can_compress(struct inode *inode)
+{
+ if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW ||
+ BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
+ return false;
+ return true;
+}
+
+/*
+ * Check if the inode needs to be submitted to compression, based on mount
+ * options, defragmentation, properties or heuristics.
+ */
static inline int inode_need_compress(struct inode *inode, u64 start, u64 end)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ if (!inode_can_compress(inode)) {
+ WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG),
+ KERN_ERR "BTRFS: unexpected compression for ino %llu\n",
+ btrfs_ino(BTRFS_I(inode)));
+ return 0;
+ }
/* force compress */
if (btrfs_test_opt(fs_info, FORCE_COMPRESS))
return 1;
@@ -1626,7 +1647,8 @@ static int run_delalloc_range(void *private_data, struct page *locked_page,
} else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC && !force_cow) {
ret = run_delalloc_nocow(inode, locked_page, start, end,
page_started, 0, nr_written);
- } else if (!inode_need_compress(inode, start, end)) {
+ } else if (!inode_can_compress(inode) ||
+ !inode_need_compress(inode, start, end)) {
ret = cow_file_range(inode, locked_page, start, end, end,
page_started, nr_written, 1, NULL);
} else {
@@ -9817,6 +9839,9 @@ static int btrfs_rename_exchange(struct inode *old_dir,
goto out_notrans;
}
+ if (dest != root)
+ btrfs_record_root_in_trans(trans, dest);
+
/*
* We need to find a free sequence number both in the source and
* in the destination directory for the exchange.
diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c
index 266f9069307b..b9c7d8508e35 100644
--- a/fs/btrfs/props.c
+++ b/fs/btrfs/props.c
@@ -386,11 +386,7 @@ int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans,
static int prop_compression_validate(const char *value, size_t len)
{
- if (!strncmp("lzo", value, 3))
- return 0;
- else if (!strncmp("zlib", value, 4))
- return 0;
- else if (!strncmp("zstd", value, 4))
+ if (btrfs_compress_is_valid_type(value, len))
return 0;
return -EINVAL;
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index d6d6e9593e39..cb6e8cb0de94 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -722,10 +722,10 @@ out:
return ret;
}
-static int update_qgroup_status_item(struct btrfs_trans_handle *trans,
- struct btrfs_fs_info *fs_info,
- struct btrfs_root *root)
+static int update_qgroup_status_item(struct btrfs_trans_handle *trans)
{
+ struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_root *quota_root = fs_info->quota_root;
struct btrfs_path *path;
struct btrfs_key key;
struct extent_buffer *l;
@@ -741,7 +741,7 @@ static int update_qgroup_status_item(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
- ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+ ret = btrfs_search_slot(trans, quota_root, &key, path, 0, 1);
if (ret > 0)
ret = -ENOENT;
@@ -2110,7 +2110,7 @@ int btrfs_run_qgroups(struct btrfs_trans_handle *trans,
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_ON;
spin_unlock(&fs_info->qgroup_lock);
- ret = update_qgroup_status_item(trans, fs_info, quota_root);
+ ret = update_qgroup_status_item(trans);
if (ret)
fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
@@ -2645,9 +2645,6 @@ out:
btrfs_free_path(path);
mutex_lock(&fs_info->qgroup_rescan_lock);
- if (!btrfs_fs_closing(fs_info))
- fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
-
if (err > 0 &&
fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) {
fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
@@ -2663,16 +2660,30 @@ out:
trans = btrfs_start_transaction(fs_info->quota_root, 1);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
+ trans = NULL;
btrfs_err(fs_info,
"fail to start transaction for status update: %d",
err);
- goto done;
}
- ret = update_qgroup_status_item(trans, fs_info, fs_info->quota_root);
- if (ret < 0) {
- err = ret;
- btrfs_err(fs_info, "fail to update qgroup status: %d", err);
+
+ mutex_lock(&fs_info->qgroup_rescan_lock);
+ if (!btrfs_fs_closing(fs_info))
+ fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN;
+ if (trans) {
+ ret = update_qgroup_status_item(trans);
+ if (ret < 0) {
+ err = ret;
+ btrfs_err(fs_info, "fail to update qgroup status: %d",
+ err);
+ }
}
+ fs_info->qgroup_rescan_running = false;
+ complete_all(&fs_info->qgroup_rescan_completion);
+ mutex_unlock(&fs_info->qgroup_rescan_lock);
+
+ if (!trans)
+ return;
+
btrfs_end_transaction(trans);
if (btrfs_fs_closing(fs_info)) {
@@ -2683,12 +2694,6 @@ out:
} else {
btrfs_err(fs_info, "qgroup scan failed with %d", err);
}
-
-done:
- mutex_lock(&fs_info->qgroup_rescan_lock);
- fs_info->qgroup_rescan_running = false;
- mutex_unlock(&fs_info->qgroup_rescan_lock);
- complete_all(&fs_info->qgroup_rescan_completion);
}
/*
@@ -2951,7 +2956,7 @@ static int qgroup_free_reserved_data(struct inode *inode,
* EXTENT_QGROUP_RESERVED, we won't double free.
* So not need to rush.
*/
- ret = clear_record_extent_bits(&BTRFS_I(inode)->io_failure_tree,
+ ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree,
free_start, free_start + free_len - 1,
EXTENT_QGROUP_RESERVED, &changeset);
if (ret < 0)
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index ab852b8e3e37..3a4e15b39cc1 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -759,6 +759,7 @@ static void __reada_start_machine(struct btrfs_fs_info *fs_info)
u64 total = 0;
int i;
+again:
do {
enqueued = 0;
mutex_lock(&fs_devices->device_list_mutex);
@@ -770,6 +771,10 @@ static void __reada_start_machine(struct btrfs_fs_info *fs_info)
mutex_unlock(&fs_devices->device_list_mutex);
total += enqueued;
} while (enqueued && total < 10000);
+ if (fs_devices->seed) {
+ fs_devices = fs_devices->seed;
+ goto again;
+ }
if (enqueued == 0)
return;
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 14c4062a6e58..1211fdcd425d 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -37,6 +37,14 @@
#include "compression.h"
/*
+ * Maximum number of references an extent can have in order for us to attempt to
+ * issue clone operations instead of write operations. This currently exists to
+ * avoid hitting limitations of the backreference walking code (taking a lot of
+ * time and using too much memory for extents with large number of references).
+ */
+#define SEND_MAX_EXTENT_REFS 64
+
+/*
* A fs_path is a helper to dynamically build path names with unknown size.
* It reallocates the internal buffer on demand.
* It allows fast adding of path elements on the right side (normal path) and
@@ -1324,6 +1332,7 @@ static int find_extent_clone(struct send_ctx *sctx,
struct clone_root *cur_clone_root;
struct btrfs_key found_key;
struct btrfs_path *tmp_path;
+ struct btrfs_extent_item *ei;
int compressed;
u32 i;
@@ -1373,7 +1382,6 @@ static int find_extent_clone(struct send_ctx *sctx,
ret = extent_from_logical(fs_info, disk_byte, tmp_path,
&found_key, &flags);
up_read(&fs_info->commit_root_sem);
- btrfs_release_path(tmp_path);
if (ret < 0)
goto out;
@@ -1382,6 +1390,21 @@ static int find_extent_clone(struct send_ctx *sctx,
goto out;
}
+ ei = btrfs_item_ptr(tmp_path->nodes[0], tmp_path->slots[0],
+ struct btrfs_extent_item);
+ /*
+ * Backreference walking (iterate_extent_inodes() below) is currently
+ * too expensive when an extent has a large number of references, both
+ * in time spent and used memory. So for now just fallback to write
+ * operations instead of clone operations when an extent has more than
+ * a certain amount of references.
+ */
+ if (btrfs_extent_refs(tmp_path->nodes[0], ei) > SEND_MAX_EXTENT_REFS) {
+ ret = -ENOENT;
+ goto out;
+ }
+ btrfs_release_path(tmp_path);
+
/*
* Setup the clone roots.
*/
@@ -6130,68 +6153,21 @@ static int changed_extent(struct send_ctx *sctx,
{
int ret = 0;
- if (sctx->cur_ino != sctx->cmp_key->objectid) {
-
- if (result == BTRFS_COMPARE_TREE_CHANGED) {
- struct extent_buffer *leaf_l;
- struct extent_buffer *leaf_r;
- struct btrfs_file_extent_item *ei_l;
- struct btrfs_file_extent_item *ei_r;
-
- leaf_l = sctx->left_path->nodes[0];
- leaf_r = sctx->right_path->nodes[0];
- ei_l = btrfs_item_ptr(leaf_l,
- sctx->left_path->slots[0],
- struct btrfs_file_extent_item);
- ei_r = btrfs_item_ptr(leaf_r,
- sctx->right_path->slots[0],
- struct btrfs_file_extent_item);
-
- /*
- * We may have found an extent item that has changed
- * only its disk_bytenr field and the corresponding
- * inode item was not updated. This case happens due to
- * very specific timings during relocation when a leaf
- * that contains file extent items is COWed while
- * relocation is ongoing and its in the stage where it
- * updates data pointers. So when this happens we can
- * safely ignore it since we know it's the same extent,
- * but just at different logical and physical locations
- * (when an extent is fully replaced with a new one, we
- * know the generation number must have changed too,
- * since snapshot creation implies committing the current
- * transaction, and the inode item must have been updated
- * as well).
- * This replacement of the disk_bytenr happens at
- * relocation.c:replace_file_extents() through
- * relocation.c:btrfs_reloc_cow_block().
- */
- if (btrfs_file_extent_generation(leaf_l, ei_l) ==
- btrfs_file_extent_generation(leaf_r, ei_r) &&
- btrfs_file_extent_ram_bytes(leaf_l, ei_l) ==
- btrfs_file_extent_ram_bytes(leaf_r, ei_r) &&
- btrfs_file_extent_compression(leaf_l, ei_l) ==
- btrfs_file_extent_compression(leaf_r, ei_r) &&
- btrfs_file_extent_encryption(leaf_l, ei_l) ==
- btrfs_file_extent_encryption(leaf_r, ei_r) &&
- btrfs_file_extent_other_encoding(leaf_l, ei_l) ==
- btrfs_file_extent_other_encoding(leaf_r, ei_r) &&
- btrfs_file_extent_type(leaf_l, ei_l) ==
- btrfs_file_extent_type(leaf_r, ei_r) &&
- btrfs_file_extent_disk_bytenr(leaf_l, ei_l) !=
- btrfs_file_extent_disk_bytenr(leaf_r, ei_r) &&
- btrfs_file_extent_disk_num_bytes(leaf_l, ei_l) ==
- btrfs_file_extent_disk_num_bytes(leaf_r, ei_r) &&
- btrfs_file_extent_offset(leaf_l, ei_l) ==
- btrfs_file_extent_offset(leaf_r, ei_r) &&
- btrfs_file_extent_num_bytes(leaf_l, ei_l) ==
- btrfs_file_extent_num_bytes(leaf_r, ei_r))
- return 0;
- }
-
- inconsistent_snapshot_error(sctx, result, "extent");
- return -EIO;
- }
+ /*
+ * We have found an extent item that changed without the inode item
+ * having changed. This can happen either after relocation (where the
+ * disk_bytenr of an extent item is replaced at
+ * relocation.c:replace_file_extents()) or after deduplication into a
+ * file in both the parent and send snapshots (where an extent item can
+ * get modified or replaced with a new one). Note that deduplication
+ * updates the inode item, but it only changes the iversion (sequence
+ * field in the inode item) of the inode, so if a file is deduplicated
+ * the same amount of times in both the parent and send snapshots, its
+ * iversion becames the same in both snapshots, whence the inode item is
+ * the same on both snapshots.
+ */
+ if (sctx->cur_ino != sctx->cmp_key->objectid)
+ return 0;
if (!sctx->cur_inode_new_gen && !sctx->cur_inode_deleted) {
if (result != BTRFS_COMPARE_TREE_DELETED)
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 49a02bf091ae..204d585e012a 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -1863,7 +1863,7 @@ restore:
}
/* Used to sort the devices by max_avail(descending sort) */
-static int btrfs_cmp_device_free_bytes(const void *dev_info1,
+static inline int btrfs_cmp_device_free_bytes(const void *dev_info1,
const void *dev_info2)
{
if (((struct btrfs_device_info *)dev_info1)->max_avail >
@@ -1892,8 +1892,8 @@ static inline void btrfs_descending_sort_devices(
* The helper to calc the free space on the devices that can be used to store
* file data.
*/
-static int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info,
- u64 *free_bytes)
+static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info,
+ u64 *free_bytes)
{
struct btrfs_device_info *devices_info;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 73c1fbca0c35..fa8f56e6f665 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -2052,6 +2052,16 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
}
} else {
spin_unlock(&fs_info->trans_lock);
+ /*
+ * The previous transaction was aborted and was already removed
+ * from the list of transactions at fs_info->trans_list. So we
+ * abort to prevent writing a new superblock that reflects a
+ * corrupt state (pointing to trees with unwritten nodes/leafs).
+ */
+ if (test_bit(BTRFS_FS_STATE_TRANS_ABORTED, &fs_info->fs_state)) {
+ ret = -EROFS;
+ goto cleanup_transaction;
+ }
}
extwriter_counter_dec(cur_trans, trans->type);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 8ac6a64d0422..e35301e5fe8e 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2729,7 +2729,8 @@ out:
* in the tree of log roots
*/
static int update_log_root(struct btrfs_trans_handle *trans,
- struct btrfs_root *log)
+ struct btrfs_root *log,
+ struct btrfs_root_item *root_item)
{
struct btrfs_fs_info *fs_info = log->fs_info;
int ret;
@@ -2737,10 +2738,10 @@ static int update_log_root(struct btrfs_trans_handle *trans,
if (log->log_transid == 1) {
/* insert root item on the first sync */
ret = btrfs_insert_root(trans, fs_info->log_root_tree,
- &log->root_key, &log->root_item);
+ &log->root_key, root_item);
} else {
ret = btrfs_update_root(trans, fs_info->log_root_tree,
- &log->root_key, &log->root_item);
+ &log->root_key, root_item);
}
return ret;
}
@@ -2836,6 +2837,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_root *log = root->log_root;
struct btrfs_root *log_root_tree = fs_info->log_root_tree;
+ struct btrfs_root_item new_root_item;
int log_transid = 0;
struct btrfs_log_ctx root_log_ctx;
struct blk_plug plug;
@@ -2901,18 +2903,26 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
goto out;
}
+ /*
+ * We _must_ update under the root->log_mutex in order to make sure we
+ * have a consistent view of the log root we are trying to commit at
+ * this moment.
+ *
+ * We _must_ copy this into a local copy, because we are not holding the
+ * log_root_tree->log_mutex yet. This is important because when we
+ * commit the log_root_tree we must have a consistent view of the
+ * log_root_tree when we update the super block to point at the
+ * log_root_tree bytenr. If we update the log_root_tree here we'll race
+ * with the commit and possibly point at the new block which we may not
+ * have written out.
+ */
btrfs_set_root_node(&log->root_item, log->node);
+ memcpy(&new_root_item, &log->root_item, sizeof(new_root_item));
root->log_transid++;
log->log_transid = root->log_transid;
root->log_start_pid = 0;
/*
- * Update or create log root item under the root's log_mutex to prevent
- * races with concurrent log syncs that can lead to failure to update
- * log root item because it was not created yet.
- */
- ret = update_log_root(trans, log);
- /*
* IO has been started, blocks of the log tree have WRITTEN flag set
* in their headers. new modifications of the log will be written to
* new positions. so it's safe to allow log writers to go in.
@@ -2932,6 +2942,14 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
mutex_unlock(&log_root_tree->log_mutex);
mutex_lock(&log_root_tree->log_mutex);
+
+ /*
+ * Now we are safe to update the log_root_tree because we're under the
+ * log_mutex, and we're a current writer so we're holding the commit
+ * open until we drop the log_mutex.
+ */
+ ret = update_log_root(trans, log, &new_root_item);
+
if (atomic_dec_and_test(&log_root_tree->log_writers)) {
/*
* Implicit memory barrier after atomic_dec_and_test
@@ -3153,6 +3171,30 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
}
/*
+ * Check if an inode was logged in the current transaction. We can't always rely
+ * on an inode's logged_trans value, because it's an in-memory only field and
+ * therefore not persisted. This means that its value is lost if the inode gets
+ * evicted and loaded again from disk (in which case it has a value of 0, and
+ * certainly it is smaller then any possible transaction ID), when that happens
+ * the full_sync flag is set in the inode's runtime flags, so on that case we
+ * assume eviction happened and ignore the logged_trans value, assuming the
+ * worst case, that the inode was logged before in the current transaction.
+ */
+static bool inode_logged(struct btrfs_trans_handle *trans,
+ struct btrfs_inode *inode)
+{
+ if (inode->logged_trans == trans->transid)
+ return true;
+
+ if (inode->last_trans == trans->transid &&
+ test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) &&
+ !test_bit(BTRFS_FS_LOG_RECOVERING, &trans->fs_info->flags))
+ return true;
+
+ return false;
+}
+
+/*
* If both a file and directory are logged, and unlinks or renames are
* mixed in, we have a few interesting corners:
*
@@ -3186,7 +3228,7 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
int bytes_del = 0;
u64 dir_ino = btrfs_ino(dir);
- if (dir->logged_trans < trans->transid)
+ if (!inode_logged(trans, dir))
return 0;
ret = join_running_log_trans(root);
@@ -3291,7 +3333,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
u64 index;
int ret;
- if (inode->logged_trans < trans->transid)
+ if (!inode_logged(trans, inode))
return 0;
ret = join_running_log_trans(root);
@@ -5098,7 +5140,7 @@ again:
BTRFS_I(other_inode),
LOG_OTHER_INODE, 0, LLONG_MAX,
ctx);
- iput(other_inode);
+ btrfs_add_delayed_iput(other_inode);
if (err)
goto out_unlock;
else
@@ -5266,9 +5308,19 @@ log_extents:
}
}
+ /*
+ * Don't update last_log_commit if we logged that an inode exists after
+ * it was loaded to memory (full_sync bit set).
+ * This is to prevent data loss when we do a write to the inode, then
+ * the inode gets evicted after all delalloc was flushed, then we log
+ * it exists (due to a rename for example) and then fsync it. This last
+ * fsync would do nothing (not logging the extents previously written).
+ */
spin_lock(&inode->lock);
inode->logged_trans = trans->transid;
- inode->last_log_commit = inode->last_sub_trans;
+ if (inode_only != LOG_INODE_EXISTS ||
+ !test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags))
+ inode->last_log_commit = inode->last_sub_trans;
spin_unlock(&inode->lock);
out_unlock:
if (unlikely(err))
@@ -5505,7 +5557,7 @@ process_leaf:
}
if (btrfs_inode_in_log(BTRFS_I(di_inode), trans->transid)) {
- iput(di_inode);
+ btrfs_add_delayed_iput(di_inode);
break;
}
@@ -5517,7 +5569,7 @@ process_leaf:
if (!ret &&
btrfs_must_commit_transaction(trans, BTRFS_I(di_inode)))
ret = 1;
- iput(di_inode);
+ btrfs_add_delayed_iput(di_inode);
if (ret)
goto next_dir_inode;
if (ctx->log_new_dentries) {
@@ -5664,7 +5716,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans,
if (!ret && ctx && ctx->log_new_dentries)
ret = log_new_dir_dentries(trans, root,
BTRFS_I(dir_inode), ctx);
- iput(dir_inode);
+ btrfs_add_delayed_iput(dir_inode);
if (ret)
goto out;
}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 38ed8e259e00..358e930df4ac 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -4851,6 +4851,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
for (i = 0; i < map->num_stripes; i++) {
num_bytes = map->stripes[i].dev->bytes_used + stripe_size;
btrfs_device_set_bytes_used(map->stripes[i].dev, num_bytes);
+ map->stripes[i].dev->has_pending_chunks = true;
}
atomic64_sub(stripe_size * map->num_stripes, &info->free_chunk_space);
@@ -5018,8 +5019,7 @@ static inline int btrfs_chunk_max_errors(struct map_lookup *map)
if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
BTRFS_BLOCK_GROUP_RAID10 |
- BTRFS_BLOCK_GROUP_RAID5 |
- BTRFS_BLOCK_GROUP_DUP)) {
+ BTRFS_BLOCK_GROUP_RAID5)) {
max_errors = 1;
} else if (map->type & BTRFS_BLOCK_GROUP_RAID6) {
max_errors = 2;
@@ -7310,6 +7310,7 @@ void btrfs_update_commit_device_bytes_used(struct btrfs_fs_info *fs_info,
for (i = 0; i < map->num_stripes; i++) {
dev = map->stripes[i].dev;
dev->commit_bytes_used = dev->bytes_used;
+ dev->has_pending_chunks = false;
}
}
mutex_unlock(&fs_info->chunk_mutex);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 76fb6e84f201..07b805d08e55 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -61,6 +61,11 @@ struct btrfs_device {
spinlock_t io_lock ____cacheline_aligned;
int running_pending;
+ /* When true means this device has pending chunk alloc in
+ * current transaction. Protected by chunk_mutex.
+ */
+ bool has_pending_chunks;
+
/* regular prio bios */
struct btrfs_pending_bios pending_bios;
/* sync bios */
@@ -312,7 +317,6 @@ struct btrfs_bio {
u64 map_type; /* get from map_lookup->type */
bio_end_io_t *end_io;
struct bio *orig_bio;
- unsigned long flags;
void *private;
atomic_t error;
int max_errors;