summaryrefslogtreecommitdiff
path: root/fs/btrfs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs')
-rw-r--r--fs/btrfs/Kconfig2
-rw-r--r--fs/btrfs/async-thread.c14
-rw-r--r--fs/btrfs/backref.c21
-rw-r--r--fs/btrfs/block-group.c6
-rw-r--r--fs/btrfs/btrfs_inode.h15
-rw-r--r--fs/btrfs/compression.c2
-rw-r--r--fs/btrfs/ctree.c21
-rw-r--r--fs/btrfs/ctree.h13
-rw-r--r--fs/btrfs/delalloc-space.c2
-rw-r--r--fs/btrfs/delayed-inode.c21
-rw-r--r--fs/btrfs/disk-io.c68
-rw-r--r--fs/btrfs/extent-tree.c4
-rw-r--r--fs/btrfs/extent_io.c8
-rw-r--r--fs/btrfs/extent_io.h2
-rw-r--r--fs/btrfs/file.c17
-rw-r--r--fs/btrfs/free-space-cache.c2
-rw-r--r--fs/btrfs/inode.c152
-rw-r--r--fs/btrfs/ioctl.c6
-rw-r--r--fs/btrfs/qgroup.c377
-rw-r--r--fs/btrfs/qgroup.h5
-rw-r--r--fs/btrfs/root-tree.c3
-rw-r--r--fs/btrfs/send.c15
-rw-r--r--fs/btrfs/space-info.c5
-rw-r--r--fs/btrfs/transaction.c22
-rw-r--r--fs/btrfs/transaction.h17
-rw-r--r--fs/btrfs/tree-log.c270
-rw-r--r--fs/btrfs/tree-log.h14
-rw-r--r--fs/btrfs/volumes.c29
28 files changed, 702 insertions, 431 deletions
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index 38651fae7f21..0aa1bee24d80 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -14,6 +14,8 @@ config BTRFS_FS
select RAID6_PQ
select XOR_BLOCKS
select SRCU
+ depends on !PPC_256K_PAGES # powerpc
+ depends on !PAGE_SIZE_256KB # hexagon
help
Btrfs is a general purpose copy-on-write filesystem with extents,
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 11be02459b87..eb592b92aa9c 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -237,6 +237,13 @@ static void run_ordered_work(struct __btrfs_workqueue *wq,
ordered_list);
if (!test_bit(WORK_DONE_BIT, &work->flags))
break;
+ /*
+ * Orders all subsequent loads after reading WORK_DONE_BIT,
+ * paired with the smp_mb__before_atomic in btrfs_work_helper
+ * this guarantees that the ordered function will see all
+ * updates from ordinary work function.
+ */
+ smp_rmb();
/*
* we are going to call the ordered done function, but
@@ -325,6 +332,13 @@ static void btrfs_work_helper(struct work_struct *normal_work)
thresh_exec_hook(wq);
work->func(work);
if (need_order) {
+ /*
+ * Ensures all memory accesses done in the work function are
+ * ordered before setting the WORK_DONE_BIT. Ensuring the thread
+ * which is going to executed the ordered work sees them.
+ * Pairs with the smp_rmb in run_ordered_work.
+ */
+ smp_mb__before_atomic();
set_bit(WORK_DONE_BIT, &work->flags);
run_ordered_work(wq, work);
}
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 7f644a58db51..c701a19fac53 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -1208,7 +1208,12 @@ again:
ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0);
if (ret < 0)
goto out;
- BUG_ON(ret == 0);
+ if (ret == 0) {
+ /* This shouldn't happen, indicates a bug or fs corruption. */
+ ASSERT(ret != 0);
+ ret = -EUCLEAN;
+ goto out;
+ }
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
if (trans && likely(trans->type != __TRANS_DUMMY) &&
@@ -1356,10 +1361,18 @@ again:
goto out;
if (!ret && extent_item_pos) {
/*
- * we've recorded that parent, so we must extend
- * its inode list here
+ * We've recorded that parent, so we must extend
+ * its inode list here.
+ *
+ * However if there was corruption we may not
+ * have found an eie, return an error in this
+ * case.
*/
- BUG_ON(!eie);
+ ASSERT(eie);
+ if (!eie) {
+ ret = -EUCLEAN;
+ goto out;
+ }
while (eie->next)
eie = eie->next;
eie->next = ref->inode_list;
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index a352c1704042..bcf19dfb0af3 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -2388,7 +2388,6 @@ int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans)
struct btrfs_path *path = NULL;
LIST_HEAD(dirty);
struct list_head *io = &cur_trans->io_bgs;
- int num_started = 0;
int loops = 0;
spin_lock(&cur_trans->dirty_bgs_lock);
@@ -2455,7 +2454,6 @@ again:
cache->io_ctl.inode = NULL;
ret = btrfs_write_out_cache(trans, cache, path);
if (ret == 0 && cache->io_ctl.inode) {
- num_started++;
should_put = 0;
/*
@@ -2556,7 +2554,6 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
int should_put;
struct btrfs_path *path;
struct list_head *io = &cur_trans->io_bgs;
- int num_started = 0;
path = btrfs_alloc_path();
if (!path)
@@ -2614,7 +2611,6 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
cache->io_ctl.inode = NULL;
ret = btrfs_write_out_cache(trans, cache, path);
if (ret == 0 && cache->io_ctl.inode) {
- num_started++;
should_put = 0;
list_add_tail(&cache->io_list, io);
} else {
@@ -2637,7 +2633,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
* finished yet (no block group item in the extent tree
* yet, etc). If this is the case, wait for all free
* space endio workers to finish and retry. This is a
- * a very rare case so no need for a more efficient and
+ * very rare case so no need for a more efficient and
* complex approach.
*/
if (ret == -ENOENT) {
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index f853835c409c..f3ff57b93158 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -268,6 +268,21 @@ static inline void btrfs_mod_outstanding_extents(struct btrfs_inode *inode,
mod);
}
+/*
+ * Called every time after doing a buffered, direct IO or memory mapped write.
+ *
+ * This is to ensure that if we write to a file that was previously fsynced in
+ * the current transaction, then try to fsync it again in the same transaction,
+ * we will know that there were changes in the file and that it needs to be
+ * logged.
+ */
+static inline void btrfs_set_inode_last_sub_trans(struct btrfs_inode *inode)
+{
+ spin_lock(&inode->lock);
+ inode->last_sub_trans = inode->root->log_transid;
+ spin_unlock(&inode->lock);
+}
+
static inline int btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation)
{
int ret = 0;
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index b05b361e2062..28f78e4f2c87 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -273,7 +273,7 @@ static void end_compressed_bio_write(struct bio *bio)
cb->compressed_pages[0]->mapping = cb->inode->i_mapping;
btrfs_writepage_endio_finish_ordered(cb->compressed_pages[0],
cb->start, cb->start + cb->len - 1,
- bio->bi_status == BLK_STS_OK);
+ !cb->errors);
cb->compressed_pages[0]->mapping = NULL;
end_compressed_writeback(inode, cb);
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index ab69e3563b12..822c615840e8 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -2658,12 +2658,9 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct extent_buffer *b;
- int root_lock;
+ int root_lock = 0;
int level = 0;
- /* We try very hard to do read locks on the root */
- root_lock = BTRFS_READ_LOCK;
-
if (p->search_commit_root) {
/*
* The commit roots are read only so we always do read locks,
@@ -2701,6 +2698,9 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
goto out;
}
+ /* We try very hard to do read locks on the root */
+ root_lock = BTRFS_READ_LOCK;
+
/*
* If the level is set to maximum, we can skip trying to get the read
* lock.
@@ -2727,6 +2727,17 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
level = btrfs_header_level(b);
out:
+ /*
+ * The root may have failed to write out at some point, and thus is no
+ * longer valid, return an error in this case.
+ */
+ if (!extent_buffer_uptodate(b)) {
+ if (root_lock)
+ btrfs_tree_unlock_rw(b, root_lock);
+ free_extent_buffer(b);
+ return ERR_PTR(-EIO);
+ }
+
p->nodes[level] = b;
if (!p->skip_locking)
p->locks[level] = root_lock;
@@ -5232,7 +5243,7 @@ again:
slot--;
/*
* check this node pointer against the min_trans parameters.
- * If it is too old, old, skip to the next one.
+ * If it is too old, skip to the next one.
*/
while (slot < nritems) {
u64 gen;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 7960359dbc70..cd77c0621a55 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -505,11 +505,6 @@ enum {
*/
BTRFS_FS_EXCL_OP,
/*
- * To info transaction_kthread we need an immediate commit so it
- * doesn't need to wait for commit_interval
- */
- BTRFS_FS_NEED_ASYNC_COMMIT,
- /*
* Indicate that balance has been set up from the ioctl and is in the
* main phase. The fs_info::balance_ctl is initialized.
* Set and cleared while holding fs_info::balance_mutex.
@@ -832,7 +827,10 @@ struct btrfs_fs_info {
*/
struct ulist *qgroup_ulist;
- /* protect user change for quota operations */
+ /*
+ * Protect user change for quota operations. If a transaction is needed,
+ * it must be started before locking this lock.
+ */
struct mutex qgroup_ioctl_lock;
/* list of dirty qgroups to be written at next commit */
@@ -945,6 +943,8 @@ enum {
BTRFS_ROOT_DEAD_TREE,
/* The root has a log tree. Used only for subvolume roots. */
BTRFS_ROOT_HAS_LOG_TREE,
+ /* Qgroup flushing is in progress */
+ BTRFS_ROOT_QGROUP_FLUSHING,
};
/*
@@ -1097,6 +1097,7 @@ struct btrfs_root {
spinlock_t qgroup_meta_rsv_lock;
u64 qgroup_meta_rsv_pertrans;
u64 qgroup_meta_rsv_prealloc;
+ wait_queue_head_t qgroup_flush_wait;
/* Number of active swapfiles */
atomic_t nr_swapfiles;
diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c
index db9f2c58eb4a..f4f531c4aa96 100644
--- a/fs/btrfs/delalloc-space.c
+++ b/fs/btrfs/delalloc-space.c
@@ -151,7 +151,7 @@ int btrfs_check_data_free_space(struct inode *inode,
return ret;
/* Use new btrfs_qgroup_reserve_data to reserve precious data space. */
- ret = btrfs_qgroup_reserve_data(inode, reserved, start, len);
+ ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), reserved, start, len);
if (ret < 0)
btrfs_free_reserved_data_space_noquota(inode, start, len);
else
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 7dad8794ee38..e96890475bac 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -627,7 +627,8 @@ static int btrfs_delayed_inode_reserve_metadata(
*/
if (!src_rsv || (!trans->bytes_reserved &&
src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) {
- ret = btrfs_qgroup_reserve_meta_prealloc(root, num_bytes, true);
+ ret = btrfs_qgroup_reserve_meta(root, num_bytes,
+ BTRFS_QGROUP_RSV_META_PREALLOC, true);
if (ret < 0)
return ret;
ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes,
@@ -1033,12 +1034,10 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans,
nofs_flag = memalloc_nofs_save();
ret = btrfs_lookup_inode(trans, root, path, &key, mod);
memalloc_nofs_restore(nofs_flag);
- if (ret > 0) {
- btrfs_release_path(path);
- return -ENOENT;
- } else if (ret < 0) {
- return ret;
- }
+ if (ret > 0)
+ ret = -ENOENT;
+ if (ret < 0)
+ goto out;
leaf = path->nodes[0];
inode_item = btrfs_item_ptr(leaf, path->slots[0],
@@ -1076,6 +1075,14 @@ err_out:
btrfs_delayed_inode_release_metadata(fs_info, node, (ret < 0));
btrfs_release_delayed_inode(node);
+ /*
+ * If we fail to update the delayed inode we need to abort the
+ * transaction, because we could leave the inode with the improper
+ * counts behind.
+ */
+ if (ret && ret != -ENOENT)
+ btrfs_abort_transaction(trans, ret);
+
return ret;
search:
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index e6aa94a583e9..f18c6d97932e 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1154,6 +1154,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
mutex_init(&root->log_mutex);
mutex_init(&root->ordered_extent_mutex);
mutex_init(&root->delalloc_mutex);
+ init_waitqueue_head(&root->qgroup_flush_wait);
init_waitqueue_head(&root->log_writer_wait);
init_waitqueue_head(&root->log_commit_wait[0]);
init_waitqueue_head(&root->log_commit_wait[1]);
@@ -1747,8 +1748,7 @@ static int transaction_kthread(void *arg)
}
now = ktime_get_seconds();
- if (cur->state < TRANS_STATE_BLOCKED &&
- !test_bit(BTRFS_FS_NEED_ASYNC_COMMIT, &fs_info->flags) &&
+ if (cur->state < TRANS_STATE_COMMIT_START &&
(now < cur->start_time ||
now - cur->start_time < fs_info->commit_interval)) {
spin_unlock(&fs_info->trans_lock);
@@ -2816,7 +2816,7 @@ int open_ctree(struct super_block *sb,
}
/*
- * Verify the type first, if that or the the checksum value are
+ * Verify the type first, if that or the checksum value are
* corrupted, we'll find out
*/
csum_type = btrfs_super_csum_type((struct btrfs_super_block *)bh->b_data);
@@ -2894,6 +2894,29 @@ int open_ctree(struct super_block *sb,
*/
fs_info->compress_type = BTRFS_COMPRESS_ZLIB;
+ /*
+ * Flag our filesystem as having big metadata blocks if they are bigger
+ * than the page size
+ */
+ if (btrfs_super_nodesize(disk_super) > PAGE_SIZE) {
+ if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA))
+ btrfs_info(fs_info,
+ "flagging fs with big metadata feature");
+ features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
+ }
+
+ /* Set up fs_info before parsing mount options */
+ nodesize = btrfs_super_nodesize(disk_super);
+ sectorsize = btrfs_super_sectorsize(disk_super);
+ stripesize = sectorsize;
+ fs_info->dirty_metadata_batch = nodesize * (1 + ilog2(nr_cpu_ids));
+ fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
+
+ /* Cache block sizes */
+ fs_info->nodesize = nodesize;
+ fs_info->sectorsize = sectorsize;
+ fs_info->stripesize = stripesize;
+
ret = btrfs_parse_options(fs_info, options, sb->s_flags);
if (ret) {
err = ret;
@@ -2921,28 +2944,6 @@ int open_ctree(struct super_block *sb,
btrfs_info(fs_info, "has skinny extents");
/*
- * flag our filesystem as having big metadata blocks if
- * they are bigger than the page size
- */
- if (btrfs_super_nodesize(disk_super) > PAGE_SIZE) {
- if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA))
- btrfs_info(fs_info,
- "flagging fs with big metadata feature");
- features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA;
- }
-
- nodesize = btrfs_super_nodesize(disk_super);
- sectorsize = btrfs_super_sectorsize(disk_super);
- stripesize = sectorsize;
- fs_info->dirty_metadata_batch = nodesize * (1 + ilog2(nr_cpu_ids));
- fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids));
-
- /* Cache block sizes */
- fs_info->nodesize = nodesize;
- fs_info->sectorsize = sectorsize;
- fs_info->stripesize = stripesize;
-
- /*
* mixed block groups end up with duplicate but slightly offset
* extent buffers for the same range. It leads to corruptions
*/
@@ -3144,7 +3145,8 @@ retry_root_backup:
goto fail_sysfs;
}
- if (!sb_rdonly(sb) && !btrfs_check_rw_degradable(fs_info, NULL)) {
+ if (!sb_rdonly(sb) && fs_info->fs_devices->missing_devices &&
+ !btrfs_check_rw_degradable(fs_info, NULL)) {
btrfs_warn(fs_info,
"writable mount is not allowed due to too many missing devices");
goto fail_sysfs;
@@ -3634,11 +3636,23 @@ static void btrfs_end_empty_barrier(struct bio *bio)
*/
static void write_dev_flush(struct btrfs_device *device)
{
- struct request_queue *q = bdev_get_queue(device->bdev);
struct bio *bio = device->flush_bio;
+#ifndef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+ /*
+ * When a disk has write caching disabled, we skip submission of a bio
+ * with flush and sync requests before writing the superblock, since
+ * it's not needed. However when the integrity checker is enabled, this
+ * results in reports that there are metadata blocks referred by a
+ * superblock that were not properly flushed. So don't skip the bio
+ * submission only when the integrity checker is enabled for the sake
+ * of simplicity, since this is a debug tool and not meant for use in
+ * non-debug builds.
+ */
+ struct request_queue *q = bdev_get_queue(device->bdev);
if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags))
return;
+#endif
bio_reset(bio);
bio->bi_end_io = btrfs_end_empty_barrier;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 832b40293907..19d2104c0462 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4596,6 +4596,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
out_free_delayed:
btrfs_free_delayed_extent_op(extent_op);
out_free_buf:
+ btrfs_tree_unlock(buf);
free_extent_buffer(buf);
out_free_reserved:
btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0);
@@ -5768,6 +5769,9 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
mutex_lock(&fs_info->fs_devices->device_list_mutex);
devices = &fs_info->fs_devices->devices;
list_for_each_entry(device, devices, dev_list) {
+ if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
+ continue;
+
ret = btrfs_trim_free_extents(device, &group_trimmed);
if (ret) {
dev_failed++;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index eca3abc1a7cd..95ddeb477797 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3152,7 +3152,7 @@ static int __do_readpage(struct extent_io_tree *tree,
/*
* If we have a file range that points to a compressed extent
- * and it's followed by a consecutive file range that points to
+ * and it's followed by a consecutive file range that points
* to the same compressed extent (possibly with a different
* offset and/or length, so it either points to the whole extent
* or only part of it), we must make sure we do not submit a
@@ -3755,6 +3755,12 @@ static void set_btree_ioerr(struct page *page)
return;
/*
+ * A read may stumble upon this buffer later, make sure that it gets an
+ * error and knows there was an error.
+ */
+ clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags);
+
+ /*
* If we error out, we should add back the dirty_metadata_bytes
* to make it consistent.
*/
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index fcf1807cc8dd..c8def2bdf247 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -202,7 +202,7 @@ struct extent_buffer {
*/
struct extent_changeset {
/* How many bytes are set/cleared in this operation */
- unsigned int bytes_changed;
+ u64 bytes_changed;
/* Changed ranges */
struct ulist range_changed;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index f6308a7b761d..1279359ed172 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2004,14 +2004,8 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
inode_unlock(inode);
- /*
- * We also have to set last_sub_trans to the current log transid,
- * otherwise subsequent syncs to a file that's been synced in this
- * transaction will appear to have already occurred.
- */
- spin_lock(&BTRFS_I(inode)->lock);
- BTRFS_I(inode)->last_sub_trans = root->log_transid;
- spin_unlock(&BTRFS_I(inode)->lock);
+ btrfs_set_inode_last_sub_trans(BTRFS_I(inode));
+
if (num_written > 0)
num_written = generic_write_sync(iocb, num_written);
@@ -3149,7 +3143,7 @@ reserve_space:
&cached_state);
if (ret)
goto out;
- ret = btrfs_qgroup_reserve_data(inode, &data_reserved,
+ ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), &data_reserved,
alloc_start, bytes_to_reserve);
if (ret) {
unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
@@ -3322,8 +3316,9 @@ static long btrfs_fallocate(struct file *file, int mode,
free_extent_map(em);
break;
}
- ret = btrfs_qgroup_reserve_data(inode, &data_reserved,
- cur_offset, last_byte - cur_offset);
+ ret = btrfs_qgroup_reserve_data(BTRFS_I(inode),
+ &data_reserved, cur_offset,
+ last_byte - cur_offset);
if (ret < 0) {
cur_offset = last_byte;
free_extent_map(em);
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 23f59d463e24..d2d32fed8f2e 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1339,7 +1339,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
/*
* at this point the pages are under IO and we're happy,
- * The caller is responsible for waiting on them and updating the
+ * The caller is responsible for waiting on them and updating
* the cache and the inode
*/
io_ctl->entries = entries;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 64dd702a5448..7755a0362a3a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1200,11 +1200,6 @@ static noinline void async_cow_submit(struct btrfs_work *work)
nr_pages = (async_chunk->end - async_chunk->start + PAGE_SIZE) >>
PAGE_SHIFT;
- /* atomic_sub_return implies a barrier */
- if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
- 5 * SZ_1M)
- cond_wake_up_nomb(&fs_info->async_submit_wait);
-
/*
* ->inode could be NULL if async_chunk_start has failed to compress,
* in which case we don't have anything to submit, yet we need to
@@ -1213,6 +1208,11 @@ static noinline void async_cow_submit(struct btrfs_work *work)
*/
if (async_chunk->inode)
submit_compressed_extents(async_chunk);
+
+ /* atomic_sub_return implies a barrier */
+ if (atomic_sub_return(nr_pages, &fs_info->async_delalloc_pages) <
+ 5 * SZ_1M)
+ cond_wake_up_nomb(&fs_info->async_submit_wait);
}
static noinline void async_cow_free(struct btrfs_work *work)
@@ -6375,7 +6375,7 @@ static int btrfs_dirty_inode(struct inode *inode)
return PTR_ERR(trans);
ret = btrfs_update_inode(trans, root, inode);
- if (ret && ret == -ENOSPC) {
+ if (ret && (ret == -ENOSPC || ret == -EDQUOT)) {
/* whoops, lets try again with the full transaction */
btrfs_end_transaction(trans);
trans = btrfs_start_transaction(root, 1);
@@ -6992,7 +6992,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
drop_inode = 1;
} else {
struct dentry *parent = dentry->d_parent;
- int ret;
err = btrfs_update_inode(trans, root, inode);
if (err)
@@ -7007,12 +7006,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
goto fail;
}
d_instantiate(dentry, inode);
- ret = btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent,
- true, NULL);
- if (ret == BTRFS_NEED_TRANS_COMMIT) {
- err = btrfs_commit_transaction(trans);
- trans = NULL;
- }
+ btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent);
}
fail:
@@ -9256,9 +9250,7 @@ again:
set_page_dirty(page);
SetPageUptodate(page);
- BTRFS_I(inode)->last_trans = fs_info->generation;
- BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
- BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;
+ btrfs_set_inode_last_sub_trans(BTRFS_I(inode));
unlock_extent_cached(io_tree, page_start, page_end, &cached_state);
@@ -9699,27 +9691,25 @@ static int btrfs_rename_exchange(struct inode *old_dir,
struct inode *new_inode = new_dentry->d_inode;
struct inode *old_inode = old_dentry->d_inode;
struct timespec64 ctime = current_time(old_inode);
- struct dentry *parent;
u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
u64 new_ino = btrfs_ino(BTRFS_I(new_inode));
u64 old_idx = 0;
u64 new_idx = 0;
int ret;
+ int ret2;
bool root_log_pinned = false;
bool dest_log_pinned = false;
- struct btrfs_log_ctx ctx_root;
- struct btrfs_log_ctx ctx_dest;
- bool sync_log_root = false;
- bool sync_log_dest = false;
- bool commit_transaction = false;
- /* we only allow rename subvolume link between subvolumes */
- if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
+ /*
+ * For non-subvolumes allow exchange only within one subvolume, in the
+ * same inode namespace. Two subvolumes (represented as directory) can
+ * be exchanged as they're a logical link and have a fixed inode number.
+ */
+ if (root != dest &&
+ (old_ino != BTRFS_FIRST_FREE_OBJECTID ||
+ new_ino != BTRFS_FIRST_FREE_OBJECTID))
return -EXDEV;
- btrfs_init_log_ctx(&ctx_root, old_inode);
- btrfs_init_log_ctx(&ctx_dest, new_inode);
-
/* close the race window with snapshot create/destroy ioctl */
if (old_ino == BTRFS_FIRST_FREE_OBJECTID ||
new_ino == BTRFS_FIRST_FREE_OBJECTID)
@@ -9861,30 +9851,14 @@ static int btrfs_rename_exchange(struct inode *old_dir,
BTRFS_I(new_inode)->dir_index = new_idx;
if (root_log_pinned) {
- parent = new_dentry->d_parent;
- ret = btrfs_log_new_name(trans, BTRFS_I(old_inode),
- BTRFS_I(old_dir), parent,
- false, &ctx_root);
- if (ret == BTRFS_NEED_LOG_SYNC)
- sync_log_root = true;
- else if (ret == BTRFS_NEED_TRANS_COMMIT)
- commit_transaction = true;
- ret = 0;
+ btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir),
+ new_dentry->d_parent);
btrfs_end_log_trans(root);
root_log_pinned = false;
}
if (dest_log_pinned) {
- if (!commit_transaction) {
- parent = old_dentry->d_parent;
- ret = btrfs_log_new_name(trans, BTRFS_I(new_inode),
- BTRFS_I(new_dir), parent,
- false, &ctx_dest);
- if (ret == BTRFS_NEED_LOG_SYNC)
- sync_log_dest = true;
- else if (ret == BTRFS_NEED_TRANS_COMMIT)
- commit_transaction = true;
- ret = 0;
- }
+ btrfs_log_new_name(trans, BTRFS_I(new_inode), BTRFS_I(new_dir),
+ old_dentry->d_parent);
btrfs_end_log_trans(dest);
dest_log_pinned = false;
}
@@ -9917,46 +9891,13 @@ out_fail:
dest_log_pinned = false;
}
}
- if (!ret && sync_log_root && !commit_transaction) {
- ret = btrfs_sync_log(trans, BTRFS_I(old_inode)->root,
- &ctx_root);
- if (ret)
- commit_transaction = true;
- }
- if (!ret && sync_log_dest && !commit_transaction) {
- ret = btrfs_sync_log(trans, BTRFS_I(new_inode)->root,
- &ctx_dest);
- if (ret)
- commit_transaction = true;
- }
- if (commit_transaction) {
- /*
- * We may have set commit_transaction when logging the new name
- * in the destination root, in which case we left the source
- * root context in the list of log contextes. So make sure we
- * remove it to avoid invalid memory accesses, since the context
- * was allocated in our stack frame.
- */
- if (sync_log_root) {
- mutex_lock(&root->log_mutex);
- list_del_init(&ctx_root.list);
- mutex_unlock(&root->log_mutex);
- }
- ret = btrfs_commit_transaction(trans);
- } else {
- int ret2;
-
- ret2 = btrfs_end_transaction(trans);
- ret = ret ? ret : ret2;
- }
+ ret2 = btrfs_end_transaction(trans);
+ ret = ret ? ret : ret2;
out_notrans:
if (new_ino == BTRFS_FIRST_FREE_OBJECTID ||
old_ino == BTRFS_FIRST_FREE_OBJECTID)
up_read(&fs_info->subvol_sem);
- ASSERT(list_empty(&ctx_root.list));
- ASSERT(list_empty(&ctx_dest.list));
-
return ret;
}
@@ -10024,11 +9965,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *old_inode = d_inode(old_dentry);
u64 index = 0;
int ret;
+ int ret2;
u64 old_ino = btrfs_ino(BTRFS_I(old_inode));
bool log_pinned = false;
- struct btrfs_log_ctx ctx;
- bool sync_log = false;
- bool commit_transaction = false;
if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID)
return -EPERM;
@@ -10178,17 +10117,8 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
BTRFS_I(old_inode)->dir_index = index;
if (log_pinned) {
- struct dentry *parent = new_dentry->d_parent;
-
- btrfs_init_log_ctx(&ctx, old_inode);
- ret = btrfs_log_new_name(trans, BTRFS_I(old_inode),
- BTRFS_I(old_dir), parent,
- false, &ctx);
- if (ret == BTRFS_NEED_LOG_SYNC)
- sync_log = true;
- else if (ret == BTRFS_NEED_TRANS_COMMIT)
- commit_transaction = true;
- ret = 0;
+ btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir),
+ new_dentry->d_parent);
btrfs_end_log_trans(root);
log_pinned = false;
}
@@ -10225,23 +10155,8 @@ out_fail:
btrfs_end_log_trans(root);
log_pinned = false;
}
- if (!ret && sync_log) {
- ret = btrfs_sync_log(trans, BTRFS_I(old_inode)->root, &ctx);
- if (ret)
- commit_transaction = true;
- } else if (sync_log) {
- mutex_lock(&root->log_mutex);
- list_del(&ctx.list);
- mutex_unlock(&root->log_mutex);
- }
- if (commit_transaction) {
- ret = btrfs_commit_transaction(trans);
- } else {
- int ret2;
-
- ret2 = btrfs_end_transaction(trans);
- ret = ret ? ret : ret2;
- }
+ ret2 = btrfs_end_transaction(trans);
+ ret = ret ? ret : ret2;
out_notrans:
if (old_ino == BTRFS_FIRST_FREE_OBJECTID)
up_read(&fs_info->subvol_sem);
@@ -10893,9 +10808,19 @@ static int btrfs_add_swap_extent(struct swap_info_struct *sis,
struct btrfs_swap_info *bsi)
{
unsigned long nr_pages;
+ unsigned long max_pages;
u64 first_ppage, first_ppage_reported, next_ppage;
int ret;
+ /*
+ * Our swapfile may have had its size extended after the swap header was
+ * written. In that case activating the swapfile should not go beyond
+ * the max size set in the swap header.
+ */
+ if (bsi->nr_pages >= sis->max)
+ return 0;
+
+ max_pages = sis->max - bsi->nr_pages;
first_ppage = ALIGN(bsi->block_start, PAGE_SIZE) >> PAGE_SHIFT;
next_ppage = ALIGN_DOWN(bsi->block_start + bsi->block_len,
PAGE_SIZE) >> PAGE_SHIFT;
@@ -10903,6 +10828,7 @@ static int btrfs_add_swap_extent(struct swap_info_struct *sis,
if (first_ppage >= next_ppage)
return 0;
nr_pages = next_ppage - first_ppage;
+ nr_pages = min(nr_pages, max_pages);
first_ppage_reported = first_ppage;
if (bsi->start == 0)
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index e9d3eb7f0e2b..675112aa998f 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -3027,10 +3027,8 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
inode_lock(inode);
err = btrfs_delete_subvolume(dir, dentry);
inode_unlock(inode);
- if (!err) {
- fsnotify_rmdir(dir, dentry);
- d_delete(dentry);
- }
+ if (!err)
+ d_delete_notify(dir, dentry);
out_dput:
dput(dentry);
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index cd8e81c02f63..5a3006c75d63 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -11,7 +11,6 @@
#include <linux/slab.h>
#include <linux/workqueue.h>
#include <linux/btrfs.h>
-#include <linux/sizes.h>
#include "ctree.h"
#include "transaction.h"
@@ -887,20 +886,44 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
struct btrfs_key found_key;
struct btrfs_qgroup *qgroup = NULL;
struct btrfs_trans_handle *trans = NULL;
+ struct ulist *ulist = NULL;
int ret = 0;
int slot;
+ /*
+ * We need to have subvol_sem write locked, to prevent races between
+ * concurrent tasks trying to enable quotas, because we will unlock
+ * and relock qgroup_ioctl_lock before setting fs_info->quota_root
+ * and before setting BTRFS_FS_QUOTA_ENABLED.
+ */
+ lockdep_assert_held_write(&fs_info->subvol_sem);
+
mutex_lock(&fs_info->qgroup_ioctl_lock);
if (fs_info->quota_root)
goto out;
- fs_info->qgroup_ulist = ulist_alloc(GFP_KERNEL);
- if (!fs_info->qgroup_ulist) {
+ ulist = ulist_alloc(GFP_KERNEL);
+ if (!ulist) {
ret = -ENOMEM;
goto out;
}
/*
+ * Unlock qgroup_ioctl_lock before starting the transaction. This is to
+ * avoid lock acquisition inversion problems (reported by lockdep) between
+ * qgroup_ioctl_lock and the vfs freeze semaphores, acquired when we
+ * start a transaction.
+ * After we started the transaction lock qgroup_ioctl_lock again and
+ * check if someone else created the quota root in the meanwhile. If so,
+ * just return success and release the transaction handle.
+ *
+ * Also we don't need to worry about someone else calling
+ * btrfs_sysfs_add_qgroups() after we unlock and getting an error because
+ * that function returns 0 (success) when the sysfs entries already exist.
+ */
+ mutex_unlock(&fs_info->qgroup_ioctl_lock);
+
+ /*
* 1 for quota root item
* 1 for BTRFS_QGROUP_STATUS item
*
@@ -909,12 +932,20 @@ int btrfs_quota_enable(struct btrfs_fs_info *fs_info)
* would be a lot of overkill.
*/
trans = btrfs_start_transaction(tree_root, 2);
+
+ mutex_lock(&fs_info->qgroup_ioctl_lock);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
trans = NULL;
goto out;
}
+ if (fs_info->quota_root)
+ goto out;
+
+ fs_info->qgroup_ulist = ulist;
+ ulist = NULL;
+
/*
* initially create the quota tree
*/
@@ -1012,8 +1043,19 @@ out_add_root:
goto out_free_path;
}
+ mutex_unlock(&fs_info->qgroup_ioctl_lock);
+ /*
+ * Commit the transaction while not holding qgroup_ioctl_lock, to avoid
+ * a deadlock with tasks concurrently doing other qgroup operations, such
+ * adding/removing qgroups or adding/deleting qgroup relations for example,
+ * because all qgroup operations first start or join a transaction and then
+ * lock the qgroup_ioctl_lock mutex.
+ * We are safe from a concurrent task trying to enable quotas, by calling
+ * this function, since we are serialized by fs_info->subvol_sem.
+ */
ret = btrfs_commit_transaction(trans);
trans = NULL;
+ mutex_lock(&fs_info->qgroup_ioctl_lock);
if (ret)
goto out_free_path;
@@ -1047,10 +1089,13 @@ out:
if (ret) {
ulist_free(fs_info->qgroup_ulist);
fs_info->qgroup_ulist = NULL;
- if (trans)
- btrfs_end_transaction(trans);
}
mutex_unlock(&fs_info->qgroup_ioctl_lock);
+ if (ret && trans)
+ btrfs_end_transaction(trans);
+ else if (trans)
+ ret = btrfs_end_transaction(trans);
+ ulist_free(ulist);
return ret;
}
@@ -1060,24 +1105,55 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
struct btrfs_trans_handle *trans = NULL;
int ret = 0;
+ /*
+ * We need to have subvol_sem write locked, to prevent races between
+ * concurrent tasks trying to disable quotas, because we will unlock
+ * and relock qgroup_ioctl_lock across BTRFS_FS_QUOTA_ENABLED changes.
+ */
+ lockdep_assert_held_write(&fs_info->subvol_sem);
+
mutex_lock(&fs_info->qgroup_ioctl_lock);
if (!fs_info->quota_root)
goto out;
/*
+ * Unlock the qgroup_ioctl_lock mutex before waiting for the rescan worker to
+ * complete. Otherwise we can deadlock because btrfs_remove_qgroup() needs
+ * to lock that mutex while holding a transaction handle and the rescan
+ * worker needs to commit a transaction.
+ */
+ mutex_unlock(&fs_info->qgroup_ioctl_lock);
+
+ /*
+ * Request qgroup rescan worker to complete and wait for it. This wait
+ * must be done before transaction start for quota disable since it may
+ * deadlock with transaction by the qgroup rescan worker.
+ */
+ clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
+ btrfs_qgroup_wait_for_completion(fs_info, false);
+
+ /*
* 1 For the root item
*
* We should also reserve enough items for the quota tree deletion in
* btrfs_clean_quota_tree but this is not done.
+ *
+ * Also, we must always start a transaction without holding the mutex
+ * qgroup_ioctl_lock, see btrfs_quota_enable().
*/
trans = btrfs_start_transaction(fs_info->tree_root, 1);
+
+ mutex_lock(&fs_info->qgroup_ioctl_lock);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
+ trans = NULL;
+ set_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
goto out;
}
- clear_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags);
- btrfs_qgroup_wait_for_completion(fs_info, false);
+ if (!fs_info->quota_root)
+ goto out;
+
spin_lock(&fs_info->qgroup_lock);
quota_root = fs_info->quota_root;
fs_info->quota_root = NULL;
@@ -1089,13 +1165,13 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
ret = btrfs_clean_quota_tree(trans, quota_root);
if (ret) {
btrfs_abort_transaction(trans, ret);
- goto end_trans;
+ goto out;
}
ret = btrfs_del_root(trans, &quota_root->root_key);
if (ret) {
btrfs_abort_transaction(trans, ret);
- goto end_trans;
+ goto out;
}
list_del(&quota_root->dirty_list);
@@ -1109,10 +1185,13 @@ int btrfs_quota_disable(struct btrfs_fs_info *fs_info)
free_extent_buffer(quota_root->commit_root);
kfree(quota_root);
-end_trans:
- ret = btrfs_end_transaction(trans);
out:
mutex_unlock(&fs_info->qgroup_ioctl_lock);
+ if (ret && trans)
+ btrfs_end_transaction(trans);
+ else if (trans)
+ ret = btrfs_end_transaction(trans);
+
return ret;
}
@@ -2262,7 +2341,7 @@ static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info,
* Update qgroup rfer/excl counters.
* Rfer update is easy, codes can explain themselves.
*
- * Excl update is tricky, the update is split into 2 part.
+ * Excl update is tricky, the update is split into 2 parts.
* Part 1: Possible exclusive <-> sharing detect:
* | A | !A |
* -------------------------------------
@@ -2840,20 +2919,8 @@ out:
return ret;
}
-/*
- * Two limits to commit transaction in advance.
- *
- * For RATIO, it will be 1/RATIO of the remaining limit as threshold.
- * For SIZE, it will be in byte unit as threshold.
- */
-#define QGROUP_FREE_RATIO 32
-#define QGROUP_FREE_SIZE SZ_32M
-static bool qgroup_check_limits(struct btrfs_fs_info *fs_info,
- const struct btrfs_qgroup *qg, u64 num_bytes)
+static bool qgroup_check_limits(const struct btrfs_qgroup *qg, u64 num_bytes)
{
- u64 free;
- u64 threshold;
-
if ((qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_RFER) &&
qgroup_rsv_total(qg) + (s64)qg->rfer + num_bytes > qg->max_rfer)
return false;
@@ -2862,32 +2929,6 @@ static bool qgroup_check_limits(struct btrfs_fs_info *fs_info,
qgroup_rsv_total(qg) + (s64)qg->excl + num_bytes > qg->max_excl)
return false;
- /*
- * Even if we passed the check, it's better to check if reservation
- * for meta_pertrans is pushing us near limit.
- * If there is too much pertrans reservation or it's near the limit,
- * let's try commit transaction to free some, using transaction_kthread
- */
- if ((qg->lim_flags & (BTRFS_QGROUP_LIMIT_MAX_RFER |
- BTRFS_QGROUP_LIMIT_MAX_EXCL))) {
- if (qg->lim_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) {
- free = qg->max_excl - qgroup_rsv_total(qg) - qg->excl;
- threshold = min_t(u64, qg->max_excl / QGROUP_FREE_RATIO,
- QGROUP_FREE_SIZE);
- } else {
- free = qg->max_rfer - qgroup_rsv_total(qg) - qg->rfer;
- threshold = min_t(u64, qg->max_rfer / QGROUP_FREE_RATIO,
- QGROUP_FREE_SIZE);
- }
-
- /*
- * Use transaction_kthread to commit transaction, so we no
- * longer need to bother nested transaction nor lock context.
- */
- if (free < threshold)
- btrfs_commit_transaction_locksafe(fs_info);
- }
-
return true;
}
@@ -2937,7 +2978,7 @@ static int qgroup_reserve(struct btrfs_root *root, u64 num_bytes, bool enforce,
qg = unode_aux_to_qgroup(unode);
- if (enforce && !qgroup_check_limits(fs_info, qg, num_bytes)) {
+ if (enforce && !qgroup_check_limits(qg, num_bytes)) {
ret = -EDQUOT;
goto out;
}
@@ -3284,6 +3325,9 @@ qgroup_rescan_init(struct btrfs_fs_info *fs_info, u64 progress_objectid,
btrfs_warn(fs_info,
"qgroup rescan init failed, qgroup is not enabled");
ret = -EINVAL;
+ } else if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
+ /* Quota disable is in progress */
+ ret = -EBUSY;
}
if (ret) {
@@ -3411,28 +3455,150 @@ btrfs_qgroup_rescan_resume(struct btrfs_fs_info *fs_info)
}
}
+#define rbtree_iterate_from_safe(node, next, start) \
+ for (node = start; node && ({ next = rb_next(node); 1;}); node = next)
+
+static int qgroup_unreserve_range(struct btrfs_inode *inode,
+ struct extent_changeset *reserved, u64 start,
+ u64 len)
+{
+ struct rb_node *node;
+ struct rb_node *next;
+ struct ulist_node *entry = NULL;
+ int ret = 0;
+
+ node = reserved->range_changed.root.rb_node;
+ while (node) {
+ entry = rb_entry(node, struct ulist_node, rb_node);
+ if (entry->val < start)
+ node = node->rb_right;
+ else if (entry)
+ node = node->rb_left;
+ else
+ break;
+ }
+
+ /* Empty changeset */
+ if (!entry)
+ return 0;
+
+ if (entry->val > start && rb_prev(&entry->rb_node))
+ entry = rb_entry(rb_prev(&entry->rb_node), struct ulist_node,
+ rb_node);
+
+ rbtree_iterate_from_safe(node, next, &entry->rb_node) {
+ u64 entry_start;
+ u64 entry_end;
+ u64 entry_len;
+ int clear_ret;
+
+ entry = rb_entry(node, struct ulist_node, rb_node);
+ entry_start = entry->val;
+ entry_end = entry->aux;
+ entry_len = entry_end - entry_start + 1;
+
+ if (entry_start >= start + len)
+ break;
+ if (entry_start + entry_len <= start)
+ continue;
+ /*
+ * Now the entry is in [start, start + len), revert the
+ * EXTENT_QGROUP_RESERVED bit.
+ */
+ clear_ret = clear_extent_bits(&inode->io_tree, entry_start,
+ entry_end, EXTENT_QGROUP_RESERVED);
+ if (!ret && clear_ret < 0)
+ ret = clear_ret;
+
+ ulist_del(&reserved->range_changed, entry->val, entry->aux);
+ if (likely(reserved->bytes_changed >= entry_len)) {
+ reserved->bytes_changed -= entry_len;
+ } else {
+ WARN_ON(1);
+ reserved->bytes_changed = 0;
+ }
+ }
+
+ return ret;
+}
+
/*
- * Reserve qgroup space for range [start, start + len).
+ * Try to free some space for qgroup.
*
- * This function will either reserve space from related qgroups or doing
- * nothing if the range is already reserved.
+ * For qgroup, there are only 3 ways to free qgroup space:
+ * - Flush nodatacow write
+ * Any nodatacow write will free its reserved data space at run_delalloc_range().
+ * In theory, we should only flush nodatacow inodes, but it's not yet
+ * possible, so we need to flush the whole root.
*
- * Return 0 for successful reserve
- * Return <0 for error (including -EQUOT)
+ * - Wait for ordered extents
+ * When ordered extents are finished, their reserved metadata is finally
+ * converted to per_trans status, which can be freed by later commit
+ * transaction.
*
- * NOTE: this function may sleep for memory allocation.
- * if btrfs_qgroup_reserve_data() is called multiple times with
- * same @reserved, caller must ensure when error happens it's OK
- * to free *ALL* reserved space.
+ * - Commit transaction
+ * This would free the meta_per_trans space.
+ * In theory this shouldn't provide much space, but any more qgroup space
+ * is needed.
*/
-int btrfs_qgroup_reserve_data(struct inode *inode,
+static int try_flush_qgroup(struct btrfs_root *root)
+{
+ struct btrfs_trans_handle *trans;
+ int ret;
+ bool can_commit = true;
+
+ /*
+ * We don't want to run flush again and again, so if there is a running
+ * one, we won't try to start a new flush, but exit directly.
+ */
+ if (test_and_set_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)) {
+ wait_event(root->qgroup_flush_wait,
+ !test_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state));
+ return 0;
+ }
+
+ /*
+ * If current process holds a transaction, we shouldn't flush, as we
+ * assume all space reservation happens before a transaction handle is
+ * held.
+ *
+ * But there are cases like btrfs_delayed_item_reserve_metadata() where
+ * we try to reserve space with one transction handle already held.
+ * In that case we can't commit transaction, but at least try to end it
+ * and hope the started data writes can free some space.
+ */
+ if (current->journal_info &&
+ current->journal_info != BTRFS_SEND_TRANS_STUB)
+ can_commit = false;
+
+ ret = btrfs_start_delalloc_snapshot(root);
+ if (ret < 0)
+ goto out;
+ btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1);
+
+ trans = btrfs_join_transaction(root);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ goto out;
+ }
+
+ if (can_commit)
+ ret = btrfs_commit_transaction(trans);
+ else
+ ret = btrfs_end_transaction(trans);
+out:
+ clear_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state);
+ wake_up(&root->qgroup_flush_wait);
+ return ret;
+}
+
+static int qgroup_reserve_data(struct btrfs_inode *inode,
struct extent_changeset **reserved_ret, u64 start,
u64 len)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
- struct ulist_node *unode;
- struct ulist_iterator uiter;
+ struct btrfs_root *root = inode->root;
struct extent_changeset *reserved;
+ bool new_reserved = false;
u64 orig_reserved;
u64 to_reserve;
int ret;
@@ -3445,6 +3611,7 @@ int btrfs_qgroup_reserve_data(struct inode *inode,
if (WARN_ON(!reserved_ret))
return -EINVAL;
if (!*reserved_ret) {
+ new_reserved = true;
*reserved_ret = extent_changeset_alloc();
if (!*reserved_ret)
return -ENOMEM;
@@ -3452,15 +3619,15 @@ int btrfs_qgroup_reserve_data(struct inode *inode,
reserved = *reserved_ret;
/* Record already reserved space */
orig_reserved = reserved->bytes_changed;
- ret = set_record_extent_bits(&BTRFS_I(inode)->io_tree, start,
+ ret = set_record_extent_bits(&inode->io_tree, start,
start + len -1, EXTENT_QGROUP_RESERVED, reserved);
/* Newly reserved space */
to_reserve = reserved->bytes_changed - orig_reserved;
- trace_btrfs_qgroup_reserve_data(inode, start, len,
+ trace_btrfs_qgroup_reserve_data(&inode->vfs_inode, start, len,
to_reserve, QGROUP_RESERVE);
if (ret < 0)
- goto cleanup;
+ goto out;
ret = qgroup_reserve(root, to_reserve, true, BTRFS_QGROUP_RSV_DATA);
if (ret < 0)
goto cleanup;
@@ -3468,23 +3635,49 @@ int btrfs_qgroup_reserve_data(struct inode *inode,
return ret;
cleanup:
- /* cleanup *ALL* already reserved ranges */
- ULIST_ITER_INIT(&uiter);
- while ((unode = ulist_next(&reserved->range_changed, &uiter)))
- clear_extent_bit(&BTRFS_I(inode)->io_tree, unode->val,
- unode->aux, EXTENT_QGROUP_RESERVED, 0, 0, NULL);
- /* Also free data bytes of already reserved one */
- btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid,
- orig_reserved, BTRFS_QGROUP_RSV_DATA);
- extent_changeset_release(reserved);
+ qgroup_unreserve_range(inode, reserved, start, len);
+out:
+ if (new_reserved) {
+ extent_changeset_release(reserved);
+ kfree(reserved);
+ *reserved_ret = NULL;
+ }
return ret;
}
+/*
+ * Reserve qgroup space for range [start, start + len).
+ *
+ * This function will either reserve space from related qgroups or do nothing
+ * if the range is already reserved.
+ *
+ * Return 0 for successful reservation
+ * Return <0 for error (including -EQUOT)
+ *
+ * NOTE: This function may sleep for memory allocation, dirty page flushing and
+ * commit transaction. So caller should not hold any dirty page locked.
+ */
+int btrfs_qgroup_reserve_data(struct btrfs_inode *inode,
+ struct extent_changeset **reserved_ret, u64 start,
+ u64 len)
+{
+ int ret;
+
+ ret = qgroup_reserve_data(inode, reserved_ret, start, len);
+ if (ret <= 0 && ret != -EDQUOT)
+ return ret;
+
+ ret = try_flush_qgroup(inode->root);
+ if (ret < 0)
+ return ret;
+ return qgroup_reserve_data(inode, reserved_ret, start, len);
+}
+
/* Free ranges specified by @reserved, normally in error path */
-static int qgroup_free_reserved_data(struct inode *inode,
+static int qgroup_free_reserved_data(struct btrfs_inode *inode,
struct extent_changeset *reserved, u64 start, u64 len)
{
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ struct btrfs_root *root = inode->root;
struct ulist_node *unode;
struct ulist_iterator uiter;
struct extent_changeset changeset;
@@ -3520,8 +3713,8 @@ static int qgroup_free_reserved_data(struct inode *inode,
* EXTENT_QGROUP_RESERVED, we won't double free.
* So not need to rush.
*/
- ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree,
- free_start, free_start + free_len - 1,
+ ret = clear_record_extent_bits(&inode->io_tree, free_start,
+ free_start + free_len - 1,
EXTENT_QGROUP_RESERVED, &changeset);
if (ret < 0)
goto out;
@@ -3550,7 +3743,8 @@ static int __btrfs_qgroup_release_data(struct inode *inode,
/* In release case, we shouldn't have @reserved */
WARN_ON(!free && reserved);
if (free && reserved)
- return qgroup_free_reserved_data(inode, reserved, start, len);
+ return qgroup_free_reserved_data(BTRFS_I(inode), reserved,
+ start, len);
extent_changeset_init(&changeset);
ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, start,
start + len -1, EXTENT_QGROUP_RESERVED, &changeset);
@@ -3649,8 +3843,8 @@ static int sub_root_meta_rsv(struct btrfs_root *root, int num_bytes,
return num_bytes;
}
-int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
- enum btrfs_qgroup_rsv_type type, bool enforce)
+int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
+ enum btrfs_qgroup_rsv_type type, bool enforce)
{
struct btrfs_fs_info *fs_info = root->fs_info;
int ret;
@@ -3676,6 +3870,21 @@ int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
return ret;
}
+int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
+ enum btrfs_qgroup_rsv_type type, bool enforce)
+{
+ int ret;
+
+ ret = btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce);
+ if (ret <= 0 && ret != -EDQUOT)
+ return ret;
+
+ ret = try_flush_qgroup(root);
+ if (ret < 0)
+ return ret;
+ return btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce);
+}
+
void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h
index b0420c4f5d0e..0a2659685ad6 100644
--- a/fs/btrfs/qgroup.h
+++ b/fs/btrfs/qgroup.h
@@ -344,12 +344,13 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid,
#endif
/* New io_tree based accurate qgroup reserve API */
-int btrfs_qgroup_reserve_data(struct inode *inode,
+int btrfs_qgroup_reserve_data(struct btrfs_inode *inode,
struct extent_changeset **reserved, u64 start, u64 len);
int btrfs_qgroup_release_data(struct inode *inode, u64 start, u64 len);
int btrfs_qgroup_free_data(struct inode *inode,
struct extent_changeset *reserved, u64 start, u64 len);
-
+int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
+ enum btrfs_qgroup_rsv_type type, bool enforce);
int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
enum btrfs_qgroup_rsv_type type, bool enforce);
/* Reserve metadata space for pertrans and prealloc type */
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index 612411c74550..0d07ebe511e7 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -371,7 +371,8 @@ int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
key.offset = ref_id;
again:
ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
- BUG_ON(ret < 0);
+ if (ret < 0)
+ goto out;
if (ret == 0) {
leaf = path->nodes[0];
ref = btrfs_item_ptr(leaf, path->slots[0],
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index ecdefa7262d2..e258fc484cea 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -4087,6 +4087,17 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move)
if (ret < 0)
goto out;
} else {
+ /*
+ * If we previously orphanized a directory that
+ * collided with a new reference that we already
+ * processed, recompute the current path because
+ * that directory may be part of the path.
+ */
+ if (orphanized_dir) {
+ ret = refresh_ref_path(sctx, cur);
+ if (ret < 0)
+ goto out;
+ }
ret = send_unlink(sctx, cur->full_path);
if (ret < 0)
goto out;
@@ -4994,6 +5005,10 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len)
lock_page(page);
if (!PageUptodate(page)) {
unlock_page(page);
+ btrfs_err(fs_info,
+ "send: IO error at offset %llu for inode %llu root %llu",
+ page_offset(page), sctx->cur_ino,
+ sctx->send_root->root_key.objectid);
put_page(page);
ret = -EIO;
break;
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index 90500b6c41fc..1cd39f6a9c3a 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -262,9 +262,10 @@ static void __btrfs_dump_space_info(struct btrfs_fs_info *fs_info,
{
lockdep_assert_held(&info->lock);
- btrfs_info(fs_info, "space_info %llu has %llu free, is %sfull",
+ /* The free space could be negative in case of overcommit */
+ btrfs_info(fs_info, "space_info %llu has %lld free, is %sfull",
info->flags,
- info->total_bytes - btrfs_space_info_used(info, true),
+ (s64)(info->total_bytes - btrfs_space_info_used(info, true)),
info->full ? "" : "not ");
btrfs_info(fs_info,
"space_info total=%llu, used=%llu, pinned=%llu, reserved=%llu, may_use=%llu, readonly=%llu",
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index aca6c467d776..e6cb95b81787 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -27,7 +27,6 @@
static const unsigned int btrfs_blocked_trans_types[TRANS_STATE_MAX] = {
[TRANS_STATE_RUNNING] = 0U,
- [TRANS_STATE_BLOCKED] = __TRANS_START,
[TRANS_STATE_COMMIT_START] = (__TRANS_START | __TRANS_ATTACH),
[TRANS_STATE_COMMIT_DOING] = (__TRANS_START |
__TRANS_ATTACH |
@@ -388,7 +387,7 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
static inline int is_transaction_blocked(struct btrfs_transaction *trans)
{
- return (trans->state >= TRANS_STATE_BLOCKED &&
+ return (trans->state >= TRANS_STATE_COMMIT_START &&
trans->state < TRANS_STATE_UNBLOCKED &&
!TRANS_ABORTED(trans));
}
@@ -580,7 +579,7 @@ again:
INIT_LIST_HEAD(&h->new_bgs);
smp_mb();
- if (cur_trans->state >= TRANS_STATE_BLOCKED &&
+ if (cur_trans->state >= TRANS_STATE_COMMIT_START &&
may_wait_transaction(fs_info, type)) {
current->journal_info = h;
btrfs_commit_transaction(h);
@@ -797,7 +796,7 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans)
struct btrfs_transaction *cur_trans = trans->transaction;
smp_mb();
- if (cur_trans->state >= TRANS_STATE_BLOCKED ||
+ if (cur_trans->state >= TRANS_STATE_COMMIT_START ||
cur_trans->delayed_refs.flushing)
return 1;
@@ -830,7 +829,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
{
struct btrfs_fs_info *info = trans->fs_info;
struct btrfs_transaction *cur_trans = trans->transaction;
- int lock = (trans->type != TRANS_JOIN_NOLOCK);
int err = 0;
if (refcount_read(&trans->use_count) > 1) {
@@ -846,13 +844,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
btrfs_trans_release_chunk_metadata(trans);
- if (lock && READ_ONCE(cur_trans->state) == TRANS_STATE_BLOCKED) {
- if (throttle)
- return btrfs_commit_transaction(trans);
- else
- wake_up_process(info->transaction_kthread);
- }
-
if (trans->type & __TRANS_FREEZABLE)
sb_end_intwrite(info->sb);
@@ -1274,8 +1265,10 @@ int btrfs_defrag_root(struct btrfs_root *root)
while (1) {
trans = btrfs_start_transaction(root, 0);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ break;
+ }
ret = btrfs_defrag_leaves(trans, root);
@@ -2304,7 +2297,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
*/
cur_trans->state = TRANS_STATE_COMPLETED;
wake_up(&cur_trans->commit_wait);
- clear_bit(BTRFS_FS_NEED_ASYNC_COMMIT, &fs_info->flags);
spin_lock(&fs_info->trans_lock);
list_del_init(&cur_trans->list);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 7291a2a93075..cbede328bda5 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -13,7 +13,6 @@
enum btrfs_trans_state {
TRANS_STATE_RUNNING,
- TRANS_STATE_BLOCKED,
TRANS_STATE_COMMIT_START,
TRANS_STATE_COMMIT_DOING,
TRANS_STATE_UNBLOCKED,
@@ -161,7 +160,7 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
spin_lock(&BTRFS_I(inode)->lock);
BTRFS_I(inode)->last_trans = trans->transaction->transid;
BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
- BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;
+ BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans - 1;
spin_unlock(&BTRFS_I(inode)->lock);
}
@@ -208,20 +207,6 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root);
int btrfs_commit_transaction(struct btrfs_trans_handle *trans);
int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans,
int wait_for_unblock);
-
-/*
- * Try to commit transaction asynchronously, so this is safe to call
- * even holding a spinlock.
- *
- * It's done by informing transaction_kthread to commit transaction without
- * waiting for commit interval.
- */
-static inline void btrfs_commit_transaction_locksafe(
- struct btrfs_fs_info *fs_info)
-{
- set_bit(BTRFS_FS_NEED_ASYNC_COMMIT, &fs_info->flags);
- wake_up_process(fs_info->transaction_kthread);
-}
int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans);
int btrfs_should_end_transaction(struct btrfs_trans_handle *trans);
void btrfs_throttle(struct btrfs_fs_info *fs_info);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 4ff381c23cef..b7bfecfc2ea3 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -174,7 +174,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans,
atomic_inc(&root->log_batch);
atomic_inc(&root->log_writers);
- if (ctx) {
+ if (ctx && !ctx->logging_new_name) {
int index = root->log_transid % 2;
list_add_tail(&ctx->list, &root->log_ctxs[index]);
ctx->log_transid = root->log_transid;
@@ -719,7 +719,9 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
*/
ret = btrfs_lookup_data_extent(fs_info, ins.objectid,
ins.offset);
- if (ret == 0) {
+ if (ret < 0) {
+ goto out;
+ } else if (ret == 0) {
btrfs_init_generic_ref(&ref,
BTRFS_ADD_DELAYED_REF,
ins.objectid, ins.offset, 0);
@@ -898,9 +900,11 @@ out:
}
/*
- * helper function to see if a given name and sequence number found
- * in an inode back reference are already in a directory and correctly
- * point to this inode
+ * See if a given name and sequence number found in an inode back reference are
+ * already in a directory and correctly point to this inode.
+ *
+ * Returns: < 0 on error, 0 if the directory entry does not exists and 1 if it
+ * exists.
*/
static noinline int inode_in_dir(struct btrfs_root *root,
struct btrfs_path *path,
@@ -909,29 +913,35 @@ static noinline int inode_in_dir(struct btrfs_root *root,
{
struct btrfs_dir_item *di;
struct btrfs_key location;
- int match = 0;
+ int ret = 0;
di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
index, name, name_len, 0);
- if (di && !IS_ERR(di)) {
+ if (IS_ERR(di)) {
+ if (PTR_ERR(di) != -ENOENT)
+ ret = PTR_ERR(di);
+ goto out;
+ } else if (di) {
btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
if (location.objectid != objectid)
goto out;
- } else
+ } else {
goto out;
- btrfs_release_path(path);
+ }
+ btrfs_release_path(path);
di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
- if (di && !IS_ERR(di)) {
- btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
- if (location.objectid != objectid)
- goto out;
- } else
+ if (IS_ERR(di)) {
+ ret = PTR_ERR(di);
goto out;
- match = 1;
+ } else if (di) {
+ btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
+ if (location.objectid == objectid)
+ ret = 1;
+ }
out:
btrfs_release_path(path);
- return match;
+ return ret;
}
/*
@@ -1158,7 +1168,10 @@ next:
/* look for a conflicting sequence number */
di = btrfs_lookup_dir_index_item(trans, root, path, btrfs_ino(dir),
ref_index, name, namelen, 0);
- if (di && !IS_ERR(di)) {
+ if (IS_ERR(di)) {
+ if (PTR_ERR(di) != -ENOENT)
+ return PTR_ERR(di);
+ } else if (di) {
ret = drop_one_dir_item(trans, root, path, dir, di);
if (ret)
return ret;
@@ -1168,7 +1181,9 @@ next:
/* look for a conflicting name */
di = btrfs_lookup_dir_item(trans, root, path, btrfs_ino(dir),
name, namelen, 0);
- if (di && !IS_ERR(di)) {
+ if (IS_ERR(di)) {
+ return PTR_ERR(di);
+ } else if (di) {
ret = drop_one_dir_item(trans, root, path, dir, di);
if (ret)
return ret;
@@ -1293,6 +1308,15 @@ again:
inode, name, namelen);
kfree(name);
iput(dir);
+ /*
+ * Whenever we need to check if a name exists or not, we
+ * check the subvolume tree. So after an unlink we must
+ * run delayed items, so that future checks for a name
+ * during log replay see that the name does not exists
+ * anymore.
+ */
+ if (!ret)
+ ret = btrfs_run_delayed_items(trans);
if (ret)
goto out;
goto again;
@@ -1493,10 +1517,12 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
if (ret)
goto out;
- /* if we already have a perfect match, we're done */
- if (!inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)),
- btrfs_ino(BTRFS_I(inode)), ref_index,
- name, namelen)) {
+ ret = inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)),
+ btrfs_ino(BTRFS_I(inode)), ref_index,
+ name, namelen);
+ if (ret < 0) {
+ goto out;
+ } else if (ret == 0) {
/*
* look for a conflicting back reference in the
* metadata. if we find one we have to unlink that name
@@ -1542,6 +1568,15 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
*/
if (!ret && inode->i_nlink == 0)
inc_nlink(inode);
+ /*
+ * Whenever we need to check if a name exists or
+ * not, we check the subvolume tree. So after an
+ * unlink we must run delayed items, so that future
+ * checks for a name during log replay see that the
+ * name does not exists anymore.
+ */
+ if (!ret)
+ ret = btrfs_run_delayed_items(trans);
}
if (ret < 0)
goto out;
@@ -1554,6 +1589,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
btrfs_update_inode(trans, root, inode);
}
+ /* Else, ret == 1, we already have a perfect match, we're done. */
ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen;
kfree(name);
@@ -1942,8 +1978,8 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
struct btrfs_key log_key;
struct inode *dir;
u8 log_type;
- int exists;
- int ret = 0;
+ bool exists;
+ int ret;
bool update_size = (key->type == BTRFS_DIR_INDEX_KEY);
bool name_added = false;
@@ -1963,12 +1999,12 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
name_len);
btrfs_dir_item_key_to_cpu(eb, di, &log_key);
- exists = btrfs_lookup_inode(trans, root, path, &log_key, 0);
- if (exists == 0)
- exists = 1;
- else
- exists = 0;
+ ret = btrfs_lookup_inode(trans, root, path, &log_key, 0);
btrfs_release_path(path);
+ if (ret < 0)
+ goto out;
+ exists = (ret == 0);
+ ret = 0;
if (key->type == BTRFS_DIR_ITEM_KEY) {
dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
@@ -1983,7 +2019,14 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
ret = -EINVAL;
goto out;
}
- if (IS_ERR_OR_NULL(dst_di)) {
+
+ if (dst_di == ERR_PTR(-ENOENT))
+ dst_di = NULL;
+
+ if (IS_ERR(dst_di)) {
+ ret = PTR_ERR(dst_di);
+ goto out;
+ } else if (!dst_di) {
/* we need a sequence number to insert, so we only
* do inserts for the BTRFS_DIR_INDEX_KEY types
*/
@@ -2465,7 +2508,9 @@ again:
else {
ret = find_dir_range(log, path, dirid, key_type,
&range_start, &range_end);
- if (ret != 0)
+ if (ret < 0)
+ goto out;
+ else if (ret > 0)
break;
}
@@ -4222,7 +4267,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans,
/*
* Log all prealloc extents beyond the inode's i_size to make sure we do not
- * lose them after doing a fast fsync and replaying the log. We scan the
+ * lose them after doing a full/fast fsync and replaying the log. We scan the
* subvolume's root instead of iterating the inode's extent map tree because
* otherwise we can log incorrect extent items based on extent map conversion.
* That can happen due to the fact that extent maps are merged when they
@@ -4923,7 +4968,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans,
* Check the inode's logged_trans only instead of
* btrfs_inode_in_log(). This is because the last_log_commit of
* the inode is not updated when we only log that it exists and
- * and it has the full sync bit set (see btrfs_log_inode()).
+ * it has the full sync bit set (see btrfs_log_inode()).
*/
if (BTRFS_I(inode)->logged_trans == trans->transid) {
spin_unlock(&BTRFS_I(inode)->lock);
@@ -5015,6 +5060,7 @@ static int copy_inode_items_to_log(struct btrfs_trans_handle *trans,
struct btrfs_log_ctx *ctx,
bool *need_log_inode_item)
{
+ const u64 i_size = i_size_read(&inode->vfs_inode);
struct btrfs_root *root = inode->root;
int ins_start_slot = 0;
int ins_nr = 0;
@@ -5035,13 +5081,21 @@ again:
if (min_key->type > max_key->type)
break;
- if (min_key->type == BTRFS_INODE_ITEM_KEY)
+ if (min_key->type == BTRFS_INODE_ITEM_KEY) {
*need_log_inode_item = false;
-
- if ((min_key->type == BTRFS_INODE_REF_KEY ||
- min_key->type == BTRFS_INODE_EXTREF_KEY) &&
- inode->generation == trans->transid &&
- !recursive_logging) {
+ } else if (min_key->type == BTRFS_EXTENT_DATA_KEY &&
+ min_key->offset >= i_size) {
+ /*
+ * Extents at and beyond eof are logged with
+ * btrfs_log_prealloc_extents().
+ * Only regular files have BTRFS_EXTENT_DATA_KEY keys,
+ * and no keys greater than that, so bail out.
+ */
+ break;
+ } else if ((min_key->type == BTRFS_INODE_REF_KEY ||
+ min_key->type == BTRFS_INODE_EXTREF_KEY) &&
+ inode->generation == trans->transid &&
+ !recursive_logging) {
u64 other_ino = 0;
u64 other_parent = 0;
@@ -5072,10 +5126,8 @@ again:
btrfs_release_path(path);
goto next_key;
}
- }
-
- /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */
- if (min_key->type == BTRFS_XATTR_ITEM_KEY) {
+ } else if (min_key->type == BTRFS_XATTR_ITEM_KEY) {
+ /* Skip xattrs, logged later with btrfs_log_all_xattrs() */
if (ins_nr == 0)
goto next_slot;
ret = copy_items(trans, inode, dst_path, path,
@@ -5128,9 +5180,21 @@ next_key:
break;
}
}
- if (ins_nr)
+ if (ins_nr) {
ret = copy_items(trans, inode, dst_path, path, ins_start_slot,
ins_nr, inode_only, logged_isize);
+ if (ret)
+ return ret;
+ }
+
+ if (inode_only == LOG_INODE_ALL && S_ISREG(inode->vfs_inode.i_mode)) {
+ /*
+ * Release the path because otherwise we might attempt to double
+ * lock the same leaf with btrfs_log_prealloc_extents() below.
+ */
+ btrfs_release_path(path);
+ ret = btrfs_log_prealloc_extents(trans, inode, dst_path);
+ }
return ret;
}
@@ -5231,6 +5295,18 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
}
/*
+ * For symlinks, we must always log their content, which is stored in an
+ * inline extent, otherwise we could end up with an empty symlink after
+ * log replay, which is invalid on linux (symlink(2) returns -ENOENT if
+ * one attempts to create an empty symlink).
+ * We don't need to worry about flushing delalloc, because when we create
+ * the inline extent when the symlink is created (we never have delalloc
+ * for symlinks).
+ */
+ if (S_ISLNK(inode->vfs_inode.i_mode))
+ inode_only = LOG_INODE_ALL;
+
+ /*
* a brute force approach to making sure we get the most uptodate
* copies of everything.
*/
@@ -5379,19 +5455,34 @@ log_extents:
}
/*
- * Don't update last_log_commit if we logged that an inode exists after
- * it was loaded to memory (full_sync bit set).
- * This is to prevent data loss when we do a write to the inode, then
- * the inode gets evicted after all delalloc was flushed, then we log
- * it exists (due to a rename for example) and then fsync it. This last
- * fsync would do nothing (not logging the extents previously written).
+ * If we are logging that an ancestor inode exists as part of logging a
+ * new name from a link or rename operation, don't mark the inode as
+ * logged - otherwise if an explicit fsync is made against an ancestor,
+ * the fsync considers the inode in the log and doesn't sync the log,
+ * resulting in the ancestor missing after a power failure unless the
+ * log was synced as part of an fsync against any other unrelated inode.
+ * So keep it simple for this case and just don't flag the ancestors as
+ * logged.
*/
- spin_lock(&inode->lock);
- inode->logged_trans = trans->transid;
- if (inode_only != LOG_INODE_EXISTS ||
- !test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags))
- inode->last_log_commit = inode->last_sub_trans;
- spin_unlock(&inode->lock);
+ if (!ctx ||
+ !(S_ISDIR(inode->vfs_inode.i_mode) && ctx->logging_new_name &&
+ &inode->vfs_inode != ctx->inode)) {
+ spin_lock(&inode->lock);
+ inode->logged_trans = trans->transid;
+ /*
+ * Don't update last_log_commit if we logged that an inode exists
+ * after it was loaded to memory (full_sync bit set).
+ * This is to prevent data loss when we do a write to the inode,
+ * then the inode gets evicted after all delalloc was flushed,
+ * then we log it exists (due to a rename for example) and then
+ * fsync it. This last fsync would do nothing (not logging the
+ * extents previously written).
+ */
+ if (inode_only != LOG_INODE_EXISTS ||
+ !test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags))
+ inode->last_log_commit = inode->last_sub_trans;
+ spin_unlock(&inode->lock);
+ }
out_unlock:
mutex_unlock(&inode->log_mutex);
@@ -5628,7 +5719,7 @@ process_leaf:
}
ctx->log_new_dentries = false;
- if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK)
+ if (type == BTRFS_FT_DIR)
log_mode = LOG_INODE_ALL;
ret = btrfs_log_inode(trans, root, BTRFS_I(di_inode),
log_mode, 0, LLONG_MAX, ctx);
@@ -6327,6 +6418,7 @@ next:
error:
if (wc.trans)
btrfs_end_transaction(wc.trans);
+ clear_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags);
btrfs_free_path(path);
return ret;
}
@@ -6416,26 +6508,12 @@ void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
/*
* Call this after adding a new name for a file and it will properly
* update the log to reflect the new name.
- *
- * @ctx can not be NULL when @sync_log is false, and should be NULL when it's
- * true (because it's not used).
- *
- * Return value depends on whether @sync_log is true or false.
- * When true: returns BTRFS_NEED_TRANS_COMMIT if the transaction needs to be
- * committed by the caller, and BTRFS_DONT_NEED_TRANS_COMMIT
- * otherwise.
- * When false: returns BTRFS_DONT_NEED_LOG_SYNC if the caller does not need to
- * to sync the log, BTRFS_NEED_LOG_SYNC if it needs to sync the log,
- * or BTRFS_NEED_TRANS_COMMIT if the transaction needs to be
- * committed (without attempting to sync the log).
*/
-int btrfs_log_new_name(struct btrfs_trans_handle *trans,
+void btrfs_log_new_name(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode, struct btrfs_inode *old_dir,
- struct dentry *parent,
- bool sync_log, struct btrfs_log_ctx *ctx)
+ struct dentry *parent)
{
- struct btrfs_fs_info *fs_info = trans->fs_info;
- int ret;
+ struct btrfs_log_ctx ctx;
/*
* this will force the logging code to walk the dentry chain
@@ -6448,36 +6526,20 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans,
* if this inode hasn't been logged and directory we're renaming it
* from hasn't been logged, we don't need to log it
*/
- if (inode->logged_trans <= fs_info->last_trans_committed &&
- (!old_dir || old_dir->logged_trans <= fs_info->last_trans_committed))
- return sync_log ? BTRFS_DONT_NEED_TRANS_COMMIT :
- BTRFS_DONT_NEED_LOG_SYNC;
-
- if (sync_log) {
- struct btrfs_log_ctx ctx2;
-
- btrfs_init_log_ctx(&ctx2, &inode->vfs_inode);
- ret = btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX,
- LOG_INODE_EXISTS, &ctx2);
- if (ret == BTRFS_NO_LOG_SYNC)
- return BTRFS_DONT_NEED_TRANS_COMMIT;
- else if (ret)
- return BTRFS_NEED_TRANS_COMMIT;
-
- ret = btrfs_sync_log(trans, inode->root, &ctx2);
- if (ret)
- return BTRFS_NEED_TRANS_COMMIT;
- return BTRFS_DONT_NEED_TRANS_COMMIT;
- }
-
- ASSERT(ctx);
- ret = btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX,
- LOG_INODE_EXISTS, ctx);
- if (ret == BTRFS_NO_LOG_SYNC)
- return BTRFS_DONT_NEED_LOG_SYNC;
- else if (ret)
- return BTRFS_NEED_TRANS_COMMIT;
+ if (!inode_logged(trans, inode) &&
+ (!old_dir || !inode_logged(trans, old_dir)))
+ return;
- return BTRFS_NEED_LOG_SYNC;
+ btrfs_init_log_ctx(&ctx, &inode->vfs_inode);
+ ctx.logging_new_name = true;
+ /*
+ * We don't care about the return value. If we fail to log the new name
+ * then we know the next attempt to sync the log will fallback to a full
+ * transaction commit (due to a call to btrfs_set_log_full_commit()), so
+ * we don't need to worry about getting a log committed that has an
+ * inconsistent state after a rename operation.
+ */
+ btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX,
+ LOG_INODE_EXISTS, &ctx);
}
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 132e43d29034..ddfc6789d9bf 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -16,6 +16,7 @@ struct btrfs_log_ctx {
int log_ret;
int log_transid;
bool log_new_dentries;
+ bool logging_new_name;
struct inode *inode;
struct list_head list;
};
@@ -26,6 +27,7 @@ static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx,
ctx->log_ret = 0;
ctx->log_transid = 0;
ctx->log_new_dentries = false;
+ ctx->logging_new_name = false;
ctx->inode = inode;
INIT_LIST_HEAD(&ctx->list);
}
@@ -67,16 +69,8 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
int for_rename);
void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans,
struct btrfs_inode *dir);
-/* Return values for btrfs_log_new_name() */
-enum {
- BTRFS_DONT_NEED_TRANS_COMMIT,
- BTRFS_NEED_TRANS_COMMIT,
- BTRFS_DONT_NEED_LOG_SYNC,
- BTRFS_NEED_LOG_SYNC,
-};
-int btrfs_log_new_name(struct btrfs_trans_handle *trans,
+void btrfs_log_new_name(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode, struct btrfs_inode *old_dir,
- struct dentry *parent,
- bool sync_log, struct btrfs_log_ctx *ctx);
+ struct dentry *parent);
#endif
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index db3b17bca11f..8898682c9103 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -742,6 +742,8 @@ static int btrfs_free_stale_devices(const char *path,
struct btrfs_device *device, *tmp_device;
int ret = 0;
+ lockdep_assert_held(&uuid_mutex);
+
if (path)
ret = -ENOENT;
@@ -1181,11 +1183,12 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
struct btrfs_device *orig_dev;
int ret = 0;
+ lockdep_assert_held(&uuid_mutex);
+
fs_devices = alloc_fs_devices(orig->fsid, NULL);
if (IS_ERR(fs_devices))
return fs_devices;
- mutex_lock(&orig->device_list_mutex);
fs_devices->total_devices = orig->total_devices;
list_for_each_entry(orig_dev, &orig->devices, dev_list) {
@@ -1217,10 +1220,8 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
device->fs_devices = fs_devices;
fs_devices->num_devices++;
}
- mutex_unlock(&orig->device_list_mutex);
return fs_devices;
error:
- mutex_unlock(&orig->device_list_mutex);
free_fs_devices(fs_devices);
return ERR_PTR(ret);
}
@@ -1266,6 +1267,7 @@ again:
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
list_del_init(&device->dev_alloc_list);
clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
+ fs_devices->rw_devices--;
}
list_del_init(&device->dev_list);
fs_devices->num_devices--;
@@ -1310,8 +1312,13 @@ static void btrfs_close_one_device(struct btrfs_device *device)
fs_devices->rw_devices--;
}
- if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
+ if (device->devid == BTRFS_DEV_REPLACE_DEVID)
+ clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
+
+ if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
+ clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
fs_devices->missing_devices--;
+ }
btrfs_close_bdev(device);
@@ -2155,8 +2162,11 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
u64 num_devices;
int ret = 0;
- mutex_lock(&uuid_mutex);
-
+ /*
+ * The device list in fs_devices is accessed without locks (neither
+ * uuid_mutex nor device_list_mutex) as it won't change on a mounted
+ * filesystem and another device rm cannot run.
+ */
num_devices = btrfs_num_devices(fs_info);
ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
@@ -2167,7 +2177,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
if (IS_ERR(device)) {
if (PTR_ERR(device) == -ENOENT &&
- strcmp(device_path, "missing") == 0)
+ device_path && strcmp(device_path, "missing") == 0)
ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
else
ret = PTR_ERR(device);
@@ -2200,11 +2210,9 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
mutex_unlock(&fs_info->chunk_mutex);
}
- mutex_unlock(&uuid_mutex);
ret = btrfs_shrink_device(device, 0);
if (!ret)
btrfs_reada_remove_dev(device);
- mutex_lock(&uuid_mutex);
if (ret)
goto error_undo;
@@ -2286,7 +2294,6 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
}
out:
- mutex_unlock(&uuid_mutex);
return ret;
error_undo:
@@ -4313,10 +4320,12 @@ static int balance_kthread(void *data)
struct btrfs_fs_info *fs_info = data;
int ret = 0;
+ sb_start_write(fs_info->sb);
mutex_lock(&fs_info->balance_mutex);
if (fs_info->balance_ctl)
ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL);
mutex_unlock(&fs_info->balance_mutex);
+ sb_end_write(fs_info->sb);
return ret;
}