From c46effa601f869f3d20a7386a745d9c002838eb8 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Mon, 14 Oct 2013 12:59:45 +0800 Subject: Btrfs: introduce a head ref rbtree The way how we process delayed refs is 1) get a bunch of head refs, 2) pick up one head ref, 3) go one node back for any delayed ref updates. The head ref is also linked in the same rbtree as the delayed ref is, so in 1) stage, we have to walk one by one including not only head refs, but delayed refs. When we have a great number of delayed refs pending to process, this'll cost time a lot. Here we introduce a head ref specific rbtree, it only has head refs, so troubles go away. Signed-off-by: Liu Bo Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 9c01509dd8ab..d15b4fc07554 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2438,6 +2438,10 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, ref->in_tree = 0; rb_erase(&ref->rb_node, &delayed_refs->root); + if (btrfs_delayed_ref_is_head(ref)) { + rb_erase(&locked_ref->href_node, + &delayed_refs->href_root); + } delayed_refs->num_entries--; if (!btrfs_delayed_ref_is_head(ref)) { /* @@ -2640,7 +2644,7 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, { struct rb_node *node; struct btrfs_delayed_ref_root *delayed_refs; - struct btrfs_delayed_ref_node *ref; + struct btrfs_delayed_ref_head *head; struct list_head cluster; int ret; u64 delayed_start; @@ -2770,18 +2774,18 @@ again: spin_lock(&delayed_refs->lock); } - node = rb_first(&delayed_refs->root); + node = rb_first(&delayed_refs->href_root); if (!node) goto out; count = (unsigned long)-1; while (node) { - ref = rb_entry(node, struct btrfs_delayed_ref_node, - rb_node); - if (btrfs_delayed_ref_is_head(ref)) { - struct btrfs_delayed_ref_head *head; + head = rb_entry(node, struct btrfs_delayed_ref_head, + href_node); + if (btrfs_delayed_ref_is_head(&head->node)) { + struct btrfs_delayed_ref_node *ref; - head = btrfs_delayed_node_to_head(ref); + ref = &head->node; atomic_inc(&ref->refs); spin_unlock(&delayed_refs->lock); @@ -2795,6 +2799,8 @@ again: btrfs_put_delayed_ref(ref); cond_resched(); goto again; + } else { + WARN_ON(1); } node = rb_next(node); } @@ -5956,6 +5962,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, */ head->node.in_tree = 0; rb_erase(&head->node.rb_node, &delayed_refs->root); + rb_erase(&head->href_node, &delayed_refs->href_root); delayed_refs->num_entries--; -- cgit v1.2.3 From 6ab0a2029ceaedb78af807871820708b7353e3be Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Fri, 1 Nov 2013 13:07:04 -0400 Subject: btrfs: publish allocation data in sysfs While trying to debug ENOSPC issues, it's helpful to understand what the kernel's view of the available space is. We export this information via ioctl, but sysfs files are more easily used. Signed-off-by: Jeff Mahoney Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 82 +++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 77 insertions(+), 5 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index d15b4fc07554..e094d02ea76a 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -35,6 +35,7 @@ #include "locking.h" #include "free-space-cache.h" #include "math.h" +#include "sysfs.h" #undef SCRAMBLE_DELAYED_REFS @@ -3408,6 +3409,23 @@ int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr) return readonly; } +static const char *alloc_name(u64 flags) +{ + switch (flags) { + case BTRFS_BLOCK_GROUP_METADATA|BTRFS_BLOCK_GROUP_DATA: + return "mixed"; + case BTRFS_BLOCK_GROUP_METADATA: + return "metadata"; + case BTRFS_BLOCK_GROUP_DATA: + return "data"; + case BTRFS_BLOCK_GROUP_SYSTEM: + return "system"; + default: + WARN_ON(1); + return "invalid-combination"; + }; +} + static int update_space_info(struct btrfs_fs_info *info, u64 flags, u64 total_bytes, u64 bytes_used, struct btrfs_space_info **space_info) @@ -3463,11 +3481,21 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, found->chunk_alloc = 0; found->flush = 0; init_waitqueue_head(&found->wait); + + ret = kobject_init_and_add(&found->kobj, &space_info_ktype, + info->space_info_kobj, "%s", + alloc_name(found->flags)); + if (ret) { + kfree(found); + return ret; + } + *space_info = found; list_add_rcu(&found->list, &info->space_info); if (flags & BTRFS_BLOCK_GROUP_DATA) info->data_sinfo = found; - return 0; + + return ret; } static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) @@ -6152,11 +6180,29 @@ int __get_raid_index(u64 flags) return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */ } -static int get_block_group_index(struct btrfs_block_group_cache *cache) +int get_block_group_index(struct btrfs_block_group_cache *cache) { return __get_raid_index(cache->flags); } +static const char *btrfs_raid_type_names[BTRFS_NR_RAID_TYPES] = { + [BTRFS_RAID_RAID10] = "raid10", + [BTRFS_RAID_RAID1] = "raid1", + [BTRFS_RAID_DUP] = "dup", + [BTRFS_RAID_RAID0] = "raid0", + [BTRFS_RAID_SINGLE] = "single", + [BTRFS_RAID_RAID5] = "raid5", + [BTRFS_RAID_RAID6] = "raid6", +}; + +const char *get_raid_name(enum btrfs_raid_types type) +{ + if (type >= BTRFS_NR_RAID_TYPES) + return NULL; + + return btrfs_raid_type_names[type]; +} + enum btrfs_loop_type { LOOP_CACHING_NOWAIT = 0, LOOP_CACHING_WAIT = 1, @@ -8340,6 +8386,8 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) release_global_block_rsv(info); while (!list_empty(&info->space_info)) { + int i; + space_info = list_entry(info->space_info.next, struct btrfs_space_info, list); @@ -8350,9 +8398,17 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) dump_space_info(space_info, 0, 0); } } - percpu_counter_destroy(&space_info->total_bytes_pinned); list_del(&space_info->list); - kfree(space_info); + for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { + struct kobject *kobj; + kobj = &space_info->block_group_kobjs[i]; + if (kobj->parent) { + kobject_del(kobj); + kobject_put(kobj); + } + } + kobject_del(&space_info->kobj); + kobject_put(&space_info->kobj); } return 0; } @@ -8363,6 +8419,19 @@ static void __link_block_group(struct btrfs_space_info *space_info, int index = get_block_group_index(cache); down_write(&space_info->groups_sem); + if (list_empty(&space_info->block_groups[index])) { + struct kobject *kobj = &space_info->block_group_kobjs[index]; + int ret; + + kobject_get(&space_info->kobj); /* put in release */ + ret = kobject_init_and_add(kobj, &btrfs_raid_ktype, + &space_info->kobj, + get_raid_name(index)); + if (ret) { + pr_warn("btrfs: failed to add kobject for block cache. ignoring.\n"); + kobject_put(&space_info->kobj); + } + } list_add_tail(&cache->list, &space_info->block_groups[index]); up_write(&space_info->groups_sem); } @@ -8803,8 +8872,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, * are still on the list after taking the semaphore */ list_del_init(&block_group->list); - if (list_empty(&block_group->space_info->block_groups[index])) + if (list_empty(&block_group->space_info->block_groups[index])) { + kobject_del(&block_group->space_info->block_group_kobjs[index]); + kobject_put(&block_group->space_info->block_group_kobjs[index]); clear_avail_alloc_bits(root->fs_info, block_group->flags); + } up_write(&block_group->space_info->groups_sem); if (block_group->cached == BTRFS_CACHE_STARTED) -- cgit v1.2.3 From 4b447bfac6c6b367a594a22cb220152f835fc0ed Mon Sep 17 00:00:00 2001 From: Valentina Giusti Date: Mon, 4 Nov 2013 22:34:24 +0100 Subject: btrfs: remove unused variable from find_free_extent The variable found_uncached_bg in find_free_extent is not used since commit 285ff5af6ce358e73f53b55c9efadd4335f4c2ff (Btrfs: remove the ideal caching code) Signed-off-by: Valentina Giusti Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index e094d02ea76a..fe651f4d1b93 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -6239,7 +6239,6 @@ static noinline int find_free_extent(struct btrfs_root *orig_root, int index = __get_raid_index(flags); int alloc_type = (flags & BTRFS_BLOCK_GROUP_DATA) ? RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC; - bool found_uncached_bg = false; bool failed_cluster_refill = false; bool failed_alloc = false; bool use_cluster = true; @@ -6357,7 +6356,6 @@ search: have_block_group: cached = block_group_cache_done(block_group); if (unlikely(!cached)) { - found_uncached_bg = true; ret = cache_block_group(block_group, 0); BUG_ON(ret < 0); ret = 0; -- cgit v1.2.3 From 1b8e5df6d9b676f6d31fb098ffdc7d18732729d7 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Wed, 20 Nov 2013 16:50:23 -0500 Subject: btrfs: fix static checker warnings This patch fixes the following warnings: fs/btrfs/extent-tree.c:6201:12: sparse: symbol 'get_raid_name' was not declared. Should it be static? fs/btrfs/extent-tree.c:8430:9: error: format not a string literal and no format arguments [-Werror=format-security] get_raid_name(index)); Signed-off-by: Jeff Mahoney Reviewed-by: Kees Cook Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index fe651f4d1b93..f08f6dda949f 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -6195,7 +6195,7 @@ static const char *btrfs_raid_type_names[BTRFS_NR_RAID_TYPES] = { [BTRFS_RAID_RAID6] = "raid6", }; -const char *get_raid_name(enum btrfs_raid_types type) +static const char *get_raid_name(enum btrfs_raid_types type) { if (type >= BTRFS_NR_RAID_TYPES) return NULL; @@ -8423,7 +8423,7 @@ static void __link_block_group(struct btrfs_space_info *space_info, kobject_get(&space_info->kobj); /* put in release */ ret = kobject_init_and_add(kobj, &btrfs_raid_ktype, - &space_info->kobj, + &space_info->kobj, "%s", get_raid_name(index)); if (ret) { pr_warn("btrfs: failed to add kobject for block cache. ignoring.\n"); -- cgit v1.2.3 From 536cd96401dcb986f681bac385b5146b45a44e66 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Tue, 17 Dec 2013 12:01:12 +0800 Subject: Btrfs: fix double initialization of the raid kobject We met the following oops when doing space balance: kobject (ffff88081b590278): tried to init an initialized object, something is seriously wrong. ... Call Trace: [] dump_stack+0x49/0x5f [] kobject_init+0x89/0xa0 [] kobject_init_and_add+0x2a/0x70 [] ? clear_extent_bit+0x199/0x470 [btrfs] [] __link_block_group+0xfc/0x120 [btrfs] [] btrfs_make_block_group+0x24b/0x370 [btrfs] [] __btrfs_alloc_chunk+0x54b/0x7e0 [btrfs] [] btrfs_alloc_chunk+0x3f/0x50 [btrfs] [] do_chunk_alloc+0x363/0x440 [btrfs] [] btrfs_check_data_free_space+0x104/0x310 [btrfs] [] btrfs_write_dirty_block_groups+0x48d/0x600 [btrfs] [] commit_cowonly_roots+0x184/0x250 [btrfs] ... Steps to reproduce: # mkfs.btrfs -f # mount -o nospace_cache # btrfs balance start # dd if=/dev/zero of=/tmpfile bs=1M count=1 The reason of this problem is that we initialized the raid kobject when we added a block group into a empty raid list. As we know, when we mounted a btrfs filesystem, the raid list was empty, we would initialize the raid kobject when we added the first block group. But if there was not data stored in the block group, the block group would be freed when doing balance, and the raid list would be empty. And then if we allocated a new block group and added it into the raid list, we would initialize the raid kobject again, the oops happened. Fix this problem by initializing the raid kobject just when mounting the fs. Signed-off-by: Miao Xie Reported-by: Wang Shilong Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index f08f6dda949f..9e524b0a7b3c 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3463,8 +3463,10 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, return ret; } - for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) + for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { INIT_LIST_HEAD(&found->block_groups[i]); + kobject_init(&found->block_group_kobjs[i], &btrfs_raid_ktype); + } init_rwsem(&found->groups_sem); spin_lock_init(&found->lock); found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK; @@ -8422,9 +8424,8 @@ static void __link_block_group(struct btrfs_space_info *space_info, int ret; kobject_get(&space_info->kobj); /* put in release */ - ret = kobject_init_and_add(kobj, &btrfs_raid_ktype, - &space_info->kobj, "%s", - get_raid_name(index)); + ret = kobject_add(kobj, &space_info->kobj, "%s", + get_raid_name(index)); if (ret) { pr_warn("btrfs: failed to add kobject for block cache. ignoring.\n"); kobject_put(&space_info->kobj); -- cgit v1.2.3 From efe120a067c8674a8ae21b194f0e68f098b61ee2 Mon Sep 17 00:00:00 2001 From: Frank Holton Date: Fri, 20 Dec 2013 11:37:06 -0500 Subject: Btrfs: convert printk to btrfs_ and fix BTRFS prefix Convert all applicable cases of printk and pr_* to the btrfs_* macros. Fix all uses of the BTRFS prefix. Signed-off-by: Frank Holton Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 9e524b0a7b3c..1c82bead2c08 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -6637,12 +6637,12 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes, int index = 0; spin_lock(&info->lock); - printk(KERN_INFO "space_info %llu has %llu free, is %sfull\n", + printk(KERN_INFO "BTRFS: space_info %llu has %llu free, is %sfull\n", info->flags, info->total_bytes - info->bytes_used - info->bytes_pinned - info->bytes_reserved - info->bytes_readonly, (info->full) ? "" : "not "); - printk(KERN_INFO "space_info total=%llu, used=%llu, pinned=%llu, " + printk(KERN_INFO "BTRFS: space_info total=%llu, used=%llu, pinned=%llu, " "reserved=%llu, may_use=%llu, readonly=%llu\n", info->total_bytes, info->bytes_used, info->bytes_pinned, info->bytes_reserved, info->bytes_may_use, @@ -6656,7 +6656,9 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes, again: list_for_each_entry(cache, &info->block_groups[index], list) { spin_lock(&cache->lock); - printk(KERN_INFO "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s\n", + printk(KERN_INFO "BTRFS: " + "block group %llu has %llu bytes, " + "%llu used %llu pinned %llu reserved %s\n", cache->key.objectid, cache->key.offset, btrfs_block_group_used(&cache->item), cache->pinned, cache->reserved, cache->ro ? "[readonly]" : ""); @@ -7019,7 +7021,7 @@ again: /*DEFAULT_RATELIMIT_BURST*/ 1); if (__ratelimit(&_rs)) WARN(1, KERN_DEBUG - "btrfs: block rsv returned %d\n", ret); + "BTRFS: block rsv returned %d\n", ret); } try_reserve: ret = reserve_metadata_bytes(root, block_rsv, blocksize, @@ -7767,7 +7769,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, btrfs_end_transaction_throttle(trans, tree_root); if (!for_reloc && btrfs_need_cleaner_sleep(root)) { - pr_debug("btrfs: drop snapshot early exit\n"); + pr_debug("BTRFS: drop snapshot early exit\n"); err = -EAGAIN; goto out_free; } @@ -8427,7 +8429,7 @@ static void __link_block_group(struct btrfs_space_info *space_info, ret = kobject_add(kobj, &space_info->kobj, "%s", get_raid_name(index)); if (ret) { - pr_warn("btrfs: failed to add kobject for block cache. ignoring.\n"); + pr_warn("BTRFS: failed to add kobject for block cache. ignoring.\n"); kobject_put(&space_info->kobj); } } -- cgit v1.2.3 From c9ea7b24ce5863d65efb1134319cede160674d41 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 19 Sep 2013 10:02:11 -0400 Subject: Btrfs: stop caching thread if extent_commit_sem is contended We can starve out the transaction commit with a bunch of caching threads all running at the same time. This is because we will only drop the extent_commit_sem if we need_resched(), which isn't likely to happen since we will be reading a lot from the disk so have already schedule()'ed plenty. Alex observed that he could starve out a transaction commit for up to a minute with 32 caching threads all running at once. This will allow us to drop the extent_commit_sem to allow the transaction commit to swap the commit_root out and then all the cachers will start back up. Here is an explanation provided by Igno So, just to fill in what happens in this loop: mutex_unlock(&caching_ctl->mutex); cond_resched(); goto again; where 'again:' takes caching_ctl->mutex and fs_info->extent_commit_sem again: again: mutex_lock(&caching_ctl->mutex); /* need to make sure the commit_root doesn't disappear */ down_read(&fs_info->extent_commit_sem); So, if I'm reading the code correct, there can be a fair amount of concurrency here: there may be multiple 'caching kthreads' per filesystem active, while there's one fs_info->extent_commit_sem per filesystem AFAICS. So, what happens if there are a lot of CPUs all busy holding the ->extent_commit_sem rwsem read-locked and a writer arrives? They'd all rush to try to release the fs_info->extent_commit_sem, and they'd block in the down_read() because there's a writer waiting. So there's a guarantee of forward progress. This should answer akpm's concern I think. Thanks, Acked-by: Ingo Molnar Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 1c82bead2c08..3d19dcc553aa 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -442,7 +442,8 @@ next: if (ret) break; - if (need_resched()) { + if (need_resched() || + rwsem_is_contended(&fs_info->extent_commit_sem)) { caching_ctl->progress = last; btrfs_release_path(path); up_read(&fs_info->extent_commit_sem); -- cgit v1.2.3 From 17504584f52af944960f1ad16752aa10a0755b3b Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Sun, 29 Dec 2013 21:44:50 +0800 Subject: Btrfs: return free space to global_rsv as much as possible @full is not protected within global_rsv.lock, so we may think global_rsv is already full but in fact it's not, so we miss the opportunity to return free space to global_rsv directly when we release other block_rsvs. Signed-off-by: Liu Bo Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 3d19dcc553aa..41fe80b9db47 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4674,7 +4674,7 @@ void btrfs_block_rsv_release(struct btrfs_root *root, u64 num_bytes) { struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv; - if (global_rsv->full || global_rsv == block_rsv || + if (global_rsv == block_rsv || block_rsv->space_info != global_rsv->space_info) global_rsv = NULL; block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv, -- cgit v1.2.3 From 90515e7f5d7d24cbb2a4038a3f1b5cfa2921aa17 Mon Sep 17 00:00:00 2001 From: Wang Shilong Date: Tue, 7 Jan 2014 17:26:58 +0800 Subject: Btrfs: handle EAGAIN case properly in btrfs_drop_snapshot() We may return early in btrfs_drop_snapshot(), we shouldn't call btrfs_std_err() for this case, fix it. Cc: stable@vger.kernel.org Signed-off-by: Wang Shilong Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 41fe80b9db47..77acc0873895 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -7835,7 +7835,7 @@ out: */ if (!for_reloc && root_dropped == false) btrfs_add_dead_root(root); - if (err) + if (err && err != -EAGAIN) btrfs_std_error(root->fs_info, err); return err; } -- cgit v1.2.3 From d7df2c796d7eedd72a334dc89c65e1fec8171431 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 23 Jan 2014 09:21:38 -0500 Subject: Btrfs: attach delayed ref updates to delayed ref heads Currently we have two rb-trees, one for delayed ref heads and one for all of the delayed refs, including the delayed ref heads. When we process the delayed refs we have to hold onto the delayed ref lock for all of the selecting and merging and such, which results in quite a bit of lock contention. This was solved by having a waitqueue and only one flusher at a time, however this hurts if we get a lot of delayed refs queued up. So instead just have an rb tree for the delayed ref heads, and then attach the delayed ref updates to an rb tree that is per delayed ref head. Then we only need to take the delayed ref lock when adding new delayed refs and when selecting a delayed ref head to process, all the rest of the time we deal with a per delayed ref head lock which will be much less contentious. The locking rules for this get a little more complicated since we have to lock up to 3 things to properly process delayed refs, but I will address that problem later. For now this passes all of xfstests and my overnight stress tests. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 317 ++++++++++++++++--------------------------------- 1 file changed, 102 insertions(+), 215 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 77acc0873895..c77156c77de7 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -857,12 +857,14 @@ again: btrfs_put_delayed_ref(&head->node); goto search_again; } + spin_lock(&head->lock); if (head->extent_op && head->extent_op->update_flags) extent_flags |= head->extent_op->flags_to_set; else BUG_ON(num_refs == 0); num_refs += head->node.ref_mod; + spin_unlock(&head->lock); mutex_unlock(&head->mutex); } spin_unlock(&delayed_refs->lock); @@ -2287,40 +2289,33 @@ static noinline struct btrfs_delayed_ref_node * select_delayed_ref(struct btrfs_delayed_ref_head *head) { struct rb_node *node; - struct btrfs_delayed_ref_node *ref; - int action = BTRFS_ADD_DELAYED_REF; -again: + struct btrfs_delayed_ref_node *ref, *last = NULL;; + /* * select delayed ref of type BTRFS_ADD_DELAYED_REF first. * this prevents ref count from going down to zero when * there still are pending delayed ref. */ - node = rb_prev(&head->node.rb_node); - while (1) { - if (!node) - break; + node = rb_first(&head->ref_root); + while (node) { ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - if (ref->bytenr != head->node.bytenr) - break; - if (ref->action == action) + if (ref->action == BTRFS_ADD_DELAYED_REF) return ref; - node = rb_prev(node); - } - if (action == BTRFS_ADD_DELAYED_REF) { - action = BTRFS_DROP_DELAYED_REF; - goto again; + else if (last == NULL) + last = ref; + node = rb_next(node); } - return NULL; + return last; } /* * Returns 0 on success or if called with an already aborted transaction. * Returns -ENOMEM or -EIO on failure and will abort the transaction. */ -static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct list_head *cluster) +static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + unsigned long nr) { struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_delayed_ref_node *ref; @@ -2328,23 +2323,26 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, struct btrfs_delayed_extent_op *extent_op; struct btrfs_fs_info *fs_info = root->fs_info; int ret; - int count = 0; + unsigned long count = 0; int must_insert_reserved = 0; delayed_refs = &trans->transaction->delayed_refs; while (1) { if (!locked_ref) { - /* pick a new head ref from the cluster list */ - if (list_empty(cluster)) + if (count >= nr) break; - locked_ref = list_entry(cluster->next, - struct btrfs_delayed_ref_head, cluster); + spin_lock(&delayed_refs->lock); + locked_ref = btrfs_select_ref_head(trans); + if (!locked_ref) { + spin_unlock(&delayed_refs->lock); + break; + } /* grab the lock that says we are going to process * all the refs for this head */ ret = btrfs_delayed_ref_lock(trans, locked_ref); - + spin_unlock(&delayed_refs->lock); /* * we may have dropped the spin lock to get the head * mutex lock, and that might have given someone else @@ -2365,6 +2363,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, * finish. If we merged anything we need to re-loop so we can * get a good ref. */ + spin_lock(&locked_ref->lock); btrfs_merge_delayed_refs(trans, fs_info, delayed_refs, locked_ref); @@ -2376,17 +2375,14 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, if (ref && ref->seq && btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) { - /* - * there are still refs with lower seq numbers in the - * process of being added. Don't run this ref yet. - */ - list_del_init(&locked_ref->cluster); + spin_unlock(&locked_ref->lock); btrfs_delayed_ref_unlock(locked_ref); - locked_ref = NULL; + spin_lock(&delayed_refs->lock); + locked_ref->processing = 0; delayed_refs->num_heads_ready++; spin_unlock(&delayed_refs->lock); + locked_ref = NULL; cond_resched(); - spin_lock(&delayed_refs->lock); continue; } @@ -2401,6 +2397,8 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, locked_ref->extent_op = NULL; if (!ref) { + + /* All delayed refs have been processed, Go ahead * and send the head node to run_one_delayed_ref, * so that any accounting fixes can happen @@ -2413,8 +2411,7 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, } if (extent_op) { - spin_unlock(&delayed_refs->lock); - + spin_unlock(&locked_ref->lock); ret = run_delayed_extent_op(trans, root, ref, extent_op); btrfs_free_delayed_extent_op(extent_op); @@ -2428,23 +2425,38 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, */ if (must_insert_reserved) locked_ref->must_insert_reserved = 1; + locked_ref->processing = 0; btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret); - spin_lock(&delayed_refs->lock); btrfs_delayed_ref_unlock(locked_ref); return ret; } - - goto next; + continue; } - } - ref->in_tree = 0; - rb_erase(&ref->rb_node, &delayed_refs->root); - if (btrfs_delayed_ref_is_head(ref)) { + /* + * Need to drop our head ref lock and re-aqcuire the + * delayed ref lock and then re-check to make sure + * nobody got added. + */ + spin_unlock(&locked_ref->lock); + spin_lock(&delayed_refs->lock); + spin_lock(&locked_ref->lock); + if (rb_first(&locked_ref->ref_root)) { + spin_unlock(&locked_ref->lock); + spin_unlock(&delayed_refs->lock); + continue; + } + ref->in_tree = 0; + delayed_refs->num_heads--; rb_erase(&locked_ref->href_node, &delayed_refs->href_root); + spin_unlock(&delayed_refs->lock); + } else { + ref->in_tree = 0; + rb_erase(&ref->rb_node, &locked_ref->ref_root); } - delayed_refs->num_entries--; + atomic_dec(&delayed_refs->num_entries); + if (!btrfs_delayed_ref_is_head(ref)) { /* * when we play the delayed ref, also correct the @@ -2461,20 +2473,18 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, default: WARN_ON(1); } - } else { - list_del_init(&locked_ref->cluster); } - spin_unlock(&delayed_refs->lock); + spin_unlock(&locked_ref->lock); ret = run_one_delayed_ref(trans, root, ref, extent_op, must_insert_reserved); btrfs_free_delayed_extent_op(extent_op); if (ret) { + locked_ref->processing = 0; btrfs_delayed_ref_unlock(locked_ref); btrfs_put_delayed_ref(ref); btrfs_debug(fs_info, "run_one_delayed_ref returned %d", ret); - spin_lock(&delayed_refs->lock); return ret; } @@ -2490,11 +2500,9 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans, } btrfs_put_delayed_ref(ref); count++; -next: cond_resched(); - spin_lock(&delayed_refs->lock); } - return count; + return 0; } #ifdef SCRAMBLE_DELAYED_REFS @@ -2576,16 +2584,6 @@ int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, return ret; } -static int refs_newer(struct btrfs_delayed_ref_root *delayed_refs, int seq, - int count) -{ - int val = atomic_read(&delayed_refs->ref_seq); - - if (val < seq || val >= seq + count) - return 1; - return 0; -} - static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads) { u64 num_bytes; @@ -2647,12 +2645,9 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, struct rb_node *node; struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_delayed_ref_head *head; - struct list_head cluster; int ret; - u64 delayed_start; int run_all = count == (unsigned long)-1; int run_most = 0; - int loops; /* We'll clean this up in btrfs_cleanup_transaction */ if (trans->aborted) @@ -2664,121 +2659,31 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info); delayed_refs = &trans->transaction->delayed_refs; - INIT_LIST_HEAD(&cluster); if (count == 0) { - count = delayed_refs->num_entries * 2; + count = atomic_read(&delayed_refs->num_entries) * 2; run_most = 1; } - if (!run_all && !run_most) { - int old; - int seq = atomic_read(&delayed_refs->ref_seq); - -progress: - old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1); - if (old) { - DEFINE_WAIT(__wait); - if (delayed_refs->flushing || - !btrfs_should_throttle_delayed_refs(trans, root)) - return 0; - - prepare_to_wait(&delayed_refs->wait, &__wait, - TASK_UNINTERRUPTIBLE); - - old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1); - if (old) { - schedule(); - finish_wait(&delayed_refs->wait, &__wait); - - if (!refs_newer(delayed_refs, seq, 256)) - goto progress; - else - return 0; - } else { - finish_wait(&delayed_refs->wait, &__wait); - goto again; - } - } - - } else { - atomic_inc(&delayed_refs->procs_running_refs); - } - again: - loops = 0; - spin_lock(&delayed_refs->lock); - #ifdef SCRAMBLE_DELAYED_REFS delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); #endif - - while (1) { - if (!(run_all || run_most) && - !btrfs_should_throttle_delayed_refs(trans, root)) - break; - - /* - * go find something we can process in the rbtree. We start at - * the beginning of the tree, and then build a cluster - * of refs to process starting at the first one we are able to - * lock - */ - delayed_start = delayed_refs->run_delayed_start; - ret = btrfs_find_ref_cluster(trans, &cluster, - delayed_refs->run_delayed_start); - if (ret) - break; - - ret = run_clustered_refs(trans, root, &cluster); - if (ret < 0) { - btrfs_release_ref_cluster(&cluster); - spin_unlock(&delayed_refs->lock); - btrfs_abort_transaction(trans, root, ret); - atomic_dec(&delayed_refs->procs_running_refs); - wake_up(&delayed_refs->wait); - return ret; - } - - atomic_add(ret, &delayed_refs->ref_seq); - - count -= min_t(unsigned long, ret, count); - - if (count == 0) - break; - - if (delayed_start >= delayed_refs->run_delayed_start) { - if (loops == 0) { - /* - * btrfs_find_ref_cluster looped. let's do one - * more cycle. if we don't run any delayed ref - * during that cycle (because we can't because - * all of them are blocked), bail out. - */ - loops = 1; - } else { - /* - * no runnable refs left, stop trying - */ - BUG_ON(run_all); - break; - } - } - if (ret) { - /* refs were run, let's reset staleness detection */ - loops = 0; - } + ret = __btrfs_run_delayed_refs(trans, root, count); + if (ret < 0) { + btrfs_abort_transaction(trans, root, ret); + return ret; } if (run_all) { - if (!list_empty(&trans->new_bgs)) { - spin_unlock(&delayed_refs->lock); + if (!list_empty(&trans->new_bgs)) btrfs_create_pending_block_groups(trans, root); - spin_lock(&delayed_refs->lock); - } + spin_lock(&delayed_refs->lock); node = rb_first(&delayed_refs->href_root); - if (!node) + if (!node) { + spin_unlock(&delayed_refs->lock); goto out; + } count = (unsigned long)-1; while (node) { @@ -2807,16 +2712,10 @@ again: node = rb_next(node); } spin_unlock(&delayed_refs->lock); - schedule_timeout(1); + cond_resched(); goto again; } out: - atomic_dec(&delayed_refs->procs_running_refs); - smp_mb(); - if (waitqueue_active(&delayed_refs->wait)) - wake_up(&delayed_refs->wait); - - spin_unlock(&delayed_refs->lock); assert_qgroups_uptodate(trans); return 0; } @@ -2858,12 +2757,13 @@ static noinline int check_delayed_ref(struct btrfs_trans_handle *trans, struct rb_node *node; int ret = 0; - ret = -ENOENT; delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); head = btrfs_find_delayed_ref_head(trans, bytenr); - if (!head) - goto out; + if (!head) { + spin_unlock(&delayed_refs->lock); + return 0; + } if (!mutex_trylock(&head->mutex)) { atomic_inc(&head->node.refs); @@ -2880,40 +2780,35 @@ static noinline int check_delayed_ref(struct btrfs_trans_handle *trans, btrfs_put_delayed_ref(&head->node); return -EAGAIN; } + spin_unlock(&delayed_refs->lock); - node = rb_prev(&head->node.rb_node); - if (!node) - goto out_unlock; - - ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - - if (ref->bytenr != bytenr) - goto out_unlock; + spin_lock(&head->lock); + node = rb_first(&head->ref_root); + while (node) { + ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); + node = rb_next(node); - ret = 1; - if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) - goto out_unlock; + /* If it's a shared ref we know a cross reference exists */ + if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) { + ret = 1; + break; + } - data_ref = btrfs_delayed_node_to_data_ref(ref); + data_ref = btrfs_delayed_node_to_data_ref(ref); - node = rb_prev(node); - if (node) { - int seq = ref->seq; - - ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - if (ref->bytenr == bytenr && ref->seq == seq) - goto out_unlock; + /* + * If our ref doesn't match the one we're currently looking at + * then we have a cross reference. + */ + if (data_ref->root != root->root_key.objectid || + data_ref->objectid != objectid || + data_ref->offset != offset) { + ret = 1; + break; + } } - - if (data_ref->root != root->root_key.objectid || - data_ref->objectid != objectid || data_ref->offset != offset) - goto out_unlock; - - ret = 0; -out_unlock: + spin_unlock(&head->lock); mutex_unlock(&head->mutex); -out: - spin_unlock(&delayed_refs->lock); return ret; } @@ -5953,8 +5848,6 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, { struct btrfs_delayed_ref_head *head; struct btrfs_delayed_ref_root *delayed_refs; - struct btrfs_delayed_ref_node *ref; - struct rb_node *node; int ret = 0; delayed_refs = &trans->transaction->delayed_refs; @@ -5963,14 +5856,8 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, if (!head) goto out; - node = rb_prev(&head->node.rb_node); - if (!node) - goto out; - - ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - - /* there are still entries for this ref, we can't drop it */ - if (ref->bytenr == bytenr) + spin_lock(&head->lock); + if (rb_first(&head->ref_root)) goto out; if (head->extent_op) { @@ -5992,20 +5879,19 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, * ahead and process it. */ head->node.in_tree = 0; - rb_erase(&head->node.rb_node, &delayed_refs->root); rb_erase(&head->href_node, &delayed_refs->href_root); - delayed_refs->num_entries--; + atomic_dec(&delayed_refs->num_entries); /* * we don't take a ref on the node because we're removing it from the * tree, so we just steal the ref the tree was holding. */ delayed_refs->num_heads--; - if (list_empty(&head->cluster)) + if (head->processing == 0) delayed_refs->num_heads_ready--; - - list_del_init(&head->cluster); + head->processing = 0; + spin_unlock(&head->lock); spin_unlock(&delayed_refs->lock); BUG_ON(head->extent_op); @@ -6016,6 +5902,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, btrfs_put_delayed_ref(&head->node); return ret; out: + spin_unlock(&head->lock); spin_unlock(&delayed_refs->lock); return 0; } -- cgit v1.2.3 From 0a2b2a844af616addc87cac3cc18dcaba2a9d0fb Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 23 Jan 2014 10:54:11 -0500 Subject: Btrfs: throttle delayed refs better On one of our gluster clusters we noticed some pretty big lag spikes. This turned out to be because our transaction commit was taking like 3 minutes to complete. This is because we have like 30 gigs of metadata, so our global reserve would end up being the max which is like 512 mb. So our throttling code would allow a ridiculous amount of delayed refs to build up and then they'd all get run at transaction commit time, and for a cold mounted file system that could take up to 3 minutes to run. So fix the throttling to be based on both the size of the global reserve and how long it takes us to run delayed refs. This patch tracks the time it takes to run delayed refs and then only allows 1 seconds worth of outstanding delayed refs at a time. This way it will auto-tune itself from cold cache up to when everything is in memory and it no longer has to go to disk. This makes our transaction commits take much less time to run. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 41 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index c77156c77de7..b5322596d60b 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2322,8 +2322,10 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *locked_ref = NULL; struct btrfs_delayed_extent_op *extent_op; struct btrfs_fs_info *fs_info = root->fs_info; + ktime_t start = ktime_get(); int ret; unsigned long count = 0; + unsigned long actual_count = 0; int must_insert_reserved = 0; delayed_refs = &trans->transaction->delayed_refs; @@ -2452,6 +2454,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, &delayed_refs->href_root); spin_unlock(&delayed_refs->lock); } else { + actual_count++; ref->in_tree = 0; rb_erase(&ref->rb_node, &locked_ref->ref_root); } @@ -2502,6 +2505,26 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, count++; cond_resched(); } + + /* + * We don't want to include ref heads since we can have empty ref heads + * and those will drastically skew our runtime down since we just do + * accounting, no actual extent tree updates. + */ + if (actual_count > 0) { + u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start)); + u64 avg; + + /* + * We weigh the current average higher than our current runtime + * to avoid large swings in the average. + */ + spin_lock(&delayed_refs->lock); + avg = fs_info->avg_delayed_ref_runtime * 3 + runtime; + avg = div64_u64(avg, 4); + fs_info->avg_delayed_ref_runtime = avg; + spin_unlock(&delayed_refs->lock); + } return 0; } @@ -2600,7 +2623,7 @@ static inline u64 heads_to_leaves(struct btrfs_root *root, u64 heads) return div64_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(root)); } -int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, +int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, struct btrfs_root *root) { struct btrfs_block_rsv *global_rsv; @@ -2629,6 +2652,22 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, return ret; } +int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + u64 num_entries = + atomic_read(&trans->transaction->delayed_refs.num_entries); + u64 avg_runtime; + + smp_mb(); + avg_runtime = fs_info->avg_delayed_ref_runtime; + if (num_entries * avg_runtime >= NSEC_PER_SEC) + return 1; + + return btrfs_check_space_for_delayed_refs(trans, root); +} + /* * this starts processing the delayed reference count updates and * extent insertions we have queued up so far. count can be -- cgit v1.2.3 From 14a958e678cd77cae475b60ca46c0797b1c006a1 Mon Sep 17 00:00:00 2001 From: Filipe David Borba Manana Date: Sun, 12 Jan 2014 02:22:46 +0000 Subject: Btrfs: fix btrfs boot when compiled as built-in After the change titled "Btrfs: add support for inode properties", if btrfs was built-in the kernel (i.e. not as a module), it would cause a kernel panic, as reported recently by Fengguang: [ 2.024722] BUG: unable to handle kernel NULL pointer dereference at (null) [ 2.027814] IP: [] crc32c+0xc/0x6b [ 2.028684] PGD 0 [ 2.028684] Oops: 0000 [#1] SMP [ 2.028684] Modules linked in: [ 2.028684] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 3.13.0-rc7-04795-ga7b57c2 #1 [ 2.028684] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 [ 2.028684] task: ffff88000edba100 ti: ffff88000edd6000 task.ti: ffff88000edd6000 [ 2.028684] RIP: 0010:[] [] crc32c+0xc/0x6b [ 2.028684] RSP: 0000:ffff88000edd7e58 EFLAGS: 00010246 [ 2.028684] RAX: 0000000000000000 RBX: ffffffff82295550 RCX: 0000000000000000 [ 2.028684] RDX: 0000000000000011 RSI: ffffffff81efe393 RDI: 00000000fffffffe [ 2.028684] RBP: ffff88000edd7e60 R08: 0000000000000003 R09: 0000000000015d20 [ 2.028684] R10: ffffffff81ef225e R11: ffffffff811b0222 R12: ffffffffffffffff [ 2.028684] R13: 0000000000000239 R14: 0000000000000000 R15: 0000000000000000 [ 2.028684] FS: 0000000000000000(0000) GS:ffff88000fa00000(0000) knlGS:0000000000000000 [ 2.028684] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [ 2.028684] CR2: 0000000000000000 CR3: 000000000220c000 CR4: 00000000000006f0 [ 2.028684] Stack: [ 2.028684] ffffffff82295550 ffff88000edd7e80 ffffffff8238af62 ffffffff8238ac05 [ 2.028684] 0000000000000000 ffff88000edd7e98 ffffffff8238ac0f ffffffff8238ac05 [ 2.028684] ffff88000edd7f08 ffffffff810002ba ffff88000edd7f00 ffffffff810e2404 [ 2.028684] Call Trace: [ 2.028684] [] btrfs_props_init+0x4f/0x96 [ 2.028684] [] ? ftrace_define_fields_btrfs_space_reservation+0x145/0x145 [ 2.028684] [] init_btrfs_fs+0xa/0xf0 [ 2.028684] [] ? ftrace_define_fields_btrfs_space_reservation+0x145/0x145 [ 2.028684] [] do_one_initcall+0xa4/0x13a [ 2.028684] [] ? parse_args+0x25f/0x33d [ 2.028684] [] kernel_init_freeable+0x1aa/0x230 [ 2.028684] [] ? do_early_param+0x88/0x88 [ 2.028684] [] ? rest_init+0x89/0x89 [ 2.028684] [] kernel_init+0xe/0x109 The issue here is that the initialization function of btrfs (super.c:init_btrfs_fs) started using crc32c (from lib/libcrc32c.c). But when it needs to call crc32c (as part of the properties initialization routine), the libcrc32c is not yet initialized, so crc32c derreferenced a NULL pointer (lib/libcrc32c.c:tfm), causing the kernel panic on boot. The approach to fix this is to use crypto component directly to use its crc32c (which is basically what lib/libcrc32c.c is, a wrapper around crypto). This is what ext4 is doing as well, it uses crypto directly to get crc32c functionality. Verified this works both when btrfs is built-in and when it's loadable kernel module. Signed-off-by: Filipe David Borba Manana Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index b5322596d60b..db1e32ffcfbb 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1074,11 +1074,11 @@ static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset) __le64 lenum; lenum = cpu_to_le64(root_objectid); - high_crc = crc32c(high_crc, &lenum, sizeof(lenum)); + high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum)); lenum = cpu_to_le64(owner); - low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); + low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); lenum = cpu_to_le64(offset); - low_crc = crc32c(low_crc, &lenum, sizeof(lenum)); + low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum)); return ((u64)high_crc << 31) ^ (u64)low_crc; } -- cgit v1.2.3 From 920e4a58d27ea146b34674cf9565ab0373f9ca51 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 15 Jan 2014 20:00:55 +0800 Subject: Btrfs: cleanup the redundant code for the block group allocation and init Signed-off-by: Miao Xie Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 94 +++++++++++++++++++++++--------------------------- 1 file changed, 44 insertions(+), 50 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index db1e32ffcfbb..1efcc262be47 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -8364,6 +8364,41 @@ static void __link_block_group(struct btrfs_space_info *space_info, up_write(&space_info->groups_sem); } +static struct btrfs_block_group_cache * +btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size) +{ + struct btrfs_block_group_cache *cache; + + cache = kzalloc(sizeof(*cache), GFP_NOFS); + if (!cache) + return NULL; + + cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), + GFP_NOFS); + if (!cache->free_space_ctl) { + kfree(cache); + return NULL; + } + + cache->key.objectid = start; + cache->key.offset = size; + cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; + + cache->sectorsize = root->sectorsize; + cache->fs_info = root->fs_info; + cache->full_stripe_len = btrfs_full_stripe_len(root, + &root->fs_info->mapping_tree, + start); + atomic_set(&cache->count, 1); + spin_lock_init(&cache->lock); + INIT_LIST_HEAD(&cache->list); + INIT_LIST_HEAD(&cache->cluster_list); + INIT_LIST_HEAD(&cache->new_bg_list); + btrfs_init_free_space_ctl(cache); + + return cache; +} + int btrfs_read_block_groups(struct btrfs_root *root) { struct btrfs_path *path; @@ -8399,26 +8434,16 @@ int btrfs_read_block_groups(struct btrfs_root *root) break; if (ret != 0) goto error; + leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - cache = kzalloc(sizeof(*cache), GFP_NOFS); + + cache = btrfs_create_block_group_cache(root, found_key.objectid, + found_key.offset); if (!cache) { ret = -ENOMEM; goto error; } - cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), - GFP_NOFS); - if (!cache->free_space_ctl) { - kfree(cache); - ret = -ENOMEM; - goto error; - } - - atomic_set(&cache->count, 1); - spin_lock_init(&cache->lock); - cache->fs_info = info; - INIT_LIST_HEAD(&cache->list); - INIT_LIST_HEAD(&cache->cluster_list); if (need_clear) { /* @@ -8439,16 +8464,10 @@ int btrfs_read_block_groups(struct btrfs_root *root) read_extent_buffer(leaf, &cache->item, btrfs_item_ptr_offset(leaf, path->slots[0]), sizeof(cache->item)); - memcpy(&cache->key, &found_key, sizeof(found_key)); + cache->flags = btrfs_block_group_flags(&cache->item); key.objectid = found_key.objectid + found_key.offset; btrfs_release_path(path); - cache->flags = btrfs_block_group_flags(&cache->item); - cache->sectorsize = root->sectorsize; - cache->full_stripe_len = btrfs_full_stripe_len(root, - &root->fs_info->mapping_tree, - found_key.objectid); - btrfs_init_free_space_ctl(cache); /* * We need to exclude the super stripes now so that the space @@ -8462,8 +8481,7 @@ int btrfs_read_block_groups(struct btrfs_root *root) * case. */ free_excluded_extents(root, cache); - kfree(cache->free_space_ctl); - kfree(cache); + btrfs_put_block_group(cache); goto error; } @@ -8594,38 +8612,15 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, root->fs_info->last_trans_log_full_commit = trans->transid; - cache = kzalloc(sizeof(*cache), GFP_NOFS); + cache = btrfs_create_block_group_cache(root, chunk_offset, size); if (!cache) return -ENOMEM; - cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), - GFP_NOFS); - if (!cache->free_space_ctl) { - kfree(cache); - return -ENOMEM; - } - - cache->key.objectid = chunk_offset; - cache->key.offset = size; - cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; - cache->sectorsize = root->sectorsize; - cache->fs_info = root->fs_info; - cache->full_stripe_len = btrfs_full_stripe_len(root, - &root->fs_info->mapping_tree, - chunk_offset); - - atomic_set(&cache->count, 1); - spin_lock_init(&cache->lock); - INIT_LIST_HEAD(&cache->list); - INIT_LIST_HEAD(&cache->cluster_list); - INIT_LIST_HEAD(&cache->new_bg_list); - - btrfs_init_free_space_ctl(cache); btrfs_set_block_group_used(&cache->item, bytes_used); btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid); - cache->flags = type; btrfs_set_block_group_flags(&cache->item, type); + cache->flags = type; cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; ret = exclude_super_stripes(root, cache); @@ -8635,8 +8630,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, * case. */ free_excluded_extents(root, cache); - kfree(cache->free_space_ctl); - kfree(cache); + btrfs_put_block_group(cache); return ret; } -- cgit v1.2.3 From 215a63d139b1e04ce4b595eeca84671782eb5758 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 15 Jan 2014 20:00:56 +0800 Subject: Btrfs: cleanup the code of used_block_group in find_free_extent() used_block_group is just used for the space cluster which doesn't belong to the current block group, the other place needn't use it. Or the logic of code seems unclear. Signed-off-by: Miao Xie Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 33 +++++++++++++-------------------- 1 file changed, 13 insertions(+), 20 deletions(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 1efcc262be47..b55a4fd13ecc 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -6159,7 +6159,6 @@ static noinline int find_free_extent(struct btrfs_root *orig_root, struct btrfs_root *root = orig_root->fs_info->extent_root; struct btrfs_free_cluster *last_ptr = NULL; struct btrfs_block_group_cache *block_group = NULL; - struct btrfs_block_group_cache *used_block_group; u64 search_start = 0; u64 max_extent_size = 0; int empty_cluster = 2 * 1024 * 1024; @@ -6220,7 +6219,6 @@ static noinline int find_free_extent(struct btrfs_root *orig_root, if (search_start == hint_byte) { block_group = btrfs_lookup_block_group(root->fs_info, search_start); - used_block_group = block_group; /* * we don't want to use the block group if it doesn't match our * allocation bits, or if its not cached. @@ -6257,7 +6255,6 @@ search: u64 offset; int cached; - used_block_group = block_group; btrfs_get_block_group(block_group); search_start = block_group->key.objectid; @@ -6300,6 +6297,7 @@ have_block_group: * lets look there */ if (last_ptr) { + struct btrfs_block_group_cache *used_block_group; unsigned long aligned_cluster; /* * the refill lock keeps out other @@ -6310,10 +6308,8 @@ have_block_group: if (used_block_group != block_group && (!used_block_group || used_block_group->ro || - !block_group_bits(used_block_group, flags))) { - used_block_group = block_group; + !block_group_bits(used_block_group, flags))) goto refill_cluster; - } if (used_block_group != block_group) btrfs_get_block_group(used_block_group); @@ -6328,16 +6324,17 @@ have_block_group: spin_unlock(&last_ptr->refill_lock); trace_btrfs_reserve_extent_cluster(root, block_group, search_start, num_bytes); + if (used_block_group != block_group) { + btrfs_put_block_group(block_group); + block_group = used_block_group; + } goto checks; } WARN_ON(last_ptr->block_group != used_block_group); - if (used_block_group != block_group) { + if (used_block_group != block_group) btrfs_put_block_group(used_block_group); - used_block_group = block_group; - } refill_cluster: - BUG_ON(used_block_group != block_group); /* If we are on LOOP_NO_EMPTY_SIZE, we can't * set up a new clusters, so lets just skip it * and let the allocator find whatever block @@ -6456,25 +6453,25 @@ unclustered_alloc: goto loop; } checks: - search_start = stripe_align(root, used_block_group, + search_start = stripe_align(root, block_group, offset, num_bytes); /* move on to the next group */ if (search_start + num_bytes > - used_block_group->key.objectid + used_block_group->key.offset) { - btrfs_add_free_space(used_block_group, offset, num_bytes); + block_group->key.objectid + block_group->key.offset) { + btrfs_add_free_space(block_group, offset, num_bytes); goto loop; } if (offset < search_start) - btrfs_add_free_space(used_block_group, offset, + btrfs_add_free_space(block_group, offset, search_start - offset); BUG_ON(offset > search_start); - ret = btrfs_update_reserved_bytes(used_block_group, num_bytes, + ret = btrfs_update_reserved_bytes(block_group, num_bytes, alloc_type); if (ret == -EAGAIN) { - btrfs_add_free_space(used_block_group, offset, num_bytes); + btrfs_add_free_space(block_group, offset, num_bytes); goto loop; } @@ -6484,16 +6481,12 @@ checks: trace_btrfs_reserve_extent(orig_root, block_group, search_start, num_bytes); - if (used_block_group != block_group) - btrfs_put_block_group(used_block_group); btrfs_put_block_group(block_group); break; loop: failed_cluster_refill = false; failed_alloc = false; BUG_ON(index != get_block_group_index(block_group)); - if (used_block_group != block_group) - btrfs_put_block_group(used_block_group); btrfs_put_block_group(block_group); } up_read(&space_info->groups_sem); -- cgit v1.2.3 From 89d4346a36a00ab1f9bd71f929564e9fc1c7c539 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 15 Jan 2014 20:00:57 +0800 Subject: Btrfs: fix wrong block group in trace during the free space allocation We allocate the free space from the former block group, not the current one, so should use the former one to output the trace information. Signed-off-by: Miao Xie Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index b55a4fd13ecc..73b55d94b953 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -6323,7 +6323,8 @@ have_block_group: /* we have a block, we're done */ spin_unlock(&last_ptr->refill_lock); trace_btrfs_reserve_extent_cluster(root, - block_group, search_start, num_bytes); + used_block_group, + search_start, num_bytes); if (used_block_group != block_group) { btrfs_put_block_group(block_group); block_group = used_block_group; -- cgit v1.2.3 From cf93da7bcf450cb4595055d491a0519cb39e68ed Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 29 Jan 2014 07:02:40 -0800 Subject: Btrfs: fix spin_unlock in check_ref_cleanup Our goto out should have gone a little farther. Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs/btrfs/extent-tree.c') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 73b55d94b953..9c9ecc93ae2c 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5893,7 +5893,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, spin_lock(&delayed_refs->lock); head = btrfs_find_delayed_ref_head(trans, bytenr); if (!head) - goto out; + goto out_delayed_unlock; spin_lock(&head->lock); if (rb_first(&head->ref_root)) @@ -5942,6 +5942,8 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, return ret; out: spin_unlock(&head->lock); + +out_delayed_unlock: spin_unlock(&delayed_refs->lock); return 0; } -- cgit v1.2.3