From 581bb050941b4f220f84d3e5ed6dace3d42dd382 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 20 Apr 2011 10:06:11 +0800 Subject: Btrfs: Cache free inode numbers in memory Currently btrfs stores the highest objectid of the fs tree, and it always returns (highest+1) inode number when we create a file, so inode numbers won't be reclaimed when we delete files, so we'll run out of inode numbers as we keep create/delete files in 32bits machines. This fixes it, and it works similarly to how we cache free space in block cgroups. We start a kernel thread to read the file tree. By scanning inode items, we know which chunks of inode numbers are free, and we cache them in an rb-tree. Because we are searching the commit root, we have to carefully handle the cross-transaction case. The rb-tree is a hybrid extent+bitmap tree, so if we have too many small chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram of extents, and a bitmap will be used if we exceed this threshold. The extents threshold is adjusted in runtime. Signed-off-by: Li Zefan --- fs/btrfs/inode-map.c | 341 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 336 insertions(+), 5 deletions(-) (limited to 'fs/btrfs/inode-map.c') diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index c05a08f4c411..5be62df90c4f 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -16,11 +16,343 @@ * Boston, MA 021110-1307, USA. */ +#include +#include +#include + #include "ctree.h" #include "disk-io.h" +#include "free-space-cache.h" +#include "inode-map.h" #include "transaction.h" -int btrfs_find_highest_inode(struct btrfs_root *root, u64 *objectid) +static int caching_kthread(void *data) +{ + struct btrfs_root *root = data; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; + struct btrfs_key key; + struct btrfs_path *path; + struct extent_buffer *leaf; + u64 last = (u64)-1; + int slot; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + /* Since the commit root is read-only, we can safely skip locking. */ + path->skip_locking = 1; + path->search_commit_root = 1; + path->reada = 2; + + key.objectid = BTRFS_FIRST_FREE_OBJECTID; + key.offset = 0; + key.type = BTRFS_INODE_ITEM_KEY; +again: + /* need to make sure the commit_root doesn't disappear */ + mutex_lock(&root->fs_commit_mutex); + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + + while (1) { + smp_mb(); + if (fs_info->closing > 1) + goto out; + + leaf = path->nodes[0]; + slot = path->slots[0]; + if (path->slots[0] >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto out; + else if (ret > 0) + break; + + if (need_resched() || + btrfs_transaction_in_commit(fs_info)) { + leaf = path->nodes[0]; + + if (btrfs_header_nritems(leaf) == 0) { + WARN_ON(1); + break; + } + + /* + * Save the key so we can advances forward + * in the next search. + */ + btrfs_item_key_to_cpu(leaf, &key, 0); + btrfs_release_path(root, path); + root->cache_progress = last; + mutex_unlock(&root->fs_commit_mutex); + schedule_timeout(1); + goto again; + } else + continue; + } + + btrfs_item_key_to_cpu(leaf, &key, slot); + + if (key.type != BTRFS_INODE_ITEM_KEY) + goto next; + + if (key.objectid >= BTRFS_LAST_FREE_OBJECTID) + break; + + if (last != (u64)-1 && last + 1 != key.objectid) { + __btrfs_add_free_space(ctl, last + 1, + key.objectid - last - 1); + wake_up(&root->cache_wait); + } + + last = key.objectid; +next: + path->slots[0]++; + } + + if (last < BTRFS_LAST_FREE_OBJECTID - 1) { + __btrfs_add_free_space(ctl, last + 1, + BTRFS_LAST_FREE_OBJECTID - last - 1); + } + + spin_lock(&root->cache_lock); + root->cached = BTRFS_CACHE_FINISHED; + spin_unlock(&root->cache_lock); + + root->cache_progress = (u64)-1; + btrfs_unpin_free_ino(root); +out: + wake_up(&root->cache_wait); + mutex_unlock(&root->fs_commit_mutex); + + btrfs_free_path(path); + + return ret; +} + +static void start_caching(struct btrfs_root *root) +{ + struct task_struct *tsk; + + spin_lock(&root->cache_lock); + if (root->cached != BTRFS_CACHE_NO) { + spin_unlock(&root->cache_lock); + return; + } + + root->cached = BTRFS_CACHE_STARTED; + spin_unlock(&root->cache_lock); + + tsk = kthread_run(caching_kthread, root, "btrfs-ino-cache-%llu\n", + root->root_key.objectid); + BUG_ON(IS_ERR(tsk)); +} + +int btrfs_find_free_ino(struct btrfs_root *root, u64 *objectid) +{ +again: + *objectid = btrfs_find_ino_for_alloc(root); + + if (*objectid != 0) + return 0; + + start_caching(root); + + wait_event(root->cache_wait, + root->cached == BTRFS_CACHE_FINISHED || + root->free_ino_ctl->free_space > 0); + + if (root->cached == BTRFS_CACHE_FINISHED && + root->free_ino_ctl->free_space == 0) + return -ENOSPC; + else + goto again; +} + +void btrfs_return_ino(struct btrfs_root *root, u64 objectid) +{ + struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; + struct btrfs_free_space_ctl *pinned = root->free_ino_pinned; +again: + if (root->cached == BTRFS_CACHE_FINISHED) { + __btrfs_add_free_space(ctl, objectid, 1); + } else { + /* + * If we are in the process of caching free ino chunks, + * to avoid adding the same inode number to the free_ino + * tree twice due to cross transaction, we'll leave it + * in the pinned tree until a transaction is committed + * or the caching work is done. + */ + + mutex_lock(&root->fs_commit_mutex); + spin_lock(&root->cache_lock); + if (root->cached == BTRFS_CACHE_FINISHED) { + spin_unlock(&root->cache_lock); + mutex_unlock(&root->fs_commit_mutex); + goto again; + } + spin_unlock(&root->cache_lock); + + start_caching(root); + + if (objectid <= root->cache_progress) + __btrfs_add_free_space(ctl, objectid, 1); + else + __btrfs_add_free_space(pinned, objectid, 1); + + mutex_unlock(&root->fs_commit_mutex); + } +} + +/* + * When a transaction is committed, we'll move those inode numbers which + * are smaller than root->cache_progress from pinned tree to free_ino tree, + * and others will just be dropped, because the commit root we were + * searching has changed. + * + * Must be called with root->fs_commit_mutex held + */ +void btrfs_unpin_free_ino(struct btrfs_root *root) +{ + struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; + struct rb_root *rbroot = &root->free_ino_pinned->free_space_offset; + struct btrfs_free_space *info; + struct rb_node *n; + u64 count; + + while (1) { + n = rb_first(rbroot); + if (!n) + break; + + info = rb_entry(n, struct btrfs_free_space, offset_index); + BUG_ON(info->bitmap); + + if (info->offset > root->cache_progress) + goto free; + else if (info->offset + info->bytes > root->cache_progress) + count = root->cache_progress - info->offset + 1; + else + count = info->bytes; + + __btrfs_add_free_space(ctl, info->offset, count); +free: + rb_erase(&info->offset_index, rbroot); + kfree(info); + } +} + +#define INIT_THRESHOLD (((1024 * 32) / 2) / sizeof(struct btrfs_free_space)) +#define INODES_PER_BITMAP (PAGE_CACHE_SIZE * 8) + +/* + * The goal is to keep the memory used by the free_ino tree won't + * exceed the memory if we use bitmaps only. + */ +static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) +{ + struct btrfs_free_space *info; + struct rb_node *n; + int max_ino; + int max_bitmaps; + + n = rb_last(&ctl->free_space_offset); + if (!n) { + ctl->extents_thresh = INIT_THRESHOLD; + return; + } + info = rb_entry(n, struct btrfs_free_space, offset_index); + + /* + * Find the maximum inode number in the filesystem. Note we + * ignore the fact that this can be a bitmap, because we are + * not doing precise calculation. + */ + max_ino = info->bytes - 1; + + max_bitmaps = ALIGN(max_ino, INODES_PER_BITMAP) / INODES_PER_BITMAP; + if (max_bitmaps <= ctl->total_bitmaps) { + ctl->extents_thresh = 0; + return; + } + + ctl->extents_thresh = (max_bitmaps - ctl->total_bitmaps) * + PAGE_CACHE_SIZE / sizeof(*info); +} + +/* + * We don't fall back to bitmap, if we are below the extents threshold + * or this chunk of inode numbers is a big one. + */ +static bool use_bitmap(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info) +{ + if (ctl->free_extents < ctl->extents_thresh || + info->bytes > INODES_PER_BITMAP / 10) + return false; + + return true; +} + +static struct btrfs_free_space_op free_ino_op = { + .recalc_thresholds = recalculate_thresholds, + .use_bitmap = use_bitmap, +}; + +static void pinned_recalc_thresholds(struct btrfs_free_space_ctl *ctl) +{ +} + +static bool pinned_use_bitmap(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info) +{ + /* + * We always use extents for two reasons: + * + * - The pinned tree is only used during the process of caching + * work. + * - Make code simpler. See btrfs_unpin_free_ino(). + */ + return false; +} + +static struct btrfs_free_space_op pinned_free_ino_op = { + .recalc_thresholds = pinned_recalc_thresholds, + .use_bitmap = pinned_use_bitmap, +}; + +void btrfs_init_free_ino_ctl(struct btrfs_root *root) +{ + struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; + struct btrfs_free_space_ctl *pinned = root->free_ino_pinned; + + spin_lock_init(&ctl->tree_lock); + ctl->unit = 1; + ctl->start = 0; + ctl->private = NULL; + ctl->op = &free_ino_op; + + /* + * Initially we allow to use 16K of ram to cache chunks of + * inode numbers before we resort to bitmaps. This is somewhat + * arbitrary, but it will be adjusted in runtime. + */ + ctl->extents_thresh = INIT_THRESHOLD; + + spin_lock_init(&pinned->tree_lock); + pinned->unit = 1; + pinned->start = 0; + pinned->private = NULL; + pinned->extents_thresh = 0; + pinned->op = &pinned_free_ino_op; +} + +static int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid) { struct btrfs_path *path; int ret; @@ -55,15 +387,14 @@ error: return ret; } -int btrfs_find_free_objectid(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - u64 dirid, u64 *objectid) +int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid) { int ret; mutex_lock(&root->objectid_mutex); if (unlikely(root->highest_objectid < BTRFS_FIRST_FREE_OBJECTID)) { - ret = btrfs_find_highest_inode(root, &root->highest_objectid); + ret = btrfs_find_highest_objectid(root, + &root->highest_objectid); if (ret) goto out; } -- cgit v1.2.3 From 82d5902d9c681be37ffa9d70482907f9f0b7ec1f Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 20 Apr 2011 10:33:24 +0800 Subject: Btrfs: Support reading/writing on disk free ino cache This is similar to block group caching. We dedicate a special inode in fs tree to save free ino cache. At the very first time we create/delete a file after mount, the free ino cache will be loaded from disk into memory. When the fs tree is commited, the cache will be written back to disk. To keep compatibility, we check the root generation against the generation of the special inode when loading the cache, so the loading will fail if the btrfs filesystem was mounted in an older kernel before. Signed-off-by: Li Zefan --- fs/btrfs/inode-map.c | 87 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) (limited to 'fs/btrfs/inode-map.c') diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 5be62df90c4f..7967e85c72f5 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -137,6 +137,7 @@ out: static void start_caching(struct btrfs_root *root) { struct task_struct *tsk; + int ret; spin_lock(&root->cache_lock); if (root->cached != BTRFS_CACHE_NO) { @@ -147,6 +148,14 @@ static void start_caching(struct btrfs_root *root) root->cached = BTRFS_CACHE_STARTED; spin_unlock(&root->cache_lock); + ret = load_free_ino_cache(root->fs_info, root); + if (ret == 1) { + spin_lock(&root->cache_lock); + root->cached = BTRFS_CACHE_FINISHED; + spin_unlock(&root->cache_lock); + return; + } + tsk = kthread_run(caching_kthread, root, "btrfs-ino-cache-%llu\n", root->root_key.objectid); BUG_ON(IS_ERR(tsk)); @@ -352,6 +361,84 @@ void btrfs_init_free_ino_ctl(struct btrfs_root *root) pinned->op = &pinned_free_ino_op; } +int btrfs_save_ino_cache(struct btrfs_root *root, + struct btrfs_trans_handle *trans) +{ + struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; + struct btrfs_path *path; + struct inode *inode; + u64 alloc_hint = 0; + int ret; + int prealloc; + bool retry = false; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; +again: + inode = lookup_free_ino_inode(root, path); + if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { + ret = PTR_ERR(inode); + goto out; + } + + if (IS_ERR(inode)) { + BUG_ON(retry); + retry = true; + + ret = create_free_ino_inode(root, trans, path); + if (ret) + goto out; + goto again; + } + + BTRFS_I(inode)->generation = 0; + ret = btrfs_update_inode(trans, root, inode); + WARN_ON(ret); + + if (i_size_read(inode) > 0) { + ret = btrfs_truncate_free_space_cache(root, trans, path, inode); + if (ret) + goto out_put; + } + + spin_lock(&root->cache_lock); + if (root->cached != BTRFS_CACHE_FINISHED) { + ret = -1; + spin_unlock(&root->cache_lock); + goto out_put; + } + spin_unlock(&root->cache_lock); + + spin_lock(&ctl->tree_lock); + prealloc = sizeof(struct btrfs_free_space) * ctl->free_extents; + prealloc = ALIGN(prealloc, PAGE_CACHE_SIZE); + prealloc += ctl->total_bitmaps * PAGE_CACHE_SIZE; + spin_unlock(&ctl->tree_lock); + + /* Just to make sure we have enough space */ + prealloc += 8 * PAGE_CACHE_SIZE; + + ret = btrfs_check_data_free_space(inode, prealloc); + if (ret) + goto out_put; + + ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc, + prealloc, prealloc, &alloc_hint); + if (ret) + goto out_put; + btrfs_free_reserved_data_space(inode, prealloc); + +out_put: + iput(inode); +out: + if (ret == 0) + ret = btrfs_write_out_ino_cache(root, trans, path); + + btrfs_free_path(path); + return ret; +} + static int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid) { struct btrfs_path *path; -- cgit v1.2.3 From a47d6b70e280401d553e7cac6f5750870de1ad21 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 26 May 2011 06:38:30 +0000 Subject: Btrfs: setup free ino caching in a more asynchronous way For a filesystem that has lots of files in it, the first time we mount it with free ino caching support, it can take quite a long time to setup the caching before we can create new files. Here we fill the cache with [highest_ino, BTRFS_LAST_FREE_OBJECTID] before we start the caching thread to search through the extent tree. Signed-off-by: Li Zefan Signed-off-by: Chris Mason --- fs/btrfs/inode-map.c | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) (limited to 'fs/btrfs/inode-map.c') diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 000970512624..3262cd17a12f 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -60,12 +60,12 @@ again: while (1) { smp_mb(); - if (fs_info->closing > 1) + if (fs_info->closing) goto out; leaf = path->nodes[0]; slot = path->slots[0]; - if (path->slots[0] >= btrfs_header_nritems(leaf)) { + if (slot >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(root, path); if (ret < 0) goto out; @@ -100,7 +100,7 @@ again: if (key.type != BTRFS_INODE_ITEM_KEY) goto next; - if (key.objectid >= BTRFS_LAST_FREE_OBJECTID) + if (key.objectid >= root->highest_objectid) break; if (last != (u64)-1 && last + 1 != key.objectid) { @@ -114,9 +114,9 @@ next: path->slots[0]++; } - if (last < BTRFS_LAST_FREE_OBJECTID - 1) { + if (last < root->highest_objectid - 1) { __btrfs_add_free_space(ctl, last + 1, - BTRFS_LAST_FREE_OBJECTID - last - 1); + root->highest_objectid - last - 1); } spin_lock(&root->cache_lock); @@ -136,8 +136,10 @@ out: static void start_caching(struct btrfs_root *root) { + struct btrfs_free_space_ctl *ctl = root->free_ino_ctl; struct task_struct *tsk; int ret; + u64 objectid; spin_lock(&root->cache_lock); if (root->cached != BTRFS_CACHE_NO) { @@ -156,6 +158,19 @@ static void start_caching(struct btrfs_root *root) return; } + /* + * It can be quite time-consuming to fill the cache by searching + * through the extent tree, and this can keep ino allocation path + * waiting. Therefore at start we quickly find out the highest + * inode number and we know we can use inode numbers which fall in + * [highest_ino + 1, BTRFS_LAST_FREE_OBJECTID]. + */ + ret = btrfs_find_free_objectid(root, &objectid); + if (!ret && objectid <= BTRFS_LAST_FREE_OBJECTID) { + __btrfs_add_free_space(ctl, objectid, + BTRFS_LAST_FREE_OBJECTID - objectid + 1); + } + tsk = kthread_run(caching_kthread, root, "btrfs-ino-cache-%llu\n", root->root_key.objectid); BUG_ON(IS_ERR(tsk)); @@ -209,7 +224,8 @@ again: start_caching(root); - if (objectid <= root->cache_progress) + if (objectid <= root->cache_progress || + objectid > root->highest_objectid) __btrfs_add_free_space(ctl, objectid, 1); else __btrfs_add_free_space(pinned, objectid, 1); -- cgit v1.2.3