From 581bb050941b4f220f84d3e5ed6dace3d42dd382 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 20 Apr 2011 10:06:11 +0800 Subject: Btrfs: Cache free inode numbers in memory Currently btrfs stores the highest objectid of the fs tree, and it always returns (highest+1) inode number when we create a file, so inode numbers won't be reclaimed when we delete files, so we'll run out of inode numbers as we keep create/delete files in 32bits machines. This fixes it, and it works similarly to how we cache free space in block cgroups. We start a kernel thread to read the file tree. By scanning inode items, we know which chunks of inode numbers are free, and we cache them in an rb-tree. Because we are searching the commit root, we have to carefully handle the cross-transaction case. The rb-tree is a hybrid extent+bitmap tree, so if we have too many small chunks of inode numbers, we'll use bitmaps. Initially we allow 16K ram of extents, and a bitmap will be used if we exceed this threshold. The extents threshold is adjusted in runtime. Signed-off-by: Li Zefan --- fs/btrfs/ioctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/btrfs/ioctl.c') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index f580a3a5d2fc..e1835f8eec93 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -50,6 +50,7 @@ #include "print-tree.h" #include "volumes.h" #include "locking.h" +#include "inode-map.h" /* Mask out flags that are inappropriate for the given type of inode. */ static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags) @@ -323,8 +324,7 @@ static noinline int create_subvol(struct btrfs_root *root, u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID; u64 index = 0; - ret = btrfs_find_free_objectid(NULL, root->fs_info->tree_root, - 0, &objectid); + ret = btrfs_find_free_objectid(root->fs_info->tree_root, &objectid); if (ret) { dput(parent); return ret; -- cgit v1.2.3 From 33345d01522f8152f99dc84a3e7a1a45707f387f Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 20 Apr 2011 10:31:50 +0800 Subject: Btrfs: Always use 64bit inode number There's a potential problem in 32bit system when we exhaust 32bit inode numbers and start to allocate big inode numbers, because btrfs uses inode->i_ino in many places. So here we always use BTRFS_I(inode)->location.objectid, which is an u64 variable. There are 2 exceptions that BTRFS_I(inode)->location.objectid != inode->i_ino: the btree inode (0 vs 1) and empty subvol dirs (256 vs 2), and inode->i_ino will be used in those cases. Another reason to make this change is I'm going to use a special inode to save free ino cache, and the inode number must be > (u64)-256. Signed-off-by: Li Zefan --- fs/btrfs/ioctl.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'fs/btrfs/ioctl.c') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index e1835f8eec93..01dccb4a70bb 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -416,7 +416,7 @@ static noinline int create_subvol(struct btrfs_root *root, BUG_ON(ret); ret = btrfs_insert_dir_item(trans, root, - name, namelen, dir->i_ino, &key, + name, namelen, btrfs_ino(dir), &key, BTRFS_FT_DIR, index); if (ret) goto fail; @@ -427,7 +427,7 @@ static noinline int create_subvol(struct btrfs_root *root, ret = btrfs_add_root_ref(trans, root->fs_info->tree_root, objectid, root->root_key.objectid, - dir->i_ino, index, name, namelen); + btrfs_ino(dir), index, name, namelen); BUG_ON(ret); @@ -1123,7 +1123,7 @@ static noinline int btrfs_ioctl_subvol_getflags(struct file *file, int ret = 0; u64 flags = 0; - if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) + if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) return -EINVAL; down_read(&root->fs_info->subvol_sem); @@ -1150,7 +1150,7 @@ static noinline int btrfs_ioctl_subvol_setflags(struct file *file, if (root->fs_info->sb->s_flags & MS_RDONLY) return -EROFS; - if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) + if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) return -EINVAL; if (copy_from_user(&flags, arg, sizeof(flags))) @@ -1633,7 +1633,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, goto out_dput; } - if (inode->i_ino != BTRFS_FIRST_FREE_OBJECTID) { + if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) { err = -EINVAL; goto out_dput; } @@ -1919,7 +1919,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, } /* clone data */ - key.objectid = src->i_ino; + key.objectid = btrfs_ino(src); key.type = BTRFS_EXTENT_DATA_KEY; key.offset = 0; @@ -1946,7 +1946,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, btrfs_item_key_to_cpu(leaf, &key, slot); if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY || - key.objectid != src->i_ino) + key.objectid != btrfs_ino(src)) break; if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) { @@ -1989,7 +1989,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, goto next; memcpy(&new_key, &key, sizeof(new_key)); - new_key.objectid = inode->i_ino; + new_key.objectid = btrfs_ino(inode); if (off <= key.offset) new_key.offset = key.offset + destoff - off; else @@ -2043,7 +2043,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, ret = btrfs_inc_extent_ref(trans, root, disko, diskl, 0, root->root_key.objectid, - inode->i_ino, + btrfs_ino(inode), new_key.offset - datao); BUG_ON(ret); } -- cgit v1.2.3 From b3b4aa74b58bded927f579fff787fb6fa1c0393c Mon Sep 17 00:00:00 2001 From: David Sterba Date: Thu, 21 Apr 2011 01:20:15 +0200 Subject: btrfs: drop unused parameter from btrfs_release_path parameter tree root it's not used since commit 5f39d397dfbe140a14edecd4e73c34ce23c4f9ee ("Btrfs: Create extent_buffer interface for large blocksizes") Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'fs/btrfs/ioctl.c') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index ffb48d6c5433..d11fc6548e15 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1396,7 +1396,7 @@ static noinline int search_ioctl(struct inode *inode, } ret = copy_to_sk(root, path, &key, sk, args->buf, &sk_offset, &num_found); - btrfs_release_path(root, path); + btrfs_release_path(path); if (ret || num_found >= sk->nr_items) break; @@ -1503,7 +1503,7 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info, if (key.offset == BTRFS_FIRST_FREE_OBJECTID) break; - btrfs_release_path(root, path); + btrfs_release_path(path); key.objectid = key.offset; key.offset = (u64)-1; dirid = key.objectid; @@ -1982,7 +1982,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, datal = btrfs_file_extent_ram_bytes(leaf, extent); } - btrfs_release_path(root, path); + btrfs_release_path(path); if (key.offset + datal <= off || key.offset >= off+len) @@ -2092,7 +2092,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, } btrfs_mark_buffer_dirty(leaf); - btrfs_release_path(root, path); + btrfs_release_path(path); inode->i_mtime = inode->i_ctime = CURRENT_TIME; @@ -2113,12 +2113,12 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, btrfs_end_transaction(trans, root); } next: - btrfs_release_path(root, path); + btrfs_release_path(path); key.offset++; } ret = 0; out: - btrfs_release_path(root, path); + btrfs_release_path(path); unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS); out_unlock: mutex_unlock(&src->i_mutex); -- cgit v1.2.3 From 475f63874d739d7842a56da94687f18d583ae654 Mon Sep 17 00:00:00 2001 From: Jan Schmidt Date: Fri, 11 Mar 2011 15:41:01 +0100 Subject: btrfs: new ioctls for scrub adds ioctls necessary to start and cancel scrubs, to get current progress and to get info about devices to be scrubbed. Note that the scrub is done per-device and that the ioctl only returns after the scrub for this devices is finished or has been canceled. Signed-off-by: Arne Jansen --- fs/btrfs/ioctl.c | 131 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 131 insertions(+) (limited to 'fs/btrfs/ioctl.c') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index f580a3a5d2fc..205cd011d2f3 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1803,6 +1803,75 @@ static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg) return ret; } +static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg) +{ + struct btrfs_ioctl_fs_info_args fi_args; + struct btrfs_device *device; + struct btrfs_device *next; + struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + fi_args.num_devices = fs_devices->num_devices; + fi_args.max_id = 0; + memcpy(&fi_args.fsid, root->fs_info->fsid, sizeof(fi_args.fsid)); + + mutex_lock(&fs_devices->device_list_mutex); + list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { + if (device->devid > fi_args.max_id) + fi_args.max_id = device->devid; + } + mutex_unlock(&fs_devices->device_list_mutex); + + if (copy_to_user(arg, &fi_args, sizeof(fi_args))) + return -EFAULT; + + return 0; +} + +static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg) +{ + struct btrfs_ioctl_dev_info_args *di_args; + struct btrfs_device *dev; + struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; + int ret = 0; + char *s_uuid = NULL; + char empty_uuid[BTRFS_UUID_SIZE] = {0}; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + di_args = memdup_user(arg, sizeof(*di_args)); + if (IS_ERR(di_args)) + return PTR_ERR(di_args); + + if (memcmp(empty_uuid, di_args->uuid, BTRFS_UUID_SIZE) != 0) + s_uuid = di_args->uuid; + + mutex_lock(&fs_devices->device_list_mutex); + dev = btrfs_find_device(root, di_args->devid, s_uuid, NULL); + mutex_unlock(&fs_devices->device_list_mutex); + + if (!dev) { + ret = -ENODEV; + goto out; + } + + di_args->devid = dev->devid; + di_args->bytes_used = dev->bytes_used; + di_args->total_bytes = dev->total_bytes; + memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid)); + strncpy(di_args->path, dev->name, sizeof(di_args->path)); + +out: + if (ret == 0 && copy_to_user(arg, di_args, sizeof(*di_args))) + ret = -EFAULT; + + kfree(di_args); + return ret; +} + static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, u64 off, u64 olen, u64 destoff) { @@ -2465,6 +2534,58 @@ static noinline long btrfs_ioctl_wait_sync(struct file *file, void __user *argp) return btrfs_wait_for_commit(root, transid); } +static long btrfs_ioctl_scrub(struct btrfs_root *root, void __user *arg) +{ + int ret; + struct btrfs_ioctl_scrub_args *sa; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + sa = memdup_user(arg, sizeof(*sa)); + if (IS_ERR(sa)) + return PTR_ERR(sa); + + ret = btrfs_scrub_dev(root, sa->devid, sa->start, sa->end, + &sa->progress); + + if (copy_to_user(arg, sa, sizeof(*sa))) + ret = -EFAULT; + + kfree(sa); + return ret; +} + +static long btrfs_ioctl_scrub_cancel(struct btrfs_root *root, void __user *arg) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + return btrfs_scrub_cancel(root); +} + +static long btrfs_ioctl_scrub_progress(struct btrfs_root *root, + void __user *arg) +{ + struct btrfs_ioctl_scrub_args *sa; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + sa = memdup_user(arg, sizeof(*sa)); + if (IS_ERR(sa)) + return PTR_ERR(sa); + + ret = btrfs_scrub_progress(root, sa->devid, &sa->progress); + + if (copy_to_user(arg, sa, sizeof(*sa))) + ret = -EFAULT; + + kfree(sa); + return ret; +} + long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -2504,6 +2625,10 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_add_dev(root, argp); case BTRFS_IOC_RM_DEV: return btrfs_ioctl_rm_dev(root, argp); + case BTRFS_IOC_FS_INFO: + return btrfs_ioctl_fs_info(root, argp); + case BTRFS_IOC_DEV_INFO: + return btrfs_ioctl_dev_info(root, argp); case BTRFS_IOC_BALANCE: return btrfs_balance(root->fs_info->dev_root); case BTRFS_IOC_CLONE: @@ -2527,6 +2652,12 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_start_sync(file, argp); case BTRFS_IOC_WAIT_SYNC: return btrfs_ioctl_wait_sync(file, argp); + case BTRFS_IOC_SCRUB: + return btrfs_ioctl_scrub(root, argp); + case BTRFS_IOC_SCRUB_CANCEL: + return btrfs_ioctl_scrub_cancel(root, argp); + case BTRFS_IOC_SCRUB_PROGRESS: + return btrfs_ioctl_scrub_progress(root, argp); } return -ENOTTY; -- cgit v1.2.3 From 8628764e1a5e1998a42b9713e9edea7753653d01 Mon Sep 17 00:00:00 2001 From: Arne Jansen Date: Wed, 23 Mar 2011 16:34:19 +0100 Subject: btrfs: add readonly flag setting the readonly flag prevents writes in case an error is detected Signed-off-by: Arne Jansen --- fs/btrfs/ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/ioctl.c') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 205cd011d2f3..f0a74f014748 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2547,7 +2547,7 @@ static long btrfs_ioctl_scrub(struct btrfs_root *root, void __user *arg) return PTR_ERR(sa); ret = btrfs_scrub_dev(root, sa->devid, sa->start, sa->end, - &sa->progress); + &sa->progress, sa->flags & BTRFS_SCRUB_READONLY); if (copy_to_user(arg, sa, sizeof(*sa))) ret = -EFAULT; -- cgit v1.2.3 From 16cdcec736cd214350cdb591bf1091f8beedefa0 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Fri, 22 Apr 2011 18:12:22 +0800 Subject: btrfs: implement delayed inode items operation Changelog V5 -> V6: - Fix oom when the memory load is high, by storing the delayed nodes into the root's radix tree, and letting btrfs inodes go. Changelog V4 -> V5: - Fix the race on adding the delayed node to the inode, which is spotted by Chris Mason. - Merge Chris Mason's incremental patch into this patch. - Fix deadlock between readdir() and memory fault, which is reported by Itaru Kitayama. Changelog V3 -> V4: - Fix nested lock, which is reported by Itaru Kitayama, by updating space cache inode in time. Changelog V2 -> V3: - Fix the race between the delayed worker and the task which does delayed items balance, which is reported by Tsutomu Itoh. - Modify the patch address David Sterba's comment. - Fix the bug of the cpu recursion spinlock, reported by Chris Mason Changelog V1 -> V2: - break up the global rb-tree, use a list to manage the delayed nodes, which is created for every directory and file, and used to manage the delayed directory name index items and the delayed inode item. - introduce a worker to deal with the delayed nodes. Compare with Ext3/4, the performance of file creation and deletion on btrfs is very poor. the reason is that btrfs must do a lot of b+ tree insertions, such as inode item, directory name item, directory name index and so on. If we can do some delayed b+ tree insertion or deletion, we can improve the performance, so we made this patch which implemented delayed directory name index insertion/deletion and delayed inode update. Implementation: - introduce a delayed root object into the filesystem, that use two lists to manage the delayed nodes which are created for every file/directory. One is used to manage all the delayed nodes that have delayed items. And the other is used to manage the delayed nodes which is waiting to be dealt with by the work thread. - Every delayed node has two rb-tree, one is used to manage the directory name index which is going to be inserted into b+ tree, and the other is used to manage the directory name index which is going to be deleted from b+ tree. - introduce a worker to deal with the delayed operation. This worker is used to deal with the works of the delayed directory name index items insertion and deletion and the delayed inode update. When the delayed items is beyond the lower limit, we create works for some delayed nodes and insert them into the work queue of the worker, and then go back. When the delayed items is beyond the upper bound, we create works for all the delayed nodes that haven't been dealt with, and insert them into the work queue of the worker, and then wait for that the untreated items is below some threshold value. - When we want to insert a directory name index into b+ tree, we just add the information into the delayed inserting rb-tree. And then we check the number of the delayed items and do delayed items balance. (The balance policy is above.) - When we want to delete a directory name index from the b+ tree, we search it in the inserting rb-tree at first. If we look it up, just drop it. If not, add the key of it into the delayed deleting rb-tree. Similar to the delayed inserting rb-tree, we also check the number of the delayed items and do delayed items balance. (The same to inserting manipulation) - When we want to update the metadata of some inode, we cached the data of the inode into the delayed node. the worker will flush it into the b+ tree after dealing with the delayed insertion and deletion. - We will move the delayed node to the tail of the list after we access the delayed node, By this way, we can cache more delayed items and merge more inode updates. - If we want to commit transaction, we will deal with all the delayed node. - the delayed node will be freed when we free the btrfs inode. - Before we log the inode items, we commit all the directory name index items and the delayed inode update. I did a quick test by the benchmark tool[1] and found we can improve the performance of file creation by ~15%, and file deletion by ~20%. Before applying this patch: Create files: Total files: 50000 Total time: 1.096108 Average time: 0.000022 Delete files: Total files: 50000 Total time: 1.510403 Average time: 0.000030 After applying this patch: Create files: Total files: 50000 Total time: 0.932899 Average time: 0.000019 Delete files: Total files: 50000 Total time: 1.215732 Average time: 0.000024 [1] http://marc.info/?l=linux-btrfs&m=128212635122920&q=p3 Many thanks for Kitayama-san's help! Signed-off-by: Miao Xie Reviewed-by: David Sterba Tested-by: Tsutomu Itoh Tested-by: Itaru Kitayama Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs/btrfs/ioctl.c') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 2616f7ed4799..df59401af742 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -422,7 +422,7 @@ static noinline int create_subvol(struct btrfs_root *root, BUG_ON(ret); ret = btrfs_insert_dir_item(trans, root, - name, namelen, dir->i_ino, &key, + name, namelen, dir, &key, BTRFS_FT_DIR, index); if (ret) goto fail; -- cgit v1.2.3 From e2156867159ae7b3bc38ef1c26ea0ee30a895ef8 Mon Sep 17 00:00:00 2001 From: Hugo Mills Date: Sat, 14 May 2011 17:43:41 +0000 Subject: btrfs: Ensure the tree search ioctl returns the right number of records Btrfs's tree search ioctl has a field to indicate that no more than a given number of records should be returned. The ioctl doesn't honour this, as the tested value is not incremented until the end of the copy_to_sk function. This patch removes an unnecessary local variable, and updates the num_found counter as each key is found in the tree. Signed-off-by: Hugo Mills Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'fs/btrfs/ioctl.c') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 2616f7ed4799..ce773fb736a1 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1279,7 +1279,6 @@ static noinline int copy_to_sk(struct btrfs_root *root, int nritems; int i; int slot; - int found = 0; int ret = 0; leaf = path->nodes[0]; @@ -1326,7 +1325,7 @@ static noinline int copy_to_sk(struct btrfs_root *root, item_off, item_len); *sk_offset += item_len; } - found++; + (*num_found)++; if (*num_found >= sk->nr_items) break; @@ -1345,7 +1344,6 @@ advance_key: } else ret = 1; overflow: - *num_found += found; return ret; } -- cgit v1.2.3 From 1f78160ce1b1b8e657e2248118c4d91f881763f0 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 20 Apr 2011 10:09:16 +0000 Subject: Btrfs: using rcu lock in the reader side of devices list fs_devices->devices is only updated on remove and add device paths, so we can use rcu to protect it in the reader side Signed-off-by: Xiao Guangrong Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs/btrfs/ioctl.c') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index ce773fb736a1..0de71feb8e1c 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -281,8 +281,9 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - mutex_lock(&fs_info->fs_devices->device_list_mutex); - list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) { + rcu_read_lock(); + list_for_each_entry_rcu(device, &fs_info->fs_devices->devices, + dev_list) { if (!device->bdev) continue; q = bdev_get_queue(device->bdev); @@ -292,7 +293,7 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg) minlen); } } - mutex_unlock(&fs_info->fs_devices->device_list_mutex); + rcu_read_unlock(); if (!num_devices) return -EOPNOTSUPP; -- cgit v1.2.3 From 4cb5300bc839b8a943eb19c9f27f25470e22d0ca Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 24 May 2011 15:35:30 -0400 Subject: Btrfs: add mount -o auto_defrag This will detect small random writes into files and queue the up for an auto defrag process. It isn't well suited to database workloads yet, but works for smaller files such as rpm, sqlite or bdb databases. Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 448 ++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 346 insertions(+), 102 deletions(-) (limited to 'fs/btrfs/ioctl.c') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index c4f17e4e2c9c..85e818ce00c5 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -656,6 +656,106 @@ out_unlock: return error; } +/* + * When we're defragging a range, we don't want to kick it off again + * if it is really just waiting for delalloc to send it down. + * If we find a nice big extent or delalloc range for the bytes in the + * file you want to defrag, we return 0 to let you know to skip this + * part of the file + */ +static int check_defrag_in_cache(struct inode *inode, u64 offset, int thresh) +{ + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct extent_map *em = NULL; + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + u64 end; + + read_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE); + read_unlock(&em_tree->lock); + + if (em) { + end = extent_map_end(em); + free_extent_map(em); + if (end - offset > thresh) + return 0; + } + /* if we already have a nice delalloc here, just stop */ + thresh /= 2; + end = count_range_bits(io_tree, &offset, offset + thresh, + thresh, EXTENT_DELALLOC, 1); + if (end >= thresh) + return 0; + return 1; +} + +/* + * helper function to walk through a file and find extents + * newer than a specific transid, and smaller than thresh. + * + * This is used by the defragging code to find new and small + * extents + */ +static int find_new_extents(struct btrfs_root *root, + struct inode *inode, u64 newer_than, + u64 *off, int thresh) +{ + struct btrfs_path *path; + struct btrfs_key min_key; + struct btrfs_key max_key; + struct extent_buffer *leaf; + struct btrfs_file_extent_item *extent; + int type; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + min_key.objectid = inode->i_ino; + min_key.type = BTRFS_EXTENT_DATA_KEY; + min_key.offset = *off; + + max_key.objectid = inode->i_ino; + max_key.type = (u8)-1; + max_key.offset = (u64)-1; + + path->keep_locks = 1; + + while(1) { + ret = btrfs_search_forward(root, &min_key, &max_key, + path, 0, newer_than); + if (ret != 0) + goto none; + if (min_key.objectid != inode->i_ino) + goto none; + if (min_key.type != BTRFS_EXTENT_DATA_KEY) + goto none; + + leaf = path->nodes[0]; + extent = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + + type = btrfs_file_extent_type(leaf, extent); + if (type == BTRFS_FILE_EXTENT_REG && + btrfs_file_extent_num_bytes(leaf, extent) < thresh && + check_defrag_in_cache(inode, min_key.offset, thresh)) { + *off = min_key.offset; + btrfs_free_path(path); + return 0; + } + + if (min_key.offset == (u64)-1) + goto none; + + min_key.offset++; + btrfs_release_path(path); + } +none: + btrfs_free_path(path); + return -ENOENT; +} + static int should_defrag_range(struct inode *inode, u64 start, u64 len, int thresh, u64 *last_len, u64 *skip, u64 *defrag_end) @@ -665,10 +765,6 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len, struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; int ret = 1; - - if (thresh == 0) - thresh = 256 * 1024; - /* * make sure that once we start defragging and extent, we keep on * defragging it @@ -727,27 +823,176 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len, return ret; } -static int btrfs_defrag_file(struct file *file, - struct btrfs_ioctl_defrag_range_args *range) +/* + * it doesn't do much good to defrag one or two pages + * at a time. This pulls in a nice chunk of pages + * to COW and defrag. + * + * It also makes sure the delalloc code has enough + * dirty data to avoid making new small extents as part + * of the defrag + * + * It's a good idea to start RA on this range + * before calling this. + */ +static int cluster_pages_for_defrag(struct inode *inode, + struct page **pages, + unsigned long start_index, + int num_pages) { - struct inode *inode = fdentry(file)->d_inode; - struct btrfs_root *root = BTRFS_I(inode)->root; - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + unsigned long file_end; + u64 isize = i_size_read(inode); + u64 page_start; + u64 page_end; + int ret; + int i; + int i_done; struct btrfs_ordered_extent *ordered; - struct page *page; + struct extent_state *cached_state = NULL; + + if (isize == 0) + return 0; + file_end = (isize - 1) >> PAGE_CACHE_SHIFT; + + ret = btrfs_delalloc_reserve_space(inode, + num_pages << PAGE_CACHE_SHIFT); + if (ret) + return ret; +again: + ret = 0; + i_done = 0; + + /* step one, lock all the pages */ + for (i = 0; i < num_pages; i++) { + struct page *page; + page = grab_cache_page(inode->i_mapping, + start_index + i); + if (!page) + break; + + if (!PageUptodate(page)) { + btrfs_readpage(NULL, page); + lock_page(page); + if (!PageUptodate(page)) { + unlock_page(page); + page_cache_release(page); + ret = -EIO; + break; + } + } + isize = i_size_read(inode); + file_end = (isize - 1) >> PAGE_CACHE_SHIFT; + if (!isize || page->index > file_end || + page->mapping != inode->i_mapping) { + /* whoops, we blew past eof, skip this page */ + unlock_page(page); + page_cache_release(page); + break; + } + pages[i] = page; + i_done++; + } + if (!i_done || ret) + goto out; + + if (!(inode->i_sb->s_flags & MS_ACTIVE)) + goto out; + + /* + * so now we have a nice long stream of locked + * and up to date pages, lets wait on them + */ + for (i = 0; i < i_done; i++) + wait_on_page_writeback(pages[i]); + + page_start = page_offset(pages[0]); + page_end = page_offset(pages[i_done - 1]) + PAGE_CACHE_SIZE; + + lock_extent_bits(&BTRFS_I(inode)->io_tree, + page_start, page_end - 1, 0, &cached_state, + GFP_NOFS); + ordered = btrfs_lookup_first_ordered_extent(inode, page_end - 1); + if (ordered && + ordered->file_offset + ordered->len > page_start && + ordered->file_offset < page_end) { + btrfs_put_ordered_extent(ordered); + unlock_extent_cached(&BTRFS_I(inode)->io_tree, + page_start, page_end - 1, + &cached_state, GFP_NOFS); + for (i = 0; i < i_done; i++) { + unlock_page(pages[i]); + page_cache_release(pages[i]); + } + btrfs_wait_ordered_range(inode, page_start, + page_end - page_start); + goto again; + } + if (ordered) + btrfs_put_ordered_extent(ordered); + + clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, + page_end - 1, EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING, 0, 0, &cached_state, + GFP_NOFS); + + if (i_done != num_pages) { + atomic_inc(&BTRFS_I(inode)->outstanding_extents); + btrfs_delalloc_release_space(inode, + (num_pages - i_done) << PAGE_CACHE_SHIFT); + } + + + btrfs_set_extent_delalloc(inode, page_start, page_end - 1, + &cached_state); + + unlock_extent_cached(&BTRFS_I(inode)->io_tree, + page_start, page_end - 1, &cached_state, + GFP_NOFS); + + for (i = 0; i < i_done; i++) { + clear_page_dirty_for_io(pages[i]); + ClearPageChecked(pages[i]); + set_page_extent_mapped(pages[i]); + set_page_dirty(pages[i]); + unlock_page(pages[i]); + page_cache_release(pages[i]); + } + return i_done; +out: + for (i = 0; i < i_done; i++) { + unlock_page(pages[i]); + page_cache_release(pages[i]); + } + btrfs_delalloc_release_space(inode, num_pages << PAGE_CACHE_SHIFT); + return ret; + +} + +int btrfs_defrag_file(struct inode *inode, struct file *file, + struct btrfs_ioctl_defrag_range_args *range, + u64 newer_than, unsigned long max_to_defrag) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_super_block *disk_super; + struct file_ra_state *ra = NULL; unsigned long last_index; - unsigned long ra_pages = root->fs_info->bdi.ra_pages; - unsigned long total_read = 0; u64 features; - u64 page_start; - u64 page_end; u64 last_len = 0; u64 skip = 0; u64 defrag_end = 0; + u64 newer_off = range->start; + int newer_left = 0; unsigned long i; int ret; + int defrag_count = 0; int compress_type = BTRFS_COMPRESS_ZLIB; + int extent_thresh = range->extent_thresh; + int newer_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT; + u64 new_align = ~((u64)128 * 1024 - 1); + struct page **pages = NULL; + + if (extent_thresh == 0) + extent_thresh = 256 * 1024; if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) { if (range->compress_type > BTRFS_COMPRESS_TYPES) @@ -759,6 +1004,27 @@ static int btrfs_defrag_file(struct file *file, if (inode->i_size == 0) return 0; + /* + * if we were not given a file, allocate a readahead + * context + */ + if (!file) { + ra = kzalloc(sizeof(*ra), GFP_NOFS); + if (!ra) + return -ENOMEM; + file_ra_state_init(ra, inode->i_mapping); + } else { + ra = &file->f_ra; + } + + pages = kmalloc(sizeof(struct page *) * newer_cluster, + GFP_NOFS); + if (!pages) { + ret = -ENOMEM; + goto out_ra; + } + + /* find the last page to defrag */ if (range->start + range->len > range->start) { last_index = min_t(u64, inode->i_size - 1, range->start + range->len - 1) >> PAGE_CACHE_SHIFT; @@ -766,11 +1032,37 @@ static int btrfs_defrag_file(struct file *file, last_index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT; } - i = range->start >> PAGE_CACHE_SHIFT; - while (i <= last_index) { - if (!should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT, + if (newer_than) { + ret = find_new_extents(root, inode, newer_than, + &newer_off, 64 * 1024); + if (!ret) { + range->start = newer_off; + /* + * we always align our defrag to help keep + * the extents in the file evenly spaced + */ + i = (newer_off & new_align) >> PAGE_CACHE_SHIFT; + newer_left = newer_cluster; + } else + goto out_ra; + } else { + i = range->start >> PAGE_CACHE_SHIFT; + } + if (!max_to_defrag) + max_to_defrag = last_index - 1; + + while (i <= last_index && defrag_count < max_to_defrag) { + /* + * make sure we stop running if someone unmounts + * the FS + */ + if (!(inode->i_sb->s_flags & MS_ACTIVE)) + break; + + if (!newer_than && + !should_defrag_range(inode, (u64)i << PAGE_CACHE_SHIFT, PAGE_CACHE_SIZE, - range->extent_thresh, + extent_thresh, &last_len, &skip, &defrag_end)) { unsigned long next; @@ -782,92 +1074,39 @@ static int btrfs_defrag_file(struct file *file, i = max(i + 1, next); continue; } - - if (total_read % ra_pages == 0) { - btrfs_force_ra(inode->i_mapping, &file->f_ra, file, i, - min(last_index, i + ra_pages - 1)); - } - total_read++; - mutex_lock(&inode->i_mutex); if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) BTRFS_I(inode)->force_compress = compress_type; - ret = btrfs_delalloc_reserve_space(inode, PAGE_CACHE_SIZE); - if (ret) - goto err_unlock; -again: - if (inode->i_size == 0 || - i > ((inode->i_size - 1) >> PAGE_CACHE_SHIFT)) { - ret = 0; - goto err_reservations; - } + btrfs_force_ra(inode->i_mapping, ra, file, i, newer_cluster); - page = grab_cache_page(inode->i_mapping, i); - if (!page) { - ret = -ENOMEM; - goto err_reservations; - } - - if (!PageUptodate(page)) { - btrfs_readpage(NULL, page); - lock_page(page); - if (!PageUptodate(page)) { - unlock_page(page); - page_cache_release(page); - ret = -EIO; - goto err_reservations; - } - } - - if (page->mapping != inode->i_mapping) { - unlock_page(page); - page_cache_release(page); - goto again; - } - - wait_on_page_writeback(page); + ret = cluster_pages_for_defrag(inode, pages, i, newer_cluster); + if (ret < 0) + goto out_ra; - if (PageDirty(page)) { - btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); - goto loop_unlock; - } + defrag_count += ret; + balance_dirty_pages_ratelimited_nr(inode->i_mapping, ret); + i += ret; - page_start = (u64)page->index << PAGE_CACHE_SHIFT; - page_end = page_start + PAGE_CACHE_SIZE - 1; - lock_extent(io_tree, page_start, page_end, GFP_NOFS); + if (newer_than) { + if (newer_off == (u64)-1) + break; - ordered = btrfs_lookup_ordered_extent(inode, page_start); - if (ordered) { - unlock_extent(io_tree, page_start, page_end, GFP_NOFS); - unlock_page(page); - page_cache_release(page); - btrfs_start_ordered_extent(inode, ordered, 1); - btrfs_put_ordered_extent(ordered); - goto again; + newer_off = max(newer_off + 1, + (u64)i << PAGE_CACHE_SHIFT); + + ret = find_new_extents(root, inode, + newer_than, &newer_off, + 64 * 1024); + if (!ret) { + range->start = newer_off; + i = (newer_off & new_align) >> PAGE_CACHE_SHIFT; + newer_left = newer_cluster; + } else { + break; + } + } else { + i++; } - set_page_extent_mapped(page); - - /* - * this makes sure page_mkwrite is called on the - * page if it is dirtied again later - */ - clear_page_dirty_for_io(page); - clear_extent_bits(&BTRFS_I(inode)->io_tree, page_start, - page_end, EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING, GFP_NOFS); - - btrfs_set_extent_delalloc(inode, page_start, page_end, NULL); - ClearPageChecked(page); - set_page_dirty(page); - unlock_extent(io_tree, page_start, page_end, GFP_NOFS); - -loop_unlock: - unlock_page(page); - page_cache_release(page); - mutex_unlock(&inode->i_mutex); - - balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1); - i++; } if ((range->flags & BTRFS_DEFRAG_RANGE_START_IO)) @@ -899,12 +1138,14 @@ loop_unlock: btrfs_set_super_incompat_flags(disk_super, features); } - return 0; + if (!file) + kfree(ra); + return defrag_count; -err_reservations: - btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE); -err_unlock: - mutex_unlock(&inode->i_mutex); +out_ra: + if (!file) + kfree(ra); + kfree(pages); return ret; } @@ -1756,7 +1997,10 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) /* the rest are all set to zero by kzalloc */ range->len = (u64)-1; } - ret = btrfs_defrag_file(file, range); + ret = btrfs_defrag_file(fdentry(file)->d_inode, file, + range, 0, 0); + if (ret > 0) + ret = 0; kfree(range); break; default: -- cgit v1.2.3