From a542ad1bafc7df9fc16de8a6894b350a4df75572 Mon Sep 17 00:00:00 2001
From: Jan Schmidt <list.btrfs@jan-o-sch.net>
Date: Mon, 13 Jun 2011 19:52:59 +0200
Subject: btrfs: added helper functions to iterate backrefs

These helper functions iterate back references and call a function for each
backref. There is also a function to resolve an inode to a path in the
file system.

Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
---
 fs/btrfs/Makefile  |   3 +-
 fs/btrfs/backref.c | 776 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/backref.h |  62 +++++
 fs/btrfs/ioctl.h   |  11 +
 4 files changed, 851 insertions(+), 1 deletion(-)
 create mode 100644 fs/btrfs/backref.c
 create mode 100644 fs/btrfs/backref.h

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 40e6ac08c21f..89b6ce3634fd 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -7,6 +7,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
 	   extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
 	   export.o tree-log.o free-space-cache.o zlib.o lzo.o \
-	   compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o
+	   compression.o delayed-ref.o relocation.o delayed-inode.o backref.o \
+	   scrub.o
 
 btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
new file mode 100644
index 000000000000..2351df0de450
--- /dev/null
+++ b/fs/btrfs/backref.c
@@ -0,0 +1,776 @@
+/*
+ * Copyright (C) 2011 STRATO.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include "ctree.h"
+#include "disk-io.h"
+#include "backref.h"
+
+struct __data_ref {
+	struct list_head list;
+	u64 inum;
+	u64 root;
+	u64 extent_data_item_offset;
+};
+
+struct __shared_ref {
+	struct list_head list;
+	u64 disk_byte;
+};
+
+static int __inode_info(u64 inum, u64 ioff, u8 key_type,
+			struct btrfs_root *fs_root, struct btrfs_path *path,
+			struct btrfs_key *found_key)
+{
+	int ret;
+	struct btrfs_key key;
+	struct extent_buffer *eb;
+
+	key.type = key_type;
+	key.objectid = inum;
+	key.offset = ioff;
+
+	ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
+	if (ret < 0)
+		return ret;
+
+	eb = path->nodes[0];
+	if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
+		ret = btrfs_next_leaf(fs_root, path);
+		if (ret)
+			return ret;
+		eb = path->nodes[0];
+	}
+
+	btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
+	if (found_key->type != key.type || found_key->objectid != key.objectid)
+		return 1;
+
+	return 0;
+}
+
+/*
+ * this makes the path point to (inum INODE_ITEM ioff)
+ */
+int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
+			struct btrfs_path *path)
+{
+	struct btrfs_key key;
+	return __inode_info(inum, ioff, BTRFS_INODE_ITEM_KEY, fs_root, path,
+				&key);
+}
+
+static int inode_ref_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
+				struct btrfs_path *path,
+				struct btrfs_key *found_key)
+{
+	return __inode_info(inum, ioff, BTRFS_INODE_REF_KEY, fs_root, path,
+				found_key);
+}
+
+/*
+ * this iterates to turn a btrfs_inode_ref into a full filesystem path. elements
+ * of the path are separated by '/' and the path is guaranteed to be
+ * 0-terminated. the path is only given within the current file system.
+ * Therefore, it never starts with a '/'. the caller is responsible to provide
+ * "size" bytes in "dest". the dest buffer will be filled backwards. finally,
+ * the start point of the resulting string is returned. this pointer is within
+ * dest, normally.
+ * in case the path buffer would overflow, the pointer is decremented further
+ * as if output was written to the buffer, though no more output is actually
+ * generated. that way, the caller can determine how much space would be
+ * required for the path to fit into the buffer. in that case, the returned
+ * value will be smaller than dest. callers must check this!
+ */
+static char *iref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
+				struct btrfs_inode_ref *iref,
+				struct extent_buffer *eb_in, u64 parent,
+				char *dest, u32 size)
+{
+	u32 len;
+	int slot;
+	u64 next_inum;
+	int ret;
+	s64 bytes_left = size - 1;
+	struct extent_buffer *eb = eb_in;
+	struct btrfs_key found_key;
+
+	if (bytes_left >= 0)
+		dest[bytes_left] = '\0';
+
+	while (1) {
+		len = btrfs_inode_ref_name_len(eb, iref);
+		bytes_left -= len;
+		if (bytes_left >= 0)
+			read_extent_buffer(eb, dest + bytes_left,
+						(unsigned long)(iref + 1), len);
+		if (eb != eb_in)
+			free_extent_buffer(eb);
+		ret = inode_ref_info(parent, 0, fs_root, path, &found_key);
+		if (ret)
+			break;
+		next_inum = found_key.offset;
+
+		/* regular exit ahead */
+		if (parent == next_inum)
+			break;
+
+		slot = path->slots[0];
+		eb = path->nodes[0];
+		/* make sure we can use eb after releasing the path */
+		if (eb != eb_in)
+			atomic_inc(&eb->refs);
+		btrfs_release_path(path);
+
+		iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
+		parent = next_inum;
+		--bytes_left;
+		if (bytes_left >= 0)
+			dest[bytes_left] = '/';
+	}
+
+	btrfs_release_path(path);
+
+	if (ret)
+		return ERR_PTR(ret);
+
+	return dest + bytes_left;
+}
+
+/*
+ * this makes the path point to (logical EXTENT_ITEM *)
+ * returns BTRFS_EXTENT_FLAG_DATA for data, BTRFS_EXTENT_FLAG_TREE_BLOCK for
+ * tree blocks and <0 on error.
+ */
+int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
+			struct btrfs_path *path, struct btrfs_key *found_key)
+{
+	int ret;
+	u64 flags;
+	u32 item_size;
+	struct extent_buffer *eb;
+	struct btrfs_extent_item *ei;
+	struct btrfs_key key;
+
+	key.type = BTRFS_EXTENT_ITEM_KEY;
+	key.objectid = logical;
+	key.offset = (u64)-1;
+
+	ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
+	if (ret < 0)
+		return ret;
+	ret = btrfs_previous_item(fs_info->extent_root, path,
+					0, BTRFS_EXTENT_ITEM_KEY);
+	if (ret < 0)
+		return ret;
+
+	btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]);
+	if (found_key->type != BTRFS_EXTENT_ITEM_KEY ||
+	    found_key->objectid > logical ||
+	    found_key->objectid + found_key->offset <= logical)
+		return -ENOENT;
+
+	eb = path->nodes[0];
+	item_size = btrfs_item_size_nr(eb, path->slots[0]);
+	BUG_ON(item_size < sizeof(*ei));
+
+	ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
+	flags = btrfs_extent_flags(eb, ei);
+
+	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
+		return BTRFS_EXTENT_FLAG_TREE_BLOCK;
+	if (flags & BTRFS_EXTENT_FLAG_DATA)
+		return BTRFS_EXTENT_FLAG_DATA;
+
+	return -EIO;
+}
+
+/*
+ * helper function to iterate extent inline refs. ptr must point to a 0 value
+ * for the first call and may be modified. it is used to track state.
+ * if more refs exist, 0 is returned and the next call to
+ * __get_extent_inline_ref must pass the modified ptr parameter to get the
+ * next ref. after the last ref was processed, 1 is returned.
+ * returns <0 on error
+ */
+static int __get_extent_inline_ref(unsigned long *ptr, struct extent_buffer *eb,
+				struct btrfs_extent_item *ei, u32 item_size,
+				struct btrfs_extent_inline_ref **out_eiref,
+				int *out_type)
+{
+	unsigned long end;
+	u64 flags;
+	struct btrfs_tree_block_info *info;
+
+	if (!*ptr) {
+		/* first call */
+		flags = btrfs_extent_flags(eb, ei);
+		if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+			info = (struct btrfs_tree_block_info *)(ei + 1);
+			*out_eiref =
+				(struct btrfs_extent_inline_ref *)(info + 1);
+		} else {
+			*out_eiref = (struct btrfs_extent_inline_ref *)(ei + 1);
+		}
+		*ptr = (unsigned long)*out_eiref;
+		if ((void *)*ptr >= (void *)ei + item_size)
+			return -ENOENT;
+	}
+
+	end = (unsigned long)ei + item_size;
+	*out_eiref = (struct btrfs_extent_inline_ref *)*ptr;
+	*out_type = btrfs_extent_inline_ref_type(eb, *out_eiref);
+
+	*ptr += btrfs_extent_inline_ref_size(*out_type);
+	WARN_ON(*ptr > end);
+	if (*ptr == end)
+		return 1; /* last */
+
+	return 0;
+}
+
+/*
+ * reads the tree block backref for an extent. tree level and root are returned
+ * through out_level and out_root. ptr must point to a 0 value for the first
+ * call and may be modified (see __get_extent_inline_ref comment).
+ * returns 0 if data was provided, 1 if there was no more data to provide or
+ * <0 on error.
+ */
+int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
+				struct btrfs_extent_item *ei, u32 item_size,
+				u64 *out_root, u8 *out_level)
+{
+	int ret;
+	int type;
+	struct btrfs_tree_block_info *info;
+	struct btrfs_extent_inline_ref *eiref;
+
+	if (*ptr == (unsigned long)-1)
+		return 1;
+
+	while (1) {
+		ret = __get_extent_inline_ref(ptr, eb, ei, item_size,
+						&eiref, &type);
+		if (ret < 0)
+			return ret;
+
+		if (type == BTRFS_TREE_BLOCK_REF_KEY ||
+		    type == BTRFS_SHARED_BLOCK_REF_KEY)
+			break;
+
+		if (ret == 1)
+			return 1;
+	}
+
+	/* we can treat both ref types equally here */
+	info = (struct btrfs_tree_block_info *)(ei + 1);
+	*out_root = btrfs_extent_inline_ref_offset(eb, eiref);
+	*out_level = btrfs_tree_block_level(eb, info);
+
+	if (ret == 1)
+		*ptr = (unsigned long)-1;
+
+	return 0;
+}
+
+static int __data_list_add(struct list_head *head, u64 inum,
+				u64 extent_data_item_offset, u64 root)
+{
+	struct __data_ref *ref;
+
+	ref = kmalloc(sizeof(*ref), GFP_NOFS);
+	if (!ref)
+		return -ENOMEM;
+
+	ref->inum = inum;
+	ref->extent_data_item_offset = extent_data_item_offset;
+	ref->root = root;
+	list_add_tail(&ref->list, head);
+
+	return 0;
+}
+
+static int __data_list_add_eb(struct list_head *head, struct extent_buffer *eb,
+				struct btrfs_extent_data_ref *dref)
+{
+	return __data_list_add(head, btrfs_extent_data_ref_objectid(eb, dref),
+				btrfs_extent_data_ref_offset(eb, dref),
+				btrfs_extent_data_ref_root(eb, dref));
+}
+
+static int __shared_list_add(struct list_head *head, u64 disk_byte)
+{
+	struct __shared_ref *ref;
+
+	ref = kmalloc(sizeof(*ref), GFP_NOFS);
+	if (!ref)
+		return -ENOMEM;
+
+	ref->disk_byte = disk_byte;
+	list_add_tail(&ref->list, head);
+
+	return 0;
+}
+
+static int __iter_shared_inline_ref_inodes(struct btrfs_fs_info *fs_info,
+					   u64 logical, u64 inum,
+					   u64 extent_data_item_offset,
+					   u64 extent_offset,
+					   struct btrfs_path *path,
+					   struct list_head *data_refs,
+					   iterate_extent_inodes_t *iterate,
+					   void *ctx)
+{
+	u64 ref_root;
+	u32 item_size;
+	struct btrfs_key key;
+	struct extent_buffer *eb;
+	struct btrfs_extent_item *ei;
+	struct btrfs_extent_inline_ref *eiref;
+	struct __data_ref *ref;
+	int ret;
+	int type;
+	int last;
+	unsigned long ptr = 0;
+
+	WARN_ON(!list_empty(data_refs));
+	ret = extent_from_logical(fs_info, logical, path, &key);
+	if (ret & BTRFS_EXTENT_FLAG_DATA)
+		ret = -EIO;
+	if (ret < 0)
+		goto out;
+
+	eb = path->nodes[0];
+	ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
+	item_size = btrfs_item_size_nr(eb, path->slots[0]);
+
+	ret = 0;
+	ref_root = 0;
+	/*
+	 * as done in iterate_extent_inodes, we first build a list of refs to
+	 * iterate, then free the path and then iterate them to avoid deadlocks.
+	 */
+	do {
+		last = __get_extent_inline_ref(&ptr, eb, ei, item_size,
+						&eiref, &type);
+		if (last < 0) {
+			ret = last;
+			goto out;
+		}
+		if (type == BTRFS_TREE_BLOCK_REF_KEY ||
+		    type == BTRFS_SHARED_BLOCK_REF_KEY) {
+			ref_root = btrfs_extent_inline_ref_offset(eb, eiref);
+			ret = __data_list_add(data_refs, inum,
+						extent_data_item_offset,
+						ref_root);
+		}
+	} while (!ret && !last);
+
+	btrfs_release_path(path);
+
+	if (ref_root == 0) {
+		printk(KERN_ERR "btrfs: failed to find tree block ref "
+			"for shared data backref %llu\n", logical);
+		WARN_ON(1);
+		ret = -EIO;
+	}
+
+out:
+	while (!list_empty(data_refs)) {
+		ref = list_first_entry(data_refs, struct __data_ref, list);
+		list_del(&ref->list);
+		if (!ret)
+			ret = iterate(ref->inum, extent_offset +
+					ref->extent_data_item_offset,
+					ref->root, ctx);
+		kfree(ref);
+	}
+
+	return ret;
+}
+
+static int __iter_shared_inline_ref(struct btrfs_fs_info *fs_info,
+				    u64 logical, u64 orig_extent_item_objectid,
+				    u64 extent_offset, struct btrfs_path *path,
+				    struct list_head *data_refs,
+				    iterate_extent_inodes_t *iterate,
+				    void *ctx)
+{
+	u64 disk_byte;
+	struct btrfs_key key;
+	struct btrfs_file_extent_item *fi;
+	struct extent_buffer *eb;
+	int slot;
+	int nritems;
+	int ret;
+	int found = 0;
+
+	eb = read_tree_block(fs_info->tree_root, logical,
+				fs_info->tree_root->leafsize, 0);
+	if (!eb)
+		return -EIO;
+
+	/*
+	 * from the shared data ref, we only have the leaf but we need
+	 * the key. thus, we must look into all items and see that we
+	 * find one (some) with a reference to our extent item.
+	 */
+	nritems = btrfs_header_nritems(eb);
+	for (slot = 0; slot < nritems; ++slot) {
+		btrfs_item_key_to_cpu(eb, &key, slot);
+		if (key.type != BTRFS_EXTENT_DATA_KEY)
+			continue;
+		fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
+		if (!fi) {
+			free_extent_buffer(eb);
+			return -EIO;
+		}
+		disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
+		if (disk_byte != orig_extent_item_objectid) {
+			if (found)
+				break;
+			else
+				continue;
+		}
+		++found;
+		ret = __iter_shared_inline_ref_inodes(fs_info, logical,
+							key.objectid,
+							key.offset,
+							extent_offset, path,
+							data_refs,
+							iterate, ctx);
+		if (ret)
+			break;
+	}
+
+	if (!found) {
+		printk(KERN_ERR "btrfs: failed to follow shared data backref "
+			"to parent %llu\n", logical);
+		WARN_ON(1);
+		ret = -EIO;
+	}
+
+	free_extent_buffer(eb);
+	return ret;
+}
+
+/*
+ * calls iterate() for every inode that references the extent identified by
+ * the given parameters. will use the path given as a parameter and return it
+ * released.
+ * when the iterator function returns a non-zero value, iteration stops.
+ */
+int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
+				struct btrfs_path *path,
+				u64 extent_item_objectid,
+				u64 extent_offset,
+				iterate_extent_inodes_t *iterate, void *ctx)
+{
+	unsigned long ptr = 0;
+	int last;
+	int ret;
+	int type;
+	u64 logical;
+	u32 item_size;
+	struct btrfs_extent_inline_ref *eiref;
+	struct btrfs_extent_data_ref *dref;
+	struct extent_buffer *eb;
+	struct btrfs_extent_item *ei;
+	struct btrfs_key key;
+	struct list_head data_refs = LIST_HEAD_INIT(data_refs);
+	struct list_head shared_refs = LIST_HEAD_INIT(shared_refs);
+	struct __data_ref *ref_d;
+	struct __shared_ref *ref_s;
+
+	eb = path->nodes[0];
+	ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
+	item_size = btrfs_item_size_nr(eb, path->slots[0]);
+
+	/* first we iterate the inline refs, ... */
+	do {
+		last = __get_extent_inline_ref(&ptr, eb, ei, item_size,
+						&eiref, &type);
+		if (last == -ENOENT) {
+			ret = 0;
+			break;
+		}
+		if (last < 0) {
+			ret = last;
+			break;
+		}
+
+		if (type == BTRFS_EXTENT_DATA_REF_KEY) {
+			dref = (struct btrfs_extent_data_ref *)(&eiref->offset);
+			ret = __data_list_add_eb(&data_refs, eb, dref);
+		} else if (type == BTRFS_SHARED_DATA_REF_KEY) {
+			logical = btrfs_extent_inline_ref_offset(eb, eiref);
+			ret = __shared_list_add(&shared_refs, logical);
+		}
+	} while (!ret && !last);
+
+	/* ... then we proceed to in-tree references and ... */
+	while (!ret) {
+		++path->slots[0];
+		if (path->slots[0] > btrfs_header_nritems(eb)) {
+			ret = btrfs_next_leaf(fs_info->extent_root, path);
+			if (ret) {
+				if (ret == 1)
+					ret = 0; /* we're done */
+				break;
+			}
+			eb = path->nodes[0];
+		}
+		btrfs_item_key_to_cpu(eb, &key, path->slots[0]);
+		if (key.objectid != extent_item_objectid)
+			break;
+		if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
+			dref = btrfs_item_ptr(eb, path->slots[0],
+						struct btrfs_extent_data_ref);
+			ret = __data_list_add_eb(&data_refs, eb, dref);
+		} else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
+			ret = __shared_list_add(&shared_refs, key.offset);
+		}
+	}
+
+	btrfs_release_path(path);
+
+	/*
+	 * ... only at the very end we can process the refs we found. this is
+	 * because the iterator function we call is allowed to make tree lookups
+	 * and we have to avoid deadlocks. additionally, we need more tree
+	 * lookups ourselves for shared data refs.
+	 */
+	while (!list_empty(&data_refs)) {
+		ref_d = list_first_entry(&data_refs, struct __data_ref, list);
+		list_del(&ref_d->list);
+		if (!ret)
+			ret = iterate(ref_d->inum, extent_offset +
+					ref_d->extent_data_item_offset,
+					ref_d->root, ctx);
+		kfree(ref_d);
+	}
+
+	while (!list_empty(&shared_refs)) {
+		ref_s = list_first_entry(&shared_refs, struct __shared_ref,
+					list);
+		list_del(&ref_s->list);
+		if (!ret)
+			ret = __iter_shared_inline_ref(fs_info,
+							ref_s->disk_byte,
+							extent_item_objectid,
+							extent_offset, path,
+							&data_refs,
+							iterate, ctx);
+		kfree(ref_s);
+	}
+
+	return ret;
+}
+
+int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
+				struct btrfs_path *path,
+				iterate_extent_inodes_t *iterate, void *ctx)
+{
+	int ret;
+	u64 offset;
+	struct btrfs_key found_key;
+
+	ret = extent_from_logical(fs_info, logical, path,
+					&found_key);
+	if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
+		ret = -EINVAL;
+	if (ret < 0)
+		return ret;
+
+	offset = logical - found_key.objectid;
+	ret = iterate_extent_inodes(fs_info, path, found_key.objectid,
+					offset, iterate, ctx);
+
+	return ret;
+}
+
+static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
+				struct btrfs_path *path,
+				iterate_irefs_t *iterate, void *ctx)
+{
+	int ret;
+	int slot;
+	u32 cur;
+	u32 len;
+	u32 name_len;
+	u64 parent = 0;
+	int found = 0;
+	struct extent_buffer *eb;
+	struct btrfs_item *item;
+	struct btrfs_inode_ref *iref;
+	struct btrfs_key found_key;
+
+	while (1) {
+		ret = inode_ref_info(inum, parent ? parent+1 : 0, fs_root, path,
+					&found_key);
+		if (ret < 0)
+			break;
+		if (ret) {
+			ret = found ? 0 : -ENOENT;
+			break;
+		}
+		++found;
+
+		parent = found_key.offset;
+		slot = path->slots[0];
+		eb = path->nodes[0];
+		/* make sure we can use eb after releasing the path */
+		atomic_inc(&eb->refs);
+		btrfs_release_path(path);
+
+		item = btrfs_item_nr(eb, slot);
+		iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
+
+		for (cur = 0; cur < btrfs_item_size(eb, item); cur += len) {
+			name_len = btrfs_inode_ref_name_len(eb, iref);
+			/* path must be released before calling iterate()! */
+			ret = iterate(parent, iref, eb, ctx);
+			if (ret) {
+				free_extent_buffer(eb);
+				break;
+			}
+			len = sizeof(*iref) + name_len;
+			iref = (struct btrfs_inode_ref *)((char *)iref + len);
+		}
+		free_extent_buffer(eb);
+	}
+
+	btrfs_release_path(path);
+
+	return ret;
+}
+
+/*
+ * returns 0 if the path could be dumped (probably truncated)
+ * returns <0 in case of an error
+ */
+static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref,
+				struct extent_buffer *eb, void *ctx)
+{
+	struct inode_fs_paths *ipath = ctx;
+	char *fspath;
+	char *fspath_min;
+	int i = ipath->fspath->elem_cnt;
+	const int s_ptr = sizeof(char *);
+	u32 bytes_left;
+
+	bytes_left = ipath->fspath->bytes_left > s_ptr ?
+					ipath->fspath->bytes_left - s_ptr : 0;
+
+	fspath_min = (char *)ipath->fspath->str + (i + 1) * s_ptr;
+	fspath = iref_to_path(ipath->fs_root, ipath->btrfs_path, iref, eb,
+				inum, fspath_min, bytes_left);
+	if (IS_ERR(fspath))
+		return PTR_ERR(fspath);
+
+	if (fspath > fspath_min) {
+		ipath->fspath->str[i] = fspath;
+		++ipath->fspath->elem_cnt;
+		ipath->fspath->bytes_left = fspath - fspath_min;
+	} else {
+		++ipath->fspath->elem_missed;
+		ipath->fspath->bytes_missing += fspath_min - fspath;
+		ipath->fspath->bytes_left = 0;
+	}
+
+	return 0;
+}
+
+/*
+ * this dumps all file system paths to the inode into the ipath struct, provided
+ * is has been created large enough. each path is zero-terminated and accessed
+ * from ipath->fspath->str[i].
+ * when it returns, there are ipath->fspath->elem_cnt number of paths available
+ * in ipath->fspath->str[]. when the allocated space wasn't sufficient, the
+ * number of missed paths in recored in ipath->fspath->elem_missed, otherwise,
+ * it's zero. ipath->fspath->bytes_missing holds the number of bytes that would
+ * have been needed to return all paths.
+ */
+int paths_from_inode(u64 inum, struct inode_fs_paths *ipath)
+{
+	return iterate_irefs(inum, ipath->fs_root, ipath->btrfs_path,
+				inode_to_path, ipath);
+}
+
+/*
+ * allocates space to return multiple file system paths for an inode.
+ * total_bytes to allocate are passed, note that space usable for actual path
+ * information will be total_bytes - sizeof(struct inode_fs_paths).
+ * the returned pointer must be freed with free_ipath() in the end.
+ */
+struct btrfs_data_container *init_data_container(u32 total_bytes)
+{
+	struct btrfs_data_container *data;
+	size_t alloc_bytes;
+
+	alloc_bytes = max_t(size_t, total_bytes, sizeof(*data));
+	data = kmalloc(alloc_bytes, GFP_NOFS);
+	if (!data)
+		return ERR_PTR(-ENOMEM);
+
+	if (total_bytes >= sizeof(*data)) {
+		data->bytes_left = total_bytes - sizeof(*data);
+		data->bytes_missing = 0;
+	} else {
+		data->bytes_missing = sizeof(*data) - total_bytes;
+		data->bytes_left = 0;
+	}
+
+	data->elem_cnt = 0;
+	data->elem_missed = 0;
+
+	return data;
+}
+
+/*
+ * allocates space to return multiple file system paths for an inode.
+ * total_bytes to allocate are passed, note that space usable for actual path
+ * information will be total_bytes - sizeof(struct inode_fs_paths).
+ * the returned pointer must be freed with free_ipath() in the end.
+ */
+struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
+					struct btrfs_path *path)
+{
+	struct inode_fs_paths *ifp;
+	struct btrfs_data_container *fspath;
+
+	fspath = init_data_container(total_bytes);
+	if (IS_ERR(fspath))
+		return (void *)fspath;
+
+	ifp = kmalloc(sizeof(*ifp), GFP_NOFS);
+	if (!ifp) {
+		kfree(fspath);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	ifp->btrfs_path = path;
+	ifp->fspath = fspath;
+	ifp->fs_root = fs_root;
+
+	return ifp;
+}
+
+void free_ipath(struct inode_fs_paths *ipath)
+{
+	kfree(ipath);
+}
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
new file mode 100644
index 000000000000..92618837cb8f
--- /dev/null
+++ b/fs/btrfs/backref.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (C) 2011 STRATO.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_BACKREF__
+#define __BTRFS_BACKREF__
+
+#include "ioctl.h"
+
+struct inode_fs_paths {
+	struct btrfs_path		*btrfs_path;
+	struct btrfs_root		*fs_root;
+	struct btrfs_data_container	*fspath;
+};
+
+typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root,
+		void *ctx);
+typedef int (iterate_irefs_t)(u64 parent, struct btrfs_inode_ref *iref,
+				struct extent_buffer *eb, void *ctx);
+
+int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
+			struct btrfs_path *path);
+
+int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
+			struct btrfs_path *path, struct btrfs_key *found_key);
+
+int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
+				struct btrfs_extent_item *ei, u32 item_size,
+				u64 *out_root, u8 *out_level);
+
+int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
+				struct btrfs_path *path,
+				u64 extent_item_objectid,
+				u64 extent_offset,
+				iterate_extent_inodes_t *iterate, void *ctx);
+
+int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
+				struct btrfs_path *path,
+				iterate_extent_inodes_t *iterate, void *ctx);
+
+int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
+
+struct btrfs_data_container *init_data_container(u32 total_bytes);
+struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
+					struct btrfs_path *path);
+void free_ipath(struct inode_fs_paths *ipath);
+
+#endif
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index ad1ea789fcb4..1857e3871843 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -193,6 +193,17 @@ struct btrfs_ioctl_space_args {
 	struct btrfs_ioctl_space_info spaces[0];
 };
 
+struct btrfs_data_container {
+	__u32	bytes_left;	/* out -- bytes not needed to deliver output */
+	__u32	bytes_missing;	/* out -- additional bytes needed for result */
+	__u32	elem_cnt;	/* out */
+	__u32	elem_missed;	/* out */
+	union {
+		char	*str[0];	/* out */
+		__u64	val[0];		/* out */
+	};
+};
+
 #define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
 				   struct btrfs_ioctl_vol_args)
 #define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
-- 
cgit v1.2.3


From 13db62b7a1e8c64763a93c155091620f85ff8920 Mon Sep 17 00:00:00 2001
From: Jan Schmidt <list.btrfs@jan-o-sch.net>
Date: Mon, 13 Jun 2011 19:56:13 +0200
Subject: btrfs scrub: added unverified_errors

In normal operation, scrub is reading data sequentially in large portions.
In case of an i/o error, we try to find the corrupted area(s) by issuing
page sized read requests. With this commit we increment the
unverified_errors counter if all of the small size requests succeed.

Userland patches carrying such conspicous events to the administrator should
already be around.

Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
---
 fs/btrfs/scrub.c | 37 ++++++++++++++++++++++++++-----------
 1 file changed, 26 insertions(+), 11 deletions(-)

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index a8d03d5efb5d..35099fa97d56 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -201,18 +201,25 @@ nomem:
  * recheck_error gets called for every page in the bio, even though only
  * one may be bad
  */
-static void scrub_recheck_error(struct scrub_bio *sbio, int ix)
+static int scrub_recheck_error(struct scrub_bio *sbio, int ix)
 {
+	struct scrub_dev *sdev = sbio->sdev;
+	u64 sector = (sbio->physical + ix * PAGE_SIZE) >> 9;
+
 	if (sbio->err) {
-		if (scrub_fixup_io(READ, sbio->sdev->dev->bdev,
-				   (sbio->physical + ix * PAGE_SIZE) >> 9,
+		if (scrub_fixup_io(READ, sbio->sdev->dev->bdev, sector,
 				   sbio->bio->bi_io_vec[ix].bv_page) == 0) {
 			if (scrub_fixup_check(sbio, ix) == 0)
-				return;
+				return 0;
 		}
 	}
 
+	spin_lock(&sdev->stat_lock);
+	++sdev->stat.read_errors;
+	spin_unlock(&sdev->stat_lock);
+
 	scrub_fixup(sbio, ix);
+	return 1;
 }
 
 static int scrub_fixup_check(struct scrub_bio *sbio, int ix)
@@ -382,8 +389,14 @@ static void scrub_checksum(struct btrfs_work *work)
 	int ret;
 
 	if (sbio->err) {
+		ret = 0;
 		for (i = 0; i < sbio->count; ++i)
-			scrub_recheck_error(sbio, i);
+			ret |= scrub_recheck_error(sbio, i);
+		if (!ret) {
+			spin_lock(&sdev->stat_lock);
+			++sdev->stat.unverified_errors;
+			spin_unlock(&sdev->stat_lock);
+		}
 
 		sbio->bio->bi_flags &= ~(BIO_POOL_MASK - 1);
 		sbio->bio->bi_flags |= 1 << BIO_UPTODATE;
@@ -396,10 +409,6 @@ static void scrub_checksum(struct btrfs_work *work)
 			bi->bv_offset = 0;
 			bi->bv_len = PAGE_SIZE;
 		}
-
-		spin_lock(&sdev->stat_lock);
-		++sdev->stat.read_errors;
-		spin_unlock(&sdev->stat_lock);
 		goto out;
 	}
 	for (i = 0; i < sbio->count; ++i) {
@@ -420,8 +429,14 @@ static void scrub_checksum(struct btrfs_work *work)
 			WARN_ON(1);
 		}
 		kunmap_atomic(buffer, KM_USER0);
-		if (ret)
-			scrub_recheck_error(sbio, i);
+		if (ret) {
+			ret = scrub_recheck_error(sbio, i);
+			if (!ret) {
+				spin_lock(&sdev->stat_lock);
+				++sdev->stat.unverified_errors;
+				spin_unlock(&sdev->stat_lock);
+			}
+		}
 	}
 
 out:
-- 
cgit v1.2.3


From 558540c17771eaf89b1a3be39aa2c8bc837da1a6 Mon Sep 17 00:00:00 2001
From: Jan Schmidt <list.btrfs@jan-o-sch.net>
Date: Mon, 13 Jun 2011 19:59:12 +0200
Subject: btrfs scrub: print paths of corrupted files

While scrubbing, we may encounter various errors. Previously, a logical
address was printed to the log only. Now, all paths belonging to that
address are resolved and printed separately. That should work for hardlinks
as well as reflinks.

Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
---
 fs/btrfs/scrub.c | 169 +++++++++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 163 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 35099fa97d56..221fd5c48736 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -17,10 +17,12 @@
  */
 
 #include <linux/blkdev.h>
+#include <linux/ratelimit.h>
 #include "ctree.h"
 #include "volumes.h"
 #include "disk-io.h"
 #include "ordered-data.h"
+#include "backref.h"
 
 /*
  * This is only the first step towards a full-features scrub. It reads all
@@ -100,6 +102,19 @@ struct scrub_dev {
 	spinlock_t		stat_lock;
 };
 
+struct scrub_warning {
+	struct btrfs_path	*path;
+	u64			extent_item_size;
+	char			*scratch_buf;
+	char			*msg_buf;
+	const char		*errstr;
+	sector_t		sector;
+	u64			logical;
+	struct btrfs_device	*dev;
+	int			msg_bufsize;
+	int			scratch_bufsize;
+};
+
 static void scrub_free_csums(struct scrub_dev *sdev)
 {
 	while (!list_empty(&sdev->csum_list)) {
@@ -195,6 +210,143 @@ nomem:
 	return ERR_PTR(-ENOMEM);
 }
 
+static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx)
+{
+	u64 isize;
+	u32 nlink;
+	int ret;
+	int i;
+	struct extent_buffer *eb;
+	struct btrfs_inode_item *inode_item;
+	struct scrub_warning *swarn = ctx;
+	struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info;
+	struct inode_fs_paths *ipath = NULL;
+	struct btrfs_root *local_root;
+	struct btrfs_key root_key;
+
+	root_key.objectid = root;
+	root_key.type = BTRFS_ROOT_ITEM_KEY;
+	root_key.offset = (u64)-1;
+	local_root = btrfs_read_fs_root_no_name(fs_info, &root_key);
+	if (IS_ERR(local_root)) {
+		ret = PTR_ERR(local_root);
+		goto err;
+	}
+
+	ret = inode_item_info(inum, 0, local_root, swarn->path);
+	if (ret) {
+		btrfs_release_path(swarn->path);
+		goto err;
+	}
+
+	eb = swarn->path->nodes[0];
+	inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
+					struct btrfs_inode_item);
+	isize = btrfs_inode_size(eb, inode_item);
+	nlink = btrfs_inode_nlink(eb, inode_item);
+	btrfs_release_path(swarn->path);
+
+	ipath = init_ipath(4096, local_root, swarn->path);
+	ret = paths_from_inode(inum, ipath);
+
+	if (ret < 0)
+		goto err;
+
+	/*
+	 * we deliberately ignore the bit ipath might have been too small to
+	 * hold all of the paths here
+	 */
+	for (i = 0; i < ipath->fspath->elem_cnt; ++i)
+		printk(KERN_WARNING "btrfs: %s at logical %llu on dev "
+			"%s, sector %llu, root %llu, inode %llu, offset %llu, "
+			"length %llu, links %u (path: %s)\n", swarn->errstr,
+			swarn->logical, swarn->dev->name,
+			(unsigned long long)swarn->sector, root, inum, offset,
+			min(isize - offset, (u64)PAGE_SIZE), nlink,
+			ipath->fspath->str[i]);
+
+	free_ipath(ipath);
+	return 0;
+
+err:
+	printk(KERN_WARNING "btrfs: %s at logical %llu on dev "
+		"%s, sector %llu, root %llu, inode %llu, offset %llu: path "
+		"resolving failed with ret=%d\n", swarn->errstr,
+		swarn->logical, swarn->dev->name,
+		(unsigned long long)swarn->sector, root, inum, offset, ret);
+
+	free_ipath(ipath);
+	return 0;
+}
+
+static void scrub_print_warning(const char *errstr, struct scrub_bio *sbio,
+				int ix)
+{
+	struct btrfs_device *dev = sbio->sdev->dev;
+	struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
+	struct btrfs_path *path;
+	struct btrfs_key found_key;
+	struct extent_buffer *eb;
+	struct btrfs_extent_item *ei;
+	struct scrub_warning swarn;
+	u32 item_size;
+	int ret;
+	u64 ref_root;
+	u8 ref_level;
+	unsigned long ptr = 0;
+	const int bufsize = 4096;
+	u64 extent_offset;
+
+	path = btrfs_alloc_path();
+
+	swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS);
+	swarn.msg_buf = kmalloc(bufsize, GFP_NOFS);
+	swarn.sector = (sbio->physical + ix * PAGE_SIZE) >> 9;
+	swarn.logical = sbio->logical + ix * PAGE_SIZE;
+	swarn.errstr = errstr;
+	swarn.dev = dev;
+	swarn.msg_bufsize = bufsize;
+	swarn.scratch_bufsize = bufsize;
+
+	if (!path || !swarn.scratch_buf || !swarn.msg_buf)
+		goto out;
+
+	ret = extent_from_logical(fs_info, swarn.logical, path, &found_key);
+	if (ret < 0)
+		goto out;
+
+	extent_offset = swarn.logical - found_key.objectid;
+	swarn.extent_item_size = found_key.offset;
+
+	eb = path->nodes[0];
+	ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
+	item_size = btrfs_item_size_nr(eb, path->slots[0]);
+
+	if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
+		do {
+			ret = tree_backref_for_extent(&ptr, eb, ei, item_size,
+							&ref_root, &ref_level);
+			printk(KERN_WARNING "%s at logical %llu on dev %s, "
+				"sector %llu: metadata %s (level %d) in tree "
+				"%llu\n", errstr, swarn.logical, dev->name,
+				(unsigned long long)swarn.sector,
+				ref_level ? "node" : "leaf",
+				ret < 0 ? -1 : ref_level,
+				ret < 0 ? -1 : ref_root);
+		} while (ret != 1);
+	} else {
+		swarn.path = path;
+		iterate_extent_inodes(fs_info, path, found_key.objectid,
+					extent_offset,
+					scrub_print_warning_inode, &swarn);
+	}
+
+out:
+	btrfs_free_path(path);
+	kfree(swarn.scratch_buf);
+	kfree(swarn.msg_buf);
+}
+
 /*
  * scrub_recheck_error gets called when either verification of the page
  * failed or the bio failed to read, e.g. with EIO. In the latter case,
@@ -205,6 +357,8 @@ static int scrub_recheck_error(struct scrub_bio *sbio, int ix)
 {
 	struct scrub_dev *sdev = sbio->sdev;
 	u64 sector = (sbio->physical + ix * PAGE_SIZE) >> 9;
+	static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
+					DEFAULT_RATELIMIT_BURST);
 
 	if (sbio->err) {
 		if (scrub_fixup_io(READ, sbio->sdev->dev->bdev, sector,
@@ -212,6 +366,11 @@ static int scrub_recheck_error(struct scrub_bio *sbio, int ix)
 			if (scrub_fixup_check(sbio, ix) == 0)
 				return 0;
 		}
+		if (__ratelimit(&_rs))
+			scrub_print_warning("i/o error", sbio, ix);
+	} else {
+		if (__ratelimit(&_rs))
+			scrub_print_warning("checksum error", sbio, ix);
 	}
 
 	spin_lock(&sdev->stat_lock);
@@ -326,9 +485,8 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
 	++sdev->stat.corrected_errors;
 	spin_unlock(&sdev->stat_lock);
 
-	if (printk_ratelimit())
-		printk(KERN_ERR "btrfs: fixed up at %llu\n",
-		       (unsigned long long)logical);
+	printk_ratelimited(KERN_ERR "btrfs: fixed up error at logical %llu\n",
+			       (unsigned long long)logical);
 	return;
 
 uncorrectable:
@@ -337,9 +495,8 @@ uncorrectable:
 	++sdev->stat.uncorrectable_errors;
 	spin_unlock(&sdev->stat_lock);
 
-	if (printk_ratelimit())
-		printk(KERN_ERR "btrfs: unable to fixup at %llu\n",
-			 (unsigned long long)logical);
+	printk_ratelimited(KERN_ERR "btrfs: unable to fixup (regular) error at "
+				"logical %llu\n", (unsigned long long)logical);
 }
 
 static int scrub_fixup_io(int rw, struct block_device *bdev, sector_t sector,
-- 
cgit v1.2.3


From 193ea74b2729e6ddc08fb6bde6e15a3bd4d94071 Mon Sep 17 00:00:00 2001
From: Jan Schmidt <list.btrfs@jan-o-sch.net>
Date: Mon, 13 Jun 2011 19:56:54 +0200
Subject: btrfs scrub: bugfix: mirror_num off by one

Fix the mirror_num determination in scrub_stripe. The rest of the scrub code
did not use mirror_num for anything important and that error went unnoticed.
The nodatasum fixup patch of this set depends on a correct mirror_num.

Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
---
 fs/btrfs/scrub.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 221fd5c48736..59caf8fcd1c7 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -452,7 +452,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
 	 * first find a good copy
 	 */
 	for (i = 0; i < multi->num_stripes; ++i) {
-		if (i == sbio->spag[ix].mirror_num)
+		if (i + 1 == sbio->spag[ix].mirror_num)
 			continue;
 
 		if (scrub_fixup_io(READ, multi->stripes[i].dev->bdev,
@@ -930,21 +930,21 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
 	if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
 		offset = map->stripe_len * num;
 		increment = map->stripe_len * map->num_stripes;
-		mirror_num = 0;
+		mirror_num = 1;
 	} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
 		int factor = map->num_stripes / map->sub_stripes;
 		offset = map->stripe_len * (num / map->sub_stripes);
 		increment = map->stripe_len * factor;
-		mirror_num = num % map->sub_stripes;
+		mirror_num = num % map->sub_stripes + 1;
 	} else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
 		increment = map->stripe_len;
-		mirror_num = num % map->num_stripes;
+		mirror_num = num % map->num_stripes + 1;
 	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
 		increment = map->stripe_len;
-		mirror_num = num % map->num_stripes;
+		mirror_num = num % map->num_stripes + 1;
 	} else {
 		increment = map->stripe_len;
-		mirror_num = 0;
+		mirror_num = 1;
 	}
 
 	path = btrfs_alloc_path();
-- 
cgit v1.2.3


From 8ddc7d9cd0a00062247c732b96386ec2462bdbc7 Mon Sep 17 00:00:00 2001
From: Jan Schmidt <list.btrfs@jan-o-sch.net>
Date: Mon, 13 Jun 2011 20:02:58 +0200
Subject: btrfs: add mirror_num to extent_read_full_page

Currently, extent_read_full_page always assumes we are trying to read mirror
0, which generally is the best we can do. To add flexibility, pass it as a
parameter. This will be needed by scrub fixup code.

Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
---
 fs/btrfs/disk-io.c   | 2 +-
 fs/btrfs/extent_io.c | 6 +++---
 fs/btrfs/extent_io.h | 2 +-
 fs/btrfs/inode.c     | 2 +-
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 07b3ac662e19..dc0343802535 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -908,7 +908,7 @@ static int btree_readpage(struct file *file, struct page *page)
 {
 	struct extent_io_tree *tree;
 	tree = &BTRFS_I(page->mapping->host)->io_tree;
-	return extent_read_full_page(tree, page, btree_get_extent);
+	return extent_read_full_page(tree, page, btree_get_extent, 0);
 }
 
 static int btree_releasepage(struct page *page, gfp_t gfp_flags)
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index d418164a35f1..afebb95e3490 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2076,16 +2076,16 @@ out:
 }
 
 int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
-			    get_extent_t *get_extent)
+			    get_extent_t *get_extent, int mirror_num)
 {
 	struct bio *bio = NULL;
 	unsigned long bio_flags = 0;
 	int ret;
 
-	ret = __extent_read_full_page(tree, page, get_extent, &bio, 0,
+	ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num,
 				      &bio_flags);
 	if (bio)
-		ret = submit_one_bio(READ, bio, 0, bio_flags);
+		ret = submit_one_bio(READ, bio, mirror_num, bio_flags);
 	return ret;
 }
 
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 7b2f0c3e7929..a9dd994bf826 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -185,7 +185,7 @@ int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
 int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
 		    gfp_t mask);
 int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
-			  get_extent_t *get_extent);
+			  get_extent_t *get_extent, int mirror_num);
 int __init extent_io_init(void);
 void extent_io_exit(void);
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b2d004ad66a0..efee8c7cfa45 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6289,7 +6289,7 @@ int btrfs_readpage(struct file *file, struct page *page)
 {
 	struct extent_io_tree *tree;
 	tree = &BTRFS_I(page->mapping->host)->io_tree;
-	return extent_read_full_page(tree, page, btrfs_get_extent);
+	return extent_read_full_page(tree, page, btrfs_get_extent, 0);
 }
 
 static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
-- 
cgit v1.2.3


From e12fa9cd390f8e93a9144bd99bd6f6ed316fbc1e Mon Sep 17 00:00:00 2001
From: Jan Schmidt <list.btrfs@jan-o-sch.net>
Date: Fri, 17 Jun 2011 15:55:21 +0200
Subject: btrfs scrub: use int for mirror_num, not u64

the rest of the code uses int mirror_num, and so should scrub

Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
---
 fs/btrfs/scrub.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 59caf8fcd1c7..41a01147b959 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -65,7 +65,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix);
 struct scrub_page {
 	u64			flags;  /* extent flags */
 	u64			generation;
-	u64			mirror_num;
+	int			mirror_num;
 	int			have_csum;
 	u8			csum[BTRFS_CSUM_SIZE];
 };
@@ -776,7 +776,7 @@ nomem:
 }
 
 static int scrub_page(struct scrub_dev *sdev, u64 logical, u64 len,
-		      u64 physical, u64 flags, u64 gen, u64 mirror_num,
+		      u64 physical, u64 flags, u64 gen, int mirror_num,
 		      u8 *csum, int force)
 {
 	struct scrub_bio *sbio;
@@ -873,7 +873,7 @@ static int scrub_find_csum(struct scrub_dev *sdev, u64 logical, u64 len,
 
 /* scrub extent tries to collect up to 64 kB for each bio */
 static int scrub_extent(struct scrub_dev *sdev, u64 logical, u64 len,
-			u64 physical, u64 flags, u64 gen, u64 mirror_num)
+			u64 physical, u64 flags, u64 gen, int mirror_num)
 {
 	int ret;
 	u8 csum[BTRFS_CSUM_SIZE];
@@ -919,7 +919,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
 	u64 physical;
 	u64 logical;
 	u64 generation;
-	u64 mirror_num;
+	int mirror_num;
 
 	u64 increment = map->stripe_len;
 	u64 offset;
-- 
cgit v1.2.3


From 0ef8e45158f97dde4801b535e25f70f7caf01a27 Mon Sep 17 00:00:00 2001
From: Jan Schmidt <list.btrfs@jan-o-sch.net>
Date: Mon, 13 Jun 2011 20:04:15 +0200
Subject: btrfs scrub: add fixup code for errors on nodatasum files

This removes a FIXME comment and introduces the first part of nodatasum
fixup: It gets the corresponding inode for a logical address and triggers a
regular readpage for the corrupted sector.

Once we have on-the-fly error correction our error will be automatically
corrected. The correction code is expected to clear the newly introduced
EXTENT_DAMAGED flag, making scrub report that error as "corrected" instead
of "uncorrectable" eventually.

Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
---
 fs/btrfs/extent_io.h |   1 +
 fs/btrfs/scrub.c     | 188 +++++++++++++++++++++++++++++++++++++++++++++++++--
 2 files changed, 183 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index a9dd994bf826..435d454b9926 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -17,6 +17,7 @@
 #define EXTENT_NODATASUM (1 << 10)
 #define EXTENT_DO_ACCOUNTING (1 << 11)
 #define EXTENT_FIRST_DELALLOC (1 << 12)
+#define EXTENT_DAMAGED (1 << 13)
 #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
 #define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
 
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 41a01147b959..db09f01c0e4f 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -22,6 +22,7 @@
 #include "volumes.h"
 #include "disk-io.h"
 #include "ordered-data.h"
+#include "transaction.h"
 #include "backref.h"
 
 /*
@@ -89,6 +90,7 @@ struct scrub_dev {
 	int			first_free;
 	int			curr;
 	atomic_t		in_flight;
+	atomic_t		fixup_cnt;
 	spinlock_t		list_lock;
 	wait_queue_head_t	list_wait;
 	u16			csum_size;
@@ -102,6 +104,14 @@ struct scrub_dev {
 	spinlock_t		stat_lock;
 };
 
+struct scrub_fixup_nodatasum {
+	struct scrub_dev	*sdev;
+	u64			logical;
+	struct btrfs_root	*root;
+	struct btrfs_work	work;
+	int			mirror_num;
+};
+
 struct scrub_warning {
 	struct btrfs_path	*path;
 	u64			extent_item_size;
@@ -190,12 +200,13 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev)
 
 		if (i != SCRUB_BIOS_PER_DEV-1)
 			sdev->bios[i]->next_free = i + 1;
-		 else
+		else
 			sdev->bios[i]->next_free = -1;
 	}
 	sdev->first_free = 0;
 	sdev->curr = -1;
 	atomic_set(&sdev->in_flight, 0);
+	atomic_set(&sdev->fixup_cnt, 0);
 	atomic_set(&sdev->cancel_req, 0);
 	sdev->csum_size = btrfs_super_csum_size(&fs_info->super_copy);
 	INIT_LIST_HEAD(&sdev->csum_list);
@@ -347,6 +358,151 @@ out:
 	kfree(swarn.msg_buf);
 }
 
+static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx)
+{
+	struct page *page;
+	unsigned long index;
+	struct scrub_fixup_nodatasum *fixup = ctx;
+	int ret;
+	int corrected;
+	struct btrfs_key key;
+	struct inode *inode;
+	u64 end = offset + PAGE_SIZE - 1;
+	struct btrfs_root *local_root;
+
+	key.objectid = root;
+	key.type = BTRFS_ROOT_ITEM_KEY;
+	key.offset = (u64)-1;
+	local_root = btrfs_read_fs_root_no_name(fixup->root->fs_info, &key);
+	if (IS_ERR(local_root))
+		return PTR_ERR(local_root);
+
+	key.type = BTRFS_INODE_ITEM_KEY;
+	key.objectid = inum;
+	key.offset = 0;
+	inode = btrfs_iget(fixup->root->fs_info->sb, &key, local_root, NULL);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
+
+	ret = set_extent_bit(&BTRFS_I(inode)->io_tree, offset, end,
+				EXTENT_DAMAGED, 0, NULL, NULL, GFP_NOFS);
+
+	/* set_extent_bit should either succeed or give proper error */
+	WARN_ON(ret > 0);
+	if (ret)
+		return ret < 0 ? ret : -EFAULT;
+
+	index = offset >> PAGE_CACHE_SHIFT;
+
+	page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
+	if (!page)
+		return -ENOMEM;
+
+	ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page,
+					btrfs_get_extent, fixup->mirror_num);
+	wait_on_page_locked(page);
+	corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset, end,
+					EXTENT_DAMAGED, 0, NULL);
+
+	if (corrected)
+		WARN_ON(!PageUptodate(page));
+	else
+		clear_extent_bit(&BTRFS_I(inode)->io_tree, offset, end,
+					EXTENT_DAMAGED, 0, 0, NULL, GFP_NOFS);
+
+	put_page(page);
+	iput(inode);
+
+	if (ret < 0)
+		return ret;
+
+	if (ret == 0 && corrected) {
+		/*
+		 * we only need to call readpage for one of the inodes belonging
+		 * to this extent. so make iterate_extent_inodes stop
+		 */
+		return 1;
+	}
+
+	return -EIO;
+}
+
+static void scrub_fixup_nodatasum(struct btrfs_work *work)
+{
+	int ret;
+	struct scrub_fixup_nodatasum *fixup;
+	struct scrub_dev *sdev;
+	struct btrfs_trans_handle *trans = NULL;
+	struct btrfs_fs_info *fs_info;
+	struct btrfs_path *path;
+	int uncorrectable = 0;
+
+	fixup = container_of(work, struct scrub_fixup_nodatasum, work);
+	sdev = fixup->sdev;
+	fs_info = fixup->root->fs_info;
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		spin_lock(&sdev->stat_lock);
+		++sdev->stat.malloc_errors;
+		spin_unlock(&sdev->stat_lock);
+		uncorrectable = 1;
+		goto out;
+	}
+
+	trans = btrfs_join_transaction(fixup->root);
+	if (IS_ERR(trans)) {
+		uncorrectable = 1;
+		goto out;
+	}
+
+	/*
+	 * the idea is to trigger a regular read through the standard path. we
+	 * read a page from the (failed) logical address by specifying the
+	 * corresponding copynum of the failed sector. thus, that readpage is
+	 * expected to fail.
+	 * that is the point where on-the-fly error correction will kick in
+	 * (once it's finished) and rewrite the failed sector if a good copy
+	 * can be found.
+	 */
+	ret = iterate_inodes_from_logical(fixup->logical, fixup->root->fs_info,
+						path, scrub_fixup_readpage,
+						fixup);
+	if (ret < 0) {
+		uncorrectable = 1;
+		goto out;
+	}
+	WARN_ON(ret != 1);
+
+	spin_lock(&sdev->stat_lock);
+	++sdev->stat.corrected_errors;
+	spin_unlock(&sdev->stat_lock);
+
+out:
+	if (trans && !IS_ERR(trans))
+		btrfs_end_transaction(trans, fixup->root);
+	if (uncorrectable) {
+		spin_lock(&sdev->stat_lock);
+		++sdev->stat.uncorrectable_errors;
+		spin_unlock(&sdev->stat_lock);
+		printk_ratelimited(KERN_ERR "btrfs: unable to fixup "
+					"(nodatasum) error at logical %llu\n",
+					fixup->logical);
+	}
+
+	btrfs_free_path(path);
+	kfree(fixup);
+
+	/* see caller why we're pretending to be paused in the scrub counters */
+	mutex_lock(&fs_info->scrub_lock);
+	atomic_dec(&fs_info->scrubs_running);
+	atomic_dec(&fs_info->scrubs_paused);
+	mutex_unlock(&fs_info->scrub_lock);
+	atomic_dec(&sdev->fixup_cnt);
+	wake_up(&fs_info->scrub_pause_wait);
+	wake_up(&sdev->list_wait);
+}
+
 /*
  * scrub_recheck_error gets called when either verification of the page
  * failed or the bio failed to read, e.g. with EIO. In the latter case,
@@ -417,6 +573,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
 	struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
 	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
 	struct btrfs_multi_bio *multi = NULL;
+	struct scrub_fixup_nodatasum *fixup;
 	u64 logical = sbio->logical + ix * PAGE_SIZE;
 	u64 length;
 	int i;
@@ -425,12 +582,30 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
 
 	if ((sbio->spag[ix].flags & BTRFS_EXTENT_FLAG_DATA) &&
 	    (sbio->spag[ix].have_csum == 0)) {
+		fixup = kzalloc(sizeof(*fixup), GFP_NOFS);
+		if (!fixup)
+			goto uncorrectable;
+		fixup->sdev = sdev;
+		fixup->logical = logical;
+		fixup->root = fs_info->extent_root;
+		fixup->mirror_num = sbio->spag[ix].mirror_num;
 		/*
-		 * nodatasum, don't try to fix anything
-		 * FIXME: we can do better, open the inode and trigger a
-		 * writeback
+		 * increment scrubs_running to prevent cancel requests from
+		 * completing as long as a fixup worker is running. we must also
+		 * increment scrubs_paused to prevent deadlocking on pause
+		 * requests used for transactions commits (as the worker uses a
+		 * transaction context). it is safe to regard the fixup worker
+		 * as paused for all matters practical. effectively, we only
+		 * avoid cancellation requests from completing.
 		 */
-		goto uncorrectable;
+		mutex_lock(&fs_info->scrub_lock);
+		atomic_inc(&fs_info->scrubs_running);
+		atomic_inc(&fs_info->scrubs_paused);
+		mutex_unlock(&fs_info->scrub_lock);
+		atomic_inc(&sdev->fixup_cnt);
+		fixup->work.func = scrub_fixup_nodatasum;
+		btrfs_queue_worker(&fs_info->scrub_workers, &fixup->work);
+		return;
 	}
 
 	length = PAGE_SIZE;
@@ -1425,10 +1600,11 @@ int btrfs_scrub_dev(struct btrfs_root *root, u64 devid, u64 start, u64 end,
 		ret = scrub_enumerate_chunks(sdev, start, end);
 
 	wait_event(sdev->list_wait, atomic_read(&sdev->in_flight) == 0);
-
 	atomic_dec(&fs_info->scrubs_running);
 	wake_up(&fs_info->scrub_pause_wait);
 
+	wait_event(sdev->list_wait, atomic_read(&sdev->fixup_cnt) == 0);
+
 	if (progress)
 		memcpy(progress, &sdev->stat, sizeof(*progress));
 
-- 
cgit v1.2.3


From d7728c960dccf775b92f2c4139f1216275a45c44 Mon Sep 17 00:00:00 2001
From: Jan Schmidt <list.btrfs@jan-o-sch.net>
Date: Thu, 7 Jul 2011 16:48:38 +0200
Subject: btrfs: new ioctls to do logical->inode and inode->path resolving

these ioctls make use of the new functions initially added for scrub. they
return all inodes belonging to a logical address (BTRFS_IOC_LOGICAL_INO) and
all paths belonging to an inode (BTRFS_IOC_INO_PATHS).

Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
---
 fs/btrfs/ioctl.c | 143 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/ioctl.h |  19 ++++++++
 2 files changed, 162 insertions(+)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 538f65a79ec5..7f57efa76d11 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -51,6 +51,7 @@
 #include "volumes.h"
 #include "locking.h"
 #include "inode-map.h"
+#include "backref.h"
 
 /* Mask out flags that are inappropriate for the given type of inode. */
 static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
@@ -2855,6 +2856,144 @@ static long btrfs_ioctl_scrub_progress(struct btrfs_root *root,
 	return ret;
 }
 
+static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
+{
+	int ret = 0;
+	int i;
+	unsigned long rel_ptr;
+	int size;
+	struct btrfs_ioctl_ino_path_args *ipa;
+	struct inode_fs_paths *ipath = NULL;
+	struct btrfs_path *path;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ipa = memdup_user(arg, sizeof(*ipa));
+	if (IS_ERR(ipa)) {
+		ret = PTR_ERR(ipa);
+		ipa = NULL;
+		goto out;
+	}
+
+	size = min_t(u32, ipa->size, 4096);
+	ipath = init_ipath(size, root, path);
+	if (IS_ERR(ipath)) {
+		ret = PTR_ERR(ipath);
+		ipath = NULL;
+		goto out;
+	}
+
+	ret = paths_from_inode(ipa->inum, ipath);
+	if (ret < 0)
+		goto out;
+
+	for (i = 0; i < ipath->fspath->elem_cnt; ++i) {
+		rel_ptr = ipath->fspath->str[i] - (char *)ipath->fspath->str;
+		ipath->fspath->str[i] = (void *)rel_ptr;
+	}
+
+	ret = copy_to_user(ipa->fspath, ipath->fspath, size);
+	if (ret) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+out:
+	btrfs_free_path(path);
+	free_ipath(ipath);
+	kfree(ipa);
+
+	return ret;
+}
+
+static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx)
+{
+	struct btrfs_data_container *inodes = ctx;
+	const size_t c = 3 * sizeof(u64);
+
+	if (inodes->bytes_left >= c) {
+		inodes->bytes_left -= c;
+		inodes->val[inodes->elem_cnt] = inum;
+		inodes->val[inodes->elem_cnt + 1] = offset;
+		inodes->val[inodes->elem_cnt + 2] = root;
+		inodes->elem_cnt += 3;
+	} else {
+		inodes->bytes_missing += c - inodes->bytes_left;
+		inodes->bytes_left = 0;
+		inodes->elem_missed += 3;
+	}
+
+	return 0;
+}
+
+static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
+					void __user *arg)
+{
+	int ret = 0;
+	int size;
+	u64 extent_offset;
+	struct btrfs_ioctl_logical_ino_args *loi;
+	struct btrfs_data_container *inodes = NULL;
+	struct btrfs_path *path = NULL;
+	struct btrfs_key key;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	loi = memdup_user(arg, sizeof(*loi));
+	if (IS_ERR(loi)) {
+		ret = PTR_ERR(loi);
+		loi = NULL;
+		goto out;
+	}
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	size = min_t(u32, loi->size, 4096);
+	inodes = init_data_container(size);
+	if (IS_ERR(inodes)) {
+		ret = PTR_ERR(inodes);
+		inodes = NULL;
+		goto out;
+	}
+
+	ret = extent_from_logical(root->fs_info, loi->logical, path, &key);
+
+	if (ret & BTRFS_EXTENT_FLAG_TREE_BLOCK)
+		ret = -ENOENT;
+	if (ret < 0)
+		goto out;
+
+	extent_offset = loi->logical - key.objectid;
+	ret = iterate_extent_inodes(root->fs_info, path, key.objectid,
+					extent_offset, build_ino_list, inodes);
+
+	if (ret < 0)
+		goto out;
+
+	ret = copy_to_user(loi->inodes, inodes, size);
+	if (ret)
+		ret = -EFAULT;
+
+out:
+	btrfs_free_path(path);
+	kfree(inodes);
+	kfree(loi);
+
+	return ret;
+}
+
 long btrfs_ioctl(struct file *file, unsigned int
 		cmd, unsigned long arg)
 {
@@ -2912,6 +3051,10 @@ long btrfs_ioctl(struct file *file, unsigned int
 		return btrfs_ioctl_tree_search(file, argp);
 	case BTRFS_IOC_INO_LOOKUP:
 		return btrfs_ioctl_ino_lookup(file, argp);
+	case BTRFS_IOC_INO_PATHS:
+		return btrfs_ioctl_ino_to_path(root, argp);
+	case BTRFS_IOC_LOGICAL_INO:
+		return btrfs_ioctl_logical_to_ino(root, argp);
 	case BTRFS_IOC_SPACE_INFO:
 		return btrfs_ioctl_space_info(root, argp);
 	case BTRFS_IOC_SYNC:
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 1857e3871843..2da30d4950e6 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -204,6 +204,20 @@ struct btrfs_data_container {
 	};
 };
 
+struct btrfs_ioctl_ino_path_args {
+	__u64				inum;		/* in */
+	__u32				size;		/* in */
+	__u64				reserved[4];
+	struct btrfs_data_container	*fspath;	/* out */
+};
+
+struct btrfs_ioctl_logical_ino_args {
+	__u64				logical;	/* in */
+	__u32				size;		/* in */
+	__u64				reserved[4];
+	struct btrfs_data_container	*inodes;	/* out */
+};
+
 #define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
 				   struct btrfs_ioctl_vol_args)
 #define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \
@@ -259,4 +273,9 @@ struct btrfs_data_container {
 				 struct btrfs_ioctl_dev_info_args)
 #define BTRFS_IOC_FS_INFO _IOR(BTRFS_IOCTL_MAGIC, 31, \
 			       struct btrfs_ioctl_fs_info_args)
+#define BTRFS_IOC_INO_PATHS _IOWR(BTRFS_IOCTL_MAGIC, 35, \
+					struct btrfs_ioctl_ino_path_args)
+#define BTRFS_IOC_LOGICAL_INO _IOWR(BTRFS_IOCTL_MAGIC, 36, \
+					struct btrfs_ioctl_ino_path_args)
+
 #endif
-- 
cgit v1.2.3


From a1d3c4786a4b9c71c0767aa656a759968f7554b6 Mon Sep 17 00:00:00 2001
From: Jan Schmidt <list.btrfs@jan-o-sch.net>
Date: Thu, 4 Aug 2011 17:15:33 +0200
Subject: btrfs: btrfs_multi_bio replaced with btrfs_bio

btrfs_bio is a bio abstraction able to split and not complete after the last
bio has returned (like the old btrfs_multi_bio). Additionally, btrfs_bio
tracks the mirror_num used to read data which can be used for error
correction purposes.

Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
---
 fs/btrfs/extent-tree.c |  10 ++--
 fs/btrfs/scrub.c       |  20 ++++----
 fs/btrfs/volumes.c     | 128 ++++++++++++++++++++++++++-----------------------
 fs/btrfs/volumes.h     |  10 ++--
 4 files changed, 90 insertions(+), 78 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f5be06a2462f..119f842c1d4f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1770,18 +1770,18 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
 {
 	int ret;
 	u64 discarded_bytes = 0;
-	struct btrfs_multi_bio *multi = NULL;
+	struct btrfs_bio *bbio = NULL;
 
 
 	/* Tell the block device(s) that the sectors can be discarded */
 	ret = btrfs_map_block(&root->fs_info->mapping_tree, REQ_DISCARD,
-			      bytenr, &num_bytes, &multi, 0);
+			      bytenr, &num_bytes, &bbio, 0);
 	if (!ret) {
-		struct btrfs_bio_stripe *stripe = multi->stripes;
+		struct btrfs_bio_stripe *stripe = bbio->stripes;
 		int i;
 
 
-		for (i = 0; i < multi->num_stripes; i++, stripe++) {
+		for (i = 0; i < bbio->num_stripes; i++, stripe++) {
 			if (!stripe->dev->can_discard)
 				continue;
 
@@ -1800,7 +1800,7 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
 			 */
 			ret = 0;
 		}
-		kfree(multi);
+		kfree(bbio);
 	}
 
 	if (actual_bytes)
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index db09f01c0e4f..97142a218f0a 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -572,7 +572,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
 	struct scrub_dev *sdev = sbio->sdev;
 	struct btrfs_fs_info *fs_info = sdev->dev->dev_root->fs_info;
 	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
-	struct btrfs_multi_bio *multi = NULL;
+	struct btrfs_bio *bbio = NULL;
 	struct scrub_fixup_nodatasum *fixup;
 	u64 logical = sbio->logical + ix * PAGE_SIZE;
 	u64 length;
@@ -610,8 +610,8 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
 
 	length = PAGE_SIZE;
 	ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length,
-			      &multi, 0);
-	if (ret || !multi || length < PAGE_SIZE) {
+			      &bbio, 0);
+	if (ret || !bbio || length < PAGE_SIZE) {
 		printk(KERN_ERR
 		       "scrub_fixup: btrfs_map_block failed us for %llu\n",
 		       (unsigned long long)logical);
@@ -619,19 +619,19 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
 		return;
 	}
 
-	if (multi->num_stripes == 1)
+	if (bbio->num_stripes == 1)
 		/* there aren't any replicas */
 		goto uncorrectable;
 
 	/*
 	 * first find a good copy
 	 */
-	for (i = 0; i < multi->num_stripes; ++i) {
+	for (i = 0; i < bbio->num_stripes; ++i) {
 		if (i + 1 == sbio->spag[ix].mirror_num)
 			continue;
 
-		if (scrub_fixup_io(READ, multi->stripes[i].dev->bdev,
-				   multi->stripes[i].physical >> 9,
+		if (scrub_fixup_io(READ, bbio->stripes[i].dev->bdev,
+				   bbio->stripes[i].physical >> 9,
 				   sbio->bio->bi_io_vec[ix].bv_page)) {
 			/* I/O-error, this is not a good copy */
 			continue;
@@ -640,7 +640,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
 		if (scrub_fixup_check(sbio, ix) == 0)
 			break;
 	}
-	if (i == multi->num_stripes)
+	if (i == bbio->num_stripes)
 		goto uncorrectable;
 
 	if (!sdev->readonly) {
@@ -655,7 +655,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
 		}
 	}
 
-	kfree(multi);
+	kfree(bbio);
 	spin_lock(&sdev->stat_lock);
 	++sdev->stat.corrected_errors;
 	spin_unlock(&sdev->stat_lock);
@@ -665,7 +665,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
 	return;
 
 uncorrectable:
-	kfree(multi);
+	kfree(bbio);
 	spin_lock(&sdev->stat_lock);
 	++sdev->stat.uncorrectable_errors;
 	spin_unlock(&sdev->stat_lock);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f2a4cc79da61..9db4d7962f9b 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2848,7 +2848,7 @@ static int find_live_mirror(struct map_lookup *map, int first, int num,
 
 static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 			     u64 logical, u64 *length,
-			     struct btrfs_multi_bio **multi_ret,
+			     struct btrfs_bio **bbio_ret,
 			     int mirror_num)
 {
 	struct extent_map *em;
@@ -2866,18 +2866,18 @@ static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 	int i;
 	int num_stripes;
 	int max_errors = 0;
-	struct btrfs_multi_bio *multi = NULL;
+	struct btrfs_bio *bbio = NULL;
 
-	if (multi_ret && !(rw & (REQ_WRITE | REQ_DISCARD)))
+	if (bbio_ret && !(rw & (REQ_WRITE | REQ_DISCARD)))
 		stripes_allocated = 1;
 again:
-	if (multi_ret) {
-		multi = kzalloc(btrfs_multi_bio_size(stripes_allocated),
+	if (bbio_ret) {
+		bbio = kzalloc(btrfs_bio_size(stripes_allocated),
 				GFP_NOFS);
-		if (!multi)
+		if (!bbio)
 			return -ENOMEM;
 
-		atomic_set(&multi->error, 0);
+		atomic_set(&bbio->error, 0);
 	}
 
 	read_lock(&em_tree->lock);
@@ -2898,7 +2898,7 @@ again:
 	if (mirror_num > map->num_stripes)
 		mirror_num = 0;
 
-	/* if our multi bio struct is too small, back off and try again */
+	/* if our btrfs_bio struct is too small, back off and try again */
 	if (rw & REQ_WRITE) {
 		if (map->type & (BTRFS_BLOCK_GROUP_RAID1 |
 				 BTRFS_BLOCK_GROUP_DUP)) {
@@ -2917,11 +2917,11 @@ again:
 			stripes_required = map->num_stripes;
 		}
 	}
-	if (multi_ret && (rw & (REQ_WRITE | REQ_DISCARD)) &&
+	if (bbio_ret && (rw & (REQ_WRITE | REQ_DISCARD)) &&
 	    stripes_allocated < stripes_required) {
 		stripes_allocated = map->num_stripes;
 		free_extent_map(em);
-		kfree(multi);
+		kfree(bbio);
 		goto again;
 	}
 	stripe_nr = offset;
@@ -2950,7 +2950,7 @@ again:
 		*length = em->len - offset;
 	}
 
-	if (!multi_ret)
+	if (!bbio_ret)
 		goto out;
 
 	num_stripes = 1;
@@ -2975,13 +2975,17 @@ again:
 			stripe_index = find_live_mirror(map, 0,
 					    map->num_stripes,
 					    current->pid % map->num_stripes);
+			mirror_num = stripe_index + 1;
 		}
 
 	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
-		if (rw & (REQ_WRITE | REQ_DISCARD))
+		if (rw & (REQ_WRITE | REQ_DISCARD)) {
 			num_stripes = map->num_stripes;
-		else if (mirror_num)
+		} else if (mirror_num) {
 			stripe_index = mirror_num - 1;
+		} else {
+			mirror_num = 1;
+		}
 
 	} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
 		int factor = map->num_stripes / map->sub_stripes;
@@ -3001,6 +3005,7 @@ again:
 			stripe_index = find_live_mirror(map, stripe_index,
 					      map->sub_stripes, stripe_index +
 					      current->pid % map->sub_stripes);
+			mirror_num = stripe_index + 1;
 		}
 	} else {
 		/*
@@ -3009,15 +3014,16 @@ again:
 		 * stripe_index is the number of our device in the stripe array
 		 */
 		stripe_index = do_div(stripe_nr, map->num_stripes);
+		mirror_num = stripe_index + 1;
 	}
 	BUG_ON(stripe_index >= map->num_stripes);
 
 	if (rw & REQ_DISCARD) {
 		for (i = 0; i < num_stripes; i++) {
-			multi->stripes[i].physical =
+			bbio->stripes[i].physical =
 				map->stripes[stripe_index].physical +
 				stripe_offset + stripe_nr * map->stripe_len;
-			multi->stripes[i].dev = map->stripes[stripe_index].dev;
+			bbio->stripes[i].dev = map->stripes[stripe_index].dev;
 
 			if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
 				u64 stripes;
@@ -3038,16 +3044,16 @@ again:
 				}
 				stripes = stripe_nr_end - 1 - j;
 				do_div(stripes, map->num_stripes);
-				multi->stripes[i].length = map->stripe_len *
+				bbio->stripes[i].length = map->stripe_len *
 					(stripes - stripe_nr + 1);
 
 				if (i == 0) {
-					multi->stripes[i].length -=
+					bbio->stripes[i].length -=
 						stripe_offset;
 					stripe_offset = 0;
 				}
 				if (stripe_index == last_stripe)
-					multi->stripes[i].length -=
+					bbio->stripes[i].length -=
 						stripe_end_offset;
 			} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
 				u64 stripes;
@@ -3072,11 +3078,11 @@ again:
 				}
 				stripes = stripe_nr_end - 1 - j;
 				do_div(stripes, factor);
-				multi->stripes[i].length = map->stripe_len *
+				bbio->stripes[i].length = map->stripe_len *
 					(stripes - stripe_nr + 1);
 
 				if (i < map->sub_stripes) {
-					multi->stripes[i].length -=
+					bbio->stripes[i].length -=
 						stripe_offset;
 					if (i == map->sub_stripes - 1)
 						stripe_offset = 0;
@@ -3084,11 +3090,11 @@ again:
 				if (stripe_index >= last_stripe &&
 				    stripe_index <= (last_stripe +
 						     map->sub_stripes - 1)) {
-					multi->stripes[i].length -=
+					bbio->stripes[i].length -=
 						stripe_end_offset;
 				}
 			} else
-				multi->stripes[i].length = *length;
+				bbio->stripes[i].length = *length;
 
 			stripe_index++;
 			if (stripe_index == map->num_stripes) {
@@ -3099,19 +3105,20 @@ again:
 		}
 	} else {
 		for (i = 0; i < num_stripes; i++) {
-			multi->stripes[i].physical =
+			bbio->stripes[i].physical =
 				map->stripes[stripe_index].physical +
 				stripe_offset +
 				stripe_nr * map->stripe_len;
-			multi->stripes[i].dev =
+			bbio->stripes[i].dev =
 				map->stripes[stripe_index].dev;
 			stripe_index++;
 		}
 	}
-	if (multi_ret) {
-		*multi_ret = multi;
-		multi->num_stripes = num_stripes;
-		multi->max_errors = max_errors;
+	if (bbio_ret) {
+		*bbio_ret = bbio;
+		bbio->num_stripes = num_stripes;
+		bbio->max_errors = max_errors;
+		bbio->mirror_num = mirror_num;
 	}
 out:
 	free_extent_map(em);
@@ -3120,9 +3127,9 @@ out:
 
 int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 		      u64 logical, u64 *length,
-		      struct btrfs_multi_bio **multi_ret, int mirror_num)
+		      struct btrfs_bio **bbio_ret, int mirror_num)
 {
-	return __btrfs_map_block(map_tree, rw, logical, length, multi_ret,
+	return __btrfs_map_block(map_tree, rw, logical, length, bbio_ret,
 				 mirror_num);
 }
 
@@ -3191,28 +3198,28 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
 	return 0;
 }
 
-static void end_bio_multi_stripe(struct bio *bio, int err)
+static void btrfs_end_bio(struct bio *bio, int err)
 {
-	struct btrfs_multi_bio *multi = bio->bi_private;
+	struct btrfs_bio *bbio = bio->bi_private;
 	int is_orig_bio = 0;
 
 	if (err)
-		atomic_inc(&multi->error);
+		atomic_inc(&bbio->error);
 
-	if (bio == multi->orig_bio)
+	if (bio == bbio->orig_bio)
 		is_orig_bio = 1;
 
-	if (atomic_dec_and_test(&multi->stripes_pending)) {
+	if (atomic_dec_and_test(&bbio->stripes_pending)) {
 		if (!is_orig_bio) {
 			bio_put(bio);
-			bio = multi->orig_bio;
+			bio = bbio->orig_bio;
 		}
-		bio->bi_private = multi->private;
-		bio->bi_end_io = multi->end_io;
+		bio->bi_private = bbio->private;
+		bio->bi_end_io = bbio->end_io;
 		/* only send an error to the higher layers if it is
 		 * beyond the tolerance of the multi-bio
 		 */
-		if (atomic_read(&multi->error) > multi->max_errors) {
+		if (atomic_read(&bbio->error) > bbio->max_errors) {
 			err = -EIO;
 		} else if (err) {
 			/*
@@ -3222,7 +3229,7 @@ static void end_bio_multi_stripe(struct bio *bio, int err)
 			set_bit(BIO_UPTODATE, &bio->bi_flags);
 			err = 0;
 		}
-		kfree(multi);
+		kfree(bbio);
 
 		bio_endio(bio, err);
 	} else if (!is_orig_bio) {
@@ -3302,20 +3309,20 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 	u64 logical = (u64)bio->bi_sector << 9;
 	u64 length = 0;
 	u64 map_length;
-	struct btrfs_multi_bio *multi = NULL;
 	int ret;
 	int dev_nr = 0;
 	int total_devs = 1;
+	struct btrfs_bio *bbio = NULL;
 
 	length = bio->bi_size;
 	map_tree = &root->fs_info->mapping_tree;
 	map_length = length;
 
-	ret = btrfs_map_block(map_tree, rw, logical, &map_length, &multi,
+	ret = btrfs_map_block(map_tree, rw, logical, &map_length, &bbio,
 			      mirror_num);
 	BUG_ON(ret);
 
-	total_devs = multi->num_stripes;
+	total_devs = bbio->num_stripes;
 	if (map_length < length) {
 		printk(KERN_CRIT "mapping failed logical %llu bio len %llu "
 		       "len %llu\n", (unsigned long long)logical,
@@ -3323,25 +3330,28 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 		       (unsigned long long)map_length);
 		BUG();
 	}
-	multi->end_io = first_bio->bi_end_io;
-	multi->private = first_bio->bi_private;
-	multi->orig_bio = first_bio;
-	atomic_set(&multi->stripes_pending, multi->num_stripes);
+
+	bbio->orig_bio = first_bio;
+	bbio->private = first_bio->bi_private;
+	bbio->end_io = first_bio->bi_end_io;
+	atomic_set(&bbio->stripes_pending, bbio->num_stripes);
 
 	while (dev_nr < total_devs) {
-		if (total_devs > 1) {
-			if (dev_nr < total_devs - 1) {
-				bio = bio_clone(first_bio, GFP_NOFS);
-				BUG_ON(!bio);
-			} else {
-				bio = first_bio;
-			}
-			bio->bi_private = multi;
-			bio->bi_end_io = end_bio_multi_stripe;
+		if (dev_nr < total_devs - 1) {
+			bio = bio_clone(first_bio, GFP_NOFS);
+			BUG_ON(!bio);
+		} else {
+			bio = first_bio;
 		}
-		bio->bi_sector = multi->stripes[dev_nr].physical >> 9;
-		dev = multi->stripes[dev_nr].dev;
+		bio->bi_private = bbio;
+		bio->bi_end_io = btrfs_end_bio;
+		bio->bi_sector = bbio->stripes[dev_nr].physical >> 9;
+		dev = bbio->stripes[dev_nr].dev;
 		if (dev && dev->bdev && (rw != WRITE || dev->writeable)) {
+			pr_debug("btrfs_map_bio: rw %d, secor=%llu, dev=%lu "
+				 "(%s id %llu), size=%u\n", rw,
+				 (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev,
+				 dev->name, dev->devid, bio->bi_size);
 			bio->bi_bdev = dev->bdev;
 			if (async_submit)
 				schedule_bio(root, dev, rw, bio);
@@ -3354,8 +3364,6 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 		}
 		dev_nr++;
 	}
-	if (total_devs == 1)
-		kfree(multi);
 	return 0;
 }
 
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 6d866db4e177..71f4f3f67495 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -136,7 +136,10 @@ struct btrfs_bio_stripe {
 	u64 length; /* only used for discard mappings */
 };
 
-struct btrfs_multi_bio {
+struct btrfs_bio;
+typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err);
+
+struct btrfs_bio {
 	atomic_t stripes_pending;
 	bio_end_io_t *end_io;
 	struct bio *orig_bio;
@@ -144,6 +147,7 @@ struct btrfs_multi_bio {
 	atomic_t error;
 	int max_errors;
 	int num_stripes;
+	int mirror_num;
 	struct btrfs_bio_stripe stripes[];
 };
 
@@ -171,7 +175,7 @@ struct map_lookup {
 int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
 				   u64 end, u64 *length);
 
-#define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \
+#define btrfs_bio_size(n) (sizeof(struct btrfs_bio) + \
 			    (sizeof(struct btrfs_bio_stripe) * (n)))
 
 int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
@@ -180,7 +184,7 @@ int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
 			   u64 chunk_offset, u64 start, u64 num_bytes);
 int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw,
 		    u64 logical, u64 *length,
-		    struct btrfs_multi_bio **multi_ret, int mirror_num);
+		    struct btrfs_bio **bbio_ret, int mirror_num);
 int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
 		     u64 chunk_start, u64 physical, u64 devid,
 		     u64 **logical, int *naddrs, int *stripe_len);
-- 
cgit v1.2.3


From 1503140d3ec2be9b917d2f8f7c64cb77b79a215b Mon Sep 17 00:00:00 2001
From: Jan Schmidt <list.btrfs@jan-o-sch.net>
Date: Thu, 16 Jun 2011 14:37:03 +0200
Subject: btrfs: Do not use bio->bi_bdev after submission

The block layer modifies bio->bi_bdev and bio->bi_sector while working on
the bio, they do _not_ come back unmodified in the completion callback.

To call add_page, we need at least some bi_bdev set, which is why the code
was working, previously. With this patch, we use the latest_bdev from
fsinfo instead of the leftover in the bio. This gives us the possibility to
use the bi_bdev field for another purpose.

Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
---
 fs/btrfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index efee8c7cfa45..936a6fabaa9f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1915,7 +1915,7 @@ static int btrfs_io_failed_hook(struct bio *failed_bio,
 	bio->bi_private = state;
 	bio->bi_end_io = failed_bio->bi_end_io;
 	bio->bi_sector = failrec->logical >> 9;
-	bio->bi_bdev = failed_bio->bi_bdev;
+	bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
 	bio->bi_size = 0;
 
 	bio_add_page(bio, page, failrec->len, start - page_offset(page));
-- 
cgit v1.2.3


From 2774b2ca3d49124bf1ae89e8d575b3dab4221266 Mon Sep 17 00:00:00 2001
From: Jan Schmidt <list.btrfs@jan-o-sch.net>
Date: Thu, 16 Jun 2011 12:05:19 +0200
Subject: btrfs: Put mirror_num in bi_bdev

The error correction code wants to make sure that only the bad mirror is
rewritten. Thus, we need to know which mirror is the bad one. I did not
find a more apropriate field than bi_bdev. But I think using this is fine,
because it is modified by the block layer, anyway, and should not be read
after the bio returned.

Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
---
 fs/btrfs/volumes.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 9db4d7962f9b..18baac5a3f6c 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3216,6 +3216,8 @@ static void btrfs_end_bio(struct bio *bio, int err)
 		}
 		bio->bi_private = bbio->private;
 		bio->bi_end_io = bbio->end_io;
+		bio->bi_bdev = (struct block_device *)
+					(unsigned long)bbio->mirror_num;
 		/* only send an error to the higher layers if it is
 		 * beyond the tolerance of the multi-bio
 		 */
-- 
cgit v1.2.3


From 4a54c8c165b66300830a67349fc7595e3fc442f7 Mon Sep 17 00:00:00 2001
From: Jan Schmidt <list.btrfs@jan-o-sch.net>
Date: Fri, 22 Jul 2011 15:41:52 +0200
Subject: btrfs: Moved repair code from inode.c to extent_io.c

The raid-retry code in inode.c can be generalized so that it works for
metadata as well. Thus, this patch moves it to extent_io.c and makes the
raid-retry code a raid-repair code.

Repair works that way: Whenever a read error occurs and we have more
mirrors to try, note the failed mirror, and retry another. If we find a
good one, check if we did note a failure earlier and if so, do not allow
the read to complete until after the bad sector was written with the good
data we just fetched. As we have the extent locked while reading, no one
can change the data in between.

Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
---
 fs/btrfs/extent_io.c | 387 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/btrfs/extent_io.h |  10 +-
 fs/btrfs/inode.c     | 155 +--------------------
 3 files changed, 393 insertions(+), 159 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index afebb95e3490..624ef10d36cc 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -17,6 +17,7 @@
 #include "compat.h"
 #include "ctree.h"
 #include "btrfs_inode.h"
+#include "volumes.h"
 
 static struct kmem_cache *extent_state_cache;
 static struct kmem_cache *extent_buffer_cache;
@@ -1599,6 +1600,368 @@ static int check_page_writeback(struct extent_io_tree *tree,
 	return 0;
 }
 
+/*
+ * When IO fails, either with EIO or csum verification fails, we
+ * try other mirrors that might have a good copy of the data.  This
+ * io_failure_record is used to record state as we go through all the
+ * mirrors.  If another mirror has good data, the page is set up to date
+ * and things continue.  If a good mirror can't be found, the original
+ * bio end_io callback is called to indicate things have failed.
+ */
+struct io_failure_record {
+	struct page *page;
+	u64 start;
+	u64 len;
+	u64 logical;
+	unsigned long bio_flags;
+	int this_mirror;
+	int failed_mirror;
+	int in_validation;
+};
+
+static int free_io_failure(struct inode *inode, struct io_failure_record *rec,
+				int did_repair)
+{
+	int ret;
+	int err = 0;
+	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
+
+	set_state_private(failure_tree, rec->start, 0);
+	ret = clear_extent_bits(failure_tree, rec->start,
+				rec->start + rec->len - 1,
+				EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
+	if (ret)
+		err = ret;
+
+	if (did_repair) {
+		ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start,
+					rec->start + rec->len - 1,
+					EXTENT_DAMAGED, GFP_NOFS);
+		if (ret && !err)
+			err = ret;
+	}
+
+	kfree(rec);
+	return err;
+}
+
+static void repair_io_failure_callback(struct bio *bio, int err)
+{
+	complete(bio->bi_private);
+}
+
+/*
+ * this bypasses the standard btrfs submit functions deliberately, as
+ * the standard behavior is to write all copies in a raid setup. here we only
+ * want to write the one bad copy. so we do the mapping for ourselves and issue
+ * submit_bio directly.
+ * to avoid any synchonization issues, wait for the data after writing, which
+ * actually prevents the read that triggered the error from finishing.
+ * currently, there can be no more than two copies of every data bit. thus,
+ * exactly one rewrite is required.
+ */
+int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
+			u64 length, u64 logical, struct page *page,
+			int mirror_num)
+{
+	struct bio *bio;
+	struct btrfs_device *dev;
+	DECLARE_COMPLETION_ONSTACK(compl);
+	u64 map_length = 0;
+	u64 sector;
+	struct btrfs_bio *bbio = NULL;
+	int ret;
+
+	BUG_ON(!mirror_num);
+
+	bio = bio_alloc(GFP_NOFS, 1);
+	if (!bio)
+		return -EIO;
+	bio->bi_private = &compl;
+	bio->bi_end_io = repair_io_failure_callback;
+	bio->bi_size = 0;
+	map_length = length;
+
+	ret = btrfs_map_block(map_tree, WRITE, logical,
+			      &map_length, &bbio, mirror_num);
+	if (ret) {
+		bio_put(bio);
+		return -EIO;
+	}
+	BUG_ON(mirror_num != bbio->mirror_num);
+	sector = bbio->stripes[mirror_num-1].physical >> 9;
+	bio->bi_sector = sector;
+	dev = bbio->stripes[mirror_num-1].dev;
+	kfree(bbio);
+	if (!dev || !dev->bdev || !dev->writeable) {
+		bio_put(bio);
+		return -EIO;
+	}
+	bio->bi_bdev = dev->bdev;
+	bio_add_page(bio, page, length, start-page_offset(page));
+	submit_bio(WRITE_SYNC, bio);
+	wait_for_completion(&compl);
+
+	if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
+		/* try to remap that extent elsewhere? */
+		bio_put(bio);
+		return -EIO;
+	}
+
+	printk(KERN_INFO "btrfs read error corrected: ino %lu off %llu (dev %s "
+			"sector %llu)\n", page->mapping->host->i_ino, start,
+			dev->name, sector);
+
+	bio_put(bio);
+	return 0;
+}
+
+/*
+ * each time an IO finishes, we do a fast check in the IO failure tree
+ * to see if we need to process or clean up an io_failure_record
+ */
+static int clean_io_failure(u64 start, struct page *page)
+{
+	u64 private;
+	u64 private_failure;
+	struct io_failure_record *failrec;
+	struct btrfs_mapping_tree *map_tree;
+	struct extent_state *state;
+	int num_copies;
+	int did_repair = 0;
+	int ret;
+	struct inode *inode = page->mapping->host;
+
+	private = 0;
+	ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
+				(u64)-1, 1, EXTENT_DIRTY, 0);
+	if (!ret)
+		return 0;
+
+	ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start,
+				&private_failure);
+	if (ret)
+		return 0;
+
+	failrec = (struct io_failure_record *)(unsigned long) private_failure;
+	BUG_ON(!failrec->this_mirror);
+
+	if (failrec->in_validation) {
+		/* there was no real error, just free the record */
+		pr_debug("clean_io_failure: freeing dummy error at %llu\n",
+			 failrec->start);
+		did_repair = 1;
+		goto out;
+	}
+
+	spin_lock(&BTRFS_I(inode)->io_tree.lock);
+	state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
+					    failrec->start,
+					    EXTENT_LOCKED);
+	spin_unlock(&BTRFS_I(inode)->io_tree.lock);
+
+	if (state && state->start == failrec->start) {
+		map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree;
+		num_copies = btrfs_num_copies(map_tree, failrec->logical,
+						failrec->len);
+		if (num_copies > 1)  {
+			ret = repair_io_failure(map_tree, start, failrec->len,
+						failrec->logical, page,
+						failrec->failed_mirror);
+			did_repair = !ret;
+		}
+	}
+
+out:
+	if (!ret)
+		ret = free_io_failure(inode, failrec, did_repair);
+
+	return ret;
+}
+
+/*
+ * this is a generic handler for readpage errors (default
+ * readpage_io_failed_hook). if other copies exist, read those and write back
+ * good data to the failed position. does not investigate in remapping the
+ * failed extent elsewhere, hoping the device will be smart enough to do this as
+ * needed
+ */
+
+static int bio_readpage_error(struct bio *failed_bio, struct page *page,
+				u64 start, u64 end, int failed_mirror,
+				struct extent_state *state)
+{
+	struct io_failure_record *failrec = NULL;
+	u64 private;
+	struct extent_map *em;
+	struct inode *inode = page->mapping->host;
+	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
+	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	struct bio *bio;
+	int num_copies;
+	int ret;
+	int read_mode;
+	u64 logical;
+
+	BUG_ON(failed_bio->bi_rw & REQ_WRITE);
+
+	ret = get_state_private(failure_tree, start, &private);
+	if (ret) {
+		failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
+		if (!failrec)
+			return -ENOMEM;
+		failrec->start = start;
+		failrec->len = end - start + 1;
+		failrec->this_mirror = 0;
+		failrec->bio_flags = 0;
+		failrec->in_validation = 0;
+
+		read_lock(&em_tree->lock);
+		em = lookup_extent_mapping(em_tree, start, failrec->len);
+		if (!em) {
+			read_unlock(&em_tree->lock);
+			kfree(failrec);
+			return -EIO;
+		}
+
+		if (em->start > start || em->start + em->len < start) {
+			free_extent_map(em);
+			em = NULL;
+		}
+		read_unlock(&em_tree->lock);
+
+		if (!em || IS_ERR(em)) {
+			kfree(failrec);
+			return -EIO;
+		}
+		logical = start - em->start;
+		logical = em->block_start + logical;
+		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
+			logical = em->block_start;
+			failrec->bio_flags = EXTENT_BIO_COMPRESSED;
+			extent_set_compress_type(&failrec->bio_flags,
+						 em->compress_type);
+		}
+		pr_debug("bio_readpage_error: (new) logical=%llu, start=%llu, "
+			 "len=%llu\n", logical, start, failrec->len);
+		failrec->logical = logical;
+		free_extent_map(em);
+
+		/* set the bits in the private failure tree */
+		ret = set_extent_bits(failure_tree, start, end,
+					EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
+		if (ret >= 0)
+			ret = set_state_private(failure_tree, start,
+						(u64)(unsigned long)failrec);
+		/* set the bits in the inode's tree */
+		if (ret >= 0)
+			ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED,
+						GFP_NOFS);
+		if (ret < 0) {
+			kfree(failrec);
+			return ret;
+		}
+	} else {
+		failrec = (struct io_failure_record *)(unsigned long)private;
+		pr_debug("bio_readpage_error: (found) logical=%llu, "
+			 "start=%llu, len=%llu, validation=%d\n",
+			 failrec->logical, failrec->start, failrec->len,
+			 failrec->in_validation);
+		/*
+		 * when data can be on disk more than twice, add to failrec here
+		 * (e.g. with a list for failed_mirror) to make
+		 * clean_io_failure() clean all those errors at once.
+		 */
+	}
+	num_copies = btrfs_num_copies(
+			      &BTRFS_I(inode)->root->fs_info->mapping_tree,
+			      failrec->logical, failrec->len);
+	if (num_copies == 1) {
+		/*
+		 * we only have a single copy of the data, so don't bother with
+		 * all the retry and error correction code that follows. no
+		 * matter what the error is, it is very likely to persist.
+		 */
+		pr_debug("bio_readpage_error: cannot repair, num_copies == 1. "
+			 "state=%p, num_copies=%d, next_mirror %d, "
+			 "failed_mirror %d\n", state, num_copies,
+			 failrec->this_mirror, failed_mirror);
+		free_io_failure(inode, failrec, 0);
+		return -EIO;
+	}
+
+	if (!state) {
+		spin_lock(&tree->lock);
+		state = find_first_extent_bit_state(tree, failrec->start,
+						    EXTENT_LOCKED);
+		if (state && state->start != failrec->start)
+			state = NULL;
+		spin_unlock(&tree->lock);
+	}
+
+	/*
+	 * there are two premises:
+	 *	a) deliver good data to the caller
+	 *	b) correct the bad sectors on disk
+	 */
+	if (failed_bio->bi_vcnt > 1) {
+		/*
+		 * to fulfill b), we need to know the exact failing sectors, as
+		 * we don't want to rewrite any more than the failed ones. thus,
+		 * we need separate read requests for the failed bio
+		 *
+		 * if the following BUG_ON triggers, our validation request got
+		 * merged. we need separate requests for our algorithm to work.
+		 */
+		BUG_ON(failrec->in_validation);
+		failrec->in_validation = 1;
+		failrec->this_mirror = failed_mirror;
+		read_mode = READ_SYNC | REQ_FAILFAST_DEV;
+	} else {
+		/*
+		 * we're ready to fulfill a) and b) alongside. get a good copy
+		 * of the failed sector and if we succeed, we have setup
+		 * everything for repair_io_failure to do the rest for us.
+		 */
+		if (failrec->in_validation) {
+			BUG_ON(failrec->this_mirror != failed_mirror);
+			failrec->in_validation = 0;
+			failrec->this_mirror = 0;
+		}
+		failrec->failed_mirror = failed_mirror;
+		failrec->this_mirror++;
+		if (failrec->this_mirror == failed_mirror)
+			failrec->this_mirror++;
+		read_mode = READ_SYNC;
+	}
+
+	if (!state || failrec->this_mirror > num_copies) {
+		pr_debug("bio_readpage_error: (fail) state=%p, num_copies=%d, "
+			 "next_mirror %d, failed_mirror %d\n", state,
+			 num_copies, failrec->this_mirror, failed_mirror);
+		free_io_failure(inode, failrec, 0);
+		return -EIO;
+	}
+
+	bio = bio_alloc(GFP_NOFS, 1);
+	bio->bi_private = state;
+	bio->bi_end_io = failed_bio->bi_end_io;
+	bio->bi_sector = failrec->logical >> 9;
+	bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+	bio->bi_size = 0;
+
+	bio_add_page(bio, page, failrec->len, start - page_offset(page));
+
+	pr_debug("bio_readpage_error: submitting new read[%#x] to "
+		 "this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode,
+		 failrec->this_mirror, num_copies, failrec->in_validation);
+
+	tree->ops->submit_bio_hook(inode, read_mode, bio, failrec->this_mirror,
+					failrec->bio_flags, 0);
+	return 0;
+}
+
 /* lots and lots of room for performance fixes in the end_bio funcs */
 
 /*
@@ -1697,6 +2060,9 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
 		struct extent_state *cached = NULL;
 		struct extent_state *state;
 
+		pr_debug("end_bio_extent_readpage: bi_vcnt=%d, idx=%d, err=%d, "
+			 "mirror=%ld\n", bio->bi_vcnt, bio->bi_idx, err,
+			 (long int)bio->bi_bdev);
 		tree = &BTRFS_I(page->mapping->host)->io_tree;
 
 		start = ((u64)page->index << PAGE_CACHE_SHIFT) +
@@ -1727,11 +2093,19 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
 							      state);
 			if (ret)
 				uptodate = 0;
+			else
+				clean_io_failure(start, page);
 		}
-		if (!uptodate && tree->ops &&
-		    tree->ops->readpage_io_failed_hook) {
-			ret = tree->ops->readpage_io_failed_hook(bio, page,
-							 start, end, NULL);
+		if (!uptodate) {
+			u64 failed_mirror;
+			failed_mirror = (u64)bio->bi_bdev;
+			if (tree->ops && tree->ops->readpage_io_failed_hook)
+				ret = tree->ops->readpage_io_failed_hook(
+						bio, page, start, end,
+						failed_mirror, NULL);
+			else
+				ret = bio_readpage_error(bio, page, start, end,
+							 failed_mirror, NULL);
 			if (ret == 0) {
 				uptodate =
 					test_bit(BIO_UPTODATE, &bio->bi_flags);
@@ -1811,6 +2185,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
 					   mirror_num, bio_flags, start);
 	else
 		submit_bio(rw, bio);
+
 	if (bio_flagged(bio, BIO_EOPNOTSUPP))
 		ret = -EOPNOTSUPP;
 	bio_put(bio);
@@ -2926,7 +3301,7 @@ out:
 	return ret;
 }
 
-static inline struct page *extent_buffer_page(struct extent_buffer *eb,
+inline struct page *extent_buffer_page(struct extent_buffer *eb,
 					      unsigned long i)
 {
 	struct page *p;
@@ -2951,7 +3326,7 @@ static inline struct page *extent_buffer_page(struct extent_buffer *eb,
 	return p;
 }
 
-static inline unsigned long num_extent_pages(u64 start, u64 len)
+inline unsigned long num_extent_pages(u64 start, u64 len)
 {
 	return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
 		(start >> PAGE_CACHE_SHIFT);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 435d454b9926..a8e20b672922 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -68,7 +68,7 @@ struct extent_io_ops {
 			      unsigned long bio_flags);
 	int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
 	int (*readpage_io_failed_hook)(struct bio *bio, struct page *page,
-				       u64 start, u64 end,
+				       u64 start, u64 end, u64 failed_mirror,
 				       struct extent_state *state);
 	int (*writepage_io_failed_hook)(struct bio *bio, struct page *page,
 					u64 start, u64 end,
@@ -252,6 +252,8 @@ void free_extent_buffer(struct extent_buffer *eb);
 int read_extent_buffer_pages(struct extent_io_tree *tree,
 			     struct extent_buffer *eb, u64 start, int wait,
 			     get_extent_t *get_extent, int mirror_num);
+unsigned long num_extent_pages(u64 start, u64 len);
+struct page *extent_buffer_page(struct extent_buffer *eb, unsigned long i);
 
 static inline void extent_buffer_get(struct extent_buffer *eb)
 {
@@ -301,4 +303,10 @@ int extent_clear_unlock_delalloc(struct inode *inode,
 struct bio *
 btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
 		gfp_t gfp_flags);
+
+struct btrfs_mapping_tree;
+
+int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
+			u64 length, u64 logical, struct page *page,
+			int mirror_num);
 #endif
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 936a6fabaa9f..9327f45434e8 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -45,10 +45,10 @@
 #include "btrfs_inode.h"
 #include "ioctl.h"
 #include "print-tree.h"
-#include "volumes.h"
 #include "ordered-data.h"
 #include "xattr.h"
 #include "tree-log.h"
+#include "volumes.h"
 #include "compression.h"
 #include "locking.h"
 #include "free-space-cache.h"
@@ -1818,154 +1818,10 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
 	return btrfs_finish_ordered_io(page->mapping->host, start, end);
 }
 
-/*
- * When IO fails, either with EIO or csum verification fails, we
- * try other mirrors that might have a good copy of the data.  This
- * io_failure_record is used to record state as we go through all the
- * mirrors.  If another mirror has good data, the page is set up to date
- * and things continue.  If a good mirror can't be found, the original
- * bio end_io callback is called to indicate things have failed.
- */
-struct io_failure_record {
-	struct page *page;
-	u64 start;
-	u64 len;
-	u64 logical;
-	unsigned long bio_flags;
-	int last_mirror;
-};
-
-static int btrfs_io_failed_hook(struct bio *failed_bio,
-			 struct page *page, u64 start, u64 end,
-			 struct extent_state *state)
-{
-	struct io_failure_record *failrec = NULL;
-	u64 private;
-	struct extent_map *em;
-	struct inode *inode = page->mapping->host;
-	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
-	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
-	struct bio *bio;
-	int num_copies;
-	int ret;
-	int rw;
-	u64 logical;
-
-	ret = get_state_private(failure_tree, start, &private);
-	if (ret) {
-		failrec = kmalloc(sizeof(*failrec), GFP_NOFS);
-		if (!failrec)
-			return -ENOMEM;
-		failrec->start = start;
-		failrec->len = end - start + 1;
-		failrec->last_mirror = 0;
-		failrec->bio_flags = 0;
-
-		read_lock(&em_tree->lock);
-		em = lookup_extent_mapping(em_tree, start, failrec->len);
-		if (em->start > start || em->start + em->len < start) {
-			free_extent_map(em);
-			em = NULL;
-		}
-		read_unlock(&em_tree->lock);
-
-		if (IS_ERR_OR_NULL(em)) {
-			kfree(failrec);
-			return -EIO;
-		}
-		logical = start - em->start;
-		logical = em->block_start + logical;
-		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
-			logical = em->block_start;
-			failrec->bio_flags = EXTENT_BIO_COMPRESSED;
-			extent_set_compress_type(&failrec->bio_flags,
-						 em->compress_type);
-		}
-		failrec->logical = logical;
-		free_extent_map(em);
-		set_extent_bits(failure_tree, start, end, EXTENT_LOCKED |
-				EXTENT_DIRTY, GFP_NOFS);
-		set_state_private(failure_tree, start,
-				 (u64)(unsigned long)failrec);
-	} else {
-		failrec = (struct io_failure_record *)(unsigned long)private;
-	}
-	num_copies = btrfs_num_copies(
-			      &BTRFS_I(inode)->root->fs_info->mapping_tree,
-			      failrec->logical, failrec->len);
-	failrec->last_mirror++;
-	if (!state) {
-		spin_lock(&BTRFS_I(inode)->io_tree.lock);
-		state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
-						    failrec->start,
-						    EXTENT_LOCKED);
-		if (state && state->start != failrec->start)
-			state = NULL;
-		spin_unlock(&BTRFS_I(inode)->io_tree.lock);
-	}
-	if (!state || failrec->last_mirror > num_copies) {
-		set_state_private(failure_tree, failrec->start, 0);
-		clear_extent_bits(failure_tree, failrec->start,
-				  failrec->start + failrec->len - 1,
-				  EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
-		kfree(failrec);
-		return -EIO;
-	}
-	bio = bio_alloc(GFP_NOFS, 1);
-	bio->bi_private = state;
-	bio->bi_end_io = failed_bio->bi_end_io;
-	bio->bi_sector = failrec->logical >> 9;
-	bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
-	bio->bi_size = 0;
-
-	bio_add_page(bio, page, failrec->len, start - page_offset(page));
-	if (failed_bio->bi_rw & REQ_WRITE)
-		rw = WRITE;
-	else
-		rw = READ;
-
-	ret = BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
-						      failrec->last_mirror,
-						      failrec->bio_flags, 0);
-	return ret;
-}
-
-/*
- * each time an IO finishes, we do a fast check in the IO failure tree
- * to see if we need to process or clean up an io_failure_record
- */
-static int btrfs_clean_io_failures(struct inode *inode, u64 start)
-{
-	u64 private;
-	u64 private_failure;
-	struct io_failure_record *failure;
-	int ret;
-
-	private = 0;
-	if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
-			     (u64)-1, 1, EXTENT_DIRTY, 0)) {
-		ret = get_state_private(&BTRFS_I(inode)->io_failure_tree,
-					start, &private_failure);
-		if (ret == 0) {
-			failure = (struct io_failure_record *)(unsigned long)
-				   private_failure;
-			set_state_private(&BTRFS_I(inode)->io_failure_tree,
-					  failure->start, 0);
-			clear_extent_bits(&BTRFS_I(inode)->io_failure_tree,
-					  failure->start,
-					  failure->start + failure->len - 1,
-					  EXTENT_DIRTY | EXTENT_LOCKED,
-					  GFP_NOFS);
-			kfree(failure);
-		}
-	}
-	return 0;
-}
-
 /*
  * when reads are done, we need to check csums to verify the data is correct
- * if there's a match, we allow the bio to finish.  If not, we go through
- * the io_failure_record routines to find good copies
+ * if there's a match, we allow the bio to finish.  If not, the code in
+ * extent_io.c will try to find good copies for us.
  */
 static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 			       struct extent_state *state)
@@ -2011,10 +1867,6 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 
 	kunmap_atomic(kaddr, KM_USER0);
 good:
-	/* if the io failure tree for this inode is non-empty,
-	 * check to see if we've recovered from a failed IO
-	 */
-	btrfs_clean_io_failures(inode, start);
 	return 0;
 
 zeroit:
@@ -7420,7 +7272,6 @@ static struct extent_io_ops btrfs_extent_io_ops = {
 	.readpage_end_io_hook = btrfs_readpage_end_io_hook,
 	.writepage_end_io_hook = btrfs_writepage_end_io_hook,
 	.writepage_start_hook = btrfs_writepage_start_hook,
-	.readpage_io_failed_hook = btrfs_io_failed_hook,
 	.set_bit_hook = btrfs_set_bit_hook,
 	.clear_bit_hook = btrfs_clear_bit_hook,
 	.merge_extent_hook = btrfs_merge_extent_hook,
-- 
cgit v1.2.3


From 5da6fcbc4eb50c0f55d520750332f5a6ab13508c Mon Sep 17 00:00:00 2001
From: Jan Schmidt <list.btrfs@jan-o-sch.net>
Date: Thu, 4 Aug 2011 18:11:04 +0200
Subject: btrfs: integrating raid-repair and scrub-fixup-nodatasum

This ties nodatasum fixup in scrub together with raid repair patches. While
both series are working fine alone, scrub will report uncorrectable errors
if they occur in a nodatasum extent *and* the page is in the page cache.

Previously, we would have triggered readpage to find good data and do the
repair. However, readpage wouldn't read anything in the case where the page
is up to date in the cache. So, we simply take that good data we have and
call repair_io_failure directly (unless the page in the cache is dirty).

Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
---
 fs/btrfs/scrub.c | 92 +++++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 67 insertions(+), 25 deletions(-)

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 97142a218f0a..eba42e5fd5fd 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -24,6 +24,7 @@
 #include "ordered-data.h"
 #include "transaction.h"
 #include "backref.h"
+#include "extent_io.h"
 
 /*
  * This is only the first step towards a full-features scrub. It reads all
@@ -360,13 +361,13 @@ out:
 
 static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx)
 {
-	struct page *page;
+	struct page *page = NULL;
 	unsigned long index;
 	struct scrub_fixup_nodatasum *fixup = ctx;
 	int ret;
-	int corrected;
+	int corrected = 0;
 	struct btrfs_key key;
-	struct inode *inode;
+	struct inode *inode = NULL;
 	u64 end = offset + PAGE_SIZE - 1;
 	struct btrfs_root *local_root;
 
@@ -384,34 +385,75 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *ctx)
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
 
-	ret = set_extent_bit(&BTRFS_I(inode)->io_tree, offset, end,
-				EXTENT_DAMAGED, 0, NULL, NULL, GFP_NOFS);
-
-	/* set_extent_bit should either succeed or give proper error */
-	WARN_ON(ret > 0);
-	if (ret)
-		return ret < 0 ? ret : -EFAULT;
-
 	index = offset >> PAGE_CACHE_SHIFT;
 
 	page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
-	if (!page)
-		return -ENOMEM;
+	if (!page) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	if (PageUptodate(page)) {
+		struct btrfs_mapping_tree *map_tree;
+		if (PageDirty(page)) {
+			/*
+			 * we need to write the data to the defect sector. the
+			 * data that was in that sector is not in memory,
+			 * because the page was modified. we must not write the
+			 * modified page to that sector.
+			 *
+			 * TODO: what could be done here: wait for the delalloc
+			 *       runner to write out that page (might involve
+			 *       COW) and see whether the sector is still
+			 *       referenced afterwards.
+			 *
+			 * For the meantime, we'll treat this error
+			 * incorrectable, although there is a chance that a
+			 * later scrub will find the bad sector again and that
+			 * there's no dirty page in memory, then.
+			 */
+			ret = -EIO;
+			goto out;
+		}
+		map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree;
+		ret = repair_io_failure(map_tree, offset, PAGE_SIZE,
+					fixup->logical, page,
+					fixup->mirror_num);
+		unlock_page(page);
+		corrected = !ret;
+	} else {
+		/*
+		 * we need to get good data first. the general readpage path
+		 * will call repair_io_failure for us, we just have to make
+		 * sure we read the bad mirror.
+		 */
+		ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
+					EXTENT_DAMAGED, GFP_NOFS);
+		if (ret) {
+			/* set_extent_bits should give proper error */
+			WARN_ON(ret > 0);
+			if (ret > 0)
+				ret = -EFAULT;
+			goto out;
+		}
 
-	ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page,
-					btrfs_get_extent, fixup->mirror_num);
-	wait_on_page_locked(page);
-	corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset, end,
-					EXTENT_DAMAGED, 0, NULL);
+		ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page,
+						btrfs_get_extent,
+						fixup->mirror_num);
+		wait_on_page_locked(page);
 
-	if (corrected)
-		WARN_ON(!PageUptodate(page));
-	else
-		clear_extent_bit(&BTRFS_I(inode)->io_tree, offset, end,
-					EXTENT_DAMAGED, 0, 0, NULL, GFP_NOFS);
+		corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset,
+						end, EXTENT_DAMAGED, 0, NULL);
+		if (!corrected)
+			clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
+						EXTENT_DAMAGED, GFP_NOFS);
+	}
 
-	put_page(page);
-	iput(inode);
+out:
+	if (page)
+		put_page(page);
+	if (inode)
+		iput(inode);
 
 	if (ret < 0)
 		return ret;
-- 
cgit v1.2.3


From bb82ab88dfdb12948af58989c75bfe904bc1b09d Mon Sep 17 00:00:00 2001
From: Arne Jansen <sensille@gmx.net>
Date: Fri, 10 Jun 2011 14:06:53 +0200
Subject: btrfs: add an extra wait mode to read_extent_buffer_pages

read_extent_buffer_pages currently has two modes, either trigger a read
without waiting for anything, or wait for the I/O to finish. The former
also bails when it's unable to lock the page. This patch now adds an
additional parameter to allow it to block on page lock, but don't wait
for completion.

Changes v5:
 - merge the 2 wait parameters into one and define WAIT_NONE, WAIT_COMPLETE and
   WAIT_PAGE_LOCK

Change v6:
 - fix bug introduced in v5

Signed-off-by: Arne Jansen <sensille@gmx.net>
---
 fs/btrfs/disk-io.c   | 5 +++--
 fs/btrfs/extent_io.c | 7 +++----
 fs/btrfs/extent_io.h | 3 +++
 3 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 07b3ac662e19..e6170e142cad 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -367,7 +367,8 @@ static int btree_read_extent_buffer_pages(struct btrfs_root *root,
 	clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
 	io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree;
 	while (1) {
-		ret = read_extent_buffer_pages(io_tree, eb, start, 1,
+		ret = read_extent_buffer_pages(io_tree, eb, start,
+					       WAIT_COMPLETE,
 					       btree_get_extent, mirror_num);
 		if (!ret &&
 		    !verify_parent_transid(io_tree, eb, parent_transid))
@@ -974,7 +975,7 @@ int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
 	if (!buf)
 		return 0;
 	read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree,
-				 buf, 0, 0, btree_get_extent, 0);
+				 buf, 0, WAIT_NONE, btree_get_extent, 0);
 	free_extent_buffer(buf);
 	return ret;
 }
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index d418164a35f1..823028e73cf8 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3349,8 +3349,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree,
 }
 
 int read_extent_buffer_pages(struct extent_io_tree *tree,
-			     struct extent_buffer *eb,
-			     u64 start, int wait,
+			     struct extent_buffer *eb, u64 start, int wait,
 			     get_extent_t *get_extent, int mirror_num)
 {
 	unsigned long i;
@@ -3386,7 +3385,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
 	num_pages = num_extent_pages(eb->start, eb->len);
 	for (i = start_i; i < num_pages; i++) {
 		page = extent_buffer_page(eb, i);
-		if (!wait) {
+		if (wait == WAIT_NONE) {
 			if (!trylock_page(page))
 				goto unlock_exit;
 		} else {
@@ -3430,7 +3429,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
 	if (bio)
 		submit_one_bio(READ, bio, mirror_num, bio_flags);
 
-	if (ret || !wait)
+	if (ret || wait != WAIT_COMPLETE)
 		return ret;
 
 	for (i = start_i; i < num_pages; i++) {
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 7b2f0c3e7929..6d74c6b34691 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -248,6 +248,9 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree,
 struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree,
 					 u64 start, unsigned long len);
 void free_extent_buffer(struct extent_buffer *eb);
+#define WAIT_NONE	0
+#define WAIT_COMPLETE	1
+#define WAIT_PAGE_LOCK	2
 int read_extent_buffer_pages(struct extent_io_tree *tree,
 			     struct extent_buffer *eb, u64 start, int wait,
 			     get_extent_t *get_extent, int mirror_num);
-- 
cgit v1.2.3


From ab0fff03055d2d1b01a7581badeba18db9c4f55c Mon Sep 17 00:00:00 2001
From: Arne Jansen <sensille@gmx.net>
Date: Mon, 23 May 2011 14:25:41 +0200
Subject: btrfs: add READAHEAD extent buffer flag

Add a READAHEAD extent buffer flag.
Add a function to trigger a read with this flag set.

Changes v2:
 - use extent buffer flags instead of extent state flags

Changes v5:
 - adapt to changed read_extent_buffer_pages interface
 - don't return eb from reada_tree_block_flagged if it has CORRUPT flag set

Signed-off-by: Arne Jansen <sensille@gmx.net>
---
 fs/btrfs/disk-io.c   | 32 ++++++++++++++++++++++++++++++++
 fs/btrfs/disk-io.h   |  2 ++
 fs/btrfs/extent_io.h |  1 +
 3 files changed, 35 insertions(+)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index e6170e142cad..1220d04072c8 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -980,6 +980,38 @@ int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
 	return ret;
 }
 
+int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
+			 int mirror_num, struct extent_buffer **eb)
+{
+	struct extent_buffer *buf = NULL;
+	struct inode *btree_inode = root->fs_info->btree_inode;
+	struct extent_io_tree *io_tree = &BTRFS_I(btree_inode)->io_tree;
+	int ret;
+
+	buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
+	if (!buf)
+		return 0;
+
+	set_bit(EXTENT_BUFFER_READAHEAD, &buf->bflags);
+
+	ret = read_extent_buffer_pages(io_tree, buf, 0, WAIT_PAGE_LOCK,
+				       btree_get_extent, mirror_num);
+	if (ret) {
+		free_extent_buffer(buf);
+		return ret;
+	}
+
+	if (test_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags)) {
+		free_extent_buffer(buf);
+		return -EIO;
+	} else if (extent_buffer_uptodate(io_tree, buf, NULL)) {
+		*eb = buf;
+	} else {
+		free_extent_buffer(buf);
+	}
+	return 0;
+}
+
 struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
 					    u64 bytenr, u32 blocksize)
 {
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index bec3ea4bd67f..b3bdb5c1390f 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -40,6 +40,8 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
 				      u32 blocksize, u64 parent_transid);
 int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize,
 			 u64 parent_transid);
+int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize,
+			 int mirror_num, struct extent_buffer **eb);
 struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root,
 						   u64 bytenr, u32 blocksize);
 int clean_tree_block(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 6d74c6b34691..fcaf49bcb880 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -32,6 +32,7 @@
 #define EXTENT_BUFFER_BLOCKING 1
 #define EXTENT_BUFFER_DIRTY 2
 #define EXTENT_BUFFER_CORRUPT 3
+#define EXTENT_BUFFER_READAHEAD 4	/* this got triggered by readahead */
 
 /* these are flags for extent_clear_unlock_delalloc */
 #define EXTENT_CLEAR_UNLOCK_PAGE 0x1
-- 
cgit v1.2.3


From 90519d66abbccc251d14719ac76f191f70826e40 Mon Sep 17 00:00:00 2001
From: Arne Jansen <sensille@gmx.net>
Date: Mon, 23 May 2011 14:30:00 +0200
Subject: btrfs: state information for readahead

Add state information for readahead to btrfs_fs_info and btrfs_device

Changes v2:
 - don't wait in radix_trees
 - add own set of workers for readahead

Reviewed-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Arne Jansen <sensille@gmx.net>
---
 fs/btrfs/ctree.h   |  5 +++++
 fs/btrfs/disk-io.c | 10 ++++++++++
 fs/btrfs/volumes.c |  8 ++++++++
 fs/btrfs/volumes.h |  8 ++++++++
 4 files changed, 31 insertions(+)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 03912c5c6f49..f71fd24cc152 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1036,6 +1036,7 @@ struct btrfs_fs_info {
 	struct btrfs_workers endio_freespace_worker;
 	struct btrfs_workers submit_workers;
 	struct btrfs_workers caching_workers;
+	struct btrfs_workers readahead_workers;
 
 	/*
 	 * fixup workers take dirty pages that didn't properly go through
@@ -1119,6 +1120,10 @@ struct btrfs_fs_info {
 	u64 fs_state;
 
 	struct btrfs_delayed_root *delayed_root;
+
+	/* readahead tree */
+	spinlock_t reada_lock;
+	struct radix_tree_root reada_tree;
 };
 
 /*
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 1220d04072c8..fd28c40e9ec3 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1711,6 +1711,10 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	fs_info->defrag_inodes = RB_ROOT;
 	fs_info->trans_no_join = 0;
 
+	/* readahead state */
+	INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT);
+	spin_lock_init(&fs_info->reada_lock);
+
 	fs_info->thread_pool_size = min_t(unsigned long,
 					  num_online_cpus() + 2, 8);
 
@@ -1903,6 +1907,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	btrfs_init_workers(&fs_info->delayed_workers, "delayed-meta",
 			   fs_info->thread_pool_size,
 			   &fs_info->generic_worker);
+	btrfs_init_workers(&fs_info->readahead_workers, "readahead",
+			   fs_info->thread_pool_size,
+			   &fs_info->generic_worker);
 
 	/*
 	 * endios are largely parallel and should have a very
@@ -1913,6 +1920,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
 	fs_info->endio_write_workers.idle_thresh = 2;
 	fs_info->endio_meta_write_workers.idle_thresh = 2;
+	fs_info->readahead_workers.idle_thresh = 2;
 
 	btrfs_start_workers(&fs_info->workers, 1);
 	btrfs_start_workers(&fs_info->generic_worker, 1);
@@ -1926,6 +1934,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	btrfs_start_workers(&fs_info->endio_freespace_worker, 1);
 	btrfs_start_workers(&fs_info->delayed_workers, 1);
 	btrfs_start_workers(&fs_info->caching_workers, 1);
+	btrfs_start_workers(&fs_info->readahead_workers, 1);
 
 	fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
 	fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
@@ -2650,6 +2659,7 @@ int close_ctree(struct btrfs_root *root)
 	btrfs_stop_workers(&fs_info->submit_workers);
 	btrfs_stop_workers(&fs_info->delayed_workers);
 	btrfs_stop_workers(&fs_info->caching_workers);
+	btrfs_stop_workers(&fs_info->readahead_workers);
 
 	btrfs_close_devices(fs_info->fs_devices);
 	btrfs_mapping_tree_free(&fs_info->mapping_tree);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f2a4cc79da61..1dccce5bc93d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -366,6 +366,14 @@ static noinline int device_list_add(const char *path,
 		}
 		INIT_LIST_HEAD(&device->dev_alloc_list);
 
+		/* init readahead state */
+		spin_lock_init(&device->reada_lock);
+		device->reada_curr_zone = NULL;
+		atomic_set(&device->reada_in_flight, 0);
+		device->reada_next = 0;
+		INIT_RADIX_TREE(&device->reada_zones, GFP_NOFS & ~__GFP_WAIT);
+		INIT_RADIX_TREE(&device->reada_extents, GFP_NOFS & ~__GFP_WAIT);
+
 		mutex_lock(&fs_devices->device_list_mutex);
 		list_add_rcu(&device->dev_list, &fs_devices->devices);
 		mutex_unlock(&fs_devices->device_list_mutex);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 6d866db4e177..2a751246188a 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -92,6 +92,14 @@ struct btrfs_device {
 	struct btrfs_work work;
 	struct rcu_head rcu;
 	struct work_struct rcu_work;
+
+	/* readahead state */
+	spinlock_t reada_lock;
+	atomic_t reada_in_flight;
+	u64 reada_next;
+	struct reada_zone *reada_curr_zone;
+	struct radix_tree_root reada_zones;
+	struct radix_tree_root reada_extents;
 };
 
 struct btrfs_fs_devices {
-- 
cgit v1.2.3


From 7414a03fbf9e75fbbf2a3c16828cd862e572aa44 Mon Sep 17 00:00:00 2001
From: Arne Jansen <sensille@gmx.net>
Date: Mon, 23 May 2011 14:33:49 +0200
Subject: btrfs: initial readahead code and prototypes

This is the implementation for the generic read ahead framework.

To trigger a readahead, btrfs_reada_add must be called. It will start
a read ahead for the given range [start, end) on tree root. The returned
handle can either be used to wait on the readahead to finish
(btrfs_reada_wait), or to send it to the background (btrfs_reada_detach).

The read ahead works as follows:
On btrfs_reada_add, the root of the tree is inserted into a radix_tree.
reada_start_machine will then search for extents to prefetch and trigger
some reads. When a read finishes for a node, all contained node/leaf
pointers that lie in the given range will also be enqueued. The reads will
be triggered in sequential order, thus giving a big win over a naive
enumeration. It will also make use of multi-device layouts. Each disk
will have its on read pointer and all disks will by utilized in parallel.
Also will no two disks read both sides of a mirror simultaneously, as this
would waste seeking capacity. Instead both disks will read different parts
of the filesystem.
Any number of readaheads can be started in parallel. The read order will be
determined globally, i.e. 2 parallel readaheads will normally finish faster
than the 2 started one after another.

Changes v2:
 - protect root->node by transaction instead of node_lock
 - fix missed branches:
    The readahead had a too simple check to determine if a branch from
    a node should be checked or not. It now also records the upper bound
    of each node to see if the requested RA range lies within.
 - use KERN_CONT to debug output, to avoid line breaks
 - defer reada_start_machine to worker to avoid deadlock

Changes v3:
 - protect root->node by rcu

Changes v5:
 - changed EIO-semantics of reada_tree_block_flagged
 - remove spin_lock from reada_control and make elems an atomic_t
 - remove unused read_total from reada_control
 - kill reada_key_cmp, use btrfs_comp_cpu_keys instead
 - use kref-style release functions where possible
 - return struct reada_control * instead of void * from btrfs_reada_add

Signed-off-by: Arne Jansen <sensille@gmx.net>
---
 fs/btrfs/Makefile |   3 +-
 fs/btrfs/ctree.h  |  16 +
 fs/btrfs/reada.c  | 949 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 967 insertions(+), 1 deletion(-)
 create mode 100644 fs/btrfs/reada.c

diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 40e6ac08c21f..bdd6fb238ce1 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -7,6 +7,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
 	   extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
 	   export.o tree-log.o free-space-cache.o zlib.o lzo.o \
-	   compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o
+	   compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
+	   reada.o
 
 btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index f71fd24cc152..370af767440d 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2702,4 +2702,20 @@ int btrfs_scrub_cancel_devid(struct btrfs_root *root, u64 devid);
 int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
 			 struct btrfs_scrub_progress *progress);
 
+/* reada.c */
+struct reada_control {
+	struct btrfs_root	*root;		/* tree to prefetch */
+	struct btrfs_key	key_start;
+	struct btrfs_key	key_end;	/* exclusive */
+	atomic_t		elems;
+	struct kref		refcnt;
+	wait_queue_head_t	wait;
+};
+struct reada_control *btrfs_reada_add(struct btrfs_root *root,
+			      struct btrfs_key *start, struct btrfs_key *end);
+int btrfs_reada_wait(void *handle);
+void btrfs_reada_detach(void *handle);
+int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
+			 u64 start, int err);
+
 #endif
diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
new file mode 100644
index 000000000000..2b701d082227
--- /dev/null
+++ b/fs/btrfs/reada.c
@@ -0,0 +1,949 @@
+/*
+ * Copyright (C) 2011 STRATO.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/sched.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h>
+#include <linux/blkdev.h>
+#include <linux/rbtree.h>
+#include <linux/slab.h>
+#include <linux/workqueue.h>
+#include "ctree.h"
+#include "volumes.h"
+#include "disk-io.h"
+#include "transaction.h"
+
+#undef DEBUG
+
+/*
+ * This is the implementation for the generic read ahead framework.
+ *
+ * To trigger a readahead, btrfs_reada_add must be called. It will start
+ * a read ahead for the given range [start, end) on tree root. The returned
+ * handle can either be used to wait on the readahead to finish
+ * (btrfs_reada_wait), or to send it to the background (btrfs_reada_detach).
+ *
+ * The read ahead works as follows:
+ * On btrfs_reada_add, the root of the tree is inserted into a radix_tree.
+ * reada_start_machine will then search for extents to prefetch and trigger
+ * some reads. When a read finishes for a node, all contained node/leaf
+ * pointers that lie in the given range will also be enqueued. The reads will
+ * be triggered in sequential order, thus giving a big win over a naive
+ * enumeration. It will also make use of multi-device layouts. Each disk
+ * will have its on read pointer and all disks will by utilized in parallel.
+ * Also will no two disks read both sides of a mirror simultaneously, as this
+ * would waste seeking capacity. Instead both disks will read different parts
+ * of the filesystem.
+ * Any number of readaheads can be started in parallel. The read order will be
+ * determined globally, i.e. 2 parallel readaheads will normally finish faster
+ * than the 2 started one after another.
+ */
+
+#define MAX_MIRRORS 2
+#define MAX_IN_FLIGHT 6
+
+struct reada_extctl {
+	struct list_head	list;
+	struct reada_control	*rc;
+	u64			generation;
+};
+
+struct reada_extent {
+	u64			logical;
+	struct btrfs_key	top;
+	u32			blocksize;
+	int			err;
+	struct list_head	extctl;
+	struct kref		refcnt;
+	spinlock_t		lock;
+	struct reada_zone	*zones[MAX_MIRRORS];
+	int			nzones;
+	struct btrfs_device	*scheduled_for;
+};
+
+struct reada_zone {
+	u64			start;
+	u64			end;
+	u64			elems;
+	struct list_head	list;
+	spinlock_t		lock;
+	int			locked;
+	struct btrfs_device	*device;
+	struct btrfs_device	*devs[MAX_MIRRORS]; /* full list, incl self */
+	int			ndevs;
+	struct kref		refcnt;
+};
+
+struct reada_machine_work {
+	struct btrfs_work	work;
+	struct btrfs_fs_info	*fs_info;
+};
+
+static void reada_extent_put(struct btrfs_fs_info *, struct reada_extent *);
+static void reada_control_release(struct kref *kref);
+static void reada_zone_release(struct kref *kref);
+static void reada_start_machine(struct btrfs_fs_info *fs_info);
+static void __reada_start_machine(struct btrfs_fs_info *fs_info);
+
+static int reada_add_block(struct reada_control *rc, u64 logical,
+			   struct btrfs_key *top, int level, u64 generation);
+
+/* recurses */
+/* in case of err, eb might be NULL */
+static int __readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
+			    u64 start, int err)
+{
+	int level = 0;
+	int nritems;
+	int i;
+	u64 bytenr;
+	u64 generation;
+	struct reada_extent *re;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct list_head list;
+	unsigned long index = start >> PAGE_CACHE_SHIFT;
+	struct btrfs_device *for_dev;
+
+	if (eb)
+		level = btrfs_header_level(eb);
+
+	/* find extent */
+	spin_lock(&fs_info->reada_lock);
+	re = radix_tree_lookup(&fs_info->reada_tree, index);
+	if (re)
+		kref_get(&re->refcnt);
+	spin_unlock(&fs_info->reada_lock);
+
+	if (!re)
+		return -1;
+
+	spin_lock(&re->lock);
+	/*
+	 * just take the full list from the extent. afterwards we
+	 * don't need the lock anymore
+	 */
+	list_replace_init(&re->extctl, &list);
+	for_dev = re->scheduled_for;
+	re->scheduled_for = NULL;
+	spin_unlock(&re->lock);
+
+	if (err == 0) {
+		nritems = level ? btrfs_header_nritems(eb) : 0;
+		generation = btrfs_header_generation(eb);
+		/*
+		 * FIXME: currently we just set nritems to 0 if this is a leaf,
+		 * effectively ignoring the content. In a next step we could
+		 * trigger more readahead depending from the content, e.g.
+		 * fetch the checksums for the extents in the leaf.
+		 */
+	} else {
+		/*
+		 * this is the error case, the extent buffer has not been
+		 * read correctly. We won't access anything from it and
+		 * just cleanup our data structures. Effectively this will
+		 * cut the branch below this node from read ahead.
+		 */
+		nritems = 0;
+		generation = 0;
+	}
+
+	for (i = 0; i < nritems; i++) {
+		struct reada_extctl *rec;
+		u64 n_gen;
+		struct btrfs_key key;
+		struct btrfs_key next_key;
+
+		btrfs_node_key_to_cpu(eb, &key, i);
+		if (i + 1 < nritems)
+			btrfs_node_key_to_cpu(eb, &next_key, i + 1);
+		else
+			next_key = re->top;
+		bytenr = btrfs_node_blockptr(eb, i);
+		n_gen = btrfs_node_ptr_generation(eb, i);
+
+		list_for_each_entry(rec, &list, list) {
+			struct reada_control *rc = rec->rc;
+
+			/*
+			 * if the generation doesn't match, just ignore this
+			 * extctl. This will probably cut off a branch from
+			 * prefetch. Alternatively one could start a new (sub-)
+			 * prefetch for this branch, starting again from root.
+			 * FIXME: move the generation check out of this loop
+			 */
+#ifdef DEBUG
+			if (rec->generation != generation) {
+				printk(KERN_DEBUG "generation mismatch for "
+						"(%llu,%d,%llu) %llu != %llu\n",
+				       key.objectid, key.type, key.offset,
+				       rec->generation, generation);
+			}
+#endif
+			if (rec->generation == generation &&
+			    btrfs_comp_cpu_keys(&key, &rc->key_end) < 0 &&
+			    btrfs_comp_cpu_keys(&next_key, &rc->key_start) > 0)
+				reada_add_block(rc, bytenr, &next_key,
+						level - 1, n_gen);
+		}
+	}
+	/*
+	 * free extctl records
+	 */
+	while (!list_empty(&list)) {
+		struct reada_control *rc;
+		struct reada_extctl *rec;
+
+		rec = list_first_entry(&list, struct reada_extctl, list);
+		list_del(&rec->list);
+		rc = rec->rc;
+		kfree(rec);
+
+		kref_get(&rc->refcnt);
+		if (atomic_dec_and_test(&rc->elems)) {
+			kref_put(&rc->refcnt, reada_control_release);
+			wake_up(&rc->wait);
+		}
+		kref_put(&rc->refcnt, reada_control_release);
+
+		reada_extent_put(fs_info, re);	/* one ref for each entry */
+	}
+	reada_extent_put(fs_info, re);	/* our ref */
+	if (for_dev)
+		atomic_dec(&for_dev->reada_in_flight);
+
+	return 0;
+}
+
+/*
+ * start is passed separately in case eb in NULL, which may be the case with
+ * failed I/O
+ */
+int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
+			 u64 start, int err)
+{
+	int ret;
+
+	ret = __readahead_hook(root, eb, start, err);
+
+	reada_start_machine(root->fs_info);
+
+	return ret;
+}
+
+static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info,
+					  struct btrfs_device *dev, u64 logical,
+					  struct btrfs_multi_bio *multi)
+{
+	int ret;
+	int looped = 0;
+	struct reada_zone *zone;
+	struct btrfs_block_group_cache *cache = NULL;
+	u64 start;
+	u64 end;
+	int i;
+
+again:
+	zone = NULL;
+	spin_lock(&fs_info->reada_lock);
+	ret = radix_tree_gang_lookup(&dev->reada_zones, (void **)&zone,
+				     logical >> PAGE_CACHE_SHIFT, 1);
+	if (ret == 1)
+		kref_get(&zone->refcnt);
+	spin_unlock(&fs_info->reada_lock);
+
+	if (ret == 1) {
+		if (logical >= zone->start && logical < zone->end)
+			return zone;
+		spin_lock(&fs_info->reada_lock);
+		kref_put(&zone->refcnt, reada_zone_release);
+		spin_unlock(&fs_info->reada_lock);
+	}
+
+	if (looped)
+		return NULL;
+
+	cache = btrfs_lookup_block_group(fs_info, logical);
+	if (!cache)
+		return NULL;
+
+	start = cache->key.objectid;
+	end = start + cache->key.offset - 1;
+	btrfs_put_block_group(cache);
+
+	zone = kzalloc(sizeof(*zone), GFP_NOFS);
+	if (!zone)
+		return NULL;
+
+	zone->start = start;
+	zone->end = end;
+	INIT_LIST_HEAD(&zone->list);
+	spin_lock_init(&zone->lock);
+	zone->locked = 0;
+	kref_init(&zone->refcnt);
+	zone->elems = 0;
+	zone->device = dev; /* our device always sits at index 0 */
+	for (i = 0; i < multi->num_stripes; ++i) {
+		/* bounds have already been checked */
+		zone->devs[i] = multi->stripes[i].dev;
+	}
+	zone->ndevs = multi->num_stripes;
+
+	spin_lock(&fs_info->reada_lock);
+	ret = radix_tree_insert(&dev->reada_zones,
+				(unsigned long)zone->end >> PAGE_CACHE_SHIFT,
+				zone);
+	spin_unlock(&fs_info->reada_lock);
+
+	if (ret) {
+		kfree(zone);
+		looped = 1;
+		goto again;
+	}
+
+	return zone;
+}
+
+static struct reada_extent *reada_find_extent(struct btrfs_root *root,
+					      u64 logical,
+					      struct btrfs_key *top, int level)
+{
+	int ret;
+	int looped = 0;
+	struct reada_extent *re = NULL;
+	struct btrfs_fs_info *fs_info = root->fs_info;
+	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
+	struct btrfs_multi_bio *multi = NULL;
+	struct btrfs_device *dev;
+	u32 blocksize;
+	u64 length;
+	int nzones = 0;
+	int i;
+	unsigned long index = logical >> PAGE_CACHE_SHIFT;
+
+again:
+	spin_lock(&fs_info->reada_lock);
+	re = radix_tree_lookup(&fs_info->reada_tree, index);
+	if (re)
+		kref_get(&re->refcnt);
+	spin_unlock(&fs_info->reada_lock);
+
+	if (re || looped)
+		return re;
+
+	re = kzalloc(sizeof(*re), GFP_NOFS);
+	if (!re)
+		return NULL;
+
+	blocksize = btrfs_level_size(root, level);
+	re->logical = logical;
+	re->blocksize = blocksize;
+	re->top = *top;
+	INIT_LIST_HEAD(&re->extctl);
+	spin_lock_init(&re->lock);
+	kref_init(&re->refcnt);
+
+	/*
+	 * map block
+	 */
+	length = blocksize;
+	ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, &multi, 0);
+	if (ret || !multi || length < blocksize)
+		goto error;
+
+	if (multi->num_stripes > MAX_MIRRORS) {
+		printk(KERN_ERR "btrfs readahead: more than %d copies not "
+				"supported", MAX_MIRRORS);
+		goto error;
+	}
+
+	for (nzones = 0; nzones < multi->num_stripes; ++nzones) {
+		struct reada_zone *zone;
+
+		dev = multi->stripes[nzones].dev;
+		zone = reada_find_zone(fs_info, dev, logical, multi);
+		if (!zone)
+			break;
+
+		re->zones[nzones] = zone;
+		spin_lock(&zone->lock);
+		if (!zone->elems)
+			kref_get(&zone->refcnt);
+		++zone->elems;
+		spin_unlock(&zone->lock);
+		spin_lock(&fs_info->reada_lock);
+		kref_put(&zone->refcnt, reada_zone_release);
+		spin_unlock(&fs_info->reada_lock);
+	}
+	re->nzones = nzones;
+	if (nzones == 0) {
+		/* not a single zone found, error and out */
+		goto error;
+	}
+
+	/* insert extent in reada_tree + all per-device trees, all or nothing */
+	spin_lock(&fs_info->reada_lock);
+	ret = radix_tree_insert(&fs_info->reada_tree, index, re);
+	if (ret) {
+		spin_unlock(&fs_info->reada_lock);
+		if (ret != -ENOMEM) {
+			/* someone inserted the extent in the meantime */
+			looped = 1;
+		}
+		goto error;
+	}
+	for (i = 0; i < nzones; ++i) {
+		dev = multi->stripes[i].dev;
+		ret = radix_tree_insert(&dev->reada_extents, index, re);
+		if (ret) {
+			while (--i >= 0) {
+				dev = multi->stripes[i].dev;
+				BUG_ON(dev == NULL);
+				radix_tree_delete(&dev->reada_extents, index);
+			}
+			BUG_ON(fs_info == NULL);
+			radix_tree_delete(&fs_info->reada_tree, index);
+			spin_unlock(&fs_info->reada_lock);
+			goto error;
+		}
+	}
+	spin_unlock(&fs_info->reada_lock);
+
+	return re;
+
+error:
+	while (nzones) {
+		struct reada_zone *zone;
+
+		--nzones;
+		zone = re->zones[nzones];
+		kref_get(&zone->refcnt);
+		spin_lock(&zone->lock);
+		--zone->elems;
+		if (zone->elems == 0) {
+			/*
+			 * no fs_info->reada_lock needed, as this can't be
+			 * the last ref
+			 */
+			kref_put(&zone->refcnt, reada_zone_release);
+		}
+		spin_unlock(&zone->lock);
+
+		spin_lock(&fs_info->reada_lock);
+		kref_put(&zone->refcnt, reada_zone_release);
+		spin_unlock(&fs_info->reada_lock);
+	}
+	kfree(re);
+	if (looped)
+		goto again;
+	return NULL;
+}
+
+static void reada_kref_dummy(struct kref *kr)
+{
+}
+
+static void reada_extent_put(struct btrfs_fs_info *fs_info,
+			     struct reada_extent *re)
+{
+	int i;
+	unsigned long index = re->logical >> PAGE_CACHE_SHIFT;
+
+	spin_lock(&fs_info->reada_lock);
+	if (!kref_put(&re->refcnt, reada_kref_dummy)) {
+		spin_unlock(&fs_info->reada_lock);
+		return;
+	}
+
+	radix_tree_delete(&fs_info->reada_tree, index);
+	for (i = 0; i < re->nzones; ++i) {
+		struct reada_zone *zone = re->zones[i];
+
+		radix_tree_delete(&zone->device->reada_extents, index);
+	}
+
+	spin_unlock(&fs_info->reada_lock);
+
+	for (i = 0; i < re->nzones; ++i) {
+		struct reada_zone *zone = re->zones[i];
+
+		kref_get(&zone->refcnt);
+		spin_lock(&zone->lock);
+		--zone->elems;
+		if (zone->elems == 0) {
+			/* no fs_info->reada_lock needed, as this can't be
+			 * the last ref */
+			kref_put(&zone->refcnt, reada_zone_release);
+		}
+		spin_unlock(&zone->lock);
+
+		spin_lock(&fs_info->reada_lock);
+		kref_put(&zone->refcnt, reada_zone_release);
+		spin_unlock(&fs_info->reada_lock);
+	}
+	if (re->scheduled_for)
+		atomic_dec(&re->scheduled_for->reada_in_flight);
+
+	kfree(re);
+}
+
+static void reada_zone_release(struct kref *kref)
+{
+	struct reada_zone *zone = container_of(kref, struct reada_zone, refcnt);
+
+	radix_tree_delete(&zone->device->reada_zones,
+			  zone->end >> PAGE_CACHE_SHIFT);
+
+	kfree(zone);
+}
+
+static void reada_control_release(struct kref *kref)
+{
+	struct reada_control *rc = container_of(kref, struct reada_control,
+						refcnt);
+
+	kfree(rc);
+}
+
+static int reada_add_block(struct reada_control *rc, u64 logical,
+			   struct btrfs_key *top, int level, u64 generation)
+{
+	struct btrfs_root *root = rc->root;
+	struct reada_extent *re;
+	struct reada_extctl *rec;
+
+	re = reada_find_extent(root, logical, top, level); /* takes one ref */
+	if (!re)
+		return -1;
+
+	rec = kzalloc(sizeof(*rec), GFP_NOFS);
+	if (!rec) {
+		reada_extent_put(root->fs_info, re);
+		return -1;
+	}
+
+	rec->rc = rc;
+	rec->generation = generation;
+	atomic_inc(&rc->elems);
+
+	spin_lock(&re->lock);
+	list_add_tail(&rec->list, &re->extctl);
+	spin_unlock(&re->lock);
+
+	/* leave the ref on the extent */
+
+	return 0;
+}
+
+/*
+ * called with fs_info->reada_lock held
+ */
+static void reada_peer_zones_set_lock(struct reada_zone *zone, int lock)
+{
+	int i;
+	unsigned long index = zone->end >> PAGE_CACHE_SHIFT;
+
+	for (i = 0; i < zone->ndevs; ++i) {
+		struct reada_zone *peer;
+		peer = radix_tree_lookup(&zone->devs[i]->reada_zones, index);
+		if (peer && peer->device != zone->device)
+			peer->locked = lock;
+	}
+}
+
+/*
+ * called with fs_info->reada_lock held
+ */
+static int reada_pick_zone(struct btrfs_device *dev)
+{
+	struct reada_zone *top_zone = NULL;
+	struct reada_zone *top_locked_zone = NULL;
+	u64 top_elems = 0;
+	u64 top_locked_elems = 0;
+	unsigned long index = 0;
+	int ret;
+
+	if (dev->reada_curr_zone) {
+		reada_peer_zones_set_lock(dev->reada_curr_zone, 0);
+		kref_put(&dev->reada_curr_zone->refcnt, reada_zone_release);
+		dev->reada_curr_zone = NULL;
+	}
+	/* pick the zone with the most elements */
+	while (1) {
+		struct reada_zone *zone;
+
+		ret = radix_tree_gang_lookup(&dev->reada_zones,
+					     (void **)&zone, index, 1);
+		if (ret == 0)
+			break;
+		index = (zone->end >> PAGE_CACHE_SHIFT) + 1;
+		if (zone->locked) {
+			if (zone->elems > top_locked_elems) {
+				top_locked_elems = zone->elems;
+				top_locked_zone = zone;
+			}
+		} else {
+			if (zone->elems > top_elems) {
+				top_elems = zone->elems;
+				top_zone = zone;
+			}
+		}
+	}
+	if (top_zone)
+		dev->reada_curr_zone = top_zone;
+	else if (top_locked_zone)
+		dev->reada_curr_zone = top_locked_zone;
+	else
+		return 0;
+
+	dev->reada_next = dev->reada_curr_zone->start;
+	kref_get(&dev->reada_curr_zone->refcnt);
+	reada_peer_zones_set_lock(dev->reada_curr_zone, 1);
+
+	return 1;
+}
+
+static int reada_start_machine_dev(struct btrfs_fs_info *fs_info,
+				   struct btrfs_device *dev)
+{
+	struct reada_extent *re = NULL;
+	int mirror_num = 0;
+	struct extent_buffer *eb = NULL;
+	u64 logical;
+	u32 blocksize;
+	int ret;
+	int i;
+	int need_kick = 0;
+
+	spin_lock(&fs_info->reada_lock);
+	if (dev->reada_curr_zone == NULL) {
+		ret = reada_pick_zone(dev);
+		if (!ret) {
+			spin_unlock(&fs_info->reada_lock);
+			return 0;
+		}
+	}
+	/*
+	 * FIXME currently we issue the reads one extent at a time. If we have
+	 * a contiguous block of extents, we could also coagulate them or use
+	 * plugging to speed things up
+	 */
+	ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
+				     dev->reada_next >> PAGE_CACHE_SHIFT, 1);
+	if (ret == 0 || re->logical >= dev->reada_curr_zone->end) {
+		ret = reada_pick_zone(dev);
+		if (!ret) {
+			spin_unlock(&fs_info->reada_lock);
+			return 0;
+		}
+		re = NULL;
+		ret = radix_tree_gang_lookup(&dev->reada_extents, (void **)&re,
+					dev->reada_next >> PAGE_CACHE_SHIFT, 1);
+	}
+	if (ret == 0) {
+		spin_unlock(&fs_info->reada_lock);
+		return 0;
+	}
+	dev->reada_next = re->logical + re->blocksize;
+	kref_get(&re->refcnt);
+
+	spin_unlock(&fs_info->reada_lock);
+
+	/*
+	 * find mirror num
+	 */
+	for (i = 0; i < re->nzones; ++i) {
+		if (re->zones[i]->device == dev) {
+			mirror_num = i + 1;
+			break;
+		}
+	}
+	logical = re->logical;
+	blocksize = re->blocksize;
+
+	spin_lock(&re->lock);
+	if (re->scheduled_for == NULL) {
+		re->scheduled_for = dev;
+		need_kick = 1;
+	}
+	spin_unlock(&re->lock);
+
+	reada_extent_put(fs_info, re);
+
+	if (!need_kick)
+		return 0;
+
+	atomic_inc(&dev->reada_in_flight);
+	ret = reada_tree_block_flagged(fs_info->extent_root, logical, blocksize,
+			 mirror_num, &eb);
+	if (ret)
+		__readahead_hook(fs_info->extent_root, NULL, logical, ret);
+	else if (eb)
+		__readahead_hook(fs_info->extent_root, eb, eb->start, ret);
+
+	if (eb)
+		free_extent_buffer(eb);
+
+	return 1;
+
+}
+
+static void reada_start_machine_worker(struct btrfs_work *work)
+{
+	struct reada_machine_work *rmw;
+	struct btrfs_fs_info *fs_info;
+
+	rmw = container_of(work, struct reada_machine_work, work);
+	fs_info = rmw->fs_info;
+
+	kfree(rmw);
+
+	__reada_start_machine(fs_info);
+}
+
+static void __reada_start_machine(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_device *device;
+	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+	u64 enqueued;
+	u64 total = 0;
+	int i;
+
+	do {
+		enqueued = 0;
+		list_for_each_entry(device, &fs_devices->devices, dev_list) {
+			if (atomic_read(&device->reada_in_flight) <
+			    MAX_IN_FLIGHT)
+				enqueued += reada_start_machine_dev(fs_info,
+								    device);
+		}
+		total += enqueued;
+	} while (enqueued && total < 10000);
+
+	if (enqueued == 0)
+		return;
+
+	/*
+	 * If everything is already in the cache, this is effectively single
+	 * threaded. To a) not hold the caller for too long and b) to utilize
+	 * more cores, we broke the loop above after 10000 iterations and now
+	 * enqueue to workers to finish it. This will distribute the load to
+	 * the cores.
+	 */
+	for (i = 0; i < 2; ++i)
+		reada_start_machine(fs_info);
+}
+
+static void reada_start_machine(struct btrfs_fs_info *fs_info)
+{
+	struct reada_machine_work *rmw;
+
+	rmw = kzalloc(sizeof(*rmw), GFP_NOFS);
+	if (!rmw) {
+		/* FIXME we cannot handle this properly right now */
+		BUG();
+	}
+	rmw->work.func = reada_start_machine_worker;
+	rmw->fs_info = fs_info;
+
+	btrfs_queue_worker(&fs_info->readahead_workers, &rmw->work);
+}
+
+#ifdef DEBUG
+static void dump_devs(struct btrfs_fs_info *fs_info, int all)
+{
+	struct btrfs_device *device;
+	struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
+	unsigned long index;
+	int ret;
+	int i;
+	int j;
+	int cnt;
+
+	spin_lock(&fs_info->reada_lock);
+	list_for_each_entry(device, &fs_devices->devices, dev_list) {
+		printk(KERN_DEBUG "dev %lld has %d in flight\n", device->devid,
+			atomic_read(&device->reada_in_flight));
+		index = 0;
+		while (1) {
+			struct reada_zone *zone;
+			ret = radix_tree_gang_lookup(&device->reada_zones,
+						     (void **)&zone, index, 1);
+			if (ret == 0)
+				break;
+			printk(KERN_DEBUG "  zone %llu-%llu elems %llu locked "
+				"%d devs", zone->start, zone->end, zone->elems,
+				zone->locked);
+			for (j = 0; j < zone->ndevs; ++j) {
+				printk(KERN_CONT " %lld",
+					zone->devs[j]->devid);
+			}
+			if (device->reada_curr_zone == zone)
+				printk(KERN_CONT " curr off %llu",
+					device->reada_next - zone->start);
+			printk(KERN_CONT "\n");
+			index = (zone->end >> PAGE_CACHE_SHIFT) + 1;
+		}
+		cnt = 0;
+		index = 0;
+		while (all) {
+			struct reada_extent *re = NULL;
+
+			ret = radix_tree_gang_lookup(&device->reada_extents,
+						     (void **)&re, index, 1);
+			if (ret == 0)
+				break;
+			printk(KERN_DEBUG
+				"  re: logical %llu size %u empty %d for %lld",
+				re->logical, re->blocksize,
+				list_empty(&re->extctl), re->scheduled_for ?
+				re->scheduled_for->devid : -1);
+
+			for (i = 0; i < re->nzones; ++i) {
+				printk(KERN_CONT " zone %llu-%llu devs",
+					re->zones[i]->start,
+					re->zones[i]->end);
+				for (j = 0; j < re->zones[i]->ndevs; ++j) {
+					printk(KERN_CONT " %lld",
+						re->zones[i]->devs[j]->devid);
+				}
+			}
+			printk(KERN_CONT "\n");
+			index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
+			if (++cnt > 15)
+				break;
+		}
+	}
+
+	index = 0;
+	cnt = 0;
+	while (all) {
+		struct reada_extent *re = NULL;
+
+		ret = radix_tree_gang_lookup(&fs_info->reada_tree, (void **)&re,
+					     index, 1);
+		if (ret == 0)
+			break;
+		if (!re->scheduled_for) {
+			index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
+			continue;
+		}
+		printk(KERN_DEBUG
+			"re: logical %llu size %u list empty %d for %lld",
+			re->logical, re->blocksize, list_empty(&re->extctl),
+			re->scheduled_for ? re->scheduled_for->devid : -1);
+		for (i = 0; i < re->nzones; ++i) {
+			printk(KERN_CONT " zone %llu-%llu devs",
+				re->zones[i]->start,
+				re->zones[i]->end);
+			for (i = 0; i < re->nzones; ++i) {
+				printk(KERN_CONT " zone %llu-%llu devs",
+					re->zones[i]->start,
+					re->zones[i]->end);
+				for (j = 0; j < re->zones[i]->ndevs; ++j) {
+					printk(KERN_CONT " %lld",
+						re->zones[i]->devs[j]->devid);
+				}
+			}
+		}
+		printk(KERN_CONT "\n");
+		index = (re->logical >> PAGE_CACHE_SHIFT) + 1;
+	}
+	spin_unlock(&fs_info->reada_lock);
+}
+#endif
+
+/*
+ * interface
+ */
+struct reada_control *btrfs_reada_add(struct btrfs_root *root,
+			struct btrfs_key *key_start, struct btrfs_key *key_end)
+{
+	struct reada_control *rc;
+	u64 start;
+	u64 generation;
+	int level;
+	struct extent_buffer *node;
+	static struct btrfs_key max_key = {
+		.objectid = (u64)-1,
+		.type = (u8)-1,
+		.offset = (u64)-1
+	};
+
+	rc = kzalloc(sizeof(*rc), GFP_NOFS);
+	if (!rc)
+		return ERR_PTR(-ENOMEM);
+
+	rc->root = root;
+	rc->key_start = *key_start;
+	rc->key_end = *key_end;
+	atomic_set(&rc->elems, 0);
+	init_waitqueue_head(&rc->wait);
+	kref_init(&rc->refcnt);
+	kref_get(&rc->refcnt); /* one ref for having elements */
+
+	node = btrfs_root_node(root);
+	start = node->start;
+	level = btrfs_header_level(node);
+	generation = btrfs_header_generation(node);
+	free_extent_buffer(node);
+
+	reada_add_block(rc, start, &max_key, level, generation);
+
+	reada_start_machine(root->fs_info);
+
+	return rc;
+}
+
+#ifdef DEBUG
+int btrfs_reada_wait(void *handle)
+{
+	struct reada_control *rc = handle;
+
+	while (atomic_read(&rc->elems)) {
+		wait_event_timeout(rc->wait, atomic_read(&rc->elems) == 0,
+				   5 * HZ);
+		dump_devs(rc->root->fs_info, rc->elems < 10 ? 1 : 0);
+	}
+
+	dump_devs(rc->root->fs_info, rc->elems < 10 ? 1 : 0);
+
+	kref_put(&rc->refcnt, reada_control_release);
+
+	return 0;
+}
+#else
+int btrfs_reada_wait(void *handle)
+{
+	struct reada_control *rc = handle;
+
+	while (atomic_read(&rc->elems)) {
+		wait_event(rc->wait, atomic_read(&rc->elems) == 0);
+	}
+
+	kref_put(&rc->refcnt, reada_control_release);
+
+	return 0;
+}
+#endif
+
+void btrfs_reada_detach(void *handle)
+{
+	struct reada_control *rc = handle;
+
+	kref_put(&rc->refcnt, reada_control_release);
+}
-- 
cgit v1.2.3


From 4bb31e928d1a47f5bd046ecb176b8eff7c589fc0 Mon Sep 17 00:00:00 2001
From: Arne Jansen <sensille@gmx.net>
Date: Fri, 10 Jun 2011 13:55:54 +0200
Subject: btrfs: hooks for readahead

This adds the hooks needed for readahead. In the readpage_end_io_hook,
the extent state is checked for the EXTENT_READAHEAD flag. Only in this
case the readahead hook is called, to keep the impact on non-ra as low
as possible.
Additionally, a hook for a failed IO is added, otherwise readahead would
wait indefinitely for the extent to finish.

Changes for v2:
 - eliminate race condition

Signed-off-by: Arne Jansen <sensille@gmx.net>
---
 fs/btrfs/disk-io.c   | 37 +++++++++++++++++++++++++++++++++++++
 fs/btrfs/extent_io.c |  2 +-
 2 files changed, 38 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index fd28c40e9ec3..2151828aa142 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -609,11 +609,47 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 	end = min_t(u64, eb->len, PAGE_CACHE_SIZE);
 	end = eb->start + end - 1;
 err:
+	if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) {
+		clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags);
+		btree_readahead_hook(root, eb, eb->start, ret);
+	}
+
 	free_extent_buffer(eb);
 out:
 	return ret;
 }
 
+static int btree_io_failed_hook(struct bio *failed_bio,
+			 struct page *page, u64 start, u64 end,
+			 struct extent_state *state)
+{
+	struct extent_io_tree *tree;
+	unsigned long len;
+	struct extent_buffer *eb;
+	struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
+
+	tree = &BTRFS_I(page->mapping->host)->io_tree;
+	if (page->private == EXTENT_PAGE_PRIVATE)
+		goto out;
+	if (!page->private)
+		goto out;
+
+	len = page->private >> 2;
+	WARN_ON(len == 0);
+
+	eb = alloc_extent_buffer(tree, start, len, page);
+	if (eb == NULL)
+		goto out;
+
+	if (test_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) {
+		clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags);
+		btree_readahead_hook(root, eb, eb->start, -EIO);
+	}
+
+out:
+	return -EIO;	/* we fixed nothing */
+}
+
 static void end_workqueue_bio(struct bio *bio, int err)
 {
 	struct end_io_wq *end_io_wq = bio->bi_private;
@@ -3166,6 +3202,7 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root)
 static struct extent_io_ops btree_extent_io_ops = {
 	.write_cache_pages_lock_hook = btree_lock_page_hook,
 	.readpage_end_io_hook = btree_readpage_end_io_hook,
+	.readpage_io_failed_hook = btree_io_failed_hook,
 	.submit_bio_hook = btree_submit_bio_hook,
 	/* note we're sharing with inode.c for the merge bio hook */
 	.merge_bio_hook = btrfs_merge_bio_hook,
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 823028e73cf8..deba714236d7 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1731,7 +1731,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
 		if (!uptodate && tree->ops &&
 		    tree->ops->readpage_io_failed_hook) {
 			ret = tree->ops->readpage_io_failed_hook(bio, page,
-							 start, end, NULL);
+							 start, end, state);
 			if (ret == 0) {
 				uptodate =
 					test_bit(BIO_UPTODATE, &bio->bi_flags);
-- 
cgit v1.2.3


From 7a26285eea8eb92e0088db011571d887d4551b0f Mon Sep 17 00:00:00 2001
From: Arne Jansen <sensille@gmx.net>
Date: Fri, 10 Jun 2011 12:39:23 +0200
Subject: btrfs: use readahead API for scrub

Scrub uses a simple tree-enumeration to bring the relevant portions
of the extent- and csum-tree into the page cache before starting the
scrub-I/O. This is now replaced by using the new readahead-API.
During readahead the scrub is being accounted as paused, so it won't
hold off transaction commits.

This change raises the average disk bandwith utilisation on my test
volume from 70% to 90%. On another volume, the time for a test run
went down from 89s to 43s.

Changes v5:
 - reada1/2 are now of type struct reada_control *

Signed-off-by: Arne Jansen <sensille@gmx.net>
---
 fs/btrfs/scrub.c | 112 +++++++++++++++++++++++++------------------------------
 1 file changed, 50 insertions(+), 62 deletions(-)

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index a8d03d5efb5d..f930f2776589 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -29,15 +29,12 @@
  * any can be found.
  *
  * Future enhancements:
- *  - To enhance the performance, better read-ahead strategies for the
- *    extent-tree can be employed.
  *  - In case an unrepairable extent is encountered, track which files are
  *    affected and report them
  *  - In case of a read error on files with nodatasum, map the file and read
  *    the extent to trigger a writeback of the good copy
  *  - track and record media errors, throw out bad devices
  *  - add a mode to also read unallocated space
- *  - make the prefetch cancellable
  */
 
 struct scrub_bio;
@@ -741,13 +738,16 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
 	int slot;
 	int i;
 	u64 nstripes;
-	int start_stripe;
 	struct extent_buffer *l;
 	struct btrfs_key key;
 	u64 physical;
 	u64 logical;
 	u64 generation;
 	u64 mirror_num;
+	struct reada_control *reada1;
+	struct reada_control *reada2;
+	struct btrfs_key key_start;
+	struct btrfs_key key_end;
 
 	u64 increment = map->stripe_len;
 	u64 offset;
@@ -779,81 +779,67 @@ static noinline_for_stack int scrub_stripe(struct scrub_dev *sdev,
 	if (!path)
 		return -ENOMEM;
 
-	path->reada = 2;
 	path->search_commit_root = 1;
 	path->skip_locking = 1;
 
 	/*
-	 * find all extents for each stripe and just read them to get
-	 * them into the page cache
-	 * FIXME: we can do better. build a more intelligent prefetching
+	 * trigger the readahead for extent tree csum tree and wait for
+	 * completion. During readahead, the scrub is officially paused
+	 * to not hold off transaction commits
 	 */
 	logical = base + offset;
-	physical = map->stripes[num].physical;
-	ret = 0;
-	for (i = 0; i < nstripes; ++i) {
-		key.objectid = logical;
-		key.type = BTRFS_EXTENT_ITEM_KEY;
-		key.offset = (u64)0;
-
-		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
-		if (ret < 0)
-			goto out_noplug;
-
-		/*
-		 * we might miss half an extent here, but that doesn't matter,
-		 * as it's only the prefetch
-		 */
-		while (1) {
-			l = path->nodes[0];
-			slot = path->slots[0];
-			if (slot >= btrfs_header_nritems(l)) {
-				ret = btrfs_next_leaf(root, path);
-				if (ret == 0)
-					continue;
-				if (ret < 0)
-					goto out_noplug;
 
-				break;
-			}
-			btrfs_item_key_to_cpu(l, &key, slot);
+	wait_event(sdev->list_wait,
+		   atomic_read(&sdev->in_flight) == 0);
+	atomic_inc(&fs_info->scrubs_paused);
+	wake_up(&fs_info->scrub_pause_wait);
 
-			if (key.objectid >= logical + map->stripe_len)
-				break;
+	/* FIXME it might be better to start readahead at commit root */
+	key_start.objectid = logical;
+	key_start.type = BTRFS_EXTENT_ITEM_KEY;
+	key_start.offset = (u64)0;
+	key_end.objectid = base + offset + nstripes * increment;
+	key_end.type = BTRFS_EXTENT_ITEM_KEY;
+	key_end.offset = (u64)0;
+	reada1 = btrfs_reada_add(root, &key_start, &key_end);
+
+	key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+	key_start.type = BTRFS_EXTENT_CSUM_KEY;
+	key_start.offset = logical;
+	key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
+	key_end.type = BTRFS_EXTENT_CSUM_KEY;
+	key_end.offset = base + offset + nstripes * increment;
+	reada2 = btrfs_reada_add(csum_root, &key_start, &key_end);
+
+	if (!IS_ERR(reada1))
+		btrfs_reada_wait(reada1);
+	if (!IS_ERR(reada2))
+		btrfs_reada_wait(reada2);
 
-			path->slots[0]++;
-		}
-		btrfs_release_path(path);
-		logical += increment;
-		physical += map->stripe_len;
-		cond_resched();
+	mutex_lock(&fs_info->scrub_lock);
+	while (atomic_read(&fs_info->scrub_pause_req)) {
+		mutex_unlock(&fs_info->scrub_lock);
+		wait_event(fs_info->scrub_pause_wait,
+		   atomic_read(&fs_info->scrub_pause_req) == 0);
+		mutex_lock(&fs_info->scrub_lock);
 	}
+	atomic_dec(&fs_info->scrubs_paused);
+	mutex_unlock(&fs_info->scrub_lock);
+	wake_up(&fs_info->scrub_pause_wait);
 
 	/*
 	 * collect all data csums for the stripe to avoid seeking during
 	 * the scrub. This might currently (crc32) end up to be about 1MB
 	 */
-	start_stripe = 0;
 	blk_start_plug(&plug);
-again:
-	logical = base + offset + start_stripe * increment;
-	for (i = start_stripe; i < nstripes; ++i) {
-		ret = btrfs_lookup_csums_range(csum_root, logical,
-					       logical + map->stripe_len - 1,
-					       &sdev->csum_list, 1);
-		if (ret)
-			goto out;
 
-		logical += increment;
-		cond_resched();
-	}
 	/*
 	 * now find all extents for each stripe and scrub them
 	 */
-	logical = base + offset + start_stripe * increment;
-	physical = map->stripes[num].physical + start_stripe * map->stripe_len;
+	logical = base + offset;
+	physical = map->stripes[num].physical;
 	ret = 0;
-	for (i = start_stripe; i < nstripes; ++i) {
+	for (i = 0; i < nstripes; ++i) {
 		/*
 		 * canceled?
 		 */
@@ -882,11 +868,14 @@ again:
 			atomic_dec(&fs_info->scrubs_paused);
 			mutex_unlock(&fs_info->scrub_lock);
 			wake_up(&fs_info->scrub_pause_wait);
-			scrub_free_csums(sdev);
-			start_stripe = i;
-			goto again;
 		}
 
+		ret = btrfs_lookup_csums_range(csum_root, logical,
+					       logical + map->stripe_len - 1,
+					       &sdev->csum_list, 1);
+		if (ret)
+			goto out;
+
 		key.objectid = logical;
 		key.type = BTRFS_EXTENT_ITEM_KEY;
 		key.offset = (u64)0;
@@ -982,7 +971,6 @@ next:
 
 out:
 	blk_finish_plug(&plug);
-out_noplug:
 	btrfs_free_path(path);
 	return ret < 0 ? ret : 0;
 }
-- 
cgit v1.2.3


From f1bdcc0a8278aa42cb77331275890aac85a4e7cd Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Thu, 14 Jul 2011 14:28:08 -0400
Subject: Btrfs: move stuff around in btrfs_inode to get better packing

Moving things around to give us better packing in the btrfs_inode.  This reduces
the size of our inode by 8 bytes.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/btrfs_inode.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index d9f99a16edd6..bf325f40cf92 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -115,9 +115,6 @@ struct btrfs_inode {
 	 */
 	u64 disk_i_size;
 
-	/* flags field from the on disk inode */
-	u32 flags;
-
 	/*
 	 * if this is a directory then index_cnt is the counter for the index
 	 * number for new files that are created
@@ -131,6 +128,9 @@ struct btrfs_inode {
 	 */
 	u64 last_unlink_trans;
 
+	/* flags field from the on disk inode */
+	u32 flags;
+
 	/*
 	 * Counters to keep track of the number of extent item's we may use due
 	 * to delalloc and such.  outstanding_extents is the number of extent
-- 
cgit v1.2.3


From 0cbbdf7c9c46467bfb7129c30236f36a679ab244 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Thu, 14 Jul 2011 16:02:04 -0400
Subject: Btrfs: kill reserved_bytes in inode

reserved_bytes is not used for anything in the inode, remove it.

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/btrfs_inode.h | 5 -----
 fs/btrfs/extent-tree.c | 2 --
 fs/btrfs/inode.c       | 1 -
 3 files changed, 8 deletions(-)

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index bf325f40cf92..c70fb10a307b 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -103,11 +103,6 @@ struct btrfs_inode {
 	 */
 	u64 delalloc_bytes;
 
-	/* total number of bytes that may be used for this inode for
-	 * delalloc
-	 */
-	u64 reserved_bytes;
-
 	/*
 	 * the size of the file stored in the metadata on disk.  data=ordered
 	 * means the in-memory i_size might be larger than the size on disk
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f5be06a2462f..03edac4f7771 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3122,7 +3122,6 @@ commit_trans:
 		return -ENOSPC;
 	}
 	data_sinfo->bytes_may_use += bytes;
-	BTRFS_I(inode)->reserved_bytes += bytes;
 	spin_unlock(&data_sinfo->lock);
 
 	return 0;
@@ -3144,7 +3143,6 @@ void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
 	data_sinfo = BTRFS_I(inode)->space_info;
 	spin_lock(&data_sinfo->lock);
 	data_sinfo->bytes_may_use -= bytes;
-	BTRFS_I(inode)->reserved_bytes -= bytes;
 	spin_unlock(&data_sinfo->lock);
 }
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b2d004ad66a0..156c3a0da792 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6755,7 +6755,6 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
 	ei->last_sub_trans = 0;
 	ei->logged_trans = 0;
 	ei->delalloc_bytes = 0;
-	ei->reserved_bytes = 0;
 	ei->disk_i_size = 0;
 	ei->flags = 0;
 	ei->index_cnt = (u64)-1;
-- 
cgit v1.2.3


From ba5b8958dabbd7890a6929af1ffc0d87187765dc Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Mon, 25 Jul 2011 15:40:35 -0400
Subject: Btrfs: use d_obtain_alias when mounting subvol/subvolid

Currently what we do is just wrong.  We either

1) Alloc a new "root" dentry with sb->s_root as it's parent which is just wrong
as we could walk into this subvol later on via another path and hilarity could
ensue.  Also we don't check the return value of d_splice_alias which isn't good
either.

or

2) Do a d_find_alias() which we could have lost our dentry from cache at this
point and found nothing.

So use d_obtain_alias().  In the case that we already have the inode/dentry in
cache we will get the correct dentry.  If not we will get a disconnected dentry
tree so if we walk into it later on everything will be connected up properly.
Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/super.c | 25 +------------------------
 1 file changed, 1 insertion(+), 24 deletions(-)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 15634d4648d7..244fa46c50b8 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -492,7 +492,6 @@ static struct dentry *get_default_root(struct super_block *sb,
 	struct btrfs_path *path;
 	struct btrfs_key location;
 	struct inode *inode;
-	struct dentry *dentry;
 	u64 dir_id;
 	int new = 0;
 
@@ -566,29 +565,7 @@ setup_root:
 		return dget(sb->s_root);
 	}
 
-	if (new) {
-		const struct qstr name = { .name = "/", .len = 1 };
-
-		/*
-		 * New inode, we need to make the dentry a sibling of s_root so
-		 * everything gets cleaned up properly on unmount.
-		 */
-		dentry = d_alloc(sb->s_root, &name);
-		if (!dentry) {
-			iput(inode);
-			return ERR_PTR(-ENOMEM);
-		}
-		d_splice_alias(inode, dentry);
-	} else {
-		/*
-		 * We found the inode in cache, just find a dentry for it and
-		 * put the reference to the inode we just got.
-		 */
-		dentry = d_find_alias(inode);
-		iput(inode);
-	}
-
-	return dentry;
+	return d_obtain_alias(inode);
 }
 
 static int btrfs_fill_super(struct super_block *sb,
-- 
cgit v1.2.3


From 830c4adbd04a79f806d4fa579546f36a71b727c1 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Mon, 25 Jul 2011 15:55:42 -0400
Subject: Btrfs: fix how we mount subvol=<whatever>

We've only been able to mount with subvol=<whatever> where whatever was a subvol
within whatever root we had as the default.  This allows us to mount -o
subvol=path/to/subvol/you/want relative from the normal fs_tree root.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/super.c | 199 +++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 135 insertions(+), 64 deletions(-)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 244fa46c50b8..934789f7fd33 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -40,6 +40,7 @@
 #include <linux/magic.h>
 #include <linux/slab.h>
 #include <linux/cleancache.h>
+#include <linux/mnt_namespace.h>
 #include "compat.h"
 #include "delayed-inode.h"
 #include "ctree.h"
@@ -58,6 +59,7 @@
 #include <trace/events/btrfs.h>
 
 static const struct super_operations btrfs_super_ops;
+static struct file_system_type btrfs_fs_type;
 
 static const char *btrfs_decode_error(struct btrfs_fs_info *fs_info, int errno,
 				      char nbuf[16])
@@ -411,7 +413,7 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
 	int intarg;
 
 	if (!options)
-		goto out;
+		return 0;
 
 	/*
 	 * strsep changes the string, duplicate it because parse_options
@@ -460,26 +462,15 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
 			error = btrfs_scan_one_device(match_strdup(&args[0]),
 					flags, holder, fs_devices);
 			if (error)
-				goto out_free_opts;
+				goto out;
 			break;
 		default:
 			break;
 		}
 	}
 
- out_free_opts:
+out:
 	kfree(orig);
- out:
-	/*
-	 * If no subvolume name is specified we use the default one.  Allocate
-	 * a copy of the string "." here so that code later in the
-	 * mount path doesn't care if it's the default volume or another one.
-	 */
-	if (!*subvol_name) {
-		*subvol_name = kstrdup(".", GFP_KERNEL);
-		if (!*subvol_name)
-			return -ENOMEM;
-	}
 	return error;
 }
 
@@ -730,6 +721,118 @@ static int btrfs_set_super(struct super_block *s, void *data)
 	return set_anon_super(s, data);
 }
 
+/*
+ * This will strip out the subvol=%s argument for an argument string and add
+ * subvolid=0 to make sure we get the actual tree root for path walking to the
+ * subvol we want.
+ */
+static char *setup_root_args(char *args)
+{
+	unsigned copied = 0;
+	unsigned len = strlen(args) + 2;
+	char *pos;
+	char *ret;
+
+	/*
+	 * We need the same args as before, but minus
+	 *
+	 * subvol=a
+	 *
+	 * and add
+	 *
+	 * subvolid=0
+	 *
+	 * which is a difference of 2 characters, so we allocate strlen(args) +
+	 * 2 characters.
+	 */
+	ret = kzalloc(len * sizeof(char), GFP_NOFS);
+	if (!ret)
+		return NULL;
+	pos = strstr(args, "subvol=");
+
+	/* This shouldn't happen, but just in case.. */
+	if (!pos) {
+		kfree(ret);
+		return NULL;
+	}
+
+	/*
+	 * The subvol=<> arg is not at the front of the string, copy everybody
+	 * up to that into ret.
+	 */
+	if (pos != args) {
+		*pos = '\0';
+		strcpy(ret, args);
+		copied += strlen(args);
+		pos++;
+	}
+
+	strncpy(ret + copied, "subvolid=0", len - copied);
+
+	/* Length of subvolid=0 */
+	copied += 10;
+
+	/*
+	 * If there is no , after the subvol= option then we know there's no
+	 * other options and we can just return.
+	 */
+	pos = strchr(pos, ',');
+	if (!pos)
+		return ret;
+
+	/* Copy the rest of the arguments into our buffer */
+	strncpy(ret + copied, pos, len - copied);
+	copied += strlen(pos);
+
+	return ret;
+}
+
+static struct dentry *mount_subvol(const char *subvol_name, int flags,
+				   const char *device_name, char *data)
+{
+	struct super_block *s;
+	struct dentry *root;
+	struct vfsmount *mnt;
+	struct mnt_namespace *ns_private;
+	char *newargs;
+	struct path path;
+	int error;
+
+	newargs = setup_root_args(data);
+	if (!newargs)
+		return ERR_PTR(-ENOMEM);
+	mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name,
+			     newargs);
+	kfree(newargs);
+	if (IS_ERR(mnt))
+		return ERR_CAST(mnt);
+
+	ns_private = create_mnt_ns(mnt);
+	if (IS_ERR(ns_private)) {
+		mntput(mnt);
+		return ERR_CAST(ns_private);
+	}
+
+	/*
+	 * This will trigger the automount of the subvol so we can just
+	 * drop the mnt we have here and return the dentry that we
+	 * found.
+	 */
+	error = vfs_path_lookup(mnt->mnt_root, mnt, subvol_name,
+				LOOKUP_FOLLOW, &path);
+	put_mnt_ns(ns_private);
+	if (error)
+		return ERR_PTR(error);
+
+	/* Get a ref to the sb and the dentry we found and return it */
+	s = path.mnt->mnt_sb;
+	atomic_inc(&s->s_active);
+	root = dget(path.dentry);
+	path_put(&path);
+	down_write(&s->s_umount);
+
+	return root;
+}
 
 /*
  * Find a superblock for the given device / mount point.
@@ -761,13 +864,19 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 	if (error)
 		return ERR_PTR(error);
 
+	if (subvol_name) {
+		root = mount_subvol(subvol_name, flags, device_name, data);
+		kfree(subvol_name);
+		return root;
+	}
+
 	error = btrfs_scan_one_device(device_name, mode, fs_type, &fs_devices);
 	if (error)
-		goto error_free_subvol_name;
+		return ERR_PTR(error);
 
 	error = btrfs_open_devices(fs_devices, mode, fs_type);
 	if (error)
-		goto error_free_subvol_name;
+		return ERR_PTR(error);
 
 	if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) {
 		error = -EACCES;
@@ -792,14 +901,15 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 
 	bdev = fs_devices->latest_bdev;
 	s = sget(fs_type, btrfs_test_super, btrfs_set_super, tree_root);
-	if (IS_ERR(s))
-		goto error_s;
+	if (IS_ERR(s)) {
+		error = PTR_ERR(s);
+		goto error_close_devices;
+	}
 
 	if (s->s_root) {
 		if ((flags ^ s->s_flags) & MS_RDONLY) {
 			deactivate_locked_super(s);
-			error = -EBUSY;
-			goto error_close_devices;
+			return ERR_PTR(-EBUSY);
 		}
 
 		btrfs_close_devices(fs_devices);
@@ -814,64 +924,25 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 					 flags & MS_SILENT ? 1 : 0);
 		if (error) {
 			deactivate_locked_super(s);
-			goto error_free_subvol_name;
+			return ERR_PTR(error);
 		}
 
 		btrfs_sb(s)->fs_info->bdev_holder = fs_type;
 		s->s_flags |= MS_ACTIVE;
 	}
 
-	/* if they gave us a subvolume name bind mount into that */
-	if (strcmp(subvol_name, ".")) {
-		struct dentry *new_root;
-
-		root = get_default_root(s, subvol_rootid);
-		if (IS_ERR(root)) {
-			error = PTR_ERR(root);
-			deactivate_locked_super(s);
-			goto error_free_subvol_name;
-		}
-
-		mutex_lock(&root->d_inode->i_mutex);
-		new_root = lookup_one_len(subvol_name, root,
-				      strlen(subvol_name));
-		mutex_unlock(&root->d_inode->i_mutex);
-
-		if (IS_ERR(new_root)) {
-			dput(root);
-			deactivate_locked_super(s);
-			error = PTR_ERR(new_root);
-			goto error_free_subvol_name;
-		}
-		if (!new_root->d_inode) {
-			dput(root);
-			dput(new_root);
-			deactivate_locked_super(s);
-			error = -ENXIO;
-			goto error_free_subvol_name;
-		}
-		dput(root);
-		root = new_root;
-	} else {
-		root = get_default_root(s, subvol_objectid);
-		if (IS_ERR(root)) {
-			error = PTR_ERR(root);
-			deactivate_locked_super(s);
-			goto error_free_subvol_name;
-		}
+	root = get_default_root(s, subvol_objectid);
+	if (IS_ERR(root)) {
+		deactivate_locked_super(s);
+		return root;
 	}
 
-	kfree(subvol_name);
 	return root;
 
-error_s:
-	error = PTR_ERR(s);
 error_close_devices:
 	btrfs_close_devices(fs_devices);
 	kfree(fs_info);
 	kfree(tree_root);
-error_free_subvol_name:
-	kfree(subvol_name);
 	return ERR_PTR(error);
 }
 
-- 
cgit v1.2.3


From fb25e9141ab843794d5cdef3936ccb58435e2371 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Tue, 26 Jul 2011 17:00:46 -0400
Subject: Btrfs: use bytes_may_use for all ENOSPC reservations

We have been using bytes_reserved for metadata reservations, which is wrong
since we use that to keep track of outstanding reservations from the allocator.
This resulted in us doing a lot of silly things to make sure we don't allocate a
bunch of metadata chunks since we never had a real view of how much space was
actually in use by metadata.

This passes Arne's enospc test and xfstests as well as my own enospc tests.
Hopefully this will get us moving in the right direction.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/ctree.h            |   2 -
 fs/btrfs/extent-tree.c      | 163 ++++++++++++++++++++++++--------------------
 fs/btrfs/free-space-cache.c |  29 ++++++--
 3 files changed, 112 insertions(+), 82 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 03912c5c6f49..332cbdc86ad4 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2196,8 +2196,6 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
 		      u64 root_objectid, u64 owner, u64 offset);
 
 int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
-int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
-				u64 num_bytes, int reserve, int sinfo);
 int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root);
 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 03edac4f7771..fbe6278f466b 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -52,6 +52,21 @@ enum {
 	CHUNK_ALLOC_LIMITED = 2,
 };
 
+/*
+ * Control how reservations are dealt with.
+ *
+ * RESERVE_FREE - freeing a reservation.
+ * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for
+ *   ENOSPC accounting
+ * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update
+ *   bytes_may_use as the ENOSPC accounting is done elsewhere
+ */
+enum {
+	RESERVE_FREE = 0,
+	RESERVE_ALLOC = 1,
+	RESERVE_ALLOC_NO_ACCOUNT = 2,
+};
+
 static int update_block_group(struct btrfs_trans_handle *trans,
 			      struct btrfs_root *root,
 			      u64 bytenr, u64 num_bytes, int alloc);
@@ -81,6 +96,8 @@ static int find_next_key(struct btrfs_path *path, int level,
 			 struct btrfs_key *key);
 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
 			    int dump_block_groups);
+static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
+				       u64 num_bytes, int reserve);
 
 static noinline int
 block_group_cache_done(struct btrfs_block_group_cache *cache)
@@ -3128,9 +3145,7 @@ commit_trans:
 }
 
 /*
- * called when we are clearing an delalloc extent from the
- * inode's io_tree or there was an error for whatever reason
- * after calling btrfs_check_data_free_space
+ * Called if we need to clear a data reservation for this inode.
  */
 void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
 {
@@ -3163,6 +3178,7 @@ static int should_alloc_chunk(struct btrfs_root *root,
 			      struct btrfs_space_info *sinfo, u64 alloc_bytes,
 			      int force)
 {
+	struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
 	u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
 	u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved;
 	u64 thresh;
@@ -3170,6 +3186,13 @@ static int should_alloc_chunk(struct btrfs_root *root,
 	if (force == CHUNK_ALLOC_FORCE)
 		return 1;
 
+	/*
+	 * We need to take into account the global rsv because for all intents
+	 * and purposes it's used space.  Don't worry about locking the
+	 * global_rsv, it doesn't change except when the transaction commits.
+	 */
+	num_allocated += global_rsv->size;
+
 	/*
 	 * in limited mode, we want to have some free space up to
 	 * about 1% of the FS size.
@@ -3317,7 +3340,7 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
 	space_info = block_rsv->space_info;
 
 	smp_mb();
-	reserved = space_info->bytes_reserved;
+	reserved = space_info->bytes_may_use;
 	progress = space_info->reservation_progress;
 
 	if (reserved == 0)
@@ -3341,9 +3364,9 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
 		writeback_inodes_sb_nr_if_idle(root->fs_info->sb, nr_pages);
 
 		spin_lock(&space_info->lock);
-		if (reserved > space_info->bytes_reserved)
-			reclaimed += reserved - space_info->bytes_reserved;
-		reserved = space_info->bytes_reserved;
+		if (reserved > space_info->bytes_may_use)
+			reclaimed += reserved - space_info->bytes_may_use;
+		reserved = space_info->bytes_may_use;
 		spin_unlock(&space_info->lock);
 
 		loops++;
@@ -3401,7 +3424,6 @@ static int reserve_metadata_bytes(struct btrfs_trans_handle *trans,
 	int ret = 0;
 	bool committed = false;
 	bool flushing = false;
-
 again:
 	ret = 0;
 	spin_lock(&space_info->lock);
@@ -3443,7 +3465,7 @@ again:
 	if (unused <= space_info->total_bytes) {
 		unused = space_info->total_bytes - unused;
 		if (unused >= num_bytes) {
-			space_info->bytes_reserved += orig_bytes;
+			space_info->bytes_may_use += orig_bytes;
 			ret = 0;
 		} else {
 			/*
@@ -3614,7 +3636,7 @@ static void block_rsv_release_bytes(struct btrfs_block_rsv *block_rsv,
 		}
 		if (num_bytes) {
 			spin_lock(&space_info->lock);
-			space_info->bytes_reserved -= num_bytes;
+			space_info->bytes_may_use -= num_bytes;
 			space_info->reservation_progress++;
 			spin_unlock(&space_info->lock);
 		}
@@ -3825,12 +3847,12 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
 	if (sinfo->total_bytes > num_bytes) {
 		num_bytes = sinfo->total_bytes - num_bytes;
 		block_rsv->reserved += num_bytes;
-		sinfo->bytes_reserved += num_bytes;
+		sinfo->bytes_may_use += num_bytes;
 	}
 
 	if (block_rsv->reserved >= block_rsv->size) {
 		num_bytes = block_rsv->reserved - block_rsv->size;
-		sinfo->bytes_reserved -= num_bytes;
+		sinfo->bytes_may_use -= num_bytes;
 		sinfo->reservation_progress++;
 		block_rsv->reserved = block_rsv->size;
 		block_rsv->full = 1;
@@ -4133,7 +4155,6 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 			btrfs_set_block_group_used(&cache->item, old_val);
 			cache->reserved -= num_bytes;
 			cache->space_info->bytes_reserved -= num_bytes;
-			cache->space_info->reservation_progress++;
 			cache->space_info->bytes_used += num_bytes;
 			cache->space_info->disk_used += num_bytes * factor;
 			spin_unlock(&cache->lock);
@@ -4185,7 +4206,6 @@ static int pin_down_extent(struct btrfs_root *root,
 	if (reserved) {
 		cache->reserved -= num_bytes;
 		cache->space_info->bytes_reserved -= num_bytes;
-		cache->space_info->reservation_progress++;
 	}
 	spin_unlock(&cache->lock);
 	spin_unlock(&cache->space_info->lock);
@@ -4212,46 +4232,55 @@ int btrfs_pin_extent(struct btrfs_root *root,
 	return 0;
 }
 
-/*
- * update size of reserved extents. this function may return -EAGAIN
- * if 'reserve' is true or 'sinfo' is false.
+/**
+ * btrfs_update_reserved_bytes - update the block_group and space info counters
+ * @cache:	The cache we are manipulating
+ * @num_bytes:	The number of bytes in question
+ * @reserve:	One of the reservation enums
+ *
+ * This is called by the allocator when it reserves space, or by somebody who is
+ * freeing space that was never actually used on disk.  For example if you
+ * reserve some space for a new leaf in transaction A and before transaction A
+ * commits you free that leaf, you call this with reserve set to 0 in order to
+ * clear the reservation.
+ *
+ * Metadata reservations should be called with RESERVE_ALLOC so we do the proper
+ * ENOSPC accounting.  For data we handle the reservation through clearing the
+ * delalloc bits in the io_tree.  We have to do this since we could end up
+ * allocating less disk space for the amount of data we have reserved in the
+ * case of compression.
+ *
+ * If this is a reservation and the block group has become read only we cannot
+ * make the reservation and return -EAGAIN, otherwise this function always
+ * succeeds.
  */
-int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
-				u64 num_bytes, int reserve, int sinfo)
+static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
+				       u64 num_bytes, int reserve)
 {
+	struct btrfs_space_info *space_info = cache->space_info;
 	int ret = 0;
-	if (sinfo) {
-		struct btrfs_space_info *space_info = cache->space_info;
-		spin_lock(&space_info->lock);
-		spin_lock(&cache->lock);
-		if (reserve) {
-			if (cache->ro) {
-				ret = -EAGAIN;
-			} else {
-				cache->reserved += num_bytes;
-				space_info->bytes_reserved += num_bytes;
-			}
-		} else {
-			if (cache->ro)
-				space_info->bytes_readonly += num_bytes;
-			cache->reserved -= num_bytes;
-			space_info->bytes_reserved -= num_bytes;
-			space_info->reservation_progress++;
-		}
-		spin_unlock(&cache->lock);
-		spin_unlock(&space_info->lock);
-	} else {
-		spin_lock(&cache->lock);
+	spin_lock(&space_info->lock);
+	spin_lock(&cache->lock);
+	if (reserve != RESERVE_FREE) {
 		if (cache->ro) {
 			ret = -EAGAIN;
 		} else {
-			if (reserve)
-				cache->reserved += num_bytes;
-			else
-				cache->reserved -= num_bytes;
+			cache->reserved += num_bytes;
+			space_info->bytes_reserved += num_bytes;
+			if (reserve == RESERVE_ALLOC) {
+				BUG_ON(space_info->bytes_may_use < num_bytes);
+				space_info->bytes_may_use -= num_bytes;
+			}
 		}
-		spin_unlock(&cache->lock);
+	} else {
+		if (cache->ro)
+			space_info->bytes_readonly += num_bytes;
+		cache->reserved -= num_bytes;
+		space_info->bytes_reserved -= num_bytes;
+		space_info->reservation_progress++;
 	}
+	spin_unlock(&cache->lock);
+	spin_unlock(&space_info->lock);
 	return ret;
 }
 
@@ -4322,7 +4351,7 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
 		} else if (cache->reserved_pinned > 0) {
 			len = min(len, cache->reserved_pinned);
 			cache->reserved_pinned -= len;
-			cache->space_info->bytes_reserved += len;
+			cache->space_info->bytes_may_use += len;
 		}
 		spin_unlock(&cache->lock);
 		spin_unlock(&cache->space_info->lock);
@@ -4701,27 +4730,8 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
 		WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
 
 		btrfs_add_free_space(cache, buf->start, buf->len);
-		ret = btrfs_update_reserved_bytes(cache, buf->len, 0, 0);
-		if (ret == -EAGAIN) {
-			/* block group became read-only */
-			btrfs_update_reserved_bytes(cache, buf->len, 0, 1);
-			goto out;
-		}
+		btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE);
 
-		ret = 1;
-		spin_lock(&block_rsv->lock);
-		if (block_rsv->reserved < block_rsv->size) {
-			block_rsv->reserved += buf->len;
-			ret = 0;
-		}
-		spin_unlock(&block_rsv->lock);
-
-		if (ret) {
-			spin_lock(&cache->space_info->lock);
-			cache->space_info->bytes_reserved -= buf->len;
-			cache->space_info->reservation_progress++;
-			spin_unlock(&cache->space_info->lock);
-		}
 		goto out;
 	}
 pin:
@@ -4881,6 +4891,8 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
 	int last_ptr_loop = 0;
 	int loop = 0;
 	int index = 0;
+	int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ?
+		RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;
 	bool found_uncached_bg = false;
 	bool failed_cluster_refill = false;
 	bool failed_alloc = false;
@@ -5200,8 +5212,8 @@ checks:
 					     search_start - offset);
 		BUG_ON(offset > search_start);
 
-		ret = btrfs_update_reserved_bytes(block_group, num_bytes, 1,
-					    (data & BTRFS_BLOCK_GROUP_DATA));
+		ret = btrfs_update_reserved_bytes(block_group, num_bytes,
+						  alloc_type);
 		if (ret == -EAGAIN) {
 			btrfs_add_free_space(block_group, offset, num_bytes);
 			goto loop;
@@ -5323,7 +5335,8 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
 	int index = 0;
 
 	spin_lock(&info->lock);
-	printk(KERN_INFO "space_info has %llu free, is %sfull\n",
+	printk(KERN_INFO "space_info %llu has %llu free, is %sfull\n",
+	       (unsigned long long)info->flags,
 	       (unsigned long long)(info->total_bytes - info->bytes_used -
 				    info->bytes_pinned - info->bytes_reserved -
 				    info->bytes_readonly),
@@ -5425,7 +5438,7 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
 		ret = btrfs_discard_extent(root, start, len, NULL);
 
 	btrfs_add_free_space(cache, start, len);
-	btrfs_update_reserved_bytes(cache, len, 0, 1);
+	btrfs_update_reserved_bytes(cache, len, RESERVE_FREE);
 	btrfs_put_block_group(cache);
 
 	trace_btrfs_reserved_extent_free(root, start, len);
@@ -5628,7 +5641,8 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
 		put_caching_control(caching_ctl);
 	}
 
-	ret = btrfs_update_reserved_bytes(block_group, ins->offset, 1, 1);
+	ret = btrfs_update_reserved_bytes(block_group, ins->offset,
+					  RESERVE_ALLOC_NO_ACCOUNT);
 	BUG_ON(ret);
 	btrfs_put_block_group(block_group);
 	ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
@@ -6594,7 +6608,7 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
 	    cache->reserved_pinned + num_bytes + min_allocable_bytes <=
 	    sinfo->total_bytes) {
 		sinfo->bytes_readonly += num_bytes;
-		sinfo->bytes_reserved += cache->reserved_pinned;
+		sinfo->bytes_may_use += cache->reserved_pinned;
 		cache->reserved_pinned = 0;
 		cache->ro = 1;
 		ret = 0;
@@ -6962,7 +6976,8 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
 					struct btrfs_space_info,
 					list);
 		if (space_info->bytes_pinned > 0 ||
-		    space_info->bytes_reserved > 0) {
+		    space_info->bytes_reserved > 0 ||
+		    space_info->bytes_may_use > 0) {
 			WARN_ON(1);
 			dump_space_info(space_info, 0, 0);
 		}
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 41ac927401d0..79c16a68a2bc 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -2472,9 +2472,19 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
 		spin_unlock(&ctl->tree_lock);
 
 		if (bytes >= minlen) {
-			int update_ret;
-			update_ret = btrfs_update_reserved_bytes(block_group,
-								 bytes, 1, 1);
+			struct btrfs_space_info *space_info;
+			int update = 0;
+
+			space_info = block_group->space_info;
+			spin_lock(&space_info->lock);
+			spin_lock(&block_group->lock);
+			if (!block_group->ro) {
+				block_group->reserved += bytes;
+				space_info->bytes_reserved += bytes;
+				update = 1;
+			}
+			spin_unlock(&block_group->lock);
+			spin_unlock(&space_info->lock);
 
 			ret = btrfs_error_discard_extent(fs_info->extent_root,
 							 start,
@@ -2482,9 +2492,16 @@ int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
 							 &actually_trimmed);
 
 			btrfs_add_free_space(block_group, start, bytes);
-			if (!update_ret)
-				btrfs_update_reserved_bytes(block_group,
-							    bytes, 0, 1);
+			if (update) {
+				spin_lock(&space_info->lock);
+				spin_lock(&block_group->lock);
+				if (block_group->ro)
+					space_info->bytes_readonly += bytes;
+				block_group->reserved -= bytes;
+				space_info->bytes_reserved -= bytes;
+				spin_unlock(&space_info->lock);
+				spin_unlock(&block_group->lock);
+			}
 
 			if (ret)
 				break;
-- 
cgit v1.2.3


From 9e4871070b5f070cacf26525389d56e0345ba156 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Mon, 1 Aug 2011 12:08:18 -0400
Subject: Btrfs: skip looking for delalloc if we don't have ->fill_delalloc

We always look for delalloc bytes in our io_tree so we can fill in delalloc.
This is fine in most cases, but if we're writing out the btree_inode this is
just a superfluous tree search on the io_tree, and if we have a lot of metadata
dirty this could be an expensive check.  So instead check to see if our io_tree
has a ->fill_delalloc op, and if not don't even bother doing the lookup.
Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent_io.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index d418164a35f1..7d5e55632809 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2136,6 +2136,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 	int compressed;
 	int write_flags;
 	unsigned long nr_written = 0;
+	bool fill_delalloc = true;
 
 	if (wbc->sync_mode == WB_SYNC_ALL)
 		write_flags = WRITE_SYNC;
@@ -2166,10 +2167,13 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 
 	set_page_extent_mapped(page);
 
+	if (!tree->ops || !tree->ops->fill_delalloc)
+		fill_delalloc = false;
+
 	delalloc_start = start;
 	delalloc_end = 0;
 	page_started = 0;
-	if (!epd->extent_locked) {
+	if (!epd->extent_locked && fill_delalloc) {
 		u64 delalloc_to_write = 0;
 		/*
 		 * make sure the wbc mapping index is at least updated
-- 
cgit v1.2.3


From 7709cde33f12db71efb377fae4ae7aab6c94ebc6 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Thu, 4 Aug 2011 10:25:02 -0400
Subject: Btrfs: calculate checksum space correctly

We have not been reserving enough space for checksums.  We were just reserving
bytes for the checksum items themselves, we were not taking into account having
to cow the tree and such.  This patch adds a csum_bytes counter to the inode for
keeping track of the number of bytes outstanding we have for checksums.  Then we
calculate how many leaves would be required for the checksums we are given and
use that to reserve space.  This adds a significant amount of bytes to our
reservations, but we will handle this later.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/btrfs_inode.h |   6 +++
 fs/btrfs/extent-tree.c | 117 +++++++++++++++++++++++++++++++++++++++++++++----
 fs/btrfs/inode.c       |   3 ++
 3 files changed, 118 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index c70fb10a307b..5a5d325a3935 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -123,6 +123,12 @@ struct btrfs_inode {
 	 */
 	u64 last_unlink_trans;
 
+	/*
+	 * Number of bytes outstanding that are going to need csums.  This is
+	 * used in ENOSPC accounting.
+	 */
+	u64 csum_bytes;
+
 	/* flags field from the on disk inode */
 	u32 flags;
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index fbe6278f466b..4add1ac2dda0 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3984,11 +3984,19 @@ int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
 	return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
 }
 
+/**
+ * drop_outstanding_extent - drop an outstanding extent
+ * @inode: the inode we're dropping the extent for
+ *
+ * This is called when we are freeing up an outstanding extent, either called
+ * after an error or after an extent is written.  This will return the number of
+ * reserved extents that need to be freed.  This must be called with
+ * BTRFS_I(inode)->lock held.
+ */
 static unsigned drop_outstanding_extent(struct inode *inode)
 {
 	unsigned dropped_extents = 0;
 
-	spin_lock(&BTRFS_I(inode)->lock);
 	BUG_ON(!BTRFS_I(inode)->outstanding_extents);
 	BTRFS_I(inode)->outstanding_extents--;
 
@@ -3998,19 +4006,70 @@ static unsigned drop_outstanding_extent(struct inode *inode)
 	 */
 	if (BTRFS_I(inode)->outstanding_extents >=
 	    BTRFS_I(inode)->reserved_extents)
-		goto out;
+		return 0;
 
 	dropped_extents = BTRFS_I(inode)->reserved_extents -
 		BTRFS_I(inode)->outstanding_extents;
 	BTRFS_I(inode)->reserved_extents -= dropped_extents;
-out:
-	spin_unlock(&BTRFS_I(inode)->lock);
 	return dropped_extents;
 }
 
-static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes)
+/**
+ * calc_csum_metadata_size - return the amount of metada space that must be
+ *	reserved/free'd for the given bytes.
+ * @inode: the inode we're manipulating
+ * @num_bytes: the number of bytes in question
+ * @reserve: 1 if we are reserving space, 0 if we are freeing space
+ *
+ * This adjusts the number of csum_bytes in the inode and then returns the
+ * correct amount of metadata that must either be reserved or freed.  We
+ * calculate how many checksums we can fit into one leaf and then divide the
+ * number of bytes that will need to be checksumed by this value to figure out
+ * how many checksums will be required.  If we are adding bytes then the number
+ * may go up and we will return the number of additional bytes that must be
+ * reserved.  If it is going down we will return the number of bytes that must
+ * be freed.
+ *
+ * This must be called with BTRFS_I(inode)->lock held.
+ */
+static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes,
+				   int reserve)
 {
-	return num_bytes >>= 3;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	u64 csum_size;
+	int num_csums_per_leaf;
+	int num_csums;
+	int old_csums;
+
+	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM &&
+	    BTRFS_I(inode)->csum_bytes == 0)
+		return 0;
+
+	old_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
+	if (reserve)
+		BTRFS_I(inode)->csum_bytes += num_bytes;
+	else
+		BTRFS_I(inode)->csum_bytes -= num_bytes;
+	csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item);
+	num_csums_per_leaf = (int)div64_u64(csum_size,
+					    sizeof(struct btrfs_csum_item) +
+					    sizeof(struct btrfs_disk_key));
+	num_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
+	num_csums = num_csums + num_csums_per_leaf - 1;
+	num_csums = num_csums / num_csums_per_leaf;
+
+	old_csums = old_csums + num_csums_per_leaf - 1;
+	old_csums = old_csums / num_csums_per_leaf;
+
+	/* No change, no need to reserve more */
+	if (old_csums == num_csums)
+		return 0;
+
+	if (reserve)
+		return btrfs_calc_trans_metadata_size(root,
+						      num_csums - old_csums);
+
+	return btrfs_calc_trans_metadata_size(root, old_csums - num_csums);
 }
 
 int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
@@ -4037,9 +4096,9 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 
 		to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
 	}
+	to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
 	spin_unlock(&BTRFS_I(inode)->lock);
 
-	to_reserve += calc_csum_metadata_size(inode, num_bytes);
 	ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1);
 	if (ret) {
 		unsigned dropped;
@@ -4047,8 +4106,11 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 		 * We don't need the return value since our reservation failed,
 		 * we just need to clean up our counter.
 		 */
+		spin_lock(&BTRFS_I(inode)->lock);
 		dropped = drop_outstanding_extent(inode);
 		WARN_ON(dropped > 1);
+		BTRFS_I(inode)->csum_bytes -= num_bytes;
+		spin_unlock(&BTRFS_I(inode)->lock);
 		return ret;
 	}
 
@@ -4057,6 +4119,15 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 	return 0;
 }
 
+/**
+ * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
+ * @inode: the inode to release the reservation for
+ * @num_bytes: the number of bytes we're releasing
+ *
+ * This will release the metadata reservation for an inode.  This can be called
+ * once we complete IO for a given set of bytes to release their metadata
+ * reservations.
+ */
 void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -4064,9 +4135,11 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
 	unsigned dropped;
 
 	num_bytes = ALIGN(num_bytes, root->sectorsize);
+	spin_lock(&BTRFS_I(inode)->lock);
 	dropped = drop_outstanding_extent(inode);
 
-	to_free = calc_csum_metadata_size(inode, num_bytes);
+	to_free = calc_csum_metadata_size(inode, num_bytes, 0);
+	spin_unlock(&BTRFS_I(inode)->lock);
 	if (dropped > 0)
 		to_free += btrfs_calc_trans_metadata_size(root, dropped);
 
@@ -4074,6 +4147,21 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
 				to_free);
 }
 
+/**
+ * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc
+ * @inode: inode we're writing to
+ * @num_bytes: the number of bytes we want to allocate
+ *
+ * This will do the following things
+ *
+ * o reserve space in the data space info for num_bytes
+ * o reserve space in the metadata space info based on number of outstanding
+ *   extents and how much csums will be needed
+ * o add to the inodes ->delalloc_bytes
+ * o add it to the fs_info's delalloc inodes list.
+ *
+ * This will return 0 for success and -ENOSPC if there is no space left.
+ */
 int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
 {
 	int ret;
@@ -4091,6 +4179,19 @@ int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
 	return 0;
 }
 
+/**
+ * btrfs_delalloc_release_space - release data and metadata space for delalloc
+ * @inode: inode we're releasing space for
+ * @num_bytes: the number of bytes we want to free up
+ *
+ * This must be matched with a call to btrfs_delalloc_reserve_space.  This is
+ * called in the case that we don't need the metadata AND data reservations
+ * anymore.  So if there is an error or we insert an inline extent.
+ *
+ * This function will release the metadata space that was not used and will
+ * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
+ * list if there are no delalloc bytes left.
+ */
 void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
 {
 	btrfs_delalloc_release_metadata(inode, num_bytes);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 156c3a0da792..98b9fa2d77f7 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6757,6 +6757,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
 	ei->delalloc_bytes = 0;
 	ei->disk_i_size = 0;
 	ei->flags = 0;
+	ei->csum_bytes = 0;
 	ei->index_cnt = (u64)-1;
 	ei->last_unlink_trans = 0;
 
@@ -6802,6 +6803,8 @@ void btrfs_destroy_inode(struct inode *inode)
 	WARN_ON(inode->i_data.nrpages);
 	WARN_ON(BTRFS_I(inode)->outstanding_extents);
 	WARN_ON(BTRFS_I(inode)->reserved_extents);
+	WARN_ON(BTRFS_I(inode)->delalloc_bytes);
+	WARN_ON(BTRFS_I(inode)->csum_bytes);
 
 	/*
 	 * This can happen where we create an inode, but somebody else also
-- 
cgit v1.2.3


From dba68306f3fae681b1005137f130f5bcfdfed34a Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Thu, 4 Aug 2011 15:34:57 -0400
Subject: Btrfs: kill the orphan space calculation for snapshots

This patch kills off the calculation for the amount of space needed for the
orphan operations during a snapshot.  The thing is we only do snapshots on
commit, so any space that is in the block_rsv->freed[] isn't going to be in the
new snapshot anyway, so there isn't any reason to require that space to be
reserved for the snapshot to occur.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/ctree.h       |  5 ---
 fs/btrfs/inode.c       | 83 --------------------------------------------------
 fs/btrfs/transaction.c |  2 --
 3 files changed, 90 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 332cbdc86ad4..4ef777e85c89 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2577,11 +2577,6 @@ int btrfs_update_inode(struct btrfs_trans_handle *trans,
 int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode);
 int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode);
 int btrfs_orphan_cleanup(struct btrfs_root *root);
-void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
-				struct btrfs_pending_snapshot *pending,
-				u64 *bytes_to_reserve);
-void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
-				struct btrfs_pending_snapshot *pending);
 void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
 			      struct btrfs_root *root);
 int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 98b9fa2d77f7..5f5f8a577e69 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2079,89 +2079,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root)
 	up_read(&root->fs_info->cleanup_work_sem);
 }
 
-/*
- * calculate extra metadata reservation when snapshotting a subvolume
- * contains orphan files.
- */
-void btrfs_orphan_pre_snapshot(struct btrfs_trans_handle *trans,
-				struct btrfs_pending_snapshot *pending,
-				u64 *bytes_to_reserve)
-{
-	struct btrfs_root *root;
-	struct btrfs_block_rsv *block_rsv;
-	u64 num_bytes;
-	int index;
-
-	root = pending->root;
-	if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
-		return;
-
-	block_rsv = root->orphan_block_rsv;
-
-	/* orphan block reservation for the snapshot */
-	num_bytes = block_rsv->size;
-
-	/*
-	 * after the snapshot is created, COWing tree blocks may use more
-	 * space than it frees. So we should make sure there is enough
-	 * reserved space.
-	 */
-	index = trans->transid & 0x1;
-	if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
-		num_bytes += block_rsv->size -
-			     (block_rsv->reserved + block_rsv->freed[index]);
-	}
-
-	*bytes_to_reserve += num_bytes;
-}
-
-void btrfs_orphan_post_snapshot(struct btrfs_trans_handle *trans,
-				struct btrfs_pending_snapshot *pending)
-{
-	struct btrfs_root *root = pending->root;
-	struct btrfs_root *snap = pending->snap;
-	struct btrfs_block_rsv *block_rsv;
-	u64 num_bytes;
-	int index;
-	int ret;
-
-	if (!root->orphan_block_rsv || list_empty(&root->orphan_list))
-		return;
-
-	/* refill source subvolume's orphan block reservation */
-	block_rsv = root->orphan_block_rsv;
-	index = trans->transid & 0x1;
-	if (block_rsv->reserved + block_rsv->freed[index] < block_rsv->size) {
-		num_bytes = block_rsv->size -
-			    (block_rsv->reserved + block_rsv->freed[index]);
-		ret = btrfs_block_rsv_migrate(&pending->block_rsv,
-					      root->orphan_block_rsv,
-					      num_bytes);
-		BUG_ON(ret);
-	}
-
-	/* setup orphan block reservation for the snapshot */
-	block_rsv = btrfs_alloc_block_rsv(snap);
-	BUG_ON(!block_rsv);
-
-	btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
-	snap->orphan_block_rsv = block_rsv;
-
-	num_bytes = root->orphan_block_rsv->size;
-	ret = btrfs_block_rsv_migrate(&pending->block_rsv,
-				      block_rsv, num_bytes);
-	BUG_ON(ret);
-
-#if 0
-	/* insert orphan item for the snapshot */
-	WARN_ON(!root->orphan_item_inserted);
-	ret = btrfs_insert_orphan_item(trans, root->fs_info->tree_root,
-				       snap->root_key.objectid);
-	BUG_ON(ret);
-	snap->orphan_item_inserted = 1;
-#endif
-}
-
 enum btrfs_orphan_cleanup_state {
 	ORPHAN_CLEANUP_STARTED	= 1,
 	ORPHAN_CLEANUP_DONE	= 2,
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index e24b7964a155..3e20cc8c1c06 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -911,7 +911,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	}
 
 	btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
-	btrfs_orphan_pre_snapshot(trans, pending, &to_reserve);
 
 	if (to_reserve > 0) {
 		ret = btrfs_block_rsv_add(trans, root, &pending->block_rsv,
@@ -1002,7 +1001,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	BUG_ON(IS_ERR(pending->snap));
 
 	btrfs_reloc_post_snapshot(trans, pending);
-	btrfs_orphan_post_snapshot(trans, pending);
 fail:
 	kfree(new_root_item);
 	trans->block_rsv = rsv;
-- 
cgit v1.2.3


From 37be25bcb6d731914e126f8de59c4367f0d66b80 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Fri, 5 Aug 2011 10:25:38 -0400
Subject: Btrfs: kill the durable block rsv stuff

This is confusing code and isn't used by anything anymore, so delete it.

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/ctree.h       |  11 ------
 fs/btrfs/disk-io.c     |   2 -
 fs/btrfs/extent-tree.c | 100 +++++++++----------------------------------------
 fs/btrfs/inode.c       |   4 --
 fs/btrfs/relocation.c  |   1 -
 5 files changed, 17 insertions(+), 101 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 4ef777e85c89..c5ceba4078cc 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -772,13 +772,10 @@ struct btrfs_space_info {
 struct btrfs_block_rsv {
 	u64 size;
 	u64 reserved;
-	u64 freed[2];
 	struct btrfs_space_info *space_info;
-	struct list_head list;
 	spinlock_t lock;
 	atomic_t usage;
 	unsigned int priority:8;
-	unsigned int durable:1;
 	unsigned int refill_used:1;
 	unsigned int full:1;
 };
@@ -840,7 +837,6 @@ struct btrfs_block_group_cache {
 	spinlock_t lock;
 	u64 pinned;
 	u64 reserved;
-	u64 reserved_pinned;
 	u64 bytes_super;
 	u64 flags;
 	u64 sectorsize;
@@ -919,11 +915,6 @@ struct btrfs_fs_info {
 
 	struct btrfs_block_rsv empty_block_rsv;
 
-	/* list of block reservations that cross multiple transactions */
-	struct list_head durable_block_rsv_list;
-
-	struct mutex durable_block_rsv_mutex;
-
 	u64 generation;
 	u64 last_trans_committed;
 
@@ -2238,8 +2229,6 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv);
 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root);
 void btrfs_free_block_rsv(struct btrfs_root *root,
 			  struct btrfs_block_rsv *rsv);
-void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
-				 struct btrfs_block_rsv *rsv);
 int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root,
 			struct btrfs_block_rsv *block_rsv,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 07b3ac662e19..0b5643a68d57 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1665,8 +1665,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	btrfs_init_block_rsv(&fs_info->trans_block_rsv);
 	btrfs_init_block_rsv(&fs_info->chunk_block_rsv);
 	btrfs_init_block_rsv(&fs_info->empty_block_rsv);
-	INIT_LIST_HEAD(&fs_info->durable_block_rsv_list);
-	mutex_init(&fs_info->durable_block_rsv_mutex);
 	atomic_set(&fs_info->nr_async_submits, 0);
 	atomic_set(&fs_info->async_delalloc_pages, 0);
 	atomic_set(&fs_info->async_submit_draining, 0);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 4add1ac2dda0..30c0558eae84 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -121,7 +121,6 @@ void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
 	if (atomic_dec_and_test(&cache->count)) {
 		WARN_ON(cache->pinned > 0);
 		WARN_ON(cache->reserved > 0);
-		WARN_ON(cache->reserved_pinned > 0);
 		kfree(cache->free_space_ctl);
 		kfree(cache);
 	}
@@ -3662,7 +3661,6 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv)
 	spin_lock_init(&rsv->lock);
 	atomic_set(&rsv->usage, 1);
 	rsv->priority = 6;
-	INIT_LIST_HEAD(&rsv->list);
 }
 
 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
@@ -3685,25 +3683,10 @@ void btrfs_free_block_rsv(struct btrfs_root *root,
 {
 	if (rsv && atomic_dec_and_test(&rsv->usage)) {
 		btrfs_block_rsv_release(root, rsv, (u64)-1);
-		if (!rsv->durable)
-			kfree(rsv);
+		kfree(rsv);
 	}
 }
 
-/*
- * make the block_rsv struct be able to capture freed space.
- * the captured space will re-add to the the block_rsv struct
- * after transaction commit
- */
-void btrfs_add_durable_block_rsv(struct btrfs_fs_info *fs_info,
-				 struct btrfs_block_rsv *block_rsv)
-{
-	block_rsv->durable = 1;
-	mutex_lock(&fs_info->durable_block_rsv_mutex);
-	list_add_tail(&block_rsv->list, &fs_info->durable_block_rsv_list);
-	mutex_unlock(&fs_info->durable_block_rsv_mutex);
-}
-
 int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root,
 			struct btrfs_block_rsv *block_rsv,
@@ -3745,9 +3728,7 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
 		ret = 0;
 	} else {
 		num_bytes -= block_rsv->reserved;
-		if (block_rsv->durable &&
-		    block_rsv->freed[0] + block_rsv->freed[1] >= num_bytes)
-			commit_trans = 1;
+		commit_trans = 1;
 	}
 	spin_unlock(&block_rsv->lock);
 	if (!ret)
@@ -3763,8 +3744,18 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
 	}
 
 	if (commit_trans) {
+		struct btrfs_space_info *sinfo = block_rsv->space_info;
+
 		if (trans)
 			return -EAGAIN;
+
+		spin_lock(&sinfo->lock);
+		if (sinfo->bytes_pinned < num_bytes) {
+			spin_unlock(&sinfo->lock);
+			return -ENOSPC;
+		}
+		spin_unlock(&sinfo->lock);
+
 		trans = btrfs_join_transaction(root);
 		BUG_ON(IS_ERR(trans));
 		ret = btrfs_commit_transaction(trans, root);
@@ -3885,10 +3876,6 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
 	fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
 	fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
 
-	btrfs_add_durable_block_rsv(fs_info, &fs_info->global_block_rsv);
-
-	btrfs_add_durable_block_rsv(fs_info, &fs_info->delalloc_block_rsv);
-
 	update_global_block_rsv(fs_info);
 }
 
@@ -4447,13 +4434,8 @@ static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
 		spin_lock(&cache->lock);
 		cache->pinned -= len;
 		cache->space_info->bytes_pinned -= len;
-		if (cache->ro) {
+		if (cache->ro)
 			cache->space_info->bytes_readonly += len;
-		} else if (cache->reserved_pinned > 0) {
-			len = min(len, cache->reserved_pinned);
-			cache->reserved_pinned -= len;
-			cache->space_info->bytes_may_use += len;
-		}
 		spin_unlock(&cache->lock);
 		spin_unlock(&cache->space_info->lock);
 	}
@@ -4468,11 +4450,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct extent_io_tree *unpin;
-	struct btrfs_block_rsv *block_rsv;
-	struct btrfs_block_rsv *next_rsv;
 	u64 start;
 	u64 end;
-	int idx;
 	int ret;
 
 	if (fs_info->pinned_extents == &fs_info->freed_extents[0])
@@ -4495,30 +4474,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
 		cond_resched();
 	}
 
-	mutex_lock(&fs_info->durable_block_rsv_mutex);
-	list_for_each_entry_safe(block_rsv, next_rsv,
-				 &fs_info->durable_block_rsv_list, list) {
-
-		idx = trans->transid & 0x1;
-		if (block_rsv->freed[idx] > 0) {
-			block_rsv_add_bytes(block_rsv,
-					    block_rsv->freed[idx], 0);
-			block_rsv->freed[idx] = 0;
-		}
-		if (atomic_read(&block_rsv->usage) == 0) {
-			btrfs_block_rsv_release(root, block_rsv, (u64)-1);
-
-			if (block_rsv->freed[0] == 0 &&
-			    block_rsv->freed[1] == 0) {
-				list_del_init(&block_rsv->list);
-				kfree(block_rsv);
-			}
-		} else {
-			btrfs_block_rsv_release(root, block_rsv, 0);
-		}
-	}
-	mutex_unlock(&fs_info->durable_block_rsv_mutex);
-
 	return 0;
 }
 
@@ -4820,36 +4775,18 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
 		if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
 			ret = check_ref_cleanup(trans, root, buf->start);
 			if (!ret)
-				goto pin;
+				goto out;
 		}
 
 		if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
 			pin_down_extent(root, cache, buf->start, buf->len, 1);
-			goto pin;
+			goto out;
 		}
 
 		WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
 
 		btrfs_add_free_space(cache, buf->start, buf->len);
 		btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE);
-
-		goto out;
-	}
-pin:
-	if (block_rsv->durable && !cache->ro) {
-		ret = 0;
-		spin_lock(&cache->lock);
-		if (!cache->ro) {
-			cache->reserved_pinned += buf->len;
-			ret = 1;
-		}
-		spin_unlock(&cache->lock);
-
-		if (ret) {
-			spin_lock(&block_rsv->lock);
-			block_rsv->freed[trans->transid & 0x1] += buf->len;
-			spin_unlock(&block_rsv->lock);
-		}
 	}
 out:
 	/*
@@ -6705,12 +6642,9 @@ static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
 		    cache->bytes_super - btrfs_block_group_used(&cache->item);
 
 	if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
-	    sinfo->bytes_may_use + sinfo->bytes_readonly +
-	    cache->reserved_pinned + num_bytes + min_allocable_bytes <=
-	    sinfo->total_bytes) {
+	    sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes +
+	    min_allocable_bytes <= sinfo->total_bytes) {
 		sinfo->bytes_readonly += num_bytes;
-		sinfo->bytes_may_use += cache->reserved_pinned;
-		cache->reserved_pinned = 0;
 		cache->ro = 1;
 		ret = 0;
 	}
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5f5f8a577e69..6402a41b9023 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2164,9 +2164,6 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
 	}
 	spin_unlock(&root->orphan_lock);
 
-	if (block_rsv)
-		btrfs_add_durable_block_rsv(root->fs_info, block_rsv);
-
 	/* grab metadata reservation from transaction handle */
 	if (reserve) {
 		ret = btrfs_orphan_reserve_metadata(trans, inode);
@@ -6505,7 +6502,6 @@ static int btrfs_truncate(struct inode *inode)
 	rsv = btrfs_alloc_block_rsv(root);
 	if (!rsv)
 		return -ENOMEM;
-	btrfs_add_durable_block_rsv(root->fs_info, rsv);
 
 	trans = btrfs_start_transaction(root, 4);
 	if (IS_ERR(trans)) {
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 59bb1764273d..545b04358249 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3651,7 +3651,6 @@ int prepare_to_relocate(struct reloc_control *rc)
 		return ret;
 
 	rc->block_rsv->refill_used = 1;
-	btrfs_add_durable_block_rsv(rc->extent_root->fs_info, rc->block_rsv);
 
 	memset(&rc->cluster, 0, sizeof(rc->cluster));
 	rc->search_start = rc->block_group->key.objectid;
-- 
cgit v1.2.3


From 4289a667a0d7c6b134898cac7bfbe950267c305c Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Fri, 5 Aug 2011 13:22:24 -0400
Subject: Btrfs: fix how we reserve space for deleting inodes

I converted btrfs_truncate to do sane reservations for truncate, but didn't
convert btrfs_evict_inode.  Basically we need to save the orphan_rsv for
deleting the orphan item, and do normal reservations for our truncate.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/inode.c | 49 ++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 38 insertions(+), 11 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 6402a41b9023..fe3891e240b3 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3527,6 +3527,8 @@ void btrfs_evict_inode(struct inode *inode)
 {
 	struct btrfs_trans_handle *trans;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_block_rsv *rsv;
+	u64 min_size = btrfs_calc_trans_metadata_size(root, 2);
 	unsigned long nr;
 	int ret;
 
@@ -3554,22 +3556,44 @@ void btrfs_evict_inode(struct inode *inode)
 		goto no_delete;
 	}
 
+	rsv = btrfs_alloc_block_rsv(root);
+	if (!rsv) {
+		btrfs_orphan_del(NULL, inode);
+		goto no_delete;
+	}
+
 	btrfs_i_size_write(inode, 0);
 
+	/*
+	 * This is a bit simpler than btrfs_truncate since
+	 *
+	 * 1) We've already reserved our space for our orphan item in the
+	 *    unlink.
+	 * 2) We're going to delete the inode item, so we don't need to update
+	 *    it at all.
+	 *
+	 * So we just need to reserve some slack space in case we add bytes when
+	 * doing the truncate.
+	 */
 	while (1) {
-		trans = btrfs_join_transaction(root);
-		BUG_ON(IS_ERR(trans));
-		trans->block_rsv = root->orphan_block_rsv;
-
-		ret = btrfs_block_rsv_check(trans, root,
-					    root->orphan_block_rsv, 0, 5);
+		ret = btrfs_block_rsv_check(NULL, root, rsv, min_size, 0);
 		if (ret) {
-			BUG_ON(ret != -EAGAIN);
-			ret = btrfs_commit_transaction(trans, root);
-			BUG_ON(ret);
-			continue;
+			printk(KERN_WARNING "Could not get space for a "
+			       "delete, will truncate on mount\n");
+			btrfs_orphan_del(NULL, inode);
+			btrfs_free_block_rsv(root, rsv);
+			goto no_delete;
+		}
+
+		trans = btrfs_start_transaction(root, 0);
+		if (IS_ERR(trans)) {
+			btrfs_orphan_del(NULL, inode);
+			btrfs_free_block_rsv(root, rsv);
+			goto no_delete;
 		}
 
+		trans->block_rsv = rsv;
+
 		ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0);
 		if (ret != -EAGAIN)
 			break;
@@ -3578,14 +3602,17 @@ void btrfs_evict_inode(struct inode *inode)
 		btrfs_end_transaction(trans, root);
 		trans = NULL;
 		btrfs_btree_balance_dirty(root, nr);
-
 	}
 
+	btrfs_free_block_rsv(root, rsv);
+
 	if (ret == 0) {
+		trans->block_rsv = root->orphan_block_rsv;
 		ret = btrfs_orphan_del(trans, inode);
 		BUG_ON(ret);
 	}
 
+	trans->block_rsv = &root->fs_info->trans_block_rsv;
 	if (!(root == root->fs_info->tree_root ||
 	      root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID))
 		btrfs_return_ino(root, btrfs_ino(inode));
-- 
cgit v1.2.3


From 6ab60601d518d563ca1a47eaa399096e69d3b64a Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Mon, 8 Aug 2011 08:24:46 -0400
Subject: Btrfs: ratelimit the generation printk for the free space cache

A user reported getting spammed when moving to 3.0 by this message.  Since we
switched to the normal checksumming infrastructure all old free space caches
will be wrong and need to be regenerated so people are likely to see this
message a lot, so ratelimit it so it doesn't fill up their logs and freak them
out.  Thanks,

Reported-by: Andrew Lutomirski <luto@mit.edu>
Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/free-space-cache.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 79c16a68a2bc..ecc1a4f85d20 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -20,6 +20,7 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/math64.h>
+#include <linux/ratelimit.h>
 #include "ctree.h"
 #include "free-space-cache.h"
 #include "transaction.h"
@@ -341,11 +342,12 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
 
 			gen = addr;
 			if (*gen != BTRFS_I(inode)->generation) {
-				printk(KERN_ERR "btrfs: space cache generation"
-				       " (%llu) does not match inode (%llu)\n",
-				       (unsigned long long)*gen,
-				       (unsigned long long)
-				       BTRFS_I(inode)->generation);
+				printk_ratelimited(KERN_ERR "btrfs: space cache"
+					" generation (%llu) does not match "
+					"inode (%llu)\n",
+					(unsigned long long)*gen,
+					(unsigned long long)
+					BTRFS_I(inode)->generation);
 				kunmap(page);
 				unlock_page(page);
 				page_cache_release(page);
-- 
cgit v1.2.3


From dabdb6408cb801644fa613c7432da012640b348c Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Mon, 8 Aug 2011 12:50:18 -0400
Subject: Btrfs: kill unused parts of block_rsv

The priority and refill_used flags are not used anymore, and neither is the
usage counter, so just remove them from btrfs_block_rsv.

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/ctree.h       |  3 ---
 fs/btrfs/extent-tree.c | 23 ++++++-----------------
 fs/btrfs/relocation.c  |  2 --
 3 files changed, 6 insertions(+), 22 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index c5ceba4078cc..58a06dea4791 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -774,9 +774,6 @@ struct btrfs_block_rsv {
 	u64 reserved;
 	struct btrfs_space_info *space_info;
 	spinlock_t lock;
-	atomic_t usage;
-	unsigned int priority:8;
-	unsigned int refill_used:1;
 	unsigned int full:1;
 };
 
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 30c0558eae84..5395cc639270 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3659,8 +3659,6 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv)
 {
 	memset(rsv, 0, sizeof(*rsv));
 	spin_lock_init(&rsv->lock);
-	atomic_set(&rsv->usage, 1);
-	rsv->priority = 6;
 }
 
 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
@@ -3681,10 +3679,8 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root)
 void btrfs_free_block_rsv(struct btrfs_root *root,
 			  struct btrfs_block_rsv *rsv)
 {
-	if (rsv && atomic_dec_and_test(&rsv->usage)) {
-		btrfs_block_rsv_release(root, rsv, (u64)-1);
-		kfree(rsv);
-	}
+	btrfs_block_rsv_release(root, rsv, (u64)-1);
+	kfree(rsv);
 }
 
 int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
@@ -3734,13 +3730,10 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
 	if (!ret)
 		return 0;
 
-	if (block_rsv->refill_used) {
-		ret = reserve_metadata_bytes(trans, root, block_rsv,
-					     num_bytes, 0);
-		if (!ret) {
-			block_rsv_add_bytes(block_rsv, num_bytes, 0);
-			return 0;
-		}
+	ret = reserve_metadata_bytes(trans, root, block_rsv, num_bytes, 0);
+	if (!ret) {
+		block_rsv_add_bytes(block_rsv, num_bytes, 0);
+		return 0;
 	}
 
 	if (commit_trans) {
@@ -3859,16 +3852,12 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
 
 	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
 	fs_info->chunk_block_rsv.space_info = space_info;
-	fs_info->chunk_block_rsv.priority = 10;
 
 	space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
 	fs_info->global_block_rsv.space_info = space_info;
-	fs_info->global_block_rsv.priority = 10;
-	fs_info->global_block_rsv.refill_used = 1;
 	fs_info->delalloc_block_rsv.space_info = space_info;
 	fs_info->trans_block_rsv.space_info = space_info;
 	fs_info->empty_block_rsv.space_info = space_info;
-	fs_info->empty_block_rsv.priority = 10;
 
 	fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
 	fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 545b04358249..aeaed99e9cfe 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3650,8 +3650,6 @@ int prepare_to_relocate(struct reloc_control *rc)
 	if (ret)
 		return ret;
 
-	rc->block_rsv->refill_used = 1;
-
 	memset(&rc->cluster, 0, sizeof(rc->cluster));
 	rc->search_start = rc->block_group->key.objectid;
 	rc->extents_found = 0;
-- 
cgit v1.2.3


From 13553e5221d6901a33b3f2157a389de085c161fe Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Mon, 8 Aug 2011 13:33:21 -0400
Subject: Btrfs: don't try to commit in btrfs_block_rsv_check

We will try and reserve metadata bytes in btrfs_block_rsv_check and if we cannot
because we have a transaction open it will return EAGAIN, so we do not need to
try and commit the transaction again.

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent-tree.c | 29 ++++-------------------------
 1 file changed, 4 insertions(+), 25 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 5395cc639270..6356ef2f0c80 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3708,7 +3708,6 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
 			  u64 min_reserved, int min_factor)
 {
 	u64 num_bytes = 0;
-	int commit_trans = 0;
 	int ret = -ENOSPC;
 
 	if (!block_rsv)
@@ -3720,13 +3719,12 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
 	if (min_reserved > num_bytes)
 		num_bytes = min_reserved;
 
-	if (block_rsv->reserved >= num_bytes) {
+	if (block_rsv->reserved >= num_bytes)
 		ret = 0;
-	} else {
+	else
 		num_bytes -= block_rsv->reserved;
-		commit_trans = 1;
-	}
 	spin_unlock(&block_rsv->lock);
+
 	if (!ret)
 		return 0;
 
@@ -3736,26 +3734,7 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
 		return 0;
 	}
 
-	if (commit_trans) {
-		struct btrfs_space_info *sinfo = block_rsv->space_info;
-
-		if (trans)
-			return -EAGAIN;
-
-		spin_lock(&sinfo->lock);
-		if (sinfo->bytes_pinned < num_bytes) {
-			spin_unlock(&sinfo->lock);
-			return -ENOSPC;
-		}
-		spin_unlock(&sinfo->lock);
-
-		trans = btrfs_join_transaction(root);
-		BUG_ON(IS_ERR(trans));
-		ret = btrfs_commit_transaction(trans, root);
-		return 0;
-	}
-
-	return -ENOSPC;
+	return ret;
 }
 
 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
-- 
cgit v1.2.3


From 907cbcebd4e5f641faf08601f216b1ceb6cb3bdf Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Mon, 8 Aug 2011 13:46:15 -0400
Subject: Btrfs: optimize how we account for space in truncate

Currently we're starting and stopping a transaction for no real reason, so kill
that and just reserve enough space as if we can truncate all in one transaction.
Also use btrfs_block_rsv_check() for our reserve to minimize the amount of space
we may have to allocate for our slack space.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/inode.c | 58 ++++++++++++++++++++++++++++----------------------------
 1 file changed, 29 insertions(+), 29 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index fe3891e240b3..4d057c084de6 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -6482,6 +6482,7 @@ static int btrfs_truncate(struct inode *inode)
 	struct btrfs_trans_handle *trans;
 	unsigned long nr;
 	u64 mask = root->sectorsize - 1;
+	u64 min_size = btrfs_calc_trans_metadata_size(root, 2);
 
 	ret = btrfs_truncate_page(inode->i_mapping, inode->i_size);
 	if (ret)
@@ -6530,17 +6531,21 @@ static int btrfs_truncate(struct inode *inode)
 	if (!rsv)
 		return -ENOMEM;
 
-	trans = btrfs_start_transaction(root, 4);
+	/*
+	 * 2 for the truncate slack space
+	 * 1 for the orphan item we're going to add
+	 * 1 for the orphan item deletion
+	 * 1 for updating the inode.
+	 */
+	trans = btrfs_start_transaction(root, 5);
 	if (IS_ERR(trans)) {
 		err = PTR_ERR(trans);
 		goto out;
 	}
 
-	/*
-	 * Reserve space for the truncate process.  Truncate should be adding
-	 * space, but if there are snapshots it may end up using space.
-	 */
-	ret = btrfs_truncate_reserve_metadata(trans, root, rsv);
+	/* Migrate the slack space for the truncate to our reserve */
+	ret = btrfs_block_rsv_migrate(&root->fs_info->trans_block_rsv, rsv,
+				      min_size);
 	BUG_ON(ret);
 
 	ret = btrfs_orphan_add(trans, inode);
@@ -6549,21 +6554,6 @@ static int btrfs_truncate(struct inode *inode)
 		goto out;
 	}
 
-	nr = trans->blocks_used;
-	btrfs_end_transaction(trans, root);
-	btrfs_btree_balance_dirty(root, nr);
-
-	/*
-	 * Ok so we've already migrated our bytes over for the truncate, so here
-	 * just reserve the one slot we need for updating the inode.
-	 */
-	trans = btrfs_start_transaction(root, 1);
-	if (IS_ERR(trans)) {
-		err = PTR_ERR(trans);
-		goto out;
-	}
-	trans->block_rsv = rsv;
-
 	/*
 	 * setattr is responsible for setting the ordered_data_close flag,
 	 * but that is only tested during the last file release.  That
@@ -6585,20 +6575,30 @@ static int btrfs_truncate(struct inode *inode)
 		btrfs_add_ordered_operation(trans, root, inode);
 
 	while (1) {
+		ret = btrfs_block_rsv_check(trans, root, rsv, min_size, 0);
+		if (ret) {
+			/*
+			 * This can only happen with the original transaction we
+			 * started above, every other time we shouldn't have a
+			 * transaction started yet.
+			 */
+			if (ret == -EAGAIN)
+				goto end_trans;
+			err = ret;
+			break;
+		}
+
 		if (!trans) {
-			trans = btrfs_start_transaction(root, 3);
+			/* Just need the 1 for updating the inode */
+			trans = btrfs_start_transaction(root, 1);
 			if (IS_ERR(trans)) {
 				err = PTR_ERR(trans);
 				goto out;
 			}
-
-			ret = btrfs_truncate_reserve_metadata(trans, root,
-							      rsv);
-			BUG_ON(ret);
-
-			trans->block_rsv = rsv;
 		}
 
+		trans->block_rsv = rsv;
+
 		ret = btrfs_truncate_inode_items(trans, root, inode,
 						 inode->i_size,
 						 BTRFS_EXTENT_DATA_KEY);
@@ -6613,7 +6613,7 @@ static int btrfs_truncate(struct inode *inode)
 			err = ret;
 			break;
 		}
-
+end_trans:
 		nr = trans->blocks_used;
 		btrfs_end_transaction(trans, root);
 		trans = NULL;
-- 
cgit v1.2.3


From 5e962c7850c273b483acc747b41bd5cddf631049 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Mon, 8 Aug 2011 14:03:37 -0400
Subject: Btrfs: kill btrfs_truncate_reserve_metadata

Since we've optimized the truncate path, we no longer require this function.

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/ctree.h       |  3 ---
 fs/btrfs/extent-tree.c | 31 -------------------------------
 2 files changed, 34 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 58a06dea4791..22a9347a3908 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2240,9 +2240,6 @@ int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
 void btrfs_block_rsv_release(struct btrfs_root *root,
 			     struct btrfs_block_rsv *block_rsv,
 			     u64 num_bytes);
-int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans,
-				    struct btrfs_root *root,
-				    struct btrfs_block_rsv *rsv);
 int btrfs_set_block_group_ro(struct btrfs_root *root,
 			     struct btrfs_block_group_cache *cache);
 int btrfs_set_block_group_rw(struct btrfs_root *root,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 6356ef2f0c80..d05967e9d613 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3858,37 +3858,6 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
 	WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
 }
 
-int btrfs_truncate_reserve_metadata(struct btrfs_trans_handle *trans,
-				    struct btrfs_root *root,
-				    struct btrfs_block_rsv *rsv)
-{
-	struct btrfs_block_rsv *trans_rsv = &root->fs_info->trans_block_rsv;
-	u64 num_bytes;
-	int ret;
-
-	/*
-	 * Truncate should be freeing data, but give us 2 items just in case it
-	 * needs to use some space.  We may want to be smarter about this in the
-	 * future.
-	 */
-	num_bytes = btrfs_calc_trans_metadata_size(root, 2);
-
-	/* We already have enough bytes, just return */
-	if (rsv->reserved >= num_bytes)
-		return 0;
-
-	num_bytes -= rsv->reserved;
-
-	/*
-	 * You should have reserved enough space before hand to do this, so this
-	 * should not fail.
-	 */
-	ret = block_rsv_migrate_bytes(trans_rsv, rsv, num_bytes);
-	BUG_ON(ret);
-
-	return 0;
-}
-
 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
 				  struct btrfs_root *root)
 {
-- 
cgit v1.2.3


From 1b9c332b6c92e992b1971a08412c6f460a54b514 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Wed, 17 Aug 2011 10:19:52 -0400
Subject: Btrfs: only reserve space in fallocate if we have to do a preallocate

Lukas found a problem where if he tries to fallocate over the same region twice
and the first fallocate took up all the space we would fail with ENOSPC.  This
is because we reserve the total space we want to use for fallocate, regardless
of wether or not we will have to actually preallocate.  So instead move the
check into the loop where we actually have to do the preallocate.  Thanks,

Tested-by: Lukas Czerner <lczerner@redhat.com>
Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/file.c | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index e4e57d59edb7..de569af766fe 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1615,10 +1615,6 @@ static long btrfs_fallocate(struct file *file, int mode,
 			goto out;
 	}
 
-	ret = btrfs_check_data_free_space(inode, alloc_end - alloc_start);
-	if (ret)
-		goto out;
-
 	locked_end = alloc_end - 1;
 	while (1) {
 		struct btrfs_ordered_extent *ordered;
@@ -1664,11 +1660,27 @@ static long btrfs_fallocate(struct file *file, int mode,
 		if (em->block_start == EXTENT_MAP_HOLE ||
 		    (cur_offset >= inode->i_size &&
 		     !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
+
+			/*
+			 * Make sure we have enough space before we do the
+			 * allocation.
+			 */
+			ret = btrfs_check_data_free_space(inode, last_byte -
+							  cur_offset);
+			if (ret) {
+				free_extent_map(em);
+				break;
+			}
+
 			ret = btrfs_prealloc_file_range(inode, mode, cur_offset,
 							last_byte - cur_offset,
 							1 << inode->i_blkbits,
 							offset + len,
 							&alloc_hint);
+
+			/* Let go of our reservation. */
+			btrfs_free_reserved_data_space(inode, last_byte -
+						       cur_offset);
 			if (ret < 0) {
 				free_extent_map(em);
 				break;
@@ -1694,8 +1706,6 @@ static long btrfs_fallocate(struct file *file, int mode,
 	}
 	unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
 			     &cached_state, GFP_NOFS);
-
-	btrfs_free_reserved_data_space(inode, alloc_end - alloc_start);
 out:
 	mutex_unlock(&inode->i_mutex);
 	return ret;
-- 
cgit v1.2.3


From 07127184efb629f1336c0592bfdacec258cab731 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Fri, 19 Aug 2011 10:29:59 -0400
Subject: Btrfs: reduce the amount of space needed for truncates

With btrfs_truncate_inode_items we always return if we have to go to another
leaf, which makes us do our reservation again.  This means we will only ever
modify one leaf at a time, so we only need 1 items worth of slack space.  Also,
since we are deleting we will not be creating nodes as we go down, if anything
we'll be free'ing them as we merge them together, so make a different
calculation for truncate which will only have the worst case useage of COW'ing
the entire path down to the leaf.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/ctree.h | 11 +++++++++++
 fs/btrfs/inode.c |  8 ++++----
 2 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 22a9347a3908..2e18b068841b 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2125,6 +2125,17 @@ static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
 		3 * num_items;
 }
 
+/*
+ * Doing a truncate won't result in new nodes or leaves, just what we need for
+ * COW.
+ */
+static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_root *root,
+						 unsigned num_items)
+{
+	return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) *
+		num_items;
+}
+
 void btrfs_put_block_group(struct btrfs_block_group_cache *cache);
 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root, unsigned long count);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 4d057c084de6..8316b570db55 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3528,7 +3528,7 @@ void btrfs_evict_inode(struct inode *inode)
 	struct btrfs_trans_handle *trans;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_block_rsv *rsv;
-	u64 min_size = btrfs_calc_trans_metadata_size(root, 2);
+	u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
 	unsigned long nr;
 	int ret;
 
@@ -6482,7 +6482,7 @@ static int btrfs_truncate(struct inode *inode)
 	struct btrfs_trans_handle *trans;
 	unsigned long nr;
 	u64 mask = root->sectorsize - 1;
-	u64 min_size = btrfs_calc_trans_metadata_size(root, 2);
+	u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
 
 	ret = btrfs_truncate_page(inode->i_mapping, inode->i_size);
 	if (ret)
@@ -6532,12 +6532,12 @@ static int btrfs_truncate(struct inode *inode)
 		return -ENOMEM;
 
 	/*
-	 * 2 for the truncate slack space
+	 * 1 for the truncate slack space
 	 * 1 for the orphan item we're going to add
 	 * 1 for the orphan item deletion
 	 * 1 for updating the inode.
 	 */
-	trans = btrfs_start_transaction(root, 5);
+	trans = btrfs_start_transaction(root, 4);
 	if (IS_ERR(trans)) {
 		err = PTR_ERR(trans);
 		goto out;
-- 
cgit v1.2.3


From 482e6dc5261406fdb921946e70b51467b0305bad Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Fri, 19 Aug 2011 10:31:56 -0400
Subject: Btrfs: allow callers to specify if flushing can occur for
 btrfs_block_rsv_check

If you run xfstest 224 it you will get lots of messages about not being able to
delete inodes and that they will be cleaned up next mount.  This is because
btrfs_block_rsv_check was not calling reserve_metadata_bytes with the ability to
flush, so if there was not enough space, it simply failed.  But in truncate and
evict case we could easily flush space to try and get enough space to do our
work, so make btrfs_block_rsv_check take a flush argument to pass down to
reserve_metadata_bytes.  Now xfstests 224 runs fine without all those
complaints.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/ctree.h            | 2 +-
 fs/btrfs/extent-tree.c      | 4 ++--
 fs/btrfs/free-space-cache.c | 2 +-
 fs/btrfs/inode.c            | 6 +++---
 fs/btrfs/relocation.c       | 4 ++--
 fs/btrfs/transaction.c      | 2 +-
 6 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 2e18b068841b..caa73cd8c00a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2244,7 +2244,7 @@ int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
 int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root,
 			  struct btrfs_block_rsv *block_rsv,
-			  u64 min_reserved, int min_factor);
+			  u64 min_reserved, int min_factor, int flush);
 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
 			    struct btrfs_block_rsv *dst_rsv,
 			    u64 num_bytes);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index d05967e9d613..a71fcf506531 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3705,7 +3705,7 @@ int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
 int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root,
 			  struct btrfs_block_rsv *block_rsv,
-			  u64 min_reserved, int min_factor)
+			  u64 min_reserved, int min_factor, int flush)
 {
 	u64 num_bytes = 0;
 	int ret = -ENOSPC;
@@ -3728,7 +3728,7 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
 	if (!ret)
 		return 0;
 
-	ret = reserve_metadata_bytes(trans, root, block_rsv, num_bytes, 0);
+	ret = reserve_metadata_bytes(trans, root, block_rsv, num_bytes, flush);
 	if (!ret) {
 		block_rsv_add_bytes(block_rsv, num_bytes, 0);
 		return 0;
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index ecc1a4f85d20..b0122e19db6b 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -199,7 +199,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
 	trans->block_rsv = root->orphan_block_rsv;
 	ret = btrfs_block_rsv_check(trans, root,
 				    root->orphan_block_rsv,
-				    0, 5);
+				    0, 5, 0);
 	if (ret)
 		return ret;
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8316b570db55..e40b9239660d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3576,10 +3576,10 @@ void btrfs_evict_inode(struct inode *inode)
 	 * doing the truncate.
 	 */
 	while (1) {
-		ret = btrfs_block_rsv_check(NULL, root, rsv, min_size, 0);
+		ret = btrfs_block_rsv_check(NULL, root, rsv, min_size, 0, 1);
 		if (ret) {
 			printk(KERN_WARNING "Could not get space for a "
-			       "delete, will truncate on mount\n");
+			       "delete, will truncate on mount %d\n", ret);
 			btrfs_orphan_del(NULL, inode);
 			btrfs_free_block_rsv(root, rsv);
 			goto no_delete;
@@ -6575,7 +6575,7 @@ static int btrfs_truncate(struct inode *inode)
 		btrfs_add_ordered_operation(trans, root, inode);
 
 	while (1) {
-		ret = btrfs_block_rsv_check(trans, root, rsv, min_size, 0);
+		ret = btrfs_block_rsv_check(trans, root, rsv, min_size, 0, 1);
 		if (ret) {
 			/*
 			 * This can only happen with the original transaction we
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index aeaed99e9cfe..fd9ac66434b0 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2042,7 +2042,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
 		trans->block_rsv = rc->block_rsv;
 
 		ret = btrfs_block_rsv_check(trans, root, rc->block_rsv,
-					    min_reserved, 0);
+					    min_reserved, 0, 0);
 		if (ret) {
 			BUG_ON(ret != -EAGAIN);
 			ret = btrfs_commit_transaction(trans, root);
@@ -3775,7 +3775,7 @@ restart:
 		}
 
 		ret = btrfs_block_rsv_check(trans, rc->extent_root,
-					    rc->block_rsv, 0, 5);
+					    rc->block_rsv, 0, 5, 0);
 		if (ret < 0) {
 			if (ret != -EAGAIN) {
 				err = ret;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 3e20cc8c1c06..a1d8c322c1ba 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -419,7 +419,7 @@ static int should_end_transaction(struct btrfs_trans_handle *trans,
 {
 	int ret;
 	ret = btrfs_block_rsv_check(trans, root,
-				    &root->fs_info->global_block_rsv, 0, 5);
+				    &root->fs_info->global_block_rsv, 0, 5, 0);
 	return ret ? 1 : 0;
 }
 
-- 
cgit v1.2.3


From a9b5fcddce594a408a48d523087b5bb64ce82469 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Fri, 19 Aug 2011 12:06:12 -0400
Subject: Btrfs: fix call to btrfs_search_slot in free space cache

We are setting ins_len to 1 even tho we are just modifying an item that should
be there already.  This may cause the search stuff to split nodes on the way
down needelessly.  Set this to 0 since we aren't inserting anything.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/free-space-cache.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index b0122e19db6b..701ef5951e3f 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -802,7 +802,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 	key.offset = offset;
 	key.type = 0;
 
-	ret = btrfs_search_slot(trans, root, &key, path, 1, 1);
+	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
 	if (ret < 0) {
 		ret = -1;
 		clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1,
-- 
cgit v1.2.3


From 7ed49f187c82821e35f8869399bcf90822a74a23 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Fri, 19 Aug 2011 15:45:52 -0400
Subject: Btrfs: fix space leak when we fail to make an allocation

When changing back to using a spin_lock to protect the extent counters I decided
that since we would only be dropping our original extent, it was ok to just drop
the extent and return.  However since somebody else could have come in and done
a reservation, we need to do the normal song and dance to clear the reservation
out properly.  So calculate how much space we need to free, and then subtract
what we just attempted to reserve.  If it's more then we know we need to drop
those bytes from the delalloc block rsv.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent-tree.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a71fcf506531..99ab5716baad 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4025,16 +4025,24 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 
 	ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1);
 	if (ret) {
+		u64 to_free = 0;
 		unsigned dropped;
-		/*
-		 * We don't need the return value since our reservation failed,
-		 * we just need to clean up our counter.
-		 */
+
 		spin_lock(&BTRFS_I(inode)->lock);
 		dropped = drop_outstanding_extent(inode);
-		WARN_ON(dropped > 1);
-		BTRFS_I(inode)->csum_bytes -= num_bytes;
+		to_free = calc_csum_metadata_size(inode, num_bytes, 0);
 		spin_unlock(&BTRFS_I(inode)->lock);
+		to_free += btrfs_calc_trans_metadata_size(root, dropped);
+
+		/*
+		 * Somebody could have come in and twiddled with the
+		 * reservation, so if we have to free more than we would have
+		 * reserved from this reservation go ahead and release those
+		 * bytes.
+		 */
+		to_free -= to_reserve;
+		if (to_free)
+			btrfs_block_rsv_release(root, block_rsv, to_free);
 		return ret;
 	}
 
-- 
cgit v1.2.3


From 7f70150896ebd1169d9c43484c8c424f755353c4 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Mon, 22 Aug 2011 15:23:19 -0400
Subject: Btrfs: don't increase the block_rsv's size when emergency allocating
 space

If we have to emergency reserve space we need to not increase the block_rsv
size, otherwise we'll leak space.  Take for instance delalloc, say we reserve
4k, and we use that 4k, and then we have to emergency allocate another 4k, we
bump the size up to 8k, however we've only accounted for 4k in reservations in
all of our supporting logic, so we'll go to free the 4k and end up having a size
of 4k, which will cause us to later not free as much space.  I saw this doing
testing where I wasn't reserving enough space for something but was still
leaking space, very frustrating.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent-tree.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 99ab5716baad..1f1d3e8dcec9 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5707,9 +5707,6 @@ use_block_rsv(struct btrfs_trans_handle *trans,
 		ret = reserve_metadata_bytes(trans, root, block_rsv, blocksize,
 					     0);
 		if (!ret) {
-			spin_lock(&block_rsv->lock);
-			block_rsv->size += blocksize;
-			spin_unlock(&block_rsv->lock);
 			return block_rsv;
 		} else if (ret && block_rsv != global_rsv) {
 			ret = block_rsv_use_bytes(global_rsv, blocksize);
-- 
cgit v1.2.3


From 4a33854257764c2ec6337ee0c8ecafb64f8e29e1 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Mon, 29 Aug 2011 11:01:31 -0400
Subject: Btrfs: set truncate block rsv's size

While debugging a different issue I noticed that we were always reserving space
when we tried to use our truncate block rsv's.  This is because they didn't have
a ->size value, so use_block_rsv just assumes there is nothing reserved and it
does a reserve_metadata_bytes.  This is because btrfs_check_block_rsv() doesn't
actually add to the size of the block rsv.  That seems to be the right thing to
do so set ->size to the minimum truncate size we need, since we will always only
refill to that size anyway, and this way everything works out correctly.

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/inode.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index e40b9239660d..06ae5b173fd7 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3561,6 +3561,7 @@ void btrfs_evict_inode(struct inode *inode)
 		btrfs_orphan_del(NULL, inode);
 		goto no_delete;
 	}
+	rsv->size = min_size;
 
 	btrfs_i_size_write(inode, 0);
 
@@ -6530,6 +6531,7 @@ static int btrfs_truncate(struct inode *inode)
 	rsv = btrfs_alloc_block_rsv(root);
 	if (!rsv)
 		return -ENOMEM;
+	rsv->size = min_size;
 
 	/*
 	 * 1 for the truncate slack space
-- 
cgit v1.2.3


From 300e4f8a56f263797568c95b71c949f9f02e4534 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Mon, 29 Aug 2011 14:06:00 -0400
Subject: Btrfs: put the block group cache after we commit the super

In moving some enospc stuff around I noticed that when we unmount we are often
evicting the free space cache inodes before we do our last commit.  This isn't
bad, but it makes us constantly have to re-read the inodes back.  So instead
don't evict the cache until after we do our last commit, this will make things a
little less crappy and makes a future enospc change work properly.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/disk-io.c          | 4 ++--
 fs/btrfs/free-space-cache.c | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 0b5643a68d57..4965a0179b31 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2543,8 +2543,6 @@ int close_ctree(struct btrfs_root *root)
 	/* clear out the rbtree of defraggable inodes */
 	btrfs_run_defrag_inodes(root->fs_info);
 
-	btrfs_put_block_group_cache(fs_info);
-
 	/*
 	 * Here come 2 situations when btrfs is broken to flip readonly:
 	 *
@@ -2570,6 +2568,8 @@ int close_ctree(struct btrfs_root *root)
 			printk(KERN_ERR "btrfs: commit super ret %d\n", ret);
 	}
 
+	btrfs_put_block_group_cache(fs_info);
+
 	kthread_stop(root->fs_info->transaction_kthread);
 	kthread_stop(root->fs_info->cleaner_kthread);
 
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 701ef5951e3f..1ea10731797a 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -105,7 +105,7 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root,
 		block_group->disk_cache_state = BTRFS_DC_CLEAR;
 	}
 
-	if (!btrfs_fs_closing(root->fs_info)) {
+	if (!block_group->iref) {
 		block_group->inode = igrab(inode);
 		block_group->iref = 1;
 	}
-- 
cgit v1.2.3


From c09544e07f8cdc455ed8615d4c067d694c33bd18 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Tue, 30 Aug 2011 10:19:10 -0400
Subject: Btrfs: handle enospc accounting for free space inodes

Since free space inodes now use normal checksumming we need to make sure to
account for their metadata use.  So reserve metadata space, and then if we fail
to write out the metadata we can just release it, otherwise it will be freed up
when the io completes.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent-tree.c      | 18 +++++++++++++-----
 fs/btrfs/free-space-cache.c | 44 +++++++++++++++++++++++++++++---------------
 fs/btrfs/inode-map.c        |  6 ++++--
 fs/btrfs/inode.c            |  2 +-
 4 files changed, 47 insertions(+), 23 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 1f1d3e8dcec9..ccdc4d12e8d4 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2755,16 +2755,20 @@ again:
 	num_pages *= 16;
 	num_pages *= PAGE_CACHE_SIZE;
 
-	ret = btrfs_check_data_free_space(inode, num_pages);
+	ret = btrfs_delalloc_reserve_space(inode, num_pages);
 	if (ret)
 		goto out_put;
 
 	ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
 					      num_pages, num_pages,
 					      &alloc_hint);
-	if (!ret)
+	if (!ret) {
 		dcs = BTRFS_DC_SETUP;
-	btrfs_free_reserved_data_space(inode, num_pages);
+		btrfs_free_reserved_data_space(inode, num_pages);
+	} else {
+		btrfs_delalloc_release_space(inode, num_pages);
+	}
+
 out_put:
 	iput(inode);
 out_free:
@@ -4002,9 +4006,13 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 	struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
 	u64 to_reserve = 0;
 	unsigned nr_extents = 0;
+	int flush = 1;
 	int ret;
 
-	if (btrfs_transaction_in_commit(root->fs_info))
+	if (btrfs_is_free_space_inode(root, inode))
+		flush = 0;
+
+	if (flush && btrfs_transaction_in_commit(root->fs_info))
 		schedule_timeout(1);
 
 	num_bytes = ALIGN(num_bytes, root->sectorsize);
@@ -4023,7 +4031,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 	to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
 	spin_unlock(&BTRFS_I(inode)->lock);
 
-	ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, 1);
+	ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, flush);
 	if (ret) {
 		u64 to_free = 0;
 		unsigned dropped;
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 1ea10731797a..3bde17ff14c0 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -532,6 +532,19 @@ out:
 	return ret;
 }
 
+/**
+ * __btrfs_write_out_cache - write out cached info to an inode
+ * @root - the root the inode belongs to
+ * @ctl - the free space cache we are going to write out
+ * @block_group - the block_group for this cache if it belongs to a block_group
+ * @trans - the trans handle
+ * @path - the path to use
+ * @offset - the offset for the key we'll insert
+ *
+ * This function writes out a free space cache struct to disk for quick recovery
+ * on mount.  This will return 0 if it was successfull in writing the cache out,
+ * and -1 if it was not.
+ */
 int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 			    struct btrfs_free_space_ctl *ctl,
 			    struct btrfs_block_group_cache *block_group,
@@ -555,7 +568,8 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 	int index = 0, num_pages = 0;
 	int entries = 0;
 	int bitmaps = 0;
-	int ret = -1;
+	int ret;
+	int err = -1;
 	bool next_page = false;
 	bool out_of_space = false;
 
@@ -563,7 +577,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 
 	node = rb_first(&ctl->free_space_offset);
 	if (!node)
-		return 0;
+		return -1;
 
 	if (!i_size_read(inode))
 		return -1;
@@ -767,7 +781,6 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 		unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
 				     i_size_read(inode) - 1, &cached_state,
 				     GFP_NOFS);
-		ret = 0;
 		goto out;
 	}
 
@@ -789,10 +802,8 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 	unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
 			     i_size_read(inode) - 1, &cached_state, GFP_NOFS);
 
-	if (ret) {
-		ret = 0;
+	if (ret)
 		goto out;
-	}
 
 	BTRFS_I(inode)->generation = trans->transid;
 
@@ -804,7 +815,6 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 
 	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
 	if (ret < 0) {
-		ret = -1;
 		clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1,
 				 EXTENT_DIRTY | EXTENT_DELALLOC |
 				 EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS);
@@ -818,7 +828,6 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
 		if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
 		    found_key.offset != offset) {
-			ret = -1;
 			clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1,
 					 EXTENT_DIRTY | EXTENT_DELALLOC |
 					 EXTENT_DO_ACCOUNTING, 0, 0, NULL,
@@ -835,16 +844,15 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 	btrfs_mark_buffer_dirty(leaf);
 	btrfs_release_path(path);
 
-	ret = 1;
-
+	err = 0;
 out:
 	kfree(pages);
-	if (ret != 1) {
+	if (err) {
 		invalidate_inode_pages2_range(inode->i_mapping, 0, index);
 		BTRFS_I(inode)->generation = 0;
 	}
 	btrfs_update_inode(trans, root, inode);
-	return ret;
+	return err;
 }
 
 int btrfs_write_out_cache(struct btrfs_root *root,
@@ -871,14 +879,16 @@ int btrfs_write_out_cache(struct btrfs_root *root,
 
 	ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans,
 				      path, block_group->key.objectid);
-	if (ret < 0) {
+	if (ret) {
+		btrfs_delalloc_release_metadata(inode, inode->i_size);
 		spin_lock(&block_group->lock);
 		block_group->disk_cache_state = BTRFS_DC_ERROR;
 		spin_unlock(&block_group->lock);
 		ret = 0;
-
+#ifdef DEBUG
 		printk(KERN_ERR "btrfs: failed to write free space cace "
 		       "for block group %llu\n", block_group->key.objectid);
+#endif
 	}
 
 	iput(inode);
@@ -2662,9 +2672,13 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root,
 		return 0;
 
 	ret = __btrfs_write_out_cache(root, inode, ctl, NULL, trans, path, 0);
-	if (ret < 0)
+	if (ret) {
+		btrfs_delalloc_release_metadata(inode, inode->i_size);
+#ifdef DEBUG
 		printk(KERN_ERR "btrfs: failed to write free ino cache "
 		       "for root %llu\n", root->root_key.objectid);
+#endif
+	}
 
 	iput(inode);
 	return ret;
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index b4087e0fa871..53dcbdf446cd 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -465,14 +465,16 @@ again:
 	/* Just to make sure we have enough space */
 	prealloc += 8 * PAGE_CACHE_SIZE;
 
-	ret = btrfs_check_data_free_space(inode, prealloc);
+	ret = btrfs_delalloc_reserve_space(inode, prealloc);
 	if (ret)
 		goto out_put;
 
 	ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, prealloc,
 					      prealloc, prealloc, &alloc_hint);
-	if (ret)
+	if (ret) {
+		btrfs_delalloc_release_space(inode, prealloc);
 		goto out_put;
+	}
 	btrfs_free_reserved_data_space(inode, prealloc);
 
 out_put:
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 06ae5b173fd7..78b5ae59ac4f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1792,11 +1792,11 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 	}
 	ret = 0;
 out:
+	btrfs_delalloc_release_metadata(inode, ordered_extent->len);
 	if (nolock) {
 		if (trans)
 			btrfs_end_transaction_nolock(trans, root);
 	} else {
-		btrfs_delalloc_release_metadata(inode, ordered_extent->len);
 		if (trans)
 			btrfs_end_transaction(trans, root);
 	}
-- 
cgit v1.2.3


From 4c13d758b7e79c14a0026c1f783f0c79e339b7bb Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Tue, 30 Aug 2011 11:31:29 -0400
Subject: Btrfs: use the transactions block_rsv for the csum root

The alloc warnings everybody has been seeing is because we have been reserving
space for csums, but we weren't actually using that space.  So make
get_block_rsv() return the trans->block_rsv if we're modifying the csum root.
Also set the trans->block_rsv to NULL so that if we modify the csum root when
running delayed ref's that comes out of the global reserve like it's supposed
to.  With this patch I'm not seeing those alloc warnings anymore.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent-tree.c | 15 +++++++++------
 fs/btrfs/transaction.c |  1 +
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index ccdc4d12e8d4..53f6dbdab510 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3565,10 +3565,12 @@ out:
 static struct btrfs_block_rsv *get_block_rsv(struct btrfs_trans_handle *trans,
 					     struct btrfs_root *root)
 {
-	struct btrfs_block_rsv *block_rsv;
-	if (root->ref_cows)
+	struct btrfs_block_rsv *block_rsv = NULL;
+
+	if (root->ref_cows || root == root->fs_info->csum_root)
 		block_rsv = trans->block_rsv;
-	else
+
+	if (!block_rsv)
 		block_rsv = root->block_rsv;
 
 	if (!block_rsv)
@@ -3865,12 +3867,13 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
 				  struct btrfs_root *root)
 {
+	struct btrfs_block_rsv *block_rsv;
+
 	if (!trans->bytes_reserved)
 		return;
 
-	BUG_ON(trans->block_rsv != &root->fs_info->trans_block_rsv);
-	btrfs_block_rsv_release(root, trans->block_rsv,
-				trans->bytes_reserved);
+	block_rsv = &root->fs_info->trans_block_rsv;
+	btrfs_block_rsv_release(root, block_rsv, trans->bytes_reserved);
 	trans->bytes_reserved = 0;
 }
 
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index a1d8c322c1ba..a770f4bd9d31 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -453,6 +453,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 		return 0;
 	}
 
+	trans->block_rsv = NULL;
 	while (count < 4) {
 		unsigned long cur = trans->delayed_ref_updates;
 		trans->delayed_ref_updates = 0;
-- 
cgit v1.2.3


From d02c9955ded7fc56dd1edc987558b084ccb03eb4 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Tue, 30 Aug 2011 11:40:48 -0400
Subject: Btrfs: don't get the block_rsv in btrfs_free_tree_block

Since the durable block rsv stuff has been killed there is no need to get the
block_rsv in btrfs_free_tree_block anymore.

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent-tree.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 53f6dbdab510..d236df790156 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4707,7 +4707,6 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
 			   struct extent_buffer *buf,
 			   u64 parent, int last_ref)
 {
-	struct btrfs_block_rsv *block_rsv;
 	struct btrfs_block_group_cache *cache = NULL;
 	int ret;
 
@@ -4722,10 +4721,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
 	if (!last_ref)
 		return;
 
-	block_rsv = get_block_rsv(trans, root);
 	cache = btrfs_lookup_block_group(root->fs_info, buf->start);
-	if (block_rsv->space_info != cache->space_info)
-		goto out;
 
 	if (btrfs_header_generation(buf) == trans->transid) {
 		if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
-- 
cgit v1.2.3


From 4a92b1b8d2810db4ea0c34616b94c0b3810fa027 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Tue, 30 Aug 2011 12:34:28 -0400
Subject: Btrfs: stop passing a trans handle all around the reservation code

The only thing that we need to have a trans handle for is in
reserve_metadata_bytes and thats to know how much flushing we can do.  So
instead of passing it around, just check current->journal_info for a
trans_handle so we know if we can commit a transaction to try and free up space
or not.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/ctree.h            |  6 ++----
 fs/btrfs/extent-tree.c      | 45 +++++++++++++++++++++++----------------------
 fs/btrfs/free-space-cache.c |  4 +---
 fs/btrfs/inode.c            |  4 ++--
 fs/btrfs/relocation.c       | 15 +++++++--------
 fs/btrfs/transaction.c      |  8 ++++----
 6 files changed, 39 insertions(+), 43 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index caa73cd8c00a..a5faf8e33baa 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2237,12 +2237,10 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv);
 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root);
 void btrfs_free_block_rsv(struct btrfs_root *root,
 			  struct btrfs_block_rsv *rsv);
-int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
-			struct btrfs_root *root,
+int btrfs_block_rsv_add(struct btrfs_root *root,
 			struct btrfs_block_rsv *block_rsv,
 			u64 num_bytes);
-int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
-			  struct btrfs_root *root,
+int btrfs_block_rsv_check(struct btrfs_root *root,
 			  struct btrfs_block_rsv *block_rsv,
 			  u64 min_reserved, int min_factor, int flush);
 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index d236df790156..9bb71597aa54 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3404,29 +3404,34 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
 	return reclaimed >= to_reclaim;
 }
 
-/*
- * Retries tells us how many times we've called reserve_metadata_bytes.  The
- * idea is if this is the first call (retries == 0) then we will add to our
- * reserved count if we can't make the allocation in order to hold our place
- * while we go and try and free up space.  That way for retries > 1 we don't try
- * and add space, we just check to see if the amount of unused space is >= the
- * total space, meaning that our reservation is valid.
+/**
+ * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
+ * @root - the root we're allocating for
+ * @block_rsv - the block_rsv we're allocating for
+ * @orig_bytes - the number of bytes we want
+ * @flush - wether or not we can flush to make our reservation
  *
- * However if we don't intend to retry this reservation, pass -1 as retries so
- * that it short circuits this logic.
+ * This will reserve orgi_bytes number of bytes from the space info associated
+ * with the block_rsv.  If there is not enough space it will make an attempt to
+ * flush out space to make room.  It will do this by flushing delalloc if
+ * possible or committing the transaction.  If flush is 0 then no attempts to
+ * regain reservations will be made and this will fail if there is not enough
+ * space already.
  */
-static int reserve_metadata_bytes(struct btrfs_trans_handle *trans,
-				  struct btrfs_root *root,
+static int reserve_metadata_bytes(struct btrfs_root *root,
 				  struct btrfs_block_rsv *block_rsv,
 				  u64 orig_bytes, int flush)
 {
 	struct btrfs_space_info *space_info = block_rsv->space_info;
+	struct btrfs_trans_handle *trans;
 	u64 unused;
 	u64 num_bytes = orig_bytes;
 	int retries = 0;
 	int ret = 0;
 	bool committed = false;
 	bool flushing = false;
+
+	trans = (struct btrfs_trans_handle *)current->journal_info;
 again:
 	ret = 0;
 	spin_lock(&space_info->lock);
@@ -3689,8 +3694,7 @@ void btrfs_free_block_rsv(struct btrfs_root *root,
 	kfree(rsv);
 }
 
-int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
-			struct btrfs_root *root,
+int btrfs_block_rsv_add(struct btrfs_root *root,
 			struct btrfs_block_rsv *block_rsv,
 			u64 num_bytes)
 {
@@ -3699,7 +3703,7 @@ int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
 	if (num_bytes == 0)
 		return 0;
 
-	ret = reserve_metadata_bytes(trans, root, block_rsv, num_bytes, 1);
+	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1);
 	if (!ret) {
 		block_rsv_add_bytes(block_rsv, num_bytes, 1);
 		return 0;
@@ -3708,8 +3712,7 @@ int btrfs_block_rsv_add(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
-int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
-			  struct btrfs_root *root,
+int btrfs_block_rsv_check(struct btrfs_root *root,
 			  struct btrfs_block_rsv *block_rsv,
 			  u64 min_reserved, int min_factor, int flush)
 {
@@ -3734,7 +3737,7 @@ int btrfs_block_rsv_check(struct btrfs_trans_handle *trans,
 	if (!ret)
 		return 0;
 
-	ret = reserve_metadata_bytes(trans, root, block_rsv, num_bytes, flush);
+	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
 	if (!ret) {
 		block_rsv_add_bytes(block_rsv, num_bytes, 0);
 		return 0;
@@ -4034,7 +4037,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 	to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
 	spin_unlock(&BTRFS_I(inode)->lock);
 
-	ret = reserve_metadata_bytes(NULL, root, block_rsv, to_reserve, flush);
+	ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
 	if (ret) {
 		u64 to_free = 0;
 		unsigned dropped;
@@ -5689,8 +5692,7 @@ use_block_rsv(struct btrfs_trans_handle *trans,
 	block_rsv = get_block_rsv(trans, root);
 
 	if (block_rsv->size == 0) {
-		ret = reserve_metadata_bytes(trans, root, block_rsv,
-					     blocksize, 0);
+		ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
 		/*
 		 * If we couldn't reserve metadata bytes try and use some from
 		 * the global reserve.
@@ -5711,8 +5713,7 @@ use_block_rsv(struct btrfs_trans_handle *trans,
 		return block_rsv;
 	if (ret) {
 		WARN_ON(1);
-		ret = reserve_metadata_bytes(trans, root, block_rsv, blocksize,
-					     0);
+		ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
 		if (!ret) {
 			return block_rsv;
 		} else if (ret && block_rsv != global_rsv) {
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 3bde17ff14c0..ffc42ef44711 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -197,9 +197,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
 
 	rsv = trans->block_rsv;
 	trans->block_rsv = root->orphan_block_rsv;
-	ret = btrfs_block_rsv_check(trans, root,
-				    root->orphan_block_rsv,
-				    0, 5, 0);
+	ret = btrfs_block_rsv_check(root, root->orphan_block_rsv, 0, 5, 0);
 	if (ret)
 		return ret;
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 78b5ae59ac4f..2947e94947b5 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3577,7 +3577,7 @@ void btrfs_evict_inode(struct inode *inode)
 	 * doing the truncate.
 	 */
 	while (1) {
-		ret = btrfs_block_rsv_check(NULL, root, rsv, min_size, 0, 1);
+		ret = btrfs_block_rsv_check(root, rsv, min_size, 0, 1);
 		if (ret) {
 			printk(KERN_WARNING "Could not get space for a "
 			       "delete, will truncate on mount %d\n", ret);
@@ -6577,7 +6577,7 @@ static int btrfs_truncate(struct inode *inode)
 		btrfs_add_ordered_operation(trans, root, inode);
 
 	while (1) {
-		ret = btrfs_block_rsv_check(trans, root, rsv, min_size, 0, 1);
+		ret = btrfs_block_rsv_check(root, rsv, min_size, 0, 1);
 		if (ret) {
 			/*
 			 * This can only happen with the original transaction we
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index fd9ac66434b0..3ab67409f90f 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2041,8 +2041,8 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
 		BUG_ON(IS_ERR(trans));
 		trans->block_rsv = rc->block_rsv;
 
-		ret = btrfs_block_rsv_check(trans, root, rc->block_rsv,
-					    min_reserved, 0, 0);
+		ret = btrfs_block_rsv_check(root, rc->block_rsv, min_reserved,
+					    0, 0);
 		if (ret) {
 			BUG_ON(ret != -EAGAIN);
 			ret = btrfs_commit_transaction(trans, root);
@@ -2152,8 +2152,7 @@ int prepare_to_merge(struct reloc_control *rc, int err)
 again:
 	if (!err) {
 		num_bytes = rc->merging_rsv_size;
-		ret = btrfs_block_rsv_add(NULL, root, rc->block_rsv,
-					  num_bytes);
+		ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes);
 		if (ret)
 			err = ret;
 	}
@@ -2427,7 +2426,7 @@ static int reserve_metadata_space(struct btrfs_trans_handle *trans,
 	num_bytes = calcu_metadata_size(rc, node, 1) * 2;
 
 	trans->block_rsv = rc->block_rsv;
-	ret = btrfs_block_rsv_add(trans, root, rc->block_rsv, num_bytes);
+	ret = btrfs_block_rsv_add(root, rc->block_rsv, num_bytes);
 	if (ret) {
 		if (ret == -EAGAIN)
 			rc->commit_transaction = 1;
@@ -3645,7 +3644,7 @@ int prepare_to_relocate(struct reloc_control *rc)
 	 * btrfs_init_reloc_root will use them when there
 	 * is no reservation in transaction handle.
 	 */
-	ret = btrfs_block_rsv_add(NULL, rc->extent_root, rc->block_rsv,
+	ret = btrfs_block_rsv_add(rc->extent_root, rc->block_rsv,
 				  rc->extent_root->nodesize * 256);
 	if (ret)
 		return ret;
@@ -3774,8 +3773,8 @@ restart:
 			}
 		}
 
-		ret = btrfs_block_rsv_check(trans, rc->extent_root,
-					    rc->block_rsv, 0, 5, 0);
+		ret = btrfs_block_rsv_check(rc->extent_root, rc->block_rsv, 0,
+					    5, 0);
 		if (ret < 0) {
 			if (ret != -EAGAIN) {
 				err = ret;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index a770f4bd9d31..8d6f4c78f73f 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -275,7 +275,7 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
 	 */
 	if (num_items > 0 && root != root->fs_info->chunk_root) {
 		num_bytes = btrfs_calc_trans_metadata_size(root, num_items);
-		ret = btrfs_block_rsv_add(NULL, root,
+		ret = btrfs_block_rsv_add(root,
 					  &root->fs_info->trans_block_rsv,
 					  num_bytes);
 		if (ret)
@@ -418,8 +418,8 @@ static int should_end_transaction(struct btrfs_trans_handle *trans,
 				  struct btrfs_root *root)
 {
 	int ret;
-	ret = btrfs_block_rsv_check(trans, root,
-				    &root->fs_info->global_block_rsv, 0, 5, 0);
+	ret = btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 0,
+				    5, 0);
 	return ret ? 1 : 0;
 }
 
@@ -914,7 +914,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	btrfs_reloc_pre_snapshot(trans, pending, &to_reserve);
 
 	if (to_reserve > 0) {
-		ret = btrfs_block_rsv_add(trans, root, &pending->block_rsv,
+		ret = btrfs_block_rsv_add(root, &pending->block_rsv,
 					  to_reserve);
 		if (ret) {
 			pending->error = ret;
-- 
cgit v1.2.3


From 9c8d86db9aee6f85866d480e0f9b37817264814c Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Mon, 19 Sep 2011 11:58:54 -0400
Subject: Btrfs: make sure to unset trans->block_rsv before running delayed
 refs

Checksums are charged in 2 different ways.  The first case is when we're writing
to the disk, we account for the new checksums with the delalloc block rsv.  In
order for this to work we check if we're allocating a block for the csum root
and if trans->block_rsv == the delalloc block rsv.  But when we're deleting the
csums because of cow, this is charged to the global block rsv, and is done when
we run the delayed refs.  So we need to make sure that trans->block_rsv == NULL
when running the delayed refs.  So set it to NULL and reset it in
should_end_transaction, and set it to NULL in commit_transaction.  This got rid
of the ridiculous amount of warnings I was seeing when trying to do a balance.
Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/transaction.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 8d6f4c78f73f..7debbf396ef3 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -427,17 +427,26 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
 				 struct btrfs_root *root)
 {
 	struct btrfs_transaction *cur_trans = trans->transaction;
+	struct btrfs_block_rsv *rsv = trans->block_rsv;
 	int updates;
 
 	smp_mb();
 	if (cur_trans->blocked || cur_trans->delayed_refs.flushing)
 		return 1;
 
+	/*
+	 * We need to do this in case we're deleting csums so the global block
+	 * rsv get's used instead of the csum block rsv.
+	 */
+	trans->block_rsv = NULL;
+
 	updates = trans->delayed_ref_updates;
 	trans->delayed_ref_updates = 0;
 	if (updates)
 		btrfs_run_delayed_refs(trans, root, updates);
 
+	trans->block_rsv = rsv;
+
 	return should_end_transaction(trans, root);
 }
 
@@ -1167,6 +1176,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
 	btrfs_run_ordered_operations(root, 0);
 
+	trans->block_rsv = NULL;
+
 	/* make a pass through all the delayed refs we have so far
 	 * any runnings procs may add more while we are here
 	 */
-- 
cgit v1.2.3


From 455757c322cc0a0f2a692c5625dd88aaf6a7b889 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Mon, 19 Sep 2011 12:26:24 -0400
Subject: Btrfs: delay iput when deleting a block group

I kept getting warnings from evict because we were calling
btrfs_start_transaction() with a transaction already started when doing a
balance.  This is because we remove a block group which requires a transaction,
and the put the last reference on the cache inode.  Instead of doing this we
need to delay the iput so it is done not within a transaction having started.
This gets rid of our warnings.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent-tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 9bb71597aa54..5498bdacd4c3 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -7266,7 +7266,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 			spin_unlock(&block_group->lock);
 		}
 		/* One for our lookup ref */
-		iput(inode);
+		btrfs_add_delayed_iput(inode);
 	}
 
 	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
-- 
cgit v1.2.3


From 3b16a4e3c355ee3c790473decfcf83d4faeb8ce0 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Wed, 21 Sep 2011 15:05:58 -0400
Subject: Btrfs: use the inode's mapping mask for allocating pages

Johannes pointed out we were allocating only kernel pages for doing writes,
which is kind of a big deal if you are on 32bit and have more than a gig of ram.
So fix our allocations to use the mapping's gfp but still clear __GFP_FS so we
don't re-enter.  Thanks,

Reported-by: Johannes Weiner <jweiner@redhat.com>
Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/ctree.h            | 6 ++++++
 fs/btrfs/file.c             | 3 ++-
 fs/btrfs/free-space-cache.c | 6 ++++--
 fs/btrfs/inode.c            | 3 ++-
 fs/btrfs/ioctl.c            | 3 ++-
 fs/btrfs/relocation.c       | 3 ++-
 6 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index a5faf8e33baa..47dea7118e0e 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -30,6 +30,7 @@
 #include <linux/kobject.h>
 #include <trace/events/btrfs.h>
 #include <asm/kmap_types.h>
+#include <linux/pagemap.h>
 #include "extent_io.h"
 #include "extent_map.h"
 #include "async-thread.h"
@@ -2117,6 +2118,11 @@ static inline bool btrfs_mixed_space_info(struct btrfs_space_info *space_info)
 		(space_info->flags & BTRFS_BLOCK_GROUP_DATA));
 }
 
+static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping)
+{
+	return mapping_gfp_mask(mapping) & ~__GFP_FS;
+}
+
 /* extent-tree.c */
 static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root,
 						 unsigned num_items)
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index de569af766fe..f2e928289600 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1069,6 +1069,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
 	int i;
 	unsigned long index = pos >> PAGE_CACHE_SHIFT;
 	struct inode *inode = fdentry(file)->d_inode;
+	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
 	int err = 0;
 	int faili = 0;
 	u64 start_pos;
@@ -1080,7 +1081,7 @@ static noinline int prepare_pages(struct btrfs_root *root, struct file *file,
 again:
 	for (i = 0; i < num_pages; i++) {
 		pages[i] = find_or_create_page(inode->i_mapping, index + i,
-					       GFP_NOFS);
+					       mask);
 		if (!pages[i]) {
 			faili = i - 1;
 			err = -ENOMEM;
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index ffc42ef44711..0a8ccdbdd464 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -254,6 +254,7 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
 	u64 num_bitmaps;
 	u64 generation;
 	pgoff_t index = 0;
+	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
 	int ret = 0;
 
 	INIT_LIST_HEAD(&bitmaps);
@@ -310,7 +311,7 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
 		if (!num_entries && !num_bitmaps)
 			break;
 
-		page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
+		page = find_or_create_page(inode->i_mapping, index, mask);
 		if (!page)
 			goto free_cache;
 
@@ -563,6 +564,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 	u64 start, end, len;
 	u64 bytes = 0;
 	u32 crc = ~(u32)0;
+	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
 	int index = 0, num_pages = 0;
 	int entries = 0;
 	int bitmaps = 0;
@@ -612,7 +614,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 	 * know and don't freak out.
 	 */
 	while (index < num_pages) {
-		page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
+		page = find_or_create_page(inode->i_mapping, index, mask);
 		if (!page) {
 			int i;
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 2947e94947b5..88e3956be57d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3282,6 +3282,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
 	pgoff_t index = from >> PAGE_CACHE_SHIFT;
 	unsigned offset = from & (PAGE_CACHE_SIZE-1);
 	struct page *page;
+	gfp_t mask = btrfs_alloc_write_mask(mapping);
 	int ret = 0;
 	u64 page_start;
 	u64 page_end;
@@ -3294,7 +3295,7 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
 
 	ret = -ENOMEM;
 again:
-	page = find_or_create_page(mapping, index, GFP_NOFS);
+	page = find_or_create_page(mapping, index, mask);
 	if (!page) {
 		btrfs_delalloc_release_space(inode, PAGE_CACHE_SIZE);
 		goto out;
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 538f65a79ec5..24fd75bb0f96 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -843,6 +843,7 @@ static int cluster_pages_for_defrag(struct inode *inode,
 	int i_done;
 	struct btrfs_ordered_extent *ordered;
 	struct extent_state *cached_state = NULL;
+	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
 
 	if (isize == 0)
 		return 0;
@@ -860,7 +861,7 @@ again:
 	for (i = 0; i < num_pages; i++) {
 		struct page *page;
 		page = find_or_create_page(inode->i_mapping,
-					    start_index + i, GFP_NOFS);
+					    start_index + i, mask);
 		if (!page)
 			break;
 
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 3ab67409f90f..7fa090fa0d39 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2921,6 +2921,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
 	unsigned long last_index;
 	struct page *page;
 	struct file_ra_state *ra;
+	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
 	int nr = 0;
 	int ret = 0;
 
@@ -2955,7 +2956,7 @@ static int relocate_file_extent_cluster(struct inode *inode,
 						  ra, NULL, index,
 						  last_index + 1 - index);
 			page = find_or_create_page(inode->i_mapping, index,
-						   GFP_NOFS);
+						   mask);
 			if (!page) {
 				btrfs_delalloc_release_metadata(inode,
 							PAGE_CACHE_SIZE);
-- 
cgit v1.2.3


From a8c9e5769718d47e87cce40c9b84cab421804797 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Wed, 21 Sep 2011 16:55:59 -0400
Subject: Btrfs: fix orphan cleanup regression

In fixing how we deal with bad inodes, we had a regression in the orphan cleanup
code, since it expects to get a bad inode back.  So fix it to deal with getting
-ESTALE back by deleting the orphan item manually and moving on.  Thanks,

Reported-by: Simon Kirby <sim@hostway.ca>
Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/inode.c | 36 +++++++++++++++++-------------------
 1 file changed, 17 insertions(+), 19 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 88e3956be57d..8005be176252 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2285,37 +2285,35 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
 		found_key.type = BTRFS_INODE_ITEM_KEY;
 		found_key.offset = 0;
 		inode = btrfs_iget(root->fs_info->sb, &found_key, root, NULL);
-		if (IS_ERR(inode)) {
-			ret = PTR_ERR(inode);
+		ret = PTR_RET(inode);
+		if (ret && ret != -ESTALE)
 			goto out;
-		}
-
-		/*
-		 * add this inode to the orphan list so btrfs_orphan_del does
-		 * the proper thing when we hit it
-		 */
-		spin_lock(&root->orphan_lock);
-		list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
-		spin_unlock(&root->orphan_lock);
 
 		/*
-		 * if this is a bad inode, means we actually succeeded in
-		 * removing the inode, but not the orphan record, which means
-		 * we need to manually delete the orphan since iput will just
-		 * do a destroy_inode
+		 * Inode is already gone but the orphan item is still there,
+		 * kill the orphan item.
 		 */
-		if (is_bad_inode(inode)) {
-			trans = btrfs_start_transaction(root, 0);
+		if (ret == -ESTALE) {
+			trans = btrfs_start_transaction(root, 1);
 			if (IS_ERR(trans)) {
 				ret = PTR_ERR(trans);
 				goto out;
 			}
-			btrfs_orphan_del(trans, inode);
+			ret = btrfs_del_orphan_item(trans, root,
+						    found_key.objectid);
+			BUG_ON(ret);
 			btrfs_end_transaction(trans, root);
-			iput(inode);
 			continue;
 		}
 
+		/*
+		 * add this inode to the orphan list so btrfs_orphan_del does
+		 * the proper thing when we hit it
+		 */
+		spin_lock(&root->orphan_lock);
+		list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list);
+		spin_unlock(&root->orphan_lock);
+
 		/* if we have links, this was a truncate, lets do that */
 		if (inode->i_nlink) {
 			if (!S_ISREG(inode->i_mode)) {
-- 
cgit v1.2.3


From ef3be45722317f8c2fb0e861065df0c3830ff9ac Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Thu, 22 Sep 2011 14:30:02 -0400
Subject: Btrfs: check unused against how much space we actually want

There is a bug that may lead to early ENOSPC in our reservation code.  We've
been checking against num_bytes which may be above and beyond what we want to
actually reserve, which could give us a false ENOSPC.  Fix this by making sure
the unused space is above how much we want to reserve and not how much we're
trying to flush.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent-tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 5498bdacd4c3..fd65f6bc676c 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3472,7 +3472,7 @@ again:
 	 */
 	if (unused <= space_info->total_bytes) {
 		unused = space_info->total_bytes - unused;
-		if (unused >= num_bytes) {
+		if (unused >= orig_bytes) {
 			space_info->bytes_may_use += orig_bytes;
 			ret = 0;
 		} else {
-- 
cgit v1.2.3


From 462d6fac8960a3ba797927adfcbd29d503eb16fd Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Mon, 26 Sep 2011 13:56:12 -0400
Subject: Btrfs: introduce convert_extent_bit

If I have a range where I know a certain bit is and I want to set it to another
bit the only option I have is to call set and then clear bit, which will result
in 2 tree searches.  This is inefficient, so introduce convert_extent_bit which
will go through and set the bit I want and clear the old bit I don't want.
Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent_io.c | 188 +++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/btrfs/extent_io.h |   2 +
 2 files changed, 190 insertions(+)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 7d5e55632809..0ada0b700b44 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -894,6 +894,194 @@ search_again:
 	goto again;
 }
 
+/**
+ * convert_extent - convert all bits in a given range from one bit to another
+ * @tree:	the io tree to search
+ * @start:	the start offset in bytes
+ * @end:	the end offset in bytes (inclusive)
+ * @bits:	the bits to set in this range
+ * @clear_bits:	the bits to clear in this range
+ * @mask:	the allocation mask
+ *
+ * This will go through and set bits for the given range.  If any states exist
+ * already in this range they are set with the given bit and cleared of the
+ * clear_bits.  This is only meant to be used by things that are mergeable, ie
+ * converting from say DELALLOC to DIRTY.  This is not meant to be used with
+ * boundary bits like LOCK.
+ */
+int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+		       int bits, int clear_bits, gfp_t mask)
+{
+	struct extent_state *state;
+	struct extent_state *prealloc = NULL;
+	struct rb_node *node;
+	int err = 0;
+	u64 last_start;
+	u64 last_end;
+
+again:
+	if (!prealloc && (mask & __GFP_WAIT)) {
+		prealloc = alloc_extent_state(mask);
+		if (!prealloc)
+			return -ENOMEM;
+	}
+
+	spin_lock(&tree->lock);
+	/*
+	 * this search will find all the extents that end after
+	 * our range starts.
+	 */
+	node = tree_search(tree, start);
+	if (!node) {
+		prealloc = alloc_extent_state_atomic(prealloc);
+		if (!prealloc)
+			return -ENOMEM;
+		err = insert_state(tree, prealloc, start, end, &bits);
+		prealloc = NULL;
+		BUG_ON(err == -EEXIST);
+		goto out;
+	}
+	state = rb_entry(node, struct extent_state, rb_node);
+hit_next:
+	last_start = state->start;
+	last_end = state->end;
+
+	/*
+	 * | ---- desired range ---- |
+	 * | state |
+	 *
+	 * Just lock what we found and keep going
+	 */
+	if (state->start == start && state->end <= end) {
+		struct rb_node *next_node;
+
+		set_state_bits(tree, state, &bits);
+		clear_state_bit(tree, state, &clear_bits, 0);
+
+		merge_state(tree, state);
+		if (last_end == (u64)-1)
+			goto out;
+
+		start = last_end + 1;
+		next_node = rb_next(&state->rb_node);
+		if (next_node && start < end && prealloc && !need_resched()) {
+			state = rb_entry(next_node, struct extent_state,
+					 rb_node);
+			if (state->start == start)
+				goto hit_next;
+		}
+		goto search_again;
+	}
+
+	/*
+	 *     | ---- desired range ---- |
+	 * | state |
+	 *   or
+	 * | ------------- state -------------- |
+	 *
+	 * We need to split the extent we found, and may flip bits on
+	 * second half.
+	 *
+	 * If the extent we found extends past our
+	 * range, we just split and search again.  It'll get split
+	 * again the next time though.
+	 *
+	 * If the extent we found is inside our range, we set the
+	 * desired bit on it.
+	 */
+	if (state->start < start) {
+		prealloc = alloc_extent_state_atomic(prealloc);
+		if (!prealloc)
+			return -ENOMEM;
+		err = split_state(tree, state, prealloc, start);
+		BUG_ON(err == -EEXIST);
+		prealloc = NULL;
+		if (err)
+			goto out;
+		if (state->end <= end) {
+			set_state_bits(tree, state, &bits);
+			clear_state_bit(tree, state, &clear_bits, 0);
+			merge_state(tree, state);
+			if (last_end == (u64)-1)
+				goto out;
+			start = last_end + 1;
+		}
+		goto search_again;
+	}
+	/*
+	 * | ---- desired range ---- |
+	 *     | state | or               | state |
+	 *
+	 * There's a hole, we need to insert something in it and
+	 * ignore the extent we found.
+	 */
+	if (state->start > start) {
+		u64 this_end;
+		if (end < last_start)
+			this_end = end;
+		else
+			this_end = last_start - 1;
+
+		prealloc = alloc_extent_state_atomic(prealloc);
+		if (!prealloc)
+			return -ENOMEM;
+
+		/*
+		 * Avoid to free 'prealloc' if it can be merged with
+		 * the later extent.
+		 */
+		err = insert_state(tree, prealloc, start, this_end,
+				   &bits);
+		BUG_ON(err == -EEXIST);
+		if (err) {
+			free_extent_state(prealloc);
+			prealloc = NULL;
+			goto out;
+		}
+		prealloc = NULL;
+		start = this_end + 1;
+		goto search_again;
+	}
+	/*
+	 * | ---- desired range ---- |
+	 *                        | state |
+	 * We need to split the extent, and set the bit
+	 * on the first half
+	 */
+	if (state->start <= end && state->end > end) {
+		prealloc = alloc_extent_state_atomic(prealloc);
+		if (!prealloc)
+			return -ENOMEM;
+
+		err = split_state(tree, state, prealloc, end + 1);
+		BUG_ON(err == -EEXIST);
+
+		set_state_bits(tree, prealloc, &bits);
+		clear_state_bit(tree, prealloc, &clear_bits, 0);
+
+		merge_state(tree, prealloc);
+		prealloc = NULL;
+		goto out;
+	}
+
+	goto search_again;
+
+out:
+	spin_unlock(&tree->lock);
+	if (prealloc)
+		free_extent_state(prealloc);
+
+	return err;
+
+search_again:
+	if (start > end)
+		goto out;
+	spin_unlock(&tree->lock);
+	if (mask & __GFP_WAIT)
+		cond_resched();
+	goto again;
+}
+
 /* wrappers around set/clear extent bit */
 int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
 		     gfp_t mask)
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 7b2f0c3e7929..cea445dcd806 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -214,6 +214,8 @@ int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
 		     gfp_t mask);
 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
 		       gfp_t mask);
+int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+		       int bits, int clear_bits, gfp_t mask);
 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
 			struct extent_state **cached_state, gfp_t mask);
 int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
-- 
cgit v1.2.3


From 1728366efa5ebf48bd2ed544afa8700cd07ba822 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Mon, 26 Sep 2011 13:58:47 -0400
Subject: Btrfs: stop using write_one_page

While looking for a performance regression a user was complaining about, I
noticed that we had a regression with the varmail test of filebench.  This was
introduced by

0d10ee2e6deb5c8409ae65b970846344897d5e4e

which keeps us from calling writepages in writepage.  This is a correct change,
however it happens to help the varmail test because we write out in larger
chunks.  This is largly to do with how we write out dirty pages for each
transaction.  If you run filebench with

load varmail
set $dir=/mnt/btrfs-test
run 60

prior to this patch you would get ~1420 ops/second, but with the patch you get
~1200 ops/second.  This is a 16% decrease.  So since we know the range of dirty
pages we want to write out, don't write out in one page chunks, write out in
ranges.  So to do this we call filemap_fdatawrite_range() on the range of bytes.
Then we convert the DIRTY extents to NEED_WAIT extents.  When we then call
btrfs_wait_marked_extents() we only have to filemap_fdatawait_range() on that
range and clear the NEED_WAIT extents.  This doesn't get us back to our original
speeds, but I've been seeing ~1380 ops/second, which is a <5% regression as
opposed to a >15% regression.  That is acceptable given that the original commit
greatly reduces our latency to begin with.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent_io.h   |  1 +
 fs/btrfs/transaction.c | 86 +++++++++++---------------------------------------
 2 files changed, 20 insertions(+), 67 deletions(-)

diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index cea445dcd806..325a346369da 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -17,6 +17,7 @@
 #define EXTENT_NODATASUM (1 << 10)
 #define EXTENT_DO_ACCOUNTING (1 << 11)
 #define EXTENT_FIRST_DELALLOC (1 << 12)
+#define EXTENT_NEED_WAIT (1 << 13)
 #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
 #define EXTENT_CTLBITS (EXTENT_DO_ACCOUNTING | EXTENT_FIRST_DELALLOC)
 
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 7debbf396ef3..45655793a2c5 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -572,50 +572,21 @@ int btrfs_end_transaction_dmeta(struct btrfs_trans_handle *trans,
 int btrfs_write_marked_extents(struct btrfs_root *root,
 			       struct extent_io_tree *dirty_pages, int mark)
 {
-	int ret;
 	int err = 0;
 	int werr = 0;
-	struct page *page;
-	struct inode *btree_inode = root->fs_info->btree_inode;
+	struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
 	u64 start = 0;
 	u64 end;
-	unsigned long index;
 
-	while (1) {
-		ret = find_first_extent_bit(dirty_pages, start, &start, &end,
-					    mark);
-		if (ret)
-			break;
-		while (start <= end) {
-			cond_resched();
-
-			index = start >> PAGE_CACHE_SHIFT;
-			start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
-			page = find_get_page(btree_inode->i_mapping, index);
-			if (!page)
-				continue;
-
-			btree_lock_page_hook(page);
-			if (!page->mapping) {
-				unlock_page(page);
-				page_cache_release(page);
-				continue;
-			}
-
-			if (PageWriteback(page)) {
-				if (PageDirty(page))
-					wait_on_page_writeback(page);
-				else {
-					unlock_page(page);
-					page_cache_release(page);
-					continue;
-				}
-			}
-			err = write_one_page(page, 0);
-			if (err)
-				werr = err;
-			page_cache_release(page);
-		}
+	while (!find_first_extent_bit(dirty_pages, start, &start, &end,
+				      mark)) {
+		convert_extent_bit(dirty_pages, start, end, EXTENT_NEED_WAIT, mark,
+				   GFP_NOFS);
+		err = filemap_fdatawrite_range(mapping, start, end);
+		if (err)
+			werr = err;
+		cond_resched();
+		start = end + 1;
 	}
 	if (err)
 		werr = err;
@@ -631,39 +602,20 @@ int btrfs_write_marked_extents(struct btrfs_root *root,
 int btrfs_wait_marked_extents(struct btrfs_root *root,
 			      struct extent_io_tree *dirty_pages, int mark)
 {
-	int ret;
 	int err = 0;
 	int werr = 0;
-	struct page *page;
-	struct inode *btree_inode = root->fs_info->btree_inode;
+	struct address_space *mapping = root->fs_info->btree_inode->i_mapping;
 	u64 start = 0;
 	u64 end;
-	unsigned long index;
 
-	while (1) {
-		ret = find_first_extent_bit(dirty_pages, start, &start, &end,
-					    mark);
-		if (ret)
-			break;
-
-		clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS);
-		while (start <= end) {
-			index = start >> PAGE_CACHE_SHIFT;
-			start = (u64)(index + 1) << PAGE_CACHE_SHIFT;
-			page = find_get_page(btree_inode->i_mapping, index);
-			if (!page)
-				continue;
-			if (PageDirty(page)) {
-				btree_lock_page_hook(page);
-				wait_on_page_writeback(page);
-				err = write_one_page(page, 0);
-				if (err)
-					werr = err;
-			}
-			wait_on_page_writeback(page);
-			page_cache_release(page);
-			cond_resched();
-		}
+	while (!find_first_extent_bit(dirty_pages, start, &start, &end,
+				      EXTENT_NEED_WAIT)) {
+		clear_extent_bits(dirty_pages, start, end, EXTENT_NEED_WAIT, GFP_NOFS);
+		err = filemap_fdatawait_range(mapping, start, end);
+		if (err)
+			werr = err;
+		cond_resched();
+		start = end + 1;
 	}
 	if (err)
 		werr = err;
-- 
cgit v1.2.3


From 726c35fa0edf1d9b8a88b73255532e73089aedda Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Mon, 26 Sep 2011 15:46:06 -0400
Subject: Btrfs: use the global reserve as a backup for deleting inodes

Xfstests 83 really stresses our ENOSPC since it uses a 100mb fs which ends up
with the mixed block group stuff.  Because of this we can run into a situation
where we don't have enough space to delete inodes, or even worse we can't free
the inodes when we next mount the fs which causes the orphan code to lose its
mind.  So if we fail to make our reservation, steal from the global reserve.
The global reserve will end up taking up the entire rest of the free space on
the fs in this worst case so there really is no other option.  With this patch
test 83 doesn't freak out.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/inode.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8005be176252..96fc9e342219 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3526,7 +3526,7 @@ void btrfs_evict_inode(struct inode *inode)
 {
 	struct btrfs_trans_handle *trans;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct btrfs_block_rsv *rsv;
+	struct btrfs_block_rsv *rsv, *global_rsv;
 	u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
 	unsigned long nr;
 	int ret;
@@ -3561,6 +3561,7 @@ void btrfs_evict_inode(struct inode *inode)
 		goto no_delete;
 	}
 	rsv->size = min_size;
+	global_rsv = &root->fs_info->global_block_rsv;
 
 	btrfs_i_size_write(inode, 0);
 
@@ -3577,6 +3578,15 @@ void btrfs_evict_inode(struct inode *inode)
 	 */
 	while (1) {
 		ret = btrfs_block_rsv_check(root, rsv, min_size, 0, 1);
+
+		/*
+		 * Try and steal from the global reserve since we will
+		 * likely not use this space anyway, we want to try as
+		 * hard as possible to get this to work.
+		 */
+		if (ret)
+			ret = btrfs_block_rsv_migrate(global_rsv, rsv, min_size);
+
 		if (ret) {
 			printk(KERN_WARNING "Could not get space for a "
 			       "delete, will truncate on mount %d\n", ret);
-- 
cgit v1.2.3


From 8f6d7f4f45f18a5b669dbbf068c74b3d5be59dbf Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Mon, 26 Sep 2011 15:55:20 -0400
Subject: Btrfs: break out of orphan cleanup if we can't make progress

I noticed while running xfstests 83 that if we didn't have enough space to
delete our inode the orphan cleanup would just loop.  This is because it keeps
finding the same orphan item and keeps trying to kill it but can't because we
don't get an error back from iput for deleting the inode.  So keep track of the
last guy we tried to kill, if it's the same as the one we're trying to kill
currently we know we are having problems and can just error out.  I don't have a
way to test this so look hard and make sure it's right.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/inode.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 96fc9e342219..15adfb542502 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2230,6 +2230,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
 	struct btrfs_key key, found_key;
 	struct btrfs_trans_handle *trans;
 	struct inode *inode;
+	u64 last_objectid = 0;
 	int ret = 0, nr_unlink = 0, nr_truncate = 0;
 
 	if (cmpxchg(&root->orphan_cleanup_state, 0, ORPHAN_CLEANUP_STARTED))
@@ -2281,6 +2282,16 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
 		 * crossing root thing.  we store the inode number in the
 		 * offset of the orphan item.
 		 */
+
+		if (found_key.offset == last_objectid) {
+			printk(KERN_ERR "btrfs: Error removing orphan entry, "
+			       "stopping orphan cleanup\n");
+			ret = -EINVAL;
+			goto out;
+		}
+
+		last_objectid = found_key.offset;
+
 		found_key.objectid = found_key.offset;
 		found_key.type = BTRFS_INODE_ITEM_KEY;
 		found_key.offset = 0;
-- 
cgit v1.2.3


From 2bf64758fd6290797a5ce97d4b9c698a4ed1cbad Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Mon, 26 Sep 2011 17:12:22 -0400
Subject: Btrfs: allow us to overcommit our enospc reservations

One of the things that kills us is the fact that our ENOSPC reservations are
horribly over the top in most normal cases.  There isn't too much that can be
done about this because when we are completely full we really need them to work
like this so we don't under reserve.  However if there is plenty of unallocated
chunks on the disk we can use that to gauge how much we can overcommit.  So this
patch adds chunk free space accounting so we always know how much unallocated
space we have.  Then if we fail to make a reservation within our allocated
space, check to see if we can overcommit.  In the normal flushing case (like
with delalloc metadata reservations) we'll take the free space and divide it by
2 if our metadata profile is setup for DUP or any of those, and then divide it
by 8 to make sure we don't overcommit too much.  Then if we're in a non-flushing
case (we really need this reservation now!) we only limit ourselves to half of
the free space.  This makes this fio test

[torrent]
filename=torrent-test
rw=randwrite
size=4g
ioengine=sync
directory=/mnt/btrfs-test

go from taking around 45 minutes to 10 seconds on my freshly formatted 3 TiB
file system.  This doesn't seem to break my other enospc tests, but could really
use some more testing as this is a super scary change.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/ctree.h       |  4 ++++
 fs/btrfs/disk-io.c     |  2 ++
 fs/btrfs/extent-tree.c | 61 ++++++++++++++++++++++++++++++++++++++------------
 fs/btrfs/volumes.c     | 39 ++++++++++++++++++++++++++++----
 4 files changed, 88 insertions(+), 18 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 47dea7118e0e..1eafccb162ee 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -893,6 +893,10 @@ struct btrfs_fs_info {
 	spinlock_t block_group_cache_lock;
 	struct rb_root block_group_cache_tree;
 
+	/* keep track of unallocated space */
+	spinlock_t free_chunk_lock;
+	u64 free_chunk_space;
+
 	struct extent_io_tree freed_extents[2];
 	struct extent_io_tree *pinned_extents;
 
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 4965a0179b31..51372a521167 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1648,6 +1648,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	spin_lock_init(&fs_info->fs_roots_radix_lock);
 	spin_lock_init(&fs_info->delayed_iput_lock);
 	spin_lock_init(&fs_info->defrag_inodes_lock);
+	spin_lock_init(&fs_info->free_chunk_lock);
 	mutex_init(&fs_info->reloc_mutex);
 
 	init_completion(&fs_info->kobj_unregister);
@@ -1675,6 +1676,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	fs_info->metadata_ratio = 0;
 	fs_info->defrag_inodes = RB_ROOT;
 	fs_info->trans_no_join = 0;
+	fs_info->free_chunk_space = 0;
 
 	fs_info->thread_pool_size = min_t(unsigned long,
 					  num_online_cpus() + 2, 8);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index fd65f6bc676c..25b69d0f9135 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3410,6 +3410,7 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
  * @block_rsv - the block_rsv we're allocating for
  * @orig_bytes - the number of bytes we want
  * @flush - wether or not we can flush to make our reservation
+ * @check - wether this is just to check if we have enough space or not
  *
  * This will reserve orgi_bytes number of bytes from the space info associated
  * with the block_rsv.  If there is not enough space it will make an attempt to
@@ -3420,11 +3421,11 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
  */
 static int reserve_metadata_bytes(struct btrfs_root *root,
 				  struct btrfs_block_rsv *block_rsv,
-				  u64 orig_bytes, int flush)
+				  u64 orig_bytes, int flush, int check)
 {
 	struct btrfs_space_info *space_info = block_rsv->space_info;
 	struct btrfs_trans_handle *trans;
-	u64 unused;
+	u64 used;
 	u64 num_bytes = orig_bytes;
 	int retries = 0;
 	int ret = 0;
@@ -3459,9 +3460,9 @@ again:
 	}
 
 	ret = -ENOSPC;
-	unused = space_info->bytes_used + space_info->bytes_reserved +
-		 space_info->bytes_pinned + space_info->bytes_readonly +
-		 space_info->bytes_may_use;
+	used = space_info->bytes_used + space_info->bytes_reserved +
+		space_info->bytes_pinned + space_info->bytes_readonly +
+		space_info->bytes_may_use;
 
 	/*
 	 * The idea here is that we've not already over-reserved the block group
@@ -3470,9 +3471,8 @@ again:
 	 * lets start flushing stuff first and then come back and try to make
 	 * our reservation.
 	 */
-	if (unused <= space_info->total_bytes) {
-		unused = space_info->total_bytes - unused;
-		if (unused >= orig_bytes) {
+	if (used <= space_info->total_bytes) {
+		if (used + orig_bytes <= space_info->total_bytes) {
 			space_info->bytes_may_use += orig_bytes;
 			ret = 0;
 		} else {
@@ -3489,10 +3489,43 @@ again:
 		 * amount plus the amount of bytes that we need for this
 		 * reservation.
 		 */
-		num_bytes = unused - space_info->total_bytes +
+		num_bytes = used - space_info->total_bytes +
 			(orig_bytes * (retries + 1));
 	}
 
+	if (ret && !check) {
+		u64 profile = btrfs_get_alloc_profile(root, 0);
+		u64 avail;
+
+		spin_lock(&root->fs_info->free_chunk_lock);
+		avail = root->fs_info->free_chunk_space;
+
+		/*
+		 * If we have dup, raid1 or raid10 then only half of the free
+		 * space is actually useable.
+		 */
+		if (profile & (BTRFS_BLOCK_GROUP_DUP |
+			       BTRFS_BLOCK_GROUP_RAID1 |
+			       BTRFS_BLOCK_GROUP_RAID10))
+			avail >>= 1;
+
+		/*
+		 * If we aren't flushing don't let us overcommit too much, say
+		 * 1/8th of the space.  If we can flush, let it overcommit up to
+		 * 1/2 of the space.
+		 */
+		if (flush)
+			avail >>= 3;
+		else
+			avail >>= 1;
+		 spin_unlock(&root->fs_info->free_chunk_lock);
+
+		if (used + orig_bytes < space_info->total_bytes + avail) {
+			space_info->bytes_may_use += orig_bytes;
+			ret = 0;
+		}
+	}
+
 	/*
 	 * Couldn't make our reservation, save our place so while we're trying
 	 * to reclaim space we can actually use it instead of somebody else
@@ -3703,7 +3736,7 @@ int btrfs_block_rsv_add(struct btrfs_root *root,
 	if (num_bytes == 0)
 		return 0;
 
-	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1);
+	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1, 0);
 	if (!ret) {
 		block_rsv_add_bytes(block_rsv, num_bytes, 1);
 		return 0;
@@ -3737,7 +3770,7 @@ int btrfs_block_rsv_check(struct btrfs_root *root,
 	if (!ret)
 		return 0;
 
-	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
+	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush, !flush);
 	if (!ret) {
 		block_rsv_add_bytes(block_rsv, num_bytes, 0);
 		return 0;
@@ -4037,7 +4070,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 	to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
 	spin_unlock(&BTRFS_I(inode)->lock);
 
-	ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
+	ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush, 0);
 	if (ret) {
 		u64 to_free = 0;
 		unsigned dropped;
@@ -5692,7 +5725,7 @@ use_block_rsv(struct btrfs_trans_handle *trans,
 	block_rsv = get_block_rsv(trans, root);
 
 	if (block_rsv->size == 0) {
-		ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
+		ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0, 0);
 		/*
 		 * If we couldn't reserve metadata bytes try and use some from
 		 * the global reserve.
@@ -5713,7 +5746,7 @@ use_block_rsv(struct btrfs_trans_handle *trans,
 		return block_rsv;
 	if (ret) {
 		WARN_ON(1);
-		ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
+		ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0, 0);
 		if (!ret) {
 			return block_rsv;
 		} else if (ret && block_rsv != global_rsv) {
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index f2a4cc79da61..e138af710de2 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1013,8 +1013,13 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
 	}
 	BUG_ON(ret);
 
-	if (device->bytes_used > 0)
-		device->bytes_used -= btrfs_dev_extent_length(leaf, extent);
+	if (device->bytes_used > 0) {
+		u64 len = btrfs_dev_extent_length(leaf, extent);
+		device->bytes_used -= len;
+		spin_lock(&root->fs_info->free_chunk_lock);
+		root->fs_info->free_chunk_space += len;
+		spin_unlock(&root->fs_info->free_chunk_lock);
+	}
 	ret = btrfs_del_item(trans, root, path);
 
 out:
@@ -1356,6 +1361,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 	if (ret)
 		goto error_undo;
 
+	spin_lock(&root->fs_info->free_chunk_lock);
+	root->fs_info->free_chunk_space = device->total_bytes -
+		device->bytes_used;
+	spin_unlock(&root->fs_info->free_chunk_lock);
+
 	device->in_fs_metadata = 0;
 	btrfs_scrub_cancel_dev(root, device);
 
@@ -1691,6 +1701,10 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 		root->fs_info->fs_devices->num_can_discard++;
 	root->fs_info->fs_devices->total_rw_bytes += device->total_bytes;
 
+	spin_lock(&root->fs_info->free_chunk_lock);
+	root->fs_info->free_chunk_space += device->total_bytes;
+	spin_unlock(&root->fs_info->free_chunk_lock);
+
 	if (!blk_queue_nonrot(bdev_get_queue(bdev)))
 		root->fs_info->fs_devices->rotating = 1;
 
@@ -2192,8 +2206,12 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
 	lock_chunks(root);
 
 	device->total_bytes = new_size;
-	if (device->writeable)
+	if (device->writeable) {
 		device->fs_devices->total_rw_bytes -= diff;
+		spin_lock(&root->fs_info->free_chunk_lock);
+		root->fs_info->free_chunk_space -= diff;
+		spin_unlock(&root->fs_info->free_chunk_lock);
+	}
 	unlock_chunks(root);
 
 again:
@@ -2257,6 +2275,9 @@ again:
 		device->total_bytes = old_size;
 		if (device->writeable)
 			device->fs_devices->total_rw_bytes += diff;
+		spin_lock(&root->fs_info->free_chunk_lock);
+		root->fs_info->free_chunk_space += diff;
+		spin_unlock(&root->fs_info->free_chunk_lock);
 		unlock_chunks(root);
 		goto done;
 	}
@@ -2615,6 +2636,11 @@ static int __finish_chunk_alloc(struct btrfs_trans_handle *trans,
 		index++;
 	}
 
+	spin_lock(&extent_root->fs_info->free_chunk_lock);
+	extent_root->fs_info->free_chunk_space -= (stripe_size *
+						   map->num_stripes);
+	spin_unlock(&extent_root->fs_info->free_chunk_lock);
+
 	index = 0;
 	stripe = &chunk->stripe;
 	while (index < map->num_stripes) {
@@ -3616,8 +3642,13 @@ static int read_one_dev(struct btrfs_root *root,
 	fill_device_from_item(leaf, dev_item, device);
 	device->dev_root = root->fs_info->dev_root;
 	device->in_fs_metadata = 1;
-	if (device->writeable)
+	if (device->writeable) {
 		device->fs_devices->total_rw_bytes += device->total_bytes;
+		spin_lock(&root->fs_info->free_chunk_lock);
+		root->fs_info->free_chunk_space += device->total_bytes -
+			device->bytes_used;
+		spin_unlock(&root->fs_info->free_chunk_lock);
+	}
 	ret = 0;
 	return ret;
 }
-- 
cgit v1.2.3


From e27425d614d68daa08f60735982a7c3a0230e855 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Tue, 27 Sep 2011 11:01:30 -0400
Subject: Btrfs: only inherit btrfs specific flags when creating files

Xfstests 79 was failing because we were inheriting the S_APPEND flag when we
weren't supposed to.  There isn't any specific documentation on this so I'm
taking the test as the standard of how things work, and having S_APPEND set on a
directory doesn't mean that S_APPEND gets inherited by its children according to
this test.  So only inherit btrfs specific things.  This will let us set
compress/nocompress on specific directories and everything in the directories
will inherit this flag, same with nodatacow.  With this patch test 79 passes.
Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/ioctl.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 24fd75bb0f96..d2b53eb8a8c2 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -117,7 +117,7 @@ void btrfs_update_iflags(struct inode *inode)
 /*
  * Inherit flags from the parent inode.
  *
- * Unlike extN we don't have any flags we don't want to inherit currently.
+ * Currently only the compression flags and the cow flags are inherited.
  */
 void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
 {
@@ -128,12 +128,17 @@ void btrfs_inherit_iflags(struct inode *inode, struct inode *dir)
 
 	flags = BTRFS_I(dir)->flags;
 
-	if (S_ISREG(inode->i_mode))
-		flags &= ~BTRFS_INODE_DIRSYNC;
-	else if (!S_ISDIR(inode->i_mode))
-		flags &= (BTRFS_INODE_NODUMP | BTRFS_INODE_NOATIME);
+	if (flags & BTRFS_INODE_NOCOMPRESS) {
+		BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS;
+		BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS;
+	} else if (flags & BTRFS_INODE_COMPRESS) {
+		BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS;
+		BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS;
+	}
+
+	if (flags & BTRFS_INODE_NODATACOW)
+		BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW;
 
-	BTRFS_I(inode)->flags = flags;
 	btrfs_update_iflags(inode);
 }
 
-- 
cgit v1.2.3


From 73bc187680f94bed498f8a669103cad290e41180 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Mon, 3 Oct 2011 14:07:49 -0400
Subject: Btrfs: introduce mount option no_space_cache

Some users have requested this and I've found I needed a way to disable cache
loading without actually clearing the cache, so introduce the no_space_cache
option.  Before we check the super blocks cache generation field and if it was
populated we always turned space caching on.  Now we check this and set the
space cache option on, and then parse the mount options so that if we want it
off it get's turned off.  Then we check the mount option all the places we do
the caching work instead of checking the super's cache generation.  This makes
things more consistent and lets us turn space caching off.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent-tree.c |  9 ++++-----
 fs/btrfs/super.c       | 21 +++++++++++++++++----
 fs/btrfs/transaction.c |  2 +-
 3 files changed, 22 insertions(+), 10 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 25b69d0f9135..f9711a82fc54 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -481,7 +481,8 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
 	 * we likely hold important locks.
 	 */
 	if (trans && (!trans->transaction->in_commit) &&
-	    (root && root != root->fs_info->tree_root)) {
+	    (root && root != root->fs_info->tree_root) &&
+	    btrfs_test_opt(root, SPACE_CACHE)) {
 		spin_lock(&cache->lock);
 		if (cache->cached != BTRFS_CACHE_NO) {
 			spin_unlock(&cache->lock);
@@ -4223,7 +4224,7 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 		spin_lock(&cache->space_info->lock);
 		spin_lock(&cache->lock);
 
-		if (btrfs_super_cache_generation(&info->super_copy) != 0 &&
+		if (btrfs_test_opt(root, SPACE_CACHE) &&
 		    cache->disk_cache_state < BTRFS_DC_CLEAR)
 			cache->disk_cache_state = BTRFS_DC_CLEAR;
 
@@ -7038,13 +7039,11 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 	path->reada = 1;
 
 	cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy);
-	if (cache_gen != 0 &&
+	if (btrfs_test_opt(root, SPACE_CACHE) &&
 	    btrfs_super_generation(&root->fs_info->super_copy) != cache_gen)
 		need_clear = 1;
 	if (btrfs_test_opt(root, CLEAR_CACHE))
 		need_clear = 1;
-	if (!btrfs_test_opt(root, SPACE_CACHE) && cache_gen)
-		printk(KERN_INFO "btrfs: disk space caching is enabled\n");
 
 	while (1) {
 		ret = find_first_block_group(root, path, &key);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 934789f7fd33..266d1f35465d 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -164,7 +164,7 @@ enum {
 	Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
 	Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
 	Opt_enospc_debug, Opt_subvolrootid, Opt_defrag,
-	Opt_inode_cache, Opt_err,
+	Opt_inode_cache, Opt_no_space_cache, Opt_err,
 };
 
 static match_table_t tokens = {
@@ -197,6 +197,7 @@ static match_table_t tokens = {
 	{Opt_subvolrootid, "subvolrootid=%d"},
 	{Opt_defrag, "autodefrag"},
 	{Opt_inode_cache, "inode_cache"},
+	{Opt_no_space_cache, "no_space_cache"},
 	{Opt_err, NULL},
 };
 
@@ -208,14 +209,19 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 {
 	struct btrfs_fs_info *info = root->fs_info;
 	substring_t args[MAX_OPT_ARGS];
-	char *p, *num, *orig;
+	char *p, *num, *orig = NULL;
+	u64 cache_gen;
 	int intarg;
 	int ret = 0;
 	char *compress_type;
 	bool compress_force = false;
 
+	cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy);
+	if (cache_gen)
+		btrfs_set_opt(info->mount_opt, SPACE_CACHE);
+
 	if (!options)
-		return 0;
+		goto out;
 
 	/*
 	 * strsep changes the string, duplicate it because parse_options
@@ -362,9 +368,12 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 			btrfs_set_opt(info->mount_opt, DISCARD);
 			break;
 		case Opt_space_cache:
-			printk(KERN_INFO "btrfs: enabling disk space caching\n");
 			btrfs_set_opt(info->mount_opt, SPACE_CACHE);
 			break;
+		case Opt_no_space_cache:
+			printk(KERN_INFO "btrfs: disabling disk space caching\n");
+			btrfs_clear_opt(info->mount_opt, SPACE_CACHE);
+			break;
 		case Opt_inode_cache:
 			printk(KERN_INFO "btrfs: enabling inode map caching\n");
 			btrfs_set_opt(info->mount_opt, INODE_MAP_CACHE);
@@ -393,6 +402,8 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 		}
 	}
 out:
+	if (!ret && btrfs_test_opt(root, SPACE_CACHE))
+		printk(KERN_INFO "btrfs: disk space caching is enabled\n");
 	kfree(orig);
 	return ret;
 }
@@ -687,6 +698,8 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs)
 		seq_puts(seq, ",noacl");
 	if (btrfs_test_opt(root, SPACE_CACHE))
 		seq_puts(seq, ",space_cache");
+	else
+		seq_puts(seq, ",no_space_cache");
 	if (btrfs_test_opt(root, CLEAR_CACHE))
 		seq_puts(seq, ",clear_cache");
 	if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED))
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 45655793a2c5..1e1a4816ccb0 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -1003,7 +1003,7 @@ static void update_super_roots(struct btrfs_root *root)
 	super->root = root_item->bytenr;
 	super->generation = root_item->generation;
 	super->root_level = root_item->level;
-	if (super->cache_generation != 0 || btrfs_test_opt(root, SPACE_CACHE))
+	if (btrfs_test_opt(root, SPACE_CACHE))
 		super->cache_generation = root_item->generation;
 }
 
-- 
cgit v1.2.3


From f75b130e9bb361850787e156c79311adb84f551e Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Wed, 5 Oct 2011 10:00:18 -0400
Subject: Btrfs: don't skip writing out a empty block groups cache

I noticed a slight bug where we will not bother writing out the block group
cache's space cache if it's space tree is empty.  Since it could have a cluster
or pinned extents that need to be written out this is just not a valid test.
Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/free-space-cache.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 0a8ccdbdd464..b81556ca75ea 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -575,10 +575,6 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 
 	INIT_LIST_HEAD(&bitmap_list);
 
-	node = rb_first(&ctl->free_space_offset);
-	if (!node)
-		return -1;
-
 	if (!i_size_read(inode))
 		return -1;
 
@@ -639,6 +635,12 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 	if (block_group)
 		start = block_group->key.objectid;
 
+	node = rb_first(&ctl->free_space_offset);
+	if (!node && cluster) {
+		node = rb_first(&cluster->root);
+		cluster = NULL;
+	}
+
 	/* Write out the extent entries */
 	do {
 		struct btrfs_free_space_entry *entry;
-- 
cgit v1.2.3


From a67509c30079f4c5025fb19ea443fb2906c3a85e Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Wed, 5 Oct 2011 15:18:58 -0400
Subject: Btrfs: add a io_ctl struct and helpers for dealing with the space
 cache

In writing and reading the space cache we have one big loop that keeps track of
which page we are on and then a bunch of sizeable loops underneath this big loop
to try and read/write out properly.  Especially in the write case this makes
things hugely complicated and hard to follow, and makes our error checking and
recovery equally as complex.  So add a io_ctl struct with a bunch of helpers to
keep track of the pages we have, where we are, if we have enough space etc.
This unifies how we deal with the pages we're writing and keeps all the messy
tracking internal.  This allows us to kill the big loops in both the read and
write case and makes reviewing and chaning the write and read paths much
simpler.  I've run xfstests and stress.sh on this code and it survives.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/free-space-cache.c | 693 ++++++++++++++++++++++++--------------------
 1 file changed, 375 insertions(+), 318 deletions(-)

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index b81556ca75ea..35bfc13c9d42 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -241,27 +241,275 @@ static int readahead_cache(struct inode *inode)
 	return 0;
 }
 
+struct io_ctl {
+	void *cur, *orig;
+	struct page *page;
+	struct page **pages;
+	struct btrfs_root *root;
+	unsigned long size;
+	int index;
+	int num_pages;
+};
+
+static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode,
+		       struct btrfs_root *root)
+{
+	memset(io_ctl, 0, sizeof(struct io_ctl));
+	io_ctl->num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
+		PAGE_CACHE_SHIFT;
+	io_ctl->pages = kzalloc(sizeof(struct page *) * io_ctl->num_pages,
+				GFP_NOFS);
+	if (!io_ctl->pages)
+		return -ENOMEM;
+	io_ctl->root = root;
+	return 0;
+}
+
+static void io_ctl_free(struct io_ctl *io_ctl)
+{
+	kfree(io_ctl->pages);
+}
+
+static void io_ctl_unmap_page(struct io_ctl *io_ctl)
+{
+	if (io_ctl->cur) {
+		kunmap(io_ctl->page);
+		io_ctl->cur = NULL;
+		io_ctl->orig = NULL;
+	}
+}
+
+static void io_ctl_map_page(struct io_ctl *io_ctl, int clear)
+{
+	WARN_ON(io_ctl->cur);
+	BUG_ON(io_ctl->index >= io_ctl->num_pages);
+	io_ctl->page = io_ctl->pages[io_ctl->index++];
+	io_ctl->cur = kmap(io_ctl->page);
+	io_ctl->orig = io_ctl->cur;
+	io_ctl->size = PAGE_CACHE_SIZE;
+	if (clear)
+		memset(io_ctl->cur, 0, PAGE_CACHE_SIZE);
+}
+
+static void io_ctl_drop_pages(struct io_ctl *io_ctl)
+{
+	int i;
+
+	io_ctl_unmap_page(io_ctl);
+
+	for (i = 0; i < io_ctl->num_pages; i++) {
+		ClearPageChecked(io_ctl->pages[i]);
+		unlock_page(io_ctl->pages[i]);
+		page_cache_release(io_ctl->pages[i]);
+	}
+}
+
+static int io_ctl_prepare_pages(struct io_ctl *io_ctl, struct inode *inode,
+				int uptodate)
+{
+	struct page *page;
+	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
+	int i;
+
+	for (i = 0; i < io_ctl->num_pages; i++) {
+		page = find_or_create_page(inode->i_mapping, i, mask);
+		if (!page) {
+			io_ctl_drop_pages(io_ctl);
+			return -ENOMEM;
+		}
+		io_ctl->pages[i] = page;
+		if (uptodate && !PageUptodate(page)) {
+			btrfs_readpage(NULL, page);
+			lock_page(page);
+			if (!PageUptodate(page)) {
+				printk(KERN_ERR "btrfs: error reading free "
+				       "space cache\n");
+				io_ctl_drop_pages(io_ctl);
+				return -EIO;
+			}
+		}
+	}
+
+	return 0;
+}
+
+static void io_ctl_set_generation(struct io_ctl *io_ctl, u64 generation)
+{
+	u64 *val;
+
+	io_ctl_map_page(io_ctl, 1);
+
+	/*
+	 * Skip the first 64bits to make sure theres a bogus crc for old
+	 * kernels
+	 */
+	io_ctl->cur += sizeof(u64);
+
+	val = io_ctl->cur;
+	*val = cpu_to_le64(generation);
+	io_ctl->cur += sizeof(u64);
+	io_ctl->size -= sizeof(u64) * 2;
+}
+
+static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation)
+{
+	u64 *gen;
+
+	io_ctl_map_page(io_ctl, 0);
+
+	/* Skip the bogus crc area */
+	io_ctl->cur += sizeof(u64);
+	gen = io_ctl->cur;
+	if (le64_to_cpu(*gen) != generation) {
+		printk_ratelimited(KERN_ERR "btrfs: space cache generation "
+				   "(%Lu) does not match inode (%Lu)\n", *gen,
+				   generation);
+		io_ctl_unmap_page(io_ctl);
+		return -EIO;
+	}
+	io_ctl->cur += sizeof(u64);
+	io_ctl->size -= sizeof(u64) * 2;
+	return 0;
+}
+
+static int io_ctl_add_entry(struct io_ctl *io_ctl, u64 offset, u64 bytes,
+			    void *bitmap)
+{
+	struct btrfs_free_space_entry *entry;
+
+	if (!io_ctl->cur)
+		return -ENOSPC;
+
+	entry = io_ctl->cur;
+	entry->offset = cpu_to_le64(offset);
+	entry->bytes = cpu_to_le64(bytes);
+	entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP :
+		BTRFS_FREE_SPACE_EXTENT;
+	io_ctl->cur += sizeof(struct btrfs_free_space_entry);
+	io_ctl->size -= sizeof(struct btrfs_free_space_entry);
+
+	if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
+		return 0;
+
+	/*
+	 * index == 1 means the current page is 0, we need to generate a bogus
+	 * crc for older kernels.
+	 */
+	if (io_ctl->index == 1) {
+		u32 *tmp;
+		u32 crc = ~(u32)0;
+
+		crc = btrfs_csum_data(io_ctl->root, io_ctl->orig + sizeof(u64),
+				      crc, PAGE_CACHE_SIZE - sizeof(u64));
+		btrfs_csum_final(crc, (char *)&crc);
+		crc++;
+		tmp = io_ctl->orig;
+		*tmp = crc;
+	}
+	io_ctl_unmap_page(io_ctl);
+
+	/* No more pages to map */
+	if (io_ctl->index >= io_ctl->num_pages)
+		return 0;
+
+	/* map the next page */
+	io_ctl_map_page(io_ctl, 1);
+	return 0;
+}
+
+static int io_ctl_add_bitmap(struct io_ctl *io_ctl, void *bitmap)
+{
+	if (!io_ctl->cur)
+		return -ENOSPC;
+
+	/*
+	 * If we aren't at the start of the current page, unmap this one and
+	 * map the next one if there is any left.
+	 */
+	if (io_ctl->cur != io_ctl->orig) {
+		io_ctl_unmap_page(io_ctl);
+		if (io_ctl->index >= io_ctl->num_pages)
+			return -ENOSPC;
+		io_ctl_map_page(io_ctl, 0);
+	}
+
+	memcpy(io_ctl->cur, bitmap, PAGE_CACHE_SIZE);
+	io_ctl_unmap_page(io_ctl);
+	if (io_ctl->index < io_ctl->num_pages)
+		io_ctl_map_page(io_ctl, 0);
+	return 0;
+}
+
+static void io_ctl_zero_remaining_pages(struct io_ctl *io_ctl)
+{
+	io_ctl_unmap_page(io_ctl);
+
+	while (io_ctl->index < io_ctl->num_pages) {
+		io_ctl_map_page(io_ctl, 1);
+		io_ctl_unmap_page(io_ctl);
+	}
+}
+
+static u8 io_ctl_read_entry(struct io_ctl *io_ctl,
+			    struct btrfs_free_space *entry)
+{
+	struct btrfs_free_space_entry *e;
+	u8 type;
+
+	e = io_ctl->cur;
+	entry->offset = le64_to_cpu(e->offset);
+	entry->bytes = le64_to_cpu(e->bytes);
+	type = e->type;
+	io_ctl->cur += sizeof(struct btrfs_free_space_entry);
+	io_ctl->size -= sizeof(struct btrfs_free_space_entry);
+
+	if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
+		return type;
+
+	io_ctl_unmap_page(io_ctl);
+
+	if (io_ctl->index >= io_ctl->num_pages)
+		return type;
+
+	io_ctl_map_page(io_ctl, 0);
+	return type;
+}
+
+static void io_ctl_read_bitmap(struct io_ctl *io_ctl,
+			       struct btrfs_free_space *entry)
+{
+	BUG_ON(!io_ctl->cur);
+	if (io_ctl->cur != io_ctl->orig) {
+		io_ctl_unmap_page(io_ctl);
+		io_ctl_map_page(io_ctl, 0);
+	}
+	memcpy(entry->bitmap, io_ctl->cur, PAGE_CACHE_SIZE);
+	io_ctl_unmap_page(io_ctl);
+	if (io_ctl->index < io_ctl->num_pages)
+		io_ctl_map_page(io_ctl, 0);
+}
+
 int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
 			    struct btrfs_free_space_ctl *ctl,
 			    struct btrfs_path *path, u64 offset)
 {
 	struct btrfs_free_space_header *header;
 	struct extent_buffer *leaf;
-	struct page *page;
+	struct io_ctl io_ctl;
 	struct btrfs_key key;
+	struct btrfs_free_space *e, *n;
 	struct list_head bitmaps;
 	u64 num_entries;
 	u64 num_bitmaps;
 	u64 generation;
-	pgoff_t index = 0;
-	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
+	u8 type;
 	int ret = 0;
 
 	INIT_LIST_HEAD(&bitmaps);
 
 	/* Nothing in the space cache, goodbye */
 	if (!i_size_read(inode))
-		goto out;
+		return 0;
 
 	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
 	key.offset = offset;
@@ -269,11 +517,10 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
 
 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 	if (ret < 0)
-		goto out;
+		return 0;
 	else if (ret > 0) {
 		btrfs_release_path(path);
-		ret = 0;
-		goto out;
+		return 0;
 	}
 
 	ret = -1;
@@ -291,170 +538,89 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
 		       " not match free space cache generation (%llu)\n",
 		       (unsigned long long)BTRFS_I(inode)->generation,
 		       (unsigned long long)generation);
-		goto out;
+		return 0;
 	}
 
 	if (!num_entries)
-		goto out;
+		return 0;
 
+	io_ctl_init(&io_ctl, inode, root);
 	ret = readahead_cache(inode);
 	if (ret)
 		goto out;
 
-	while (1) {
-		struct btrfs_free_space_entry *entry;
-		struct btrfs_free_space *e;
-		void *addr;
-		unsigned long offset = 0;
-		int need_loop = 0;
+	ret = io_ctl_prepare_pages(&io_ctl, inode, 1);
+	if (ret)
+		goto out;
 
-		if (!num_entries && !num_bitmaps)
-			break;
+	ret = io_ctl_check_generation(&io_ctl, generation);
+	if (ret)
+		goto free_cache;
 
-		page = find_or_create_page(inode->i_mapping, index, mask);
-		if (!page)
+	while (num_entries) {
+		e = kmem_cache_zalloc(btrfs_free_space_cachep,
+				      GFP_NOFS);
+		if (!e)
 			goto free_cache;
 
-		if (!PageUptodate(page)) {
-			btrfs_readpage(NULL, page);
-			lock_page(page);
-			if (!PageUptodate(page)) {
-				unlock_page(page);
-				page_cache_release(page);
-				printk(KERN_ERR "btrfs: error reading free "
-				       "space cache\n");
-				goto free_cache;
-			}
+		type = io_ctl_read_entry(&io_ctl, e);
+		if (!e->bytes) {
+			kmem_cache_free(btrfs_free_space_cachep, e);
+			goto free_cache;
 		}
-		addr = kmap(page);
-
-		if (index == 0) {
-			u64 *gen;
 
-			/*
-			 * We put a bogus crc in the front of the first page in
-			 * case old kernels try to mount a fs with the new
-			 * format to make sure they discard the cache.
-			 */
-			addr += sizeof(u64);
-			offset += sizeof(u64);
-
-			gen = addr;
-			if (*gen != BTRFS_I(inode)->generation) {
-				printk_ratelimited(KERN_ERR "btrfs: space cache"
-					" generation (%llu) does not match "
-					"inode (%llu)\n",
-					(unsigned long long)*gen,
-					(unsigned long long)
-					BTRFS_I(inode)->generation);
-				kunmap(page);
-				unlock_page(page);
-				page_cache_release(page);
+		if (type == BTRFS_FREE_SPACE_EXTENT) {
+			spin_lock(&ctl->tree_lock);
+			ret = link_free_space(ctl, e);
+			spin_unlock(&ctl->tree_lock);
+			if (ret) {
+				printk(KERN_ERR "Duplicate entries in "
+				       "free space cache, dumping\n");
+				kmem_cache_free(btrfs_free_space_cachep, e);
 				goto free_cache;
 			}
-			addr += sizeof(u64);
-			offset += sizeof(u64);
-		}
-		entry = addr;
-
-		while (1) {
-			if (!num_entries)
-				break;
-
-			need_loop = 1;
-			e = kmem_cache_zalloc(btrfs_free_space_cachep,
-					      GFP_NOFS);
-			if (!e) {
-				kunmap(page);
-				unlock_page(page);
-				page_cache_release(page);
+		} else {
+			BUG_ON(!num_bitmaps);
+			num_bitmaps--;
+			e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
+			if (!e->bitmap) {
+				kmem_cache_free(
+					btrfs_free_space_cachep, e);
 				goto free_cache;
 			}
-
-			e->offset = le64_to_cpu(entry->offset);
-			e->bytes = le64_to_cpu(entry->bytes);
-			if (!e->bytes) {
-				kunmap(page);
+			spin_lock(&ctl->tree_lock);
+			ret = link_free_space(ctl, e);
+			ctl->total_bitmaps++;
+			ctl->op->recalc_thresholds(ctl);
+			spin_unlock(&ctl->tree_lock);
+			if (ret) {
+				printk(KERN_ERR "Duplicate entries in "
+				       "free space cache, dumping\n");
 				kmem_cache_free(btrfs_free_space_cachep, e);
-				unlock_page(page);
-				page_cache_release(page);
 				goto free_cache;
 			}
-
-			if (entry->type == BTRFS_FREE_SPACE_EXTENT) {
-				spin_lock(&ctl->tree_lock);
-				ret = link_free_space(ctl, e);
-				spin_unlock(&ctl->tree_lock);
-				if (ret) {
-					printk(KERN_ERR "Duplicate entries in "
-					       "free space cache, dumping\n");
-					kunmap(page);
-					unlock_page(page);
-					page_cache_release(page);
-					goto free_cache;
-				}
-			} else {
-				e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
-				if (!e->bitmap) {
-					kunmap(page);
-					kmem_cache_free(
-						btrfs_free_space_cachep, e);
-					unlock_page(page);
-					page_cache_release(page);
-					goto free_cache;
-				}
-				spin_lock(&ctl->tree_lock);
-				ret = link_free_space(ctl, e);
-				ctl->total_bitmaps++;
-				ctl->op->recalc_thresholds(ctl);
-				spin_unlock(&ctl->tree_lock);
-				if (ret) {
-					printk(KERN_ERR "Duplicate entries in "
-					       "free space cache, dumping\n");
-					kunmap(page);
-					unlock_page(page);
-					page_cache_release(page);
-					goto free_cache;
-				}
-				list_add_tail(&e->list, &bitmaps);
-			}
-
-			num_entries--;
-			offset += sizeof(struct btrfs_free_space_entry);
-			if (offset + sizeof(struct btrfs_free_space_entry) >=
-			    PAGE_CACHE_SIZE)
-				break;
-			entry++;
+			list_add_tail(&e->list, &bitmaps);
 		}
 
-		/*
-		 * We read an entry out of this page, we need to move on to the
-		 * next page.
-		 */
-		if (need_loop) {
-			kunmap(page);
-			goto next;
-		}
+		num_entries--;
+	}
 
-		/*
-		 * We add the bitmaps at the end of the entries in order that
-		 * the bitmap entries are added to the cache.
-		 */
-		e = list_entry(bitmaps.next, struct btrfs_free_space, list);
+	/*
+	 * We add the bitmaps at the end of the entries in order that
+	 * the bitmap entries are added to the cache.
+	 */
+	list_for_each_entry_safe(e, n, &bitmaps, list) {
 		list_del_init(&e->list);
-		memcpy(e->bitmap, addr, PAGE_CACHE_SIZE);
-		kunmap(page);
-		num_bitmaps--;
-next:
-		unlock_page(page);
-		page_cache_release(page);
-		index++;
+		io_ctl_read_bitmap(&io_ctl, e);
 	}
 
+	io_ctl_drop_pages(&io_ctl);
 	ret = 1;
 out:
+	io_ctl_free(&io_ctl);
 	return ret;
 free_cache:
+	io_ctl_drop_pages(&io_ctl);
 	__btrfs_remove_free_space_cache(ctl);
 	goto out;
 }
@@ -554,40 +720,28 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 	struct extent_buffer *leaf;
 	struct rb_node *node;
 	struct list_head *pos, *n;
-	struct page **pages;
-	struct page *page;
 	struct extent_state *cached_state = NULL;
 	struct btrfs_free_cluster *cluster = NULL;
 	struct extent_io_tree *unpin = NULL;
+	struct io_ctl io_ctl;
 	struct list_head bitmap_list;
 	struct btrfs_key key;
 	u64 start, end, len;
-	u64 bytes = 0;
-	u32 crc = ~(u32)0;
-	gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
-	int index = 0, num_pages = 0;
 	int entries = 0;
 	int bitmaps = 0;
 	int ret;
 	int err = -1;
-	bool next_page = false;
-	bool out_of_space = false;
 
 	INIT_LIST_HEAD(&bitmap_list);
 
 	if (!i_size_read(inode))
 		return -1;
 
-	num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >>
-		PAGE_CACHE_SHIFT;
-
 	filemap_write_and_wait(inode->i_mapping);
 	btrfs_wait_ordered_range(inode, inode->i_size &
 				 ~(root->sectorsize - 1), (u64)-1);
 
-	pages = kzalloc(sizeof(struct page *) * num_pages, GFP_NOFS);
-	if (!pages)
-		return -1;
+	io_ctl_init(&io_ctl, inode, root);
 
 	/* Get the cluster for this block_group if it exists */
 	if (block_group && !list_empty(&block_group->cluster_list))
@@ -601,30 +755,9 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 	 */
 	unpin = root->fs_info->pinned_extents;
 
-	/*
-	 * Lock all pages first so we can lock the extent safely.
-	 *
-	 * NOTE: Because we hold the ref the entire time we're going to write to
-	 * the page find_get_page should never fail, so we don't do a check
-	 * after find_get_page at this point.  Just putting this here so people
-	 * know and don't freak out.
-	 */
-	while (index < num_pages) {
-		page = find_or_create_page(inode->i_mapping, index, mask);
-		if (!page) {
-			int i;
-
-			for (i = 0; i < num_pages; i++) {
-				unlock_page(pages[i]);
-				page_cache_release(pages[i]);
-			}
-			goto out;
-		}
-		pages[index] = page;
-		index++;
-	}
+	/* Lock all pages first so we can lock the extent safely. */
+	io_ctl_prepare_pages(&io_ctl, inode, 0);
 
-	index = 0;
 	lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
 			 0, &cached_state, GFP_NOFS);
 
@@ -641,166 +774,78 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 		cluster = NULL;
 	}
 
-	/* Write out the extent entries */
-	do {
-		struct btrfs_free_space_entry *entry;
-		void *addr, *orig;
-		unsigned long offset = 0;
-
-		next_page = false;
+	io_ctl_set_generation(&io_ctl, trans->transid);
 
-		if (index >= num_pages) {
-			out_of_space = true;
-			break;
-		}
-
-		page = pages[index];
+	/* Write out the extent entries */
+	while (node) {
+		struct btrfs_free_space *e;
 
-		orig = addr = kmap(page);
-		if (index == 0) {
-			u64 *gen;
+		e = rb_entry(node, struct btrfs_free_space, offset_index);
+		entries++;
 
-			/*
-			 * We're going to put in a bogus crc for this page to
-			 * make sure that old kernels who aren't aware of this
-			 * format will be sure to discard the cache.
-			 */
-			addr += sizeof(u64);
-			offset += sizeof(u64);
+		ret = io_ctl_add_entry(&io_ctl, e->offset, e->bytes,
+				       e->bitmap);
+		if (ret)
+			goto out_nospc;
 
-			gen = addr;
-			*gen = trans->transid;
-			addr += sizeof(u64);
-			offset += sizeof(u64);
+		if (e->bitmap) {
+			list_add_tail(&e->list, &bitmap_list);
+			bitmaps++;
 		}
-		entry = addr;
-
-		memset(addr, 0, PAGE_CACHE_SIZE - offset);
-		while (node && !next_page) {
-			struct btrfs_free_space *e;
-
-			e = rb_entry(node, struct btrfs_free_space, offset_index);
-			entries++;
-
-			entry->offset = cpu_to_le64(e->offset);
-			entry->bytes = cpu_to_le64(e->bytes);
-			if (e->bitmap) {
-				entry->type = BTRFS_FREE_SPACE_BITMAP;
-				list_add_tail(&e->list, &bitmap_list);
-				bitmaps++;
-			} else {
-				entry->type = BTRFS_FREE_SPACE_EXTENT;
-			}
-			node = rb_next(node);
-			if (!node && cluster) {
-				node = rb_first(&cluster->root);
-				cluster = NULL;
-			}
-			offset += sizeof(struct btrfs_free_space_entry);
-			if (offset + sizeof(struct btrfs_free_space_entry) >=
-			    PAGE_CACHE_SIZE)
-				next_page = true;
-			entry++;
+		node = rb_next(node);
+		if (!node && cluster) {
+			node = rb_first(&cluster->root);
+			cluster = NULL;
 		}
+	}
 
-		/*
-		 * We want to add any pinned extents to our free space cache
-		 * so we don't leak the space
-		 */
-		while (block_group && !next_page &&
-		       (start < block_group->key.objectid +
-			block_group->key.offset)) {
-			ret = find_first_extent_bit(unpin, start, &start, &end,
-						    EXTENT_DIRTY);
-			if (ret) {
-				ret = 0;
-				break;
-			}
-
-			/* This pinned extent is out of our range */
-			if (start >= block_group->key.objectid +
-			    block_group->key.offset)
-				break;
-
-			len = block_group->key.objectid +
-				block_group->key.offset - start;
-			len = min(len, end + 1 - start);
-
-			entries++;
-			entry->offset = cpu_to_le64(start);
-			entry->bytes = cpu_to_le64(len);
-			entry->type = BTRFS_FREE_SPACE_EXTENT;
-
-			start = end + 1;
-			offset += sizeof(struct btrfs_free_space_entry);
-			if (offset + sizeof(struct btrfs_free_space_entry) >=
-			    PAGE_CACHE_SIZE)
-				next_page = true;
-			entry++;
+	/*
+	 * We want to add any pinned extents to our free space cache
+	 * so we don't leak the space
+	 */
+	while (block_group && (start < block_group->key.objectid +
+			       block_group->key.offset)) {
+		ret = find_first_extent_bit(unpin, start, &start, &end,
+					    EXTENT_DIRTY);
+		if (ret) {
+			ret = 0;
+			break;
 		}
 
-		/* Generate bogus crc value */
-		if (index == 0) {
-			u32 *tmp;
-			crc = btrfs_csum_data(root, orig + sizeof(u64), crc,
-					      PAGE_CACHE_SIZE - sizeof(u64));
-			btrfs_csum_final(crc, (char *)&crc);
-			crc++;
-			tmp = orig;
-			*tmp = crc;
-		}
+		/* This pinned extent is out of our range */
+		if (start >= block_group->key.objectid +
+		    block_group->key.offset)
+			break;
 
-		kunmap(page);
+		len = block_group->key.objectid +
+			block_group->key.offset - start;
+		len = min(len, end + 1 - start);
 
-		bytes += PAGE_CACHE_SIZE;
+		entries++;
+		ret = io_ctl_add_entry(&io_ctl, start, len, NULL);
+		if (ret)
+			goto out_nospc;
 
-		index++;
-	} while (node || next_page);
+		start = end + 1;
+	}
 
 	/* Write out the bitmaps */
 	list_for_each_safe(pos, n, &bitmap_list) {
-		void *addr;
 		struct btrfs_free_space *entry =
 			list_entry(pos, struct btrfs_free_space, list);
 
-		if (index >= num_pages) {
-			out_of_space = true;
-			break;
-		}
-		page = pages[index];
-
-		addr = kmap(page);
-		memcpy(addr, entry->bitmap, PAGE_CACHE_SIZE);
-		kunmap(page);
-		bytes += PAGE_CACHE_SIZE;
-
+		ret = io_ctl_add_bitmap(&io_ctl, entry->bitmap);
+		if (ret)
+			goto out_nospc;
 		list_del_init(&entry->list);
-		index++;
-	}
-
-	if (out_of_space) {
-		btrfs_drop_pages(pages, num_pages);
-		unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
-				     i_size_read(inode) - 1, &cached_state,
-				     GFP_NOFS);
-		goto out;
 	}
 
 	/* Zero out the rest of the pages just to make sure */
-	while (index < num_pages) {
-		void *addr;
+	io_ctl_zero_remaining_pages(&io_ctl);
 
-		page = pages[index];
-		addr = kmap(page);
-		memset(addr, 0, PAGE_CACHE_SIZE);
-		kunmap(page);
-		bytes += PAGE_CACHE_SIZE;
-		index++;
-	}
-
-	ret = btrfs_dirty_pages(root, inode, pages, num_pages, 0,
-					    bytes, &cached_state);
-	btrfs_drop_pages(pages, num_pages);
+	ret = btrfs_dirty_pages(root, inode, io_ctl.pages, io_ctl.num_pages,
+				0, i_size_read(inode), &cached_state);
+	io_ctl_drop_pages(&io_ctl);
 	unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
 			     i_size_read(inode) - 1, &cached_state, GFP_NOFS);
 
@@ -817,7 +862,7 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 
 	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
 	if (ret < 0) {
-		clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1,
+		clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
 				 EXTENT_DIRTY | EXTENT_DELALLOC |
 				 EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS);
 		goto out;
@@ -830,7 +875,8 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
 		if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
 		    found_key.offset != offset) {
-			clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, bytes - 1,
+			clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
+					 inode->i_size - 1,
 					 EXTENT_DIRTY | EXTENT_DELALLOC |
 					 EXTENT_DO_ACCOUNTING, 0, 0, NULL,
 					 GFP_NOFS);
@@ -848,13 +894,24 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 
 	err = 0;
 out:
-	kfree(pages);
+	io_ctl_free(&io_ctl);
 	if (err) {
-		invalidate_inode_pages2_range(inode->i_mapping, 0, index);
+		invalidate_inode_pages2(inode->i_mapping);
 		BTRFS_I(inode)->generation = 0;
 	}
 	btrfs_update_inode(trans, root, inode);
 	return err;
+
+out_nospc:
+	list_for_each_safe(pos, n, &bitmap_list) {
+		struct btrfs_free_space *entry =
+			list_entry(pos, struct btrfs_free_space, list);
+		list_del_init(&entry->list);
+	}
+	io_ctl_drop_pages(&io_ctl);
+	unlock_extent_cached(&BTRFS_I(inode)->io_tree, 0,
+			     i_size_read(inode) - 1, &cached_state, GFP_NOFS);
+	goto out;
 }
 
 int btrfs_write_out_cache(struct btrfs_root *root,
-- 
cgit v1.2.3


From 549b4fdb8f3c0708bbc0ee12ff955cd206c0f60c Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Wed, 5 Oct 2011 16:33:53 -0400
Subject: Btrfs: check the return value of filemap_write_and_wait in the space
 cache

We need to check the return value of filemap_write_and_wait in the space cache
writeout code.  Also don't set the inode's generation until we're sure nothing
else is going to fail.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/free-space-cache.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 35bfc13c9d42..abc924c9467c 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -852,9 +852,10 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 	if (ret)
 		goto out;
 
-	BTRFS_I(inode)->generation = trans->transid;
 
-	filemap_write_and_wait(inode->i_mapping);
+	ret = filemap_write_and_wait(inode->i_mapping);
+	if (ret)
+		goto out;
 
 	key.objectid = BTRFS_FREE_SPACE_OBJECTID;
 	key.offset = offset;
@@ -884,6 +885,8 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 			goto out;
 		}
 	}
+
+	BTRFS_I(inode)->generation = trans->transid;
 	header = btrfs_item_ptr(leaf, path->slots[0],
 				struct btrfs_free_space_header);
 	btrfs_set_free_space_entries(leaf, header, entries);
-- 
cgit v1.2.3


From 9a82ca659d8bfd99afc0e89bbde2202322df5755 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Wed, 5 Oct 2011 16:35:28 -0400
Subject: Btrfs: take overflow into account in reserving space

My overcommit stuff can be a little racy when we're filling up the disk with
fs_mark and we overcommit into things that quickly get used up for data.  So use
num_bytes to see if we have enough available space so we're less likely to
overcommit ourselves out of the ability to make reservations.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent-tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f9711a82fc54..f95e55083bdb 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3521,7 +3521,7 @@ again:
 			avail >>= 1;
 		 spin_unlock(&root->fs_info->free_chunk_lock);
 
-		if (used + orig_bytes < space_info->total_bytes + avail) {
+		if (used + num_bytes < space_info->total_bytes + avail) {
 			space_info->bytes_may_use += orig_bytes;
 			ret = 0;
 		}
-- 
cgit v1.2.3


From 5b0e95bf607ddd59b39f52d3d55e6581c817b530 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Thu, 6 Oct 2011 08:58:24 -0400
Subject: Btrfs: inline checksums into the disk free space cache

Yeah yeah I know this is how we used to do it and then I changed it, but damnit
I'm changing it back.  The fact is that writing out checksums will modify
metadata, which could cause us to dirty a block group we've already written out,
so we have to truncate it and all of it's checksums and re-write it which will
write new checksums which could dirty a blockg roup that has already been
written and you see where I'm going with this?  This can cause unmount or really
anything that depends on a transaction to commit to take it's sweet damned time
to happen.  So go back to the way it was, only this time we're specifically
setting NODATACOW because we can't go through the COW pathway anyway and we're
doing our own built-in cow'ing by truncating the free space cache.  The other
new thing is once we truncate the old cache and preallocate the new space, we
don't need to do that song and dance at all for the rest of the transaction, we
can just overwrite the existing space with the new cache if the block group
changes for whatever reason, and the NODATACOW will let us do this fine.  So
keep track of which transaction we last cleared our cache in and if we cleared
it in this transaction just say we're all setup and carry on.  This survives
xfstests and stress.sh.

The inode cache will continue to use the normal csum infrastructure since it
only gets written once and there will be no more modifications to the fs tree in
a transaction commit.

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/ctree.h            |   1 +
 fs/btrfs/extent-tree.c      |  18 ++--
 fs/btrfs/free-space-cache.c | 211 ++++++++++++++++++++++++++++++++------------
 fs/btrfs/inode.c            |  10 +--
 4 files changed, 172 insertions(+), 68 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 1eafccb162ee..ea60897a9171 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -838,6 +838,7 @@ struct btrfs_block_group_cache {
 	u64 bytes_super;
 	u64 flags;
 	u64 sectorsize;
+	u64 cache_generation;
 	unsigned int ro:1;
 	unsigned int dirty:1;
 	unsigned int iref:1;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index f95e55083bdb..0abf70c984e9 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2717,6 +2717,13 @@ again:
 		goto again;
 	}
 
+	/* We've already setup this transaction, go ahead and exit */
+	if (block_group->cache_generation == trans->transid &&
+	    i_size_read(inode)) {
+		dcs = BTRFS_DC_SETUP;
+		goto out_put;
+	}
+
 	/*
 	 * We want to set the generation to 0, that way if anything goes wrong
 	 * from here on out we know not to trust this cache when we load up next
@@ -2756,19 +2763,16 @@ again:
 	num_pages *= 16;
 	num_pages *= PAGE_CACHE_SIZE;
 
-	ret = btrfs_delalloc_reserve_space(inode, num_pages);
+	ret = btrfs_check_data_free_space(inode, num_pages);
 	if (ret)
 		goto out_put;
 
 	ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
 					      num_pages, num_pages,
 					      &alloc_hint);
-	if (!ret) {
+	if (!ret)
 		dcs = BTRFS_DC_SETUP;
-		btrfs_free_reserved_data_space(inode, num_pages);
-	} else {
-		btrfs_delalloc_release_space(inode, num_pages);
-	}
+	btrfs_free_reserved_data_space(inode, num_pages);
 
 out_put:
 	iput(inode);
@@ -2776,6 +2780,8 @@ out_free:
 	btrfs_release_path(path);
 out:
 	spin_lock(&block_group->lock);
+	if (!ret)
+		block_group->cache_generation = trans->transid;
 	block_group->disk_cache_state = dcs;
 	spin_unlock(&block_group->lock);
 
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index abc924c9467c..5d40c1ed8225 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -85,6 +85,7 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root,
 				      *block_group, struct btrfs_path *path)
 {
 	struct inode *inode = NULL;
+	u32 flags = BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
 
 	spin_lock(&block_group->lock);
 	if (block_group->inode)
@@ -99,9 +100,10 @@ struct inode *lookup_free_space_inode(struct btrfs_root *root,
 		return inode;
 
 	spin_lock(&block_group->lock);
-	if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) {
+	if (!((BTRFS_I(inode)->flags & flags) == flags)) {
 		printk(KERN_INFO "Old style space inode found, converting.\n");
-		BTRFS_I(inode)->flags &= ~BTRFS_INODE_NODATASUM;
+		BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM |
+			BTRFS_INODE_NODATACOW;
 		block_group->disk_cache_state = BTRFS_DC_CLEAR;
 	}
 
@@ -123,12 +125,17 @@ int __create_free_space_inode(struct btrfs_root *root,
 	struct btrfs_free_space_header *header;
 	struct btrfs_inode_item *inode_item;
 	struct extent_buffer *leaf;
+	u64 flags = BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC;
 	int ret;
 
 	ret = btrfs_insert_empty_inode(trans, root, path, ino);
 	if (ret)
 		return ret;
 
+	/* We inline crc's for the free disk space cache */
+	if (ino != BTRFS_FREE_INO_OBJECTID)
+		flags |= BTRFS_INODE_NODATASUM | BTRFS_INODE_NODATACOW;
+
 	leaf = path->nodes[0];
 	inode_item = btrfs_item_ptr(leaf, path->slots[0],
 				    struct btrfs_inode_item);
@@ -141,8 +148,7 @@ int __create_free_space_inode(struct btrfs_root *root,
 	btrfs_set_inode_uid(leaf, inode_item, 0);
 	btrfs_set_inode_gid(leaf, inode_item, 0);
 	btrfs_set_inode_mode(leaf, inode_item, S_IFREG | 0600);
-	btrfs_set_inode_flags(leaf, inode_item, BTRFS_INODE_NOCOMPRESS |
-			      BTRFS_INODE_PREALLOC);
+	btrfs_set_inode_flags(leaf, inode_item, flags);
 	btrfs_set_inode_nlink(leaf, inode_item, 1);
 	btrfs_set_inode_transid(leaf, inode_item, trans->transid);
 	btrfs_set_inode_block_group(leaf, inode_item, offset);
@@ -249,6 +255,7 @@ struct io_ctl {
 	unsigned long size;
 	int index;
 	int num_pages;
+	unsigned check_crcs:1;
 };
 
 static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode,
@@ -262,6 +269,8 @@ static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode,
 	if (!io_ctl->pages)
 		return -ENOMEM;
 	io_ctl->root = root;
+	if (btrfs_ino(inode) != BTRFS_FREE_INO_OBJECTID)
+		io_ctl->check_crcs = 1;
 	return 0;
 }
 
@@ -340,25 +349,39 @@ static void io_ctl_set_generation(struct io_ctl *io_ctl, u64 generation)
 	io_ctl_map_page(io_ctl, 1);
 
 	/*
-	 * Skip the first 64bits to make sure theres a bogus crc for old
-	 * kernels
+	 * Skip the csum areas.  If we don't check crcs then we just have a
+	 * 64bit chunk at the front of the first page.
 	 */
-	io_ctl->cur += sizeof(u64);
+	if (io_ctl->check_crcs) {
+		io_ctl->cur += (sizeof(u32) * io_ctl->num_pages);
+		io_ctl->size -= sizeof(u64) + (sizeof(u32) * io_ctl->num_pages);
+	} else {
+		io_ctl->cur += sizeof(u64);
+		io_ctl->size -= sizeof(u64) * 2;
+	}
 
 	val = io_ctl->cur;
 	*val = cpu_to_le64(generation);
 	io_ctl->cur += sizeof(u64);
-	io_ctl->size -= sizeof(u64) * 2;
 }
 
 static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation)
 {
 	u64 *gen;
 
-	io_ctl_map_page(io_ctl, 0);
+	/*
+	 * Skip the crc area.  If we don't check crcs then we just have a 64bit
+	 * chunk at the front of the first page.
+	 */
+	if (io_ctl->check_crcs) {
+		io_ctl->cur += sizeof(u32) * io_ctl->num_pages;
+		io_ctl->size -= sizeof(u64) +
+			(sizeof(u32) * io_ctl->num_pages);
+	} else {
+		io_ctl->cur += sizeof(u64);
+		io_ctl->size -= sizeof(u64) * 2;
+	}
 
-	/* Skip the bogus crc area */
-	io_ctl->cur += sizeof(u64);
 	gen = io_ctl->cur;
 	if (le64_to_cpu(*gen) != generation) {
 		printk_ratelimited(KERN_ERR "btrfs: space cache generation "
@@ -368,7 +391,63 @@ static int io_ctl_check_generation(struct io_ctl *io_ctl, u64 generation)
 		return -EIO;
 	}
 	io_ctl->cur += sizeof(u64);
-	io_ctl->size -= sizeof(u64) * 2;
+	return 0;
+}
+
+static void io_ctl_set_crc(struct io_ctl *io_ctl, int index)
+{
+	u32 *tmp;
+	u32 crc = ~(u32)0;
+	unsigned offset = 0;
+
+	if (!io_ctl->check_crcs) {
+		io_ctl_unmap_page(io_ctl);
+		return;
+	}
+
+	if (index == 0)
+		offset = sizeof(u32) * io_ctl->num_pages;;
+
+	crc = btrfs_csum_data(io_ctl->root, io_ctl->orig + offset, crc,
+			      PAGE_CACHE_SIZE - offset);
+	btrfs_csum_final(crc, (char *)&crc);
+	io_ctl_unmap_page(io_ctl);
+	tmp = kmap(io_ctl->pages[0]);
+	tmp += index;
+	*tmp = crc;
+	kunmap(io_ctl->pages[0]);
+}
+
+static int io_ctl_check_crc(struct io_ctl *io_ctl, int index)
+{
+	u32 *tmp, val;
+	u32 crc = ~(u32)0;
+	unsigned offset = 0;
+
+	if (!io_ctl->check_crcs) {
+		io_ctl_map_page(io_ctl, 0);
+		return 0;
+	}
+
+	if (index == 0)
+		offset = sizeof(u32) * io_ctl->num_pages;
+
+	tmp = kmap(io_ctl->pages[0]);
+	tmp += index;
+	val = *tmp;
+	kunmap(io_ctl->pages[0]);
+
+	io_ctl_map_page(io_ctl, 0);
+	crc = btrfs_csum_data(io_ctl->root, io_ctl->orig + offset, crc,
+			      PAGE_CACHE_SIZE - offset);
+	btrfs_csum_final(crc, (char *)&crc);
+	if (val != crc) {
+		printk_ratelimited(KERN_ERR "btrfs: csum mismatch on free "
+				   "space cache\n");
+		io_ctl_unmap_page(io_ctl);
+		return -EIO;
+	}
+
 	return 0;
 }
 
@@ -391,22 +470,7 @@ static int io_ctl_add_entry(struct io_ctl *io_ctl, u64 offset, u64 bytes,
 	if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
 		return 0;
 
-	/*
-	 * index == 1 means the current page is 0, we need to generate a bogus
-	 * crc for older kernels.
-	 */
-	if (io_ctl->index == 1) {
-		u32 *tmp;
-		u32 crc = ~(u32)0;
-
-		crc = btrfs_csum_data(io_ctl->root, io_ctl->orig + sizeof(u64),
-				      crc, PAGE_CACHE_SIZE - sizeof(u64));
-		btrfs_csum_final(crc, (char *)&crc);
-		crc++;
-		tmp = io_ctl->orig;
-		*tmp = crc;
-	}
-	io_ctl_unmap_page(io_ctl);
+	io_ctl_set_crc(io_ctl, io_ctl->index - 1);
 
 	/* No more pages to map */
 	if (io_ctl->index >= io_ctl->num_pages)
@@ -427,14 +491,14 @@ static int io_ctl_add_bitmap(struct io_ctl *io_ctl, void *bitmap)
 	 * map the next one if there is any left.
 	 */
 	if (io_ctl->cur != io_ctl->orig) {
-		io_ctl_unmap_page(io_ctl);
+		io_ctl_set_crc(io_ctl, io_ctl->index - 1);
 		if (io_ctl->index >= io_ctl->num_pages)
 			return -ENOSPC;
 		io_ctl_map_page(io_ctl, 0);
 	}
 
 	memcpy(io_ctl->cur, bitmap, PAGE_CACHE_SIZE);
-	io_ctl_unmap_page(io_ctl);
+	io_ctl_set_crc(io_ctl, io_ctl->index - 1);
 	if (io_ctl->index < io_ctl->num_pages)
 		io_ctl_map_page(io_ctl, 0);
 	return 0;
@@ -442,51 +506,60 @@ static int io_ctl_add_bitmap(struct io_ctl *io_ctl, void *bitmap)
 
 static void io_ctl_zero_remaining_pages(struct io_ctl *io_ctl)
 {
-	io_ctl_unmap_page(io_ctl);
+	/*
+	 * If we're not on the boundary we know we've modified the page and we
+	 * need to crc the page.
+	 */
+	if (io_ctl->cur != io_ctl->orig)
+		io_ctl_set_crc(io_ctl, io_ctl->index - 1);
+	else
+		io_ctl_unmap_page(io_ctl);
 
 	while (io_ctl->index < io_ctl->num_pages) {
 		io_ctl_map_page(io_ctl, 1);
-		io_ctl_unmap_page(io_ctl);
+		io_ctl_set_crc(io_ctl, io_ctl->index - 1);
 	}
 }
 
-static u8 io_ctl_read_entry(struct io_ctl *io_ctl,
-			    struct btrfs_free_space *entry)
+static int io_ctl_read_entry(struct io_ctl *io_ctl,
+			    struct btrfs_free_space *entry, u8 *type)
 {
 	struct btrfs_free_space_entry *e;
-	u8 type;
 
 	e = io_ctl->cur;
 	entry->offset = le64_to_cpu(e->offset);
 	entry->bytes = le64_to_cpu(e->bytes);
-	type = e->type;
+	*type = e->type;
 	io_ctl->cur += sizeof(struct btrfs_free_space_entry);
 	io_ctl->size -= sizeof(struct btrfs_free_space_entry);
 
 	if (io_ctl->size >= sizeof(struct btrfs_free_space_entry))
-		return type;
+		return 0;
 
 	io_ctl_unmap_page(io_ctl);
 
 	if (io_ctl->index >= io_ctl->num_pages)
-		return type;
+		return 0;
 
-	io_ctl_map_page(io_ctl, 0);
-	return type;
+	return io_ctl_check_crc(io_ctl, io_ctl->index);
 }
 
-static void io_ctl_read_bitmap(struct io_ctl *io_ctl,
-			       struct btrfs_free_space *entry)
+static int io_ctl_read_bitmap(struct io_ctl *io_ctl,
+			      struct btrfs_free_space *entry)
 {
-	BUG_ON(!io_ctl->cur);
-	if (io_ctl->cur != io_ctl->orig) {
+	int ret;
+
+	if (io_ctl->cur && io_ctl->cur != io_ctl->orig)
 		io_ctl_unmap_page(io_ctl);
-		io_ctl_map_page(io_ctl, 0);
-	}
+
+	ret = io_ctl_check_crc(io_ctl, io_ctl->index);
+	if (ret)
+		return ret;
+
 	memcpy(entry->bitmap, io_ctl->cur, PAGE_CACHE_SIZE);
 	io_ctl_unmap_page(io_ctl);
-	if (io_ctl->index < io_ctl->num_pages)
-		io_ctl_map_page(io_ctl, 0);
+
+	return 0;
 }
 
 int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
@@ -553,6 +626,10 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
 	if (ret)
 		goto out;
 
+	ret = io_ctl_check_crc(&io_ctl, 0);
+	if (ret)
+		goto free_cache;
+
 	ret = io_ctl_check_generation(&io_ctl, generation);
 	if (ret)
 		goto free_cache;
@@ -563,7 +640,12 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
 		if (!e)
 			goto free_cache;
 
-		type = io_ctl_read_entry(&io_ctl, e);
+		ret = io_ctl_read_entry(&io_ctl, e, &type);
+		if (ret) {
+			kmem_cache_free(btrfs_free_space_cachep, e);
+			goto free_cache;
+		}
+
 		if (!e->bytes) {
 			kmem_cache_free(btrfs_free_space_cachep, e);
 			goto free_cache;
@@ -611,7 +693,9 @@ int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
 	 */
 	list_for_each_entry_safe(e, n, &bitmaps, list) {
 		list_del_init(&e->list);
-		io_ctl_read_bitmap(&io_ctl, e);
+		ret = io_ctl_read_bitmap(&io_ctl, e);
+		if (ret)
+			goto free_cache;
 	}
 
 	io_ctl_drop_pages(&io_ctl);
@@ -632,7 +716,7 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
 	struct btrfs_root *root = fs_info->tree_root;
 	struct inode *inode;
 	struct btrfs_path *path;
-	int ret;
+	int ret = 0;
 	bool matched;
 	u64 used = btrfs_block_group_used(&block_group->item);
 
@@ -664,6 +748,14 @@ int load_free_space_cache(struct btrfs_fs_info *fs_info,
 		return 0;
 	}
 
+	/* We may have converted the inode and made the cache invalid. */
+	spin_lock(&block_group->lock);
+	if (block_group->disk_cache_state != BTRFS_DC_WRITTEN) {
+		spin_unlock(&block_group->lock);
+		goto out;
+	}
+	spin_unlock(&block_group->lock);
+
 	ret = __load_free_space_cache(fs_info->tree_root, inode, ctl,
 				      path, block_group->key.objectid);
 	btrfs_free_path(path);
@@ -774,6 +866,13 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 		cluster = NULL;
 	}
 
+	/* Make sure we can fit our crcs into the first page */
+	if (io_ctl.check_crcs &&
+	    (io_ctl.num_pages * sizeof(u32)) >= PAGE_CACHE_SIZE) {
+		WARN_ON(1);
+		goto out_nospc;
+	}
+
 	io_ctl_set_generation(&io_ctl, trans->transid);
 
 	/* Write out the extent entries */
@@ -864,8 +963,8 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
 	if (ret < 0) {
 		clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1,
-				 EXTENT_DIRTY | EXTENT_DELALLOC |
-				 EXTENT_DO_ACCOUNTING, 0, 0, NULL, GFP_NOFS);
+				 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, NULL,
+				 GFP_NOFS);
 		goto out;
 	}
 	leaf = path->nodes[0];
@@ -878,9 +977,8 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 		    found_key.offset != offset) {
 			clear_extent_bit(&BTRFS_I(inode)->io_tree, 0,
 					 inode->i_size - 1,
-					 EXTENT_DIRTY | EXTENT_DELALLOC |
-					 EXTENT_DO_ACCOUNTING, 0, 0, NULL,
-					 GFP_NOFS);
+					 EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0,
+					 NULL, GFP_NOFS);
 			btrfs_release_path(path);
 			goto out;
 		}
@@ -942,7 +1040,6 @@ int btrfs_write_out_cache(struct btrfs_root *root,
 	ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans,
 				      path, block_group->key.objectid);
 	if (ret) {
-		btrfs_delalloc_release_metadata(inode, inode->i_size);
 		spin_lock(&block_group->lock);
 		block_group->disk_cache_state = BTRFS_DC_ERROR;
 		spin_unlock(&block_group->lock);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 15adfb542502..246397d8478f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1792,12 +1792,12 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
 	}
 	ret = 0;
 out:
-	btrfs_delalloc_release_metadata(inode, ordered_extent->len);
-	if (nolock) {
-		if (trans)
+	if (root != root->fs_info->tree_root)
+		btrfs_delalloc_release_metadata(inode, ordered_extent->len);
+	if (trans) {
+		if (nolock)
 			btrfs_end_transaction_nolock(trans, root);
-	} else {
-		if (trans)
+		else
 			btrfs_end_transaction(trans, root);
 	}
 
-- 
cgit v1.2.3


From 4b91c14f913f649d4302b3677b85c4ce87a3d8e7 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Fri, 7 Oct 2011 11:55:34 -0400
Subject: Btrfs: wait for ordered extents if we didn't reclaim enough

I noticed recently that my overcommit patch was causing one of my enospc tests
to fail 25% of the time with early ENOSPC.  This is because my overcommit patch
was letting us go way over board, but it wasn't waiting long enough to let the
delalloc shrinker do it's job.  The problem is we just start writeback and wait
a little bit hoping we flush enough, but we only free up delalloc space by
having the writes complete all the way.  We do this by waiting for ordered
extents, which we do but only if we already free'd enough for the reservation,
which isn't right, we should flush ordered extents if we didn't reclaim enough
in case that will push us over the edge.  With this patch I've not seen a
failure in this enospc test after running it in a loop for an hour.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent-tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 0abf70c984e9..fc0de6880045 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3406,7 +3406,7 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
 		}
 
 	}
-	if (reclaimed >= to_reclaim && !trans)
+	if (reclaimed < to_reclaim && !trans)
 		btrfs_wait_ordered_extents(root, 0, 0);
 	return reclaimed >= to_reclaim;
 }
-- 
cgit v1.2.3


From e70bea5fe0e3d6355fd95674eaff5aa5a32f0564 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Tue, 11 Oct 2011 14:18:24 -0400
Subject: Btrfs: fix the amount of space reserved for unlink

Our unlink reservations were a bit much, we were reserving 10 and I only count 8
possible items we're touching, so comment what we're reserving for and fix the
count value.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/inode.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 246397d8478f..1f013c5c36aa 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2758,7 +2758,16 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
 	u64 ino = btrfs_ino(inode);
 	u64 dir_ino = btrfs_ino(dir);
 
-	trans = btrfs_start_transaction(root, 10);
+	/*
+	 * 1 for the possible orphan item
+	 * 1 for the dir item
+	 * 1 for the dir index
+	 * 1 for the inode ref
+	 * 1 for the inode ref in the tree log
+	 * 2 for the dir entries in the log
+	 * 1 for the inode
+	 */
+	trans = btrfs_start_transaction(root, 8);
 	if (!IS_ERR(trans) || PTR_ERR(trans) != -ENOSPC)
 		return trans;
 
-- 
cgit v1.2.3


From ed3ee9f44ba55eb6acfbfc8caa881e0253710d2a Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Thu, 13 Oct 2011 13:09:22 -0400
Subject: Btrfs: fix regression in re-setting a large xattr

Recently I changed the xattr stuff to unconditionally set the xattr first in
case the xattr didn't exist yet.  This has introduced a regression when setting
an xattr that already exists with a large value.  If we find the key we are
looking for split_leaf will assume that we're extending that item.  The problem
is the size we pass down to btrfs_search_slot includes the size of the item
already, so if we have the largest xattr we can possibly have plus the size of
the xattr item plus the xattr item that btrfs_search_slot we'd overflow the
leaf.  Thankfully this is not what we're doing, but split_leaf doesn't know this
so it just returns EOVERFLOW.  So in the xattr code we need to check and see if
we got back EOVERFLOW and treat it like EEXIST since that's really what
happened.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/xattr.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 69565e5fc6a0..a76e41c04b71 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -127,6 +127,17 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
 again:
 	ret = btrfs_insert_xattr_item(trans, root, path, btrfs_ino(inode),
 				      name, name_len, value, size);
+	/*
+	 * If we're setting an xattr to a new value but the new value is say
+	 * exactly BTRFS_MAX_XATTR_SIZE, we could end up with EOVERFLOW getting
+	 * back from split_leaf.  This is because it thinks we'll be extending
+	 * the existing item size, but we're asking for enough space to add the
+	 * item itself.  So if we get EOVERFLOW just set ret to EEXIST and let
+	 * the rest of the function figure it out.
+	 */
+	if (ret == -EOVERFLOW)
+		ret = -EEXIST;
+
 	if (ret == -EEXIST) {
 		if (flags & XATTR_CREATE)
 			goto out;
-- 
cgit v1.2.3


From bbb495c2ed9d34894cb3d6d366866fc92a2e1adc Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Fri, 14 Oct 2011 13:37:45 -0400
Subject: Btrfs: don't check bytes_pinned to determine if we should commit the
 transaction

Before the only reason to commit the transaction to recover space in
reserve_metadata_bytes() was if there were enough pinned_bytes to satisfy our
reservation.  But now we have the delayed inode stuff which will hold it's
reservations until we commit the transaction.  So say we max out our reservation
by creating a bunch of files but don't have any pinned bytes we will ENOSPC out
early even though we could commit the transaction and get that space back.  So
now just unconditionally commit the transaction since currently there is no way
to know how much metadata space is being reserved by delayed inode stuff.
Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent-tree.c | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index fc0de6880045..79365a40cb3a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3568,17 +3568,6 @@ again:
 		goto again;
 	}
 
-	/*
-	 * Not enough space to be reclaimed, don't bother committing the
-	 * transaction.
-	 */
-	spin_lock(&space_info->lock);
-	if (space_info->bytes_pinned < orig_bytes)
-		ret = -ENOSPC;
-	spin_unlock(&space_info->lock);
-	if (ret)
-		goto out;
-
 	ret = -EAGAIN;
 	if (trans)
 		goto out;
-- 
cgit v1.2.3


From f104d044376aadcee74605d66b8d9dc2e145782c Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Fri, 14 Oct 2011 13:56:58 -0400
Subject: Btrfs: wait for ordered extents if we're in trouble when shrinking
 delalloc

The only way we actually reclaim delalloc space is waiting for the IO to
completely finish.  Usually we kick off a bunch of IO and wait for a little bit
and hope we can make our reservation, and usually this works out pretty well.
With overcommit however we can get seriously underwater if we're filling up the
disk quickly, so we need to be able to force the delalloc shrinker to wait for
the ordered IO to finish to give us a better chance of actually reclaiming
enough space to get our reservation.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent-tree.c | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 79365a40cb3a..96cbc5104959 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3334,7 +3334,8 @@ out:
  * shrink metadata reservation for delalloc
  */
 static int shrink_delalloc(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *root, u64 to_reclaim, int sync)
+			   struct btrfs_root *root, u64 to_reclaim,
+			   bool wait_ordered)
 {
 	struct btrfs_block_rsv *block_rsv;
 	struct btrfs_space_info *space_info;
@@ -3387,11 +3388,15 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
 		if (trans && trans->transaction->blocked)
 			return -EAGAIN;
 
-		time_left = schedule_timeout_interruptible(1);
+		if (wait_ordered && !trans) {
+			btrfs_wait_ordered_extents(root, 0, 0);
+		} else {
+			time_left = schedule_timeout_interruptible(1);
 
-		/* We were interrupted, exit */
-		if (time_left)
-			break;
+			/* We were interrupted, exit */
+			if (time_left)
+				break;
+		}
 
 		/* we've kicked the IO a few times, if anything has been freed,
 		 * exit.  There is no sense in looping here for a long time
@@ -3406,8 +3411,7 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
 		}
 
 	}
-	if (reclaimed < to_reclaim && !trans)
-		btrfs_wait_ordered_extents(root, 0, 0);
+
 	return reclaimed >= to_reclaim;
 }
 
@@ -3438,6 +3442,7 @@ static int reserve_metadata_bytes(struct btrfs_root *root,
 	int ret = 0;
 	bool committed = false;
 	bool flushing = false;
+	bool wait_ordered = false;
 
 	trans = (struct btrfs_trans_handle *)current->journal_info;
 again:
@@ -3496,6 +3501,7 @@ again:
 		 * amount plus the amount of bytes that we need for this
 		 * reservation.
 		 */
+		wait_ordered = true;
 		num_bytes = used - space_info->total_bytes +
 			(orig_bytes * (retries + 1));
 	}
@@ -3530,6 +3536,8 @@ again:
 		if (used + num_bytes < space_info->total_bytes + avail) {
 			space_info->bytes_may_use += orig_bytes;
 			ret = 0;
+		} else {
+			wait_ordered = true;
 		}
 	}
 
@@ -3552,7 +3560,7 @@ again:
 	 * We do synchronous shrinking since we don't actually unreserve
 	 * metadata until after the IO is completed.
 	 */
-	ret = shrink_delalloc(trans, root, num_bytes, 1);
+	ret = shrink_delalloc(trans, root, num_bytes, wait_ordered);
 	if (ret < 0)
 		goto out;
 
@@ -3564,6 +3572,7 @@ again:
 	 * so go back around and try again.
 	 */
 	if (retries < 2) {
+		wait_ordered = true;
 		retries++;
 		goto again;
 	}
-- 
cgit v1.2.3


From 877da174301dde9062b915da4c8103048be49702 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Fri, 14 Oct 2011 14:02:10 -0400
Subject: Btrfs: allow shrink_delalloc flush the needed reclaimed pages

Currently we only allow a maximum of 2 megabytes of pages to be flushed at a
time.  This was ok before, but now we have overcommit which will screw us in a
heartbeat if we are quickly filling the disk.  So instead pick either 2
megabytes or the number of pages we need to reclaim to be safe again, which ever
is larger.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent-tree.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 96cbc5104959..424ae82855c8 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3343,7 +3343,7 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
 	u64 max_reclaim;
 	u64 reclaimed = 0;
 	long time_left;
-	int nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
+	unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
 	int loops = 0;
 	unsigned long progress;
 
@@ -3366,7 +3366,8 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
 	}
 
 	max_reclaim = min(reserved, to_reclaim);
-
+	nr_pages = max_t(unsigned long, nr_pages,
+			 max_reclaim >> PAGE_CACHE_SHIFT);
 	while (loops < 1024) {
 		/* have the flusher threads jump in and do some IO */
 		smp_mb();
-- 
cgit v1.2.3


From b24e03db0df3e9164c9649db12fecc8c2d81b0d1 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Fri, 14 Oct 2011 14:40:17 -0400
Subject: Btrfs: release trans metadata bytes before flushing delayed refs

We started setting trans->block_rsv = NULL to allow the delayed refs flushing
stuff to use the right block_rsv and then just made
btrfs_trans_release_metadata() unconditionally use the trans block rsv.  The
problem with this is we need to reserve some space in the transaction and then
migrate it to the global block rsv, so we need to be able to free that out
properly.  So instead just move btrfs_trans_release_metadata() before the
delayed ref flushing and use trans->block_rsv for the freeing.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent-tree.c | 5 +----
 fs/btrfs/transaction.c | 6 ++----
 2 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 424ae82855c8..eb4fe56b08bb 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3909,13 +3909,10 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
 				  struct btrfs_root *root)
 {
-	struct btrfs_block_rsv *block_rsv;
-
 	if (!trans->bytes_reserved)
 		return;
 
-	block_rsv = &root->fs_info->trans_block_rsv;
-	btrfs_block_rsv_release(root, block_rsv, trans->bytes_reserved);
+	btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
 	trans->bytes_reserved = 0;
 }
 
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 1e1a4816ccb0..d064fa0a4a07 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -462,6 +462,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 		return 0;
 	}
 
+	btrfs_trans_release_metadata(trans, root);
 	trans->block_rsv = NULL;
 	while (count < 4) {
 		unsigned long cur = trans->delayed_ref_updates;
@@ -483,8 +484,6 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
 		count++;
 	}
 
-	btrfs_trans_release_metadata(trans, root);
-
 	if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) &&
 	    should_end_transaction(trans, root)) {
 		trans->transaction->blocked = 1;
@@ -1128,6 +1127,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 
 	btrfs_run_ordered_operations(root, 0);
 
+	btrfs_trans_release_metadata(trans, root);
 	trans->block_rsv = NULL;
 
 	/* make a pass through all the delayed refs we have so far
@@ -1136,8 +1136,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	ret = btrfs_run_delayed_refs(trans, root, 0);
 	BUG_ON(ret);
 
-	btrfs_trans_release_metadata(trans, root);
-
 	cur_trans = trans->transaction;
 	/*
 	 * set the flushing flag so procs in this transaction have to
-- 
cgit v1.2.3


From 3880a1b46d87a6b030c31889875befc745d95dff Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Fri, 14 Oct 2011 14:46:51 -0400
Subject: Btrfs: reserve some space for an orphan item when unlinking

In __unlink_start_trans() if we don't have enough room for a reservation we will
check to see if the unlink will free up space.  If it does that's great, but we
will still could add an orphan item, so we need to reserve enough space to add
the orphan item.  Do this and migrate the space the global reserve so it all
works out right.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/inode.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1f013c5c36aa..b6b70bdd0992 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2790,7 +2790,8 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
 		return ERR_PTR(-ENOMEM);
 	}
 
-	trans = btrfs_start_transaction(root, 0);
+	/* 1 for the orphan item */
+	trans = btrfs_start_transaction(root, 1);
 	if (IS_ERR(trans)) {
 		btrfs_free_path(path);
 		root->fs_info->enospc_unlink = 0;
@@ -2895,6 +2896,12 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir,
 	err = 0;
 out:
 	btrfs_free_path(path);
+	/* Migrate the orphan reservation over */
+	if (!err)
+		err = btrfs_block_rsv_migrate(trans->block_rsv,
+				&root->fs_info->global_block_rsv,
+				btrfs_calc_trans_metadata_size(root, 1));
+
 	if (err) {
 		btrfs_end_transaction(trans, root);
 		root->fs_info->enospc_unlink = 0;
-- 
cgit v1.2.3


From 36ba022ac0b748dd543f43430b03198e899426c9 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Tue, 18 Oct 2011 12:15:48 -0400
Subject: Btrfs: seperate out btrfs_block_rsv_check out into 2 different
 functions

Currently btrfs_block_rsv_check does 2 things, it will either refill a block
reserve like in the truncate or refill case, or it will check to see if there is
enough space in the global reserve and possibly refill it.  However because of
overcommit we could be well overcommitting ourselves just to try and refill the
global reserve, when really we should just be committing the transaction.  So
breack this out into btrfs_block_rsv_refill and btrfs_block_rsv_check.  Refill
will try to reserve more metadata if it can and btrfs_block_rsv_check will not,
it will only tell you if the factor of the total space is still reserved.
Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/ctree.h            |  4 +++-
 fs/btrfs/extent-tree.c      | 41 +++++++++++++++++++++++++++--------------
 fs/btrfs/free-space-cache.c |  2 +-
 fs/btrfs/inode.c            |  4 ++--
 fs/btrfs/relocation.c       |  6 ++----
 fs/btrfs/transaction.c      |  4 ++--
 6 files changed, 37 insertions(+), 24 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index ea60897a9171..227620993bce 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2252,8 +2252,10 @@ int btrfs_block_rsv_add(struct btrfs_root *root,
 			struct btrfs_block_rsv *block_rsv,
 			u64 num_bytes);
 int btrfs_block_rsv_check(struct btrfs_root *root,
+			  struct btrfs_block_rsv *block_rsv, int min_factor);
+int btrfs_block_rsv_refill(struct btrfs_root *root,
 			  struct btrfs_block_rsv *block_rsv,
-			  u64 min_reserved, int min_factor, int flush);
+			  u64 min_reserved);
 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
 			    struct btrfs_block_rsv *dst_rsv,
 			    u64 num_bytes);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index eb4fe56b08bb..a5f1421eeee9 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3422,7 +3422,6 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
  * @block_rsv - the block_rsv we're allocating for
  * @orig_bytes - the number of bytes we want
  * @flush - wether or not we can flush to make our reservation
- * @check - wether this is just to check if we have enough space or not
  *
  * This will reserve orgi_bytes number of bytes from the space info associated
  * with the block_rsv.  If there is not enough space it will make an attempt to
@@ -3433,7 +3432,7 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
  */
 static int reserve_metadata_bytes(struct btrfs_root *root,
 				  struct btrfs_block_rsv *block_rsv,
-				  u64 orig_bytes, int flush, int check)
+				  u64 orig_bytes, int flush)
 {
 	struct btrfs_space_info *space_info = block_rsv->space_info;
 	struct btrfs_trans_handle *trans;
@@ -3507,7 +3506,7 @@ again:
 			(orig_bytes * (retries + 1));
 	}
 
-	if (ret && !check) {
+	if (ret) {
 		u64 profile = btrfs_get_alloc_profile(root, 0);
 		u64 avail;
 
@@ -3742,7 +3741,7 @@ int btrfs_block_rsv_add(struct btrfs_root *root,
 	if (num_bytes == 0)
 		return 0;
 
-	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1, 0);
+	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1);
 	if (!ret) {
 		block_rsv_add_bytes(block_rsv, num_bytes, 1);
 		return 0;
@@ -3752,8 +3751,7 @@ int btrfs_block_rsv_add(struct btrfs_root *root,
 }
 
 int btrfs_block_rsv_check(struct btrfs_root *root,
-			  struct btrfs_block_rsv *block_rsv,
-			  u64 min_reserved, int min_factor, int flush)
+			  struct btrfs_block_rsv *block_rsv, int min_factor)
 {
 	u64 num_bytes = 0;
 	int ret = -ENOSPC;
@@ -3762,11 +3760,26 @@ int btrfs_block_rsv_check(struct btrfs_root *root,
 		return 0;
 
 	spin_lock(&block_rsv->lock);
-	if (min_factor > 0)
-		num_bytes = div_factor(block_rsv->size, min_factor);
-	if (min_reserved > num_bytes)
-		num_bytes = min_reserved;
+	num_bytes = div_factor(block_rsv->size, min_factor);
+	if (block_rsv->reserved >= num_bytes)
+		ret = 0;
+	spin_unlock(&block_rsv->lock);
 
+	return ret;
+}
+
+int btrfs_block_rsv_refill(struct btrfs_root *root,
+			  struct btrfs_block_rsv *block_rsv,
+			  u64 min_reserved)
+{
+	u64 num_bytes = 0;
+	int ret = -ENOSPC;
+
+	if (!block_rsv)
+		return 0;
+
+	spin_lock(&block_rsv->lock);
+	num_bytes = min_reserved;
 	if (block_rsv->reserved >= num_bytes)
 		ret = 0;
 	else
@@ -3776,7 +3789,7 @@ int btrfs_block_rsv_check(struct btrfs_root *root,
 	if (!ret)
 		return 0;
 
-	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush, !flush);
+	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 1);
 	if (!ret) {
 		block_rsv_add_bytes(block_rsv, num_bytes, 0);
 		return 0;
@@ -4073,7 +4086,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
 	to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
 	spin_unlock(&BTRFS_I(inode)->lock);
 
-	ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush, 0);
+	ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
 	if (ret) {
 		u64 to_free = 0;
 		unsigned dropped;
@@ -5728,7 +5741,7 @@ use_block_rsv(struct btrfs_trans_handle *trans,
 	block_rsv = get_block_rsv(trans, root);
 
 	if (block_rsv->size == 0) {
-		ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0, 0);
+		ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
 		/*
 		 * If we couldn't reserve metadata bytes try and use some from
 		 * the global reserve.
@@ -5749,7 +5762,7 @@ use_block_rsv(struct btrfs_trans_handle *trans,
 		return block_rsv;
 	if (ret) {
 		WARN_ON(1);
-		ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0, 0);
+		ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
 		if (!ret) {
 			return block_rsv;
 		} else if (ret && block_rsv != global_rsv) {
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 5d40c1ed8225..2fecfc3183ee 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -203,7 +203,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
 
 	rsv = trans->block_rsv;
 	trans->block_rsv = root->orphan_block_rsv;
-	ret = btrfs_block_rsv_check(root, root->orphan_block_rsv, 0, 5, 0);
+	ret = btrfs_block_rsv_check(root, root->orphan_block_rsv, 5);
 	if (ret)
 		return ret;
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index b6b70bdd0992..f12747c9447b 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -3604,7 +3604,7 @@ void btrfs_evict_inode(struct inode *inode)
 	 * doing the truncate.
 	 */
 	while (1) {
-		ret = btrfs_block_rsv_check(root, rsv, min_size, 0, 1);
+		ret = btrfs_block_rsv_refill(root, rsv, min_size);
 
 		/*
 		 * Try and steal from the global reserve since we will
@@ -6613,7 +6613,7 @@ static int btrfs_truncate(struct inode *inode)
 		btrfs_add_ordered_operation(trans, root, inode);
 
 	while (1) {
-		ret = btrfs_block_rsv_check(root, rsv, min_size, 0, 1);
+		ret = btrfs_block_rsv_refill(root, rsv, min_size);
 		if (ret) {
 			/*
 			 * This can only happen with the original transaction we
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 7fa090fa0d39..10af6a0e0865 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -2041,8 +2041,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
 		BUG_ON(IS_ERR(trans));
 		trans->block_rsv = rc->block_rsv;
 
-		ret = btrfs_block_rsv_check(root, rc->block_rsv, min_reserved,
-					    0, 0);
+		ret = btrfs_block_rsv_refill(root, rc->block_rsv, min_reserved);
 		if (ret) {
 			BUG_ON(ret != -EAGAIN);
 			ret = btrfs_commit_transaction(trans, root);
@@ -3774,8 +3773,7 @@ restart:
 			}
 		}
 
-		ret = btrfs_block_rsv_check(rc->extent_root, rc->block_rsv, 0,
-					    5, 0);
+		ret = btrfs_block_rsv_check(rc->extent_root, rc->block_rsv, 5);
 		if (ret < 0) {
 			if (ret != -EAGAIN) {
 				err = ret;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index d064fa0a4a07..29bef63e23ba 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -418,8 +418,8 @@ static int should_end_transaction(struct btrfs_trans_handle *trans,
 				  struct btrfs_root *root)
 {
 	int ret;
-	ret = btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 0,
-				    5, 0);
+
+	ret = btrfs_block_rsv_check(root, &root->fs_info->global_block_rsv, 5);
 	return ret ? 1 : 0;
 }
 
-- 
cgit v1.2.3


From 7e355b83efa80e5f5821591c13c17649594d82ac Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Tue, 18 Oct 2011 13:07:31 -0400
Subject: Btrfs: if we have a lot of pinned space, commit the transaction

Mitch kept hitting a panic because he was getting ENOSPC.  One of my previous
patches makes it so we are much better at not allocating new metadata chunks.
Unfortunately coupled with the overcommit patch this works us into a bit of a
problem if we are removing a bunch of space and end up chewing up all of our
space with pinned extents.  We can allocate chunks fine and overflow is ok, but
the only way to reclaim this space is to commit the transaction.  So if we go to
overcommit, first check and see how much pinned space we have.  If we have more
than 80% of the free space chewed up with pinned extents, just commit the
transaction, this will free up enough space for our reservation and we won't
have this problem anymore.  With this patch Mitch's test doesn't blow up
anymore.  Thanks,

Reported-and-tested-by: Mitch Harder <mitch.harder@sabayonlinux.org>
Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/extent-tree.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index a5f1421eeee9..4eb7d2ba38f8 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3510,6 +3510,20 @@ again:
 		u64 profile = btrfs_get_alloc_profile(root, 0);
 		u64 avail;
 
+		/*
+		 * If we have a lot of space that's pinned, don't bother doing
+		 * the overcommit dance yet and just commit the transaction.
+		 */
+		avail = (space_info->total_bytes - space_info->bytes_used) * 8;
+		do_div(avail, 10);
+		if (space_info->bytes_pinned >= avail && flush && !trans &&
+		    !committed) {
+			space_info->flush = 1;
+			flushing = true;
+			spin_unlock(&space_info->lock);
+			goto commit;
+		}
+
 		spin_lock(&root->fs_info->free_chunk_lock);
 		avail = root->fs_info->free_chunk_space;
 
@@ -3581,6 +3595,7 @@ again:
 	if (trans)
 		goto out;
 
+commit:
 	ret = -ENOSPC;
 	if (committed)
 		goto out;
-- 
cgit v1.2.3


From 016fc6a63e465d5b94e4028f6d05d9703e195428 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Wed, 19 Oct 2011 10:31:37 -0400
Subject: Btrfs: don't flush the cache inode before writing it

I noticed we had a little bit of latency when writing out the space cache
inodes.  It's because we flush it before we write anything in case we have dirty
pages already there.  This doesn't matter though since we're just going to
overwrite the space, and there really shouldn't be any dirty pages anyway.  This
makes some of my tests run a little bit faster.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
---
 fs/btrfs/free-space-cache.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 2fecfc3183ee..de205d59b74b 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -829,10 +829,6 @@ int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 	if (!i_size_read(inode))
 		return -1;
 
-	filemap_write_and_wait(inode->i_mapping);
-	btrfs_wait_ordered_range(inode, inode->i_size &
-				 ~(root->sectorsize - 1), (u64)-1);
-
 	io_ctl_init(&io_ctl, inode, root);
 
 	/* Get the cluster for this block_group if it exists */
-- 
cgit v1.2.3


From 84850e8d8a5ec7b9d3c47d224e9a10c9da52ff1b Mon Sep 17 00:00:00 2001
From: "Yan, Zheng" <zheng.z.yan@intel.com>
Date: Mon, 29 Aug 2011 09:25:53 +0800
Subject: btrfs: check file extent backref offset underflow

Offset field in data extent backref can underflow if clone range ioctl
is used. We can reliably detect the underflow because max file size is
limited to 2^63 and max data extent size is limited by block group size.

Signed-off-by: Zheng Yan  <zheng.z.yan@intel.com>
---
 fs/btrfs/relocation.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 10af6a0e0865..24d654ce7a06 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -3322,8 +3322,11 @@ static int find_data_references(struct reloc_control *rc,
 	}
 
 	key.objectid = ref_objectid;
-	key.offset = ref_offset;
 	key.type = BTRFS_EXTENT_DATA_KEY;
+	if (ref_offset > ((u64)-1 << 32))
+		key.offset = 0;
+	else
+		key.offset = ref_offset;
 
 	path->search_commit_root = 1;
 	path->skip_locking = 1;
-- 
cgit v1.2.3


From 60ccf82f5b6e26e10d41783464ca469c070c7d49 Mon Sep 17 00:00:00 2001
From: Diego Calleja <diegocg@gmail.com>
Date: Thu, 1 Sep 2011 16:33:57 +0200
Subject: btrfs: fix memory leak in btrfs_defrag_file

kmemleak found this:
unreferenced object 0xffff8801b64af968 (size 512):
  comm "btrfs-cleaner", pid 3317, jiffies 4306810886 (age 903.272s)
  hex dump (first 32 bytes):
    00 82 01 07 00 ea ff ff c0 83 01 07 00 ea ff ff  ................
    80 82 01 07 00 ea ff ff c0 87 01 07 00 ea ff ff  ................
  backtrace:
    [<ffffffff816875cc>] kmemleak_alloc+0x5c/0xc0
    [<ffffffff8114aec3>] kmem_cache_alloc_trace+0x163/0x240
    [<ffffffff8127a290>] btrfs_defrag_file+0xf0/0xb20
    [<ffffffff8125d9a5>] btrfs_run_defrag_inodes+0x165/0x210
    [<ffffffff812479d7>] cleaner_kthread+0x177/0x190
    [<ffffffff81075c7d>] kthread+0x8d/0xa0
    [<ffffffff816af5f4>] kernel_thread_helper+0x4/0x10
    [<ffffffffffffffff>] 0xffffffffffffffff

"pages" is not always freed. Fix it removing the unnecesary additional return.

Signed-off-by: Diego Calleja <diegocg@gmail.com>
---
 fs/btrfs/ioctl.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index d2b53eb8a8c2..8ccc106f4e18 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1140,9 +1140,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 		btrfs_set_super_incompat_flags(disk_super, features);
 	}
 
-	if (!file)
-		kfree(ra);
-	return defrag_count;
+	ret = defrag_count;
 
 out_ra:
 	if (!file)
-- 
cgit v1.2.3


From cbcc83265d929ac71553c1b5dafdb830171af947 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 2 Sep 2011 15:56:25 +0800
Subject: Btrfs: fix defragmentation regression

There's an off-by-one bug:

  # create a file with lots of 4K file extents
  # btrfs fi defrag /mnt/file
  # sync
  # filefrag -v /mnt/file
  Filesystem type is: 9123683e
  File size of /mnt/file is 1228800 (300 blocks, blocksize 4096)
   ext logical physical expected length flags
     0       0     3372              64
     1      64     3136     3435      1
     2      65     3436     3136     64
     3     129     3201     3499      1
     4     130     3500     3201     64
     5     194     3266     3563      1
     6     195     3564     3266     64
     7     259     3331     3627      1
     8     260     3628     3331     40 eof

After this patch:

  ...
  # filefrag -v /mnt/file
  Filesystem type is: 9123683e
  File size of /mnt/file is 1228800 (300 blocks, blocksize 4096)
   ext logical physical expected length flags
     0       0     3372             300 eof
  /mnt/file: 1 extent found

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/ioctl.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 8ccc106f4e18..b39f7bf92704 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1087,7 +1087,6 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 
 		defrag_count += ret;
 		balance_dirty_pages_ratelimited_nr(inode->i_mapping, ret);
-		i += ret;
 
 		if (newer_than) {
 			if (newer_off == (u64)-1)
@@ -1107,7 +1106,10 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 				break;
 			}
 		} else {
-			i++;
+			if (ret > 0)
+				i += ret;
+			else
+				i++;
 		}
 	}
 
-- 
cgit v1.2.3


From 151a31b25e5c941bdd9fdefed650effca223c716 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 2 Sep 2011 15:56:39 +0800
Subject: Btrfs: use i_size_read() in btrfs_defrag_file()

Don't use inode->i_size directly, since we're not holding i_mutex.

This also fixes another bug, that i_size can change after it's checked
against 0 and then (i_size - 1) can be negative.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/ioctl.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index b39f7bf92704..323d77f09258 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -978,6 +978,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 	struct btrfs_super_block *disk_super;
 	struct file_ra_state *ra = NULL;
 	unsigned long last_index;
+	u64 isize = i_size_read(inode);
 	u64 features;
 	u64 last_len = 0;
 	u64 skip = 0;
@@ -1003,7 +1004,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 			compress_type = range->compress_type;
 	}
 
-	if (inode->i_size == 0)
+	if (isize == 0)
 		return 0;
 
 	/*
@@ -1028,10 +1029,10 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 
 	/* find the last page to defrag */
 	if (range->start + range->len > range->start) {
-		last_index = min_t(u64, inode->i_size - 1,
+		last_index = min_t(u64, isize - 1,
 			 range->start + range->len - 1) >> PAGE_CACHE_SHIFT;
 	} else {
-		last_index = (inode->i_size - 1) >> PAGE_CACHE_SHIFT;
+		last_index = (isize - 1) >> PAGE_CACHE_SHIFT;
 	}
 
 	if (newer_than) {
-- 
cgit v1.2.3


From 5ca496604b5975d371bb669ee6c2394bcbea818f Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 2 Sep 2011 15:56:55 +0800
Subject: Btrfs: fix wrong max_to_defrag in btrfs_defrag_file()

It's off-by-one, and thus we may skip the last page while defragmenting.

An example case:

  # create /mnt/file with 2 4K file extents
  # btrfs fi defrag /mnt/file
  # sync
  # filefrag /mnt/file
  /mnt/file: 2 extents found

So it's not defragmented.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/ioctl.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 323d77f09258..f9026413bcf1 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1052,7 +1052,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 		i = range->start >> PAGE_CACHE_SHIFT;
 	}
 	if (!max_to_defrag)
-		max_to_defrag = last_index - 1;
+		max_to_defrag = last_index;
 
 	while (i <= last_index && defrag_count < max_to_defrag) {
 		/*
-- 
cgit v1.2.3


From 83c8c9bde0add721f7509aa446455183b040b931 Mon Sep 17 00:00:00 2001
From: Jeff Liu <jeff.liu@oracle.com>
Date: Wed, 14 Sep 2011 14:11:21 +0800
Subject: btrfs: trivial fix, a potential memory leak in
 btrfs_parse_early_options()

Signed-off-by: Jie Liu <jeff.liu@oracle.com>
---
 fs/btrfs/super.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 266d1f35465d..09ce951666ea 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -419,7 +419,7 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
 		u64 *subvol_rootid, struct btrfs_fs_devices **fs_devices)
 {
 	substring_t args[MAX_OPT_ARGS];
-	char *opts, *orig, *p;
+	char *device_name, *opts, *orig, *p;
 	int error = 0;
 	int intarg;
 
@@ -470,8 +470,14 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
 			}
 			break;
 		case Opt_device:
-			error = btrfs_scan_one_device(match_strdup(&args[0]),
+			device_name = match_strdup(&args[0]);
+			if (!device_name) {
+				error = -ENOMEM;
+				goto out;
+			}
+			error = btrfs_scan_one_device(device_name,
 					flags, holder, fs_devices);
+			kfree(device_name);
 			if (error)
 				goto out;
 			break;
-- 
cgit v1.2.3


From 008873eafbc77deb1702aedece33756c58486c6a Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 2 Sep 2011 15:57:07 +0800
Subject: Btrfs: honor extent thresh during defragmentation

We won't defrag an extent, if it's bigger than the threshold we
specified and there's no small extent before it, but actually
the code doesn't work this way.

There are three bugs:

- When should_defrag_range() decides we should keep on defragmenting
  an extent, last_len is not incremented. (old bug)

- The length that passes to should_defrag_range() is not the length
  we're going to defrag. (new bug)

- We always defrag 256K bytes data, and a big extent can be part of
  this range. (new bug)

For a file with 4 extents:

        | 4K | 4K | 256K | 256K |

The result of defrag with (the default) 256K extent thresh should be:

        | 264K | 256K |

but with those bugs, we'll get:

        | 520K |

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/ioctl.c | 37 ++++++++++++++++++++++++++-----------
 1 file changed, 26 insertions(+), 11 deletions(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index f9026413bcf1..d524b6697ad9 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -765,7 +765,7 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len,
 	int ret = 1;
 
 	/*
-	 * make sure that once we start defragging and extent, we keep on
+	 * make sure that once we start defragging an extent, we keep on
 	 * defragging it
 	 */
 	if (start < *defrag_end)
@@ -810,7 +810,6 @@ static int should_defrag_range(struct inode *inode, u64 start, u64 len,
 	 * extent will force at least part of that big extent to be defragged.
 	 */
 	if (ret) {
-		*last_len += len;
 		*defrag_end = extent_map_end(em);
 	} else {
 		*last_len = 0;
@@ -984,13 +983,14 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 	u64 skip = 0;
 	u64 defrag_end = 0;
 	u64 newer_off = range->start;
-	int newer_left = 0;
 	unsigned long i;
+	unsigned long ra_index = 0;
 	int ret;
 	int defrag_count = 0;
 	int compress_type = BTRFS_COMPRESS_ZLIB;
 	int extent_thresh = range->extent_thresh;
-	int newer_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT;
+	int max_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT;
+	int cluster = max_cluster;
 	u64 new_align = ~((u64)128 * 1024 - 1);
 	struct page **pages = NULL;
 
@@ -1020,7 +1020,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 		ra = &file->f_ra;
 	}
 
-	pages = kmalloc(sizeof(struct page *) * newer_cluster,
+	pages = kmalloc(sizeof(struct page *) * max_cluster,
 			GFP_NOFS);
 	if (!pages) {
 		ret = -ENOMEM;
@@ -1045,7 +1045,6 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 			 * the extents in the file evenly spaced
 			 */
 			i = (newer_off & new_align) >> PAGE_CACHE_SHIFT;
-			newer_left = newer_cluster;
 		} else
 			goto out_ra;
 	} else {
@@ -1077,12 +1076,26 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 			i = max(i + 1, next);
 			continue;
 		}
+
+		if (!newer_than) {
+			cluster = (PAGE_CACHE_ALIGN(defrag_end) >>
+				   PAGE_CACHE_SHIFT) - i;
+			cluster = min(cluster, max_cluster);
+		} else {
+			cluster = max_cluster;
+		}
+
 		if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
 			BTRFS_I(inode)->force_compress = compress_type;
 
-		btrfs_force_ra(inode->i_mapping, ra, file, i, newer_cluster);
+		if (i + cluster > ra_index) {
+			ra_index = max(i, ra_index);
+			btrfs_force_ra(inode->i_mapping, ra, file, ra_index,
+				       cluster);
+			ra_index += max_cluster;
+		}
 
-		ret = cluster_pages_for_defrag(inode, pages, i, newer_cluster);
+		ret = cluster_pages_for_defrag(inode, pages, i, cluster);
 		if (ret < 0)
 			goto out_ra;
 
@@ -1102,15 +1115,17 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 			if (!ret) {
 				range->start = newer_off;
 				i = (newer_off & new_align) >> PAGE_CACHE_SHIFT;
-				newer_left = newer_cluster;
 			} else {
 				break;
 			}
 		} else {
-			if (ret > 0)
+			if (ret > 0) {
 				i += ret;
-			else
+				last_len += ret << PAGE_CACHE_SHIFT;
+			} else {
 				i++;
+				last_len = 0;
+			}
 		}
 	}
 
-- 
cgit v1.2.3


From f4c697e6406da5dd445eda8d923c53e1138793dd Mon Sep 17 00:00:00 2001
From: Lukas Czerner <lczerner@redhat.com>
Date: Mon, 5 Sep 2011 16:34:54 +0200
Subject: btrfs: return EINVAL if start > total_bytes in fitrim ioctl

We should retirn EINVAL if the start is beyond the end of the file
system in the btrfs_ioctl_fitrim(). Fix that by adding the appropriate
check for it.

Also in the btrfs_trim_fs() it is possible that len+start might overflow
if big values are passed. Fix it by decrementing the len so that start+len
is equal to the file system size in the worst case.

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
---
 fs/btrfs/ioctl.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index d524b6697ad9..136a2f980e21 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -282,6 +282,7 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
 	struct fstrim_range range;
 	u64 minlen = ULLONG_MAX;
 	u64 num_devices = 0;
+	u64 total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
 	int ret;
 
 	if (!capable(CAP_SYS_ADMIN))
@@ -300,12 +301,15 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
 		}
 	}
 	rcu_read_unlock();
+
 	if (!num_devices)
 		return -EOPNOTSUPP;
-
 	if (copy_from_user(&range, arg, sizeof(range)))
 		return -EFAULT;
+	if (range.start > total_bytes)
+		return -EINVAL;
 
+	range.len = min(range.len, total_bytes - range.start);
 	range.minlen = max(range.minlen, minlen);
 	ret = btrfs_trim_fs(root, &range);
 	if (ret < 0)
-- 
cgit v1.2.3


From a05a9bb18ae0abec0b513b5fde876c47905fa13e Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 6 Sep 2011 16:55:34 +0800
Subject: Btrfs: fix array bound checking

Otherwise we can execced the array bound of path->slots[].

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/ctree.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 011cab3aca8d..0fe615e4ea38 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -902,9 +902,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 
 	orig_ptr = btrfs_node_blockptr(mid, orig_slot);
 
-	if (level < BTRFS_MAX_LEVEL - 1)
+	if (level < BTRFS_MAX_LEVEL - 1) {
 		parent = path->nodes[level + 1];
-	pslot = path->slots[level + 1];
+		pslot = path->slots[level + 1];
+	}
 
 	/*
 	 * deal with the case where there is only one pointer in the root
@@ -1107,9 +1108,10 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
 	mid = path->nodes[level];
 	WARN_ON(btrfs_header_generation(mid) != trans->transid);
 
-	if (level < BTRFS_MAX_LEVEL - 1)
+	if (level < BTRFS_MAX_LEVEL - 1) {
 		parent = path->nodes[level + 1];
-	pslot = path->slots[level + 1];
+		pslot = path->slots[level + 1];
+	}
 
 	if (!parent)
 		return 1;
-- 
cgit v1.2.3


From 560f7d75457f86a43970aa413e334e394082dce4 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 8 Sep 2011 10:22:01 +0800
Subject: Btrfs: remove BUG_ON() in compress_file_range()

It's not a big deal if we fail to allocate the array, and instead of
panic we can just give up compressing.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/inode.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index f12747c9447b..81d4f68f35c9 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -393,7 +393,10 @@ again:
 	     (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) {
 		WARN_ON(pages);
 		pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS);
-		BUG_ON(!pages);
+		if (!pages) {
+			/* just bail out to the uncompressed code */
+			goto cont;
+		}
 
 		if (BTRFS_I(inode)->force_compress)
 			compress_type = BTRFS_I(inode)->force_compress;
@@ -424,6 +427,7 @@ again:
 			will_compress = 1;
 		}
 	}
+cont:
 	if (start == 0) {
 		trans = btrfs_join_transaction(root);
 		BUG_ON(IS_ERR(trans));
-- 
cgit v1.2.3


From f0dd9592a1aa014b3a01aa2be7e795aae040d65b Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 8 Sep 2011 10:26:51 +0800
Subject: Btrfs: fix direct-io vs nodatacow

To reproduce the bug:

  # mount -o nodatacow /dev/sda7 /mnt/
  # dd if=/dev/zero of=/mnt/tmp bs=4K count=1
  1+0 records in
  1+0 records out
  4096 bytes (4.1 kB) copied, 0.000136115 s, 30.1 MB/s
  # dd if=/dev/zero of=/mnt/tmp bs=4K count=1 conv=notrunc oflag=direct
  dd: writing `/mnt/tmp': Input/output error
  1+0 records in
  0+0 records out

btrfs_ordered_update_i_size() may return 1, but btrfs_endio_direct_write()
mistakenly takes it as an error.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
---
 fs/btrfs/inode.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 81d4f68f35c9..65474d95f26f 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5777,8 +5777,7 @@ again:
 	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) {
 		ret = btrfs_ordered_update_i_size(inode, 0, ordered);
 		if (!ret)
-			ret = btrfs_update_inode(trans, root, inode);
-		err = ret;
+			err = btrfs_update_inode(trans, root, inode);
 		goto out;
 	}
 
-- 
cgit v1.2.3


From fee187d9d9ddc382c81370a9a280391132dea2e1 Mon Sep 17 00:00:00 2001
From: Liu Bo <liubo2009@cn.fujitsu.com>
Date: Thu, 29 Sep 2011 15:55:28 +0800
Subject: Btrfs: do not set EXTENT_DIRTY along with EXTENT_DELALLOC

Signed-off-by: Liu Bo <liubo2009@cn.fujitsu.com>
---
 fs/btrfs/extent_io.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 0ada0b700b44..f284d4e5f447 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1107,7 +1107,7 @@ int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
 			struct extent_state **cached_state, gfp_t mask)
 {
 	return set_extent_bit(tree, start, end,
-			      EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE,
+			      EXTENT_DELALLOC | EXTENT_UPTODATE,
 			      0, NULL, cached_state, mask);
 }
 
-- 
cgit v1.2.3


From 10b2f34d6e7fbe07f498cb2006272e9a561f5e60 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Sun, 2 Oct 2011 13:56:53 +0300
Subject: Btrfs: pass the correct root to lookup_free_space_inode()

Free space items are located in tree of tree roots, not in the extent
tree.  It didn't pop up because lookup_free_space_inode() grabs the
inode all the time instead of actually searching the tree.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/btrfs/extent-tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 4eb7d2ba38f8..6cfcc9060c83 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -7312,7 +7312,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 		goto out;
 	}
 
-	inode = lookup_free_space_inode(root, block_group, path);
+	inode = lookup_free_space_inode(tree_root, block_group, path);
 	if (!IS_ERR(inode)) {
 		ret = btrfs_orphan_add(trans, inode);
 		BUG_ON(ret);
-- 
cgit v1.2.3


From cfbffc39ac89dbd5197cbeec2599a1128eb928f8 Mon Sep 17 00:00:00 2001
From: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
Date: Thu, 6 Oct 2011 13:37:08 +0900
Subject: Btrfs: fix return value of btrfs_get_acl()

In btrfs_get_acl(), when the second __btrfs_getxattr() call fails,
acl is not correctly set.
Therefore, a wrong value might return to the caller.

Signed-off-by: Tsutomu Itoh <t-itoh@jp.fujitsu.com>
---
 fs/btrfs/acl.c | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index eb159aaa5a11..89b156d85d63 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -59,22 +59,19 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
 		if (!value)
 			return ERR_PTR(-ENOMEM);
 		size = __btrfs_getxattr(inode, name, value, size);
-		if (size > 0) {
-			acl = posix_acl_from_xattr(value, size);
-			if (IS_ERR(acl)) {
-				kfree(value);
-				return acl;
-			}
-			set_cached_acl(inode, type, acl);
-		}
-		kfree(value);
+	}
+	if (size > 0) {
+		acl = posix_acl_from_xattr(value, size);
 	} else if (size == -ENOENT || size == -ENODATA || size == 0) {
 		/* FIXME, who returns -ENOENT?  I think nobody */
 		acl = NULL;
-		set_cached_acl(inode, type, acl);
 	} else {
 		acl = ERR_PTR(-EIO);
 	}
+	kfree(value);
+
+	if (!IS_ERR(acl))
+		set_cached_acl(inode, type, acl);
 
 	return acl;
 }
-- 
cgit v1.2.3


From 60d2adbb1e7fee1cb4bc67f70bd0bd8ace7b6c3c Mon Sep 17 00:00:00 2001
From: Miao Xie <miaox@cn.fujitsu.com>
Date: Fri, 9 Sep 2011 17:34:35 +0800
Subject: Btrfs: fix race between multi-task space allocation and caching space

The task may fail to get free space though it is enough when multi-task
space allocation and caching space happen at the same time.

	Task1			Caching Thread		Task2
	------------------------------------------------------------------------
	find_free_extent
	  The space has not
	  be cached, and start
	  caching thread. And
	  wait for it.
				cache space, if
				the space is > 2MB
				wake up Task1
							find_free_extent
							  get all the space that
							  is cached.
	  try to allocate space,
	  but there is no space
	  now.
	trigger BUG_ON()

The message is following:
btrfs allocation failed flags 1, wanted 4096
space_info has 1040187392 free, is not full
space_info total=1082130432, used=4096, pinned=41938944, reserved=0, may_use=40828928, readonly=0
block group 12582912 has 8388608 bytes, 0 used 8388608 pinned 0 reserved
block group has cluster?: no
0 blocks of free space at or bigger than bytes is
block group 1103101952 has 1073741824 bytes, 4096 used 33550336 pinned 0 reserved
block group has cluster?: no
0 blocks of free space at or bigger than bytes is
------------[ cut here ]------------
kernel BUG at fs/btrfs/inode.c:835!
 [<ffffffffa031261b>] __extent_writepage+0x1bf/0x5ce [btrfs]
 [<ffffffff810cbcb8>] ? __set_page_dirty_nobuffers+0xfe/0x108
 [<ffffffffa02f8ada>] ? wait_current_trans+0x23/0xec [btrfs]
 [<ffffffff810c3fbf>] ? find_get_pages_tag+0x73/0xe2
 [<ffffffffa0312d12>] extent_write_cache_pages.clone.0+0x176/0x29a [btrfs]
 [<ffffffffa0312e74>] extent_writepages+0x3e/0x53 [btrfs]
 [<ffffffff8110ad2c>] ? do_sync_write+0xc6/0x103
 [<ffffffffa0302d6e>] ? btrfs_submit_direct+0x414/0x414 [btrfs]
 [<ffffffff811380fa>] ? fsnotify+0x236/0x266
 [<ffffffffa02fc930>] btrfs_writepages+0x22/0x24 [btrfs]
 [<ffffffff810cc215>] do_writepages+0x1c/0x25
 [<ffffffff810c4958>] __filemap_fdatawrite_range+0x4e/0x50
 [<ffffffff810c4982>] filemap_write_and_wait_range+0x28/0x51
 [<ffffffffa0306b2e>] btrfs_sync_file+0x7d/0x198 [btrfs]
 [<ffffffff8110aa26>] ? fsnotify_modify+0x5d/0x65
 [<ffffffff8112d150>] vfs_fsync_range+0x18/0x21
 [<ffffffff8112d170>] vfs_fsync+0x17/0x19
 [<ffffffff8112d316>] do_fsync+0x29/0x3e
 [<ffffffff8112d348>] sys_fsync+0xb/0xf
 [<ffffffff81468352>] system_call_fastpath+0x16/0x1b
[SNIP]
RIP  [<ffffffffa02fe08c>] cow_file_range+0x1c4/0x32b [btrfs]

We fix this bug by trying to allocate the space again if there are block groups
in caching.

Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
---
 fs/btrfs/extent-tree.c | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 6cfcc9060c83..cef355f1328a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4954,6 +4954,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
 	bool failed_cluster_refill = false;
 	bool failed_alloc = false;
 	bool use_cluster = true;
+	bool have_caching_bg = false;
 	u64 ideal_cache_percent = 0;
 	u64 ideal_cache_offset = 0;
 
@@ -5036,6 +5037,7 @@ ideal_cache:
 		}
 	}
 search:
+	have_caching_bg = false;
 	down_read(&space_info->groups_sem);
 	list_for_each_entry(block_group, &space_info->block_groups[index],
 			    list) {
@@ -5244,6 +5246,8 @@ refill_cluster:
 			failed_alloc = true;
 			goto have_block_group;
 		} else if (!offset) {
+			if (!cached)
+				have_caching_bg = true;
 			goto loop;
 		}
 checks:
@@ -5294,6 +5298,9 @@ loop:
 	}
 	up_read(&space_info->groups_sem);
 
+	if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg)
+		goto search;
+
 	if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
 		goto search;
 
-- 
cgit v1.2.3


From 068132bad1de70f85f5f6d12c36d64f8f7848d92 Mon Sep 17 00:00:00 2001
From: Daniel J Blueman <daniel.blueman@gmail.com>
Date: Thu, 23 Jun 2011 23:01:01 +0800
Subject: btrfs: fix oops on failure path

If lookup_extent_backref fails, path->nodes[0] reasonably could be
null along with other callers of btrfs_print_leaf, so ensure we have a
valid extent buffer before dereferencing.

Signed-off-by: Daniel J Blueman <daniel.blueman@gmail.com>
---
 fs/btrfs/print-tree.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index fb2605d998e9..f38e452486b8 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -158,8 +158,7 @@ static void print_extent_ref_v0(struct extent_buffer *eb, int slot)
 void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 {
 	int i;
-	u32 type;
-	u32 nr = btrfs_header_nritems(l);
+	u32 type, nr;
 	struct btrfs_item *item;
 	struct btrfs_root_item *ri;
 	struct btrfs_dir_item *di;
@@ -172,6 +171,11 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 	struct btrfs_key key;
 	struct btrfs_key found_key;
 
+	if (!l)
+		return;
+
+	nr = btrfs_header_nritems(l);
+
 	printk(KERN_INFO "leaf %llu total ptrs %d free space %d\n",
 		(unsigned long long)btrfs_header_bytenr(l), nr,
 		btrfs_leaf_free_space(root, l));
-- 
cgit v1.2.3


From 5f524444c351e145a5f7e28253594688a421bfe8 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 13 Oct 2011 00:20:43 +0300
Subject: Btrfs: fix a bug when opening seed devices

Initialize fs_info->bdev_holder a bit earlier to be able to pass a
correct holder id to blkdev_get() when opening seed devices with O_EXCL.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/btrfs/super.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 09ce951666ea..29eecbb6ec3a 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -939,6 +939,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 
 		s->s_flags = flags | MS_NOSEC;
 		strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
+		btrfs_sb(s)->fs_info->bdev_holder = fs_type;
 		error = btrfs_fill_super(s, fs_devices, data,
 					 flags & MS_SILENT ? 1 : 0);
 		if (error) {
@@ -946,7 +947,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 			return ERR_PTR(error);
 		}
 
-		btrfs_sb(s)->fs_info->bdev_holder = fs_type;
 		s->s_flags |= MS_ACTIVE;
 	}
 
-- 
cgit v1.2.3


From 20bcd64934e4eb8f3f90a0dca54fb0ac2edd7795 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Thu, 20 Oct 2011 00:06:20 +0300
Subject: Btrfs: close all bdevs on mount failure

Fix a bug introduced by 20b45077.  We have to return EINVAL on mount
failure, but doing that too early in the sequence leaves all of the
devices opened exclusively.  This also fixes an issue where under some
scenarios only a second mount -o degraded <devices> command would
succeed.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
---
 fs/btrfs/volumes.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index e138af710de2..c6938b45e0fd 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -597,10 +597,8 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
 		set_blocksize(bdev, 4096);
 
 		bh = btrfs_read_dev_super(bdev);
-		if (!bh) {
-			ret = -EINVAL;
+		if (!bh)
 			goto error_close;
-		}
 
 		disk_super = (struct btrfs_super_block *)bh->b_data;
 		devid = btrfs_stack_device_id(&disk_super->dev_item);
@@ -655,7 +653,7 @@ error:
 		continue;
 	}
 	if (fs_devices->open_devices == 0) {
-		ret = -EIO;
+		ret = -EINVAL;
 		goto out;
 	}
 	fs_devices->seeding = seeding;
-- 
cgit v1.2.3


From f9d9ef62cd3ecbd6cbb7957a253c1e81f69d5586 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.cz>
Date: Thu, 29 Sep 2011 13:11:33 +0200
Subject: btrfs: do not allow mounting non-subvolumes via subvol option

There's a missing test whether the path passed to subvol=path option
during mount is a real subvolume, allowing any directory located in
default subovlume to be passed and accepted for mount.

(current btrfs progs prevent this early)
$ btrfs subvol snapshot . p1-snap
ERROR: '.' is not a subvolume

(with "is subvolume?" test bypassed)
$ btrfs subvol snapshot . p1-snap
Create a snapshot of '.' in './p1-snap'

$ btrfs subvol list -p .
ID 258 parent 5 top level 5 path subvol
ID 259 parent 5 top level 5 path subvol1
ID 260 parent 5 top level 5 path default-subvol1
ID 262 parent 5 top level 5 path p1/p1-snapshot
ID 263 parent 259 top level 5 path subvol1/subvol1-snap

The problem I see is that this makes a false impression of snapshotting the
given subvolume but in fact snapshots the default one: a user expects outcome
like ID 263 but in fact gets ID 262 .

This patch makes mount fail with EINVAL with a message in syslog.

Signed-off-by: David Sterba <dsterba@suse.cz>
---
 fs/btrfs/super.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 29eecbb6ec3a..5429b1fa0bfc 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -740,6 +740,16 @@ static int btrfs_set_super(struct super_block *s, void *data)
 	return set_anon_super(s, data);
 }
 
+/*
+ * subvolumes are identified by ino 256
+ */
+static inline int is_subvolume_inode(struct inode *inode)
+{
+	if (inode && inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
+		return 1;
+	return 0;
+}
+
 /*
  * This will strip out the subvol=%s argument for an argument string and add
  * subvolid=0 to make sure we get the actual tree root for path walking to the
@@ -843,6 +853,15 @@ static struct dentry *mount_subvol(const char *subvol_name, int flags,
 	if (error)
 		return ERR_PTR(error);
 
+	if (!is_subvolume_inode(path.dentry->d_inode)) {
+		path_put(&path);
+		mntput(mnt);
+		error = -EINVAL;
+		printk(KERN_ERR "btrfs: '%s' is not a valid subvolume\n",
+				subvol_name);
+		return ERR_PTR(-EINVAL);
+	}
+
 	/* Get a ref to the sb and the dentry we found and return it */
 	s = path.mnt->mnt_sb;
 	atomic_inc(&s->s_active);
-- 
cgit v1.2.3


From dff51cd1c60856c28f5d22a571294c2b70b6b322 Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.cz>
Date: Tue, 14 Jun 2011 12:52:17 +0200
Subject: btrfs: ratelimit WARN_ON in use_block_rsv

The WARN_ON under some circumstances heavily polute log and slow down
the machine. This is just a safety, as the warning should be fixed by
another patch, nevertheless, it still pops up during testing.

Signed-off-by: David Sterba <dsterba@suse.cz>
---
 fs/btrfs/extent-tree.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index cef355f1328a..28c4809851a5 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -23,6 +23,7 @@
 #include <linux/rcupdate.h>
 #include <linux/kthread.h>
 #include <linux/slab.h>
+#include <linux/ratelimit.h>
 #include "compat.h"
 #include "hash.h"
 #include "ctree.h"
@@ -5783,7 +5784,13 @@ use_block_rsv(struct btrfs_trans_handle *trans,
 	if (!ret)
 		return block_rsv;
 	if (ret) {
-		WARN_ON(1);
+		static DEFINE_RATELIMIT_STATE(_rs,
+				DEFAULT_RATELIMIT_INTERVAL,
+				/*DEFAULT_RATELIMIT_BURST*/ 2);
+		if (__ratelimit(&_rs)) {
+			printk(KERN_DEBUG "btrfs: block rsv returned %d\n", ret);
+			WARN_ON(1);
+		}
 		ret = reserve_metadata_bytes(root, block_rsv, blocksize, 0);
 		if (!ret) {
 			return block_rsv;
-- 
cgit v1.2.3


From cd354ad613a393424f85333ceed6b15e07fb94ae Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 20 Oct 2011 15:45:37 -0400
Subject: Btrfs: don't wait as long for more batches during SSD log commit

When we're doing log commits, we try to wait for more writers to come in
and make the commit bigger.  This helps improve performance on rotating
disks, but on SSDs it adds latencies.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/tree-log.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 786639fca067..310ab22cfe58 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2013,10 +2013,10 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	/* wait for previous tree log sync to complete */
 	if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
 		wait_log_commit(trans, root, root->log_transid - 1);
-
 	while (1) {
 		unsigned long batch = root->log_batch;
-		if (root->log_multiple_pids) {
+		/* when we're on an ssd, just kick the log commit out */
+		if (!btrfs_test_opt(root, SSD) && root->log_multiple_pids) {
 			mutex_unlock(&root->log_mutex);
 			schedule_timeout_uninterruptible(1);
 			mutex_lock(&root->log_mutex);
-- 
cgit v1.2.3


From 1eae31e918972bbeefc119d23c1d67674f49a301 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 14 Oct 2011 06:31:20 -0400
Subject: Btrfs: make sure btrfs_remove_free_space doesn't leak EAGAIN

btrfs_remove_free_space needs to make sure to set ret back to a
valid return value after setting it to EAGAIN, otherwise we return
it to the callers.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/free-space-cache.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index de205d59b74b..f49475dfa954 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -1868,6 +1868,7 @@ again:
 			ctl->total_bitmaps--;
 		}
 		kmem_cache_free(btrfs_free_space_cachep, info);
+		ret = 0;
 		goto out_lock;
 	}
 
@@ -1875,7 +1876,8 @@ again:
 		unlink_free_space(ctl, info);
 		info->offset += bytes;
 		info->bytes -= bytes;
-		link_free_space(ctl, info);
+		ret = link_free_space(ctl, info);
+		WARN_ON(ret);
 		goto out_lock;
 	}
 
-- 
cgit v1.2.3


From e688b7252f784c2479d559f9f70ca8354752c5e7 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 31 Oct 2011 20:52:39 -0400
Subject: Btrfs: fix extent pinning bugs in the tree log

The tree log had two important bugs that could cause corruptions after a
crash.  Sometimes we were allowing tree log blocks to be reused after
the tree log was committed but before the transaction commit was done.

This allowed a future metadata write to overwrite the tree log data.  It
is fixed by adding a new variant of freeing reserved extents that always
pins them.  Credit goes to Stefan Behrens and Arne Jansen for many many
hours spent tracking this bug down.

During tree log replay, we do a pass through the tree log and pin all
the extents we find.  This makes sure the replay code won't go in and
use any of those blocks for new allocations during replay.  The problem
is the free space cache isn't honoring these pinned extents.  So the
allocator can end up handing them out, leading to all kinds of problems
during replay.

The fix here is to force any free space cache to load while we pin the
extents, and then to make sure we remove the pinned extents from the
free space rbtree.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
Reported-by: Stefan Behrens <sbehrens@giantdisaster.de>
---
 fs/btrfs/ctree.h       |  5 +++++
 fs/btrfs/extent-tree.c | 51 +++++++++++++++++++++++++++++++++++++++++++++++---
 fs/btrfs/tree-log.c    | 11 ++++++-----
 3 files changed, 59 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 227620993bce..f63c9b3f6e08 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2156,6 +2156,9 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
 			     u64 num_bytes, u64 *refs, u64 *flags);
 int btrfs_pin_extent(struct btrfs_root *root,
 		     u64 bytenr, u64 num, int reserved);
+int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
+				    struct btrfs_root *root,
+				    u64 bytenr, u64 num_bytes);
 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root,
 			  u64 objectid, u64 offset, u64 bytenr);
@@ -2206,6 +2209,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
 		      u64 root_objectid, u64 owner, u64 offset);
 
 int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len);
+int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
+				       u64 start, u64 len);
 int btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root);
 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 28c4809851a5..cb7626646bba 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4344,6 +4344,34 @@ int btrfs_pin_extent(struct btrfs_root *root,
 	return 0;
 }
 
+/*
+ * this function must be called within transaction
+ */
+int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
+				    struct btrfs_root *root,
+				    u64 bytenr, u64 num_bytes)
+{
+	struct btrfs_block_group_cache *cache;
+
+	cache = btrfs_lookup_block_group(root->fs_info, bytenr);
+	BUG_ON(!cache);
+
+	/*
+	 * pull in the free space cache (if any) so that our pin
+	 * removes the free space from the cache.  We have load_only set
+	 * to one because the slow code to read in the free extents does check
+	 * the pinned extents.
+	 */
+	cache_block_group(cache, trans, root, 1);
+
+	pin_down_extent(root, cache, bytenr, num_bytes, 0);
+
+	/* remove us from the free space cache (if we're there at all) */
+	btrfs_remove_free_space(cache, bytenr, num_bytes);
+	btrfs_put_block_group(cache);
+	return 0;
+}
+
 /**
  * btrfs_update_reserved_bytes - update the block_group and space info counters
  * @cache:	The cache we are manipulating
@@ -5487,7 +5515,8 @@ again:
 	return ret;
 }
 
-int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
+static int __btrfs_free_reserved_extent(struct btrfs_root *root,
+					u64 start, u64 len, int pin)
 {
 	struct btrfs_block_group_cache *cache;
 	int ret = 0;
@@ -5502,8 +5531,12 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
 	if (btrfs_test_opt(root, DISCARD))
 		ret = btrfs_discard_extent(root, start, len, NULL);
 
-	btrfs_add_free_space(cache, start, len);
-	btrfs_update_reserved_bytes(cache, len, RESERVE_FREE);
+	if (pin)
+		pin_down_extent(root, cache, start, len, 1);
+	else {
+		btrfs_add_free_space(cache, start, len);
+		btrfs_update_reserved_bytes(cache, len, RESERVE_FREE);
+	}
 	btrfs_put_block_group(cache);
 
 	trace_btrfs_reserved_extent_free(root, start, len);
@@ -5511,6 +5544,18 @@ int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len)
 	return ret;
 }
 
+int btrfs_free_reserved_extent(struct btrfs_root *root,
+					u64 start, u64 len)
+{
+	return __btrfs_free_reserved_extent(root, start, len, 0);
+}
+
+int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
+				       u64 start, u64 len)
+{
+	return __btrfs_free_reserved_extent(root, start, len, 1);
+}
+
 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
 				      struct btrfs_root *root,
 				      u64 parent, u64 root_objectid,
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 310ab22cfe58..8ca1b6b83bd1 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -276,8 +276,9 @@ static int process_one_buffer(struct btrfs_root *log,
 			      struct walk_control *wc, u64 gen)
 {
 	if (wc->pin)
-		btrfs_pin_extent(log->fs_info->extent_root,
-				 eb->start, eb->len, 0);
+		btrfs_pin_extent_for_log_replay(wc->trans,
+						log->fs_info->extent_root,
+						eb->start, eb->len);
 
 	if (btrfs_buffer_uptodate(eb, gen)) {
 		if (wc->write)
@@ -1760,7 +1761,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
 
 				WARN_ON(root_owner !=
 					BTRFS_TREE_LOG_OBJECTID);
-				ret = btrfs_free_reserved_extent(root,
+				ret = btrfs_free_and_pin_reserved_extent(root,
 							 bytenr, blocksize);
 				BUG_ON(ret);
 			}
@@ -1828,7 +1829,7 @@ static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
 				btrfs_tree_unlock(next);
 
 				WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
-				ret = btrfs_free_reserved_extent(root,
+				ret = btrfs_free_and_pin_reserved_extent(root,
 						path->nodes[*level]->start,
 						path->nodes[*level]->len);
 				BUG_ON(ret);
@@ -1897,7 +1898,7 @@ static int walk_log_tree(struct btrfs_trans_handle *trans,
 
 			WARN_ON(log->root_key.objectid !=
 				BTRFS_TREE_LOG_OBJECTID);
-			ret = btrfs_free_reserved_extent(log, next->start,
+			ret = btrfs_free_and_pin_reserved_extent(log, next->start,
 							 next->len);
 			BUG_ON(ret);
 		}
-- 
cgit v1.2.3


From 01d658f2ca3c85c1ffb20b306e30d16197000ce7 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Tue, 1 Nov 2011 10:08:06 -0400
Subject: Btrfs: make sure to flush queued bios if write_cache_pages waits

write_cache_pages tries to build up a large bio to stuff down the pipe.
But if it needs to wait for a page lock, it needs to make sure and send
down any pending writes so we don't deadlock with anyone who has the
page lock and is waiting for writeback of things inside the bio.

Dave Sterba triggered this as a deadlock between the autodefrag code and
the extent write_cache_pages

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c   | 13 ++++++++++---
 fs/btrfs/disk-io.h   |  2 --
 fs/btrfs/extent_io.c | 14 ++++++++++----
 fs/btrfs/extent_io.h |  3 ++-
 4 files changed, 22 insertions(+), 10 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 51372a521167..6f58911ece0d 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2735,7 +2735,8 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
 	return ret;
 }
 
-int btree_lock_page_hook(struct page *page)
+static int btree_lock_page_hook(struct page *page, void *data,
+				void (*flush_fn)(void *))
 {
 	struct inode *inode = page->mapping->host;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -2752,7 +2753,10 @@ int btree_lock_page_hook(struct page *page)
 	if (!eb)
 		goto out;
 
-	btrfs_tree_lock(eb);
+	if (!btrfs_try_tree_write_lock(eb)) {
+		flush_fn(data);
+		btrfs_tree_lock(eb);
+	}
 	btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
 
 	if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
@@ -2767,7 +2771,10 @@ int btree_lock_page_hook(struct page *page)
 	btrfs_tree_unlock(eb);
 	free_extent_buffer(eb);
 out:
-	lock_page(page);
+	if (!trylock_page(page)) {
+		flush_fn(data);
+		lock_page(page);
+	}
 	return 0;
 }
 
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index bec3ea4bd67f..e678539c8519 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -83,8 +83,6 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
 			     struct btrfs_fs_info *fs_info);
 int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root);
-int btree_lock_page_hook(struct page *page);
-
 
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
 void btrfs_init_lockdep(void);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index f284d4e5f447..b40ba75f4483 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2613,10 +2613,16 @@ retry:
 			 * swizzled back from swapper_space to tmpfs file
 			 * mapping
 			 */
-			if (tree->ops && tree->ops->write_cache_pages_lock_hook)
-				tree->ops->write_cache_pages_lock_hook(page);
-			else
-				lock_page(page);
+			if (tree->ops &&
+			    tree->ops->write_cache_pages_lock_hook) {
+				tree->ops->write_cache_pages_lock_hook(page,
+							       data, flush_fn);
+			} else {
+				if (!trylock_page(page)) {
+					flush_fn(data);
+					lock_page(page);
+				}
+			}
 
 			if (unlikely(page->mapping != mapping)) {
 				unlock_page(page);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 325a346369da..cbd4824a7c94 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -86,7 +86,8 @@ struct extent_io_ops {
 				  struct extent_state *other);
 	void (*split_extent_hook)(struct inode *inode,
 				  struct extent_state *orig, u64 split);
-	int (*write_cache_pages_lock_hook)(struct page *page);
+	int (*write_cache_pages_lock_hook)(struct page *page, void *data,
+					   void (*flush_fn)(void *));
 };
 
 struct extent_io_tree {
-- 
cgit v1.2.3


From 5a77d76c243be18d854aa1b14d697525f60e169a Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Tue, 1 Nov 2011 14:32:23 -0400
Subject: Btrfs: release metadata from global reserve if we have to fallback
 for unlink

I fixed a problem where we weren't reserving space for an orphan item when we
had to fallback to using the global reserve for an unlink, but I introduced
another problem.  I was migrating the bytes from the transaction reserve to the
global reserve and then releasing from the global reserve in
btrfs_end_transaction().  The problem with this is that a migrate will jack up
the size for the destination, but leave the size alone for the source, with the
idea that you can do a release normally on the source and it all washes out, and
then you can do a release again on the destination and it works out right.  My
way was skipping the release on the trans_block_rsv which still had the jacked
up size from our original reservation.  So instead release manually from the
global reserve if this transaction was using it, and then set the
trans->block_rsv back to the trans_block_rsv so that btrfs_end_transaction
cleans everything up properly.  With this patch xfstest 83 doesn't emit warnings
about leaking space.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/inode.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 65474d95f26f..a8586e10953c 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2904,7 +2904,7 @@ out:
 	if (!err)
 		err = btrfs_block_rsv_migrate(trans->block_rsv,
 				&root->fs_info->global_block_rsv,
-				btrfs_calc_trans_metadata_size(root, 1));
+				trans->bytes_reserved);
 
 	if (err) {
 		btrfs_end_transaction(trans, root);
@@ -2920,6 +2920,9 @@ static void __unlink_end_trans(struct btrfs_trans_handle *trans,
 			       struct btrfs_root *root)
 {
 	if (trans->block_rsv == &root->fs_info->global_block_rsv) {
+		btrfs_block_rsv_release(root, trans->block_rsv,
+					trans->bytes_reserved);
+		trans->block_rsv = &root->fs_info->trans_block_rsv;
 		BUG_ON(!root->fs_info->enospc_unlink);
 		root->fs_info->enospc_unlink = 0;
 	}
-- 
cgit v1.2.3


From c8174313a8102e874aaa321e2fc4c7c460a87151 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Wed, 2 Nov 2011 09:29:35 -0400
Subject: Btrfs: use the global reserve when truncating the free space cache
 inode

We no longer use the orphan block rsv for holding the reservation for truncating
the inode, so instead use the global block rsv and check to make sure it has
enough space for us to truncate the space.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/free-space-cache.c | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index f49475dfa954..7a15fcfb3e1f 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -198,14 +198,24 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
 				    struct inode *inode)
 {
 	struct btrfs_block_rsv *rsv;
+	u64 needed_bytes;
 	loff_t oldsize;
 	int ret = 0;
 
 	rsv = trans->block_rsv;
-	trans->block_rsv = root->orphan_block_rsv;
-	ret = btrfs_block_rsv_check(root, root->orphan_block_rsv, 5);
-	if (ret)
-		return ret;
+	trans->block_rsv = &root->fs_info->global_block_rsv;
+
+	/* 1 for slack space, 1 for updating the inode */
+	needed_bytes = btrfs_calc_trunc_metadata_size(root, 1) +
+		btrfs_calc_trans_metadata_size(root, 1);
+
+	spin_lock(&trans->block_rsv->lock);
+	if (trans->block_rsv->reserved < needed_bytes) {
+		spin_unlock(&trans->block_rsv->lock);
+		trans->block_rsv = rsv;
+		return -ENOSPC;
+	}
+	spin_unlock(&trans->block_rsv->lock);
 
 	oldsize = i_size_read(inode);
 	btrfs_i_size_write(inode, 0);
@@ -218,13 +228,15 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
 	ret = btrfs_truncate_inode_items(trans, root, inode,
 					 0, BTRFS_EXTENT_DATA_KEY);
 
-	trans->block_rsv = rsv;
 	if (ret) {
+		trans->block_rsv = rsv;
 		WARN_ON(1);
 		return ret;
 	}
 
 	ret = btrfs_update_inode(trans, root, inode);
+	trans->block_rsv = rsv;
+
 	return ret;
 }
 
-- 
cgit v1.2.3


From 6c41761fc6efe1503103a1afe03a6635c0b5d4ec Mon Sep 17 00:00:00 2001
From: David Sterba <dsterba@suse.cz>
Date: Wed, 13 Apr 2011 15:41:04 +0200
Subject: btrfs: separate superblock items out of fs_info

fs_info has now ~9kb, more than fits into one page. This will cause
mount failure when memory is too fragmented. Top space consumers are
super block structures super_copy and super_for_commit, ~2.8kb each.
Allocate them dynamically. fs_info will be ~3.5kb. (measured on x86_64)

Add a wrapper for freeing fs_info and all of it's dynamically allocated
members.

Signed-off-by: David Sterba <dsterba@suse.cz>
---
 fs/btrfs/compression.c |  3 ++-
 fs/btrfs/ctree.h       | 16 ++++++++++++++--
 fs/btrfs/disk-io.c     | 33 ++++++++++-----------------------
 fs/btrfs/extent-tree.c | 16 ++++++++--------
 fs/btrfs/file-item.c   | 17 ++++++-----------
 fs/btrfs/inode.c       |  2 +-
 fs/btrfs/ioctl.c       |  8 ++++----
 fs/btrfs/scrub.c       |  2 +-
 fs/btrfs/super.c       | 19 +++++++++++++------
 fs/btrfs/transaction.c | 10 +++++-----
 fs/btrfs/tree-log.c    |  4 ++--
 fs/btrfs/volumes.c     | 24 ++++++++++++------------
 12 files changed, 78 insertions(+), 76 deletions(-)

diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 8ec5d86f1734..14f1c5a0b2d2 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -85,7 +85,8 @@ struct compressed_bio {
 static inline int compressed_bio_size(struct btrfs_root *root,
 				      unsigned long disk_size)
 {
-	u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy);
+	u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
+
 	return sizeof(struct compressed_bio) +
 		((disk_size + root->sectorsize - 1) / root->sectorsize) *
 		csum_size;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index f63c9b3f6e08..5181c53c1124 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -936,8 +936,8 @@ struct btrfs_fs_info {
 	wait_queue_head_t transaction_blocked_wait;
 	wait_queue_head_t async_submit_wait;
 
-	struct btrfs_super_block super_copy;
-	struct btrfs_super_block super_for_commit;
+	struct btrfs_super_block *super_copy;
+	struct btrfs_super_block *super_for_commit;
 	struct block_device *__bdev;
 	struct super_block *sb;
 	struct inode *btree_inode;
@@ -2387,6 +2387,18 @@ static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
 	smp_mb();
 	return fs_info->closing;
 }
+static inline void free_fs_info(struct btrfs_fs_info *fs_info)
+{
+	kfree(fs_info->delayed_root);
+	kfree(fs_info->extent_root);
+	kfree(fs_info->tree_root);
+	kfree(fs_info->chunk_root);
+	kfree(fs_info->dev_root);
+	kfree(fs_info->csum_root);
+	kfree(fs_info->super_copy);
+	kfree(fs_info->super_for_commit);
+	kfree(fs_info);
+}
 
 /* root-item.c */
 int btrfs_find_root_ref(struct btrfs_root *tree_root,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 6f58911ece0d..761717c98278 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -256,8 +256,7 @@ void btrfs_csum_final(u32 crc, char *result)
 static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
 			   int verify)
 {
-	u16 csum_size =
-		btrfs_super_csum_size(&root->fs_info->super_copy);
+	u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
 	char *result = NULL;
 	unsigned long len;
 	unsigned long cur_len;
@@ -1766,14 +1765,14 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 		goto fail_alloc;
 	}
 
-	memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy));
-	memcpy(&fs_info->super_for_commit, &fs_info->super_copy,
-	       sizeof(fs_info->super_for_commit));
+	memcpy(fs_info->super_copy, bh->b_data, sizeof(*fs_info->super_copy));
+	memcpy(fs_info->super_for_commit, fs_info->super_copy,
+	       sizeof(*fs_info->super_for_commit));
 	brelse(bh);
 
-	memcpy(fs_info->fsid, fs_info->super_copy.fsid, BTRFS_FSID_SIZE);
+	memcpy(fs_info->fsid, fs_info->super_copy->fsid, BTRFS_FSID_SIZE);
 
-	disk_super = &fs_info->super_copy;
+	disk_super = fs_info->super_copy;
 	if (!btrfs_super_root(disk_super))
 		goto fail_alloc;
 
@@ -2152,7 +2151,6 @@ fail_sb_buffer:
 	btrfs_stop_workers(&fs_info->delayed_workers);
 	btrfs_stop_workers(&fs_info->caching_workers);
 fail_alloc:
-	kfree(fs_info->delayed_root);
 fail_iput:
 	invalidate_inode_pages2(fs_info->btree_inode->i_mapping);
 	iput(fs_info->btree_inode);
@@ -2164,12 +2162,7 @@ fail_bdi:
 fail_srcu:
 	cleanup_srcu_struct(&fs_info->subvol_srcu);
 fail:
-	kfree(extent_root);
-	kfree(tree_root);
-	kfree(fs_info);
-	kfree(chunk_root);
-	kfree(dev_root);
-	kfree(csum_root);
+	free_fs_info(fs_info);
 	return ERR_PTR(err);
 }
 
@@ -2338,10 +2331,10 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
 	int total_errors = 0;
 	u64 flags;
 
-	max_errors = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
+	max_errors = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
 	do_barriers = !btrfs_test_opt(root, NOBARRIER);
 
-	sb = &root->fs_info->super_for_commit;
+	sb = root->fs_info->super_for_commit;
 	dev_item = &sb->dev_item;
 
 	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
@@ -2603,7 +2596,6 @@ int close_ctree(struct btrfs_root *root)
 	del_fs_roots(fs_info);
 
 	iput(fs_info->btree_inode);
-	kfree(fs_info->delayed_root);
 
 	btrfs_stop_workers(&fs_info->generic_worker);
 	btrfs_stop_workers(&fs_info->fixup_workers);
@@ -2624,12 +2616,7 @@ int close_ctree(struct btrfs_root *root)
 	bdi_destroy(&fs_info->bdi);
 	cleanup_srcu_struct(&fs_info->subvol_srcu);
 
-	kfree(fs_info->extent_root);
-	kfree(fs_info->tree_root);
-	kfree(fs_info->chunk_root);
-	kfree(fs_info->dev_root);
-	kfree(fs_info->csum_root);
-	kfree(fs_info);
+	free_fs_info(fs_info);
 
 	return 0;
 }
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index cb7626646bba..782eb3ea8edf 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3209,7 +3209,7 @@ static int should_alloc_chunk(struct btrfs_root *root,
 	 * about 1% of the FS size.
 	 */
 	if (force == CHUNK_ALLOC_LIMITED) {
-		thresh = btrfs_super_total_bytes(&root->fs_info->super_copy);
+		thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
 		thresh = max_t(u64, 64 * 1024 * 1024,
 			       div_factor_fine(thresh, 1));
 
@@ -3231,7 +3231,7 @@ static int should_alloc_chunk(struct btrfs_root *root,
 	if (num_allocated + alloc_bytes < div_factor(num_bytes, 8))
 		return 0;
 
-	thresh = btrfs_super_total_bytes(&root->fs_info->super_copy);
+	thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
 
 	/* 256MB or 5% of the FS */
 	thresh = max_t(u64, 256 * 1024 * 1024, div_factor_fine(thresh, 5));
@@ -3843,7 +3843,7 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
 	u64 num_bytes;
 	u64 meta_used;
 	u64 data_used;
-	int csum_size = btrfs_super_csum_size(&fs_info->super_copy);
+	int csum_size = btrfs_super_csum_size(fs_info->super_copy);
 
 	sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
 	spin_lock(&sinfo->lock);
@@ -4222,12 +4222,12 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 
 	/* block accounting for super block */
 	spin_lock(&info->delalloc_lock);
-	old_val = btrfs_super_bytes_used(&info->super_copy);
+	old_val = btrfs_super_bytes_used(info->super_copy);
 	if (alloc)
 		old_val += num_bytes;
 	else
 		old_val -= num_bytes;
-	btrfs_set_super_bytes_used(&info->super_copy, old_val);
+	btrfs_set_super_bytes_used(info->super_copy, old_val);
 	spin_unlock(&info->delalloc_lock);
 
 	while (total) {
@@ -7127,9 +7127,9 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 		return -ENOMEM;
 	path->reada = 1;
 
-	cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy);
+	cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
 	if (btrfs_test_opt(root, SPACE_CACHE) &&
-	    btrfs_super_generation(&root->fs_info->super_copy) != cache_gen)
+	    btrfs_super_generation(root->fs_info->super_copy) != cache_gen)
 		need_clear = 1;
 	if (btrfs_test_opt(root, CLEAR_CACHE))
 		need_clear = 1;
@@ -7458,7 +7458,7 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
 	int mixed = 0;
 	int ret;
 
-	disk_super = &fs_info->super_copy;
+	disk_super = fs_info->super_copy;
 	if (!btrfs_super_root(disk_super))
 		return 1;
 
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index a1cb7821becd..c7fb3a4247d3 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -91,8 +91,7 @@ struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
 	struct btrfs_csum_item *item;
 	struct extent_buffer *leaf;
 	u64 csum_offset = 0;
-	u16 csum_size =
-		btrfs_super_csum_size(&root->fs_info->super_copy);
+	u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
 	int csums_in_item;
 
 	file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
@@ -162,8 +161,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
 	u64 item_last_offset = 0;
 	u64 disk_bytenr;
 	u32 diff;
-	u16 csum_size =
-		btrfs_super_csum_size(&root->fs_info->super_copy);
+	u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
 	int ret;
 	struct btrfs_path *path;
 	struct btrfs_csum_item *item = NULL;
@@ -290,7 +288,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
 	int ret;
 	size_t size;
 	u64 csum_end;
-	u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy);
+	u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
 
 	path = btrfs_alloc_path();
 	if (!path)
@@ -492,8 +490,7 @@ static noinline int truncate_one_csum(struct btrfs_trans_handle *trans,
 				      u64 bytenr, u64 len)
 {
 	struct extent_buffer *leaf;
-	u16 csum_size =
-		btrfs_super_csum_size(&root->fs_info->super_copy);
+	u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
 	u64 csum_end;
 	u64 end_byte = bytenr + len;
 	u32 blocksize_bits = root->fs_info->sb->s_blocksize_bits;
@@ -549,8 +546,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
 	u64 csum_end;
 	struct extent_buffer *leaf;
 	int ret;
-	u16 csum_size =
-		btrfs_super_csum_size(&root->fs_info->super_copy);
+	u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
 	int blocksize_bits = root->fs_info->sb->s_blocksize_bits;
 
 	root = root->fs_info->csum_root;
@@ -676,8 +672,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
 	struct btrfs_sector_sum *sector_sum;
 	u32 nritems;
 	u32 ins_size;
-	u16 csum_size =
-		btrfs_super_csum_size(&root->fs_info->super_copy);
+	u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
 
 	path = btrfs_alloc_path();
 	if (!path)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index a8586e10953c..b6b612e14ed7 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -824,7 +824,7 @@ static noinline int cow_file_range(struct inode *inode,
 	}
 
 	BUG_ON(disk_num_bytes >
-	       btrfs_super_total_bytes(&root->fs_info->super_copy));
+	       btrfs_super_total_bytes(root->fs_info->super_copy));
 
 	alloc_hint = get_extent_allocation_hint(inode, start, num_bytes);
 	btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 33aae13cc74b..8f6e14279409 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -282,7 +282,7 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
 	struct fstrim_range range;
 	u64 minlen = ULLONG_MAX;
 	u64 num_devices = 0;
-	u64 total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
+	u64 total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy);
 	int ret;
 
 	if (!capable(CAP_SYS_ADMIN))
@@ -1164,7 +1164,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 		mutex_unlock(&inode->i_mutex);
 	}
 
-	disk_super = &root->fs_info->super_copy;
+	disk_super = root->fs_info->super_copy;
 	features = btrfs_super_incompat_flags(disk_super);
 	if (range->compress_type == BTRFS_COMPRESS_LZO) {
 		features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO;
@@ -2613,7 +2613,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
 		return PTR_ERR(trans);
 	}
 
-	dir_id = btrfs_super_root_dir(&root->fs_info->super_copy);
+	dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
 	di = btrfs_lookup_dir_item(trans, root->fs_info->tree_root, path,
 				   dir_id, "default", 7, 1);
 	if (IS_ERR_OR_NULL(di)) {
@@ -2629,7 +2629,7 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
 	btrfs_mark_buffer_dirty(path->nodes[0]);
 	btrfs_free_path(path);
 
-	disk_super = &root->fs_info->super_copy;
+	disk_super = root->fs_info->super_copy;
 	features = btrfs_super_incompat_flags(disk_super);
 	if (!(features & BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL)) {
 		features |= BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index a8d03d5efb5d..69a600f07763 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -182,7 +182,7 @@ struct scrub_dev *scrub_setup_dev(struct btrfs_device *dev)
 	sdev->curr = -1;
 	atomic_set(&sdev->in_flight, 0);
 	atomic_set(&sdev->cancel_req, 0);
-	sdev->csum_size = btrfs_super_csum_size(&fs_info->super_copy);
+	sdev->csum_size = btrfs_super_csum_size(fs_info->super_copy);
 	INIT_LIST_HEAD(&sdev->csum_list);
 
 	spin_lock_init(&sdev->list_lock);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 5429b1fa0bfc..f7e9de724ef2 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -216,7 +216,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 	char *compress_type;
 	bool compress_force = false;
 
-	cache_gen = btrfs_super_cache_generation(&root->fs_info->super_copy);
+	cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
 	if (cache_gen)
 		btrfs_set_opt(info->mount_opt, SPACE_CACHE);
 
@@ -524,7 +524,7 @@ static struct dentry *get_default_root(struct super_block *sb,
 	 * will mount by default if we haven't been given a specific subvolume
 	 * to mount.
 	 */
-	dir_id = btrfs_super_root_dir(&root->fs_info->super_copy);
+	dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
 	di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0);
 	if (IS_ERR(di)) {
 		btrfs_free_path(path);
@@ -937,6 +937,13 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 	fs_info->fs_devices = fs_devices;
 	tree_root->fs_info = fs_info;
 
+	fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
+	fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS);
+	if (!fs_info->super_copy || !fs_info->super_for_commit) {
+		error = -ENOMEM;
+		goto error_close_devices;
+	}
+
 	bdev = fs_devices->latest_bdev;
 	s = sget(fs_type, btrfs_test_super, btrfs_set_super, tree_root);
 	if (IS_ERR(s)) {
@@ -951,7 +958,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 		}
 
 		btrfs_close_devices(fs_devices);
-		kfree(fs_info);
+		free_fs_info(fs_info);
 		kfree(tree_root);
 	} else {
 		char b[BDEVNAME_SIZE];
@@ -979,7 +986,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 
 error_close_devices:
 	btrfs_close_devices(fs_devices);
-	kfree(fs_info);
+	free_fs_info(fs_info);
 	kfree(tree_root);
 	return ERR_PTR(error);
 }
@@ -1005,7 +1012,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
 		if (root->fs_info->fs_devices->rw_devices == 0)
 			return -EACCES;
 
-		if (btrfs_super_log_root(&root->fs_info->super_copy) != 0)
+		if (btrfs_super_log_root(root->fs_info->super_copy) != 0)
 			return -EINVAL;
 
 		ret = btrfs_cleanup_fs_roots(root->fs_info);
@@ -1171,7 +1178,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
 static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct btrfs_root *root = btrfs_sb(dentry->d_sb);
-	struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
+	struct btrfs_super_block *disk_super = root->fs_info->super_copy;
 	struct list_head *head = &root->fs_info->space_info;
 	struct btrfs_space_info *found;
 	u64 total_used = 0;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 29bef63e23ba..373c7ec1a026 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -991,7 +991,7 @@ static void update_super_roots(struct btrfs_root *root)
 	struct btrfs_root_item *root_item;
 	struct btrfs_super_block *super;
 
-	super = &root->fs_info->super_copy;
+	super = root->fs_info->super_copy;
 
 	root_item = &root->fs_info->chunk_root->root_item;
 	super->chunk_root = root_item->bytenr;
@@ -1301,12 +1301,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 	update_super_roots(root);
 
 	if (!root->fs_info->log_root_recovering) {
-		btrfs_set_super_log_root(&root->fs_info->super_copy, 0);
-		btrfs_set_super_log_root_level(&root->fs_info->super_copy, 0);
+		btrfs_set_super_log_root(root->fs_info->super_copy, 0);
+		btrfs_set_super_log_root_level(root->fs_info->super_copy, 0);
 	}
 
-	memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy,
-	       sizeof(root->fs_info->super_copy));
+	memcpy(root->fs_info->super_for_commit, root->fs_info->super_copy,
+	       sizeof(*root->fs_info->super_copy));
 
 	trans->transaction->blocked = 0;
 	spin_lock(&root->fs_info->trans_lock);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 8ca1b6b83bd1..f4d81c06d48f 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2118,9 +2118,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
 	BUG_ON(ret);
 	btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark);
 
-	btrfs_set_super_log_root(&root->fs_info->super_for_commit,
+	btrfs_set_super_log_root(root->fs_info->super_for_commit,
 				log_root_tree->node->start);
-	btrfs_set_super_log_root_level(&root->fs_info->super_for_commit,
+	btrfs_set_super_log_root_level(root->fs_info->super_for_commit,
 				btrfs_header_level(log_root_tree->node));
 
 	log_root_tree->log_batch = 0;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index c6938b45e0fd..c3b45564048e 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1395,8 +1395,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 	call_rcu(&device->rcu, free_device);
 	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
 
-	num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1;
-	btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices);
+	num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
+	btrfs_set_super_num_devices(root->fs_info->super_copy, num_devices);
 
 	if (cur_devices->open_devices == 0) {
 		struct btrfs_fs_devices *fs_devices;
@@ -1458,7 +1458,7 @@ static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans,
 	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
 	struct btrfs_fs_devices *old_devices;
 	struct btrfs_fs_devices *seed_devices;
-	struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
+	struct btrfs_super_block *disk_super = root->fs_info->super_copy;
 	struct btrfs_device *device;
 	u64 super_flags;
 
@@ -1706,12 +1706,12 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	if (!blk_queue_nonrot(bdev_get_queue(bdev)))
 		root->fs_info->fs_devices->rotating = 1;
 
-	total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
-	btrfs_set_super_total_bytes(&root->fs_info->super_copy,
+	total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy);
+	btrfs_set_super_total_bytes(root->fs_info->super_copy,
 				    total_bytes + device->total_bytes);
 
-	total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy);
-	btrfs_set_super_num_devices(&root->fs_info->super_copy,
+	total_bytes = btrfs_super_num_devices(root->fs_info->super_copy);
+	btrfs_set_super_num_devices(root->fs_info->super_copy,
 				    total_bytes + 1);
 	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
 
@@ -1802,7 +1802,7 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans,
 		      struct btrfs_device *device, u64 new_size)
 {
 	struct btrfs_super_block *super_copy =
-		&device->dev_root->fs_info->super_copy;
+		device->dev_root->fs_info->super_copy;
 	u64 old_total = btrfs_super_total_bytes(super_copy);
 	u64 diff = new_size - device->total_bytes;
 
@@ -1861,7 +1861,7 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans,
 static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64
 			chunk_offset)
 {
-	struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+	struct btrfs_super_block *super_copy = root->fs_info->super_copy;
 	struct btrfs_disk_key *disk_key;
 	struct btrfs_chunk *chunk;
 	u8 *ptr;
@@ -2187,7 +2187,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
 	bool retried = false;
 	struct extent_buffer *l;
 	struct btrfs_key key;
-	struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+	struct btrfs_super_block *super_copy = root->fs_info->super_copy;
 	u64 old_total = btrfs_super_total_bytes(super_copy);
 	u64 old_size = device->total_bytes;
 	u64 diff = device->total_bytes - new_size;
@@ -2311,7 +2311,7 @@ static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
 			   struct btrfs_key *key,
 			   struct btrfs_chunk *chunk, int item_size)
 {
-	struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+	struct btrfs_super_block *super_copy = root->fs_info->super_copy;
 	struct btrfs_disk_key disk_key;
 	u32 array_size;
 	u8 *ptr;
@@ -3653,7 +3653,7 @@ static int read_one_dev(struct btrfs_root *root,
 
 int btrfs_read_sys_array(struct btrfs_root *root)
 {
-	struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+	struct btrfs_super_block *super_copy = root->fs_info->super_copy;
 	struct extent_buffer *sb;
 	struct btrfs_disk_key *disk_key;
 	struct btrfs_chunk *chunk;
-- 
cgit v1.2.3


From af31f5e5b84b5bf2bcec464153a5130b170b2770 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 3 Nov 2011 15:17:42 -0400
Subject: Btrfs: add a log of past tree roots

This takes some of the free space in the btrfs super block
to record information about most of the roots in the last four
commits.

It also adds a -o recovery to use the root history log when
we're not able to read the tree of tree roots, the extent
tree root, the device tree root or the csum root.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h   |  95 +++++++++++++++++
 fs/btrfs/disk-io.c | 291 ++++++++++++++++++++++++++++++++++++++++++++++++-----
 fs/btrfs/super.c   |   7 +-
 3 files changed, 369 insertions(+), 24 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 5181c53c1124..78f43d1102a0 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -360,6 +360,47 @@ struct btrfs_header {
 #define BTRFS_SYSTEM_CHUNK_ARRAY_SIZE 2048
 #define BTRFS_LABEL_SIZE 256
 
+/*
+ * just in case we somehow lose the roots and are not able to mount,
+ * we store an array of the roots from previous transactions
+ * in the super.
+ */
+#define BTRFS_NUM_BACKUP_ROOTS 4
+struct btrfs_root_backup {
+	__le64 tree_root;
+	__le64 tree_root_gen;
+
+	__le64 chunk_root;
+	__le64 chunk_root_gen;
+
+	__le64 extent_root;
+	__le64 extent_root_gen;
+
+	__le64 fs_root;
+	__le64 fs_root_gen;
+
+	__le64 dev_root;
+	__le64 dev_root_gen;
+
+	__le64 csum_root;
+	__le64 csum_root_gen;
+
+	__le64 total_bytes;
+	__le64 bytes_used;
+	__le64 num_devices;
+	/* future */
+	__le64 unsed_64[4];
+
+	u8 tree_root_level;
+	u8 chunk_root_level;
+	u8 extent_root_level;
+	u8 fs_root_level;
+	u8 dev_root_level;
+	u8 csum_root_level;
+	/* future and to align */
+	u8 unused_8[10];
+} __attribute__ ((__packed__));
+
 /*
  * the super block basically lists the main trees of the FS
  * it currently lacks any block count etc etc
@@ -406,6 +447,7 @@ struct btrfs_super_block {
 	/* future expansion */
 	__le64 reserved[31];
 	u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
+	struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS];
 } __attribute__ ((__packed__));
 
 /*
@@ -1113,6 +1155,9 @@ struct btrfs_fs_info {
 	u64 fs_state;
 
 	struct btrfs_delayed_root *delayed_root;
+
+	/* next backup root to be overwritten */
+	int backup_root_index;
 };
 
 /*
@@ -1357,6 +1402,7 @@ struct btrfs_ioctl_defrag_range_args {
 #define BTRFS_MOUNT_ENOSPC_DEBUG	 (1 << 15)
 #define BTRFS_MOUNT_AUTO_DEFRAG		(1 << 16)
 #define BTRFS_MOUNT_INODE_MAP_CACHE	(1 << 17)
+#define BTRFS_MOUNT_RECOVERY		(1 << 18)
 
 #define btrfs_clear_opt(o, opt)		((o) &= ~BTRFS_MOUNT_##opt)
 #define btrfs_set_opt(o, opt)		((o) |= BTRFS_MOUNT_##opt)
@@ -1972,6 +2018,55 @@ static inline bool btrfs_root_readonly(struct btrfs_root *root)
 	return root->root_item.flags & BTRFS_ROOT_SUBVOL_RDONLY;
 }
 
+/* struct btrfs_root_backup */
+BTRFS_SETGET_STACK_FUNCS(backup_tree_root, struct btrfs_root_backup,
+		   tree_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_tree_root_gen, struct btrfs_root_backup,
+		   tree_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_tree_root_level, struct btrfs_root_backup,
+		   tree_root_level, 8);
+
+BTRFS_SETGET_STACK_FUNCS(backup_chunk_root, struct btrfs_root_backup,
+		   chunk_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_gen, struct btrfs_root_backup,
+		   chunk_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_chunk_root_level, struct btrfs_root_backup,
+		   chunk_root_level, 8);
+
+BTRFS_SETGET_STACK_FUNCS(backup_extent_root, struct btrfs_root_backup,
+		   extent_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_extent_root_gen, struct btrfs_root_backup,
+		   extent_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_extent_root_level, struct btrfs_root_backup,
+		   extent_root_level, 8);
+
+BTRFS_SETGET_STACK_FUNCS(backup_fs_root, struct btrfs_root_backup,
+		   fs_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_fs_root_gen, struct btrfs_root_backup,
+		   fs_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_fs_root_level, struct btrfs_root_backup,
+		   fs_root_level, 8);
+
+BTRFS_SETGET_STACK_FUNCS(backup_dev_root, struct btrfs_root_backup,
+		   dev_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_dev_root_gen, struct btrfs_root_backup,
+		   dev_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_dev_root_level, struct btrfs_root_backup,
+		   dev_root_level, 8);
+
+BTRFS_SETGET_STACK_FUNCS(backup_csum_root, struct btrfs_root_backup,
+		   csum_root, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_csum_root_gen, struct btrfs_root_backup,
+		   csum_root_gen, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_csum_root_level, struct btrfs_root_backup,
+		   csum_root_level, 8);
+BTRFS_SETGET_STACK_FUNCS(backup_total_bytes, struct btrfs_root_backup,
+		   total_bytes, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_bytes_used, struct btrfs_root_backup,
+		   bytes_used, 64);
+BTRFS_SETGET_STACK_FUNCS(backup_num_devices, struct btrfs_root_backup,
+		   num_devices, 64);
+
 /* struct btrfs_super_block */
 
 BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 761717c98278..a61f8a6cf219 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1134,10 +1134,12 @@ static int find_and_setup_root(struct btrfs_root *tree_root,
 
 	generation = btrfs_root_generation(&root->root_item);
 	blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item));
+	root->commit_root = NULL;
 	root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item),
 				     blocksize, generation);
 	if (!root->node || !btrfs_buffer_uptodate(root->node, generation)) {
 		free_extent_buffer(root->node);
+		root->node = NULL;
 		return -EIO;
 	}
 	root->commit_root = btrfs_root_node(root);
@@ -1576,6 +1578,228 @@ sleep:
 	return 0;
 }
 
+/*
+ * this will find the highest generation in the array of
+ * root backups.  The index of the highest array is returned,
+ * or -1 if we can't find anything.
+ *
+ * We check to make sure the array is valid by comparing the
+ * generation of the latest  root in the array with the generation
+ * in the super block.  If they don't match we pitch it.
+ */
+static int find_newest_super_backup(struct btrfs_fs_info *info, u64 newest_gen)
+{
+	u64 cur;
+	int newest_index = -1;
+	struct btrfs_root_backup *root_backup;
+	int i;
+
+	for (i = 0; i < BTRFS_NUM_BACKUP_ROOTS; i++) {
+		root_backup = info->super_copy->super_roots + i;
+		cur = btrfs_backup_tree_root_gen(root_backup);
+		if (cur == newest_gen)
+			newest_index = i;
+	}
+
+	/* check to see if we actually wrapped around */
+	if (newest_index == BTRFS_NUM_BACKUP_ROOTS - 1) {
+		root_backup = info->super_copy->super_roots;
+		cur = btrfs_backup_tree_root_gen(root_backup);
+		if (cur == newest_gen)
+			newest_index = 0;
+	}
+	return newest_index;
+}
+
+
+/*
+ * find the oldest backup so we know where to store new entries
+ * in the backup array.  This will set the backup_root_index
+ * field in the fs_info struct
+ */
+static void find_oldest_super_backup(struct btrfs_fs_info *info,
+				     u64 newest_gen)
+{
+	int newest_index = -1;
+
+	newest_index = find_newest_super_backup(info, newest_gen);
+	/* if there was garbage in there, just move along */
+	if (newest_index == -1) {
+		info->backup_root_index = 0;
+	} else {
+		info->backup_root_index = (newest_index + 1) % BTRFS_NUM_BACKUP_ROOTS;
+	}
+}
+
+/*
+ * copy all the root pointers into the super backup array.
+ * this will bump the backup pointer by one when it is
+ * done
+ */
+static void backup_super_roots(struct btrfs_fs_info *info)
+{
+	int next_backup;
+	struct btrfs_root_backup *root_backup;
+	int last_backup;
+
+	next_backup = info->backup_root_index;
+	last_backup = (next_backup + BTRFS_NUM_BACKUP_ROOTS - 1) %
+		BTRFS_NUM_BACKUP_ROOTS;
+
+	/*
+	 * just overwrite the last backup if we're at the same generation
+	 * this happens only at umount
+	 */
+	root_backup = info->super_for_commit->super_roots + last_backup;
+	if (btrfs_backup_tree_root_gen(root_backup) ==
+	    btrfs_header_generation(info->tree_root->node))
+		next_backup = last_backup;
+
+	root_backup = info->super_for_commit->super_roots + next_backup;
+
+	/*
+	 * make sure all of our padding and empty slots get zero filled
+	 * regardless of which ones we use today
+	 */
+	memset(root_backup, 0, sizeof(*root_backup));
+
+	info->backup_root_index = (next_backup + 1) % BTRFS_NUM_BACKUP_ROOTS;
+
+	btrfs_set_backup_tree_root(root_backup, info->tree_root->node->start);
+	btrfs_set_backup_tree_root_gen(root_backup,
+			       btrfs_header_generation(info->tree_root->node));
+
+	btrfs_set_backup_tree_root_level(root_backup,
+			       btrfs_header_level(info->tree_root->node));
+
+	btrfs_set_backup_chunk_root(root_backup, info->chunk_root->node->start);
+	btrfs_set_backup_chunk_root_gen(root_backup,
+			       btrfs_header_generation(info->chunk_root->node));
+	btrfs_set_backup_chunk_root_level(root_backup,
+			       btrfs_header_level(info->chunk_root->node));
+
+	btrfs_set_backup_extent_root(root_backup, info->extent_root->node->start);
+	btrfs_set_backup_extent_root_gen(root_backup,
+			       btrfs_header_generation(info->extent_root->node));
+	btrfs_set_backup_extent_root_level(root_backup,
+			       btrfs_header_level(info->extent_root->node));
+
+	btrfs_set_backup_fs_root(root_backup, info->fs_root->node->start);
+	btrfs_set_backup_fs_root_gen(root_backup,
+			       btrfs_header_generation(info->fs_root->node));
+	btrfs_set_backup_fs_root_level(root_backup,
+			       btrfs_header_level(info->fs_root->node));
+
+	btrfs_set_backup_dev_root(root_backup, info->dev_root->node->start);
+	btrfs_set_backup_dev_root_gen(root_backup,
+			       btrfs_header_generation(info->dev_root->node));
+	btrfs_set_backup_dev_root_level(root_backup,
+				       btrfs_header_level(info->dev_root->node));
+
+	btrfs_set_backup_csum_root(root_backup, info->csum_root->node->start);
+	btrfs_set_backup_csum_root_gen(root_backup,
+			       btrfs_header_generation(info->csum_root->node));
+	btrfs_set_backup_csum_root_level(root_backup,
+			       btrfs_header_level(info->csum_root->node));
+
+	btrfs_set_backup_total_bytes(root_backup,
+			     btrfs_super_total_bytes(info->super_copy));
+	btrfs_set_backup_bytes_used(root_backup,
+			     btrfs_super_bytes_used(info->super_copy));
+	btrfs_set_backup_num_devices(root_backup,
+			     btrfs_super_num_devices(info->super_copy));
+
+	/*
+	 * if we don't copy this out to the super_copy, it won't get remembered
+	 * for the next commit
+	 */
+	memcpy(&info->super_copy->super_roots,
+	       &info->super_for_commit->super_roots,
+	       sizeof(*root_backup) * BTRFS_NUM_BACKUP_ROOTS);
+}
+
+/*
+ * this copies info out of the root backup array and back into
+ * the in-memory super block.  It is meant to help iterate through
+ * the array, so you send it the number of backups you've already
+ * tried and the last backup index you used.
+ *
+ * this returns -1 when it has tried all the backups
+ */
+static noinline int next_root_backup(struct btrfs_fs_info *info,
+				     struct btrfs_super_block *super,
+				     int *num_backups_tried, int *backup_index)
+{
+	struct btrfs_root_backup *root_backup;
+	int newest = *backup_index;
+
+	if (*num_backups_tried == 0) {
+		u64 gen = btrfs_super_generation(super);
+
+		newest = find_newest_super_backup(info, gen);
+		if (newest == -1)
+			return -1;
+
+		*backup_index = newest;
+		*num_backups_tried = 1;
+	} else if (*num_backups_tried == BTRFS_NUM_BACKUP_ROOTS) {
+		/* we've tried all the backups, all done */
+		return -1;
+	} else {
+		/* jump to the next oldest backup */
+		newest = (*backup_index + BTRFS_NUM_BACKUP_ROOTS - 1) %
+			BTRFS_NUM_BACKUP_ROOTS;
+		*backup_index = newest;
+		*num_backups_tried += 1;
+	}
+	root_backup = super->super_roots + newest;
+
+	btrfs_set_super_generation(super,
+				   btrfs_backup_tree_root_gen(root_backup));
+	btrfs_set_super_root(super, btrfs_backup_tree_root(root_backup));
+	btrfs_set_super_root_level(super,
+				   btrfs_backup_tree_root_level(root_backup));
+	btrfs_set_super_bytes_used(super, btrfs_backup_bytes_used(root_backup));
+
+	/*
+	 * fixme: the total bytes and num_devices need to match or we should
+	 * need a fsck
+	 */
+	btrfs_set_super_total_bytes(super, btrfs_backup_total_bytes(root_backup));
+	btrfs_set_super_num_devices(super, btrfs_backup_num_devices(root_backup));
+	return 0;
+}
+
+/* helper to cleanup tree roots */
+static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
+{
+	free_extent_buffer(info->tree_root->node);
+	free_extent_buffer(info->tree_root->commit_root);
+	free_extent_buffer(info->dev_root->node);
+	free_extent_buffer(info->dev_root->commit_root);
+	free_extent_buffer(info->extent_root->node);
+	free_extent_buffer(info->extent_root->commit_root);
+	free_extent_buffer(info->csum_root->node);
+	free_extent_buffer(info->csum_root->commit_root);
+
+	info->tree_root->node = NULL;
+	info->tree_root->commit_root = NULL;
+	info->dev_root->node = NULL;
+	info->dev_root->commit_root = NULL;
+	info->extent_root->node = NULL;
+	info->extent_root->commit_root = NULL;
+	info->csum_root->node = NULL;
+	info->csum_root->commit_root = NULL;
+
+	if (chunk_root) {
+		free_extent_buffer(info->chunk_root->node);
+		free_extent_buffer(info->chunk_root->commit_root);
+		info->chunk_root->node = NULL;
+		info->chunk_root->commit_root = NULL;
+	}
+}
+
+
 struct btrfs_root *open_ctree(struct super_block *sb,
 			      struct btrfs_fs_devices *fs_devices,
 			      char *options)
@@ -1603,6 +1827,8 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
 	int ret;
 	int err = -EINVAL;
+	int num_backups_tried = 0;
+	int backup_index = 0;
 
 	struct btrfs_super_block *disk_super;
 
@@ -1781,6 +2007,13 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 
 	btrfs_check_super_valid(fs_info, sb->s_flags & MS_RDONLY);
 
+	/*
+	 * run through our array of backup supers and setup
+	 * our ring pointer to the oldest one
+	 */
+	generation = btrfs_super_generation(disk_super);
+	find_oldest_super_backup(fs_info, generation);
+
 	/*
 	 * In the long term, we'll store the compression type in the super
 	 * block, and it'll be used for per file compression control.
@@ -1938,7 +2171,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	if (!test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) {
 		printk(KERN_WARNING "btrfs: failed to read chunk root on %s\n",
 		       sb->s_id);
-		goto fail_chunk_root;
+		goto fail_tree_roots;
 	}
 	btrfs_set_root_node(&chunk_root->root_item, chunk_root->node);
 	chunk_root->commit_root = btrfs_root_node(chunk_root);
@@ -1953,11 +2186,12 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	if (ret) {
 		printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n",
 		       sb->s_id);
-		goto fail_chunk_root;
+		goto fail_tree_roots;
 	}
 
 	btrfs_close_extra_devices(fs_devices);
 
+retry_root_backup:
 	blocksize = btrfs_level_size(tree_root,
 				     btrfs_super_root_level(disk_super));
 	generation = btrfs_super_generation(disk_super);
@@ -1965,32 +2199,33 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	tree_root->node = read_tree_block(tree_root,
 					  btrfs_super_root(disk_super),
 					  blocksize, generation);
-	if (!tree_root->node)
-		goto fail_chunk_root;
-	if (!test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
+	if (!tree_root->node ||
+	    !test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) {
 		printk(KERN_WARNING "btrfs: failed to read tree root on %s\n",
 		       sb->s_id);
-		goto fail_tree_root;
+
+		goto recovery_tree_root;
 	}
+
 	btrfs_set_root_node(&tree_root->root_item, tree_root->node);
 	tree_root->commit_root = btrfs_root_node(tree_root);
 
 	ret = find_and_setup_root(tree_root, fs_info,
 				  BTRFS_EXTENT_TREE_OBJECTID, extent_root);
 	if (ret)
-		goto fail_tree_root;
+		goto recovery_tree_root;
 	extent_root->track_dirty = 1;
 
 	ret = find_and_setup_root(tree_root, fs_info,
 				  BTRFS_DEV_TREE_OBJECTID, dev_root);
 	if (ret)
-		goto fail_extent_root;
+		goto recovery_tree_root;
 	dev_root->track_dirty = 1;
 
 	ret = find_and_setup_root(tree_root, fs_info,
 				  BTRFS_CSUM_TREE_OBJECTID, csum_root);
 	if (ret)
-		goto fail_dev_root;
+		goto recovery_tree_root;
 
 	csum_root->track_dirty = 1;
 
@@ -2123,20 +2358,10 @@ fail_cleaner:
 
 fail_block_groups:
 	btrfs_free_block_groups(fs_info);
-	free_extent_buffer(csum_root->node);
-	free_extent_buffer(csum_root->commit_root);
-fail_dev_root:
-	free_extent_buffer(dev_root->node);
-	free_extent_buffer(dev_root->commit_root);
-fail_extent_root:
-	free_extent_buffer(extent_root->node);
-	free_extent_buffer(extent_root->commit_root);
-fail_tree_root:
-	free_extent_buffer(tree_root->node);
-	free_extent_buffer(tree_root->commit_root);
-fail_chunk_root:
-	free_extent_buffer(chunk_root->node);
-	free_extent_buffer(chunk_root->commit_root);
+
+fail_tree_roots:
+	free_root_pointers(fs_info, 1);
+
 fail_sb_buffer:
 	btrfs_stop_workers(&fs_info->generic_worker);
 	btrfs_stop_workers(&fs_info->fixup_workers);
@@ -2164,6 +2389,25 @@ fail_srcu:
 fail:
 	free_fs_info(fs_info);
 	return ERR_PTR(err);
+
+recovery_tree_root:
+
+	if (!btrfs_test_opt(tree_root, RECOVERY))
+		goto fail_tree_roots;
+
+	free_root_pointers(fs_info, 0);
+
+	/* don't use the log in recovery mode, it won't be valid */
+	btrfs_set_super_log_root(disk_super, 0);
+
+	/* we can't trust the free space cache either */
+	btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE);
+
+	ret = next_root_backup(fs_info, fs_info->super_copy,
+			       &num_backups_tried, &backup_index);
+	if (ret == -1)
+		goto fail_block_groups;
+	goto retry_root_backup;
 }
 
 static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate)
@@ -2333,6 +2577,7 @@ int write_all_supers(struct btrfs_root *root, int max_mirrors)
 
 	max_errors = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
 	do_barriers = !btrfs_test_opt(root, NOBARRIER);
+	backup_super_roots(root->fs_info);
 
 	sb = root->fs_info->super_for_commit;
 	dev_item = &sb->dev_item;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index f7e9de724ef2..57080dffdfc6 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -164,7 +164,7 @@ enum {
 	Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
 	Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
 	Opt_enospc_debug, Opt_subvolrootid, Opt_defrag,
-	Opt_inode_cache, Opt_no_space_cache, Opt_err,
+	Opt_inode_cache, Opt_no_space_cache, Opt_recovery, Opt_err,
 };
 
 static match_table_t tokens = {
@@ -198,6 +198,7 @@ static match_table_t tokens = {
 	{Opt_defrag, "autodefrag"},
 	{Opt_inode_cache, "inode_cache"},
 	{Opt_no_space_cache, "no_space_cache"},
+	{Opt_recovery, "recovery"},
 	{Opt_err, NULL},
 };
 
@@ -392,6 +393,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 			printk(KERN_INFO "btrfs: enabling auto defrag");
 			btrfs_set_opt(info->mount_opt, AUTO_DEFRAG);
 			break;
+		case Opt_recovery:
+			printk(KERN_INFO "btrfs: enabling auto recovery");
+			btrfs_set_opt(info->mount_opt, RECOVERY);
+			break;
 		case Opt_err:
 			printk(KERN_INFO "btrfs: unrecognized mount option "
 			       "'%s'\n", p);
-- 
cgit v1.2.3


From 6d668dda0caec537fbf28c4d91e6d18181af3cff Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Thu, 3 Nov 2011 22:54:25 -0400
Subject: Btrfs: make a delayed_block_rsv for the delayed item insertion

I've been hitting warnings in use_block_rsv when running the delayed insertion
stuff.  It's because we will readjust global block rsv based on what is in use,
which means we could end up discarding reservations that are for the delayed
insertion stuff.  So instead create a seperate block rsv for the delayed
insertion stuff.  This will also make it easier to debug problems with the
delayed insertion reservations since we will know that only the delayed
insertion code touches this block_rsv.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h         |  2 ++
 fs/btrfs/delayed-inode.c | 14 +++++++-------
 fs/btrfs/disk-io.c       |  1 +
 fs/btrfs/extent-tree.c   |  3 +++
 4 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 78f43d1102a0..3002e5d4da0b 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -957,6 +957,8 @@ struct btrfs_fs_info {
 	struct btrfs_block_rsv trans_block_rsv;
 	/* block reservation for chunk tree */
 	struct btrfs_block_rsv chunk_block_rsv;
+	/* block reservation for delayed operations */
+	struct btrfs_block_rsv delayed_block_rsv;
 
 	struct btrfs_block_rsv empty_block_rsv;
 
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index b52c672f4c18..fc4026af7290 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -591,7 +591,7 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans,
 		return 0;
 
 	src_rsv = trans->block_rsv;
-	dst_rsv = &root->fs_info->global_block_rsv;
+	dst_rsv = &root->fs_info->delayed_block_rsv;
 
 	num_bytes = btrfs_calc_trans_metadata_size(root, 1);
 	ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
@@ -609,7 +609,7 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
 	if (!item->bytes_reserved)
 		return;
 
-	rsv = &root->fs_info->global_block_rsv;
+	rsv = &root->fs_info->delayed_block_rsv;
 	btrfs_block_rsv_release(root, rsv,
 				item->bytes_reserved);
 }
@@ -628,7 +628,7 @@ static int btrfs_delayed_inode_reserve_metadata(
 		return 0;
 
 	src_rsv = trans->block_rsv;
-	dst_rsv = &root->fs_info->global_block_rsv;
+	dst_rsv = &root->fs_info->delayed_block_rsv;
 
 	num_bytes = btrfs_calc_trans_metadata_size(root, 1);
 	ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
@@ -646,7 +646,7 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_root *root,
 	if (!node->bytes_reserved)
 		return;
 
-	rsv = &root->fs_info->global_block_rsv;
+	rsv = &root->fs_info->delayed_block_rsv;
 	btrfs_block_rsv_release(root, rsv,
 				node->bytes_reserved);
 	node->bytes_reserved = 0;
@@ -1026,7 +1026,7 @@ int btrfs_run_delayed_items(struct btrfs_trans_handle *trans,
 	path->leave_spinning = 1;
 
 	block_rsv = trans->block_rsv;
-	trans->block_rsv = &root->fs_info->global_block_rsv;
+	trans->block_rsv = &root->fs_info->delayed_block_rsv;
 
 	delayed_root = btrfs_get_delayed_root(root);
 
@@ -1069,7 +1069,7 @@ static int __btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans,
 	path->leave_spinning = 1;
 
 	block_rsv = trans->block_rsv;
-	trans->block_rsv = &node->root->fs_info->global_block_rsv;
+	trans->block_rsv = &node->root->fs_info->delayed_block_rsv;
 
 	ret = btrfs_insert_delayed_items(trans, path, node->root, node);
 	if (!ret)
@@ -1149,7 +1149,7 @@ static void btrfs_async_run_delayed_node_done(struct btrfs_work *work)
 		goto free_path;
 
 	block_rsv = trans->block_rsv;
-	trans->block_rsv = &root->fs_info->global_block_rsv;
+	trans->block_rsv = &root->fs_info->delayed_block_rsv;
 
 	ret = btrfs_insert_delayed_items(trans, path, root, delayed_node);
 	if (!ret)
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index a61f8a6cf219..23b6776477b7 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1891,6 +1891,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	btrfs_init_block_rsv(&fs_info->trans_block_rsv);
 	btrfs_init_block_rsv(&fs_info->chunk_block_rsv);
 	btrfs_init_block_rsv(&fs_info->empty_block_rsv);
+	btrfs_init_block_rsv(&fs_info->delayed_block_rsv);
 	atomic_set(&fs_info->nr_async_submits, 0);
 	atomic_set(&fs_info->async_delalloc_pages, 0);
 	atomic_set(&fs_info->async_submit_draining, 0);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 782eb3ea8edf..01c1f08b976a 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3914,6 +3914,7 @@ static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
 	fs_info->delalloc_block_rsv.space_info = space_info;
 	fs_info->trans_block_rsv.space_info = space_info;
 	fs_info->empty_block_rsv.space_info = space_info;
+	fs_info->delayed_block_rsv.space_info = space_info;
 
 	fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
 	fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
@@ -3933,6 +3934,8 @@ static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
 	WARN_ON(fs_info->trans_block_rsv.reserved > 0);
 	WARN_ON(fs_info->chunk_block_rsv.size > 0);
 	WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
+	WARN_ON(fs_info->delayed_block_rsv.size > 0);
+	WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
 }
 
 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
-- 
cgit v1.2.3


From 663350ac38c67ca388acea6e876dc6d668c232b0 Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Thu, 3 Nov 2011 22:54:25 -0400
Subject: Btrfs: be smarter about committing the transaction in
 reserve_metadata_bytes

Because of the overcommit stuff I had to make it so that we committed the
transaction all the time in reserve_metadata_bytes in case we had overcommitted
because of delayed items.  This was because previously we had no way of knowing
how much space was reserved for delayed items.  Now that we have the
delayed_block_rsv we can check it to see if committing the transaction would get
us anywhere.  This patch breaks out the committing logic into a helper function
that will check to see if committing the transaction would free enough space for
us to get anything done.  With this patch xfstests 83 goes from taking 445
seconds to taking 28 seconds on my box.  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 86 +++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 67 insertions(+), 19 deletions(-)

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 01c1f08b976a..5b84205e7685 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3334,12 +3334,12 @@ out:
 /*
  * shrink metadata reservation for delalloc
  */
-static int shrink_delalloc(struct btrfs_trans_handle *trans,
-			   struct btrfs_root *root, u64 to_reclaim,
+static int shrink_delalloc(struct btrfs_root *root, u64 to_reclaim,
 			   bool wait_ordered)
 {
 	struct btrfs_block_rsv *block_rsv;
 	struct btrfs_space_info *space_info;
+	struct btrfs_trans_handle *trans;
 	u64 reserved;
 	u64 max_reclaim;
 	u64 reclaimed = 0;
@@ -3348,6 +3348,7 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
 	int loops = 0;
 	unsigned long progress;
 
+	trans = (struct btrfs_trans_handle *)current->journal_info;
 	block_rsv = &root->fs_info->delalloc_block_rsv;
 	space_info = block_rsv->space_info;
 
@@ -3417,6 +3418,60 @@ static int shrink_delalloc(struct btrfs_trans_handle *trans,
 	return reclaimed >= to_reclaim;
 }
 
+/**
+ * maybe_commit_transaction - possibly commit the transaction if its ok to
+ * @root - the root we're allocating for
+ * @bytes - the number of bytes we want to reserve
+ * @force - force the commit
+ *
+ * This will check to make sure that committing the transaction will actually
+ * get us somewhere and then commit the transaction if it does.  Otherwise it
+ * will return -ENOSPC.
+ */
+static int may_commit_transaction(struct btrfs_root *root,
+				  struct btrfs_space_info *space_info,
+				  u64 bytes, int force)
+{
+	struct btrfs_block_rsv *delayed_rsv = &root->fs_info->delayed_block_rsv;
+	struct btrfs_trans_handle *trans;
+
+	trans = (struct btrfs_trans_handle *)current->journal_info;
+	if (trans)
+		return -EAGAIN;
+
+	if (force)
+		goto commit;
+
+	/* See if there is enough pinned space to make this reservation */
+	spin_lock(&space_info->lock);
+	if (space_info->bytes_pinned >= bytes) {
+		spin_unlock(&space_info->lock);
+		goto commit;
+	}
+	spin_unlock(&space_info->lock);
+
+	/*
+	 * See if there is some space in the delayed insertion reservation for
+	 * this reservation.
+	 */
+	if (space_info != delayed_rsv->space_info)
+		return -ENOSPC;
+
+	spin_lock(&delayed_rsv->lock);
+	if (delayed_rsv->size < bytes) {
+		spin_unlock(&delayed_rsv->lock);
+		return -ENOSPC;
+	}
+	spin_unlock(&delayed_rsv->lock);
+
+commit:
+	trans = btrfs_join_transaction(root);
+	if (IS_ERR(trans))
+		return -ENOSPC;
+
+	return btrfs_commit_transaction(trans, root);
+}
+
 /**
  * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
  * @root - the root we're allocating for
@@ -3436,7 +3491,6 @@ static int reserve_metadata_bytes(struct btrfs_root *root,
 				  u64 orig_bytes, int flush)
 {
 	struct btrfs_space_info *space_info = block_rsv->space_info;
-	struct btrfs_trans_handle *trans;
 	u64 used;
 	u64 num_bytes = orig_bytes;
 	int retries = 0;
@@ -3445,7 +3499,6 @@ static int reserve_metadata_bytes(struct btrfs_root *root,
 	bool flushing = false;
 	bool wait_ordered = false;
 
-	trans = (struct btrfs_trans_handle *)current->journal_info;
 again:
 	ret = 0;
 	spin_lock(&space_info->lock);
@@ -3461,7 +3514,7 @@ again:
 		 * deadlock since we are waiting for the flusher to finish, but
 		 * hold the current transaction open.
 		 */
-		if (trans)
+		if (current->journal_info)
 			return -EAGAIN;
 		ret = wait_event_interruptible(space_info->wait,
 					       !space_info->flush);
@@ -3517,12 +3570,16 @@ again:
 		 */
 		avail = (space_info->total_bytes - space_info->bytes_used) * 8;
 		do_div(avail, 10);
-		if (space_info->bytes_pinned >= avail && flush && !trans &&
-		    !committed) {
+		if (space_info->bytes_pinned >= avail && flush && !committed) {
 			space_info->flush = 1;
 			flushing = true;
 			spin_unlock(&space_info->lock);
-			goto commit;
+			ret = may_commit_transaction(root, space_info,
+						     orig_bytes, 1);
+			if (ret)
+				goto out;
+			committed = true;
+			goto again;
 		}
 
 		spin_lock(&root->fs_info->free_chunk_lock);
@@ -3575,7 +3632,7 @@ again:
 	 * We do synchronous shrinking since we don't actually unreserve
 	 * metadata until after the IO is completed.
 	 */
-	ret = shrink_delalloc(trans, root, num_bytes, wait_ordered);
+	ret = shrink_delalloc(root, num_bytes, wait_ordered);
 	if (ret < 0)
 		goto out;
 
@@ -3592,21 +3649,12 @@ again:
 		goto again;
 	}
 
-	ret = -EAGAIN;
-	if (trans)
-		goto out;
-
-commit:
 	ret = -ENOSPC;
 	if (committed)
 		goto out;
 
-	trans = btrfs_join_transaction(root);
-	if (IS_ERR(trans))
-		goto out;
-	ret = btrfs_commit_transaction(trans, root);
+	ret = may_commit_transaction(root, space_info, orig_bytes, 0);
 	if (!ret) {
-		trans = NULL;
 		committed = true;
 		goto again;
 	}
-- 
cgit v1.2.3


From bf0da8c183a15656eee63c54f334c3794320872a Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 4 Nov 2011 12:29:37 -0400
Subject: Btrfs: ClearPageError during writepage and clean_tree_block

Failure testing was tripping up over stale PageError bits in
metadata pages.  If we have an io error on a block, and later on
end up reusing it, nobody ever clears PageError on those pages.

During commit, we'll find PageError and think we had trouble writing
the block, which will lead to aborts and other problems.

This changes clean_tree_block and the btrfs writepage code to
clear the PageError bit.  In both cases we're either completely
done with the page or the page has good stuff and the error bit
is no longer valid.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_io.c   | 4 ++++
 fs/btrfs/transaction.c | 7 ++++++-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index b40ba75f4483..cc3c58970d4e 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2334,6 +2334,9 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 	trace___extent_writepage(page, inode, wbc);
 
 	WARN_ON(!PageLocked(page));
+
+	ClearPageError(page);
+
 	pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
 	if (page->index > end_index ||
 	   (page->index == end_index && !pg_offset)) {
@@ -3402,6 +3405,7 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
 						PAGECACHE_TAG_DIRTY);
 		}
 		spin_unlock_irq(&page->mapping->tree_lock);
+		ClearPageError(page);
 		unlock_page(page);
 	}
 	return 0;
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 373c7ec1a026..29f782cc2cc9 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -634,7 +634,12 @@ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
 
 	ret = btrfs_write_marked_extents(root, dirty_pages, mark);
 	ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark);
-	return ret || ret2;
+
+	if (ret)
+		return ret;
+	if (ret2)
+		return ret2;
+	return 0;
 }
 
 int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans,
-- 
cgit v1.2.3


From c06a0e120a4e381a1c291c1fce3c6155c5791cae Mon Sep 17 00:00:00 2001
From: Josef Bacik <josef@redhat.com>
Date: Fri, 4 Nov 2011 19:56:02 -0400
Subject: Btrfs: fix delayed insertion reservation

We all keep getting those stupid warnings from use_block_rsv when running
stress.sh, and it's because the delayed insertion stuff is being stupid.  It's
not the delayed insertion stuffs fault, it's all just stupid.  When marking an
inode dirty for oh say updating the time on it, we just do a
btrfs_join_transaction, which doesn't reserve any space.  This is stupid because
we're going to have to have space reserve to make this change, but we do it
because it's fast because chances are we're going to call it over and over again
and it doesn't matter.  Well thanks to the delayed insertion stuff this is
mostly the case, so we do actually need to make this reservation.  So if
trans->bytes_reserved is 0 then try to do a normal reservation.  If not return
ENOSPC which will make the btrfs_dirty_inode start a proper transaction which
will let it do the whole ENOSPC dance and reserve enough space for the delayed
insertion to steal the reservation from the transaction.

The other stupid thing we do is not reserve space for the inode when writing to
the thing.  Usually this is ok since we have to update the time so we'd have
already done all this work before we get to the endio stuff, so it doesn't
matter.  But this is stupid because we could write the data after the
transaction commits where we changed the mtime of the inode so we have to cow
all the way down to the inode anyway.  This used to be masked by the delalloc
reservation stuff, but because we delay the update it doesn't get masked in this
case.  So again the delayed insertion stuff bites us in the ass.  So if our
trans->block_rsv is delalloc, just steal the reservation from the delalloc
reserve.  Hopefully this won't bite us in the ass, but I've said that before.

With this patch stress.sh no longer spits out those stupid warnings (famous last
words).  Thanks,

Signed-off-by: Josef Bacik <josef@redhat.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.h         |  3 +++
 fs/btrfs/delayed-inode.c | 36 ++++++++++++++++++++++++++++--------
 fs/btrfs/extent-tree.c   | 18 ++++++++++++++++++
 3 files changed, 49 insertions(+), 8 deletions(-)

diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 3002e5d4da0b..6bb34fc1ff22 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2353,6 +2353,9 @@ void btrfs_free_block_rsv(struct btrfs_root *root,
 int btrfs_block_rsv_add(struct btrfs_root *root,
 			struct btrfs_block_rsv *block_rsv,
 			u64 num_bytes);
+int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
+				struct btrfs_block_rsv *block_rsv,
+				u64 num_bytes);
 int btrfs_block_rsv_check(struct btrfs_root *root,
 			  struct btrfs_block_rsv *block_rsv, int min_factor);
 int btrfs_block_rsv_refill(struct btrfs_root *root,
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index fc4026af7290..bbe8496d5339 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -624,13 +624,36 @@ static int btrfs_delayed_inode_reserve_metadata(
 	u64 num_bytes;
 	int ret;
 
-	if (!trans->bytes_reserved)
-		return 0;
-
 	src_rsv = trans->block_rsv;
 	dst_rsv = &root->fs_info->delayed_block_rsv;
 
 	num_bytes = btrfs_calc_trans_metadata_size(root, 1);
+
+	/*
+	 * btrfs_dirty_inode will update the inode under btrfs_join_transaction
+	 * which doesn't reserve space for speed.  This is a problem since we
+	 * still need to reserve space for this update, so try to reserve the
+	 * space.
+	 *
+	 * Now if src_rsv == delalloc_block_rsv we'll let it just steal since
+	 * we're accounted for.
+	 */
+	if (!trans->bytes_reserved &&
+	    src_rsv != &root->fs_info->delalloc_block_rsv) {
+		ret = btrfs_block_rsv_add_noflush(root, dst_rsv, num_bytes);
+		/*
+		 * Since we're under a transaction reserve_metadata_bytes could
+		 * try to commit the transaction which will make it return
+		 * EAGAIN to make us stop the transaction we have, so return
+		 * ENOSPC instead so that btrfs_dirty_inode knows what to do.
+		 */
+		if (ret == -EAGAIN)
+			ret = -ENOSPC;
+		if (!ret)
+			node->bytes_reserved = num_bytes;
+		return ret;
+	}
+
 	ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes);
 	if (!ret)
 		node->bytes_reserved = num_bytes;
@@ -1686,11 +1709,8 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
 	}
 
 	ret = btrfs_delayed_inode_reserve_metadata(trans, root, delayed_node);
-	/*
-	 * we must reserve enough space when we start a new transaction,
-	 * so reserving metadata failure is impossible
-	 */
-	BUG_ON(ret);
+	if (ret)
+		goto release_node;
 
 	fill_stack_inode_item(trans, &delayed_node->inode_item, inode);
 	delayed_node->inode_dirty = 1;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 5b84205e7685..23e936c3de76 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3814,6 +3814,24 @@ int btrfs_block_rsv_add(struct btrfs_root *root,
 	return ret;
 }
 
+int btrfs_block_rsv_add_noflush(struct btrfs_root *root,
+				struct btrfs_block_rsv *block_rsv,
+				u64 num_bytes)
+{
+	int ret;
+
+	if (num_bytes == 0)
+		return 0;
+
+	ret = reserve_metadata_bytes(root, block_rsv, num_bytes, 0);
+	if (!ret) {
+		block_rsv_add_bytes(block_rsv, num_bytes, 1);
+		return 0;
+	}
+
+	return ret;
+}
+
 int btrfs_block_rsv_check(struct btrfs_root *root,
 			  struct btrfs_block_rsv *block_rsv, int min_factor)
 {
-- 
cgit v1.2.3


From 740c3d226cbba6cd6a32adfb66809c94938f3e57 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Wed, 2 Nov 2011 15:48:34 -0400
Subject: Btrfs: fix the new inspection ioctls for 32 bit compat

The new ioctls to follow backrefs are not clean for 32/64 bit
compat.  This reworks them for u64s everywhere.  They are brand new, so
there are no problems with changing the interface now.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/backref.c |  8 ++++----
 fs/btrfs/ioctl.c   | 10 +++++-----
 fs/btrfs/ioctl.h   | 11 +++++------
 fs/btrfs/scrub.c   |  2 +-
 4 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 2351df0de450..8855aad3929c 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -676,14 +676,14 @@ static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref,
 	bytes_left = ipath->fspath->bytes_left > s_ptr ?
 					ipath->fspath->bytes_left - s_ptr : 0;
 
-	fspath_min = (char *)ipath->fspath->str + (i + 1) * s_ptr;
+	fspath_min = (char *)ipath->fspath->val + (i + 1) * s_ptr;
 	fspath = iref_to_path(ipath->fs_root, ipath->btrfs_path, iref, eb,
 				inum, fspath_min, bytes_left);
 	if (IS_ERR(fspath))
 		return PTR_ERR(fspath);
 
 	if (fspath > fspath_min) {
-		ipath->fspath->str[i] = fspath;
+		ipath->fspath->val[i] = (u64)fspath;
 		++ipath->fspath->elem_cnt;
 		ipath->fspath->bytes_left = fspath - fspath_min;
 	} else {
@@ -698,9 +698,9 @@ static int inode_to_path(u64 inum, struct btrfs_inode_ref *iref,
 /*
  * this dumps all file system paths to the inode into the ipath struct, provided
  * is has been created large enough. each path is zero-terminated and accessed
- * from ipath->fspath->str[i].
+ * from ipath->fspath->val[i].
  * when it returns, there are ipath->fspath->elem_cnt number of paths available
- * in ipath->fspath->str[]. when the allocated space wasn't sufficient, the
+ * in ipath->fspath->val[]. when the allocated space wasn't sufficient, the
  * number of missed paths in recored in ipath->fspath->elem_missed, otherwise,
  * it's zero. ipath->fspath->bytes_missing holds the number of bytes that would
  * have been needed to return all paths.
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index cc9893990341..4a34c472f126 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2895,7 +2895,7 @@ static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
 {
 	int ret = 0;
 	int i;
-	unsigned long rel_ptr;
+	u64 rel_ptr;
 	int size;
 	struct btrfs_ioctl_ino_path_args *ipa = NULL;
 	struct inode_fs_paths *ipath = NULL;
@@ -2930,11 +2930,11 @@ static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
 		goto out;
 
 	for (i = 0; i < ipath->fspath->elem_cnt; ++i) {
-		rel_ptr = ipath->fspath->str[i] - (char *)ipath->fspath->str;
-		ipath->fspath->str[i] = (void *)rel_ptr;
+		rel_ptr = ipath->fspath->val[i] - (u64)ipath->fspath->val;
+		ipath->fspath->val[i] = rel_ptr;
 	}
 
-	ret = copy_to_user(ipa->fspath, ipath->fspath, size);
+	ret = copy_to_user((void *)ipa->fspath, (void *)ipath->fspath, size);
 	if (ret) {
 		ret = -EFAULT;
 		goto out;
@@ -3017,7 +3017,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
 	if (ret < 0)
 		goto out;
 
-	ret = copy_to_user(loi->inodes, inodes, size);
+	ret = copy_to_user((void *)loi->inodes, (void *)inodes, size);
 	if (ret)
 		ret = -EFAULT;
 
diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h
index 2da30d4950e6..252ae9915de8 100644
--- a/fs/btrfs/ioctl.h
+++ b/fs/btrfs/ioctl.h
@@ -198,24 +198,23 @@ struct btrfs_data_container {
 	__u32	bytes_missing;	/* out -- additional bytes needed for result */
 	__u32	elem_cnt;	/* out */
 	__u32	elem_missed;	/* out */
-	union {
-		char	*str[0];	/* out */
-		__u64	val[0];		/* out */
-	};
+	__u64	val[0];		/* out */
 };
 
 struct btrfs_ioctl_ino_path_args {
 	__u64				inum;		/* in */
 	__u32				size;		/* in */
 	__u64				reserved[4];
-	struct btrfs_data_container	*fspath;	/* out */
+	/* struct btrfs_data_container	*fspath;	   out */
+	__u64				fspath;		/* out */
 };
 
 struct btrfs_ioctl_logical_ino_args {
 	__u64				logical;	/* in */
 	__u32				size;		/* in */
 	__u64				reserved[4];
-	struct btrfs_data_container	*inodes;	/* out */
+	/* struct btrfs_data_container	*inodes;	out   */
+	__u64				inodes;
 };
 
 #define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 94cd3a19e9c8..562dad10dee9 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -272,7 +272,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root, void *ctx)
 			swarn->logical, swarn->dev->name,
 			(unsigned long long)swarn->sector, root, inum, offset,
 			min(isize - offset, (u64)PAGE_SIZE), nlink,
-			ipath->fspath->str[i]);
+			(char *)ipath->fspath->val[i]);
 
 	free_ipath(ipath);
 	return 0;
-- 
cgit v1.2.3


From c674e04e1cd6049715e7b9446790f4b441e547c0 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 3 Nov 2011 22:23:13 -0400
Subject: Btrfs: fix extent_buffer leak in the metadata IO error handling

The scrub readahead branch brought in a new error handling hook,
but it was leaking extent_buffer references.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 0eb1f0951251..40a62b980087 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -644,6 +644,7 @@ static int btree_io_failed_hook(struct bio *failed_bio,
 		clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags);
 		btree_readahead_hook(root, eb, eb->start, -EIO);
 	}
+	free_extent_buffer(eb);
 
 out:
 	return -EIO;	/* we fixed nothing */
-- 
cgit v1.2.3


From 306c8b68c82dfe6b7c9e5b61985760ad5d089205 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 3 Nov 2011 15:21:39 -0400
Subject: Btrfs: stop the readahead threads on failed mount

If we don't stop them, they linger around corrupting
memory by using pointers to freed things.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 40a62b980087..e532892431f4 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -2444,6 +2444,7 @@ fail_tree_roots:
 
 fail_sb_buffer:
 	btrfs_stop_workers(&fs_info->generic_worker);
+	btrfs_stop_workers(&fs_info->readahead_workers);
 	btrfs_stop_workers(&fs_info->fixup_workers);
 	btrfs_stop_workers(&fs_info->delalloc_workers);
 	btrfs_stop_workers(&fs_info->workers);
-- 
cgit v1.2.3


From 9510dc4c62252e96dca44a94948327c76774f25b Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Fri, 4 Nov 2011 09:41:02 -0400
Subject: Btrfs: stop leaking btrfs_bios on readahead

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/reada.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index cd857119ba8a..35eaf6893679 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -423,6 +423,7 @@ again:
 	}
 	spin_unlock(&fs_info->reada_lock);
 
+	kfree(multi);
 	return re;
 
 error:
@@ -447,6 +448,7 @@ error:
 		kref_put(&zone->refcnt, reada_zone_release);
 		spin_unlock(&fs_info->reada_lock);
 	}
+	kfree(multi);
 	kfree(re);
 	if (looped)
 		goto again;
-- 
cgit v1.2.3


From 21ca543efc12674fddb22ddf4ea4906427f4e982 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Fri, 4 Nov 2011 09:41:02 -0400
Subject: Btrfs: rename btrfs_bio multi -> bbio for consistency

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/reada.c | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c
index 35eaf6893679..2373b39a132b 100644
--- a/fs/btrfs/reada.c
+++ b/fs/btrfs/reada.c
@@ -247,7 +247,7 @@ int btree_readahead_hook(struct btrfs_root *root, struct extent_buffer *eb,
 
 static struct reada_zone *reada_find_zone(struct btrfs_fs_info *fs_info,
 					  struct btrfs_device *dev, u64 logical,
-					  struct btrfs_bio *multi)
+					  struct btrfs_bio *bbio)
 {
 	int ret;
 	int looped = 0;
@@ -297,11 +297,11 @@ again:
 	kref_init(&zone->refcnt);
 	zone->elems = 0;
 	zone->device = dev; /* our device always sits at index 0 */
-	for (i = 0; i < multi->num_stripes; ++i) {
+	for (i = 0; i < bbio->num_stripes; ++i) {
 		/* bounds have already been checked */
-		zone->devs[i] = multi->stripes[i].dev;
+		zone->devs[i] = bbio->stripes[i].dev;
 	}
-	zone->ndevs = multi->num_stripes;
+	zone->ndevs = bbio->num_stripes;
 
 	spin_lock(&fs_info->reada_lock);
 	ret = radix_tree_insert(&dev->reada_zones,
@@ -327,7 +327,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root,
 	struct reada_extent *re = NULL;
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct btrfs_mapping_tree *map_tree = &fs_info->mapping_tree;
-	struct btrfs_bio *multi = NULL;
+	struct btrfs_bio *bbio = NULL;
 	struct btrfs_device *dev;
 	u32 blocksize;
 	u64 length;
@@ -361,21 +361,21 @@ again:
 	 * map block
 	 */
 	length = blocksize;
-	ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, &multi, 0);
-	if (ret || !multi || length < blocksize)
+	ret = btrfs_map_block(map_tree, REQ_WRITE, logical, &length, &bbio, 0);
+	if (ret || !bbio || length < blocksize)
 		goto error;
 
-	if (multi->num_stripes > MAX_MIRRORS) {
+	if (bbio->num_stripes > MAX_MIRRORS) {
 		printk(KERN_ERR "btrfs readahead: more than %d copies not "
 				"supported", MAX_MIRRORS);
 		goto error;
 	}
 
-	for (nzones = 0; nzones < multi->num_stripes; ++nzones) {
+	for (nzones = 0; nzones < bbio->num_stripes; ++nzones) {
 		struct reada_zone *zone;
 
-		dev = multi->stripes[nzones].dev;
-		zone = reada_find_zone(fs_info, dev, logical, multi);
+		dev = bbio->stripes[nzones].dev;
+		zone = reada_find_zone(fs_info, dev, logical, bbio);
 		if (!zone)
 			break;
 
@@ -407,11 +407,11 @@ again:
 		goto error;
 	}
 	for (i = 0; i < nzones; ++i) {
-		dev = multi->stripes[i].dev;
+		dev = bbio->stripes[i].dev;
 		ret = radix_tree_insert(&dev->reada_extents, index, re);
 		if (ret) {
 			while (--i >= 0) {
-				dev = multi->stripes[i].dev;
+				dev = bbio->stripes[i].dev;
 				BUG_ON(dev == NULL);
 				radix_tree_delete(&dev->reada_extents, index);
 			}
@@ -423,7 +423,7 @@ again:
 	}
 	spin_unlock(&fs_info->reada_lock);
 
-	kfree(multi);
+	kfree(bbio);
 	return re;
 
 error:
@@ -448,7 +448,7 @@ error:
 		kref_put(&zone->refcnt, reada_zone_release);
 		spin_unlock(&fs_info->reada_lock);
 	}
-	kfree(multi);
+	kfree(bbio);
 	kfree(re);
 	if (looped)
 		goto again;
-- 
cgit v1.2.3


From 56d2a48f81a1bde827c625b90221fade72fe9c61 Mon Sep 17 00:00:00 2001
From: Ilya Dryomov <idryomov@gmail.com>
Date: Fri, 4 Nov 2011 09:41:02 -0400
Subject: Btrfs: fix a potential btrfs_bio leak on scrub fixups

In case we were able to map less than we wanted (length < PAGE_SIZE
clause is true) btrfs_bio is still allocated and we have to free it.

Signed-off-by: Ilya Dryomov <idryomov@gmail.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/scrub.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 562dad10dee9..ed11d3866afd 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -655,6 +655,7 @@ static void scrub_fixup(struct scrub_bio *sbio, int ix)
 		       "scrub_fixup: btrfs_map_block failed us for %llu\n",
 		       (unsigned long long)logical);
 		WARN_ON(1);
+		kfree(bbio);
 		return;
 	}
 
-- 
cgit v1.2.3


From d43317dcd074818d4bd12ddd4184a29aff98907b Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Sun, 6 Nov 2011 03:26:19 -0500
Subject: Btrfs: fix race during transaction joins

While we're allocating ram for a new transaction, we drop our spinlock.
When we get the lock back, we do check to see if a transaction started
while we slept, but we don't check to make sure it isn't blocked
because a commit has already started.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/transaction.c | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 29f782cc2cc9..960835eaf4da 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -55,6 +55,7 @@ static noinline int join_transaction(struct btrfs_root *root, int nofail)
 	struct btrfs_transaction *cur_trans;
 
 	spin_lock(&root->fs_info->trans_lock);
+loop:
 	if (root->fs_info->trans_no_join) {
 		if (!nofail) {
 			spin_unlock(&root->fs_info->trans_lock);
@@ -75,16 +76,18 @@ static noinline int join_transaction(struct btrfs_root *root, int nofail)
 	cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, GFP_NOFS);
 	if (!cur_trans)
 		return -ENOMEM;
+
 	spin_lock(&root->fs_info->trans_lock);
 	if (root->fs_info->running_transaction) {
+		/*
+		 * someone started a transaction after we unlocked.  Make sure
+		 * to redo the trans_no_join checks above
+		 */
 		kmem_cache_free(btrfs_transaction_cachep, cur_trans);
 		cur_trans = root->fs_info->running_transaction;
-		atomic_inc(&cur_trans->use_count);
-		atomic_inc(&cur_trans->num_writers);
-		cur_trans->num_joined++;
-		spin_unlock(&root->fs_info->trans_lock);
-		return 0;
+		goto loop;
 	}
+
 	atomic_set(&cur_trans->num_writers, 1);
 	cur_trans->num_joined = 0;
 	init_waitqueue_head(&cur_trans->writer_wait);
-- 
cgit v1.2.3


From 7c7e82a77fe3d89ae50824aa7c897454675eb4c4 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Sun, 6 Nov 2011 18:50:56 -0500
Subject: Btrfs: check for a null fs root when writing to the backup root log

During log replay, can commit the transaction before the fs_root
pointers are setup, so we have to make sure they are not null before
trying to use them.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/disk-io.c | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index e532892431f4..e53a5bb85670 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1754,11 +1754,18 @@ static void backup_super_roots(struct btrfs_fs_info *info)
 	btrfs_set_backup_extent_root_level(root_backup,
 			       btrfs_header_level(info->extent_root->node));
 
-	btrfs_set_backup_fs_root(root_backup, info->fs_root->node->start);
-	btrfs_set_backup_fs_root_gen(root_backup,
+	/*
+	 * we might commit during log recovery, which happens before we set
+	 * the fs_root.  Make sure it is valid before we fill it in.
+	 */
+	if (info->fs_root && info->fs_root->node) {
+		btrfs_set_backup_fs_root(root_backup,
+					 info->fs_root->node->start);
+		btrfs_set_backup_fs_root_gen(root_backup,
 			       btrfs_header_generation(info->fs_root->node));
-	btrfs_set_backup_fs_root_level(root_backup,
+		btrfs_set_backup_fs_root_level(root_backup,
 			       btrfs_header_level(info->fs_root->node));
+	}
 
 	btrfs_set_backup_dev_root(root_backup, info->dev_root->node->start);
 	btrfs_set_backup_dev_root_gen(root_backup,
-- 
cgit v1.2.3