From 44b56e0a1aed522a10051645e85d300e10926fd3 Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Mon, 11 Jan 2010 11:47:43 +0000
Subject: xfs: convert remaining direct references to m_perag

Convert the remaining direct lookups of the per ag structures to use
get/put accesses. Ensure that the loops across AGs and prior users
of the interface balance gets and puts correctly.

Signed-off-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_mount.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'fs/xfs/xfs_mount.c')

diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index eb403b40e120..9055b60730d0 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -438,18 +438,20 @@ xfs_initialize_perag(
 			}
 
 			/* This ag is preferred for inodes */
-			pag = &mp->m_perag[index];
+			pag = xfs_perag_get(mp, index);
 			pag->pagi_inodeok = 1;
 			if (index < max_metadata)
 				pag->pagf_metadata = 1;
 			xfs_initialize_perag_icache(pag);
+			xfs_perag_put(pag);
 		}
 	} else {
 		/* Setup default behavior for smaller filesystems */
 		for (index = 0; index < agcount; index++) {
-			pag = &mp->m_perag[index];
+			pag = xfs_perag_get(mp, index);
 			pag->pagi_inodeok = 1;
 			xfs_initialize_perag_icache(pag);
+			xfs_perag_put(pag);
 		}
 	}
 	return index;
@@ -731,12 +733,13 @@ xfs_initialize_perag_data(xfs_mount_t *mp, xfs_agnumber_t agcount)
 		error = xfs_ialloc_pagi_init(mp, NULL, index);
 		if (error)
 			return error;
-		pag = &mp->m_perag[index];
+		pag = xfs_perag_get(mp, index);
 		ifree += pag->pagi_freecount;
 		ialloc += pag->pagi_count;
 		bfree += pag->pagf_freeblks;
 		bfreelst += pag->pagf_flcount;
 		btree += pag->pagf_btreeblks;
+		xfs_perag_put(pag);
 	}
 	/*
 	 * Overwrite incore superblock counters with just-read data
-- 
cgit v1.2.3


From 1c1c6ebcf5284aee4910f3b906ac90c20e510c82 Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Mon, 11 Jan 2010 11:47:44 +0000
Subject: xfs: Replace per-ag array with a radix tree

The use of an array for the per-ag structures requires reallocation
of the array when growing the filesystem. This requires locking
access to the array to avoid use after free situations, and the
locking is difficult to get right. To avoid needing to reallocate an
array, change the per-ag structures to an allocated object per ag
and index them using a tree structure.

The AGs are always densely indexed (hence the use of an array), but
the number supported is 2^32 and lookups tend to be random and hence
indexing needs to scale. A simple choice is a radix tree - it works
well with this sort of index.  This change also removes another
large contiguous allocation from the mount/growfs path in XFS.

The growing process now needs to change to only initialise the new
AGs required for the extra space, and as such only needs to
exclusively lock the tree for inserts. The rest of the code only
needs to lock the tree while doing lookups, and hence this will
remove all the deadlocks that currently occur on the m_perag_lock as
it is now an innermost lock. The lock is also changed to a spinlock
from a read/write lock as the hold time is now extremely short.

To complete the picture, the per-ag structures will need to be
reference counted to ensure that we don't free/modify them while
they are still in use.  This will be done in subsequent patch.

Signed-off-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_mount.c | 63 +++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 48 insertions(+), 15 deletions(-)

(limited to 'fs/xfs/xfs_mount.c')

diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 9055b60730d0..c04dd83cb57c 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -209,13 +209,16 @@ STATIC void
 xfs_free_perag(
 	xfs_mount_t	*mp)
 {
-	if (mp->m_perag) {
-		int	agno;
+	xfs_agnumber_t	agno;
+	struct xfs_perag *pag;
 
-		for (agno = 0; agno < mp->m_maxagi; agno++)
-			if (mp->m_perag[agno].pagb_list)
-				kmem_free(mp->m_perag[agno].pagb_list);
-		kmem_free(mp->m_perag);
+	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
+		spin_lock(&mp->m_perag_lock);
+		pag = radix_tree_delete(&mp->m_perag_tree, agno);
+		spin_unlock(&mp->m_perag_lock);
+		ASSERT(pag);
+		kmem_free(pag->pagb_list);
+		kmem_free(pag);
 	}
 }
 
@@ -389,10 +392,11 @@ xfs_initialize_perag_icache(
 	}
 }
 
-xfs_agnumber_t
+int
 xfs_initialize_perag(
 	xfs_mount_t	*mp,
-	xfs_agnumber_t	agcount)
+	xfs_agnumber_t	agcount,
+	xfs_agnumber_t	*maxagi)
 {
 	xfs_agnumber_t	index, max_metadata;
 	xfs_perag_t	*pag;
@@ -405,6 +409,33 @@ xfs_initialize_perag(
 	agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0);
 	ino = XFS_AGINO_TO_INO(mp, agcount - 1, agino);
 
+	/*
+	 * Walk the current per-ag tree so we don't try to initialise AGs
+	 * that already exist (growfs case). Allocate and insert all the
+	 * AGs we don't find ready for initialisation.
+	 */
+	for (index = 0; index < agcount; index++) {
+		pag = xfs_perag_get(mp, index);
+		if (pag) {
+			xfs_perag_put(pag);
+			continue;
+		}
+		pag = kmem_zalloc(sizeof(*pag), KM_MAYFAIL);
+		if (!pag)
+			return -ENOMEM;
+		if (radix_tree_preload(GFP_NOFS))
+			return -ENOMEM;
+		spin_lock(&mp->m_perag_lock);
+		if (radix_tree_insert(&mp->m_perag_tree, index, pag)) {
+			BUG();
+			spin_unlock(&mp->m_perag_lock);
+			kmem_free(pag);
+			return -EEXIST;
+		}
+		spin_unlock(&mp->m_perag_lock);
+		radix_tree_preload_end();
+	}
+
 	/* Clear the mount flag if no inode can overflow 32 bits
 	 * on this filesystem, or if specifically requested..
 	 */
@@ -454,7 +485,9 @@ xfs_initialize_perag(
 			xfs_perag_put(pag);
 		}
 	}
-	return index;
+	if (maxagi)
+		*maxagi = index;
+	return 0;
 }
 
 void
@@ -1155,13 +1188,13 @@ xfs_mountfs(
 	/*
 	 * Allocate and initialize the per-ag data.
 	 */
-	init_rwsem(&mp->m_peraglock);
-	mp->m_perag = kmem_zalloc(sbp->sb_agcount * sizeof(xfs_perag_t),
-				  KM_MAYFAIL);
-	if (!mp->m_perag)
+	spin_lock_init(&mp->m_perag_lock);
+	INIT_RADIX_TREE(&mp->m_perag_tree, GFP_NOFS);
+	error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi);
+	if (error) {
+		cmn_err(CE_WARN, "XFS: Failed per-ag init: %d", error);
 		goto out_remove_uuid;
-
-	mp->m_maxagi = xfs_initialize_perag(mp, sbp->sb_agcount);
+	}
 
 	if (!sbp->sb_logblocks) {
 		cmn_err(CE_WARN, "XFS: no log defined");
-- 
cgit v1.2.3


From aed3bb90abaf0b42e8c8747e192f7bb97f445279 Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Mon, 11 Jan 2010 11:47:45 +0000
Subject: xfs: Reference count per-ag structures

Reference count the per-ag structures to ensure that we keep get/put
pairs balanced. Assert that the reference counts are zero at unmount
time to catch leaks. In future, reference counts will enable us to
safely remove perag structures by allowing us to detect when they
are no longer in use.

Signed-off-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_mount.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs/xfs/xfs_mount.c')

diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index c04dd83cb57c..f241fec26070 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -215,6 +215,7 @@ xfs_free_perag(
 	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
 		spin_lock(&mp->m_perag_lock);
 		pag = radix_tree_delete(&mp->m_perag_tree, agno);
+		ASSERT(atomic_read(&pag->pag_ref) == 0);
 		spin_unlock(&mp->m_perag_lock);
 		ASSERT(pag);
 		kmem_free(pag->pagb_list);
-- 
cgit v1.2.3


From 0fa800fbd549736dfdc1d7761f87e33dc8cd973b Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Mon, 11 Jan 2010 11:47:46 +0000
Subject: xfs: Add trace points for per-ag refcount debugging.

Uninline xfs_perag_{get,put} so that tracepoints can be inserted
into them to speed debugging of reference count problems.

Signed-off-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_mount.c | 34 ++++++++++++++++++++++++++++++++++
 1 file changed, 34 insertions(+)

(limited to 'fs/xfs/xfs_mount.c')

diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index f241fec26070..049dbc71c28e 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -200,6 +200,38 @@ xfs_uuid_unmount(
 }
 
 
+/*
+ * Reference counting access wrappers to the perag structures.
+ */
+struct xfs_perag *
+xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno)
+{
+	struct xfs_perag	*pag;
+	int			ref = 0;
+
+	spin_lock(&mp->m_perag_lock);
+	pag = radix_tree_lookup(&mp->m_perag_tree, agno);
+	if (pag) {
+		ASSERT(atomic_read(&pag->pag_ref) >= 0);
+		/* catch leaks in the positive direction during testing */
+		ASSERT(atomic_read(&pag->pag_ref) < 1000);
+		ref = atomic_inc_return(&pag->pag_ref);
+	}
+	spin_unlock(&mp->m_perag_lock);
+	trace_xfs_perag_get(mp, agno, ref, _RET_IP_);
+	return pag;
+}
+
+void
+xfs_perag_put(struct xfs_perag *pag)
+{
+	int	ref;
+
+	ASSERT(atomic_read(&pag->pag_ref) > 0);
+	ref = atomic_dec_return(&pag->pag_ref);
+	trace_xfs_perag_put(pag->pag_mount, pag->pag_agno, ref, _RET_IP_);
+}
+
 /*
  * Free up the resources associated with a mount structure.  Assume that
  * the structure was initially zeroed, so we can tell which fields got
@@ -433,6 +465,8 @@ xfs_initialize_perag(
 			kmem_free(pag);
 			return -EEXIST;
 		}
+		pag->pag_agno = index;
+		pag->pag_mount = mp;
 		spin_unlock(&mp->m_perag_lock);
 		radix_tree_preload_end();
 	}
-- 
cgit v1.2.3


From 8b26c5825e023b1bccac7afd174ebe55b8905cb1 Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Mon, 11 Jan 2010 11:47:48 +0000
Subject: xfs: handle ENOMEM correctly during initialisation of perag
 structures

Add proper error handling in case an error occurs while initializing
new perag structures for a mount point.  The mount structure is
restored to its previous state by deleting and freeing any perag
structures added during the call.

Signed-off-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_mount.c | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

(limited to 'fs/xfs/xfs_mount.c')

diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 049dbc71c28e..be643e588067 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -432,11 +432,13 @@ xfs_initialize_perag(
 	xfs_agnumber_t	*maxagi)
 {
 	xfs_agnumber_t	index, max_metadata;
+	xfs_agnumber_t	first_initialised = 0;
 	xfs_perag_t	*pag;
 	xfs_agino_t	agino;
 	xfs_ino_t	ino;
 	xfs_sb_t	*sbp = &mp->m_sb;
 	xfs_ino_t	max_inum = XFS_MAXINUMBER_32;
+	int		error = -ENOMEM;
 
 	/* Check to see if the filesystem can overflow 32 bit inodes */
 	agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0);
@@ -453,17 +455,20 @@ xfs_initialize_perag(
 			xfs_perag_put(pag);
 			continue;
 		}
+		if (!first_initialised)
+			first_initialised = index;
 		pag = kmem_zalloc(sizeof(*pag), KM_MAYFAIL);
 		if (!pag)
-			return -ENOMEM;
+			goto out_unwind;
 		if (radix_tree_preload(GFP_NOFS))
-			return -ENOMEM;
+			goto out_unwind;
 		spin_lock(&mp->m_perag_lock);
 		if (radix_tree_insert(&mp->m_perag_tree, index, pag)) {
 			BUG();
 			spin_unlock(&mp->m_perag_lock);
-			kmem_free(pag);
-			return -EEXIST;
+			radix_tree_preload_end();
+			error = -EEXIST;
+			goto out_unwind;
 		}
 		pag->pag_agno = index;
 		pag->pag_mount = mp;
@@ -523,6 +528,14 @@ xfs_initialize_perag(
 	if (maxagi)
 		*maxagi = index;
 	return 0;
+
+out_unwind:
+	kmem_free(pag);
+	for (; index > first_initialised; index--) {
+		pag = radix_tree_delete(&mp->m_perag_tree, index);
+		kmem_free(pag);
+	}
+	return error;
 }
 
 void
-- 
cgit v1.2.3


From e57336ff7fc7520bec7b3a7741043bdebaf622ea Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Mon, 11 Jan 2010 11:47:49 +0000
Subject: xfs: embed the pagb_list array in the perag structure

Now that the perag structure is allocated memory rather than held in
an array, we don't need to have the busy extent array external to
the structure. Embed it into the perag structure to avoid needing an
extra allocation when setting up.

Signed-off-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_mount.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs/xfs/xfs_mount.c')

diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index be643e588067..0df5045abd3b 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -247,10 +247,9 @@ xfs_free_perag(
 	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
 		spin_lock(&mp->m_perag_lock);
 		pag = radix_tree_delete(&mp->m_perag_tree, agno);
+		ASSERT(pag);
 		ASSERT(atomic_read(&pag->pag_ref) == 0);
 		spin_unlock(&mp->m_perag_lock);
-		ASSERT(pag);
-		kmem_free(pag->pagb_list);
 		kmem_free(pag);
 	}
 }
-- 
cgit v1.2.3


From 587aa0feb74ffe3239b5e26ff5d017ba9f5daec9 Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Wed, 20 Jan 2010 12:04:53 +1100
Subject: xfs: rearrange xfs_mod_sb() to avoid array subscript warning

gcc warns of an array subscript out of bounds in xfs_mod_sb().
The code is written in such a way that if the array subscript is
out of bounds, then it will assert fail. Rearrange the code to
avoid the bounds check warning.

Signed-off-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_mount.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'fs/xfs/xfs_mount.c')

diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 0df5045abd3b..d95bd1809f3c 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1631,15 +1631,14 @@ xfs_mod_sb(xfs_trans_t *tp, __int64_t fields)
 	xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, fields);
 
 	/* find modified range */
+	f = (xfs_sb_field_t)xfs_highbit64((__uint64_t)fields);
+	ASSERT((1LL << f) & XFS_SB_MOD_BITS);
+	last = xfs_sb_info[f + 1].offset - 1;
 
 	f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields);
 	ASSERT((1LL << f) & XFS_SB_MOD_BITS);
 	first = xfs_sb_info[f].offset;
 
-	f = (xfs_sb_field_t)xfs_highbit64((__uint64_t)fields);
-	ASSERT((1LL << f) & XFS_SB_MOD_BITS);
-	last = xfs_sb_info[f + 1].offset - 1;
-
 	xfs_trans_log_buf(tp, bp, first, last);
 }
 
-- 
cgit v1.2.3


From 0cadda1c5f194f98a05d252ff4385d86d2ed0862 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Tue, 19 Jan 2010 09:56:44 +0000
Subject: xfs: remove duplicate buffer flags

Currently we define aliases for the buffer flags in various
namespaces, which only adds confusion.  Remove all but the XBF_
flags to clean this up a bit.

Note that we still abuse XFS_B_ASYNC/XBF_ASYNC for some non-buffer
uses, but I'll clean that up later.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <david@fromorbit.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_mount.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs/xfs/xfs_mount.c')

diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index d95bd1809f3c..bb0154047e85 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -665,7 +665,7 @@ xfs_readsb(xfs_mount_t *mp, int flags)
 	 * access to the superblock.
 	 */
 	sector_size = xfs_getsize_buftarg(mp->m_ddev_targp);
-	extra_flags = XFS_BUF_LOCK | XFS_BUF_MANAGE | XFS_BUF_MAPPED;
+	extra_flags = XBF_LOCK | XBF_FS_MANAGED | XBF_MAPPED;
 
 	bp = xfs_buf_read(mp->m_ddev_targp, XFS_SB_DADDR, BTOBB(sector_size),
 			  extra_flags);
@@ -1969,7 +1969,7 @@ xfs_getsb(
 
 	ASSERT(mp->m_sb_bp != NULL);
 	bp = mp->m_sb_bp;
-	if (flags & XFS_BUF_TRYLOCK) {
+	if (flags & XBF_TRYLOCK) {
 		if (!XFS_BUF_CPSEMA(bp)) {
 			return NULL;
 		}
-- 
cgit v1.2.3


From a14a348bff2f99471a28e5928eb6801224c053d8 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Tue, 19 Jan 2010 09:56:46 +0000
Subject: xfs: cleanup up xfs_log_force calling conventions

Remove the XFS_LOG_FORCE argument which was always set, and the
XFS_LOG_URGE define, which was never used.

Split xfs_log_force into a two helpers - xfs_log_force which forces
the whole log, and xfs_log_force_lsn which forces up to the
specified LSN.  The underlying implementations already were entirely
separate, as were the users.

Also re-indent the new _xfs_log_force/_xfs_log_force which
previously had a weird coding style.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_mount.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs/xfs/xfs_mount.c')

diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index bb0154047e85..7f81ed72c875 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1455,7 +1455,7 @@ xfs_unmountfs(
 	 * push out the iclog we will never get that unlocked. hence we
 	 * need to force the log first.
 	 */
-	xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC);
+	xfs_log_force(mp, XFS_LOG_SYNC);
 	xfs_reclaim_inodes(mp, XFS_IFLUSH_ASYNC);
 
 	xfs_qm_unmount(mp);
@@ -1465,7 +1465,7 @@ xfs_unmountfs(
 	 * that nothing is pinned.  This is important because bflush()
 	 * will skip pinned buffers.
 	 */
-	xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC);
+	xfs_log_force(mp, XFS_LOG_SYNC);
 
 	xfs_binval(mp->m_ddev_targp);
 	if (mp->m_rtdev_targp) {
-- 
cgit v1.2.3


From d5db0f97fbbeff11c88dec1aaf1536a975afbaeb Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Fri, 5 Feb 2010 22:59:53 +0000
Subject: xfs: more reserved blocks fixups

This mangles the reserved blocks counts a little more.

1) add a helper function for the default reserved count
2) add helper functions to save/restore counts on ro/rw
3) save/restore reserved blocks on freeze/thaw
4) disallow changing reserved count while readonly

V2: changed field name to match Dave's changes

Signed-off-by: Eric Sandeen <sandeen@sandeen.net>
Signed-off-by: Alex Elder <aelder@sgi.com>
---
 fs/xfs/xfs_mount.c | 34 +++++++++++++++++++++++-----------
 1 file changed, 23 insertions(+), 11 deletions(-)

(limited to 'fs/xfs/xfs_mount.c')

diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 7f81ed72c875..5061149b2cc4 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1091,6 +1091,22 @@ xfs_mount_reset_sbqflags(
 	return xfs_trans_commit(tp, 0);
 }
 
+__uint64_t
+xfs_default_resblks(xfs_mount_t *mp)
+{
+	__uint64_t resblks;
+
+	/*
+	 * We default to 5% or 1024 fsbs of space reserved, whichever is smaller.
+	 * This may drive us straight to ENOSPC on mount, but that implies
+	 * we were already there on the last unmount. Warn if this occurs.
+	 */
+	resblks = mp->m_sb.sb_dblocks;
+	do_div(resblks, 20);
+	resblks = min_t(__uint64_t, resblks, 1024);
+	return resblks;
+}
+
 /*
  * This function does the following on an initial mount of a file system:
  *	- reads the superblock from disk and init the mount struct
@@ -1401,18 +1417,14 @@ xfs_mountfs(
 	 * when at ENOSPC. This is needed for operations like create with
 	 * attr, unwritten extent conversion at ENOSPC, etc. Data allocations
 	 * are not allowed to use this reserved space.
-	 *
-	 * We default to 5% or 1024 fsbs of space reserved, whichever is smaller.
-	 * This may drive us straight to ENOSPC on mount, but that implies
-	 * we were already there on the last unmount. Warn if this occurs.
 	 */
-	resblks = mp->m_sb.sb_dblocks;
-	do_div(resblks, 20);
-	resblks = min_t(__uint64_t, resblks, 1024);
-	error = xfs_reserve_blocks(mp, &resblks, NULL);
-	if (error)
-		cmn_err(CE_WARN, "XFS: Unable to allocate reserve blocks. "
-				"Continuing without a reserve pool.");
+	if (!(mp->m_flags & XFS_MOUNT_RDONLY)) {
+		resblks = xfs_default_resblks(mp);
+		error = xfs_reserve_blocks(mp, &resblks, NULL);
+		if (error)
+			cmn_err(CE_WARN, "XFS: Unable to allocate reserve "
+				"blocks. Continuing without a reserve pool.");
+	}
 
 	return 0;
 
-- 
cgit v1.2.3


From c854363e80b49dd04a4de18ebc379eb8c8806674 Mon Sep 17 00:00:00 2001
From: Dave Chinner <david@fromorbit.com>
Date: Sat, 6 Feb 2010 12:39:36 +1100
Subject: xfs: Use delayed write for inodes rather than async V2

We currently do background inode flush asynchronously, resulting in
inodes being written in whatever order the background writeback
issues them. Not only that, there are also blocking and non-blocking
asynchronous inode flushes, depending on where the flush comes from.

This patch completely removes asynchronous inode writeback. It
removes all the strange writeback modes and replaces them with
either a synchronous flush or a non-blocking delayed write flush.
That is, inode flushes will only issue IO directly if they are
synchronous, and background flushing may do nothing if the operation
would block (e.g. on a pinned inode or buffer lock).

Delayed write flushes will now result in the inode buffer sitting in
the delwri queue of the buffer cache to be flushed by either an AIL
push or by the xfsbufd timing out the buffer. This will allow
accumulation of dirty inode buffers in memory and allow optimisation
of inode cluster writeback at the xfsbufd level where we have much
greater queue depths than the block layer elevators. We will also
get adjacent inode cluster buffer IO merging for free when a later
patch in the series allows sorting of the delayed write buffers
before dispatch.

This effectively means that any inode that is written back by
background writeback will be seen as flush locked during AIL
pushing, and will result in the buffers being pushed from there.
This writeback path is currently non-optimal, but the next patch
in the series will fix that problem.

A side effect of this delayed write mechanism is that background
inode reclaim will no longer directly flush inodes, nor can it wait
on the flush lock. The result is that inode reclaim must leave the
inode in the reclaimable state until it is clean. Hence attempts to
reclaim a dirty inode in the background will simply skip the inode
until it is clean and this allows other mechanisms (i.e. xfsbufd) to
do more optimal writeback of the dirty buffers. As a result, the
inode reclaim code has been rewritten so that it no longer relies on
the ambiguous return values of xfs_iflush() to determine whether it
is safe to reclaim an inode.

Portions of this patch are derived from patches by Christoph
Hellwig.

Version 2:
- cleanup reclaim code as suggested by Christoph
- log background reclaim inode flush errors
- just pass sync flags to xfs_iflush

Signed-off-by: Dave Chinner <david@fromorbit.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
---
 fs/xfs/xfs_mount.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'fs/xfs/xfs_mount.c')

diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 5061149b2cc4..6afaaeb2950a 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1468,7 +1468,18 @@ xfs_unmountfs(
 	 * need to force the log first.
 	 */
 	xfs_log_force(mp, XFS_LOG_SYNC);
-	xfs_reclaim_inodes(mp, XFS_IFLUSH_ASYNC);
+
+	/*
+	 * Do a delwri reclaim pass first so that as many dirty inodes are
+	 * queued up for IO as possible. Then flush the buffers before making
+	 * a synchronous path to catch all the remaining inodes are reclaimed.
+	 * This makes the reclaim process as quick as possible by avoiding
+	 * synchronous writeout and blocking on inodes already in the delwri
+	 * state as much as possible.
+	 */
+	xfs_reclaim_inodes(mp, 0);
+	XFS_bflush(mp->m_ddev_targp);
+	xfs_reclaim_inodes(mp, SYNC_WAIT);
 
 	xfs_qm_unmount(mp);
 
-- 
cgit v1.2.3