From 44b56e0a1aed522a10051645e85d300e10926fd3 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 11 Jan 2010 11:47:43 +0000 Subject: xfs: convert remaining direct references to m_perag Convert the remaining direct lookups of the per ag structures to use get/put accesses. Ensure that the loops across AGs and prior users of the interface balance gets and puts correctly. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/xfs_mount.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'fs/xfs/xfs_mount.c') diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index eb403b40e120..9055b60730d0 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -438,18 +438,20 @@ xfs_initialize_perag( } /* This ag is preferred for inodes */ - pag = &mp->m_perag[index]; + pag = xfs_perag_get(mp, index); pag->pagi_inodeok = 1; if (index < max_metadata) pag->pagf_metadata = 1; xfs_initialize_perag_icache(pag); + xfs_perag_put(pag); } } else { /* Setup default behavior for smaller filesystems */ for (index = 0; index < agcount; index++) { - pag = &mp->m_perag[index]; + pag = xfs_perag_get(mp, index); pag->pagi_inodeok = 1; xfs_initialize_perag_icache(pag); + xfs_perag_put(pag); } } return index; @@ -731,12 +733,13 @@ xfs_initialize_perag_data(xfs_mount_t *mp, xfs_agnumber_t agcount) error = xfs_ialloc_pagi_init(mp, NULL, index); if (error) return error; - pag = &mp->m_perag[index]; + pag = xfs_perag_get(mp, index); ifree += pag->pagi_freecount; ialloc += pag->pagi_count; bfree += pag->pagf_freeblks; bfreelst += pag->pagf_flcount; btree += pag->pagf_btreeblks; + xfs_perag_put(pag); } /* * Overwrite incore superblock counters with just-read data -- cgit v1.2.3 From 1c1c6ebcf5284aee4910f3b906ac90c20e510c82 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 11 Jan 2010 11:47:44 +0000 Subject: xfs: Replace per-ag array with a radix tree The use of an array for the per-ag structures requires reallocation of the array when growing the filesystem. This requires locking access to the array to avoid use after free situations, and the locking is difficult to get right. To avoid needing to reallocate an array, change the per-ag structures to an allocated object per ag and index them using a tree structure. The AGs are always densely indexed (hence the use of an array), but the number supported is 2^32 and lookups tend to be random and hence indexing needs to scale. A simple choice is a radix tree - it works well with this sort of index. This change also removes another large contiguous allocation from the mount/growfs path in XFS. The growing process now needs to change to only initialise the new AGs required for the extra space, and as such only needs to exclusively lock the tree for inserts. The rest of the code only needs to lock the tree while doing lookups, and hence this will remove all the deadlocks that currently occur on the m_perag_lock as it is now an innermost lock. The lock is also changed to a spinlock from a read/write lock as the hold time is now extremely short. To complete the picture, the per-ag structures will need to be reference counted to ensure that we don't free/modify them while they are still in use. This will be done in subsequent patch. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/xfs_mount.c | 63 +++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 48 insertions(+), 15 deletions(-) (limited to 'fs/xfs/xfs_mount.c') diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 9055b60730d0..c04dd83cb57c 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -209,13 +209,16 @@ STATIC void xfs_free_perag( xfs_mount_t *mp) { - if (mp->m_perag) { - int agno; + xfs_agnumber_t agno; + struct xfs_perag *pag; - for (agno = 0; agno < mp->m_maxagi; agno++) - if (mp->m_perag[agno].pagb_list) - kmem_free(mp->m_perag[agno].pagb_list); - kmem_free(mp->m_perag); + for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { + spin_lock(&mp->m_perag_lock); + pag = radix_tree_delete(&mp->m_perag_tree, agno); + spin_unlock(&mp->m_perag_lock); + ASSERT(pag); + kmem_free(pag->pagb_list); + kmem_free(pag); } } @@ -389,10 +392,11 @@ xfs_initialize_perag_icache( } } -xfs_agnumber_t +int xfs_initialize_perag( xfs_mount_t *mp, - xfs_agnumber_t agcount) + xfs_agnumber_t agcount, + xfs_agnumber_t *maxagi) { xfs_agnumber_t index, max_metadata; xfs_perag_t *pag; @@ -405,6 +409,33 @@ xfs_initialize_perag( agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0); ino = XFS_AGINO_TO_INO(mp, agcount - 1, agino); + /* + * Walk the current per-ag tree so we don't try to initialise AGs + * that already exist (growfs case). Allocate and insert all the + * AGs we don't find ready for initialisation. + */ + for (index = 0; index < agcount; index++) { + pag = xfs_perag_get(mp, index); + if (pag) { + xfs_perag_put(pag); + continue; + } + pag = kmem_zalloc(sizeof(*pag), KM_MAYFAIL); + if (!pag) + return -ENOMEM; + if (radix_tree_preload(GFP_NOFS)) + return -ENOMEM; + spin_lock(&mp->m_perag_lock); + if (radix_tree_insert(&mp->m_perag_tree, index, pag)) { + BUG(); + spin_unlock(&mp->m_perag_lock); + kmem_free(pag); + return -EEXIST; + } + spin_unlock(&mp->m_perag_lock); + radix_tree_preload_end(); + } + /* Clear the mount flag if no inode can overflow 32 bits * on this filesystem, or if specifically requested.. */ @@ -454,7 +485,9 @@ xfs_initialize_perag( xfs_perag_put(pag); } } - return index; + if (maxagi) + *maxagi = index; + return 0; } void @@ -1155,13 +1188,13 @@ xfs_mountfs( /* * Allocate and initialize the per-ag data. */ - init_rwsem(&mp->m_peraglock); - mp->m_perag = kmem_zalloc(sbp->sb_agcount * sizeof(xfs_perag_t), - KM_MAYFAIL); - if (!mp->m_perag) + spin_lock_init(&mp->m_perag_lock); + INIT_RADIX_TREE(&mp->m_perag_tree, GFP_NOFS); + error = xfs_initialize_perag(mp, sbp->sb_agcount, &mp->m_maxagi); + if (error) { + cmn_err(CE_WARN, "XFS: Failed per-ag init: %d", error); goto out_remove_uuid; - - mp->m_maxagi = xfs_initialize_perag(mp, sbp->sb_agcount); + } if (!sbp->sb_logblocks) { cmn_err(CE_WARN, "XFS: no log defined"); -- cgit v1.2.3 From aed3bb90abaf0b42e8c8747e192f7bb97f445279 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 11 Jan 2010 11:47:45 +0000 Subject: xfs: Reference count per-ag structures Reference count the per-ag structures to ensure that we keep get/put pairs balanced. Assert that the reference counts are zero at unmount time to catch leaks. In future, reference counts will enable us to safely remove perag structures by allowing us to detect when they are no longer in use. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/xfs_mount.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs/xfs/xfs_mount.c') diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index c04dd83cb57c..f241fec26070 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -215,6 +215,7 @@ xfs_free_perag( for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { spin_lock(&mp->m_perag_lock); pag = radix_tree_delete(&mp->m_perag_tree, agno); + ASSERT(atomic_read(&pag->pag_ref) == 0); spin_unlock(&mp->m_perag_lock); ASSERT(pag); kmem_free(pag->pagb_list); -- cgit v1.2.3 From 0fa800fbd549736dfdc1d7761f87e33dc8cd973b Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 11 Jan 2010 11:47:46 +0000 Subject: xfs: Add trace points for per-ag refcount debugging. Uninline xfs_perag_{get,put} so that tracepoints can be inserted into them to speed debugging of reference count problems. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/xfs_mount.c | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) (limited to 'fs/xfs/xfs_mount.c') diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index f241fec26070..049dbc71c28e 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -200,6 +200,38 @@ xfs_uuid_unmount( } +/* + * Reference counting access wrappers to the perag structures. + */ +struct xfs_perag * +xfs_perag_get(struct xfs_mount *mp, xfs_agnumber_t agno) +{ + struct xfs_perag *pag; + int ref = 0; + + spin_lock(&mp->m_perag_lock); + pag = radix_tree_lookup(&mp->m_perag_tree, agno); + if (pag) { + ASSERT(atomic_read(&pag->pag_ref) >= 0); + /* catch leaks in the positive direction during testing */ + ASSERT(atomic_read(&pag->pag_ref) < 1000); + ref = atomic_inc_return(&pag->pag_ref); + } + spin_unlock(&mp->m_perag_lock); + trace_xfs_perag_get(mp, agno, ref, _RET_IP_); + return pag; +} + +void +xfs_perag_put(struct xfs_perag *pag) +{ + int ref; + + ASSERT(atomic_read(&pag->pag_ref) > 0); + ref = atomic_dec_return(&pag->pag_ref); + trace_xfs_perag_put(pag->pag_mount, pag->pag_agno, ref, _RET_IP_); +} + /* * Free up the resources associated with a mount structure. Assume that * the structure was initially zeroed, so we can tell which fields got @@ -433,6 +465,8 @@ xfs_initialize_perag( kmem_free(pag); return -EEXIST; } + pag->pag_agno = index; + pag->pag_mount = mp; spin_unlock(&mp->m_perag_lock); radix_tree_preload_end(); } -- cgit v1.2.3 From 8b26c5825e023b1bccac7afd174ebe55b8905cb1 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 11 Jan 2010 11:47:48 +0000 Subject: xfs: handle ENOMEM correctly during initialisation of perag structures Add proper error handling in case an error occurs while initializing new perag structures for a mount point. The mount structure is restored to its previous state by deleting and freeing any perag structures added during the call. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/xfs_mount.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) (limited to 'fs/xfs/xfs_mount.c') diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 049dbc71c28e..be643e588067 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -432,11 +432,13 @@ xfs_initialize_perag( xfs_agnumber_t *maxagi) { xfs_agnumber_t index, max_metadata; + xfs_agnumber_t first_initialised = 0; xfs_perag_t *pag; xfs_agino_t agino; xfs_ino_t ino; xfs_sb_t *sbp = &mp->m_sb; xfs_ino_t max_inum = XFS_MAXINUMBER_32; + int error = -ENOMEM; /* Check to see if the filesystem can overflow 32 bit inodes */ agino = XFS_OFFBNO_TO_AGINO(mp, sbp->sb_agblocks - 1, 0); @@ -453,17 +455,20 @@ xfs_initialize_perag( xfs_perag_put(pag); continue; } + if (!first_initialised) + first_initialised = index; pag = kmem_zalloc(sizeof(*pag), KM_MAYFAIL); if (!pag) - return -ENOMEM; + goto out_unwind; if (radix_tree_preload(GFP_NOFS)) - return -ENOMEM; + goto out_unwind; spin_lock(&mp->m_perag_lock); if (radix_tree_insert(&mp->m_perag_tree, index, pag)) { BUG(); spin_unlock(&mp->m_perag_lock); - kmem_free(pag); - return -EEXIST; + radix_tree_preload_end(); + error = -EEXIST; + goto out_unwind; } pag->pag_agno = index; pag->pag_mount = mp; @@ -523,6 +528,14 @@ xfs_initialize_perag( if (maxagi) *maxagi = index; return 0; + +out_unwind: + kmem_free(pag); + for (; index > first_initialised; index--) { + pag = radix_tree_delete(&mp->m_perag_tree, index); + kmem_free(pag); + } + return error; } void -- cgit v1.2.3 From e57336ff7fc7520bec7b3a7741043bdebaf622ea Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 11 Jan 2010 11:47:49 +0000 Subject: xfs: embed the pagb_list array in the perag structure Now that the perag structure is allocated memory rather than held in an array, we don't need to have the busy extent array external to the structure. Embed it into the perag structure to avoid needing an extra allocation when setting up. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/xfs_mount.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs/xfs/xfs_mount.c') diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index be643e588067..0df5045abd3b 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -247,10 +247,9 @@ xfs_free_perag( for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) { spin_lock(&mp->m_perag_lock); pag = radix_tree_delete(&mp->m_perag_tree, agno); + ASSERT(pag); ASSERT(atomic_read(&pag->pag_ref) == 0); spin_unlock(&mp->m_perag_lock); - ASSERT(pag); - kmem_free(pag->pagb_list); kmem_free(pag); } } -- cgit v1.2.3 From 587aa0feb74ffe3239b5e26ff5d017ba9f5daec9 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 20 Jan 2010 12:04:53 +1100 Subject: xfs: rearrange xfs_mod_sb() to avoid array subscript warning gcc warns of an array subscript out of bounds in xfs_mod_sb(). The code is written in such a way that if the array subscript is out of bounds, then it will assert fail. Rearrange the code to avoid the bounds check warning. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_mount.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'fs/xfs/xfs_mount.c') diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 0df5045abd3b..d95bd1809f3c 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1631,15 +1631,14 @@ xfs_mod_sb(xfs_trans_t *tp, __int64_t fields) xfs_sb_to_disk(XFS_BUF_TO_SBP(bp), &mp->m_sb, fields); /* find modified range */ + f = (xfs_sb_field_t)xfs_highbit64((__uint64_t)fields); + ASSERT((1LL << f) & XFS_SB_MOD_BITS); + last = xfs_sb_info[f + 1].offset - 1; f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields); ASSERT((1LL << f) & XFS_SB_MOD_BITS); first = xfs_sb_info[f].offset; - f = (xfs_sb_field_t)xfs_highbit64((__uint64_t)fields); - ASSERT((1LL << f) & XFS_SB_MOD_BITS); - last = xfs_sb_info[f + 1].offset - 1; - xfs_trans_log_buf(tp, bp, first, last); } -- cgit v1.2.3 From 0cadda1c5f194f98a05d252ff4385d86d2ed0862 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 Jan 2010 09:56:44 +0000 Subject: xfs: remove duplicate buffer flags Currently we define aliases for the buffer flags in various namespaces, which only adds confusion. Remove all but the XBF_ flags to clean this up a bit. Note that we still abuse XFS_B_ASYNC/XBF_ASYNC for some non-buffer uses, but I'll clean that up later. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Signed-off-by: Alex Elder --- fs/xfs/xfs_mount.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/xfs/xfs_mount.c') diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index d95bd1809f3c..bb0154047e85 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -665,7 +665,7 @@ xfs_readsb(xfs_mount_t *mp, int flags) * access to the superblock. */ sector_size = xfs_getsize_buftarg(mp->m_ddev_targp); - extra_flags = XFS_BUF_LOCK | XFS_BUF_MANAGE | XFS_BUF_MAPPED; + extra_flags = XBF_LOCK | XBF_FS_MANAGED | XBF_MAPPED; bp = xfs_buf_read(mp->m_ddev_targp, XFS_SB_DADDR, BTOBB(sector_size), extra_flags); @@ -1969,7 +1969,7 @@ xfs_getsb( ASSERT(mp->m_sb_bp != NULL); bp = mp->m_sb_bp; - if (flags & XFS_BUF_TRYLOCK) { + if (flags & XBF_TRYLOCK) { if (!XFS_BUF_CPSEMA(bp)) { return NULL; } -- cgit v1.2.3 From a14a348bff2f99471a28e5928eb6801224c053d8 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 19 Jan 2010 09:56:46 +0000 Subject: xfs: cleanup up xfs_log_force calling conventions Remove the XFS_LOG_FORCE argument which was always set, and the XFS_LOG_URGE define, which was never used. Split xfs_log_force into a two helpers - xfs_log_force which forces the whole log, and xfs_log_force_lsn which forces up to the specified LSN. The underlying implementations already were entirely separate, as were the users. Also re-indent the new _xfs_log_force/_xfs_log_force which previously had a weird coding style. Signed-off-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/xfs_mount.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs/xfs/xfs_mount.c') diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index bb0154047e85..7f81ed72c875 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1455,7 +1455,7 @@ xfs_unmountfs( * push out the iclog we will never get that unlocked. hence we * need to force the log first. */ - xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC); + xfs_log_force(mp, XFS_LOG_SYNC); xfs_reclaim_inodes(mp, XFS_IFLUSH_ASYNC); xfs_qm_unmount(mp); @@ -1465,7 +1465,7 @@ xfs_unmountfs( * that nothing is pinned. This is important because bflush() * will skip pinned buffers. */ - xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE | XFS_LOG_SYNC); + xfs_log_force(mp, XFS_LOG_SYNC); xfs_binval(mp->m_ddev_targp); if (mp->m_rtdev_targp) { -- cgit v1.2.3 From d5db0f97fbbeff11c88dec1aaf1536a975afbaeb Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Fri, 5 Feb 2010 22:59:53 +0000 Subject: xfs: more reserved blocks fixups This mangles the reserved blocks counts a little more. 1) add a helper function for the default reserved count 2) add helper functions to save/restore counts on ro/rw 3) save/restore reserved blocks on freeze/thaw 4) disallow changing reserved count while readonly V2: changed field name to match Dave's changes Signed-off-by: Eric Sandeen Signed-off-by: Alex Elder --- fs/xfs/xfs_mount.c | 34 +++++++++++++++++++++++----------- 1 file changed, 23 insertions(+), 11 deletions(-) (limited to 'fs/xfs/xfs_mount.c') diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 7f81ed72c875..5061149b2cc4 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1091,6 +1091,22 @@ xfs_mount_reset_sbqflags( return xfs_trans_commit(tp, 0); } +__uint64_t +xfs_default_resblks(xfs_mount_t *mp) +{ + __uint64_t resblks; + + /* + * We default to 5% or 1024 fsbs of space reserved, whichever is smaller. + * This may drive us straight to ENOSPC on mount, but that implies + * we were already there on the last unmount. Warn if this occurs. + */ + resblks = mp->m_sb.sb_dblocks; + do_div(resblks, 20); + resblks = min_t(__uint64_t, resblks, 1024); + return resblks; +} + /* * This function does the following on an initial mount of a file system: * - reads the superblock from disk and init the mount struct @@ -1401,18 +1417,14 @@ xfs_mountfs( * when at ENOSPC. This is needed for operations like create with * attr, unwritten extent conversion at ENOSPC, etc. Data allocations * are not allowed to use this reserved space. - * - * We default to 5% or 1024 fsbs of space reserved, whichever is smaller. - * This may drive us straight to ENOSPC on mount, but that implies - * we were already there on the last unmount. Warn if this occurs. */ - resblks = mp->m_sb.sb_dblocks; - do_div(resblks, 20); - resblks = min_t(__uint64_t, resblks, 1024); - error = xfs_reserve_blocks(mp, &resblks, NULL); - if (error) - cmn_err(CE_WARN, "XFS: Unable to allocate reserve blocks. " - "Continuing without a reserve pool."); + if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { + resblks = xfs_default_resblks(mp); + error = xfs_reserve_blocks(mp, &resblks, NULL); + if (error) + cmn_err(CE_WARN, "XFS: Unable to allocate reserve " + "blocks. Continuing without a reserve pool."); + } return 0; -- cgit v1.2.3 From c854363e80b49dd04a4de18ebc379eb8c8806674 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Sat, 6 Feb 2010 12:39:36 +1100 Subject: xfs: Use delayed write for inodes rather than async V2 We currently do background inode flush asynchronously, resulting in inodes being written in whatever order the background writeback issues them. Not only that, there are also blocking and non-blocking asynchronous inode flushes, depending on where the flush comes from. This patch completely removes asynchronous inode writeback. It removes all the strange writeback modes and replaces them with either a synchronous flush or a non-blocking delayed write flush. That is, inode flushes will only issue IO directly if they are synchronous, and background flushing may do nothing if the operation would block (e.g. on a pinned inode or buffer lock). Delayed write flushes will now result in the inode buffer sitting in the delwri queue of the buffer cache to be flushed by either an AIL push or by the xfsbufd timing out the buffer. This will allow accumulation of dirty inode buffers in memory and allow optimisation of inode cluster writeback at the xfsbufd level where we have much greater queue depths than the block layer elevators. We will also get adjacent inode cluster buffer IO merging for free when a later patch in the series allows sorting of the delayed write buffers before dispatch. This effectively means that any inode that is written back by background writeback will be seen as flush locked during AIL pushing, and will result in the buffers being pushed from there. This writeback path is currently non-optimal, but the next patch in the series will fix that problem. A side effect of this delayed write mechanism is that background inode reclaim will no longer directly flush inodes, nor can it wait on the flush lock. The result is that inode reclaim must leave the inode in the reclaimable state until it is clean. Hence attempts to reclaim a dirty inode in the background will simply skip the inode until it is clean and this allows other mechanisms (i.e. xfsbufd) to do more optimal writeback of the dirty buffers. As a result, the inode reclaim code has been rewritten so that it no longer relies on the ambiguous return values of xfs_iflush() to determine whether it is safe to reclaim an inode. Portions of this patch are derived from patches by Christoph Hellwig. Version 2: - cleanup reclaim code as suggested by Christoph - log background reclaim inode flush errors - just pass sync flags to xfs_iflush Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_mount.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'fs/xfs/xfs_mount.c') diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 5061149b2cc4..6afaaeb2950a 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1468,7 +1468,18 @@ xfs_unmountfs( * need to force the log first. */ xfs_log_force(mp, XFS_LOG_SYNC); - xfs_reclaim_inodes(mp, XFS_IFLUSH_ASYNC); + + /* + * Do a delwri reclaim pass first so that as many dirty inodes are + * queued up for IO as possible. Then flush the buffers before making + * a synchronous path to catch all the remaining inodes are reclaimed. + * This makes the reclaim process as quick as possible by avoiding + * synchronous writeout and blocking on inodes already in the delwri + * state as much as possible. + */ + xfs_reclaim_inodes(mp, 0); + XFS_bflush(mp->m_ddev_targp); + xfs_reclaim_inodes(mp, SYNC_WAIT); xfs_qm_unmount(mp); -- cgit v1.2.3