From 6e6938b6d3130305a5960c86b1a9b21e58cf6144 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Sun, 6 Jun 2010 10:38:15 -0600
Subject: writeback: introduce .tagged_writepages for the WB_SYNC_NONE sync
 stage

sync(2) is performed in two stages: the WB_SYNC_NONE sync and the
WB_SYNC_ALL sync. Identify the first stage with .tagged_writepages and
do livelock prevention for it, too.

Jan's commit f446daaea9 ("mm: implement writeback livelock avoidance
using page tagging") is a partial fix in that it only fixed the
WB_SYNC_ALL phase livelock.

Although ext4 is tested to no longer livelock with commit f446daaea9,
it may due to some "redirty_tail() after pages_skipped" effect which
is by no means a guarantee for _all_ the file systems.

Note that writeback_inodes_sb() is called by not only sync(), they are
treated the same because the other callers also need livelock prevention.

Impact:  It changes the order in which pages/inodes are synced to disk.
Now in the WB_SYNC_NONE stage, it won't proceed to write the next inode
until finished with the current inode.

Acked-by: Jan Kara <jack@suse.cz>
CC: Dave Chinner <david@fromorbit.com>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
 fs/ext4/inode.c           |  4 ++--
 fs/fs-writeback.c         | 17 +++++++++--------
 include/linux/writeback.h |  1 +
 mm/page-writeback.c       |  4 ++--
 4 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index a5763e3505ba..8558b6c3450a 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2741,7 +2741,7 @@ static int write_cache_pages_da(struct address_space *mapping,
 	index = wbc->range_start >> PAGE_CACHE_SHIFT;
 	end = wbc->range_end >> PAGE_CACHE_SHIFT;
 
-	if (wbc->sync_mode == WB_SYNC_ALL)
+	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
 		tag = PAGECACHE_TAG_TOWRITE;
 	else
 		tag = PAGECACHE_TAG_DIRTY;
@@ -2973,7 +2973,7 @@ static int ext4_da_writepages(struct address_space *mapping,
 	}
 
 retry:
-	if (wbc->sync_mode == WB_SYNC_ALL)
+	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
 		tag_pages_for_writeback(mapping, index, end);
 
 	while (!ret && wbc->nr_to_write > 0) {
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 0f015a0468de..5ed2ce9a28d0 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -36,6 +36,7 @@ struct wb_writeback_work {
 	long nr_pages;
 	struct super_block *sb;
 	enum writeback_sync_modes sync_mode;
+	unsigned int tagged_writepages:1;
 	unsigned int for_kupdate:1;
 	unsigned int range_cyclic:1;
 	unsigned int for_background:1;
@@ -650,6 +651,7 @@ static long wb_writeback(struct bdi_writeback *wb,
 {
 	struct writeback_control wbc = {
 		.sync_mode		= work->sync_mode,
+		.tagged_writepages	= work->tagged_writepages,
 		.older_than_this	= NULL,
 		.for_kupdate		= work->for_kupdate,
 		.for_background		= work->for_background,
@@ -657,7 +659,7 @@ static long wb_writeback(struct bdi_writeback *wb,
 	};
 	unsigned long oldest_jif;
 	long wrote = 0;
-	long write_chunk;
+	long write_chunk = MAX_WRITEBACK_PAGES;
 	struct inode *inode;
 
 	if (wbc.for_kupdate) {
@@ -683,9 +685,7 @@ static long wb_writeback(struct bdi_writeback *wb,
 	 *                   (quickly) tag currently dirty pages
 	 *                   (maybe slowly) sync all tagged pages
 	 */
-	if (wbc.sync_mode == WB_SYNC_NONE)
-		write_chunk = MAX_WRITEBACK_PAGES;
-	else
+	if (wbc.sync_mode == WB_SYNC_ALL || wbc.tagged_writepages)
 		write_chunk = LONG_MAX;
 
 	wbc.wb_start = jiffies; /* livelock avoidance */
@@ -1188,10 +1188,11 @@ void writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr)
 {
 	DECLARE_COMPLETION_ONSTACK(done);
 	struct wb_writeback_work work = {
-		.sb		= sb,
-		.sync_mode	= WB_SYNC_NONE,
-		.done		= &done,
-		.nr_pages	= nr,
+		.sb			= sb,
+		.sync_mode		= WB_SYNC_NONE,
+		.tagged_writepages	= 1,
+		.done			= &done,
+		.nr_pages		= nr,
 	};
 
 	WARN_ON(!rwsem_is_locked(&sb->s_umount));
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 17e7ccc322a5..3f6542ca6198 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -47,6 +47,7 @@ struct writeback_control {
 	unsigned encountered_congestion:1; /* An output: a queue is full */
 	unsigned for_kupdate:1;		/* A kupdate writeback */
 	unsigned for_background:1;	/* A background writeback */
+	unsigned tagged_writepages:1;	/* tag-and-write to avoid livelock */
 	unsigned for_reclaim:1;		/* Invoked from the page allocator */
 	unsigned range_cyclic:1;	/* range_start is cyclic */
 	unsigned more_io:1;		/* more io to be dispatched */
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 31f698862420..955fe35d01e0 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -892,12 +892,12 @@ int write_cache_pages(struct address_space *mapping,
 			range_whole = 1;
 		cycled = 1; /* ignore range_cyclic tests */
 	}
-	if (wbc->sync_mode == WB_SYNC_ALL)
+	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
 		tag = PAGECACHE_TAG_TOWRITE;
 	else
 		tag = PAGECACHE_TAG_DIRTY;
 retry:
-	if (wbc->sync_mode == WB_SYNC_ALL)
+	if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
 		tag_pages_for_writeback(mapping, index, end);
 	done_index = index;
 	while (!done && (index <= end)) {
-- 
cgit v1.2.3


From 94c3dcbb0b0cdfd82cedd21705424d8044edc42c Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Wed, 27 Apr 2011 19:05:21 -0600
Subject: writeback: update dirtied_when for synced inode to prevent livelock

Explicitly update .dirtied_when on synced inodes, so that they are no
longer considered for writeback in the next round.

It can prevent both of the following livelock schemes:

- while true; do echo data >> f; done
- while true; do touch f;        done (in theory)

The exact livelock condition is, during sync(1):

(1) no new inodes are dirtied
(2) an inode being actively dirtied

On (2), the inode will be tagged and synced with .nr_to_write=LONG_MAX.
When finished, it will be redirty_tail()ed because it's still dirty
and (.nr_to_write > 0). redirty_tail() won't update its ->dirtied_when
on condition (1). The sync work will then revisit it on the next
queue_io() and find it eligible again because its old ->dirtied_when
predates the sync work start time.

We'll do more aggressive "keep writeback as long as we wrote something"
logic in wb_writeback(). The "use LONG_MAX .nr_to_write" trick in commit
b9543dac5bbc ("writeback: avoid livelocking WB_SYNC_ALL writeback") will
no longer be enough to stop sync livelock.

Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
 fs/fs-writeback.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 5ed2ce9a28d0..fe190a8b0bc8 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -419,6 +419,15 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 	spin_lock(&inode->i_lock);
 	inode->i_state &= ~I_SYNC;
 	if (!(inode->i_state & I_FREEING)) {
+		/*
+		 * Sync livelock prevention. Each inode is tagged and synced in
+		 * one shot. If still dirty, it will be redirty_tail()'ed below.
+		 * Update the dirty time to prevent enqueue and sync it again.
+		 */
+		if ((inode->i_state & I_DIRTY) &&
+		    (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages))
+			inode->dirtied_when = jiffies;
+
 		if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
 			/*
 			 * We didn't write back all the pages.  nfs_writepages()
-- 
cgit v1.2.3


From cb9bd1159c5fe8995e151fa7df10fa19f8c119cc Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Wed, 21 Jul 2010 22:50:57 -0600
Subject: writeback: introduce writeback_control.inodes_written

The flusher works on dirty inodes in batches, and may quit prematurely
if the batch of inodes happen to be metadata-only dirtied: in this case
wbc->nr_to_write won't be decreased at all, which stands for "no pages
written" but also mis-interpreted as "no progress".

So introduce writeback_control.inodes_written to count the inodes get
cleaned from VFS POV.  A non-zero value means there are some progress on
writeback, in which case more writeback can be tried.

Acked-by: Jan Kara <jack@suse.cz>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
 fs/fs-writeback.c         | 4 ++++
 include/linux/writeback.h | 1 +
 2 files changed, 5 insertions(+)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index fe190a8b0bc8..e4504299f4a5 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -464,6 +464,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 			 * No need to add it back to the LRU.
 			 */
 			list_del_init(&inode->i_wb_list);
+			wbc->inodes_written++;
 		}
 	}
 	inode_sync_complete(inode);
@@ -725,6 +726,7 @@ static long wb_writeback(struct bdi_writeback *wb,
 		wbc.more_io = 0;
 		wbc.nr_to_write = write_chunk;
 		wbc.pages_skipped = 0;
+		wbc.inodes_written = 0;
 
 		trace_wbc_writeback_start(&wbc, wb->bdi);
 		if (work->sb)
@@ -741,6 +743,8 @@ static long wb_writeback(struct bdi_writeback *wb,
 		 */
 		if (wbc.nr_to_write <= 0)
 			continue;
+		if (wbc.inodes_written)
+			continue;
 		/*
 		 * Didn't write everything and we don't have more IO, bail
 		 */
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 3f6542ca6198..7df9026f7129 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -34,6 +34,7 @@ struct writeback_control {
 	long nr_to_write;		/* Write this many pages, and decrement
 					   this for each page written */
 	long pages_skipped;		/* Pages which were not written */
+	long inodes_written;		/* # of inodes written (at least) */
 
 	/*
 	 * For a_ops->writepages(): is start or end are non-zero then this is
-- 
cgit v1.2.3


From e6fb6da2e10682d477f2fdb749451d9fe5d168e8 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Thu, 22 Jul 2010 10:23:44 -0600
Subject: writeback: try more writeback as long as something was written

writeback_inodes_wb()/__writeback_inodes_sb() are not aggressive in that
they only populate possibly a subset of eligible inodes into b_io at
entrance time. When the queued set of inodes are all synced, they just
return, possibly with all queued inode pages written but still
wbc.nr_to_write > 0.

For kupdate and background writeback, there may be more eligible inodes
sitting in b_dirty when the current set of b_io inodes are completed. So
it is necessary to try another round of writeback as long as we made some
progress in this round. When there are no more eligible inodes, no more
inodes will be enqueued in queue_io(), hence nothing could/will be
synced and we may safely bail.

For example, imagine 100 inodes

        i0, i1, i2, ..., i90, i91, i99

At queue_io() time, i90-i99 happen to be expired and moved to s_io for
IO. When finished successfully, if their total size is less than
MAX_WRITEBACK_PAGES, nr_to_write will be > 0. Then wb_writeback() will
quit the background work (w/o this patch) while it's still over
background threshold. This will be a fairly normal/frequent case I guess.

Now that we do tagged sync and update inode->dirtied_when after the sync,
this change won't livelock sync(1).  I actually tried to write 1 page
per 1ms with this command

	write-and-fsync -n10000 -S 1000 -c 4096 /fs/test

and do sync(1) at the same time. The sync completes quickly on ext4,
xfs, btrfs.

Acked-by: Jan Kara <jack@suse.cz>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
 fs/fs-writeback.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index e4504299f4a5..271cf2150ba0 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -739,22 +739,22 @@ static long wb_writeback(struct bdi_writeback *wb,
 		wrote += write_chunk - wbc.nr_to_write;
 
 		/*
-		 * If we consumed everything, see if we have more
+		 * Did we write something? Try for more
+		 *
+		 * Dirty inodes are moved to b_io for writeback in batches.
+		 * The completion of the current batch does not necessarily
+		 * mean the overall work is done. So we keep looping as long
+		 * as made some progress on cleaning pages or inodes.
 		 */
-		if (wbc.nr_to_write <= 0)
+		if (wbc.nr_to_write < write_chunk)
 			continue;
 		if (wbc.inodes_written)
 			continue;
 		/*
-		 * Didn't write everything and we don't have more IO, bail
+		 * No more inodes for IO, bail
 		 */
 		if (!wbc.more_io)
 			break;
-		/*
-		 * Did we write something? Try for more
-		 */
-		if (wbc.nr_to_write < write_chunk)
-			continue;
 		/*
 		 * Nothing written. Wait for some inode to
 		 * become available for writeback. Otherwise
-- 
cgit v1.2.3


From ba9aa8399fda48510d80c2fed1afb8fedbe1bb41 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Wed, 21 Jul 2010 20:32:30 -0600
Subject: writeback: the kupdate expire timestamp should be a moving target

Dynamically compute the dirty expire timestamp at queue_io() time.

writeback_control.older_than_this used to be determined at entrance to
the kupdate writeback work. This _static_ timestamp may go stale if the
kupdate work runs on and on. The flusher may then stuck with some old
busy inodes, never considering newly expired inodes thereafter.

This has two possible problems:

- It is unfair for a large dirty inode to delay (for a long time) the
  writeback of small dirty inodes.

- As time goes by, the large and busy dirty inode may contain only
  _freshly_ dirtied pages. Ignoring newly expired dirty inodes risks
  delaying the expired dirty pages to the end of LRU lists, triggering
  the evil pageout(). Nevertheless this patch merely addresses part
  of the problem.

v2: keep policy changes inside wb_writeback() and keep the
wbc.older_than_this visibility as suggested by Dave.

CC: Dave Chinner <david@fromorbit.com>
Acked-by: Jan Kara <jack@suse.cz>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Itaru Kitayama <kitayama@cl.bb4u.ne.jp>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
 fs/fs-writeback.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 271cf2150ba0..0adee7853b80 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -672,11 +672,6 @@ static long wb_writeback(struct bdi_writeback *wb,
 	long write_chunk = MAX_WRITEBACK_PAGES;
 	struct inode *inode;
 
-	if (wbc.for_kupdate) {
-		wbc.older_than_this = &oldest_jif;
-		oldest_jif = jiffies -
-				msecs_to_jiffies(dirty_expire_interval * 10);
-	}
 	if (!wbc.range_cyclic) {
 		wbc.range_start = 0;
 		wbc.range_end = LLONG_MAX;
@@ -723,6 +718,12 @@ static long wb_writeback(struct bdi_writeback *wb,
 		if (work->for_background && !over_bground_thresh())
 			break;
 
+		if (work->for_kupdate) {
+			oldest_jif = jiffies -
+				msecs_to_jiffies(dirty_expire_interval * 10);
+			wbc.older_than_this = &oldest_jif;
+		}
+
 		wbc.more_io = 0;
 		wbc.nr_to_write = write_chunk;
 		wbc.pages_skipped = 0;
-- 
cgit v1.2.3


From 424b351fe1901fc909fd0ca4f21dab58f24c1aac Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Wed, 21 Jul 2010 20:11:53 -0600
Subject: writeback: refill b_io iff empty

There is no point to carry different refill policies between for_kupdate
and other type of works. Use a consistent "refill b_io iff empty" policy
which can guarantee fairness in an easy to understand way.

A b_io refill will setup a _fixed_ work set with all currently eligible
inodes and start a new round of walk through b_io. The "fixed" work set
means no new inodes will be added to the work set during the walk.
Only when a complete walk over b_io is done, new inodes that are
eligible at the time will be enqueued and the walk be started over.

This procedure provides fairness among the inodes because it guarantees
each inode to be synced once and only once at each round. So all inodes
will be free from starvations.

This change relies on wb_writeback() to keep retrying as long as we made
some progress on cleaning some pages and/or inodes. Without that ability,
the old logic on background works relies on aggressively queuing all
eligible inodes into b_io at every time. But that's not a guarantee.

The below test script completes a slightly faster now:

             2.6.39-rc3	  2.6.39-rc3-dyn-expire+
------------------------------------------------
all elapsed     256.043      252.367
stddev           24.381       12.530

tar elapsed      30.097       28.808
dd  elapsed      13.214       11.782

	#!/bin/zsh

	cp /c/linux-2.6.38.3.tar.bz2 /dev/shm/

	umount /dev/sda7
	mkfs.xfs -f /dev/sda7
	mount /dev/sda7 /fs

	echo 3 > /proc/sys/vm/drop_caches

	tic=$(cat /proc/uptime|cut -d' ' -f2)

	cd /fs
	time tar jxf /dev/shm/linux-2.6.38.3.tar.bz2 &
	time dd if=/dev/zero of=/fs/zero bs=1M count=1000 &

	wait
	sync
	tac=$(cat /proc/uptime|cut -d' ' -f2)
	echo elapsed: $((tac - tic))

It maintains roughly the same small vs. large file writeout shares, and
offers large files better chances to be written in nice 4M chunks.

Analyzes from Dave Chinner in great details:

Let's say we have lots of inodes with 100 dirty pages being created,
and one large writeback going on. We expire 8 new inodes for every
1024 pages we write back.

With the old code, we do:

	b_more_io (large inode) -> b_io (1l)
	8 newly expired inodes -> b_io (1l, 8s)

	writeback  large inode 1024 pages -> b_more_io

	b_more_io (large inode) -> b_io (8s, 1l)
	8 newly expired inodes -> b_io (8s, 1l, 8s)

	writeback  8 small inodes 800 pages
		   1 large inode 224 pages -> b_more_io

	b_more_io (large inode) -> b_io (8s, 1l)
	8 newly expired inodes -> b_io (8s, 1l, 8s)
	.....

Your new code:

	b_more_io (large inode) -> b_io (1l)
	8 newly expired inodes -> b_io (1l, 8s)

	writeback  large inode 1024 pages -> b_more_io
	(b_io == 8s)
	writeback  8 small inodes 800 pages

	b_io empty: (1800 pages written)
		b_more_io (large inode) -> b_io (1l)
		14 newly expired inodes -> b_io (1l, 14s)

	writeback  large inode 1024 pages -> b_more_io
	(b_io == 14s)
	writeback  10 small inodes 1000 pages
		   1 small inode 24 pages -> b_more_io (1l, 1s(24))
	writeback  5 small inodes 500 pages
	b_io empty: (2548 pages written)
		b_more_io (large inode) -> b_io (1l, 1s(24))
		20 newly expired inodes -> b_io (1l, 1s(24), 20s)
	......

Rough progression of pages written at b_io refill:

Old code:

	total	large file	% of writeback
	1024	224		21.9% (fixed)

New code:
	total	large file	% of writeback
	1800	1024		~55%
	2550	1024		~40%
	3050	1024		~33%
	3500	1024		~29%
	3950	1024		~26%
	4250	1024		~24%
	4500	1024		~22.7%
	4700	1024		~21.7%
	4800	1024		~21.3%
	4800	1024		~21.3%
	(pretty much steady state from here)

Ok, so the steady state is reached with a similar percentage of
writeback to the large file as the existing code. Ok, that's good,
but providing some evidence that is doesn't change the shared of
writeback to the large should be in the commit message ;)

The other advantage to this is that we always write 1024 page chunks
to the large file, rather than smaller "whatever remains" chunks.

CC: Jan Kara <jack@suse.cz>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
 fs/fs-writeback.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 0adee7853b80..664acdb2e7ef 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -589,7 +589,8 @@ void writeback_inodes_wb(struct bdi_writeback *wb,
 	if (!wbc->wb_start)
 		wbc->wb_start = jiffies; /* livelock avoidance */
 	spin_lock(&inode_wb_list_lock);
-	if (!wbc->for_kupdate || list_empty(&wb->b_io))
+
+	if (list_empty(&wb->b_io))
 		queue_io(wb, wbc->older_than_this);
 
 	while (!list_empty(&wb->b_io)) {
@@ -616,7 +617,7 @@ static void __writeback_inodes_sb(struct super_block *sb,
 	WARN_ON(!rwsem_is_locked(&sb->s_umount));
 
 	spin_lock(&inode_wb_list_lock);
-	if (!wbc->for_kupdate || list_empty(&wb->b_io))
+	if (list_empty(&wb->b_io))
 		queue_io(wb, wbc->older_than_this);
 	writeback_sb_inodes(sb, wb, wbc, true);
 	spin_unlock(&inode_wb_list_lock);
-- 
cgit v1.2.3


From f758eeabeb96f878c860e8f110f94ec8820822a9 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@infradead.org>
Date: Thu, 21 Apr 2011 18:19:44 -0600
Subject: writeback: split inode_wb_list_lock into bdi_writeback.list_lock

Split the global inode_wb_list_lock into a per-bdi_writeback list_lock,
as it's currently the most contended lock in the system for metadata
heavy workloads.  It won't help for single-filesystem workloads for
which we'll need the I/O-less balance_dirty_pages, but at least we
can dedicate a cpu to spinning on each bdi now for larger systems.

Based on earlier patches from Nick Piggin and Dave Chinner.

It reduces lock contentions to 1/4 in this test case:
10 HDD JBOD, 100 dd on each disk, XFS, 6GB ram

lock_stat version 0.3
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
                              class name    con-bounces    contentions   waittime-min   waittime-max waittime-total    acq-bounces   acquisitions   holdtime-min   holdtime-max holdtime-total
-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
vanilla 2.6.39-rc3:
                      inode_wb_list_lock:         42590          44433           0.12         147.74      144127.35         252274         886792           0.08         121.34      917211.23
                      ------------------
                      inode_wb_list_lock              2          [<ffffffff81165da5>] bdev_inode_switch_bdi+0x29/0x85
                      inode_wb_list_lock             34          [<ffffffff8115bd0b>] inode_wb_list_del+0x22/0x49
                      inode_wb_list_lock          12893          [<ffffffff8115bb53>] __mark_inode_dirty+0x170/0x1d0
                      inode_wb_list_lock          10702          [<ffffffff8115afef>] writeback_single_inode+0x16d/0x20a
                      ------------------
                      inode_wb_list_lock              2          [<ffffffff81165da5>] bdev_inode_switch_bdi+0x29/0x85
                      inode_wb_list_lock             19          [<ffffffff8115bd0b>] inode_wb_list_del+0x22/0x49
                      inode_wb_list_lock           5550          [<ffffffff8115bb53>] __mark_inode_dirty+0x170/0x1d0
                      inode_wb_list_lock           8511          [<ffffffff8115b4ad>] writeback_sb_inodes+0x10f/0x157

2.6.39-rc3 + patch:
                &(&wb->list_lock)->rlock:         11383          11657           0.14         151.69       40429.51          90825         527918           0.11         145.90      556843.37
                ------------------------
                &(&wb->list_lock)->rlock             10          [<ffffffff8115b189>] inode_wb_list_del+0x5f/0x86
                &(&wb->list_lock)->rlock           1493          [<ffffffff8115b1ed>] writeback_inodes_wb+0x3d/0x150
                &(&wb->list_lock)->rlock           3652          [<ffffffff8115a8e9>] writeback_sb_inodes+0x123/0x16f
                &(&wb->list_lock)->rlock           1412          [<ffffffff8115a38e>] writeback_single_inode+0x17f/0x223
                ------------------------
                &(&wb->list_lock)->rlock              3          [<ffffffff8110b5af>] bdi_lock_two+0x46/0x4b
                &(&wb->list_lock)->rlock              6          [<ffffffff8115b189>] inode_wb_list_del+0x5f/0x86
                &(&wb->list_lock)->rlock           2061          [<ffffffff8115af97>] __mark_inode_dirty+0x173/0x1cf
                &(&wb->list_lock)->rlock           2629          [<ffffffff8115a8e9>] writeback_sb_inodes+0x123/0x16f

hughd@google.com: fix recursive lock when bdi_lock_two() is called with new the same as old
akpm@linux-foundation.org: cleanup bdev_inode_switch_bdi() comment

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
 fs/block_dev.c              | 16 +++++---
 fs/fs-writeback.c           | 97 +++++++++++++++++++++++----------------------
 fs/inode.c                  |  5 +--
 include/linux/backing-dev.h |  2 +
 include/linux/writeback.h   |  2 -
 mm/backing-dev.c            | 21 ++++++++--
 mm/filemap.c                |  6 +--
 mm/rmap.c                   |  4 +-
 8 files changed, 85 insertions(+), 68 deletions(-)

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 1a2421f908f0..3c9a03e51b62 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -44,24 +44,28 @@ inline struct block_device *I_BDEV(struct inode *inode)
 {
 	return &BDEV_I(inode)->bdev;
 }
-
 EXPORT_SYMBOL(I_BDEV);
 
 /*
- * move the inode from it's current bdi to the a new bdi. if the inode is dirty
- * we need to move it onto the dirty list of @dst so that the inode is always
- * on the right list.
+ * Move the inode from its current bdi to a new bdi. If the inode is dirty we
+ * need to move it onto the dirty list of @dst so that the inode is always on
+ * the right list.
  */
 static void bdev_inode_switch_bdi(struct inode *inode,
 			struct backing_dev_info *dst)
 {
-	spin_lock(&inode_wb_list_lock);
+	struct backing_dev_info *old = inode->i_data.backing_dev_info;
+
+	if (unlikely(dst == old))		/* deadlock avoidance */
+		return;
+	bdi_lock_two(&old->wb, &dst->wb);
 	spin_lock(&inode->i_lock);
 	inode->i_data.backing_dev_info = dst;
 	if (inode->i_state & I_DIRTY)
 		list_move(&inode->i_wb_list, &dst->wb.b_dirty);
 	spin_unlock(&inode->i_lock);
-	spin_unlock(&inode_wb_list_lock);
+	spin_unlock(&old->wb.list_lock);
+	spin_unlock(&dst->wb.list_lock);
 }
 
 static sector_t max_block(struct block_device *bdev)
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 664acdb2e7ef..36a30917e0dc 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -181,12 +181,13 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi)
  */
 void inode_wb_list_del(struct inode *inode)
 {
-	spin_lock(&inode_wb_list_lock);
+	struct backing_dev_info *bdi = inode_to_bdi(inode);
+
+	spin_lock(&bdi->wb.list_lock);
 	list_del_init(&inode->i_wb_list);
-	spin_unlock(&inode_wb_list_lock);
+	spin_unlock(&bdi->wb.list_lock);
 }
 
-
 /*
  * Redirty an inode: set its when-it-was dirtied timestamp and move it to the
  * furthest end of its superblock's dirty-inode list.
@@ -196,11 +197,9 @@ void inode_wb_list_del(struct inode *inode)
  * the case then the inode must have been redirtied while it was being written
  * out and we don't reset its dirtied_when.
  */
-static void redirty_tail(struct inode *inode)
+static void redirty_tail(struct inode *inode, struct bdi_writeback *wb)
 {
-	struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
-
-	assert_spin_locked(&inode_wb_list_lock);
+	assert_spin_locked(&wb->list_lock);
 	if (!list_empty(&wb->b_dirty)) {
 		struct inode *tail;
 
@@ -214,11 +213,9 @@ static void redirty_tail(struct inode *inode)
 /*
  * requeue inode for re-scanning after bdi->b_io list is exhausted.
  */
-static void requeue_io(struct inode *inode)
+static void requeue_io(struct inode *inode, struct bdi_writeback *wb)
 {
-	struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
-
-	assert_spin_locked(&inode_wb_list_lock);
+	assert_spin_locked(&wb->list_lock);
 	list_move(&inode->i_wb_list, &wb->b_more_io);
 }
 
@@ -226,7 +223,7 @@ static void inode_sync_complete(struct inode *inode)
 {
 	/*
 	 * Prevent speculative execution through
-	 * spin_unlock(&inode_wb_list_lock);
+	 * spin_unlock(&wb->list_lock);
 	 */
 
 	smp_mb();
@@ -302,7 +299,7 @@ static void move_expired_inodes(struct list_head *delaying_queue,
  */
 static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this)
 {
-	assert_spin_locked(&inode_wb_list_lock);
+	assert_spin_locked(&wb->list_lock);
 	list_splice_init(&wb->b_more_io, &wb->b_io);
 	move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
 }
@@ -317,7 +314,8 @@ static int write_inode(struct inode *inode, struct writeback_control *wbc)
 /*
  * Wait for writeback on an inode to complete.
  */
-static void inode_wait_for_writeback(struct inode *inode)
+static void inode_wait_for_writeback(struct inode *inode,
+				     struct bdi_writeback *wb)
 {
 	DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC);
 	wait_queue_head_t *wqh;
@@ -325,15 +323,15 @@ static void inode_wait_for_writeback(struct inode *inode)
 	wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
 	while (inode->i_state & I_SYNC) {
 		spin_unlock(&inode->i_lock);
-		spin_unlock(&inode_wb_list_lock);
+		spin_unlock(&wb->list_lock);
 		__wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE);
-		spin_lock(&inode_wb_list_lock);
+		spin_lock(&wb->list_lock);
 		spin_lock(&inode->i_lock);
 	}
 }
 
 /*
- * Write out an inode's dirty pages.  Called under inode_wb_list_lock and
+ * Write out an inode's dirty pages.  Called under wb->list_lock and
  * inode->i_lock.  Either the caller has an active reference on the inode or
  * the inode has I_WILL_FREE set.
  *
@@ -344,13 +342,14 @@ static void inode_wait_for_writeback(struct inode *inode)
  * livelocks, etc.
  */
 static int
-writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
+writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
+		       struct writeback_control *wbc)
 {
 	struct address_space *mapping = inode->i_mapping;
 	unsigned dirty;
 	int ret;
 
-	assert_spin_locked(&inode_wb_list_lock);
+	assert_spin_locked(&wb->list_lock);
 	assert_spin_locked(&inode->i_lock);
 
 	if (!atomic_read(&inode->i_count))
@@ -368,14 +367,14 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 		 * completed a full scan of b_io.
 		 */
 		if (wbc->sync_mode != WB_SYNC_ALL) {
-			requeue_io(inode);
+			requeue_io(inode, wb);
 			return 0;
 		}
 
 		/*
 		 * It's a data-integrity sync.  We must wait.
 		 */
-		inode_wait_for_writeback(inode);
+		inode_wait_for_writeback(inode, wb);
 	}
 
 	BUG_ON(inode->i_state & I_SYNC);
@@ -384,7 +383,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 	inode->i_state |= I_SYNC;
 	inode->i_state &= ~I_DIRTY_PAGES;
 	spin_unlock(&inode->i_lock);
-	spin_unlock(&inode_wb_list_lock);
+	spin_unlock(&wb->list_lock);
 
 	ret = do_writepages(mapping, wbc);
 
@@ -415,7 +414,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 			ret = err;
 	}
 
-	spin_lock(&inode_wb_list_lock);
+	spin_lock(&wb->list_lock);
 	spin_lock(&inode->i_lock);
 	inode->i_state &= ~I_SYNC;
 	if (!(inode->i_state & I_FREEING)) {
@@ -438,7 +437,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 				/*
 				 * slice used up: queue for next turn
 				 */
-				requeue_io(inode);
+				requeue_io(inode, wb);
 			} else {
 				/*
 				 * Writeback blocked by something other than
@@ -447,7 +446,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 				 * retrying writeback of the dirty page/inode
 				 * that cannot be performed immediately.
 				 */
-				redirty_tail(inode);
+				redirty_tail(inode, wb);
 			}
 		} else if (inode->i_state & I_DIRTY) {
 			/*
@@ -456,7 +455,7 @@ writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 			 * submission or metadata updates after data IO
 			 * completion.
 			 */
-			redirty_tail(inode);
+			redirty_tail(inode, wb);
 		} else {
 			/*
 			 * The inode is clean.  At this point we either have
@@ -521,7 +520,7 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
 				 * superblock, move all inodes not belonging
 				 * to it back onto the dirty list.
 				 */
-				redirty_tail(inode);
+				redirty_tail(inode, wb);
 				continue;
 			}
 
@@ -541,7 +540,7 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
 		spin_lock(&inode->i_lock);
 		if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
 			spin_unlock(&inode->i_lock);
-			requeue_io(inode);
+			requeue_io(inode, wb);
 			continue;
 		}
 
@@ -557,19 +556,19 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
 		__iget(inode);
 
 		pages_skipped = wbc->pages_skipped;
-		writeback_single_inode(inode, wbc);
+		writeback_single_inode(inode, wb, wbc);
 		if (wbc->pages_skipped != pages_skipped) {
 			/*
 			 * writeback is not making progress due to locked
 			 * buffers.  Skip this inode for now.
 			 */
-			redirty_tail(inode);
+			redirty_tail(inode, wb);
 		}
 		spin_unlock(&inode->i_lock);
-		spin_unlock(&inode_wb_list_lock);
+		spin_unlock(&wb->list_lock);
 		iput(inode);
 		cond_resched();
-		spin_lock(&inode_wb_list_lock);
+		spin_lock(&wb->list_lock);
 		if (wbc->nr_to_write <= 0) {
 			wbc->more_io = 1;
 			return 1;
@@ -588,7 +587,7 @@ void writeback_inodes_wb(struct bdi_writeback *wb,
 
 	if (!wbc->wb_start)
 		wbc->wb_start = jiffies; /* livelock avoidance */
-	spin_lock(&inode_wb_list_lock);
+	spin_lock(&wb->list_lock);
 
 	if (list_empty(&wb->b_io))
 		queue_io(wb, wbc->older_than_this);
@@ -598,7 +597,7 @@ void writeback_inodes_wb(struct bdi_writeback *wb,
 		struct super_block *sb = inode->i_sb;
 
 		if (!pin_sb_for_writeback(sb)) {
-			requeue_io(inode);
+			requeue_io(inode, wb);
 			continue;
 		}
 		ret = writeback_sb_inodes(sb, wb, wbc, false);
@@ -607,7 +606,7 @@ void writeback_inodes_wb(struct bdi_writeback *wb,
 		if (ret)
 			break;
 	}
-	spin_unlock(&inode_wb_list_lock);
+	spin_unlock(&wb->list_lock);
 	/* Leave any unwritten inodes on b_io */
 }
 
@@ -616,11 +615,11 @@ static void __writeback_inodes_sb(struct super_block *sb,
 {
 	WARN_ON(!rwsem_is_locked(&sb->s_umount));
 
-	spin_lock(&inode_wb_list_lock);
+	spin_lock(&wb->list_lock);
 	if (list_empty(&wb->b_io))
 		queue_io(wb, wbc->older_than_this);
 	writeback_sb_inodes(sb, wb, wbc, true);
-	spin_unlock(&inode_wb_list_lock);
+	spin_unlock(&wb->list_lock);
 }
 
 /*
@@ -762,15 +761,15 @@ static long wb_writeback(struct bdi_writeback *wb,
 		 * become available for writeback. Otherwise
 		 * we'll just busyloop.
 		 */
-		spin_lock(&inode_wb_list_lock);
+		spin_lock(&wb->list_lock);
 		if (!list_empty(&wb->b_more_io))  {
 			inode = wb_inode(wb->b_more_io.prev);
 			trace_wbc_writeback_wait(&wbc, wb->bdi);
 			spin_lock(&inode->i_lock);
-			inode_wait_for_writeback(inode);
+			inode_wait_for_writeback(inode, wb);
 			spin_unlock(&inode->i_lock);
 		}
-		spin_unlock(&inode_wb_list_lock);
+		spin_unlock(&wb->list_lock);
 	}
 
 	return wrote;
@@ -1104,10 +1103,10 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 			}
 
 			spin_unlock(&inode->i_lock);
-			spin_lock(&inode_wb_list_lock);
+			spin_lock(&bdi->wb.list_lock);
 			inode->dirtied_when = jiffies;
 			list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
-			spin_unlock(&inode_wb_list_lock);
+			spin_unlock(&bdi->wb.list_lock);
 
 			if (wakeup_bdi)
 				bdi_wakeup_thread_delayed(bdi);
@@ -1309,6 +1308,7 @@ EXPORT_SYMBOL(sync_inodes_sb);
  */
 int write_inode_now(struct inode *inode, int sync)
 {
+	struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
 	int ret;
 	struct writeback_control wbc = {
 		.nr_to_write = LONG_MAX,
@@ -1321,11 +1321,11 @@ int write_inode_now(struct inode *inode, int sync)
 		wbc.nr_to_write = 0;
 
 	might_sleep();
-	spin_lock(&inode_wb_list_lock);
+	spin_lock(&wb->list_lock);
 	spin_lock(&inode->i_lock);
-	ret = writeback_single_inode(inode, &wbc);
+	ret = writeback_single_inode(inode, wb, &wbc);
 	spin_unlock(&inode->i_lock);
-	spin_unlock(&inode_wb_list_lock);
+	spin_unlock(&wb->list_lock);
 	if (sync)
 		inode_sync_wait(inode);
 	return ret;
@@ -1345,13 +1345,14 @@ EXPORT_SYMBOL(write_inode_now);
  */
 int sync_inode(struct inode *inode, struct writeback_control *wbc)
 {
+	struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
 	int ret;
 
-	spin_lock(&inode_wb_list_lock);
+	spin_lock(&wb->list_lock);
 	spin_lock(&inode->i_lock);
-	ret = writeback_single_inode(inode, wbc);
+	ret = writeback_single_inode(inode, wb, wbc);
 	spin_unlock(&inode->i_lock);
-	spin_unlock(&inode_wb_list_lock);
+	spin_unlock(&wb->list_lock);
 	return ret;
 }
 EXPORT_SYMBOL(sync_inode);
diff --git a/fs/inode.c b/fs/inode.c
index 0f7e88a7803f..4be128cbc754 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -37,7 +37,7 @@
  *   inode_lru, inode->i_lru
  * inode_sb_list_lock protects:
  *   sb->s_inodes, inode->i_sb_list
- * inode_wb_list_lock protects:
+ * bdi->wb.list_lock protects:
  *   bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list
  * inode_hash_lock protects:
  *   inode_hashtable, inode->i_hash
@@ -48,7 +48,7 @@
  *   inode->i_lock
  *     inode_lru_lock
  *
- * inode_wb_list_lock
+ * bdi->wb.list_lock
  *   inode->i_lock
  *
  * inode_hash_lock
@@ -68,7 +68,6 @@ static LIST_HEAD(inode_lru);
 static DEFINE_SPINLOCK(inode_lru_lock);
 
 __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock);
-__cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_wb_list_lock);
 
 /*
  * iprune_sem provides exclusion between the icache shrinking and the
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 96f4094b706d..47feb2c4706a 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -57,6 +57,7 @@ struct bdi_writeback {
 	struct list_head b_dirty;	/* dirty inodes */
 	struct list_head b_io;		/* parked for writeback */
 	struct list_head b_more_io;	/* parked for more writeback */
+	spinlock_t list_lock;		/* protects the b_* lists */
 };
 
 struct backing_dev_info {
@@ -106,6 +107,7 @@ int bdi_writeback_thread(void *data);
 int bdi_has_dirty_io(struct backing_dev_info *bdi);
 void bdi_arm_supers_timer(void);
 void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi);
+void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2);
 
 extern spinlock_t bdi_lock;
 extern struct list_head bdi_list;
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 7df9026f7129..c2d957fb38d3 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -9,8 +9,6 @@
 
 struct backing_dev_info;
 
-extern spinlock_t inode_wb_list_lock;
-
 /*
  * fs/fs-writeback.c
  */
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index f032e6e1e09a..5f6553ef1ba7 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -45,6 +45,17 @@ static struct timer_list sync_supers_timer;
 static int bdi_sync_supers(void *);
 static void sync_supers_timer_fn(unsigned long);
 
+void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2)
+{
+	if (wb1 < wb2) {
+		spin_lock(&wb1->list_lock);
+		spin_lock_nested(&wb2->list_lock, 1);
+	} else {
+		spin_lock(&wb2->list_lock);
+		spin_lock_nested(&wb1->list_lock, 1);
+	}
+}
+
 #ifdef CONFIG_DEBUG_FS
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
@@ -67,14 +78,14 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
 	struct inode *inode;
 
 	nr_dirty = nr_io = nr_more_io = 0;
-	spin_lock(&inode_wb_list_lock);
+	spin_lock(&wb->list_lock);
 	list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
 		nr_dirty++;
 	list_for_each_entry(inode, &wb->b_io, i_wb_list)
 		nr_io++;
 	list_for_each_entry(inode, &wb->b_more_io, i_wb_list)
 		nr_more_io++;
-	spin_unlock(&inode_wb_list_lock);
+	spin_unlock(&wb->list_lock);
 
 	global_dirty_limits(&background_thresh, &dirty_thresh);
 	bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
@@ -628,6 +639,7 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
 	INIT_LIST_HEAD(&wb->b_dirty);
 	INIT_LIST_HEAD(&wb->b_io);
 	INIT_LIST_HEAD(&wb->b_more_io);
+	spin_lock_init(&wb->list_lock);
 	setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi);
 }
 
@@ -676,11 +688,12 @@ void bdi_destroy(struct backing_dev_info *bdi)
 	if (bdi_has_dirty_io(bdi)) {
 		struct bdi_writeback *dst = &default_backing_dev_info.wb;
 
-		spin_lock(&inode_wb_list_lock);
+		bdi_lock_two(&bdi->wb, dst);
 		list_splice(&bdi->wb.b_dirty, &dst->b_dirty);
 		list_splice(&bdi->wb.b_io, &dst->b_io);
 		list_splice(&bdi->wb.b_more_io, &dst->b_more_io);
-		spin_unlock(&inode_wb_list_lock);
+		spin_unlock(&bdi->wb.list_lock);
+		spin_unlock(&dst->list_lock);
 	}
 
 	bdi_unregister(bdi);
diff --git a/mm/filemap.c b/mm/filemap.c
index d7b10578a64b..1e492c3dd6f8 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -81,7 +81,7 @@
  *  ->i_mutex
  *    ->i_alloc_sem             (various)
  *
- *  inode_wb_list_lock
+ *  bdi->wb.list_lock
  *    sb_lock			(fs/fs-writeback.c)
  *    ->mapping->tree_lock	(__sync_single_inode)
  *
@@ -99,9 +99,9 @@
  *    ->zone.lru_lock		(check_pte_range->isolate_lru_page)
  *    ->private_lock		(page_remove_rmap->set_page_dirty)
  *    ->tree_lock		(page_remove_rmap->set_page_dirty)
- *    inode_wb_list_lock	(page_remove_rmap->set_page_dirty)
+ *    bdi.wb->list_lock		(page_remove_rmap->set_page_dirty)
  *    ->inode->i_lock		(page_remove_rmap->set_page_dirty)
- *    inode_wb_list_lock	(zap_pte_range->set_page_dirty)
+ *    bdi.wb->list_lock		(zap_pte_range->set_page_dirty)
  *    ->inode->i_lock		(zap_pte_range->set_page_dirty)
  *    ->private_lock		(zap_pte_range->__set_page_dirty_buffers)
  *
diff --git a/mm/rmap.c b/mm/rmap.c
index 0eb463ea88dd..d04e36a7cc9f 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -32,11 +32,11 @@
  *               mmlist_lock (in mmput, drain_mmlist and others)
  *               mapping->private_lock (in __set_page_dirty_buffers)
  *               inode->i_lock (in set_page_dirty's __mark_inode_dirty)
- *               inode_wb_list_lock (in set_page_dirty's __mark_inode_dirty)
+ *               bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
  *                 sb_lock (within inode_lock in fs/fs-writeback.c)
  *                 mapping->tree_lock (widely used, in set_page_dirty,
  *                           in arch-dependent flush_dcache_mmap_lock,
- *                           within inode_wb_list_lock in __sync_single_inode)
+ *                           within bdi.wb->list_lock in __sync_single_inode)
  *
  * (code doesn't rely on that order so it could be switched around)
  * ->tasklist_lock
-- 
cgit v1.2.3


From e8dfc30582995ae12454cda517b17d6294175b07 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Thu, 21 Apr 2011 12:06:32 -0600
Subject: writeback: elevate queue_io() into wb_writeback()

Code refactor for more logical code layout.
No behavior change.

- remove the mis-named __writeback_inodes_sb()

- wb_writeback()/writeback_inodes_wb() will decide when to queue_io()
  before calling __writeback_inodes_wb()

Acked-by: Jan Kara <jack@suse.cz>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
 fs/fs-writeback.c | 29 ++++++++++++-----------------
 1 file changed, 12 insertions(+), 17 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 36a30917e0dc..565b1fd15be6 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -580,17 +580,13 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
 	return 1;
 }
 
-void writeback_inodes_wb(struct bdi_writeback *wb,
-		struct writeback_control *wbc)
+static void __writeback_inodes_wb(struct bdi_writeback *wb,
+				  struct writeback_control *wbc)
 {
 	int ret = 0;
 
 	if (!wbc->wb_start)
 		wbc->wb_start = jiffies; /* livelock avoidance */
-	spin_lock(&wb->list_lock);
-
-	if (list_empty(&wb->b_io))
-		queue_io(wb, wbc->older_than_this);
 
 	while (!list_empty(&wb->b_io)) {
 		struct inode *inode = wb_inode(wb->b_io.prev);
@@ -606,19 +602,16 @@ void writeback_inodes_wb(struct bdi_writeback *wb,
 		if (ret)
 			break;
 	}
-	spin_unlock(&wb->list_lock);
 	/* Leave any unwritten inodes on b_io */
 }
 
-static void __writeback_inodes_sb(struct super_block *sb,
-		struct bdi_writeback *wb, struct writeback_control *wbc)
+void writeback_inodes_wb(struct bdi_writeback *wb,
+		struct writeback_control *wbc)
 {
-	WARN_ON(!rwsem_is_locked(&sb->s_umount));
-
 	spin_lock(&wb->list_lock);
 	if (list_empty(&wb->b_io))
 		queue_io(wb, wbc->older_than_this);
-	writeback_sb_inodes(sb, wb, wbc, true);
+	__writeback_inodes_wb(wb, wbc);
 	spin_unlock(&wb->list_lock);
 }
 
@@ -685,7 +678,7 @@ static long wb_writeback(struct bdi_writeback *wb,
 	 * The intended call sequence for WB_SYNC_ALL writeback is:
 	 *
 	 *      wb_writeback()
-	 *          __writeback_inodes_sb()     <== called only once
+	 *          writeback_sb_inodes()       <== called only once
 	 *              write_cache_pages()     <== called once for each inode
 	 *                   (quickly) tag currently dirty pages
 	 *                   (maybe slowly) sync all tagged pages
@@ -694,6 +687,7 @@ static long wb_writeback(struct bdi_writeback *wb,
 		write_chunk = LONG_MAX;
 
 	wbc.wb_start = jiffies; /* livelock avoidance */
+	spin_lock(&wb->list_lock);
 	for (;;) {
 		/*
 		 * Stop writeback when nr_pages has been consumed
@@ -730,10 +724,12 @@ static long wb_writeback(struct bdi_writeback *wb,
 		wbc.inodes_written = 0;
 
 		trace_wbc_writeback_start(&wbc, wb->bdi);
+		if (list_empty(&wb->b_io))
+			queue_io(wb, wbc.older_than_this);
 		if (work->sb)
-			__writeback_inodes_sb(work->sb, wb, &wbc);
+			writeback_sb_inodes(work->sb, wb, &wbc, true);
 		else
-			writeback_inodes_wb(wb, &wbc);
+			__writeback_inodes_wb(wb, &wbc);
 		trace_wbc_writeback_written(&wbc, wb->bdi);
 
 		work->nr_pages -= write_chunk - wbc.nr_to_write;
@@ -761,7 +757,6 @@ static long wb_writeback(struct bdi_writeback *wb,
 		 * become available for writeback. Otherwise
 		 * we'll just busyloop.
 		 */
-		spin_lock(&wb->list_lock);
 		if (!list_empty(&wb->b_more_io))  {
 			inode = wb_inode(wb->b_more_io.prev);
 			trace_wbc_writeback_wait(&wbc, wb->bdi);
@@ -769,8 +764,8 @@ static long wb_writeback(struct bdi_writeback *wb,
 			inode_wait_for_writeback(inode, wb);
 			spin_unlock(&inode->i_lock);
 		}
-		spin_unlock(&wb->list_lock);
 	}
+	spin_unlock(&wb->list_lock);
 
 	return wrote;
 }
-- 
cgit v1.2.3


From e185dda89d69cde142b48059413a03561f41f78a Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Sat, 23 Apr 2011 11:26:07 -0600
Subject: writeback: avoid extra sync work at enqueue time

This removes writeback_control.wb_start and does more straightforward
sync livelock prevention by setting .older_than_this to prevent extra
inodes from being enqueued in the first place.

Acked-by: Jan Kara <jack@suse.cz>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
 fs/fs-writeback.c         | 16 +++-------------
 include/linux/writeback.h |  3 ---
 2 files changed, 3 insertions(+), 16 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 565b1fd15be6..d0553f33fb50 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -544,15 +544,6 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
 			continue;
 		}
 
-		/*
-		 * Was this inode dirtied after sync_sb_inodes was called?
-		 * This keeps sync from extra jobs and livelock.
-		 */
-		if (inode_dirtied_after(inode, wbc->wb_start)) {
-			spin_unlock(&inode->i_lock);
-			return 1;
-		}
-
 		__iget(inode);
 
 		pages_skipped = wbc->pages_skipped;
@@ -585,9 +576,6 @@ static void __writeback_inodes_wb(struct bdi_writeback *wb,
 {
 	int ret = 0;
 
-	if (!wbc->wb_start)
-		wbc->wb_start = jiffies; /* livelock avoidance */
-
 	while (!list_empty(&wb->b_io)) {
 		struct inode *inode = wb_inode(wb->b_io.prev);
 		struct super_block *sb = inode->i_sb;
@@ -686,7 +674,9 @@ static long wb_writeback(struct bdi_writeback *wb,
 	if (wbc.sync_mode == WB_SYNC_ALL || wbc.tagged_writepages)
 		write_chunk = LONG_MAX;
 
-	wbc.wb_start = jiffies; /* livelock avoidance */
+	oldest_jif = jiffies;
+	wbc.older_than_this = &oldest_jif;
+
 	spin_lock(&wb->list_lock);
 	for (;;) {
 		/*
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index c2d957fb38d3..d8e96a480850 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -26,9 +26,6 @@ struct writeback_control {
 	enum writeback_sync_modes sync_mode;
 	unsigned long *older_than_this;	/* If !NULL, only write back inodes
 					   older than this */
-	unsigned long wb_start;         /* Time writeback_inodes_wb was
-					   called. This is needed to avoid
-					   extra jobs and livelock */
 	long nr_to_write;		/* Write this many pages, and decrement
 					   this for each page written */
 	long pages_skipped;		/* Pages which were not written */
-- 
cgit v1.2.3


From 6f7186562771ec9b629914df328048449ccddf4a Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Wed, 2 Mar 2011 17:14:34 -0600
Subject: writeback: add bdi_dirty_limit() kernel-doc

Clarify the bdi_dirty_limit() comment.

Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Jan Kara <jack@suse.cz>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
 mm/page-writeback.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 955fe35d01e0..b8be62381396 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -437,10 +437,17 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
 	*pdirty = dirty;
 }
 
-/*
+/**
  * bdi_dirty_limit - @bdi's share of dirty throttling threshold
+ * @bdi: the backing_dev_info to query
+ * @dirty: global dirty limit in pages
+ *
+ * Returns @bdi's dirty limit in pages. The term "dirty" in the context of
+ * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages.
+ * And the "limit" in the name is not seriously taken as hard limit in
+ * balance_dirty_pages().
  *
- * Allocate high/low dirty limits to fast/slow devices, in order to prevent
+ * It allocates high/low dirty limits to fast/slow devices, in order to prevent
  * - starving fast devices
  * - piling up dirty pages (that will take long time to sync) on slow devices
  *
-- 
cgit v1.2.3


From 3efaf0faba6793cd91298c76315e15de59c13ae0 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Thu, 16 Dec 2010 22:22:00 -0600
Subject: writeback: skip balance_dirty_pages() for in-memory fs

This avoids unnecessary checks and dirty throttling on tmpfs/ramfs.

Notes about the tmpfs/ramfs behavior changes:

As for 2.6.36 and older kernels, the tmpfs writes will sleep inside
balance_dirty_pages() as long as we are over the (dirty+background)/2
global throttle threshold.  This is because both the dirty pages and
threshold will be 0 for tmpfs/ramfs. Hence this test will always
evaluate to TRUE:

                dirty_exceeded =
                        (bdi_nr_reclaimable + bdi_nr_writeback >= bdi_thresh)
                        || (nr_reclaimable + nr_writeback >= dirty_thresh);

For 2.6.37, someone complained that the current logic does not allow the
users to set vm.dirty_ratio=0.  So commit 4cbec4c8b9 changed the test to

                dirty_exceeded =
                        (bdi_nr_reclaimable + bdi_nr_writeback > bdi_thresh)
                        || (nr_reclaimable + nr_writeback > dirty_thresh);

So 2.6.37 will behave differently for tmpfs/ramfs: it will never get
throttled unless the global dirty threshold is exceeded (which is very
unlikely to happen; once happen, will block many tasks).

I'd say that the 2.6.36 behavior is very bad for tmpfs/ramfs. It means
for a busy writing server, tmpfs write()s may get livelocked! The
"inadvertent" throttling can hardly bring help to any workload because
of its "either no throttling, or get throttled to death" property.

So based on 2.6.37, this patch won't bring more noticeable changes.

CC: Hugh Dickins <hughd@google.com>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
 mm/page-writeback.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index b8be62381396..b2529f8f8be0 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -244,13 +244,8 @@ void task_dirty_inc(struct task_struct *tsk)
 static void bdi_writeout_fraction(struct backing_dev_info *bdi,
 		long *numerator, long *denominator)
 {
-	if (bdi_cap_writeback_dirty(bdi)) {
-		prop_fraction_percpu(&vm_completions, &bdi->completions,
+	prop_fraction_percpu(&vm_completions, &bdi->completions,
 				numerator, denominator);
-	} else {
-		*numerator = 0;
-		*denominator = 1;
-	}
 }
 
 static inline void task_dirties_fraction(struct task_struct *tsk,
@@ -495,6 +490,9 @@ static void balance_dirty_pages(struct address_space *mapping,
 	bool dirty_exceeded = false;
 	struct backing_dev_info *bdi = mapping->backing_dev_info;
 
+	if (!bdi_cap_account_dirty(bdi))
+		return;
+
 	for (;;) {
 		struct writeback_control wbc = {
 			.sync_mode	= WB_SYNC_NONE,
-- 
cgit v1.2.3


From b7a2441f9966fe3e1be960a876ab52e6029ea005 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Wed, 21 Jul 2010 22:19:51 -0600
Subject: writeback: remove writeback_control.more_io

When wbc.more_io was first introduced, it indicates whether there are
at least one superblock whose s_more_io contains more IO work. Now with
the per-bdi writeback, it can be replaced with a simple b_more_io test.

Acked-by: Jan Kara <jack@suse.cz>
Acked-by: Mel Gorman <mel@csn.ul.ie>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
 fs/fs-writeback.c                | 9 ++-------
 include/linux/writeback.h        | 1 -
 include/trace/events/ext4.h      | 6 ++----
 include/trace/events/writeback.h | 5 +----
 4 files changed, 5 insertions(+), 16 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index d0553f33fb50..f43c479feee9 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -560,12 +560,8 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
 		iput(inode);
 		cond_resched();
 		spin_lock(&wb->list_lock);
-		if (wbc->nr_to_write <= 0) {
-			wbc->more_io = 1;
+		if (wbc->nr_to_write <= 0)
 			return 1;
-		}
-		if (!list_empty(&wb->b_more_io))
-			wbc->more_io = 1;
 	}
 	/* b_io is empty */
 	return 1;
@@ -708,7 +704,6 @@ static long wb_writeback(struct bdi_writeback *wb,
 			wbc.older_than_this = &oldest_jif;
 		}
 
-		wbc.more_io = 0;
 		wbc.nr_to_write = write_chunk;
 		wbc.pages_skipped = 0;
 		wbc.inodes_written = 0;
@@ -740,7 +735,7 @@ static long wb_writeback(struct bdi_writeback *wb,
 		/*
 		 * No more inodes for IO, bail
 		 */
-		if (!wbc.more_io)
+		if (list_empty(&wb->b_more_io))
 			break;
 		/*
 		 * Nothing written. Wait for some inode to
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index d8e96a480850..8797b20dd22b 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -46,7 +46,6 @@ struct writeback_control {
 	unsigned tagged_writepages:1;	/* tag-and-write to avoid livelock */
 	unsigned for_reclaim:1;		/* Invoked from the page allocator */
 	unsigned range_cyclic:1;	/* range_start is cyclic */
-	unsigned more_io:1;		/* more io to be dispatched */
 };
 
 /*
diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h
index e09592d2f916..b225d0d8c87f 100644
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -404,7 +404,6 @@ TRACE_EVENT(ext4_da_writepages_result,
 		__field(	int,	pages_written		)
 		__field(	long,	pages_skipped		)
 		__field(	int,	sync_mode		)
-		__field(	char,	more_io			)	
 		__field(       pgoff_t,	writeback_index		)
 	),
 
@@ -415,16 +414,15 @@ TRACE_EVENT(ext4_da_writepages_result,
 		__entry->pages_written	= pages_written;
 		__entry->pages_skipped	= wbc->pages_skipped;
 		__entry->sync_mode	= wbc->sync_mode;
-		__entry->more_io	= wbc->more_io;
 		__entry->writeback_index = inode->i_mapping->writeback_index;
 	),
 
 	TP_printk("dev %d,%d ino %lu ret %d pages_written %d pages_skipped %ld "
-		  " more_io %d sync_mode %d writeback_index %lu",
+		  "sync_mode %d writeback_index %lu",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  (unsigned long) __entry->ino, __entry->ret,
 		  __entry->pages_written, __entry->pages_skipped,
-		  __entry->more_io, __entry->sync_mode,
+		  __entry->sync_mode,
 		  (unsigned long) __entry->writeback_index)
 );
 
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index 4e249b927eaa..b2cfac5f3313 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -101,7 +101,6 @@ DECLARE_EVENT_CLASS(wbc_class,
 		__field(int, for_background)
 		__field(int, for_reclaim)
 		__field(int, range_cyclic)
-		__field(int, more_io)
 		__field(unsigned long, older_than_this)
 		__field(long, range_start)
 		__field(long, range_end)
@@ -116,7 +115,6 @@ DECLARE_EVENT_CLASS(wbc_class,
 		__entry->for_background	= wbc->for_background;
 		__entry->for_reclaim	= wbc->for_reclaim;
 		__entry->range_cyclic	= wbc->range_cyclic;
-		__entry->more_io	= wbc->more_io;
 		__entry->older_than_this = wbc->older_than_this ?
 						*wbc->older_than_this : 0;
 		__entry->range_start	= (long)wbc->range_start;
@@ -124,7 +122,7 @@ DECLARE_EVENT_CLASS(wbc_class,
 	),
 
 	TP_printk("bdi %s: towrt=%ld skip=%ld mode=%d kupd=%d "
-		"bgrd=%d reclm=%d cyclic=%d more=%d older=0x%lx "
+		"bgrd=%d reclm=%d cyclic=%d older=0x%lx "
 		"start=0x%lx end=0x%lx",
 		__entry->name,
 		__entry->nr_to_write,
@@ -134,7 +132,6 @@ DECLARE_EVENT_CLASS(wbc_class,
 		__entry->for_background,
 		__entry->for_reclaim,
 		__entry->range_cyclic,
-		__entry->more_io,
 		__entry->older_than_this,
 		__entry->range_start,
 		__entry->range_end)
-- 
cgit v1.2.3


From 846d5a091b0506b75489577cde27f39b37a192a4 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Thu, 5 May 2011 21:10:38 -0600
Subject: writeback: remove .nonblocking and .encountered_congestion

Remove two unused struct writeback_control fields:

	.encountered_congestion	(completely unused)
	.nonblocking		(never set, checked/showed in XFS,NFS/btrfs)

The .for_background check in nfs_write_inode() is also removed btw,
as .for_background implies WB_SYNC_NONE.

Reviewed-by: Jan Kara <jack@suse.cz>
Proposed-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
 fs/nfs/write.c               | 3 +--
 fs/xfs/linux-2.6/xfs_aops.c  | 2 +-
 include/linux/writeback.h    | 2 --
 include/trace/events/btrfs.h | 6 ++----
 4 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index e268e3b23497..dd6a6cee39a7 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1564,8 +1564,7 @@ int nfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 		int status;
 		bool sync = true;
 
-		if (wbc->sync_mode == WB_SYNC_NONE || wbc->nonblocking ||
-		    wbc->for_background)
+		if (wbc->sync_mode == WB_SYNC_NONE)
 			sync = false;
 
 		status = pnfs_layoutcommit_inode(inode, sync);
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 79ce38be15a1..7559861481aa 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -970,7 +970,7 @@ xfs_vm_writepage(
 	offset = page_offset(page);
 	type = IO_OVERWRITE;
 
-	if (wbc->sync_mode == WB_SYNC_NONE && wbc->nonblocking)
+	if (wbc->sync_mode == WB_SYNC_NONE)
 		nonblocking = 1;
 
 	do {
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 8797b20dd22b..2f1b512bd6e0 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -39,8 +39,6 @@ struct writeback_control {
 	loff_t range_start;
 	loff_t range_end;
 
-	unsigned nonblocking:1;		/* Don't get stuck on request queues */
-	unsigned encountered_congestion:1; /* An output: a queue is full */
 	unsigned for_kupdate:1;		/* A kupdate writeback */
 	unsigned for_background:1;	/* A background writeback */
 	unsigned tagged_writepages:1;	/* tag-and-write to avoid livelock */
diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
index 4114129f0794..b31702ac15be 100644
--- a/include/trace/events/btrfs.h
+++ b/include/trace/events/btrfs.h
@@ -284,7 +284,6 @@ DECLARE_EVENT_CLASS(btrfs__writepage,
 		__field(	long,   pages_skipped		)
 		__field(	loff_t, range_start		)
 		__field(	loff_t, range_end		)
-		__field(	char,   nonblocking		)
 		__field(	char,   for_kupdate		)
 		__field(	char,   for_reclaim		)
 		__field(	char,   range_cyclic		)
@@ -299,7 +298,6 @@ DECLARE_EVENT_CLASS(btrfs__writepage,
 		__entry->pages_skipped	= wbc->pages_skipped;
 		__entry->range_start	= wbc->range_start;
 		__entry->range_end	= wbc->range_end;
-		__entry->nonblocking	= wbc->nonblocking;
 		__entry->for_kupdate	= wbc->for_kupdate;
 		__entry->for_reclaim	= wbc->for_reclaim;
 		__entry->range_cyclic	= wbc->range_cyclic;
@@ -310,13 +308,13 @@ DECLARE_EVENT_CLASS(btrfs__writepage,
 
 	TP_printk("root = %llu(%s), ino = %lu, page_index = %lu, "
 		  "nr_to_write = %ld, pages_skipped = %ld, range_start = %llu, "
-		  "range_end = %llu, nonblocking = %d, for_kupdate = %d, "
+		  "range_end = %llu, for_kupdate = %d, "
 		  "for_reclaim = %d, range_cyclic = %d, writeback_index = %lu",
 		  show_root_type(__entry->root_objectid),
 		  (unsigned long)__entry->ino, __entry->index,
 		  __entry->nr_to_write, __entry->pages_skipped,
 		  __entry->range_start, __entry->range_end,
-		  __entry->nonblocking, __entry->for_kupdate,
+		  __entry->for_kupdate,
 		  __entry->for_reclaim, __entry->range_cyclic,
 		  (unsigned long)__entry->writeback_index)
 );
-- 
cgit v1.2.3


From 251d6a471c831e22880b3c146bb4556ddfb1dc82 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Wed, 1 Dec 2010 17:33:37 -0600
Subject: writeback: trace event writeback_single_inode

It is valuable to know how the dirty inodes are iterated and their IO size.

"writeback_single_inode: bdi 8:0: ino=134246746 state=I_DIRTY_SYNC|I_SYNC age=414 index=0 to_write=1024 wrote=0"

- "state" reflects inode->i_state at the end of writeback_single_inode()
- "index" reflects mapping->writeback_index after the ->writepages() call
- "to_write" is the wbc->nr_to_write at entrance of writeback_single_inode()
- "wrote" is the number of pages actually written

v2: add trace event writeback_single_inode_requeue as proposed by Dave.

CC: Dave Chinner <david@fromorbit.com>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
 fs/fs-writeback.c                |  4 +++
 include/trace/events/writeback.h | 70 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 74 insertions(+)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index f43c479feee9..5185fad48b62 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -346,6 +346,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
 		       struct writeback_control *wbc)
 {
 	struct address_space *mapping = inode->i_mapping;
+	long nr_to_write = wbc->nr_to_write;
 	unsigned dirty;
 	int ret;
 
@@ -368,6 +369,8 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
 		 */
 		if (wbc->sync_mode != WB_SYNC_ALL) {
 			requeue_io(inode, wb);
+			trace_writeback_single_inode_requeue(inode, wbc,
+							     nr_to_write);
 			return 0;
 		}
 
@@ -467,6 +470,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
 		}
 	}
 	inode_sync_complete(inode);
+	trace_writeback_single_inode(inode, wbc, nr_to_write);
 	return ret;
 }
 
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index b2cfac5f3313..898277bc89b4 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -8,6 +8,19 @@
 #include <linux/device.h>
 #include <linux/writeback.h>
 
+#define show_inode_state(state)					\
+	__print_flags(state, "|",				\
+		{I_DIRTY_SYNC,		"I_DIRTY_SYNC"},	\
+		{I_DIRTY_DATASYNC,	"I_DIRTY_DATASYNC"},	\
+		{I_DIRTY_PAGES,		"I_DIRTY_PAGES"},	\
+		{I_NEW,			"I_NEW"},		\
+		{I_WILL_FREE,		"I_WILL_FREE"},		\
+		{I_FREEING,		"I_FREEING"},		\
+		{I_CLEAR,		"I_CLEAR"},		\
+		{I_SYNC,		"I_SYNC"},		\
+		{I_REFERENCED,		"I_REFERENCED"}		\
+	)
+
 struct wb_writeback_work;
 
 DECLARE_EVENT_CLASS(writeback_work_class,
@@ -184,6 +197,63 @@ DEFINE_EVENT(writeback_congest_waited_template, writeback_wait_iff_congested,
 	TP_ARGS(usec_timeout, usec_delayed)
 );
 
+DECLARE_EVENT_CLASS(writeback_single_inode_template,
+
+	TP_PROTO(struct inode *inode,
+		 struct writeback_control *wbc,
+		 unsigned long nr_to_write
+	),
+
+	TP_ARGS(inode, wbc, nr_to_write),
+
+	TP_STRUCT__entry(
+		__array(char, name, 32)
+		__field(unsigned long, ino)
+		__field(unsigned long, state)
+		__field(unsigned long, age)
+		__field(unsigned long, writeback_index)
+		__field(long, nr_to_write)
+		__field(unsigned long, wrote)
+	),
+
+	TP_fast_assign(
+		strncpy(__entry->name,
+			dev_name(inode->i_mapping->backing_dev_info->dev), 32);
+		__entry->ino		= inode->i_ino;
+		__entry->state		= inode->i_state;
+		__entry->age		= (jiffies - inode->dirtied_when) *
+								1000 / HZ;
+		__entry->writeback_index = inode->i_mapping->writeback_index;
+		__entry->nr_to_write	= nr_to_write;
+		__entry->wrote		= nr_to_write - wbc->nr_to_write;
+	),
+
+	TP_printk("bdi %s: ino=%lu state=%s age=%lu "
+		  "index=%lu to_write=%ld wrote=%lu",
+		  __entry->name,
+		  __entry->ino,
+		  show_inode_state(__entry->state),
+		  __entry->age,
+		  __entry->writeback_index,
+		  __entry->nr_to_write,
+		  __entry->wrote
+	)
+);
+
+DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode_requeue,
+	TP_PROTO(struct inode *inode,
+		 struct writeback_control *wbc,
+		 unsigned long nr_to_write),
+	TP_ARGS(inode, wbc, nr_to_write)
+);
+
+DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode,
+	TP_PROTO(struct inode *inode,
+		 struct writeback_control *wbc,
+		 unsigned long nr_to_write),
+	TP_ARGS(inode, wbc, nr_to_write)
+);
+
 #endif /* _TRACE_WRITEBACK_H */
 
 /* This part must be outside protection */
-- 
cgit v1.2.3


From e84d0a4f8e39a73003a6ec9a11b07702745f4c1f Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Sat, 23 Apr 2011 12:27:27 -0600
Subject: writeback: trace event writeback_queue_io

Note that it adds a little overheads to account the moved/enqueued
inodes from b_dirty to b_io. The "moved" accounting may be later used to
limit the number of inodes that can be moved in one shot, in order to
keep spinlock hold time under control.

Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
 fs/fs-writeback.c                | 14 ++++++++++----
 include/trace/events/writeback.h | 25 +++++++++++++++++++++++++
 2 files changed, 35 insertions(+), 4 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 5185fad48b62..6caa98247a5b 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -248,15 +248,16 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t)
 /*
  * Move expired dirty inodes from @delaying_queue to @dispatch_queue.
  */
-static void move_expired_inodes(struct list_head *delaying_queue,
+static int move_expired_inodes(struct list_head *delaying_queue,
 			       struct list_head *dispatch_queue,
-				unsigned long *older_than_this)
+			       unsigned long *older_than_this)
 {
 	LIST_HEAD(tmp);
 	struct list_head *pos, *node;
 	struct super_block *sb = NULL;
 	struct inode *inode;
 	int do_sb_sort = 0;
+	int moved = 0;
 
 	while (!list_empty(delaying_queue)) {
 		inode = wb_inode(delaying_queue->prev);
@@ -267,12 +268,13 @@ static void move_expired_inodes(struct list_head *delaying_queue,
 			do_sb_sort = 1;
 		sb = inode->i_sb;
 		list_move(&inode->i_wb_list, &tmp);
+		moved++;
 	}
 
 	/* just one sb in list, splice to dispatch_queue and we're done */
 	if (!do_sb_sort) {
 		list_splice(&tmp, dispatch_queue);
-		return;
+		goto out;
 	}
 
 	/* Move inodes from one superblock together */
@@ -284,6 +286,8 @@ static void move_expired_inodes(struct list_head *delaying_queue,
 				list_move(&inode->i_wb_list, dispatch_queue);
 		}
 	}
+out:
+	return moved;
 }
 
 /*
@@ -299,9 +303,11 @@ static void move_expired_inodes(struct list_head *delaying_queue,
  */
 static void queue_io(struct bdi_writeback *wb, unsigned long *older_than_this)
 {
+	int moved;
 	assert_spin_locked(&wb->list_lock);
 	list_splice_init(&wb->b_more_io, &wb->b_io);
-	move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
+	moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
+	trace_writeback_queue_io(wb, older_than_this, moved);
 }
 
 static int write_inode(struct inode *inode, struct writeback_control *wbc)
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index 898277bc89b4..205d14919ef2 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -162,6 +162,31 @@ DEFINE_WBC_EVENT(wbc_balance_dirty_written);
 DEFINE_WBC_EVENT(wbc_balance_dirty_wait);
 DEFINE_WBC_EVENT(wbc_writepage);
 
+TRACE_EVENT(writeback_queue_io,
+	TP_PROTO(struct bdi_writeback *wb,
+		 unsigned long *older_than_this,
+		 int moved),
+	TP_ARGS(wb, older_than_this, moved),
+	TP_STRUCT__entry(
+		__array(char,		name, 32)
+		__field(unsigned long,	older)
+		__field(long,		age)
+		__field(int,		moved)
+	),
+	TP_fast_assign(
+		strncpy(__entry->name, dev_name(wb->bdi->dev), 32);
+		__entry->older	= older_than_this ?  *older_than_this : 0;
+		__entry->age	= older_than_this ?
+				  (jiffies - *older_than_this) * 1000 / HZ : -1;
+		__entry->moved	= moved;
+	),
+	TP_printk("bdi %s: older=%lu age=%ld enqueue=%d",
+		__entry->name,
+		__entry->older,	/* older_than_this in jiffies */
+		__entry->age,	/* older_than_this in relative milliseconds */
+		__entry->moved)
+);
+
 DECLARE_EVENT_CLASS(writeback_congest_waited_template,
 
 	TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed),
-- 
cgit v1.2.3


From 36715cef0770b7e2547892b7c3197fc024274630 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Sat, 11 Jun 2011 17:53:57 -0600
Subject: writeback: skip tmpfs early in balance_dirty_pages_ratelimited_nr()

This helps prevent tmpfs dirtiers from skewing the per-cpu bdp_ratelimits.

Acked-by: Jan Kara <jack@suse.cz>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
 mm/page-writeback.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index b2529f8f8be0..1965d05a29cc 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -490,9 +490,6 @@ static void balance_dirty_pages(struct address_space *mapping,
 	bool dirty_exceeded = false;
 	struct backing_dev_info *bdi = mapping->backing_dev_info;
 
-	if (!bdi_cap_account_dirty(bdi))
-		return;
-
 	for (;;) {
 		struct writeback_control wbc = {
 			.sync_mode	= WB_SYNC_NONE,
@@ -631,9 +628,13 @@ static DEFINE_PER_CPU(unsigned long, bdp_ratelimits) = 0;
 void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
 					unsigned long nr_pages_dirtied)
 {
+	struct backing_dev_info *bdi = mapping->backing_dev_info;
 	unsigned long ratelimit;
 	unsigned long *p;
 
+	if (!bdi_cap_account_dirty(bdi))
+		return;
+
 	ratelimit = ratelimit_pages;
 	if (mapping->backing_dev_info->dirty_exceeded)
 		ratelimit = 8;
-- 
cgit v1.2.3


From d46db3d58233be4be980eb1e42eebe7808bcabab Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Wed, 4 May 2011 19:54:37 -0600
Subject: writeback: make writeback_control.nr_to_write straight

Pass struct wb_writeback_work all the way down to writeback_sb_inodes(),
and initialize the struct writeback_control there.

struct writeback_control is basically designed to control writeback of a
single file, but we keep abuse it for writing multiple files in
writeback_sb_inodes() and its callers.

It immediately clean things up, e.g. suddenly wbc.nr_to_write vs
work->nr_pages starts to make sense, and instead of saving and restoring
pages_skipped in writeback_sb_inodes it can always start with a clean
zero value.

It also makes a neat IO pattern change: large dirty files are now
written in the full 4MB writeback chunk size, rather than whatever
remained quota in wbc->nr_to_write.

Acked-by: Jan Kara <jack@suse.cz>
Proposed-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
 fs/btrfs/extent_io.c             |   2 -
 fs/fs-writeback.c                | 196 ++++++++++++++++++++++-----------------
 include/linux/writeback.h        |   6 +-
 include/trace/events/writeback.h |  39 +++++---
 mm/backing-dev.c                 |  17 +---
 mm/page-writeback.c              |  17 +---
 6 files changed, 148 insertions(+), 129 deletions(-)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 7055d11c1efd..561262d35689 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2551,7 +2551,6 @@ int extent_write_full_page(struct extent_io_tree *tree, struct page *page,
 	};
 	struct writeback_control wbc_writepages = {
 		.sync_mode	= wbc->sync_mode,
-		.older_than_this = NULL,
 		.nr_to_write	= 64,
 		.range_start	= page_offset(page) + PAGE_CACHE_SIZE,
 		.range_end	= (loff_t)-1,
@@ -2584,7 +2583,6 @@ int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode,
 	};
 	struct writeback_control wbc_writepages = {
 		.sync_mode	= mode,
-		.older_than_this = NULL,
 		.nr_to_write	= nr_pages * 2,
 		.range_start	= start,
 		.range_end	= end + 1,
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 6caa98247a5b..2c947da39f6e 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -29,12 +29,22 @@
 #include <linux/tracepoint.h>
 #include "internal.h"
 
+/*
+ * The maximum number of pages to writeout in a single bdi flush/kupdate
+ * operation.  We do this so we don't hold I_SYNC against an inode for
+ * enormous amounts of time, which would block a userspace task which has
+ * been forced to throttle against that inode.  Also, the code reevaluates
+ * the dirty each time it has written this many pages.
+ */
+#define MAX_WRITEBACK_PAGES     1024L
+
 /*
  * Passed into wb_writeback(), essentially a subset of writeback_control
  */
 struct wb_writeback_work {
 	long nr_pages;
 	struct super_block *sb;
+	unsigned long *older_than_this;
 	enum writeback_sync_modes sync_mode;
 	unsigned int tagged_writepages:1;
 	unsigned int for_kupdate:1;
@@ -472,7 +482,6 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
 			 * No need to add it back to the LRU.
 			 */
 			list_del_init(&inode->i_wb_list);
-			wbc->inodes_written++;
 		}
 	}
 	inode_sync_complete(inode);
@@ -506,6 +515,31 @@ static bool pin_sb_for_writeback(struct super_block *sb)
 	return false;
 }
 
+static long writeback_chunk_size(struct wb_writeback_work *work)
+{
+	long pages;
+
+	/*
+	 * WB_SYNC_ALL mode does livelock avoidance by syncing dirty
+	 * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX
+	 * here avoids calling into writeback_inodes_wb() more than once.
+	 *
+	 * The intended call sequence for WB_SYNC_ALL writeback is:
+	 *
+	 *      wb_writeback()
+	 *          writeback_sb_inodes()       <== called only once
+	 *              write_cache_pages()     <== called once for each inode
+	 *                   (quickly) tag currently dirty pages
+	 *                   (maybe slowly) sync all tagged pages
+	 */
+	if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages)
+		pages = LONG_MAX;
+	else
+		pages = min(MAX_WRITEBACK_PAGES, work->nr_pages);
+
+	return pages;
+}
+
 /*
  * Write a portion of b_io inodes which belong to @sb.
  *
@@ -513,18 +547,30 @@ static bool pin_sb_for_writeback(struct super_block *sb)
  * inodes. Otherwise write only ones which go sequentially
  * in reverse order.
  *
- * Return 1, if the caller writeback routine should be
- * interrupted. Otherwise return 0.
+ * Return the number of pages and/or inodes written.
  */
-static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
-		struct writeback_control *wbc, bool only_this_sb)
+static long writeback_sb_inodes(struct super_block *sb,
+				struct bdi_writeback *wb,
+				struct wb_writeback_work *work)
 {
+	struct writeback_control wbc = {
+		.sync_mode		= work->sync_mode,
+		.tagged_writepages	= work->tagged_writepages,
+		.for_kupdate		= work->for_kupdate,
+		.for_background		= work->for_background,
+		.range_cyclic		= work->range_cyclic,
+		.range_start		= 0,
+		.range_end		= LLONG_MAX,
+	};
+	unsigned long start_time = jiffies;
+	long write_chunk;
+	long wrote = 0;  /* count both pages and inodes */
+
 	while (!list_empty(&wb->b_io)) {
-		long pages_skipped;
 		struct inode *inode = wb_inode(wb->b_io.prev);
 
 		if (inode->i_sb != sb) {
-			if (only_this_sb) {
+			if (work->sb) {
 				/*
 				 * We only want to write back data for this
 				 * superblock, move all inodes not belonging
@@ -539,7 +585,7 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
 			 * Bounce back to the caller to unpin this and
 			 * pin the next superblock.
 			 */
-			return 0;
+			break;
 		}
 
 		/*
@@ -553,12 +599,18 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
 			requeue_io(inode, wb);
 			continue;
 		}
-
 		__iget(inode);
+		write_chunk = writeback_chunk_size(work);
+		wbc.nr_to_write = write_chunk;
+		wbc.pages_skipped = 0;
+
+		writeback_single_inode(inode, wb, &wbc);
 
-		pages_skipped = wbc->pages_skipped;
-		writeback_single_inode(inode, wb, wbc);
-		if (wbc->pages_skipped != pages_skipped) {
+		work->nr_pages -= write_chunk - wbc.nr_to_write;
+		wrote += write_chunk - wbc.nr_to_write;
+		if (!(inode->i_state & I_DIRTY))
+			wrote++;
+		if (wbc.pages_skipped) {
 			/*
 			 * writeback is not making progress due to locked
 			 * buffers.  Skip this inode for now.
@@ -570,17 +622,25 @@ static int writeback_sb_inodes(struct super_block *sb, struct bdi_writeback *wb,
 		iput(inode);
 		cond_resched();
 		spin_lock(&wb->list_lock);
-		if (wbc->nr_to_write <= 0)
-			return 1;
+		/*
+		 * bail out to wb_writeback() often enough to check
+		 * background threshold and other termination conditions.
+		 */
+		if (wrote) {
+			if (time_is_before_jiffies(start_time + HZ / 10UL))
+				break;
+			if (work->nr_pages <= 0)
+				break;
+		}
 	}
-	/* b_io is empty */
-	return 1;
+	return wrote;
 }
 
-static void __writeback_inodes_wb(struct bdi_writeback *wb,
-				  struct writeback_control *wbc)
+static long __writeback_inodes_wb(struct bdi_writeback *wb,
+				  struct wb_writeback_work *work)
 {
-	int ret = 0;
+	unsigned long start_time = jiffies;
+	long wrote = 0;
 
 	while (!list_empty(&wb->b_io)) {
 		struct inode *inode = wb_inode(wb->b_io.prev);
@@ -590,33 +650,37 @@ static void __writeback_inodes_wb(struct bdi_writeback *wb,
 			requeue_io(inode, wb);
 			continue;
 		}
-		ret = writeback_sb_inodes(sb, wb, wbc, false);
+		wrote += writeback_sb_inodes(sb, wb, work);
 		drop_super(sb);
 
-		if (ret)
-			break;
+		/* refer to the same tests at the end of writeback_sb_inodes */
+		if (wrote) {
+			if (time_is_before_jiffies(start_time + HZ / 10UL))
+				break;
+			if (work->nr_pages <= 0)
+				break;
+		}
 	}
 	/* Leave any unwritten inodes on b_io */
+	return wrote;
 }
 
-void writeback_inodes_wb(struct bdi_writeback *wb,
-		struct writeback_control *wbc)
+long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages)
 {
+	struct wb_writeback_work work = {
+		.nr_pages	= nr_pages,
+		.sync_mode	= WB_SYNC_NONE,
+		.range_cyclic	= 1,
+	};
+
 	spin_lock(&wb->list_lock);
 	if (list_empty(&wb->b_io))
-		queue_io(wb, wbc->older_than_this);
-	__writeback_inodes_wb(wb, wbc);
+		queue_io(wb, NULL);
+	__writeback_inodes_wb(wb, &work);
 	spin_unlock(&wb->list_lock);
-}
 
-/*
- * The maximum number of pages to writeout in a single bdi flush/kupdate
- * operation.  We do this so we don't hold I_SYNC against an inode for
- * enormous amounts of time, which would block a userspace task which has
- * been forced to throttle against that inode.  Also, the code reevaluates
- * the dirty each time it has written this many pages.
- */
-#define MAX_WRITEBACK_PAGES     1024
+	return nr_pages - work.nr_pages;
+}
 
 static inline bool over_bground_thresh(void)
 {
@@ -646,42 +710,13 @@ static inline bool over_bground_thresh(void)
 static long wb_writeback(struct bdi_writeback *wb,
 			 struct wb_writeback_work *work)
 {
-	struct writeback_control wbc = {
-		.sync_mode		= work->sync_mode,
-		.tagged_writepages	= work->tagged_writepages,
-		.older_than_this	= NULL,
-		.for_kupdate		= work->for_kupdate,
-		.for_background		= work->for_background,
-		.range_cyclic		= work->range_cyclic,
-	};
+	long nr_pages = work->nr_pages;
 	unsigned long oldest_jif;
-	long wrote = 0;
-	long write_chunk = MAX_WRITEBACK_PAGES;
 	struct inode *inode;
-
-	if (!wbc.range_cyclic) {
-		wbc.range_start = 0;
-		wbc.range_end = LLONG_MAX;
-	}
-
-	/*
-	 * WB_SYNC_ALL mode does livelock avoidance by syncing dirty
-	 * inodes/pages in one big loop. Setting wbc.nr_to_write=LONG_MAX
-	 * here avoids calling into writeback_inodes_wb() more than once.
-	 *
-	 * The intended call sequence for WB_SYNC_ALL writeback is:
-	 *
-	 *      wb_writeback()
-	 *          writeback_sb_inodes()       <== called only once
-	 *              write_cache_pages()     <== called once for each inode
-	 *                   (quickly) tag currently dirty pages
-	 *                   (maybe slowly) sync all tagged pages
-	 */
-	if (wbc.sync_mode == WB_SYNC_ALL || wbc.tagged_writepages)
-		write_chunk = LONG_MAX;
+	long progress;
 
 	oldest_jif = jiffies;
-	wbc.older_than_this = &oldest_jif;
+	work->older_than_this = &oldest_jif;
 
 	spin_lock(&wb->list_lock);
 	for (;;) {
@@ -711,24 +746,17 @@ static long wb_writeback(struct bdi_writeback *wb,
 		if (work->for_kupdate) {
 			oldest_jif = jiffies -
 				msecs_to_jiffies(dirty_expire_interval * 10);
-			wbc.older_than_this = &oldest_jif;
+			work->older_than_this = &oldest_jif;
 		}
 
-		wbc.nr_to_write = write_chunk;
-		wbc.pages_skipped = 0;
-		wbc.inodes_written = 0;
-
-		trace_wbc_writeback_start(&wbc, wb->bdi);
+		trace_writeback_start(wb->bdi, work);
 		if (list_empty(&wb->b_io))
-			queue_io(wb, wbc.older_than_this);
+			queue_io(wb, work->older_than_this);
 		if (work->sb)
-			writeback_sb_inodes(work->sb, wb, &wbc, true);
+			progress = writeback_sb_inodes(work->sb, wb, work);
 		else
-			__writeback_inodes_wb(wb, &wbc);
-		trace_wbc_writeback_written(&wbc, wb->bdi);
-
-		work->nr_pages -= write_chunk - wbc.nr_to_write;
-		wrote += write_chunk - wbc.nr_to_write;
+			progress = __writeback_inodes_wb(wb, work);
+		trace_writeback_written(wb->bdi, work);
 
 		/*
 		 * Did we write something? Try for more
@@ -738,9 +766,7 @@ static long wb_writeback(struct bdi_writeback *wb,
 		 * mean the overall work is done. So we keep looping as long
 		 * as made some progress on cleaning pages or inodes.
 		 */
-		if (wbc.nr_to_write < write_chunk)
-			continue;
-		if (wbc.inodes_written)
+		if (progress)
 			continue;
 		/*
 		 * No more inodes for IO, bail
@@ -753,8 +779,8 @@ static long wb_writeback(struct bdi_writeback *wb,
 		 * we'll just busyloop.
 		 */
 		if (!list_empty(&wb->b_more_io))  {
+			trace_writeback_wait(wb->bdi, work);
 			inode = wb_inode(wb->b_more_io.prev);
-			trace_wbc_writeback_wait(&wbc, wb->bdi);
 			spin_lock(&inode->i_lock);
 			inode_wait_for_writeback(inode, wb);
 			spin_unlock(&inode->i_lock);
@@ -762,7 +788,7 @@ static long wb_writeback(struct bdi_writeback *wb,
 	}
 	spin_unlock(&wb->list_lock);
 
-	return wrote;
+	return nr_pages - work->nr_pages;
 }
 
 /*
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 2f1b512bd6e0..df1b7f18f100 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -24,12 +24,9 @@ enum writeback_sync_modes {
  */
 struct writeback_control {
 	enum writeback_sync_modes sync_mode;
-	unsigned long *older_than_this;	/* If !NULL, only write back inodes
-					   older than this */
 	long nr_to_write;		/* Write this many pages, and decrement
 					   this for each page written */
 	long pages_skipped;		/* Pages which were not written */
-	long inodes_written;		/* # of inodes written (at least) */
 
 	/*
 	 * For a_ops->writepages(): is start or end are non-zero then this is
@@ -56,8 +53,7 @@ void writeback_inodes_sb_nr(struct super_block *, unsigned long nr);
 int writeback_inodes_sb_if_idle(struct super_block *);
 int writeback_inodes_sb_nr_if_idle(struct super_block *, unsigned long nr);
 void sync_inodes_sb(struct super_block *);
-void writeback_inodes_wb(struct bdi_writeback *wb,
-		struct writeback_control *wbc);
+long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages);
 long wb_do_writeback(struct bdi_writeback *wb, int force_wait);
 void wakeup_flusher_threads(long nr_pages);
 
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index 205d14919ef2..3e7662a0cfa3 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -62,6 +62,9 @@ DEFINE_EVENT(writeback_work_class, name, \
 DEFINE_WRITEBACK_WORK_EVENT(writeback_nothread);
 DEFINE_WRITEBACK_WORK_EVENT(writeback_queue);
 DEFINE_WRITEBACK_WORK_EVENT(writeback_exec);
+DEFINE_WRITEBACK_WORK_EVENT(writeback_start);
+DEFINE_WRITEBACK_WORK_EVENT(writeback_written);
+DEFINE_WRITEBACK_WORK_EVENT(writeback_wait);
 
 TRACE_EVENT(writeback_pages_written,
 	TP_PROTO(long pages_written),
@@ -101,6 +104,30 @@ DEFINE_WRITEBACK_EVENT(writeback_bdi_register);
 DEFINE_WRITEBACK_EVENT(writeback_bdi_unregister);
 DEFINE_WRITEBACK_EVENT(writeback_thread_start);
 DEFINE_WRITEBACK_EVENT(writeback_thread_stop);
+DEFINE_WRITEBACK_EVENT(balance_dirty_start);
+DEFINE_WRITEBACK_EVENT(balance_dirty_wait);
+
+TRACE_EVENT(balance_dirty_written,
+
+	TP_PROTO(struct backing_dev_info *bdi, int written),
+
+	TP_ARGS(bdi, written),
+
+	TP_STRUCT__entry(
+		__array(char,	name, 32)
+		__field(int,	written)
+	),
+
+	TP_fast_assign(
+		strncpy(__entry->name, dev_name(bdi->dev), 32);
+		__entry->written = written;
+	),
+
+	TP_printk("bdi %s written %d",
+		  __entry->name,
+		  __entry->written
+	)
+);
 
 DECLARE_EVENT_CLASS(wbc_class,
 	TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi),
@@ -114,7 +141,6 @@ DECLARE_EVENT_CLASS(wbc_class,
 		__field(int, for_background)
 		__field(int, for_reclaim)
 		__field(int, range_cyclic)
-		__field(unsigned long, older_than_this)
 		__field(long, range_start)
 		__field(long, range_end)
 	),
@@ -128,14 +154,12 @@ DECLARE_EVENT_CLASS(wbc_class,
 		__entry->for_background	= wbc->for_background;
 		__entry->for_reclaim	= wbc->for_reclaim;
 		__entry->range_cyclic	= wbc->range_cyclic;
-		__entry->older_than_this = wbc->older_than_this ?
-						*wbc->older_than_this : 0;
 		__entry->range_start	= (long)wbc->range_start;
 		__entry->range_end	= (long)wbc->range_end;
 	),
 
 	TP_printk("bdi %s: towrt=%ld skip=%ld mode=%d kupd=%d "
-		"bgrd=%d reclm=%d cyclic=%d older=0x%lx "
+		"bgrd=%d reclm=%d cyclic=%d "
 		"start=0x%lx end=0x%lx",
 		__entry->name,
 		__entry->nr_to_write,
@@ -145,7 +169,6 @@ DECLARE_EVENT_CLASS(wbc_class,
 		__entry->for_background,
 		__entry->for_reclaim,
 		__entry->range_cyclic,
-		__entry->older_than_this,
 		__entry->range_start,
 		__entry->range_end)
 )
@@ -154,12 +177,6 @@ DECLARE_EVENT_CLASS(wbc_class,
 DEFINE_EVENT(wbc_class, name, \
 	TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi), \
 	TP_ARGS(wbc, bdi))
-DEFINE_WBC_EVENT(wbc_writeback_start);
-DEFINE_WBC_EVENT(wbc_writeback_written);
-DEFINE_WBC_EVENT(wbc_writeback_wait);
-DEFINE_WBC_EVENT(wbc_balance_dirty_start);
-DEFINE_WBC_EVENT(wbc_balance_dirty_written);
-DEFINE_WBC_EVENT(wbc_balance_dirty_wait);
 DEFINE_WBC_EVENT(wbc_writepage);
 
 TRACE_EVENT(writeback_queue_io,
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 5f6553ef1ba7..7ba303be5e03 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -260,18 +260,6 @@ int bdi_has_dirty_io(struct backing_dev_info *bdi)
 	return wb_has_dirty_io(&bdi->wb);
 }
 
-static void bdi_flush_io(struct backing_dev_info *bdi)
-{
-	struct writeback_control wbc = {
-		.sync_mode		= WB_SYNC_NONE,
-		.older_than_this	= NULL,
-		.range_cyclic		= 1,
-		.nr_to_write		= 1024,
-	};
-
-	writeback_inodes_wb(&bdi->wb, &wbc);
-}
-
 /*
  * kupdated() used to do this. We cannot do it from the bdi_forker_thread()
  * or we risk deadlocking on ->s_umount. The longer term solution would be
@@ -457,9 +445,10 @@ static int bdi_forker_thread(void *ptr)
 			if (IS_ERR(task)) {
 				/*
 				 * If thread creation fails, force writeout of
-				 * the bdi from the thread.
+				 * the bdi from the thread. Hopefully 1024 is
+				 * large enough for efficient IO.
 				 */
-				bdi_flush_io(bdi);
+				writeback_inodes_wb(&bdi->wb, 1024);
 			} else {
 				/*
 				 * The spinlock makes sure we do not lose
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 1965d05a29cc..9d6ac2b6d942 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -491,13 +491,6 @@ static void balance_dirty_pages(struct address_space *mapping,
 	struct backing_dev_info *bdi = mapping->backing_dev_info;
 
 	for (;;) {
-		struct writeback_control wbc = {
-			.sync_mode	= WB_SYNC_NONE,
-			.older_than_this = NULL,
-			.nr_to_write	= write_chunk,
-			.range_cyclic	= 1,
-		};
-
 		nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
 					global_page_state(NR_UNSTABLE_NFS);
 		nr_writeback = global_page_state(NR_WRITEBACK);
@@ -559,17 +552,17 @@ static void balance_dirty_pages(struct address_space *mapping,
 		 * threshold otherwise wait until the disk writes catch
 		 * up.
 		 */
-		trace_wbc_balance_dirty_start(&wbc, bdi);
+		trace_balance_dirty_start(bdi);
 		if (bdi_nr_reclaimable > bdi_thresh) {
-			writeback_inodes_wb(&bdi->wb, &wbc);
-			pages_written += write_chunk - wbc.nr_to_write;
-			trace_wbc_balance_dirty_written(&wbc, bdi);
+			pages_written += writeback_inodes_wb(&bdi->wb,
+							     write_chunk);
+			trace_balance_dirty_written(bdi, pages_written);
 			if (pages_written >= write_chunk)
 				break;		/* We've done our duty */
 		}
-		trace_wbc_balance_dirty_wait(&wbc, bdi);
 		__set_current_state(TASK_UNINTERRUPTIBLE);
 		io_schedule_timeout(pause);
+		trace_balance_dirty_wait(bdi);
 
 		/*
 		 * Increase the delay for each loop, up to our previous
-- 
cgit v1.2.3


From f7d2b1ecd0c714adefc7d3a942ef87beb828a763 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 8 Dec 2010 22:44:24 -0600
Subject: writeback: account per-bdi accumulated written pages

Introduce the BDI_WRITTEN counter. It will be used for estimating the
bdi's write bandwidth.

Peter Zijlstra <a.p.zijlstra@chello.nl>:
Move BDI_WRITTEN accounting into __bdi_writeout_inc().
This will cover and fix fuse, which only calls bdi_writeout_inc().

CC: Michael Rubin <mrubin@google.com>
Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
 include/linux/backing-dev.h |  1 +
 mm/backing-dev.c            | 10 ++++++++--
 mm/page-writeback.c         |  1 +
 3 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 47feb2c4706a..469d56443c63 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -40,6 +40,7 @@ typedef int (congested_fn)(void *, int);
 enum bdi_stat_item {
 	BDI_RECLAIMABLE,
 	BDI_WRITEBACK,
+	BDI_WRITTEN,
 	NR_BDI_STAT_ITEMS
 };
 
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 7ba303be5e03..83f18a1d9d10 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -97,6 +97,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
 		   "BdiDirtyThresh:   %8lu kB\n"
 		   "DirtyThresh:      %8lu kB\n"
 		   "BackgroundThresh: %8lu kB\n"
+		   "BdiWritten:       %8lu kB\n"
 		   "b_dirty:          %8lu\n"
 		   "b_io:             %8lu\n"
 		   "b_more_io:        %8lu\n"
@@ -104,8 +105,13 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
 		   "state:            %8lx\n",
 		   (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
 		   (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)),
-		   K(bdi_thresh), K(dirty_thresh),
-		   K(background_thresh), nr_dirty, nr_io, nr_more_io,
+		   K(bdi_thresh),
+		   K(dirty_thresh),
+		   K(background_thresh),
+		   (unsigned long) K(bdi_stat(bdi, BDI_WRITTEN)),
+		   nr_dirty,
+		   nr_io,
+		   nr_more_io,
 		   !list_empty(&bdi->bdi_list), bdi->state);
 #undef K
 
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 9d6ac2b6d942..8cd71376c63d 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -219,6 +219,7 @@ int dirty_bytes_handler(struct ctl_table *table, int write,
  */
 static inline void __bdi_writeout_inc(struct backing_dev_info *bdi)
 {
+	__inc_bdi_stat(bdi, BDI_WRITTEN);
 	__prop_inc_percpu_max(&vm_completions, &bdi->completions,
 			      bdi->max_prop_frac);
 }
-- 
cgit v1.2.3


From e98be2d599207c6b31e9bb340d52a231b2f3662d Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Sun, 29 Aug 2010 11:22:30 -0600
Subject: writeback: bdi write bandwidth estimation

The estimation value will start from 100MB/s and adapt to the real
bandwidth in seconds.

It tries to update the bandwidth only when disk is fully utilized.
Any inactive period of more than one second will be skipped.

The estimated bandwidth will be reflecting how fast the device can
writeout when _fully utilized_, and won't drop to 0 when it goes idle.
The value will remain constant at disk idle time. At busy write time, if
not considering fluctuations, it will also remain high unless be knocked
down by possible concurrent reads that compete for the disk time and
bandwidth with async writes.

The estimation is not done purely in the flusher because there is no
guarantee for write_cache_pages() to return timely to update bandwidth.

The bdi->avg_write_bandwidth smoothing is very effective for filtering
out sudden spikes, however may be a little biased in long term.

The overheads are low because the bdi bandwidth update only occurs at
200ms intervals.

The 200ms update interval is suitable, because it's not possible to get
the real bandwidth for the instance at all, due to large fluctuations.

The NFS commits can be as large as seconds worth of data. One XFS
completion may be as large as half second worth of data if we are going
to increase the write chunk to half second worth of data. In ext4,
fluctuations with time period of around 5 seconds is observed. And there
is another pattern of irregular periods of up to 20 seconds on SSD tests.

That's why we are not only doing the estimation at 200ms intervals, but
also averaging them over a period of 3 seconds and then go further to do
another level of smoothing in avg_write_bandwidth.

CC: Li Shaohua <shaohua.li@intel.com>
CC: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
 fs/fs-writeback.c           | 13 +++++++
 include/linux/backing-dev.h |  5 +++
 include/linux/writeback.h   |  3 ++
 mm/backing-dev.c            | 12 +++++++
 mm/page-writeback.c         | 87 +++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 120 insertions(+)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 2c947da39f6e..5826992910e9 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -692,6 +692,16 @@ static inline bool over_bground_thresh(void)
 		global_page_state(NR_UNSTABLE_NFS) > background_thresh);
 }
 
+/*
+ * Called under wb->list_lock. If there are multiple wb per bdi,
+ * only the flusher working on the first wb should do it.
+ */
+static void wb_update_bandwidth(struct bdi_writeback *wb,
+				unsigned long start_time)
+{
+	__bdi_update_bandwidth(wb->bdi, start_time);
+}
+
 /*
  * Explicit flushing or periodic writeback of "old" data.
  *
@@ -710,6 +720,7 @@ static inline bool over_bground_thresh(void)
 static long wb_writeback(struct bdi_writeback *wb,
 			 struct wb_writeback_work *work)
 {
+	unsigned long wb_start = jiffies;
 	long nr_pages = work->nr_pages;
 	unsigned long oldest_jif;
 	struct inode *inode;
@@ -758,6 +769,8 @@ static long wb_writeback(struct bdi_writeback *wb,
 			progress = __writeback_inodes_wb(wb, work);
 		trace_writeback_written(wb->bdi, work);
 
+		wb_update_bandwidth(wb, wb_start);
+
 		/*
 		 * Did we write something? Try for more
 		 *
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 469d56443c63..a008982e7c08 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -73,6 +73,11 @@ struct backing_dev_info {
 
 	struct percpu_counter bdi_stat[NR_BDI_STAT_ITEMS];
 
+	unsigned long bw_time_stamp;	/* last time write bw is updated */
+	unsigned long written_stamp;	/* pages written at bw_time_stamp */
+	unsigned long write_bandwidth;	/* the estimated write bandwidth */
+	unsigned long avg_write_bandwidth; /* further smoothed write bw */
+
 	struct prop_local_percpu completions;
 	int dirty_exceeded;
 
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index df1b7f18f100..66862f2d90c8 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -118,6 +118,9 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty);
 unsigned long bdi_dirty_limit(struct backing_dev_info *bdi,
 			       unsigned long dirty);
 
+void __bdi_update_bandwidth(struct backing_dev_info *bdi,
+			    unsigned long start_time);
+
 void page_writeback_init(void);
 void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
 					unsigned long nr_pages_dirtied);
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 83f18a1d9d10..a76cdd160277 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -638,6 +638,11 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
 	setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi);
 }
 
+/*
+ * Initial write bandwidth: 100 MB/s
+ */
+#define INIT_BW		(100 << (20 - PAGE_SHIFT))
+
 int bdi_init(struct backing_dev_info *bdi)
 {
 	int i, err;
@@ -660,6 +665,13 @@ int bdi_init(struct backing_dev_info *bdi)
 	}
 
 	bdi->dirty_exceeded = 0;
+
+	bdi->bw_time_stamp = jiffies;
+	bdi->written_stamp = 0;
+
+	bdi->write_bandwidth = INIT_BW;
+	bdi->avg_write_bandwidth = INIT_BW;
+
 	err = prop_local_init_percpu(&bdi->completions);
 
 	if (err) {
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 8cd71376c63d..446bdf7b975b 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -36,6 +36,11 @@
 #include <linux/pagevec.h>
 #include <trace/events/writeback.h>
 
+/*
+ * Estimate write bandwidth at 200ms intervals.
+ */
+#define BANDWIDTH_INTERVAL	max(HZ/5, 1)
+
 /*
  * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
  * will look to see if it needs to force writeback or throttling.
@@ -471,6 +476,85 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty)
 	return bdi_dirty;
 }
 
+static void bdi_update_write_bandwidth(struct backing_dev_info *bdi,
+				       unsigned long elapsed,
+				       unsigned long written)
+{
+	const unsigned long period = roundup_pow_of_two(3 * HZ);
+	unsigned long avg = bdi->avg_write_bandwidth;
+	unsigned long old = bdi->write_bandwidth;
+	u64 bw;
+
+	/*
+	 * bw = written * HZ / elapsed
+	 *
+	 *                   bw * elapsed + write_bandwidth * (period - elapsed)
+	 * write_bandwidth = ---------------------------------------------------
+	 *                                          period
+	 */
+	bw = written - bdi->written_stamp;
+	bw *= HZ;
+	if (unlikely(elapsed > period)) {
+		do_div(bw, elapsed);
+		avg = bw;
+		goto out;
+	}
+	bw += (u64)bdi->write_bandwidth * (period - elapsed);
+	bw >>= ilog2(period);
+
+	/*
+	 * one more level of smoothing, for filtering out sudden spikes
+	 */
+	if (avg > old && old >= (unsigned long)bw)
+		avg -= (avg - old) >> 3;
+
+	if (avg < old && old <= (unsigned long)bw)
+		avg += (old - avg) >> 3;
+
+out:
+	bdi->write_bandwidth = bw;
+	bdi->avg_write_bandwidth = avg;
+}
+
+void __bdi_update_bandwidth(struct backing_dev_info *bdi,
+			    unsigned long start_time)
+{
+	unsigned long now = jiffies;
+	unsigned long elapsed = now - bdi->bw_time_stamp;
+	unsigned long written;
+
+	/*
+	 * rate-limit, only update once every 200ms.
+	 */
+	if (elapsed < BANDWIDTH_INTERVAL)
+		return;
+
+	written = percpu_counter_read(&bdi->bdi_stat[BDI_WRITTEN]);
+
+	/*
+	 * Skip quiet periods when disk bandwidth is under-utilized.
+	 * (at least 1s idle time between two flusher runs)
+	 */
+	if (elapsed > HZ && time_before(bdi->bw_time_stamp, start_time))
+		goto snapshot;
+
+	bdi_update_write_bandwidth(bdi, elapsed, written);
+
+snapshot:
+	bdi->written_stamp = written;
+	bdi->bw_time_stamp = now;
+}
+
+static void bdi_update_bandwidth(struct backing_dev_info *bdi,
+				 unsigned long start_time)
+{
+	if (time_is_after_eq_jiffies(bdi->bw_time_stamp + BANDWIDTH_INTERVAL))
+		return;
+	spin_lock(&bdi->wb.list_lock);
+	__bdi_update_bandwidth(bdi, start_time);
+	spin_unlock(&bdi->wb.list_lock);
+}
+
 /*
  * balance_dirty_pages() must be called by processes which are generating dirty
  * data.  It looks at the number of dirty pages in the machine and will force
@@ -490,6 +574,7 @@ static void balance_dirty_pages(struct address_space *mapping,
 	unsigned long pause = 1;
 	bool dirty_exceeded = false;
 	struct backing_dev_info *bdi = mapping->backing_dev_info;
+	unsigned long start_time = jiffies;
 
 	for (;;) {
 		nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
@@ -544,6 +629,8 @@ static void balance_dirty_pages(struct address_space *mapping,
 		if (!bdi->dirty_exceeded)
 			bdi->dirty_exceeded = 1;
 
+		bdi_update_bandwidth(bdi, start_time);
+
 		/* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
 		 * Unstable writes are a feature of certain networked
 		 * filesystems (i.e. NFS) in which data may have been
-- 
cgit v1.2.3


From 00821b002df7da867bb2c15b4f83f3706371383f Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Sun, 29 Aug 2010 11:28:45 -0600
Subject: writeback: show bdi write bandwidth in debugfs

Add a "BdiWriteBandwidth" entry and indent others in /debug/bdi/*/stats.

btw, increase digital field width to 10, for keeping the possibly
huge BdiWritten number aligned at least for desktop systems.

Impact: this could break user space tools if they are dumb enough to
depend on the number of white spaces.

CC: Theodore Ts'o <tytso@mit.edu>
CC: Jan Kara <jack@suse.cz>
CC: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
 mm/backing-dev.c | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index a76cdd160277..ddd0345e2e6d 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -92,23 +92,25 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
 
 #define K(x) ((x) << (PAGE_SHIFT - 10))
 	seq_printf(m,
-		   "BdiWriteback:     %8lu kB\n"
-		   "BdiReclaimable:   %8lu kB\n"
-		   "BdiDirtyThresh:   %8lu kB\n"
-		   "DirtyThresh:      %8lu kB\n"
-		   "BackgroundThresh: %8lu kB\n"
-		   "BdiWritten:       %8lu kB\n"
-		   "b_dirty:          %8lu\n"
-		   "b_io:             %8lu\n"
-		   "b_more_io:        %8lu\n"
-		   "bdi_list:         %8u\n"
-		   "state:            %8lx\n",
+		   "BdiWriteback:       %10lu kB\n"
+		   "BdiReclaimable:     %10lu kB\n"
+		   "BdiDirtyThresh:     %10lu kB\n"
+		   "DirtyThresh:        %10lu kB\n"
+		   "BackgroundThresh:   %10lu kB\n"
+		   "BdiWritten:         %10lu kB\n"
+		   "BdiWriteBandwidth:  %10lu kBps\n"
+		   "b_dirty:            %10lu\n"
+		   "b_io:               %10lu\n"
+		   "b_more_io:          %10lu\n"
+		   "bdi_list:           %10u\n"
+		   "state:              %10lx\n",
 		   (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
 		   (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)),
 		   K(bdi_thresh),
 		   K(dirty_thresh),
 		   K(background_thresh),
 		   (unsigned long) K(bdi_stat(bdi, BDI_WRITTEN)),
+		   (unsigned long) K(bdi->write_bandwidth),
 		   nr_dirty,
 		   nr_io,
 		   nr_more_io,
-- 
cgit v1.2.3


From 7762741e3af69720186802e945229b6a5afd5c49 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Sun, 12 Sep 2010 13:34:05 -0600
Subject: writeback: consolidate variable names in balance_dirty_pages()

Introduce

	nr_dirty = NR_FILE_DIRTY + NR_WRITEBACK + NR_UNSTABLE_NFS

in order to simplify many tests in the following patches.

balance_dirty_pages() will eventually care only about the dirty sums
besides nr_writeback.

Acked-by: Jan Kara <jack@suse.cz>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
 mm/page-writeback.c | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 446bdf7b975b..5f3e1b46ace5 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -565,8 +565,9 @@ static void bdi_update_bandwidth(struct backing_dev_info *bdi,
 static void balance_dirty_pages(struct address_space *mapping,
 				unsigned long write_chunk)
 {
-	long nr_reclaimable, bdi_nr_reclaimable;
-	long nr_writeback, bdi_nr_writeback;
+	unsigned long nr_reclaimable, bdi_nr_reclaimable;
+	unsigned long nr_dirty;  /* = file_dirty + writeback + unstable_nfs */
+	unsigned long bdi_dirty;
 	unsigned long background_thresh;
 	unsigned long dirty_thresh;
 	unsigned long bdi_thresh;
@@ -579,7 +580,7 @@ static void balance_dirty_pages(struct address_space *mapping,
 	for (;;) {
 		nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
 					global_page_state(NR_UNSTABLE_NFS);
-		nr_writeback = global_page_state(NR_WRITEBACK);
+		nr_dirty = nr_reclaimable + global_page_state(NR_WRITEBACK);
 
 		global_dirty_limits(&background_thresh, &dirty_thresh);
 
@@ -588,8 +589,7 @@ static void balance_dirty_pages(struct address_space *mapping,
 		 * catch-up. This avoids (excessively) small writeouts
 		 * when the bdi limits are ramping up.
 		 */
-		if (nr_reclaimable + nr_writeback <=
-				(background_thresh + dirty_thresh) / 2)
+		if (nr_dirty <= (background_thresh + dirty_thresh) / 2)
 			break;
 
 		bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
@@ -607,10 +607,12 @@ static void balance_dirty_pages(struct address_space *mapping,
 		 */
 		if (bdi_thresh < 2*bdi_stat_error(bdi)) {
 			bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
-			bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK);
+			bdi_dirty = bdi_nr_reclaimable +
+				    bdi_stat_sum(bdi, BDI_WRITEBACK);
 		} else {
 			bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
-			bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
+			bdi_dirty = bdi_nr_reclaimable +
+				    bdi_stat(bdi, BDI_WRITEBACK);
 		}
 
 		/*
@@ -619,9 +621,8 @@ static void balance_dirty_pages(struct address_space *mapping,
 		 * bdi or process from holding back light ones; The latter is
 		 * the last resort safeguard.
 		 */
-		dirty_exceeded =
-			(bdi_nr_reclaimable + bdi_nr_writeback > bdi_thresh)
-			|| (nr_reclaimable + nr_writeback > dirty_thresh);
+		dirty_exceeded = (bdi_dirty > bdi_thresh) ||
+				  (nr_dirty > dirty_thresh);
 
 		if (!dirty_exceeded)
 			break;
-- 
cgit v1.2.3


From c42843f2f0bbc9d716a32caf667d18fc2bf3bc4c Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Wed, 2 Mar 2011 15:54:09 -0600
Subject: writeback: introduce smoothed global dirty limit

The start of a heavy weight application (ie. KVM) may instantly knock
down determine_dirtyable_memory() if the swap is not enabled or full.
global_dirty_limits() and bdi_dirty_limit() will in turn get global/bdi
dirty thresholds that are _much_ lower than the global/bdi dirty pages.

balance_dirty_pages() will then heavily throttle all dirtiers including
the light ones, until the dirty pages drop below the new dirty thresholds.
During this _deep_ dirty-exceeded state, the system may appear rather
unresponsive to the users.

About "deep" dirty-exceeded: task_dirty_limit() assigns 1/8 lower dirty
threshold to heavy dirtiers than light ones, and the dirty pages will
be throttled around the heavy dirtiers' dirty threshold and reasonably
below the light dirtiers' dirty threshold. In this state, only the heavy
dirtiers will be throttled and the dirty pages are carefully controlled
to not exceed the light dirtiers' dirty threshold. However if the
threshold itself suddenly drops below the number of dirty pages, the
light dirtiers will get heavily throttled.

So introduce global_dirty_limit for tracking the global dirty threshold
with policies

- follow downwards slowly
- follow up in one shot

global_dirty_limit can effectively mask out the impact of sudden drop of
dirtyable memory. It will be used in the next patch for two new type of
dirty limits. Note that the new dirty limits are not going to avoid
throttling the light dirtiers, but could limit their sleep time to 200ms.

Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
 fs/fs-writeback.c         |  2 +-
 include/linux/writeback.h |  6 ++++
 mm/page-writeback.c       | 74 +++++++++++++++++++++++++++++++++++++++++++++--
 3 files changed, 79 insertions(+), 3 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 5826992910e9..227ff12257f3 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -699,7 +699,7 @@ static inline bool over_bground_thresh(void)
 static void wb_update_bandwidth(struct bdi_writeback *wb,
 				unsigned long start_time)
 {
-	__bdi_update_bandwidth(wb->bdi, start_time);
+	__bdi_update_bandwidth(wb->bdi, 0, 0, 0, 0, start_time);
 }
 
 /*
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 66862f2d90c8..e9d371b6053b 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -84,6 +84,8 @@ static inline void laptop_sync_completion(void) { }
 #endif
 void throttle_vm_writeout(gfp_t gfp_mask);
 
+extern unsigned long global_dirty_limit;
+
 /* These are exported to sysctl. */
 extern int dirty_background_ratio;
 extern unsigned long dirty_background_bytes;
@@ -119,6 +121,10 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi,
 			       unsigned long dirty);
 
 void __bdi_update_bandwidth(struct backing_dev_info *bdi,
+			    unsigned long thresh,
+			    unsigned long dirty,
+			    unsigned long bdi_thresh,
+			    unsigned long bdi_dirty,
 			    unsigned long start_time);
 
 void page_writeback_init(void);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 5f3e1b46ace5..da959952b9f5 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -116,6 +116,7 @@ EXPORT_SYMBOL(laptop_mode);
 
 /* End of sysctl-exported parameters */
 
+unsigned long global_dirty_limit;
 
 /*
  * Scale the writeback cache size proportional to the relative writeout speeds.
@@ -516,7 +517,67 @@ out:
 	bdi->avg_write_bandwidth = avg;
 }
 
+/*
+ * The global dirtyable memory and dirty threshold could be suddenly knocked
+ * down by a large amount (eg. on the startup of KVM in a swapless system).
+ * This may throw the system into deep dirty exceeded state and throttle
+ * heavy/light dirtiers alike. To retain good responsiveness, maintain
+ * global_dirty_limit for tracking slowly down to the knocked down dirty
+ * threshold.
+ */
+static void update_dirty_limit(unsigned long thresh, unsigned long dirty)
+{
+	unsigned long limit = global_dirty_limit;
+
+	/*
+	 * Follow up in one step.
+	 */
+	if (limit < thresh) {
+		limit = thresh;
+		goto update;
+	}
+
+	/*
+	 * Follow down slowly. Use the higher one as the target, because thresh
+	 * may drop below dirty. This is exactly the reason to introduce
+	 * global_dirty_limit which is guaranteed to lie above the dirty pages.
+	 */
+	thresh = max(thresh, dirty);
+	if (limit > thresh) {
+		limit -= (limit - thresh) >> 5;
+		goto update;
+	}
+	return;
+update:
+	global_dirty_limit = limit;
+}
+
+static void global_update_bandwidth(unsigned long thresh,
+				    unsigned long dirty,
+				    unsigned long now)
+{
+	static DEFINE_SPINLOCK(dirty_lock);
+	static unsigned long update_time;
+
+	/*
+	 * check locklessly first to optimize away locking for the most time
+	 */
+	if (time_before(now, update_time + BANDWIDTH_INTERVAL))
+		return;
+
+	spin_lock(&dirty_lock);
+	if (time_after_eq(now, update_time + BANDWIDTH_INTERVAL)) {
+		update_dirty_limit(thresh, dirty);
+		update_time = now;
+	}
+	spin_unlock(&dirty_lock);
+}
+
 void __bdi_update_bandwidth(struct backing_dev_info *bdi,
+			    unsigned long thresh,
+			    unsigned long dirty,
+			    unsigned long bdi_thresh,
+			    unsigned long bdi_dirty,
 			    unsigned long start_time)
 {
 	unsigned long now = jiffies;
@@ -538,6 +599,9 @@ void __bdi_update_bandwidth(struct backing_dev_info *bdi,
 	if (elapsed > HZ && time_before(bdi->bw_time_stamp, start_time))
 		goto snapshot;
 
+	if (thresh)
+		global_update_bandwidth(thresh, dirty, now);
+
 	bdi_update_write_bandwidth(bdi, elapsed, written);
 
 snapshot:
@@ -546,12 +610,17 @@ snapshot:
 }
 
 static void bdi_update_bandwidth(struct backing_dev_info *bdi,
+				 unsigned long thresh,
+				 unsigned long dirty,
+				 unsigned long bdi_thresh,
+				 unsigned long bdi_dirty,
 				 unsigned long start_time)
 {
 	if (time_is_after_eq_jiffies(bdi->bw_time_stamp + BANDWIDTH_INTERVAL))
 		return;
 	spin_lock(&bdi->wb.list_lock);
-	__bdi_update_bandwidth(bdi, start_time);
+	__bdi_update_bandwidth(bdi, thresh, dirty, bdi_thresh, bdi_dirty,
+			       start_time);
 	spin_unlock(&bdi->wb.list_lock);
 }
 
@@ -630,7 +699,8 @@ static void balance_dirty_pages(struct address_space *mapping,
 		if (!bdi->dirty_exceeded)
 			bdi->dirty_exceeded = 1;
 
-		bdi_update_bandwidth(bdi, start_time);
+		bdi_update_bandwidth(bdi, dirty_thresh, nr_dirty,
+				     bdi_thresh, bdi_dirty, start_time);
 
 		/* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
 		 * Unstable writes are a feature of certain networked
-- 
cgit v1.2.3


From ffd1f609ab10532e8137b4b981fdf903ef4d0b32 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Sun, 19 Jun 2011 22:18:42 -0600
Subject: writeback: introduce max-pause and pass-good dirty limits

The max-pause limit helps to keep the sleep time inside
balance_dirty_pages() within MAX_PAUSE=200ms. The 200ms max sleep means
per task rate limit of 8pages/200ms=160KB/s when dirty exceeded, which
normally is enough to stop dirtiers from continue pushing the dirty
pages high, unless there are a sufficient large number of slow dirtiers
(eg. 500 tasks doing 160KB/s will still sum up to 80MB/s, exceeding the
write bandwidth of a slow disk and hence accumulating more and more dirty
pages).

The pass-good limit helps to let go of the good bdi's in the presence of
a blocked bdi (ie. NFS server not responding) or slow USB disk which for
some reason build up a large number of initial dirty pages that refuse
to go away anytime soon.

For example, given two bdi's A and B and the initial state

	bdi_thresh_A = dirty_thresh / 2
	bdi_thresh_B = dirty_thresh / 2
	bdi_dirty_A  = dirty_thresh / 2
	bdi_dirty_B  = dirty_thresh / 2

Then A get blocked, after a dozen seconds

	bdi_thresh_A = 0
	bdi_thresh_B = dirty_thresh
	bdi_dirty_A  = dirty_thresh / 2
	bdi_dirty_B  = dirty_thresh / 2

The (bdi_dirty_B < bdi_thresh_B) test is now useless and the dirty pages
will be effectively throttled by condition (nr_dirty < dirty_thresh).
This has two problems:
(1) we lose the protections for light dirtiers
(2) balance_dirty_pages() effectively becomes IO-less because the
    (bdi_nr_reclaimable > bdi_thresh) test won't be true. This is good
    for IO, but balance_dirty_pages() loses an important way to break
    out of the loop which leads to more spread out throttle delays.

DIRTY_PASSGOOD_AREA can eliminate the above issues. The only problem is,
DIRTY_PASSGOOD_AREA needs to be defined as 2 to fully cover the above
example while this patch uses the more conservative value 8 so as not to
surprise people with too many dirty pages than expected.

The max-pause limit won't noticeably impact the speed dirty pages are
knocked down when there is a sudden drop of global/bdi dirty thresholds.
Because the heavy dirties will be throttled below 160KB/s which is slow
enough. It does help to avoid long dirty throttle delays and especially
will make light dirtiers more responsive.

Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
 include/linux/writeback.h | 21 +++++++++++++++++++++
 mm/page-writeback.c       | 33 +++++++++++++++++++++++++++++++++
 2 files changed, 54 insertions(+)

diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index e9d371b6053b..b625073b80c8 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -7,6 +7,27 @@
 #include <linux/sched.h>
 #include <linux/fs.h>
 
+/*
+ * The 1/16 region above the global dirty limit will be put to maximum pauses:
+ *
+ *	(limit, limit + limit/DIRTY_MAXPAUSE_AREA)
+ *
+ * The 1/16 region above the max-pause region, dirty exceeded bdi's will be put
+ * to loops:
+ *
+ *	(limit + limit/DIRTY_MAXPAUSE_AREA, limit + limit/DIRTY_PASSGOOD_AREA)
+ *
+ * Further beyond, all dirtier tasks will enter a loop waiting (possibly long
+ * time) for the dirty pages to drop, unless written enough pages.
+ *
+ * The global dirty threshold is normally equal to the global dirty limit,
+ * except when the system suddenly allocates a lot of anonymous memory and
+ * knocks down the global dirty threshold quickly, in which case the global
+ * dirty limit will follow down slowly to prevent livelocking all dirtier tasks.
+ */
+#define DIRTY_MAXPAUSE_AREA		16
+#define DIRTY_PASSGOOD_AREA		8
+
 struct backing_dev_info;
 
 /*
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index da959952b9f5..798842a22474 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -36,6 +36,11 @@
 #include <linux/pagevec.h>
 #include <trace/events/writeback.h>
 
+/*
+ * Sleep at most 200ms at a time in balance_dirty_pages().
+ */
+#define MAX_PAUSE		max(HZ/5, 1)
+
 /*
  * Estimate write bandwidth at 200ms intervals.
  */
@@ -399,6 +404,11 @@ unsigned long determine_dirtyable_memory(void)
 	return x + 1;	/* Ensure that we never return 0 */
 }
 
+static unsigned long hard_dirty_limit(unsigned long thresh)
+{
+	return max(thresh, global_dirty_limit);
+}
+
 /*
  * global_dirty_limits - background-writeback and dirty-throttling thresholds
  *
@@ -723,6 +733,29 @@ static void balance_dirty_pages(struct address_space *mapping,
 		io_schedule_timeout(pause);
 		trace_balance_dirty_wait(bdi);
 
+		dirty_thresh = hard_dirty_limit(dirty_thresh);
+		/*
+		 * max-pause area. If dirty exceeded but still within this
+		 * area, no need to sleep for more than 200ms: (a) 8 pages per
+		 * 200ms is typically more than enough to curb heavy dirtiers;
+		 * (b) the pause time limit makes the dirtiers more responsive.
+		 */
+		if (nr_dirty < dirty_thresh +
+			       dirty_thresh / DIRTY_MAXPAUSE_AREA &&
+		    time_after(jiffies, start_time + MAX_PAUSE))
+			break;
+		/*
+		 * pass-good area. When some bdi gets blocked (eg. NFS server
+		 * not responding), or write bandwidth dropped dramatically due
+		 * to concurrent reads, or dirty threshold suddenly dropped and
+		 * the dirty pages cannot be brought down anytime soon (eg. on
+		 * slow USB stick), at least let go of the good bdi's.
+		 */
+		if (nr_dirty < dirty_thresh +
+			       dirty_thresh / DIRTY_PASSGOOD_AREA &&
+		    bdi_dirty < bdi_thresh)
+			break;
+
 		/*
 		 * Increase the delay for each loop, up to our previous
 		 * default of taking a 100ms nap.
-- 
cgit v1.2.3


From e1cbe236013c82bcf9a156e98d7b47efb89d2674 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Mon, 6 Dec 2010 22:34:29 -0600
Subject: writeback: trace global_dirty_state

Add trace event balance_dirty_state for showing the global dirty page
counts and thresholds at each global_dirty_limits() invocation.  This
will cover the callers throttle_vm_writeout(), over_bground_thresh()
and each balance_dirty_pages() loop.

Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
 include/trace/events/writeback.h | 46 ++++++++++++++++++++++++++++++++++++++++
 mm/page-writeback.c              |  1 +
 2 files changed, 47 insertions(+)

diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index 3e7662a0cfa3..6bca4cc0063c 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -204,6 +204,52 @@ TRACE_EVENT(writeback_queue_io,
 		__entry->moved)
 );
 
+TRACE_EVENT(global_dirty_state,
+
+	TP_PROTO(unsigned long background_thresh,
+		 unsigned long dirty_thresh
+	),
+
+	TP_ARGS(background_thresh,
+		dirty_thresh
+	),
+
+	TP_STRUCT__entry(
+		__field(unsigned long,	nr_dirty)
+		__field(unsigned long,	nr_writeback)
+		__field(unsigned long,	nr_unstable)
+		__field(unsigned long,	background_thresh)
+		__field(unsigned long,	dirty_thresh)
+		__field(unsigned long,	dirty_limit)
+		__field(unsigned long,	nr_dirtied)
+		__field(unsigned long,	nr_written)
+	),
+
+	TP_fast_assign(
+		__entry->nr_dirty	= global_page_state(NR_FILE_DIRTY);
+		__entry->nr_writeback	= global_page_state(NR_WRITEBACK);
+		__entry->nr_unstable	= global_page_state(NR_UNSTABLE_NFS);
+		__entry->nr_dirtied	= global_page_state(NR_DIRTIED);
+		__entry->nr_written	= global_page_state(NR_WRITTEN);
+		__entry->background_thresh = background_thresh;
+		__entry->dirty_thresh	= dirty_thresh;
+		__entry->dirty_limit = global_dirty_limit;
+	),
+
+	TP_printk("dirty=%lu writeback=%lu unstable=%lu "
+		  "bg_thresh=%lu thresh=%lu limit=%lu "
+		  "dirtied=%lu written=%lu",
+		  __entry->nr_dirty,
+		  __entry->nr_writeback,
+		  __entry->nr_unstable,
+		  __entry->background_thresh,
+		  __entry->dirty_thresh,
+		  __entry->dirty_limit,
+		  __entry->nr_dirtied,
+		  __entry->nr_written
+	)
+);
+
 DECLARE_EVENT_CLASS(writeback_congest_waited_template,
 
 	TP_PROTO(unsigned int usec_timeout, unsigned int usec_delayed),
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 798842a22474..f9d9f5476d58 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -447,6 +447,7 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
 	}
 	*pbackground = background;
 	*pdirty = dirty;
+	trace_global_dirty_state(background, dirty);
 }
 
 /**
-- 
cgit v1.2.3


From 1a12d8bd7b2998be01ee55edb64e7473728abb9c Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Sun, 29 Aug 2010 13:28:09 -0600
Subject: writeback: scale IO chunk size up to half device bandwidth

Originally, MAX_WRITEBACK_PAGES was hard-coded to 1024 because of a
concern of not holding I_SYNC for too long.  (At least, that was the
comment previously.)  This doesn't make sense now because the only
time we wait for I_SYNC is if we are calling sync or fsync, and in
that case we need to write out all of the data anyway.  Previously
there may have been other code paths that waited on I_SYNC, but not
any more.					    -- Theodore Ts'o

So remove the MAX_WRITEBACK_PAGES constraint. The writeback pages
will adapt to as large as the storage device can write within 500ms.

XFS is observed to do IO completions in a batch, and the batch size is
equal to the write chunk size. To avoid dirty pages to suddenly drop
out of balance_dirty_pages()'s dirty control scope and create large
fluctuations, the chunk size is also limited to half the control scope.

The balance_dirty_pages() control scrope is

	[(background_thresh + dirty_thresh) / 2, dirty_thresh]

which is by default [15%, 20%] of global dirty pages, whose range size
is dirty_thresh / DIRTY_FULL_SCOPE.

The adpative write chunk size will be rounded to the nearest 4MB
boundary.

http://bugzilla.kernel.org/show_bug.cgi?id=13930

CC: Theodore Ts'o <tytso@mit.edu>
CC: Dave Chinner <david@fromorbit.com>
CC: Chris Mason <chris.mason@oracle.com>
CC: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
 fs/fs-writeback.c         | 23 ++++++++++-------------
 include/linux/writeback.h | 11 +++++++++++
 2 files changed, 21 insertions(+), 13 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 227ff12257f3..50445cf0b83a 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -29,15 +29,6 @@
 #include <linux/tracepoint.h>
 #include "internal.h"
 
-/*
- * The maximum number of pages to writeout in a single bdi flush/kupdate
- * operation.  We do this so we don't hold I_SYNC against an inode for
- * enormous amounts of time, which would block a userspace task which has
- * been forced to throttle against that inode.  Also, the code reevaluates
- * the dirty each time it has written this many pages.
- */
-#define MAX_WRITEBACK_PAGES     1024L
-
 /*
  * Passed into wb_writeback(), essentially a subset of writeback_control
  */
@@ -515,7 +506,8 @@ static bool pin_sb_for_writeback(struct super_block *sb)
 	return false;
 }
 
-static long writeback_chunk_size(struct wb_writeback_work *work)
+static long writeback_chunk_size(struct backing_dev_info *bdi,
+				 struct wb_writeback_work *work)
 {
 	long pages;
 
@@ -534,8 +526,13 @@ static long writeback_chunk_size(struct wb_writeback_work *work)
 	 */
 	if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages)
 		pages = LONG_MAX;
-	else
-		pages = min(MAX_WRITEBACK_PAGES, work->nr_pages);
+	else {
+		pages = min(bdi->avg_write_bandwidth / 2,
+			    global_dirty_limit / DIRTY_SCOPE);
+		pages = min(pages, work->nr_pages);
+		pages = round_down(pages + MIN_WRITEBACK_PAGES,
+				   MIN_WRITEBACK_PAGES);
+	}
 
 	return pages;
 }
@@ -600,7 +597,7 @@ static long writeback_sb_inodes(struct super_block *sb,
 			continue;
 		}
 		__iget(inode);
-		write_chunk = writeback_chunk_size(work);
+		write_chunk = writeback_chunk_size(wb->bdi, work);
 		wbc.nr_to_write = write_chunk;
 		wbc.pages_skipped = 0;
 
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index b625073b80c8..f1bfa12ea246 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -8,6 +8,10 @@
 #include <linux/fs.h>
 
 /*
+ * The 1/4 region under the global dirty thresh is for smooth dirty throttling:
+ *
+ *	(thresh - thresh/DIRTY_FULL_SCOPE, thresh)
+ *
  * The 1/16 region above the global dirty limit will be put to maximum pauses:
  *
  *	(limit, limit + limit/DIRTY_MAXPAUSE_AREA)
@@ -25,9 +29,16 @@
  * knocks down the global dirty threshold quickly, in which case the global
  * dirty limit will follow down slowly to prevent livelocking all dirtier tasks.
  */
+#define DIRTY_SCOPE		8
+#define DIRTY_FULL_SCOPE	(DIRTY_SCOPE / 2)
 #define DIRTY_MAXPAUSE_AREA		16
 #define DIRTY_PASSGOOD_AREA		8
 
+/*
+ * 4MB minimal write chunk size
+ */
+#define MIN_WRITEBACK_PAGES	(4096UL >> (PAGE_CACHE_SHIFT - 10))
+
 struct backing_dev_info;
 
 /*
-- 
cgit v1.2.3


From fcc5c22218a18509a7412bf074fc9a7a5d874a8a Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Mon, 11 Jul 2011 23:08:50 -0700
Subject: writeback: don't busy retry writeback on new/freeing inodes

Fix a system hang bug introduced by commit b7a2441f9966 ("writeback:
remove writeback_control.more_io") and e8dfc3058 ("writeback: elevate
queue_io() into wb_writeback()") easily reproducible with high memory
pressure and lots of file creation/deletions, for example, a kernel
build in limited memory.

It hangs when some inode is in the I_NEW, I_FREEING or I_WILL_FREE
state, the flusher will get stuck busy retrying that inode, never
releasing wb->list_lock. The lock in turn blocks all kinds of other
tasks when they are trying to grab it.

As put by Jan, it's a safe change regarding data integrity. I_FREEING or
I_WILL_FREE inodes are written back by iput_final() and it is reclaim
code that is responsible for eventually removing them. So writeback code
can safely ignore them. I_NEW inodes should move out of this state when
they are fully set up and in the writeback round following that, we will
consider them for writeback. So the change makes sense.

CC: Jan Kara <jack@suse.cz>
Reported-by: Hugh Dickins <hughd@google.com>
Tested-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
 fs/fs-writeback.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 50445cf0b83a..6d49439ca31d 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -593,7 +593,7 @@ static long writeback_sb_inodes(struct super_block *sb,
 		spin_lock(&inode->i_lock);
 		if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
 			spin_unlock(&inode->i_lock);
-			requeue_io(inode, wb);
+			redirty_tail(inode, wb);
 			continue;
 		}
 		__iget(inode);
-- 
cgit v1.2.3


From bcff25fc8aa47a13faff8b4b992589813f7b450a Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Fri, 1 Jul 2011 13:31:25 -0600
Subject: mm: properly reflect task dirty limits in dirty_exceeded logic

We set bdi->dirty_exceeded (and thus ratelimiting code starts to
call balance_dirty_pages() every 8 pages) when a per-bdi limit is
exceeded or global limit is exceeded. But per-bdi limit also depends
on the task. Thus different tasks reach the limit on that bdi at
different levels of dirty pages. The result is that with current code
bdi->dirty_exceeded ping-ponged between 1 and 0 depending on which task
just got into balance_dirty_pages().

We fix the issue by clearing bdi->dirty_exceeded only when per-bdi amount
of dirty pages drops below the threshold (7/8 * bdi_dirty_limit) where task
limits already do not have any influence.

Impact:  The end result is, the dirty pages are kept more tightly under
control, with the average number slightly lowered than before.  This
reduces the risk to throttle light dirtiers and hence more responsive.
However it may add overheads by enforcing balance_dirty_pages() calls
on every 8 pages when there are 2+ heavy dirtiers.

CC: Andrew Morton <akpm@linux-foundation.org>
CC: Christoph Hellwig <hch@infradead.org>
CC: Dave Chinner <david@fromorbit.com>
CC: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
---
 mm/page-writeback.c | 26 ++++++++++++++++++++------
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index f9d9f5476d58..1d781803e629 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -281,12 +281,13 @@ static inline void task_dirties_fraction(struct task_struct *tsk,
  * effectively curb the growth of dirty pages. Light dirtiers with high enough
  * dirty threshold may never get throttled.
  */
+#define TASK_LIMIT_FRACTION 8
 static unsigned long task_dirty_limit(struct task_struct *tsk,
 				       unsigned long bdi_dirty)
 {
 	long numerator, denominator;
 	unsigned long dirty = bdi_dirty;
-	u64 inv = dirty >> 3;
+	u64 inv = dirty / TASK_LIMIT_FRACTION;
 
 	task_dirties_fraction(tsk, &numerator, &denominator);
 	inv *= numerator;
@@ -297,6 +298,12 @@ static unsigned long task_dirty_limit(struct task_struct *tsk,
 	return max(dirty, bdi_dirty/2);
 }
 
+/* Minimum limit for any task */
+static unsigned long task_min_dirty_limit(unsigned long bdi_dirty)
+{
+	return bdi_dirty - bdi_dirty / TASK_LIMIT_FRACTION;
+}
+
 /*
  *
  */
@@ -651,9 +658,12 @@ static void balance_dirty_pages(struct address_space *mapping,
 	unsigned long background_thresh;
 	unsigned long dirty_thresh;
 	unsigned long bdi_thresh;
+	unsigned long task_bdi_thresh;
+	unsigned long min_task_bdi_thresh;
 	unsigned long pages_written = 0;
 	unsigned long pause = 1;
 	bool dirty_exceeded = false;
+	bool clear_dirty_exceeded = true;
 	struct backing_dev_info *bdi = mapping->backing_dev_info;
 	unsigned long start_time = jiffies;
 
@@ -673,7 +683,8 @@ static void balance_dirty_pages(struct address_space *mapping,
 			break;
 
 		bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
-		bdi_thresh = task_dirty_limit(current, bdi_thresh);
+		min_task_bdi_thresh = task_min_dirty_limit(bdi_thresh);
+		task_bdi_thresh = task_dirty_limit(current, bdi_thresh);
 
 		/*
 		 * In order to avoid the stacked BDI deadlock we need
@@ -685,7 +696,7 @@ static void balance_dirty_pages(struct address_space *mapping,
 		 * actually dirty; with m+n sitting in the percpu
 		 * deltas.
 		 */
-		if (bdi_thresh < 2*bdi_stat_error(bdi)) {
+		if (task_bdi_thresh < 2 * bdi_stat_error(bdi)) {
 			bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
 			bdi_dirty = bdi_nr_reclaimable +
 				    bdi_stat_sum(bdi, BDI_WRITEBACK);
@@ -701,8 +712,10 @@ static void balance_dirty_pages(struct address_space *mapping,
 		 * bdi or process from holding back light ones; The latter is
 		 * the last resort safeguard.
 		 */
-		dirty_exceeded = (bdi_dirty > bdi_thresh) ||
+		dirty_exceeded = (bdi_dirty > task_bdi_thresh) ||
 				  (nr_dirty > dirty_thresh);
+		clear_dirty_exceeded = (bdi_dirty <= min_task_bdi_thresh) &&
+					(nr_dirty <= dirty_thresh);
 
 		if (!dirty_exceeded)
 			break;
@@ -723,7 +736,7 @@ static void balance_dirty_pages(struct address_space *mapping,
 		 * up.
 		 */
 		trace_balance_dirty_start(bdi);
-		if (bdi_nr_reclaimable > bdi_thresh) {
+		if (bdi_nr_reclaimable > task_bdi_thresh) {
 			pages_written += writeback_inodes_wb(&bdi->wb,
 							     write_chunk);
 			trace_balance_dirty_written(bdi, pages_written);
@@ -766,7 +779,8 @@ static void balance_dirty_pages(struct address_space *mapping,
 			pause = HZ / 10;
 	}
 
-	if (!dirty_exceeded && bdi->dirty_exceeded)
+	/* Clear dirty_exceeded flag only when no task can exceed the limit */
+	if (clear_dirty_exceeded && bdi->dirty_exceeded)
 		bdi->dirty_exceeded = 0;
 
 	if (writeback_in_progress(bdi))
-- 
cgit v1.2.3