Merge branch 'for-4.3/blkcg' of git://git.kernel.dk/linux-block

Pull blk-cg updates from Jens Axboe: "A bit later in the cycle, but this has been in the block tree for a a while. This is basically four patchsets from Tejun, that improve our buffered cgroup writeback. It was dependent on the other cgroup changes, but they went in earlier in this cycle. Series 1 is set of 5 patches that has cgroup writeback updates: - bdi_writeback iteration fix which could lead to some wb's being skipped or repeated during e.g. sync under memory pressure. - Simplification of wb work wait mechanism. - Writeback tracepoints updated to report cgroup. Series 2 is is a set of updates for the CFQ cgroup writeback handling: cfq has always charged all async IOs to the root cgroup. It didn't have much choice as writeback didn't know about cgroups and there was no way to tell who to blame for a given writeback IO. writeback finally grew support for cgroups and now tags each writeback IO with the appropriate cgroup to charge it against. This patchset updates cfq so that it follows the blkcg each bio is tagged with. Async cfq_queues are now shared across cfq_group, which is per-cgroup, instead of per-request_queue cfq_data. This makes all IOs follow the weight based IO resource distribution implemented by cfq. - Switched from GFP_ATOMIC to GFP_NOWAIT as suggested by Jeff. - Other misc review points addressed, acks added and rebased. Series 3 is the blkcg policy cleanup patches: This patchset contains assorted cleanups for blkcg_policy methods and blk[c]g_policy_data handling. - alloc/free added for blkg_policy_data. exit dropped. - alloc/free added for blkcg_policy_data. - blk-throttle's async percpu allocation is replaced with direct allocation. - all methods now take blk[c]g_policy_data instead of blkcg_gq or blkcg. And finally, series 4 is a set of patches cleaning up the blkcg stats handling: blkcg's stats have always been somwhat of a mess. This patchset tries to improve the situation a bit. - The following patches added to consolidate blkcg entry point and blkg creation. This is in itself is an improvement and helps colllecting common stats on bio issue. - per-blkg stats now accounted on bio issue rather than request completion so that bio based and request based drivers can behave the same way. The issue was spotted by Vivek. - cfq-iosched implements custom recursive stats and blk-throttle implements custom per-cpu stats. This patchset make blkcg core support both by default. - cfq-iosched and blk-throttle keep track of the same stats multiple times. Unify them" * 'for-4.3/blkcg' of git://git.kernel.dk/linux-block: (45 commits) blkcg: use CGROUP_WEIGHT_* scale for io.weight on the unified hierarchy blkcg: s/CFQ_WEIGHT_*/CFQ_WEIGHT_LEGACY_*/ blkcg: implement interface for the unified hierarchy blkcg: misc preparations for unified hierarchy interface blkcg: separate out tg_conf_updated() from tg_set_conf() blkcg: move body parsing from blkg_conf_prep() to its callers blkcg: mark existing cftypes as legacy blkcg: rename subsystem name from blkio to io blkcg: refine error codes returned during blkcg configuration blkcg: remove unnecessary NULL checks from __cfqg_set_weight_device() blkcg: reduce stack usage of blkg_rwstat_recursive_sum() blkcg: remove cfqg_stats->sectors blkcg: move io_service_bytes and io_serviced stats into blkcg_gq blkcg: make blkg_[rw]stat_recursive_sum() to be able to index into blkcg_gq blkcg: make blkcg_[rw]stat per-cpu blkcg: add blkg_[rw]stat->aux_cnt and replace cfq_group->dead_stats with it blkcg: consolidate blkg creation in blkcg_bio_issue_check() blk-throttle: improve queue bypass handling blkcg: move root blkg lookup optimization from throtl_lookup_tg() to __blkg_lookup() blkcg: inline [__]blkg_lookup() ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2015-09-10 18:56:14 -0700
committer: Linus Torvalds <torvalds@linux-foundation.org> 2015-09-10 18:56:14 -0700
commit: b0a1ea51bda4c2bcdde460221e1772f3a4f8c44f (patch)
tree: 9684c11b72718cd7e96e5eb93298690269ecf447 /include
parent: 33e247c7e58d335d70ecb84fd869091e2e4b8dcb (diff)
parent: 69d7fde5909b614114343974cfc52cb8ff30b544 (diff)
5 files changed, 397 insertions, 155 deletions
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 0fe9df983ab7..5a5d79ee256f 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -286,7 +286,7 @@ static inline struct bdi_writeback *wb_find_current(struct backing_dev_info *bdi
 	 * %current's blkcg equals the effective blkcg of its memcg.  No
 	 * need to use the relatively expensive cgroup_get_e_css().
 	 */
-	if (likely(wb && wb->blkcg_css == task_css(current, blkio_cgrp_id)))
+	if (likely(wb && wb->blkcg_css == task_css(current, io_cgrp_id)))
 		return wb;
 	return NULL;
 }
@@ -402,7 +402,7 @@ static inline void unlocked_inode_to_wb_end(struct inode *inode, bool locked)
 }
 
 struct wb_iter {
-	int			start_blkcg_id;
+	int			start_memcg_id;
 	struct radix_tree_iter	tree_iter;
 	void			**slot;
 };
@@ -414,9 +414,9 @@ static inline struct bdi_writeback *__wb_iter_next(struct wb_iter *iter,
 
 	WARN_ON_ONCE(!rcu_read_lock_held());
 
-	if (iter->start_blkcg_id >= 0) {
-		iter->slot = radix_tree_iter_init(titer, iter->start_blkcg_id);
-		iter->start_blkcg_id = -1;
+	if (iter->start_memcg_id >= 0) {
+		iter->slot = radix_tree_iter_init(titer, iter->start_memcg_id);
+		iter->start_memcg_id = -1;
 	} else {
 		iter->slot = radix_tree_next_slot(iter->slot, titer, 0);
 	}
@@ -430,30 +430,30 @@ static inline struct bdi_writeback *__wb_iter_next(struct wb_iter *iter,
 
 static inline struct bdi_writeback *__wb_iter_init(struct wb_iter *iter,
 						   struct backing_dev_info *bdi,
-						   int start_blkcg_id)
+						   int start_memcg_id)
 {
-	iter->start_blkcg_id = start_blkcg_id;
+	iter->start_memcg_id = start_memcg_id;
 
-	if (start_blkcg_id)
+	if (start_memcg_id)
 		return __wb_iter_next(iter, bdi);
 	else
 		return &bdi->wb;
 }
 
 /**
- * bdi_for_each_wb - walk all wb's of a bdi in ascending blkcg ID order
+ * bdi_for_each_wb - walk all wb's of a bdi in ascending memcg ID order
  * @wb_cur: cursor struct bdi_writeback pointer
  * @bdi: bdi to walk wb's of
  * @iter: pointer to struct wb_iter to be used as iteration buffer
- * @start_blkcg_id: blkcg ID to start iteration from
+ * @start_memcg_id: memcg ID to start iteration from
  *
  * Iterate @wb_cur through the wb's (bdi_writeback's) of @bdi in ascending
- * blkcg ID order starting from @start_blkcg_id.  @iter is struct wb_iter
+ * memcg ID order starting from @start_memcg_id.  @iter is struct wb_iter
  * to be used as temp storage during iteration.  rcu_read_lock() must be
  * held throughout iteration.
  */
-#define bdi_for_each_wb(wb_cur, bdi, iter, start_blkcg_id)		\
-	for ((wb_cur) = __wb_iter_init(iter, bdi, start_blkcg_id);	\
+#define bdi_for_each_wb(wb_cur, bdi, iter, start_memcg_id)		\
+	for ((wb_cur) = __wb_iter_init(iter, bdi, start_memcg_id);	\
 	     (wb_cur); (wb_cur) = __wb_iter_next(iter, bdi))
 
 #else	/* CONFIG_CGROUP_WRITEBACK */
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index a4cd1641e9e2..0a5cc7a1109b 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -14,12 +14,15 @@
  */
 
 #include <linux/cgroup.h>
-#include <linux/u64_stats_sync.h>
+#include <linux/percpu_counter.h>
 #include <linux/seq_file.h>
 #include <linux/radix-tree.h>
 #include <linux/blkdev.h>
 #include <linux/atomic.h>
 
+/* percpu_counter batch for blkg_[rw]stats, per-cpu drift doesn't matter */
+#define BLKG_STAT_CPU_BATCH	(INT_MAX / 2)
+
 /* Max limits for throttle policy */
 #define THROTL_IOPS_MAX		UINT_MAX
 
@@ -45,7 +48,7 @@ struct blkcg {
 	struct blkcg_gq			*blkg_hint;
 	struct hlist_head		blkg_list;
 
-	struct blkcg_policy_data	*pd[BLKCG_MAX_POLS];
+	struct blkcg_policy_data	*cpd[BLKCG_MAX_POLS];
 
 	struct list_head		all_blkcgs_node;
 #ifdef CONFIG_CGROUP_WRITEBACK
@@ -53,14 +56,19 @@ struct blkcg {
 #endif
 };
 
+/*
+ * blkg_[rw]stat->aux_cnt is excluded for local stats but included for
+ * recursive.  Used to carry stats of dead children, and, for blkg_rwstat,
+ * to carry result values from read and sum operations.
+ */
 struct blkg_stat {
-	struct u64_stats_sync		syncp;
-	uint64_t			cnt;
+	struct percpu_counter		cpu_cnt;
+	atomic64_t			aux_cnt;
 };
 
 struct blkg_rwstat {
-	struct u64_stats_sync		syncp;
-	uint64_t			cnt[BLKG_RWSTAT_NR];
+	struct percpu_counter		cpu_cnt[BLKG_RWSTAT_NR];
+	atomic64_t			aux_cnt[BLKG_RWSTAT_NR];
 };
 
 /*
@@ -68,32 +76,28 @@ struct blkg_rwstat {
  * request_queue (q).  This is used by blkcg policies which need to track
  * information per blkcg - q pair.
  *
- * There can be multiple active blkcg policies and each has its private
- * data on each blkg, the size of which is determined by
- * blkcg_policy->pd_size.  blkcg core allocates and frees such areas
- * together with blkg and invokes pd_init/exit_fn() methods.
- *
- * Such private data must embed struct blkg_policy_data (pd) at the
- * beginning and pd_size can't be smaller than pd.
+ * There can be multiple active blkcg policies and each blkg:policy pair is
+ * represented by a blkg_policy_data which is allocated and freed by each
+ * policy's pd_alloc/free_fn() methods.  A policy can allocate private data
+ * area by allocating larger data structure which embeds blkg_policy_data
+ * at the beginning.
  */
 struct blkg_policy_data {
 	/* the blkg and policy id this per-policy data belongs to */
 	struct blkcg_gq			*blkg;
 	int				plid;
-
-	/* used during policy activation */
-	struct list_head		alloc_node;
 };
 
 /*
- * Policies that need to keep per-blkcg data which is independent
- * from any request_queue associated to it must specify its size
- * with the cpd_size field of the blkcg_policy structure and
- * embed a blkcg_policy_data in it.  cpd_init() is invoked to let
- * each policy handle per-blkcg data.
+ * Policies that need to keep per-blkcg data which is independent from any
+ * request_queue associated to it should implement cpd_alloc/free_fn()
+ * methods.  A policy can allocate private data area by allocating larger
+ * data structure which embeds blkcg_policy_data at the beginning.
+ * cpd_init() is invoked to let each policy handle per-blkcg data.
  */
 struct blkcg_policy_data {
-	/* the policy id this per-policy data belongs to */
+	/* the blkcg and policy id this per-policy data belongs to */
+	struct blkcg			*blkcg;
 	int				plid;
 };
 
@@ -123,40 +127,50 @@ struct blkcg_gq {
 	/* is this blkg online? protected by both blkcg and q locks */
 	bool				online;
 
+	struct blkg_rwstat		stat_bytes;
+	struct blkg_rwstat		stat_ios;
+
 	struct blkg_policy_data		*pd[BLKCG_MAX_POLS];
 
 	struct rcu_head			rcu_head;
 };
 
-typedef void (blkcg_pol_init_cpd_fn)(const struct blkcg *blkcg);
-typedef void (blkcg_pol_init_pd_fn)(struct blkcg_gq *blkg);
-typedef void (blkcg_pol_online_pd_fn)(struct blkcg_gq *blkg);
-typedef void (blkcg_pol_offline_pd_fn)(struct blkcg_gq *blkg);
-typedef void (blkcg_pol_exit_pd_fn)(struct blkcg_gq *blkg);
-typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkcg_gq *blkg);
+typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp);
+typedef void (blkcg_pol_init_cpd_fn)(struct blkcg_policy_data *cpd);
+typedef void (blkcg_pol_free_cpd_fn)(struct blkcg_policy_data *cpd);
+typedef void (blkcg_pol_bind_cpd_fn)(struct blkcg_policy_data *cpd);
+typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(gfp_t gfp, int node);
+typedef void (blkcg_pol_init_pd_fn)(struct blkg_policy_data *pd);
+typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd);
+typedef void (blkcg_pol_offline_pd_fn)(struct blkg_policy_data *pd);
+typedef void (blkcg_pol_free_pd_fn)(struct blkg_policy_data *pd);
+typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkg_policy_data *pd);
 
 struct blkcg_policy {
 	int				plid;
-	/* policy specific private data size */
-	size_t				pd_size;
-	/* policy specific per-blkcg data size */
-	size_t				cpd_size;
 	/* cgroup files for the policy */
-	struct cftype			*cftypes;
+	struct cftype			*dfl_cftypes;
+	struct cftype			*legacy_cftypes;
 
 	/* operations */
+	blkcg_pol_alloc_cpd_fn		*cpd_alloc_fn;
 	blkcg_pol_init_cpd_fn		*cpd_init_fn;
+	blkcg_pol_free_cpd_fn		*cpd_free_fn;
+	blkcg_pol_bind_cpd_fn		*cpd_bind_fn;
+
+	blkcg_pol_alloc_pd_fn		*pd_alloc_fn;
 	blkcg_pol_init_pd_fn		*pd_init_fn;
 	blkcg_pol_online_pd_fn		*pd_online_fn;
 	blkcg_pol_offline_pd_fn		*pd_offline_fn;
-	blkcg_pol_exit_pd_fn		*pd_exit_fn;
+	blkcg_pol_free_pd_fn		*pd_free_fn;
 	blkcg_pol_reset_pd_stats_fn	*pd_reset_stats_fn;
 };
 
 extern struct blkcg blkcg_root;
 extern struct cgroup_subsys_state * const blkcg_root_css;
 
-struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, struct request_queue *q);
+struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg,
+				      struct request_queue *q, bool update_hint);
 struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
 				    struct request_queue *q);
 int blkcg_init_queue(struct request_queue *q);
@@ -171,6 +185,7 @@ int blkcg_activate_policy(struct request_queue *q,
 void blkcg_deactivate_policy(struct request_queue *q,
 			     const struct blkcg_policy *pol);
 
+const char *blkg_dev_name(struct blkcg_gq *blkg);
 void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
 		       u64 (*prfill)(struct seq_file *,
 				     struct blkg_policy_data *, int),
@@ -182,19 +197,24 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
 u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off);
 u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
 		       int off);
+int blkg_print_stat_bytes(struct seq_file *sf, void *v);
+int blkg_print_stat_ios(struct seq_file *sf, void *v);
+int blkg_print_stat_bytes_recursive(struct seq_file *sf, void *v);
+int blkg_print_stat_ios_recursive(struct seq_file *sf, void *v);
 
-u64 blkg_stat_recursive_sum(struct blkg_policy_data *pd, int off);
-struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkg_policy_data *pd,
-					     int off);
+u64 blkg_stat_recursive_sum(struct blkcg_gq *blkg,
+			    struct blkcg_policy *pol, int off);
+struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg,
+					     struct blkcg_policy *pol, int off);
 
 struct blkg_conf_ctx {
 	struct gendisk			*disk;
 	struct blkcg_gq			*blkg;
-	u64				v;
+	char				*body;
 };
 
 int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
-		   const char *input, struct blkg_conf_ctx *ctx);
+		   char *input, struct blkg_conf_ctx *ctx);
 void blkg_conf_finish(struct blkg_conf_ctx *ctx);
 
 
@@ -205,7 +225,7 @@ static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css)
 
 static inline struct blkcg *task_blkcg(struct task_struct *tsk)
 {
-	return css_to_blkcg(task_css(tsk, blkio_cgrp_id));
+	return css_to_blkcg(task_css(tsk, io_cgrp_id));
 }
 
 static inline struct blkcg *bio_blkcg(struct bio *bio)
@@ -218,7 +238,7 @@ static inline struct blkcg *bio_blkcg(struct bio *bio)
 static inline struct cgroup_subsys_state *
 task_get_blkcg_css(struct task_struct *task)
 {
-	return task_get_css(task, blkio_cgrp_id);
+	return task_get_css(task, io_cgrp_id);
 }
 
 /**
@@ -233,6 +253,52 @@ static inline struct blkcg *blkcg_parent(struct blkcg *blkcg)
 }
 
 /**
+ * __blkg_lookup - internal version of blkg_lookup()
+ * @blkcg: blkcg of interest
+ * @q: request_queue of interest
+ * @update_hint: whether to update lookup hint with the result or not
+ *
+ * This is internal version and shouldn't be used by policy
+ * implementations.  Looks up blkgs for the @blkcg - @q pair regardless of
+ * @q's bypass state.  If @update_hint is %true, the caller should be
+ * holding @q->queue_lock and lookup hint is updated on success.
+ */
+static inline struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
+					     struct request_queue *q,
+					     bool update_hint)
+{
+	struct blkcg_gq *blkg;
+
+	if (blkcg == &blkcg_root)
+		return q->root_blkg;
+
+	blkg = rcu_dereference(blkcg->blkg_hint);
+	if (blkg && blkg->q == q)
+		return blkg;
+
+	return blkg_lookup_slowpath(blkcg, q, update_hint);
+}
+
+/**
+ * blkg_lookup - lookup blkg for the specified blkcg - q pair
+ * @blkcg: blkcg of interest
+ * @q: request_queue of interest
+ *
+ * Lookup blkg for the @blkcg - @q pair.  This function should be called
+ * under RCU read lock and is guaranteed to return %NULL if @q is bypassing
+ * - see blk_queue_bypass_start() for details.
+ */
+static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg,
+					   struct request_queue *q)
+{
+	WARN_ON_ONCE(!rcu_read_lock_held());
+
+	if (unlikely(blk_queue_bypass(q)))
+		return NULL;
+	return __blkg_lookup(blkcg, q, false);
+}
+
+/**
  * blkg_to_pdata - get policy private data
  * @blkg: blkg of interest
  * @pol: policy of interest
@@ -248,7 +314,7 @@ static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
 static inline struct blkcg_policy_data *blkcg_to_cpd(struct blkcg *blkcg,
 						     struct blkcg_policy *pol)
 {
-	return blkcg ? blkcg->pd[pol->plid] : NULL;
+	return blkcg ? blkcg->cpd[pol->plid] : NULL;
 }
 
 /**
@@ -262,6 +328,11 @@ static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd)
 	return pd ? pd->blkg : NULL;
 }
 
+static inline struct blkcg *cpd_to_blkcg(struct blkcg_policy_data *cpd)
+{
+	return cpd ? cpd->blkcg : NULL;
+}
+
 /**
  * blkg_path - format cgroup path of blkg
  * @blkg: blkg of interest
@@ -309,9 +380,6 @@ static inline void blkg_put(struct blkcg_gq *blkg)
 		call_rcu(&blkg->rcu_head, __blkg_release_rcu);
 }
 
-struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg, struct request_queue *q,
-			       bool update_hint);
-
 /**
  * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants
  * @d_blkg: loop cursor pointing to the current descendant
@@ -373,8 +441,8 @@ static inline struct request_list *blk_get_rl(struct request_queue *q,
 	 * or if either the blkcg or queue is going away.  Fall back to
 	 * root_rl in such cases.
 	 */
-	blkg = blkg_lookup_create(blkcg, q);
-	if (IS_ERR(blkg))
+	blkg = blkg_lookup(blkcg, q);
+	if (unlikely(!blkg))
 		goto root_rl;
 
 	blkg_get(blkg);
@@ -394,8 +462,7 @@ root_rl:
  */
 static inline void blk_put_rl(struct request_list *rl)
 {
-	/* root_rl may not have blkg set */
-	if (rl->blkg && rl->blkg->blkcg != &blkcg_root)
+	if (rl->blkg->blkcg != &blkcg_root)
 		blkg_put(rl->blkg);
 }
 
@@ -433,9 +500,21 @@ struct request_list *__blk_queue_next_rl(struct request_list *rl,
 #define blk_queue_for_each_rl(rl, q)	\
 	for ((rl) = &(q)->root_rl; (rl); (rl) = __blk_queue_next_rl((rl), (q)))
 
-static inline void blkg_stat_init(struct blkg_stat *stat)
+static inline int blkg_stat_init(struct blkg_stat *stat, gfp_t gfp)
 {
-	u64_stats_init(&stat->syncp);
+	int ret;
+
+	ret = percpu_counter_init(&stat->cpu_cnt, 0, gfp);
+	if (ret)
+		return ret;
+
+	atomic64_set(&stat->aux_cnt, 0);
+	return 0;
+}
+
+static inline void blkg_stat_exit(struct blkg_stat *stat)
+{
+	percpu_counter_destroy(&stat->cpu_cnt);
 }
 
 /**
@@ -443,34 +522,21 @@ static inline void blkg_stat_init(struct blkg_stat *stat)
  * @stat: target blkg_stat
  * @val: value to add
  *
- * Add @val to @stat.  The caller is responsible for synchronizing calls to
- * this function.
+ * Add @val to @stat.  The caller must ensure that IRQ on the same CPU
+ * don't re-enter this function for the same counter.
  */
 static inline void blkg_stat_add(struct blkg_stat *stat, uint64_t val)
 {
-	u64_stats_update_begin(&stat->syncp);
-	stat->cnt += val;
-	u64_stats_update_end(&stat->syncp);
+	__percpu_counter_add(&stat->cpu_cnt, val, BLKG_STAT_CPU_BATCH);
 }
 
 /**
  * blkg_stat_read - read the current value of a blkg_stat
  * @stat: blkg_stat to read
- *
- * Read the current value of @stat.  This function can be called without
- * synchroniztion and takes care of u64 atomicity.
  */
 static inline uint64_t blkg_stat_read(struct blkg_stat *stat)
 {
-	unsigned int start;
-	uint64_t v;
-
-	do {
-		start = u64_stats_fetch_begin_irq(&stat->syncp);
-		v = stat->cnt;
-	} while (u64_stats_fetch_retry_irq(&stat->syncp, start));
-
-	return v;
+	return percpu_counter_sum_positive(&stat->cpu_cnt);
 }
 
 /**
@@ -479,24 +545,46 @@ static inline uint64_t blkg_stat_read(struct blkg_stat *stat)
  */
 static inline void blkg_stat_reset(struct blkg_stat *stat)
 {
-	stat->cnt = 0;
+	percpu_counter_set(&stat->cpu_cnt, 0);
+	atomic64_set(&stat->aux_cnt, 0);
 }
 
 /**
- * blkg_stat_merge - merge a blkg_stat into another
+ * blkg_stat_add_aux - add a blkg_stat into another's aux count
  * @to: the destination blkg_stat
  * @from: the source
  *
- * Add @from's count to @to.
+ * Add @from's count including the aux one to @to's aux count.
  */
-static inline void blkg_stat_merge(struct blkg_stat *to, struct blkg_stat *from)
+static inline void blkg_stat_add_aux(struct blkg_stat *to,
+				     struct blkg_stat *from)
 {
-	blkg_stat_add(to, blkg_stat_read(from));
+	atomic64_add(blkg_stat_read(from) + atomic64_read(&from->aux_cnt),
+		     &to->aux_cnt);
 }
 
-static inline void blkg_rwstat_init(struct blkg_rwstat *rwstat)
+static inline int blkg_rwstat_init(struct blkg_rwstat *rwstat, gfp_t gfp)
 {
-	u64_stats_init(&rwstat->syncp);
+	int i, ret;
+
+	for (i = 0; i < BLKG_RWSTAT_NR; i++) {
+		ret = percpu_counter_init(&rwstat->cpu_cnt[i], 0, gfp);
+		if (ret) {
+			while (--i >= 0)
+				percpu_counter_destroy(&rwstat->cpu_cnt[i]);
+			return ret;
+		}
+		atomic64_set(&rwstat->aux_cnt[i], 0);
+	}
+	return 0;
+}
+
+static inline void blkg_rwstat_exit(struct blkg_rwstat *rwstat)
+{
+	int i;
+
+	for (i = 0; i < BLKG_RWSTAT_NR; i++)
+		percpu_counter_destroy(&rwstat->cpu_cnt[i]);
 }
 
 /**
@@ -511,39 +599,38 @@ static inline void blkg_rwstat_init(struct blkg_rwstat *rwstat)
 static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat,
 				   int rw, uint64_t val)
 {
-	u64_stats_update_begin(&rwstat->syncp);
+	struct percpu_counter *cnt;
 
 	if (rw & REQ_WRITE)
-		rwstat->cnt[BLKG_RWSTAT_WRITE] += val;
+		cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_WRITE];
 	else
-		rwstat->cnt[BLKG_RWSTAT_READ] += val;
+		cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_READ];
+
+	__percpu_counter_add(cnt, val, BLKG_STAT_CPU_BATCH);
+
 	if (rw & REQ_SYNC)
-		rwstat->cnt[BLKG_RWSTAT_SYNC] += val;
+		cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_SYNC];
 	else
-		rwstat->cnt[BLKG_RWSTAT_ASYNC] += val;
+		cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_ASYNC];
 
-	u64_stats_update_end(&rwstat->syncp);
+	__percpu_counter_add(cnt, val, BLKG_STAT_CPU_BATCH);
 }
 
 /**
  * blkg_rwstat_read - read the current values of a blkg_rwstat
  * @rwstat: blkg_rwstat to read
  *
- * Read the current snapshot of @rwstat and return it as the return value.
- * This function can be called without synchronization and takes care of
- * u64 atomicity.
+ * Read the current snapshot of @rwstat and return it in the aux counts.
  */
 static inline struct blkg_rwstat blkg_rwstat_read(struct blkg_rwstat *rwstat)
 {
-	unsigned int start;
-	struct blkg_rwstat tmp;
-
-	do {
-		start = u64_stats_fetch_begin_irq(&rwstat->syncp);
-		tmp = *rwstat;
-	} while (u64_stats_fetch_retry_irq(&rwstat->syncp, start));
+	struct blkg_rwstat result;
+	int i;
 
-	return tmp;
+	for (i = 0; i < BLKG_RWSTAT_NR; i++)
+		atomic64_set(&result.aux_cnt[i],
+			     percpu_counter_sum_positive(&rwstat->cpu_cnt[i]));
+	return result;
 }
 
 /**
@@ -558,7 +645,8 @@ static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat)
 {
 	struct blkg_rwstat tmp = blkg_rwstat_read(rwstat);
 
-	return tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE];
+	return atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) +
+		atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]);
 }
 
 /**
@@ -567,26 +655,71 @@ static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat)
  */
 static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat)
 {
-	memset(rwstat->cnt, 0, sizeof(rwstat->cnt));
+	int i;
+
+	for (i = 0; i < BLKG_RWSTAT_NR; i++) {
+		percpu_counter_set(&rwstat->cpu_cnt[i], 0);
+		atomic64_set(&rwstat->aux_cnt[i], 0);
+	}
 }
 
 /**
- * blkg_rwstat_merge - merge a blkg_rwstat into another
+ * blkg_rwstat_add_aux - add a blkg_rwstat into another's aux count
  * @to: the destination blkg_rwstat
  * @from: the source
  *
- * Add @from's counts to @to.
+ * Add @from's count including the aux one to @to's aux count.
  */
-static inline void blkg_rwstat_merge(struct blkg_rwstat *to,
-				     struct blkg_rwstat *from)
+static inline void blkg_rwstat_add_aux(struct blkg_rwstat *to,
+				       struct blkg_rwstat *from)
 {
 	struct blkg_rwstat v = blkg_rwstat_read(from);
 	int i;
 
-	u64_stats_update_begin(&to->syncp);
 	for (i = 0; i < BLKG_RWSTAT_NR; i++)
-		to->cnt[i] += v.cnt[i];
-	u64_stats_update_end(&to->syncp);
+		atomic64_add(atomic64_read(&v.aux_cnt[i]) +
+			     atomic64_read(&from->aux_cnt[i]),
+			     &to->aux_cnt[i]);
+}
+
+#ifdef CONFIG_BLK_DEV_THROTTLING
+extern bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
+			   struct bio *bio);
+#else
+static inline bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
+				  struct bio *bio) { return false; }
+#endif
+
+static inline bool blkcg_bio_issue_check(struct request_queue *q,
+					 struct bio *bio)
+{
+	struct blkcg *blkcg;
+	struct blkcg_gq *blkg;
+	bool throtl = false;
+
+	rcu_read_lock();
+	blkcg = bio_blkcg(bio);
+
+	blkg = blkg_lookup(blkcg, q);
+	if (unlikely(!blkg)) {
+		spin_lock_irq(q->queue_lock);
+		blkg = blkg_lookup_create(blkcg, q);
+		if (IS_ERR(blkg))
+			blkg = NULL;
+		spin_unlock_irq(q->queue_lock);
+	}
+
+	throtl = blk_throtl_bio(q, blkg, bio);
+
+	if (!throtl) {
+		blkg = blkg ?: q->root_blkg;
+		blkg_rwstat_add(&blkg->stat_bytes, bio->bi_flags,
+				bio->bi_iter.bi_size);
+		blkg_rwstat_add(&blkg->stat_ios, bio->bi_flags, 1);
+	}
+
+	rcu_read_unlock();
+	return !throtl;
 }
 
 #else	/* CONFIG_BLK_CGROUP */
@@ -642,6 +775,9 @@ static inline void blk_put_rl(struct request_list *rl) { }
 static inline void blk_rq_set_rl(struct request *rq, struct request_list *rl) { }
 static inline struct request_list *blk_rq_rl(struct request *rq) { return &rq->q->root_rl; }
 
+static inline bool blkcg_bio_issue_check(struct request_queue *q,
+					 struct bio *bio) { return true; }
+
 #define blk_queue_for_each_rl(rl, q)	\
 	for ((rl) = &(q)->root_rl; (rl); (rl) = NULL)
 
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
index 1f36945fd23d..1a96fdaa33d5 100644
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -27,7 +27,7 @@ SUBSYS(cpuacct)
 #endif
 
 #if IS_ENABLED(CONFIG_BLK_CGROUP)
-SUBSYS(blkio)
+SUBSYS(io)
 #endif
 
 #if IS_ENABLED(CONFIG_MEMCG)
diff --git a/include/linux/kernfs.h b/include/linux/kernfs.h
index 123be25ea15a..5d4e9c4b821d 100644
--- a/include/linux/kernfs.h
+++ b/include/linux/kernfs.h
@@ -266,6 +266,7 @@ static inline bool kernfs_ns_enabled(struct kernfs_node *kn)
 }
 
 int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen);
+size_t kernfs_path_len(struct kernfs_node *kn);
 char * __must_check kernfs_path(struct kernfs_node *kn, char *buf,
 				size_t buflen);
 void pr_cont_kernfs_name(struct kernfs_node *kn);
@@ -332,6 +333,9 @@ static inline bool kernfs_ns_enabled(struct kernfs_node *kn)
 static inline int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen)
 { return -ENOSYS; }
 
+static inline size_t kernfs_path_len(struct kernfs_node *kn)
+{ return 0; }
+
 static inline char * __must_check kernfs_path(struct kernfs_node *kn, char *buf,
 					      size_t buflen)
 { return NULL; }
diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h
index a7aa607a4c55..fff846b512e6 100644
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -131,6 +131,66 @@ DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode,
 	TP_ARGS(inode, flags)
 );
 
+#ifdef CREATE_TRACE_POINTS
+#ifdef CONFIG_CGROUP_WRITEBACK
+
+static inline size_t __trace_wb_cgroup_size(struct bdi_writeback *wb)
+{
+	return kernfs_path_len(wb->memcg_css->cgroup->kn) + 1;
+}
+
+static inline void __trace_wb_assign_cgroup(char *buf, struct bdi_writeback *wb)
+{
+	struct cgroup *cgrp = wb->memcg_css->cgroup;
+	char *path;
+
+	path = cgroup_path(cgrp, buf, kernfs_path_len(cgrp->kn) + 1);
+	WARN_ON_ONCE(path != buf);
+}
+
+static inline size_t __trace_wbc_cgroup_size(struct writeback_control *wbc)
+{
+	if (wbc->wb)
+		return __trace_wb_cgroup_size(wbc->wb);
+	else
+		return 2;
+}
+
+static inline void __trace_wbc_assign_cgroup(char *buf,
+					     struct writeback_control *wbc)
+{
+	if (wbc->wb)
+		__trace_wb_assign_cgroup(buf, wbc->wb);
+	else
+		strcpy(buf, "/");
+}
+
+#else	/* CONFIG_CGROUP_WRITEBACK */
+
+static inline size_t __trace_wb_cgroup_size(struct bdi_writeback *wb)
+{
+	return 2;
+}
+
+static inline void __trace_wb_assign_cgroup(char *buf, struct bdi_writeback *wb)
+{
+	strcpy(buf, "/");
+}
+
+static inline size_t __trace_wbc_cgroup_size(struct writeback_control *wbc)
+{
+	return 2;
+}
+
+static inline void __trace_wbc_assign_cgroup(char *buf,
+					     struct writeback_control *wbc)
+{
+	strcpy(buf, "/");
+}
+
+#endif	/* CONFIG_CGROUP_WRITEBACK */
+#endif	/* CREATE_TRACE_POINTS */
+
 DECLARE_EVENT_CLASS(writeback_write_inode_template,
 
 	TP_PROTO(struct inode *inode, struct writeback_control *wbc),
@@ -141,6 +201,7 @@ DECLARE_EVENT_CLASS(writeback_write_inode_template,
 		__array(char, name, 32)
 		__field(unsigned long, ino)
 		__field(int, sync_mode)
+		__dynamic_array(char, cgroup, __trace_wbc_cgroup_size(wbc))
 	),
 
 	TP_fast_assign(
@@ -148,12 +209,14 @@ DECLARE_EVENT_CLASS(writeback_write_inode_template,
 			dev_name(inode_to_bdi(inode)->dev), 32);
 		__entry->ino		= inode->i_ino;
 		__entry->sync_mode	= wbc->sync_mode;
+		__trace_wbc_assign_cgroup(__get_str(cgroup), wbc);
 	),
 
-	TP_printk("bdi %s: ino=%lu sync_mode=%d",
+	TP_printk("bdi %s: ino=%lu sync_mode=%d cgroup=%s",
 		__entry->name,
 		__entry->ino,
-		__entry->sync_mode
+		__entry->sync_mode,
+		__get_str(cgroup)
 	)
 );
 
@@ -172,8 +235,8 @@ DEFINE_EVENT(writeback_write_inode_template, writeback_write_inode,
 );
 
 DECLARE_EVENT_CLASS(writeback_work_class,
-	TP_PROTO(struct backing_dev_info *bdi, struct wb_writeback_work *work),
-	TP_ARGS(bdi, work),
+	TP_PROTO(struct bdi_writeback *wb, struct wb_writeback_work *work),
+	TP_ARGS(wb, work),
 	TP_STRUCT__entry(
 		__array(char, name, 32)
 		__field(long, nr_pages)
@@ -183,10 +246,11 @@ DECLARE_EVENT_CLASS(writeback_work_class,
 		__field(int, range_cyclic)
 		__field(int, for_background)
 		__field(int, reason)
+		__dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb))
 	),
 	TP_fast_assign(
 		strncpy(__entry->name,
-			bdi->dev ? dev_name(bdi->dev) : "(unknown)", 32);
+			wb->bdi->dev ? dev_name(wb->bdi->dev) : "(unknown)", 32);
 		__entry->nr_pages = work->nr_pages;
 		__entry->sb_dev = work->sb ? work->sb->s_dev : 0;
 		__entry->sync_mode = work->sync_mode;
@@ -194,9 +258,10 @@ DECLARE_EVENT_CLASS(writeback_work_class,
 		__entry->range_cyclic = work->range_cyclic;
 		__entry->for_background	= work->for_background;
 		__entry->reason = work->reason;
+		__trace_wb_assign_cgroup(__get_str(cgroup), wb);
 	),
 	TP_printk("bdi %s: sb_dev %d:%d nr_pages=%ld sync_mode=%d "
-		  "kupdate=%d range_cyclic=%d background=%d reason=%s",
+		  "kupdate=%d range_cyclic=%d background=%d reason=%s cgroup=%s",
 		  __entry->name,
 		  MAJOR(__entry->sb_dev), MINOR(__entry->sb_dev),
 		  __entry->nr_pages,
@@ -204,13 +269,14 @@ DECLARE_EVENT_CLASS(writeback_work_class,
 		  __entry->for_kupdate,
 		  __entry->range_cyclic,
 		  __entry->for_background,
-		  __print_symbolic(__entry->reason, WB_WORK_REASON)
+		  __print_symbolic(__entry->reason, WB_WORK_REASON),
+		  __get_str(cgroup)
 	)
 );
 #define DEFINE_WRITEBACK_WORK_EVENT(name) \
 DEFINE_EVENT(writeback_work_class, name, \
-	TP_PROTO(struct backing_dev_info *bdi, struct wb_writeback_work *work), \
-	TP_ARGS(bdi, work))
+	TP_PROTO(struct bdi_writeback *wb, struct wb_writeback_work *work), \
+	TP_ARGS(wb, work))
 DEFINE_WRITEBACK_WORK_EVENT(writeback_queue);
 DEFINE_WRITEBACK_WORK_EVENT(writeback_exec);
 DEFINE_WRITEBACK_WORK_EVENT(writeback_start);
@@ -230,26 +296,42 @@ TRACE_EVENT(writeback_pages_written,
 );
 
 DECLARE_EVENT_CLASS(writeback_class,
-	TP_PROTO(struct backing_dev_info *bdi),
-	TP_ARGS(bdi),
+	TP_PROTO(struct bdi_writeback *wb),
+	TP_ARGS(wb),
 	TP_STRUCT__entry(
 		__array(char, name, 32)
+		__dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb))
 	),
 	TP_fast_assign(
-		strncpy(__entry->name, dev_name(bdi->dev), 32);
+		strncpy(__entry->name, dev_name(wb->bdi->dev), 32);
+		__trace_wb_assign_cgroup(__get_str(cgroup), wb);
 	),
-	TP_printk("bdi %s",
-		  __entry->name
+	TP_printk("bdi %s: cgroup=%s",
+		  __entry->name,
+		  __get_str(cgroup)
 	)
 );
 #define DEFINE_WRITEBACK_EVENT(name) \
 DEFINE_EVENT(writeback_class, name, \
-	TP_PROTO(struct backing_dev_info *bdi), \
-	TP_ARGS(bdi))
+	TP_PROTO(struct bdi_writeback *wb), \
+	TP_ARGS(wb))
 
 DEFINE_WRITEBACK_EVENT(writeback_nowork);
 DEFINE_WRITEBACK_EVENT(writeback_wake_background);
-DEFINE_WRITEBACK_EVENT(writeback_bdi_register);
+
+TRACE_EVENT(writeback_bdi_register,
+	TP_PROTO(struct backing_dev_info *bdi),
+	TP_ARGS(bdi),
+	TP_STRUCT__entry(
+		__array(char, name, 32)
+	),
+	TP_fast_assign(
+		strncpy(__entry->name, dev_name(bdi->dev), 32);
+	),
+	TP_printk("bdi %s",
+		__entry->name
+	)
+);
 
 DECLARE_EVENT_CLASS(wbc_class,
 	TP_PROTO(struct writeback_control *wbc, struct backing_dev_info *bdi),
@@ -265,6 +347,7 @@ DECLARE_EVENT_CLASS(wbc_class,
 		__field(int, range_cyclic)
 		__field(long, range_start)
 		__field(long, range_end)
+		__dynamic_array(char, cgroup, __trace_wbc_cgroup_size(wbc))
 	),
 
 	TP_fast_assign(
@@ -278,11 +361,12 @@ DECLARE_EVENT_CLASS(wbc_class,
 		__entry->range_cyclic	= wbc->range_cyclic;
 		__entry->range_start	= (long)wbc->range_start;
 		__entry->range_end	= (long)wbc->range_end;
+		__trace_wbc_assign_cgroup(__get_str(cgroup), wbc);
 	),
 
 	TP_printk("bdi %s: towrt=%ld skip=%ld mode=%d kupd=%d "
 		"bgrd=%d reclm=%d cyclic=%d "
-		"start=0x%lx end=0x%lx",
+		"start=0x%lx end=0x%lx cgroup=%s",
 		__entry->name,
 		__entry->nr_to_write,
 		__entry->pages_skipped,
@@ -292,7 +376,9 @@ DECLARE_EVENT_CLASS(wbc_class,
 		__entry->for_reclaim,
 		__entry->range_cyclic,
 		__entry->range_start,
-		__entry->range_end)
+		__entry->range_end,
+		__get_str(cgroup)
+	)
 )
 
 #define DEFINE_WBC_EVENT(name) \
@@ -312,6 +398,7 @@ TRACE_EVENT(writeback_queue_io,
 		__field(long,		age)
 		__field(int,		moved)
 		__field(int,		reason)
+		__dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb))
 	),
 	TP_fast_assign(
 		unsigned long *older_than_this = work->older_than_this;
@@ -321,13 +408,15 @@ TRACE_EVENT(writeback_queue_io,
 				  (jiffies - *older_than_this) * 1000 / HZ : -1;
 		__entry->moved	= moved;
 		__entry->reason	= work->reason;
+		__trace_wb_assign_cgroup(__get_str(cgroup), wb);
 	),
-	TP_printk("bdi %s: older=%lu age=%ld enqueue=%d reason=%s",
+	TP_printk("bdi %s: older=%lu age=%ld enqueue=%d reason=%s cgroup=%s",
 		__entry->name,
 		__entry->older,	/* older_than_this in jiffies */
 		__entry->age,	/* older_than_this in relative milliseconds */
 		__entry->moved,
-		__print_symbolic(__entry->reason, WB_WORK_REASON)
+		__print_symbolic(__entry->reason, WB_WORK_REASON),
+		__get_str(cgroup)
 	)
 );
 
@@ -381,11 +470,11 @@ TRACE_EVENT(global_dirty_state,
 
 TRACE_EVENT(bdi_dirty_ratelimit,
 
-	TP_PROTO(struct backing_dev_info *bdi,
+	TP_PROTO(struct bdi_writeback *wb,
 		 unsigned long dirty_rate,
 		 unsigned long task_ratelimit),
 
-	TP_ARGS(bdi, dirty_rate, task_ratelimit),
+	TP_ARGS(wb, dirty_rate, task_ratelimit),
 
 	TP_STRUCT__entry(
 		__array(char,		bdi, 32)
@@ -395,36 +484,39 @@ TRACE_EVENT(bdi_dirty_ratelimit,
 		__field(unsigned long,	dirty_ratelimit)
 		__field(unsigned long,	task_ratelimit)
 		__field(unsigned long,	balanced_dirty_ratelimit)
+		__dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb))
 	),
 
 	TP_fast_assign(
-		strlcpy(__entry->bdi, dev_name(bdi->dev), 32);
-		__entry->write_bw	= KBps(bdi->wb.write_bandwidth);
-		__entry->avg_write_bw	= KBps(bdi->wb.avg_write_bandwidth);
+		strlcpy(__entry->bdi, dev_name(wb->bdi->dev), 32);
+		__entry->write_bw	= KBps(wb->write_bandwidth);
+		__entry->avg_write_bw	= KBps(wb->avg_write_bandwidth);
 		__entry->dirty_rate	= KBps(dirty_rate);
-		__entry->dirty_ratelimit = KBps(bdi->wb.dirty_ratelimit);
+		__entry->dirty_ratelimit = KBps(wb->dirty_ratelimit);
 		__entry->task_ratelimit	= KBps(task_ratelimit);
 		__entry->balanced_dirty_ratelimit =
-					KBps(bdi->wb.balanced_dirty_ratelimit);
+					KBps(wb->balanced_dirty_ratelimit);
+		__trace_wb_assign_cgroup(__get_str(cgroup), wb);
 	),
 
 	TP_printk("bdi %s: "
 		  "write_bw=%lu awrite_bw=%lu dirty_rate=%lu "
 		  "dirty_ratelimit=%lu task_ratelimit=%lu "
-		  "balanced_dirty_ratelimit=%lu",
+		  "balanced_dirty_ratelimit=%lu cgroup=%s",
 		  __entry->bdi,
 		  __entry->write_bw,		/* write bandwidth */
 		  __entry->avg_write_bw,	/* avg write bandwidth */
 		  __entry->dirty_rate,		/* bdi dirty rate */
 		  __entry->dirty_ratelimit,	/* base ratelimit */
 		  __entry->task_ratelimit, /* ratelimit with position control */
-		  __entry->balanced_dirty_ratelimit /* the balanced ratelimit */
+		  __entry->balanced_dirty_ratelimit, /* the balanced ratelimit */
+		  __get_str(cgroup)
 	)
 );
 
 TRACE_EVENT(balance_dirty_pages,
 
-	TP_PROTO(struct backing_dev_info *bdi,
+	TP_PROTO(struct bdi_writeback *wb,
 		 unsigned long thresh,
 		 unsigned long bg_thresh,
 		 unsigned long dirty,
@@ -437,7 +529,7 @@ TRACE_EVENT(balance_dirty_pages,
 		 long pause,
 		 unsigned long start_time),
 
-	TP_ARGS(bdi, thresh, bg_thresh, dirty, bdi_thresh, bdi_dirty,
+	TP_ARGS(wb, thresh, bg_thresh, dirty, bdi_thresh, bdi_dirty,
 		dirty_ratelimit, task_ratelimit,
 		dirtied, period, pause, start_time),
 
@@ -456,11 +548,12 @@ TRACE_EVENT(balance_dirty_pages,
 		__field(	 long,	pause)
 		__field(unsigned long,	period)
 		__field(	 long,	think)
+		__dynamic_array(char, cgroup, __trace_wb_cgroup_size(wb))
 	),
 
 	TP_fast_assign(
 		unsigned long freerun = (thresh + bg_thresh) / 2;
-		strlcpy(__entry->bdi, dev_name(bdi->dev), 32);
+		strlcpy(__entry->bdi, dev_name(wb->bdi->dev), 32);
 
 		__entry->limit		= global_wb_domain.dirty_limit;
 		__entry->setpoint	= (global_wb_domain.dirty_limit +
@@ -478,6 +571,7 @@ TRACE_EVENT(balance_dirty_pages,
 		__entry->period		= period * 1000 / HZ;
 		__entry->pause		= pause * 1000 / HZ;
 		__entry->paused		= (jiffies - start_time) * 1000 / HZ;
+		__trace_wb_assign_cgroup(__get_str(cgroup), wb);
 	),
 
 
@@ -486,7 +580,7 @@ TRACE_EVENT(balance_dirty_pages,
 		  "bdi_setpoint=%lu bdi_dirty=%lu "
 		  "dirty_ratelimit=%lu task_ratelimit=%lu "
 		  "dirtied=%u dirtied_pause=%u "
-		  "paused=%lu pause=%ld period=%lu think=%ld",
+		  "paused=%lu pause=%ld period=%lu think=%ld cgroup=%s",
 		  __entry->bdi,
 		  __entry->limit,
 		  __entry->setpoint,
@@ -500,7 +594,8 @@ TRACE_EVENT(balance_dirty_pages,
 		  __entry->paused,	/* ms */
 		  __entry->pause,	/* ms */
 		  __entry->period,	/* ms */
-		  __entry->think	/* ms */
+		  __entry->think,	/* ms */
+		  __get_str(cgroup)
 	  )
 );
 
@@ -514,6 +609,8 @@ TRACE_EVENT(writeback_sb_inodes_requeue,
 		__field(unsigned long, ino)
 		__field(unsigned long, state)
 		__field(unsigned long, dirtied_when)
+		__dynamic_array(char, cgroup,
+				__trace_wb_cgroup_size(inode_to_wb(inode)))
 	),
 
 	TP_fast_assign(
@@ -522,14 +619,16 @@ TRACE_EVENT(writeback_sb_inodes_requeue,
 		__entry->ino		= inode->i_ino;
 		__entry->state		= inode->i_state;
 		__entry->dirtied_when	= inode->dirtied_when;
+		__trace_wb_assign_cgroup(__get_str(cgroup), inode_to_wb(inode));
 	),
 
-	TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu",
+	TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu cgroup=%s",
 		  __entry->name,
 		  __entry->ino,
 		  show_inode_state(__entry->state),
 		  __entry->dirtied_when,
-		  (jiffies - __entry->dirtied_when) / HZ
+		  (jiffies - __entry->dirtied_when) / HZ,
+		  __get_str(cgroup)
 	)
 );
 
@@ -585,6 +684,7 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template,
 		__field(unsigned long, writeback_index)
 		__field(long, nr_to_write)
 		__field(unsigned long, wrote)
+		__dynamic_array(char, cgroup, __trace_wbc_cgroup_size(wbc))
 	),
 
 	TP_fast_assign(
@@ -596,10 +696,11 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template,
 		__entry->writeback_index = inode->i_mapping->writeback_index;
 		__entry->nr_to_write	= nr_to_write;
 		__entry->wrote		= nr_to_write - wbc->nr_to_write;
+		__trace_wbc_assign_cgroup(__get_str(cgroup), wbc);
 	),
 
 	TP_printk("bdi %s: ino=%lu state=%s dirtied_when=%lu age=%lu "
-		  "index=%lu to_write=%ld wrote=%lu",
+		  "index=%lu to_write=%ld wrote=%lu cgroup=%s",
 		  __entry->name,
 		  __entry->ino,
 		  show_inode_state(__entry->state),
@@ -607,7 +708,8 @@ DECLARE_EVENT_CLASS(writeback_single_inode_template,
 		  (jiffies - __entry->dirtied_when) / HZ,
 		  __entry->writeback_index,
 		  __entry->nr_to_write,
-		  __entry->wrote
+		  __entry->wrote,
+		  __get_str(cgroup)
 	)
 );
author	Linus Torvalds <torvalds@linux-foundation.org>	2015-09-10 18:56:14 -0700
committer	Linus Torvalds <torvalds@linux-foundation.org>	2015-09-10 18:56:14 -0700
commit	b0a1ea51bda4c2bcdde460221e1772f3a4f8c44f (patch)
tree	9684c11b72718cd7e96e5eb93298690269ecf447 /include
parent	33e247c7e58d335d70ecb84fd869091e2e4b8dcb (diff)
parent	69d7fde5909b614114343974cfc52cb8ff30b544 (diff)