sched: Force sleep on consecutive sched_yields

If a task sched_yields to itself continuously, force the task to sleep in sched_yield. This will lower the CPU load of this task thereby lowering the cpu frequency and improving power. Added a stat variable to track how many times we sleep due these consecutive sched_yields. Also added sysctl knobs to control the number of consecutive sched_yields before which the sleep kicks in and the duration fo the sleep in us. Bug 1424617 Change-Id: Ie92412b8b900365816e17237fcbd0aac6e9c94ce Signed-off-by: Sai Gurrappadi <sgurrappadi@nvidia.com> Reviewed-on: http://git-master/r/358455 Reviewed-by: Wen Yi <wyi@nvidia.com> Reviewed-by: Peter Zu <pzu@nvidia.com> GVS: Gerrit_Virtual_Submit Reviewed-by: Diwakar Tundlam <dtundlam@nvidia.com>
author: Sai Gurrappadi <sgurrappadi@nvidia.com> 2014-01-21 16:41:37 -0800
committer: Diwakar Tundlam <dtundlam@nvidia.com> 2014-03-03 19:38:54 -0800
commit: 3bfdbefc2cf0ecf2933250813c356b3d147e59e0 (patch)
tree: de830b9368a6ac8b3788fbb47054b7633771c8a0 /kernel
parent: 33db1f7eb8e2d9eaad2dce65152cd5b2a4a27fae (diff)
5 files changed, 43 insertions, 4 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 62cb6b24ab46..825447720620 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -297,7 +297,18 @@ __read_mostly int scheduler_running;
  */
 int sysctl_sched_rt_runtime = 950000;
 
+/*
+ * Number of sched_yield calls that result in a thread yielding
+ * to itself before a sleep is injected in its next sched_yield call
+ * Setting this to -1 will disable adding sleep in sched_yield
+ */
+const_debug int sysctl_sched_yield_sleep_threshold = 4;
 
+/*
+ * Sleep duration in us used when sched_yield_sleep_threshold
+ * is exceeded.
+ */
+const_debug unsigned int sysctl_sched_yield_sleep_duration = 50;
 
 /*
  * __task_rq_lock - lock the rq @p resides on.
@@ -3035,6 +3046,7 @@ need_resched:
 	if (likely(prev != next)) {
 		rq->nr_switches++;
 		rq->curr = next;
+		prev->yield_count = 0;
 		++*switch_count;
 
 		context_switch(rq, prev, next); /* unlocks the rq */
@@ -3046,8 +3058,10 @@ need_resched:
 		 */
 		cpu = smp_processor_id();
 		rq = cpu_rq(cpu);
-	} else
+	} else {
+		prev->yield_count++;
 		raw_spin_unlock_irq(&rq->lock);
+	}
 
 	post_schedule(rq);
 
@@ -4352,6 +4366,8 @@ SYSCALL_DEFINE0(sched_yield)
 	struct rq *rq = this_rq_lock();
 
 	schedstat_inc(rq, yld_count);
+	if (rq->curr->yield_count == sysctl_sched_yield_sleep_threshold)
+		schedstat_inc(rq, yield_sleep_count);
 	current->sched_class->yield_task(rq);
 
 	/*
@@ -4363,7 +4379,11 @@ SYSCALL_DEFINE0(sched_yield)
 	do_raw_spin_unlock(&rq->lock);
 	sched_preempt_enable_no_resched();
 
-	schedule();
+	if (rq->curr->yield_count == sysctl_sched_yield_sleep_threshold)
+		usleep_range(sysctl_sched_yield_sleep_duration,
+				sysctl_sched_yield_sleep_duration + 5);
+	else
+		schedule();
 
 	return 0;
 }
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 75024a673520..068ad55aa641 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -302,6 +302,7 @@ do {									\
 #define P64(n) SEQ_printf(m, "  .%-30s: %Ld\n", #n, rq->n);
 
 	P(yld_count);
+	P(yield_sleep_count);
 
 	P(sched_count);
 	P(sched_goidle);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 35bd8b7f3a87..0fc275c70d7d 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -511,6 +511,7 @@ struct rq {
 
 	/* sys_sched_yield() stats */
 	unsigned int yld_count;
+	unsigned int yield_sleep_count;
 
 	/* schedule() stats */
 	unsigned int sched_count;
@@ -1143,6 +1144,8 @@ extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
 extern const_debug unsigned int sysctl_sched_time_avg;
 extern const_debug unsigned int sysctl_sched_nr_migrate;
 extern const_debug unsigned int sysctl_sched_migration_cost;
+extern const_debug unsigned int sysctl_sched_yield_sleep_duration;
+extern const_debug int sysctl_sched_yield_sleep_threshold;
 
 static inline u64 sched_avg_period(void)
 {
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
index da98af347e8b..dff505e53f7f 100644
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -35,12 +35,13 @@ static int show_schedstat(struct seq_file *seq, void *v)
 
 		/* runqueue-specific stats */
 		seq_printf(seq,
-		    "cpu%d %u 0 %u %u %u %u %llu %llu %lu",
+		    "cpu%d %u 0 %u %u %u %u %llu %llu %lu %u",
 		    cpu, rq->yld_count,
 		    rq->sched_count, rq->sched_goidle,
 		    rq->ttwu_count, rq->ttwu_local,
 		    rq->rq_cpu_time,
-		    rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount);
+		    rq->rq_sched_info.run_delay, rq->rq_sched_info.pcount,
+		    rq->yield_sleep_count);
 
 		seq_printf(seq, "\n");
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 2ba77228ff97..207454a598f4 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -312,6 +312,20 @@ static struct ctl_table kern_table[] = {
 		.extra1		= &min_wakeup_granularity_ns,
 		.extra2		= &max_wakeup_granularity_ns,
 	},
+	{
+		.procname	= "sched_yield_sleep_threshold",
+		.data		= &sysctl_sched_yield_sleep_threshold,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "sched_yield_sleep_duration",
+		.data		= &sysctl_sched_yield_sleep_duration,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
 #ifdef CONFIG_SMP
 	{
 		.procname	= "sched_tunable_scaling",
author	Sai Gurrappadi <sgurrappadi@nvidia.com>	2014-01-21 16:41:37 -0800
committer	Diwakar Tundlam <dtundlam@nvidia.com>	2014-03-03 19:38:54 -0800
commit	3bfdbefc2cf0ecf2933250813c356b3d147e59e0 (patch)
tree	de830b9368a6ac8b3788fbb47054b7633771c8a0 /kernel
parent	33db1f7eb8e2d9eaad2dce65152cd5b2a4a27fae (diff)