From 01e3eb82278bf45221fc38b391bc5ee0f6a314d6 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 Jan 2009 13:00:50 +0100
Subject: Revert "sched: improve preempt debugging"

This reverts commit 7317d7b87edb41a9135e30be1ec3f7ef817c53dd.

This has been reported (and bisected) by Alexey Zaytsev and
Kamalesh Babulal to produce annoying warnings during bootup
on both x86 and powerpc.

kernel_locked() is not a valid test in IRQ context (we update the
BKL's ->lock_depth and the preempt count separately and non-atomicalyy),
so we cannot put it into the generic preempt debugging checks which
can run in IRQ contexts too.

Reported-and-bisected-by: Alexey Zaytsev <alexey.zaytsev@gmail.com>
Reported-and-bisected-by: Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 8be2c13b50d0..3b630d882660 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4440,7 +4440,7 @@ void __kprobes sub_preempt_count(int val)
 	/*
 	 * Underflow?
 	 */
-       if (DEBUG_LOCKS_WARN_ON(val > preempt_count() - (!!kernel_locked())))
+	if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
 		return;
 	/*
 	 * Is the spinlock portion underflowing?
-- 
cgit v1.2.3


From 5add95d4f7cf08f6f62510f19576992912387501 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 14 Jan 2009 14:14:08 +0100
Subject: [CVE-2009-0029] System call wrappers part 06

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 kernel/sched.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 8be2c13b50d0..1a0fdfa5ddf9 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5126,7 +5126,7 @@ int can_nice(const struct task_struct *p, const int nice)
  * sys_setpriority is a more generic, but much slower function that
  * does similar things.
  */
-asmlinkage long sys_nice(int increment)
+SYSCALL_DEFINE1(nice, int, increment)
 {
 	long nice, retval;
 
@@ -5433,8 +5433,8 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
  * @policy: new policy.
  * @param: structure containing the new RT priority.
  */
-asmlinkage long
-sys_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
+SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
+		struct sched_param __user *, param)
 {
 	/* negative values for policy are not valid */
 	if (policy < 0)
@@ -5448,7 +5448,7 @@ sys_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
  * @pid: the pid in question.
  * @param: structure containing the new RT priority.
  */
-asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
+SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
 {
 	return do_sched_setscheduler(pid, -1, param);
 }
@@ -5457,7 +5457,7 @@ asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
  * sys_sched_getscheduler - get the policy (scheduling class) of a thread
  * @pid: the pid in question.
  */
-asmlinkage long sys_sched_getscheduler(pid_t pid)
+SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
 {
 	struct task_struct *p;
 	int retval;
@@ -5482,7 +5482,7 @@ asmlinkage long sys_sched_getscheduler(pid_t pid)
  * @pid: the pid in question.
  * @param: structure containing the RT priority.
  */
-asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
+SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
 {
 	struct sched_param lp;
 	struct task_struct *p;
@@ -5600,8 +5600,8 @@ static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
  * @user_mask_ptr: user-space pointer to the new cpu mask
  */
-asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
-				      unsigned long __user *user_mask_ptr)
+SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
+		unsigned long __user *, user_mask_ptr)
 {
 	cpumask_var_t new_mask;
 	int retval;
@@ -5648,8 +5648,8 @@ out_unlock:
  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
  * @user_mask_ptr: user-space pointer to hold the current cpu mask
  */
-asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
-				      unsigned long __user *user_mask_ptr)
+SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
+		unsigned long __user *, user_mask_ptr)
 {
 	int ret;
 	cpumask_var_t mask;
@@ -5678,7 +5678,7 @@ asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
  * This function yields the current CPU to other tasks. If there are no
  * other threads running on this CPU then this function will return.
  */
-asmlinkage long sys_sched_yield(void)
+SYSCALL_DEFINE0(sched_yield)
 {
 	struct rq *rq = this_rq_lock();
 
@@ -5819,7 +5819,7 @@ long __sched io_schedule_timeout(long timeout)
  * this syscall returns the maximum rt_priority that can be used
  * by a given scheduling class.
  */
-asmlinkage long sys_sched_get_priority_max(int policy)
+SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
 {
 	int ret = -EINVAL;
 
@@ -5844,7 +5844,7 @@ asmlinkage long sys_sched_get_priority_max(int policy)
  * this syscall returns the minimum rt_priority that can be used
  * by a given scheduling class.
  */
-asmlinkage long sys_sched_get_priority_min(int policy)
+SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
 {
 	int ret = -EINVAL;
 
-- 
cgit v1.2.3


From 754fe8d297bfae7b77f7ce866e2fb0c5fb186506 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 14 Jan 2009 14:14:09 +0100
Subject: [CVE-2009-0029] System call wrappers part 07

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 kernel/sched.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 1a0fdfa5ddf9..65c02037b052 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5869,8 +5869,8 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
  * this syscall writes the default timeslice value of a given process
  * into the user-space timespec buffer. A value of '0' means infinity.
  */
-asmlinkage
-long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
+SYSCALL_DEFINE4(sched_rr_get_interval, pid_t, pid,
+		struct timespec __user *, interval)
 {
 	struct task_struct *p;
 	unsigned int time_slice;
-- 
cgit v1.2.3


From 17da2bd90abf428523de0fb98f7075e00e3ed42e Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 14 Jan 2009 14:14:10 +0100
Subject: [CVE-2009-0029] System call wrappers part 08

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 65c02037b052..eb1931eef587 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5869,7 +5869,7 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
  * this syscall writes the default timeslice value of a given process
  * into the user-space timespec buffer. A value of '0' means infinity.
  */
-SYSCALL_DEFINE4(sched_rr_get_interval, pid_t, pid,
+SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
 		struct timespec __user *, interval)
 {
 	struct task_struct *p;
-- 
cgit v1.2.3


From 98a4826b99bc4bcc34c604b2fc4fcf4d771600ec Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 14 Jan 2009 10:56:32 +0100
Subject: sched: fix bandwidth validation for UID grouping

Impact: make rt-limit tunables work again

Mark Glines reported:

> I've got an issue on x86-64 where I can't configure the system to allow
> RT tasks for a non-root user.
>
> In 2.6.26.5, I was able to do the following to set things up nicely:
> echo 450000 >/sys/kernel/uids/0/cpu_rt_runtime
> echo 450000 >/sys/kernel/uids/1000/cpu_rt_runtime
>
> Seems like every value I try to echo into the /sys files returns EINVAL.

For UID grouping we initialize the root group with infinite bandwidth
which by default is actually more than the global limit, therefore the
bandwidth check always fails.

Because the root group is a phantom group (for UID grouping) we cannot
runtime adjust it, therefore we let it reflect the global bandwidth
settings.

Reported-by: Mark Glines <mark@glines.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 3b630d882660..ed62d1cee05c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -9050,6 +9050,13 @@ static int tg_schedulable(struct task_group *tg, void *data)
 		runtime = d->rt_runtime;
 	}
 
+#ifdef CONFIG_USER_SCHED
+	if (tg == &root_task_group) {
+		period = global_rt_period();
+		runtime = global_rt_runtime();
+	}
+#endif
+
 	/*
 	 * Cannot have more runtime than the period.
 	 */
-- 
cgit v1.2.3


From cce7ade803699463ecc62a065ca522004f7ccb3d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 15 Jan 2009 14:53:37 +0100
Subject: sched: SCHED_IDLE weight change

Increase the SCHED_IDLE weight from 2 to 3, this gives much more stable
vruntime numbers.

time advanced in 100ms:

 weight=2

 64765.988352
 67012.881408
 88501.412352

 weight=3

 35496.181411
 34130.971298
 35497.411573

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index ed62d1cee05c..6acfb3c2398b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1323,8 +1323,8 @@ static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
  * slice expiry etc.
  */
 
-#define WEIGHT_IDLEPRIO		2
-#define WMULT_IDLEPRIO		(1 << 31)
+#define WEIGHT_IDLEPRIO                3
+#define WMULT_IDLEPRIO         1431655765
 
 /*
  * Nice levels are multiplicative, with a gentle 10% change for every
-- 
cgit v1.2.3


From d942fb6c7d391baba3dddb566eb735fbf3df8528 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 26 Jan 2009 17:56:17 +0100
Subject: sched: fix sync wakeups

Pawel Dziekonski reported that the openssl benchmark and his
quantum chemistry application both show slowdowns due to the
scheduler under-parallelizing execution.

The reason are pipe wakeups still doing 'sync' wakeups which
overrides the normal buddy wakeup logic - even if waker and
wakee are loosely coupled.

Fix an inversion of logic in the buddy wakeup code.

Reported-by: Pawel Dziekonski <dzieko@gmail.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 52bbf1c842a8..770b1f9ebe14 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2266,6 +2266,10 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 	if (!sched_feat(SYNC_WAKEUPS))
 		sync = 0;
 
+	if (!sync && (current->se.avg_overlap < sysctl_sched_migration_cost &&
+			    p->se.avg_overlap < sysctl_sched_migration_cost))
+		sync = 1;
+
 #ifdef CONFIG_SMP
 	if (sched_feat(LB_WAKEUP_UPDATE)) {
 		struct sched_domain *sd;
-- 
cgit v1.2.3


From 1596e29773eadd96b0a5fc6e736afa52394cafda Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 28 Jan 2009 14:51:38 +0100
Subject: sched: symmetric sync vs avg_overlap

Reinstate the weakening of the sync hint if set. This yields a more
symmetric usage of avg_overlap.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 770b1f9ebe14..242d0d47a70d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2266,9 +2266,15 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 	if (!sched_feat(SYNC_WAKEUPS))
 		sync = 0;
 
-	if (!sync && (current->se.avg_overlap < sysctl_sched_migration_cost &&
-			    p->se.avg_overlap < sysctl_sched_migration_cost))
-		sync = 1;
+	if (!sync) {
+		if (current->se.avg_overlap < sysctl_sched_migration_cost &&
+			  p->se.avg_overlap < sysctl_sched_migration_cost)
+			sync = 1;
+	} else {
+		if (current->se.avg_overlap >= sysctl_sched_migration_cost ||
+			  p->se.avg_overlap >= sysctl_sched_migration_cost)
+			sync = 0;
+	}
 
 #ifdef CONFIG_SMP
 	if (sched_feat(LB_WAKEUP_UPDATE)) {
-- 
cgit v1.2.3


From 777c6c5f1f6e757ae49ecca2ed72d6b1f523c007 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Wed, 4 Feb 2009 15:12:14 -0800
Subject: wait: prevent exclusive waiter starvation

With exclusive waiters, every process woken up through the wait queue must
ensure that the next waiter down the line is woken when it has finished.

Interruptible waiters don't do that when aborting due to a signal.  And if
an aborting waiter is concurrently woken up through the waitqueue, noone
will ever wake up the next waiter.

This has been observed with __wait_on_bit_lock() used by
lock_page_killable(): the first contender on the queue was aborting when
the actual lock holder woke it up concurrently.  The aborted contender
didn't acquire the lock and therefor never did an unlock followed by
waking up the next waiter.

Add abort_exclusive_wait() which removes the process' wait descriptor from
the waitqueue, iff still queued, or wakes up the next waiter otherwise.
It does so under the waitqueue lock.  Racing with a wake up means the
aborting process is either already woken (removed from the queue) and will
wake up the next waiter, or it will remove itself from the queue and the
concurrent wake up will apply to the next waiter after it.

Use abort_exclusive_wait() in __wait_event_interruptible_exclusive() and
__wait_on_bit_lock() when they were interrupted by other means than a wake
up through the queue.

[akpm@linux-foundation.org: coding-style fixes]
Reported-by: Chris Mason <chris.mason@oracle.com>
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Mentored-by: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Matthew Wilcox <matthew@wil.cx>
Cc: Chuck Lever <cel@citi.umich.edu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: <stable@kernel.org>		["after some testing"]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sched.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 242d0d47a70d..8ee437a5ec1d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4697,8 +4697,8 @@ EXPORT_SYMBOL(default_wake_function);
  * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
  * zero in this (rare) case, and we handle it by continuing to scan the queue.
  */
-static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
-			     int nr_exclusive, int sync, void *key)
+void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
+			int nr_exclusive, int sync, void *key)
 {
 	wait_queue_t *curr, *next;
 
-- 
cgit v1.2.3