summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2011-05-26 13:48:30 +0200
committerIngo Molnar <mingo@elte.hu>2011-05-26 13:48:39 +0200
commit1102c660dd35725a11c7ca9365c237f2f42f6b30 (patch)
treecd32d3053b30050182218e0d36b4aed7459c48de /kernel
parent6e9101aeec39961308176e0f59e73ac5d37d243a (diff)
parent4db70f73e56961b9bcdfd0c36c62847a18b7dbb5 (diff)
Merge branch 'linus' into perf/urgent
Merge reason: Linus applied an overlapping commit: 5f2e8e2b0bf0: kernel/watchdog.c: Use proper ANSI C prototypes So merge it in to make sure we can iterate the file without conflicts. Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.locks2
-rw-r--r--kernel/auditsc.c27
-rw-r--r--kernel/capability.c4
-rw-r--r--kernel/compat.c55
-rw-r--r--kernel/cred.c6
-rw-r--r--kernel/exit.c110
-rw-r--r--kernel/fork.c42
-rw-r--r--kernel/hrtimer.c162
-rw-r--r--kernel/irq/proc.c54
-rw-r--r--kernel/kmod.c100
-rw-r--r--kernel/module.c4
-rw-r--r--kernel/mutex.c25
-rw-r--r--kernel/nsproxy.c42
-rw-r--r--kernel/pm_qos_params.c2
-rw-r--r--kernel/posix-cpu-timers.c4
-rw-r--r--kernel/posix-timers.c27
-rw-r--r--kernel/printk.c87
-rw-r--r--kernel/ptrace.c120
-rw-r--r--kernel/rcutiny.c1
-rw-r--r--kernel/rcutree.c1
-rw-r--r--kernel/sched.c41
-rw-r--r--kernel/sched_fair.c52
-rw-r--r--kernel/signal.c684
-rw-r--r--kernel/sys_ni.c9
-rw-r--r--kernel/sysctl.c8
-rw-r--r--kernel/time/alarmtimer.c16
-rw-r--r--kernel/time/clockevents.c2
-rw-r--r--kernel/time/clocksource.c2
-rw-r--r--kernel/time/tick-broadcast.c16
-rw-r--r--kernel/time/timekeeping.c17
-rw-r--r--kernel/utsname.c39
-rw-r--r--kernel/workqueue.c4
32 files changed, 1224 insertions, 541 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index 88c92fb44618..5068e2a4e75f 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -199,4 +199,4 @@ config INLINE_WRITE_UNLOCK_IRQRESTORE
def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
config MUTEX_SPIN_ON_OWNER
- def_bool SMP && !DEBUG_MUTEXES && !HAVE_DEFAULT_NO_SPIN_MUTEXES
+ def_bool SMP && !DEBUG_MUTEXES
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index b33513a08beb..00d79df03e76 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -443,17 +443,25 @@ static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree)
/* Determine if any context name data matches a rule's watch data */
/* Compare a task_struct with an audit_rule. Return 1 on match, 0
- * otherwise. */
+ * otherwise.
+ *
+ * If task_creation is true, this is an explicit indication that we are
+ * filtering a task rule at task creation time. This and tsk == current are
+ * the only situations where tsk->cred may be accessed without an rcu read lock.
+ */
static int audit_filter_rules(struct task_struct *tsk,
struct audit_krule *rule,
struct audit_context *ctx,
struct audit_names *name,
- enum audit_state *state)
+ enum audit_state *state,
+ bool task_creation)
{
- const struct cred *cred = get_task_cred(tsk);
+ const struct cred *cred;
int i, j, need_sid = 1;
u32 sid;
+ cred = rcu_dereference_check(tsk->cred, tsk == current || task_creation);
+
for (i = 0; i < rule->field_count; i++) {
struct audit_field *f = &rule->fields[i];
int result = 0;
@@ -637,10 +645,8 @@ static int audit_filter_rules(struct task_struct *tsk,
break;
}
- if (!result) {
- put_cred(cred);
+ if (!result)
return 0;
- }
}
if (ctx) {
@@ -656,7 +662,6 @@ static int audit_filter_rules(struct task_struct *tsk,
case AUDIT_NEVER: *state = AUDIT_DISABLED; break;
case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break;
}
- put_cred(cred);
return 1;
}
@@ -671,7 +676,8 @@ static enum audit_state audit_filter_task(struct task_struct *tsk, char **key)
rcu_read_lock();
list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TASK], list) {
- if (audit_filter_rules(tsk, &e->rule, NULL, NULL, &state)) {
+ if (audit_filter_rules(tsk, &e->rule, NULL, NULL,
+ &state, true)) {
if (state == AUDIT_RECORD_CONTEXT)
*key = kstrdup(e->rule.filterkey, GFP_ATOMIC);
rcu_read_unlock();
@@ -705,7 +711,7 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
list_for_each_entry_rcu(e, list, list) {
if ((e->rule.mask[word] & bit) == bit &&
audit_filter_rules(tsk, &e->rule, ctx, NULL,
- &state)) {
+ &state, false)) {
rcu_read_unlock();
ctx->current_state = state;
return state;
@@ -743,7 +749,8 @@ void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx)
list_for_each_entry_rcu(e, list, list) {
if ((e->rule.mask[word] & bit) == bit &&
- audit_filter_rules(tsk, &e->rule, ctx, n, &state)) {
+ audit_filter_rules(tsk, &e->rule, ctx, n,
+ &state, false)) {
rcu_read_unlock();
ctx->current_state = state;
return;
diff --git a/kernel/capability.c b/kernel/capability.c
index 32a80e08ff4b..283c529f8b1c 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -22,12 +22,8 @@
*/
const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET;
-const kernel_cap_t __cap_full_set = CAP_FULL_SET;
-const kernel_cap_t __cap_init_eff_set = CAP_INIT_EFF_SET;
EXPORT_SYMBOL(__cap_empty_set);
-EXPORT_SYMBOL(__cap_full_set);
-EXPORT_SYMBOL(__cap_init_eff_set);
int file_caps_enabled = 1;
diff --git a/kernel/compat.c b/kernel/compat.c
index 38b1d2c1cbe8..fc9eb093acd5 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -293,6 +293,8 @@ asmlinkage long compat_sys_times(struct compat_tms __user *tbuf)
return compat_jiffies_to_clock_t(jiffies);
}
+#ifdef __ARCH_WANT_SYS_SIGPENDING
+
/*
* Assumption: old_sigset_t and compat_old_sigset_t are both
* types that can be passed to put_user()/get_user().
@@ -312,6 +314,10 @@ asmlinkage long compat_sys_sigpending(compat_old_sigset_t __user *set)
return ret;
}
+#endif
+
+#ifdef __ARCH_WANT_SYS_SIGPROCMASK
+
asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set,
compat_old_sigset_t __user *oset)
{
@@ -333,6 +339,8 @@ asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set,
return ret;
}
+#endif
+
asmlinkage long compat_sys_setrlimit(unsigned int resource,
struct compat_rlimit __user *rlim)
{
@@ -890,10 +898,9 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
{
compat_sigset_t s32;
sigset_t s;
- int sig;
struct timespec t;
siginfo_t info;
- long ret, timeout = 0;
+ long ret;
if (sigsetsize != sizeof(sigset_t))
return -EINVAL;
@@ -901,51 +908,19 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
if (copy_from_user(&s32, uthese, sizeof(compat_sigset_t)))
return -EFAULT;
sigset_from_compat(&s, &s32);
- sigdelsetmask(&s,sigmask(SIGKILL)|sigmask(SIGSTOP));
- signotset(&s);
if (uts) {
- if (get_compat_timespec (&t, uts))
+ if (get_compat_timespec(&t, uts))
return -EFAULT;
- if (t.tv_nsec >= 1000000000L || t.tv_nsec < 0
- || t.tv_sec < 0)
- return -EINVAL;
}
- spin_lock_irq(&current->sighand->siglock);
- sig = dequeue_signal(current, &s, &info);
- if (!sig) {
- timeout = MAX_SCHEDULE_TIMEOUT;
- if (uts)
- timeout = timespec_to_jiffies(&t)
- +(t.tv_sec || t.tv_nsec);
- if (timeout) {
- current->real_blocked = current->blocked;
- sigandsets(&current->blocked, &current->blocked, &s);
-
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
-
- timeout = schedule_timeout_interruptible(timeout);
-
- spin_lock_irq(&current->sighand->siglock);
- sig = dequeue_signal(current, &s, &info);
- current->blocked = current->real_blocked;
- siginitset(&current->real_blocked, 0);
- recalc_sigpending();
- }
- }
- spin_unlock_irq(&current->sighand->siglock);
+ ret = do_sigtimedwait(&s, &info, uts ? &t : NULL);
- if (sig) {
- ret = sig;
- if (uinfo) {
- if (copy_siginfo_to_user32(uinfo, &info))
- ret = -EFAULT;
- }
- }else {
- ret = timeout?-EINTR:-EAGAIN;
+ if (ret > 0 && uinfo) {
+ if (copy_siginfo_to_user32(uinfo, &info))
+ ret = -EFAULT;
}
+
return ret;
}
diff --git a/kernel/cred.c b/kernel/cred.c
index 8093c16b84b1..e12c8af793f8 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -49,10 +49,10 @@ struct cred init_cred = {
.magic = CRED_MAGIC,
#endif
.securebits = SECUREBITS_DEFAULT,
- .cap_inheritable = CAP_INIT_INH_SET,
+ .cap_inheritable = CAP_EMPTY_SET,
.cap_permitted = CAP_FULL_SET,
- .cap_effective = CAP_INIT_EFF_SET,
- .cap_bset = CAP_INIT_BSET,
+ .cap_effective = CAP_FULL_SET,
+ .cap_bset = CAP_FULL_SET,
.user = INIT_USER,
.user_ns = &init_user_ns,
.group_info = &init_groups,
diff --git a/kernel/exit.c b/kernel/exit.c
index 8dd874181542..20a406471525 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1377,11 +1377,23 @@ static int *task_stopped_code(struct task_struct *p, bool ptrace)
return NULL;
}
-/*
- * Handle sys_wait4 work for one task in state TASK_STOPPED. We hold
- * read_lock(&tasklist_lock) on entry. If we return zero, we still hold
- * the lock and this task is uninteresting. If we return nonzero, we have
- * released the lock and the system call should return.
+/**
+ * wait_task_stopped - Wait for %TASK_STOPPED or %TASK_TRACED
+ * @wo: wait options
+ * @ptrace: is the wait for ptrace
+ * @p: task to wait for
+ *
+ * Handle sys_wait4() work for %p in state %TASK_STOPPED or %TASK_TRACED.
+ *
+ * CONTEXT:
+ * read_lock(&tasklist_lock), which is released if return value is
+ * non-zero. Also, grabs and releases @p->sighand->siglock.
+ *
+ * RETURNS:
+ * 0 if wait condition didn't exist and search for other wait conditions
+ * should continue. Non-zero return, -errno on failure and @p's pid on
+ * success, implies that tasklist_lock is released and wait condition
+ * search should terminate.
*/
static int wait_task_stopped(struct wait_opts *wo,
int ptrace, struct task_struct *p)
@@ -1397,6 +1409,9 @@ static int wait_task_stopped(struct wait_opts *wo,
if (!ptrace && !(wo->wo_flags & WUNTRACED))
return 0;
+ if (!task_stopped_code(p, ptrace))
+ return 0;
+
exit_code = 0;
spin_lock_irq(&p->sighand->siglock);
@@ -1538,33 +1553,84 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
return 0;
}
- if (likely(!ptrace) && unlikely(task_ptrace(p))) {
+ /* dead body doesn't have much to contribute */
+ if (p->exit_state == EXIT_DEAD)
+ return 0;
+
+ /* slay zombie? */
+ if (p->exit_state == EXIT_ZOMBIE) {
+ /*
+ * A zombie ptracee is only visible to its ptracer.
+ * Notification and reaping will be cascaded to the real
+ * parent when the ptracer detaches.
+ */
+ if (likely(!ptrace) && unlikely(task_ptrace(p))) {
+ /* it will become visible, clear notask_error */
+ wo->notask_error = 0;
+ return 0;
+ }
+
+ /* we don't reap group leaders with subthreads */
+ if (!delay_group_leader(p))
+ return wait_task_zombie(wo, p);
+
/*
- * This child is hidden by ptrace.
- * We aren't allowed to see it now, but eventually we will.
+ * Allow access to stopped/continued state via zombie by
+ * falling through. Clearing of notask_error is complex.
+ *
+ * When !@ptrace:
+ *
+ * If WEXITED is set, notask_error should naturally be
+ * cleared. If not, subset of WSTOPPED|WCONTINUED is set,
+ * so, if there are live subthreads, there are events to
+ * wait for. If all subthreads are dead, it's still safe
+ * to clear - this function will be called again in finite
+ * amount time once all the subthreads are released and
+ * will then return without clearing.
+ *
+ * When @ptrace:
+ *
+ * Stopped state is per-task and thus can't change once the
+ * target task dies. Only continued and exited can happen.
+ * Clear notask_error if WCONTINUED | WEXITED.
+ */
+ if (likely(!ptrace) || (wo->wo_flags & (WCONTINUED | WEXITED)))
+ wo->notask_error = 0;
+ } else {
+ /*
+ * If @p is ptraced by a task in its real parent's group,
+ * hide group stop/continued state when looking at @p as
+ * the real parent; otherwise, a single stop can be
+ * reported twice as group and ptrace stops.
+ *
+ * If a ptracer wants to distinguish the two events for its
+ * own children, it should create a separate process which
+ * takes the role of real parent.
+ */
+ if (likely(!ptrace) && task_ptrace(p) &&
+ same_thread_group(p->parent, p->real_parent))
+ return 0;
+
+ /*
+ * @p is alive and it's gonna stop, continue or exit, so
+ * there always is something to wait for.
*/
wo->notask_error = 0;
- return 0;
}
- if (p->exit_state == EXIT_DEAD)
- return 0;
-
/*
- * We don't reap group leaders with subthreads.
+ * Wait for stopped. Depending on @ptrace, different stopped state
+ * is used and the two don't interact with each other.
*/
- if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p))
- return wait_task_zombie(wo, p);
+ ret = wait_task_stopped(wo, ptrace, p);
+ if (ret)
+ return ret;
/*
- * It's stopped or running now, so it might
- * later continue, exit, or stop again.
+ * Wait for continued. There's only one continued state and the
+ * ptracer can consume it which can confuse the real parent. Don't
+ * use WCONTINUED from ptracer. You don't need or want it.
*/
- wo->notask_error = 0;
-
- if (task_stopped_code(p, ptrace))
- return wait_task_stopped(wo, ptrace, p);
-
return wait_task_continued(wo, p);
}
diff --git a/kernel/fork.c b/kernel/fork.c
index 2b44d82b8237..8e7e135d0817 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -383,15 +383,14 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
get_file(file);
if (tmp->vm_flags & VM_DENYWRITE)
atomic_dec(&inode->i_writecount);
- spin_lock(&mapping->i_mmap_lock);
+ mutex_lock(&mapping->i_mmap_mutex);
if (tmp->vm_flags & VM_SHARED)
mapping->i_mmap_writable++;
- tmp->vm_truncate_count = mpnt->vm_truncate_count;
flush_dcache_mmap_lock(mapping);
/* insert tmp into the share list, just after mpnt */
vma_prio_tree_add(tmp, mpnt);
flush_dcache_mmap_unlock(mapping);
- spin_unlock(&mapping->i_mmap_lock);
+ mutex_unlock(&mapping->i_mmap_mutex);
}
/*
@@ -486,6 +485,20 @@ static void mm_init_aio(struct mm_struct *mm)
#endif
}
+int mm_init_cpumask(struct mm_struct *mm, struct mm_struct *oldmm)
+{
+#ifdef CONFIG_CPUMASK_OFFSTACK
+ if (!alloc_cpumask_var(&mm->cpu_vm_mask_var, GFP_KERNEL))
+ return -ENOMEM;
+
+ if (oldmm)
+ cpumask_copy(mm_cpumask(mm), mm_cpumask(oldmm));
+ else
+ memset(mm_cpumask(mm), 0, cpumask_size());
+#endif
+ return 0;
+}
+
static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
{
atomic_set(&mm->mm_users, 1);
@@ -522,10 +535,20 @@ struct mm_struct * mm_alloc(void)
struct mm_struct * mm;
mm = allocate_mm();
- if (mm) {
- memset(mm, 0, sizeof(*mm));
- mm = mm_init(mm, current);
+ if (!mm)
+ return NULL;
+
+ memset(mm, 0, sizeof(*mm));
+ mm = mm_init(mm, current);
+ if (!mm)
+ return NULL;
+
+ if (mm_init_cpumask(mm, NULL)) {
+ mm_free_pgd(mm);
+ free_mm(mm);
+ return NULL;
}
+
return mm;
}
@@ -537,6 +560,7 @@ struct mm_struct * mm_alloc(void)
void __mmdrop(struct mm_struct *mm)
{
BUG_ON(mm == &init_mm);
+ free_cpumask_var(mm->cpu_vm_mask_var);
mm_free_pgd(mm);
destroy_context(mm);
mmu_notifier_mm_destroy(mm);
@@ -691,6 +715,9 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
if (!mm_init(mm, tsk))
goto fail_nomem;
+ if (mm_init_cpumask(mm, oldmm))
+ goto fail_nocpumask;
+
if (init_new_context(tsk, mm))
goto fail_nocontext;
@@ -717,6 +744,9 @@ fail_nomem:
return NULL;
fail_nocontext:
+ free_cpumask_var(mm->cpu_vm_mask_var);
+
+fail_nocpumask:
/*
* If init_new_context() failed, we cannot use mmput() to free the mm
* because it calls destroy_context()
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index dbbbf7d43080..a9205e32a059 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -64,17 +64,20 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
.clock_base =
{
{
- .index = CLOCK_REALTIME,
- .get_time = &ktime_get_real,
+ .index = HRTIMER_BASE_MONOTONIC,
+ .clockid = CLOCK_MONOTONIC,
+ .get_time = &ktime_get,
.resolution = KTIME_LOW_RES,
},
{
- .index = CLOCK_MONOTONIC,
- .get_time = &ktime_get,
+ .index = HRTIMER_BASE_REALTIME,
+ .clockid = CLOCK_REALTIME,
+ .get_time = &ktime_get_real,
.resolution = KTIME_LOW_RES,
},
{
- .index = CLOCK_BOOTTIME,
+ .index = HRTIMER_BASE_BOOTTIME,
+ .clockid = CLOCK_BOOTTIME,
.get_time = &ktime_get_boottime,
.resolution = KTIME_LOW_RES,
},
@@ -196,7 +199,7 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
struct hrtimer_cpu_base *new_cpu_base;
int this_cpu = smp_processor_id();
int cpu = hrtimer_get_target(this_cpu, pinned);
- int basenum = hrtimer_clockid_to_base(base->index);
+ int basenum = base->index;
again:
new_cpu_base = &per_cpu(hrtimer_bases, cpu);
@@ -621,66 +624,6 @@ static int hrtimer_reprogram(struct hrtimer *timer,
return res;
}
-
-/*
- * Retrigger next event is called after clock was set
- *
- * Called with interrupts disabled via on_each_cpu()
- */
-static void retrigger_next_event(void *arg)
-{
- struct hrtimer_cpu_base *base;
- struct timespec realtime_offset, wtm, sleep;
-
- if (!hrtimer_hres_active())
- return;
-
- get_xtime_and_monotonic_and_sleep_offset(&realtime_offset, &wtm,
- &sleep);
- set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec);
-
- base = &__get_cpu_var(hrtimer_bases);
-
- /* Adjust CLOCK_REALTIME offset */
- raw_spin_lock(&base->lock);
- base->clock_base[HRTIMER_BASE_REALTIME].offset =
- timespec_to_ktime(realtime_offset);
- base->clock_base[HRTIMER_BASE_BOOTTIME].offset =
- timespec_to_ktime(sleep);
-
- hrtimer_force_reprogram(base, 0);
- raw_spin_unlock(&base->lock);
-}
-
-/*
- * Clock realtime was set
- *
- * Change the offset of the realtime clock vs. the monotonic
- * clock.
- *
- * We might have to reprogram the high resolution timer interrupt. On
- * SMP we call the architecture specific code to retrigger _all_ high
- * resolution timer interrupts. On UP we just disable interrupts and
- * call the high resolution interrupt code.
- */
-void clock_was_set(void)
-{
- /* Retrigger the CPU local events everywhere */
- on_each_cpu(retrigger_next_event, NULL, 1);
-}
-
-/*
- * During resume we might have to reprogram the high resolution timer
- * interrupt (on the local CPU):
- */
-void hres_timers_resume(void)
-{
- WARN_ONCE(!irqs_disabled(),
- KERN_INFO "hres_timers_resume() called with IRQs enabled!");
-
- retrigger_next_event(NULL);
-}
-
/*
* Initialize the high resolution related parts of cpu_base
*/
@@ -715,11 +658,39 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
}
/*
+ * Retrigger next event is called after clock was set
+ *
+ * Called with interrupts disabled via on_each_cpu()
+ */
+static void retrigger_next_event(void *arg)
+{
+ struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases);
+ struct timespec realtime_offset, xtim, wtm, sleep;
+
+ if (!hrtimer_hres_active())
+ return;
+
+ /* Optimized out for !HIGH_RES */
+ get_xtime_and_monotonic_and_sleep_offset(&xtim, &wtm, &sleep);
+ set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec);
+
+ /* Adjust CLOCK_REALTIME offset */
+ raw_spin_lock(&base->lock);
+ base->clock_base[HRTIMER_BASE_REALTIME].offset =
+ timespec_to_ktime(realtime_offset);
+ base->clock_base[HRTIMER_BASE_BOOTTIME].offset =
+ timespec_to_ktime(sleep);
+
+ hrtimer_force_reprogram(base, 0);
+ raw_spin_unlock(&base->lock);
+}
+
+/*
* Switch to high resolution mode
*/
static int hrtimer_switch_to_hres(void)
{
- int cpu = smp_processor_id();
+ int i, cpu = smp_processor_id();
struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu);
unsigned long flags;
@@ -735,9 +706,8 @@ static int hrtimer_switch_to_hres(void)
return 0;
}
base->hres_active = 1;
- base->clock_base[HRTIMER_BASE_REALTIME].resolution = KTIME_HIGH_RES;
- base->clock_base[HRTIMER_BASE_MONOTONIC].resolution = KTIME_HIGH_RES;
- base->clock_base[HRTIMER_BASE_BOOTTIME].resolution = KTIME_HIGH_RES;
+ for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
+ base->clock_base[i].resolution = KTIME_HIGH_RES;
tick_setup_sched_timer();
@@ -761,9 +731,43 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
return 0;
}
static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
+static inline void retrigger_next_event(void *arg) { }
#endif /* CONFIG_HIGH_RES_TIMERS */
+/*
+ * Clock realtime was set
+ *
+ * Change the offset of the realtime clock vs. the monotonic
+ * clock.
+ *
+ * We might have to reprogram the high resolution timer interrupt. On
+ * SMP we call the architecture specific code to retrigger _all_ high
+ * resolution timer interrupts. On UP we just disable interrupts and
+ * call the high resolution interrupt code.
+ */
+void clock_was_set(void)
+{
+#ifdef CONFIG_HIGH_RES_TIMERS
+ /* Retrigger the CPU local events everywhere */
+ on_each_cpu(retrigger_next_event, NULL, 1);
+#endif
+ timerfd_clock_was_set();
+}
+
+/*
+ * During resume we might have to reprogram the high resolution timer
+ * interrupt (on the local CPU):
+ */
+void hrtimers_resume(void)
+{
+ WARN_ONCE(!irqs_disabled(),
+ KERN_INFO "hrtimers_resume() called with IRQs enabled!");
+
+ retrigger_next_event(NULL);
+ timerfd_clock_was_set();
+}
+
static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer)
{
#ifdef CONFIG_TIMER_STATS
@@ -856,6 +860,7 @@ static int enqueue_hrtimer(struct hrtimer *timer,
debug_activate(timer);
timerqueue_add(&base->active, &timer->node);
+ base->cpu_base->active_bases |= 1 << base->index;
/*
* HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the
@@ -897,6 +902,8 @@ static void __remove_hrtimer(struct hrtimer *timer,
#endif
}
timerqueue_del(&base->active, &timer->node);
+ if (!timerqueue_getnext(&base->active))
+ base->cpu_base->active_bases &= ~(1 << base->index);
out:
timer->state = newstate;
}
@@ -1234,7 +1241,6 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
void hrtimer_interrupt(struct clock_event_device *dev)
{
struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
- struct hrtimer_clock_base *base;
ktime_t expires_next, now, entry_time, delta;
int i, retries = 0;
@@ -1256,12 +1262,15 @@ retry:
*/
cpu_base->expires_next.tv64 = KTIME_MAX;
- base = cpu_base->clock_base;
-
for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
- ktime_t basenow;
+ struct hrtimer_clock_base *base;
struct timerqueue_node *node;
+ ktime_t basenow;
+
+ if (!(cpu_base->active_bases & (1 << i)))
+ continue;
+ base = cpu_base->clock_base + i;
basenow = ktime_add(now, base->offset);
while ((node = timerqueue_getnext(&base->active))) {
@@ -1294,7 +1303,6 @@ retry:
__run_hrtimer(timer, &basenow);
}
- base++;
}
/*
@@ -1525,7 +1533,7 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
struct timespec __user *rmtp;
int ret = 0;
- hrtimer_init_on_stack(&t.timer, restart->nanosleep.index,
+ hrtimer_init_on_stack(&t.timer, restart->nanosleep.clockid,
HRTIMER_MODE_ABS);
hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires);
@@ -1577,7 +1585,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
restart = &current_thread_info()->restart_block;
restart->fn = hrtimer_nanosleep_restart;
- restart->nanosleep.index = t.timer.base->index;
+ restart->nanosleep.clockid = t.timer.base->clockid;
restart->nanosleep.rmtp = rmtp;
restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer);
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 834899f2500f..64e3df6ab1ef 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -19,7 +19,7 @@ static struct proc_dir_entry *root_irq_dir;
#ifdef CONFIG_SMP
-static int irq_affinity_proc_show(struct seq_file *m, void *v)
+static int show_irq_affinity(int type, struct seq_file *m, void *v)
{
struct irq_desc *desc = irq_to_desc((long)m->private);
const struct cpumask *mask = desc->irq_data.affinity;
@@ -28,7 +28,10 @@ static int irq_affinity_proc_show(struct seq_file *m, void *v)
if (irqd_is_setaffinity_pending(&desc->irq_data))
mask = desc->pending_mask;
#endif
- seq_cpumask(m, mask);
+ if (type)
+ seq_cpumask_list(m, mask);
+ else
+ seq_cpumask(m, mask);
seq_putc(m, '\n');
return 0;
}
@@ -59,7 +62,18 @@ static int irq_affinity_hint_proc_show(struct seq_file *m, void *v)
#endif
int no_irq_affinity;
-static ssize_t irq_affinity_proc_write(struct file *file,
+static int irq_affinity_proc_show(struct seq_file *m, void *v)
+{
+ return show_irq_affinity(0, m, v);
+}
+
+static int irq_affinity_list_proc_show(struct seq_file *m, void *v)
+{
+ return show_irq_affinity(1, m, v);
+}
+
+
+static ssize_t write_irq_affinity(int type, struct file *file,
const char __user *buffer, size_t count, loff_t *pos)
{
unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data;
@@ -72,7 +86,10 @@ static ssize_t irq_affinity_proc_write(struct file *file,
if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
return -ENOMEM;
- err = cpumask_parse_user(buffer, count, new_value);
+ if (type)
+ err = cpumask_parselist_user(buffer, count, new_value);
+ else
+ err = cpumask_parse_user(buffer, count, new_value);
if (err)
goto free_cpumask;
@@ -100,11 +117,28 @@ free_cpumask:
return err;
}
+static ssize_t irq_affinity_proc_write(struct file *file,
+ const char __user *buffer, size_t count, loff_t *pos)
+{
+ return write_irq_affinity(0, file, buffer, count, pos);
+}
+
+static ssize_t irq_affinity_list_proc_write(struct file *file,
+ const char __user *buffer, size_t count, loff_t *pos)
+{
+ return write_irq_affinity(1, file, buffer, count, pos);
+}
+
static int irq_affinity_proc_open(struct inode *inode, struct file *file)
{
return single_open(file, irq_affinity_proc_show, PDE(inode)->data);
}
+static int irq_affinity_list_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, irq_affinity_list_proc_show, PDE(inode)->data);
+}
+
static int irq_affinity_hint_proc_open(struct inode *inode, struct file *file)
{
return single_open(file, irq_affinity_hint_proc_show, PDE(inode)->data);
@@ -125,6 +159,14 @@ static const struct file_operations irq_affinity_hint_proc_fops = {
.release = single_release,
};
+static const struct file_operations irq_affinity_list_proc_fops = {
+ .open = irq_affinity_list_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+ .write = irq_affinity_list_proc_write,
+};
+
static int default_affinity_show(struct seq_file *m, void *v)
{
seq_cpumask(m, irq_default_affinity);
@@ -289,6 +331,10 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
proc_create_data("affinity_hint", 0400, desc->dir,
&irq_affinity_hint_proc_fops, (void *)(long)irq);
+ /* create /proc/irq/<irq>/smp_affinity_list */
+ proc_create_data("smp_affinity_list", 0600, desc->dir,
+ &irq_affinity_list_proc_fops, (void *)(long)irq);
+
proc_create_data("node", 0444, desc->dir,
&irq_node_proc_fops, (void *)(long)irq);
#endif
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 5ae0ff38425f..ad6a81c58b44 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -25,6 +25,7 @@
#include <linux/kmod.h>
#include <linux/slab.h>
#include <linux/completion.h>
+#include <linux/cred.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/workqueue.h>
@@ -43,6 +44,13 @@ extern int max_threads;
static struct workqueue_struct *khelper_wq;
+#define CAP_BSET (void *)1
+#define CAP_PI (void *)2
+
+static kernel_cap_t usermodehelper_bset = CAP_FULL_SET;
+static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET;
+static DEFINE_SPINLOCK(umh_sysctl_lock);
+
#ifdef CONFIG_MODULES
/*
@@ -132,6 +140,7 @@ EXPORT_SYMBOL(__request_module);
static int ____call_usermodehelper(void *data)
{
struct subprocess_info *sub_info = data;
+ struct cred *new;
int retval;
spin_lock_irq(&current->sighand->siglock);
@@ -153,6 +162,19 @@ static int ____call_usermodehelper(void *data)
goto fail;
}
+ retval = -ENOMEM;
+ new = prepare_kernel_cred(current);
+ if (!new)
+ goto fail;
+
+ spin_lock(&umh_sysctl_lock);
+ new->cap_bset = cap_intersect(usermodehelper_bset, new->cap_bset);
+ new->cap_inheritable = cap_intersect(usermodehelper_inheritable,
+ new->cap_inheritable);
+ spin_unlock(&umh_sysctl_lock);
+
+ commit_creds(new);
+
retval = kernel_execve(sub_info->path,
(const char *const *)sub_info->argv,
(const char *const *)sub_info->envp);
@@ -420,6 +442,84 @@ unlock:
}
EXPORT_SYMBOL(call_usermodehelper_exec);
+static int proc_cap_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table t;
+ unsigned long cap_array[_KERNEL_CAPABILITY_U32S];
+ kernel_cap_t new_cap;
+ int err, i;
+
+ if (write && (!capable(CAP_SETPCAP) ||
+ !capable(CAP_SYS_MODULE)))
+ return -EPERM;
+
+ /*
+ * convert from the global kernel_cap_t to the ulong array to print to
+ * userspace if this is a read.
+ */
+ spin_lock(&umh_sysctl_lock);
+ for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++) {
+ if (table->data == CAP_BSET)
+ cap_array[i] = usermodehelper_bset.cap[i];
+ else if (table->data == CAP_PI)
+ cap_array[i] = usermodehelper_inheritable.cap[i];
+ else
+ BUG();
+ }
+ spin_unlock(&umh_sysctl_lock);
+
+ t = *table;
+ t.data = &cap_array;
+
+ /*
+ * actually read or write and array of ulongs from userspace. Remember
+ * these are least significant 32 bits first
+ */
+ err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos);
+ if (err < 0)
+ return err;
+
+ /*
+ * convert from the sysctl array of ulongs to the kernel_cap_t
+ * internal representation
+ */
+ for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++)
+ new_cap.cap[i] = cap_array[i];
+
+ /*
+ * Drop everything not in the new_cap (but don't add things)
+ */
+ spin_lock(&umh_sysctl_lock);
+ if (write) {
+ if (table->data == CAP_BSET)
+ usermodehelper_bset = cap_intersect(usermodehelper_bset, new_cap);
+ if (table->data == CAP_PI)
+ usermodehelper_inheritable = cap_intersect(usermodehelper_inheritable, new_cap);
+ }
+ spin_unlock(&umh_sysctl_lock);
+
+ return 0;
+}
+
+struct ctl_table usermodehelper_table[] = {
+ {
+ .procname = "bset",
+ .data = CAP_BSET,
+ .maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long),
+ .mode = 0600,
+ .proc_handler = proc_cap_handler,
+ },
+ {
+ .procname = "inheritable",
+ .data = CAP_PI,
+ .maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long),
+ .mode = 0600,
+ .proc_handler = proc_cap_handler,
+ },
+ { }
+};
+
void __init usermodehelper_init(void)
{
khelper_wq = create_singlethread_workqueue("khelper");
diff --git a/kernel/module.c b/kernel/module.c
index 22879725678d..795bdc7f5c3f 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2812,7 +2812,7 @@ static struct module *load_module(void __user *umod,
}
/* This has to be done once we're sure module name is unique. */
- if (!mod->taints)
+ if (!mod->taints || mod->taints == (1U<<TAINT_CRAP))
dynamic_debug_setup(info.debug, info.num_debug);
/* Find duplicate symbols */
@@ -2849,7 +2849,7 @@ static struct module *load_module(void __user *umod,
module_bug_cleanup(mod);
ddebug:
- if (!mod->taints)
+ if (!mod->taints || mod->taints == (1U<<TAINT_CRAP))
dynamic_debug_remove(info.debug);
unlock:
mutex_unlock(&module_mutex);
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 2c938e2337cd..d607ed5dd441 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -131,14 +131,14 @@ EXPORT_SYMBOL(mutex_unlock);
*/
static inline int __sched
__mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
- unsigned long ip)
+ struct lockdep_map *nest_lock, unsigned long ip)
{
struct task_struct *task = current;
struct mutex_waiter waiter;
unsigned long flags;
preempt_disable();
- mutex_acquire(&lock->dep_map, subclass, 0, ip);
+ mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);
#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
/*
@@ -269,16 +269,25 @@ void __sched
mutex_lock_nested(struct mutex *lock, unsigned int subclass)
{
might_sleep();
- __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, _RET_IP_);
+ __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_);
}
EXPORT_SYMBOL_GPL(mutex_lock_nested);
+void __sched
+_mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest)
+{
+ might_sleep();
+ __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, nest, _RET_IP_);
+}
+
+EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock);
+
int __sched
mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass)
{
might_sleep();
- return __mutex_lock_common(lock, TASK_KILLABLE, subclass, _RET_IP_);
+ return __mutex_lock_common(lock, TASK_KILLABLE, subclass, NULL, _RET_IP_);
}
EXPORT_SYMBOL_GPL(mutex_lock_killable_nested);
@@ -287,7 +296,7 @@ mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass)
{
might_sleep();
return __mutex_lock_common(lock, TASK_INTERRUPTIBLE,
- subclass, _RET_IP_);
+ subclass, NULL, _RET_IP_);
}
EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
@@ -393,7 +402,7 @@ __mutex_lock_slowpath(atomic_t *lock_count)
{
struct mutex *lock = container_of(lock_count, struct mutex, count);
- __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, _RET_IP_);
+ __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_);
}
static noinline int __sched
@@ -401,7 +410,7 @@ __mutex_lock_killable_slowpath(atomic_t *lock_count)
{
struct mutex *lock = container_of(lock_count, struct mutex, count);
- return __mutex_lock_common(lock, TASK_KILLABLE, 0, _RET_IP_);
+ return __mutex_lock_common(lock, TASK_KILLABLE, 0, NULL, _RET_IP_);
}
static noinline int __sched
@@ -409,7 +418,7 @@ __mutex_lock_interruptible_slowpath(atomic_t *lock_count)
{
struct mutex *lock = container_of(lock_count, struct mutex, count);
- return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, _RET_IP_);
+ return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, NULL, _RET_IP_);
}
#endif
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index a05d191ffdd9..5424e37673ed 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -22,6 +22,9 @@
#include <linux/pid_namespace.h>
#include <net/net_namespace.h>
#include <linux/ipc_namespace.h>
+#include <linux/proc_fs.h>
+#include <linux/file.h>
+#include <linux/syscalls.h>
static struct kmem_cache *nsproxy_cachep;
@@ -233,6 +236,45 @@ void exit_task_namespaces(struct task_struct *p)
switch_task_namespaces(p, NULL);
}
+SYSCALL_DEFINE2(setns, int, fd, int, nstype)
+{
+ const struct proc_ns_operations *ops;
+ struct task_struct *tsk = current;
+ struct nsproxy *new_nsproxy;
+ struct proc_inode *ei;
+ struct file *file;
+ int err;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ file = proc_ns_fget(fd);
+ if (IS_ERR(file))
+ return PTR_ERR(file);
+
+ err = -EINVAL;
+ ei = PROC_I(file->f_dentry->d_inode);
+ ops = ei->ns_ops;
+ if (nstype && (ops->type != nstype))
+ goto out;
+
+ new_nsproxy = create_new_namespaces(0, tsk, tsk->fs);
+ if (IS_ERR(new_nsproxy)) {
+ err = PTR_ERR(new_nsproxy);
+ goto out;
+ }
+
+ err = ops->install(new_nsproxy, ei->ns);
+ if (err) {
+ free_nsproxy(new_nsproxy);
+ goto out;
+ }
+ switch_task_namespaces(tsk, new_nsproxy);
+out:
+ fput(file);
+ return err;
+}
+
static int __init nsproxy_cache_init(void)
{
nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC);
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index 0da058bff8eb..beb184689af9 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -385,7 +385,7 @@ static ssize_t pm_qos_power_read(struct file *filp, char __user *buf,
s32 value;
unsigned long flags;
struct pm_qos_object *o;
- struct pm_qos_request_list *pm_qos_req = filp->private_data;;
+ struct pm_qos_request_list *pm_qos_req = filp->private_data;
if (!pm_qos_req)
return -EINVAL;
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 0791b13df7bf..58f405b581e7 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -1514,7 +1514,7 @@ static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
return -EFAULT;
restart_block->fn = posix_cpu_nsleep_restart;
- restart_block->nanosleep.index = which_clock;
+ restart_block->nanosleep.clockid = which_clock;
restart_block->nanosleep.rmtp = rmtp;
restart_block->nanosleep.expires = timespec_to_ns(rqtp);
}
@@ -1523,7 +1523,7 @@ static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
static long posix_cpu_nsleep_restart(struct restart_block *restart_block)
{
- clockid_t which_clock = restart_block->nanosleep.index;
+ clockid_t which_clock = restart_block->nanosleep.clockid;
struct timespec t;
struct itimerspec it;
int error;
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index e5498d7405c3..4556182527f3 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -491,6 +491,13 @@ static struct k_itimer * alloc_posix_timer(void)
return tmr;
}
+static void k_itimer_rcu_free(struct rcu_head *head)
+{
+ struct k_itimer *tmr = container_of(head, struct k_itimer, it.rcu);
+
+ kmem_cache_free(posix_timers_cache, tmr);
+}
+
#define IT_ID_SET 1
#define IT_ID_NOT_SET 0
static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
@@ -503,7 +510,7 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
}
put_pid(tmr->it_pid);
sigqueue_free(tmr->sigq);
- kmem_cache_free(posix_timers_cache, tmr);
+ call_rcu(&tmr->it.rcu, k_itimer_rcu_free);
}
static struct k_clock *clockid_to_kclock(const clockid_t id)
@@ -631,22 +638,18 @@ out:
static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)
{
struct k_itimer *timr;
- /*
- * Watch out here. We do a irqsave on the idr_lock and pass the
- * flags part over to the timer lock. Must not let interrupts in
- * while we are moving the lock.
- */
- spin_lock_irqsave(&idr_lock, *flags);
+
+ rcu_read_lock();
timr = idr_find(&posix_timers_id, (int)timer_id);
if (timr) {
- spin_lock(&timr->it_lock);
+ spin_lock_irqsave(&timr->it_lock, *flags);
if (timr->it_signal == current->signal) {
- spin_unlock(&idr_lock);
+ rcu_read_unlock();
return timr;
}
- spin_unlock(&timr->it_lock);
+ spin_unlock_irqrestore(&timr->it_lock, *flags);
}
- spin_unlock_irqrestore(&idr_lock, *flags);
+ rcu_read_unlock();
return NULL;
}
@@ -1056,7 +1059,7 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
*/
long clock_nanosleep_restart(struct restart_block *restart_block)
{
- clockid_t which_clock = restart_block->nanosleep.index;
+ clockid_t which_clock = restart_block->nanosleep.clockid;
struct k_clock *kc = clockid_to_kclock(which_clock);
if (WARN_ON_ONCE(!kc || !kc->nsleep_restart))
diff --git a/kernel/printk.c b/kernel/printk.c
index da8ca817eae3..35185392173f 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -31,6 +31,7 @@
#include <linux/smp.h>
#include <linux/security.h>
#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/syscalls.h>
#include <linux/kexec.h>
#include <linux/kdb.h>
@@ -167,46 +168,74 @@ void log_buf_kexec_setup(void)
}
#endif
+/* requested log_buf_len from kernel cmdline */
+static unsigned long __initdata new_log_buf_len;
+
+/* save requested log_buf_len since it's too early to process it */
static int __init log_buf_len_setup(char *str)
{
unsigned size = memparse(str, &str);
- unsigned long flags;
if (size)
size = roundup_pow_of_two(size);
- if (size > log_buf_len) {
- unsigned start, dest_idx, offset;
- char *new_log_buf;
+ if (size > log_buf_len)
+ new_log_buf_len = size;
- new_log_buf = alloc_bootmem(size);
- if (!new_log_buf) {
- printk(KERN_WARNING "log_buf_len: allocation failed\n");
- goto out;
- }
+ return 0;
+}
+early_param("log_buf_len", log_buf_len_setup);
- spin_lock_irqsave(&logbuf_lock, flags);
- log_buf_len = size;
- log_buf = new_log_buf;
-
- offset = start = min(con_start, log_start);
- dest_idx = 0;
- while (start != log_end) {
- log_buf[dest_idx] = __log_buf[start & (__LOG_BUF_LEN - 1)];
- start++;
- dest_idx++;
- }
- log_start -= offset;
- con_start -= offset;
- log_end -= offset;
- spin_unlock_irqrestore(&logbuf_lock, flags);
+void __init setup_log_buf(int early)
+{
+ unsigned long flags;
+ unsigned start, dest_idx, offset;
+ char *new_log_buf;
+ int free;
+
+ if (!new_log_buf_len)
+ return;
+
+ if (early) {
+ unsigned long mem;
- printk(KERN_NOTICE "log_buf_len: %d\n", log_buf_len);
+ mem = memblock_alloc(new_log_buf_len, PAGE_SIZE);
+ if (mem == MEMBLOCK_ERROR)
+ return;
+ new_log_buf = __va(mem);
+ } else {
+ new_log_buf = alloc_bootmem_nopanic(new_log_buf_len);
}
-out:
- return 1;
-}
-__setup("log_buf_len=", log_buf_len_setup);
+ if (unlikely(!new_log_buf)) {
+ pr_err("log_buf_len: %ld bytes not available\n",
+ new_log_buf_len);
+ return;
+ }
+
+ spin_lock_irqsave(&logbuf_lock, flags);
+ log_buf_len = new_log_buf_len;
+ log_buf = new_log_buf;
+ new_log_buf_len = 0;
+ free = __LOG_BUF_LEN - log_end;
+
+ offset = start = min(con_start, log_start);
+ dest_idx = 0;
+ while (start != log_end) {
+ unsigned log_idx_mask = start & (__LOG_BUF_LEN - 1);
+
+ log_buf[dest_idx] = __log_buf[log_idx_mask];
+ start++;
+ dest_idx++;
+ }
+ log_start -= offset;
+ con_start -= offset;
+ log_end -= offset;
+ spin_unlock_irqrestore(&logbuf_lock, flags);
+
+ pr_info("log_buf_len: %d\n", log_buf_len);
+ pr_info("early log buf free: %d(%d%%)\n",
+ free, (free * 100) / __LOG_BUF_LEN);
+}
#ifdef CONFIG_BOOT_PRINTK_DELAY
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index dc7ab65f3b36..2df115790cd9 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -38,35 +38,33 @@ void __ptrace_link(struct task_struct *child, struct task_struct *new_parent)
child->parent = new_parent;
}
-/*
- * Turn a tracing stop into a normal stop now, since with no tracer there
- * would be no way to wake it up with SIGCONT or SIGKILL. If there was a
- * signal sent that would resume the child, but didn't because it was in
- * TASK_TRACED, resume it now.
- * Requires that irqs be disabled.
- */
-static void ptrace_untrace(struct task_struct *child)
-{
- spin_lock(&child->sighand->siglock);
- if (task_is_traced(child)) {
- /*
- * If the group stop is completed or in progress,
- * this thread was already counted as stopped.
- */
- if (child->signal->flags & SIGNAL_STOP_STOPPED ||
- child->signal->group_stop_count)
- __set_task_state(child, TASK_STOPPED);
- else
- signal_wake_up(child, 1);
- }
- spin_unlock(&child->sighand->siglock);
-}
-
-/*
- * unptrace a task: move it back to its original parent and
- * remove it from the ptrace list.
+/**
+ * __ptrace_unlink - unlink ptracee and restore its execution state
+ * @child: ptracee to be unlinked
*
- * Must be called with the tasklist lock write-held.
+ * Remove @child from the ptrace list, move it back to the original parent,
+ * and restore the execution state so that it conforms to the group stop
+ * state.
+ *
+ * Unlinking can happen via two paths - explicit PTRACE_DETACH or ptracer
+ * exiting. For PTRACE_DETACH, unless the ptracee has been killed between
+ * ptrace_check_attach() and here, it's guaranteed to be in TASK_TRACED.
+ * If the ptracer is exiting, the ptracee can be in any state.
+ *
+ * After detach, the ptracee should be in a state which conforms to the
+ * group stop. If the group is stopped or in the process of stopping, the
+ * ptracee should be put into TASK_STOPPED; otherwise, it should be woken
+ * up from TASK_TRACED.
+ *
+ * If the ptracee is in TASK_TRACED and needs to be moved to TASK_STOPPED,
+ * it goes through TRACED -> RUNNING -> STOPPED transition which is similar
+ * to but in the opposite direction of what happens while attaching to a
+ * stopped task. However, in this direction, the intermediate RUNNING
+ * state is not hidden even from the current ptracer and if it immediately
+ * re-attaches and performs a WNOHANG wait(2), it may fail.
+ *
+ * CONTEXT:
+ * write_lock_irq(tasklist_lock)
*/
void __ptrace_unlink(struct task_struct *child)
{
@@ -76,8 +74,27 @@ void __ptrace_unlink(struct task_struct *child)
child->parent = child->real_parent;
list_del_init(&child->ptrace_entry);
- if (task_is_traced(child))
- ptrace_untrace(child);
+ spin_lock(&child->sighand->siglock);
+
+ /*
+ * Reinstate GROUP_STOP_PENDING if group stop is in effect and
+ * @child isn't dead.
+ */
+ if (!(child->flags & PF_EXITING) &&
+ (child->signal->flags & SIGNAL_STOP_STOPPED ||
+ child->signal->group_stop_count))
+ child->group_stop |= GROUP_STOP_PENDING;
+
+ /*
+ * If transition to TASK_STOPPED is pending or in TASK_TRACED, kick
+ * @child in the butt. Note that @resume should be used iff @child
+ * is in TASK_TRACED; otherwise, we might unduly disrupt
+ * TASK_KILLABLE sleeps.
+ */
+ if (child->group_stop & GROUP_STOP_PENDING || task_is_traced(child))
+ signal_wake_up(child, task_is_traced(child));
+
+ spin_unlock(&child->sighand->siglock);
}
/*
@@ -96,16 +113,14 @@ int ptrace_check_attach(struct task_struct *child, int kill)
*/
read_lock(&tasklist_lock);
if ((child->ptrace & PT_PTRACED) && child->parent == current) {
- ret = 0;
/*
* child->sighand can't be NULL, release_task()
* does ptrace_unlink() before __exit_signal().
*/
spin_lock_irq(&child->sighand->siglock);
- if (task_is_stopped(child))
- child->state = TASK_TRACED;
- else if (!task_is_traced(child) && !kill)
- ret = -ESRCH;
+ WARN_ON_ONCE(task_is_stopped(child));
+ if (task_is_traced(child) || kill)
+ ret = 0;
spin_unlock_irq(&child->sighand->siglock);
}
read_unlock(&tasklist_lock);
@@ -169,6 +184,7 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode)
static int ptrace_attach(struct task_struct *task)
{
+ bool wait_trap = false;
int retval;
audit_ptrace(task);
@@ -208,12 +224,42 @@ static int ptrace_attach(struct task_struct *task)
__ptrace_link(task, current);
send_sig_info(SIGSTOP, SEND_SIG_FORCED, task);
+ spin_lock(&task->sighand->siglock);
+
+ /*
+ * If the task is already STOPPED, set GROUP_STOP_PENDING and
+ * TRAPPING, and kick it so that it transits to TRACED. TRAPPING
+ * will be cleared if the child completes the transition or any
+ * event which clears the group stop states happens. We'll wait
+ * for the transition to complete before returning from this
+ * function.
+ *
+ * This hides STOPPED -> RUNNING -> TRACED transition from the
+ * attaching thread but a different thread in the same group can
+ * still observe the transient RUNNING state. IOW, if another
+ * thread's WNOHANG wait(2) on the stopped tracee races against
+ * ATTACH, the wait(2) may fail due to the transient RUNNING.
+ *
+ * The following task_is_stopped() test is safe as both transitions
+ * in and out of STOPPED are protected by siglock.
+ */
+ if (task_is_stopped(task)) {
+ task->group_stop |= GROUP_STOP_PENDING | GROUP_STOP_TRAPPING;
+ signal_wake_up(task, 1);
+ wait_trap = true;
+ }
+
+ spin_unlock(&task->sighand->siglock);
+
retval = 0;
unlock_tasklist:
write_unlock_irq(&tasklist_lock);
unlock_creds:
mutex_unlock(&task->signal->cred_guard_mutex);
out:
+ if (wait_trap)
+ wait_event(current->signal->wait_chldexit,
+ !(task->group_stop & GROUP_STOP_TRAPPING));
return retval;
}
@@ -316,8 +362,6 @@ static int ptrace_detach(struct task_struct *child, unsigned int data)
if (child->ptrace) {
child->exit_code = data;
dead = __ptrace_detach(current, child);
- if (!child->exit_state)
- wake_up_state(child, TASK_TRACED | TASK_STOPPED);
}
write_unlock_irq(&tasklist_lock);
@@ -518,7 +562,7 @@ static int ptrace_resume(struct task_struct *child, long request,
}
child->exit_code = data;
- wake_up_process(child);
+ wake_up_state(child, __TASK_TRACED);
return 0;
}
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 421abfd3641d..7bbac7d0f5ab 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -35,6 +35,7 @@
#include <linux/init.h>
#include <linux/time.h>
#include <linux/cpu.h>
+#include <linux/prefetch.h>
/* Controls for rcu_kthread() kthread, replacing RCU_SOFTIRQ used previously. */
static struct task_struct *rcu_kthread_task;
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index e486f7c3ffb8..f07d2f03181a 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -49,6 +49,7 @@
#include <linux/kernel_stat.h>
#include <linux/wait.h>
#include <linux/kthread.h>
+#include <linux/prefetch.h>
#include "rcutree.h"
diff --git a/kernel/sched.c b/kernel/sched.c
index c62acf45d3b9..2d12893b8b0f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -293,7 +293,7 @@ static DEFINE_SPINLOCK(task_group_lock);
* limitation from this.)
*/
#define MIN_SHARES 2
-#define MAX_SHARES (1UL << 18)
+#define MAX_SHARES (1UL << (18 + SCHED_LOAD_RESOLUTION))
static int root_task_group_load = ROOT_TASK_GROUP_LOAD;
#endif
@@ -1330,13 +1330,25 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
{
u64 tmp;
- tmp = (u64)delta_exec * weight;
+ /*
+ * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
+ * entities since MIN_SHARES = 2. Treat weight as 1 if less than
+ * 2^SCHED_LOAD_RESOLUTION.
+ */
+ if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
+ tmp = (u64)delta_exec * scale_load_down(weight);
+ else
+ tmp = (u64)delta_exec;
if (!lw->inv_weight) {
- if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST))
+ unsigned long w = scale_load_down(lw->weight);
+
+ if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
lw->inv_weight = 1;
+ else if (unlikely(!w))
+ lw->inv_weight = WMULT_CONST;
else
- lw->inv_weight = WMULT_CONST / lw->weight;
+ lw->inv_weight = WMULT_CONST / w;
}
/*
@@ -1778,17 +1790,20 @@ static void dec_nr_running(struct rq *rq)
static void set_load_weight(struct task_struct *p)
{
+ int prio = p->static_prio - MAX_RT_PRIO;
+ struct load_weight *load = &p->se.load;
+
/*
* SCHED_IDLE tasks get minimal weight:
*/
if (p->policy == SCHED_IDLE) {
- p->se.load.weight = WEIGHT_IDLEPRIO;
- p->se.load.inv_weight = WMULT_IDLEPRIO;
+ load->weight = scale_load(WEIGHT_IDLEPRIO);
+ load->inv_weight = WMULT_IDLEPRIO;
return;
}
- p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];
- p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
+ load->weight = scale_load(prio_to_weight[prio]);
+ load->inv_weight = prio_to_wmult[prio];
}
static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
@@ -2564,7 +2579,7 @@ static void ttwu_queue(struct task_struct *p, int cpu)
{
struct rq *rq = cpu_rq(cpu);
-#if defined(CONFIG_SMP) && defined(CONFIG_SCHED_TTWU_QUEUE)
+#if defined(CONFIG_SMP)
if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) {
ttwu_queue_remote(p, cpu);
return;
@@ -6527,7 +6542,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
printk(KERN_CONT " %s", str);
- if (group->cpu_power != SCHED_LOAD_SCALE) {
+ if (group->cpu_power != SCHED_POWER_SCALE) {
printk(KERN_CONT " (cpu_power = %d)",
group->cpu_power);
}
@@ -7902,7 +7917,7 @@ void __init sched_init(void)
#ifdef CONFIG_SMP
rq->sd = NULL;
rq->rd = NULL;
- rq->cpu_power = SCHED_LOAD_SCALE;
+ rq->cpu_power = SCHED_POWER_SCALE;
rq->post_schedule = 0;
rq->active_balance = 0;
rq->next_balance = jiffies;
@@ -8806,14 +8821,14 @@ cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
u64 shareval)
{
- return sched_group_set_shares(cgroup_tg(cgrp), shareval);
+ return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval));
}
static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
{
struct task_group *tg = cgroup_tg(cgrp);
- return (u64) tg->shares;
+ return (u64) scale_load_down(tg->shares);
}
#endif /* CONFIG_FAIR_GROUP_SCHED */
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 37f22626225e..e32a9b70ee9c 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1584,7 +1584,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
}
/* Adjust by relative CPU power of the group */
- avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
+ avg_load = (avg_load * SCHED_POWER_SCALE) / group->cpu_power;
if (local_group) {
this_load = avg_load;
@@ -1722,7 +1722,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
nr_running += cpu_rq(i)->cfs.nr_running;
}
- capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
+ capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
if (tmp->flags & SD_POWERSAVINGS_BALANCE)
nr_running /= 2;
@@ -2570,7 +2570,7 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
{
- return SCHED_LOAD_SCALE;
+ return SCHED_POWER_SCALE;
}
unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
@@ -2607,10 +2607,10 @@ unsigned long scale_rt_power(int cpu)
available = total - rq->rt_avg;
}
- if (unlikely((s64)total < SCHED_LOAD_SCALE))
- total = SCHED_LOAD_SCALE;
+ if (unlikely((s64)total < SCHED_POWER_SCALE))
+ total = SCHED_POWER_SCALE;
- total >>= SCHED_LOAD_SHIFT;
+ total >>= SCHED_POWER_SHIFT;
return div_u64(available, total);
}
@@ -2618,7 +2618,7 @@ unsigned long scale_rt_power(int cpu)
static void update_cpu_power(struct sched_domain *sd, int cpu)
{
unsigned long weight = sd->span_weight;
- unsigned long power = SCHED_LOAD_SCALE;
+ unsigned long power = SCHED_POWER_SCALE;
struct sched_group *sdg = sd->groups;
if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
@@ -2627,7 +2627,7 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
else
power *= default_scale_smt_power(sd, cpu);
- power >>= SCHED_LOAD_SHIFT;
+ power >>= SCHED_POWER_SHIFT;
}
sdg->cpu_power_orig = power;
@@ -2637,10 +2637,10 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
else
power *= default_scale_freq_power(sd, cpu);
- power >>= SCHED_LOAD_SHIFT;
+ power >>= SCHED_POWER_SHIFT;
power *= scale_rt_power(cpu);
- power >>= SCHED_LOAD_SHIFT;
+ power >>= SCHED_POWER_SHIFT;
if (!power)
power = 1;
@@ -2682,7 +2682,7 @@ static inline int
fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
{
/*
- * Only siblings can have significantly less than SCHED_LOAD_SCALE
+ * Only siblings can have significantly less than SCHED_POWER_SCALE
*/
if (!(sd->flags & SD_SHARE_CPUPOWER))
return 0;
@@ -2770,7 +2770,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
}
/* Adjust by relative CPU power of the group */
- sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
+ sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->cpu_power;
/*
* Consider the group unbalanced when the imbalance is larger
@@ -2787,7 +2787,8 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1)
sgs->group_imb = 1;
- sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
+ sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power,
+ SCHED_POWER_SCALE);
if (!sgs->group_capacity)
sgs->group_capacity = fix_small_capacity(sd, group);
sgs->group_weight = group->group_weight;
@@ -2961,7 +2962,7 @@ static int check_asym_packing(struct sched_domain *sd,
return 0;
*imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power,
- SCHED_LOAD_SCALE);
+ SCHED_POWER_SCALE);
return 1;
}
@@ -2990,7 +2991,7 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
cpu_avg_load_per_task(this_cpu);
scaled_busy_load_per_task = sds->busiest_load_per_task
- * SCHED_LOAD_SCALE;
+ * SCHED_POWER_SCALE;
scaled_busy_load_per_task /= sds->busiest->cpu_power;
if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
@@ -3009,10 +3010,10 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
min(sds->busiest_load_per_task, sds->max_load);
pwr_now += sds->this->cpu_power *
min(sds->this_load_per_task, sds->this_load);
- pwr_now /= SCHED_LOAD_SCALE;
+ pwr_now /= SCHED_POWER_SCALE;
/* Amount of load we'd subtract */
- tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
+ tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
sds->busiest->cpu_power;
if (sds->max_load > tmp)
pwr_move += sds->busiest->cpu_power *
@@ -3020,15 +3021,15 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
/* Amount of load we'd add */
if (sds->max_load * sds->busiest->cpu_power <
- sds->busiest_load_per_task * SCHED_LOAD_SCALE)
+ sds->busiest_load_per_task * SCHED_POWER_SCALE)
tmp = (sds->max_load * sds->busiest->cpu_power) /
sds->this->cpu_power;
else
- tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
+ tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
sds->this->cpu_power;
pwr_move += sds->this->cpu_power *
min(sds->this_load_per_task, sds->this_load + tmp);
- pwr_move /= SCHED_LOAD_SCALE;
+ pwr_move /= SCHED_POWER_SCALE;
/* Move if we gain throughput */
if (pwr_move > pwr_now)
@@ -3070,7 +3071,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
load_above_capacity = (sds->busiest_nr_running -
sds->busiest_group_capacity);
- load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_LOAD_SCALE);
+ load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE);
load_above_capacity /= sds->busiest->cpu_power;
}
@@ -3090,7 +3091,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
/* How much load to actually move to equalise the imbalance */
*imbalance = min(max_pull * sds->busiest->cpu_power,
(sds->avg_load - sds->this_load) * sds->this->cpu_power)
- / SCHED_LOAD_SCALE;
+ / SCHED_POWER_SCALE;
/*
* if *imbalance is less than the average load per runnable task
@@ -3159,7 +3160,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
if (!sds.busiest || sds.busiest_nr_running == 0)
goto out_balanced;
- sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
+ sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr;
/*
* If the busiest group is imbalanced the below checks don't
@@ -3238,7 +3239,8 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
for_each_cpu(i, sched_group_cpus(group)) {
unsigned long power = power_of(i);
- unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
+ unsigned long capacity = DIV_ROUND_CLOSEST(power,
+ SCHED_POWER_SCALE);
unsigned long wl;
if (!capacity)
@@ -3263,7 +3265,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
* the load can be moved away from the cpu that is potentially
* running at a lower capacity.
*/
- wl = (wl * SCHED_LOAD_SCALE) / power;
+ wl = (wl * SCHED_POWER_SCALE) / power;
if (wl > max_load) {
max_load = wl;
diff --git a/kernel/signal.c b/kernel/signal.c
index 7165af5f1b11..86c32b884f8e 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -124,7 +124,7 @@ static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked)
static int recalc_sigpending_tsk(struct task_struct *t)
{
- if (t->signal->group_stop_count > 0 ||
+ if ((t->group_stop & GROUP_STOP_PENDING) ||
PENDING(&t->pending, &t->blocked) ||
PENDING(&t->signal->shared_pending, &t->blocked)) {
set_tsk_thread_flag(t, TIF_SIGPENDING);
@@ -223,6 +223,83 @@ static inline void print_dropped_signal(int sig)
current->comm, current->pid, sig);
}
+/**
+ * task_clear_group_stop_trapping - clear group stop trapping bit
+ * @task: target task
+ *
+ * If GROUP_STOP_TRAPPING is set, a ptracer is waiting for us. Clear it
+ * and wake up the ptracer. Note that we don't need any further locking.
+ * @task->siglock guarantees that @task->parent points to the ptracer.
+ *
+ * CONTEXT:
+ * Must be called with @task->sighand->siglock held.
+ */
+static void task_clear_group_stop_trapping(struct task_struct *task)
+{
+ if (unlikely(task->group_stop & GROUP_STOP_TRAPPING)) {
+ task->group_stop &= ~GROUP_STOP_TRAPPING;
+ __wake_up_sync_key(&task->parent->signal->wait_chldexit,
+ TASK_UNINTERRUPTIBLE, 1, task);
+ }
+}
+
+/**
+ * task_clear_group_stop_pending - clear pending group stop
+ * @task: target task
+ *
+ * Clear group stop states for @task.
+ *
+ * CONTEXT:
+ * Must be called with @task->sighand->siglock held.
+ */
+void task_clear_group_stop_pending(struct task_struct *task)
+{
+ task->group_stop &= ~(GROUP_STOP_PENDING | GROUP_STOP_CONSUME |
+ GROUP_STOP_DEQUEUED);
+}
+
+/**
+ * task_participate_group_stop - participate in a group stop
+ * @task: task participating in a group stop
+ *
+ * @task has GROUP_STOP_PENDING set and is participating in a group stop.
+ * Group stop states are cleared and the group stop count is consumed if
+ * %GROUP_STOP_CONSUME was set. If the consumption completes the group
+ * stop, the appropriate %SIGNAL_* flags are set.
+ *
+ * CONTEXT:
+ * Must be called with @task->sighand->siglock held.
+ *
+ * RETURNS:
+ * %true if group stop completion should be notified to the parent, %false
+ * otherwise.
+ */
+static bool task_participate_group_stop(struct task_struct *task)
+{
+ struct signal_struct *sig = task->signal;
+ bool consume = task->group_stop & GROUP_STOP_CONSUME;
+
+ WARN_ON_ONCE(!(task->group_stop & GROUP_STOP_PENDING));
+
+ task_clear_group_stop_pending(task);
+
+ if (!consume)
+ return false;
+
+ if (!WARN_ON_ONCE(sig->group_stop_count == 0))
+ sig->group_stop_count--;
+
+ /*
+ * Tell the caller to notify completion iff we are entering into a
+ * fresh group stop. Read comment in do_signal_stop() for details.
+ */
+ if (!sig->group_stop_count && !(sig->flags & SIGNAL_STOP_STOPPED)) {
+ sig->flags = SIGNAL_STOP_STOPPED;
+ return true;
+ }
+ return false;
+}
+
/*
* allocate a new signal queue record
* - this may be called without locks if and only if t == current, otherwise an
@@ -527,7 +604,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
* is to alert stop-signal processing code when another
* processor has come along and cleared the flag.
*/
- tsk->signal->flags |= SIGNAL_STOP_DEQUEUED;
+ current->group_stop |= GROUP_STOP_DEQUEUED;
}
if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) {
/*
@@ -592,7 +669,7 @@ static int rm_from_queue_full(sigset_t *mask, struct sigpending *s)
if (sigisemptyset(&m))
return 0;
- signandsets(&s->signal, &s->signal, mask);
+ sigandnsets(&s->signal, &s->signal, mask);
list_for_each_entry_safe(q, n, &s->list, list) {
if (sigismember(mask, q->info.si_signo)) {
list_del_init(&q->list);
@@ -727,34 +804,14 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns)
} else if (sig == SIGCONT) {
unsigned int why;
/*
- * Remove all stop signals from all queues,
- * and wake all threads.
+ * Remove all stop signals from all queues, wake all threads.
*/
rm_from_queue(SIG_KERNEL_STOP_MASK, &signal->shared_pending);
t = p;
do {
- unsigned int state;
+ task_clear_group_stop_pending(t);
rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending);
- /*
- * If there is a handler for SIGCONT, we must make
- * sure that no thread returns to user mode before
- * we post the signal, in case it was the only
- * thread eligible to run the signal handler--then
- * it must not do anything between resuming and
- * running the handler. With the TIF_SIGPENDING
- * flag set, the thread will pause and acquire the
- * siglock that we hold now and until we've queued
- * the pending signal.
- *
- * Wake up the stopped thread _after_ setting
- * TIF_SIGPENDING
- */
- state = __TASK_STOPPED;
- if (sig_user_defined(t, SIGCONT) && !sigismember(&t->blocked, SIGCONT)) {
- set_tsk_thread_flag(t, TIF_SIGPENDING);
- state |= TASK_INTERRUPTIBLE;
- }
- wake_up_state(t, state);
+ wake_up_state(t, __TASK_STOPPED);
} while_each_thread(p, t);
/*
@@ -780,13 +837,6 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns)
signal->flags = why | SIGNAL_STOP_CONTINUED;
signal->group_stop_count = 0;
signal->group_exit_code = 0;
- } else {
- /*
- * We are not stopped, but there could be a stop
- * signal in the middle of being processed after
- * being removed from the queue. Clear that too.
- */
- signal->flags &= ~SIGNAL_STOP_DEQUEUED;
}
}
@@ -875,6 +925,7 @@ static void complete_signal(int sig, struct task_struct *p, int group)
signal->group_stop_count = 0;
t = p;
do {
+ task_clear_group_stop_pending(t);
sigaddset(&t->pending.signal, SIGKILL);
signal_wake_up(t, 1);
} while_each_thread(p, t);
@@ -1109,6 +1160,7 @@ int zap_other_threads(struct task_struct *p)
p->signal->group_stop_count = 0;
while_each_thread(p, t) {
+ task_clear_group_stop_pending(t);
count++;
/* Don't bother with already dead threads */
@@ -1536,16 +1588,30 @@ int do_notify_parent(struct task_struct *tsk, int sig)
return ret;
}
-static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
+/**
+ * do_notify_parent_cldstop - notify parent of stopped/continued state change
+ * @tsk: task reporting the state change
+ * @for_ptracer: the notification is for ptracer
+ * @why: CLD_{CONTINUED|STOPPED|TRAPPED} to report
+ *
+ * Notify @tsk's parent that the stopped/continued state has changed. If
+ * @for_ptracer is %false, @tsk's group leader notifies to its real parent.
+ * If %true, @tsk reports to @tsk->parent which should be the ptracer.
+ *
+ * CONTEXT:
+ * Must be called with tasklist_lock at least read locked.
+ */
+static void do_notify_parent_cldstop(struct task_struct *tsk,
+ bool for_ptracer, int why)
{
struct siginfo info;
unsigned long flags;
struct task_struct *parent;
struct sighand_struct *sighand;
- if (task_ptrace(tsk))
+ if (for_ptracer) {
parent = tsk->parent;
- else {
+ } else {
tsk = tsk->group_leader;
parent = tsk->real_parent;
}
@@ -1621,6 +1687,15 @@ static int sigkill_pending(struct task_struct *tsk)
}
/*
+ * Test whether the target task of the usual cldstop notification - the
+ * real_parent of @child - is in the same group as the ptracer.
+ */
+static bool real_parent_is_ptracer(struct task_struct *child)
+{
+ return same_thread_group(child->parent, child->real_parent);
+}
+
+/*
* This must be called with current->sighand->siglock held.
*
* This should be the path for all ptrace stops.
@@ -1631,10 +1706,12 @@ static int sigkill_pending(struct task_struct *tsk)
* If we actually decide not to stop at all because the tracer
* is gone, we keep current->exit_code unless clear_code.
*/
-static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
+static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
__releases(&current->sighand->siglock)
__acquires(&current->sighand->siglock)
{
+ bool gstop_done = false;
+
if (arch_ptrace_stop_needed(exit_code, info)) {
/*
* The arch code has something special to do before a
@@ -1655,21 +1732,49 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
}
/*
- * If there is a group stop in progress,
- * we must participate in the bookkeeping.
+ * If @why is CLD_STOPPED, we're trapping to participate in a group
+ * stop. Do the bookkeeping. Note that if SIGCONT was delievered
+ * while siglock was released for the arch hook, PENDING could be
+ * clear now. We act as if SIGCONT is received after TASK_TRACED
+ * is entered - ignore it.
*/
- if (current->signal->group_stop_count > 0)
- --current->signal->group_stop_count;
+ if (why == CLD_STOPPED && (current->group_stop & GROUP_STOP_PENDING))
+ gstop_done = task_participate_group_stop(current);
current->last_siginfo = info;
current->exit_code = exit_code;
- /* Let the debugger run. */
- __set_current_state(TASK_TRACED);
+ /*
+ * TRACED should be visible before TRAPPING is cleared; otherwise,
+ * the tracer might fail do_wait().
+ */
+ set_current_state(TASK_TRACED);
+
+ /*
+ * We're committing to trapping. Clearing GROUP_STOP_TRAPPING and
+ * transition to TASK_TRACED should be atomic with respect to
+ * siglock. This hsould be done after the arch hook as siglock is
+ * released and regrabbed across it.
+ */
+ task_clear_group_stop_trapping(current);
+
spin_unlock_irq(&current->sighand->siglock);
read_lock(&tasklist_lock);
if (may_ptrace_stop()) {
- do_notify_parent_cldstop(current, CLD_TRAPPED);
+ /*
+ * Notify parents of the stop.
+ *
+ * While ptraced, there are two parents - the ptracer and
+ * the real_parent of the group_leader. The ptracer should
+ * know about every stop while the real parent is only
+ * interested in the completion of group stop. The states
+ * for the two don't interact with each other. Notify
+ * separately unless they're gonna be duplicates.
+ */
+ do_notify_parent_cldstop(current, true, why);
+ if (gstop_done && !real_parent_is_ptracer(current))
+ do_notify_parent_cldstop(current, false, why);
+
/*
* Don't want to allow preemption here, because
* sys_ptrace() needs this task to be inactive.
@@ -1684,7 +1789,16 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
/*
* By the time we got the lock, our tracer went away.
* Don't drop the lock yet, another tracer may come.
+ *
+ * If @gstop_done, the ptracer went away between group stop
+ * completion and here. During detach, it would have set
+ * GROUP_STOP_PENDING on us and we'll re-enter TASK_STOPPED
+ * in do_signal_stop() on return, so notifying the real
+ * parent of the group stop completion is enough.
*/
+ if (gstop_done)
+ do_notify_parent_cldstop(current, false, why);
+
__set_current_state(TASK_RUNNING);
if (clear_code)
current->exit_code = 0;
@@ -1728,7 +1842,7 @@ void ptrace_notify(int exit_code)
/* Let the debugger run. */
spin_lock_irq(&current->sighand->siglock);
- ptrace_stop(exit_code, 1, &info);
+ ptrace_stop(exit_code, CLD_TRAPPED, 1, &info);
spin_unlock_irq(&current->sighand->siglock);
}
@@ -1741,66 +1855,115 @@ void ptrace_notify(int exit_code)
static int do_signal_stop(int signr)
{
struct signal_struct *sig = current->signal;
- int notify;
- if (!sig->group_stop_count) {
+ if (!(current->group_stop & GROUP_STOP_PENDING)) {
+ unsigned int gstop = GROUP_STOP_PENDING | GROUP_STOP_CONSUME;
struct task_struct *t;
- if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED) ||
+ /* signr will be recorded in task->group_stop for retries */
+ WARN_ON_ONCE(signr & ~GROUP_STOP_SIGMASK);
+
+ if (!likely(current->group_stop & GROUP_STOP_DEQUEUED) ||
unlikely(signal_group_exit(sig)))
return 0;
/*
- * There is no group stop already in progress.
- * We must initiate one now.
+ * There is no group stop already in progress. We must
+ * initiate one now.
+ *
+ * While ptraced, a task may be resumed while group stop is
+ * still in effect and then receive a stop signal and
+ * initiate another group stop. This deviates from the
+ * usual behavior as two consecutive stop signals can't
+ * cause two group stops when !ptraced. That is why we
+ * also check !task_is_stopped(t) below.
+ *
+ * The condition can be distinguished by testing whether
+ * SIGNAL_STOP_STOPPED is already set. Don't generate
+ * group_exit_code in such case.
+ *
+ * This is not necessary for SIGNAL_STOP_CONTINUED because
+ * an intervening stop signal is required to cause two
+ * continued events regardless of ptrace.
*/
- sig->group_exit_code = signr;
+ if (!(sig->flags & SIGNAL_STOP_STOPPED))
+ sig->group_exit_code = signr;
+ else
+ WARN_ON_ONCE(!task_ptrace(current));
+ current->group_stop &= ~GROUP_STOP_SIGMASK;
+ current->group_stop |= signr | gstop;
sig->group_stop_count = 1;
- for (t = next_thread(current); t != current; t = next_thread(t))
+ for (t = next_thread(current); t != current;
+ t = next_thread(t)) {
+ t->group_stop &= ~GROUP_STOP_SIGMASK;
/*
* Setting state to TASK_STOPPED for a group
* stop is always done with the siglock held,
* so this check has no races.
*/
- if (!(t->flags & PF_EXITING) &&
- !task_is_stopped_or_traced(t)) {
+ if (!(t->flags & PF_EXITING) && !task_is_stopped(t)) {
+ t->group_stop |= signr | gstop;
sig->group_stop_count++;
signal_wake_up(t, 0);
}
+ }
}
- /*
- * If there are no other threads in the group, or if there is
- * a group stop in progress and we are the last to stop, report
- * to the parent. When ptraced, every thread reports itself.
- */
- notify = sig->group_stop_count == 1 ? CLD_STOPPED : 0;
- notify = tracehook_notify_jctl(notify, CLD_STOPPED);
- /*
- * tracehook_notify_jctl() can drop and reacquire siglock, so
- * we keep ->group_stop_count != 0 before the call. If SIGCONT
- * or SIGKILL comes in between ->group_stop_count == 0.
- */
- if (sig->group_stop_count) {
- if (!--sig->group_stop_count)
- sig->flags = SIGNAL_STOP_STOPPED;
- current->exit_code = sig->group_exit_code;
+retry:
+ if (likely(!task_ptrace(current))) {
+ int notify = 0;
+
+ /*
+ * If there are no other threads in the group, or if there
+ * is a group stop in progress and we are the last to stop,
+ * report to the parent.
+ */
+ if (task_participate_group_stop(current))
+ notify = CLD_STOPPED;
+
__set_current_state(TASK_STOPPED);
+ spin_unlock_irq(&current->sighand->siglock);
+
+ /*
+ * Notify the parent of the group stop completion. Because
+ * we're not holding either the siglock or tasklist_lock
+ * here, ptracer may attach inbetween; however, this is for
+ * group stop and should always be delivered to the real
+ * parent of the group leader. The new ptracer will get
+ * its notification when this task transitions into
+ * TASK_TRACED.
+ */
+ if (notify) {
+ read_lock(&tasklist_lock);
+ do_notify_parent_cldstop(current, false, notify);
+ read_unlock(&tasklist_lock);
+ }
+
+ /* Now we don't run again until woken by SIGCONT or SIGKILL */
+ schedule();
+
+ spin_lock_irq(&current->sighand->siglock);
+ } else {
+ ptrace_stop(current->group_stop & GROUP_STOP_SIGMASK,
+ CLD_STOPPED, 0, NULL);
+ current->exit_code = 0;
}
- spin_unlock_irq(&current->sighand->siglock);
- if (notify) {
- read_lock(&tasklist_lock);
- do_notify_parent_cldstop(current, notify);
- read_unlock(&tasklist_lock);
+ /*
+ * GROUP_STOP_PENDING could be set if another group stop has
+ * started since being woken up or ptrace wants us to transit
+ * between TASK_STOPPED and TRACED. Retry group stop.
+ */
+ if (current->group_stop & GROUP_STOP_PENDING) {
+ WARN_ON_ONCE(!(current->group_stop & GROUP_STOP_SIGMASK));
+ goto retry;
}
- /* Now we don't run again until woken by SIGCONT or SIGKILL */
- do {
- schedule();
- } while (try_to_freeze());
+ /* PTRACE_ATTACH might have raced with task killing, clear trapping */
+ task_clear_group_stop_trapping(current);
+
+ spin_unlock_irq(&current->sighand->siglock);
tracehook_finish_jctl();
- current->exit_code = 0;
return 1;
}
@@ -1814,7 +1977,7 @@ static int ptrace_signal(int signr, siginfo_t *info,
ptrace_signal_deliver(regs, cookie);
/* Let the debugger run. */
- ptrace_stop(signr, 0, info);
+ ptrace_stop(signr, CLD_TRAPPED, 0, info);
/* We're back. Did the debugger cancel the sig? */
signr = current->exit_code;
@@ -1869,18 +2032,36 @@ relock:
* the CLD_ si_code into SIGNAL_CLD_MASK bits.
*/
if (unlikely(signal->flags & SIGNAL_CLD_MASK)) {
- int why = (signal->flags & SIGNAL_STOP_CONTINUED)
- ? CLD_CONTINUED : CLD_STOPPED;
+ struct task_struct *leader;
+ int why;
+
+ if (signal->flags & SIGNAL_CLD_CONTINUED)
+ why = CLD_CONTINUED;
+ else
+ why = CLD_STOPPED;
+
signal->flags &= ~SIGNAL_CLD_MASK;
- why = tracehook_notify_jctl(why, CLD_CONTINUED);
spin_unlock_irq(&sighand->siglock);
- if (why) {
- read_lock(&tasklist_lock);
- do_notify_parent_cldstop(current->group_leader, why);
- read_unlock(&tasklist_lock);
- }
+ /*
+ * Notify the parent that we're continuing. This event is
+ * always per-process and doesn't make whole lot of sense
+ * for ptracers, who shouldn't consume the state via
+ * wait(2) either, but, for backward compatibility, notify
+ * the ptracer of the group leader too unless it's gonna be
+ * a duplicate.
+ */
+ read_lock(&tasklist_lock);
+
+ do_notify_parent_cldstop(current, false, why);
+
+ leader = current->group_leader;
+ if (task_ptrace(leader) && !real_parent_is_ptracer(leader))
+ do_notify_parent_cldstop(leader, true, why);
+
+ read_unlock(&tasklist_lock);
+
goto relock;
}
@@ -1897,8 +2078,8 @@ relock:
if (unlikely(signr != 0))
ka = return_ka;
else {
- if (unlikely(signal->group_stop_count > 0) &&
- do_signal_stop(0))
+ if (unlikely(current->group_stop &
+ GROUP_STOP_PENDING) && do_signal_stop(0))
goto relock;
signr = dequeue_signal(current, &current->blocked,
@@ -2017,10 +2198,42 @@ relock:
return signr;
}
+/*
+ * It could be that complete_signal() picked us to notify about the
+ * group-wide signal. Other threads should be notified now to take
+ * the shared signals in @which since we will not.
+ */
+static void retarget_shared_pending(struct task_struct *tsk, sigset_t *which)
+{
+ sigset_t retarget;
+ struct task_struct *t;
+
+ sigandsets(&retarget, &tsk->signal->shared_pending.signal, which);
+ if (sigisemptyset(&retarget))
+ return;
+
+ t = tsk;
+ while_each_thread(tsk, t) {
+ if (t->flags & PF_EXITING)
+ continue;
+
+ if (!has_pending_signals(&retarget, &t->blocked))
+ continue;
+ /* Remove the signals this thread can handle. */
+ sigandsets(&retarget, &retarget, &t->blocked);
+
+ if (!signal_pending(t))
+ signal_wake_up(t, 0);
+
+ if (sigisemptyset(&retarget))
+ break;
+ }
+}
+
void exit_signals(struct task_struct *tsk)
{
int group_stop = 0;
- struct task_struct *t;
+ sigset_t unblocked;
if (thread_group_empty(tsk) || signal_group_exit(tsk->signal)) {
tsk->flags |= PF_EXITING;
@@ -2036,26 +2249,23 @@ void exit_signals(struct task_struct *tsk)
if (!signal_pending(tsk))
goto out;
- /*
- * It could be that __group_complete_signal() choose us to
- * notify about group-wide signal. Another thread should be
- * woken now to take the signal since we will not.
- */
- for (t = tsk; (t = next_thread(t)) != tsk; )
- if (!signal_pending(t) && !(t->flags & PF_EXITING))
- recalc_sigpending_and_wake(t);
+ unblocked = tsk->blocked;
+ signotset(&unblocked);
+ retarget_shared_pending(tsk, &unblocked);
- if (unlikely(tsk->signal->group_stop_count) &&
- !--tsk->signal->group_stop_count) {
- tsk->signal->flags = SIGNAL_STOP_STOPPED;
- group_stop = tracehook_notify_jctl(CLD_STOPPED, CLD_STOPPED);
- }
+ if (unlikely(tsk->group_stop & GROUP_STOP_PENDING) &&
+ task_participate_group_stop(tsk))
+ group_stop = CLD_STOPPED;
out:
spin_unlock_irq(&tsk->sighand->siglock);
+ /*
+ * If group stop has completed, deliver the notification. This
+ * should always go to the real parent of the group leader.
+ */
if (unlikely(group_stop)) {
read_lock(&tasklist_lock);
- do_notify_parent_cldstop(tsk, group_stop);
+ do_notify_parent_cldstop(tsk, false, group_stop);
read_unlock(&tasklist_lock);
}
}
@@ -2089,11 +2299,33 @@ long do_no_restart_syscall(struct restart_block *param)
return -EINTR;
}
-/*
- * We don't need to get the kernel lock - this is all local to this
- * particular thread.. (and that's good, because this is _heavily_
- * used by various programs)
+static void __set_task_blocked(struct task_struct *tsk, const sigset_t *newset)
+{
+ if (signal_pending(tsk) && !thread_group_empty(tsk)) {
+ sigset_t newblocked;
+ /* A set of now blocked but previously unblocked signals. */
+ sigandnsets(&newblocked, newset, &current->blocked);
+ retarget_shared_pending(tsk, &newblocked);
+ }
+ tsk->blocked = *newset;
+ recalc_sigpending();
+}
+
+/**
+ * set_current_blocked - change current->blocked mask
+ * @newset: new mask
+ *
+ * It is wrong to change ->blocked directly, this helper should be used
+ * to ensure the process can't miss a shared signal we are going to block.
*/
+void set_current_blocked(const sigset_t *newset)
+{
+ struct task_struct *tsk = current;
+
+ spin_lock_irq(&tsk->sighand->siglock);
+ __set_task_blocked(tsk, newset);
+ spin_unlock_irq(&tsk->sighand->siglock);
+}
/*
* This is also useful for kernel threads that want to temporarily
@@ -2105,30 +2337,29 @@ long do_no_restart_syscall(struct restart_block *param)
*/
int sigprocmask(int how, sigset_t *set, sigset_t *oldset)
{
- int error;
+ struct task_struct *tsk = current;
+ sigset_t newset;
- spin_lock_irq(&current->sighand->siglock);
+ /* Lockless, only current can change ->blocked, never from irq */
if (oldset)
- *oldset = current->blocked;
+ *oldset = tsk->blocked;
- error = 0;
switch (how) {
case SIG_BLOCK:
- sigorsets(&current->blocked, &current->blocked, set);
+ sigorsets(&newset, &tsk->blocked, set);
break;
case SIG_UNBLOCK:
- signandsets(&current->blocked, &current->blocked, set);
+ sigandnsets(&newset, &tsk->blocked, set);
break;
case SIG_SETMASK:
- current->blocked = *set;
+ newset = *set;
break;
default:
- error = -EINVAL;
+ return -EINVAL;
}
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
- return error;
+ set_current_blocked(&newset);
+ return 0;
}
/**
@@ -2138,40 +2369,34 @@ int sigprocmask(int how, sigset_t *set, sigset_t *oldset)
* @oset: previous value of signal mask if non-null
* @sigsetsize: size of sigset_t type
*/
-SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, set,
+SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, nset,
sigset_t __user *, oset, size_t, sigsetsize)
{
- int error = -EINVAL;
sigset_t old_set, new_set;
+ int error;
/* XXX: Don't preclude handling different sized sigset_t's. */
if (sigsetsize != sizeof(sigset_t))
- goto out;
+ return -EINVAL;
- if (set) {
- error = -EFAULT;
- if (copy_from_user(&new_set, set, sizeof(*set)))
- goto out;
+ old_set = current->blocked;
+
+ if (nset) {
+ if (copy_from_user(&new_set, nset, sizeof(sigset_t)))
+ return -EFAULT;
sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP));
- error = sigprocmask(how, &new_set, &old_set);
+ error = sigprocmask(how, &new_set, NULL);
if (error)
- goto out;
- if (oset)
- goto set_old;
- } else if (oset) {
- spin_lock_irq(&current->sighand->siglock);
- old_set = current->blocked;
- spin_unlock_irq(&current->sighand->siglock);
+ return error;
+ }
- set_old:
- error = -EFAULT;
- if (copy_to_user(oset, &old_set, sizeof(*oset)))
- goto out;
+ if (oset) {
+ if (copy_to_user(oset, &old_set, sizeof(sigset_t)))
+ return -EFAULT;
}
- error = 0;
-out:
- return error;
+
+ return 0;
}
long do_sigpending(void __user *set, unsigned long sigsetsize)
@@ -2284,6 +2509,66 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from)
#endif
/**
+ * do_sigtimedwait - wait for queued signals specified in @which
+ * @which: queued signals to wait for
+ * @info: if non-null, the signal's siginfo is returned here
+ * @ts: upper bound on process time suspension
+ */
+int do_sigtimedwait(const sigset_t *which, siginfo_t *info,
+ const struct timespec *ts)
+{
+ struct task_struct *tsk = current;
+ long timeout = MAX_SCHEDULE_TIMEOUT;
+ sigset_t mask = *which;
+ int sig;
+
+ if (ts) {
+ if (!timespec_valid(ts))
+ return -EINVAL;
+ timeout = timespec_to_jiffies(ts);
+ /*
+ * We can be close to the next tick, add another one
+ * to ensure we will wait at least the time asked for.
+ */
+ if (ts->tv_sec || ts->tv_nsec)
+ timeout++;
+ }
+
+ /*
+ * Invert the set of allowed signals to get those we want to block.
+ */
+ sigdelsetmask(&mask, sigmask(SIGKILL) | sigmask(SIGSTOP));
+ signotset(&mask);
+
+ spin_lock_irq(&tsk->sighand->siglock);
+ sig = dequeue_signal(tsk, &mask, info);
+ if (!sig && timeout) {
+ /*
+ * None ready, temporarily unblock those we're interested
+ * while we are sleeping in so that we'll be awakened when
+ * they arrive. Unblocking is always fine, we can avoid
+ * set_current_blocked().
+ */
+ tsk->real_blocked = tsk->blocked;
+ sigandsets(&tsk->blocked, &tsk->blocked, &mask);
+ recalc_sigpending();
+ spin_unlock_irq(&tsk->sighand->siglock);
+
+ timeout = schedule_timeout_interruptible(timeout);
+
+ spin_lock_irq(&tsk->sighand->siglock);
+ __set_task_blocked(tsk, &tsk->real_blocked);
+ siginitset(&tsk->real_blocked, 0);
+ sig = dequeue_signal(tsk, &mask, info);
+ }
+ spin_unlock_irq(&tsk->sighand->siglock);
+
+ if (sig)
+ return sig;
+ return timeout ? -EINTR : -EAGAIN;
+}
+
+/**
* sys_rt_sigtimedwait - synchronously wait for queued signals specified
* in @uthese
* @uthese: queued signals to wait for
@@ -2295,11 +2580,10 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese,
siginfo_t __user *, uinfo, const struct timespec __user *, uts,
size_t, sigsetsize)
{
- int ret, sig;
sigset_t these;
struct timespec ts;
siginfo_t info;
- long timeout = 0;
+ int ret;
/* XXX: Don't preclude handling different sized sigset_t's. */
if (sigsetsize != sizeof(sigset_t))
@@ -2308,61 +2592,16 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese,
if (copy_from_user(&these, uthese, sizeof(these)))
return -EFAULT;
- /*
- * Invert the set of allowed signals to get those we
- * want to block.
- */
- sigdelsetmask(&these, sigmask(SIGKILL)|sigmask(SIGSTOP));
- signotset(&these);
-
if (uts) {
if (copy_from_user(&ts, uts, sizeof(ts)))
return -EFAULT;
- if (ts.tv_nsec >= 1000000000L || ts.tv_nsec < 0
- || ts.tv_sec < 0)
- return -EINVAL;
}
- spin_lock_irq(&current->sighand->siglock);
- sig = dequeue_signal(current, &these, &info);
- if (!sig) {
- timeout = MAX_SCHEDULE_TIMEOUT;
- if (uts)
- timeout = (timespec_to_jiffies(&ts)
- + (ts.tv_sec || ts.tv_nsec));
-
- if (timeout) {
- /*
- * None ready -- temporarily unblock those we're
- * interested while we are sleeping in so that we'll
- * be awakened when they arrive.
- */
- current->real_blocked = current->blocked;
- sigandsets(&current->blocked, &current->blocked, &these);
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
-
- timeout = schedule_timeout_interruptible(timeout);
-
- spin_lock_irq(&current->sighand->siglock);
- sig = dequeue_signal(current, &these, &info);
- current->blocked = current->real_blocked;
- siginitset(&current->real_blocked, 0);
- recalc_sigpending();
- }
- }
- spin_unlock_irq(&current->sighand->siglock);
+ ret = do_sigtimedwait(&these, &info, uts ? &ts : NULL);
- if (sig) {
- ret = sig;
- if (uinfo) {
- if (copy_siginfo_to_user(uinfo, &info))
- ret = -EFAULT;
- }
- } else {
- ret = -EAGAIN;
- if (timeout)
- ret = -EINTR;
+ if (ret > 0 && uinfo) {
+ if (copy_siginfo_to_user(uinfo, &info))
+ ret = -EFAULT;
}
return ret;
@@ -2650,60 +2889,51 @@ SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set)
/**
* sys_sigprocmask - examine and change blocked signals
* @how: whether to add, remove, or set signals
- * @set: signals to add or remove (if non-null)
+ * @nset: signals to add or remove (if non-null)
* @oset: previous value of signal mask if non-null
*
* Some platforms have their own version with special arguments;
* others support only sys_rt_sigprocmask.
*/
-SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, set,
+SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, nset,
old_sigset_t __user *, oset)
{
- int error;
old_sigset_t old_set, new_set;
+ sigset_t new_blocked;
- if (set) {
- error = -EFAULT;
- if (copy_from_user(&new_set, set, sizeof(*set)))
- goto out;
+ old_set = current->blocked.sig[0];
+
+ if (nset) {
+ if (copy_from_user(&new_set, nset, sizeof(*nset)))
+ return -EFAULT;
new_set &= ~(sigmask(SIGKILL) | sigmask(SIGSTOP));
- spin_lock_irq(&current->sighand->siglock);
- old_set = current->blocked.sig[0];
+ new_blocked = current->blocked;
- error = 0;
switch (how) {
- default:
- error = -EINVAL;
- break;
case SIG_BLOCK:
- sigaddsetmask(&current->blocked, new_set);
+ sigaddsetmask(&new_blocked, new_set);
break;
case SIG_UNBLOCK:
- sigdelsetmask(&current->blocked, new_set);
+ sigdelsetmask(&new_blocked, new_set);
break;
case SIG_SETMASK:
- current->blocked.sig[0] = new_set;
+ new_blocked.sig[0] = new_set;
break;
+ default:
+ return -EINVAL;
}
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
- if (error)
- goto out;
- if (oset)
- goto set_old;
- } else if (oset) {
- old_set = current->blocked.sig[0];
- set_old:
- error = -EFAULT;
+ set_current_blocked(&new_blocked);
+ }
+
+ if (oset) {
if (copy_to_user(oset, &old_set, sizeof(*oset)))
- goto out;
+ return -EFAULT;
}
- error = 0;
-out:
- return error;
+
+ return 0;
}
#endif /* __ARCH_WANT_SYS_SIGPROCMASK */
@@ -2793,8 +3023,10 @@ SYSCALL_DEFINE2(signal, int, sig, __sighandler_t, handler)
SYSCALL_DEFINE0(pause)
{
- current->state = TASK_INTERRUPTIBLE;
- schedule();
+ while (!signal_pending(current)) {
+ current->state = TASK_INTERRUPTIBLE;
+ schedule();
+ }
return -ERESTARTNOHAND;
}
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 25cc41cd8f33..62cbc8877fef 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -46,7 +46,9 @@ cond_syscall(sys_getsockopt);
cond_syscall(compat_sys_getsockopt);
cond_syscall(sys_shutdown);
cond_syscall(sys_sendmsg);
+cond_syscall(sys_sendmmsg);
cond_syscall(compat_sys_sendmsg);
+cond_syscall(compat_sys_sendmmsg);
cond_syscall(sys_recvmsg);
cond_syscall(sys_recvmmsg);
cond_syscall(compat_sys_recvmsg);
@@ -69,15 +71,22 @@ cond_syscall(compat_sys_epoll_pwait);
cond_syscall(sys_semget);
cond_syscall(sys_semop);
cond_syscall(sys_semtimedop);
+cond_syscall(compat_sys_semtimedop);
cond_syscall(sys_semctl);
+cond_syscall(compat_sys_semctl);
cond_syscall(sys_msgget);
cond_syscall(sys_msgsnd);
+cond_syscall(compat_sys_msgsnd);
cond_syscall(sys_msgrcv);
+cond_syscall(compat_sys_msgrcv);
cond_syscall(sys_msgctl);
+cond_syscall(compat_sys_msgctl);
cond_syscall(sys_shmget);
cond_syscall(sys_shmat);
+cond_syscall(compat_sys_shmat);
cond_syscall(sys_shmdt);
cond_syscall(sys_shmctl);
+cond_syscall(compat_sys_shmctl);
cond_syscall(sys_mq_open);
cond_syscall(sys_mq_unlink);
cond_syscall(sys_mq_timedsend);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 3dd0c46fa3bb..4fc92445a29c 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -56,6 +56,7 @@
#include <linux/kprobes.h>
#include <linux/pipe_fs_i.h>
#include <linux/oom.h>
+#include <linux/kmod.h>
#include <asm/uaccess.h>
#include <asm/processor.h>
@@ -616,6 +617,11 @@ static struct ctl_table kern_table[] = {
.child = random_table,
},
{
+ .procname = "usermodehelper",
+ .mode = 0555,
+ .child = usermodehelper_table,
+ },
+ {
.procname = "overflowuid",
.data = &overflowuid,
.maxlen = sizeof(int),
@@ -1500,7 +1506,7 @@ static struct ctl_table fs_table[] = {
static struct ctl_table debug_table[] = {
#if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) || \
- defined(CONFIG_S390)
+ defined(CONFIG_S390) || defined(CONFIG_TILE)
{
.procname = "exception-trace",
.data = &show_unhandled_signals,
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 9265014cb4db..2d966244ea60 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -494,7 +494,7 @@ static int update_rmtp(ktime_t exp, enum alarmtimer_type type,
*/
static long __sched alarm_timer_nsleep_restart(struct restart_block *restart)
{
- enum alarmtimer_type type = restart->nanosleep.index;
+ enum alarmtimer_type type = restart->nanosleep.clockid;
ktime_t exp;
struct timespec __user *rmtp;
struct alarm alarm;
@@ -573,7 +573,7 @@ static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
restart = &current_thread_info()->restart_block;
restart->fn = alarm_timer_nsleep_restart;
- restart->nanosleep.index = type;
+ restart->nanosleep.clockid = type;
restart->nanosleep.expires = exp.tv64;
restart->nanosleep.rmtp = rmtp;
ret = -ERESTART_RESTARTBLOCK;
@@ -669,12 +669,20 @@ static int __init has_wakealarm(struct device *dev, void *name_ptr)
*/
static int __init alarmtimer_init_late(void)
{
+ struct device *dev;
char *str;
/* Find an rtc device and init the rtc_timer */
- class_find_device(rtc_class, NULL, &str, has_wakealarm);
- if (str)
+ dev = class_find_device(rtc_class, NULL, &str, has_wakealarm);
+ /* If we have a device then str is valid. See has_wakealarm() */
+ if (dev) {
rtcdev = rtc_class_open(str);
+ /*
+ * Drop the reference we got in class_find_device,
+ * rtc_open takes its own.
+ */
+ put_device(dev);
+ }
if (!rtcdev) {
printk(KERN_WARNING "No RTC device found, ALARM timers will"
" not wake from suspend");
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 22a9da9a9c96..c027d4f602f1 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -197,7 +197,7 @@ EXPORT_SYMBOL_GPL(clockevents_register_device);
static void clockevents_config(struct clock_event_device *dev,
u32 freq)
{
- unsigned long sec;
+ u64 sec;
if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT))
return;
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index d9d5f8c885f6..1c95fd677328 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -639,7 +639,7 @@ static void clocksource_enqueue(struct clocksource *cs)
*/
void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq)
{
- unsigned long sec;
+ u64 sec;
/*
* Calc the maximum number of seconds which we can run before
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 723c7637e55a..c7218d132738 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -456,23 +456,27 @@ void tick_broadcast_oneshot_control(unsigned long reason)
unsigned long flags;
int cpu;
- raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
-
/*
* Periodic mode does not care about the enter/exit of power
* states
*/
if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
- goto out;
+ return;
- bc = tick_broadcast_device.evtdev;
+ /*
+ * We are called with preemtion disabled from the depth of the
+ * idle code, so we can't be moved away.
+ */
cpu = smp_processor_id();
td = &per_cpu(tick_cpu_device, cpu);
dev = td->evtdev;
if (!(dev->features & CLOCK_EVT_FEAT_C3STOP))
- goto out;
+ return;
+
+ bc = tick_broadcast_device.evtdev;
+ raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) {
if (!cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) {
cpumask_set_cpu(cpu, tick_get_broadcast_oneshot_mask());
@@ -489,8 +493,6 @@ void tick_broadcast_oneshot_control(unsigned long reason)
tick_program_event(dev->next_event, 1);
}
}
-
-out:
raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
}
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 8e6a05a5915a..342408cf68dd 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -680,7 +680,7 @@ static void timekeeping_resume(void)
clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL);
/* Resume hrtimers */
- hres_timers_resume();
+ hrtimers_resume();
}
static int timekeeping_suspend(void)
@@ -1099,6 +1099,21 @@ void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
}
/**
+ * ktime_get_monotonic_offset() - get wall_to_monotonic in ktime_t format
+ */
+ktime_t ktime_get_monotonic_offset(void)
+{
+ unsigned long seq;
+ struct timespec wtom;
+
+ do {
+ seq = read_seqbegin(&xtime_lock);
+ wtom = wall_to_monotonic;
+ } while (read_seqretry(&xtime_lock, seq));
+ return timespec_to_ktime(wtom);
+}
+
+/**
* xtime_update() - advances the timekeeping infrastructure
* @ticks: number of ticks, that have elapsed since the last call.
*
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 44646179eaba..bff131b9510a 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -15,6 +15,7 @@
#include <linux/err.h>
#include <linux/slab.h>
#include <linux/user_namespace.h>
+#include <linux/proc_fs.h>
static struct uts_namespace *create_uts_ns(void)
{
@@ -79,3 +80,41 @@ void free_uts_ns(struct kref *kref)
put_user_ns(ns->user_ns);
kfree(ns);
}
+
+static void *utsns_get(struct task_struct *task)
+{
+ struct uts_namespace *ns = NULL;
+ struct nsproxy *nsproxy;
+
+ rcu_read_lock();
+ nsproxy = task_nsproxy(task);
+ if (nsproxy) {
+ ns = nsproxy->uts_ns;
+ get_uts_ns(ns);
+ }
+ rcu_read_unlock();
+
+ return ns;
+}
+
+static void utsns_put(void *ns)
+{
+ put_uts_ns(ns);
+}
+
+static int utsns_install(struct nsproxy *nsproxy, void *ns)
+{
+ get_uts_ns(ns);
+ put_uts_ns(nsproxy->uts_ns);
+ nsproxy->uts_ns = ns;
+ return 0;
+}
+
+const struct proc_ns_operations utsns_operations = {
+ .name = "uts",
+ .type = CLONE_NEWUTS,
+ .get = utsns_get,
+ .put = utsns_put,
+ .install = utsns_install,
+};
+
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index e3378e8d3a5c..0400553f0d04 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -2866,9 +2866,7 @@ static int alloc_cwqs(struct workqueue_struct *wq)
}
}
- /* just in case, make sure it's actually aligned
- * - this is affected by PERCPU() alignment in vmlinux.lds.S
- */
+ /* just in case, make sure it's actually aligned */
BUG_ON(!IS_ALIGNED(wq->cpu_wq.v, align));
return wq->cpu_wq.v ? 0 : -ENOMEM;
}