From bf26c018490c2fce7fe9b629083b96ce0e6ad019 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 7 Apr 2011 16:53:20 +0200 Subject: ptrace: Prepare to fix racy accesses on task breakpoints When a task is traced and is in a stopped state, the tracer may execute a ptrace request to examine the tracee state and get its task struct. Right after, the tracee can be killed and thus its breakpoints released. This can happen concurrently when the tracer is in the middle of reading or modifying these breakpoints, leading to dereferencing a freed pointer. Hence, to prepare the fix, create a generic breakpoint reference holding API. When a reference on the breakpoints of a task is held, the breakpoints won't be released until the last reference is dropped. After that, no more ptrace request on the task's breakpoints can be serviced for the tracer. Reported-by: Oleg Nesterov Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Will Deacon Cc: Prasad Cc: Paul Mundt Cc: v2.6.33.. Link: http://lkml.kernel.org/r/1302284067-7860-2-git-send-email-fweisbec@gmail.com --- kernel/exit.c | 2 +- kernel/ptrace.c | 17 +++++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index f5d2f63bae0b..8dd874181542 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1016,7 +1016,7 @@ NORET_TYPE void do_exit(long code) /* * FIXME: do that only when needed, using sched_exit tracepoint */ - flush_ptrace_hw_breakpoint(tsk); + ptrace_put_breakpoints(tsk); exit_notify(tsk, group_dead); #ifdef CONFIG_NUMA diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 0fc1eed28d27..dc7ab65f3b36 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -22,6 +22,7 @@ #include #include #include +#include /* @@ -879,3 +880,19 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid, return ret; } #endif /* CONFIG_COMPAT */ + +#ifdef CONFIG_HAVE_HW_BREAKPOINT +int ptrace_get_breakpoints(struct task_struct *tsk) +{ + if (atomic_inc_not_zero(&tsk->ptrace_bp_refcnt)) + return 0; + + return -1; +} + +void ptrace_put_breakpoints(struct task_struct *tsk) +{ + if (atomic_dec_and_test(&tsk->ptrace_bp_refcnt)) + flush_ptrace_hw_breakpoint(tsk); +} +#endif /* CONFIG_HAVE_HW_BREAKPOINT */ -- cgit v1.2.3 From e05b2efb82596905ebfe88e8612ee81dec9b6592 Mon Sep 17 00:00:00 2001 From: john stultz Date: Wed, 4 May 2011 18:16:50 -0700 Subject: clocksource: Install completely before selecting Christian Hoffmann reported that the command line clocksource override with acpi_pm timer fails: Kernel command line: clocksource=acpi_pm hpet clockevent registered Switching to clocksource hpet Override clocksource acpi_pm is not HRT compatible. Cannot switch while in HRT/NOHZ mode. The watchdog code is what enables CLOCK_SOURCE_VALID_FOR_HRES, but we actually end up selecting the clocksource before we enqueue it into the watchdog list, so that's why we see the warning and fail to switch to acpi_pm timer as requested. That's particularly bad when we want to debug timekeeping related problems in early boot. Put the selection call last. Reported-by: Christian Hoffmann Signed-off-by: John Stultz Cc: stable@kernel.org # 32... Link: http://lkml.kernel.org/r/%3C1304558210.2943.24.camel%40work-vm%3E Signed-off-by: Thomas Gleixner --- kernel/time/clocksource.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 6519cf62d9cd..0e17c10f8a9d 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -685,8 +685,8 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) /* Add clocksource to the clcoksource list */ mutex_lock(&clocksource_mutex); clocksource_enqueue(cs); - clocksource_select(); clocksource_enqueue_watchdog(cs); + clocksource_select(); mutex_unlock(&clocksource_mutex); return 0; } @@ -706,8 +706,8 @@ int clocksource_register(struct clocksource *cs) mutex_lock(&clocksource_mutex); clocksource_enqueue(cs); - clocksource_select(); clocksource_enqueue_watchdog(cs); + clocksource_select(); mutex_unlock(&clocksource_mutex); return 0; } -- cgit v1.2.3 From a3a4a5acd3bd2f6f1e102e1f1b9d2e2bb320a7fd Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Thu, 5 May 2011 23:55:18 -0400 Subject: Regression: partial revert "tracing: Remove lock_depth from event entry" This partially reverts commit e6e1e2593592a8f6f6380496655d8c6f67431266. That commit changed the structure layout of the trace structure, which in turn broke PowerTOP (1.9x generation) quite badly. I appreciate not wanting to expose the variable in question, and PowerTOP was not using it, so I've replaced the variable with just a padding field - that way if in the future a new field is needed it can just use this padding field. Signed-off-by: Arjan van de Ven Signed-off-by: Linus Torvalds --- kernel/trace/trace.c | 1 + kernel/trace/trace_events.c | 1 + 2 files changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d38c16a06a6f..1cb49be7c7fb 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1110,6 +1110,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, entry->preempt_count = pc & 0xff; entry->pid = (tsk) ? tsk->pid : 0; + entry->padding = 0; entry->flags = #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index e88f74fe1d4c..2fe110341359 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -116,6 +116,7 @@ static int trace_define_common_fields(void) __common_field(unsigned char, flags); __common_field(unsigned char, preempt_count); __common_field(int, pid); + __common_field(int, padding); return ret; } -- cgit v1.2.3 From 87186475a402391a1ca7d42a675c9b35a18dc348 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 10 May 2011 21:09:53 +0200 Subject: PM: Fix warning in pm_restrict_gfp_mask() during SNAPSHOT_S2RAM ioctl A warning is printed by pm_restrict_gfp_mask() while the SNAPSHOT_S2RAM ioctl is being executed after creating a hibernation image, because pm_restrict_gfp_mask() has been called once already before the image creation and suspend_devices_and_enter() calls it once again. This happens after commit 452aa6999e6703ffbddd7f6ea124d3 (mm/pm: force GFP_NOIO during suspend/hibernation and resume). To avoid this issue, move pm_restrict_gfp_mask() and pm_restore_gfp_mask() from suspend_devices_and_enter() to its caller in kernel/power/suspend.c. Reported-by: Alexandre Felipe Muller de Souza Signed-off-by: Rafael J. Wysocki Cc: stable@kernel.org --- kernel/power/suspend.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 8935369d503a..6275970b2189 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -216,7 +216,6 @@ int suspend_devices_and_enter(suspend_state_t state) goto Close; } suspend_console(); - pm_restrict_gfp_mask(); suspend_test_start(); error = dpm_suspend_start(PMSG_SUSPEND); if (error) { @@ -233,7 +232,6 @@ int suspend_devices_and_enter(suspend_state_t state) suspend_test_start(); dpm_resume_end(PMSG_RESUME); suspend_test_finish("resume devices"); - pm_restore_gfp_mask(); resume_console(); Close: if (suspend_ops->end) @@ -294,7 +292,9 @@ int enter_state(suspend_state_t state) goto Finish; pr_debug("PM: Entering %s sleep\n", pm_states[state]); + pm_restrict_gfp_mask(); error = suspend_devices_and_enter(state); + pm_restore_gfp_mask(); Finish: pr_debug("PM: Finishing wakeup.\n"); -- cgit v1.2.3 From 9744997a8a2280e67984d4bffd87221d24f3b6b1 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 10 May 2011 21:10:01 +0200 Subject: PM / Hibernate: Make snapshot_release() restore GFP mask If the process using the hibernate user space interface closes /dev/snapshot after creating a hibernation image without thawing tasks, snapshot_release() should call pm_restore_gfp_mask() to restore the GFP mask used before the creation of the image. Make that happen. Tested-by: Alexandre Felipe Muller de Souza Signed-off-by: Rafael J. Wysocki Cc: stable@kernel.org --- kernel/power/user.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/user.c b/kernel/power/user.c index c36c3b9e8a84..6522be913ac1 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -135,8 +135,10 @@ static int snapshot_release(struct inode *inode, struct file *filp) free_basic_memory_bitmaps(); data = filp->private_data; free_all_swap_pages(data->swap); - if (data->frozen) + if (data->frozen) { + pm_restore_gfp_mask(); thaw_processes(); + } pm_notifier_call_chain(data->mode == O_RDONLY ? PM_POST_HIBERNATION : PM_POST_RESTORE); atomic_inc(&snapshot_device_available); -- cgit v1.2.3 From 36cb7035ea0c11ef2c7fa2bbe0cd181b23569b29 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 10 May 2011 21:10:13 +0200 Subject: PM / Hibernate: Fix ioctl SNAPSHOT_S2RAM The SNAPSHOT_S2RAM ioctl used for implementing the feature allowing one to suspend to RAM after creating a hibernation image is currently broken, because it doesn't clear the "ready" flag in the struct snapshot_data object handled by it. As a result, the SNAPSHOT_UNFREEZE doesn't work correctly after SNAPSHOT_S2RAM has returned and the user space hibernate task cannot thaw the other processes as appropriate. Make SNAPSHOT_S2RAM clear data->ready to fix this problem. Tested-by: Alexandre Felipe Muller de Souza Signed-off-by: Rafael J. Wysocki Cc: stable@kernel.org --- kernel/power/user.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/power/user.c b/kernel/power/user.c index 6522be913ac1..7d02d33be699 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -381,6 +381,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, * PM_HIBERNATION_PREPARE */ error = suspend_devices_and_enter(PM_SUSPEND_MEM); + data->ready = 0; break; case SNAPSHOT_PLATFORM_SUPPORT: -- cgit v1.2.3 From 47a150edc2ae734c0f4bf50aa19499e23b9a46f8 Mon Sep 17 00:00:00 2001 From: "Serge E. Hallyn" Date: Fri, 13 May 2011 04:27:54 +0100 Subject: Cache user_ns in struct cred If !CONFIG_USERNS, have current_user_ns() defined to (&init_user_ns). Get rid of _current_user_ns. This requires nsown_capable() to be defined in capability.c rather than as static inline in capability.h, so do that. Request_key needs init_user_ns defined at current_user_ns if !CONFIG_USERNS, so forward-declare that in cred.h if !CONFIG_USERNS at current_user_ns() define. Compile-tested with and without CONFIG_USERNS. Signed-off-by: Serge E. Hallyn [ This makes a huge performance difference for acl_permission_check(), up to 30%. And that is one of the hottest kernel functions for loads that are pathname-lookup heavy. ] Signed-off-by: Linus Torvalds --- kernel/capability.c | 12 ++++++++++++ kernel/cred.c | 12 ++++++------ 2 files changed, 18 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/capability.c b/kernel/capability.c index bf0c734d0c12..32a80e08ff4b 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -399,3 +399,15 @@ bool task_ns_capable(struct task_struct *t, int cap) return ns_capable(task_cred_xxx(t, user)->user_ns, cap); } EXPORT_SYMBOL(task_ns_capable); + +/** + * nsown_capable - Check superior capability to one's own user_ns + * @cap: The capability in question + * + * Return true if the current task has the given superior capability + * targeted at its own user namespace. + */ +bool nsown_capable(int cap) +{ + return ns_capable(current_user_ns(), cap); +} diff --git a/kernel/cred.c b/kernel/cred.c index 5557b55048df..8093c16b84b1 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -54,6 +54,7 @@ struct cred init_cred = { .cap_effective = CAP_INIT_EFF_SET, .cap_bset = CAP_INIT_BSET, .user = INIT_USER, + .user_ns = &init_user_ns, .group_info = &init_groups, #ifdef CONFIG_KEYS .tgcred = &init_tgcred, @@ -410,6 +411,11 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) goto error_put; } + /* cache user_ns in cred. Doesn't need a refcount because it will + * stay pinned by cred->user + */ + new->user_ns = new->user->user_ns; + #ifdef CONFIG_KEYS /* new threads get their own thread keyrings if their parent already * had one */ @@ -741,12 +747,6 @@ int set_create_files_as(struct cred *new, struct inode *inode) } EXPORT_SYMBOL(set_create_files_as); -struct user_namespace *current_user_ns(void) -{ - return _current_user_ns(); -} -EXPORT_SYMBOL(current_user_ns); - #ifdef CONFIG_DEBUG_CREDENTIALS bool creds_are_invalid(const struct cred *cred) -- cgit v1.2.3 From 07f4beb0b5bbfaf36a64aa00d59e670ec578a95a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 16 May 2011 11:07:48 +0200 Subject: tick: Clear broadcast active bit when switching to oneshot The first cpu which switches from periodic to oneshot mode switches also the broadcast device into oneshot mode. The broadcast device serves as a backup for per cpu timers which stop in deeper C-states. To avoid starvation of the cpus which might be in idle and depend on broadcast mode it marks the other cpus as broadcast active and sets the brodcast expiry value of those cpus to the next tick. The oneshot mode broadcast bit for the other cpus is sticky and gets only cleared when those cpus exit idle. If a cpu was not idle while the bit got set in consequence the bit prevents that the broadcast device is armed on behalf of that cpu when it enters idle for the first time after it switched to oneshot mode. In most cases that goes unnoticed as one of the other cpus has usually a timer pending which keeps the broadcast device armed with a short timeout. Now if the only cpu which has a short timer active has the bit set then the broadcast device will not be armed on behalf of that cpu and will fire way after the expected timer expiry. In the case of Christians bug report it took ~145 seconds which is about half of the wrap around time of HPET (the limit for that device) due to the fact that all other cpus had no timers armed which expired before the 145 seconds timeframe. The solution is simply to clear the broadcast active bit unconditionally when a cpu switches to oneshot mode after the first cpu switched the broadcast device over. It's not idle at that point otherwise it would not be executing that code. [ I fundamentally hate that broadcast crap. Why the heck thought some folks that when going into deep idle it's a brilliant concept to switch off the last device which brings the cpu back from that state? ] Thanks to Christian for providing all the valuable debug information! Reported-and-tested-by: Christian Hoffmann Cc: John Stultz Link: http://lkml.kernel.org/r/%3Calpine.LFD.2.02.1105161105170.3078%40ionos%3E Cc: stable@kernel.org Signed-off-by: Thomas Gleixner --- kernel/time/tick-broadcast.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index da800ffa810c..723c7637e55a 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -522,10 +522,11 @@ static void tick_broadcast_init_next_event(struct cpumask *mask, */ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { + int cpu = smp_processor_id(); + /* Set it up only once ! */ if (bc->event_handler != tick_handle_oneshot_broadcast) { int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC; - int cpu = smp_processor_id(); bc->event_handler = tick_handle_oneshot_broadcast; clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); @@ -551,6 +552,15 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) tick_broadcast_set_event(tick_next_period, 1); } else bc->next_event.tv64 = KTIME_MAX; + } else { + /* + * The first cpu which switches to oneshot mode sets + * the bit for all other cpus which are in the general + * (periodic) broadcast mask. So the bit is set and + * would prevent the first broadcast enter after this + * to program the bc device. + */ + tick_broadcast_clear_oneshot(cpu); } } -- cgit v1.2.3 From d410fa4ef99112386de5f218dd7df7b4fca910b4 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 19 May 2011 15:59:38 -0700 Subject: Create Documentation/security/, move LSM-, credentials-, and keys-related files from Documentation/ to Documentation/security/, add Documentation/security/00-INDEX, and update all occurrences of Documentation/ to Documentation/security/. --- kernel/cred.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cred.c b/kernel/cred.c index 8093c16b84b1..004e3679624d 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -1,4 +1,4 @@ -/* Task credentials management - see Documentation/credentials.txt +/* Task credentials management - see Documentation/security/credentials.txt * * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved. * Written by David Howells (dhowells@redhat.com) -- cgit v1.2.3