From 84bb5b645ec5a54744180a1edc5dc72adc862457 Mon Sep 17 00:00:00 2001 From: George McCollister Date: Tue, 18 Feb 2014 17:56:51 -0600 Subject: sched: Fix double normalization of vruntime commit 791c9e0292671a3bfa95286bb5c08129d8605618 upstream. dequeue_entity() is called when p->on_rq and sets se->on_rq = 0 which appears to guarentee that the !se->on_rq condition is met. If the task has done set_current_state(TASK_INTERRUPTIBLE) without schedule() the second condition will be met and vruntime will be incorrectly adjusted twice. In certain cases this can result in the task's vruntime never increasing past the vruntime of other tasks on the CFS' run queue, starving them of CPU time. This patch changes switched_from_fair() to use !p->on_rq instead of !se->on_rq. I'm able to cause a task with a priority of 120 to starve all other tasks with the same priority on an ARM platform running 3.2.51-rt72 PREEMPT RT by writing one character at time to a serial tty (16550 UART) in a tight loop. I'm also able to verify making this change corrects the problem on that platform and kernel version. Signed-off-by: George McCollister Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1392767811-28916-1-git-send-email-george.mccollister@gmail.com Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- kernel/sched/fair.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 305ef886219e..c7ab8eab5427 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5862,15 +5862,15 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p) struct cfs_rq *cfs_rq = cfs_rq_of(se); /* - * Ensure the task's vruntime is normalized, so that when its + * Ensure the task's vruntime is normalized, so that when it's * switched back to the fair class the enqueue_entity(.flags=0) will * do the right thing. * - * If it was on_rq, then the dequeue_entity(.flags=0) will already - * have normalized the vruntime, if it was !on_rq, then only when + * If it's on_rq, then the dequeue_entity(.flags=0) will already + * have normalized the vruntime, if it's !on_rq, then only when * the task is sleeping will it still have non-normalized vruntime. */ - if (!se->on_rq && p->state != TASK_RUNNING) { + if (!p->on_rq && p->state != TASK_RUNNING) { /* * Fix up our vruntime so that the current sleep doesn't * cause 'unlimited' sleep bonus. -- cgit v1.2.3 From 56f1c4124bd0c769591071916abc5358b8811c1a Mon Sep 17 00:00:00 2001 From: Chuansheng Liu Date: Mon, 24 Feb 2014 11:29:50 +0800 Subject: genirq: Remove racy waitqueue_active check commit c685689fd24d310343ac33942e9a54a974ae9c43 upstream. We hit one rare case below: T1 calling disable_irq(), but hanging at synchronize_irq() always; The corresponding irq thread is in sleeping state; And all CPUs are in idle state; After analysis, we found there is one possible scenerio which causes T1 is waiting there forever: CPU0 CPU1 synchronize_irq() wait_event() spin_lock() atomic_dec_and_test(&threads_active) insert the __wait into queue spin_unlock() if(waitqueue_active) atomic_read(&threads_active) wake_up() Here after inserted the __wait into queue on CPU0, and before test if queue is empty on CPU1, there is no barrier, it maybe cause it is not visible for CPU1 immediately, although CPU0 has updated the queue list. It is similar for CPU0 atomic_read() threads_active also. So we'd need one smp_mb() before waitqueue_active.that, but removing the waitqueue_active() check solves it as wel l and it makes things simple and clear. Signed-off-by: Chuansheng Liu Cc: Xiaoming Wang Link: http://lkml.kernel.org/r/1393212590-32543-1-git-send-email-chuansheng.liu@intel.com Signed-off-by: Thomas Gleixner Signed-off-by: Greg Kroah-Hartman --- kernel/irq/manage.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index dc4db3228dcd..9bd5c8a6c8ee 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -802,8 +802,7 @@ static irqreturn_t irq_thread_fn(struct irq_desc *desc, static void wake_threads_waitq(struct irq_desc *desc) { - if (atomic_dec_and_test(&desc->threads_active) && - waitqueue_active(&desc->wait_for_threads)) + if (atomic_dec_and_test(&desc->threads_active)) wake_up(&desc->wait_for_threads); } -- cgit v1.2.3 From 4bdd401e8b7384a685606f2254e634805580a2aa Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 27 Feb 2014 18:19:36 +0800 Subject: cpuset: fix a race condition in __cpuset_node_allowed_softwall() commit 99afb0fd5f05aac467ffa85c36778fec4396209b upstream. It's not safe to access task's cpuset after releasing task_lock(). Holding callback_mutex won't help. Signed-off-by: Li Zefan Signed-off-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- kernel/cpuset.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index d313870dcd02..d9dd521ddd6b 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2422,9 +2422,9 @@ int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask) task_lock(current); cs = nearest_hardwall_ancestor(task_cs(current)); + allowed = node_isset(node, cs->mems_allowed); task_unlock(current); - allowed = node_isset(node, cs->mems_allowed); mutex_unlock(&callback_mutex); return allowed; } -- cgit v1.2.3 From d6a6d1f38ce55aa5a7d8aab972176660b19fd7ab Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 26 Feb 2014 13:37:38 -0500 Subject: tracing: Do not add event files for modules that fail tracepoints commit 45ab2813d40d88fc575e753c38478de242d03f88 upstream. If a module fails to add its tracepoints due to module tainting, do not create the module event infrastructure in the debugfs directory. As the events will not work and worse yet, they will silently fail, making the user wonder why the events they enable do not display anything. Having a warning on module load and the events not visible to the users will make the cause of the problem much clearer. Link: http://lkml.kernel.org/r/20140227154923.265882695@goodmis.org Fixes: 6d723736e472 "tracing/events: add support for modules to TRACE_EVENT" Acked-by: Mathieu Desnoyers Cc: Rusty Russell Signed-off-by: Steven Rostedt Signed-off-by: Greg Kroah-Hartman --- kernel/trace/trace_events.c | 10 ++++++++++ kernel/tracepoint.c | 7 ++++++- 2 files changed, 16 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 3d18aadef493..2f4b185bfc23 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1860,6 +1860,16 @@ static void trace_module_add_events(struct module *mod) struct ftrace_module_file_ops *file_ops = NULL; struct ftrace_event_call **call, **start, **end; + if (!mod->num_trace_events) + return; + + /* Don't add infrastructure for mods without tracepoints */ + if (trace_module_has_bad_taint(mod)) { + pr_err("%s: module has bad taint, not creating trace events\n", + mod->name); + return; + } + start = mod->trace_events; end = mod->trace_events + mod->num_trace_events; diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 29f26540e9c9..031cc5655a51 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -631,6 +631,11 @@ void tracepoint_iter_reset(struct tracepoint_iter *iter) EXPORT_SYMBOL_GPL(tracepoint_iter_reset); #ifdef CONFIG_MODULES +bool trace_module_has_bad_taint(struct module *mod) +{ + return mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP)); +} + static int tracepoint_module_coming(struct module *mod) { struct tp_module *tp_mod, *iter; @@ -641,7 +646,7 @@ static int tracepoint_module_coming(struct module *mod) * module headers (for forced load), to make sure we don't cause a crash. * Staging and out-of-tree GPL modules are fine. */ - if (mod->taints & ~((1 << TAINT_OOT_MODULE) | (1 << TAINT_CRAP))) + if (trace_module_has_bad_taint(mod)) return 0; mutex_lock(&tracepoints_mutex); tp_mod = kmalloc(sizeof(struct tp_module), GFP_KERNEL); -- cgit v1.2.3 From a191212af8f4895d6a40c9d53fa84e9ae575ecd0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 26 Jun 2013 12:17:32 +0200 Subject: tick: Make oneshot broadcast robust vs. CPU offlining commit c9b5a266b103af873abb9ac03bc3d067702c8f4b upstream. In periodic mode we remove offline cpus from the broadcast propagation mask. In oneshot mode we fail to do so. This was not a problem so far, but the recent changes to the broadcast propagation introduced a constellation which can result in a NULL pointer dereference. What happens is: CPU0 CPU1 idle() arch_idle() tick_broadcast_oneshot_control(OFF); set cpu1 in tick_broadcast_force_mask if (cpu_offline()) arch_cpu_dead() cpu_dead_cleanup(cpu1) cpu1 tickdevice pointer = NULL broadcast interrupt dereference cpu1 tickdevice pointer -> OOPS We dereference the pointer because cpu1 is still set in tick_broadcast_force_mask and tick_do_broadcast() expects a valid cpumask and therefor lacks any further checks. Remove the cpu from the tick_broadcast_force_mask before we set the tick device pointer to NULL. Also add a sanity check to the oneshot broadcast function, so we can detect such issues w/o crashing the machine. Reported-by: Prarit Bhargava Cc: athorlton@sgi.com Cc: CAI Qian Link: http://lkml.kernel.org/r/alpine.DEB.2.02.1306261303260.4013@ionos.tec.linutronix.de Signed-off-by: Thomas Gleixner Signed-off-by: Preeti U Murthy Signed-off-by: Greg Kroah-Hartman --- kernel/time/tick-broadcast.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index f681da32a2ff..19ee339a1d0d 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -593,6 +593,13 @@ again: cpumask_or(tmpmask, tmpmask, tick_broadcast_force_mask); cpumask_clear(tick_broadcast_force_mask); + /* + * Sanity check. Catch the case where we try to broadcast to + * offline cpus. + */ + if (WARN_ON_ONCE(!cpumask_subset(tmpmask, cpu_online_mask))) + cpumask_and(tmpmask, tmpmask, cpu_online_mask); + /* * Wakeup the cpus which have an expired event. */ @@ -834,10 +841,12 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup) raw_spin_lock_irqsave(&tick_broadcast_lock, flags); /* - * Clear the broadcast mask flag for the dead cpu, but do not - * stop the broadcast device! + * Clear the broadcast masks for the dead cpu, but do not stop + * the broadcast device! */ cpumask_clear_cpu(cpu, tick_broadcast_oneshot_mask); + cpumask_clear_cpu(cpu, tick_broadcast_pending_mask); + cpumask_clear_cpu(cpu, tick_broadcast_force_mask); raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags); } -- cgit v1.2.3 From a1c10a94ff3c76b83a9e2899659ff27877fce23f Mon Sep 17 00:00:00 2001 From: Vaibhav Nagarnaik Date: Thu, 13 Feb 2014 19:51:48 -0800 Subject: tracing: Fix array size mismatch in format string commit 87291347c49dc40aa339f587b209618201c2e527 upstream. In event format strings, the array size is reported in two locations. One in array subscript and then via the "size:" attribute. The values reported there have a mismatch. For e.g., in sched:sched_switch the prev_comm and next_comm character arrays have subscript values as [32] where as the actual field size is 16. name: sched_switch ID: 301 format: field:unsigned short common_type; offset:0; size:2; signed:0; field:unsigned char common_flags; offset:2; size:1; signed:0; field:unsigned char common_preempt_count; offset:3; size:1;signed:0; field:int common_pid; offset:4; size:4; signed:1; field:char prev_comm[32]; offset:8; size:16; signed:1; field:pid_t prev_pid; offset:24; size:4; signed:1; field:int prev_prio; offset:28; size:4; signed:1; field:long prev_state; offset:32; size:8; signed:1; field:char next_comm[32]; offset:40; size:16; signed:1; field:pid_t next_pid; offset:56; size:4; signed:1; field:int next_prio; offset:60; size:4; signed:1; After bisection, the following commit was blamed: 92edca0 tracing: Use direct field, type and system names This commit removes the duplication of strings for field->name and field->type assuming that all the strings passed in __trace_define_field() are immutable. This is not true for arrays, where the type string is created in event_storage variable and field->type for all array fields points to event_storage. Use __stringify() to create a string constant for the type string. Also, get rid of event_storage and event_storage_mutex that are not needed anymore. also, an added benefit is that this reduces the overhead of events a bit more: text data bss dec hex filename 8424787 2036472 1302528 11763787 b3804b vmlinux 8420814 2036408 1302528 11759750 b37086 vmlinux.patched Link: http://lkml.kernel.org/r/1392349908-29685-1-git-send-email-vnagarnaik@google.com Cc: Laurent Chavey Signed-off-by: Vaibhav Nagarnaik Signed-off-by: Steven Rostedt Signed-off-by: Greg Kroah-Hartman --- kernel/trace/trace_events.c | 6 ------ kernel/trace/trace_export.c | 7 ++----- 2 files changed, 2 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 2f4b185bfc23..001b349af939 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -27,12 +27,6 @@ DEFINE_MUTEX(event_mutex); -DEFINE_MUTEX(event_storage_mutex); -EXPORT_SYMBOL_GPL(event_storage_mutex); - -char event_storage[EVENT_STORAGE_SIZE]; -EXPORT_SYMBOL_GPL(event_storage); - LIST_HEAD(ftrace_events); static LIST_HEAD(ftrace_common_fields); diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index d21a74670088..d7d0b50b1b70 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -95,15 +95,12 @@ static void __always_unused ____ftrace_check_##name(void) \ #undef __array #define __array(type, item, len) \ do { \ + char *type_str = #type"["__stringify(len)"]"; \ BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ - mutex_lock(&event_storage_mutex); \ - snprintf(event_storage, sizeof(event_storage), \ - "%s[%d]", #type, len); \ - ret = trace_define_field(event_call, event_storage, #item, \ + ret = trace_define_field(event_call, type_str, #item, \ offsetof(typeof(field), item), \ sizeof(field.item), \ is_signed_type(type), filter_type); \ - mutex_unlock(&event_storage_mutex); \ if (ret) \ return ret; \ } while (0); -- cgit v1.2.3 From ccdb5fa37f4d9a80cd0a9170e5bcc7c9510d8c1b Mon Sep 17 00:00:00 2001 From: Gerald Schaefer Date: Fri, 24 May 2013 18:07:49 +0200 Subject: sched/autogroup: Fix race with task_groups list commit 41261b6a832ea0e788627f6a8707854423f9ff49 upstream. In autogroup_create(), a tg is allocated and added to the task_groups list. If CONFIG_RT_GROUP_SCHED is set, this tg is then modified while on the list, without locking. This can race with someone walking the list, like __enable_runtime() during CPU unplug, and result in a use-after-free bug. To fix this, move sched_online_group(), which adds the tg to the list, to the end of the autogroup_create() function after the modification. Signed-off-by: Gerald Schaefer Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1369411669-46971-2-git-send-email-gerald.schaefer@de.ibm.com Signed-off-by: Ingo Molnar Signed-off-by: Preeti U Murthy Signed-off-by: Greg Kroah-Hartman --- kernel/sched/auto_group.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index 64de5f8b0c9e..4a073539c58e 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c @@ -77,8 +77,6 @@ static inline struct autogroup *autogroup_create(void) if (IS_ERR(tg)) goto out_free; - sched_online_group(tg, &root_task_group); - kref_init(&ag->kref); init_rwsem(&ag->lock); ag->id = atomic_inc_return(&autogroup_seq_nr); @@ -98,6 +96,7 @@ static inline struct autogroup *autogroup_create(void) #endif tg->autogroup = ag; + sched_online_group(tg, &root_task_group); return ag; out_free: -- cgit v1.2.3 From f26c70a452dc0507bf7d3d2c3158ee7808e14f1c Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Sun, 2 Mar 2014 13:09:47 +0100 Subject: futex: Allow architectures to skip futex_atomic_cmpxchg_inatomic() test commit 03b8c7b623c80af264c4c8d6111e5c6289933666 upstream. If an architecture has futex_atomic_cmpxchg_inatomic() implemented and there is no runtime check necessary, allow to skip the test within futex_init(). This allows to get rid of some code which would always give the same result, and also allows the compiler to optimize a couple of if statements away. Signed-off-by: Heiko Carstens Cc: Finn Thain Cc: Geert Uytterhoeven Link: http://lkml.kernel.org/r/20140302120947.GA3641@osiris Signed-off-by: Thomas Gleixner [geert: Backported to v3.10..v3.13] Signed-off-by: Geert Uytterhoeven Signed-off-by: Greg Kroah-Hartman --- kernel/futex.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/futex.c b/kernel/futex.c index a283b3041072..3bc18bf48d0c 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -67,7 +67,9 @@ #include "rtmutex_common.h" +#ifndef CONFIG_HAVE_FUTEX_CMPXCHG int __read_mostly futex_cmpxchg_enabled; +#endif #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) @@ -2729,10 +2731,10 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); } -static int __init futex_init(void) +static void __init futex_detect_cmpxchg(void) { +#ifndef CONFIG_HAVE_FUTEX_CMPXCHG u32 curval; - int i; /* * This will fail and we want it. Some arch implementations do @@ -2746,6 +2748,14 @@ static int __init futex_init(void) */ if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT) futex_cmpxchg_enabled = 1; +#endif +} + +static int __init futex_init(void) +{ + int i; + + futex_detect_cmpxchg(); for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { plist_head_init(&futex_queues[i].chain); -- cgit v1.2.3 From 98afe6dfdef0ef9df6e21cdd9d977bfc6147b0a9 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Mon, 14 Apr 2014 16:58:55 -0400 Subject: user namespace: fix incorrect memory barriers commit e79323bd87808fdfbc68ce6c5371bd224d9672ee upstream. smp_read_barrier_depends() can be used if there is data dependency between the readers - i.e. if the read operation after the barrier uses address that was obtained from the read operation before the barrier. In this file, there is only control dependency, no data dependecy, so the use of smp_read_barrier_depends() is incorrect. The code could fail in the following way: * the cpu predicts that idx < entries is true and starts executing the body of the for loop * the cpu fetches map->extent[0].first and map->extent[0].count * the cpu fetches map->nr_extents * the cpu verifies that idx < extents is true, so it commits the instructions in the body of the for loop The problem is that in this scenario, the cpu read map->extent[0].first and map->nr_extents in the wrong order. We need a full read memory barrier to prevent it. Signed-off-by: Mikulas Patocka Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- kernel/user_namespace.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 9064b919a406..9bea1d7dd21f 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -148,7 +148,7 @@ static u32 map_id_range_down(struct uid_gid_map *map, u32 id, u32 count) /* Find the matching extent */ extents = map->nr_extents; - smp_read_barrier_depends(); + smp_rmb(); for (idx = 0; idx < extents; idx++) { first = map->extent[idx].first; last = first + map->extent[idx].count - 1; @@ -172,7 +172,7 @@ static u32 map_id_down(struct uid_gid_map *map, u32 id) /* Find the matching extent */ extents = map->nr_extents; - smp_read_barrier_depends(); + smp_rmb(); for (idx = 0; idx < extents; idx++) { first = map->extent[idx].first; last = first + map->extent[idx].count - 1; @@ -195,7 +195,7 @@ static u32 map_id_up(struct uid_gid_map *map, u32 id) /* Find the matching extent */ extents = map->nr_extents; - smp_read_barrier_depends(); + smp_rmb(); for (idx = 0; idx < extents; idx++) { first = map->extent[idx].lower_first; last = first + map->extent[idx].count - 1; @@ -611,9 +611,8 @@ static ssize_t map_write(struct file *file, const char __user *buf, * were written before the count of the extents. * * To achieve this smp_wmb() is used on guarantee the write - * order and smp_read_barrier_depends() is guaranteed that we - * don't have crazy architectures returning stale data. - * + * order and smp_rmb() is guaranteed that we don't have crazy + * architectures returning stale data. */ mutex_lock(&id_map_mutex); -- cgit v1.2.3 From a15c36b2de489e2388819b85ccb933b747a18b70 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 2 Apr 2014 17:45:05 +0200 Subject: pid_namespace: pidns_get() should check task_active_pid_ns() != NULL commit d23082257d83e4bc89727d5aedee197e907999d2 upstream. pidns_get()->get_pid_ns() can hit ns == NULL. This task_struct can't go away, but task_active_pid_ns(task) is NULL if release_task(task) was already called. Alternatively we could change get_pid_ns(ns) to check ns != NULL, but it seems that other callers are fine. Signed-off-by: Oleg Nesterov Cc: Eric W. Biederman ebiederm@xmission.com> Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- kernel/pid_namespace.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 6917e8edb48e..e32703d5e0ab 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -312,7 +312,9 @@ static void *pidns_get(struct task_struct *task) struct pid_namespace *ns; rcu_read_lock(); - ns = get_pid_ns(task_active_pid_ns(task)); + ns = task_active_pid_ns(task); + if (ns) + get_pid_ns(ns); rcu_read_unlock(); return ns; -- cgit v1.2.3 From a756e3d4c71d3de28373076da2552260741f3d0d Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 7 Apr 2014 15:38:41 -0700 Subject: wait: fix reparent_leader() vs EXIT_DEAD->EXIT_ZOMBIE race commit dfccbb5e49a621c1b21a62527d61fc4305617aca upstream. wait_task_zombie() first does EXIT_ZOMBIE->EXIT_DEAD transition and drops tasklist_lock. If this task is not the natural child and it is traced, we change its state back to EXIT_ZOMBIE for ->real_parent. The last transition is racy, this is even documented in 50b8d257486a "ptrace: partially fix the do_wait(WEXITED) vs EXIT_DEAD->EXIT_ZOMBIE race". wait_consider_task() tries to detect this transition and clear ->notask_error but we can't rely on ptrace_reparented(), debugger can exit and do ptrace_unlink() before its sub-thread sets EXIT_ZOMBIE. And there is another problem which were missed before: this transition can also race with reparent_leader() which doesn't reset >exit_signal if EXIT_DEAD, assuming that this task must be reaped by someone else. So the tracee can be re-parented with ->exit_signal != SIGCHLD, and if /sbin/init doesn't use __WALL it becomes unreapable. Change reparent_leader() to update ->exit_signal even if EXIT_DEAD. Note: this is the simple temporary hack for -stable, it doesn't try to solve all problems, it will be reverted by the next changes. Signed-off-by: Oleg Nesterov Reported-by: Jan Kratochvil Reported-by: Michal Schmidt Tested-by: Michal Schmidt Cc: Al Viro Cc: Lennart Poettering Cc: Roland McGrath Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- kernel/exit.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 7bb73f9d09db..80495a1ed651 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -570,9 +570,6 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p, struct list_head *dead) { list_move_tail(&p->sibling, &p->real_parent->children); - - if (p->exit_state == EXIT_DEAD) - return; /* * If this is a threaded reparent there is no need to * notify anyone anything has happened. @@ -580,9 +577,19 @@ static void reparent_leader(struct task_struct *father, struct task_struct *p, if (same_thread_group(p->real_parent, father)) return; - /* We don't want people slaying init. */ + /* + * We don't want people slaying init. + * + * Note: we do this even if it is EXIT_DEAD, wait_task_zombie() + * can change ->exit_state to EXIT_ZOMBIE. If this is the final + * state, do_notify_parent() was already called and ->exit_signal + * doesn't matter. + */ p->exit_signal = SIGCHLD; + if (p->exit_state == EXIT_DEAD) + return; + /* If it has exited notify the new parent about this child's death. */ if (!p->ptrace && p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) { -- cgit v1.2.3 From 1673978155f56ce8e1bcabacb9ee09464b5de728 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 7 Apr 2014 15:38:29 -0700 Subject: exit: call disassociate_ctty() before exit_task_namespaces() commit c39df5fa37b0623589508c95515b4aa1531c524e upstream. Commit 8aac62706ada ("move exit_task_namespaces() outside of exit_notify()") breaks pppd and the exiting service crashes the kernel: BUG: unable to handle kernel NULL pointer dereference at 0000000000000028 IP: ppp_register_channel+0x13/0x20 [ppp_generic] Call Trace: ppp_asynctty_open+0x12b/0x170 [ppp_async] tty_ldisc_open.isra.2+0x27/0x60 tty_ldisc_hangup+0x1e3/0x220 __tty_hangup+0x2c4/0x440 disassociate_ctty+0x61/0x270 do_exit+0x7f2/0xa50 ppp_register_channel() needs ->net_ns and current->nsproxy == NULL. Move disassociate_ctty() before exit_task_namespaces(), it doesn't make sense to delay it after perf_event_exit_task() or cgroup_exit(). This also allows to use task_work_add() inside the (nontrivial) code paths in disassociate_ctty(). Investigated by Peter Hurley. Signed-off-by: Oleg Nesterov Reported-by: Sree Harsha Totakura Cc: Peter Hurley Cc: Sree Harsha Totakura Cc: "Eric W. Biederman" Cc: Jeff Dike Cc: Ingo Molnar Cc: Andrey Vagin Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- kernel/exit.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 80495a1ed651..6682b2ea5b11 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -801,6 +801,8 @@ void do_exit(long code) exit_shm(tsk); exit_files(tsk); exit_fs(tsk); + if (group_dead) + disassociate_ctty(1); exit_task_namespaces(tsk); exit_task_work(tsk); check_stack_usage(); @@ -816,13 +818,9 @@ void do_exit(long code) cgroup_exit(tsk, 1); - if (group_dead) - disassociate_ctty(1); - module_put(task_thread_info(tsk)->exec_domain->module); proc_exit_connector(tsk); - /* * FIXME: do that only when needed, using sched_exit tracepoint */ -- cgit v1.2.3 From 82c647cbfc1f77afbaa199f8fe14a186eaa1bfe1 Mon Sep 17 00:00:00 2001 From: Liu Hua Date: Mon, 7 Apr 2014 15:38:57 -0700 Subject: hung_task: check the value of "sysctl_hung_task_timeout_sec" commit 80df28476505ed4e6701c3448c63c9229a50c655 upstream. As sysctl_hung_task_timeout_sec is unsigned long, when this value is larger then LONG_MAX/HZ, the function schedule_timeout_interruptible in watchdog will return immediately without sleep and with print : schedule_timeout: wrong timeout value ffffffffffffff83 and then the funtion watchdog will call schedule_timeout_interruptible again and again. The screen will be filled with "schedule_timeout: wrong timeout value ffffffffffffff83" This patch does some check and correction in sysctl, to let the function schedule_timeout_interruptible allways get the valid parameter. Signed-off-by: Liu Hua Tested-by: Satoru Takeuchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- kernel/sysctl.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 9edcf456e0fc..ed6c01626acd 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -144,6 +144,11 @@ static int min_percpu_pagelist_fract = 8; static int ngroups_max = NGROUPS_MAX; static const int cap_last_cap = CAP_LAST_CAP; +/*this is needed for proc_doulongvec_minmax of sysctl_hung_task_timeout_secs */ +#ifdef CONFIG_DETECT_HUNG_TASK +static unsigned long hung_task_timeout_max = (LONG_MAX/HZ); +#endif + #ifdef CONFIG_INOTIFY_USER #include #endif @@ -966,6 +971,7 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(unsigned long), .mode = 0644, .proc_handler = proc_dohung_task_timeout_secs, + .extra2 = &hung_task_timeout_max, }, { .procname = "hung_task_warnings", -- cgit v1.2.3