summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile6
-rw-r--r--kernel/audit.c3
-rw-r--r--kernel/audit_watch.c2
-rw-r--r--kernel/bpf/syscall.c4
-rw-r--r--kernel/cgroup.c4
-rw-r--r--kernel/cpu_pm.c4
-rw-r--r--kernel/debug/debug_core.c27
-rw-r--r--kernel/debug/kdb/kdb_io.c8
-rw-r--r--kernel/debug/kdb/kdb_private.h2
-rw-r--r--kernel/elfcore.c25
-rw-r--r--kernel/events/core.c27
-rw-r--r--kernel/events/internal.h2
-rw-r--r--kernel/events/uprobes.c16
-rw-r--r--kernel/exit.c30
-rw-r--r--kernel/fork.c50
-rw-r--r--kernel/futex.c1167
-rw-r--r--kernel/futex_compat.c201
-rw-r--r--kernel/gcov/fs.c2
-rw-r--r--kernel/gcov/gcc_4_7.c4
-rw-r--r--kernel/irq/manage.c4
-rw-r--r--kernel/irq/migration.c26
-rw-r--r--kernel/kexec_file.c4
-rw-r--r--kernel/kmod.c9
-rw-r--r--kernel/kprobes.c48
-rw-r--r--kernel/locking/lockdep_proc.c2
-rw-r--r--kernel/locking/rtmutex-debug.c9
-rw-r--r--kernel/locking/rtmutex-debug.h3
-rw-r--r--kernel/locking/rtmutex.c292
-rw-r--r--kernel/locking/rtmutex.h2
-rw-r--r--kernel/locking/rtmutex_common.h14
-rw-r--r--kernel/module.c27
-rw-r--r--kernel/padata.c88
-rw-r--r--kernel/power/hibernate.c11
-rw-r--r--kernel/printk/printk.c3
-rw-r--r--kernel/profile.c21
-rw-r--r--kernel/ptrace.c18
-rw-r--r--kernel/reboot.c28
-rw-r--r--kernel/sched/core.c8
-rw-r--r--kernel/sched/fair.c43
-rw-r--r--kernel/sched/rt.c1
-rw-r--r--kernel/sys.c11
-rw-r--r--kernel/sysctl.c2
-rw-r--r--kernel/time/posix-clock.c31
-rw-r--r--kernel/time/timer.c1
-rw-r--r--kernel/trace/blktrace.c212
-rw-r--r--kernel/trace/bpf_trace.c4
-rw-r--r--kernel/trace/ftrace.c28
-rw-r--r--kernel/trace/ring_buffer.c94
-rw-r--r--kernel/trace/trace.c88
-rw-r--r--kernel/trace/trace.h68
-rw-r--r--kernel/trace/trace_clock.c44
-rw-r--r--kernel/trace/trace_entries.h2
-rw-r--r--kernel/trace/trace_events.c5
-rw-r--r--kernel/trace/trace_events_trigger.c21
-rw-r--r--kernel/trace/trace_functions.c2
-rw-r--r--kernel/trace/trace_selftest.c9
-rw-r--r--kernel/tracepoint.c80
-rw-r--r--kernel/workqueue.c35
58 files changed, 1903 insertions, 1079 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 53abf008ecb3..8b73d57804f2 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -36,9 +36,6 @@ obj-$(CONFIG_PROFILING) += profile.o
obj-$(CONFIG_STACKTRACE) += stacktrace.o
obj-y += time/
obj-$(CONFIG_FUTEX) += futex.o
-ifeq ($(CONFIG_COMPAT),y)
-obj-$(CONFIG_FUTEX) += futex_compat.o
-endif
obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
obj-$(CONFIG_SMP) += smp.o
ifneq ($(CONFIG_SMP),y)
@@ -80,9 +77,6 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
obj-$(CONFIG_TRACEPOINTS) += tracepoint.o
obj-$(CONFIG_LATENCYTOP) += latencytop.o
-obj-$(CONFIG_BINFMT_ELF) += elfcore.o
-obj-$(CONFIG_COMPAT_BINFMT_ELF) += elfcore.o
-obj-$(CONFIG_BINFMT_ELF_FDPIC) += elfcore.o
obj-$(CONFIG_FUNCTION_TRACER) += trace/
obj-$(CONFIG_TRACING) += trace/
obj-$(CONFIG_TRACE_CLOCK) += trace/
diff --git a/kernel/audit.c b/kernel/audit.c
index 84c445db5fe1..b685672def35 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -919,6 +919,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
case AUDIT_FIRST_USER_MSG2 ... AUDIT_LAST_USER_MSG2:
if (!audit_enabled && msg_type != AUDIT_USER_AVC)
return 0;
+ /* exit early if there isn't at least one character to print */
+ if (data_len < 2)
+ return -EINVAL;
err = audit_filter_user(msg_type);
if (err == 1) { /* match or error */
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index f45a9a5d3e47..af453f3c2b3d 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -316,8 +316,6 @@ static void audit_update_watch(struct audit_parent *parent,
if (oentry->rule.exe)
audit_remove_mark(oentry->rule.exe);
- audit_watch_log_rule_change(r, owatch, "updated_rules");
-
call_rcu(&oentry->rcu, audit_free_rule_rcu);
}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index fd3fd8d17ef5..01431ef8cf07 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -152,7 +152,7 @@ static int map_create(union bpf_attr *attr)
err = bpf_map_charge_memlock(map);
if (err)
- goto free_map;
+ goto free_map_nouncharge;
err = bpf_map_new_fd(map);
if (err < 0)
@@ -162,6 +162,8 @@ static int map_create(union bpf_attr *attr)
return err;
free_map:
+ bpf_map_uncharge_memlock(map);
+free_map_nouncharge:
map->ops->map_free(map);
return err;
}
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 7a7c535f8a2f..1f5e7dcbfd40 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3310,6 +3310,10 @@ static int cgroup_rename(struct kernfs_node *kn, struct kernfs_node *new_parent,
struct cgroup *cgrp = kn->priv;
int ret;
+ /* do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable */
+ if (strchr(new_name_str, '\n'))
+ return -EINVAL;
+
if (kernfs_type(kn) != KERNFS_DIR)
return -ENOTDIR;
if (kn->parent != new_parent)
diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c
index 009cc9a17d95..f1042d639eee 100644
--- a/kernel/cpu_pm.c
+++ b/kernel/cpu_pm.c
@@ -97,7 +97,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_unregister_notifier);
*/
int cpu_pm_enter(void)
{
- int nr_calls;
+ int nr_calls = 0;
int ret = 0;
read_lock(&cpu_pm_notifier_lock);
@@ -156,7 +156,7 @@ EXPORT_SYMBOL_GPL(cpu_pm_exit);
*/
int cpu_cluster_pm_enter(void)
{
- int nr_calls;
+ int nr_calls = 0;
int ret = 0;
read_lock(&cpu_pm_notifier_lock);
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 79517e5549f1..bc791cec58e6 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -94,14 +94,6 @@ int dbg_switch_cpu;
/* Use kdb or gdbserver mode */
int dbg_kdb_mode = 1;
-static int __init opt_kgdb_con(char *str)
-{
- kgdb_use_con = 1;
- return 0;
-}
-
-early_param("kgdbcon", opt_kgdb_con);
-
module_param(kgdb_use_con, int, 0644);
module_param(kgdbreboot, int, 0644);
@@ -443,6 +435,7 @@ static int kgdb_reenter_check(struct kgdb_state *ks)
if (exception_level > 1) {
dump_stack();
+ kgdb_io_module_registered = false;
panic("Recursive entry to debugger");
}
@@ -487,6 +480,7 @@ static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs,
arch_kgdb_ops.disable_hw_break(regs);
acquirelock:
+ rcu_read_lock();
/*
* Interrupts will be restored by the 'trap return' code, except when
* single stepping.
@@ -541,6 +535,7 @@ return_normal:
atomic_dec(&slaves_in_kgdb);
dbg_touch_watchdogs();
local_irq_restore(flags);
+ rcu_read_unlock();
return 0;
}
cpu_relax();
@@ -559,6 +554,7 @@ return_normal:
raw_spin_unlock(&dbg_master_lock);
dbg_touch_watchdogs();
local_irq_restore(flags);
+ rcu_read_unlock();
goto acquirelock;
}
@@ -676,6 +672,7 @@ kgdb_restore:
raw_spin_unlock(&dbg_master_lock);
dbg_touch_watchdogs();
local_irq_restore(flags);
+ rcu_read_unlock();
return kgdb_info[cpu].ret_state;
}
@@ -806,6 +803,20 @@ static struct console kgdbcons = {
.index = -1,
};
+static int __init opt_kgdb_con(char *str)
+{
+ kgdb_use_con = 1;
+
+ if (kgdb_io_module_registered && !kgdb_con_registered) {
+ register_console(&kgdbcons);
+ kgdb_con_registered = 1;
+ }
+
+ return 0;
+}
+
+early_param("kgdbcon", opt_kgdb_con);
+
#ifdef CONFIG_MAGIC_SYSRQ
static void sysrq_handle_dbg(int key)
{
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index cc892a9e109d..ae39b014b7d6 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -683,12 +683,16 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap)
size_avail = sizeof(kdb_buffer) - len;
goto kdb_print_out;
}
- if (kdb_grepping_flag >= KDB_GREPPING_FLAG_SEARCH)
+ if (kdb_grepping_flag >= KDB_GREPPING_FLAG_SEARCH) {
/*
* This was a interactive search (using '/' at more
- * prompt) and it has completed. Clear the flag.
+ * prompt) and it has completed. Replace the \0 with
+ * its original value to ensure multi-line strings
+ * are handled properly, and return to normal mode.
*/
+ *cphold = replaced_byte;
kdb_grepping_flag = 0;
+ }
/*
* at this point the string is a full line and
* should be printed, up to the null.
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index 533e04e75a9c..f51b762d6886 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -234,7 +234,7 @@ extern struct task_struct *kdb_curr_task(int);
#define kdb_do_each_thread(g, p) do_each_thread(g, p)
#define kdb_while_each_thread(g, p) while_each_thread(g, p)
-#define GFP_KDB (in_interrupt() ? GFP_ATOMIC : GFP_KERNEL)
+#define GFP_KDB (in_dbg_master() ? GFP_ATOMIC : GFP_KERNEL)
extern void *debug_kmalloc(size_t size, gfp_t flags);
extern void debug_kfree(void *);
diff --git a/kernel/elfcore.c b/kernel/elfcore.c
deleted file mode 100644
index a2b29b9bdfcb..000000000000
--- a/kernel/elfcore.c
+++ /dev/null
@@ -1,25 +0,0 @@
-#include <linux/elf.h>
-#include <linux/fs.h>
-#include <linux/mm.h>
-#include <linux/binfmts.h>
-#include <linux/elfcore.h>
-
-Elf_Half __weak elf_core_extra_phdrs(void)
-{
- return 0;
-}
-
-int __weak elf_core_write_extra_phdrs(struct coredump_params *cprm, loff_t offset)
-{
- return 1;
-}
-
-int __weak elf_core_write_extra_data(struct coredump_params *cprm)
-{
- return 1;
-}
-
-size_t __weak elf_core_extra_data_size(void)
-{
- return 0;
-}
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 203384a71fee..ee75563b724f 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -3497,7 +3497,9 @@ find_get_context(struct pmu *pmu, struct task_struct *task,
cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
ctx = &cpuctx->ctx;
get_ctx(ctx);
+ raw_spin_lock_irqsave(&ctx->lock, flags);
++ctx->pin_count;
+ raw_spin_unlock_irqrestore(&ctx->lock, flags);
return ctx;
}
@@ -3940,7 +3942,9 @@ EXPORT_SYMBOL_GPL(perf_event_read_value);
static int __perf_read_group_add(struct perf_event *leader,
u64 read_format, u64 *values)
{
+ struct perf_event_context *ctx = leader->ctx;
struct perf_event *sub;
+ unsigned long flags;
int n = 1; /* skip @nr */
int ret;
@@ -3970,12 +3974,15 @@ static int __perf_read_group_add(struct perf_event *leader,
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(leader);
+ raw_spin_lock_irqsave(&ctx->lock, flags);
+
list_for_each_entry(sub, &leader->sibling_list, group_entry) {
values[n++] += perf_event_count(sub);
if (read_format & PERF_FORMAT_ID)
values[n++] = primary_event_id(sub);
}
+ raw_spin_unlock_irqrestore(&ctx->lock, flags);
return 0;
}
@@ -4659,11 +4666,11 @@ static void perf_mmap_open(struct vm_area_struct *vma)
static void perf_mmap_close(struct vm_area_struct *vma)
{
struct perf_event *event = vma->vm_file->private_data;
-
struct ring_buffer *rb = ring_buffer_get(event);
struct user_struct *mmap_user = rb->mmap_user;
int mmap_locked = rb->mmap_locked;
unsigned long size = perf_data_size(rb);
+ bool detach_rest = false;
if (event->pmu->event_unmapped)
event->pmu->event_unmapped(event);
@@ -4682,7 +4689,8 @@ static void perf_mmap_close(struct vm_area_struct *vma)
mutex_unlock(&event->mmap_mutex);
}
- atomic_dec(&rb->mmap_count);
+ if (atomic_dec_and_test(&rb->mmap_count))
+ detach_rest = true;
if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
goto out_put;
@@ -4691,7 +4699,7 @@ static void perf_mmap_close(struct vm_area_struct *vma)
mutex_unlock(&event->mmap_mutex);
/* If there's still other mmap()s of this buffer, we're done. */
- if (atomic_read(&rb->mmap_count))
+ if (!detach_rest)
goto out_put;
/*
@@ -5810,10 +5818,17 @@ static void perf_event_task_output(struct perf_event *event,
goto out;
task_event->event_id.pid = perf_event_pid(event, task);
- task_event->event_id.ppid = perf_event_pid(event, current);
-
task_event->event_id.tid = perf_event_tid(event, task);
- task_event->event_id.ptid = perf_event_tid(event, current);
+
+ if (task_event->event_id.header.type == PERF_RECORD_EXIT) {
+ task_event->event_id.ppid = perf_event_pid(event,
+ task->real_parent);
+ task_event->event_id.ptid = perf_event_pid(event,
+ task->real_parent);
+ } else { /* PERF_RECORD_FORK */
+ task_event->event_id.ppid = perf_event_pid(event, current);
+ task_event->event_id.ptid = perf_event_tid(event, current);
+ }
task_event->event_id.time = perf_event_clock(event);
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 2bbad9c1274c..8baa3121e7a6 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -193,7 +193,7 @@ static inline int get_recursion_context(int *recursion)
rctx = 3;
else if (in_irq())
rctx = 2;
- else if (in_softirq())
+ else if (in_serving_softirq())
rctx = 1;
else
rctx = 0;
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 8cad3cd92e23..d937fbbc3642 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -602,10 +602,6 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
if (ret)
goto out;
- /* uprobe_write_opcode() assumes we don't cross page boundary */
- BUG_ON((uprobe->offset & ~PAGE_MASK) +
- UPROBE_SWBP_INSN_SIZE > PAGE_SIZE);
-
smp_wmb(); /* pairs with the smp_rmb() in handle_swbp() */
set_bit(UPROBE_COPY_INSN, &uprobe->flags);
@@ -884,6 +880,13 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *
if (offset > i_size_read(inode))
return -EINVAL;
+ /*
+ * This ensures that copy_from_page() and copy_to_page()
+ * can't cross page boundary.
+ */
+ if (!IS_ALIGNED(offset, UPROBE_SWBP_INSN_SIZE))
+ return -EINVAL;
+
retry:
uprobe = alloc_uprobe(inode, offset);
if (!uprobe)
@@ -1692,6 +1695,9 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
uprobe_opcode_t opcode;
int result;
+ if (WARN_ON_ONCE(!IS_ALIGNED(vaddr, UPROBE_SWBP_INSN_SIZE)))
+ return -EINVAL;
+
pagefault_disable();
result = __copy_from_user_inatomic(&opcode, (void __user*)vaddr,
sizeof(opcode));
@@ -1869,7 +1875,7 @@ static void handle_swbp(struct pt_regs *regs)
if (!uprobe) {
if (is_swbp > 0) {
/* No matching uprobe; signal SIGTRAP. */
- send_sig(SIGTRAP, current, 0);
+ force_sig(SIGTRAP, current);
} else {
/*
* Either we raced with uprobe_unregister() or we can't
diff --git a/kernel/exit.c b/kernel/exit.c
index 03f6722302b5..8d3c268fb1b8 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -389,7 +389,7 @@ static void exit_mm(struct task_struct *tsk)
struct mm_struct *mm = tsk->mm;
struct core_state *core_state;
- mm_release(tsk, mm);
+ exit_mm_release(tsk, mm);
if (!mm)
return;
sync_mm_rss(mm);
@@ -408,7 +408,10 @@ static void exit_mm(struct task_struct *tsk)
up_read(&mm->mmap_sem);
self.task = tsk;
- self.next = xchg(&core_state->dumper.next, &self);
+ if (self.task->flags & PF_SIGNALED)
+ self.next = xchg(&core_state->dumper.next, &self);
+ else
+ self.task = NULL;
/*
* Implies mb(), the result of xchg() must be visible
* to core_state->dumper.
@@ -692,27 +695,12 @@ void do_exit(long code)
*/
if (unlikely(tsk->flags & PF_EXITING)) {
pr_alert("Fixing recursive fault but reboot is needed!\n");
- /*
- * We can do this unlocked here. The futex code uses
- * this flag just to verify whether the pi state
- * cleanup has been done or not. In the worst case it
- * loops once more. We pretend that the cleanup was
- * done as there is no way to return. Either the
- * OWNER_DIED bit is set by now or we push the blocked
- * task into the wait for ever nirwana as well.
- */
- tsk->flags |= PF_EXITPIDONE;
+ futex_exit_recursive(tsk);
set_current_state(TASK_UNINTERRUPTIBLE);
schedule();
}
exit_signals(tsk); /* sets PF_EXITING */
- /*
- * tsk->flags are checked in the futex code to protect against
- * an exiting task cleaning up the robust pi futexes.
- */
- smp_mb();
- raw_spin_unlock_wait(&tsk->pi_lock);
if (unlikely(in_atomic())) {
pr_info("note: %s[%d] exited with preempt_count %d\n",
@@ -790,12 +778,6 @@ void do_exit(long code)
* Make sure we are holding no locks:
*/
debug_check_no_locks_held();
- /*
- * We can do this unlocked here. The futex code uses this flag
- * just to verify whether the pi state cleanup has been done
- * or not. In the worst case it loops once more.
- */
- tsk->flags |= PF_EXITPIDONE;
if (tsk->io_context)
exit_io_context(tsk);
diff --git a/kernel/fork.c b/kernel/fork.c
index a6dc6b3f6a01..2bd4c38efa09 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -887,24 +887,8 @@ static int wait_for_vfork_done(struct task_struct *child,
* restoring the old one. . .
* Eric Biederman 10 January 1998
*/
-void mm_release(struct task_struct *tsk, struct mm_struct *mm)
+static void mm_release(struct task_struct *tsk, struct mm_struct *mm)
{
- /* Get rid of any futexes when releasing the mm */
-#ifdef CONFIG_FUTEX
- if (unlikely(tsk->robust_list)) {
- exit_robust_list(tsk);
- tsk->robust_list = NULL;
- }
-#ifdef CONFIG_COMPAT
- if (unlikely(tsk->compat_robust_list)) {
- compat_exit_robust_list(tsk);
- tsk->compat_robust_list = NULL;
- }
-#endif
- if (unlikely(!list_empty(&tsk->pi_state_list)))
- exit_pi_state_list(tsk);
-#endif
-
uprobe_free_utask(tsk);
/* Get rid of any cached register state */
@@ -937,6 +921,18 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
complete_vfork_done(tsk);
}
+void exit_mm_release(struct task_struct *tsk, struct mm_struct *mm)
+{
+ futex_exit_release(tsk);
+ mm_release(tsk, mm);
+}
+
+void exec_mm_release(struct task_struct *tsk, struct mm_struct *mm)
+{
+ futex_exec_release(tsk);
+ mm_release(tsk, mm);
+}
+
/*
* Allocate a new mm structure and copy contents from the
* mm structure of the passed in task structure.
@@ -1511,14 +1507,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
#ifdef CONFIG_BLOCK
p->plug = NULL;
#endif
-#ifdef CONFIG_FUTEX
- p->robust_list = NULL;
-#ifdef CONFIG_COMPAT
- p->compat_robust_list = NULL;
-#endif
- INIT_LIST_HEAD(&p->pi_state_list);
- p->pi_state_cache = NULL;
-#endif
+ futex_init_task(p);
+
/*
* sigaltstack should be cleared when sharing the same VM
*/
@@ -1539,14 +1529,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
/* ok, now we should be set up.. */
p->pid = pid_nr(pid);
if (clone_flags & CLONE_THREAD) {
- p->exit_signal = -1;
p->group_leader = current->group_leader;
p->tgid = current->tgid;
} else {
- if (clone_flags & CLONE_PARENT)
- p->exit_signal = current->group_leader->exit_signal;
- else
- p->exit_signal = (clone_flags & CSIGNAL);
p->group_leader = p;
p->tgid = p->pid;
}
@@ -1591,9 +1576,14 @@ static struct task_struct *copy_process(unsigned long clone_flags,
if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
p->real_parent = current->real_parent;
p->parent_exec_id = current->parent_exec_id;
+ if (clone_flags & CLONE_THREAD)
+ p->exit_signal = -1;
+ else
+ p->exit_signal = current->group_leader->exit_signal;
} else {
p->real_parent = current;
p->parent_exec_id = current->self_exec_id;
+ p->exit_signal = (clone_flags & CSIGNAL);
}
spin_lock(&current->sighand->siglock);
diff --git a/kernel/futex.c b/kernel/futex.c
index e50b67674ba2..6d47b7dc1cfb 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -44,6 +44,7 @@
* along with this program; if not, write to the Free Software
* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
+#include <linux/compat.h>
#include <linux/slab.h>
#include <linux/poll.h>
#include <linux/fs.h>
@@ -171,8 +172,10 @@
* double_lock_hb() and double_unlock_hb(), respectively.
*/
-#ifndef CONFIG_HAVE_FUTEX_CMPXCHG
-int __read_mostly futex_cmpxchg_enabled;
+#ifdef CONFIG_HAVE_FUTEX_CMPXCHG
+#define futex_cmpxchg_enabled 1
+#else
+static int __read_mostly futex_cmpxchg_enabled;
#endif
/*
@@ -328,6 +331,12 @@ static inline bool should_fail_futex(bool fshared)
}
#endif /* CONFIG_FAIL_FUTEX */
+#ifdef CONFIG_COMPAT
+static void compat_exit_robust_list(struct task_struct *curr);
+#else
+static inline void compat_exit_robust_list(struct task_struct *curr) { }
+#endif
+
static inline void futex_get_mm(union futex_key *key)
{
atomic_inc(&key->private.mm->mm_count);
@@ -816,7 +825,7 @@ static int refill_pi_state_cache(void)
return 0;
}
-static struct futex_pi_state * alloc_pi_state(void)
+static struct futex_pi_state *alloc_pi_state(void)
{
struct futex_pi_state *pi_state = current->pi_state_cache;
@@ -826,10 +835,41 @@ static struct futex_pi_state * alloc_pi_state(void)
return pi_state;
}
+static void pi_state_update_owner(struct futex_pi_state *pi_state,
+ struct task_struct *new_owner)
+{
+ struct task_struct *old_owner = pi_state->owner;
+
+ lockdep_assert_held(&pi_state->pi_mutex.wait_lock);
+
+ if (old_owner) {
+ raw_spin_lock(&old_owner->pi_lock);
+ WARN_ON(list_empty(&pi_state->list));
+ list_del_init(&pi_state->list);
+ raw_spin_unlock(&old_owner->pi_lock);
+ }
+
+ if (new_owner) {
+ raw_spin_lock(&new_owner->pi_lock);
+ WARN_ON(!list_empty(&pi_state->list));
+ list_add(&pi_state->list, &new_owner->pi_state_list);
+ pi_state->owner = new_owner;
+ raw_spin_unlock(&new_owner->pi_lock);
+ }
+}
+
+static void get_pi_state(struct futex_pi_state *pi_state)
+{
+ WARN_ON_ONCE(!atomic_inc_not_zero(&pi_state->refcount));
+}
+
/*
+ * Drops a reference to the pi_state object and frees or caches it
+ * when the last reference is gone.
+ *
* Must be called with the hb lock held.
*/
-static void free_pi_state(struct futex_pi_state *pi_state)
+static void put_pi_state(struct futex_pi_state *pi_state)
{
if (!pi_state)
return;
@@ -842,11 +882,10 @@ static void free_pi_state(struct futex_pi_state *pi_state)
* and has cleaned up the pi_state already
*/
if (pi_state->owner) {
- raw_spin_lock_irq(&pi_state->owner->pi_lock);
- list_del_init(&pi_state->list);
- raw_spin_unlock_irq(&pi_state->owner->pi_lock);
-
- rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner);
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+ pi_state_update_owner(pi_state, NULL);
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+ rt_mutex_proxy_unlock(&pi_state->pi_mutex);
}
if (current->pi_state_cache)
@@ -867,7 +906,7 @@ static void free_pi_state(struct futex_pi_state *pi_state)
* Look up the task based on what TID userspace gave us.
* We dont trust it.
*/
-static struct task_struct * futex_find_get_task(pid_t pid)
+static struct task_struct *futex_find_get_task(pid_t pid)
{
struct task_struct *p;
@@ -886,7 +925,7 @@ static struct task_struct * futex_find_get_task(pid_t pid)
* Kernel cleans up PI-state, but userspace is likely hosed.
* (Robust-futex cleanup is separate and might save the day for userspace.)
*/
-void exit_pi_state_list(struct task_struct *curr)
+static void exit_pi_state_list(struct task_struct *curr)
{
struct list_head *next, *head = &curr->pi_state_list;
struct futex_pi_state *pi_state;
@@ -927,10 +966,12 @@ void exit_pi_state_list(struct task_struct *curr)
pi_state->owner = NULL;
raw_spin_unlock_irq(&curr->pi_lock);
- rt_mutex_unlock(&pi_state->pi_mutex);
-
+ get_pi_state(pi_state);
spin_unlock(&hb->lock);
+ rt_mutex_futex_unlock(&pi_state->pi_mutex);
+ put_pi_state(pi_state);
+
raw_spin_lock_irq(&curr->pi_lock);
}
raw_spin_unlock_irq(&curr->pi_lock);
@@ -983,7 +1024,41 @@ void exit_pi_state_list(struct task_struct *curr)
* FUTEX_OWNER_DIED bit. See [4]
*
* [10] There is no transient state which leaves owner and user space
- * TID out of sync.
+ * TID out of sync. Except one error case where the kernel is denied
+ * write access to the user address, see fixup_pi_state_owner().
+ *
+ *
+ * Serialization and lifetime rules:
+ *
+ * hb->lock:
+ *
+ * hb -> futex_q, relation
+ * futex_q -> pi_state, relation
+ *
+ * (cannot be raw because hb can contain arbitrary amount
+ * of futex_q's)
+ *
+ * pi_mutex->wait_lock:
+ *
+ * {uval, pi_state}
+ *
+ * (and pi_mutex 'obviously')
+ *
+ * p->pi_lock:
+ *
+ * p->pi_state_list -> pi_state->list, relation
+ *
+ * pi_state->refcount:
+ *
+ * pi_state lifetime
+ *
+ *
+ * Lock order:
+ *
+ * hb->lock
+ * pi_mutex->wait_lock
+ * p->pi_lock
+ *
*/
/*
@@ -991,10 +1066,12 @@ void exit_pi_state_list(struct task_struct *curr)
* the pi_state against the user space value. If correct, attach to
* it.
*/
-static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
+static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
+ struct futex_pi_state *pi_state,
struct futex_pi_state **ps)
{
pid_t pid = uval & FUTEX_TID_MASK;
+ int ret, uval2;
/*
* Userspace might have messed up non-PI and PI futexes [3]
@@ -1002,9 +1079,39 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
if (unlikely(!pi_state))
return -EINVAL;
+ /*
+ * We get here with hb->lock held, and having found a
+ * futex_top_waiter(). This means that futex_lock_pi() of said futex_q
+ * has dropped the hb->lock in between queue_me() and unqueue_me_pi(),
+ * which in turn means that futex_lock_pi() still has a reference on
+ * our pi_state.
+ *
+ * The waiter holding a reference on @pi_state also protects against
+ * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi()
+ * and futex_wait_requeue_pi() as it cannot go to 0 and consequently
+ * free pi_state before we can take a reference ourselves.
+ */
WARN_ON(!atomic_read(&pi_state->refcount));
/*
+ * Now that we have a pi_state, we can acquire wait_lock
+ * and do the state validation.
+ */
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+
+ /*
+ * Since {uval, pi_state} is serialized by wait_lock, and our current
+ * uval was read without holding it, it can have changed. Verify it
+ * still is what we expect it to be, otherwise retry the entire
+ * operation.
+ */
+ if (get_futex_value_locked(&uval2, uaddr))
+ goto out_efault;
+
+ if (uval != uval2)
+ goto out_eagain;
+
+ /*
* Handle the owner died case:
*/
if (uval & FUTEX_OWNER_DIED) {
@@ -1019,11 +1126,11 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
* is not 0. Inconsistent state. [5]
*/
if (pid)
- return -EINVAL;
+ goto out_einval;
/*
* Take a ref on the state and return success. [4]
*/
- goto out_state;
+ goto out_attach;
}
/*
@@ -1035,14 +1142,14 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
* Take a ref on the state and return success. [6]
*/
if (!pid)
- goto out_state;
+ goto out_attach;
} else {
/*
* If the owner died bit is not set, then the pi_state
* must have an owner. [7]
*/
if (!pi_state->owner)
- return -EINVAL;
+ goto out_einval;
}
/*
@@ -1051,19 +1158,124 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
* user space TID. [9/10]
*/
if (pid != task_pid_vnr(pi_state->owner))
- return -EINVAL;
-out_state:
- atomic_inc(&pi_state->refcount);
+ goto out_einval;
+
+out_attach:
+ get_pi_state(pi_state);
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
*ps = pi_state;
return 0;
+
+out_einval:
+ ret = -EINVAL;
+ goto out_error;
+
+out_eagain:
+ ret = -EAGAIN;
+ goto out_error;
+
+out_efault:
+ ret = -EFAULT;
+ goto out_error;
+
+out_error:
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+ return ret;
+}
+
+/**
+ * wait_for_owner_exiting - Block until the owner has exited
+ * @exiting: Pointer to the exiting task
+ *
+ * Caller must hold a refcount on @exiting.
+ */
+static void wait_for_owner_exiting(int ret, struct task_struct *exiting)
+{
+ if (ret != -EBUSY) {
+ WARN_ON_ONCE(exiting);
+ return;
+ }
+
+ if (WARN_ON_ONCE(ret == -EBUSY && !exiting))
+ return;
+
+ mutex_lock(&exiting->futex_exit_mutex);
+ /*
+ * No point in doing state checking here. If the waiter got here
+ * while the task was in exec()->exec_futex_release() then it can
+ * have any FUTEX_STATE_* value when the waiter has acquired the
+ * mutex. OK, if running, EXITING or DEAD if it reached exit()
+ * already. Highly unlikely and not a problem. Just one more round
+ * through the futex maze.
+ */
+ mutex_unlock(&exiting->futex_exit_mutex);
+
+ put_task_struct(exiting);
+}
+
+static int handle_exit_race(u32 __user *uaddr, u32 uval,
+ struct task_struct *tsk)
+{
+ u32 uval2;
+
+ /*
+ * If the futex exit state is not yet FUTEX_STATE_DEAD, tell the
+ * caller that the alleged owner is busy.
+ */
+ if (tsk && tsk->futex_state != FUTEX_STATE_DEAD)
+ return -EBUSY;
+
+ /*
+ * Reread the user space value to handle the following situation:
+ *
+ * CPU0 CPU1
+ *
+ * sys_exit() sys_futex()
+ * do_exit() futex_lock_pi()
+ * futex_lock_pi_atomic()
+ * exit_signals(tsk) No waiters:
+ * tsk->flags |= PF_EXITING; *uaddr == 0x00000PID
+ * mm_release(tsk) Set waiter bit
+ * exit_robust_list(tsk) { *uaddr = 0x80000PID;
+ * Set owner died attach_to_pi_owner() {
+ * *uaddr = 0xC0000000; tsk = get_task(PID);
+ * } if (!tsk->flags & PF_EXITING) {
+ * ... attach();
+ * tsk->futex_state = } else {
+ * FUTEX_STATE_DEAD; if (tsk->futex_state !=
+ * FUTEX_STATE_DEAD)
+ * return -EAGAIN;
+ * return -ESRCH; <--- FAIL
+ * }
+ *
+ * Returning ESRCH unconditionally is wrong here because the
+ * user space value has been changed by the exiting task.
+ *
+ * The same logic applies to the case where the exiting task is
+ * already gone.
+ */
+ if (get_futex_value_locked(&uval2, uaddr))
+ return -EFAULT;
+
+ /* If the user space value has changed, try again. */
+ if (uval2 != uval)
+ return -EAGAIN;
+
+ /*
+ * The exiting task did not have a robust list, the robust list was
+ * corrupted or the user space value in *uaddr is simply bogus.
+ * Give up and tell user space.
+ */
+ return -ESRCH;
}
/*
* Lookup the task for the TID provided from user space and attach to
* it after doing proper sanity checks.
*/
-static int attach_to_pi_owner(u32 uval, union futex_key *key,
- struct futex_pi_state **ps)
+static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
+ struct futex_pi_state **ps,
+ struct task_struct **exiting)
{
pid_t pid = uval & FUTEX_TID_MASK;
struct futex_pi_state *pi_state;
@@ -1072,12 +1284,15 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,
/*
* We are the first waiter - try to look up the real owner and attach
* the new pi_state to it, but bail out when TID = 0 [1]
+ *
+ * The !pid check is paranoid. None of the call sites should end up
+ * with pid == 0, but better safe than sorry. Let the caller retry
*/
if (!pid)
- return -ESRCH;
+ return -EAGAIN;
p = futex_find_get_task(pid);
if (!p)
- return -ESRCH;
+ return handle_exit_race(uaddr, uval, NULL);
if (unlikely(p->flags & PF_KTHREAD)) {
put_task_struct(p);
@@ -1085,27 +1300,41 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,
}
/*
- * We need to look at the task state flags to figure out,
- * whether the task is exiting. To protect against the do_exit
- * change of the task flags, we do this protected by
- * p->pi_lock:
+ * We need to look at the task state to figure out, whether the
+ * task is exiting. To protect against the change of the task state
+ * in futex_exit_release(), we do this protected by p->pi_lock:
*/
raw_spin_lock_irq(&p->pi_lock);
- if (unlikely(p->flags & PF_EXITING)) {
+ if (unlikely(p->futex_state != FUTEX_STATE_OK)) {
/*
- * The task is on the way out. When PF_EXITPIDONE is
- * set, we know that the task has finished the
- * cleanup:
+ * The task is on the way out. When the futex state is
+ * FUTEX_STATE_DEAD, we know that the task has finished
+ * the cleanup:
*/
- int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN;
+ int ret = handle_exit_race(uaddr, uval, p);
raw_spin_unlock_irq(&p->pi_lock);
- put_task_struct(p);
+ /*
+ * If the owner task is between FUTEX_STATE_EXITING and
+ * FUTEX_STATE_DEAD then store the task pointer and keep
+ * the reference on the task struct. The calling code will
+ * drop all locks, wait for the task to reach
+ * FUTEX_STATE_DEAD and then drop the refcount. This is
+ * required to prevent a live lock when the current task
+ * preempted the exiting task between the two states.
+ */
+ if (ret == -EBUSY)
+ *exiting = p;
+ else
+ put_task_struct(p);
return ret;
}
/*
* No existing pi state. First waiter. [2]
+ *
+ * This creates pi_state, we have hb->lock held, this means nothing can
+ * observe this state, wait_lock is irrelevant.
*/
pi_state = alloc_pi_state();
@@ -1130,8 +1359,10 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,
return 0;
}
-static int lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
- union futex_key *key, struct futex_pi_state **ps)
+static int lookup_pi_state(u32 __user *uaddr, u32 uval,
+ struct futex_hash_bucket *hb,
+ union futex_key *key, struct futex_pi_state **ps,
+ struct task_struct **exiting)
{
struct futex_q *match = futex_top_waiter(hb, key);
@@ -1140,13 +1371,13 @@ static int lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
* attach to the pi_state when the validation succeeds.
*/
if (match)
- return attach_to_pi_state(uval, match->pi_state, ps);
+ return attach_to_pi_state(uaddr, uval, match->pi_state, ps);
/*
* We are the first waiter - try to look up the owner based on
* @uval and attach to it.
*/
- return attach_to_pi_owner(uval, key, ps);
+ return attach_to_pi_owner(uaddr, uval, key, ps, exiting);
}
static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
@@ -1159,7 +1390,7 @@ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
return -EFAULT;
- /*If user space value changed, let the caller retry */
+ /* If user space value changed, let the caller retry */
return curval != uval ? -EAGAIN : 0;
}
@@ -1172,6 +1403,8 @@ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
* lookup
* @task: the task to perform the atomic lock work for. This will
* be "current" except in the case of requeue pi.
+ * @exiting: Pointer to store the task pointer of the owner task
+ * which is in the middle of exiting
* @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
*
* Return:
@@ -1180,11 +1413,17 @@ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
* <0 - error
*
* The hb->lock and futex_key refs shall be held by the caller.
+ *
+ * @exiting is only set when the return value is -EBUSY. If so, this holds
+ * a refcount on the exiting task on return and the caller needs to drop it
+ * after waiting for the exit to complete.
*/
static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
union futex_key *key,
struct futex_pi_state **ps,
- struct task_struct *task, int set_waiters)
+ struct task_struct *task,
+ struct task_struct **exiting,
+ int set_waiters)
{
u32 uval, newval, vpid = task_pid_vnr(task);
struct futex_q *match;
@@ -1215,7 +1454,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
*/
match = futex_top_waiter(hb, key);
if (match)
- return attach_to_pi_state(uval, match->pi_state, ps);
+ return attach_to_pi_state(uaddr, uval, match->pi_state, ps);
/*
* No waiter and user TID is 0. We are here because the
@@ -1254,7 +1493,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
* attach to the owner. If that fails, no harm done, we only
* set the FUTEX_WAITERS bit in the user space variable.
*/
- return attach_to_pi_owner(uval, key, ps);
+ return attach_to_pi_owner(uaddr, newval, key, ps, exiting);
}
/**
@@ -1305,41 +1544,35 @@ static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q)
q->lock_ptr = NULL;
}
-static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
- struct futex_hash_bucket *hb)
+/*
+ * Caller must hold a reference on @pi_state.
+ */
+static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state)
{
- struct task_struct *new_owner;
- struct futex_pi_state *pi_state = this->pi_state;
u32 uninitialized_var(curval), newval;
+ struct task_struct *new_owner;
+ bool deboost = false;
WAKE_Q(wake_q);
- bool deboost;
int ret = 0;
- if (!pi_state)
- return -EINVAL;
-
- /*
- * If current does not own the pi_state then the futex is
- * inconsistent and user space fiddled with the futex value.
- */
- if (pi_state->owner != current)
- return -EINVAL;
-
- raw_spin_lock(&pi_state->pi_mutex.wait_lock);
new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
+ if (WARN_ON_ONCE(!new_owner)) {
+ /*
+ * As per the comment in futex_unlock_pi() this should not happen.
+ *
+ * When this happens, give up our locks and try again, giving
+ * the futex_lock_pi() instance time to complete, either by
+ * waiting on the rtmutex or removing itself from the futex
+ * queue.
+ */
+ ret = -EAGAIN;
+ goto out_unlock;
+ }
/*
- * It is possible that the next waiter (the one that brought
- * this owner to the kernel) timed out and is no longer
- * waiting on the lock.
- */
- if (!new_owner)
- new_owner = this->task;
-
- /*
- * We pass it to the next owner. The WAITERS bit is always
- * kept enabled while there is PI state around. We cleanup the
- * owner died bit, because we are the owner.
+ * We pass it to the next owner. The WAITERS bit is always kept
+ * enabled while there is PI state around. We cleanup the owner
+ * died bit, because we are the owner.
*/
newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
@@ -1348,6 +1581,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) {
ret = -EFAULT;
+
} else if (curval != uval) {
/*
* If a unconditional UNLOCK_PI operation (user space did not
@@ -1360,38 +1594,26 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
else
ret = -EINVAL;
}
- if (ret) {
- raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
- return ret;
- }
-
- raw_spin_lock_irq(&pi_state->owner->pi_lock);
- WARN_ON(list_empty(&pi_state->list));
- list_del_init(&pi_state->list);
- raw_spin_unlock_irq(&pi_state->owner->pi_lock);
-
- raw_spin_lock_irq(&new_owner->pi_lock);
- WARN_ON(!list_empty(&pi_state->list));
- list_add(&pi_state->list, &new_owner->pi_state_list);
- pi_state->owner = new_owner;
- raw_spin_unlock_irq(&new_owner->pi_lock);
- raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
+ if (!ret) {
+ /*
+ * This is a point of no return; once we modified the uval
+ * there is no going back and subsequent operations must
+ * not fail.
+ */
+ pi_state_update_owner(pi_state, new_owner);
+ deboost = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
+ }
- deboost = rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
+out_unlock:
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
- /*
- * First unlock HB so the waiter does not spin on it once he got woken
- * up. Second wake up the waiter before the priority is adjusted. If we
- * deboost first (and lose our higher priority), then the task might get
- * scheduled away before the wake up can take place.
- */
- spin_unlock(&hb->lock);
- wake_up_q(&wake_q);
- if (deboost)
+ if (deboost) {
+ wake_up_q(&wake_q);
rt_mutex_adjust_prio(current);
+ }
- return 0;
+ return ret;
}
/*
@@ -1680,6 +1902,8 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
* @key1: the from futex key
* @key2: the to futex key
* @ps: address to store the pi_state pointer
+ * @exiting: Pointer to store the task pointer of the owner task
+ * which is in the middle of exiting
* @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
*
* Try and get the lock on behalf of the top waiter if we can do it atomically.
@@ -1687,16 +1911,20 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
* then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.
* hb1 and hb2 must be held by the caller.
*
+ * @exiting is only set when the return value is -EBUSY. If so, this holds
+ * a refcount on the exiting task on return and the caller needs to drop it
+ * after waiting for the exit to complete.
+ *
* Return:
* 0 - failed to acquire the lock atomically;
* >0 - acquired the lock, return value is vpid of the top_waiter
* <0 - error
*/
-static int futex_proxy_trylock_atomic(u32 __user *pifutex,
- struct futex_hash_bucket *hb1,
- struct futex_hash_bucket *hb2,
- union futex_key *key1, union futex_key *key2,
- struct futex_pi_state **ps, int set_waiters)
+static int
+futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1,
+ struct futex_hash_bucket *hb2, union futex_key *key1,
+ union futex_key *key2, struct futex_pi_state **ps,
+ struct task_struct **exiting, int set_waiters)
{
struct futex_q *top_waiter = NULL;
u32 curval;
@@ -1733,7 +1961,7 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
*/
vpid = task_pid_vnr(top_waiter->task);
ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
- set_waiters);
+ exiting, set_waiters);
if (ret == 1) {
requeue_pi_wake_futex(top_waiter, key2, hb2);
return vpid;
@@ -1853,6 +2081,8 @@ retry_private:
}
if (requeue_pi && (task_count - nr_wake < nr_requeue)) {
+ struct task_struct *exiting = NULL;
+
/*
* Attempt to acquire uaddr2 and wake the top waiter. If we
* intend to requeue waiters, force setting the FUTEX_WAITERS
@@ -1860,7 +2090,8 @@ retry_private:
* faults rather in the requeue loop below.
*/
ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1,
- &key2, &pi_state, nr_requeue);
+ &key2, &pi_state,
+ &exiting, nr_requeue);
/*
* At this point the top_waiter has either taken uaddr2 or is
@@ -1884,14 +2115,15 @@ retry_private:
* rereading and handing potential crap to
* lookup_pi_state.
*/
- ret = lookup_pi_state(ret, hb2, &key2, &pi_state);
+ ret = lookup_pi_state(uaddr2, ret, hb2, &key2,
+ &pi_state, &exiting);
}
switch (ret) {
case 0:
break;
case -EFAULT:
- free_pi_state(pi_state);
+ put_pi_state(pi_state);
pi_state = NULL;
double_unlock_hb(hb1, hb2);
hb_waiters_dec(hb2);
@@ -1901,19 +2133,26 @@ retry_private:
if (!ret)
goto retry;
goto out;
+ case -EBUSY:
case -EAGAIN:
/*
* Two reasons for this:
- * - Owner is exiting and we just wait for the
+ * - EBUSY: Owner is exiting and we just wait for the
* exit to complete.
- * - The user space value changed.
+ * - EAGAIN: The user space value changed.
*/
- free_pi_state(pi_state);
+ put_pi_state(pi_state);
pi_state = NULL;
double_unlock_hb(hb1, hb2);
hb_waiters_dec(hb2);
put_futex_key(&key2);
put_futex_key(&key1);
+ /*
+ * Handle the case where the owner is in the middle of
+ * exiting. Wait for the exit to complete otherwise
+ * this task might loop forever, aka. live lock.
+ */
+ wait_for_owner_exiting(ret, exiting);
cond_resched();
goto retry;
default:
@@ -1964,7 +2203,7 @@ retry_private:
*/
if (requeue_pi) {
/* Prepare the waiter to take the rt_mutex. */
- atomic_inc(&pi_state->refcount);
+ get_pi_state(pi_state);
this->pi_state = pi_state;
ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
this->rt_waiter,
@@ -1977,7 +2216,7 @@ retry_private:
} else if (ret) {
/* -EDEADLK */
this->pi_state = NULL;
- free_pi_state(pi_state);
+ put_pi_state(pi_state);
goto out_unlock;
}
}
@@ -1986,7 +2225,7 @@ retry_private:
}
out_unlock:
- free_pi_state(pi_state);
+ put_pi_state(pi_state);
double_unlock_hb(hb1, hb2);
wake_up_q(&wake_q);
hb_waiters_dec(hb2);
@@ -2040,20 +2279,7 @@ queue_unlock(struct futex_hash_bucket *hb)
hb_waiters_dec(hb);
}
-/**
- * queue_me() - Enqueue the futex_q on the futex_hash_bucket
- * @q: The futex_q to enqueue
- * @hb: The destination hash bucket
- *
- * The hb->lock must be held by the caller, and is released here. A call to
- * queue_me() is typically paired with exactly one call to unqueue_me(). The
- * exceptions involve the PI related operations, which may use unqueue_me_pi()
- * or nothing if the unqueue is done as part of the wake process and the unqueue
- * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
- * an example).
- */
-static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
- __releases(&hb->lock)
+static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
{
int prio;
@@ -2070,6 +2296,24 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
plist_node_init(&q->list, prio);
plist_add(&q->list, &hb->chain);
q->task = current;
+}
+
+/**
+ * queue_me() - Enqueue the futex_q on the futex_hash_bucket
+ * @q: The futex_q to enqueue
+ * @hb: The destination hash bucket
+ *
+ * The hb->lock must be held by the caller, and is released here. A call to
+ * queue_me() is typically paired with exactly one call to unqueue_me(). The
+ * exceptions involve the PI related operations, which may use unqueue_me_pi()
+ * or nothing if the unqueue is done as part of the wake process and the unqueue
+ * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
+ * an example).
+ */
+static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
+ __releases(&hb->lock)
+{
+ __queue_me(q, hb);
spin_unlock(&hb->lock);
}
@@ -2139,53 +2383,97 @@ static void unqueue_me_pi(struct futex_q *q)
__unqueue_futex(q);
BUG_ON(!q->pi_state);
- free_pi_state(q->pi_state);
+ put_pi_state(q->pi_state);
q->pi_state = NULL;
spin_unlock(q->lock_ptr);
}
-/*
- * Fixup the pi_state owner with the new owner.
- *
- * Must be called with hash bucket lock held and mm->sem held for non
- * private futexes.
- */
-static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
- struct task_struct *newowner)
+static int __fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
+ struct task_struct *argowner)
{
- u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
struct futex_pi_state *pi_state = q->pi_state;
- struct task_struct *oldowner = pi_state->owner;
- u32 uval, uninitialized_var(curval), newval;
- int ret;
-
- /* Owner died? */
- if (!pi_state->owner)
- newtid |= FUTEX_OWNER_DIED;
+ struct task_struct *oldowner, *newowner;
+ u32 uval, curval, newval, newtid;
+ int err = 0;
+ oldowner = pi_state->owner;
/*
- * We are here either because we stole the rtmutex from the
- * previous highest priority waiter or we are the highest priority
- * waiter but failed to get the rtmutex the first time.
- * We have to replace the newowner TID in the user space variable.
+ * We are here because either:
+ *
+ * - we stole the lock and pi_state->owner needs updating to reflect
+ * that (@argowner == current),
+ *
+ * or:
+ *
+ * - someone stole our lock and we need to fix things to point to the
+ * new owner (@argowner == NULL).
+ *
+ * Either way, we have to replace the TID in the user space variable.
* This must be atomic as we have to preserve the owner died bit here.
*
* Note: We write the user space value _before_ changing the pi_state
* because we can fault here. Imagine swapped out pages or a fork
* that marked all the anonymous memory readonly for cow.
*
- * Modifying pi_state _before_ the user space value would
- * leave the pi_state in an inconsistent state when we fault
- * here, because we need to drop the hash bucket lock to
- * handle the fault. This might be observed in the PID check
- * in lookup_pi_state.
+ * Modifying pi_state _before_ the user space value would leave the
+ * pi_state in an inconsistent state when we fault here, because we
+ * need to drop the locks to handle the fault. This might be observed
+ * in the PID check in lookup_pi_state.
*/
retry:
+ if (!argowner) {
+ if (oldowner != current) {
+ /*
+ * We raced against a concurrent self; things are
+ * already fixed up. Nothing to do.
+ */
+ return 0;
+ }
+
+ if (__rt_mutex_futex_trylock(&pi_state->pi_mutex)) {
+ /* We got the lock. pi_state is correct. Tell caller */
+ return 1;
+ }
+
+ /*
+ * The trylock just failed, so either there is an owner or
+ * there is a higher priority waiter than this one.
+ */
+ newowner = rt_mutex_owner(&pi_state->pi_mutex);
+ /*
+ * If the higher priority waiter has not yet taken over the
+ * rtmutex then newowner is NULL. We can't return here with
+ * that state because it's inconsistent vs. the user space
+ * state. So drop the locks and try again. It's a valid
+ * situation and not any different from the other retry
+ * conditions.
+ */
+ if (unlikely(!newowner)) {
+ err = -EAGAIN;
+ goto handle_fault;
+ }
+ } else {
+ WARN_ON_ONCE(argowner != current);
+ if (oldowner == current) {
+ /*
+ * We raced against a concurrent self; things are
+ * already fixed up. Nothing to do.
+ */
+ return 1;
+ }
+ newowner = argowner;
+ }
+
+ newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
+ /* Owner died? */
+ if (!pi_state->owner)
+ newtid |= FUTEX_OWNER_DIED;
+
if (get_futex_value_locked(&uval, uaddr))
goto handle_fault;
- while (1) {
+ for (;;) {
newval = (uval & FUTEX_OWNER_DIED) | newtid;
if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
@@ -2199,48 +2487,75 @@ retry:
* We fixed up user space. Now we need to fix the pi_state
* itself.
*/
- if (pi_state->owner != NULL) {
- raw_spin_lock_irq(&pi_state->owner->pi_lock);
- WARN_ON(list_empty(&pi_state->list));
- list_del_init(&pi_state->list);
- raw_spin_unlock_irq(&pi_state->owner->pi_lock);
- }
+ pi_state_update_owner(pi_state, newowner);
- pi_state->owner = newowner;
-
- raw_spin_lock_irq(&newowner->pi_lock);
- WARN_ON(!list_empty(&pi_state->list));
- list_add(&pi_state->list, &newowner->pi_state_list);
- raw_spin_unlock_irq(&newowner->pi_lock);
- return 0;
+ return argowner == current;
/*
- * To handle the page fault we need to drop the hash bucket
- * lock here. That gives the other task (either the highest priority
- * waiter itself or the task which stole the rtmutex) the
- * chance to try the fixup of the pi_state. So once we are
- * back from handling the fault we need to check the pi_state
- * after reacquiring the hash bucket lock and before trying to
- * do another fixup. When the fixup has been done already we
- * simply return.
+ * To handle the page fault we need to drop the locks here. That gives
+ * the other task (either the highest priority waiter itself or the
+ * task which stole the rtmutex) the chance to try the fixup of the
+ * pi_state. So once we are back from handling the fault we need to
+ * check the pi_state after reacquiring the locks and before trying to
+ * do another fixup. When the fixup has been done already we simply
+ * return.
+ *
+ * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
+ * drop hb->lock since the caller owns the hb -> futex_q relation.
+ * Dropping the pi_mutex->wait_lock requires the state revalidate.
*/
handle_fault:
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
spin_unlock(q->lock_ptr);
- ret = fault_in_user_writeable(uaddr);
+ err = fault_in_user_writeable(uaddr);
spin_lock(q->lock_ptr);
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
/*
* Check if someone else fixed it for us:
*/
if (pi_state->owner != oldowner)
- return 0;
+ return argowner == current;
- if (ret)
- return ret;
+ /* Retry if err was -EAGAIN or the fault in succeeded */
+ if (!err)
+ goto retry;
- goto retry;
+ /*
+ * fault_in_user_writeable() failed so user state is immutable. At
+ * best we can make the kernel state consistent but user state will
+ * be most likely hosed and any subsequent unlock operation will be
+ * rejected due to PI futex rule [10].
+ *
+ * Ensure that the rtmutex owner is also the pi_state owner despite
+ * the user space value claiming something different. There is no
+ * point in unlocking the rtmutex if current is the owner as it
+ * would need to wait until the next waiter has taken the rtmutex
+ * to guarantee consistent state. Keep it simple. Userspace asked
+ * for this wreckaged state.
+ *
+ * The rtmutex has an owner - either current or some other
+ * task. See the EAGAIN loop above.
+ */
+ pi_state_update_owner(pi_state, rt_mutex_owner(&pi_state->pi_mutex));
+
+ return err;
+}
+
+static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
+ struct task_struct *argowner)
+{
+ struct futex_pi_state *pi_state = q->pi_state;
+ int ret;
+
+ lockdep_assert_held(q->lock_ptr);
+
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+ ret = __fixup_pi_state_owner(uaddr, q, argowner);
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+ return ret;
}
static long futex_wait_restart(struct restart_block *restart);
@@ -2262,60 +2577,39 @@ static long futex_wait_restart(struct restart_block *restart);
*/
static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
{
- struct task_struct *owner;
- int ret = 0;
-
if (locked) {
/*
* Got the lock. We might not be the anticipated owner if we
* did a lock-steal - fix up the PI-state in that case:
+ *
+ * Speculative pi_state->owner read (we don't hold wait_lock);
+ * since we own the lock pi_state->owner == current is the
+ * stable state, anything else needs more attention.
*/
if (q->pi_state->owner != current)
- ret = fixup_pi_state_owner(uaddr, q, current);
- goto out;
+ return fixup_pi_state_owner(uaddr, q, current);
+ return 1;
}
/*
- * Catch the rare case, where the lock was released when we were on the
- * way back before we locked the hash bucket.
+ * If we didn't get the lock; check if anybody stole it from us. In
+ * that case, we need to fix up the uval to point to them instead of
+ * us, otherwise bad things happen. [10]
+ *
+ * Another speculative read; pi_state->owner == current is unstable
+ * but needs our attention.
*/
- if (q->pi_state->owner == current) {
- /*
- * Try to get the rt_mutex now. This might fail as some other
- * task acquired the rt_mutex after we removed ourself from the
- * rt_mutex waiters list.
- */
- if (rt_mutex_trylock(&q->pi_state->pi_mutex)) {
- locked = 1;
- goto out;
- }
-
- /*
- * pi_state is incorrect, some other task did a lock steal and
- * we returned due to timeout or signal without taking the
- * rt_mutex. Too late.
- */
- raw_spin_lock(&q->pi_state->pi_mutex.wait_lock);
- owner = rt_mutex_owner(&q->pi_state->pi_mutex);
- if (!owner)
- owner = rt_mutex_next_owner(&q->pi_state->pi_mutex);
- raw_spin_unlock(&q->pi_state->pi_mutex.wait_lock);
- ret = fixup_pi_state_owner(uaddr, q, owner);
- goto out;
- }
+ if (q->pi_state->owner == current)
+ return fixup_pi_state_owner(uaddr, q, NULL);
/*
* Paranoia check. If we did not take the lock, then we should not be
- * the owner of the rt_mutex.
+ * the owner of the rt_mutex. Warn and establish consistent state.
*/
- if (rt_mutex_owner(&q->pi_state->pi_mutex) == current)
- printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p "
- "pi-state %p\n", ret,
- q->pi_state->pi_mutex.owner,
- q->pi_state->owner);
+ if (WARN_ON_ONCE(rt_mutex_owner(&q->pi_state->pi_mutex) == current))
+ return fixup_pi_state_owner(uaddr, q, current);
-out:
- return ret ? ret : locked;
+ return 0;
}
/**
@@ -2536,6 +2830,8 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
ktime_t *time, int trylock)
{
struct hrtimer_sleeper timeout, *to = NULL;
+ struct task_struct *exiting = NULL;
+ struct rt_mutex_waiter rt_waiter;
struct futex_hash_bucket *hb;
struct futex_q q = futex_q_init;
int res, ret;
@@ -2559,7 +2855,8 @@ retry:
retry_private:
hb = queue_lock(&q);
- ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0);
+ ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current,
+ &exiting, 0);
if (unlikely(ret)) {
/*
* Atomic work succeeded and we got the lock,
@@ -2572,15 +2869,22 @@ retry_private:
goto out_unlock_put_key;
case -EFAULT:
goto uaddr_faulted;
+ case -EBUSY:
case -EAGAIN:
/*
* Two reasons for this:
- * - Task is exiting and we just wait for the
+ * - EBUSY: Task is exiting and we just wait for the
* exit to complete.
- * - The user space value changed.
+ * - EAGAIN: The user space value changed.
*/
queue_unlock(hb);
put_futex_key(&q.key);
+ /*
+ * Handle the case where the owner is in the middle of
+ * exiting. Wait for the exit to complete otherwise
+ * this task might loop forever, aka. live lock.
+ */
+ wait_for_owner_exiting(ret, exiting);
cond_resched();
goto retry;
default:
@@ -2588,25 +2892,52 @@ retry_private:
}
}
+ WARN_ON(!q.pi_state);
+
/*
* Only actually queue now that the atomic ops are done:
*/
- queue_me(&q, hb);
+ __queue_me(&q, hb);
- WARN_ON(!q.pi_state);
- /*
- * Block on the PI mutex:
- */
- if (!trylock) {
- ret = rt_mutex_timed_futex_lock(&q.pi_state->pi_mutex, to);
- } else {
- ret = rt_mutex_trylock(&q.pi_state->pi_mutex);
+ if (trylock) {
+ ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex);
/* Fixup the trylock return value: */
ret = ret ? 0 : -EWOULDBLOCK;
+ goto no_block;
}
+ /*
+ * We must add ourselves to the rt_mutex waitlist while holding hb->lock
+ * such that the hb and rt_mutex wait lists match.
+ */
+ rt_mutex_init_waiter(&rt_waiter);
+ ret = rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
+ if (ret) {
+ if (ret == 1)
+ ret = 0;
+
+ goto no_block;
+ }
+
+ spin_unlock(q.lock_ptr);
+
+ if (unlikely(to))
+ hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS);
+
+ ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
+
spin_lock(q.lock_ptr);
/*
+ * If we failed to acquire the lock (signal/timeout), we must
+ * first acquire the hb->lock before removing the lock from the
+ * rt_mutex waitqueue, such that we can keep the hb and rt_mutex
+ * wait lists consistent.
+ */
+ if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
+ ret = 0;
+
+no_block:
+ /*
* Fixup the pi_state owner and possibly acquire the lock if we
* haven't already.
*/
@@ -2618,13 +2949,6 @@ retry_private:
if (res)
ret = (res < 0) ? res : 0;
- /*
- * If fixup_owner() faulted and was unable to handle the fault, unlock
- * it and return the fault to userspace.
- */
- if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current))
- rt_mutex_unlock(&q.pi_state->pi_mutex);
-
/* Unqueue and drop the lock */
unqueue_me_pi(&q);
@@ -2636,8 +2960,10 @@ out_unlock_put_key:
out_put_key:
put_futex_key(&q.key);
out:
- if (to)
+ if (to) {
+ hrtimer_cancel(&to->timer);
destroy_hrtimer_on_stack(&to->timer);
+ }
return ret != -EINTR ? ret : -ERESTARTNOINTR;
uaddr_faulted:
@@ -2690,10 +3016,39 @@ retry:
*/
match = futex_top_waiter(hb, &key);
if (match) {
- ret = wake_futex_pi(uaddr, uval, match, hb);
+ struct futex_pi_state *pi_state = match->pi_state;
+
+ ret = -EINVAL;
+ if (!pi_state)
+ goto out_unlock;
+
+ /*
+ * If current does not own the pi_state then the futex is
+ * inconsistent and user space fiddled with the futex value.
+ */
+ if (pi_state->owner != current)
+ goto out_unlock;
+
+ get_pi_state(pi_state);
+ /*
+ * Since modifying the wait_list is done while holding both
+ * hb->lock and wait_lock, holding either is sufficient to
+ * observe it.
+ *
+ * By taking wait_lock while still holding hb->lock, we ensure
+ * there is no point where we hold neither; and therefore
+ * wake_futex_pi() must observe a state consistent with what we
+ * observed.
+ */
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+ spin_unlock(&hb->lock);
+
+ ret = wake_futex_pi(uaddr, uval, pi_state);
+
+ put_pi_state(pi_state);
+
/*
- * In case of success wake_futex_pi dropped the hash
- * bucket lock.
+ * Success, we're done! No tricky corner cases.
*/
if (!ret)
goto out_putkey;
@@ -2708,7 +3063,6 @@ retry:
* setting the FUTEX_WAITERS bit. Try again.
*/
if (ret == -EAGAIN) {
- spin_unlock(&hb->lock);
put_futex_key(&key);
goto retry;
}
@@ -2716,7 +3070,7 @@ retry:
* wake_futex_pi has detected invalid state. Tell user
* space.
*/
- goto out_unlock;
+ goto out_putkey;
}
/*
@@ -2726,8 +3080,10 @@ retry:
* preserve the WAITERS bit not the OWNER_DIED one. We are the
* owner.
*/
- if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0))
+ if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0)) {
+ spin_unlock(&hb->lock);
goto pi_faulted;
+ }
/*
* If uval has changed, let user space handle it.
@@ -2741,7 +3097,6 @@ out_putkey:
return ret;
pi_faulted:
- spin_unlock(&hb->lock);
put_futex_key(&key);
ret = fault_in_user_writeable(uaddr);
@@ -2871,10 +3226,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
* The waiter is allocated on our stack, manipulated by the requeue
* code while we sleep on uaddr.
*/
- debug_rt_mutex_init_waiter(&rt_waiter);
- RB_CLEAR_NODE(&rt_waiter.pi_tree_entry);
- RB_CLEAR_NODE(&rt_waiter.tree_entry);
- rt_waiter.task = NULL;
+ rt_mutex_init_waiter(&rt_waiter);
ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
if (unlikely(ret != 0))
@@ -2929,14 +3281,17 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
if (q.pi_state && (q.pi_state->owner != current)) {
spin_lock(q.lock_ptr);
ret = fixup_pi_state_owner(uaddr2, &q, current);
- if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current)
- rt_mutex_unlock(&q.pi_state->pi_mutex);
/*
* Drop the reference to the pi state which
* the requeue_pi() code acquired for us.
*/
- free_pi_state(q.pi_state);
+ put_pi_state(q.pi_state);
spin_unlock(q.lock_ptr);
+ /*
+ * Adjust the return value. It's either -EFAULT or
+ * success (1) but the caller expects 0 for success.
+ */
+ ret = ret < 0 ? ret : 0;
}
} else {
struct rt_mutex *pi_mutex;
@@ -2967,14 +3322,6 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
if (res)
ret = (res < 0) ? res : 0;
- /*
- * If fixup_pi_state_owner() faulted and was unable to handle
- * the fault, unlock the rt_mutex and return the fault to
- * userspace.
- */
- if (ret && rt_mutex_owner(pi_mutex) == current)
- rt_mutex_unlock(pi_mutex);
-
/* Unqueue and drop the lock. */
unqueue_me_pi(&q);
}
@@ -3088,7 +3435,7 @@ err_unlock:
* Process a futex-list entry, check whether it's owned by the
* dying task, and do notification if so:
*/
-int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi)
+static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi)
{
u32 uval, uninitialized_var(nval), mval;
@@ -3163,7 +3510,7 @@ static inline int fetch_robust_entry(struct robust_list __user **entry,
*
* We silently return on any sign of list-walking problem.
*/
-void exit_robust_list(struct task_struct *curr)
+static void exit_robust_list(struct task_struct *curr)
{
struct robust_list_head __user *head = curr->robust_list;
struct robust_list __user *entry, *next_entry, *pending;
@@ -3226,6 +3573,114 @@ void exit_robust_list(struct task_struct *curr)
curr, pip);
}
+static void futex_cleanup(struct task_struct *tsk)
+{
+ if (unlikely(tsk->robust_list)) {
+ exit_robust_list(tsk);
+ tsk->robust_list = NULL;
+ }
+
+#ifdef CONFIG_COMPAT
+ if (unlikely(tsk->compat_robust_list)) {
+ compat_exit_robust_list(tsk);
+ tsk->compat_robust_list = NULL;
+ }
+#endif
+
+ if (unlikely(!list_empty(&tsk->pi_state_list)))
+ exit_pi_state_list(tsk);
+}
+
+/**
+ * futex_exit_recursive - Set the tasks futex state to FUTEX_STATE_DEAD
+ * @tsk: task to set the state on
+ *
+ * Set the futex exit state of the task lockless. The futex waiter code
+ * observes that state when a task is exiting and loops until the task has
+ * actually finished the futex cleanup. The worst case for this is that the
+ * waiter runs through the wait loop until the state becomes visible.
+ *
+ * This is called from the recursive fault handling path in do_exit().
+ *
+ * This is best effort. Either the futex exit code has run already or
+ * not. If the OWNER_DIED bit has been set on the futex then the waiter can
+ * take it over. If not, the problem is pushed back to user space. If the
+ * futex exit code did not run yet, then an already queued waiter might
+ * block forever, but there is nothing which can be done about that.
+ */
+void futex_exit_recursive(struct task_struct *tsk)
+{
+ /* If the state is FUTEX_STATE_EXITING then futex_exit_mutex is held */
+ if (tsk->futex_state == FUTEX_STATE_EXITING)
+ mutex_unlock(&tsk->futex_exit_mutex);
+ tsk->futex_state = FUTEX_STATE_DEAD;
+}
+
+static void futex_cleanup_begin(struct task_struct *tsk)
+{
+ /*
+ * Prevent various race issues against a concurrent incoming waiter
+ * including live locks by forcing the waiter to block on
+ * tsk->futex_exit_mutex when it observes FUTEX_STATE_EXITING in
+ * attach_to_pi_owner().
+ */
+ mutex_lock(&tsk->futex_exit_mutex);
+
+ /*
+ * Switch the state to FUTEX_STATE_EXITING under tsk->pi_lock.
+ *
+ * This ensures that all subsequent checks of tsk->futex_state in
+ * attach_to_pi_owner() must observe FUTEX_STATE_EXITING with
+ * tsk->pi_lock held.
+ *
+ * It guarantees also that a pi_state which was queued right before
+ * the state change under tsk->pi_lock by a concurrent waiter must
+ * be observed in exit_pi_state_list().
+ */
+ raw_spin_lock_irq(&tsk->pi_lock);
+ tsk->futex_state = FUTEX_STATE_EXITING;
+ raw_spin_unlock_irq(&tsk->pi_lock);
+}
+
+static void futex_cleanup_end(struct task_struct *tsk, int state)
+{
+ /*
+ * Lockless store. The only side effect is that an observer might
+ * take another loop until it becomes visible.
+ */
+ tsk->futex_state = state;
+ /*
+ * Drop the exit protection. This unblocks waiters which observed
+ * FUTEX_STATE_EXITING to reevaluate the state.
+ */
+ mutex_unlock(&tsk->futex_exit_mutex);
+}
+
+void futex_exec_release(struct task_struct *tsk)
+{
+ /*
+ * The state handling is done for consistency, but in the case of
+ * exec() there is no way to prevent futher damage as the PID stays
+ * the same. But for the unlikely and arguably buggy case that a
+ * futex is held on exec(), this provides at least as much state
+ * consistency protection which is possible.
+ */
+ futex_cleanup_begin(tsk);
+ futex_cleanup(tsk);
+ /*
+ * Reset the state to FUTEX_STATE_OK. The task is alive and about
+ * exec a new binary.
+ */
+ futex_cleanup_end(tsk, FUTEX_STATE_OK);
+}
+
+void futex_exit_release(struct task_struct *tsk)
+{
+ futex_cleanup_begin(tsk);
+ futex_cleanup(tsk);
+ futex_cleanup_end(tsk, FUTEX_STATE_DEAD);
+}
+
long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
u32 __user *uaddr2, u32 val2, u32 val3)
{
@@ -3318,6 +3773,192 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
}
+#ifdef CONFIG_COMPAT
+/*
+ * Fetch a robust-list pointer. Bit 0 signals PI futexes:
+ */
+static inline int
+compat_fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
+ compat_uptr_t __user *head, unsigned int *pi)
+{
+ if (get_user(*uentry, head))
+ return -EFAULT;
+
+ *entry = compat_ptr((*uentry) & ~1);
+ *pi = (unsigned int)(*uentry) & 1;
+
+ return 0;
+}
+
+static void __user *futex_uaddr(struct robust_list __user *entry,
+ compat_long_t futex_offset)
+{
+ compat_uptr_t base = ptr_to_compat(entry);
+ void __user *uaddr = compat_ptr(base + futex_offset);
+
+ return uaddr;
+}
+
+/*
+ * Walk curr->robust_list (very carefully, it's a userspace list!)
+ * and mark any locks found there dead, and notify any waiters.
+ *
+ * We silently return on any sign of list-walking problem.
+ */
+void compat_exit_robust_list(struct task_struct *curr)
+{
+ struct compat_robust_list_head __user *head = curr->compat_robust_list;
+ struct robust_list __user *entry, *next_entry, *pending;
+ unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
+ unsigned int uninitialized_var(next_pi);
+ compat_uptr_t uentry, next_uentry, upending;
+ compat_long_t futex_offset;
+ int rc;
+
+ if (!futex_cmpxchg_enabled)
+ return;
+
+ /*
+ * Fetch the list head (which was registered earlier, via
+ * sys_set_robust_list()):
+ */
+ if (compat_fetch_robust_entry(&uentry, &entry, &head->list.next, &pi))
+ return;
+ /*
+ * Fetch the relative futex offset:
+ */
+ if (get_user(futex_offset, &head->futex_offset))
+ return;
+ /*
+ * Fetch any possibly pending lock-add first, and handle it
+ * if it exists:
+ */
+ if (compat_fetch_robust_entry(&upending, &pending,
+ &head->list_op_pending, &pip))
+ return;
+
+ next_entry = NULL; /* avoid warning with gcc */
+ while (entry != (struct robust_list __user *) &head->list) {
+ /*
+ * Fetch the next entry in the list before calling
+ * handle_futex_death:
+ */
+ rc = compat_fetch_robust_entry(&next_uentry, &next_entry,
+ (compat_uptr_t __user *)&entry->next, &next_pi);
+ /*
+ * A pending lock might already be on the list, so
+ * dont process it twice:
+ */
+ if (entry != pending) {
+ void __user *uaddr = futex_uaddr(entry, futex_offset);
+
+ if (handle_futex_death(uaddr, curr, pi))
+ return;
+ }
+ if (rc)
+ return;
+ uentry = next_uentry;
+ entry = next_entry;
+ pi = next_pi;
+ /*
+ * Avoid excessively long or circular lists:
+ */
+ if (!--limit)
+ break;
+
+ cond_resched();
+ }
+ if (pending) {
+ void __user *uaddr = futex_uaddr(pending, futex_offset);
+
+ handle_futex_death(uaddr, curr, pip);
+ }
+}
+
+COMPAT_SYSCALL_DEFINE2(set_robust_list,
+ struct compat_robust_list_head __user *, head,
+ compat_size_t, len)
+{
+ if (!futex_cmpxchg_enabled)
+ return -ENOSYS;
+
+ if (unlikely(len != sizeof(*head)))
+ return -EINVAL;
+
+ current->compat_robust_list = head;
+
+ return 0;
+}
+
+COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid,
+ compat_uptr_t __user *, head_ptr,
+ compat_size_t __user *, len_ptr)
+{
+ struct compat_robust_list_head __user *head;
+ unsigned long ret;
+ struct task_struct *p;
+
+ if (!futex_cmpxchg_enabled)
+ return -ENOSYS;
+
+ rcu_read_lock();
+
+ ret = -ESRCH;
+ if (!pid)
+ p = current;
+ else {
+ p = find_task_by_vpid(pid);
+ if (!p)
+ goto err_unlock;
+ }
+
+ ret = -EPERM;
+ if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
+ goto err_unlock;
+
+ head = p->compat_robust_list;
+ rcu_read_unlock();
+
+ if (put_user(sizeof(*head), len_ptr))
+ return -EFAULT;
+ return put_user(ptr_to_compat(head), head_ptr);
+
+err_unlock:
+ rcu_read_unlock();
+
+ return ret;
+}
+
+COMPAT_SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
+ struct compat_timespec __user *, utime, u32 __user *, uaddr2,
+ u32, val3)
+{
+ struct timespec ts;
+ ktime_t t, *tp = NULL;
+ int val2 = 0;
+ int cmd = op & FUTEX_CMD_MASK;
+
+ if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
+ cmd == FUTEX_WAIT_BITSET ||
+ cmd == FUTEX_WAIT_REQUEUE_PI)) {
+ if (compat_get_timespec(&ts, utime))
+ return -EFAULT;
+ if (!timespec_valid(&ts))
+ return -EINVAL;
+
+ t = timespec_to_ktime(ts);
+ if (cmd == FUTEX_WAIT)
+ t = ktime_add_safe(ktime_get(), t);
+ tp = &t;
+ }
+ if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
+ cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)
+ val2 = (int) (unsigned long) utime;
+
+ return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
+}
+#endif /* CONFIG_COMPAT */
+
static void __init futex_detect_cmpxchg(void)
{
#ifndef CONFIG_HAVE_FUTEX_CMPXCHG
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
deleted file mode 100644
index 4ae3232e7a28..000000000000
--- a/kernel/futex_compat.c
+++ /dev/null
@@ -1,201 +0,0 @@
-/*
- * linux/kernel/futex_compat.c
- *
- * Futex compatibililty routines.
- *
- * Copyright 2006, Red Hat, Inc., Ingo Molnar
- */
-
-#include <linux/linkage.h>
-#include <linux/compat.h>
-#include <linux/nsproxy.h>
-#include <linux/futex.h>
-#include <linux/ptrace.h>
-#include <linux/syscalls.h>
-
-#include <asm/uaccess.h>
-
-
-/*
- * Fetch a robust-list pointer. Bit 0 signals PI futexes:
- */
-static inline int
-fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
- compat_uptr_t __user *head, unsigned int *pi)
-{
- if (get_user(*uentry, head))
- return -EFAULT;
-
- *entry = compat_ptr((*uentry) & ~1);
- *pi = (unsigned int)(*uentry) & 1;
-
- return 0;
-}
-
-static void __user *futex_uaddr(struct robust_list __user *entry,
- compat_long_t futex_offset)
-{
- compat_uptr_t base = ptr_to_compat(entry);
- void __user *uaddr = compat_ptr(base + futex_offset);
-
- return uaddr;
-}
-
-/*
- * Walk curr->robust_list (very carefully, it's a userspace list!)
- * and mark any locks found there dead, and notify any waiters.
- *
- * We silently return on any sign of list-walking problem.
- */
-void compat_exit_robust_list(struct task_struct *curr)
-{
- struct compat_robust_list_head __user *head = curr->compat_robust_list;
- struct robust_list __user *entry, *next_entry, *pending;
- unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
- unsigned int uninitialized_var(next_pi);
- compat_uptr_t uentry, next_uentry, upending;
- compat_long_t futex_offset;
- int rc;
-
- if (!futex_cmpxchg_enabled)
- return;
-
- /*
- * Fetch the list head (which was registered earlier, via
- * sys_set_robust_list()):
- */
- if (fetch_robust_entry(&uentry, &entry, &head->list.next, &pi))
- return;
- /*
- * Fetch the relative futex offset:
- */
- if (get_user(futex_offset, &head->futex_offset))
- return;
- /*
- * Fetch any possibly pending lock-add first, and handle it
- * if it exists:
- */
- if (fetch_robust_entry(&upending, &pending,
- &head->list_op_pending, &pip))
- return;
-
- next_entry = NULL; /* avoid warning with gcc */
- while (entry != (struct robust_list __user *) &head->list) {
- /*
- * Fetch the next entry in the list before calling
- * handle_futex_death:
- */
- rc = fetch_robust_entry(&next_uentry, &next_entry,
- (compat_uptr_t __user *)&entry->next, &next_pi);
- /*
- * A pending lock might already be on the list, so
- * dont process it twice:
- */
- if (entry != pending) {
- void __user *uaddr = futex_uaddr(entry, futex_offset);
-
- if (handle_futex_death(uaddr, curr, pi))
- return;
- }
- if (rc)
- return;
- uentry = next_uentry;
- entry = next_entry;
- pi = next_pi;
- /*
- * Avoid excessively long or circular lists:
- */
- if (!--limit)
- break;
-
- cond_resched();
- }
- if (pending) {
- void __user *uaddr = futex_uaddr(pending, futex_offset);
-
- handle_futex_death(uaddr, curr, pip);
- }
-}
-
-COMPAT_SYSCALL_DEFINE2(set_robust_list,
- struct compat_robust_list_head __user *, head,
- compat_size_t, len)
-{
- if (!futex_cmpxchg_enabled)
- return -ENOSYS;
-
- if (unlikely(len != sizeof(*head)))
- return -EINVAL;
-
- current->compat_robust_list = head;
-
- return 0;
-}
-
-COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid,
- compat_uptr_t __user *, head_ptr,
- compat_size_t __user *, len_ptr)
-{
- struct compat_robust_list_head __user *head;
- unsigned long ret;
- struct task_struct *p;
-
- if (!futex_cmpxchg_enabled)
- return -ENOSYS;
-
- rcu_read_lock();
-
- ret = -ESRCH;
- if (!pid)
- p = current;
- else {
- p = find_task_by_vpid(pid);
- if (!p)
- goto err_unlock;
- }
-
- ret = -EPERM;
- if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
- goto err_unlock;
-
- head = p->compat_robust_list;
- rcu_read_unlock();
-
- if (put_user(sizeof(*head), len_ptr))
- return -EFAULT;
- return put_user(ptr_to_compat(head), head_ptr);
-
-err_unlock:
- rcu_read_unlock();
-
- return ret;
-}
-
-COMPAT_SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
- struct compat_timespec __user *, utime, u32 __user *, uaddr2,
- u32, val3)
-{
- struct timespec ts;
- ktime_t t, *tp = NULL;
- int val2 = 0;
- int cmd = op & FUTEX_CMD_MASK;
-
- if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
- cmd == FUTEX_WAIT_BITSET ||
- cmd == FUTEX_WAIT_REQUEUE_PI)) {
- if (compat_get_timespec(&ts, utime))
- return -EFAULT;
- if (!timespec_valid(&ts))
- return -EINVAL;
-
- t = timespec_to_ktime(ts);
- if (cmd == FUTEX_WAIT)
- t = ktime_add_safe(ktime_get(), t);
- tp = &t;
- }
- if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
- cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)
- val2 = (int) (unsigned long) utime;
-
- return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
-}
diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c
index edf67c493a8e..e473f6a1f6ca 100644
--- a/kernel/gcov/fs.c
+++ b/kernel/gcov/fs.c
@@ -108,9 +108,9 @@ static void *gcov_seq_next(struct seq_file *seq, void *data, loff_t *pos)
{
struct gcov_iterator *iter = data;
+ (*pos)++;
if (gcov_iter_next(iter))
return NULL;
- (*pos)++;
return iter;
}
diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c
index 46a18e72bce6..6d5ef6220afe 100644
--- a/kernel/gcov/gcc_4_7.c
+++ b/kernel/gcov/gcc_4_7.c
@@ -18,7 +18,9 @@
#include <linux/vmalloc.h>
#include "gcov.h"
-#if (__GNUC__ >= 7)
+#if (__GNUC__ >= 10)
+#define GCOV_COUNTERS 8
+#elif (__GNUC__ >= 7)
#define GCOV_COUNTERS 9
#elif (__GNUC__ > 5) || (__GNUC__ == 5 && __GNUC_MINOR__ >= 1)
#define GCOV_COUNTERS 10
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 14aaaa61e905..f5bb63cbb6b4 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -872,11 +872,15 @@ irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action)
irqreturn_t ret;
local_bh_disable();
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT_BASE))
+ local_irq_disable();
ret = action->thread_fn(action->irq, action->dev_id);
if (ret == IRQ_HANDLED)
atomic_inc(&desc->threads_handled);
irq_finalize_oneshot(desc, action);
+ if (!IS_ENABLED(CONFIG_PREEMPT_RT_BASE))
+ local_irq_enable();
local_bh_enable();
return ret;
}
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 37ddb7bda651..ec7c7eda0774 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -7,17 +7,18 @@
void irq_move_masked_irq(struct irq_data *idata)
{
struct irq_desc *desc = irq_data_to_desc(idata);
- struct irq_chip *chip = desc->irq_data.chip;
+ struct irq_data *data = &desc->irq_data;
+ struct irq_chip *chip = data->chip;
- if (likely(!irqd_is_setaffinity_pending(&desc->irq_data)))
+ if (likely(!irqd_is_setaffinity_pending(data)))
return;
- irqd_clr_move_pending(&desc->irq_data);
+ irqd_clr_move_pending(data);
/*
* Paranoia: cpu-local interrupts shouldn't be calling in here anyway.
*/
- if (irqd_is_per_cpu(&desc->irq_data)) {
+ if (irqd_is_per_cpu(data)) {
WARN_ON(1);
return;
}
@@ -42,9 +43,20 @@ void irq_move_masked_irq(struct irq_data *idata)
* For correct operation this depends on the caller
* masking the irqs.
*/
- if (cpumask_any_and(desc->pending_mask, cpu_online_mask) < nr_cpu_ids)
- irq_do_set_affinity(&desc->irq_data, desc->pending_mask, false);
-
+ if (cpumask_any_and(desc->pending_mask, cpu_online_mask) < nr_cpu_ids) {
+ int ret;
+
+ ret = irq_do_set_affinity(data, desc->pending_mask, false);
+ /*
+ * If the there is a cleanup pending in the underlying
+ * vector management, reschedule the move for the next
+ * interrupt. Leave desc->pending_mask intact.
+ */
+ if (ret == -EBUSY) {
+ irqd_set_move_pending(data);
+ return;
+ }
+ }
cpumask_clear(desc->pending_mask);
}
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
index 6030efd4a188..1210cd6bcaa6 100644
--- a/kernel/kexec_file.c
+++ b/kernel/kexec_file.c
@@ -575,8 +575,10 @@ static int kexec_calculate_store_digests(struct kimage *image)
sha_region_sz = KEXEC_SEGMENT_MAX * sizeof(struct kexec_sha_region);
sha_regions = vzalloc(sha_region_sz);
- if (!sha_regions)
+ if (!sha_regions) {
+ ret = -ENOMEM;
goto out_free_desc;
+ }
desc->tfm = tfm;
desc->flags = 0;
diff --git a/kernel/kmod.c b/kernel/kmod.c
index e4e5e98002fe..3f3bbae4cec3 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -28,6 +28,7 @@
#include <linux/cred.h>
#include <linux/file.h>
#include <linux/fdtable.h>
+#include <linux/fs_struct.h>
#include <linux/workqueue.h>
#include <linux/security.h>
#include <linux/mount.h>
@@ -223,6 +224,14 @@ static int call_usermodehelper_exec_async(void *data)
spin_unlock_irq(&current->sighand->siglock);
/*
+ * Initial kernel threads share ther FS with init, in order to
+ * get the init root directory. But we've now created a new
+ * thread that is going to execve a user process and has its own
+ * 'struct fs_struct'. Reset umask to the default.
+ */
+ current->fs->umask = 0022;
+
+ /*
* Our parent (unbound workqueue) runs with elevated scheduling
* priority. Avoid propagating that into the userspace child.
*/
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index f59f49bc2a5d..90f46c8aa900 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -561,11 +561,12 @@ static void kprobe_optimizer(struct work_struct *work)
do_free_cleaned_kprobes();
mutex_unlock(&module_mutex);
- mutex_unlock(&kprobe_mutex);
/* Step 5: Kick optimizer again if needed */
if (!list_empty(&optimizing_list) || !list_empty(&unoptimizing_list))
kick_kprobe_optimizer();
+
+ mutex_unlock(&kprobe_mutex);
}
/* Wait for completing optimization and unoptimization */
@@ -1149,6 +1150,26 @@ __releases(hlist_lock)
}
NOKPROBE_SYMBOL(kretprobe_table_unlock);
+struct kprobe kprobe_busy = {
+ .addr = (void *) get_kprobe,
+};
+
+void kprobe_busy_begin(void)
+{
+ struct kprobe_ctlblk *kcb;
+
+ preempt_disable();
+ __this_cpu_write(current_kprobe, &kprobe_busy);
+ kcb = get_kprobe_ctlblk();
+ kcb->kprobe_status = KPROBE_HIT_ACTIVE;
+}
+
+void kprobe_busy_end(void)
+{
+ __this_cpu_write(current_kprobe, NULL);
+ preempt_enable();
+}
+
/*
* This function is called from finish_task_switch when task tk becomes dead,
* so that we can recycle any function-return probe instances associated
@@ -1166,6 +1187,8 @@ void kprobe_flush_task(struct task_struct *tk)
/* Early boot. kretprobe_table_locks not yet initialized. */
return;
+ kprobe_busy_begin();
+
INIT_HLIST_HEAD(&empty_rp);
hash = hash_ptr(tk, KPROBE_HASH_BITS);
head = &kretprobe_inst_table[hash];
@@ -1179,6 +1202,8 @@ void kprobe_flush_task(struct task_struct *tk)
hlist_del(&ri->hlist);
kfree(ri);
}
+
+ kprobe_busy_end();
}
NOKPROBE_SYMBOL(kprobe_flush_task);
@@ -1859,6 +1884,10 @@ int register_kretprobe(struct kretprobe *rp)
int i;
void *addr;
+ /* If only rp->kp.addr is specified, check reregistering kprobes */
+ if (rp->kp.addr && check_kprobe_rereg(&rp->kp))
+ return -EINVAL;
+
if (kretprobe_blacklist_size) {
addr = kprobe_addr(&rp->kp);
if (IS_ERR(addr))
@@ -1987,6 +2016,9 @@ static void kill_kprobe(struct kprobe *p)
{
struct kprobe *kp;
+ if (WARN_ON_ONCE(kprobe_gone(p)))
+ return;
+
p->flags |= KPROBE_FLAG_GONE;
if (kprobe_aggrprobe(p)) {
/*
@@ -2004,6 +2036,14 @@ static void kill_kprobe(struct kprobe *p)
* the original probed function (which will be freed soon) any more.
*/
arch_remove_kprobe(p);
+
+ /*
+ * The module is going away. We should disarm the kprobe which
+ * is using ftrace, because ftrace framework is still available at
+ * MODULE_STATE_GOING notification.
+ */
+ if (kprobe_ftrace(p) && !kprobe_disabled(p) && !kprobes_all_disarmed)
+ disarm_kprobe_ftrace(p);
}
/* Disable one kprobe */
@@ -2122,7 +2162,10 @@ static int kprobes_module_callback(struct notifier_block *nb,
mutex_lock(&kprobe_mutex);
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
head = &kprobe_table[i];
- hlist_for_each_entry_rcu(p, head, hlist)
+ hlist_for_each_entry_rcu(p, head, hlist) {
+ if (kprobe_gone(p))
+ continue;
+
if (within_module_init((unsigned long)p->addr, mod) ||
(checkcore &&
within_module_core((unsigned long)p->addr, mod))) {
@@ -2133,6 +2176,7 @@ static int kprobes_module_callback(struct notifier_block *nb,
*/
kill_kprobe(p);
}
+ }
}
mutex_unlock(&kprobe_mutex);
return NOTIFY_DONE;
diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c
index 35b34eccdd10..9484f934aa34 100644
--- a/kernel/locking/lockdep_proc.c
+++ b/kernel/locking/lockdep_proc.c
@@ -423,7 +423,7 @@ static void seq_lock_time(struct seq_file *m, struct lock_time *lt)
seq_time(m, lt->min);
seq_time(m, lt->max);
seq_time(m, lt->total);
- seq_time(m, lt->nr ? div_s64(lt->total, lt->nr) : 0);
+ seq_time(m, lt->nr ? div64_u64(lt->total, lt->nr) : 0);
}
static void seq_stats(struct seq_file *m, struct lock_stat_data *data)
diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c
index 62b6cee8ea7f..0613c4b1d059 100644
--- a/kernel/locking/rtmutex-debug.c
+++ b/kernel/locking/rtmutex-debug.c
@@ -173,12 +173,3 @@ void debug_rt_mutex_init(struct rt_mutex *lock, const char *name)
lock->name = name;
}
-void
-rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task)
-{
-}
-
-void rt_mutex_deadlock_account_unlock(struct task_struct *task)
-{
-}
-
diff --git a/kernel/locking/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h
index d0519c3432b6..b585af9a1b50 100644
--- a/kernel/locking/rtmutex-debug.h
+++ b/kernel/locking/rtmutex-debug.h
@@ -9,9 +9,6 @@
* This file contains macros used solely by rtmutex.c. Debug version.
*/
-extern void
-rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task);
-extern void rt_mutex_deadlock_account_unlock(struct task_struct *task);
extern void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter);
extern void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter);
extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name);
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index dd173df9ee5e..532986d82179 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -163,13 +163,14 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
* 2) Drop lock->wait_lock
* 3) Try to unlock the lock with cmpxchg
*/
-static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock)
+static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock,
+ unsigned long flags)
__releases(lock->wait_lock)
{
struct task_struct *owner = rt_mutex_owner(lock);
clear_rt_mutex_waiters(lock);
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
/*
* If a new waiter comes in between the unlock and the cmpxchg
* we have two situations:
@@ -211,11 +212,12 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
/*
* Simple slow path only version: lock->owner is protected by lock->wait_lock.
*/
-static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock)
+static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock,
+ unsigned long flags)
__releases(lock->wait_lock)
{
lock->owner = NULL;
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
return true;
}
#endif
@@ -497,7 +499,6 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
int ret = 0, depth = 0;
struct rt_mutex *lock;
bool detect_deadlock;
- unsigned long flags;
bool requeue = true;
detect_deadlock = rt_mutex_cond_detect_deadlock(orig_waiter, chwalk);
@@ -540,7 +541,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
/*
* [1] Task cannot go away as we did a get_task() before !
*/
- raw_spin_lock_irqsave(&task->pi_lock, flags);
+ raw_spin_lock_irq(&task->pi_lock);
/*
* [2] Get the waiter on which @task is blocked on.
@@ -624,7 +625,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
* operations.
*/
if (!raw_spin_trylock(&lock->wait_lock)) {
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ raw_spin_unlock_irq(&task->pi_lock);
cpu_relax();
goto retry;
}
@@ -655,7 +656,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
/*
* No requeue[7] here. Just release @task [8]
*/
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ raw_spin_unlock(&task->pi_lock);
put_task_struct(task);
/*
@@ -663,14 +664,14 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
* If there is no owner of the lock, end of chain.
*/
if (!rt_mutex_owner(lock)) {
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irq(&lock->wait_lock);
return 0;
}
/* [10] Grab the next task, i.e. owner of @lock */
task = rt_mutex_owner(lock);
get_task_struct(task);
- raw_spin_lock_irqsave(&task->pi_lock, flags);
+ raw_spin_lock(&task->pi_lock);
/*
* No requeue [11] here. We just do deadlock detection.
@@ -685,8 +686,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
top_waiter = rt_mutex_top_waiter(lock);
/* [13] Drop locks */
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock(&task->pi_lock);
+ raw_spin_unlock_irq(&lock->wait_lock);
/* If owner is not blocked, end of chain. */
if (!next_lock)
@@ -707,7 +708,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
rt_mutex_enqueue(lock, waiter);
/* [8] Release the task */
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ raw_spin_unlock(&task->pi_lock);
put_task_struct(task);
/*
@@ -725,14 +726,14 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
*/
if (prerequeue_top_waiter != rt_mutex_top_waiter(lock))
wake_up_process(rt_mutex_top_waiter(lock)->task);
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irq(&lock->wait_lock);
return 0;
}
/* [10] Grab the next task, i.e. the owner of @lock */
task = rt_mutex_owner(lock);
get_task_struct(task);
- raw_spin_lock_irqsave(&task->pi_lock, flags);
+ raw_spin_lock(&task->pi_lock);
/* [11] requeue the pi waiters if necessary */
if (waiter == rt_mutex_top_waiter(lock)) {
@@ -786,8 +787,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
top_waiter = rt_mutex_top_waiter(lock);
/* [13] Drop the locks */
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock(&task->pi_lock);
+ raw_spin_unlock_irq(&lock->wait_lock);
/*
* Make the actual exit decisions [12], based on the stored
@@ -810,7 +811,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
goto again;
out_unlock_pi:
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ raw_spin_unlock_irq(&task->pi_lock);
out_put_task:
put_task_struct(task);
@@ -820,7 +821,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
/*
* Try to take an rt-mutex
*
- * Must be called with lock->wait_lock held.
+ * Must be called with lock->wait_lock held and interrupts disabled
*
* @lock: The lock to be acquired.
* @task: The task which wants to acquire the lock
@@ -830,8 +831,6 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
struct rt_mutex_waiter *waiter)
{
- unsigned long flags;
-
/*
* Before testing whether we can acquire @lock, we set the
* RT_MUTEX_HAS_WAITERS bit in @lock->owner. This forces all
@@ -916,7 +915,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
* case, but conditionals are more expensive than a redundant
* store.
*/
- raw_spin_lock_irqsave(&task->pi_lock, flags);
+ raw_spin_lock(&task->pi_lock);
task->pi_blocked_on = NULL;
/*
* Finish the lock acquisition. @task is the new owner. If
@@ -925,7 +924,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
*/
if (rt_mutex_has_waiters(lock))
rt_mutex_enqueue_pi(task, rt_mutex_top_waiter(lock));
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ raw_spin_unlock(&task->pi_lock);
takeit:
/* We got the lock. */
@@ -937,8 +936,6 @@ takeit:
*/
rt_mutex_set_owner(lock, task);
- rt_mutex_deadlock_account_lock(lock, task);
-
return 1;
}
@@ -947,7 +944,7 @@ takeit:
*
* Prepare waiter and propagate pi chain
*
- * This must be called with lock->wait_lock held.
+ * This must be called with lock->wait_lock held and interrupts disabled
*/
static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
struct rt_mutex_waiter *waiter,
@@ -958,7 +955,6 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
struct rt_mutex_waiter *top_waiter = waiter;
struct rt_mutex *next_lock;
int chain_walk = 0, res;
- unsigned long flags;
/*
* Early deadlock detection. We really don't want the task to
@@ -972,7 +968,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
if (owner == task)
return -EDEADLK;
- raw_spin_lock_irqsave(&task->pi_lock, flags);
+ raw_spin_lock(&task->pi_lock);
__rt_mutex_adjust_prio(task);
waiter->task = task;
waiter->lock = lock;
@@ -985,12 +981,12 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
task->pi_blocked_on = waiter;
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ raw_spin_unlock(&task->pi_lock);
if (!owner)
return 0;
- raw_spin_lock_irqsave(&owner->pi_lock, flags);
+ raw_spin_lock(&owner->pi_lock);
if (waiter == rt_mutex_top_waiter(lock)) {
rt_mutex_dequeue_pi(owner, top_waiter);
rt_mutex_enqueue_pi(owner, waiter);
@@ -1005,7 +1001,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
/* Store the lock on which owner is blocked or NULL */
next_lock = task_blocked_on_lock(owner);
- raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
+ raw_spin_unlock(&owner->pi_lock);
/*
* Even if full deadlock detection is on, if the owner is not
* blocked itself, we can avoid finding this out in the chain
@@ -1021,12 +1017,12 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
*/
get_task_struct(owner);
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irq(&lock->wait_lock);
res = rt_mutex_adjust_prio_chain(owner, chwalk, lock,
next_lock, waiter, task);
- raw_spin_lock(&lock->wait_lock);
+ raw_spin_lock_irq(&lock->wait_lock);
return res;
}
@@ -1035,15 +1031,14 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
* Remove the top waiter from the current tasks pi waiter tree and
* queue it up.
*
- * Called with lock->wait_lock held.
+ * Called with lock->wait_lock held and interrupts disabled.
*/
static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
struct rt_mutex *lock)
{
struct rt_mutex_waiter *waiter;
- unsigned long flags;
- raw_spin_lock_irqsave(&current->pi_lock, flags);
+ raw_spin_lock(&current->pi_lock);
waiter = rt_mutex_top_waiter(lock);
@@ -1065,7 +1060,7 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
*/
lock->owner = (void *) RT_MUTEX_HAS_WAITERS;
- raw_spin_unlock_irqrestore(&current->pi_lock, flags);
+ raw_spin_unlock(&current->pi_lock);
wake_q_add(wake_q, waiter->task);
}
@@ -1073,7 +1068,7 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
/*
* Remove a waiter from a lock and give up
*
- * Must be called with lock->wait_lock held and
+ * Must be called with lock->wait_lock held and interrupts disabled. I must
* have just failed to try_to_take_rt_mutex().
*/
static void remove_waiter(struct rt_mutex *lock,
@@ -1082,12 +1077,11 @@ static void remove_waiter(struct rt_mutex *lock,
bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock));
struct task_struct *owner = rt_mutex_owner(lock);
struct rt_mutex *next_lock;
- unsigned long flags;
- raw_spin_lock_irqsave(&current->pi_lock, flags);
+ raw_spin_lock(&current->pi_lock);
rt_mutex_dequeue(lock, waiter);
current->pi_blocked_on = NULL;
- raw_spin_unlock_irqrestore(&current->pi_lock, flags);
+ raw_spin_unlock(&current->pi_lock);
/*
* Only update priority if the waiter was the highest priority
@@ -1096,7 +1090,7 @@ static void remove_waiter(struct rt_mutex *lock,
if (!owner || !is_top_waiter)
return;
- raw_spin_lock_irqsave(&owner->pi_lock, flags);
+ raw_spin_lock(&owner->pi_lock);
rt_mutex_dequeue_pi(owner, waiter);
@@ -1108,7 +1102,7 @@ static void remove_waiter(struct rt_mutex *lock,
/* Store the lock on which owner is blocked or NULL */
next_lock = task_blocked_on_lock(owner);
- raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
+ raw_spin_unlock(&owner->pi_lock);
/*
* Don't walk the chain, if the owner task is not blocked
@@ -1120,12 +1114,12 @@ static void remove_waiter(struct rt_mutex *lock,
/* gets dropped in rt_mutex_adjust_prio_chain()! */
get_task_struct(owner);
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irq(&lock->wait_lock);
rt_mutex_adjust_prio_chain(owner, RT_MUTEX_MIN_CHAINWALK, lock,
next_lock, NULL, current);
- raw_spin_lock(&lock->wait_lock);
+ raw_spin_lock_irq(&lock->wait_lock);
}
/*
@@ -1157,15 +1151,23 @@ void rt_mutex_adjust_pi(struct task_struct *task)
next_lock, NULL, task);
}
+void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
+{
+ debug_rt_mutex_init_waiter(waiter);
+ RB_CLEAR_NODE(&waiter->pi_tree_entry);
+ RB_CLEAR_NODE(&waiter->tree_entry);
+ waiter->task = NULL;
+}
+
/**
* __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop
* @lock: the rt_mutex to take
* @state: the state the task should block in (TASK_INTERRUPTIBLE
- * or TASK_UNINTERRUPTIBLE)
+ * or TASK_UNINTERRUPTIBLE)
* @timeout: the pre-initialized and started timer, or NULL for none
* @waiter: the pre-initialized rt_mutex_waiter
*
- * lock->wait_lock must be held by the caller.
+ * Must be called with lock->wait_lock held and interrupts disabled
*/
static int __sched
__rt_mutex_slowlock(struct rt_mutex *lock, int state,
@@ -1193,13 +1195,13 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
break;
}
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irq(&lock->wait_lock);
debug_rt_mutex_print_deadlock(waiter);
schedule();
- raw_spin_lock(&lock->wait_lock);
+ raw_spin_lock_irq(&lock->wait_lock);
set_current_state(state);
}
@@ -1236,17 +1238,24 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
enum rtmutex_chainwalk chwalk)
{
struct rt_mutex_waiter waiter;
+ unsigned long flags;
int ret = 0;
- debug_rt_mutex_init_waiter(&waiter);
- RB_CLEAR_NODE(&waiter.pi_tree_entry);
- RB_CLEAR_NODE(&waiter.tree_entry);
+ rt_mutex_init_waiter(&waiter);
- raw_spin_lock(&lock->wait_lock);
+ /*
+ * Technically we could use raw_spin_[un]lock_irq() here, but this can
+ * be called in early boot if the cmpxchg() fast path is disabled
+ * (debug, no architecture support). In this case we will acquire the
+ * rtmutex with lock->wait_lock held. But we cannot unconditionally
+ * enable interrupts in that early boot case. So we need to use the
+ * irqsave/restore variants.
+ */
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
/* Try to acquire the lock again: */
if (try_to_take_rt_mutex(lock, current, NULL)) {
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
return 0;
}
@@ -1275,7 +1284,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
*/
fixup_rt_mutex_waiters(lock);
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
/* Remove pending timer: */
if (unlikely(timeout))
@@ -1286,11 +1295,25 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
return ret;
}
+static inline int __rt_mutex_slowtrylock(struct rt_mutex *lock)
+{
+ int ret = try_to_take_rt_mutex(lock, current, NULL);
+
+ /*
+ * try_to_take_rt_mutex() sets the lock waiters bit
+ * unconditionally. Clean this up.
+ */
+ fixup_rt_mutex_waiters(lock);
+
+ return ret;
+}
+
/*
* Slow path try-lock function:
*/
static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
{
+ unsigned long flags;
int ret;
/*
@@ -1302,20 +1325,14 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
return 0;
/*
- * The mutex has currently no owner. Lock the wait lock and
- * try to acquire the lock.
+ * The mutex has currently no owner. Lock the wait lock and try to
+ * acquire the lock. We use irqsave here to support early boot calls.
*/
- raw_spin_lock(&lock->wait_lock);
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
- ret = try_to_take_rt_mutex(lock, current, NULL);
-
- /*
- * try_to_take_rt_mutex() sets the lock waiters bit
- * unconditionally. Clean this up.
- */
- fixup_rt_mutex_waiters(lock);
+ ret = __rt_mutex_slowtrylock(lock);
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
return ret;
}
@@ -1327,11 +1344,12 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
struct wake_q_head *wake_q)
{
- raw_spin_lock(&lock->wait_lock);
+ unsigned long flags;
- debug_rt_mutex_unlock(lock);
+ /* irqsave required to support early boot calls */
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
- rt_mutex_deadlock_account_unlock(current);
+ debug_rt_mutex_unlock(lock);
/*
* We must be careful here if the fast path is enabled. If we
@@ -1366,10 +1384,10 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
*/
while (!rt_mutex_has_waiters(lock)) {
/* Drops lock->wait_lock ! */
- if (unlock_rt_mutex_safe(lock) == true)
+ if (unlock_rt_mutex_safe(lock, flags) == true)
return false;
/* Relock the rtmutex and try again */
- raw_spin_lock(&lock->wait_lock);
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
}
/*
@@ -1380,7 +1398,7 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
*/
mark_wakeup_next_waiter(wake_q, lock);
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
/* check PI boosting */
return true;
@@ -1398,11 +1416,10 @@ rt_mutex_fastlock(struct rt_mutex *lock, int state,
struct hrtimer_sleeper *timeout,
enum rtmutex_chainwalk chwalk))
{
- if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
- rt_mutex_deadlock_account_lock(lock, current);
+ if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
return 0;
- } else
- return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK);
+
+ return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK);
}
static inline int
@@ -1414,21 +1431,19 @@ rt_mutex_timed_fastlock(struct rt_mutex *lock, int state,
enum rtmutex_chainwalk chwalk))
{
if (chwalk == RT_MUTEX_MIN_CHAINWALK &&
- likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
- rt_mutex_deadlock_account_lock(lock, current);
+ likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
return 0;
- } else
- return slowfn(lock, state, timeout, chwalk);
+
+ return slowfn(lock, state, timeout, chwalk);
}
static inline int
rt_mutex_fasttrylock(struct rt_mutex *lock,
int (*slowfn)(struct rt_mutex *lock))
{
- if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
- rt_mutex_deadlock_account_lock(lock, current);
+ if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
return 1;
- }
+
return slowfn(lock);
}
@@ -1438,19 +1453,18 @@ rt_mutex_fastunlock(struct rt_mutex *lock,
struct wake_q_head *wqh))
{
WAKE_Q(wake_q);
+ bool deboost;
- if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
- rt_mutex_deadlock_account_unlock(current);
+ if (likely(rt_mutex_cmpxchg_release(lock, current, NULL)))
+ return;
- } else {
- bool deboost = slowfn(lock, &wake_q);
+ deboost = slowfn(lock, &wake_q);
- wake_up_q(&wake_q);
+ wake_up_q(&wake_q);
- /* Undo pi boosting if necessary: */
- if (deboost)
- rt_mutex_adjust_prio(current);
- }
+ /* Undo pi boosting if necessary: */
+ if (deboost)
+ rt_mutex_adjust_prio(current);
}
/**
@@ -1484,16 +1498,16 @@ int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock)
EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
/*
- * Futex variant with full deadlock detection.
+ * Futex variant, must not use fastpath.
*/
-int rt_mutex_timed_futex_lock(struct rt_mutex *lock,
- struct hrtimer_sleeper *timeout)
+int __sched rt_mutex_futex_trylock(struct rt_mutex *lock)
{
- might_sleep();
+ return rt_mutex_slowtrylock(lock);
+}
- return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
- RT_MUTEX_FULL_CHAINWALK,
- rt_mutex_slowlock);
+int __sched __rt_mutex_futex_trylock(struct rt_mutex *lock)
+{
+ return __rt_mutex_slowtrylock(lock);
}
/**
@@ -1552,20 +1566,38 @@ void __sched rt_mutex_unlock(struct rt_mutex *lock)
EXPORT_SYMBOL_GPL(rt_mutex_unlock);
/**
- * rt_mutex_futex_unlock - Futex variant of rt_mutex_unlock
- * @lock: the rt_mutex to be unlocked
- *
- * Returns: true/false indicating whether priority adjustment is
- * required or not.
+ * Futex variant, that since futex variants do not use the fast-path, can be
+ * simple and will not need to retry.
*/
-bool __sched rt_mutex_futex_unlock(struct rt_mutex *lock,
- struct wake_q_head *wqh)
+bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock,
+ struct wake_q_head *wake_q)
{
- if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
- rt_mutex_deadlock_account_unlock(current);
- return false;
+ lockdep_assert_held(&lock->wait_lock);
+
+ debug_rt_mutex_unlock(lock);
+
+ if (!rt_mutex_has_waiters(lock)) {
+ lock->owner = NULL;
+ return false; /* done */
+ }
+
+ mark_wakeup_next_waiter(wake_q, lock);
+ return true; /* deboost and wakeups */
+}
+
+void __sched rt_mutex_futex_unlock(struct rt_mutex *lock)
+{
+ WAKE_Q(wake_q);
+ bool deboost;
+
+ raw_spin_lock_irq(&lock->wait_lock);
+ deboost = __rt_mutex_futex_unlock(lock, &wake_q);
+ raw_spin_unlock_irq(&lock->wait_lock);
+
+ if (deboost) {
+ wake_up_q(&wake_q);
+ rt_mutex_adjust_prio(current);
}
- return rt_mutex_slowunlock(lock, wqh);
}
/**
@@ -1622,7 +1654,6 @@ void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
__rt_mutex_init(lock, NULL);
debug_rt_mutex_proxy_lock(lock, proxy_owner);
rt_mutex_set_owner(lock, proxy_owner);
- rt_mutex_deadlock_account_lock(lock, proxy_owner);
}
/**
@@ -1633,12 +1664,10 @@ void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
* No locking. Caller has to do serializing itself
* Special API call for PI-futex support
*/
-void rt_mutex_proxy_unlock(struct rt_mutex *lock,
- struct task_struct *proxy_owner)
+void rt_mutex_proxy_unlock(struct rt_mutex *lock)
{
debug_rt_mutex_proxy_unlock(lock);
rt_mutex_set_owner(lock, NULL);
- rt_mutex_deadlock_account_unlock(proxy_owner);
}
/**
@@ -1660,10 +1689,10 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
{
int ret;
- raw_spin_lock(&lock->wait_lock);
+ raw_spin_lock_irq(&lock->wait_lock);
if (try_to_take_rt_mutex(lock, task, NULL)) {
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irq(&lock->wait_lock);
return 1;
}
@@ -1684,7 +1713,7 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
if (unlikely(ret))
remove_waiter(lock, waiter);
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irq(&lock->wait_lock);
debug_rt_mutex_print_deadlock(waiter);
@@ -1734,20 +1763,16 @@ int rt_mutex_wait_proxy_lock(struct rt_mutex *lock,
{
int ret;
- raw_spin_lock(&lock->wait_lock);
-
- set_current_state(TASK_INTERRUPTIBLE);
-
+ raw_spin_lock_irq(&lock->wait_lock);
/* sleep on the mutex */
+ set_current_state(TASK_INTERRUPTIBLE);
ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
-
/*
* try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
* have to fix that up.
*/
fixup_rt_mutex_waiters(lock);
-
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irq(&lock->wait_lock);
return ret;
}
@@ -1778,14 +1803,31 @@ bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock,
raw_spin_lock_irq(&lock->wait_lock);
/*
+ * Do an unconditional try-lock, this deals with the lock stealing
+ * state where __rt_mutex_futex_unlock() -> mark_wakeup_next_waiter()
+ * sets a NULL owner.
+ *
+ * We're not interested in the return value, because the subsequent
+ * test on rt_mutex_owner() will infer that. If the trylock succeeded,
+ * we will own the lock and it will have removed the waiter. If we
+ * failed the trylock, we're still not owner and we need to remove
+ * ourselves.
+ */
+ try_to_take_rt_mutex(lock, current, waiter);
+ /*
* Unless we're the owner; we're still enqueued on the wait_list.
* So check if we became owner, if not, take us off the wait_list.
*/
if (rt_mutex_owner(lock) != current) {
remove_waiter(lock, waiter);
- fixup_rt_mutex_waiters(lock);
cleanup = true;
}
+ /*
+ * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
+ * have to fix that up.
+ */
+ fixup_rt_mutex_waiters(lock);
+
raw_spin_unlock_irq(&lock->wait_lock);
return cleanup;
diff --git a/kernel/locking/rtmutex.h b/kernel/locking/rtmutex.h
index c4060584c407..6607802efa8b 100644
--- a/kernel/locking/rtmutex.h
+++ b/kernel/locking/rtmutex.h
@@ -11,8 +11,6 @@
*/
#define rt_mutex_deadlock_check(l) (0)
-#define rt_mutex_deadlock_account_lock(m, t) do { } while (0)
-#define rt_mutex_deadlock_account_unlock(l) do { } while (0)
#define debug_rt_mutex_init_waiter(w) do { } while (0)
#define debug_rt_mutex_free_waiter(w) do { } while (0)
#define debug_rt_mutex_lock(l) do { } while (0)
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 6f8f68edb700..97c048c494f0 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -101,8 +101,8 @@ enum rtmutex_chainwalk {
extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
struct task_struct *proxy_owner);
-extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
- struct task_struct *proxy_owner);
+extern void rt_mutex_proxy_unlock(struct rt_mutex *lock);
+extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter);
extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
struct rt_mutex_waiter *waiter,
struct task_struct *task);
@@ -111,9 +111,13 @@ extern int rt_mutex_wait_proxy_lock(struct rt_mutex *lock,
struct rt_mutex_waiter *waiter);
extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock,
struct rt_mutex_waiter *waiter);
-extern int rt_mutex_timed_futex_lock(struct rt_mutex *l, struct hrtimer_sleeper *to);
-extern bool rt_mutex_futex_unlock(struct rt_mutex *lock,
- struct wake_q_head *wqh);
+extern int rt_mutex_futex_trylock(struct rt_mutex *l);
+extern int __rt_mutex_futex_trylock(struct rt_mutex *l);
+
+extern void rt_mutex_futex_unlock(struct rt_mutex *lock);
+extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock,
+ struct wake_q_head *wqh);
+
extern void rt_mutex_adjust_prio(struct task_struct *task);
#ifdef CONFIG_DEBUG_RT_MUTEXES
diff --git a/kernel/module.c b/kernel/module.c
index 2f695b6e1a3e..d84f5e38456f 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1779,7 +1779,6 @@ static int mod_sysfs_init(struct module *mod)
if (err)
mod_kobject_put(mod);
- /* delay uevent until full sysfs population */
out:
return err;
}
@@ -1813,7 +1812,6 @@ static int mod_sysfs_setup(struct module *mod,
add_sect_attrs(mod, info);
add_notes_attrs(mod, info);
- kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD);
return 0;
out_unreg_param:
@@ -2109,6 +2107,21 @@ static int verify_export_symbols(struct module *mod)
return 0;
}
+static bool ignore_undef_symbol(Elf_Half emachine, const char *name)
+{
+ /*
+ * On x86, PIC code and Clang non-PIC code may have call foo@PLT. GNU as
+ * before 2.37 produces an unreferenced _GLOBAL_OFFSET_TABLE_ on x86-64.
+ * i386 has a similar problem but may not deserve a fix.
+ *
+ * If we ever have to ignore many symbols, consider refactoring the code to
+ * only warn if referenced by a relocation.
+ */
+ if (emachine == EM_386 || emachine == EM_X86_64)
+ return !strcmp(name, "_GLOBAL_OFFSET_TABLE_");
+ return false;
+}
+
/* Change all symbols so that st_value encodes the pointer directly. */
static int simplify_symbols(struct module *mod, const struct load_info *info)
{
@@ -2150,8 +2163,10 @@ static int simplify_symbols(struct module *mod, const struct load_info *info)
break;
}
- /* Ok if weak. */
- if (!ksym && ELF_ST_BIND(sym[i].st_info) == STB_WEAK)
+ /* Ok if weak or ignored. */
+ if (!ksym &&
+ (ELF_ST_BIND(sym[i].st_info) == STB_WEAK ||
+ ignore_undef_symbol(info->hdr->e_machine, name)))
break;
pr_warn("%s: Unknown symbol %s (err %li)\n",
@@ -3301,6 +3316,9 @@ static noinline int do_init_module(struct module *mod)
blocking_notifier_call_chain(&module_notify_list,
MODULE_STATE_LIVE, mod);
+ /* Delay uevent until module has finished its init routine */
+ kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD);
+
/*
* We need to finish all async code before the module init sequence
* is done. This has potential to deadlock. For example, a newly
@@ -3589,6 +3607,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
return do_init_module(mod);
bug_cleanup:
+ mod->state = MODULE_STATE_GOING;
/* module_bug_cleanup needs module_mutex protection */
mutex_lock(&module_mutex);
module_bug_cleanup(mod);
diff --git a/kernel/padata.c b/kernel/padata.c
index ae036af3f012..c50975f43b34 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -65,15 +65,11 @@ static int padata_cpu_hash(struct parallel_data *pd)
static void padata_parallel_worker(struct work_struct *parallel_work)
{
struct padata_parallel_queue *pqueue;
- struct parallel_data *pd;
- struct padata_instance *pinst;
LIST_HEAD(local_list);
local_bh_disable();
pqueue = container_of(parallel_work,
struct padata_parallel_queue, work);
- pd = pqueue->pd;
- pinst = pd->pinst;
spin_lock(&pqueue->parallel.lock);
list_replace_init(&pqueue->parallel.list, &local_list);
@@ -136,6 +132,7 @@ int padata_do_parallel(struct padata_instance *pinst,
padata->cb_cpu = cb_cpu;
target_cpu = padata_cpu_hash(pd);
+ padata->cpu = target_cpu;
queue = per_cpu_ptr(pd->pqueue, target_cpu);
spin_lock(&queue->parallel.lock);
@@ -159,8 +156,6 @@ EXPORT_SYMBOL(padata_do_parallel);
* A pointer to the control struct of the next object that needs
* serialization, if present in one of the percpu reorder queues.
*
- * NULL, if all percpu reorder queues are empty.
- *
* -EINPROGRESS, if the next object that needs serialization will
* be parallel processed by another cpu and is not yet present in
* the cpu's reorder queue.
@@ -170,25 +165,12 @@ EXPORT_SYMBOL(padata_do_parallel);
*/
static struct padata_priv *padata_get_next(struct parallel_data *pd)
{
- int cpu, num_cpus;
- unsigned int next_nr, next_index;
struct padata_parallel_queue *next_queue;
struct padata_priv *padata;
struct padata_list *reorder;
+ int cpu = pd->cpu;
- num_cpus = cpumask_weight(pd->cpumask.pcpu);
-
- /*
- * Calculate the percpu reorder queue and the sequence
- * number of the next object.
- */
- next_nr = pd->processed;
- next_index = next_nr % num_cpus;
- cpu = padata_index_to_cpu(pd, next_index);
next_queue = per_cpu_ptr(pd->pqueue, cpu);
-
- padata = NULL;
-
reorder = &next_queue->reorder;
spin_lock(&reorder->lock);
@@ -199,7 +181,8 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd)
list_del_init(&padata->list);
atomic_dec(&pd->reorder_objects);
- pd->processed++;
+ pd->cpu = cpumask_next_wrap(cpu, pd->cpumask.pcpu, -1,
+ false);
spin_unlock(&reorder->lock);
goto out;
@@ -222,6 +205,7 @@ static void padata_reorder(struct parallel_data *pd)
struct padata_priv *padata;
struct padata_serial_queue *squeue;
struct padata_instance *pinst = pd->pinst;
+ struct padata_parallel_queue *next_queue;
/*
* We need to ensure that only one cpu can work on dequeueing of
@@ -240,12 +224,11 @@ static void padata_reorder(struct parallel_data *pd)
padata = padata_get_next(pd);
/*
- * All reorder queues are empty, or the next object that needs
- * serialization is parallel processed by another cpu and is
- * still on it's way to the cpu's reorder queue, nothing to
- * do for now.
+ * If the next object that needs serialization is parallel
+ * processed by another cpu and is still on it's way to the
+ * cpu's reorder queue, nothing to do for now.
*/
- if (!padata || PTR_ERR(padata) == -EINPROGRESS)
+ if (PTR_ERR(padata) == -EINPROGRESS)
break;
/*
@@ -254,7 +237,6 @@ static void padata_reorder(struct parallel_data *pd)
* so exit immediately.
*/
if (PTR_ERR(padata) == -ENODATA) {
- del_timer(&pd->timer);
spin_unlock_bh(&pd->lock);
return;
}
@@ -273,28 +255,27 @@ static void padata_reorder(struct parallel_data *pd)
/*
* The next object that needs serialization might have arrived to
- * the reorder queues in the meantime, we will be called again
- * from the timer function if no one else cares for it.
+ * the reorder queues in the meantime.
*
- * Ensure reorder_objects is read after pd->lock is dropped so we see
- * an increment from another task in padata_do_serial. Pairs with
+ * Ensure reorder queue is read after pd->lock is dropped so we see
+ * new objects from another task in padata_do_serial. Pairs with
* smp_mb__after_atomic in padata_do_serial.
*/
smp_mb();
- if (atomic_read(&pd->reorder_objects)
- && !(pinst->flags & PADATA_RESET))
- mod_timer(&pd->timer, jiffies + HZ);
- else
- del_timer(&pd->timer);
- return;
+ next_queue = per_cpu_ptr(pd->pqueue, pd->cpu);
+ if (!list_empty(&next_queue->reorder.list))
+ queue_work(pinst->wq, &pd->reorder_work);
}
-static void padata_reorder_timer(unsigned long arg)
+static void invoke_padata_reorder(struct work_struct *work)
{
- struct parallel_data *pd = (struct parallel_data *)arg;
+ struct parallel_data *pd;
+ local_bh_disable();
+ pd = container_of(work, struct parallel_data, reorder_work);
padata_reorder(pd);
+ local_bh_enable();
}
static void padata_serial_worker(struct work_struct *serial_work)
@@ -341,29 +322,22 @@ static void padata_serial_worker(struct work_struct *serial_work)
*/
void padata_do_serial(struct padata_priv *padata)
{
- int cpu;
- struct padata_parallel_queue *pqueue;
- struct parallel_data *pd;
-
- pd = padata->pd;
-
- cpu = get_cpu();
- pqueue = per_cpu_ptr(pd->pqueue, cpu);
+ struct parallel_data *pd = padata->pd;
+ struct padata_parallel_queue *pqueue = per_cpu_ptr(pd->pqueue,
+ padata->cpu);
spin_lock(&pqueue->reorder.lock);
- atomic_inc(&pd->reorder_objects);
list_add_tail(&padata->list, &pqueue->reorder.list);
+ atomic_inc(&pd->reorder_objects);
spin_unlock(&pqueue->reorder.lock);
/*
- * Ensure the atomic_inc of reorder_objects above is ordered correctly
+ * Ensure the addition to the reorder list is ordered correctly
* with the trylock of pd->lock in padata_reorder. Pairs with smp_mb
* in padata_reorder.
*/
smp_mb__after_atomic();
- put_cpu();
-
padata_reorder(pd);
}
EXPORT_SYMBOL(padata_do_serial);
@@ -412,9 +386,14 @@ static void padata_init_pqueues(struct parallel_data *pd)
struct padata_parallel_queue *pqueue;
cpu_index = 0;
- for_each_cpu(cpu, pd->cpumask.pcpu) {
+ for_each_possible_cpu(cpu) {
pqueue = per_cpu_ptr(pd->pqueue, cpu);
- pqueue->pd = pd;
+
+ if (!cpumask_test_cpu(cpu, pd->cpumask.pcpu)) {
+ pqueue->cpu_index = -1;
+ continue;
+ }
+
pqueue->cpu_index = cpu_index;
cpu_index++;
@@ -448,12 +427,13 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
padata_init_pqueues(pd);
padata_init_squeues(pd);
- setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd);
atomic_set(&pd->seq_nr, -1);
atomic_set(&pd->reorder_objects, 0);
atomic_set(&pd->refcnt, 1);
pd->pinst = pinst;
spin_lock_init(&pd->lock);
+ pd->cpu = cpumask_first(pd->cpumask.pcpu);
+ INIT_WORK(&pd->reorder_work, invoke_padata_reorder);
return pd;
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 3124cebaec31..7d73b30c55cc 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -779,17 +779,6 @@ static int software_resume(void)
/* Check if the device is there */
swsusp_resume_device = name_to_dev_t(resume_file);
-
- /*
- * name_to_dev_t is ineffective to verify parition if resume_file is in
- * integer format. (e.g. major:minor)
- */
- if (isdigit(resume_file[0]) && resume_wait) {
- int partno;
- while (!get_gendisk(swsusp_resume_device, &partno))
- msleep(10);
- }
-
if (!swsusp_resume_device) {
/*
* Some device discovery might still be in progress; we need
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index e53a976ca28e..b55dfb3e801f 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -2032,6 +2032,9 @@ static int __init console_setup(char *str)
char *s, *options, *brl_options = NULL;
int idx;
+ if (str[0] == 0)
+ return 1;
+
if (_braille_console_setup(&str, &brl_options))
return 1;
diff --git a/kernel/profile.c b/kernel/profile.c
index 9cd8e18e6f18..927a0345e259 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -38,7 +38,8 @@ struct profile_hit {
#define NR_PROFILE_GRP (NR_PROFILE_HIT/PROFILE_GRPSZ)
static atomic_t *prof_buffer;
-static unsigned long prof_len, prof_shift;
+static unsigned long prof_len;
+static unsigned short int prof_shift;
int prof_on __read_mostly;
EXPORT_SYMBOL_GPL(prof_on);
@@ -63,8 +64,8 @@ int profile_setup(char *str)
if (str[strlen(sleepstr)] == ',')
str += strlen(sleepstr) + 1;
if (get_option(&str, &par))
- prof_shift = par;
- pr_info("kernel sleep profiling enabled (shift: %ld)\n",
+ prof_shift = clamp(par, 0, BITS_PER_LONG - 1);
+ pr_info("kernel sleep profiling enabled (shift: %u)\n",
prof_shift);
#else
pr_warn("kernel sleep profiling requires CONFIG_SCHEDSTATS\n");
@@ -74,21 +75,21 @@ int profile_setup(char *str)
if (str[strlen(schedstr)] == ',')
str += strlen(schedstr) + 1;
if (get_option(&str, &par))
- prof_shift = par;
- pr_info("kernel schedule profiling enabled (shift: %ld)\n",
+ prof_shift = clamp(par, 0, BITS_PER_LONG - 1);
+ pr_info("kernel schedule profiling enabled (shift: %u)\n",
prof_shift);
} else if (!strncmp(str, kvmstr, strlen(kvmstr))) {
prof_on = KVM_PROFILING;
if (str[strlen(kvmstr)] == ',')
str += strlen(kvmstr) + 1;
if (get_option(&str, &par))
- prof_shift = par;
- pr_info("kernel KVM profiling enabled (shift: %ld)\n",
+ prof_shift = clamp(par, 0, BITS_PER_LONG - 1);
+ pr_info("kernel KVM profiling enabled (shift: %u)\n",
prof_shift);
} else if (get_option(&str, &par)) {
- prof_shift = par;
+ prof_shift = clamp(par, 0, BITS_PER_LONG - 1);
prof_on = CPU_PROFILING;
- pr_info("kernel profiling enabled (shift: %ld)\n",
+ pr_info("kernel profiling enabled (shift: %u)\n",
prof_shift);
}
return 1;
@@ -475,7 +476,7 @@ read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos)
unsigned long p = *ppos;
ssize_t read;
char *pnt;
- unsigned int sample_step = 1 << prof_shift;
+ unsigned long sample_step = 1UL << prof_shift;
profile_flip_buffers();
if (p >= (prof_len+1)*sizeof(unsigned int))
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index da8c358930fb..5a1d8cc7ef4e 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -129,6 +129,21 @@ void __ptrace_unlink(struct task_struct *child)
spin_unlock(&child->sighand->siglock);
}
+static bool looks_like_a_spurious_pid(struct task_struct *task)
+{
+ if (task->exit_code != ((PTRACE_EVENT_EXEC << 8) | SIGTRAP))
+ return false;
+
+ if (task_pid_vnr(task) == task->ptrace_message)
+ return false;
+ /*
+ * The tracee changed its pid but the PTRACE_EVENT_EXEC event
+ * was not wait()'ed, most probably debugger targets the old
+ * leader which was destroyed in de_thread().
+ */
+ return true;
+}
+
/* Ensure that nothing can wake it up, even SIGKILL */
static bool ptrace_freeze_traced(struct task_struct *task)
{
@@ -139,7 +154,8 @@ static bool ptrace_freeze_traced(struct task_struct *task)
return ret;
spin_lock_irq(&task->sighand->siglock);
- if (task_is_traced(task) && !__fatal_signal_pending(task)) {
+ if (task_is_traced(task) && !looks_like_a_spurious_pid(task) &&
+ !__fatal_signal_pending(task)) {
task->state = __TASK_TRACED;
ret = true;
}
diff --git a/kernel/reboot.c b/kernel/reboot.c
index bd30a973fe94..2946ed1d99d4 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -512,22 +512,22 @@ static int __init reboot_setup(char *str)
break;
case 's':
- {
- int rc;
-
- if (isdigit(*(str+1))) {
- rc = kstrtoint(str+1, 0, &reboot_cpu);
- if (rc)
- return rc;
- } else if (str[1] == 'm' && str[2] == 'p' &&
- isdigit(*(str+3))) {
- rc = kstrtoint(str+3, 0, &reboot_cpu);
- if (rc)
- return rc;
- } else
+ if (isdigit(*(str+1)))
+ reboot_cpu = simple_strtoul(str+1, NULL, 0);
+ else if (str[1] == 'm' && str[2] == 'p' &&
+ isdigit(*(str+3)))
+ reboot_cpu = simple_strtoul(str+3, NULL, 0);
+ else
reboot_mode = REBOOT_SOFT;
+ if (reboot_cpu >= num_possible_cpus()) {
+ pr_err("Ignoring the CPU number in reboot= option. "
+ "CPU %d exceeds possible cpu number %d\n",
+ reboot_cpu, num_possible_cpus());
+ reboot_cpu = 0;
+ break;
+ }
break;
- }
+
case 'g':
reboot_mode = REBOOT_GPIO;
break;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d81bcc6362ff..4a0a754f24c8 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3439,7 +3439,8 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
if (dl_prio(prio)) {
struct task_struct *pi_task = rt_mutex_get_top_task(p);
if (!dl_prio(p->normal_prio) ||
- (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
+ (pi_task && dl_prio(pi_task->prio) &&
+ dl_entity_preempt(&pi_task->dl, &p->dl))) {
p->dl.dl_boosted = 1;
enqueue_flag |= ENQUEUE_REPLENISH;
} else
@@ -8265,8 +8266,9 @@ int sched_rr_handler(struct ctl_table *table, int write,
/* make sure that internally we keep jiffies */
/* also, writing zero resets timeslice to default */
if (!ret && write) {
- sched_rr_timeslice = sched_rr_timeslice <= 0 ?
- RR_TIMESLICE : msecs_to_jiffies(sched_rr_timeslice);
+ sched_rr_timeslice =
+ sysctl_sched_rr_timeslice <= 0 ? RR_TIMESLICE :
+ msecs_to_jiffies(sysctl_sched_rr_timeslice);
}
mutex_unlock(&mutex);
return ret;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b42d2b8b283e..e00f17070cb2 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2325,7 +2325,7 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
/*
* We don't care about NUMA placement if we don't have memory.
*/
- if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
+ if ((curr->flags & (PF_EXITING | PF_KTHREAD)) || work->next != work)
return;
/*
@@ -2394,28 +2394,22 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
#ifdef CONFIG_FAIR_GROUP_SCHED
# ifdef CONFIG_SMP
-static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
+static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
{
- long tg_weight;
+ long tg_weight, load, shares;
/*
- * Use this CPU's real-time load instead of the last load contribution
- * as the updating of the contribution is delayed, and we will use the
- * the real-time load to calc the share. See update_tg_load_avg().
+ * This really should be: cfs_rq->avg.load_avg, but instead we use
+ * cfs_rq->load.weight, which is its upper bound. This helps ramp up
+ * the shares for small weight interactive tasks.
*/
- tg_weight = atomic_long_read(&tg->load_avg);
- tg_weight -= cfs_rq->tg_load_avg_contrib;
- tg_weight += cfs_rq->load.weight;
+ load = scale_load_down(cfs_rq->load.weight);
- return tg_weight;
-}
-
-static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
-{
- long tg_weight, load, shares;
+ tg_weight = atomic_long_read(&tg->load_avg);
- tg_weight = calc_tg_weight(tg, cfs_rq);
- load = cfs_rq->load.weight;
+ /* Ensure tg_weight >= load */
+ tg_weight -= cfs_rq->tg_load_avg_contrib;
+ tg_weight += load;
shares = (tg->shares * load);
if (tg_weight)
@@ -2434,6 +2428,7 @@ static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
return tg->shares;
}
# endif /* CONFIG_SMP */
+
static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
unsigned long weight)
{
@@ -3857,7 +3852,7 @@ static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
{
struct hrtimer *refresh_timer = &cfs_b->period_timer;
- u64 remaining;
+ s64 remaining;
/* if the call-back is running a quota refresh is already occurring */
if (hrtimer_callback_running(refresh_timer))
@@ -3865,7 +3860,7 @@ static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
/* is a quota refresh about to occur? */
remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
- if (remaining < min_expire)
+ if (remaining < (s64)min_expire)
return 1;
return 0;
@@ -5944,7 +5939,15 @@ static int detach_tasks(struct lb_env *env)
if (!can_migrate_task(p, env))
goto next;
- load = task_h_load(p);
+ /*
+ * Depending of the number of CPUs and tasks and the
+ * cgroup hierarchy, task_h_load() can return a null
+ * value. Make sure that env->imbalance decreases
+ * otherwise detach_tasks() will stop only after
+ * detaching up to loop_max tasks.
+ */
+ load = max_t(unsigned long, task_h_load(p), 1);
+
if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
goto next;
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 801b4ec40702..5ee5740635f3 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -9,6 +9,7 @@
#include <linux/irq_work.h>
int sched_rr_timeslice = RR_TIMESLICE;
+int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
diff --git a/kernel/sys.c b/kernel/sys.c
index 1855f1bf113e..ee8d83885367 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1183,11 +1183,13 @@ SYSCALL_DEFINE1(uname, struct old_utsname __user *, name)
SYSCALL_DEFINE1(olduname, struct oldold_utsname __user *, name)
{
- struct oldold_utsname tmp = {};
+ struct oldold_utsname tmp;
if (!name)
return -EFAULT;
+ memset(&tmp, 0, sizeof(tmp));
+
down_read(&uts_sem);
memcpy(&tmp.sysname, &utsname()->sysname, __OLD_UTS_LEN);
memcpy(&tmp.nodename, &utsname()->nodename, __OLD_UTS_LEN);
@@ -1773,13 +1775,6 @@ static int validate_prctl_map(struct prctl_mm_map *prctl_map)
error = -EINVAL;
/*
- * @brk should be after @end_data in traditional maps.
- */
- if (prctl_map->start_brk <= prctl_map->end_data ||
- prctl_map->brk <= prctl_map->end_data)
- goto out;
-
- /*
* Neither we should allow to override limits if they set.
*/
if (check_data_rlimit(rlimit(RLIMIT_DATA), prctl_map->brk,
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c2dddd335d06..ecbb1b764a82 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -412,7 +412,7 @@ static struct ctl_table kern_table[] = {
},
{
.procname = "sched_rr_timeslice_ms",
- .data = &sched_rr_timeslice,
+ .data = &sysctl_sched_rr_timeslice,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = sched_rr_handler,
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
index e24008c098c6..45a0a26023d4 100644
--- a/kernel/time/posix-clock.c
+++ b/kernel/time/posix-clock.c
@@ -25,8 +25,6 @@
#include <linux/syscalls.h>
#include <linux/uaccess.h>
-static void delete_clock(struct kref *kref);
-
/*
* Returns NULL if the posix_clock instance attached to 'fp' is old and stale.
*/
@@ -168,7 +166,7 @@ static int posix_clock_open(struct inode *inode, struct file *fp)
err = 0;
if (!err) {
- kref_get(&clk->kref);
+ get_device(clk->dev);
fp->private_data = clk;
}
out:
@@ -184,7 +182,7 @@ static int posix_clock_release(struct inode *inode, struct file *fp)
if (clk->ops.release)
err = clk->ops.release(clk);
- kref_put(&clk->kref, delete_clock);
+ put_device(clk->dev);
fp->private_data = NULL;
@@ -206,38 +204,35 @@ static const struct file_operations posix_clock_file_operations = {
#endif
};
-int posix_clock_register(struct posix_clock *clk, dev_t devid)
+int posix_clock_register(struct posix_clock *clk, struct device *dev)
{
int err;
- kref_init(&clk->kref);
init_rwsem(&clk->rwsem);
cdev_init(&clk->cdev, &posix_clock_file_operations);
+ err = cdev_device_add(&clk->cdev, dev);
+ if (err) {
+ pr_err("%s unable to add device %d:%d\n",
+ dev_name(dev), MAJOR(dev->devt), MINOR(dev->devt));
+ return err;
+ }
clk->cdev.owner = clk->ops.owner;
- err = cdev_add(&clk->cdev, devid, 1);
+ clk->dev = dev;
- return err;
+ return 0;
}
EXPORT_SYMBOL_GPL(posix_clock_register);
-static void delete_clock(struct kref *kref)
-{
- struct posix_clock *clk = container_of(kref, struct posix_clock, kref);
-
- if (clk->release)
- clk->release(clk);
-}
-
void posix_clock_unregister(struct posix_clock *clk)
{
- cdev_del(&clk->cdev);
+ cdev_device_del(&clk->cdev, clk->dev);
down_write(&clk->rwsem);
clk->zombie = true;
up_write(&clk->rwsem);
- kref_put(&clk->kref, delete_clock);
+ put_device(clk->dev);
}
EXPORT_SYMBOL_GPL(posix_clock_unregister);
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index 3d7588a2e97c..6ca409a46030 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -42,6 +42,7 @@
#include <linux/sched/sysctl.h>
#include <linux/slab.h>
#include <linux/compat.h>
+#include <linux/random.h>
#include <asm/uaccess.h>
#include <asm/unistd.h>
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 210b8e726a97..c142e100840e 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -15,6 +15,9 @@
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*
*/
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/kernel.h>
#include <linux/blkdev.h>
#include <linux/blktrace_api.h>
@@ -319,11 +322,12 @@ static void put_probe_ref(void)
static void blk_trace_cleanup(struct blk_trace *bt)
{
+ synchronize_rcu();
blk_trace_free(bt);
put_probe_ref();
}
-int blk_trace_remove(struct request_queue *q)
+static int __blk_trace_remove(struct request_queue *q)
{
struct blk_trace *bt;
@@ -336,6 +340,17 @@ int blk_trace_remove(struct request_queue *q)
return 0;
}
+
+int blk_trace_remove(struct request_queue *q)
+{
+ int ret;
+
+ mutex_lock(&q->blk_trace_mutex);
+ ret = __blk_trace_remove(q);
+ mutex_unlock(&q->blk_trace_mutex);
+
+ return ret;
+}
EXPORT_SYMBOL_GPL(blk_trace_remove);
static ssize_t blk_dropped_read(struct file *filp, char __user *buffer,
@@ -469,6 +484,16 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
*/
strreplace(buts->name, '/', '_');
+ /*
+ * bdev can be NULL, as with scsi-generic, this is a helpful as
+ * we can be.
+ */
+ if (q->blk_trace) {
+ pr_warn("Concurrent blktraces are not allowed on %s\n",
+ buts->name);
+ return -EBUSY;
+ }
+
bt = kzalloc(sizeof(*bt), GFP_KERNEL);
if (!bt)
return -ENOMEM;
@@ -546,9 +571,8 @@ err:
return ret;
}
-int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
- struct block_device *bdev,
- char __user *arg)
+static int __blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
+ struct block_device *bdev, char __user *arg)
{
struct blk_user_trace_setup buts;
int ret;
@@ -562,11 +586,24 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
return ret;
if (copy_to_user(arg, &buts, sizeof(buts))) {
- blk_trace_remove(q);
+ __blk_trace_remove(q);
return -EFAULT;
}
return 0;
}
+
+int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
+ struct block_device *bdev,
+ char __user *arg)
+{
+ int ret;
+
+ mutex_lock(&q->blk_trace_mutex);
+ ret = __blk_trace_setup(q, name, dev, bdev, arg);
+ mutex_unlock(&q->blk_trace_mutex);
+
+ return ret;
+}
EXPORT_SYMBOL_GPL(blk_trace_setup);
#if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64)
@@ -595,7 +632,7 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name,
return ret;
if (copy_to_user(arg, &buts.name, ARRAY_SIZE(buts.name))) {
- blk_trace_remove(q);
+ __blk_trace_remove(q);
return -EFAULT;
}
@@ -603,11 +640,13 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name,
}
#endif
-int blk_trace_startstop(struct request_queue *q, int start)
+static int __blk_trace_startstop(struct request_queue *q, int start)
{
int ret;
- struct blk_trace *bt = q->blk_trace;
+ struct blk_trace *bt;
+ bt = rcu_dereference_protected(q->blk_trace,
+ lockdep_is_held(&q->blk_trace_mutex));
if (bt == NULL)
return -EINVAL;
@@ -642,8 +681,25 @@ int blk_trace_startstop(struct request_queue *q, int start)
return ret;
}
+
+int blk_trace_startstop(struct request_queue *q, int start)
+{
+ int ret;
+
+ mutex_lock(&q->blk_trace_mutex);
+ ret = __blk_trace_startstop(q, start);
+ mutex_unlock(&q->blk_trace_mutex);
+
+ return ret;
+}
EXPORT_SYMBOL_GPL(blk_trace_startstop);
+/*
+ * When reading or writing the blktrace sysfs files, the references to the
+ * opened sysfs or device files should prevent the underlying block device
+ * from being removed. So no further delete protection is really needed.
+ */
+
/**
* blk_trace_ioctl: - handle the ioctls associated with tracing
* @bdev: the block device
@@ -661,12 +717,12 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
if (!q)
return -ENXIO;
- mutex_lock(&bdev->bd_mutex);
+ mutex_lock(&q->blk_trace_mutex);
switch (cmd) {
case BLKTRACESETUP:
bdevname(bdev, b);
- ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
+ ret = __blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
break;
#if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64)
case BLKTRACESETUP32:
@@ -677,17 +733,17 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
case BLKTRACESTART:
start = 1;
case BLKTRACESTOP:
- ret = blk_trace_startstop(q, start);
+ ret = __blk_trace_startstop(q, start);
break;
case BLKTRACETEARDOWN:
- ret = blk_trace_remove(q);
+ ret = __blk_trace_remove(q);
break;
default:
ret = -ENOTTY;
break;
}
- mutex_unlock(&bdev->bd_mutex);
+ mutex_unlock(&q->blk_trace_mutex);
return ret;
}
@@ -698,10 +754,14 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
**/
void blk_trace_shutdown(struct request_queue *q)
{
- if (q->blk_trace) {
- blk_trace_startstop(q, 0);
- blk_trace_remove(q);
+ mutex_lock(&q->blk_trace_mutex);
+ if (rcu_dereference_protected(q->blk_trace,
+ lockdep_is_held(&q->blk_trace_mutex))) {
+ __blk_trace_startstop(q, 0);
+ __blk_trace_remove(q);
}
+
+ mutex_unlock(&q->blk_trace_mutex);
}
/*
@@ -722,10 +782,14 @@ void blk_trace_shutdown(struct request_queue *q)
static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
unsigned int nr_bytes, u32 what)
{
- struct blk_trace *bt = q->blk_trace;
+ struct blk_trace *bt;
- if (likely(!bt))
+ rcu_read_lock();
+ bt = rcu_dereference(q->blk_trace);
+ if (likely(!bt)) {
+ rcu_read_unlock();
return;
+ }
if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
what |= BLK_TC_ACT(BLK_TC_PC);
@@ -736,6 +800,7 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
__blk_add_trace(bt, blk_rq_pos(rq), nr_bytes,
rq->cmd_flags, what, rq->errors, 0, NULL);
}
+ rcu_read_unlock();
}
static void blk_add_trace_rq_abort(void *ignore,
@@ -785,13 +850,18 @@ static void blk_add_trace_rq_complete(void *ignore,
static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
u32 what, int error)
{
- struct blk_trace *bt = q->blk_trace;
+ struct blk_trace *bt;
- if (likely(!bt))
+ rcu_read_lock();
+ bt = rcu_dereference(q->blk_trace);
+ if (likely(!bt)) {
+ rcu_read_unlock();
return;
+ }
__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
bio->bi_rw, what, error, 0, NULL);
+ rcu_read_unlock();
}
static void blk_add_trace_bio_bounce(void *ignore,
@@ -836,10 +906,13 @@ static void blk_add_trace_getrq(void *ignore,
if (bio)
blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0);
else {
- struct blk_trace *bt = q->blk_trace;
+ struct blk_trace *bt;
+ rcu_read_lock();
+ bt = rcu_dereference(q->blk_trace);
if (bt)
__blk_add_trace(bt, 0, 0, rw, BLK_TA_GETRQ, 0, 0, NULL);
+ rcu_read_unlock();
}
}
@@ -851,27 +924,35 @@ static void blk_add_trace_sleeprq(void *ignore,
if (bio)
blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0);
else {
- struct blk_trace *bt = q->blk_trace;
+ struct blk_trace *bt;
+ rcu_read_lock();
+ bt = rcu_dereference(q->blk_trace);
if (bt)
__blk_add_trace(bt, 0, 0, rw, BLK_TA_SLEEPRQ,
0, 0, NULL);
+ rcu_read_unlock();
}
}
static void blk_add_trace_plug(void *ignore, struct request_queue *q)
{
- struct blk_trace *bt = q->blk_trace;
+ struct blk_trace *bt;
+ rcu_read_lock();
+ bt = rcu_dereference(q->blk_trace);
if (bt)
__blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL);
+ rcu_read_unlock();
}
static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
unsigned int depth, bool explicit)
{
- struct blk_trace *bt = q->blk_trace;
+ struct blk_trace *bt;
+ rcu_read_lock();
+ bt = rcu_dereference(q->blk_trace);
if (bt) {
__be64 rpdu = cpu_to_be64(depth);
u32 what;
@@ -883,14 +964,17 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
__blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu);
}
+ rcu_read_unlock();
}
static void blk_add_trace_split(void *ignore,
struct request_queue *q, struct bio *bio,
unsigned int pdu)
{
- struct blk_trace *bt = q->blk_trace;
+ struct blk_trace *bt;
+ rcu_read_lock();
+ bt = rcu_dereference(q->blk_trace);
if (bt) {
__be64 rpdu = cpu_to_be64(pdu);
@@ -898,6 +982,7 @@ static void blk_add_trace_split(void *ignore,
bio->bi_iter.bi_size, bio->bi_rw, BLK_TA_SPLIT,
bio->bi_error, sizeof(rpdu), &rpdu);
}
+ rcu_read_unlock();
}
/**
@@ -917,11 +1002,15 @@ static void blk_add_trace_bio_remap(void *ignore,
struct request_queue *q, struct bio *bio,
dev_t dev, sector_t from)
{
- struct blk_trace *bt = q->blk_trace;
+ struct blk_trace *bt;
struct blk_io_trace_remap r;
- if (likely(!bt))
+ rcu_read_lock();
+ bt = rcu_dereference(q->blk_trace);
+ if (likely(!bt)) {
+ rcu_read_unlock();
return;
+ }
r.device_from = cpu_to_be32(dev);
r.device_to = cpu_to_be32(bio->bi_bdev->bd_dev);
@@ -930,6 +1019,7 @@ static void blk_add_trace_bio_remap(void *ignore,
__blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size,
bio->bi_rw, BLK_TA_REMAP, bio->bi_error,
sizeof(r), &r);
+ rcu_read_unlock();
}
/**
@@ -950,11 +1040,15 @@ static void blk_add_trace_rq_remap(void *ignore,
struct request *rq, dev_t dev,
sector_t from)
{
- struct blk_trace *bt = q->blk_trace;
+ struct blk_trace *bt;
struct blk_io_trace_remap r;
- if (likely(!bt))
+ rcu_read_lock();
+ bt = rcu_dereference(q->blk_trace);
+ if (likely(!bt)) {
+ rcu_read_unlock();
return;
+ }
r.device_from = cpu_to_be32(dev);
r.device_to = cpu_to_be32(disk_devt(rq->rq_disk));
@@ -963,6 +1057,7 @@ static void blk_add_trace_rq_remap(void *ignore,
__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
rq_data_dir(rq), BLK_TA_REMAP, !!rq->errors,
sizeof(r), &r);
+ rcu_read_unlock();
}
/**
@@ -980,10 +1075,14 @@ void blk_add_driver_data(struct request_queue *q,
struct request *rq,
void *data, size_t len)
{
- struct blk_trace *bt = q->blk_trace;
+ struct blk_trace *bt;
- if (likely(!bt))
+ rcu_read_lock();
+ bt = rcu_dereference(q->blk_trace);
+ if (likely(!bt)) {
+ rcu_read_unlock();
return;
+ }
if (rq->cmd_type == REQ_TYPE_BLOCK_PC)
__blk_add_trace(bt, 0, blk_rq_bytes(rq), 0,
@@ -991,6 +1090,7 @@ void blk_add_driver_data(struct request_queue *q,
else
__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), 0,
BLK_TA_DRV_DATA, rq->errors, len, data);
+ rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(blk_add_driver_data);
@@ -1481,7 +1581,16 @@ static int blk_trace_remove_queue(struct request_queue *q)
if (bt == NULL)
return -EINVAL;
+ if (bt->trace_state == Blktrace_running) {
+ bt->trace_state = Blktrace_stopped;
+ spin_lock_irq(&running_trace_lock);
+ list_del_init(&bt->running_list);
+ spin_unlock_irq(&running_trace_lock);
+ relay_flush(bt->rchan);
+ }
+
put_probe_ref();
+ synchronize_rcu();
blk_trace_free(bt);
return 0;
}
@@ -1642,6 +1751,7 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
struct hd_struct *p = dev_to_part(dev);
struct request_queue *q;
struct block_device *bdev;
+ struct blk_trace *bt;
ssize_t ret = -ENXIO;
bdev = bdget(part_devt(p));
@@ -1652,26 +1762,28 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
if (q == NULL)
goto out_bdput;
- mutex_lock(&bdev->bd_mutex);
+ mutex_lock(&q->blk_trace_mutex);
+ bt = rcu_dereference_protected(q->blk_trace,
+ lockdep_is_held(&q->blk_trace_mutex));
if (attr == &dev_attr_enable) {
- ret = sprintf(buf, "%u\n", !!q->blk_trace);
+ ret = sprintf(buf, "%u\n", !!bt);
goto out_unlock_bdev;
}
- if (q->blk_trace == NULL)
+ if (bt == NULL)
ret = sprintf(buf, "disabled\n");
else if (attr == &dev_attr_act_mask)
- ret = blk_trace_mask2str(buf, q->blk_trace->act_mask);
+ ret = blk_trace_mask2str(buf, bt->act_mask);
else if (attr == &dev_attr_pid)
- ret = sprintf(buf, "%u\n", q->blk_trace->pid);
+ ret = sprintf(buf, "%u\n", bt->pid);
else if (attr == &dev_attr_start_lba)
- ret = sprintf(buf, "%llu\n", q->blk_trace->start_lba);
+ ret = sprintf(buf, "%llu\n", bt->start_lba);
else if (attr == &dev_attr_end_lba)
- ret = sprintf(buf, "%llu\n", q->blk_trace->end_lba);
+ ret = sprintf(buf, "%llu\n", bt->end_lba);
out_unlock_bdev:
- mutex_unlock(&bdev->bd_mutex);
+ mutex_unlock(&q->blk_trace_mutex);
out_bdput:
bdput(bdev);
out:
@@ -1685,6 +1797,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
struct block_device *bdev;
struct request_queue *q;
struct hd_struct *p;
+ struct blk_trace *bt;
u64 value;
ssize_t ret = -EINVAL;
@@ -1713,10 +1826,12 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
if (q == NULL)
goto out_bdput;
- mutex_lock(&bdev->bd_mutex);
+ mutex_lock(&q->blk_trace_mutex);
+ bt = rcu_dereference_protected(q->blk_trace,
+ lockdep_is_held(&q->blk_trace_mutex));
if (attr == &dev_attr_enable) {
- if (!!value == !!q->blk_trace) {
+ if (!!value == !!bt) {
ret = 0;
goto out_unlock_bdev;
}
@@ -1728,22 +1843,25 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
}
ret = 0;
- if (q->blk_trace == NULL)
+ if (bt == NULL) {
ret = blk_trace_setup_queue(q, bdev);
+ bt = rcu_dereference_protected(q->blk_trace,
+ lockdep_is_held(&q->blk_trace_mutex));
+ }
if (ret == 0) {
if (attr == &dev_attr_act_mask)
- q->blk_trace->act_mask = value;
+ bt->act_mask = value;
else if (attr == &dev_attr_pid)
- q->blk_trace->pid = value;
+ bt->pid = value;
else if (attr == &dev_attr_start_lba)
- q->blk_trace->start_lba = value;
+ bt->start_lba = value;
else if (attr == &dev_attr_end_lba)
- q->blk_trace->end_lba = value;
+ bt->end_lba = value;
}
out_unlock_bdev:
- mutex_unlock(&bdev->bd_mutex);
+ mutex_unlock(&q->blk_trace_mutex);
out_bdput:
bdput(bdev);
out:
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 3dd40c736067..a71bdad638d5 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -206,6 +206,10 @@ static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5)
event->pmu->count)
return -EINVAL;
+ if (unlikely(event->attr.type != PERF_TYPE_HARDWARE &&
+ event->attr.type != PERF_TYPE_RAW))
+ return -EINVAL;
+
/*
* we don't know if the function is run successfully by the
* return value. It can be judged in other places, such as
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index e4c6f89b6b11..c5484723abda 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1943,12 +1943,18 @@ static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops,
static void print_ip_ins(const char *fmt, unsigned char *p)
{
+ char ins[MCOUNT_INSN_SIZE];
int i;
+ if (probe_kernel_read(ins, p, MCOUNT_INSN_SIZE)) {
+ printk(KERN_CONT "%s[FAULT] %px\n", fmt, p);
+ return;
+ }
+
printk(KERN_CONT "%s", fmt);
for (i = 0; i < MCOUNT_INSN_SIZE; i++)
- printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]);
+ printk(KERN_CONT "%s%02x", i ? ":" : "", ins[i]);
}
static struct ftrace_ops *
@@ -2823,8 +2829,11 @@ static int referenced_filters(struct dyn_ftrace *rec)
int cnt = 0;
for (ops = ftrace_ops_list; ops != &ftrace_list_end; ops = ops->next) {
- if (ops_references_rec(ops, rec))
- cnt++;
+ if (ops_references_rec(ops, rec)) {
+ cnt++;
+ if (ops->flags & FTRACE_OPS_FL_SAVE_REGS)
+ rec->flags |= FTRACE_FL_REGS;
+ }
}
return cnt;
@@ -2874,7 +2883,7 @@ static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs)
p = &pg->records[i];
if (test)
cnt += referenced_filters(p);
- p->flags = cnt;
+ p->flags += cnt;
/*
* Do the initial record conversion from mcount jump
@@ -4398,8 +4407,11 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
parser = &iter->parser;
if (trace_parser_loaded(parser)) {
+ int enable = !(iter->flags & FTRACE_ITER_NOTRACE);
+
parser->buffer[parser->idx] = 0;
- ftrace_match_records(iter->hash, parser->buffer, parser->idx);
+ ftrace_process_regex(iter->hash, parser->buffer,
+ parser->idx, enable);
}
trace_parser_put(parser);
@@ -5173,7 +5185,7 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op;
int bit;
- bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX);
+ bit = trace_test_and_set_recursion(TRACE_LIST_START);
if (bit < 0)
return;
@@ -5234,7 +5246,7 @@ static void ftrace_ops_recurs_func(unsigned long ip, unsigned long parent_ip,
{
int bit;
- bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX);
+ bit = trace_test_and_set_recursion(TRACE_LIST_START);
if (bit < 0)
return;
@@ -5705,7 +5717,6 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
}
if (t->ret_stack == NULL) {
- atomic_set(&t->tracing_graph_pause, 0);
atomic_set(&t->trace_overrun, 0);
t->curr_ret_stack = -1;
/* Make sure the tasks see the -1 first: */
@@ -5917,7 +5928,6 @@ static DEFINE_PER_CPU(struct ftrace_ret_stack *, idle_ret_stack);
static void
graph_init_task(struct task_struct *t, struct ftrace_ret_stack *ret_stack)
{
- atomic_set(&t->tracing_graph_pause, 0);
atomic_set(&t->trace_overrun, 0);
t->ftrace_timestamp = 0;
/* make curr_ret_stack visible before we add the ret_stack */
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 1cf2402c6922..19b30ff90cc4 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -416,14 +416,16 @@ struct rb_event_info {
/*
* Used for which event context the event is in.
- * NMI = 0
- * IRQ = 1
- * SOFTIRQ = 2
- * NORMAL = 3
+ * TRANSITION = 0
+ * NMI = 1
+ * IRQ = 2
+ * SOFTIRQ = 3
+ * NORMAL = 4
*
* See trace_recursive_lock() comment below for more details.
*/
enum {
+ RB_CTX_TRANSITION,
RB_CTX_NMI,
RB_CTX_IRQ,
RB_CTX_SOFTIRQ,
@@ -1659,18 +1661,18 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
{
struct ring_buffer_per_cpu *cpu_buffer;
unsigned long nr_pages;
- int cpu, err = 0;
+ int cpu, err;
/*
* Always succeed at resizing a non-existent buffer:
*/
if (!buffer)
- return size;
+ return 0;
/* Make sure the requested buffer exists */
if (cpu_id != RING_BUFFER_ALL_CPUS &&
!cpumask_test_cpu(cpu_id, buffer->cpumask))
- return size;
+ return 0;
nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
@@ -1810,7 +1812,7 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
}
mutex_unlock(&buffer->mutex);
- return size;
+ return 0;
out_err:
for_each_buffer_cpu(buffer, cpu) {
@@ -2585,10 +2587,10 @@ rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
* a bit of overhead in something as critical as function tracing,
* we use a bitmask trick.
*
- * bit 0 = NMI context
- * bit 1 = IRQ context
- * bit 2 = SoftIRQ context
- * bit 3 = normal context.
+ * bit 1 = NMI context
+ * bit 2 = IRQ context
+ * bit 3 = SoftIRQ context
+ * bit 4 = normal context.
*
* This works because this is the order of contexts that can
* preempt other contexts. A SoftIRQ never preempts an IRQ
@@ -2611,6 +2613,30 @@ rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
* The least significant bit can be cleared this way, and it
* just so happens that it is the same bit corresponding to
* the current context.
+ *
+ * Now the TRANSITION bit breaks the above slightly. The TRANSITION bit
+ * is set when a recursion is detected at the current context, and if
+ * the TRANSITION bit is already set, it will fail the recursion.
+ * This is needed because there's a lag between the changing of
+ * interrupt context and updating the preempt count. In this case,
+ * a false positive will be found. To handle this, one extra recursion
+ * is allowed, and this is done by the TRANSITION bit. If the TRANSITION
+ * bit is already set, then it is considered a recursion and the function
+ * ends. Otherwise, the TRANSITION bit is set, and that bit is returned.
+ *
+ * On the trace_recursive_unlock(), the TRANSITION bit will be the first
+ * to be cleared. Even if it wasn't the context that set it. That is,
+ * if an interrupt comes in while NORMAL bit is set and the ring buffer
+ * is called before preempt_count() is updated, since the check will
+ * be on the NORMAL bit, the TRANSITION bit will then be set. If an
+ * NMI then comes in, it will set the NMI bit, but when the NMI code
+ * does the trace_recursive_unlock() it will clear the TRANSTION bit
+ * and leave the NMI bit set. But this is fine, because the interrupt
+ * code that set the TRANSITION bit will then clear the NMI bit when it
+ * calls trace_recursive_unlock(). If another NMI comes in, it will
+ * set the TRANSITION bit and continue.
+ *
+ * Note: The TRANSITION bit only handles a single transition between context.
*/
static __always_inline int
@@ -2629,8 +2655,16 @@ trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer)
} else
bit = RB_CTX_NORMAL;
- if (unlikely(val & (1 << bit)))
- return 1;
+ if (unlikely(val & (1 << bit))) {
+ /*
+ * It is possible that this was called by transitioning
+ * between interrupt context, and preempt_count() has not
+ * been updated yet. In this case, use the TRANSITION bit.
+ */
+ bit = RB_CTX_TRANSITION;
+ if (val & (1 << bit))
+ return 1;
+ }
val |= (1 << bit);
cpu_buffer->current_context = val;
@@ -3052,10 +3086,30 @@ static bool rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
if (unlikely(!head))
return true;
- return reader->read == rb_page_commit(reader) &&
- (commit == reader ||
- (commit == head &&
- head->read == rb_page_commit(commit)));
+ /* Reader should exhaust content in reader page */
+ if (reader->read != rb_page_commit(reader))
+ return false;
+
+ /*
+ * If writers are committing on the reader page, knowing all
+ * committed content has been read, the ring buffer is empty.
+ */
+ if (commit == reader)
+ return true;
+
+ /*
+ * If writers are committing on a page other than reader page
+ * and head page, there should always be content to read.
+ */
+ if (commit != head)
+ return false;
+
+ /*
+ * Writers are committing on the head page, we just need
+ * to care about there're committed data, and the reader will
+ * swap reader page with head page when it is to read data.
+ */
+ return rb_page_commit(commit) == 0;
}
/**
@@ -4260,6 +4314,8 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return;
+ /* prevent another thread from changing buffer sizes */
+ mutex_lock(&buffer->mutex);
atomic_inc(&buffer->resize_disabled);
atomic_inc(&cpu_buffer->record_disabled);
@@ -4283,6 +4339,8 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
atomic_dec(&cpu_buffer->record_disabled);
atomic_dec(&buffer->resize_disabled);
+
+ mutex_unlock(&buffer->mutex);
}
EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 06efd18bf3e3..bc8b1fdbf1bb 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -26,6 +26,7 @@
#include <linux/linkage.h>
#include <linux/uaccess.h>
#include <linux/kprobes.h>
+#include <linux/vmalloc.h>
#include <linux/ftrace.h>
#include <linux/module.h>
#include <linux/percpu.h>
@@ -1368,9 +1369,6 @@ struct saved_cmdlines_buffer {
};
static struct saved_cmdlines_buffer *savedcmd;
-/* temporary disable recording */
-static atomic_t trace_record_cmdline_disabled __read_mostly;
-
static inline char *get_saved_cmdlines(int idx)
{
return &savedcmd->saved_cmdlines[idx * TASK_COMM_LEN];
@@ -1561,10 +1559,13 @@ void trace_stop_cmdline_recording(void);
static int trace_save_cmdline(struct task_struct *tsk)
{
- unsigned pid, idx;
+ unsigned tpid, idx;
- if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT))
- return 0;
+ /* treat recording of idle task as a success */
+ if (!tsk->pid)
+ return 1;
+
+ tpid = tsk->pid & (PID_MAX_DEFAULT - 1);
/*
* It's not the end of the world if we don't get
@@ -1575,26 +1576,15 @@ static int trace_save_cmdline(struct task_struct *tsk)
if (!arch_spin_trylock(&trace_cmdline_lock))
return 0;
- idx = savedcmd->map_pid_to_cmdline[tsk->pid];
+ idx = savedcmd->map_pid_to_cmdline[tpid];
if (idx == NO_CMDLINE_MAP) {
idx = (savedcmd->cmdline_idx + 1) % savedcmd->cmdline_num;
- /*
- * Check whether the cmdline buffer at idx has a pid
- * mapped. We are going to overwrite that entry so we
- * need to clear the map_pid_to_cmdline. Otherwise we
- * would read the new comm for the old pid.
- */
- pid = savedcmd->map_cmdline_to_pid[idx];
- if (pid != NO_CMDLINE_MAP)
- savedcmd->map_pid_to_cmdline[pid] = NO_CMDLINE_MAP;
-
- savedcmd->map_cmdline_to_pid[idx] = tsk->pid;
- savedcmd->map_pid_to_cmdline[tsk->pid] = idx;
-
+ savedcmd->map_pid_to_cmdline[tpid] = idx;
savedcmd->cmdline_idx = idx;
}
+ savedcmd->map_cmdline_to_pid[idx] = tsk->pid;
set_cmdline(idx, tsk->comm);
arch_spin_unlock(&trace_cmdline_lock);
@@ -1605,6 +1595,7 @@ static int trace_save_cmdline(struct task_struct *tsk)
static void __trace_find_cmdline(int pid, char comm[])
{
unsigned map;
+ int tpid;
if (!pid) {
strcpy(comm, "<idle>");
@@ -1616,16 +1607,16 @@ static void __trace_find_cmdline(int pid, char comm[])
return;
}
- if (pid > PID_MAX_DEFAULT) {
- strcpy(comm, "<...>");
- return;
+ tpid = pid & (PID_MAX_DEFAULT - 1);
+ map = savedcmd->map_pid_to_cmdline[tpid];
+ if (map != NO_CMDLINE_MAP) {
+ tpid = savedcmd->map_cmdline_to_pid[map];
+ if (tpid == pid) {
+ strlcpy(comm, get_saved_cmdlines(map), TASK_COMM_LEN);
+ return;
+ }
}
-
- map = savedcmd->map_pid_to_cmdline[pid];
- if (map != NO_CMDLINE_MAP)
- strcpy(comm, get_saved_cmdlines(map));
- else
- strcpy(comm, "<...>");
+ strcpy(comm, "<...>");
}
void trace_find_cmdline(int pid, char comm[])
@@ -1641,9 +1632,6 @@ void trace_find_cmdline(int pid, char comm[])
void tracing_record_cmdline(struct task_struct *tsk)
{
- if (atomic_read(&trace_record_cmdline_disabled) || !tracing_is_on())
- return;
-
if (!__this_cpu_read(trace_cmdline_save))
return;
@@ -1706,7 +1694,7 @@ void trace_buffer_unlock_commit(struct trace_array *tr,
__buffer_unlock_commit(buffer, event);
ftrace_trace_stack(tr, buffer, flags, 6, pc, NULL);
- ftrace_trace_userstack(buffer, flags, pc);
+ ftrace_trace_userstack(tr, buffer, flags, pc);
}
EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit);
@@ -1768,7 +1756,7 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr,
* two. They are that meaningful.
*/
ftrace_trace_stack(tr, buffer, flags, regs ? 0 : 4, pc, regs);
- ftrace_trace_userstack(buffer, flags, pc);
+ ftrace_trace_userstack(tr, buffer, flags, pc);
}
EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit_regs);
@@ -1867,7 +1855,8 @@ static void __ftrace_trace_stack(struct ring_buffer *buffer,
size *= sizeof(unsigned long);
event = trace_buffer_lock_reserve(buffer, TRACE_STACK,
- sizeof(*entry) + size, flags, pc);
+ (sizeof(*entry) - sizeof(entry->caller)) + size,
+ flags, pc);
if (!event)
goto out;
entry = ring_buffer_event_data(event);
@@ -1941,14 +1930,15 @@ void trace_dump_stack(int skip)
static DEFINE_PER_CPU(int, user_stack_count);
void
-ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc)
+ftrace_trace_userstack(struct trace_array *tr,
+ struct ring_buffer *buffer, unsigned long flags, int pc)
{
struct trace_event_call *call = &event_user_stack;
struct ring_buffer_event *event;
struct userstack_entry *entry;
struct stack_trace trace;
- if (!(global_trace.trace_flags & TRACE_ITER_USERSTACKTRACE))
+ if (!(tr->trace_flags & TRACE_ITER_USERSTACKTRACE))
return;
/*
@@ -2271,6 +2261,9 @@ int trace_array_printk(struct trace_array *tr,
if (!(global_trace.trace_flags & TRACE_ITER_PRINTK))
return 0;
+ if (!tr)
+ return -ENOENT;
+
va_start(ap, fmt);
ret = trace_array_vprintk(tr, ip, fmt, ap);
va_end(ap);
@@ -2501,9 +2494,6 @@ static void *s_start(struct seq_file *m, loff_t *pos)
return ERR_PTR(-EBUSY);
#endif
- if (!iter->snapshot)
- atomic_inc(&trace_record_cmdline_disabled);
-
if (*pos != iter->pos) {
iter->ent = NULL;
iter->cpu = 0;
@@ -2546,9 +2536,6 @@ static void s_stop(struct seq_file *m, void *p)
return;
#endif
- if (!iter->snapshot)
- atomic_dec(&trace_record_cmdline_disabled);
-
trace_access_unlock(iter->cpu_file);
trace_event_read_unlock();
}
@@ -6621,6 +6608,19 @@ static int allocate_trace_buffers(struct trace_array *tr, int size)
*/
allocate_snapshot = false;
#endif
+
+ /*
+ * Because of some magic with the way alloc_percpu() works on
+ * x86_64, we need to synchronize the pgd of all the tables,
+ * otherwise the trace events that happen in x86_64 page fault
+ * handlers can't cope with accessing the chance that a
+ * alloc_percpu()'d memory might be touched in the page fault trace
+ * event. Oh, and we need to audit all other alloc_percpu() and vmalloc()
+ * calls in tracing, because something might get triggered within a
+ * page fault trace event!
+ */
+ vmalloc_sync_mappings();
+
return 0;
}
@@ -7260,7 +7260,7 @@ __init static int tracer_alloc_buffers(void)
goto out_free_buffer_mask;
/* Only allocate trace_printk buffers if a trace_printk exists */
- if (__stop___trace_bprintk_fmt != __start___trace_bprintk_fmt)
+ if (&__stop___trace_bprintk_fmt != &__start___trace_bprintk_fmt)
/* Must be called before global_trace.buffer is allocated */
trace_printk_init_buffers();
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 12a82a7ad5a6..d8032be31405 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -431,23 +431,8 @@ struct tracer {
* When function tracing occurs, the following steps are made:
* If arch does not support a ftrace feature:
* call internal function (uses INTERNAL bits) which calls...
- * If callback is registered to the "global" list, the list
- * function is called and recursion checks the GLOBAL bits.
- * then this function calls...
* The function callback, which can use the FTRACE bits to
* check for recursion.
- *
- * Now if the arch does not suppport a feature, and it calls
- * the global list function which calls the ftrace callback
- * all three of these steps will do a recursion protection.
- * There's no reason to do one if the previous caller already
- * did. The recursion that we are protecting against will
- * go through the same steps again.
- *
- * To prevent the multiple recursion checks, if a recursion
- * bit is set that is higher than the MAX bit of the current
- * check, then we know that the check was made by the previous
- * caller, and we can skip the current check.
*/
enum {
TRACE_BUFFER_BIT,
@@ -460,12 +445,14 @@ enum {
TRACE_FTRACE_NMI_BIT,
TRACE_FTRACE_IRQ_BIT,
TRACE_FTRACE_SIRQ_BIT,
+ TRACE_FTRACE_TRANSITION_BIT,
- /* INTERNAL_BITs must be greater than FTRACE_BITs */
+ /* Internal use recursion bits */
TRACE_INTERNAL_BIT,
TRACE_INTERNAL_NMI_BIT,
TRACE_INTERNAL_IRQ_BIT,
TRACE_INTERNAL_SIRQ_BIT,
+ TRACE_INTERNAL_TRANSITION_BIT,
TRACE_CONTROL_BIT,
@@ -487,12 +474,18 @@ enum {
#define TRACE_CONTEXT_BITS 4
#define TRACE_FTRACE_START TRACE_FTRACE_BIT
-#define TRACE_FTRACE_MAX ((1 << (TRACE_FTRACE_START + TRACE_CONTEXT_BITS)) - 1)
#define TRACE_LIST_START TRACE_INTERNAL_BIT
-#define TRACE_LIST_MAX ((1 << (TRACE_LIST_START + TRACE_CONTEXT_BITS)) - 1)
-#define TRACE_CONTEXT_MASK TRACE_LIST_MAX
+#define TRACE_CONTEXT_MASK ((1 << (TRACE_LIST_START + TRACE_CONTEXT_BITS)) - 1)
+
+enum {
+ TRACE_CTX_NMI,
+ TRACE_CTX_IRQ,
+ TRACE_CTX_SOFTIRQ,
+ TRACE_CTX_NORMAL,
+ TRACE_CTX_TRANSITION,
+};
static __always_inline int trace_get_context_bit(void)
{
@@ -500,30 +493,36 @@ static __always_inline int trace_get_context_bit(void)
if (in_interrupt()) {
if (in_nmi())
- bit = 0;
+ bit = TRACE_CTX_NMI;
else if (in_irq())
- bit = 1;
+ bit = TRACE_CTX_IRQ;
else
- bit = 2;
+ bit = TRACE_CTX_SOFTIRQ;
} else
- bit = 3;
+ bit = TRACE_CTX_NORMAL;
return bit;
}
-static __always_inline int trace_test_and_set_recursion(int start, int max)
+static __always_inline int trace_test_and_set_recursion(int start)
{
unsigned int val = current->trace_recursion;
int bit;
- /* A previous recursion check was made */
- if ((val & TRACE_CONTEXT_MASK) > max)
- return 0;
-
bit = trace_get_context_bit() + start;
- if (unlikely(val & (1 << bit)))
- return -1;
+ if (unlikely(val & (1 << bit))) {
+ /*
+ * It could be that preempt_count has not been updated during
+ * a switch between contexts. Allow for a single recursion.
+ */
+ bit = start + TRACE_CTX_TRANSITION;
+ if (trace_recursion_test(bit))
+ return -1;
+ trace_recursion_set(bit);
+ barrier();
+ return bit;
+ }
val |= 1 << bit;
current->trace_recursion = val;
@@ -536,9 +535,6 @@ static __always_inline void trace_clear_recursion(int bit)
{
unsigned int val = current->trace_recursion;
- if (!bit)
- return;
-
bit = 1 << bit;
val &= ~bit;
@@ -636,13 +632,15 @@ void update_max_tr_single(struct trace_array *tr,
#endif /* CONFIG_TRACER_MAX_TRACE */
#ifdef CONFIG_STACKTRACE
-void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags,
+void ftrace_trace_userstack(struct trace_array *tr,
+ struct ring_buffer *buffer, unsigned long flags,
int pc);
void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
int pc);
#else
-static inline void ftrace_trace_userstack(struct ring_buffer *buffer,
+static inline void ftrace_trace_userstack(struct trace_array *tr,
+ struct ring_buffer *buffer,
unsigned long flags, int pc)
{
}
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 0f06532a755b..b70233a9563f 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -93,33 +93,49 @@ u64 notrace trace_clock_global(void)
{
unsigned long flags;
int this_cpu;
- u64 now;
+ u64 now, prev_time;
local_irq_save(flags);
this_cpu = raw_smp_processor_id();
- now = sched_clock_cpu(this_cpu);
+
/*
- * If in an NMI context then dont risk lockups and return the
- * cpu_clock() time:
+ * The global clock "guarantees" that the events are ordered
+ * between CPUs. But if two events on two different CPUS call
+ * trace_clock_global at roughly the same time, it really does
+ * not matter which one gets the earlier time. Just make sure
+ * that the same CPU will always show a monotonic clock.
+ *
+ * Use a read memory barrier to get the latest written
+ * time that was recorded.
*/
- if (unlikely(in_nmi()))
- goto out;
+ smp_rmb();
+ prev_time = READ_ONCE(trace_clock_struct.prev_time);
+ now = sched_clock_cpu(this_cpu);
- arch_spin_lock(&trace_clock_struct.lock);
+ /* Make sure that now is always greater than or equal to prev_time */
+ if ((s64)(now - prev_time) < 0)
+ now = prev_time;
/*
- * TODO: if this happens often then maybe we should reset
- * my_scd->clock to prev_time+1, to make sure
- * we start ticking with the local clock from now on?
+ * If in an NMI context then dont risk lockups and simply return
+ * the current time.
*/
- if ((s64)(now - trace_clock_struct.prev_time) < 0)
- now = trace_clock_struct.prev_time + 1;
+ if (unlikely(in_nmi()))
+ goto out;
- trace_clock_struct.prev_time = now;
+ /* Tracing can cause strange recursion, always use a try lock */
+ if (arch_spin_trylock(&trace_clock_struct.lock)) {
+ /* Reread prev_time in case it was already updated */
+ prev_time = READ_ONCE(trace_clock_struct.prev_time);
+ if ((s64)(now - prev_time) < 0)
+ now = prev_time;
- arch_spin_unlock(&trace_clock_struct.lock);
+ trace_clock_struct.prev_time = now;
+ /* The unlock acts as the wmb for the above rmb */
+ arch_spin_unlock(&trace_clock_struct.lock);
+ }
out:
local_irq_restore(flags);
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index ee7b94a4810a..246db27dbdc9 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -178,7 +178,7 @@ FTRACE_ENTRY(kernel_stack, stack_entry,
F_STRUCT(
__field( int, size )
- __dynamic_array(unsigned long, caller )
+ __array( unsigned long, caller, FTRACE_STACK_ENTRIES )
),
F_printk("\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n\t=> (" IP_FMT ")\n"
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index bd4c0bb61ad7..b89e00c748f1 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -755,6 +755,8 @@ static int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set)
char *event = NULL, *sub = NULL, *match;
int ret;
+ if (!tr)
+ return -ENOENT;
/*
* The buf format can be <subsystem>:<event-name>
* *:<event-name> means any event by that name.
@@ -1081,7 +1083,8 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
mutex_lock(&event_mutex);
list_for_each_entry(file, &tr->events, list) {
call = file->event_call;
- if (!trace_event_name(call) || !call->class || !call->class->reg)
+ if ((call->flags & TRACE_EVENT_FL_IGNORE_ENABLE) ||
+ !trace_event_name(call) || !call->class || !call->class->reg)
continue;
if (system && strcmp(call->class->system, system->name) != 0)
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index 78346aba6980..94fca4d687ad 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -204,11 +204,17 @@ static int event_trigger_regex_open(struct inode *inode, struct file *file)
static int trigger_process_regex(struct trace_event_file *file, char *buff)
{
- char *command, *next = buff;
+ char *command, *next;
struct event_command *p;
int ret = -EINVAL;
+ next = buff = skip_spaces(buff);
command = strsep(&next, ": \t");
+ if (next) {
+ next = skip_spaces(next);
+ if (!*next)
+ next = NULL;
+ }
command = (command[0] != '!') ? command : command + 1;
mutex_lock(&trigger_cmd_mutex);
@@ -615,8 +621,14 @@ event_trigger_callback(struct event_command *cmd_ops,
int ret;
/* separate the trigger from the filter (t:n [if filter]) */
- if (param && isdigit(param[0]))
+ if (param && isdigit(param[0])) {
trigger = strsep(&param, " \t");
+ if (param) {
+ param = skip_spaces(param);
+ if (!*param)
+ param = NULL;
+ }
+ }
trigger_ops = cmd_ops->get_trigger_ops(cmd, trigger);
@@ -1185,6 +1197,11 @@ event_enable_trigger_func(struct event_command *cmd_ops,
trigger = strsep(&param, " \t");
if (!trigger)
return -EINVAL;
+ if (param) {
+ param = skip_spaces(param);
+ if (!*param)
+ param = NULL;
+ }
system = strsep(&trigger, ":");
if (!trigger)
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index fcd41a166405..7adbfcf555fd 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -137,7 +137,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip,
pc = preempt_count();
preempt_disable_notrace();
- bit = trace_test_and_set_recursion(TRACE_FTRACE_START, TRACE_FTRACE_MAX);
+ bit = trace_test_and_set_recursion(TRACE_FTRACE_START);
if (bit < 0)
goto out;
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index ca70d11b8aa7..f444f57f1338 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -490,8 +490,13 @@ trace_selftest_function_recursion(void)
unregister_ftrace_function(&test_rec_probe);
ret = -1;
- if (trace_selftest_recursion_cnt != 1) {
- pr_cont("*callback not called once (%d)* ",
+ /*
+ * Recursion allows for transitions between context,
+ * and may call the callback twice.
+ */
+ if (trace_selftest_recursion_cnt != 1 &&
+ trace_selftest_recursion_cnt != 2) {
+ pr_cont("*callback not called once (or twice) (%d)* ",
trace_selftest_recursion_cnt);
goto out;
}
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index eda85bbf1c2e..a1f9be703002 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -59,6 +59,12 @@ struct tp_probes {
struct tracepoint_func probes[0];
};
+/* Called in removal of a func but failed to allocate a new tp_funcs */
+static void tp_stub_func(void)
+{
+ return;
+}
+
static inline void *allocate_probes(int count)
{
struct tp_probes *p = kmalloc(count * sizeof(struct tracepoint_func)
@@ -97,6 +103,7 @@ func_add(struct tracepoint_func **funcs, struct tracepoint_func *tp_func,
{
struct tracepoint_func *old, *new;
int nr_probes = 0;
+ int stub_funcs = 0;
int pos = -1;
if (WARN_ON(!tp_func->func))
@@ -113,14 +120,34 @@ func_add(struct tracepoint_func **funcs, struct tracepoint_func *tp_func,
if (old[nr_probes].func == tp_func->func &&
old[nr_probes].data == tp_func->data)
return ERR_PTR(-EEXIST);
+ if (old[nr_probes].func == tp_stub_func)
+ stub_funcs++;
}
}
- /* + 2 : one for new probe, one for NULL func */
- new = allocate_probes(nr_probes + 2);
+ /* + 2 : one for new probe, one for NULL func - stub functions */
+ new = allocate_probes(nr_probes + 2 - stub_funcs);
if (new == NULL)
return ERR_PTR(-ENOMEM);
if (old) {
- if (pos < 0) {
+ if (stub_funcs) {
+ /* Need to copy one at a time to remove stubs */
+ int probes = 0;
+
+ pos = -1;
+ for (nr_probes = 0; old[nr_probes].func; nr_probes++) {
+ if (old[nr_probes].func == tp_stub_func)
+ continue;
+ if (pos < 0 && old[nr_probes].prio < prio)
+ pos = probes++;
+ new[probes++] = old[nr_probes];
+ }
+ nr_probes = probes;
+ if (pos < 0)
+ pos = probes;
+ else
+ nr_probes--; /* Account for insertion */
+
+ } else if (pos < 0) {
pos = nr_probes;
memcpy(new, old, nr_probes * sizeof(struct tracepoint_func));
} else {
@@ -154,8 +181,9 @@ static void *func_remove(struct tracepoint_func **funcs,
/* (N -> M), (N > 1, M >= 0) probes */
if (tp_func->func) {
for (nr_probes = 0; old[nr_probes].func; nr_probes++) {
- if (old[nr_probes].func == tp_func->func &&
- old[nr_probes].data == tp_func->data)
+ if ((old[nr_probes].func == tp_func->func &&
+ old[nr_probes].data == tp_func->data) ||
+ old[nr_probes].func == tp_stub_func)
nr_del++;
}
}
@@ -174,14 +202,32 @@ static void *func_remove(struct tracepoint_func **funcs,
/* N -> M, (N > 1, M > 0) */
/* + 1 for NULL */
new = allocate_probes(nr_probes - nr_del + 1);
- if (new == NULL)
- return ERR_PTR(-ENOMEM);
- for (i = 0; old[i].func; i++)
- if (old[i].func != tp_func->func
- || old[i].data != tp_func->data)
- new[j++] = old[i];
- new[nr_probes - nr_del].func = NULL;
- *funcs = new;
+ if (new) {
+ for (i = 0; old[i].func; i++)
+ if ((old[i].func != tp_func->func
+ || old[i].data != tp_func->data)
+ && old[i].func != tp_stub_func)
+ new[j++] = old[i];
+ new[nr_probes - nr_del].func = NULL;
+ *funcs = new;
+ } else {
+ /*
+ * Failed to allocate, replace the old function
+ * with calls to tp_stub_func.
+ */
+ for (i = 0; old[i].func; i++)
+ if (old[i].func == tp_func->func &&
+ old[i].data == tp_func->data) {
+ old[i].func = tp_stub_func;
+ /* Set the prio to the next event. */
+ if (old[i + 1].func)
+ old[i].prio =
+ old[i + 1].prio;
+ else
+ old[i].prio = -1;
+ }
+ *funcs = old;
+ }
}
debug_print_probes(*funcs);
return old;
@@ -234,10 +280,12 @@ static int tracepoint_remove_func(struct tracepoint *tp,
tp_funcs = rcu_dereference_protected(tp->funcs,
lockdep_is_held(&tracepoints_mutex));
old = func_remove(&tp_funcs, func);
- if (IS_ERR(old)) {
- WARN_ON_ONCE(PTR_ERR(old) != -ENOMEM);
+ if (WARN_ON_ONCE(IS_ERR(old)))
return PTR_ERR(old);
- }
+
+ if (tp_funcs == old)
+ /* Failed allocating new tp_funcs, replaced func with stub */
+ return 0;
if (!tp_funcs) {
/* Removed last function */
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 3fb2d45c0b42..b7eed05ea987 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1351,7 +1351,6 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
*/
WARN_ON_ONCE(!irqs_disabled());
- debug_work_activate(work);
/* if draining, only works from the same workqueue are allowed */
if (unlikely(wq->flags & __WQ_DRAINING) &&
@@ -1430,6 +1429,7 @@ retry:
worklist = &pwq->delayed_works;
}
+ debug_work_activate(work);
insert_work(pwq, work, worklist, work_flags);
spin_unlock(&pwq->pool->lock);
@@ -3309,15 +3309,21 @@ static void pwq_unbound_release_workfn(struct work_struct *work)
unbound_release_work);
struct workqueue_struct *wq = pwq->wq;
struct worker_pool *pool = pwq->pool;
- bool is_last;
+ bool is_last = false;
- if (WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND)))
- return;
+ /*
+ * when @pwq is not linked, it doesn't hold any reference to the
+ * @wq, and @wq is invalid to access.
+ */
+ if (!list_empty(&pwq->pwqs_node)) {
+ if (WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND)))
+ return;
- mutex_lock(&wq->mutex);
- list_del_rcu(&pwq->pwqs_node);
- is_last = list_empty(&wq->pwqs);
- mutex_unlock(&wq->mutex);
+ mutex_lock(&wq->mutex);
+ list_del_rcu(&pwq->pwqs_node);
+ is_last = list_empty(&wq->pwqs);
+ mutex_unlock(&wq->mutex);
+ }
mutex_lock(&wq_pool_mutex);
put_unbound_pool(pool);
@@ -3361,17 +3367,24 @@ static void pwq_adjust_max_active(struct pool_workqueue *pwq)
* is updated and visible.
*/
if (!freezable || !workqueue_freezing) {
+ bool kick = false;
+
pwq->max_active = wq->saved_max_active;
while (!list_empty(&pwq->delayed_works) &&
- pwq->nr_active < pwq->max_active)
+ pwq->nr_active < pwq->max_active) {
pwq_activate_first_delayed(pwq);
+ kick = true;
+ }
/*
* Need to kick a worker after thawed or an unbound wq's
- * max_active is bumped. It's a slow path. Do it always.
+ * max_active is bumped. In realtime scenarios, always kicking a
+ * worker will cause interference on the isolated cpu cores, so
+ * let's kick iff work items were activated.
*/
- wake_up_worker(pwq->pool);
+ if (kick)
+ wake_up_worker(pwq->pool);
} else {
pwq->max_active = 0;
}