summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorDeepak Nibade <dnibade@nvidia.com>2013-08-14 16:23:39 +0530
committerDeepak Nibade <dnibade@nvidia.com>2013-08-14 16:23:39 +0530
commit9282699d7fd7954f11d59254e568e5d4bfbbe71a (patch)
tree4d945260bc8eb63b0db0423ad55ad7021eb5d4ac /kernel
parent933d6b11dbd7fda89ac094321d0cd9992afb5592 (diff)
parent67e6589a34ea5360b00869aaaec4a844c29cf713 (diff)
Merge branch 'linux-3.4.57' into rel-17
Bug 1348440 Change-Id: If25c49f027dc2a69642f7ed4733e965962b2a5a2 Signed-off-by: Deepak Nibade <dnibade@nvidia.com>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/audit_tree.c2
-rw-r--r--kernel/cgroup.c2
-rw-r--r--kernel/cpu.c55
-rw-r--r--kernel/events/core.c291
-rw-r--r--kernel/events/hw_breakpoint.c4
-rw-r--r--kernel/events/internal.h4
-rw-r--r--kernel/hrtimer.c37
-rw-r--r--kernel/irq/manage.c6
-rw-r--r--kernel/kmod.c5
-rw-r--r--kernel/panic.c8
-rw-r--r--kernel/sched/clock.c26
-rw-r--r--kernel/sched/core.c6
-rw-r--r--kernel/sched/fair.c2
-rw-r--r--kernel/signal.c5
-rw-r--r--kernel/sys.c30
-rw-r--r--kernel/time/tick-broadcast.c17
-rw-r--r--kernel/time/tick-common.c1
-rw-r--r--kernel/time/tick-sched.c8
-rw-r--r--kernel/timer.c10
-rw-r--r--kernel/trace/Kconfig24
-rw-r--r--kernel/trace/ftrace.c54
-rw-r--r--kernel/trace/trace.c127
-rw-r--r--kernel/trace/trace.h7
-rw-r--r--kernel/trace/trace_events_filter.c4
-rw-r--r--kernel/trace/trace_irqsoff.c19
-rw-r--r--kernel/trace/trace_sched_wakeup.c18
-rw-r--r--kernel/trace/trace_selftest.c9
-rw-r--r--kernel/trace/trace_stack.c78
-rw-r--r--kernel/trace/trace_stat.c2
-rw-r--r--kernel/trace/trace_syscalls.c21
30 files changed, 617 insertions, 265 deletions
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 31fdc480b5c6..0caf1f8de0fb 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -608,9 +608,9 @@ void audit_trim_trees(void)
}
spin_unlock(&hash_lock);
trim_marked(tree);
- put_tree(tree);
drop_collected_mounts(root_mnt);
skip_it:
+ put_tree(tree);
mutex_lock(&audit_filter_mutex);
}
list_del(&cursor);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index ad447684aa47..9e4e81bc0d3f 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2050,7 +2050,7 @@ static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
if (!group)
return -ENOMEM;
/* pre-allocate to guarantee space while iterating in rcu read-side. */
- retval = flex_array_prealloc(group, 0, group_size - 1, GFP_KERNEL);
+ retval = flex_array_prealloc(group, 0, group_size, GFP_KERNEL);
if (retval)
goto out_free_group_list;
diff --git a/kernel/cpu.c b/kernel/cpu.c
index acf5d8047458..ae08fe970529 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -125,6 +125,27 @@ static void cpu_hotplug_done(void)
mutex_unlock(&cpu_hotplug.lock);
}
+/*
+ * Wait for currently running CPU hotplug operations to complete (if any) and
+ * disable future CPU hotplug (from sysfs). The 'cpu_add_remove_lock' protects
+ * the 'cpu_hotplug_disabled' flag. The same lock is also acquired by the
+ * hotplug path before performing hotplug operations. So acquiring that lock
+ * guarantees mutual exclusion from any currently running hotplug operations.
+ */
+void cpu_hotplug_disable(void)
+{
+ cpu_maps_update_begin();
+ cpu_hotplug_disabled = 1;
+ cpu_maps_update_done();
+}
+
+void cpu_hotplug_enable(void)
+{
+ cpu_maps_update_begin();
+ cpu_hotplug_disabled = 0;
+ cpu_maps_update_done();
+}
+
#else /* #if CONFIG_HOTPLUG_CPU */
static void cpu_hotplug_begin(void) {}
static void cpu_hotplug_done(void) {}
@@ -486,36 +507,6 @@ static int __init alloc_frozen_cpus(void)
core_initcall(alloc_frozen_cpus);
/*
- * Prevent regular CPU hotplug from racing with the freezer, by disabling CPU
- * hotplug when tasks are about to be frozen. Also, don't allow the freezer
- * to continue until any currently running CPU hotplug operation gets
- * completed.
- * To modify the 'cpu_hotplug_disabled' flag, we need to acquire the
- * 'cpu_add_remove_lock'. And this same lock is also taken by the regular
- * CPU hotplug path and released only after it is complete. Thus, we
- * (and hence the freezer) will block here until any currently running CPU
- * hotplug operation gets completed.
- */
-void cpu_hotplug_disable_before_freeze(void)
-{
- cpu_maps_update_begin();
- cpu_hotplug_disabled = 1;
- cpu_maps_update_done();
-}
-
-
-/*
- * When tasks have been thawed, re-enable regular CPU hotplug (which had been
- * disabled while beginning to freeze tasks).
- */
-void cpu_hotplug_enable_after_thaw(void)
-{
- cpu_maps_update_begin();
- cpu_hotplug_disabled = 0;
- cpu_maps_update_done();
-}
-
-/*
* When callbacks for CPU hotplug notifications are being executed, we must
* ensure that the state of the system with respect to the tasks being frozen
* or not, as reported by the notification, remains unchanged *throughout the
@@ -534,12 +525,12 @@ cpu_hotplug_pm_callback(struct notifier_block *nb,
case PM_SUSPEND_PREPARE:
case PM_HIBERNATION_PREPARE:
- cpu_hotplug_disable_before_freeze();
+ cpu_hotplug_disable();
break;
case PM_POST_SUSPEND:
case PM_POST_HIBERNATION:
- cpu_hotplug_enable_after_thaw();
+ cpu_hotplug_enable();
break;
default:
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 839a24f1ddeb..8e810bab9ca8 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -193,9 +193,6 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
static void update_context_time(struct perf_event_context *ctx);
static u64 perf_event_time(struct perf_event *event);
-static void ring_buffer_attach(struct perf_event *event,
- struct ring_buffer *rb);
-
void __weak perf_event_print_debug(void) { }
extern __weak const char *perf_pmu_name(void)
@@ -253,9 +250,9 @@ perf_cgroup_match(struct perf_event *event)
return !event->cgrp || event->cgrp == cpuctx->cgrp;
}
-static inline void perf_get_cgroup(struct perf_event *event)
+static inline bool perf_tryget_cgroup(struct perf_event *event)
{
- css_get(&event->cgrp->css);
+ return css_tryget(&event->cgrp->css);
}
static inline void perf_put_cgroup(struct perf_event *event)
@@ -484,7 +481,11 @@ static inline int perf_cgroup_connect(int fd, struct perf_event *event,
event->cgrp = cgrp;
/* must be done before we fput() the file */
- perf_get_cgroup(event);
+ if (!perf_tryget_cgroup(event)) {
+ event->cgrp = NULL;
+ ret = -ENOENT;
+ goto out;
+ }
/*
* all events in a group must monitor
@@ -722,8 +723,18 @@ perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
{
struct perf_event_context *ctx;
- rcu_read_lock();
retry:
+ /*
+ * One of the few rules of preemptible RCU is that one cannot do
+ * rcu_read_unlock() while holding a scheduler (or nested) lock when
+ * part of the read side critical section was preemptible -- see
+ * rcu_read_unlock_special().
+ *
+ * Since ctx->lock nests under rq->lock we must ensure the entire read
+ * side critical section is non-preemptible.
+ */
+ preempt_disable();
+ rcu_read_lock();
ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
if (ctx) {
/*
@@ -739,6 +750,8 @@ retry:
raw_spin_lock_irqsave(&ctx->lock, *flags);
if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
raw_spin_unlock_irqrestore(&ctx->lock, *flags);
+ rcu_read_unlock();
+ preempt_enable();
goto retry;
}
@@ -748,6 +761,7 @@ retry:
}
}
rcu_read_unlock();
+ preempt_enable();
return ctx;
}
@@ -901,6 +915,15 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
}
/*
+ * Initialize event state based on the perf_event_attr::disabled.
+ */
+static inline void perf_event__state_init(struct perf_event *event)
+{
+ event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF :
+ PERF_EVENT_STATE_INACTIVE;
+}
+
+/*
* Called at perf_event creation and when events are attached/detached from a
* group.
*/
@@ -1705,7 +1728,16 @@ static int __perf_event_enable(void *info)
struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
int err;
- if (WARN_ON_ONCE(!ctx->is_active))
+ /*
+ * There's a time window between 'ctx->is_active' check
+ * in perf_event_enable function and this place having:
+ * - IRQs on
+ * - ctx->lock unlocked
+ *
+ * where the task could be killed and 'ctx' deactivated
+ * by perf_event_exit_task.
+ */
+ if (!ctx->is_active)
return -EINVAL;
raw_spin_lock(&ctx->lock);
@@ -2849,6 +2881,7 @@ static void free_event_rcu(struct rcu_head *head)
}
static void ring_buffer_put(struct ring_buffer *rb);
+static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb);
static void free_event(struct perf_event *event)
{
@@ -2873,15 +2906,30 @@ static void free_event(struct perf_event *event)
if (has_branch_stack(event)) {
static_key_slow_dec_deferred(&perf_sched_events);
/* is system-wide event */
- if (!(event->attach_state & PERF_ATTACH_TASK))
+ if (!(event->attach_state & PERF_ATTACH_TASK)) {
atomic_dec(&per_cpu(perf_branch_stack_events,
event->cpu));
+ }
}
}
if (event->rb) {
- ring_buffer_put(event->rb);
- event->rb = NULL;
+ struct ring_buffer *rb;
+
+ /*
+ * Can happen when we close an event with re-directed output.
+ *
+ * Since we have a 0 refcount, perf_mmap_close() will skip
+ * over us; possibly making our ring_buffer_put() the last.
+ */
+ mutex_lock(&event->mmap_mutex);
+ rb = event->rb;
+ if (rb) {
+ rcu_assign_pointer(event->rb, NULL);
+ ring_buffer_detach(event, rb);
+ ring_buffer_put(rb); /* could be last */
+ }
+ mutex_unlock(&event->mmap_mutex);
}
if (is_cgroup_event(event))
@@ -3119,30 +3167,13 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
unsigned int events = POLL_HUP;
/*
- * Race between perf_event_set_output() and perf_poll(): perf_poll()
- * grabs the rb reference but perf_event_set_output() overrides it.
- * Here is the timeline for two threads T1, T2:
- * t0: T1, rb = rcu_dereference(event->rb)
- * t1: T2, old_rb = event->rb
- * t2: T2, event->rb = new rb
- * t3: T2, ring_buffer_detach(old_rb)
- * t4: T1, ring_buffer_attach(rb1)
- * t5: T1, poll_wait(event->waitq)
- *
- * To avoid this problem, we grab mmap_mutex in perf_poll()
- * thereby ensuring that the assignment of the new ring buffer
- * and the detachment of the old buffer appear atomic to perf_poll()
+ * Pin the event->rb by taking event->mmap_mutex; otherwise
+ * perf_event_set_output() can swizzle our rb and make us miss wakeups.
*/
mutex_lock(&event->mmap_mutex);
-
- rcu_read_lock();
- rb = rcu_dereference(event->rb);
- if (rb) {
- ring_buffer_attach(event, rb);
+ rb = event->rb;
+ if (rb)
events = atomic_xchg(&rb->poll, 0);
- }
- rcu_read_unlock();
-
mutex_unlock(&event->mmap_mutex);
poll_wait(file, &event->waitq, wait);
@@ -3459,16 +3490,12 @@ static void ring_buffer_attach(struct perf_event *event,
return;
spin_lock_irqsave(&rb->event_lock, flags);
- if (!list_empty(&event->rb_entry))
- goto unlock;
-
- list_add(&event->rb_entry, &rb->event_list);
-unlock:
+ if (list_empty(&event->rb_entry))
+ list_add(&event->rb_entry, &rb->event_list);
spin_unlock_irqrestore(&rb->event_lock, flags);
}
-static void ring_buffer_detach(struct perf_event *event,
- struct ring_buffer *rb)
+static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb)
{
unsigned long flags;
@@ -3487,13 +3514,10 @@ static void ring_buffer_wakeup(struct perf_event *event)
rcu_read_lock();
rb = rcu_dereference(event->rb);
- if (!rb)
- goto unlock;
-
- list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
- wake_up_all(&event->waitq);
-
-unlock:
+ if (rb) {
+ list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
+ wake_up_all(&event->waitq);
+ }
rcu_read_unlock();
}
@@ -3522,18 +3546,10 @@ static struct ring_buffer *ring_buffer_get(struct perf_event *event)
static void ring_buffer_put(struct ring_buffer *rb)
{
- struct perf_event *event, *n;
- unsigned long flags;
-
if (!atomic_dec_and_test(&rb->refcount))
return;
- spin_lock_irqsave(&rb->event_lock, flags);
- list_for_each_entry_safe(event, n, &rb->event_list, rb_entry) {
- list_del_init(&event->rb_entry);
- wake_up_all(&event->waitq);
- }
- spin_unlock_irqrestore(&rb->event_lock, flags);
+ WARN_ON_ONCE(!list_empty(&rb->event_list));
call_rcu(&rb->rcu_head, rb_free_rcu);
}
@@ -3543,26 +3559,100 @@ static void perf_mmap_open(struct vm_area_struct *vma)
struct perf_event *event = vma->vm_file->private_data;
atomic_inc(&event->mmap_count);
+ atomic_inc(&event->rb->mmap_count);
}
+/*
+ * A buffer can be mmap()ed multiple times; either directly through the same
+ * event, or through other events by use of perf_event_set_output().
+ *
+ * In order to undo the VM accounting done by perf_mmap() we need to destroy
+ * the buffer here, where we still have a VM context. This means we need
+ * to detach all events redirecting to us.
+ */
static void perf_mmap_close(struct vm_area_struct *vma)
{
struct perf_event *event = vma->vm_file->private_data;
- if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
- unsigned long size = perf_data_size(event->rb);
- struct user_struct *user = event->mmap_user;
- struct ring_buffer *rb = event->rb;
+ struct ring_buffer *rb = event->rb;
+ struct user_struct *mmap_user = rb->mmap_user;
+ int mmap_locked = rb->mmap_locked;
+ unsigned long size = perf_data_size(rb);
+
+ atomic_dec(&rb->mmap_count);
+
+ if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
+ return;
+
+ /* Detach current event from the buffer. */
+ rcu_assign_pointer(event->rb, NULL);
+ ring_buffer_detach(event, rb);
+ mutex_unlock(&event->mmap_mutex);
- atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
- vma->vm_mm->pinned_vm -= event->mmap_locked;
- rcu_assign_pointer(event->rb, NULL);
- ring_buffer_detach(event, rb);
+ /* If there's still other mmap()s of this buffer, we're done. */
+ if (atomic_read(&rb->mmap_count)) {
+ ring_buffer_put(rb); /* can't be last */
+ return;
+ }
+
+ /*
+ * No other mmap()s, detach from all other events that might redirect
+ * into the now unreachable buffer. Somewhat complicated by the
+ * fact that rb::event_lock otherwise nests inside mmap_mutex.
+ */
+again:
+ rcu_read_lock();
+ list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
+ if (!atomic_long_inc_not_zero(&event->refcount)) {
+ /*
+ * This event is en-route to free_event() which will
+ * detach it and remove it from the list.
+ */
+ continue;
+ }
+ rcu_read_unlock();
+
+ mutex_lock(&event->mmap_mutex);
+ /*
+ * Check we didn't race with perf_event_set_output() which can
+ * swizzle the rb from under us while we were waiting to
+ * acquire mmap_mutex.
+ *
+ * If we find a different rb; ignore this event, a next
+ * iteration will no longer find it on the list. We have to
+ * still restart the iteration to make sure we're not now
+ * iterating the wrong list.
+ */
+ if (event->rb == rb) {
+ rcu_assign_pointer(event->rb, NULL);
+ ring_buffer_detach(event, rb);
+ ring_buffer_put(rb); /* can't be last, we still have one */
+ }
mutex_unlock(&event->mmap_mutex);
+ put_event(event);
- ring_buffer_put(rb);
- free_uid(user);
+ /*
+ * Restart the iteration; either we're on the wrong list or
+ * destroyed its integrity by doing a deletion.
+ */
+ goto again;
}
+ rcu_read_unlock();
+
+ /*
+ * It could be there's still a few 0-ref events on the list; they'll
+ * get cleaned up by free_event() -- they'll also still have their
+ * ref on the rb and will free it whenever they are done with it.
+ *
+ * Aside from that, this buffer is 'fully' detached and unmapped,
+ * undo the VM accounting.
+ */
+
+ atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
+ vma->vm_mm->pinned_vm -= mmap_locked;
+ free_uid(mmap_user);
+
+ ring_buffer_put(rb); /* could be last */
}
static const struct vm_operations_struct perf_mmap_vmops = {
@@ -3612,12 +3702,24 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
return -EINVAL;
WARN_ON_ONCE(event->ctx->parent_ctx);
+again:
mutex_lock(&event->mmap_mutex);
if (event->rb) {
- if (event->rb->nr_pages == nr_pages)
- atomic_inc(&event->rb->refcount);
- else
+ if (event->rb->nr_pages != nr_pages) {
ret = -EINVAL;
+ goto unlock;
+ }
+
+ if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
+ /*
+ * Raced against perf_mmap_close() through
+ * perf_event_set_output(). Try again, hope for better
+ * luck.
+ */
+ mutex_unlock(&event->mmap_mutex);
+ goto again;
+ }
+
goto unlock;
}
@@ -3658,12 +3760,16 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
ret = -ENOMEM;
goto unlock;
}
- rcu_assign_pointer(event->rb, rb);
+
+ atomic_set(&rb->mmap_count, 1);
+ rb->mmap_locked = extra;
+ rb->mmap_user = get_current_user();
atomic_long_add(user_extra, &user->locked_vm);
- event->mmap_locked = extra;
- event->mmap_user = get_current_user();
- vma->vm_mm->pinned_vm += event->mmap_locked;
+ vma->vm_mm->pinned_vm += extra;
+
+ ring_buffer_attach(event, rb);
+ rcu_assign_pointer(event->rb, rb);
perf_event_update_userpage(event);
@@ -3672,7 +3778,11 @@ unlock:
atomic_inc(&event->mmap_count);
mutex_unlock(&event->mmap_mutex);
- vma->vm_flags |= VM_RESERVED;
+ /*
+ * Since pinned accounting is per vm we cannot allow fork() to copy our
+ * vma.
+ */
+ vma->vm_flags |= VM_DONTCOPY | VM_RESERVED;
vma->vm_ops = &perf_mmap_vmops;
return ret;
@@ -5961,8 +6071,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu,
event->overflow_handler = overflow_handler;
event->overflow_handler_context = context;
- if (attr->disabled)
- event->state = PERF_EVENT_STATE_OFF;
+ perf_event__state_init(event);
pmu = NULL;
@@ -6161,6 +6270,8 @@ set:
if (atomic_read(&event->mmap_count))
goto unlock;
+ old_rb = event->rb;
+
if (output_event) {
/* get the rb we want to redirect to */
rb = ring_buffer_get(output_event);
@@ -6168,16 +6279,28 @@ set:
goto unlock;
}
- old_rb = event->rb;
- rcu_assign_pointer(event->rb, rb);
if (old_rb)
ring_buffer_detach(event, old_rb);
+
+ if (rb)
+ ring_buffer_attach(event, rb);
+
+ rcu_assign_pointer(event->rb, rb);
+
+ if (old_rb) {
+ ring_buffer_put(old_rb);
+ /*
+ * Since we detached before setting the new rb, so that we
+ * could attach the new rb, we could have missed a wakeup.
+ * Provide it now.
+ */
+ wake_up_all(&event->waitq);
+ }
+
ret = 0;
unlock:
mutex_unlock(&event->mmap_mutex);
- if (old_rb)
- ring_buffer_put(old_rb);
out:
return ret;
}
@@ -6370,9 +6493,17 @@ SYSCALL_DEFINE5(perf_event_open,
mutex_lock(&gctx->mutex);
perf_remove_from_context(group_leader);
+
+ /*
+ * Removing from the context ends up with disabled
+ * event. What we want here is event in the initial
+ * startup state, ready to be add into new context.
+ */
+ perf_event__state_init(group_leader);
list_for_each_entry(sibling, &group_leader->sibling_list,
group_entry) {
perf_remove_from_context(sibling);
+ perf_event__state_init(sibling);
put_ctx(gctx);
}
mutex_unlock(&gctx->mutex);
@@ -6844,7 +6975,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
* child.
*/
- child_ctx = alloc_perf_context(event->pmu, child);
+ child_ctx = alloc_perf_context(parent_ctx->pmu, child);
if (!child_ctx)
return -ENOMEM;
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index bb38c4d3ee12..fc8bfcf3a3da 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -147,7 +147,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp,
return;
}
- for_each_online_cpu(cpu) {
+ for_each_possible_cpu(cpu) {
unsigned int nr;
nr = per_cpu(nr_cpu_bp_pinned[type], cpu);
@@ -233,7 +233,7 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type,
if (cpu >= 0) {
toggle_bp_task_slot(bp, cpu, enable, type, weight);
} else {
- for_each_online_cpu(cpu)
+ for_each_possible_cpu(cpu)
toggle_bp_task_slot(bp, cpu, enable, type, weight);
}
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index b0b107f90afc..b400e6429c02 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -30,6 +30,10 @@ struct ring_buffer {
spinlock_t event_lock;
struct list_head event_list;
+ atomic_t mmap_count;
+ unsigned long mmap_locked;
+ struct user_struct *mmap_user;
+
struct perf_event_mmap_page *user_page;
void *data_pages[0];
};
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index cdd5607c0ceb..a57ef25867e3 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -61,6 +61,7 @@
DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
{
+ .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),
.clock_base =
{
{
@@ -297,6 +298,10 @@ ktime_t ktime_sub_ns(const ktime_t kt, u64 nsec)
} else {
unsigned long rem = do_div(nsec, NSEC_PER_SEC);
+ /* Make sure nsec fits into long */
+ if (unlikely(nsec > KTIME_SEC_MAX))
+ return (ktime_t){ .tv64 = KTIME_MAX };
+
tmp = ktime_set((long)nsec, rem);
}
@@ -702,17 +707,20 @@ static int hrtimer_switch_to_hres(void)
return 1;
}
+static void clock_was_set_work(struct work_struct *work)
+{
+ clock_was_set();
+}
+
+static DECLARE_WORK(hrtimer_work, clock_was_set_work);
+
/*
- * Called from timekeeping code to reprogramm the hrtimer interrupt
- * device. If called from the timer interrupt context we defer it to
- * softirq context.
+ * Called from timekeeping and resume code to reprogramm the hrtimer
+ * interrupt device on all cpus.
*/
void clock_was_set_delayed(void)
{
- struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
-
- cpu_base->clock_was_set = 1;
- __raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+ schedule_work(&hrtimer_work);
}
#else
@@ -761,8 +769,10 @@ void hrtimers_resume(void)
WARN_ONCE(!irqs_disabled(),
KERN_INFO "hrtimers_resume() called with IRQs enabled!");
+ /* Retrigger on the local CPU */
retrigger_next_event(NULL);
- timerfd_clock_was_set();
+ /* And schedule a retrigger for all others */
+ clock_was_set_delayed();
}
static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer)
@@ -1307,6 +1317,8 @@ retry:
expires = ktime_sub(hrtimer_get_expires(timer),
base->offset);
+ if (expires.tv64 < 0)
+ expires.tv64 = KTIME_MAX;
if (expires.tv64 < expires_next.tv64)
expires_next = expires;
break;
@@ -1409,13 +1421,6 @@ void hrtimer_peek_ahead_timers(void)
static void run_hrtimer_softirq(struct softirq_action *h)
{
- struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
-
- if (cpu_base->clock_was_set) {
- cpu_base->clock_was_set = 0;
- clock_was_set();
- }
-
hrtimer_peek_ahead_timers();
}
@@ -1640,8 +1645,6 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
int i;
- raw_spin_lock_init(&cpu_base->lock);
-
for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
cpu_base->clock_base[i].cpu_base = cpu_base;
timerqueue_init_head(&cpu_base->clock_base[i].active);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 7684920f4f66..86a500d7ea59 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -546,9 +546,9 @@ int can_request_irq(unsigned int irq, unsigned long irqflags)
return 0;
if (irq_settings_can_request(desc)) {
- if (desc->action)
- if (irqflags & desc->action->flags & IRQF_SHARED)
- canrequest =1;
+ if (!desc->action ||
+ irqflags & desc->action->flags & IRQF_SHARED)
+ canrequest = 1;
}
irq_put_desc_unlock(desc, flags);
return canrequest;
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 05698a7415fe..f2490e1dd51f 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -541,6 +541,11 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
int retval = 0;
helper_lock();
+ if (!sub_info->path) {
+ retval = -EINVAL;
+ goto out;
+ }
+
if (sub_info->path[0] == '\0')
goto out;
diff --git a/kernel/panic.c b/kernel/panic.c
index 90fd443165df..9ce61102c01e 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -81,6 +81,14 @@ void panic(const char *fmt, ...)
int state = 0;
/*
+ * Disable local interrupts. This will prevent panic_smp_self_stop
+ * from deadlocking the first cpu that invokes the panic, since
+ * there is nothing to prevent an interrupt handler (that runs
+ * after the panic_lock is acquired) from invoking panic again.
+ */
+ local_irq_disable();
+
+ /*
* It's possible to come here directly from a panic-assertion and
* not have preempt disabled. Some functions called from here want
* preempt to be disabled. No point enabling it later though...
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index c685e31492df..c3ae1446461c 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -176,10 +176,36 @@ static u64 sched_clock_remote(struct sched_clock_data *scd)
u64 this_clock, remote_clock;
u64 *ptr, old_val, val;
+#if BITS_PER_LONG != 64
+again:
+ /*
+ * Careful here: The local and the remote clock values need to
+ * be read out atomic as we need to compare the values and
+ * then update either the local or the remote side. So the
+ * cmpxchg64 below only protects one readout.
+ *
+ * We must reread via sched_clock_local() in the retry case on
+ * 32bit as an NMI could use sched_clock_local() via the
+ * tracer and hit between the readout of
+ * the low32bit and the high 32bit portion.
+ */
+ this_clock = sched_clock_local(my_scd);
+ /*
+ * We must enforce atomic readout on 32bit, otherwise the
+ * update on the remote cpu can hit inbetween the readout of
+ * the low32bit and the high 32bit portion.
+ */
+ remote_clock = cmpxchg64(&scd->clock, 0, 0);
+#else
+ /*
+ * On 64bit the read of [my]scd->clock is atomic versus the
+ * update, so we can avoid the above 32bit dance.
+ */
sched_clock_local(my_scd);
again:
this_clock = my_scd->clock;
remote_clock = scd->clock;
+#endif
/*
* Use the opportunity that we have both locks
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9b20f3e88bd0..99d701ee206c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1653,8 +1653,10 @@ static void try_to_wake_up_local(struct task_struct *p)
{
struct rq *rq = task_rq(p);
- BUG_ON(rq != this_rq());
- BUG_ON(p == current);
+ if (WARN_ON_ONCE(rq != this_rq()) ||
+ WARN_ON_ONCE(p == current))
+ return;
+
lockdep_assert_held(&rq->lock);
if (!raw_spin_trylock(&p->pi_lock)) {
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e9553640c1c3..da4512f578bf 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5511,7 +5511,7 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
* idle runqueue:
*/
if (rq->cfs.load.weight)
- rr_interval = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
+ rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
return rr_interval;
}
diff --git a/kernel/signal.c b/kernel/signal.c
index 563b7711d234..ee44fc784d73 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -482,6 +482,9 @@ flush_signal_handlers(struct task_struct *t, int force_default)
if (force_default || ka->sa.sa_handler != SIG_IGN)
ka->sa.sa_handler = SIG_DFL;
ka->sa.sa_flags = 0;
+#ifdef __ARCH_HAS_SA_RESTORER
+ ka->sa.sa_restorer = NULL;
+#endif
sigemptyset(&ka->sa.sa_mask);
ka++;
}
@@ -2864,7 +2867,7 @@ do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info)
static int do_tkill(pid_t tgid, pid_t pid, int sig)
{
- struct siginfo info;
+ struct siginfo info = {};
info.si_signo = sig;
info.si_errno = 0;
diff --git a/kernel/sys.c b/kernel/sys.c
index b0003db7fea0..3449d262ea44 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -320,7 +320,6 @@ void kernel_restart_prepare(char *cmd)
system_state = SYSTEM_RESTART;
usermodehelper_disable();
device_shutdown();
- syscore_shutdown();
}
/**
@@ -354,6 +353,29 @@ int unregister_reboot_notifier(struct notifier_block *nb)
}
EXPORT_SYMBOL(unregister_reboot_notifier);
+/* Add backwards compatibility for stable trees. */
+#ifndef PF_NO_SETAFFINITY
+#define PF_NO_SETAFFINITY PF_THREAD_BOUND
+#endif
+
+static void migrate_to_reboot_cpu(void)
+{
+ /* The boot cpu is always logical cpu 0 */
+ int cpu = 0;
+
+ cpu_hotplug_disable();
+
+ /* Make certain the cpu I'm about to reboot on is online */
+ if (!cpu_online(cpu))
+ cpu = cpumask_first(cpu_online_mask);
+
+ /* Prevent races with other tasks migrating this task */
+ current->flags |= PF_NO_SETAFFINITY;
+
+ /* Make certain I only run on the appropriate processor */
+ set_cpus_allowed_ptr(current, cpumask_of(cpu));
+}
+
/**
* kernel_restart - reboot the system
* @cmd: pointer to buffer containing command to execute for restart
@@ -365,7 +387,8 @@ EXPORT_SYMBOL(unregister_reboot_notifier);
void kernel_restart(char *cmd)
{
kernel_restart_prepare(cmd);
- disable_nonboot_cpus();
+ migrate_to_reboot_cpu();
+ syscore_shutdown();
if (!cmd)
printk(KERN_EMERG "Restarting system.\n");
else
@@ -391,6 +414,7 @@ static void kernel_shutdown_prepare(enum system_states state)
void kernel_halt(void)
{
kernel_shutdown_prepare(SYSTEM_HALT);
+ migrate_to_reboot_cpu();
syscore_shutdown();
printk(KERN_EMERG "System halted.\n");
kmsg_dump(KMSG_DUMP_HALT);
@@ -409,7 +433,7 @@ void kernel_power_off(void)
kernel_shutdown_prepare(SYSTEM_POWER_OFF);
if (pm_power_off_prepare)
pm_power_off_prepare();
- disable_nonboot_cpus();
+ migrate_to_reboot_cpu();
syscore_shutdown();
printk(KERN_EMERG "Power down.\n");
kmsg_dump(KMSG_DUMP_POWEROFF);
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index f113755695e2..f8961bf97362 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -66,12 +66,17 @@ static void tick_broadcast_start_periodic(struct clock_event_device *bc)
*/
int tick_check_broadcast_device(struct clock_event_device *dev)
{
- if ((tick_broadcast_device.evtdev &&
+ struct clock_event_device *cur = tick_broadcast_device.evtdev;
+
+ if ((dev->features & CLOCK_EVT_FEAT_DUMMY) ||
+ (tick_broadcast_device.evtdev &&
tick_broadcast_device.evtdev->rating >= dev->rating) ||
(dev->features & CLOCK_EVT_FEAT_C3STOP))
return 0;
clockevents_exchange_device(tick_broadcast_device.evtdev, dev);
+ if (cur)
+ cur->event_handler = clockevents_handle_noop;
tick_broadcast_device.evtdev = dev;
if (!cpumask_empty(tick_get_broadcast_mask()))
tick_broadcast_start_periodic(dev);
@@ -395,7 +400,15 @@ void tick_check_oneshot_broadcast(int cpu)
if (cpumask_test_cpu(cpu, to_cpumask(tick_broadcast_oneshot_mask))) {
struct tick_device *td = &per_cpu(tick_cpu_device, cpu);
- clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_ONESHOT);
+ /*
+ * We might be in the middle of switching over from
+ * periodic to oneshot. If the CPU has not yet
+ * switched over, leave the device alone.
+ */
+ if (td->mode == TICKDEV_MODE_ONESHOT) {
+ clockevents_set_mode(td->evtdev,
+ CLOCK_EVT_MODE_ONESHOT);
+ }
}
}
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index da6c9ecad4e4..ead79bce1258 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -323,6 +323,7 @@ static void tick_shutdown(unsigned int *cpup)
*/
dev->mode = CLOCK_EVT_MODE_UNUSED;
clockevents_exchange_device(dev, NULL);
+ dev->event_handler = clockevents_handle_noop;
td->evtdev = NULL;
}
raw_spin_unlock_irqrestore(&tick_device_lock, flags);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index cd0e5a7d1632..2196248dabba 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -145,7 +145,6 @@ static void tick_nohz_update_jiffies(ktime_t now)
tick_do_update_jiffies64(now);
local_irq_restore(flags);
- calc_load_exit_idle();
touch_softlockup_watchdog();
}
@@ -497,12 +496,17 @@ void tick_nohz_idle_enter(void)
*/
void tick_nohz_irq_exit(void)
{
+ unsigned long flags;
struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
if (!ts->inidle)
return;
+ local_irq_save(flags);
+
tick_nohz_stop_sched_tick(ts);
+
+ local_irq_restore(flags);
}
/**
@@ -862,7 +866,7 @@ void tick_cancel_sched_timer(int cpu)
hrtimer_cancel(&ts->sched_timer);
# endif
- ts->nohz_mode = NOHZ_MODE_INACTIVE;
+ memset(ts, 0, sizeof(*ts));
}
#endif
diff --git a/kernel/timer.c b/kernel/timer.c
index 6dfdb72828ff..7e0a770be489 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -145,9 +145,11 @@ static unsigned long round_jiffies_common(unsigned long j, int cpu,
/* now that we have rounded, subtract the extra skew again */
j -= cpu * 3;
- if (j <= jiffies) /* rounding ate our timeout entirely; */
- return original;
- return j;
+ /*
+ * Make sure j is still in the future. Otherwise return the
+ * unmodified value.
+ */
+ return time_is_after_jiffies(j) ? j : original;
}
/**
@@ -1680,12 +1682,12 @@ static int __cpuinit init_timers_cpu(int cpu)
boot_done = 1;
base = &boot_tvec_bases;
}
+ spin_lock_init(&base->lock);
tvec_base_done[cpu] = 1;
} else {
base = per_cpu(tvec_bases, cpu);
}
- spin_lock_init(&base->lock);
for (j = 0; j < TVN_SIZE; j++) {
INIT_LIST_HEAD(base->tv5.vec + j);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index e2a3f7207cca..eddb2876f7b9 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -386,24 +386,28 @@ config KPROBE_EVENT
If you want to use perf tools, this option is strongly recommended.
config DYNAMIC_FTRACE
- bool "enable/disable ftrace tracepoints dynamically"
+ bool "enable/disable function tracing dynamically"
depends on FUNCTION_TRACER
depends on HAVE_DYNAMIC_FTRACE
default y
help
- This option will modify all the calls to ftrace dynamically
- (will patch them out of the binary image and replace them
- with a No-Op instruction) as they are called. A table is
- created to dynamically enable them again.
+ This option will modify all the calls to function tracing
+ dynamically (will patch them out of the binary image and
+ replace them with a No-Op instruction) on boot up. During
+ compile time, a table is made of all the locations that ftrace
+ can function trace, and this table is linked into the kernel
+ image. When this is enabled, functions can be individually
+ enabled, and the functions not enabled will not affect
+ performance of the system.
+
+ See the files in /sys/kernel/debug/tracing:
+ available_filter_functions
+ set_ftrace_filter
+ set_ftrace_notrace
This way a CONFIG_FUNCTION_TRACER kernel is slightly larger, but
otherwise has native performance as long as no tracing is active.
- The changes to the code are done by a kernel thread that
- wakes up once a second and checks to see if any ftrace calls
- were made. If so, it runs stop_machine (stops all CPUS)
- and modifies the code to jump over the call to ftrace.
-
config FUNCTION_PROFILER
bool "Kernel function profiler"
depends on FUNCTION_TRACER
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 4a86e6404085..e101cf9acc06 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -624,7 +624,7 @@ int ftrace_profile_pages_init(struct ftrace_profile_stat *stat)
pages = DIV_ROUND_UP(functions, PROFILES_PER_PAGE);
- for (i = 0; i < pages; i++) {
+ for (i = 1; i < pages; i++) {
pg->next = (void *)get_zeroed_page(GFP_KERNEL);
if (!pg->next)
goto out_free;
@@ -642,7 +642,6 @@ int ftrace_profile_pages_init(struct ftrace_profile_stat *stat)
free_page(tmp);
}
- free_page((unsigned long)stat->pages);
stat->pages = NULL;
stat->start = NULL;
@@ -1000,6 +999,19 @@ static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
static struct pid * const ftrace_swapper_pid = &init_struct_pid;
+loff_t
+ftrace_filter_lseek(struct file *file, loff_t offset, int whence)
+{
+ loff_t ret;
+
+ if (file->f_mode & FMODE_READ)
+ ret = seq_lseek(file, offset, whence);
+ else
+ file->f_pos = ret = 1;
+
+ return ret;
+}
+
#ifdef CONFIG_DYNAMIC_FTRACE
#ifndef CONFIG_FTRACE_MCOUNT_RECORD
@@ -2542,7 +2554,7 @@ static void ftrace_filter_reset(struct ftrace_hash *hash)
* routine, you can use ftrace_filter_write() for the write
* routine if @flag has FTRACE_ITER_FILTER set, or
* ftrace_notrace_write() if @flag has FTRACE_ITER_NOTRACE set.
- * ftrace_regex_lseek() should be used as the lseek routine, and
+ * ftrace_filter_lseek() should be used as the lseek routine, and
* release must call ftrace_regex_release().
*/
int
@@ -2626,19 +2638,6 @@ ftrace_notrace_open(struct inode *inode, struct file *file)
inode, file);
}
-loff_t
-ftrace_regex_lseek(struct file *file, loff_t offset, int origin)
-{
- loff_t ret;
-
- if (file->f_mode & FMODE_READ)
- ret = seq_lseek(file, offset, origin);
- else
- file->f_pos = ret = 1;
-
- return ret;
-}
-
static int ftrace_match(char *str, char *regex, int len, int type)
{
int matched = 0;
@@ -3034,8 +3033,8 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
continue;
}
- hlist_del(&entry->node);
- call_rcu(&entry->rcu, ftrace_free_entry_rcu);
+ hlist_del_rcu(&entry->node);
+ call_rcu_sched(&entry->rcu, ftrace_free_entry_rcu);
}
}
__disable_ftrace_function_probe();
@@ -3446,7 +3445,7 @@ static const struct file_operations ftrace_filter_fops = {
.open = ftrace_filter_open,
.read = seq_read,
.write = ftrace_filter_write,
- .llseek = ftrace_regex_lseek,
+ .llseek = ftrace_filter_lseek,
.release = ftrace_regex_release,
};
@@ -3454,7 +3453,7 @@ static const struct file_operations ftrace_notrace_fops = {
.open = ftrace_notrace_open,
.read = seq_read,
.write = ftrace_notrace_write,
- .llseek = ftrace_regex_lseek,
+ .llseek = ftrace_filter_lseek,
.release = ftrace_regex_release,
};
@@ -3612,7 +3611,8 @@ out:
if (fail)
return -EINVAL;
- ftrace_graph_filter_enabled = 1;
+ ftrace_graph_filter_enabled = !!(*idx);
+
return 0;
}
@@ -3659,8 +3659,8 @@ static const struct file_operations ftrace_graph_fops = {
.open = ftrace_graph_open,
.read = seq_read,
.write = ftrace_graph_write,
+ .llseek = ftrace_filter_lseek,
.release = ftrace_graph_release,
- .llseek = seq_lseek,
};
#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
@@ -4261,7 +4261,7 @@ static const struct file_operations ftrace_pid_fops = {
.open = ftrace_pid_open,
.write = ftrace_pid_write,
.read = seq_read,
- .llseek = seq_lseek,
+ .llseek = ftrace_filter_lseek,
.release = ftrace_pid_release,
};
@@ -4381,12 +4381,8 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
ftrace_startup_sysctl();
/* we are starting ftrace again */
- if (ftrace_ops_list != &ftrace_list_end) {
- if (ftrace_ops_list->next == &ftrace_list_end)
- ftrace_trace_function = ftrace_ops_list->func;
- else
- ftrace_trace_function = ftrace_ops_list_func;
- }
+ if (ftrace_ops_list != &ftrace_list_end)
+ update_ftrace_function();
} else {
/* stopping ftrace calls (just send to ftrace_stub) */
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 55e4d4c5313d..13cd224c7e81 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -687,7 +687,15 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
memcpy(max_data->comm, tsk->comm, TASK_COMM_LEN);
max_data->pid = tsk->pid;
- max_data->uid = task_uid(tsk);
+ /*
+ * If tsk == current, then use current_uid(), as that does not use
+ * RCU. The irq tracer can be called out of RCU scope.
+ */
+ if (tsk == current)
+ max_data->uid = current_uid();
+ else
+ max_data->uid = task_uid(tsk);
+
max_data->nice = tsk->static_prio - 20 - MAX_RT_PRIO;
max_data->policy = tsk->policy;
max_data->rt_priority = tsk->rt_priority;
@@ -708,7 +716,7 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
void
update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
{
- struct ring_buffer *buf = tr->buffer;
+ struct ring_buffer *buf;
if (trace_stop_count)
return;
@@ -720,6 +728,7 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
}
arch_spin_lock(&ftrace_max_lock);
+ buf = tr->buffer;
tr->buffer = max_tr.buffer;
max_tr.buffer = buf;
@@ -2742,11 +2751,25 @@ static int set_tracer_option(struct tracer *trace, char *cmp, int neg)
return -EINVAL;
}
-static void set_tracer_flags(unsigned int mask, int enabled)
+/* Some tracers require overwrite to stay enabled */
+int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set)
+{
+ if (tracer->enabled && (mask & TRACE_ITER_OVERWRITE) && !set)
+ return -1;
+
+ return 0;
+}
+
+int set_tracer_flag(unsigned int mask, int enabled)
{
/* do nothing if flag is already set */
if (!!(trace_flags & mask) == !!enabled)
- return;
+ return 0;
+
+ /* Give the tracer a chance to approve the change */
+ if (current_trace->flag_changed)
+ if (current_trace->flag_changed(current_trace, mask, !!enabled))
+ return -EINVAL;
if (enabled)
trace_flags |= mask;
@@ -2758,6 +2781,8 @@ static void set_tracer_flags(unsigned int mask, int enabled)
if (mask == TRACE_ITER_OVERWRITE)
ring_buffer_change_overwrite(global_trace.buffer, enabled);
+
+ return 0;
}
static ssize_t
@@ -2767,7 +2792,7 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
char buf[64];
char *cmp;
int neg = 0;
- int ret;
+ int ret = -ENODEV;
int i;
if (cnt >= sizeof(buf))
@@ -2784,21 +2809,23 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
cmp += 2;
}
+ mutex_lock(&trace_types_lock);
+
for (i = 0; trace_options[i]; i++) {
if (strcmp(cmp, trace_options[i]) == 0) {
- set_tracer_flags(1 << i, !neg);
+ ret = set_tracer_flag(1 << i, !neg);
break;
}
}
/* If no option could be set, test the specific tracer options */
- if (!trace_options[i]) {
- mutex_lock(&trace_types_lock);
+ if (!trace_options[i])
ret = set_tracer_option(current_trace, cmp, neg);
- mutex_unlock(&trace_types_lock);
- if (ret)
- return ret;
- }
+
+ mutex_unlock(&trace_types_lock);
+
+ if (ret < 0)
+ return ret;
*ppos += cnt;
@@ -3122,6 +3149,9 @@ static int tracing_set_tracer(const char *buf)
goto out;
trace_branch_disable();
+
+ current_trace->enabled = false;
+
if (current_trace && current_trace->reset)
current_trace->reset(tr);
if (current_trace && current_trace->use_max_tr) {
@@ -3151,6 +3181,7 @@ static int tracing_set_tracer(const char *buf)
goto out;
}
+ current_trace->enabled = true;
trace_branch_enable(tr);
out:
mutex_unlock(&trace_types_lock);
@@ -4485,7 +4516,13 @@ trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt,
if (val != 0 && val != 1)
return -EINVAL;
- set_tracer_flags(1 << index, val);
+
+ mutex_lock(&trace_types_lock);
+ ret = set_tracer_flag(1 << index, val);
+ mutex_unlock(&trace_types_lock);
+
+ if (ret < 0)
+ return ret;
*ppos += cnt;
@@ -4688,6 +4725,8 @@ static __init int tracer_init_debugfs(void)
trace_access_lock_init();
d_tracer = tracing_init_dentry();
+ if (!d_tracer)
+ return 0;
trace_create_file("tracing_enabled", 0644, d_tracer,
&global_trace, &tracing_ctrl_fops);
@@ -4824,36 +4863,32 @@ void trace_init_global_iter(struct trace_iterator *iter)
iter->cpu_file = TRACE_PIPE_ALL_CPU;
}
-static void
-__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
+void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
{
- static arch_spinlock_t ftrace_dump_lock =
- (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
/* use static because iter can be a bit big for the stack */
static struct trace_iterator iter;
+ static atomic_t dump_running;
unsigned int old_userobj;
- static int dump_ran;
unsigned long flags;
int cnt = 0, cpu;
- /* only one dump */
- local_irq_save(flags);
- arch_spin_lock(&ftrace_dump_lock);
- if (dump_ran)
- goto out;
-
- dump_ran = 1;
+ /* Only allow one dump user at a time. */
+ if (atomic_inc_return(&dump_running) != 1) {
+ atomic_dec(&dump_running);
+ return;
+ }
+ /*
+ * Always turn off tracing when we dump.
+ * We don't need to show trace output of what happens
+ * between multiple crashes.
+ *
+ * If the user does a sysrq-z, then they can re-enable
+ * tracing with echo 1 > tracing_on.
+ */
tracing_off();
- /* Did function tracer already get disabled? */
- if (ftrace_is_dead()) {
- printk("# WARNING: FUNCTION TRACING IS CORRUPTED\n");
- printk("# MAY BE MISSING FUNCTION EVENTS\n");
- }
-
- if (disable_tracing)
- ftrace_kill();
+ local_irq_save(flags);
trace_init_global_iter(&iter);
@@ -4886,6 +4921,12 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
printk(KERN_TRACE "Dumping ftrace buffer:\n");
+ /* Did function tracer already get disabled? */
+ if (ftrace_is_dead()) {
+ printk("# WARNING: FUNCTION TRACING IS CORRUPTED\n");
+ printk("# MAY BE MISSING FUNCTION EVENTS\n");
+ }
+
/*
* We need to stop all tracing on all CPUS to read the
* the next buffer. This is a bit expensive, but is
@@ -4925,26 +4966,14 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
printk(KERN_TRACE "---------------------------------\n");
out_enable:
- /* Re-enable tracing if requested */
- if (!disable_tracing) {
- trace_flags |= old_userobj;
+ trace_flags |= old_userobj;
- for_each_tracing_cpu(cpu) {
- atomic_dec(&iter.tr->data[cpu]->disabled);
- }
- tracing_on();
+ for_each_tracing_cpu(cpu) {
+ atomic_dec(&iter.tr->data[cpu]->disabled);
}
-
- out:
- arch_spin_unlock(&ftrace_dump_lock);
+ atomic_dec(&dump_running);
local_irq_restore(flags);
}
-
-/* By default: disable tracing after the dump */
-void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
-{
- __ftrace_dump(true, oops_dump_mode);
-}
EXPORT_SYMBOL_GPL(ftrace_dump);
__init static int tracer_alloc_buffers(void)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index f95d65da6db8..7360674ea049 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -278,10 +278,14 @@ struct tracer {
enum print_line_t (*print_line)(struct trace_iterator *iter);
/* If you handled the flag setting, return 0 */
int (*set_flag)(u32 old_flags, u32 bit, int set);
+ /* Return 0 if OK with change, else return non-zero */
+ int (*flag_changed)(struct tracer *tracer,
+ u32 mask, int set);
struct tracer *next;
struct tracer_flags *flags;
int print_max;
int use_max_tr;
+ bool enabled;
};
@@ -826,6 +830,9 @@ extern struct list_head ftrace_events;
extern const char *__start___trace_bprintk_fmt[];
extern const char *__stop___trace_bprintk_fmt[];
+int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set);
+int set_tracer_flag(unsigned int mask, int enabled);
+
#undef FTRACE_ENTRY
#define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \
extern struct ftrace_event_call \
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 006d5c9b2615..4d7cd25aeece 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -777,7 +777,11 @@ static int filter_set_pred(struct event_filter *filter,
static void __free_preds(struct event_filter *filter)
{
+ int i;
+
if (filter->preds) {
+ for (i = 0; i < filter->n_preds; i++)
+ kfree(filter->preds[i].ops);
kfree(filter->preds);
filter->preds = NULL;
}
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 99d20e920368..8dd139a3f6c7 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -32,7 +32,7 @@ enum {
static int trace_type __read_mostly;
-static int save_lat_flag;
+static int save_flags;
static void stop_irqsoff_tracer(struct trace_array *tr, int graph);
static int start_irqsoff_tracer(struct trace_array *tr, int graph);
@@ -557,8 +557,11 @@ static void stop_irqsoff_tracer(struct trace_array *tr, int graph)
static void __irqsoff_tracer_init(struct trace_array *tr)
{
- save_lat_flag = trace_flags & TRACE_ITER_LATENCY_FMT;
- trace_flags |= TRACE_ITER_LATENCY_FMT;
+ save_flags = trace_flags;
+
+ /* non overwrite screws up the latency tracers */
+ set_tracer_flag(TRACE_ITER_OVERWRITE, 1);
+ set_tracer_flag(TRACE_ITER_LATENCY_FMT, 1);
tracing_max_latency = 0;
irqsoff_trace = tr;
@@ -572,10 +575,13 @@ static void __irqsoff_tracer_init(struct trace_array *tr)
static void irqsoff_tracer_reset(struct trace_array *tr)
{
+ int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT;
+ int overwrite_flag = save_flags & TRACE_ITER_OVERWRITE;
+
stop_irqsoff_tracer(tr, is_graph());
- if (!save_lat_flag)
- trace_flags &= ~TRACE_ITER_LATENCY_FMT;
+ set_tracer_flag(TRACE_ITER_LATENCY_FMT, lat_flag);
+ set_tracer_flag(TRACE_ITER_OVERWRITE, overwrite_flag);
}
static void irqsoff_tracer_start(struct trace_array *tr)
@@ -608,6 +614,7 @@ static struct tracer irqsoff_tracer __read_mostly =
.print_line = irqsoff_print_line,
.flags = &tracer_flags,
.set_flag = irqsoff_set_flag,
+ .flag_changed = trace_keep_overwrite,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_irqsoff,
#endif
@@ -641,6 +648,7 @@ static struct tracer preemptoff_tracer __read_mostly =
.print_line = irqsoff_print_line,
.flags = &tracer_flags,
.set_flag = irqsoff_set_flag,
+ .flag_changed = trace_keep_overwrite,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_preemptoff,
#endif
@@ -676,6 +684,7 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
.print_line = irqsoff_print_line,
.flags = &tracer_flags,
.set_flag = irqsoff_set_flag,
+ .flag_changed = trace_keep_overwrite,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_preemptirqsoff,
#endif
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index ff791ea48b57..9eadedc9b4a8 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -36,7 +36,7 @@ static void __wakeup_reset(struct trace_array *tr);
static int wakeup_graph_entry(struct ftrace_graph_ent *trace);
static void wakeup_graph_return(struct ftrace_graph_ret *trace);
-static int save_lat_flag;
+static int save_flags;
#define TRACE_DISPLAY_GRAPH 1
@@ -539,8 +539,11 @@ static void stop_wakeup_tracer(struct trace_array *tr)
static int __wakeup_tracer_init(struct trace_array *tr)
{
- save_lat_flag = trace_flags & TRACE_ITER_LATENCY_FMT;
- trace_flags |= TRACE_ITER_LATENCY_FMT;
+ save_flags = trace_flags;
+
+ /* non overwrite screws up the latency tracers */
+ set_tracer_flag(TRACE_ITER_OVERWRITE, 1);
+ set_tracer_flag(TRACE_ITER_LATENCY_FMT, 1);
tracing_max_latency = 0;
wakeup_trace = tr;
@@ -562,12 +565,15 @@ static int wakeup_rt_tracer_init(struct trace_array *tr)
static void wakeup_tracer_reset(struct trace_array *tr)
{
+ int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT;
+ int overwrite_flag = save_flags & TRACE_ITER_OVERWRITE;
+
stop_wakeup_tracer(tr);
/* make sure we put back any tasks we are tracing */
wakeup_reset(tr);
- if (!save_lat_flag)
- trace_flags &= ~TRACE_ITER_LATENCY_FMT;
+ set_tracer_flag(TRACE_ITER_LATENCY_FMT, lat_flag);
+ set_tracer_flag(TRACE_ITER_OVERWRITE, overwrite_flag);
}
static void wakeup_tracer_start(struct trace_array *tr)
@@ -593,6 +599,7 @@ static struct tracer wakeup_tracer __read_mostly =
.print_line = wakeup_print_line,
.flags = &tracer_flags,
.set_flag = wakeup_set_flag,
+ .flag_changed = trace_keep_overwrite,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_wakeup,
#endif
@@ -614,6 +621,7 @@ static struct tracer wakeup_rt_tracer __read_mostly =
.print_line = wakeup_print_line,
.flags = &tracer_flags,
.set_flag = wakeup_set_flag,
+ .flag_changed = trace_keep_overwrite,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_wakeup,
#endif
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 288541f977fb..09fd98afd266 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -461,8 +461,6 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
/* Maximum number of functions to trace before diagnosing a hang */
#define GRAPH_MAX_FUNC_TEST 100000000
-static void
-__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode);
static unsigned int graph_hang_thresh;
/* Wrap the real function entry probe to avoid possible hanging */
@@ -472,8 +470,11 @@ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace)
if (unlikely(++graph_hang_thresh > GRAPH_MAX_FUNC_TEST)) {
ftrace_graph_stop();
printk(KERN_WARNING "BUG: Function graph tracer hang!\n");
- if (ftrace_dump_on_oops)
- __ftrace_dump(false, DUMP_ALL);
+ if (ftrace_dump_on_oops) {
+ ftrace_dump(DUMP_ALL);
+ /* ftrace_dump() disables tracing */
+ tracing_on();
+ }
return 0;
}
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index d4545f49242e..8298997c2486 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -20,13 +20,24 @@
#define STACK_TRACE_ENTRIES 500
+#ifdef CC_USING_FENTRY
+# define fentry 1
+#else
+# define fentry 0
+#endif
+
static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES+1] =
{ [0 ... (STACK_TRACE_ENTRIES)] = ULONG_MAX };
static unsigned stack_dump_index[STACK_TRACE_ENTRIES];
+/*
+ * Reserve one entry for the passed in ip. This will allow
+ * us to remove most or all of the stack size overhead
+ * added by the stack tracer itself.
+ */
static struct stack_trace max_stack_trace = {
- .max_entries = STACK_TRACE_ENTRIES,
- .entries = stack_dump_trace,
+ .max_entries = STACK_TRACE_ENTRIES - 1,
+ .entries = &stack_dump_trace[1],
};
static unsigned long max_stack_size;
@@ -40,25 +51,34 @@ static DEFINE_MUTEX(stack_sysctl_mutex);
int stack_tracer_enabled;
static int last_stack_tracer_enabled;
-static inline void check_stack(void)
+static inline void
+check_stack(unsigned long ip, unsigned long *stack)
{
unsigned long this_size, flags;
unsigned long *p, *top, *start;
+ static int tracer_frame;
+ int frame_size = ACCESS_ONCE(tracer_frame);
int i;
- this_size = ((unsigned long)&this_size) & (THREAD_SIZE-1);
+ this_size = ((unsigned long)stack) & (THREAD_SIZE-1);
this_size = THREAD_SIZE - this_size;
+ /* Remove the frame of the tracer */
+ this_size -= frame_size;
if (this_size <= max_stack_size)
return;
/* we do not handle interrupt stacks yet */
- if (!object_is_on_stack(&this_size))
+ if (!object_is_on_stack(stack))
return;
local_irq_save(flags);
arch_spin_lock(&max_stack_lock);
+ /* In case another CPU set the tracer_frame on us */
+ if (unlikely(!frame_size))
+ this_size -= tracer_frame;
+
/* a race could have already updated it */
if (this_size <= max_stack_size)
goto out;
@@ -71,10 +91,18 @@ static inline void check_stack(void)
save_stack_trace(&max_stack_trace);
/*
+ * Add the passed in ip from the function tracer.
+ * Searching for this on the stack will skip over
+ * most of the overhead from the stack tracer itself.
+ */
+ stack_dump_trace[0] = ip;
+ max_stack_trace.nr_entries++;
+
+ /*
* Now find where in the stack these are.
*/
i = 0;
- start = &this_size;
+ start = stack;
top = (unsigned long *)
(((unsigned long)start & ~(THREAD_SIZE-1)) + THREAD_SIZE);
@@ -98,6 +126,18 @@ static inline void check_stack(void)
found = 1;
/* Start the search from here */
start = p + 1;
+ /*
+ * We do not want to show the overhead
+ * of the stack tracer stack in the
+ * max stack. If we haven't figured
+ * out what that is, then figure it out
+ * now.
+ */
+ if (unlikely(!tracer_frame) && i == 1) {
+ tracer_frame = (p - stack) *
+ sizeof(unsigned long);
+ max_stack_size -= tracer_frame;
+ }
}
}
@@ -113,6 +153,7 @@ static inline void check_stack(void)
static void
stack_trace_call(unsigned long ip, unsigned long parent_ip)
{
+ unsigned long stack;
int cpu;
if (unlikely(!ftrace_enabled || stack_trace_disabled))
@@ -125,7 +166,26 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip)
if (per_cpu(trace_active, cpu)++ != 0)
goto out;
- check_stack();
+ /*
+ * When fentry is used, the traced function does not get
+ * its stack frame set up, and we lose the parent.
+ * The ip is pretty useless because the function tracer
+ * was called before that function set up its stack frame.
+ * In this case, we use the parent ip.
+ *
+ * By adding the return address of either the parent ip
+ * or the current ip we can disregard most of the stack usage
+ * caused by the stack tracer itself.
+ *
+ * The function tracer always reports the address of where the
+ * mcount call was, but the stack will hold the return address.
+ */
+ if (fentry)
+ ip = parent_ip;
+ else
+ ip += MCOUNT_INSN_SIZE;
+
+ check_stack(ip, &stack);
out:
per_cpu(trace_active, cpu)--;
@@ -324,7 +384,7 @@ static const struct file_operations stack_trace_filter_fops = {
.open = stack_trace_filter_open,
.read = seq_read,
.write = ftrace_filter_write,
- .llseek = ftrace_regex_lseek,
+ .llseek = ftrace_filter_lseek,
.release = ftrace_regex_release,
};
@@ -373,6 +433,8 @@ static __init int stack_trace_init(void)
struct dentry *d_tracer;
d_tracer = tracing_init_dentry();
+ if (!d_tracer)
+ return 0;
trace_create_file("stack_max_size", 0644, d_tracer,
&max_stack_size, &stack_max_size_fops);
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index 96cffb269e73..847f88a6194b 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -307,6 +307,8 @@ static int tracing_stat_init(void)
struct dentry *d_tracing;
d_tracing = tracing_init_dentry();
+ if (!d_tracing)
+ return 0;
stat_dir = debugfs_create_dir("trace_stat", d_tracing);
if (!stat_dir)
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 96fc73369099..c9ce09addacd 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -305,6 +305,8 @@ void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
struct ring_buffer *buffer;
int size;
int syscall_nr;
+ unsigned long irq_flags;
+ int pc;
syscall_nr = syscall_get_nr(current, regs);
if (syscall_nr < 0)
@@ -318,8 +320,11 @@ void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
+ local_save_flags(irq_flags);
+ pc = preempt_count();
+
event = trace_current_buffer_lock_reserve(&buffer,
- sys_data->enter_event->event.type, size, 0, 0);
+ sys_data->enter_event->event.type, size, irq_flags, pc);
if (!event)
return;
@@ -329,7 +334,8 @@ void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
if (!filter_current_check_discard(buffer, sys_data->enter_event,
entry, event))
- trace_current_buffer_unlock_commit(buffer, event, 0, 0);
+ trace_current_buffer_unlock_commit(buffer, event,
+ irq_flags, pc);
}
void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
@@ -339,6 +345,8 @@ void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
struct ring_buffer_event *event;
struct ring_buffer *buffer;
int syscall_nr;
+ unsigned long irq_flags;
+ int pc;
syscall_nr = syscall_get_nr(current, regs);
if (syscall_nr < 0)
@@ -350,8 +358,12 @@ void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
if (!sys_data)
return;
+ local_save_flags(irq_flags);
+ pc = preempt_count();
+
event = trace_current_buffer_lock_reserve(&buffer,
- sys_data->exit_event->event.type, sizeof(*entry), 0, 0);
+ sys_data->exit_event->event.type, sizeof(*entry),
+ irq_flags, pc);
if (!event)
return;
@@ -361,7 +373,8 @@ void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
if (!filter_current_check_discard(buffer, sys_data->exit_event,
entry, event))
- trace_current_buffer_unlock_commit(buffer, event, 0, 0);
+ trace_current_buffer_unlock_commit(buffer, event,
+ irq_flags, pc);
}
int reg_event_syscall_enter(struct ftrace_event_call *call)