From 86397dc3ccfc0e17b7550d05eaf15fe91f6498dd Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 17 Aug 2010 13:53:06 +0800 Subject: tracing: Clean up seqfile code for format file Remove the nasty hack that marks a pointer's LSB to distinguish common fields from event fields. Replace it with a more sane approach. Signed-off-by: Li Zefan LKML-Reference: <4C6A23C2.9020606@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 55 +++++++++++++++------------------------------ 1 file changed, 18 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 4c758f146328..398c0e8b332c 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -600,21 +600,29 @@ out: enum { FORMAT_HEADER = 1, - FORMAT_PRINTFMT = 2, + FORMAT_FIELD_SEPERATOR = 2, + FORMAT_PRINTFMT = 3, }; static void *f_next(struct seq_file *m, void *v, loff_t *pos) { struct ftrace_event_call *call = m->private; struct ftrace_event_field *field; - struct list_head *head; + struct list_head *common_head = &ftrace_common_fields; + struct list_head *head = trace_get_fields(call); (*pos)++; switch ((unsigned long)v) { case FORMAT_HEADER: - head = &ftrace_common_fields; + if (unlikely(list_empty(common_head))) + return NULL; + + field = list_entry(common_head->prev, + struct ftrace_event_field, link); + return field; + case FORMAT_FIELD_SEPERATOR: if (unlikely(list_empty(head))) return NULL; @@ -626,31 +634,10 @@ static void *f_next(struct seq_file *m, void *v, loff_t *pos) return NULL; } - head = trace_get_fields(call); - - /* - * To separate common fields from event fields, the - * LSB is set on the first event field. Clear it in case. - */ - v = (void *)((unsigned long)v & ~1L); - field = v; - /* - * If this is a common field, and at the end of the list, then - * continue with main list. - */ - if (field->link.prev == &ftrace_common_fields) { - if (unlikely(list_empty(head))) - return NULL; - field = list_entry(head->prev, struct ftrace_event_field, link); - /* Set the LSB to notify f_show to print an extra newline */ - field = (struct ftrace_event_field *) - ((unsigned long)field | 1); - return field; - } - - /* If we are done tell f_show to print the format */ - if (field->link.prev == head) + if (field->link.prev == common_head) + return (void *)FORMAT_FIELD_SEPERATOR; + else if (field->link.prev == head) return (void *)FORMAT_PRINTFMT; field = list_entry(field->link.prev, struct ftrace_event_field, link); @@ -688,22 +675,16 @@ static int f_show(struct seq_file *m, void *v) seq_printf(m, "format:\n"); return 0; + case FORMAT_FIELD_SEPERATOR: + seq_putc(m, '\n'); + return 0; + case FORMAT_PRINTFMT: seq_printf(m, "\nprint fmt: %s\n", call->print_fmt); return 0; } - /* - * To separate common fields from event fields, the - * LSB is set on the first event field. Clear it and - * print a newline if it is set. - */ - if ((unsigned long)v & 1) { - seq_putc(m, '\n'); - v = (void *)((unsigned long)v & ~1L); - } - field = v; /* -- cgit v1.2.3 From 56962b4449af34070bb1994621ef4f0265eed4d8 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 30 Jun 2010 23:03:51 +0200 Subject: perf: Generalize some arch callchain code - Most archs use one callchain buffer per cpu, except x86 that needs to deal with NMIs. Provide a default perf_callchain_buffer() implementation that x86 overrides. - Centralize all the kernel/user regs handling and invoke new arch handlers from there: perf_callchain_user() / perf_callchain_kernel() That avoid all the user_mode(), current->mm checks and so... - Invert some parameters in perf_callchain_*() helpers: entry to the left, regs to the right, following the traditional (dst, src). Signed-off-by: Frederic Weisbecker Acked-by: Paul Mackerras Tested-by: Will Deacon Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Stephane Eranian Cc: David Miller Cc: Paul Mundt Cc: Borislav Petkov --- kernel/perf_event.c | 40 ++++++++++++++++++++++++++++++++++++++-- 1 file changed, 38 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index c772a3d4000d..02efde6c8798 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -2937,13 +2937,49 @@ void perf_event_do_pending(void) __perf_pending_run(); } +DEFINE_PER_CPU(struct perf_callchain_entry, perf_callchain_entry); + /* * Callchain support -- arch specific */ -__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) +__weak struct perf_callchain_entry *perf_callchain_buffer(void) { - return NULL; + return &__get_cpu_var(perf_callchain_entry); +} + +__weak void perf_callchain_kernel(struct perf_callchain_entry *entry, + struct pt_regs *regs) +{ +} + +__weak void perf_callchain_user(struct perf_callchain_entry *entry, + struct pt_regs *regs) +{ +} + +static struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) +{ + struct perf_callchain_entry *entry; + + entry = perf_callchain_buffer(); + if (!entry) + return NULL; + + entry->nr = 0; + + if (!user_mode(regs)) { + perf_callchain_kernel(entry, regs); + if (current->mm) + regs = task_pt_regs(current); + else + regs = NULL; + } + + if (regs) + perf_callchain_user(entry, regs); + + return entry; } -- cgit v1.2.3 From f72c1a931e311bb7780fee19e41a89ac42cab50e Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 1 Jul 2010 02:31:21 +0200 Subject: perf: Factorize callchain context handling Store the kernel and user contexts from the generic layer instead of archs, this gathers some repetitive code. Signed-off-by: Frederic Weisbecker Acked-by: Paul Mackerras Tested-by: Will Deacon Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Stephane Eranian Cc: David Miller Cc: Paul Mundt Cc: Borislav Petkov --- kernel/perf_event.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 02efde6c8798..615d024894cf 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -2969,6 +2969,7 @@ static struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) entry->nr = 0; if (!user_mode(regs)) { + perf_callchain_store(entry, PERF_CONTEXT_KERNEL); perf_callchain_kernel(entry, regs); if (current->mm) regs = task_pt_regs(current); @@ -2976,8 +2977,10 @@ static struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) regs = NULL; } - if (regs) + if (regs) { + perf_callchain_store(entry, PERF_CONTEXT_USER); perf_callchain_user(entry, regs); + } return entry; } -- cgit v1.2.3 From 927c7a9e92c4f69097a6e9e086d11fc2f8a5b40b Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 1 Jul 2010 16:20:36 +0200 Subject: perf: Fix race in callchains Now that software events don't have interrupt disabled anymore in the event path, callchains can nest on any context. So seperating nmi and others contexts in two buffers has become racy. Fix this by providing one buffer per nesting level. Given the size of the callchain entries (2040 bytes * 4), we now need to allocate them dynamically. v2: Fixed put_callchain_entry call after recursion. Fix the type of the recursion, it must be an array. v3: Use a manual pr cpu allocation (temporary solution until NMIs can safely access vmalloc'ed memory). Do a better separation between callchain reference tracking and allocation. Make the "put" path lockless for non-release cases. v4: Protect the callchain buffers with rcu. v5: Do the cpu buffers allocations node affine. Signed-off-by: Frederic Weisbecker Tested-by: Will Deacon Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Paul Mackerras Cc: Stephane Eranian Cc: Paul Mundt Cc: David Miller Cc: Borislav Petkov --- kernel/perf_event.c | 298 ++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 229 insertions(+), 69 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 615d024894cf..75ab8a2df6b2 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -1763,6 +1763,216 @@ static u64 perf_event_read(struct perf_event *event) return perf_event_count(event); } +/* + * Callchain support + */ + +struct callchain_cpus_entries { + struct rcu_head rcu_head; + struct perf_callchain_entry *cpu_entries[0]; +}; + +static DEFINE_PER_CPU(int, callchain_recursion[4]); +static atomic_t nr_callchain_events; +static DEFINE_MUTEX(callchain_mutex); +struct callchain_cpus_entries *callchain_cpus_entries; + + +__weak void perf_callchain_kernel(struct perf_callchain_entry *entry, + struct pt_regs *regs) +{ +} + +__weak void perf_callchain_user(struct perf_callchain_entry *entry, + struct pt_regs *regs) +{ +} + +static void release_callchain_buffers_rcu(struct rcu_head *head) +{ + struct callchain_cpus_entries *entries; + int cpu; + + entries = container_of(head, struct callchain_cpus_entries, rcu_head); + + for_each_possible_cpu(cpu) + kfree(entries->cpu_entries[cpu]); + + kfree(entries); +} + +static void release_callchain_buffers(void) +{ + struct callchain_cpus_entries *entries; + + entries = callchain_cpus_entries; + rcu_assign_pointer(callchain_cpus_entries, NULL); + call_rcu(&entries->rcu_head, release_callchain_buffers_rcu); +} + +static int alloc_callchain_buffers(void) +{ + int cpu; + int size; + struct callchain_cpus_entries *entries; + + /* + * We can't use the percpu allocation API for data that can be + * accessed from NMI. Use a temporary manual per cpu allocation + * until that gets sorted out. + */ + size = sizeof(*entries) + sizeof(struct perf_callchain_entry *) * + num_possible_cpus(); + + entries = kzalloc(size, GFP_KERNEL); + if (!entries) + return -ENOMEM; + + size = sizeof(struct perf_callchain_entry) * 4; + + for_each_possible_cpu(cpu) { + entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL, + cpu_to_node(cpu)); + if (!entries->cpu_entries[cpu]) + goto fail; + } + + rcu_assign_pointer(callchain_cpus_entries, entries); + + return 0; + +fail: + for_each_possible_cpu(cpu) + kfree(entries->cpu_entries[cpu]); + kfree(entries); + + return -ENOMEM; +} + +static int get_callchain_buffers(void) +{ + int err = 0; + int count; + + mutex_lock(&callchain_mutex); + + count = atomic_inc_return(&nr_callchain_events); + if (WARN_ON_ONCE(count < 1)) { + err = -EINVAL; + goto exit; + } + + if (count > 1) { + /* If the allocation failed, give up */ + if (!callchain_cpus_entries) + err = -ENOMEM; + goto exit; + } + + err = alloc_callchain_buffers(); + if (err) + release_callchain_buffers(); +exit: + mutex_unlock(&callchain_mutex); + + return err; +} + +static void put_callchain_buffers(void) +{ + if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) { + release_callchain_buffers(); + mutex_unlock(&callchain_mutex); + } +} + +static int get_recursion_context(int *recursion) +{ + int rctx; + + if (in_nmi()) + rctx = 3; + else if (in_irq()) + rctx = 2; + else if (in_softirq()) + rctx = 1; + else + rctx = 0; + + if (recursion[rctx]) + return -1; + + recursion[rctx]++; + barrier(); + + return rctx; +} + +static inline void put_recursion_context(int *recursion, int rctx) +{ + barrier(); + recursion[rctx]--; +} + +static struct perf_callchain_entry *get_callchain_entry(int *rctx) +{ + int cpu; + struct callchain_cpus_entries *entries; + + *rctx = get_recursion_context(__get_cpu_var(callchain_recursion)); + if (*rctx == -1) + return NULL; + + entries = rcu_dereference(callchain_cpus_entries); + if (!entries) + return NULL; + + cpu = smp_processor_id(); + + return &entries->cpu_entries[cpu][*rctx]; +} + +static void +put_callchain_entry(int rctx) +{ + put_recursion_context(__get_cpu_var(callchain_recursion), rctx); +} + +static struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) +{ + int rctx; + struct perf_callchain_entry *entry; + + + entry = get_callchain_entry(&rctx); + if (rctx == -1) + return NULL; + + if (!entry) + goto exit_put; + + entry->nr = 0; + + if (!user_mode(regs)) { + perf_callchain_store(entry, PERF_CONTEXT_KERNEL); + perf_callchain_kernel(entry, regs); + if (current->mm) + regs = task_pt_regs(current); + else + regs = NULL; + } + + if (regs) { + perf_callchain_store(entry, PERF_CONTEXT_USER); + perf_callchain_user(entry, regs); + } + +exit_put: + put_callchain_entry(rctx); + + return entry; +} + /* * Initialize the perf_event context in a task_struct: */ @@ -1895,6 +2105,8 @@ static void free_event(struct perf_event *event) atomic_dec(&nr_comm_events); if (event->attr.task) atomic_dec(&nr_task_events); + if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) + put_callchain_buffers(); } if (event->buffer) { @@ -2937,55 +3149,6 @@ void perf_event_do_pending(void) __perf_pending_run(); } -DEFINE_PER_CPU(struct perf_callchain_entry, perf_callchain_entry); - -/* - * Callchain support -- arch specific - */ - -__weak struct perf_callchain_entry *perf_callchain_buffer(void) -{ - return &__get_cpu_var(perf_callchain_entry); -} - -__weak void perf_callchain_kernel(struct perf_callchain_entry *entry, - struct pt_regs *regs) -{ -} - -__weak void perf_callchain_user(struct perf_callchain_entry *entry, - struct pt_regs *regs) -{ -} - -static struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) -{ - struct perf_callchain_entry *entry; - - entry = perf_callchain_buffer(); - if (!entry) - return NULL; - - entry->nr = 0; - - if (!user_mode(regs)) { - perf_callchain_store(entry, PERF_CONTEXT_KERNEL); - perf_callchain_kernel(entry, regs); - if (current->mm) - regs = task_pt_regs(current); - else - regs = NULL; - } - - if (regs) { - perf_callchain_store(entry, PERF_CONTEXT_USER); - perf_callchain_user(entry, regs); - } - - return entry; -} - - /* * We assume there is only KVM supporting the callbacks. * Later on, we might change it to a list if there is @@ -3480,14 +3643,20 @@ static void perf_event_output(struct perf_event *event, int nmi, struct perf_output_handle handle; struct perf_event_header header; + /* protect the callchain buffers */ + rcu_read_lock(); + perf_prepare_sample(&header, data, event, regs); if (perf_output_begin(&handle, event, header.size, nmi, 1)) - return; + goto exit; perf_output_sample(&handle, &header, data, event); perf_output_end(&handle); + +exit: + rcu_read_unlock(); } /* @@ -4243,32 +4412,16 @@ end: int perf_swevent_get_recursion_context(void) { struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - int rctx; - if (in_nmi()) - rctx = 3; - else if (in_irq()) - rctx = 2; - else if (in_softirq()) - rctx = 1; - else - rctx = 0; - - if (cpuctx->recursion[rctx]) - return -1; - - cpuctx->recursion[rctx]++; - barrier(); - - return rctx; + return get_recursion_context(cpuctx->recursion); } EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); void inline perf_swevent_put_recursion_context(int rctx) { struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - barrier(); - cpuctx->recursion[rctx]--; + + put_recursion_context(cpuctx->recursion, rctx); } void __perf_sw_event(u32 event_id, u64 nr, int nmi, @@ -4968,6 +5121,13 @@ done: atomic_inc(&nr_comm_events); if (event->attr.task) atomic_inc(&nr_task_events); + if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) { + err = get_callchain_buffers(); + if (err) { + free_event(event); + return ERR_PTR(err); + } + } } return event; -- cgit v1.2.3 From 7ae07ea3a48d30689ee037cb136bc21f0b37d8ae Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 14 Aug 2010 20:45:13 +0200 Subject: perf: Humanize the number of contexts Instead of hardcoding the number of contexts for the recursions barriers, define a cpp constant to make the code more self-explanatory. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Paul Mackerras Cc: Stephane Eranian --- kernel/perf_event.c | 4 ++-- kernel/trace/trace_event_perf.c | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 75ab8a2df6b2..f416aef242c3 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -1772,7 +1772,7 @@ struct callchain_cpus_entries { struct perf_callchain_entry *cpu_entries[0]; }; -static DEFINE_PER_CPU(int, callchain_recursion[4]); +static DEFINE_PER_CPU(int, callchain_recursion[PERF_NR_CONTEXTS]); static atomic_t nr_callchain_events; static DEFINE_MUTEX(callchain_mutex); struct callchain_cpus_entries *callchain_cpus_entries; @@ -1828,7 +1828,7 @@ static int alloc_callchain_buffers(void) if (!entries) return -ENOMEM; - size = sizeof(struct perf_callchain_entry) * 4; + size = sizeof(struct perf_callchain_entry) * PERF_NR_CONTEXTS; for_each_possible_cpu(cpu) { entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL, diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 000e6e85b445..db2eae2efcf2 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -9,7 +9,7 @@ #include #include "trace.h" -static char *perf_trace_buf[4]; +static char *perf_trace_buf[PERF_NR_CONTEXTS]; /* * Force it to be aligned to unsigned long to avoid misaligned accesses @@ -45,7 +45,7 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event, char *buf; int i; - for (i = 0; i < 4; i++) { + for (i = 0; i < PERF_NR_CONTEXTS; i++) { buf = (char *)alloc_percpu(perf_trace_t); if (!buf) goto fail; @@ -65,7 +65,7 @@ fail: if (!total_ref_count) { int i; - for (i = 0; i < 4; i++) { + for (i = 0; i < PERF_NR_CONTEXTS; i++) { free_percpu(perf_trace_buf[i]); perf_trace_buf[i] = NULL; } @@ -140,7 +140,7 @@ void perf_trace_destroy(struct perf_event *p_event) tp_event->perf_events = NULL; if (!--total_ref_count) { - for (i = 0; i < 4; i++) { + for (i = 0; i < PERF_NR_CONTEXTS; i++) { free_percpu(perf_trace_buf[i]); perf_trace_buf[i] = NULL; } -- cgit v1.2.3 From 6016ee13db518ab1cd0cbf43fc2ad5712021e338 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 11 Aug 2010 12:47:59 +0900 Subject: perf, tracing: add missing __percpu markups ftrace_event_call->perf_events, perf_trace_buf, fgraph_data->cpu_data and some local variables are percpu pointers missing __percpu markups. Add them. Signed-off-by: Namhyung Kim Acked-by: Tejun Heo Cc: Steven Rostedt Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Paul Mackerras Cc: Stephane Eranian LKML-Reference: <1281498479-28551-1-git-send-email-namhyung@gmail.com> Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_event_perf.c | 15 ++++++++------- kernel/trace/trace_functions_graph.c | 2 +- 2 files changed, 9 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index db2eae2efcf2..92f5477a006a 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -9,7 +9,7 @@ #include #include "trace.h" -static char *perf_trace_buf[PERF_NR_CONTEXTS]; +static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS]; /* * Force it to be aligned to unsigned long to avoid misaligned accesses @@ -24,7 +24,7 @@ static int total_ref_count; static int perf_trace_event_init(struct ftrace_event_call *tp_event, struct perf_event *p_event) { - struct hlist_head *list; + struct hlist_head __percpu *list; int ret = -ENOMEM; int cpu; @@ -42,11 +42,11 @@ static int perf_trace_event_init(struct ftrace_event_call *tp_event, tp_event->perf_events = list; if (!total_ref_count) { - char *buf; + char __percpu *buf; int i; for (i = 0; i < PERF_NR_CONTEXTS; i++) { - buf = (char *)alloc_percpu(perf_trace_t); + buf = (char __percpu *)alloc_percpu(perf_trace_t); if (!buf) goto fail; @@ -102,13 +102,14 @@ int perf_trace_init(struct perf_event *p_event) int perf_trace_enable(struct perf_event *p_event) { struct ftrace_event_call *tp_event = p_event->tp_event; + struct hlist_head __percpu *pcpu_list; struct hlist_head *list; - list = tp_event->perf_events; - if (WARN_ON_ONCE(!list)) + pcpu_list = tp_event->perf_events; + if (WARN_ON_ONCE(!pcpu_list)) return -EINVAL; - list = this_cpu_ptr(list); + list = this_cpu_ptr(pcpu_list); hlist_add_head_rcu(&p_event->hlist_entry, list); return 0; diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 6bff23625781..fcb5a542cd21 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -23,7 +23,7 @@ struct fgraph_cpu_data { }; struct fgraph_data { - struct fgraph_cpu_data *cpu_data; + struct fgraph_cpu_data __percpu *cpu_data; /* Place to preserve last processed entry. */ struct ftrace_graph_ent_entry ent; -- cgit v1.2.3 From eac243355a99d6b9d41bbeba4fc83e7f735485f9 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Tue, 31 Aug 2010 23:00:08 -0400 Subject: lockup_detector: Convert cpu notifier to return encapsulate errno value By the commit e6bde73b07edeb703d4c89c1daabc09c303de11f ("cpu-hotplug: return better errno on cpu hotplug failure"), the cpu notifier can return encapsulate errno value, resulting in more meaningful error codes for CPU hotplug failures. This converts the cpu notifier to return encapsulate errno value for the lockup_detector as well. Signed-off-by: Akinobu Mita Cc: peterz@infradead.org Cc: gorcunov@gmail.com Cc: fweisbec@gmail.com LKML-Reference: <1283310009-22168-3-git-send-email-dzickus@redhat.com> Signed-off-by: Don Zickus Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 0d53c8e853b1..501cb6edca4b 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -371,7 +371,7 @@ static int watchdog_nmi_enable(int cpu) } printk(KERN_ERR "NMI watchdog failed to create perf event on cpu%i: %p\n", cpu, event); - return -1; + return PTR_ERR(event); /* success path */ out_save: @@ -415,17 +415,19 @@ static int watchdog_prepare_cpu(int cpu) static int watchdog_enable(int cpu) { struct task_struct *p = per_cpu(softlockup_watchdog, cpu); + int err; /* enable the perf event */ - if (watchdog_nmi_enable(cpu) != 0) - return -1; + err = watchdog_nmi_enable(cpu); + if (err) + return err; /* create the watchdog thread */ if (!p) { p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu); if (IS_ERR(p)) { printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu); - return -1; + return PTR_ERR(p); } kthread_bind(p, cpu); per_cpu(watchdog_touch_ts, cpu) = 0; @@ -519,17 +521,16 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { int hotcpu = (unsigned long)hcpu; + int err = 0; switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: - if (watchdog_prepare_cpu(hotcpu)) - return NOTIFY_BAD; + err = watchdog_prepare_cpu(hotcpu); break; case CPU_ONLINE: case CPU_ONLINE_FROZEN: - if (watchdog_enable(hotcpu)) - return NOTIFY_BAD; + err = watchdog_enable(hotcpu); break; #ifdef CONFIG_HOTPLUG_CPU case CPU_UP_CANCELED: @@ -542,7 +543,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) break; #endif /* CONFIG_HOTPLUG_CPU */ } - return NOTIFY_OK; + return notifier_from_errno(err); } static struct notifier_block __cpuinitdata cpu_nfb = { @@ -558,7 +559,7 @@ static int __init spawn_watchdog_task(void) return 0; err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); - WARN_ON(err == NOTIFY_BAD); + WARN_ON(notifier_to_errno(err)); cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); register_cpu_notifier(&cpu_nfb); -- cgit v1.2.3 From 14416c35b6b9975c9593d7ecc8382d1ecaa0b598 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Tue, 31 Aug 2010 23:00:09 -0400 Subject: lockup_detector: Remove unused panic_notifier The panic notifer in lockup_detector just set did_panic to 1. But did_panic is not used anywhere so we can just remove it. Signed-off-by: Akinobu Mita Cc: peterz@infradead.org Cc: gorcunov@gmail.com Cc: fweisbec@gmail.com LKML-Reference: <1283310009-22168-4-git-send-email-dzickus@redhat.com> Signed-off-by: Don Zickus Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 501cb6edca4b..5b1ee4f4ca0d 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -43,7 +43,6 @@ static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); static DEFINE_PER_CPU(struct perf_event *, watchdog_ev); #endif -static int __read_mostly did_panic; static int __initdata no_watchdog; @@ -180,18 +179,6 @@ static int is_softlockup(unsigned long touch_ts) return 0; } -static int -watchdog_panic(struct notifier_block *this, unsigned long event, void *ptr) -{ - did_panic = 1; - - return NOTIFY_DONE; -} - -static struct notifier_block panic_block = { - .notifier_call = watchdog_panic, -}; - #ifdef CONFIG_HARDLOCKUP_DETECTOR static struct perf_event_attr wd_hw_attr = { .type = PERF_TYPE_HARDWARE, @@ -564,8 +551,6 @@ static int __init spawn_watchdog_task(void) cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); register_cpu_notifier(&cpu_nfb); - atomic_notifier_chain_register(&panic_notifier_list, &panic_block); - return 0; } early_initcall(spawn_watchdog_task); -- cgit v1.2.3 From f6195aa09e618d712f52bf4fa33b5293820eb93d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 1 Sep 2010 12:23:12 -0400 Subject: ring-buffer: Place duplicate expression into a single function While discussing the strictness of the 80 character limit on the Kernel Summit Discussion mailing list, I showed examples that I broke that limit slightly with some algorithms. In discussing with John Linville, what looked better, I realized that two of the 80 char breaking culprits were an identical expression. As a clean up, this patch moves the identical expression into its own helper function and that is used instead. As a side effect, the offending code is now under the 80 character limit. :-) This clean up code also changes the expression from (A - B) - C to A - (B + C) This makes the code look a little nicer too. Cc: John W. Linville Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 19cccc3c3028..ef27017caa56 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2606,6 +2606,19 @@ void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu) } EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu); +/* + * The total entries in the ring buffer is the running counter + * of entries entered into the ring buffer, minus the sum of + * the entries read from the ring buffer and the number of + * entries that were overwritten. + */ +static inline unsigned long +rb_num_of_entries(struct ring_buffer_per_cpu *cpu_buffer) +{ + return local_read(&cpu_buffer->entries) - + (local_read(&cpu_buffer->overrun) + cpu_buffer->read); +} + /** * ring_buffer_entries_cpu - get the number of entries in a cpu buffer * @buffer: The ring buffer @@ -2614,16 +2627,13 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu); unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; - unsigned long ret; if (!cpumask_test_cpu(cpu, buffer->cpumask)) return 0; cpu_buffer = buffer->buffers[cpu]; - ret = (local_read(&cpu_buffer->entries) - local_read(&cpu_buffer->overrun)) - - cpu_buffer->read; - return ret; + return rb_num_of_entries(cpu_buffer); } EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu); @@ -2684,8 +2694,7 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer) /* if you care about this being correct, lock the buffer */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; - entries += (local_read(&cpu_buffer->entries) - - local_read(&cpu_buffer->overrun)) - cpu_buffer->read; + entries += rb_num_of_entries(cpu_buffer); } return entries; -- cgit v1.2.3 From 51b0fe39549a04858001922919ab355dee9bdfcf Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 11 Jun 2010 13:35:57 +0200 Subject: perf: Deconstify struct pmu sed -ie 's/const struct pmu\>/struct pmu/g' `git grep -l "const struct pmu\>"` Signed-off-by: Peter Zijlstra Cc: paulus Cc: stephane eranian Cc: Robert Richter Cc: Will Deacon Cc: Paul Mundt Cc: Frederic Weisbecker Cc: Cyrill Gorcunov Cc: Lin Ming Cc: Yanmin Cc: Deng-Cheng Zhu Cc: David Miller Cc: Michael Cree LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 2d74f31220ad..fb46fd13f31f 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -75,7 +75,7 @@ static DEFINE_SPINLOCK(perf_resource_lock); /* * Architecture provided APIs - weak aliases: */ -extern __weak const struct pmu *hw_perf_event_init(struct perf_event *event) +extern __weak struct pmu *hw_perf_event_init(struct perf_event *event) { return NULL; } @@ -691,7 +691,7 @@ group_sched_in(struct perf_event *group_event, struct perf_event_context *ctx) { struct perf_event *event, *partial_group = NULL; - const struct pmu *pmu = group_event->pmu; + struct pmu *pmu = group_event->pmu; bool txn = false; if (group_event->state == PERF_EVENT_STATE_OFF) @@ -4501,7 +4501,7 @@ static int perf_swevent_int(struct perf_event *event) return 0; } -static const struct pmu perf_ops_generic = { +static struct pmu perf_ops_generic = { .enable = perf_swevent_enable, .disable = perf_swevent_disable, .start = perf_swevent_int, @@ -4614,7 +4614,7 @@ static void cpu_clock_perf_event_read(struct perf_event *event) cpu_clock_perf_event_update(event); } -static const struct pmu perf_ops_cpu_clock = { +static struct pmu perf_ops_cpu_clock = { .enable = cpu_clock_perf_event_enable, .disable = cpu_clock_perf_event_disable, .read = cpu_clock_perf_event_read, @@ -4671,7 +4671,7 @@ static void task_clock_perf_event_read(struct perf_event *event) task_clock_perf_event_update(event, time); } -static const struct pmu perf_ops_task_clock = { +static struct pmu perf_ops_task_clock = { .enable = task_clock_perf_event_enable, .disable = task_clock_perf_event_disable, .read = task_clock_perf_event_read, @@ -4785,7 +4785,7 @@ static int swevent_hlist_get(struct perf_event *event) #ifdef CONFIG_EVENT_TRACING -static const struct pmu perf_ops_tracepoint = { +static struct pmu perf_ops_tracepoint = { .enable = perf_trace_enable, .disable = perf_trace_disable, .start = perf_swevent_int, @@ -4849,7 +4849,7 @@ static void tp_perf_event_destroy(struct perf_event *event) perf_trace_destroy(event); } -static const struct pmu *tp_perf_event_init(struct perf_event *event) +static struct pmu *tp_perf_event_init(struct perf_event *event) { int err; @@ -4896,7 +4896,7 @@ static void perf_event_free_filter(struct perf_event *event) #else -static const struct pmu *tp_perf_event_init(struct perf_event *event) +static struct pmu *tp_perf_event_init(struct perf_event *event) { return NULL; } @@ -4918,7 +4918,7 @@ static void bp_perf_event_destroy(struct perf_event *event) release_bp_slot(event); } -static const struct pmu *bp_perf_event_init(struct perf_event *bp) +static struct pmu *bp_perf_event_init(struct perf_event *bp) { int err; @@ -4942,7 +4942,7 @@ void perf_bp_event(struct perf_event *bp, void *data) perf_swevent_add(bp, 1, 1, &sample, regs); } #else -static const struct pmu *bp_perf_event_init(struct perf_event *bp) +static struct pmu *bp_perf_event_init(struct perf_event *bp) { return NULL; } @@ -4964,9 +4964,9 @@ static void sw_perf_event_destroy(struct perf_event *event) swevent_hlist_put(event); } -static const struct pmu *sw_perf_event_init(struct perf_event *event) +static struct pmu *sw_perf_event_init(struct perf_event *event) { - const struct pmu *pmu = NULL; + struct pmu *pmu = NULL; u64 event_id = event->attr.config; /* @@ -5028,7 +5028,7 @@ perf_event_alloc(struct perf_event_attr *attr, perf_overflow_handler_t overflow_handler, gfp_t gfpflags) { - const struct pmu *pmu; + struct pmu *pmu; struct perf_event *event; struct hw_perf_event *hwc; long err; -- cgit v1.2.3 From b0a873ebbf87bf38bf70b5e39a7cadc96099fa13 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 11 Jun 2010 13:35:08 +0200 Subject: perf: Register PMU implementations Simple registration interface for struct pmu, this provides the infrastructure for removing all the weak functions. Signed-off-by: Peter Zijlstra Cc: paulus Cc: stephane eranian Cc: Robert Richter Cc: Will Deacon Cc: Paul Mundt Cc: Frederic Weisbecker Cc: Cyrill Gorcunov Cc: Lin Ming Cc: Yanmin Cc: Deng-Cheng Zhu Cc: David Miller Cc: Michael Cree LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/hw_breakpoint.c | 35 ++- kernel/perf_event.c | 588 ++++++++++++++++++++++++------------------------- 2 files changed, 320 insertions(+), 303 deletions(-) (limited to 'kernel') diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index d71a987fd2bf..e9c5cfa1fd20 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c @@ -565,6 +565,34 @@ static struct notifier_block hw_breakpoint_exceptions_nb = { .priority = 0x7fffffff }; +static void bp_perf_event_destroy(struct perf_event *event) +{ + release_bp_slot(event); +} + +static int hw_breakpoint_event_init(struct perf_event *bp) +{ + int err; + + if (bp->attr.type != PERF_TYPE_BREAKPOINT) + return -ENOENT; + + err = register_perf_hw_breakpoint(bp); + if (err) + return err; + + bp->destroy = bp_perf_event_destroy; + + return 0; +} + +static struct pmu perf_breakpoint = { + .event_init = hw_breakpoint_event_init, + .enable = arch_install_hw_breakpoint, + .disable = arch_uninstall_hw_breakpoint, + .read = hw_breakpoint_pmu_read, +}; + static int __init init_hw_breakpoint(void) { unsigned int **task_bp_pinned; @@ -586,6 +614,8 @@ static int __init init_hw_breakpoint(void) constraints_initialized = 1; + perf_pmu_register(&perf_breakpoint); + return register_die_notifier(&hw_breakpoint_exceptions_nb); err_alloc: @@ -601,8 +631,3 @@ static int __init init_hw_breakpoint(void) core_initcall(init_hw_breakpoint); -struct pmu perf_ops_bp = { - .enable = arch_install_hw_breakpoint, - .disable = arch_uninstall_hw_breakpoint, - .read = hw_breakpoint_pmu_read, -}; diff --git a/kernel/perf_event.c b/kernel/perf_event.c index fb46fd13f31f..288ce43de57c 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -31,7 +31,6 @@ #include #include #include -#include #include @@ -72,14 +71,6 @@ static atomic64_t perf_event_id; */ static DEFINE_SPINLOCK(perf_resource_lock); -/* - * Architecture provided APIs - weak aliases: - */ -extern __weak struct pmu *hw_perf_event_init(struct perf_event *event) -{ - return NULL; -} - void __weak hw_perf_disable(void) { barrier(); } void __weak hw_perf_enable(void) { barrier(); } @@ -4501,182 +4492,6 @@ static int perf_swevent_int(struct perf_event *event) return 0; } -static struct pmu perf_ops_generic = { - .enable = perf_swevent_enable, - .disable = perf_swevent_disable, - .start = perf_swevent_int, - .stop = perf_swevent_void, - .read = perf_swevent_read, - .unthrottle = perf_swevent_void, /* hwc->interrupts already reset */ -}; - -/* - * hrtimer based swevent callback - */ - -static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) -{ - enum hrtimer_restart ret = HRTIMER_RESTART; - struct perf_sample_data data; - struct pt_regs *regs; - struct perf_event *event; - u64 period; - - event = container_of(hrtimer, struct perf_event, hw.hrtimer); - event->pmu->read(event); - - perf_sample_data_init(&data, 0); - data.period = event->hw.last_period; - regs = get_irq_regs(); - - if (regs && !perf_exclude_event(event, regs)) { - if (!(event->attr.exclude_idle && current->pid == 0)) - if (perf_event_overflow(event, 0, &data, regs)) - ret = HRTIMER_NORESTART; - } - - period = max_t(u64, 10000, event->hw.sample_period); - hrtimer_forward_now(hrtimer, ns_to_ktime(period)); - - return ret; -} - -static void perf_swevent_start_hrtimer(struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - - hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hwc->hrtimer.function = perf_swevent_hrtimer; - if (hwc->sample_period) { - u64 period; - - if (hwc->remaining) { - if (hwc->remaining < 0) - period = 10000; - else - period = hwc->remaining; - hwc->remaining = 0; - } else { - period = max_t(u64, 10000, hwc->sample_period); - } - __hrtimer_start_range_ns(&hwc->hrtimer, - ns_to_ktime(period), 0, - HRTIMER_MODE_REL, 0); - } -} - -static void perf_swevent_cancel_hrtimer(struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - - if (hwc->sample_period) { - ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer); - hwc->remaining = ktime_to_ns(remaining); - - hrtimer_cancel(&hwc->hrtimer); - } -} - -/* - * Software event: cpu wall time clock - */ - -static void cpu_clock_perf_event_update(struct perf_event *event) -{ - int cpu = raw_smp_processor_id(); - s64 prev; - u64 now; - - now = cpu_clock(cpu); - prev = local64_xchg(&event->hw.prev_count, now); - local64_add(now - prev, &event->count); -} - -static int cpu_clock_perf_event_enable(struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - int cpu = raw_smp_processor_id(); - - local64_set(&hwc->prev_count, cpu_clock(cpu)); - perf_swevent_start_hrtimer(event); - - return 0; -} - -static void cpu_clock_perf_event_disable(struct perf_event *event) -{ - perf_swevent_cancel_hrtimer(event); - cpu_clock_perf_event_update(event); -} - -static void cpu_clock_perf_event_read(struct perf_event *event) -{ - cpu_clock_perf_event_update(event); -} - -static struct pmu perf_ops_cpu_clock = { - .enable = cpu_clock_perf_event_enable, - .disable = cpu_clock_perf_event_disable, - .read = cpu_clock_perf_event_read, -}; - -/* - * Software event: task time clock - */ - -static void task_clock_perf_event_update(struct perf_event *event, u64 now) -{ - u64 prev; - s64 delta; - - prev = local64_xchg(&event->hw.prev_count, now); - delta = now - prev; - local64_add(delta, &event->count); -} - -static int task_clock_perf_event_enable(struct perf_event *event) -{ - struct hw_perf_event *hwc = &event->hw; - u64 now; - - now = event->ctx->time; - - local64_set(&hwc->prev_count, now); - - perf_swevent_start_hrtimer(event); - - return 0; -} - -static void task_clock_perf_event_disable(struct perf_event *event) -{ - perf_swevent_cancel_hrtimer(event); - task_clock_perf_event_update(event, event->ctx->time); - -} - -static void task_clock_perf_event_read(struct perf_event *event) -{ - u64 time; - - if (!in_nmi()) { - update_context_time(event->ctx); - time = event->ctx->time; - } else { - u64 now = perf_clock(); - u64 delta = now - event->ctx->timestamp; - time = event->ctx->time + delta; - } - - task_clock_perf_event_update(event, time); -} - -static struct pmu perf_ops_task_clock = { - .enable = task_clock_perf_event_enable, - .disable = task_clock_perf_event_disable, - .read = task_clock_perf_event_read, -}; - /* Deref the hlist from the update side */ static inline struct swevent_hlist * swevent_hlist_deref(struct perf_cpu_context *cpuctx) @@ -4783,17 +4598,63 @@ static int swevent_hlist_get(struct perf_event *event) return err; } -#ifdef CONFIG_EVENT_TRACING +atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX]; -static struct pmu perf_ops_tracepoint = { - .enable = perf_trace_enable, - .disable = perf_trace_disable, +static void sw_perf_event_destroy(struct perf_event *event) +{ + u64 event_id = event->attr.config; + + WARN_ON(event->parent); + + atomic_dec(&perf_swevent_enabled[event_id]); + swevent_hlist_put(event); +} + +static int perf_swevent_init(struct perf_event *event) +{ + int event_id = event->attr.config; + + if (event->attr.type != PERF_TYPE_SOFTWARE) + return -ENOENT; + + switch (event_id) { + case PERF_COUNT_SW_CPU_CLOCK: + case PERF_COUNT_SW_TASK_CLOCK: + return -ENOENT; + + default: + break; + } + + if (event_id > PERF_COUNT_SW_MAX) + return -ENOENT; + + if (!event->parent) { + int err; + + err = swevent_hlist_get(event); + if (err) + return err; + + atomic_inc(&perf_swevent_enabled[event_id]); + event->destroy = sw_perf_event_destroy; + } + + return 0; +} + +static struct pmu perf_swevent = { + .event_init = perf_swevent_init, + .enable = perf_swevent_enable, + .disable = perf_swevent_disable, .start = perf_swevent_int, .stop = perf_swevent_void, .read = perf_swevent_read, - .unthrottle = perf_swevent_void, + .unthrottle = perf_swevent_void, /* hwc->interrupts already reset */ }; +#ifdef CONFIG_EVENT_TRACING + static int perf_tp_filter_match(struct perf_event *event, struct perf_sample_data *data) { @@ -4849,10 +4710,13 @@ static void tp_perf_event_destroy(struct perf_event *event) perf_trace_destroy(event); } -static struct pmu *tp_perf_event_init(struct perf_event *event) +static int perf_tp_event_init(struct perf_event *event) { int err; + if (event->attr.type != PERF_TYPE_TRACEPOINT) + return -ENOENT; + /* * Raw tracepoint data is a severe data leak, only allow root to * have these. @@ -4860,15 +4724,30 @@ static struct pmu *tp_perf_event_init(struct perf_event *event) if ((event->attr.sample_type & PERF_SAMPLE_RAW) && perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN)) - return ERR_PTR(-EPERM); + return -EPERM; err = perf_trace_init(event); if (err) - return NULL; + return err; event->destroy = tp_perf_event_destroy; - return &perf_ops_tracepoint; + return 0; +} + +static struct pmu perf_tracepoint = { + .event_init = perf_tp_event_init, + .enable = perf_trace_enable, + .disable = perf_trace_disable, + .start = perf_swevent_int, + .stop = perf_swevent_void, + .read = perf_swevent_read, + .unthrottle = perf_swevent_void, +}; + +static inline void perf_tp_register(void) +{ + perf_pmu_register(&perf_tracepoint); } static int perf_event_set_filter(struct perf_event *event, void __user *arg) @@ -4896,9 +4775,8 @@ static void perf_event_free_filter(struct perf_event *event) #else -static struct pmu *tp_perf_event_init(struct perf_event *event) +static inline void perf_tp_register(void) { - return NULL; } static int perf_event_set_filter(struct perf_event *event, void __user *arg) @@ -4913,105 +4791,247 @@ static void perf_event_free_filter(struct perf_event *event) #endif /* CONFIG_EVENT_TRACING */ #ifdef CONFIG_HAVE_HW_BREAKPOINT -static void bp_perf_event_destroy(struct perf_event *event) +void perf_bp_event(struct perf_event *bp, void *data) { - release_bp_slot(event); + struct perf_sample_data sample; + struct pt_regs *regs = data; + + perf_sample_data_init(&sample, bp->attr.bp_addr); + + if (!perf_exclude_event(bp, regs)) + perf_swevent_add(bp, 1, 1, &sample, regs); } +#endif + +/* + * hrtimer based swevent callback + */ -static struct pmu *bp_perf_event_init(struct perf_event *bp) +static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) { - int err; + enum hrtimer_restart ret = HRTIMER_RESTART; + struct perf_sample_data data; + struct pt_regs *regs; + struct perf_event *event; + u64 period; - err = register_perf_hw_breakpoint(bp); - if (err) - return ERR_PTR(err); + event = container_of(hrtimer, struct perf_event, hw.hrtimer); + event->pmu->read(event); + + perf_sample_data_init(&data, 0); + data.period = event->hw.last_period; + regs = get_irq_regs(); + + if (regs && !perf_exclude_event(event, regs)) { + if (!(event->attr.exclude_idle && current->pid == 0)) + if (perf_event_overflow(event, 0, &data, regs)) + ret = HRTIMER_NORESTART; + } - bp->destroy = bp_perf_event_destroy; + period = max_t(u64, 10000, event->hw.sample_period); + hrtimer_forward_now(hrtimer, ns_to_ktime(period)); - return &perf_ops_bp; + return ret; } -void perf_bp_event(struct perf_event *bp, void *data) +static void perf_swevent_start_hrtimer(struct perf_event *event) { - struct perf_sample_data sample; - struct pt_regs *regs = data; + struct hw_perf_event *hwc = &event->hw; - perf_sample_data_init(&sample, bp->attr.bp_addr); + hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hwc->hrtimer.function = perf_swevent_hrtimer; + if (hwc->sample_period) { + u64 period; - if (!perf_exclude_event(bp, regs)) - perf_swevent_add(bp, 1, 1, &sample, regs); + if (hwc->remaining) { + if (hwc->remaining < 0) + period = 10000; + else + period = hwc->remaining; + hwc->remaining = 0; + } else { + period = max_t(u64, 10000, hwc->sample_period); + } + __hrtimer_start_range_ns(&hwc->hrtimer, + ns_to_ktime(period), 0, + HRTIMER_MODE_REL, 0); + } } -#else -static struct pmu *bp_perf_event_init(struct perf_event *bp) + +static void perf_swevent_cancel_hrtimer(struct perf_event *event) { - return NULL; + struct hw_perf_event *hwc = &event->hw; + + if (hwc->sample_period) { + ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer); + hwc->remaining = ktime_to_ns(remaining); + + hrtimer_cancel(&hwc->hrtimer); + } } -void perf_bp_event(struct perf_event *bp, void *regs) +/* + * Software event: cpu wall time clock + */ + +static void cpu_clock_event_update(struct perf_event *event) { + int cpu = raw_smp_processor_id(); + s64 prev; + u64 now; + + now = cpu_clock(cpu); + prev = local64_xchg(&event->hw.prev_count, now); + local64_add(now - prev, &event->count); } -#endif -atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX]; +static int cpu_clock_event_enable(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + int cpu = raw_smp_processor_id(); -static void sw_perf_event_destroy(struct perf_event *event) + local64_set(&hwc->prev_count, cpu_clock(cpu)); + perf_swevent_start_hrtimer(event); + + return 0; +} + +static void cpu_clock_event_disable(struct perf_event *event) { - u64 event_id = event->attr.config; + perf_swevent_cancel_hrtimer(event); + cpu_clock_event_update(event); +} - WARN_ON(event->parent); +static void cpu_clock_event_read(struct perf_event *event) +{ + cpu_clock_event_update(event); +} - atomic_dec(&perf_swevent_enabled[event_id]); - swevent_hlist_put(event); +static int cpu_clock_event_init(struct perf_event *event) +{ + if (event->attr.type != PERF_TYPE_SOFTWARE) + return -ENOENT; + + if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK) + return -ENOENT; + + return 0; } -static struct pmu *sw_perf_event_init(struct perf_event *event) +static struct pmu perf_cpu_clock = { + .event_init = cpu_clock_event_init, + .enable = cpu_clock_event_enable, + .disable = cpu_clock_event_disable, + .read = cpu_clock_event_read, +}; + +/* + * Software event: task time clock + */ + +static void task_clock_event_update(struct perf_event *event, u64 now) { - struct pmu *pmu = NULL; - u64 event_id = event->attr.config; + u64 prev; + s64 delta; - /* - * Software events (currently) can't in general distinguish - * between user, kernel and hypervisor events. - * However, context switches and cpu migrations are considered - * to be kernel events, and page faults are never hypervisor - * events. - */ - switch (event_id) { - case PERF_COUNT_SW_CPU_CLOCK: - pmu = &perf_ops_cpu_clock; + prev = local64_xchg(&event->hw.prev_count, now); + delta = now - prev; + local64_add(delta, &event->count); +} - break; - case PERF_COUNT_SW_TASK_CLOCK: - /* - * If the user instantiates this as a per-cpu event, - * use the cpu_clock event instead. - */ - if (event->ctx->task) - pmu = &perf_ops_task_clock; - else - pmu = &perf_ops_cpu_clock; +static int task_clock_event_enable(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + u64 now; - break; - case PERF_COUNT_SW_PAGE_FAULTS: - case PERF_COUNT_SW_PAGE_FAULTS_MIN: - case PERF_COUNT_SW_PAGE_FAULTS_MAJ: - case PERF_COUNT_SW_CONTEXT_SWITCHES: - case PERF_COUNT_SW_CPU_MIGRATIONS: - case PERF_COUNT_SW_ALIGNMENT_FAULTS: - case PERF_COUNT_SW_EMULATION_FAULTS: - if (!event->parent) { - int err; - - err = swevent_hlist_get(event); - if (err) - return ERR_PTR(err); + now = event->ctx->time; - atomic_inc(&perf_swevent_enabled[event_id]); - event->destroy = sw_perf_event_destroy; + local64_set(&hwc->prev_count, now); + + perf_swevent_start_hrtimer(event); + + return 0; +} + +static void task_clock_event_disable(struct perf_event *event) +{ + perf_swevent_cancel_hrtimer(event); + task_clock_event_update(event, event->ctx->time); + +} + +static void task_clock_event_read(struct perf_event *event) +{ + u64 time; + + if (!in_nmi()) { + update_context_time(event->ctx); + time = event->ctx->time; + } else { + u64 now = perf_clock(); + u64 delta = now - event->ctx->timestamp; + time = event->ctx->time + delta; + } + + task_clock_event_update(event, time); +} + +static int task_clock_event_init(struct perf_event *event) +{ + if (event->attr.type != PERF_TYPE_SOFTWARE) + return -ENOENT; + + if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK) + return -ENOENT; + + return 0; +} + +static struct pmu perf_task_clock = { + .event_init = task_clock_event_init, + .enable = task_clock_event_enable, + .disable = task_clock_event_disable, + .read = task_clock_event_read, +}; + +static LIST_HEAD(pmus); +static DEFINE_MUTEX(pmus_lock); +static struct srcu_struct pmus_srcu; + +int perf_pmu_register(struct pmu *pmu) +{ + mutex_lock(&pmus_lock); + list_add_rcu(&pmu->entry, &pmus); + mutex_unlock(&pmus_lock); + + return 0; +} + +void perf_pmu_unregister(struct pmu *pmu) +{ + mutex_lock(&pmus_lock); + list_del_rcu(&pmu->entry); + mutex_unlock(&pmus_lock); + + synchronize_srcu(&pmus_srcu); +} + +struct pmu *perf_init_event(struct perf_event *event) +{ + struct pmu *pmu = NULL; + int idx; + + idx = srcu_read_lock(&pmus_srcu); + list_for_each_entry_rcu(pmu, &pmus, entry) { + int ret = pmu->event_init(event); + if (!ret) + break; + if (ret != -ENOENT) { + pmu = ERR_PTR(ret); + break; } - pmu = &perf_ops_generic; - break; } + srcu_read_unlock(&pmus_srcu, idx); return pmu; } @@ -5092,29 +5112,8 @@ perf_event_alloc(struct perf_event_attr *attr, if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP)) goto done; - switch (attr->type) { - case PERF_TYPE_RAW: - case PERF_TYPE_HARDWARE: - case PERF_TYPE_HW_CACHE: - pmu = hw_perf_event_init(event); - break; - - case PERF_TYPE_SOFTWARE: - pmu = sw_perf_event_init(event); - break; - - case PERF_TYPE_TRACEPOINT: - pmu = tp_perf_event_init(event); - break; + pmu = perf_init_event(event); - case PERF_TYPE_BREAKPOINT: - pmu = bp_perf_event_init(event); - break; - - - default: - break; - } done: err = 0; if (!pmu) @@ -5979,22 +5978,15 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) return NOTIFY_OK; } -/* - * This has to have a higher priority than migration_notifier in sched.c. - */ -static struct notifier_block __cpuinitdata perf_cpu_nb = { - .notifier_call = perf_cpu_notify, - .priority = 20, -}; - void __init perf_event_init(void) { perf_event_init_all_cpus(); - perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, - (void *)(long)smp_processor_id()); - perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_ONLINE, - (void *)(long)smp_processor_id()); - register_cpu_notifier(&perf_cpu_nb); + init_srcu_struct(&pmus_srcu); + perf_pmu_register(&perf_swevent); + perf_pmu_register(&perf_cpu_clock); + perf_pmu_register(&perf_task_clock); + perf_tp_register(); + perf_cpu_notifier(perf_cpu_notify); } static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, -- cgit v1.2.3 From 9ed6060d286b1eb55974d09080f442f809408c42 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 11 Jun 2010 17:36:35 +0200 Subject: perf: Unindent labels Fixup random annoying style bits. Signed-off-by: Peter Zijlstra Cc: paulus LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 43 ++++++++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 288ce43de57c..149ca18371b7 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -147,7 +147,7 @@ perf_lock_task_context(struct task_struct *task, unsigned long *flags) struct perf_event_context *ctx; rcu_read_lock(); - retry: +retry: ctx = rcu_dereference(task->perf_event_ctxp); if (ctx) { /* @@ -619,7 +619,7 @@ void perf_event_disable(struct perf_event *event) return; } - retry: +retry: task_oncpu_function_call(task, __perf_event_disable, event); raw_spin_lock_irq(&ctx->lock); @@ -849,7 +849,7 @@ static void __perf_install_in_context(void *info) if (!err && !ctx->task && cpuctx->max_pertask) cpuctx->max_pertask--; - unlock: +unlock: perf_enable(); raw_spin_unlock(&ctx->lock); @@ -922,10 +922,12 @@ static void __perf_event_mark_enabled(struct perf_event *event, event->state = PERF_EVENT_STATE_INACTIVE; event->tstamp_enabled = ctx->time - event->total_time_enabled; - list_for_each_entry(sub, &event->sibling_list, group_entry) - if (sub->state >= PERF_EVENT_STATE_INACTIVE) + list_for_each_entry(sub, &event->sibling_list, group_entry) { + if (sub->state >= PERF_EVENT_STATE_INACTIVE) { sub->tstamp_enabled = ctx->time - sub->total_time_enabled; + } + } } /* @@ -991,7 +993,7 @@ static void __perf_event_enable(void *info) } } - unlock: +unlock: raw_spin_unlock(&ctx->lock); } @@ -1032,7 +1034,7 @@ void perf_event_enable(struct perf_event *event) if (event->state == PERF_EVENT_STATE_ERROR) event->state = PERF_EVENT_STATE_OFF; - retry: +retry: raw_spin_unlock_irq(&ctx->lock); task_oncpu_function_call(task, __perf_event_enable, event); @@ -1052,7 +1054,7 @@ void perf_event_enable(struct perf_event *event) if (event->state == PERF_EVENT_STATE_OFF) __perf_event_mark_enabled(event, ctx); - out: +out: raw_spin_unlock_irq(&ctx->lock); } @@ -1092,17 +1094,19 @@ static void ctx_sched_out(struct perf_event_context *ctx, if (!ctx->nr_active) goto out_enable; - if (event_type & EVENT_PINNED) + if (event_type & EVENT_PINNED) { list_for_each_entry(event, &ctx->pinned_groups, group_entry) group_sched_out(event, cpuctx, ctx); + } - if (event_type & EVENT_FLEXIBLE) + if (event_type & EVENT_FLEXIBLE) { list_for_each_entry(event, &ctx->flexible_groups, group_entry) group_sched_out(event, cpuctx, ctx); + } out_enable: perf_enable(); - out: +out: raw_spin_unlock(&ctx->lock); } @@ -1341,9 +1345,10 @@ ctx_flexible_sched_in(struct perf_event_context *ctx, if (event->cpu != -1 && event->cpu != smp_processor_id()) continue; - if (group_can_go_on(event, cpuctx, can_add_hw)) + if (group_can_go_on(event, cpuctx, can_add_hw)) { if (group_sched_in(event, cpuctx, ctx)) can_add_hw = 0; + } } } @@ -1373,7 +1378,7 @@ ctx_sched_in(struct perf_event_context *ctx, ctx_flexible_sched_in(ctx, cpuctx); perf_enable(); - out: +out: raw_spin_unlock(&ctx->lock); } @@ -1714,7 +1719,7 @@ static void perf_event_enable_on_exec(struct task_struct *task) raw_spin_unlock(&ctx->lock); perf_event_task_sched_in(task); - out: +out: local_irq_restore(flags); } @@ -2053,7 +2058,7 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu) if (!ptrace_may_access(task, PTRACE_MODE_READ)) goto errout; - retry: +retry: ctx = perf_lock_task_context(task, &flags); if (ctx) { unclone_ctx(ctx); @@ -2081,7 +2086,7 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu) put_task_struct(task); return ctx; - errout: +errout: put_task_struct(task); return ERR_PTR(err); } @@ -3264,7 +3269,7 @@ again: if (handle->wakeup != local_read(&buffer->wakeup)) perf_output_wakeup(handle); - out: +out: preempt_enable(); } @@ -4562,7 +4567,7 @@ static int swevent_hlist_get_cpu(struct perf_event *event, int cpu) rcu_assign_pointer(cpuctx->swevent_hlist, hlist); } cpuctx->hlist_refcount++; - exit: +exit: mutex_unlock(&cpuctx->hlist_mutex); return err; @@ -4587,7 +4592,7 @@ static int swevent_hlist_get(struct perf_event *event) put_online_cpus(); return 0; - fail: +fail: for_each_possible_cpu(cpu) { if (cpu == failed_cpu) break; -- cgit v1.2.3 From 24cd7f54a0d47e1d5b3de29e2456bfbd2d8447b7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 11 Jun 2010 17:32:03 +0200 Subject: perf: Reduce perf_disable() usage Since the current perf_disable() usage is only an optimization, remove it for now. This eases the removal of the __weak hw_perf_enable() interface. Signed-off-by: Peter Zijlstra Cc: paulus Cc: stephane eranian Cc: Robert Richter Cc: Will Deacon Cc: Paul Mundt Cc: Frederic Weisbecker Cc: Cyrill Gorcunov Cc: Lin Ming Cc: Yanmin Cc: Deng-Cheng Zhu Cc: David Miller Cc: Michael Cree LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 37 +------------------------------------ 1 file changed, 1 insertion(+), 36 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 149ca18371b7..9a98ce953561 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -478,11 +478,6 @@ static void __perf_event_remove_from_context(void *info) return; raw_spin_lock(&ctx->lock); - /* - * Protect the list operation against NMI by disabling the - * events on a global level. - */ - perf_disable(); event_sched_out(event, cpuctx, ctx); @@ -498,7 +493,6 @@ static void __perf_event_remove_from_context(void *info) perf_max_events - perf_reserved_percpu); } - perf_enable(); raw_spin_unlock(&ctx->lock); } @@ -803,12 +797,6 @@ static void __perf_install_in_context(void *info) ctx->is_active = 1; update_context_time(ctx); - /* - * Protect the list operation against NMI by disabling the - * events on a global level. NOP for non NMI based events. - */ - perf_disable(); - add_event_to_ctx(event, ctx); if (event->cpu != -1 && event->cpu != smp_processor_id()) @@ -850,8 +838,6 @@ static void __perf_install_in_context(void *info) cpuctx->max_pertask--; unlock: - perf_enable(); - raw_spin_unlock(&ctx->lock); } @@ -972,12 +958,10 @@ static void __perf_event_enable(void *info) if (!group_can_go_on(event, cpuctx, 1)) { err = -EEXIST; } else { - perf_disable(); if (event == leader) err = group_sched_in(event, cpuctx, ctx); else err = event_sched_in(event, cpuctx, ctx); - perf_enable(); } if (err) { @@ -1090,9 +1074,8 @@ static void ctx_sched_out(struct perf_event_context *ctx, goto out; update_context_time(ctx); - perf_disable(); if (!ctx->nr_active) - goto out_enable; + goto out; if (event_type & EVENT_PINNED) { list_for_each_entry(event, &ctx->pinned_groups, group_entry) @@ -1103,9 +1086,6 @@ static void ctx_sched_out(struct perf_event_context *ctx, list_for_each_entry(event, &ctx->flexible_groups, group_entry) group_sched_out(event, cpuctx, ctx); } - - out_enable: - perf_enable(); out: raw_spin_unlock(&ctx->lock); } @@ -1364,8 +1344,6 @@ ctx_sched_in(struct perf_event_context *ctx, ctx->timestamp = perf_clock(); - perf_disable(); - /* * First go through the list and put on any pinned groups * in order to give them the best chance of going on. @@ -1377,7 +1355,6 @@ ctx_sched_in(struct perf_event_context *ctx, if (event_type & EVENT_FLEXIBLE) ctx_flexible_sched_in(ctx, cpuctx); - perf_enable(); out: raw_spin_unlock(&ctx->lock); } @@ -1425,8 +1402,6 @@ void perf_event_task_sched_in(struct task_struct *task) if (cpuctx->task_ctx == ctx) return; - perf_disable(); - /* * We want to keep the following priority order: * cpu pinned (that don't need to move), task pinned, @@ -1439,8 +1414,6 @@ void perf_event_task_sched_in(struct task_struct *task) ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE); cpuctx->task_ctx = ctx; - - perf_enable(); } #define MAX_INTERRUPTS (~0ULL) @@ -1555,11 +1528,9 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) hwc->sample_period = sample_period; if (local64_read(&hwc->period_left) > 8*sample_period) { - perf_disable(); perf_event_stop(event); local64_set(&hwc->period_left, 0); perf_event_start(event); - perf_enable(); } } @@ -1588,15 +1559,12 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx) */ if (interrupts == MAX_INTERRUPTS) { perf_log_throttle(event, 1); - perf_disable(); event->pmu->unthrottle(event); - perf_enable(); } if (!event->attr.freq || !event->attr.sample_freq) continue; - perf_disable(); event->pmu->read(event); now = local64_read(&event->count); delta = now - hwc->freq_count_stamp; @@ -1604,7 +1572,6 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx) if (delta > 0) perf_adjust_period(event, TICK_NSEC, delta); - perf_enable(); } raw_spin_unlock(&ctx->lock); } @@ -1647,7 +1614,6 @@ void perf_event_task_tick(struct task_struct *curr) if (!rotate) return; - perf_disable(); cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); if (ctx) task_ctx_sched_out(ctx, EVENT_FLEXIBLE); @@ -1659,7 +1625,6 @@ void perf_event_task_tick(struct task_struct *curr) cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); if (ctx) task_ctx_sched_in(curr, EVENT_FLEXIBLE); - perf_enable(); } static int event_enable_on_exec(struct perf_event *event, -- cgit v1.2.3 From 33696fc0d141bbbcb12f75b69608ea83282e3117 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 14 Jun 2010 08:49:00 +0200 Subject: perf: Per PMU disable Changes perf_disable() into perf_pmu_disable(). Signed-off-by: Peter Zijlstra Cc: paulus Cc: stephane eranian Cc: Robert Richter Cc: Will Deacon Cc: Paul Mundt Cc: Frederic Weisbecker Cc: Cyrill Gorcunov Cc: Lin Ming Cc: Yanmin Cc: Deng-Cheng Zhu Cc: David Miller Cc: Michael Cree LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 31 +++++++++++++++++++------------ 1 file changed, 19 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 9a98ce953561..5ed0c06765bb 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -71,23 +71,20 @@ static atomic64_t perf_event_id; */ static DEFINE_SPINLOCK(perf_resource_lock); -void __weak hw_perf_disable(void) { barrier(); } -void __weak hw_perf_enable(void) { barrier(); } - void __weak perf_event_print_debug(void) { } -static DEFINE_PER_CPU(int, perf_disable_count); - -void perf_disable(void) +void perf_pmu_disable(struct pmu *pmu) { - if (!__get_cpu_var(perf_disable_count)++) - hw_perf_disable(); + int *count = this_cpu_ptr(pmu->pmu_disable_count); + if (!(*count)++) + pmu->pmu_disable(pmu); } -void perf_enable(void) +void perf_pmu_enable(struct pmu *pmu) { - if (!--__get_cpu_var(perf_disable_count)) - hw_perf_enable(); + int *count = this_cpu_ptr(pmu->pmu_disable_count); + if (!--(*count)) + pmu->pmu_enable(pmu); } static void get_ctx(struct perf_event_context *ctx) @@ -4970,11 +4967,19 @@ static struct srcu_struct pmus_srcu; int perf_pmu_register(struct pmu *pmu) { + int ret; + mutex_lock(&pmus_lock); + ret = -ENOMEM; + pmu->pmu_disable_count = alloc_percpu(int); + if (!pmu->pmu_disable_count) + goto unlock; list_add_rcu(&pmu->entry, &pmus); + ret = 0; +unlock: mutex_unlock(&pmus_lock); - return 0; + return ret; } void perf_pmu_unregister(struct pmu *pmu) @@ -4984,6 +4989,8 @@ void perf_pmu_unregister(struct pmu *pmu) mutex_unlock(&pmus_lock); synchronize_srcu(&pmus_srcu); + + free_percpu(pmu->pmu_disable_count); } struct pmu *perf_init_event(struct perf_event *event) -- cgit v1.2.3 From ad5133b7030d04ce7701aa7cbe98f561347c79c2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 15 Jun 2010 12:22:39 +0200 Subject: perf: Default PMU ops Provide default implementations for the pmu txn methods, this allows us to remove some conditional code. Signed-off-by: Peter Zijlstra Cc: paulus Cc: stephane eranian Cc: Robert Richter Cc: Will Deacon Cc: Paul Mundt Cc: Frederic Weisbecker Cc: Cyrill Gorcunov Cc: Lin Ming Cc: Yanmin Cc: Deng-Cheng Zhu Cc: David Miller Cc: Michael Cree LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 64 +++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 52 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 5ed0c06765bb..8ef4ba3bcb1f 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -674,21 +674,14 @@ group_sched_in(struct perf_event *group_event, { struct perf_event *event, *partial_group = NULL; struct pmu *pmu = group_event->pmu; - bool txn = false; if (group_event->state == PERF_EVENT_STATE_OFF) return 0; - /* Check if group transaction availabe */ - if (pmu->start_txn) - txn = true; - - if (txn) - pmu->start_txn(pmu); + pmu->start_txn(pmu); if (event_sched_in(group_event, cpuctx, ctx)) { - if (txn) - pmu->cancel_txn(pmu); + pmu->cancel_txn(pmu); return -EAGAIN; } @@ -702,7 +695,7 @@ group_sched_in(struct perf_event *group_event, } } - if (!txn || !pmu->commit_txn(pmu)) + if (!pmu->commit_txn(pmu)) return 0; group_error: @@ -717,8 +710,7 @@ group_error: } event_sched_out(group_event, cpuctx, ctx); - if (txn) - pmu->cancel_txn(pmu); + pmu->cancel_txn(pmu); return -EAGAIN; } @@ -4965,6 +4957,31 @@ static LIST_HEAD(pmus); static DEFINE_MUTEX(pmus_lock); static struct srcu_struct pmus_srcu; +static void perf_pmu_nop_void(struct pmu *pmu) +{ +} + +static int perf_pmu_nop_int(struct pmu *pmu) +{ + return 0; +} + +static void perf_pmu_start_txn(struct pmu *pmu) +{ + perf_pmu_disable(pmu); +} + +static int perf_pmu_commit_txn(struct pmu *pmu) +{ + perf_pmu_enable(pmu); + return 0; +} + +static void perf_pmu_cancel_txn(struct pmu *pmu) +{ + perf_pmu_enable(pmu); +} + int perf_pmu_register(struct pmu *pmu) { int ret; @@ -4974,6 +4991,29 @@ int perf_pmu_register(struct pmu *pmu) pmu->pmu_disable_count = alloc_percpu(int); if (!pmu->pmu_disable_count) goto unlock; + + if (!pmu->start_txn) { + if (pmu->pmu_enable) { + /* + * If we have pmu_enable/pmu_disable calls, install + * transaction stubs that use that to try and batch + * hardware accesses. + */ + pmu->start_txn = perf_pmu_start_txn; + pmu->commit_txn = perf_pmu_commit_txn; + pmu->cancel_txn = perf_pmu_cancel_txn; + } else { + pmu->start_txn = perf_pmu_nop_void; + pmu->commit_txn = perf_pmu_nop_int; + pmu->cancel_txn = perf_pmu_nop_void; + } + } + + if (!pmu->pmu_enable) { + pmu->pmu_enable = perf_pmu_nop_void; + pmu->pmu_disable = perf_pmu_nop_void; + } + list_add_rcu(&pmu->entry, &pmus); ret = 0; unlock: -- cgit v1.2.3 From fa407f35e0298d841e4088f95a7f9cf6e725c6d5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 24 Jun 2010 12:35:12 +0200 Subject: perf: Shrink hw_perf_event Use hw_perf_event::period_left instead of hw_perf_event::remaining and win back 8 bytes. Signed-off-by: Peter Zijlstra Cc: paulus Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 8ef4ba3bcb1f..1a6cdbf0d091 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -4800,14 +4800,13 @@ static void perf_swevent_start_hrtimer(struct perf_event *event) hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); hwc->hrtimer.function = perf_swevent_hrtimer; if (hwc->sample_period) { - u64 period; + s64 period = local64_read(&hwc->period_left); - if (hwc->remaining) { - if (hwc->remaining < 0) + if (period) { + if (period < 0) period = 10000; - else - period = hwc->remaining; - hwc->remaining = 0; + + local64_set(&hwc->period_left, 0); } else { period = max_t(u64, 10000, hwc->sample_period); } @@ -4823,7 +4822,7 @@ static void perf_swevent_cancel_hrtimer(struct perf_event *event) if (hwc->sample_period) { ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer); - hwc->remaining = ktime_to_ns(remaining); + local64_set(&hwc->period_left, ktime_to_ns(remaining)); hrtimer_cancel(&hwc->hrtimer); } -- cgit v1.2.3 From a4eaf7f14675cb512d69f0c928055e73d0c6d252 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 16 Jun 2010 14:37:10 +0200 Subject: perf: Rework the PMU methods Replace pmu::{enable,disable,start,stop,unthrottle} with pmu::{add,del,start,stop}, all of which take a flags argument. The new interface extends the capability to stop a counter while keeping it scheduled on the PMU. We replace the throttled state with the generic stopped state. This also allows us to efficiently stop/start counters over certain code paths (like IRQ handlers). It also allows scheduling a counter without it starting, allowing for a generic frozen state (useful for rotating stopped counters). The stopped state is implemented in two different ways, depending on how the architecture implemented the throttled state: 1) We disable the counter: a) the pmu has per-counter enable bits, we flip that b) we program a NOP event, preserving the counter state 2) We store the counter state and ignore all read/overflow events Signed-off-by: Peter Zijlstra Cc: paulus Cc: stephane eranian Cc: Robert Richter Cc: Will Deacon Cc: Paul Mundt Cc: Frederic Weisbecker Cc: Cyrill Gorcunov Cc: Lin Ming Cc: Yanmin Cc: Deng-Cheng Zhu Cc: David Miller Cc: Michael Cree LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/hw_breakpoint.c | 29 ++++++++- kernel/perf_event.c | 140 +++++++++++++++++++++------------------- kernel/trace/trace_event_perf.c | 7 +- 3 files changed, 104 insertions(+), 72 deletions(-) (limited to 'kernel') diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index e9c5cfa1fd20..6f150095cafe 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c @@ -586,10 +586,35 @@ static int hw_breakpoint_event_init(struct perf_event *bp) return 0; } +static int hw_breakpoint_add(struct perf_event *bp, int flags) +{ + if (!(flags & PERF_EF_START)) + bp->hw.state = PERF_HES_STOPPED; + + return arch_install_hw_breakpoint(bp); +} + +static void hw_breakpoint_del(struct perf_event *bp, int flags) +{ + arch_uninstall_hw_breakpoint(bp); +} + +static void hw_breakpoint_start(struct perf_event *bp, int flags) +{ + bp->hw.state = 0; +} + +static void hw_breakpoint_stop(struct perf_event *bp, int flags) +{ + bp->hw.state = PERF_HES_STOPPED; +} + static struct pmu perf_breakpoint = { .event_init = hw_breakpoint_event_init, - .enable = arch_install_hw_breakpoint, - .disable = arch_uninstall_hw_breakpoint, + .add = hw_breakpoint_add, + .del = hw_breakpoint_del, + .start = hw_breakpoint_start, + .stop = hw_breakpoint_stop, .read = hw_breakpoint_pmu_read, }; diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 1a6cdbf0d091..3bace4fd0355 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -424,7 +424,7 @@ event_sched_out(struct perf_event *event, event->state = PERF_EVENT_STATE_OFF; } event->tstamp_stopped = ctx->time; - event->pmu->disable(event); + event->pmu->del(event, 0); event->oncpu = -1; if (!is_software_event(event)) @@ -649,7 +649,7 @@ event_sched_in(struct perf_event *event, */ smp_wmb(); - if (event->pmu->enable(event)) { + if (event->pmu->add(event, PERF_EF_START)) { event->state = PERF_EVENT_STATE_INACTIVE; event->oncpu = -1; return -EAGAIN; @@ -1482,22 +1482,6 @@ do { \ return div64_u64(dividend, divisor); } -static void perf_event_stop(struct perf_event *event) -{ - if (!event->pmu->stop) - return event->pmu->disable(event); - - return event->pmu->stop(event); -} - -static int perf_event_start(struct perf_event *event) -{ - if (!event->pmu->start) - return event->pmu->enable(event); - - return event->pmu->start(event); -} - static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) { struct hw_perf_event *hwc = &event->hw; @@ -1517,9 +1501,9 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) hwc->sample_period = sample_period; if (local64_read(&hwc->period_left) > 8*sample_period) { - perf_event_stop(event); + event->pmu->stop(event, PERF_EF_UPDATE); local64_set(&hwc->period_left, 0); - perf_event_start(event); + event->pmu->start(event, PERF_EF_RELOAD); } } @@ -1548,7 +1532,7 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx) */ if (interrupts == MAX_INTERRUPTS) { perf_log_throttle(event, 1); - event->pmu->unthrottle(event); + event->pmu->start(event, 0); } if (!event->attr.freq || !event->attr.sample_freq) @@ -2506,6 +2490,9 @@ int perf_event_task_disable(void) static int perf_event_index(struct perf_event *event) { + if (event->hw.state & PERF_HES_STOPPED) + return 0; + if (event->state != PERF_EVENT_STATE_ACTIVE) return 0; @@ -4120,8 +4107,6 @@ static int __perf_event_overflow(struct perf_event *event, int nmi, struct hw_perf_event *hwc = &event->hw; int ret = 0; - throttle = (throttle && event->pmu->unthrottle != NULL); - if (!throttle) { hwc->interrupts++; } else { @@ -4246,7 +4231,7 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow, } } -static void perf_swevent_add(struct perf_event *event, u64 nr, +static void perf_swevent_event(struct perf_event *event, u64 nr, int nmi, struct perf_sample_data *data, struct pt_regs *regs) { @@ -4272,6 +4257,9 @@ static void perf_swevent_add(struct perf_event *event, u64 nr, static int perf_exclude_event(struct perf_event *event, struct pt_regs *regs) { + if (event->hw.state & PERF_HES_STOPPED) + return 0; + if (regs) { if (event->attr.exclude_user && user_mode(regs)) return 1; @@ -4371,7 +4359,7 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id, hlist_for_each_entry_rcu(event, node, head, hlist_entry) { if (perf_swevent_match(event, type, event_id, data, regs)) - perf_swevent_add(event, nr, nmi, data, regs); + perf_swevent_event(event, nr, nmi, data, regs); } end: rcu_read_unlock(); @@ -4415,7 +4403,7 @@ static void perf_swevent_read(struct perf_event *event) { } -static int perf_swevent_enable(struct perf_event *event) +static int perf_swevent_add(struct perf_event *event, int flags) { struct hw_perf_event *hwc = &event->hw; struct perf_cpu_context *cpuctx; @@ -4428,6 +4416,8 @@ static int perf_swevent_enable(struct perf_event *event) perf_swevent_set_period(event); } + hwc->state = !(flags & PERF_EF_START); + head = find_swevent_head(cpuctx, event); if (WARN_ON_ONCE(!head)) return -EINVAL; @@ -4437,18 +4427,19 @@ static int perf_swevent_enable(struct perf_event *event) return 0; } -static void perf_swevent_disable(struct perf_event *event) +static void perf_swevent_del(struct perf_event *event, int flags) { hlist_del_rcu(&event->hlist_entry); } -static void perf_swevent_void(struct perf_event *event) +static void perf_swevent_start(struct perf_event *event, int flags) { + event->hw.state = 0; } -static int perf_swevent_int(struct perf_event *event) +static void perf_swevent_stop(struct perf_event *event, int flags) { - return 0; + event->hw.state = PERF_HES_STOPPED; } /* Deref the hlist from the update side */ @@ -4604,12 +4595,11 @@ static int perf_swevent_init(struct perf_event *event) static struct pmu perf_swevent = { .event_init = perf_swevent_init, - .enable = perf_swevent_enable, - .disable = perf_swevent_disable, - .start = perf_swevent_int, - .stop = perf_swevent_void, + .add = perf_swevent_add, + .del = perf_swevent_del, + .start = perf_swevent_start, + .stop = perf_swevent_stop, .read = perf_swevent_read, - .unthrottle = perf_swevent_void, /* hwc->interrupts already reset */ }; #ifdef CONFIG_EVENT_TRACING @@ -4657,7 +4647,7 @@ void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, hlist_for_each_entry_rcu(event, node, head, hlist_entry) { if (perf_tp_event_match(event, &data, regs)) - perf_swevent_add(event, count, 1, &data, regs); + perf_swevent_event(event, count, 1, &data, regs); } perf_swevent_put_recursion_context(rctx); @@ -4696,12 +4686,11 @@ static int perf_tp_event_init(struct perf_event *event) static struct pmu perf_tracepoint = { .event_init = perf_tp_event_init, - .enable = perf_trace_enable, - .disable = perf_trace_disable, - .start = perf_swevent_int, - .stop = perf_swevent_void, + .add = perf_trace_add, + .del = perf_trace_del, + .start = perf_swevent_start, + .stop = perf_swevent_stop, .read = perf_swevent_read, - .unthrottle = perf_swevent_void, }; static inline void perf_tp_register(void) @@ -4757,8 +4746,8 @@ void perf_bp_event(struct perf_event *bp, void *data) perf_sample_data_init(&sample, bp->attr.bp_addr); - if (!perf_exclude_event(bp, regs)) - perf_swevent_add(bp, 1, 1, &sample, regs); + if (!bp->hw.state && !perf_exclude_event(bp, regs)) + perf_swevent_event(bp, 1, 1, &sample, regs); } #endif @@ -4834,32 +4823,39 @@ static void perf_swevent_cancel_hrtimer(struct perf_event *event) static void cpu_clock_event_update(struct perf_event *event) { - int cpu = raw_smp_processor_id(); s64 prev; u64 now; - now = cpu_clock(cpu); + now = local_clock(); prev = local64_xchg(&event->hw.prev_count, now); local64_add(now - prev, &event->count); } -static int cpu_clock_event_enable(struct perf_event *event) +static void cpu_clock_event_start(struct perf_event *event, int flags) { - struct hw_perf_event *hwc = &event->hw; - int cpu = raw_smp_processor_id(); - - local64_set(&hwc->prev_count, cpu_clock(cpu)); + local64_set(&event->hw.prev_count, local_clock()); perf_swevent_start_hrtimer(event); - - return 0; } -static void cpu_clock_event_disable(struct perf_event *event) +static void cpu_clock_event_stop(struct perf_event *event, int flags) { perf_swevent_cancel_hrtimer(event); cpu_clock_event_update(event); } +static int cpu_clock_event_add(struct perf_event *event, int flags) +{ + if (flags & PERF_EF_START) + cpu_clock_event_start(event, flags); + + return 0; +} + +static void cpu_clock_event_del(struct perf_event *event, int flags) +{ + cpu_clock_event_stop(event, flags); +} + static void cpu_clock_event_read(struct perf_event *event) { cpu_clock_event_update(event); @@ -4878,8 +4874,10 @@ static int cpu_clock_event_init(struct perf_event *event) static struct pmu perf_cpu_clock = { .event_init = cpu_clock_event_init, - .enable = cpu_clock_event_enable, - .disable = cpu_clock_event_disable, + .add = cpu_clock_event_add, + .del = cpu_clock_event_del, + .start = cpu_clock_event_start, + .stop = cpu_clock_event_stop, .read = cpu_clock_event_read, }; @@ -4897,25 +4895,29 @@ static void task_clock_event_update(struct perf_event *event, u64 now) local64_add(delta, &event->count); } -static int task_clock_event_enable(struct perf_event *event) +static void task_clock_event_start(struct perf_event *event, int flags) { - struct hw_perf_event *hwc = &event->hw; - u64 now; - - now = event->ctx->time; - - local64_set(&hwc->prev_count, now); - + local64_set(&event->hw.prev_count, event->ctx->time); perf_swevent_start_hrtimer(event); - - return 0; } -static void task_clock_event_disable(struct perf_event *event) +static void task_clock_event_stop(struct perf_event *event, int flags) { perf_swevent_cancel_hrtimer(event); task_clock_event_update(event, event->ctx->time); +} + +static int task_clock_event_add(struct perf_event *event, int flags) +{ + if (flags & PERF_EF_START) + task_clock_event_start(event, flags); + return 0; +} + +static void task_clock_event_del(struct perf_event *event, int flags) +{ + task_clock_event_stop(event, PERF_EF_UPDATE); } static void task_clock_event_read(struct perf_event *event) @@ -4947,8 +4949,10 @@ static int task_clock_event_init(struct perf_event *event) static struct pmu perf_task_clock = { .event_init = task_clock_event_init, - .enable = task_clock_event_enable, - .disable = task_clock_event_disable, + .add = task_clock_event_add, + .del = task_clock_event_del, + .start = task_clock_event_start, + .stop = task_clock_event_stop, .read = task_clock_event_read, }; diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index f3bbcd1c90c8..39c059ca670e 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -101,7 +101,7 @@ int perf_trace_init(struct perf_event *p_event) return ret; } -int perf_trace_enable(struct perf_event *p_event) +int perf_trace_add(struct perf_event *p_event, int flags) { struct ftrace_event_call *tp_event = p_event->tp_event; struct hlist_head __percpu *pcpu_list; @@ -111,13 +111,16 @@ int perf_trace_enable(struct perf_event *p_event) if (WARN_ON_ONCE(!pcpu_list)) return -EINVAL; + if (!(flags & PERF_EF_START)) + p_event->hw.state = PERF_HES_STOPPED; + list = this_cpu_ptr(pcpu_list); hlist_add_head_rcu(&p_event->hlist_entry, list); return 0; } -void perf_trace_disable(struct perf_event *p_event) +void perf_trace_del(struct perf_event *p_event, int flags) { hlist_del_rcu(&p_event->hlist_entry); } -- cgit v1.2.3 From 15ac9a395a753cb28c674e7ea80386ffdff21785 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 6 Sep 2010 15:51:45 +0200 Subject: perf: Remove the sysfs bits Neither the overcommit nor the reservation sysfs parameter were actually working, remove them as they'll only get in the way. Signed-off-by: Peter Zijlstra Cc: paulus LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 124 ---------------------------------------------------- 1 file changed, 124 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 3bace4fd0355..8462e69409ae 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -39,10 +39,6 @@ */ static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context); -int perf_max_events __read_mostly = 1; -static int perf_reserved_percpu __read_mostly; -static int perf_overcommit __read_mostly = 1; - static atomic_t nr_events __read_mostly; static atomic_t nr_mmap_events __read_mostly; static atomic_t nr_comm_events __read_mostly; @@ -66,11 +62,6 @@ int sysctl_perf_event_sample_rate __read_mostly = 100000; static atomic64_t perf_event_id; -/* - * Lock for (sysadmin-configurable) event reservations: - */ -static DEFINE_SPINLOCK(perf_resource_lock); - void __weak perf_event_print_debug(void) { } void perf_pmu_disable(struct pmu *pmu) @@ -480,16 +471,6 @@ static void __perf_event_remove_from_context(void *info) list_del_event(event, ctx); - if (!ctx->task) { - /* - * Allow more per task events with respect to the - * reservation: - */ - cpuctx->max_pertask = - min(perf_max_events - ctx->nr_events, - perf_max_events - perf_reserved_percpu); - } - raw_spin_unlock(&ctx->lock); } @@ -823,9 +804,6 @@ static void __perf_install_in_context(void *info) } } - if (!err && !ctx->task && cpuctx->max_pertask) - cpuctx->max_pertask--; - unlock: raw_spin_unlock(&ctx->lock); } @@ -5930,10 +5908,6 @@ static void __cpuinit perf_event_init_cpu(int cpu) cpuctx = &per_cpu(perf_cpu_context, cpu); - spin_lock(&perf_resource_lock); - cpuctx->max_pertask = perf_max_events - perf_reserved_percpu; - spin_unlock(&perf_resource_lock); - mutex_lock(&cpuctx->hlist_mutex); if (cpuctx->hlist_refcount > 0) { struct swevent_hlist *hlist; @@ -6008,101 +5982,3 @@ void __init perf_event_init(void) perf_tp_register(); perf_cpu_notifier(perf_cpu_notify); } - -static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, - struct sysdev_class_attribute *attr, - char *buf) -{ - return sprintf(buf, "%d\n", perf_reserved_percpu); -} - -static ssize_t -perf_set_reserve_percpu(struct sysdev_class *class, - struct sysdev_class_attribute *attr, - const char *buf, - size_t count) -{ - struct perf_cpu_context *cpuctx; - unsigned long val; - int err, cpu, mpt; - - err = strict_strtoul(buf, 10, &val); - if (err) - return err; - if (val > perf_max_events) - return -EINVAL; - - spin_lock(&perf_resource_lock); - perf_reserved_percpu = val; - for_each_online_cpu(cpu) { - cpuctx = &per_cpu(perf_cpu_context, cpu); - raw_spin_lock_irq(&cpuctx->ctx.lock); - mpt = min(perf_max_events - cpuctx->ctx.nr_events, - perf_max_events - perf_reserved_percpu); - cpuctx->max_pertask = mpt; - raw_spin_unlock_irq(&cpuctx->ctx.lock); - } - spin_unlock(&perf_resource_lock); - - return count; -} - -static ssize_t perf_show_overcommit(struct sysdev_class *class, - struct sysdev_class_attribute *attr, - char *buf) -{ - return sprintf(buf, "%d\n", perf_overcommit); -} - -static ssize_t -perf_set_overcommit(struct sysdev_class *class, - struct sysdev_class_attribute *attr, - const char *buf, size_t count) -{ - unsigned long val; - int err; - - err = strict_strtoul(buf, 10, &val); - if (err) - return err; - if (val > 1) - return -EINVAL; - - spin_lock(&perf_resource_lock); - perf_overcommit = val; - spin_unlock(&perf_resource_lock); - - return count; -} - -static SYSDEV_CLASS_ATTR( - reserve_percpu, - 0644, - perf_show_reserve_percpu, - perf_set_reserve_percpu - ); - -static SYSDEV_CLASS_ATTR( - overcommit, - 0644, - perf_show_overcommit, - perf_set_overcommit - ); - -static struct attribute *perfclass_attrs[] = { - &attr_reserve_percpu.attr, - &attr_overcommit.attr, - NULL -}; - -static struct attribute_group perfclass_attr_group = { - .attrs = perfclass_attrs, - .name = "perf_events", -}; - -static int __init perf_event_sysfs_init(void) -{ - return sysfs_create_group(&cpu_sysdev_class.kset.kobj, - &perfclass_attr_group); -} -device_initcall(perf_event_sysfs_init); -- cgit v1.2.3 From c3f00c70276d8ae82578c8b773e2db657f69a478 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 18 Aug 2010 14:37:15 +0200 Subject: perf: Separate find_get_context() from event initialization Separate find_get_context() from the event allocation and initialization so that we may make find_get_context() depend on the event pmu in a later patch. Signed-off-by: Peter Zijlstra Cc: paulus Cc: stephane eranian Cc: Robert Richter Cc: Frederic Weisbecker Cc: Lin Ming Cc: Yanmin LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 73 +++++++++++++++++++++++++---------------------------- 1 file changed, 35 insertions(+), 38 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 8462e69409ae..a3c86a8335c4 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -827,6 +827,8 @@ perf_install_in_context(struct perf_event_context *ctx, { struct task_struct *task = ctx->task; + event->ctx = ctx; + if (!task) { /* * Per cpu events are installed via an smp call and @@ -5038,20 +5040,17 @@ struct pmu *perf_init_event(struct perf_event *event) * Allocate and initialize a event structure */ static struct perf_event * -perf_event_alloc(struct perf_event_attr *attr, - int cpu, - struct perf_event_context *ctx, +perf_event_alloc(struct perf_event_attr *attr, int cpu, struct perf_event *group_leader, struct perf_event *parent_event, - perf_overflow_handler_t overflow_handler, - gfp_t gfpflags) + perf_overflow_handler_t overflow_handler) { struct pmu *pmu; struct perf_event *event; struct hw_perf_event *hwc; long err; - event = kzalloc(sizeof(*event), gfpflags); + event = kzalloc(sizeof(*event), GFP_KERNEL); if (!event) return ERR_PTR(-ENOMEM); @@ -5076,7 +5075,6 @@ perf_event_alloc(struct perf_event_attr *attr, event->attr = *attr; event->group_leader = group_leader; event->pmu = NULL; - event->ctx = ctx; event->oncpu = -1; event->parent = parent_event; @@ -5321,20 +5319,26 @@ SYSCALL_DEFINE5(perf_event_open, if (event_fd < 0) return event_fd; + event = perf_event_alloc(&attr, cpu, group_leader, NULL, NULL); + if (IS_ERR(event)) { + err = PTR_ERR(event); + goto err_fd; + } + /* * Get the target context (task or percpu): */ ctx = find_get_context(pid, cpu); if (IS_ERR(ctx)) { err = PTR_ERR(ctx); - goto err_fd; + goto err_alloc; } if (group_fd != -1) { group_leader = perf_fget_light(group_fd, &fput_needed); if (IS_ERR(group_leader)) { err = PTR_ERR(group_leader); - goto err_put_context; + goto err_context; } group_file = group_leader->filp; if (flags & PERF_FLAG_FD_OUTPUT) @@ -5354,37 +5358,30 @@ SYSCALL_DEFINE5(perf_event_open, * becoming part of another group-sibling): */ if (group_leader->group_leader != group_leader) - goto err_put_context; + goto err_context; /* * Do not allow to attach to a group in a different * task or CPU context: */ if (group_leader->ctx != ctx) - goto err_put_context; + goto err_context; /* * Only a group leader can be exclusive or pinned */ if (attr.exclusive || attr.pinned) - goto err_put_context; - } - - event = perf_event_alloc(&attr, cpu, ctx, group_leader, - NULL, NULL, GFP_KERNEL); - if (IS_ERR(event)) { - err = PTR_ERR(event); - goto err_put_context; + goto err_context; } if (output_event) { err = perf_event_set_output(event, output_event); if (err) - goto err_free_put_context; + goto err_context; } event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR); if (IS_ERR(event_file)) { err = PTR_ERR(event_file); - goto err_free_put_context; + goto err_context; } event->filp = event_file; @@ -5410,11 +5407,11 @@ SYSCALL_DEFINE5(perf_event_open, fd_install(event_fd, event_file); return event_fd; -err_free_put_context: - free_event(event); -err_put_context: +err_context: fput_light(group_file, fput_needed); put_ctx(ctx); +err_alloc: + free_event(event); err_fd: put_unused_fd(event_fd); return err; @@ -5432,25 +5429,24 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, pid_t pid, perf_overflow_handler_t overflow_handler) { - struct perf_event *event; struct perf_event_context *ctx; + struct perf_event *event; int err; /* * Get the target context (task or percpu): */ + event = perf_event_alloc(attr, cpu, NULL, NULL, overflow_handler); + if (IS_ERR(event)) { + err = PTR_ERR(event); + goto err; + } + ctx = find_get_context(pid, cpu); if (IS_ERR(ctx)) { err = PTR_ERR(ctx); - goto err_exit; - } - - event = perf_event_alloc(attr, cpu, ctx, NULL, - NULL, overflow_handler, GFP_KERNEL); - if (IS_ERR(event)) { - err = PTR_ERR(event); - goto err_put_context; + goto err_free; } event->filp = NULL; @@ -5468,9 +5464,9 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, return event; - err_put_context: - put_ctx(ctx); - err_exit: +err_free: + free_event(event); +err: return ERR_PTR(err); } EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter); @@ -5498,9 +5494,9 @@ inherit_event(struct perf_event *parent_event, parent_event = parent_event->parent; child_event = perf_event_alloc(&parent_event->attr, - parent_event->cpu, child_ctx, + parent_event->cpu, group_leader, parent_event, - NULL, GFP_KERNEL); + NULL); if (IS_ERR(child_event)) return child_event; get_ctx(child_ctx); @@ -5525,6 +5521,7 @@ inherit_event(struct perf_event *parent_event, local64_set(&hwc->period_left, sample_period); } + child_event->ctx = child_ctx; child_event->overflow_handler = parent_event->overflow_handler; /* -- cgit v1.2.3 From b28ab83c595e767f2028276b7398d17f2253cec0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 6 Sep 2010 14:48:15 +0200 Subject: perf: Remove the swevent hash-table from the cpu context Separate the swevent hash-table from the cpu_context bits in preparation for per pmu cpu contexts. This keeps the swevent hash a global entity. Signed-off-by: Peter Zijlstra Cc: paulus Cc: stephane eranian Cc: Robert Richter Cc: Frederic Weisbecker Cc: Lin Ming Cc: Yanmin LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 104 +++++++++++++++++++++++++++++----------------------- 1 file changed, 58 insertions(+), 46 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index a3c86a8335c4..2c47ed6c4f26 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -4154,6 +4154,17 @@ int perf_event_overflow(struct perf_event *event, int nmi, * Generic software event infrastructure */ +struct swevent_htable { + struct swevent_hlist *swevent_hlist; + struct mutex hlist_mutex; + int hlist_refcount; + + /* Recursion avoidance in each contexts */ + int recursion[PERF_NR_CONTEXTS]; +}; + +static DEFINE_PER_CPU(struct swevent_htable, swevent_htable); + /* * We directly increment event->count and keep a second value in * event->hw.period_left to count intervals. This period event @@ -4286,11 +4297,11 @@ __find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id) /* For the read side: events when they trigger */ static inline struct hlist_head * -find_swevent_head_rcu(struct perf_cpu_context *ctx, u64 type, u32 event_id) +find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id) { struct swevent_hlist *hlist; - hlist = rcu_dereference(ctx->swevent_hlist); + hlist = rcu_dereference(swhash->swevent_hlist); if (!hlist) return NULL; @@ -4299,7 +4310,7 @@ find_swevent_head_rcu(struct perf_cpu_context *ctx, u64 type, u32 event_id) /* For the event head insertion and removal in the hlist */ static inline struct hlist_head * -find_swevent_head(struct perf_cpu_context *ctx, struct perf_event *event) +find_swevent_head(struct swevent_htable *swhash, struct perf_event *event) { struct swevent_hlist *hlist; u32 event_id = event->attr.config; @@ -4310,7 +4321,7 @@ find_swevent_head(struct perf_cpu_context *ctx, struct perf_event *event) * and release. Which makes the protected version suitable here. * The context lock guarantees that. */ - hlist = rcu_dereference_protected(ctx->swevent_hlist, + hlist = rcu_dereference_protected(swhash->swevent_hlist, lockdep_is_held(&event->ctx->lock)); if (!hlist) return NULL; @@ -4323,17 +4334,13 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id, struct perf_sample_data *data, struct pt_regs *regs) { - struct perf_cpu_context *cpuctx; + struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); struct perf_event *event; struct hlist_node *node; struct hlist_head *head; - cpuctx = &__get_cpu_var(perf_cpu_context); - rcu_read_lock(); - - head = find_swevent_head_rcu(cpuctx, type, event_id); - + head = find_swevent_head_rcu(swhash, type, event_id); if (!head) goto end; @@ -4347,17 +4354,17 @@ end: int perf_swevent_get_recursion_context(void) { - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); - return get_recursion_context(cpuctx->recursion); + return get_recursion_context(swhash->recursion); } EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context); void inline perf_swevent_put_recursion_context(int rctx) { - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); - put_recursion_context(cpuctx->recursion, rctx); + put_recursion_context(swhash->recursion, rctx); } void __perf_sw_event(u32 event_id, u64 nr, int nmi, @@ -4385,12 +4392,10 @@ static void perf_swevent_read(struct perf_event *event) static int perf_swevent_add(struct perf_event *event, int flags) { + struct swevent_htable *swhash = &__get_cpu_var(swevent_htable); struct hw_perf_event *hwc = &event->hw; - struct perf_cpu_context *cpuctx; struct hlist_head *head; - cpuctx = &__get_cpu_var(perf_cpu_context); - if (hwc->sample_period) { hwc->last_period = hwc->sample_period; perf_swevent_set_period(event); @@ -4398,7 +4403,7 @@ static int perf_swevent_add(struct perf_event *event, int flags) hwc->state = !(flags & PERF_EF_START); - head = find_swevent_head(cpuctx, event); + head = find_swevent_head(swhash, event); if (WARN_ON_ONCE(!head)) return -EINVAL; @@ -4424,10 +4429,10 @@ static void perf_swevent_stop(struct perf_event *event, int flags) /* Deref the hlist from the update side */ static inline struct swevent_hlist * -swevent_hlist_deref(struct perf_cpu_context *cpuctx) +swevent_hlist_deref(struct swevent_htable *swhash) { - return rcu_dereference_protected(cpuctx->swevent_hlist, - lockdep_is_held(&cpuctx->hlist_mutex)); + return rcu_dereference_protected(swhash->swevent_hlist, + lockdep_is_held(&swhash->hlist_mutex)); } static void swevent_hlist_release_rcu(struct rcu_head *rcu_head) @@ -4438,27 +4443,27 @@ static void swevent_hlist_release_rcu(struct rcu_head *rcu_head) kfree(hlist); } -static void swevent_hlist_release(struct perf_cpu_context *cpuctx) +static void swevent_hlist_release(struct swevent_htable *swhash) { - struct swevent_hlist *hlist = swevent_hlist_deref(cpuctx); + struct swevent_hlist *hlist = swevent_hlist_deref(swhash); if (!hlist) return; - rcu_assign_pointer(cpuctx->swevent_hlist, NULL); + rcu_assign_pointer(swhash->swevent_hlist, NULL); call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu); } static void swevent_hlist_put_cpu(struct perf_event *event, int cpu) { - struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); + struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); - mutex_lock(&cpuctx->hlist_mutex); + mutex_lock(&swhash->hlist_mutex); - if (!--cpuctx->hlist_refcount) - swevent_hlist_release(cpuctx); + if (!--swhash->hlist_refcount) + swevent_hlist_release(swhash); - mutex_unlock(&cpuctx->hlist_mutex); + mutex_unlock(&swhash->hlist_mutex); } static void swevent_hlist_put(struct perf_event *event) @@ -4476,12 +4481,12 @@ static void swevent_hlist_put(struct perf_event *event) static int swevent_hlist_get_cpu(struct perf_event *event, int cpu) { - struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); + struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); int err = 0; - mutex_lock(&cpuctx->hlist_mutex); + mutex_lock(&swhash->hlist_mutex); - if (!swevent_hlist_deref(cpuctx) && cpu_online(cpu)) { + if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) { struct swevent_hlist *hlist; hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); @@ -4489,11 +4494,11 @@ static int swevent_hlist_get_cpu(struct perf_event *event, int cpu) err = -ENOMEM; goto exit; } - rcu_assign_pointer(cpuctx->swevent_hlist, hlist); + rcu_assign_pointer(swhash->swevent_hlist, hlist); } - cpuctx->hlist_refcount++; + swhash->hlist_refcount++; exit: - mutex_unlock(&cpuctx->hlist_mutex); + mutex_unlock(&swhash->hlist_mutex); return err; } @@ -5889,12 +5894,15 @@ int perf_event_init_task(struct task_struct *child) static void __init perf_event_init_all_cpus(void) { - int cpu; struct perf_cpu_context *cpuctx; + struct swevent_htable *swhash; + int cpu; for_each_possible_cpu(cpu) { + swhash = &per_cpu(swevent_htable, cpu); + mutex_init(&swhash->hlist_mutex); + cpuctx = &per_cpu(perf_cpu_context, cpu); - mutex_init(&cpuctx->hlist_mutex); __perf_event_init_context(&cpuctx->ctx, NULL); } } @@ -5902,18 +5910,21 @@ static void __init perf_event_init_all_cpus(void) static void __cpuinit perf_event_init_cpu(int cpu) { struct perf_cpu_context *cpuctx; + struct swevent_htable *swhash; cpuctx = &per_cpu(perf_cpu_context, cpu); - mutex_lock(&cpuctx->hlist_mutex); - if (cpuctx->hlist_refcount > 0) { + swhash = &per_cpu(swevent_htable, cpu); + + mutex_lock(&swhash->hlist_mutex); + if (swhash->hlist_refcount > 0) { struct swevent_hlist *hlist; - hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); - WARN_ON_ONCE(!hlist); - rcu_assign_pointer(cpuctx->swevent_hlist, hlist); + hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu)); + WARN_ON(!hlist); + rcu_assign_pointer(swhash->swevent_hlist, hlist); } - mutex_unlock(&cpuctx->hlist_mutex); + mutex_unlock(&swhash->hlist_mutex); } #ifdef CONFIG_HOTPLUG_CPU @@ -5931,11 +5942,12 @@ static void __perf_event_exit_cpu(void *info) static void perf_event_exit_cpu(int cpu) { struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); + struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); struct perf_event_context *ctx = &cpuctx->ctx; - mutex_lock(&cpuctx->hlist_mutex); - swevent_hlist_release(cpuctx); - mutex_unlock(&cpuctx->hlist_mutex); + mutex_lock(&swhash->hlist_mutex); + swevent_hlist_release(swhash); + mutex_unlock(&swhash->hlist_mutex); mutex_lock(&ctx->mutex); smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1); -- cgit v1.2.3 From b5ab4cd563e7ab49b27957704112a8ecade54e1f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 6 Sep 2010 16:32:21 +0200 Subject: perf: Per cpu-context rotation timer Give each cpu-context its own timer so that it is a self contained entity, this eases the way for per-pmu-per-cpu contexts as well as provides the basic infrastructure to allow different rotation times per pmu. Things to look at: - folding the tick and these TICK_NSEC timers - separate task context rotation Signed-off-by: Peter Zijlstra Cc: paulus Cc: stephane eranian Cc: Robert Richter Cc: Frederic Weisbecker Cc: Lin Ming Cc: Yanmin LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 80 +++++++++++++++++++++++++++++++++++++++++------------ kernel/sched.c | 2 -- 2 files changed, 63 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 2c47ed6c4f26..d75e4c8727f9 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -78,6 +78,25 @@ void perf_pmu_enable(struct pmu *pmu) pmu->pmu_enable(pmu); } +static void perf_pmu_rotate_start(void) +{ + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + + if (hrtimer_active(&cpuctx->timer)) + return; + + __hrtimer_start_range_ns(&cpuctx->timer, + ns_to_ktime(cpuctx->timer_interval), 0, + HRTIMER_MODE_REL_PINNED, 0); +} + +static void perf_pmu_rotate_stop(void) +{ + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + + hrtimer_cancel(&cpuctx->timer); +} + static void get_ctx(struct perf_event_context *ctx) { WARN_ON(!atomic_inc_not_zero(&ctx->refcount)); @@ -281,6 +300,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) } list_add_rcu(&event->event_entry, &ctx->event_list); + if (!ctx->nr_events) + perf_pmu_rotate_start(); ctx->nr_events++; if (event->attr.inherit_stat) ctx->nr_stat++; @@ -1383,6 +1404,12 @@ void perf_event_task_sched_in(struct task_struct *task) ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE); cpuctx->task_ctx = ctx; + + /* + * Since these rotations are per-cpu, we need to ensure the + * cpu-context we got scheduled on is actually rotating. + */ + perf_pmu_rotate_start(); } #define MAX_INTERRUPTS (~0ULL) @@ -1487,7 +1514,7 @@ static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) } } -static void perf_ctx_adjust_freq(struct perf_event_context *ctx) +static void perf_ctx_adjust_freq(struct perf_event_context *ctx, u64 period) { struct perf_event *event; struct hw_perf_event *hwc; @@ -1524,7 +1551,7 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx) hwc->freq_count_stamp = now; if (delta > 0) - perf_adjust_period(event, TICK_NSEC, delta); + perf_adjust_period(event, period, delta); } raw_spin_unlock(&ctx->lock); } @@ -1542,30 +1569,39 @@ static void rotate_ctx(struct perf_event_context *ctx) raw_spin_unlock(&ctx->lock); } -void perf_event_task_tick(struct task_struct *curr) +/* + * Cannot race with ->pmu_rotate_start() because this is ran from hardirq + * context, and ->pmu_rotate_start() is called with irqs disabled (both are + * cpu affine, so there are no SMP races). + */ +static enum hrtimer_restart perf_event_context_tick(struct hrtimer *timer) { + enum hrtimer_restart restart = HRTIMER_NORESTART; struct perf_cpu_context *cpuctx; struct perf_event_context *ctx; int rotate = 0; - if (!atomic_read(&nr_events)) - return; + cpuctx = container_of(timer, struct perf_cpu_context, timer); - cpuctx = &__get_cpu_var(perf_cpu_context); - if (cpuctx->ctx.nr_events && - cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) - rotate = 1; + if (cpuctx->ctx.nr_events) { + restart = HRTIMER_RESTART; + if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) + rotate = 1; + } - ctx = curr->perf_event_ctxp; - if (ctx && ctx->nr_events && ctx->nr_events != ctx->nr_active) - rotate = 1; + ctx = current->perf_event_ctxp; + if (ctx && ctx->nr_events) { + restart = HRTIMER_RESTART; + if (ctx->nr_events != ctx->nr_active) + rotate = 1; + } - perf_ctx_adjust_freq(&cpuctx->ctx); + perf_ctx_adjust_freq(&cpuctx->ctx, cpuctx->timer_interval); if (ctx) - perf_ctx_adjust_freq(ctx); + perf_ctx_adjust_freq(ctx, cpuctx->timer_interval); if (!rotate) - return; + goto done; cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); if (ctx) @@ -1577,7 +1613,12 @@ void perf_event_task_tick(struct task_struct *curr) cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); if (ctx) - task_ctx_sched_in(curr, EVENT_FLEXIBLE); + task_ctx_sched_in(current, EVENT_FLEXIBLE); + +done: + hrtimer_forward_now(timer, ns_to_ktime(cpuctx->timer_interval)); + + return restart; } static int event_enable_on_exec(struct perf_event *event, @@ -4786,7 +4827,7 @@ static void perf_swevent_start_hrtimer(struct perf_event *event) } __hrtimer_start_range_ns(&hwc->hrtimer, ns_to_ktime(period), 0, - HRTIMER_MODE_REL, 0); + HRTIMER_MODE_REL_PINNED, 0); } } @@ -5904,6 +5945,9 @@ static void __init perf_event_init_all_cpus(void) cpuctx = &per_cpu(perf_cpu_context, cpu); __perf_event_init_context(&cpuctx->ctx, NULL); + cpuctx->timer_interval = TICK_NSEC; + hrtimer_init(&cpuctx->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + cpuctx->timer.function = perf_event_context_tick; } } @@ -5934,6 +5978,8 @@ static void __perf_event_exit_cpu(void *info) struct perf_event_context *ctx = &cpuctx->ctx; struct perf_event *event, *tmp; + perf_pmu_rotate_stop(); + list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) __perf_event_remove_from_context(event); list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry) diff --git a/kernel/sched.c b/kernel/sched.c index 09b574e7f4df..66a02ba83c01 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3578,8 +3578,6 @@ void scheduler_tick(void) curr->sched_class->task_tick(rq, curr, 0); raw_spin_unlock(&rq->lock); - perf_event_task_tick(curr); - #ifdef CONFIG_SMP rq->idle_at_tick = idle_cpu(cpu); trigger_load_balance(rq, cpu); -- cgit v1.2.3 From 108b02cfce04ee90b0a07ee0b104baffd39f5934 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 6 Sep 2010 14:32:03 +0200 Subject: perf: Per-pmu-per-cpu contexts Allocate per-cpu contexts per pmu. Signed-off-by: Peter Zijlstra Cc: paulus Cc: stephane eranian Cc: Robert Richter Cc: Frederic Weisbecker Cc: Lin Ming Cc: Yanmin LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 178 ++++++++++++++++++++++++++++++++-------------------- 1 file changed, 110 insertions(+), 68 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index d75e4c8727f9..8ca6e690ffe3 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -34,16 +34,15 @@ #include -/* - * Each CPU has a list of per CPU events: - */ -static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context); - static atomic_t nr_events __read_mostly; static atomic_t nr_mmap_events __read_mostly; static atomic_t nr_comm_events __read_mostly; static atomic_t nr_task_events __read_mostly; +static LIST_HEAD(pmus); +static DEFINE_MUTEX(pmus_lock); +static struct srcu_struct pmus_srcu; + /* * perf event paranoia level: * -1 - not paranoid at all @@ -78,9 +77,9 @@ void perf_pmu_enable(struct pmu *pmu) pmu->pmu_enable(pmu); } -static void perf_pmu_rotate_start(void) +static void perf_pmu_rotate_start(struct pmu *pmu) { - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); if (hrtimer_active(&cpuctx->timer)) return; @@ -90,9 +89,9 @@ static void perf_pmu_rotate_start(void) HRTIMER_MODE_REL_PINNED, 0); } -static void perf_pmu_rotate_stop(void) +static void perf_pmu_rotate_stop(struct pmu *pmu) { - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); hrtimer_cancel(&cpuctx->timer); } @@ -301,7 +300,7 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) list_add_rcu(&event->event_entry, &ctx->event_list); if (!ctx->nr_events) - perf_pmu_rotate_start(); + perf_pmu_rotate_start(ctx->pmu); ctx->nr_events++; if (event->attr.inherit_stat) ctx->nr_stat++; @@ -466,6 +465,12 @@ group_sched_out(struct perf_event *group_event, cpuctx->exclusive = 0; } +static inline struct perf_cpu_context * +__get_cpu_context(struct perf_event_context *ctx) +{ + return this_cpu_ptr(ctx->pmu->pmu_cpu_context); +} + /* * Cross CPU call to remove a performance event * @@ -474,9 +479,9 @@ group_sched_out(struct perf_event *group_event, */ static void __perf_event_remove_from_context(void *info) { - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); struct perf_event *event = info; struct perf_event_context *ctx = event->ctx; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); /* * If this is a task context, we need to check whether it is @@ -556,8 +561,8 @@ retry: static void __perf_event_disable(void *info) { struct perf_event *event = info; - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); struct perf_event_context *ctx = event->ctx; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); /* * If this is a per-task event, need to check whether this @@ -765,10 +770,10 @@ static void add_event_to_ctx(struct perf_event *event, */ static void __perf_install_in_context(void *info) { - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); struct perf_event *event = info; struct perf_event_context *ctx = event->ctx; struct perf_event *leader = event->group_leader; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); int err; /* @@ -912,9 +917,9 @@ static void __perf_event_mark_enabled(struct perf_event *event, static void __perf_event_enable(void *info) { struct perf_event *event = info; - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); struct perf_event_context *ctx = event->ctx; struct perf_event *leader = event->group_leader; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); int err; /* @@ -1188,15 +1193,19 @@ static void perf_event_sync_stat(struct perf_event_context *ctx, void perf_event_task_sched_out(struct task_struct *task, struct task_struct *next) { - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); struct perf_event_context *ctx = task->perf_event_ctxp; struct perf_event_context *next_ctx; struct perf_event_context *parent; + struct perf_cpu_context *cpuctx; int do_switch = 1; perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0); - if (likely(!ctx || !cpuctx->task_ctx)) + if (likely(!ctx)) + return; + + cpuctx = __get_cpu_context(ctx); + if (!cpuctx->task_ctx) return; rcu_read_lock(); @@ -1242,7 +1251,7 @@ void perf_event_task_sched_out(struct task_struct *task, static void task_ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type) { - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); if (!cpuctx->task_ctx) return; @@ -1360,8 +1369,8 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, static void task_ctx_sched_in(struct task_struct *task, enum event_type_t event_type) { - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); struct perf_event_context *ctx = task->perf_event_ctxp; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); if (likely(!ctx)) return; @@ -1383,12 +1392,13 @@ static void task_ctx_sched_in(struct task_struct *task, */ void perf_event_task_sched_in(struct task_struct *task) { - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); struct perf_event_context *ctx = task->perf_event_ctxp; + struct perf_cpu_context *cpuctx; if (likely(!ctx)) return; + cpuctx = __get_cpu_context(ctx); if (cpuctx->task_ctx == ctx) return; @@ -1409,7 +1419,7 @@ void perf_event_task_sched_in(struct task_struct *task) * Since these rotations are per-cpu, we need to ensure the * cpu-context we got scheduled on is actually rotating. */ - perf_pmu_rotate_start(); + perf_pmu_rotate_start(ctx->pmu); } #define MAX_INTERRUPTS (~0ULL) @@ -1687,9 +1697,9 @@ out: */ static void __perf_event_read(void *info) { - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); struct perf_event *event = info; struct perf_event_context *ctx = event->ctx; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); /* * If this is a task context, we need to check whether it is @@ -1962,7 +1972,8 @@ __perf_event_init_context(struct perf_event_context *ctx, ctx->task = task; } -static struct perf_event_context *find_get_context(pid_t pid, int cpu) +static struct perf_event_context * +find_get_context(struct pmu *pmu, pid_t pid, int cpu) { struct perf_event_context *ctx; struct perf_cpu_context *cpuctx; @@ -1986,7 +1997,7 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu) if (!cpu_online(cpu)) return ERR_PTR(-ENODEV); - cpuctx = &per_cpu(perf_cpu_context, cpu); + cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); ctx = &cpuctx->ctx; get_ctx(ctx); @@ -2030,6 +2041,7 @@ retry: if (!ctx) goto errout; __perf_event_init_context(ctx, task); + ctx->pmu = pmu; get_ctx(ctx); if (cmpxchg(&task->perf_event_ctxp, NULL, ctx)) { /* @@ -3745,18 +3757,20 @@ static void perf_event_task_ctx(struct perf_event_context *ctx, static void perf_event_task_event(struct perf_task_event *task_event) { - struct perf_cpu_context *cpuctx; struct perf_event_context *ctx = task_event->task_ctx; + struct perf_cpu_context *cpuctx; + struct pmu *pmu; - rcu_read_lock(); - cpuctx = &get_cpu_var(perf_cpu_context); - perf_event_task_ctx(&cpuctx->ctx, task_event); + rcu_read_lock_sched(); + list_for_each_entry_rcu(pmu, &pmus, entry) { + cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + perf_event_task_ctx(&cpuctx->ctx, task_event); + } if (!ctx) ctx = rcu_dereference(current->perf_event_ctxp); if (ctx) perf_event_task_ctx(ctx, task_event); - put_cpu_var(perf_cpu_context); - rcu_read_unlock(); + rcu_read_unlock_sched(); } static void perf_event_task(struct task_struct *task, @@ -3861,6 +3875,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) struct perf_cpu_context *cpuctx; struct perf_event_context *ctx; unsigned int size; + struct pmu *pmu; char comm[TASK_COMM_LEN]; memset(comm, 0, sizeof(comm)); @@ -3872,14 +3887,15 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; - rcu_read_lock(); - cpuctx = &get_cpu_var(perf_cpu_context); - perf_event_comm_ctx(&cpuctx->ctx, comm_event); + rcu_read_lock_sched(); + list_for_each_entry_rcu(pmu, &pmus, entry) { + cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + perf_event_comm_ctx(&cpuctx->ctx, comm_event); + } ctx = rcu_dereference(current->perf_event_ctxp); if (ctx) perf_event_comm_ctx(ctx, comm_event); - put_cpu_var(perf_cpu_context); - rcu_read_unlock(); + rcu_read_unlock_sched(); } void perf_event_comm(struct task_struct *task) @@ -3989,6 +4005,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) char tmp[16]; char *buf = NULL; const char *name; + struct pmu *pmu; memset(tmp, 0, sizeof(tmp)); @@ -4040,14 +4057,16 @@ got_name: mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; - rcu_read_lock(); - cpuctx = &get_cpu_var(perf_cpu_context); - perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, vma->vm_flags & VM_EXEC); + rcu_read_lock_sched(); + list_for_each_entry_rcu(pmu, &pmus, entry) { + cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, + vma->vm_flags & VM_EXEC); + } ctx = rcu_dereference(current->perf_event_ctxp); if (ctx) perf_event_mmap_ctx(ctx, mmap_event, vma->vm_flags & VM_EXEC); - put_cpu_var(perf_cpu_context); - rcu_read_unlock(); + rcu_read_unlock_sched(); kfree(buf); } @@ -4982,10 +5001,6 @@ static struct pmu perf_task_clock = { .read = task_clock_event_read, }; -static LIST_HEAD(pmus); -static DEFINE_MUTEX(pmus_lock); -static struct srcu_struct pmus_srcu; - static void perf_pmu_nop_void(struct pmu *pmu) { } @@ -5013,7 +5028,7 @@ static void perf_pmu_cancel_txn(struct pmu *pmu) int perf_pmu_register(struct pmu *pmu) { - int ret; + int cpu, ret; mutex_lock(&pmus_lock); ret = -ENOMEM; @@ -5021,6 +5036,21 @@ int perf_pmu_register(struct pmu *pmu) if (!pmu->pmu_disable_count) goto unlock; + pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context); + if (!pmu->pmu_cpu_context) + goto free_pdc; + + for_each_possible_cpu(cpu) { + struct perf_cpu_context *cpuctx; + + cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); + __perf_event_init_context(&cpuctx->ctx, NULL); + cpuctx->ctx.pmu = pmu; + cpuctx->timer_interval = TICK_NSEC; + hrtimer_init(&cpuctx->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + cpuctx->timer.function = perf_event_context_tick; + } + if (!pmu->start_txn) { if (pmu->pmu_enable) { /* @@ -5049,6 +5079,10 @@ unlock: mutex_unlock(&pmus_lock); return ret; + +free_pdc: + free_percpu(pmu->pmu_disable_count); + goto unlock; } void perf_pmu_unregister(struct pmu *pmu) @@ -5057,9 +5091,14 @@ void perf_pmu_unregister(struct pmu *pmu) list_del_rcu(&pmu->entry); mutex_unlock(&pmus_lock); + /* + * We use the pmu list either under SRCU or preempt_disable, + * synchronize_srcu() implies synchronize_sched() so we're good. + */ synchronize_srcu(&pmus_srcu); free_percpu(pmu->pmu_disable_count); + free_percpu(pmu->pmu_cpu_context); } struct pmu *perf_init_event(struct perf_event *event) @@ -5374,7 +5413,7 @@ SYSCALL_DEFINE5(perf_event_open, /* * Get the target context (task or percpu): */ - ctx = find_get_context(pid, cpu); + ctx = find_get_context(event->pmu, pid, cpu); if (IS_ERR(ctx)) { err = PTR_ERR(ctx); goto err_alloc; @@ -5489,7 +5528,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, goto err; } - ctx = find_get_context(pid, cpu); + ctx = find_get_context(event->pmu, pid, cpu); if (IS_ERR(ctx)) { err = PTR_ERR(ctx); goto err_free; @@ -5833,6 +5872,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent, return -ENOMEM; __perf_event_init_context(child_ctx, child); + child_ctx->pmu = event->pmu; child->perf_event_ctxp = child_ctx; get_task_struct(child); } @@ -5935,30 +5975,18 @@ int perf_event_init_task(struct task_struct *child) static void __init perf_event_init_all_cpus(void) { - struct perf_cpu_context *cpuctx; struct swevent_htable *swhash; int cpu; for_each_possible_cpu(cpu) { swhash = &per_cpu(swevent_htable, cpu); mutex_init(&swhash->hlist_mutex); - - cpuctx = &per_cpu(perf_cpu_context, cpu); - __perf_event_init_context(&cpuctx->ctx, NULL); - cpuctx->timer_interval = TICK_NSEC; - hrtimer_init(&cpuctx->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - cpuctx->timer.function = perf_event_context_tick; } } static void __cpuinit perf_event_init_cpu(int cpu) { - struct perf_cpu_context *cpuctx; - struct swevent_htable *swhash; - - cpuctx = &per_cpu(perf_cpu_context, cpu); - - swhash = &per_cpu(swevent_htable, cpu); + struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); mutex_lock(&swhash->hlist_mutex); if (swhash->hlist_refcount > 0) { @@ -5972,32 +6000,46 @@ static void __cpuinit perf_event_init_cpu(int cpu) } #ifdef CONFIG_HOTPLUG_CPU -static void __perf_event_exit_cpu(void *info) +static void __perf_event_exit_context(void *__info) { - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - struct perf_event_context *ctx = &cpuctx->ctx; + struct perf_event_context *ctx = __info; struct perf_event *event, *tmp; - perf_pmu_rotate_stop(); + perf_pmu_rotate_stop(ctx->pmu); list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) __perf_event_remove_from_context(event); list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry) __perf_event_remove_from_context(event); } + +static void perf_event_exit_cpu_context(int cpu) +{ + struct perf_event_context *ctx; + struct pmu *pmu; + int idx; + + idx = srcu_read_lock(&pmus_srcu); + list_for_each_entry_rcu(pmu, &pmus, entry) { + ctx = &this_cpu_ptr(pmu->pmu_cpu_context)->ctx; + + mutex_lock(&ctx->mutex); + smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1); + mutex_unlock(&ctx->mutex); + } + srcu_read_unlock(&pmus_srcu, idx); + +} + static void perf_event_exit_cpu(int cpu) { - struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); - struct perf_event_context *ctx = &cpuctx->ctx; mutex_lock(&swhash->hlist_mutex); swevent_hlist_release(swhash); mutex_unlock(&swhash->hlist_mutex); - mutex_lock(&ctx->mutex); - smp_call_function_single(cpu, __perf_event_exit_cpu, NULL, 1); - mutex_unlock(&ctx->mutex); + perf_event_exit_cpu_context(cpu); } #else static inline void perf_event_exit_cpu(int cpu) { } -- cgit v1.2.3 From 97dee4f3206622f31396dede2b5ddb8670458f56 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 7 Sep 2010 15:35:33 +0200 Subject: perf: Move some code around Move all inherit code near each other. Signed-off-by: Peter Zijlstra Cc: paulus Cc: stephane eranian Cc: Robert Richter Cc: Frederic Weisbecker Cc: Lin Ming Cc: Yanmin LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 200 ++++++++++++++++++++++++++-------------------------- 1 file changed, 100 insertions(+), 100 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 8ca6e690ffe3..dae0e2f30293 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -5556,106 +5556,6 @@ err: } EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter); -/* - * inherit a event from parent task to child task: - */ -static struct perf_event * -inherit_event(struct perf_event *parent_event, - struct task_struct *parent, - struct perf_event_context *parent_ctx, - struct task_struct *child, - struct perf_event *group_leader, - struct perf_event_context *child_ctx) -{ - struct perf_event *child_event; - - /* - * Instead of creating recursive hierarchies of events, - * we link inherited events back to the original parent, - * which has a filp for sure, which we use as the reference - * count: - */ - if (parent_event->parent) - parent_event = parent_event->parent; - - child_event = perf_event_alloc(&parent_event->attr, - parent_event->cpu, - group_leader, parent_event, - NULL); - if (IS_ERR(child_event)) - return child_event; - get_ctx(child_ctx); - - /* - * Make the child state follow the state of the parent event, - * not its attr.disabled bit. We hold the parent's mutex, - * so we won't race with perf_event_{en, dis}able_family. - */ - if (parent_event->state >= PERF_EVENT_STATE_INACTIVE) - child_event->state = PERF_EVENT_STATE_INACTIVE; - else - child_event->state = PERF_EVENT_STATE_OFF; - - if (parent_event->attr.freq) { - u64 sample_period = parent_event->hw.sample_period; - struct hw_perf_event *hwc = &child_event->hw; - - hwc->sample_period = sample_period; - hwc->last_period = sample_period; - - local64_set(&hwc->period_left, sample_period); - } - - child_event->ctx = child_ctx; - child_event->overflow_handler = parent_event->overflow_handler; - - /* - * Link it up in the child's context: - */ - add_event_to_ctx(child_event, child_ctx); - - /* - * Get a reference to the parent filp - we will fput it - * when the child event exits. This is safe to do because - * we are in the parent and we know that the filp still - * exists and has a nonzero count: - */ - atomic_long_inc(&parent_event->filp->f_count); - - /* - * Link this into the parent event's child list - */ - WARN_ON_ONCE(parent_event->ctx->parent_ctx); - mutex_lock(&parent_event->child_mutex); - list_add_tail(&child_event->child_list, &parent_event->child_list); - mutex_unlock(&parent_event->child_mutex); - - return child_event; -} - -static int inherit_group(struct perf_event *parent_event, - struct task_struct *parent, - struct perf_event_context *parent_ctx, - struct task_struct *child, - struct perf_event_context *child_ctx) -{ - struct perf_event *leader; - struct perf_event *sub; - struct perf_event *child_ctr; - - leader = inherit_event(parent_event, parent, parent_ctx, - child, NULL, child_ctx); - if (IS_ERR(leader)) - return PTR_ERR(leader); - list_for_each_entry(sub, &parent_event->sibling_list, group_entry) { - child_ctr = inherit_event(sub, parent, parent_ctx, - child, leader, child_ctx); - if (IS_ERR(child_ctr)) - return PTR_ERR(child_ctr); - } - return 0; -} - static void sync_child_event(struct perf_event *child_event, struct task_struct *child) { @@ -5844,6 +5744,106 @@ again: put_ctx(ctx); } +/* + * inherit a event from parent task to child task: + */ +static struct perf_event * +inherit_event(struct perf_event *parent_event, + struct task_struct *parent, + struct perf_event_context *parent_ctx, + struct task_struct *child, + struct perf_event *group_leader, + struct perf_event_context *child_ctx) +{ + struct perf_event *child_event; + + /* + * Instead of creating recursive hierarchies of events, + * we link inherited events back to the original parent, + * which has a filp for sure, which we use as the reference + * count: + */ + if (parent_event->parent) + parent_event = parent_event->parent; + + child_event = perf_event_alloc(&parent_event->attr, + parent_event->cpu, + group_leader, parent_event, + NULL); + if (IS_ERR(child_event)) + return child_event; + get_ctx(child_ctx); + + /* + * Make the child state follow the state of the parent event, + * not its attr.disabled bit. We hold the parent's mutex, + * so we won't race with perf_event_{en, dis}able_family. + */ + if (parent_event->state >= PERF_EVENT_STATE_INACTIVE) + child_event->state = PERF_EVENT_STATE_INACTIVE; + else + child_event->state = PERF_EVENT_STATE_OFF; + + if (parent_event->attr.freq) { + u64 sample_period = parent_event->hw.sample_period; + struct hw_perf_event *hwc = &child_event->hw; + + hwc->sample_period = sample_period; + hwc->last_period = sample_period; + + local64_set(&hwc->period_left, sample_period); + } + + child_event->ctx = child_ctx; + child_event->overflow_handler = parent_event->overflow_handler; + + /* + * Link it up in the child's context: + */ + add_event_to_ctx(child_event, child_ctx); + + /* + * Get a reference to the parent filp - we will fput it + * when the child event exits. This is safe to do because + * we are in the parent and we know that the filp still + * exists and has a nonzero count: + */ + atomic_long_inc(&parent_event->filp->f_count); + + /* + * Link this into the parent event's child list + */ + WARN_ON_ONCE(parent_event->ctx->parent_ctx); + mutex_lock(&parent_event->child_mutex); + list_add_tail(&child_event->child_list, &parent_event->child_list); + mutex_unlock(&parent_event->child_mutex); + + return child_event; +} + +static int inherit_group(struct perf_event *parent_event, + struct task_struct *parent, + struct perf_event_context *parent_ctx, + struct task_struct *child, + struct perf_event_context *child_ctx) +{ + struct perf_event *leader; + struct perf_event *sub; + struct perf_event *child_ctr; + + leader = inherit_event(parent_event, parent, parent_ctx, + child, NULL, child_ctx); + if (IS_ERR(leader)) + return PTR_ERR(leader); + list_for_each_entry(sub, &parent_event->sibling_list, group_entry) { + child_ctr = inherit_event(sub, parent, parent_ctx, + child, leader, child_ctx); + if (IS_ERR(child_ctr)) + return PTR_ERR(child_ctr); + } + return 0; +} + static int inherit_task_group(struct perf_event *event, struct task_struct *parent, struct perf_event_context *parent_ctx, -- cgit v1.2.3 From eb184479874238393ac186c4e054d24311c34aaa Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 7 Sep 2010 15:55:13 +0200 Subject: perf: Clean up perf_event_context allocation Unify the two perf_event_context allocation sites. Signed-off-by: Peter Zijlstra Cc: paulus Cc: stephane eranian Cc: Robert Richter Cc: Frederic Weisbecker Cc: Lin Ming Cc: Yanmin LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 41 ++++++++++++++++++++++++++--------------- 1 file changed, 26 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index dae0e2f30293..13d98d756347 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -1959,9 +1959,7 @@ exit_put: /* * Initialize the perf_event context in a task_struct: */ -static void -__perf_event_init_context(struct perf_event_context *ctx, - struct task_struct *task) +static void __perf_event_init_context(struct perf_event_context *ctx) { raw_spin_lock_init(&ctx->lock); mutex_init(&ctx->mutex); @@ -1969,7 +1967,25 @@ __perf_event_init_context(struct perf_event_context *ctx, INIT_LIST_HEAD(&ctx->flexible_groups); INIT_LIST_HEAD(&ctx->event_list); atomic_set(&ctx->refcount, 1); - ctx->task = task; +} + +static struct perf_event_context * +alloc_perf_context(struct pmu *pmu, struct task_struct *task) +{ + struct perf_event_context *ctx; + + ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL); + if (!ctx) + return NULL; + + __perf_event_init_context(ctx); + if (task) { + ctx->task = task; + get_task_struct(task); + } + ctx->pmu = pmu; + + return ctx; } static struct perf_event_context * @@ -2036,22 +2052,22 @@ retry: } if (!ctx) { - ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL); + ctx = alloc_perf_context(pmu, task); err = -ENOMEM; if (!ctx) goto errout; - __perf_event_init_context(ctx, task); - ctx->pmu = pmu; + get_ctx(ctx); + if (cmpxchg(&task->perf_event_ctxp, NULL, ctx)) { /* * We raced with some other task; use * the context they set. */ + put_task_struct(task); kfree(ctx); goto retry; } - get_task_struct(task); } put_task_struct(task); @@ -5044,7 +5060,7 @@ int perf_pmu_register(struct pmu *pmu) struct perf_cpu_context *cpuctx; cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); - __perf_event_init_context(&cpuctx->ctx, NULL); + __perf_event_init_context(&cpuctx->ctx); cpuctx->ctx.pmu = pmu; cpuctx->timer_interval = TICK_NSEC; hrtimer_init(&cpuctx->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); @@ -5866,15 +5882,11 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent, * child. */ - child_ctx = kzalloc(sizeof(struct perf_event_context), - GFP_KERNEL); + child_ctx = alloc_perf_context(event->pmu, child); if (!child_ctx) return -ENOMEM; - __perf_event_init_context(child_ctx, child); - child_ctx->pmu = event->pmu; child->perf_event_ctxp = child_ctx; - get_task_struct(child); } ret = inherit_group(event, parent, parent_ctx, @@ -5886,7 +5898,6 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent, return ret; } - /* * Initialize the perf_event context in task_struct */ -- cgit v1.2.3 From 8dc85d547285668e509f86c177bcd4ea055bcaaf Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 2 Sep 2010 16:50:03 +0200 Subject: perf: Multiple task contexts Provide the infrastructure for multiple task contexts. A more flexible approach would have resulted in more pointer chases in the scheduling hot-paths. This approach has the limitation of a static number of task contexts. Since I expect most external PMUs to be system wide, or at least node wide (as per the intel uncore unit) they won't actually need a task context. Signed-off-by: Peter Zijlstra Cc: paulus Cc: stephane eranian Cc: Robert Richter Cc: Frederic Weisbecker Cc: Lin Ming Cc: Yanmin LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 336 ++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 231 insertions(+), 105 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 13d98d756347..7223ea875861 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -148,13 +148,13 @@ static u64 primary_event_id(struct perf_event *event) * the context could get moved to another task. */ static struct perf_event_context * -perf_lock_task_context(struct task_struct *task, unsigned long *flags) +perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags) { struct perf_event_context *ctx; rcu_read_lock(); retry: - ctx = rcu_dereference(task->perf_event_ctxp); + ctx = rcu_dereference(task->perf_event_ctxp[ctxn]); if (ctx) { /* * If this context is a clone of another, it might @@ -167,7 +167,7 @@ retry: * can't get swapped on us any more. */ raw_spin_lock_irqsave(&ctx->lock, *flags); - if (ctx != rcu_dereference(task->perf_event_ctxp)) { + if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) { raw_spin_unlock_irqrestore(&ctx->lock, *flags); goto retry; } @@ -186,12 +186,13 @@ retry: * can't get swapped to another task. This also increments its * reference count so that the context can't get freed. */ -static struct perf_event_context *perf_pin_task_context(struct task_struct *task) +static struct perf_event_context * +perf_pin_task_context(struct task_struct *task, int ctxn) { struct perf_event_context *ctx; unsigned long flags; - ctx = perf_lock_task_context(task, &flags); + ctx = perf_lock_task_context(task, ctxn, &flags); if (ctx) { ++ctx->pin_count; raw_spin_unlock_irqrestore(&ctx->lock, flags); @@ -1179,28 +1180,15 @@ static void perf_event_sync_stat(struct perf_event_context *ctx, } } -/* - * Called from scheduler to remove the events of the current task, - * with interrupts disabled. - * - * We stop each event and update the event value in event->count. - * - * This does not protect us against NMI, but disable() - * sets the disabled bit in the control field of event _before_ - * accessing the event control register. If a NMI hits, then it will - * not restart the event. - */ -void perf_event_task_sched_out(struct task_struct *task, - struct task_struct *next) +void perf_event_context_sched_out(struct task_struct *task, int ctxn, + struct task_struct *next) { - struct perf_event_context *ctx = task->perf_event_ctxp; + struct perf_event_context *ctx = task->perf_event_ctxp[ctxn]; struct perf_event_context *next_ctx; struct perf_event_context *parent; struct perf_cpu_context *cpuctx; int do_switch = 1; - perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0); - if (likely(!ctx)) return; @@ -1210,7 +1198,7 @@ void perf_event_task_sched_out(struct task_struct *task, rcu_read_lock(); parent = rcu_dereference(ctx->parent_ctx); - next_ctx = next->perf_event_ctxp; + next_ctx = next->perf_event_ctxp[ctxn]; if (parent && next_ctx && rcu_dereference(next_ctx->parent_ctx) == parent) { /* @@ -1229,8 +1217,8 @@ void perf_event_task_sched_out(struct task_struct *task, * XXX do we need a memory barrier of sorts * wrt to rcu_dereference() of perf_event_ctxp */ - task->perf_event_ctxp = next_ctx; - next->perf_event_ctxp = ctx; + task->perf_event_ctxp[ctxn] = next_ctx; + next->perf_event_ctxp[ctxn] = ctx; ctx->task = next; next_ctx->task = task; do_switch = 0; @@ -1248,6 +1236,31 @@ void perf_event_task_sched_out(struct task_struct *task, } } +#define for_each_task_context_nr(ctxn) \ + for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++) + +/* + * Called from scheduler to remove the events of the current task, + * with interrupts disabled. + * + * We stop each event and update the event value in event->count. + * + * This does not protect us against NMI, but disable() + * sets the disabled bit in the control field of event _before_ + * accessing the event control register. If a NMI hits, then it will + * not restart the event. + */ +void perf_event_task_sched_out(struct task_struct *task, + struct task_struct *next) +{ + int ctxn; + + perf_sw_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, NULL, 0); + + for_each_task_context_nr(ctxn) + perf_event_context_sched_out(task, ctxn, next); +} + static void task_ctx_sched_out(struct perf_event_context *ctx, enum event_type_t event_type) { @@ -1366,38 +1379,23 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, ctx_sched_in(ctx, cpuctx, event_type); } -static void task_ctx_sched_in(struct task_struct *task, +static void task_ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type) { - struct perf_event_context *ctx = task->perf_event_ctxp; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + struct perf_cpu_context *cpuctx; - if (likely(!ctx)) - return; + cpuctx = __get_cpu_context(ctx); if (cpuctx->task_ctx == ctx) return; + ctx_sched_in(ctx, cpuctx, event_type); cpuctx->task_ctx = ctx; } -/* - * Called from scheduler to add the events of the current task - * with interrupts disabled. - * - * We restore the event value and then enable it. - * - * This does not protect us against NMI, but enable() - * sets the enabled bit in the control field of event _before_ - * accessing the event control register. If a NMI hits, then it will - * keep the event running. - */ -void perf_event_task_sched_in(struct task_struct *task) + +void perf_event_context_sched_in(struct perf_event_context *ctx) { - struct perf_event_context *ctx = task->perf_event_ctxp; struct perf_cpu_context *cpuctx; - if (likely(!ctx)) - return; - cpuctx = __get_cpu_context(ctx); if (cpuctx->task_ctx == ctx) return; @@ -1422,6 +1420,31 @@ void perf_event_task_sched_in(struct task_struct *task) perf_pmu_rotate_start(ctx->pmu); } +/* + * Called from scheduler to add the events of the current task + * with interrupts disabled. + * + * We restore the event value and then enable it. + * + * This does not protect us against NMI, but enable() + * sets the enabled bit in the control field of event _before_ + * accessing the event control register. If a NMI hits, then it will + * keep the event running. + */ +void perf_event_task_sched_in(struct task_struct *task) +{ + struct perf_event_context *ctx; + int ctxn; + + for_each_task_context_nr(ctxn) { + ctx = task->perf_event_ctxp[ctxn]; + if (likely(!ctx)) + continue; + + perf_event_context_sched_in(ctx); + } +} + #define MAX_INTERRUPTS (~0ULL) static void perf_log_throttle(struct perf_event *event, int enable); @@ -1588,7 +1611,7 @@ static enum hrtimer_restart perf_event_context_tick(struct hrtimer *timer) { enum hrtimer_restart restart = HRTIMER_NORESTART; struct perf_cpu_context *cpuctx; - struct perf_event_context *ctx; + struct perf_event_context *ctx = NULL; int rotate = 0; cpuctx = container_of(timer, struct perf_cpu_context, timer); @@ -1599,7 +1622,7 @@ static enum hrtimer_restart perf_event_context_tick(struct hrtimer *timer) rotate = 1; } - ctx = current->perf_event_ctxp; + ctx = cpuctx->task_ctx; if (ctx && ctx->nr_events) { restart = HRTIMER_RESTART; if (ctx->nr_events != ctx->nr_active) @@ -1623,7 +1646,7 @@ static enum hrtimer_restart perf_event_context_tick(struct hrtimer *timer) cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE); if (ctx) - task_ctx_sched_in(current, EVENT_FLEXIBLE); + task_ctx_sched_in(ctx, EVENT_FLEXIBLE); done: hrtimer_forward_now(timer, ns_to_ktime(cpuctx->timer_interval)); @@ -1650,20 +1673,18 @@ static int event_enable_on_exec(struct perf_event *event, * Enable all of a task's events that have been marked enable-on-exec. * This expects task == current. */ -static void perf_event_enable_on_exec(struct task_struct *task) +static void perf_event_enable_on_exec(struct perf_event_context *ctx) { - struct perf_event_context *ctx; struct perf_event *event; unsigned long flags; int enabled = 0; int ret; local_irq_save(flags); - ctx = task->perf_event_ctxp; if (!ctx || !ctx->nr_events) goto out; - __perf_event_task_sched_out(ctx); + task_ctx_sched_out(ctx, EVENT_ALL); raw_spin_lock(&ctx->lock); @@ -1687,7 +1708,7 @@ static void perf_event_enable_on_exec(struct task_struct *task) raw_spin_unlock(&ctx->lock); - perf_event_task_sched_in(task); + perf_event_context_sched_in(ctx); out: local_irq_restore(flags); } @@ -1995,7 +2016,7 @@ find_get_context(struct pmu *pmu, pid_t pid, int cpu) struct perf_cpu_context *cpuctx; struct task_struct *task; unsigned long flags; - int err; + int ctxn, err; if (pid == -1 && cpu != -1) { /* Must be root to operate on a CPU event: */ @@ -2044,8 +2065,13 @@ find_get_context(struct pmu *pmu, pid_t pid, int cpu) if (!ptrace_may_access(task, PTRACE_MODE_READ)) goto errout; + err = -EINVAL; + ctxn = pmu->task_ctx_nr; + if (ctxn < 0) + goto errout; + retry: - ctx = perf_lock_task_context(task, &flags); + ctx = perf_lock_task_context(task, ctxn, &flags); if (ctx) { unclone_ctx(ctx); raw_spin_unlock_irqrestore(&ctx->lock, flags); @@ -2059,7 +2085,7 @@ retry: get_ctx(ctx); - if (cmpxchg(&task->perf_event_ctxp, NULL, ctx)) { + if (cmpxchg(&task->perf_event_ctxp[ctxn], NULL, ctx)) { /* * We raced with some other task; use * the context they set. @@ -3773,19 +3799,26 @@ static void perf_event_task_ctx(struct perf_event_context *ctx, static void perf_event_task_event(struct perf_task_event *task_event) { - struct perf_event_context *ctx = task_event->task_ctx; struct perf_cpu_context *cpuctx; + struct perf_event_context *ctx; struct pmu *pmu; + int ctxn; rcu_read_lock_sched(); list_for_each_entry_rcu(pmu, &pmus, entry) { cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); perf_event_task_ctx(&cpuctx->ctx, task_event); + + ctx = task_event->task_ctx; + if (!ctx) { + ctxn = pmu->task_ctx_nr; + if (ctxn < 0) + continue; + ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); + } + if (ctx) + perf_event_task_ctx(ctx, task_event); } - if (!ctx) - ctx = rcu_dereference(current->perf_event_ctxp); - if (ctx) - perf_event_task_ctx(ctx, task_event); rcu_read_unlock_sched(); } @@ -3890,9 +3923,10 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) { struct perf_cpu_context *cpuctx; struct perf_event_context *ctx; + char comm[TASK_COMM_LEN]; unsigned int size; struct pmu *pmu; - char comm[TASK_COMM_LEN]; + int ctxn; memset(comm, 0, sizeof(comm)); strlcpy(comm, comm_event->task->comm, sizeof(comm)); @@ -3907,19 +3941,31 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) list_for_each_entry_rcu(pmu, &pmus, entry) { cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); perf_event_comm_ctx(&cpuctx->ctx, comm_event); + + ctxn = pmu->task_ctx_nr; + if (ctxn < 0) + continue; + + ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); + if (ctx) + perf_event_comm_ctx(ctx, comm_event); } - ctx = rcu_dereference(current->perf_event_ctxp); - if (ctx) - perf_event_comm_ctx(ctx, comm_event); rcu_read_unlock_sched(); } void perf_event_comm(struct task_struct *task) { struct perf_comm_event comm_event; + struct perf_event_context *ctx; + int ctxn; - if (task->perf_event_ctxp) - perf_event_enable_on_exec(task); + for_each_task_context_nr(ctxn) { + ctx = task->perf_event_ctxp[ctxn]; + if (!ctx) + continue; + + perf_event_enable_on_exec(ctx); + } if (!atomic_read(&nr_comm_events)) return; @@ -4022,6 +4068,7 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) char *buf = NULL; const char *name; struct pmu *pmu; + int ctxn; memset(tmp, 0, sizeof(tmp)); @@ -4078,10 +4125,17 @@ got_name: cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, vma->vm_flags & VM_EXEC); + + ctxn = pmu->task_ctx_nr; + if (ctxn < 0) + continue; + + ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); + if (ctx) { + perf_event_mmap_ctx(ctx, mmap_event, + vma->vm_flags & VM_EXEC); + } } - ctx = rcu_dereference(current->perf_event_ctxp); - if (ctx) - perf_event_mmap_ctx(ctx, mmap_event, vma->vm_flags & VM_EXEC); rcu_read_unlock_sched(); kfree(buf); @@ -5042,6 +5096,43 @@ static void perf_pmu_cancel_txn(struct pmu *pmu) perf_pmu_enable(pmu); } +/* + * Ensures all contexts with the same task_ctx_nr have the same + * pmu_cpu_context too. + */ +static void *find_pmu_context(int ctxn) +{ + struct pmu *pmu; + + if (ctxn < 0) + return NULL; + + list_for_each_entry(pmu, &pmus, entry) { + if (pmu->task_ctx_nr == ctxn) + return pmu->pmu_cpu_context; + } + + return NULL; +} + +static void free_pmu_context(void * __percpu cpu_context) +{ + struct pmu *pmu; + + mutex_lock(&pmus_lock); + /* + * Like a real lame refcount. + */ + list_for_each_entry(pmu, &pmus, entry) { + if (pmu->pmu_cpu_context == cpu_context) + goto out; + } + + free_percpu(cpu_context); +out: + mutex_unlock(&pmus_lock); +} + int perf_pmu_register(struct pmu *pmu) { int cpu, ret; @@ -5052,6 +5143,10 @@ int perf_pmu_register(struct pmu *pmu) if (!pmu->pmu_disable_count) goto unlock; + pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr); + if (pmu->pmu_cpu_context) + goto got_cpu_context; + pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context); if (!pmu->pmu_cpu_context) goto free_pdc; @@ -5067,6 +5162,7 @@ int perf_pmu_register(struct pmu *pmu) cpuctx->timer.function = perf_event_context_tick; } +got_cpu_context: if (!pmu->start_txn) { if (pmu->pmu_enable) { /* @@ -5114,7 +5210,7 @@ void perf_pmu_unregister(struct pmu *pmu) synchronize_srcu(&pmus_srcu); free_percpu(pmu->pmu_disable_count); - free_percpu(pmu->pmu_cpu_context); + free_pmu_context(pmu->pmu_cpu_context); } struct pmu *perf_init_event(struct perf_event *event) @@ -5628,16 +5724,13 @@ __perf_event_exit_task(struct perf_event *child_event, } } -/* - * When a child task exits, feed back event values to parent events. - */ -void perf_event_exit_task(struct task_struct *child) +static void perf_event_exit_task_context(struct task_struct *child, int ctxn) { struct perf_event *child_event, *tmp; struct perf_event_context *child_ctx; unsigned long flags; - if (likely(!child->perf_event_ctxp)) { + if (likely(!child->perf_event_ctxp[ctxn])) { perf_event_task(child, NULL, 0); return; } @@ -5649,7 +5742,7 @@ void perf_event_exit_task(struct task_struct *child) * scheduled, so we are now safe from rescheduling changing * our context. */ - child_ctx = child->perf_event_ctxp; + child_ctx = child->perf_event_ctxp[ctxn]; __perf_event_task_sched_out(child_ctx); /* @@ -5658,7 +5751,7 @@ void perf_event_exit_task(struct task_struct *child) * incremented the context's refcount before we do put_ctx below. */ raw_spin_lock(&child_ctx->lock); - child->perf_event_ctxp = NULL; + child->perf_event_ctxp[ctxn] = NULL; /* * If this context is a clone; unclone it so it can't get * swapped to another process while we're removing all @@ -5711,6 +5804,17 @@ again: put_ctx(child_ctx); } +/* + * When a child task exits, feed back event values to parent events. + */ +void perf_event_exit_task(struct task_struct *child) +{ + int ctxn; + + for_each_task_context_nr(ctxn) + perf_event_exit_task_context(child, ctxn); +} + static void perf_free_event(struct perf_event *event, struct perf_event_context *ctx) { @@ -5732,32 +5836,37 @@ static void perf_free_event(struct perf_event *event, /* * free an unexposed, unused context as created by inheritance by - * init_task below, used by fork() in case of fail. + * perf_event_init_task below, used by fork() in case of fail. */ void perf_event_free_task(struct task_struct *task) { - struct perf_event_context *ctx = task->perf_event_ctxp; + struct perf_event_context *ctx; struct perf_event *event, *tmp; + int ctxn; - if (!ctx) - return; + for_each_task_context_nr(ctxn) { + ctx = task->perf_event_ctxp[ctxn]; + if (!ctx) + continue; - mutex_lock(&ctx->mutex); + mutex_lock(&ctx->mutex); again: - list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry) - perf_free_event(event, ctx); + list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, + group_entry) + perf_free_event(event, ctx); - list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, - group_entry) - perf_free_event(event, ctx); + list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, + group_entry) + perf_free_event(event, ctx); - if (!list_empty(&ctx->pinned_groups) || - !list_empty(&ctx->flexible_groups)) - goto again; + if (!list_empty(&ctx->pinned_groups) || + !list_empty(&ctx->flexible_groups)) + goto again; - mutex_unlock(&ctx->mutex); + mutex_unlock(&ctx->mutex); - put_ctx(ctx); + put_ctx(ctx); + } } /* @@ -5863,17 +5972,18 @@ static int inherit_group(struct perf_event *parent_event, static int inherit_task_group(struct perf_event *event, struct task_struct *parent, struct perf_event_context *parent_ctx, - struct task_struct *child, + struct task_struct *child, int ctxn, int *inherited_all) { int ret; - struct perf_event_context *child_ctx = child->perf_event_ctxp; + struct perf_event_context *child_ctx; if (!event->attr.inherit) { *inherited_all = 0; return 0; } + child_ctx = child->perf_event_ctxp[ctxn]; if (!child_ctx) { /* * This is executed from the parent task context, so @@ -5886,7 +5996,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent, if (!child_ctx) return -ENOMEM; - child->perf_event_ctxp = child_ctx; + child->perf_event_ctxp[ctxn] = child_ctx; } ret = inherit_group(event, parent, parent_ctx, @@ -5901,7 +6011,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent, /* * Initialize the perf_event context in task_struct */ -int perf_event_init_task(struct task_struct *child) +int perf_event_init_context(struct task_struct *child, int ctxn) { struct perf_event_context *child_ctx, *parent_ctx; struct perf_event_context *cloned_ctx; @@ -5910,19 +6020,19 @@ int perf_event_init_task(struct task_struct *child) int inherited_all = 1; int ret = 0; - child->perf_event_ctxp = NULL; + child->perf_event_ctxp[ctxn] = NULL; mutex_init(&child->perf_event_mutex); INIT_LIST_HEAD(&child->perf_event_list); - if (likely(!parent->perf_event_ctxp)) + if (likely(!parent->perf_event_ctxp[ctxn])) return 0; /* * If the parent's context is a clone, pin it so it won't get * swapped under us. */ - parent_ctx = perf_pin_task_context(parent); + parent_ctx = perf_pin_task_context(parent, ctxn); /* * No need to check if parent_ctx != NULL here; since we saw @@ -5942,20 +6052,20 @@ int perf_event_init_task(struct task_struct *child) * the list, not manipulating it: */ list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) { - ret = inherit_task_group(event, parent, parent_ctx, child, - &inherited_all); + ret = inherit_task_group(event, parent, parent_ctx, + child, ctxn, &inherited_all); if (ret) break; } list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) { - ret = inherit_task_group(event, parent, parent_ctx, child, - &inherited_all); + ret = inherit_task_group(event, parent, parent_ctx, + child, ctxn, &inherited_all); if (ret) break; } - child_ctx = child->perf_event_ctxp; + child_ctx = child->perf_event_ctxp[ctxn]; if (child_ctx && inherited_all) { /* @@ -5984,6 +6094,22 @@ int perf_event_init_task(struct task_struct *child) return ret; } +/* + * Initialize the perf_event context in task_struct + */ +int perf_event_init_task(struct task_struct *child) +{ + int ctxn, ret; + + for_each_task_context_nr(ctxn) { + ret = perf_event_init_context(child, ctxn); + if (ret) + return ret; + } + + return 0; +} + static void __init perf_event_init_all_cpus(void) { struct swevent_htable *swhash; -- cgit v1.2.3 From 89a1e18731959e9953fae15ddc1a983eb15a4f19 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 7 Sep 2010 17:34:50 +0200 Subject: perf: Provide a separate task context for swevents Since software events are always schedulable, mixing them up with hardware events (who are not) can lead to funny scheduling oddities. Giving them their own context solves this. Signed-off-by: Peter Zijlstra Cc: paulus Cc: stephane eranian Cc: Robert Richter Cc: Frederic Weisbecker Cc: Lin Ming Cc: Yanmin LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/hw_breakpoint.c | 2 ++ kernel/perf_event.c | 40 +++++++++++++++++++++++++++++----------- 2 files changed, 31 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index 6f150095cafe..3b2aaffb65f0 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c @@ -610,6 +610,8 @@ static void hw_breakpoint_stop(struct perf_event *bp, int flags) } static struct pmu perf_breakpoint = { + .task_ctx_nr = perf_sw_context, /* could eventually get its own */ + .event_init = hw_breakpoint_event_init, .add = hw_breakpoint_add, .del = hw_breakpoint_del, diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 7223ea875861..357ee8d5e8ae 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -4709,6 +4709,8 @@ static int perf_swevent_init(struct perf_event *event) } static struct pmu perf_swevent = { + .task_ctx_nr = perf_sw_context, + .event_init = perf_swevent_init, .add = perf_swevent_add, .del = perf_swevent_del, @@ -4800,6 +4802,8 @@ static int perf_tp_event_init(struct perf_event *event) } static struct pmu perf_tracepoint = { + .task_ctx_nr = perf_sw_context, + .event_init = perf_tp_event_init, .add = perf_trace_add, .del = perf_trace_del, @@ -4988,6 +4992,8 @@ static int cpu_clock_event_init(struct perf_event *event) } static struct pmu perf_cpu_clock = { + .task_ctx_nr = perf_sw_context, + .event_init = cpu_clock_event_init, .add = cpu_clock_event_add, .del = cpu_clock_event_del, @@ -5063,6 +5069,8 @@ static int task_clock_event_init(struct perf_event *event) } static struct pmu perf_task_clock = { + .task_ctx_nr = perf_sw_context, + .event_init = task_clock_event_init, .add = task_clock_event_add, .del = task_clock_event_del, @@ -5490,6 +5498,7 @@ SYSCALL_DEFINE5(perf_event_open, struct perf_event_context *ctx; struct file *event_file = NULL; struct file *group_file = NULL; + struct pmu *pmu; int event_fd; int fput_needed = 0; int err; @@ -5522,20 +5531,11 @@ SYSCALL_DEFINE5(perf_event_open, goto err_fd; } - /* - * Get the target context (task or percpu): - */ - ctx = find_get_context(event->pmu, pid, cpu); - if (IS_ERR(ctx)) { - err = PTR_ERR(ctx); - goto err_alloc; - } - if (group_fd != -1) { group_leader = perf_fget_light(group_fd, &fput_needed); if (IS_ERR(group_leader)) { err = PTR_ERR(group_leader); - goto err_context; + goto err_alloc; } group_file = group_leader->filp; if (flags & PERF_FLAG_FD_OUTPUT) @@ -5544,6 +5544,23 @@ SYSCALL_DEFINE5(perf_event_open, group_leader = NULL; } + /* + * Special case software events and allow them to be part of + * any hardware group. + */ + pmu = event->pmu; + if ((pmu->task_ctx_nr == perf_sw_context) && group_leader) + pmu = group_leader->pmu; + + /* + * Get the target context (task or percpu): + */ + ctx = find_get_context(pmu, pid, cpu); + if (IS_ERR(ctx)) { + err = PTR_ERR(ctx); + goto err_group_fd; + } + /* * Look up the group leader (we will attach this event to it): */ @@ -5605,8 +5622,9 @@ SYSCALL_DEFINE5(perf_event_open, return event_fd; err_context: - fput_light(group_file, fput_needed); put_ctx(ctx); +err_group_fd: + fput_light(group_file, fput_needed); err_alloc: free_event(event); err_fd: -- cgit v1.2.3 From 1b9a644fece117cfa5474a2388d6b89d1baf8ddf Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 7 Sep 2010 18:32:22 +0200 Subject: perf: Optimize context ops Assuming we don't mix events of different pmus onto a single context (with the exeption of software events inside a hardware group) we can now assume that all events on a particular context belong to the same pmu, hence we can disable the pmu for the entire context operations. This reduces the amount of hardware writes. The exception for swevents comes from the fact that the sw pmu disable is a nop. Signed-off-by: Peter Zijlstra Cc: paulus Cc: stephane eranian Cc: Robert Richter Cc: Frederic Weisbecker Cc: Lin Ming Cc: Yanmin LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 357ee8d5e8ae..9819a69a61a1 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -1065,6 +1065,7 @@ static void ctx_sched_out(struct perf_event_context *ctx, struct perf_event *event; raw_spin_lock(&ctx->lock); + perf_pmu_disable(ctx->pmu); ctx->is_active = 0; if (likely(!ctx->nr_events)) goto out; @@ -1083,6 +1084,7 @@ static void ctx_sched_out(struct perf_event_context *ctx, group_sched_out(event, cpuctx, ctx); } out: + perf_pmu_enable(ctx->pmu); raw_spin_unlock(&ctx->lock); } @@ -1400,6 +1402,7 @@ void perf_event_context_sched_in(struct perf_event_context *ctx) if (cpuctx->task_ctx == ctx) return; + perf_pmu_disable(ctx->pmu); /* * We want to keep the following priority order: * cpu pinned (that don't need to move), task pinned, @@ -1418,6 +1421,7 @@ void perf_event_context_sched_in(struct perf_event_context *ctx) * cpu-context we got scheduled on is actually rotating. */ perf_pmu_rotate_start(ctx->pmu); + perf_pmu_enable(ctx->pmu); } /* @@ -1629,6 +1633,7 @@ static enum hrtimer_restart perf_event_context_tick(struct hrtimer *timer) rotate = 1; } + perf_pmu_disable(cpuctx->ctx.pmu); perf_ctx_adjust_freq(&cpuctx->ctx, cpuctx->timer_interval); if (ctx) perf_ctx_adjust_freq(ctx, cpuctx->timer_interval); @@ -1649,6 +1654,7 @@ static enum hrtimer_restart perf_event_context_tick(struct hrtimer *timer) task_ctx_sched_in(ctx, EVENT_FLEXIBLE); done: + perf_pmu_enable(cpuctx->ctx.pmu); hrtimer_forward_now(timer, ns_to_ktime(cpuctx->timer_interval)); return restart; -- cgit v1.2.3 From 4e231c7962ce711c7d8c2a4dc23ecd1e8fc28363 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 9 Sep 2010 21:01:59 +0200 Subject: perf: Fix up delayed_put_task_struct() I missed a perf_event_ctxp user when converting it to an array. Pull this last user into perf_event.c as well and fix it up. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/exit.c | 4 +--- kernel/perf_event.c | 8 ++++++++ 2 files changed, 9 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/exit.c b/kernel/exit.c index 03120229db28..e2bdf37f9fde 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -149,9 +149,7 @@ static void delayed_put_task_struct(struct rcu_head *rhp) { struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); -#ifdef CONFIG_PERF_EVENTS - WARN_ON_ONCE(tsk->perf_event_ctxp); -#endif + perf_event_delayed_put(tsk); trace_sched_process_free(tsk); put_task_struct(tsk); } diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 9819a69a61a1..eaf1c5de6dcc 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -5893,6 +5893,14 @@ again: } } +void perf_event_delayed_put(struct task_struct *task) +{ + int ctxn; + + for_each_task_context_nr(ctxn) + WARN_ON_ONCE(task->perf_event_ctxp[ctxn]); +} + /* * inherit a event from parent task to child task: */ -- cgit v1.2.3 From cee010ec5211b96f33c5c2208f5c14ebb04b634a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 10 Sep 2010 12:51:54 +0200 Subject: perf: Ensure we call add_event_to_ctx() with the right locks held Even though we call it from the inherit path, where the child is not yet accessible, we need to hold ctx->lock, add_event_to_ctx() assumes IRQs are disabled. Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index eaf1c5de6dcc..f395fb4d9b74 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -5913,6 +5913,7 @@ inherit_event(struct perf_event *parent_event, struct perf_event_context *child_ctx) { struct perf_event *child_event; + unsigned long flags; /* * Instead of creating recursive hierarchies of events, @@ -5957,7 +5958,9 @@ inherit_event(struct perf_event *parent_event, /* * Link it up in the child's context: */ + raw_spin_lock_irqsave(&child_ctx->lock, flags); add_event_to_ctx(child_event, child_ctx); + raw_spin_unlock_irqrestore(&child_ctx->lock, flags); /* * Get a reference to the parent filp - we will fput it -- cgit v1.2.3 From e5f4d3394a52ac351f1a479fe136d92fa5228eff Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 10 Sep 2010 17:38:06 +0200 Subject: perf: Fix perf_init_event() We ought to return -ENOENT when non of the registered PMUs recognise the requested event. This fixes a boot crash that occurs if no PMU is available but the NMI watchdog tries to register an event. Reported-by: Ingo Molnar Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index f395fb4d9b74..f29b52576ec1 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -5236,12 +5236,15 @@ struct pmu *perf_init_event(struct perf_event *event) list_for_each_entry_rcu(pmu, &pmus, entry) { int ret = pmu->event_init(event); if (!ret) - break; + goto unlock; + if (ret != -ENOENT) { pmu = ERR_PTR(ret); - break; + goto unlock; } } + pmu = ERR_PTR(-ENOENT); +unlock: srcu_read_unlock(&pmus_srcu, idx); return pmu; -- cgit v1.2.3 From cde8e88498c8de69271fcb6d4dd974979368fa67 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 13 Sep 2010 11:06:55 +0200 Subject: perf: Sanitize the RCU logic Simplify things and simply synchronize against two RCU variants for PMU unregister -- we don't care about performance, its module unload if anything. Reported-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra Cc: Paul E. McKenney LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index f29b52576ec1..bc46bff69620 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -3810,7 +3810,7 @@ static void perf_event_task_event(struct perf_task_event *task_event) struct pmu *pmu; int ctxn; - rcu_read_lock_sched(); + rcu_read_lock(); list_for_each_entry_rcu(pmu, &pmus, entry) { cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); perf_event_task_ctx(&cpuctx->ctx, task_event); @@ -3825,7 +3825,7 @@ static void perf_event_task_event(struct perf_task_event *task_event) if (ctx) perf_event_task_ctx(ctx, task_event); } - rcu_read_unlock_sched(); + rcu_read_unlock(); } static void perf_event_task(struct task_struct *task, @@ -3943,7 +3943,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) comm_event->event_id.header.size = sizeof(comm_event->event_id) + size; - rcu_read_lock_sched(); + rcu_read_lock(); list_for_each_entry_rcu(pmu, &pmus, entry) { cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); perf_event_comm_ctx(&cpuctx->ctx, comm_event); @@ -3956,7 +3956,7 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) if (ctx) perf_event_comm_ctx(ctx, comm_event); } - rcu_read_unlock_sched(); + rcu_read_unlock(); } void perf_event_comm(struct task_struct *task) @@ -4126,7 +4126,7 @@ got_name: mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size; - rcu_read_lock_sched(); + rcu_read_lock(); list_for_each_entry_rcu(pmu, &pmus, entry) { cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, @@ -4142,7 +4142,7 @@ got_name: vma->vm_flags & VM_EXEC); } } - rcu_read_unlock_sched(); + rcu_read_unlock(); kfree(buf); } @@ -5218,10 +5218,11 @@ void perf_pmu_unregister(struct pmu *pmu) mutex_unlock(&pmus_lock); /* - * We use the pmu list either under SRCU or preempt_disable, - * synchronize_srcu() implies synchronize_sched() so we're good. + * We dereference the pmu list under both SRCU and regular RCU, so + * synchronize against both of those. */ synchronize_srcu(&pmus_srcu); + synchronize_rcu(); free_percpu(pmu->pmu_disable_count); free_pmu_context(pmu->pmu_cpu_context); -- cgit v1.2.3 From 0c67b40872326a5340cab51d79a192a5fbaeb484 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 13 Sep 2010 11:15:58 +0200 Subject: perf: Fix free_event() With the context rework stuff we can actually end up freeing an event before it gets attached to a context. Reported-by: Cyrill Gorcunov Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index bc46bff69620..440f9ca067b2 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -2150,7 +2150,9 @@ static void free_event(struct perf_event *event) if (event->destroy) event->destroy(event); - put_ctx(event->ctx); + if (event->ctx) + put_ctx(event->ctx); + call_rcu(&event->rcu_head, free_event_rcu); } -- cgit v1.2.3 From 2bccfffd1538f3523847583213567e2f7ce00926 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 9 Sep 2010 08:43:22 -0400 Subject: tracing: Do not reset *pos in set_ftrace_filter After the filtered functions are read, the probed functions are read from the hash in set_ftrace_filter. When the hashed probed functions are read, the *pos passed in is reset. Instead of modifying the pos given to the read function, just record the pos where the filtered functions ended and subtract from that. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index fa7ece649fe1..585ea27025b1 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1368,6 +1368,7 @@ enum { #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ struct ftrace_iterator { + loff_t func_pos; struct ftrace_page *pg; int hidx; int idx; @@ -1418,12 +1419,15 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos) loff_t l; if (!(iter->flags & FTRACE_ITER_HASH)) - *pos = 0; + iter->func_pos = *pos; + + if (iter->func_pos > *pos) + return NULL; iter->flags |= FTRACE_ITER_HASH; iter->hidx = 0; - for (l = 0; l <= *pos; ) { + for (l = 0; l <= (*pos - iter->func_pos); ) { p = t_hash_next(m, p, &l); if (!p) break; -- cgit v1.2.3 From 4aeb69672d011fac5c8df671f3ca89f7987c104e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 9 Sep 2010 10:00:28 -0400 Subject: tracing: Replace typecasted void pointer in set_ftrace_filter code The set_ftrace_filter uses seq_file and reads from two lists. The pointer returned by t_next() can either be of type struct dyn_ftrace or struct ftrace_func_probe. If there is a bug (there was one) the wrong pointer may be used and the reference can cause an oops. This patch makes t_next() and friends only return the iterator structure which now has a pointer of type struct dyn_ftrace and struct ftrace_func_probe. The t_show() can now test if the pointer is NULL or not and if the pointer exists, it is guaranteed to be of the correct type. Now if there's a bug, only wrong data will be shown but not an oops. Cc: Chris Wright Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 67 +++++++++++++++++++++++++++++++++++---------------- 1 file changed, 46 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 585ea27025b1..c8db0dbb984e 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1368,25 +1368,29 @@ enum { #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ struct ftrace_iterator { - loff_t func_pos; - struct ftrace_page *pg; - int hidx; - int idx; - unsigned flags; - struct trace_parser parser; + loff_t func_pos; + struct ftrace_page *pg; + struct dyn_ftrace *func; + struct ftrace_func_probe *probe; + struct trace_parser parser; + int hidx; + int idx; + unsigned flags; }; static void * -t_hash_next(struct seq_file *m, void *v, loff_t *pos) +t_hash_next(struct seq_file *m, loff_t *pos) { struct ftrace_iterator *iter = m->private; - struct hlist_node *hnd = v; + struct hlist_node *hnd = NULL; struct hlist_head *hhd; WARN_ON(!(iter->flags & FTRACE_ITER_HASH)); (*pos)++; + if (iter->probe) + hnd = &iter->probe->node; retry: if (iter->hidx >= FTRACE_FUNC_HASHSIZE) return NULL; @@ -1409,7 +1413,12 @@ t_hash_next(struct seq_file *m, void *v, loff_t *pos) } } - return hnd; + if (WARN_ON_ONCE(!hnd)) + return NULL; + + iter->probe = hlist_entry(hnd, struct ftrace_func_probe, node); + + return iter; } static void *t_hash_start(struct seq_file *m, loff_t *pos) @@ -1428,19 +1437,24 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos) iter->hidx = 0; for (l = 0; l <= (*pos - iter->func_pos); ) { - p = t_hash_next(m, p, &l); + p = t_hash_next(m, &l); if (!p) break; } - return p; + if (!p) + return NULL; + + return iter; } -static int t_hash_show(struct seq_file *m, void *v) +static int +t_hash_show(struct seq_file *m, struct ftrace_iterator *iter) { struct ftrace_func_probe *rec; - struct hlist_node *hnd = v; - rec = hlist_entry(hnd, struct ftrace_func_probe, node); + rec = iter->probe; + if (WARN_ON_ONCE(!rec)) + return -EIO; if (rec->ops->print) return rec->ops->print(m, rec->ip, rec->ops, rec->data); @@ -1461,7 +1475,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos) struct dyn_ftrace *rec = NULL; if (iter->flags & FTRACE_ITER_HASH) - return t_hash_next(m, v, pos); + return t_hash_next(m, pos); (*pos)++; @@ -1495,7 +1509,12 @@ t_next(struct seq_file *m, void *v, loff_t *pos) } } - return rec; + if (!rec) + return NULL; + + iter->func = rec; + + return iter; } static void *t_start(struct seq_file *m, loff_t *pos) @@ -1530,10 +1549,14 @@ static void *t_start(struct seq_file *m, loff_t *pos) break; } - if (!p && iter->flags & FTRACE_ITER_FILTER) - return t_hash_start(m, pos); + if (!p) { + if (iter->flags & FTRACE_ITER_FILTER) + return t_hash_start(m, pos); - return p; + return NULL; + } + + return iter; } static void t_stop(struct seq_file *m, void *p) @@ -1544,16 +1567,18 @@ static void t_stop(struct seq_file *m, void *p) static int t_show(struct seq_file *m, void *v) { struct ftrace_iterator *iter = m->private; - struct dyn_ftrace *rec = v; + struct dyn_ftrace *rec; if (iter->flags & FTRACE_ITER_HASH) - return t_hash_show(m, v); + return t_hash_show(m, iter); if (iter->flags & FTRACE_ITER_PRINTALL) { seq_printf(m, "#### all functions enabled ####\n"); return 0; } + rec = iter->func; + if (!rec) return 0; -- cgit v1.2.3 From 98c4fd046f07156ca6055677e8f03d4280be16c1 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 10 Sep 2010 11:47:43 -0400 Subject: tracing: Keep track of set_ftrace_filter position and allow lseek again This patch keeps track of the index within the elements of set_ftrace_filter and if the position goes backwards, it nicely resets and starts from the beginning again. This allows for lseek and pread to work properly now. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 34 ++++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index c8db0dbb984e..2d51166b93fe 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1368,6 +1368,7 @@ enum { #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */ struct ftrace_iterator { + loff_t pos; loff_t func_pos; struct ftrace_page *pg; struct dyn_ftrace *func; @@ -1385,9 +1386,8 @@ t_hash_next(struct seq_file *m, loff_t *pos) struct hlist_node *hnd = NULL; struct hlist_head *hhd; - WARN_ON(!(iter->flags & FTRACE_ITER_HASH)); - (*pos)++; + iter->pos = *pos; if (iter->probe) hnd = &iter->probe->node; @@ -1427,14 +1427,9 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos) void *p = NULL; loff_t l; - if (!(iter->flags & FTRACE_ITER_HASH)) - iter->func_pos = *pos; - if (iter->func_pos > *pos) return NULL; - iter->flags |= FTRACE_ITER_HASH; - iter->hidx = 0; for (l = 0; l <= (*pos - iter->func_pos); ) { p = t_hash_next(m, &l); @@ -1444,6 +1439,9 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos) if (!p) return NULL; + /* Only set this if we have an item */ + iter->flags |= FTRACE_ITER_HASH; + return iter; } @@ -1478,6 +1476,8 @@ t_next(struct seq_file *m, void *v, loff_t *pos) return t_hash_next(m, pos); (*pos)++; + iter->pos = *pos; + iter->func_pos = *pos; if (iter->flags & FTRACE_ITER_PRINTALL) return NULL; @@ -1517,6 +1517,13 @@ t_next(struct seq_file *m, void *v, loff_t *pos) return iter; } +static void reset_iter_read(struct ftrace_iterator *iter) +{ + iter->pos = 0; + iter->func_pos = 0; + iter->flags &= ~(FTRACE_ITER_PRINTALL & FTRACE_ITER_HASH); +} + static void *t_start(struct seq_file *m, loff_t *pos) { struct ftrace_iterator *iter = m->private; @@ -1524,6 +1531,12 @@ static void *t_start(struct seq_file *m, loff_t *pos) loff_t l; mutex_lock(&ftrace_lock); + /* + * If an lseek was done, then reset and start from beginning. + */ + if (*pos < iter->pos) + reset_iter_read(iter); + /* * For set_ftrace_filter reading, if we have the filter * off, we can short cut and just print out that all @@ -1541,6 +1554,11 @@ static void *t_start(struct seq_file *m, loff_t *pos) if (iter->flags & FTRACE_ITER_HASH) return t_hash_start(m, pos); + /* + * Unfortunately, we need to restart at ftrace_pages_start + * every time we let go of the ftrace_mutex. This is because + * those pointers can change without the lock. + */ iter->pg = ftrace_pages_start; iter->idx = 0; for (l = 0; l <= *pos; ) { @@ -2447,7 +2465,7 @@ static const struct file_operations ftrace_filter_fops = { .open = ftrace_filter_open, .read = seq_read, .write = ftrace_filter_write, - .llseek = no_llseek, + .llseek = ftrace_regex_lseek, .release = ftrace_filter_release, }; -- cgit v1.2.3 From 57c072c7113f54f9512624d6c665db6184448782 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 14 Sep 2010 11:21:11 -0400 Subject: tracing: Fix reading of set_ftrace_filter across lists If we do: # cd /sys/kernel/debug # echo 'do_IRQ:traceon schedule:traceon sys_write:traceon' > \ set_ftrace_filter # cat set_ftrace_filter We get the following output: #### all functions enabled #### sys_write:traceon:unlimited schedule:traceon:unlimited do_IRQ:traceon:unlimited This outputs two lists. One is the fact that all functions are currently enabled for function tracing, the other has three probed functions, which happen to have 'traceon' as their commands. Currently, when reading the first list (functions enabled) the seq_file code will receive a "NULL" from the t_next() function causing it to exit early. This makes "read()" from userspace stop reading the code at this boarder. Although read is allowed to do this, some (broken) applications might consider this an end of file and stop early. This patch adds the start of the second list to t_next() when it finishes the first list. It is a simple change and gives the set_ftrace_filter file nicer reading ability. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 2d51166b93fe..1884cf5bc110 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1477,10 +1477,9 @@ t_next(struct seq_file *m, void *v, loff_t *pos) (*pos)++; iter->pos = *pos; - iter->func_pos = *pos; if (iter->flags & FTRACE_ITER_PRINTALL) - return NULL; + return t_hash_start(m, pos); retry: if (iter->idx >= iter->pg->index) { @@ -1510,8 +1509,9 @@ t_next(struct seq_file *m, void *v, loff_t *pos) } if (!rec) - return NULL; + return t_hash_start(m, pos); + iter->func_pos = *pos; iter->func = rec; return iter; -- cgit v1.2.3 From 2bd16212b8eb86f9574e78d6605a5ba9e9aa8c4e Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Tue, 7 Sep 2010 16:53:44 +0200 Subject: tracing: Add funcgraph-irq option for function graph tracer. It's handy to be able to disable the irq related output and not to have to jump over each irq related code, when you have no interrest in it. The option is by default enabled, so there's no change to current behaviour. It affects only the final output, so all the irq related data stay in the ring buffer. Signed-off-by: Jiri Olsa LKML-Reference: <20100907145344.GC1912@jolsa.brq.redhat.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_functions_graph.c | 101 ++++++++++++++++++++++++++++++++++- 1 file changed, 100 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index c93bcb248638..8674750a5ece 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -18,6 +18,7 @@ struct fgraph_cpu_data { pid_t last_pid; int depth; + int depth_irq; int ignore; unsigned long enter_funcs[FTRACE_RETFUNC_DEPTH]; }; @@ -41,6 +42,7 @@ struct fgraph_data { #define TRACE_GRAPH_PRINT_PROC 0x8 #define TRACE_GRAPH_PRINT_DURATION 0x10 #define TRACE_GRAPH_PRINT_ABS_TIME 0x20 +#define TRACE_GRAPH_PRINT_IRQS 0x40 static struct tracer_opt trace_opts[] = { /* Display overruns? (for self-debug purpose) */ @@ -55,13 +57,15 @@ static struct tracer_opt trace_opts[] = { { TRACER_OPT(funcgraph-duration, TRACE_GRAPH_PRINT_DURATION) }, /* Display absolute time of an entry */ { TRACER_OPT(funcgraph-abstime, TRACE_GRAPH_PRINT_ABS_TIME) }, + /* Display interrupts */ + { TRACER_OPT(funcgraph-irqs, TRACE_GRAPH_PRINT_IRQS) }, { } /* Empty entry */ }; static struct tracer_flags tracer_flags = { /* Don't display overruns and proc by default */ .val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD | - TRACE_GRAPH_PRINT_DURATION, + TRACE_GRAPH_PRINT_DURATION | TRACE_GRAPH_PRINT_IRQS, .opts = trace_opts }; @@ -855,6 +859,92 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s, return 0; } +/* + * Entry check for irq code + * + * returns 1 if + * - we are inside irq code + * - we just extered irq code + * + * retunns 0 if + * - funcgraph-interrupts option is set + * - we are not inside irq code + */ +static int +check_irq_entry(struct trace_iterator *iter, u32 flags, + unsigned long addr, int depth) +{ + int cpu = iter->cpu; + struct fgraph_data *data = iter->private; + int *depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq); + + if (flags & TRACE_GRAPH_PRINT_IRQS) + return 0; + + /* + * We are inside the irq code + */ + if (*depth_irq >= 0) + return 1; + + if ((addr < (unsigned long)__irqentry_text_start) || + (addr >= (unsigned long)__irqentry_text_end)) + return 0; + + /* + * We are entering irq code. + */ + *depth_irq = depth; + return 1; +} + +/* + * Return check for irq code + * + * returns 1 if + * - we are inside irq code + * - we just left irq code + * + * returns 0 if + * - funcgraph-interrupts option is set + * - we are not inside irq code + */ +static int +check_irq_return(struct trace_iterator *iter, u32 flags, int depth) +{ + int cpu = iter->cpu; + struct fgraph_data *data = iter->private; + int *depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq); + + if (flags & TRACE_GRAPH_PRINT_IRQS) + return 0; + + /* + * We are not inside the irq code. + */ + if (*depth_irq == -1) + return 0; + + /* + * We are inside the irq code, and this is returning entry. + * Let's not trace it and clear the entry depth, since + * we are out of irq code. + * + * This condition ensures that we 'leave the irq code' once + * we are out of the entry depth. Thus protecting us from + * the RETURN entry loss. + */ + if (*depth_irq >= depth) { + *depth_irq = -1; + return 1; + } + + /* + * We are inside the irq code, and this is not the entry. + */ + return 1; +} + static enum print_line_t print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, struct trace_iterator *iter, u32 flags) @@ -865,6 +955,9 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s, static enum print_line_t ret; int cpu = iter->cpu; + if (check_irq_entry(iter, flags, call->func, call->depth)) + return TRACE_TYPE_HANDLED; + if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func, flags)) return TRACE_TYPE_PARTIAL_LINE; @@ -902,6 +995,9 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, int ret; int i; + if (check_irq_return(iter, flags, trace->depth)) + return TRACE_TYPE_HANDLED; + if (data) { struct fgraph_cpu_data *cpu_data; int cpu = iter->cpu; @@ -1210,9 +1306,12 @@ void graph_trace_open(struct trace_iterator *iter) pid_t *pid = &(per_cpu_ptr(data->cpu_data, cpu)->last_pid); int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth); int *ignore = &(per_cpu_ptr(data->cpu_data, cpu)->ignore); + int *depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq); + *pid = -1; *depth = 0; *ignore = 0; + *depth_irq = -1; } iter->private = data; -- cgit v1.2.3 From b304d0441a4118fadd4c3f16e4dc600c271030b5 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 14 Sep 2010 18:58:33 -0400 Subject: tracing: Do not trace in irq when funcgraph-irq option is zero When the function graph tracer funcgraph-irq option is zero, disable tracing in IRQs. This makes the option have two effects. 1) When reading the trace file, do not display the functions that happen in interrupt context (when detected) 2) [*new*] When recording a trace, skip those that are detected to be in interrupt by the 'in_irq()' function Note, in_irq() is updated at irq_enter() and irq_exit(). There are still functions that are recorded by the function graph tracer that is in interrupt context but outside the irq_enter/exit() routines. Signed-off-by: Steven Rostedt --- kernel/trace/trace_functions_graph.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 8674750a5ece..02c708ae0d42 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -15,6 +15,9 @@ #include "trace.h" #include "trace_output.h" +/* When set, irq functions will be ignored */ +static int ftrace_graph_skip_irqs; + struct fgraph_cpu_data { pid_t last_pid; int depth; @@ -208,6 +211,14 @@ int __trace_graph_entry(struct trace_array *tr, return 1; } +static inline int ftrace_graph_ignore_irqs(void) +{ + if (!ftrace_graph_skip_irqs) + return 0; + + return in_irq(); +} + int trace_graph_entry(struct ftrace_graph_ent *trace) { struct trace_array *tr = graph_array; @@ -222,7 +233,8 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) return 0; /* trace it when it is-nested-in or is a function enabled. */ - if (!(trace->depth || ftrace_graph_addr(trace->func))) + if (!(trace->depth || ftrace_graph_addr(trace->func)) || + ftrace_graph_ignore_irqs()) return 0; local_irq_save(flags); @@ -1334,6 +1346,14 @@ void graph_trace_close(struct trace_iterator *iter) } } +static int func_graph_set_flag(u32 old_flags, u32 bit, int set) +{ + if (bit == TRACE_GRAPH_PRINT_IRQS) + ftrace_graph_skip_irqs = !set; + + return 0; +} + static struct trace_event_functions graph_functions = { .trace = print_graph_function_event, }; @@ -1360,6 +1380,7 @@ static struct tracer graph_trace __read_mostly = { .print_line = print_graph_function, .print_header = print_graph_headers, .flags = &tracer_flags, + .set_flag = func_graph_set_flag, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_function_graph, #endif -- cgit v1.2.3 From 79e406d7b00ab2b261ae32a59f266fd3b7af6f29 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 14 Sep 2010 22:19:46 -0400 Subject: tracing: Remove leftover FTRACE_ENABLE/DISABLE_MCOUNT enums The enums for FTRACE_ENABLE_MCOUNT and FTRACE_DISABLE_MCOUNT were used as commands to ftrace_run_update_code(). But these commands were used by the old nasty ftrace daemon that has long been slain. This is a clean up patch to remove the references to these enums and simplify the code a little. Reported-by: Wu Zhangjin Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 83a16e9ee518..20aff3f1c719 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -884,10 +884,8 @@ enum { FTRACE_ENABLE_CALLS = (1 << 0), FTRACE_DISABLE_CALLS = (1 << 1), FTRACE_UPDATE_TRACE_FUNC = (1 << 2), - FTRACE_ENABLE_MCOUNT = (1 << 3), - FTRACE_DISABLE_MCOUNT = (1 << 4), - FTRACE_START_FUNC_RET = (1 << 5), - FTRACE_STOP_FUNC_RET = (1 << 6), + FTRACE_START_FUNC_RET = (1 << 3), + FTRACE_STOP_FUNC_RET = (1 << 4), }; static int ftrace_filtered; @@ -1226,8 +1224,6 @@ static void ftrace_shutdown(int command) static void ftrace_startup_sysctl(void) { - int command = FTRACE_ENABLE_MCOUNT; - if (unlikely(ftrace_disabled)) return; @@ -1235,23 +1231,17 @@ static void ftrace_startup_sysctl(void) saved_ftrace_func = NULL; /* ftrace_start_up is true if we want ftrace running */ if (ftrace_start_up) - command |= FTRACE_ENABLE_CALLS; - - ftrace_run_update_code(command); + ftrace_run_update_code(FTRACE_ENABLE_CALLS); } static void ftrace_shutdown_sysctl(void) { - int command = FTRACE_DISABLE_MCOUNT; - if (unlikely(ftrace_disabled)) return; /* ftrace_start_up is true if ftrace is running */ if (ftrace_start_up) - command |= FTRACE_DISABLE_CALLS; - - ftrace_run_update_code(command); + ftrace_run_update_code(FTRACE_DISABLE_CALLS); } static cycle_t ftrace_update_time; -- cgit v1.2.3 From d9ca07a05ce1c42ac9717e54eaea4546a3a80978 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Tue, 14 Sep 2010 15:34:01 +0200 Subject: watchdog: Avoid kernel crash when disabling watchdog In case you boot with the watchdog disabled, i.e., nowatchdog, then, if you try to disable it via /proc/sys/kernel/watchdog, you get a kernel crash. The reason is that you are trying to cancel a hrtimer which has never been initialized. This patch fixes this by skipping execution of watchdog_disable_all_cpus() when the watchdog is marked disabled from boot. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra LKML-Reference: <4c8f7a23.cae9d80a.2c11.0bb4@mx.google.com> Signed-off-by: Ingo Molnar --- kernel/watchdog.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/watchdog.c b/kernel/watchdog.c index fa71aebda4ff..89eadbb9cefe 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -473,6 +473,9 @@ static void watchdog_disable_all_cpus(void) { int cpu; + if (no_watchdog) + return; + for_each_online_cpu(cpu) watchdog_disable(cpu); -- cgit v1.2.3 From d958077d007d98125766d11e82da2fd6497b91d6 Mon Sep 17 00:00:00 2001 From: Matt Helsley Date: Mon, 13 Sep 2010 13:01:18 -0700 Subject: hw breakpoints: Fix pid namespace bug Hardware breakpoints can't be registered within pid namespaces because tsk->pid is passed rather than the pid in the current namespace. (See https://bugzilla.kernel.org/show_bug.cgi?id=17281 ) This is a quick fix demonstrating the problem but is not the best method of solving the problem since passing pids internally is not the best way to avoid pid namespace bugs. Subsequent patches will show a better solution. Much thanks to Frederic Weisbecker for doing the bulk of the work finding this bug. Signed-off-by: Matt Helsley Signed-off-by: Peter Zijlstra Cc: Robin Green Cc: Prasad Cc: Arnaldo Carvalho de Melo Cc: Steven Rostedt Cc: Will Deacon Cc: Mahesh Salgaonkar LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/hw_breakpoint.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index 3b2aaffb65f0..6122f02cfedf 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c @@ -433,7 +433,8 @@ register_user_hw_breakpoint(struct perf_event_attr *attr, perf_overflow_handler_t triggered, struct task_struct *tsk) { - return perf_event_create_kernel_counter(attr, -1, tsk->pid, triggered); + return perf_event_create_kernel_counter(attr, -1, task_pid_vnr(tsk), + triggered); } EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); -- cgit v1.2.3 From 2ebd4ffb6d0cb877787b1e42be8485820158857e Mon Sep 17 00:00:00 2001 From: Matt Helsley Date: Mon, 13 Sep 2010 13:01:19 -0700 Subject: perf events: Split out task search into helper Split out the code which searches for non-exiting tasks into its own helper. Creating this helper not only makes the code slightly more readable it prepares to move the search out of find_get_context() in a subsequent commit. Signed-off-by: Matt Helsley Signed-off-by: Peter Zijlstra Cc: Robin Green Cc: Prasad Cc: Arnaldo Carvalho de Melo Cc: Steven Rostedt Cc: Will Deacon Cc: Mahesh Salgaonkar LKML-Reference: <561205417b450b8a4bf7488374541d64b4690431.1284407762.git.matthltc@us.ibm.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 63 ++++++++++++++++++++++++++++++++++------------------- 1 file changed, 40 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 440f9ca067b2..3f5309db72f1 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -2015,6 +2015,43 @@ alloc_perf_context(struct pmu *pmu, struct task_struct *task) return ctx; } +static struct task_struct * +find_lively_task_by_vpid(pid_t vpid) +{ + struct task_struct *task; + int err; + + rcu_read_lock(); + if (!vpid) + task = current; + else + task = find_task_by_vpid(vpid); + if (task) + get_task_struct(task); + rcu_read_unlock(); + + if (!task) + return ERR_PTR(-ESRCH); + + /* + * Can't attach events to a dying task. + */ + err = -ESRCH; + if (task->flags & PF_EXITING) + goto errout; + + /* Reuse ptrace permission checks for now. */ + err = -EACCES; + if (!ptrace_may_access(task, PTRACE_MODE_READ)) + goto errout; + + return task; +errout: + put_task_struct(task); + return ERR_PTR(err); + +} + static struct perf_event_context * find_get_context(struct pmu *pmu, pid_t pid, int cpu) { @@ -2047,29 +2084,9 @@ find_get_context(struct pmu *pmu, pid_t pid, int cpu) return ctx; } - rcu_read_lock(); - if (!pid) - task = current; - else - task = find_task_by_vpid(pid); - if (task) - get_task_struct(task); - rcu_read_unlock(); - - if (!task) - return ERR_PTR(-ESRCH); - - /* - * Can't attach events to a dying task. - */ - err = -ESRCH; - if (task->flags & PF_EXITING) - goto errout; - - /* Reuse ptrace permission checks for now. */ - err = -EACCES; - if (!ptrace_may_access(task, PTRACE_MODE_READ)) - goto errout; + task = find_lively_task_by_vpid(pid); + if (IS_ERR(task)) + return (void*)task; err = -EINVAL; ctxn = pmu->task_ctx_nr; -- cgit v1.2.3 From 38a81da2205f94e8a2a834b51a6b99c91fc7c2e8 Mon Sep 17 00:00:00 2001 From: Matt Helsley Date: Mon, 13 Sep 2010 13:01:20 -0700 Subject: perf events: Clean up pid passing The kernel perf event creation path shouldn't use find_task_by_vpid() because a vpid exists in a specific namespace. find_task_by_vpid() uses current's pid namespace which isn't always the correct namespace to use for the vpid in all the places perf_event_create_kernel_counter() (and thus find_get_context()) is called. The goal is to clean up pid namespace handling and prevent bugs like: https://bugzilla.kernel.org/show_bug.cgi?id=17281 Instead of using pids switch find_get_context() to use task struct pointers directly. The syscall is responsible for resolving the pid to a task struct. This moves the pid namespace resolution into the syscall much like every other syscall that takes pid parameters. Signed-off-by: Matt Helsley Signed-off-by: Peter Zijlstra Cc: Robin Green Cc: Prasad Cc: Arnaldo Carvalho de Melo Cc: Steven Rostedt Cc: Will Deacon Cc: Mahesh Salgaonkar LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/hw_breakpoint.c | 5 ++--- kernel/perf_event.c | 21 ++++++++++----------- kernel/watchdog.c | 2 +- 3 files changed, 13 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index 6122f02cfedf..3b714e839c10 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c @@ -433,8 +433,7 @@ register_user_hw_breakpoint(struct perf_event_attr *attr, perf_overflow_handler_t triggered, struct task_struct *tsk) { - return perf_event_create_kernel_counter(attr, -1, task_pid_vnr(tsk), - triggered); + return perf_event_create_kernel_counter(attr, -1, tsk, triggered); } EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); @@ -516,7 +515,7 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr, get_online_cpus(); for_each_online_cpu(cpu) { pevent = per_cpu_ptr(cpu_events, cpu); - bp = perf_event_create_kernel_counter(attr, cpu, -1, triggered); + bp = perf_event_create_kernel_counter(attr, cpu, NULL, triggered); *pevent = bp; diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 3f5309db72f1..86f394e15d53 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -2053,15 +2053,14 @@ errout: } static struct perf_event_context * -find_get_context(struct pmu *pmu, pid_t pid, int cpu) +find_get_context(struct pmu *pmu, struct task_struct *task, int cpu) { struct perf_event_context *ctx; struct perf_cpu_context *cpuctx; - struct task_struct *task; unsigned long flags; int ctxn, err; - if (pid == -1 && cpu != -1) { + if (!task && cpu != -1) { /* Must be root to operate on a CPU event: */ if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) return ERR_PTR(-EACCES); @@ -2084,10 +2083,6 @@ find_get_context(struct pmu *pmu, pid_t pid, int cpu) return ctx; } - task = find_lively_task_by_vpid(pid); - if (IS_ERR(task)) - return (void*)task; - err = -EINVAL; ctxn = pmu->task_ctx_nr; if (ctxn < 0) @@ -5527,6 +5522,7 @@ SYSCALL_DEFINE5(perf_event_open, struct perf_event_context *ctx; struct file *event_file = NULL; struct file *group_file = NULL; + struct task_struct *task = NULL; struct pmu *pmu; int event_fd; int fput_needed = 0; @@ -5581,10 +5577,13 @@ SYSCALL_DEFINE5(perf_event_open, if ((pmu->task_ctx_nr == perf_sw_context) && group_leader) pmu = group_leader->pmu; + if (pid != -1) + task = find_lively_task_by_vpid(pid); + /* * Get the target context (task or percpu): */ - ctx = find_get_context(pmu, pid, cpu); + ctx = find_get_context(pmu, task, cpu); if (IS_ERR(ctx)) { err = PTR_ERR(ctx); goto err_group_fd; @@ -5666,11 +5665,11 @@ err_fd: * * @attr: attributes of the counter to create * @cpu: cpu in which the counter is bound - * @pid: task to profile + * @task: task to profile (NULL for percpu) */ struct perf_event * perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, - pid_t pid, + struct task_struct *task, perf_overflow_handler_t overflow_handler) { struct perf_event_context *ctx; @@ -5687,7 +5686,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, goto err; } - ctx = find_get_context(event->pmu, pid, cpu); + ctx = find_get_context(event->pmu, task, cpu); if (IS_ERR(ctx)) { err = PTR_ERR(ctx); goto err_free; diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 89eadbb9cefe..dc8e16824b51 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -358,7 +358,7 @@ static int watchdog_nmi_enable(int cpu) /* Try to register using hardware perf events */ wd_attr = &wd_hw_attr; wd_attr->sample_period = hw_nmi_get_sample_period(); - event = perf_event_create_kernel_counter(wd_attr, cpu, -1, watchdog_overflow_callback); + event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback); if (!IS_ERR(event)) { printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n"); goto out_save; -- cgit v1.2.3 From edbaadbe42b0b790618ec49d29626223529d8195 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 15 Sep 2010 10:04:26 +0900 Subject: kprobes: Remove redundant address check Remove call to kernel_text_address() in register_jprobes() because it is called right after in register_kprobe(). Signed-off-by: Namhyung Kim Acked-by: Masami Hiramatsu LKML-Reference: <1284512670-2369-2-git-send-email-namhyung@gmail.com> Signed-off-by: Ingo Molnar --- kernel/kprobes.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 282035f3ae96..8f967016cef0 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1343,14 +1343,11 @@ int __kprobes register_jprobes(struct jprobe **jps, int num) jp = jps[i]; addr = arch_deref_entry_point(jp->entry); - if (!kernel_text_address(addr)) - ret = -EINVAL; - else { - /* Todo: Verify probepoint is a function entry point */ - jp->kp.pre_handler = setjmp_pre_handler; - jp->kp.break_handler = longjmp_break_handler; - ret = register_kprobe(&jp->kp); - } + /* Todo: Verify probepoint is a function entry point */ + jp->kp.pre_handler = setjmp_pre_handler; + jp->kp.break_handler = longjmp_break_handler; + ret = register_kprobe(&jp->kp); + if (ret < 0) { if (i > 0) unregister_jprobes(jps, i); -- cgit v1.2.3 From 05662bdb64c746079de7ac4dc4fb4caa5e8e119f Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 15 Sep 2010 10:04:27 +0900 Subject: kprobes: Verify jprobe entry point Verify jprobe's entry point is a function entry point using kallsyms' offset value. Signed-off-by: Namhyung Kim Acked-by: Masami Hiramatsu LKML-Reference: <1284512670-2369-3-git-send-email-namhyung@gmail.com> Signed-off-by: Ingo Molnar --- kernel/kprobes.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 8f967016cef0..1b0dbe067077 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1339,14 +1339,18 @@ int __kprobes register_jprobes(struct jprobe **jps, int num) if (num <= 0) return -EINVAL; for (i = 0; i < num; i++) { - unsigned long addr; + unsigned long addr, offset; jp = jps[i]; addr = arch_deref_entry_point(jp->entry); - /* Todo: Verify probepoint is a function entry point */ - jp->kp.pre_handler = setjmp_pre_handler; - jp->kp.break_handler = longjmp_break_handler; - ret = register_kprobe(&jp->kp); + /* Verify probepoint is a function entry point */ + if (kallsyms_lookup_size_offset(addr, NULL, &offset) && + offset == 0) { + jp->kp.pre_handler = setjmp_pre_handler; + jp->kp.break_handler = longjmp_break_handler; + ret = register_kprobe(&jp->kp); + } else + ret = -EINVAL; if (ret < 0) { if (i > 0) -- cgit v1.2.3 From 6376b2297502e72255b7eb2893c6044ad5a7b5f4 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 15 Sep 2010 10:04:28 +0900 Subject: kprobes: Make functions static Make following (internal) functions static to make sparse happier :-) * get_optimized_kprobe: only called from static functions * kretprobe_table_unlock: _lock function is static * kprobes_optinsn_template_holder: never called but holding asm code Signed-off-by: Namhyung Kim Acked-by: Masami Hiramatsu LKML-Reference: <1284512670-2369-4-git-send-email-namhyung@gmail.com> Signed-off-by: Ingo Molnar --- kernel/kprobes.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 1b0dbe067077..c53aad5d7e5e 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -399,7 +399,7 @@ static inline int kprobe_optready(struct kprobe *p) * Return an optimized kprobe whose optimizing code replaces * instructions including addr (exclude breakpoint). */ -struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr) +static struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr) { int i; struct kprobe *p = NULL; @@ -857,7 +857,8 @@ void __kprobes kretprobe_hash_unlock(struct task_struct *tsk, spin_unlock_irqrestore(hlist_lock, *flags); } -void __kprobes kretprobe_table_unlock(unsigned long hash, unsigned long *flags) +static void __kprobes kretprobe_table_unlock(unsigned long hash, + unsigned long *flags) { spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); spin_unlock_irqrestore(hlist_lock, *flags); -- cgit v1.2.3 From 635c17c2b2b4e5cd34f5dcba19d751b4e58533c2 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 15 Sep 2010 10:04:30 +0900 Subject: kprobes: Add sparse context annotations This removes following warnings when build with C=1 warning: context imbalance in 'kretprobe_hash_lock' - wrong count at exit warning: context imbalance in 'kretprobe_table_lock' - wrong count at exit warning: context imbalance in 'kretprobe_hash_unlock' - unexpected unlock warning: context imbalance in 'kretprobe_table_unlock' - unexpected unlock Signed-off-by: Namhyung Kim Acked-by: Masami Hiramatsu LKML-Reference: <1284512670-2369-6-git-send-email-namhyung@gmail.com> Signed-off-by: Ingo Molnar --- kernel/kprobes.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index c53aad5d7e5e..6dd5359e1f0e 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -831,6 +831,7 @@ void __kprobes recycle_rp_inst(struct kretprobe_instance *ri, void __kprobes kretprobe_hash_lock(struct task_struct *tsk, struct hlist_head **head, unsigned long *flags) +__acquires(hlist_lock) { unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); spinlock_t *hlist_lock; @@ -842,6 +843,7 @@ void __kprobes kretprobe_hash_lock(struct task_struct *tsk, static void __kprobes kretprobe_table_lock(unsigned long hash, unsigned long *flags) +__acquires(hlist_lock) { spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); spin_lock_irqsave(hlist_lock, *flags); @@ -849,6 +851,7 @@ static void __kprobes kretprobe_table_lock(unsigned long hash, void __kprobes kretprobe_hash_unlock(struct task_struct *tsk, unsigned long *flags) +__releases(hlist_lock) { unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS); spinlock_t *hlist_lock; @@ -859,6 +862,7 @@ void __kprobes kretprobe_hash_unlock(struct task_struct *tsk, static void __kprobes kretprobe_table_unlock(unsigned long hash, unsigned long *flags) +__releases(hlist_lock) { spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash); spin_unlock_irqrestore(hlist_lock, *flags); -- cgit v1.2.3 From d14b12d7adbf214f33eb59f800b5c3d5ed9268e8 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Fri, 17 Sep 2010 11:28:47 +0200 Subject: perf_events: Fix broken event grouping Events were not grouped anymore. The reason was that in perf_event_open(), the field event->group_leader was initialized before the function looked up the group_fd to find the event leader. This patch fixes this by reordering the code correctly. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Cc: Robert Richter LKML-Reference: <20100917093009.360420946@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 86f394e15d53..ce95617f5d2c 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -5550,17 +5550,11 @@ SYSCALL_DEFINE5(perf_event_open, if (event_fd < 0) return event_fd; - event = perf_event_alloc(&attr, cpu, group_leader, NULL, NULL); - if (IS_ERR(event)) { - err = PTR_ERR(event); - goto err_fd; - } - if (group_fd != -1) { group_leader = perf_fget_light(group_fd, &fput_needed); if (IS_ERR(group_leader)) { err = PTR_ERR(group_leader); - goto err_alloc; + goto err_fd; } group_file = group_leader->filp; if (flags & PERF_FLAG_FD_OUTPUT) @@ -5569,6 +5563,12 @@ SYSCALL_DEFINE5(perf_event_open, group_leader = NULL; } + event = perf_event_alloc(&attr, cpu, group_leader, NULL, NULL); + if (IS_ERR(event)) { + err = PTR_ERR(event); + goto err_fd; + } + /* * Special case software events and allow them to be part of * any hardware group. @@ -5653,7 +5653,6 @@ err_context: put_ctx(ctx); err_group_fd: fput_light(group_file, fput_needed); -err_alloc: free_event(event); err_fd: put_unused_fd(event_fd); -- cgit v1.2.3 From b04243ef7006cda301819f54ee7ce0a3632489e3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 17 Sep 2010 11:28:48 +0200 Subject: perf: Complete software pmu grouping Aside from allowing software events into a !software group, allow adding !software events to pure software groups. Once we've moved the software group and attached the first !software event, the group will no longer be a pure software group and hence no longer be eligible for movement, at which point the straight ctx comparison is correct again. Signed-off-by: Peter Zijlstra Cc: Stephane Eranian Cc: Robert Richter Cc: Paul Mackerras LKML-Reference: <20100917093009.410784731@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 65 ++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 60 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index ce95617f5d2c..6d7eef5f3c41 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -5184,6 +5184,7 @@ int perf_pmu_register(struct pmu *pmu) cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu); __perf_event_init_context(&cpuctx->ctx); + cpuctx->ctx.type = cpu_context; cpuctx->ctx.pmu = pmu; cpuctx->timer_interval = TICK_NSEC; hrtimer_init(&cpuctx->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); @@ -5517,7 +5518,8 @@ SYSCALL_DEFINE5(perf_event_open, struct perf_event_attr __user *, attr_uptr, pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) { - struct perf_event *event, *group_leader = NULL, *output_event = NULL; + struct perf_event *group_leader = NULL, *output_event = NULL; + struct perf_event *event, *sibling; struct perf_event_attr attr; struct perf_event_context *ctx; struct file *event_file = NULL; @@ -5525,6 +5527,7 @@ SYSCALL_DEFINE5(perf_event_open, struct task_struct *task = NULL; struct pmu *pmu; int event_fd; + int move_group = 0; int fput_needed = 0; int err; @@ -5574,8 +5577,29 @@ SYSCALL_DEFINE5(perf_event_open, * any hardware group. */ pmu = event->pmu; - if ((pmu->task_ctx_nr == perf_sw_context) && group_leader) - pmu = group_leader->pmu; + + if (group_leader && + (is_software_event(event) != is_software_event(group_leader))) { + if (is_software_event(event)) { + /* + * If event and group_leader are not both a software + * event, and event is, then group leader is not. + * + * Allow the addition of software events to !software + * groups, this is safe because software events never + * fail to schedule. + */ + pmu = group_leader->pmu; + } else if (is_software_event(group_leader) && + (group_leader->group_flags & PERF_GROUP_SOFTWARE)) { + /* + * In case the group is a pure software group, and we + * try to add a hardware event, move the whole group to + * the hardware context. + */ + move_group = 1; + } + } if (pid != -1) task = find_lively_task_by_vpid(pid); @@ -5605,8 +5629,14 @@ SYSCALL_DEFINE5(perf_event_open, * Do not allow to attach to a group in a different * task or CPU context: */ - if (group_leader->ctx != ctx) - goto err_context; + if (move_group) { + if (group_leader->ctx->type != ctx->type) + goto err_context; + } else { + if (group_leader->ctx != ctx) + goto err_context; + } + /* * Only a group leader can be exclusive or pinned */ @@ -5626,9 +5656,34 @@ SYSCALL_DEFINE5(perf_event_open, goto err_context; } + if (move_group) { + struct perf_event_context *gctx = group_leader->ctx; + + mutex_lock(&gctx->mutex); + perf_event_remove_from_context(group_leader); + list_for_each_entry(sibling, &group_leader->sibling_list, + group_entry) { + perf_event_remove_from_context(sibling); + put_ctx(gctx); + } + mutex_unlock(&gctx->mutex); + put_ctx(gctx); + } + event->filp = event_file; WARN_ON_ONCE(ctx->parent_ctx); mutex_lock(&ctx->mutex); + + if (move_group) { + perf_install_in_context(ctx, group_leader, cpu); + get_ctx(ctx); + list_for_each_entry(sibling, &group_leader->sibling_list, + group_entry) { + perf_install_in_context(ctx, sibling, cpu); + get_ctx(ctx); + } + } + perf_install_in_context(ctx, event, cpu); ++ctx->generation; mutex_unlock(&ctx->mutex); -- cgit v1.2.3 From 917bdd1c9b7b0f4c22f2504c2f0c1074c8ab9df7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 17 Sep 2010 11:28:49 +0200 Subject: perf: Fix perf_event_exit_cpu_context() Use the right cpu-context.. spotted by preempt warning on hot-unplug Signed-off-by: Peter Zijlstra Cc: Stephane Eranian Cc: Robert Richter LKML-Reference: <20100917093009.461794357@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 6d7eef5f3c41..27332e5f51a7 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -6269,14 +6269,13 @@ static void perf_event_exit_cpu_context(int cpu) idx = srcu_read_lock(&pmus_srcu); list_for_each_entry_rcu(pmu, &pmus, entry) { - ctx = &this_cpu_ptr(pmu->pmu_cpu_context)->ctx; + ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx; mutex_lock(&ctx->mutex); smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1); mutex_unlock(&ctx->mutex); } srcu_read_unlock(&pmus_srcu, idx); - } static void perf_event_exit_cpu(int cpu) -- cgit v1.2.3 From e9d2b064149ff7ef4acbc65a1b9374ac8b218d3e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 17 Sep 2010 11:28:50 +0200 Subject: perf: Undo the per cpu-context timer stuff Revert the timer per cpu-context timers because of unfortunate nohz interaction. Fixing that would have been somewhat ugly, so go back to driving things from the regular tick. Provide a jiffies interval feature for people who want slower rotations. Signed-off-by: Peter Zijlstra Cc: Stephane Eranian Cc: Robert Richter Cc: Yinghai Lu LKML-Reference: <20100917093009.519845633@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 79 +++++++++++++++++++++++++++++++++-------------------- kernel/sched.c | 2 ++ 2 files changed, 51 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 27332e5f51a7..baae1367e945 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -77,23 +77,22 @@ void perf_pmu_enable(struct pmu *pmu) pmu->pmu_enable(pmu); } +static DEFINE_PER_CPU(struct list_head, rotation_list); + +/* + * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized + * because they're strictly cpu affine and rotate_start is called with IRQs + * disabled, while rotate_context is called from IRQ context. + */ static void perf_pmu_rotate_start(struct pmu *pmu) { struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + struct list_head *head = &__get_cpu_var(rotation_list); - if (hrtimer_active(&cpuctx->timer)) - return; + WARN_ON(!irqs_disabled()); - __hrtimer_start_range_ns(&cpuctx->timer, - ns_to_ktime(cpuctx->timer_interval), 0, - HRTIMER_MODE_REL_PINNED, 0); -} - -static void perf_pmu_rotate_stop(struct pmu *pmu) -{ - struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); - - hrtimer_cancel(&cpuctx->timer); + if (list_empty(&cpuctx->rotation_list)) + list_add(&cpuctx->rotation_list, head); } static void get_ctx(struct perf_event_context *ctx) @@ -1607,36 +1606,33 @@ static void rotate_ctx(struct perf_event_context *ctx) } /* - * Cannot race with ->pmu_rotate_start() because this is ran from hardirq - * context, and ->pmu_rotate_start() is called with irqs disabled (both are - * cpu affine, so there are no SMP races). + * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized + * because they're strictly cpu affine and rotate_start is called with IRQs + * disabled, while rotate_context is called from IRQ context. */ -static enum hrtimer_restart perf_event_context_tick(struct hrtimer *timer) +static void perf_rotate_context(struct perf_cpu_context *cpuctx) { - enum hrtimer_restart restart = HRTIMER_NORESTART; - struct perf_cpu_context *cpuctx; + u64 interval = (u64)cpuctx->jiffies_interval * TICK_NSEC; struct perf_event_context *ctx = NULL; - int rotate = 0; - - cpuctx = container_of(timer, struct perf_cpu_context, timer); + int rotate = 0, remove = 1; if (cpuctx->ctx.nr_events) { - restart = HRTIMER_RESTART; + remove = 0; if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active) rotate = 1; } ctx = cpuctx->task_ctx; if (ctx && ctx->nr_events) { - restart = HRTIMER_RESTART; + remove = 0; if (ctx->nr_events != ctx->nr_active) rotate = 1; } perf_pmu_disable(cpuctx->ctx.pmu); - perf_ctx_adjust_freq(&cpuctx->ctx, cpuctx->timer_interval); + perf_ctx_adjust_freq(&cpuctx->ctx, interval); if (ctx) - perf_ctx_adjust_freq(ctx, cpuctx->timer_interval); + perf_ctx_adjust_freq(ctx, interval); if (!rotate) goto done; @@ -1654,10 +1650,24 @@ static enum hrtimer_restart perf_event_context_tick(struct hrtimer *timer) task_ctx_sched_in(ctx, EVENT_FLEXIBLE); done: + if (remove) + list_del_init(&cpuctx->rotation_list); + perf_pmu_enable(cpuctx->ctx.pmu); - hrtimer_forward_now(timer, ns_to_ktime(cpuctx->timer_interval)); +} + +void perf_event_task_tick(void) +{ + struct list_head *head = &__get_cpu_var(rotation_list); + struct perf_cpu_context *cpuctx, *tmp; - return restart; + WARN_ON(!irqs_disabled()); + + list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) { + if (cpuctx->jiffies_interval == 1 || + !(jiffies % cpuctx->jiffies_interval)) + perf_rotate_context(cpuctx); + } } static int event_enable_on_exec(struct perf_event *event, @@ -5186,9 +5196,8 @@ int perf_pmu_register(struct pmu *pmu) __perf_event_init_context(&cpuctx->ctx); cpuctx->ctx.type = cpu_context; cpuctx->ctx.pmu = pmu; - cpuctx->timer_interval = TICK_NSEC; - hrtimer_init(&cpuctx->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - cpuctx->timer.function = perf_event_context_tick; + cpuctx->jiffies_interval = 1; + INIT_LIST_HEAD(&cpuctx->rotation_list); } got_cpu_context: @@ -6229,6 +6238,7 @@ static void __init perf_event_init_all_cpus(void) for_each_possible_cpu(cpu) { swhash = &per_cpu(swevent_htable, cpu); mutex_init(&swhash->hlist_mutex); + INIT_LIST_HEAD(&per_cpu(rotation_list, cpu)); } } @@ -6248,6 +6258,15 @@ static void __cpuinit perf_event_init_cpu(int cpu) } #ifdef CONFIG_HOTPLUG_CPU +static void perf_pmu_rotate_stop(struct pmu *pmu) +{ + struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + + WARN_ON(!irqs_disabled()); + + list_del_init(&cpuctx->rotation_list); +} + static void __perf_event_exit_context(void *__info) { struct perf_event_context *ctx = __info; diff --git a/kernel/sched.c b/kernel/sched.c index 1c3ea7a55b7b..794819eab9ca 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3584,6 +3584,8 @@ void scheduler_tick(void) curr->sched_class->task_tick(rq, curr, 0); raw_spin_unlock(&rq->lock); + perf_event_task_tick(); + #ifdef CONFIG_SMP rq->idle_at_tick = idle_cpu(cpu); trigger_load_balance(rq, cpu); -- cgit v1.2.3 From 41945f6ccf1e86f87fddf6b32db9cf431c05fb54 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 16 Sep 2010 19:17:24 +0200 Subject: perf: Avoid RCU vs preemption assumptions The per-pmu per-cpu context patch converted things from get_cpu_var() to this_cpu_ptr(), but that only works if rcu_read_lock() actually disables preemption, and since there is no such guarantee, we need to fix that. Use the newly introduced {get,put}_cpu_ptr(). Signed-off-by: Peter Zijlstra Cc: Tejun Heo LKML-Reference: <20100917093009.308453028@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index baae1367e945..c16158c77dfd 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -3836,18 +3836,20 @@ static void perf_event_task_event(struct perf_task_event *task_event) rcu_read_lock(); list_for_each_entry_rcu(pmu, &pmus, entry) { - cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); perf_event_task_ctx(&cpuctx->ctx, task_event); ctx = task_event->task_ctx; if (!ctx) { ctxn = pmu->task_ctx_nr; if (ctxn < 0) - continue; + goto next; ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); } if (ctx) perf_event_task_ctx(ctx, task_event); +next: + put_cpu_ptr(pmu->pmu_cpu_context); } rcu_read_unlock(); } @@ -3969,16 +3971,18 @@ static void perf_event_comm_event(struct perf_comm_event *comm_event) rcu_read_lock(); list_for_each_entry_rcu(pmu, &pmus, entry) { - cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); perf_event_comm_ctx(&cpuctx->ctx, comm_event); ctxn = pmu->task_ctx_nr; if (ctxn < 0) - continue; + goto next; ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); if (ctx) perf_event_comm_ctx(ctx, comm_event); +next: + put_cpu_ptr(pmu->pmu_cpu_context); } rcu_read_unlock(); } @@ -4152,19 +4156,21 @@ got_name: rcu_read_lock(); list_for_each_entry_rcu(pmu, &pmus, entry) { - cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); + cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); perf_event_mmap_ctx(&cpuctx->ctx, mmap_event, vma->vm_flags & VM_EXEC); ctxn = pmu->task_ctx_nr; if (ctxn < 0) - continue; + goto next; ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); if (ctx) { perf_event_mmap_ctx(ctx, mmap_event, vma->vm_flags & VM_EXEC); } +next: + put_cpu_ptr(pmu->pmu_cpu_context); } rcu_read_unlock(); -- cgit v1.2.3 From bf5438fca2950b03c21ad868090cc1a8fcd49536 Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Fri, 17 Sep 2010 11:09:00 -0400 Subject: jump label: Base patch for jump label base patch to implement 'jump labeling'. Based on a new 'asm goto' inline assembly gcc mechanism, we can now branch to labels from an 'asm goto' statment. This allows us to create a 'no-op' fastpath, which can subsequently be patched with a jump to the slowpath code. This is useful for code which might be rarely used, but which we'd like to be able to call, if needed. Tracepoints are the current usecase that these are being implemented for. Acked-by: David S. Miller Signed-off-by: Jason Baron LKML-Reference: [ cleaned up some formating ] Signed-off-by: Steven Rostedt --- kernel/Makefile | 2 +- kernel/jump_label.c | 346 ++++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/kprobes.c | 1 + kernel/module.c | 6 + 4 files changed, 354 insertions(+), 1 deletion(-) create mode 100644 kernel/jump_label.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 0b72d1a74be0..d52b473c99a1 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -10,7 +10,7 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o \ kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ notifier.o ksysfs.o pm_qos_params.o sched_clock.o cred.o \ - async.o range.o + async.o range.o jump_label.o obj-$(CONFIG_HAVE_EARLY_RES) += early_res.o obj-y += groups.o diff --git a/kernel/jump_label.c b/kernel/jump_label.c new file mode 100644 index 000000000000..460fd40112b3 --- /dev/null +++ b/kernel/jump_label.c @@ -0,0 +1,346 @@ +/* + * jump label support + * + * Copyright (C) 2009 Jason Baron + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef HAVE_JUMP_LABEL + +#define JUMP_LABEL_HASH_BITS 6 +#define JUMP_LABEL_TABLE_SIZE (1 << JUMP_LABEL_HASH_BITS) +static struct hlist_head jump_label_table[JUMP_LABEL_TABLE_SIZE]; + +/* mutex to protect coming/going of the the jump_label table */ +static DEFINE_MUTEX(jump_label_mutex); + +struct jump_label_entry { + struct hlist_node hlist; + struct jump_entry *table; + int nr_entries; + /* hang modules off here */ + struct hlist_head modules; + unsigned long key; +}; + +struct jump_label_module_entry { + struct hlist_node hlist; + struct jump_entry *table; + int nr_entries; + struct module *mod; +}; + +static int jump_label_cmp(const void *a, const void *b) +{ + const struct jump_entry *jea = a; + const struct jump_entry *jeb = b; + + if (jea->key < jeb->key) + return -1; + + if (jea->key > jeb->key) + return 1; + + return 0; +} + +static void +sort_jump_label_entries(struct jump_entry *start, struct jump_entry *stop) +{ + unsigned long size; + + size = (((unsigned long)stop - (unsigned long)start) + / sizeof(struct jump_entry)); + sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL); +} + +static struct jump_label_entry *get_jump_label_entry(jump_label_t key) +{ + struct hlist_head *head; + struct hlist_node *node; + struct jump_label_entry *e; + u32 hash = jhash((void *)&key, sizeof(jump_label_t), 0); + + head = &jump_label_table[hash & (JUMP_LABEL_TABLE_SIZE - 1)]; + hlist_for_each_entry(e, node, head, hlist) { + if (key == e->key) + return e; + } + return NULL; +} + +static struct jump_label_entry * +add_jump_label_entry(jump_label_t key, int nr_entries, struct jump_entry *table) +{ + struct hlist_head *head; + struct jump_label_entry *e; + u32 hash; + + e = get_jump_label_entry(key); + if (e) + return ERR_PTR(-EEXIST); + + e = kmalloc(sizeof(struct jump_label_entry), GFP_KERNEL); + if (!e) + return ERR_PTR(-ENOMEM); + + hash = jhash((void *)&key, sizeof(jump_label_t), 0); + head = &jump_label_table[hash & (JUMP_LABEL_TABLE_SIZE - 1)]; + e->key = key; + e->table = table; + e->nr_entries = nr_entries; + INIT_HLIST_HEAD(&(e->modules)); + hlist_add_head(&e->hlist, head); + return e; +} + +static int +build_jump_label_hashtable(struct jump_entry *start, struct jump_entry *stop) +{ + struct jump_entry *iter, *iter_begin; + struct jump_label_entry *entry; + int count; + + sort_jump_label_entries(start, stop); + iter = start; + while (iter < stop) { + entry = get_jump_label_entry(iter->key); + if (!entry) { + iter_begin = iter; + count = 0; + while ((iter < stop) && + (iter->key == iter_begin->key)) { + iter++; + count++; + } + entry = add_jump_label_entry(iter_begin->key, + count, iter_begin); + if (IS_ERR(entry)) + return PTR_ERR(entry); + } else { + WARN_ONCE(1, KERN_ERR "build_jump_hashtable: unexpected entry!\n"); + return -1; + } + } + return 0; +} + +/*** + * jump_label_update - update jump label text + * @key - key value associated with a a jump label + * @type - enum set to JUMP_LABEL_ENABLE or JUMP_LABEL_DISABLE + * + * Will enable/disable the jump for jump label @key, depending on the + * value of @type. + * + */ + +void jump_label_update(unsigned long key, enum jump_label_type type) +{ + struct jump_entry *iter; + struct jump_label_entry *entry; + struct hlist_node *module_node; + struct jump_label_module_entry *e_module; + int count; + + mutex_lock(&jump_label_mutex); + entry = get_jump_label_entry((jump_label_t)key); + if (entry) { + count = entry->nr_entries; + iter = entry->table; + while (count--) { + if (kernel_text_address(iter->code)) + arch_jump_label_transform(iter, type); + iter++; + } + /* eanble/disable jump labels in modules */ + hlist_for_each_entry(e_module, module_node, &(entry->modules), + hlist) { + count = e_module->nr_entries; + iter = e_module->table; + while (count--) { + if (kernel_text_address(iter->code)) + arch_jump_label_transform(iter, type); + iter++; + } + } + } + mutex_unlock(&jump_label_mutex); +} + +static __init int init_jump_label(void) +{ + int ret; + struct jump_entry *iter_start = __start___jump_table; + struct jump_entry *iter_stop = __stop___jump_table; + struct jump_entry *iter; + + mutex_lock(&jump_label_mutex); + ret = build_jump_label_hashtable(__start___jump_table, + __stop___jump_table); + iter = iter_start; + while (iter < iter_stop) { + arch_jump_label_text_poke_early(iter->code); + iter++; + } + mutex_unlock(&jump_label_mutex); + return ret; +} +early_initcall(init_jump_label); + +#ifdef CONFIG_MODULES + +static struct jump_label_module_entry * +add_jump_label_module_entry(struct jump_label_entry *entry, + struct jump_entry *iter_begin, + int count, struct module *mod) +{ + struct jump_label_module_entry *e; + + e = kmalloc(sizeof(struct jump_label_module_entry), GFP_KERNEL); + if (!e) + return ERR_PTR(-ENOMEM); + e->mod = mod; + e->nr_entries = count; + e->table = iter_begin; + hlist_add_head(&e->hlist, &entry->modules); + return e; +} + +static int add_jump_label_module(struct module *mod) +{ + struct jump_entry *iter, *iter_begin; + struct jump_label_entry *entry; + struct jump_label_module_entry *module_entry; + int count; + + /* if the module doesn't have jump label entries, just return */ + if (!mod->num_jump_entries) + return 0; + + sort_jump_label_entries(mod->jump_entries, + mod->jump_entries + mod->num_jump_entries); + iter = mod->jump_entries; + while (iter < mod->jump_entries + mod->num_jump_entries) { + entry = get_jump_label_entry(iter->key); + iter_begin = iter; + count = 0; + while ((iter < mod->jump_entries + mod->num_jump_entries) && + (iter->key == iter_begin->key)) { + iter++; + count++; + } + if (!entry) { + entry = add_jump_label_entry(iter_begin->key, 0, NULL); + if (IS_ERR(entry)) + return PTR_ERR(entry); + } + module_entry = add_jump_label_module_entry(entry, iter_begin, + count, mod); + if (IS_ERR(module_entry)) + return PTR_ERR(module_entry); + } + return 0; +} + +static void remove_jump_label_module(struct module *mod) +{ + struct hlist_head *head; + struct hlist_node *node, *node_next, *module_node, *module_node_next; + struct jump_label_entry *e; + struct jump_label_module_entry *e_module; + int i; + + /* if the module doesn't have jump label entries, just return */ + if (!mod->num_jump_entries) + return; + + for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) { + head = &jump_label_table[i]; + hlist_for_each_entry_safe(e, node, node_next, head, hlist) { + hlist_for_each_entry_safe(e_module, module_node, + module_node_next, + &(e->modules), hlist) { + if (e_module->mod == mod) { + hlist_del(&e_module->hlist); + kfree(e_module); + } + } + if (hlist_empty(&e->modules) && (e->nr_entries == 0)) { + hlist_del(&e->hlist); + kfree(e); + } + } + } +} + +static int +jump_label_module_notify(struct notifier_block *self, unsigned long val, + void *data) +{ + struct module *mod = data; + int ret = 0; + + switch (val) { + case MODULE_STATE_COMING: + mutex_lock(&jump_label_mutex); + ret = add_jump_label_module(mod); + if (ret) + remove_jump_label_module(mod); + mutex_unlock(&jump_label_mutex); + break; + case MODULE_STATE_GOING: + mutex_lock(&jump_label_mutex); + remove_jump_label_module(mod); + mutex_unlock(&jump_label_mutex); + break; + } + return ret; +} + +/*** + * apply_jump_label_nops - patch module jump labels with arch_get_jump_label_nop() + * @mod: module to patch + * + * Allow for run-time selection of the optimal nops. Before the module + * loads patch these with arch_get_jump_label_nop(), which is specified by + * the arch specific jump label code. + */ +void jump_label_apply_nops(struct module *mod) +{ + struct jump_entry *iter; + + /* if the module doesn't have jump label entries, just return */ + if (!mod->num_jump_entries) + return; + + iter = mod->jump_entries; + while (iter < mod->jump_entries + mod->num_jump_entries) { + arch_jump_label_text_poke_early(iter->code); + iter++; + } +} + +struct notifier_block jump_label_module_nb = { + .notifier_call = jump_label_module_notify, + .priority = 0, +}; + +static __init int init_jump_label_module(void) +{ + return register_module_notifier(&jump_label_module_nb); +} +early_initcall(init_jump_label_module); + +#endif /* CONFIG_MODULES */ + +#endif diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 6dd5359e1f0e..18904e42a918 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -47,6 +47,7 @@ #include #include #include +#include #include #include diff --git a/kernel/module.c b/kernel/module.c index d0b5f8db11b4..eba134157ef6 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -55,6 +55,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -2308,6 +2309,11 @@ static void find_module_sections(struct module *mod, struct load_info *info) sizeof(*mod->tracepoints), &mod->num_tracepoints); #endif +#ifdef HAVE_JUMP_LABEL + mod->jump_entries = section_objs(info, "__jump_table", + sizeof(*mod->jump_entries), + &mod->num_jump_entries); +#endif #ifdef CONFIG_EVENT_TRACING mod->trace_events = section_objs(info, "_ftrace_events", sizeof(*mod->trace_events), -- cgit v1.2.3 From e0cf0cd49632552f063fb3ae58691946da45fb2e Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Fri, 17 Sep 2010 11:09:04 -0400 Subject: jump label: Initialize workqueue tracepoints *before* they are registered Initialize the workqueue data structures *before* they are registered so that they are ready for callbacks. Signed-off-by: Jason Baron LKML-Reference: Signed-off-by: Steven Rostedt --- kernel/trace/trace_workqueue.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c index a7cc3793baf6..209b379a4721 100644 --- a/kernel/trace/trace_workqueue.c +++ b/kernel/trace/trace_workqueue.c @@ -263,6 +263,11 @@ int __init trace_workqueue_early_init(void) { int ret, cpu; + for_each_possible_cpu(cpu) { + spin_lock_init(&workqueue_cpu_stat(cpu)->lock); + INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list); + } + ret = register_trace_workqueue_insertion(probe_workqueue_insertion, NULL); if (ret) goto out; @@ -279,11 +284,6 @@ int __init trace_workqueue_early_init(void) if (ret) goto no_creation; - for_each_possible_cpu(cpu) { - spin_lock_init(&workqueue_cpu_stat(cpu)->lock); - INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list); - } - return 0; no_creation: -- cgit v1.2.3 From 4c3ef6d79328c0e23ade60cbfc8d496123a6855c Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Fri, 17 Sep 2010 11:09:08 -0400 Subject: jump label: Add jump_label_text_reserved() to reserve jump points Add a jump_label_text_reserved(void *start, void *end), so that other pieces of code that want to modify kernel text, can first verify that jump label has not reserved the instruction. Acked-by: Masami Hiramatsu Signed-off-by: Jason Baron LKML-Reference: <06236663a3a7b1c1f13576bb9eccb6d9c17b7bfe.1284733808.git.jbaron@redhat.com> Signed-off-by: Steven Rostedt --- kernel/jump_label.c | 83 +++++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/kprobes.c | 3 +- 2 files changed, 85 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/jump_label.c b/kernel/jump_label.c index 460fd40112b3..7be868bf25c6 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -177,6 +177,89 @@ void jump_label_update(unsigned long key, enum jump_label_type type) mutex_unlock(&jump_label_mutex); } +static int addr_conflict(struct jump_entry *entry, void *start, void *end) +{ + if (entry->code <= (unsigned long)end && + entry->code + JUMP_LABEL_NOP_SIZE > (unsigned long)start) + return 1; + + return 0; +} + +#ifdef CONFIG_MODULES + +static int module_conflict(void *start, void *end) +{ + struct hlist_head *head; + struct hlist_node *node, *node_next, *module_node, *module_node_next; + struct jump_label_entry *e; + struct jump_label_module_entry *e_module; + struct jump_entry *iter; + int i, count; + int conflict = 0; + + for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) { + head = &jump_label_table[i]; + hlist_for_each_entry_safe(e, node, node_next, head, hlist) { + hlist_for_each_entry_safe(e_module, module_node, + module_node_next, + &(e->modules), hlist) { + count = e_module->nr_entries; + iter = e_module->table; + while (count--) { + if (addr_conflict(iter, start, end)) { + conflict = 1; + goto out; + } + iter++; + } + } + } + } +out: + return conflict; +} + +#endif + +/*** + * jump_label_text_reserved - check if addr range is reserved + * @start: start text addr + * @end: end text addr + * + * checks if the text addr located between @start and @end + * overlaps with any of the jump label patch addresses. Code + * that wants to modify kernel text should first verify that + * it does not overlap with any of the jump label addresses. + * + * returns 1 if there is an overlap, 0 otherwise + */ +int jump_label_text_reserved(void *start, void *end) +{ + struct jump_entry *iter; + struct jump_entry *iter_start = __start___jump_table; + struct jump_entry *iter_stop = __start___jump_table; + int conflict = 0; + + mutex_lock(&jump_label_mutex); + iter = iter_start; + while (iter < iter_stop) { + if (addr_conflict(iter, start, end)) { + conflict = 1; + goto out; + } + iter++; + } + + /* now check modules */ +#ifdef CONFIG_MODULES + conflict = module_conflict(start, end); +#endif +out: + mutex_unlock(&jump_label_mutex); + return conflict; +} + static __init int init_jump_label(void) { int ret; diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 18904e42a918..ec4210c6501e 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1147,7 +1147,8 @@ int __kprobes register_kprobe(struct kprobe *p) preempt_disable(); if (!kernel_text_address((unsigned long) p->addr) || in_kprobes_functions((unsigned long) p->addr) || - ftrace_text_reserved(p->addr, p->addr)) { + ftrace_text_reserved(p->addr, p->addr) || + jump_label_text_reserved(p->addr, p->addr)) { preempt_enable(); return -EINVAL; } -- cgit v1.2.3 From 8f7b50c514206211cc282a4247f7b12f18dee674 Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Fri, 17 Sep 2010 11:09:13 -0400 Subject: jump label: Tracepoint support for jump labels Make use of the jump label infrastructure for tracepoints. Signed-off-by: Jason Baron LKML-Reference: Signed-off-by: Steven Rostedt --- kernel/tracepoint.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index c77f3eceea25..d6073a50a6ca 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -25,6 +25,7 @@ #include #include #include +#include extern struct tracepoint __start___tracepoints[]; extern struct tracepoint __stop___tracepoints[]; @@ -263,7 +264,13 @@ static void set_tracepoint(struct tracepoint_entry **entry, * is used. */ rcu_assign_pointer(elem->funcs, (*entry)->funcs); - elem->state = active; + if (!elem->state && active) { + enable_jump_label(&elem->state); + elem->state = active; + } else if (elem->state && !active) { + disable_jump_label(&elem->state); + elem->state = active; + } } /* @@ -277,7 +284,10 @@ static void disable_tracepoint(struct tracepoint *elem) if (elem->unregfunc && elem->state) elem->unregfunc(); - elem->state = 0; + if (elem->state) { + disable_jump_label(&elem->state); + elem->state = 0; + } rcu_assign_pointer(elem->funcs, NULL); } -- cgit v1.2.3 From 540804b5c52065a87d826f7714b18a3ec0b269f9 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Mon, 4 Oct 2010 12:00:02 +0200 Subject: perf_events: Fix invalid pointer when pid is invalid This patch fixes an error in perf_event_open() when the pid provided by the user is invalid. find_lively_task_by_vpid() does not return NULL on error but an error code. Without the fix the error code was silently passed to find_get_context() which would eventually cause a invalid pointer dereference. Signed-off-by: Stephane Eranian Cc: peterz@infradead.org Cc: paulus@samba.org Cc: davem@davemloft.net Cc: fweisbec@gmail.com Cc: perfmon2-devel@lists.sf.net Cc: eranian@gmail.com Cc: robert.richter@amd.com LKML-Reference: <4ca9a5d1.e8e9d80a.3dbb.ffff8f2e@mx.google.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index c16158c77dfd..64507eaa2d9e 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -5616,8 +5616,13 @@ SYSCALL_DEFINE5(perf_event_open, } } - if (pid != -1) + if (pid != -1) { task = find_lively_task_by_vpid(pid); + if (IS_ERR(task)) { + err = PTR_ERR(task); + goto err_group_fd; + } + } /* * Get the target context (task or percpu): -- cgit v1.2.3 From 84c7991059c9c4530cc911137c5bf508a41ed129 Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Sun, 3 Oct 2010 21:41:13 +0100 Subject: perf: New helper function for pmu name Introduce perf_pmu_name() helper function that returns the name of the pmu. This gives us a generic way to get the name of a pmu regardless of how an architecture identifies it internally. Signed-off-by: Matt Fleming Acked-by: Peter Zijlstra Acked-by: Paul Mundt Signed-off-by: Robert Richter --- kernel/perf_event.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 403d1804b198..e2534691db0d 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -85,6 +85,11 @@ void __weak hw_perf_enable(void) { barrier(); } void __weak perf_event_print_debug(void) { } +extern __weak const char *perf_pmu_name(void) +{ + return "pmu"; +} + static DEFINE_PER_CPU(int, perf_disable_count); void perf_disable(void) -- cgit v1.2.3 From 14cae9bd2faf6d0d75702c2e107e75207bcdfec1 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 29 Sep 2010 10:08:23 +0200 Subject: tracing: Fix function-graph build warning on 32-bit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix kernel/trace/trace_functions_graph.c: In function ‘trace_print_graph_duration’: kernel/trace/trace_functions_graph.c:652: warning: comparison of distinct pointer types lacks a cast when building 36-rc6 on a 32-bit due to the strict type check failing in the min() macro. Signed-off-by: Borislav Petkov Cc: Chase Douglas Cc: Steven Rostedt Cc: Ingo Molnar LKML-Reference: <20100929080823.GA13595@liondog.tnic> Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_functions_graph.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 02c708ae0d42..ef49e9370b25 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -665,8 +665,9 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s) /* Print nsecs (we don't want to exceed 7 numbers) */ if (len < 7) { - snprintf(nsecs_str, min(sizeof(nsecs_str), 8UL - len), "%03lu", - nsecs_rem); + size_t slen = min_t(size_t, sizeof(nsecs_str), 8UL - len); + + snprintf(nsecs_str, slen, "%03lu", nsecs_rem); ret = trace_seq_printf(s, ".%s", nsecs_str); if (!ret) return TRACE_TYPE_PARTIAL_LINE; -- cgit v1.2.3 From fd02e6f7ae085840d43d780149fcf95a614eca5e Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Thu, 14 Oct 2010 12:10:24 +0900 Subject: kprobes: Fix selftest to clear flags field for reusing probes Fix selftest to clear flags field for reusing probes because the flags field can be modified by Kprobes. This also set NULL to kprobe.addr instead of 0. Signed-off-by: Masami Hiramatsu Cc: Rusty Russell Cc: Ananth N Mavinakayanahalli Cc: 2nddept-manager@sdl.hitachi.co.jp LKML-Reference: <20101014031024.4100.50107.stgit@ltc236.sdl.hitachi.co.jp> Signed-off-by: Ingo Molnar --- kernel/test_kprobes.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c index 4f104515a19b..f8b11a283171 100644 --- a/kernel/test_kprobes.c +++ b/kernel/test_kprobes.c @@ -115,7 +115,9 @@ static int test_kprobes(void) int ret; struct kprobe *kps[2] = {&kp, &kp2}; - kp.addr = 0; /* addr should be cleard for reusing kprobe. */ + /* addr and flags should be cleard for reusing kprobe. */ + kp.addr = NULL; + kp.flags = 0; ret = register_kprobes(kps, 2); if (ret < 0) { printk(KERN_ERR "Kprobe smoke test failed: " @@ -210,7 +212,9 @@ static int test_jprobes(void) int ret; struct jprobe *jps[2] = {&jp, &jp2}; - jp.kp.addr = 0; /* addr should be cleard for reusing kprobe. */ + /* addr and flags should be cleard for reusing kprobe. */ + jp.kp.addr = NULL; + jp.kp.flags = 0; ret = register_jprobes(jps, 2); if (ret < 0) { printk(KERN_ERR "Kprobe smoke test failed: " @@ -323,7 +327,9 @@ static int test_kretprobes(void) int ret; struct kretprobe *rps[2] = {&rp, &rp2}; - rp.kp.addr = 0; /* addr should be cleard for reusing kprobe. */ + /* addr and flags should be cleard for reusing kprobe. */ + rp.kp.addr = NULL; + rp.kp.flags = 0; ret = register_kretprobes(rps, 2); if (ret < 0) { printk(KERN_ERR "Kprobe smoke test failed: " -- cgit v1.2.3 From 72441cb1fd77d092f09ddfac748955703884c9a7 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 13 Oct 2010 17:12:30 -0400 Subject: ftrace/x86: Add support for C version of recordmcount This patch adds the support for the C version of recordmcount and compile times show ~ 12% improvement. After verifying this works, other archs can add: HAVE_C_MCOUNT_RECORD in its Kconfig and it will use the C version of recordmcount instead of the perl version. Cc: Cc: Michal Marek Cc: linux-kbuild@vger.kernel.org Cc: John Reiser Signed-off-by: Steven Rostedt --- kernel/trace/Kconfig | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 538501c6ea50..df00fbbaf609 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -49,6 +49,11 @@ config HAVE_SYSCALL_TRACEPOINTS help See Documentation/trace/ftrace-design.txt +config HAVE_C_MCOUNT_RECORD + bool + help + C version of recordmcount available? + config TRACER_MAX_TRACE bool -- cgit v1.2.3 From cf4db2597ae93b60efc0a7a4ec08690b75d629b1 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 14 Oct 2010 23:32:44 -0400 Subject: ftrace: Rename config option HAVE_C_MCOUNT_RECORD to HAVE_C_RECORDMCOUNT The config option used by archs to let the build system know that the C version of the recordmcount works for said arch is currently called HAVE_C_MCOUNT_RECORD which enables BUILD_C_RECORDMCOUNT. To be more consistent with the name that all archs may use, it has been renamed to HAVE_C_RECORDMCOUNT. This will be less confusing since we are building a C recordmcount and not a mcount_record. Suggested-by: Ingo Molnar Cc: Cc: Michal Marek Cc: linux-kbuild@vger.kernel.org Cc: John Reiser Signed-off-by: Steven Rostedt --- kernel/trace/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index df00fbbaf609..e550d2eda1df 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -49,7 +49,7 @@ config HAVE_SYSCALL_TRACEPOINTS help See Documentation/trace/ftrace-design.txt -config HAVE_C_MCOUNT_RECORD +config HAVE_C_RECORDMCOUNT bool help C version of recordmcount available? -- cgit v1.2.3 From 907f27840985fe6a0c62e43cd4702c6e04b4bcc7 Mon Sep 17 00:00:00 2001 From: matt mooney Date: Mon, 27 Sep 2010 19:04:53 -0700 Subject: tracing/trivial: Remove cast from void* Unnecessary cast from void* in assignment. Signed-off-by: matt mooney Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 4 ++-- kernel/trace/trace.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 65fb077ea79c..ebd80d50c474 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1638,8 +1638,8 @@ ftrace_failures_open(struct inode *inode, struct file *file) ret = ftrace_avail_open(inode, file); if (!ret) { - m = (struct seq_file *)file->private_data; - iter = (struct ftrace_iterator *)m->private; + m = file->private_data; + iter = m->private; iter->flags = FTRACE_ITER_FAILURES; } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9ec59f541156..001bcd2ccf4a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2196,7 +2196,7 @@ int tracing_open_generic(struct inode *inode, struct file *filp) static int tracing_release(struct inode *inode, struct file *file) { - struct seq_file *m = (struct seq_file *)file->private_data; + struct seq_file *m = file->private_data; struct trace_iterator *iter; int cpu; -- cgit v1.2.3 From a9d61173dc1cb63e660ae89e874e51ba4fd2f991 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Fri, 24 Sep 2010 17:41:02 +0200 Subject: tracing: Add proper check for irq_depth routines The check_irq_entry and check_irq_return could be called from graph event context. In such case there's no graph private data allocated. Adding checks to handle this case. Signed-off-by: Jiri Olsa LKML-Reference: <20100924154102.GB1818@jolsa.brq.redhat.com> [ Fixed some grammar in the comments ] Signed-off-by: Steven Rostedt --- kernel/trace/trace_functions_graph.c | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index ef49e9370b25..4c58ccc6427c 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -888,12 +888,20 @@ check_irq_entry(struct trace_iterator *iter, u32 flags, unsigned long addr, int depth) { int cpu = iter->cpu; + int *depth_irq; struct fgraph_data *data = iter->private; - int *depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq); - if (flags & TRACE_GRAPH_PRINT_IRQS) + /* + * If we are either displaying irqs, or we got called as + * a graph event and private data does not exist, + * then we bypass the irq check. + */ + if ((flags & TRACE_GRAPH_PRINT_IRQS) || + (!data)) return 0; + depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq); + /* * We are inside the irq code */ @@ -926,12 +934,20 @@ static int check_irq_return(struct trace_iterator *iter, u32 flags, int depth) { int cpu = iter->cpu; + int *depth_irq; struct fgraph_data *data = iter->private; - int *depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq); - if (flags & TRACE_GRAPH_PRINT_IRQS) + /* + * If we are either displaying irqs, or we got called as + * a graph event and private data does not exist, + * then we bypass the irq check. + */ + if ((flags & TRACE_GRAPH_PRINT_IRQS) || + (!data)) return 0; + depth_irq = &(per_cpu_ptr(data->cpu_data, cpu)->depth_irq); + /* * We are not inside the irq code. */ -- cgit v1.2.3 From 0a772620a2e21fb55a02f70fe38d4b5c3a5fbbbf Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 23 Sep 2010 14:00:52 +0200 Subject: tracing: Make graph related irqs/preemptsoff functions global Move trace_graph_function() and print_graph_headers_flags() functions to the trace_function_graph.c to be globaly available. Signed-off-by: Jiri Olsa LKML-Reference: <1285243253-7372-3-git-send-email-jolsa@redhat.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace.h | 4 +++ kernel/trace/trace_functions_graph.c | 63 ++++++++++++++++++++++++++++++++++-- kernel/trace/trace_irqsoff.c | 56 ++++---------------------------- 3 files changed, 71 insertions(+), 52 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index d39b3c5454a5..9021f8c0c0c3 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -343,6 +343,10 @@ void trace_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, unsigned long flags, int pc); +void trace_graph_function(struct trace_array *tr, + unsigned long ip, + unsigned long parent_ip, + unsigned long flags, int pc); void trace_default_header(struct seq_file *m); void print_trace_header(struct seq_file *m, struct trace_iterator *iter); int trace_empty(struct trace_iterator *iter); diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 4c58ccc6427c..6f8fe28acba1 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -262,6 +262,35 @@ int trace_graph_thresh_entry(struct ftrace_graph_ent *trace) return trace_graph_entry(trace); } +static void +__trace_graph_function(struct trace_array *tr, + unsigned long ip, unsigned long flags, int pc) +{ + u64 time = trace_clock_local(); + struct ftrace_graph_ent ent = { + .func = ip, + .depth = 0, + }; + struct ftrace_graph_ret ret = { + .func = ip, + .depth = 0, + .calltime = time, + .rettime = time, + }; + + __trace_graph_entry(tr, &ent, flags, pc); + __trace_graph_return(tr, &ret, flags, pc); +} + +void +trace_graph_function(struct trace_array *tr, + unsigned long ip, unsigned long parent_ip, + unsigned long flags, int pc) +{ + __trace_graph_function(tr, parent_ip, flags, pc); + __trace_graph_function(tr, ip, flags, pc); +} + void __trace_graph_return(struct trace_array *tr, struct ftrace_graph_ret *trace, unsigned long flags, @@ -1179,7 +1208,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent, enum print_line_t -print_graph_function_flags(struct trace_iterator *iter, u32 flags) +__print_graph_function_flags(struct trace_iterator *iter, u32 flags) { struct ftrace_graph_ent_entry *field; struct fgraph_data *data = iter->private; @@ -1242,7 +1271,18 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags) static enum print_line_t print_graph_function(struct trace_iterator *iter) { - return print_graph_function_flags(iter, tracer_flags.val); + return __print_graph_function_flags(iter, tracer_flags.val); +} + +enum print_line_t print_graph_function_flags(struct trace_iterator *iter, + u32 flags) +{ + if (trace_flags & TRACE_ITER_LATENCY_FMT) + flags |= TRACE_GRAPH_PRINT_DURATION; + else + flags |= TRACE_GRAPH_PRINT_ABS_TIME; + + return __print_graph_function_flags(iter, flags); } static enum print_line_t @@ -1274,7 +1314,7 @@ static void print_lat_header(struct seq_file *s, u32 flags) seq_printf(s, "#%.*s|||| / \n", size, spaces); } -void print_graph_headers_flags(struct seq_file *s, u32 flags) +static void __print_graph_headers_flags(struct seq_file *s, u32 flags) { int lat = trace_flags & TRACE_ITER_LATENCY_FMT; @@ -1315,6 +1355,23 @@ void print_graph_headers(struct seq_file *s) print_graph_headers_flags(s, tracer_flags.val); } +void print_graph_headers_flags(struct seq_file *s, u32 flags) +{ + struct trace_iterator *iter = s->private; + + if (trace_flags & TRACE_ITER_LATENCY_FMT) { + /* print nothing if the buffers are empty */ + if (trace_empty(iter)) + return; + + print_trace_header(s, iter); + flags |= TRACE_GRAPH_PRINT_DURATION; + } else + flags |= TRACE_GRAPH_PRINT_ABS_TIME; + + __print_graph_headers_flags(s, flags); +} + void graph_trace_open(struct trace_iterator *iter) { /* pid and depth on the last trace processed */ diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 73a6b0601f2e..4047e98afcba 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -229,75 +229,33 @@ static void irqsoff_trace_close(struct trace_iterator *iter) static enum print_line_t irqsoff_print_line(struct trace_iterator *iter) { - u32 flags = GRAPH_TRACER_FLAGS; - - if (trace_flags & TRACE_ITER_LATENCY_FMT) - flags |= TRACE_GRAPH_PRINT_DURATION; - else - flags |= TRACE_GRAPH_PRINT_ABS_TIME; - /* * In graph mode call the graph tracer output function, * otherwise go with the TRACE_FN event handler */ if (is_graph()) - return print_graph_function_flags(iter, flags); + return print_graph_function_flags(iter, GRAPH_TRACER_FLAGS); return TRACE_TYPE_UNHANDLED; } static void irqsoff_print_header(struct seq_file *s) { - if (is_graph()) { - struct trace_iterator *iter = s->private; - u32 flags = GRAPH_TRACER_FLAGS; - - if (trace_flags & TRACE_ITER_LATENCY_FMT) { - /* print nothing if the buffers are empty */ - if (trace_empty(iter)) - return; - - print_trace_header(s, iter); - flags |= TRACE_GRAPH_PRINT_DURATION; - } else - flags |= TRACE_GRAPH_PRINT_ABS_TIME; - - print_graph_headers_flags(s, flags); - } else + if (is_graph()) + print_graph_headers_flags(s, GRAPH_TRACER_FLAGS); + else trace_default_header(s); } -static void -trace_graph_function(struct trace_array *tr, - unsigned long ip, unsigned long flags, int pc) -{ - u64 time = trace_clock_local(); - struct ftrace_graph_ent ent = { - .func = ip, - .depth = 0, - }; - struct ftrace_graph_ret ret = { - .func = ip, - .depth = 0, - .calltime = time, - .rettime = time, - }; - - __trace_graph_entry(tr, &ent, flags, pc); - __trace_graph_return(tr, &ret, flags, pc); -} - static void __trace_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, unsigned long flags, int pc) { - if (!is_graph()) + if (is_graph()) + trace_graph_function(tr, ip, parent_ip, flags, pc); + else trace_function(tr, ip, parent_ip, flags, pc); - else { - trace_graph_function(tr, parent_ip, flags, pc); - trace_graph_function(tr, ip, flags, pc); - } } #else -- cgit v1.2.3 From 7495a5beaa22f190f4888aa8cbe4827c16575d0a Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Thu, 23 Sep 2010 14:00:53 +0200 Subject: tracing: Graph support for wakeup tracer Add function graph support for wakeup latency tracer. The graph output is enabled by setting the 'display-graph' trace option. Signed-off-by: Jiri Olsa LKML-Reference: <1285243253-7372-4-git-send-email-jolsa@redhat.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_sched_wakeup.c | 231 ++++++++++++++++++++++++++++++++++++-- 1 file changed, 221 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 4086eae6e81b..033510dbb322 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -31,13 +31,33 @@ static int wakeup_rt; static arch_spinlock_t wakeup_lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; +static void wakeup_reset(struct trace_array *tr); static void __wakeup_reset(struct trace_array *tr); +static int wakeup_graph_entry(struct ftrace_graph_ent *trace); +static void wakeup_graph_return(struct ftrace_graph_ret *trace); static int save_lat_flag; +#define TRACE_DISPLAY_GRAPH 1 + +static struct tracer_opt trace_opts[] = { +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + /* display latency trace as call graph */ + { TRACER_OPT(display-graph, TRACE_DISPLAY_GRAPH) }, +#endif + { } /* Empty entry */ +}; + +static struct tracer_flags tracer_flags = { + .val = 0, + .opts = trace_opts, +}; + +#define is_graph() (tracer_flags.val & TRACE_DISPLAY_GRAPH) + #ifdef CONFIG_FUNCTION_TRACER /* - * irqsoff uses its own tracer function to keep the overhead down: + * wakeup uses its own tracer function to keep the overhead down: */ static void wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) @@ -80,8 +100,191 @@ static struct ftrace_ops trace_ops __read_mostly = { .func = wakeup_tracer_call, }; + +static int start_func_tracer(int graph) +{ + int ret; + + if (!graph) + ret = register_ftrace_function(&trace_ops); + else + ret = register_ftrace_graph(&wakeup_graph_return, + &wakeup_graph_entry); + + if (!ret && tracing_is_enabled()) + tracer_enabled = 1; + else + tracer_enabled = 0; + + return ret; +} + +static void stop_func_tracer(int graph) +{ + tracer_enabled = 0; + + if (!graph) + unregister_ftrace_function(&trace_ops); + else + unregister_ftrace_graph(); +} + #endif /* CONFIG_FUNCTION_TRACER */ +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +static int wakeup_set_flag(u32 old_flags, u32 bit, int set) +{ + + if (!(bit & TRACE_DISPLAY_GRAPH)) + return -EINVAL; + + if (!(is_graph() ^ set)) + return 0; + + stop_func_tracer(!set); + + wakeup_reset(wakeup_trace); + tracing_max_latency = 0; + + return start_func_tracer(set); +} + +static int wakeup_graph_entry(struct ftrace_graph_ent *trace) +{ + struct trace_array *tr = wakeup_trace; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int cpu, pc, ret = 0; + + if (likely(!wakeup_task)) + return 0; + + pc = preempt_count(); + preempt_disable_notrace(); + + cpu = raw_smp_processor_id(); + if (cpu != wakeup_current_cpu) + goto out_enable; + + data = tr->data[cpu]; + disabled = atomic_inc_return(&data->disabled); + if (unlikely(disabled != 1)) + goto out; + + local_save_flags(flags); + ret = __trace_graph_entry(tr, trace, flags, pc); + +out: + atomic_dec(&data->disabled); + +out_enable: + preempt_enable_notrace(); + return ret; +} + +static void wakeup_graph_return(struct ftrace_graph_ret *trace) +{ + struct trace_array *tr = wakeup_trace; + struct trace_array_cpu *data; + unsigned long flags; + long disabled; + int cpu, pc; + + if (likely(!wakeup_task)) + return; + + pc = preempt_count(); + preempt_disable_notrace(); + + cpu = raw_smp_processor_id(); + if (cpu != wakeup_current_cpu) + goto out_enable; + + data = tr->data[cpu]; + disabled = atomic_inc_return(&data->disabled); + if (unlikely(disabled != 1)) + goto out; + + local_save_flags(flags); + __trace_graph_return(tr, trace, flags, pc); + +out: + atomic_dec(&data->disabled); + +out_enable: + preempt_enable_notrace(); + return; +} + +static void wakeup_trace_open(struct trace_iterator *iter) +{ + if (is_graph()) + graph_trace_open(iter); +} + +static void wakeup_trace_close(struct trace_iterator *iter) +{ + if (iter->private) + graph_trace_close(iter); +} + +#define GRAPH_TRACER_FLAGS (TRACE_GRAPH_PRINT_PROC) + +static enum print_line_t wakeup_print_line(struct trace_iterator *iter) +{ + /* + * In graph mode call the graph tracer output function, + * otherwise go with the TRACE_FN event handler + */ + if (is_graph()) + return print_graph_function_flags(iter, GRAPH_TRACER_FLAGS); + + return TRACE_TYPE_UNHANDLED; +} + +static void wakeup_print_header(struct seq_file *s) +{ + if (is_graph()) + print_graph_headers_flags(s, GRAPH_TRACER_FLAGS); + else + trace_default_header(s); +} + +static void +__trace_function(struct trace_array *tr, + unsigned long ip, unsigned long parent_ip, + unsigned long flags, int pc) +{ + if (is_graph()) + trace_graph_function(tr, ip, parent_ip, flags, pc); + else + trace_function(tr, ip, parent_ip, flags, pc); +} +#else +#define __trace_function trace_function + +static int wakeup_set_flag(u32 old_flags, u32 bit, int set) +{ + return -EINVAL; +} + +static int wakeup_graph_entry(struct ftrace_graph_ent *trace) +{ + return -1; +} + +static enum print_line_t wakeup_print_line(struct trace_iterator *iter) +{ + return TRACE_TYPE_UNHANDLED; +} + +static void wakeup_graph_return(struct ftrace_graph_ret *trace) { } +static void wakeup_print_header(struct seq_file *s) { } +static void wakeup_trace_open(struct trace_iterator *iter) { } +static void wakeup_trace_close(struct trace_iterator *iter) { } +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + /* * Should this new latency be reported/recorded? */ @@ -152,7 +355,7 @@ probe_wakeup_sched_switch(void *ignore, /* The task we are waiting for is waking up */ data = wakeup_trace->data[wakeup_cpu]; - trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc); + __trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc); tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); T0 = data->preempt_timestamp; @@ -252,7 +455,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success) * is not called by an assembly function (where as schedule is) * it should be safe to use it here. */ - trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc); + __trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc); out_locked: arch_spin_unlock(&wakeup_lock); @@ -303,12 +506,8 @@ static void start_wakeup_tracer(struct trace_array *tr) */ smp_wmb(); - register_ftrace_function(&trace_ops); - - if (tracing_is_enabled()) - tracer_enabled = 1; - else - tracer_enabled = 0; + if (start_func_tracer(is_graph())) + printk(KERN_ERR "failed to start wakeup tracer\n"); return; fail_deprobe_wake_new: @@ -320,7 +519,7 @@ fail_deprobe: static void stop_wakeup_tracer(struct trace_array *tr) { tracer_enabled = 0; - unregister_ftrace_function(&trace_ops); + stop_func_tracer(is_graph()); unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL); unregister_trace_sched_wakeup_new(probe_wakeup, NULL); unregister_trace_sched_wakeup(probe_wakeup, NULL); @@ -379,9 +578,15 @@ static struct tracer wakeup_tracer __read_mostly = .start = wakeup_tracer_start, .stop = wakeup_tracer_stop, .print_max = 1, + .print_header = wakeup_print_header, + .print_line = wakeup_print_line, + .flags = &tracer_flags, + .set_flag = wakeup_set_flag, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_wakeup, #endif + .open = wakeup_trace_open, + .close = wakeup_trace_close, .use_max_tr = 1, }; @@ -394,9 +599,15 @@ static struct tracer wakeup_rt_tracer __read_mostly = .stop = wakeup_tracer_stop, .wait_pipe = poll_wait_pipe, .print_max = 1, + .print_header = wakeup_print_header, + .print_line = wakeup_print_line, + .flags = &tracer_flags, + .set_flag = wakeup_set_flag, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_wakeup, #endif + .open = wakeup_trace_open, + .close = wakeup_trace_close, .use_max_tr = 1, }; -- cgit v1.2.3 From 542181d3769d001c59cd17573dd4381e87d215f2 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 5 Oct 2010 16:38:49 -0400 Subject: tracing: Use one prologue for the wakeup tracer function tracers The wakeup tracer has three types of function tracers. Normal function tracer, function graph entry, and function graph return. Each of these use a complex dance to prevent recursion and whether to trace the data or not (depending on the wake_task variable). This patch moves the duplicate code into a single routine, to prevent future mistakes with modifying duplicate complex code. Cc: Jiri Olsa Signed-off-by: Steven Rostedt --- kernel/trace/trace_sched_wakeup.c | 102 +++++++++++++++++++------------------- 1 file changed, 50 insertions(+), 52 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 033510dbb322..31689d2df7f3 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -56,43 +56,73 @@ static struct tracer_flags tracer_flags = { #define is_graph() (tracer_flags.val & TRACE_DISPLAY_GRAPH) #ifdef CONFIG_FUNCTION_TRACER + /* - * wakeup uses its own tracer function to keep the overhead down: + * Prologue for the wakeup function tracers. + * + * Returns 1 if it is OK to continue, and preemption + * is disabled and data->disabled is incremented. + * 0 if the trace is to be ignored, and preemption + * is not disabled and data->disabled is + * kept the same. + * + * Note, this function is also used outside this ifdef but + * inside the #ifdef of the function graph tracer below. + * This is OK, since the function graph tracer is + * dependent on the function tracer. */ -static void -wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) +static int +func_prolog_preempt_disable(struct trace_array *tr, + struct trace_array_cpu **data, + int *pc) { - struct trace_array *tr = wakeup_trace; - struct trace_array_cpu *data; - unsigned long flags; long disabled; int cpu; - int pc; if (likely(!wakeup_task)) - return; + return 0; - pc = preempt_count(); + *pc = preempt_count(); preempt_disable_notrace(); cpu = raw_smp_processor_id(); if (cpu != wakeup_current_cpu) goto out_enable; - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); + *data = tr->data[cpu]; + disabled = atomic_inc_return(&(*data)->disabled); if (unlikely(disabled != 1)) goto out; - local_irq_save(flags); + return 1; - trace_function(tr, ip, parent_ip, flags, pc); +out: + atomic_dec(&(*data)->disabled); + +out_enable: + preempt_enable_notrace(); + return 0; +} + +/* + * wakeup uses its own tracer function to keep the overhead down: + */ +static void +wakeup_tracer_call(unsigned long ip, unsigned long parent_ip) +{ + struct trace_array *tr = wakeup_trace; + struct trace_array_cpu *data; + unsigned long flags; + int pc; + + if (!func_prolog_preempt_disable(tr, &data, &pc)) + return; + local_irq_save(flags); + trace_function(tr, ip, parent_ip, flags, pc); local_irq_restore(flags); - out: atomic_dec(&data->disabled); - out_enable: preempt_enable_notrace(); } @@ -154,32 +184,16 @@ static int wakeup_graph_entry(struct ftrace_graph_ent *trace) struct trace_array *tr = wakeup_trace; struct trace_array_cpu *data; unsigned long flags; - long disabled; - int cpu, pc, ret = 0; + int pc, ret = 0; - if (likely(!wakeup_task)) + if (!func_prolog_preempt_disable(tr, &data, &pc)) return 0; - pc = preempt_count(); - preempt_disable_notrace(); - - cpu = raw_smp_processor_id(); - if (cpu != wakeup_current_cpu) - goto out_enable; - - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); - if (unlikely(disabled != 1)) - goto out; - local_save_flags(flags); ret = __trace_graph_entry(tr, trace, flags, pc); - -out: atomic_dec(&data->disabled); - -out_enable: preempt_enable_notrace(); + return ret; } @@ -188,31 +202,15 @@ static void wakeup_graph_return(struct ftrace_graph_ret *trace) struct trace_array *tr = wakeup_trace; struct trace_array_cpu *data; unsigned long flags; - long disabled; - int cpu, pc; + int pc; - if (likely(!wakeup_task)) + if (!func_prolog_preempt_disable(tr, &data, &pc)) return; - pc = preempt_count(); - preempt_disable_notrace(); - - cpu = raw_smp_processor_id(); - if (cpu != wakeup_current_cpu) - goto out_enable; - - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); - if (unlikely(disabled != 1)) - goto out; - local_save_flags(flags); __trace_graph_return(tr, trace, flags, pc); - -out: atomic_dec(&data->disabled); -out_enable: preempt_enable_notrace(); return; } -- cgit v1.2.3 From 5e6d2b9cfa3a6e7fe62fc0135bc1bd778f5db564 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 5 Oct 2010 19:41:43 -0400 Subject: tracing: Use one prologue for the preempt irqs off tracer function tracers The preempt and irqsoff tracers have three types of function tracers. Normal function tracer, function graph entry, and function graph return. Each of these use a complex dance to prevent recursion and whether to trace the data or not (depending if interrupts are enabled or not). This patch moves the duplicate code into a single routine, to prevent future mistakes with modifying duplicate complex code. Cc: Jiri Olsa Signed-off-by: Steven Rostedt --- kernel/trace/trace_irqsoff.c | 96 ++++++++++++++++++++++---------------------- 1 file changed, 48 insertions(+), 48 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 4047e98afcba..5cf8c602b880 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -87,14 +87,22 @@ static __cacheline_aligned_in_smp unsigned long max_sequence; #ifdef CONFIG_FUNCTION_TRACER /* - * irqsoff uses its own tracer function to keep the overhead down: + * Prologue for the preempt and irqs off function tracers. + * + * Returns 1 if it is OK to continue, and data->disabled is + * incremented. + * 0 if the trace is to be ignored, and data->disabled + * is kept the same. + * + * Note, this function is also used outside this ifdef but + * inside the #ifdef of the function graph tracer below. + * This is OK, since the function graph tracer is + * dependent on the function tracer. */ -static void -irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) +static int func_prolog_dec(struct trace_array *tr, + struct trace_array_cpu **data, + unsigned long *flags) { - struct trace_array *tr = irqsoff_trace; - struct trace_array_cpu *data; - unsigned long flags; long disabled; int cpu; @@ -106,18 +114,38 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) */ cpu = raw_smp_processor_id(); if (likely(!per_cpu(tracing_cpu, cpu))) - return; + return 0; - local_save_flags(flags); + local_save_flags(*flags); /* slight chance to get a false positive on tracing_cpu */ - if (!irqs_disabled_flags(flags)) - return; + if (!irqs_disabled_flags(*flags)) + return 0; - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); + *data = tr->data[cpu]; + disabled = atomic_inc_return(&(*data)->disabled); if (likely(disabled == 1)) - trace_function(tr, ip, parent_ip, flags, preempt_count()); + return 1; + + atomic_dec(&(*data)->disabled); + + return 0; +} + +/* + * irqsoff uses its own tracer function to keep the overhead down: + */ +static void +irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip) +{ + struct trace_array *tr = irqsoff_trace; + struct trace_array_cpu *data; + unsigned long flags; + + if (!func_prolog_dec(tr, &data, &flags)) + return; + + trace_function(tr, ip, parent_ip, flags, preempt_count()); atomic_dec(&data->disabled); } @@ -155,30 +183,16 @@ static int irqsoff_graph_entry(struct ftrace_graph_ent *trace) struct trace_array *tr = irqsoff_trace; struct trace_array_cpu *data; unsigned long flags; - long disabled; int ret; - int cpu; int pc; - cpu = raw_smp_processor_id(); - if (likely(!per_cpu(tracing_cpu, cpu))) - return 0; - - local_save_flags(flags); - /* slight chance to get a false positive on tracing_cpu */ - if (!irqs_disabled_flags(flags)) + if (!func_prolog_dec(tr, &data, &flags)) return 0; - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); - - if (likely(disabled == 1)) { - pc = preempt_count(); - ret = __trace_graph_entry(tr, trace, flags, pc); - } else - ret = 0; - + pc = preempt_count(); + ret = __trace_graph_entry(tr, trace, flags, pc); atomic_dec(&data->disabled); + return ret; } @@ -187,27 +201,13 @@ static void irqsoff_graph_return(struct ftrace_graph_ret *trace) struct trace_array *tr = irqsoff_trace; struct trace_array_cpu *data; unsigned long flags; - long disabled; - int cpu; int pc; - cpu = raw_smp_processor_id(); - if (likely(!per_cpu(tracing_cpu, cpu))) - return; - - local_save_flags(flags); - /* slight chance to get a false positive on tracing_cpu */ - if (!irqs_disabled_flags(flags)) + if (!func_prolog_dec(tr, &data, &flags)) return; - data = tr->data[cpu]; - disabled = atomic_inc_return(&data->disabled); - - if (likely(disabled == 1)) { - pc = preempt_count(); - __trace_graph_return(tr, trace, flags, pc); - } - + pc = preempt_count(); + __trace_graph_return(tr, trace, flags, pc); atomic_dec(&data->disabled); } -- cgit v1.2.3 From 78c89ba121221d9224a5747803d7fffe51cd6e44 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 5 Oct 2010 23:22:19 -0400 Subject: tracing: Remove parent recording in latency tracer graph options Even though the parent is recorded with the normal function tracing of the latency tracers (irqsoff and wakeup), the function graph recording is bogus. This is due to the function graph messing with the return stack. The latency tracers pass in as the parent CALLER_ADDR0, which works fine for plain function tracing. But this causes bogus output with the graph tracer: 3) -0 | d.s3. 0.000 us | return_to_handler(); 3) -0 | d.s3. 0.000 us | _raw_spin_unlock_irqrestore(); 3) -0 | d.s3. 0.000 us | return_to_handler(); 3) -0 | d.s3. 0.000 us | trace_hardirqs_on(); The "return_to_handle()" call is the trampoline of the function graph tracer, and is meaningless in this context. Cc: Jiri Olsa Signed-off-by: Steven Rostedt --- kernel/trace/trace_functions_graph.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 6f8fe28acba1..76b05980225c 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -287,7 +287,6 @@ trace_graph_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, unsigned long flags, int pc) { - __trace_graph_function(tr, parent_ip, flags, pc); __trace_graph_function(tr, ip, flags, pc); } -- cgit v1.2.3 From c530ccd9a1864a44a7ff35826681229ce9f2357a Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Fri, 15 Oct 2010 15:26:01 +0200 Subject: perf_events: Fix bogus context time tracking You can only call update_context_time() when the context is active, i.e., the thread it is attached to is still running. However, perf_event_read() can be called even when the context is inactive, e.g., user read() the counters. The call to update_context_time() must be conditioned on the status of the context, otherwise, bogus time_enabled, time_running may be returned. Here is an example on AMD64. The task program is an example from libpfm4. The -p prints deltas every 1s. $ task -p -e cpu_clk_unhalted sleep 5 2,266,610 cpu_clk_unhalted (0.00% scaling, ena=2,158,982, run=2,158,982) 0 cpu_clk_unhalted (0.00% scaling, ena=2,158,982, run=2,158,982) 0 cpu_clk_unhalted (0.00% scaling, ena=2,158,982, run=2,158,982) 0 cpu_clk_unhalted (0.00% scaling, ena=2,158,982, run=2,158,982) 0 cpu_clk_unhalted (0.00% scaling, ena=2,158,982, run=2,158,982) 5,242,358,071 cpu_clk_unhalted (99.95% scaling, ena=5,000,359,984, run=2,319,270) Whereas if you don't read deltas, e.g., no call to perf_event_read() until the process terminates: $ task -e cpu_clk_unhalted sleep 5 2,497,783 cpu_clk_unhalted (0.00% scaling, ena=2,376,899, run=2,376,899) Notice that time_enable, time_running are bogus in the first example causing bogus scaling. This patch fixes the problem, by conditionally calling update_context_time() in perf_event_read(). Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Cc: stable@kernel.org LKML-Reference: <4cb856dc.51edd80a.5ae0.38fb@mx.google.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 1ec3916ffef0..e7eeba1794fd 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -1780,7 +1780,13 @@ static u64 perf_event_read(struct perf_event *event) unsigned long flags; raw_spin_lock_irqsave(&ctx->lock, flags); - update_context_time(ctx); + /* + * may read while context is not active + * (e.g., thread is blocked), in that case + * we cannot update context time + */ + if (ctx->is_active) + update_context_time(ctx); update_event_times(event); raw_spin_unlock_irqrestore(&ctx->lock, flags); } -- cgit v1.2.3 From 8e5fc1a7320baf6076391607515dceb61319b36a Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Fri, 15 Oct 2010 16:54:01 +0200 Subject: perf_events: Fix transaction recovery in group_sched_in() The group_sched_in() function uses a transactional approach to schedule a group of events. In a group, either all events can be scheduled or none are. To schedule each event in, the function calls event_sched_in(). In case of error, event_sched_out() is called on each event in the group. The problem is that event_sched_out() does not completely cancel the effects of event_sched_in(). Furthermore event_sched_out() changes the state of the event as if it had run which is not true is this particular case. Those inconsistencies impact time tracking fields and may lead to events in a group not all reporting the same time_enabled and time_running values. This is demonstrated with the example below: $ task -eunhalted_core_cycles,baclears,baclears -e unhalted_core_cycles,baclears,baclears sleep 5 1946101 unhalted_core_cycles (32.85% scaling, ena=829181, run=556827) 11423 baclears (32.85% scaling, ena=829181, run=556827) 7671 baclears (0.00% scaling, ena=556827, run=556827) 2250443 unhalted_core_cycles (57.83% scaling, ena=962822, run=405995) 11705 baclears (57.83% scaling, ena=962822, run=405995) 11705 baclears (57.83% scaling, ena=962822, run=405995) Notice that in the first group, the last baclears event does not report the same timings as its siblings. This issue comes from the fact that tstamp_stopped is updated by event_sched_out() as if the event had actually run. To solve the issue, we must ensure that, in case of error, there is no change in the event state whatsoever. That means timings must remain as they were when entering group_sched_in(). To do this we defer updating tstamp_running until we know the transaction succeeded. Therefore, we have split event_sched_in() in two parts separating the update to tstamp_running. Similarly, in case of error, we do not want to update tstamp_stopped. Therefore, we have split event_sched_out() in two parts separating the update to tstamp_stopped. With this patch, we now get the following output: $ task -eunhalted_core_cycles,baclears,baclears -e unhalted_core_cycles,baclears,baclears sleep 5 2492050 unhalted_core_cycles (71.75% scaling, ena=1093330, run=308841) 11243 baclears (71.75% scaling, ena=1093330, run=308841) 11243 baclears (71.75% scaling, ena=1093330, run=308841) 1852746 unhalted_core_cycles (0.00% scaling, ena=784489, run=784489) 9253 baclears (0.00% scaling, ena=784489, run=784489) 9253 baclears (0.00% scaling, ena=784489, run=784489) Note that the uneven timing between groups is a side effect of the process spending most of its time sleeping, i.e., not enough event rotations (but that's a separate issue). Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra LKML-Reference: <4cb86b4c.41e9d80a.44e9.3e19@mx.google.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 76 ++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 63 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index e7eeba1794fd..634f86a4b2f9 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -412,8 +412,8 @@ event_filter_match(struct perf_event *event) return event->cpu == -1 || event->cpu == smp_processor_id(); } -static void -event_sched_out(struct perf_event *event, +static int +__event_sched_out(struct perf_event *event, struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { @@ -432,14 +432,13 @@ event_sched_out(struct perf_event *event, } if (event->state != PERF_EVENT_STATE_ACTIVE) - return; + return 0; event->state = PERF_EVENT_STATE_INACTIVE; if (event->pending_disable) { event->pending_disable = 0; event->state = PERF_EVENT_STATE_OFF; } - event->tstamp_stopped = ctx->time; event->pmu->del(event, 0); event->oncpu = -1; @@ -448,6 +447,19 @@ event_sched_out(struct perf_event *event, ctx->nr_active--; if (event->attr.exclusive || !cpuctx->active_oncpu) cpuctx->exclusive = 0; + return 1; +} + +static void +event_sched_out(struct perf_event *event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) +{ + int ret; + + ret = __event_sched_out(event, cpuctx, ctx); + if (ret) + event->tstamp_stopped = ctx->time; } static void @@ -647,7 +659,7 @@ retry: } static int -event_sched_in(struct perf_event *event, +__event_sched_in(struct perf_event *event, struct perf_cpu_context *cpuctx, struct perf_event_context *ctx) { @@ -667,8 +679,6 @@ event_sched_in(struct perf_event *event, return -EAGAIN; } - event->tstamp_running += ctx->time - event->tstamp_stopped; - if (!is_software_event(event)) cpuctx->active_oncpu++; ctx->nr_active++; @@ -679,6 +689,35 @@ event_sched_in(struct perf_event *event, return 0; } +static inline int +event_sched_in(struct perf_event *event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) +{ + int ret = __event_sched_in(event, cpuctx, ctx); + if (ret) + return ret; + event->tstamp_running += ctx->time - event->tstamp_stopped; + return 0; +} + +static void +group_commit_event_sched_in(struct perf_event *group_event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) +{ + struct perf_event *event; + u64 now = ctx->time; + + group_event->tstamp_running += now - group_event->tstamp_stopped; + /* + * Schedule in siblings as one group (if any): + */ + list_for_each_entry(event, &group_event->sibling_list, group_entry) { + event->tstamp_running += now - event->tstamp_stopped; + } +} + static int group_sched_in(struct perf_event *group_event, struct perf_cpu_context *cpuctx, @@ -692,7 +731,13 @@ group_sched_in(struct perf_event *group_event, pmu->start_txn(pmu); - if (event_sched_in(group_event, cpuctx, ctx)) { + /* + * use __event_sched_in() to delay updating tstamp_running + * until the transaction is committed. In case of failure + * we will keep an unmodified tstamp_running which is a + * requirement to get correct timing information + */ + if (__event_sched_in(group_event, cpuctx, ctx)) { pmu->cancel_txn(pmu); return -EAGAIN; } @@ -701,26 +746,31 @@ group_sched_in(struct perf_event *group_event, * Schedule in siblings as one group (if any): */ list_for_each_entry(event, &group_event->sibling_list, group_entry) { - if (event_sched_in(event, cpuctx, ctx)) { + if (__event_sched_in(event, cpuctx, ctx)) { partial_group = event; goto group_error; } } - if (!pmu->commit_txn(pmu)) + if (!pmu->commit_txn(pmu)) { + /* commit tstamp_running */ + group_commit_event_sched_in(group_event, cpuctx, ctx); return 0; - + } group_error: /* * Groups can be scheduled in as one unit only, so undo any * partial group before returning: + * + * use __event_sched_out() to avoid updating tstamp_stopped + * because the event never actually ran */ list_for_each_entry(event, &group_event->sibling_list, group_entry) { if (event == partial_group) break; - event_sched_out(event, cpuctx, ctx); + __event_sched_out(event, cpuctx, ctx); } - event_sched_out(group_event, cpuctx, ctx); + __event_sched_out(group_event, cpuctx, ctx); pmu->cancel_txn(pmu); -- cgit v1.2.3 From e360adbe29241a0194e10e20595360dd7b98a2b3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 14 Oct 2010 14:01:34 +0800 Subject: irq_work: Add generic hardirq context callbacks Provide a mechanism that allows running code in IRQ context. It is most useful for NMI code that needs to interact with the rest of the system -- like wakeup a task to drain buffers. Perf currently has such a mechanism, so extract that and provide it as a generic feature, independent of perf so that others may also benefit. The IRQ context callback is generated through self-IPIs where possible, or on architectures like powerpc the decrementer (the built-in timer facility) is set to generate an interrupt immediately. Architectures that don't have anything like this get to do with a callback from the timer tick. These architectures can call irq_work_run() at the tail of any IRQ handlers that might enqueue such work (like the perf IRQ handler) to avoid undue latencies in processing the work. Signed-off-by: Peter Zijlstra Acked-by: Kyle McMartin Acked-by: Martin Schwidefsky [ various fixes ] Signed-off-by: Huang Ying LKML-Reference: <1287036094.7768.291.camel@yhuang-dev> Signed-off-by: Ingo Molnar --- kernel/Makefile | 2 + kernel/irq_work.c | 164 ++++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/perf_event.c | 104 ++------------------------------- kernel/timer.c | 7 ++- 4 files changed, 176 insertions(+), 101 deletions(-) create mode 100644 kernel/irq_work.c (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index d52b473c99a1..4d9bf5f8531f 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -23,6 +23,7 @@ CFLAGS_REMOVE_rtmutex-debug.o = -pg CFLAGS_REMOVE_cgroup-debug.o = -pg CFLAGS_REMOVE_sched_clock.o = -pg CFLAGS_REMOVE_perf_event.o = -pg +CFLAGS_REMOVE_irq_work.o = -pg endif obj-$(CONFIG_FREEZER) += freezer.o @@ -100,6 +101,7 @@ obj-$(CONFIG_TRACING) += trace/ obj-$(CONFIG_X86_DS) += trace/ obj-$(CONFIG_RING_BUFFER) += trace/ obj-$(CONFIG_SMP) += sched_cpupri.o +obj-$(CONFIG_IRQ_WORK) += irq_work.o obj-$(CONFIG_PERF_EVENTS) += perf_event.o obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o diff --git a/kernel/irq_work.c b/kernel/irq_work.c new file mode 100644 index 000000000000..f16763ff8481 --- /dev/null +++ b/kernel/irq_work.c @@ -0,0 +1,164 @@ +/* + * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra + * + * Provides a framework for enqueueing and running callbacks from hardirq + * context. The enqueueing is NMI-safe. + */ + +#include +#include +#include +#include + +/* + * An entry can be in one of four states: + * + * free NULL, 0 -> {claimed} : free to be used + * claimed NULL, 3 -> {pending} : claimed to be enqueued + * pending next, 3 -> {busy} : queued, pending callback + * busy NULL, 2 -> {free, claimed} : callback in progress, can be claimed + * + * We use the lower two bits of the next pointer to keep PENDING and BUSY + * flags. + */ + +#define IRQ_WORK_PENDING 1UL +#define IRQ_WORK_BUSY 2UL +#define IRQ_WORK_FLAGS 3UL + +static inline bool irq_work_is_set(struct irq_work *entry, int flags) +{ + return (unsigned long)entry->next & flags; +} + +static inline struct irq_work *irq_work_next(struct irq_work *entry) +{ + unsigned long next = (unsigned long)entry->next; + next &= ~IRQ_WORK_FLAGS; + return (struct irq_work *)next; +} + +static inline struct irq_work *next_flags(struct irq_work *entry, int flags) +{ + unsigned long next = (unsigned long)entry; + next |= flags; + return (struct irq_work *)next; +} + +static DEFINE_PER_CPU(struct irq_work *, irq_work_list); + +/* + * Claim the entry so that no one else will poke at it. + */ +static bool irq_work_claim(struct irq_work *entry) +{ + struct irq_work *next, *nflags; + + do { + next = entry->next; + if ((unsigned long)next & IRQ_WORK_PENDING) + return false; + nflags = next_flags(next, IRQ_WORK_FLAGS); + } while (cmpxchg(&entry->next, next, nflags) != next); + + return true; +} + + +void __weak arch_irq_work_raise(void) +{ + /* + * Lame architectures will get the timer tick callback + */ +} + +/* + * Queue the entry and raise the IPI if needed. + */ +static void __irq_work_queue(struct irq_work *entry) +{ + struct irq_work **head, *next; + + head = &get_cpu_var(irq_work_list); + + do { + next = *head; + /* Can assign non-atomic because we keep the flags set. */ + entry->next = next_flags(next, IRQ_WORK_FLAGS); + } while (cmpxchg(head, next, entry) != next); + + /* The list was empty, raise self-interrupt to start processing. */ + if (!irq_work_next(entry)) + arch_irq_work_raise(); + + put_cpu_var(irq_work_list); +} + +/* + * Enqueue the irq_work @entry, returns true on success, failure when the + * @entry was already enqueued by someone else. + * + * Can be re-enqueued while the callback is still in progress. + */ +bool irq_work_queue(struct irq_work *entry) +{ + if (!irq_work_claim(entry)) { + /* + * Already enqueued, can't do! + */ + return false; + } + + __irq_work_queue(entry); + return true; +} +EXPORT_SYMBOL_GPL(irq_work_queue); + +/* + * Run the irq_work entries on this cpu. Requires to be ran from hardirq + * context with local IRQs disabled. + */ +void irq_work_run(void) +{ + struct irq_work *list, **head; + + head = &__get_cpu_var(irq_work_list); + if (*head == NULL) + return; + + BUG_ON(!in_irq()); + BUG_ON(!irqs_disabled()); + + list = xchg(head, NULL); + while (list != NULL) { + struct irq_work *entry = list; + + list = irq_work_next(list); + + /* + * Clear the PENDING bit, after this point the @entry + * can be re-used. + */ + entry->next = next_flags(NULL, IRQ_WORK_BUSY); + entry->func(entry); + /* + * Clear the BUSY bit and return to the free state if + * no-one else claimed it meanwhile. + */ + cmpxchg(&entry->next, next_flags(NULL, IRQ_WORK_BUSY), NULL); + } +} +EXPORT_SYMBOL_GPL(irq_work_run); + +/* + * Synchronize against the irq_work @entry, ensures the entry is not + * currently in use. + */ +void irq_work_sync(struct irq_work *entry) +{ + WARN_ON_ONCE(irqs_disabled()); + + while (irq_work_is_set(entry, IRQ_WORK_BUSY)) + cpu_relax(); +} +EXPORT_SYMBOL_GPL(irq_work_sync); diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 634f86a4b2f9..99b9700e74d0 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -2206,12 +2206,11 @@ static void free_event_rcu(struct rcu_head *head) kfree(event); } -static void perf_pending_sync(struct perf_event *event); static void perf_buffer_put(struct perf_buffer *buffer); static void free_event(struct perf_event *event) { - perf_pending_sync(event); + irq_work_sync(&event->pending); if (!event->parent) { atomic_dec(&nr_events); @@ -3162,16 +3161,7 @@ void perf_event_wakeup(struct perf_event *event) } } -/* - * Pending wakeups - * - * Handle the case where we need to wakeup up from NMI (or rq->lock) context. - * - * The NMI bit means we cannot possibly take locks. Therefore, maintain a - * single linked list and use cmpxchg() to add entries lockless. - */ - -static void perf_pending_event(struct perf_pending_entry *entry) +static void perf_pending_event(struct irq_work *entry) { struct perf_event *event = container_of(entry, struct perf_event, pending); @@ -3187,89 +3177,6 @@ static void perf_pending_event(struct perf_pending_entry *entry) } } -#define PENDING_TAIL ((struct perf_pending_entry *)-1UL) - -static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = { - PENDING_TAIL, -}; - -static void perf_pending_queue(struct perf_pending_entry *entry, - void (*func)(struct perf_pending_entry *)) -{ - struct perf_pending_entry **head; - - if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL) - return; - - entry->func = func; - - head = &get_cpu_var(perf_pending_head); - - do { - entry->next = *head; - } while (cmpxchg(head, entry->next, entry) != entry->next); - - set_perf_event_pending(); - - put_cpu_var(perf_pending_head); -} - -static int __perf_pending_run(void) -{ - struct perf_pending_entry *list; - int nr = 0; - - list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL); - while (list != PENDING_TAIL) { - void (*func)(struct perf_pending_entry *); - struct perf_pending_entry *entry = list; - - list = list->next; - - func = entry->func; - entry->next = NULL; - /* - * Ensure we observe the unqueue before we issue the wakeup, - * so that we won't be waiting forever. - * -- see perf_not_pending(). - */ - smp_wmb(); - - func(entry); - nr++; - } - - return nr; -} - -static inline int perf_not_pending(struct perf_event *event) -{ - /* - * If we flush on whatever cpu we run, there is a chance we don't - * need to wait. - */ - get_cpu(); - __perf_pending_run(); - put_cpu(); - - /* - * Ensure we see the proper queue state before going to sleep - * so that we do not miss the wakeup. -- see perf_pending_handle() - */ - smp_rmb(); - return event->pending.next == NULL; -} - -static void perf_pending_sync(struct perf_event *event) -{ - wait_event(event->waitq, perf_not_pending(event)); -} - -void perf_event_do_pending(void) -{ - __perf_pending_run(); -} - /* * We assume there is only KVM supporting the callbacks. * Later on, we might change it to a list if there is @@ -3319,8 +3226,7 @@ static void perf_output_wakeup(struct perf_output_handle *handle) if (handle->nmi) { handle->event->pending_wakeup = 1; - perf_pending_queue(&handle->event->pending, - perf_pending_event); + irq_work_queue(&handle->event->pending); } else perf_event_wakeup(handle->event); } @@ -4356,8 +4262,7 @@ static int __perf_event_overflow(struct perf_event *event, int nmi, event->pending_kill = POLL_HUP; if (nmi) { event->pending_disable = 1; - perf_pending_queue(&event->pending, - perf_pending_event); + irq_work_queue(&event->pending); } else perf_event_disable(event); } @@ -5374,6 +5279,7 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, INIT_LIST_HEAD(&event->event_entry); INIT_LIST_HEAD(&event->sibling_list); init_waitqueue_head(&event->waitq); + init_irq_work(&event->pending, perf_pending_event); mutex_init(&event->mmap_mutex); diff --git a/kernel/timer.c b/kernel/timer.c index 97bf05baade7..68a9ae7679b7 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -37,7 +37,7 @@ #include #include #include -#include +#include #include #include @@ -1279,7 +1279,10 @@ void update_process_times(int user_tick) run_local_timers(); rcu_check_callbacks(cpu, user_tick); printk_tick(); - perf_event_do_pending(); +#ifdef CONFIG_IRQ_WORK + if (in_irq()) + irq_work_run(); +#endif scheduler_tick(); run_posix_cpu_timers(p); } -- cgit v1.2.3 From 74c3337c2fc6389d3a57a622a936036b6db6b2e8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 15 Oct 2010 11:40:29 +0200 Subject: perf: Fix group moving Matt found we trigger the WARN_ON_ONCE() in perf_group_attach() when we take the move_group path in perf_event_open(). Since we cannot de-construct the group (we rely on it to move the events), we have to simply ignore the double attach. The group state is context invariant and doesn't need changing. Reported-by: Matt Fleming Signed-off-by: Peter Zijlstra LKML-Reference: <1287135757.29097.1368.camel@twins> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 99b9700e74d0..346dc0e35a0a 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -315,7 +315,12 @@ static void perf_group_attach(struct perf_event *event) { struct perf_event *group_leader = event->group_leader; - WARN_ON_ONCE(event->attach_state & PERF_ATTACH_GROUP); + /* + * We can have double attach due to group movement in perf_event_open. + */ + if (event->attach_state & PERF_ATTACH_GROUP) + return; + event->attach_state |= PERF_ATTACH_GROUP; if (group_leader == event) -- cgit v1.2.3 From e7d0bc047548d76feee6b23f7d3d9da927189a50 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 14 Oct 2010 16:54:51 +0200 Subject: perf: Fix task refcount bugs Currently it looks like find_lively_task_by_vpid() takes a task ref and relies on find_get_context() to drop it. The problem is that perf_event_create_kernel_counter() shouldn't be dropping task refs. Signed-off-by: Peter Zijlstra Acked-by: Frederic Weisbecker Acked-by: Matt Helsley LKML-Reference: <20101014203625.278436085@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 346dc0e35a0a..f928878a1c17 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -2190,11 +2190,9 @@ retry: } } - put_task_struct(task); return ctx; errout: - put_task_struct(task); return ERR_PTR(err); } @@ -5602,7 +5600,7 @@ SYSCALL_DEFINE5(perf_event_open, ctx = find_get_context(pmu, task, cpu); if (IS_ERR(ctx)) { err = PTR_ERR(ctx); - goto err_group_fd; + goto err_task; } /* @@ -5698,6 +5696,9 @@ SYSCALL_DEFINE5(perf_event_open, err_context: put_ctx(ctx); +err_task: + if (task) + put_task_struct(task); err_group_fd: fput_light(group_file, fput_needed); free_event(event); -- cgit v1.2.3 From c6be5a5cb62592d9d661899a2aa78236eb00ffa5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 14 Oct 2010 16:59:46 +0200 Subject: perf: Find task before event alloc So that we can pass the task pointer to the event allocation, so that we can use task associated data during event initialization. Signed-off-by: Peter Zijlstra LKML-Reference: <20101014203625.340789919@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index f928878a1c17..b21d06aaef60 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -5551,10 +5551,18 @@ SYSCALL_DEFINE5(perf_event_open, group_leader = NULL; } + if (pid != -1) { + task = find_lively_task_by_vpid(pid); + if (IS_ERR(task)) { + err = PTR_ERR(task); + goto err_group_fd; + } + } + event = perf_event_alloc(&attr, cpu, group_leader, NULL, NULL); if (IS_ERR(event)) { err = PTR_ERR(event); - goto err_fd; + goto err_task; } /* @@ -5586,21 +5594,13 @@ SYSCALL_DEFINE5(perf_event_open, } } - if (pid != -1) { - task = find_lively_task_by_vpid(pid); - if (IS_ERR(task)) { - err = PTR_ERR(task); - goto err_group_fd; - } - } - /* * Get the target context (task or percpu): */ ctx = find_get_context(pmu, task, cpu); if (IS_ERR(ctx)) { err = PTR_ERR(ctx); - goto err_task; + goto err_alloc; } /* @@ -5696,12 +5696,13 @@ SYSCALL_DEFINE5(perf_event_open, err_context: put_ctx(ctx); +err_alloc: + free_event(event); err_task: if (task) put_task_struct(task); err_group_fd: fput_light(group_file, fput_needed); - free_event(event); err_fd: put_unused_fd(event_fd); return err; -- cgit v1.2.3 From d580ff8699e8811a9af37e9de4dea375401bdeec Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 14 Oct 2010 17:43:23 +0200 Subject: perf, hw_breakpoint: Fix crash in hw_breakpoint creation hw_breakpoint creation needs to account stuff per-task to ensure there is always sufficient hardware resources to back these things due to ptrace. With the perf per pmu context changes the event initialization no longer has access to the event context, for the simple reason that we need to first find the pmu (result of initialization) before we can find the context. This makes hw_breakpoints unhappy, because it can no longer do per task accounting, cure this by frobbing a task pointer in the event::hw bits for now... Signed-off-by: Peter Zijlstra Cc: Frederic Weisbecker LKML-Reference: <20101014203625.391543667@chello.nl> Signed-off-by: Ingo Molnar --- kernel/hw_breakpoint.c | 8 ++++---- kernel/perf_event.c | 23 ++++++++++++++++++----- 2 files changed, 22 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index 3b714e839c10..2c9120f0afca 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c @@ -113,12 +113,12 @@ static unsigned int max_task_bp_pinned(int cpu, enum bp_type_idx type) */ static int task_bp_pinned(struct perf_event *bp, enum bp_type_idx type) { - struct perf_event_context *ctx = bp->ctx; + struct task_struct *tsk = bp->hw.bp_target; struct perf_event *iter; int count = 0; list_for_each_entry(iter, &bp_task_head, hw.bp_list) { - if (iter->ctx == ctx && find_slot_idx(iter) == type) + if (iter->hw.bp_target == tsk && find_slot_idx(iter) == type) count += hw_breakpoint_weight(iter); } @@ -134,7 +134,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, enum bp_type_idx type) { int cpu = bp->cpu; - struct task_struct *tsk = bp->ctx->task; + struct task_struct *tsk = bp->hw.bp_target; if (cpu >= 0) { slots->pinned = per_cpu(nr_cpu_bp_pinned[type], cpu); @@ -213,7 +213,7 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, int weight) { int cpu = bp->cpu; - struct task_struct *tsk = bp->ctx->task; + struct task_struct *tsk = bp->hw.bp_target; /* Pinned counter cpu profiling */ if (!tsk) { diff --git a/kernel/perf_event.c b/kernel/perf_event.c index b21d06aaef60..856e20baf13f 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -5255,9 +5255,10 @@ unlock: */ static struct perf_event * perf_event_alloc(struct perf_event_attr *attr, int cpu, - struct perf_event *group_leader, - struct perf_event *parent_event, - perf_overflow_handler_t overflow_handler) + struct task_struct *task, + struct perf_event *group_leader, + struct perf_event *parent_event, + perf_overflow_handler_t overflow_handler) { struct pmu *pmu; struct perf_event *event; @@ -5299,6 +5300,17 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, event->state = PERF_EVENT_STATE_INACTIVE; + if (task) { + event->attach_state = PERF_ATTACH_TASK; +#ifdef CONFIG_HAVE_HW_BREAKPOINT + /* + * hw_breakpoint is a bit difficult here.. + */ + if (attr->type == PERF_TYPE_BREAKPOINT) + event->hw.bp_target = task; +#endif + } + if (!overflow_handler && parent_event) overflow_handler = parent_event->overflow_handler; @@ -5559,7 +5571,7 @@ SYSCALL_DEFINE5(perf_event_open, } } - event = perf_event_alloc(&attr, cpu, group_leader, NULL, NULL); + event = perf_event_alloc(&attr, cpu, task, group_leader, NULL, NULL); if (IS_ERR(event)) { err = PTR_ERR(event); goto err_task; @@ -5728,7 +5740,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, * Get the target context (task or percpu): */ - event = perf_event_alloc(attr, cpu, NULL, NULL, overflow_handler); + event = perf_event_alloc(attr, cpu, task, NULL, NULL, overflow_handler); if (IS_ERR(event)) { err = PTR_ERR(event); goto err; @@ -5996,6 +6008,7 @@ inherit_event(struct perf_event *parent_event, child_event = perf_event_alloc(&parent_event->attr, parent_event->cpu, + child, group_leader, parent_event, NULL); if (IS_ERR(child_event)) -- cgit v1.2.3 From 3b6e901f839f42afb40f614418df82c08b01320a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 14 Oct 2010 21:10:38 +0200 Subject: jump_label: Use more consistent naming Now that there's still only a few users around, rename things to make them more consistent. Signed-off-by: Peter Zijlstra LKML-Reference: <20101014203625.448565169@chello.nl> Signed-off-by: Ingo Molnar --- kernel/tracepoint.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index d6073a50a6ca..e95ee7f31d43 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -265,10 +265,10 @@ static void set_tracepoint(struct tracepoint_entry **entry, */ rcu_assign_pointer(elem->funcs, (*entry)->funcs); if (!elem->state && active) { - enable_jump_label(&elem->state); + jump_label_enable(&elem->state); elem->state = active; } else if (elem->state && !active) { - disable_jump_label(&elem->state); + jump_label_disable(&elem->state); elem->state = active; } } @@ -285,7 +285,7 @@ static void disable_tracepoint(struct tracepoint *elem) elem->unregfunc(); if (elem->state) { - disable_jump_label(&elem->state); + jump_label_disable(&elem->state); elem->state = 0; } rcu_assign_pointer(elem->funcs, NULL); -- cgit v1.2.3 From 82cd6def9806dcb6a325fb6abbc1d61388a15f6a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 14 Oct 2010 17:57:23 +0200 Subject: perf: Use jump_labels to optimize the scheduler hooks Trades a call + conditional + ret for an unconditional jmp. Acked-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra LKML-Reference: <20101014203625.501657727@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 856e20baf13f..f7febb02ab97 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -34,7 +34,7 @@ #include -static atomic_t nr_events __read_mostly; +atomic_t perf_task_events __read_mostly; static atomic_t nr_mmap_events __read_mostly; static atomic_t nr_comm_events __read_mostly; static atomic_t nr_task_events __read_mostly; @@ -1311,8 +1311,8 @@ void perf_event_context_sched_out(struct task_struct *task, int ctxn, * accessing the event control register. If a NMI hits, then it will * not restart the event. */ -void perf_event_task_sched_out(struct task_struct *task, - struct task_struct *next) +void __perf_event_task_sched_out(struct task_struct *task, + struct task_struct *next) { int ctxn; @@ -1337,14 +1337,6 @@ static void task_ctx_sched_out(struct perf_event_context *ctx, cpuctx->task_ctx = NULL; } -/* - * Called with IRQs disabled - */ -static void __perf_event_task_sched_out(struct perf_event_context *ctx) -{ - task_ctx_sched_out(ctx, EVENT_ALL); -} - /* * Called with IRQs disabled */ @@ -1494,7 +1486,7 @@ void perf_event_context_sched_in(struct perf_event_context *ctx) * accessing the event control register. If a NMI hits, then it will * keep the event running. */ -void perf_event_task_sched_in(struct task_struct *task) +void __perf_event_task_sched_in(struct task_struct *task) { struct perf_event_context *ctx; int ctxn; @@ -2216,7 +2208,8 @@ static void free_event(struct perf_event *event) irq_work_sync(&event->pending); if (!event->parent) { - atomic_dec(&nr_events); + if (event->attach_state & PERF_ATTACH_TASK) + jump_label_dec(&perf_task_events); if (event->attr.mmap || event->attr.mmap_data) atomic_dec(&nr_mmap_events); if (event->attr.comm) @@ -5354,7 +5347,8 @@ done: event->pmu = pmu; if (!event->parent) { - atomic_inc(&nr_events); + if (event->attach_state & PERF_ATTACH_TASK) + jump_label_inc(&perf_task_events); if (event->attr.mmap || event->attr.mmap_data) atomic_inc(&nr_mmap_events); if (event->attr.comm) @@ -5849,7 +5843,7 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) * our context. */ child_ctx = child->perf_event_ctxp[ctxn]; - __perf_event_task_sched_out(child_ctx); + task_ctx_sched_out(child_ctx, EVENT_ALL); /* * Take the context lock here so that if find_get_context is -- cgit v1.2.3 From 7e54a5a0b655734326dc78c2b5efc1eb35497bb6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 14 Oct 2010 22:32:45 +0200 Subject: perf: Optimize sw events Acked-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index f7febb02ab97..05ecf6f7c672 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -4669,7 +4669,7 @@ static void sw_perf_event_destroy(struct perf_event *event) WARN_ON(event->parent); - atomic_dec(&perf_swevent_enabled[event_id]); + jump_label_dec(&perf_swevent_enabled[event_id]); swevent_hlist_put(event); } @@ -4699,7 +4699,7 @@ static int perf_swevent_init(struct perf_event *event) if (err) return err; - atomic_inc(&perf_swevent_enabled[event_id]); + jump_label_inc(&perf_swevent_enabled[event_id]); event->destroy = sw_perf_event_destroy; } -- cgit v1.2.3 From 7e40798f406fe73f9bac496a390daabd8768a8f7 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 19 Oct 2010 10:56:19 -0400 Subject: tracing: Fix compile issue for trace_sched_wakeup.c The function start_func_tracer() was incorrectly added in the #ifdef CONFIG_FUNCTION_TRACER condition, but is still used even when function tracing is not enabled. The calls to register_ftrace_function() and register_ftrace_graph() become nops (and their arguments are even ignored), thus there is no reason to hide start_func_tracer() when function tracing is not enabled. Reported-by: Ingo Molnar Signed-off-by: Steven Rostedt --- kernel/trace/trace_sched_wakeup.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 31689d2df7f3..7319559ed59f 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -130,6 +130,7 @@ static struct ftrace_ops trace_ops __read_mostly = { .func = wakeup_tracer_call, }; +#endif /* CONFIG_FUNCTION_TRACER */ static int start_func_tracer(int graph) { @@ -159,8 +160,6 @@ static void stop_func_tracer(int graph) unregister_ftrace_graph(); } -#endif /* CONFIG_FUNCTION_TRACER */ - #ifdef CONFIG_FUNCTION_GRAPH_TRACER static int wakeup_set_flag(u32 old_flags, u32 bit, int set) { -- cgit v1.2.3