From 2239291aeb0379fe47980b0e560e0eb9fd7e82ec Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 21 Apr 2010 12:27:06 -0400 Subject: tracing: Remove per event trace registering This patch removes the register functions of TRACE_EVENT() to enable and disable tracepoints. The registering of a event is now down directly in the trace_events.c file. The tracepoint_probe_register() is now called directly. The prototypes are no longer type checked, but this should not be an issue since the tracepoints are created automatically by the macros. If a prototype is incorrect in the TRACE_EVENT() macro, then other macros will catch it. The trace_event_class structure now holds the probes to be called by the callbacks. This removes needing to have each event have a separate pointer for the probe. To handle kprobes and syscalls, since they register probes in a different manner, a "reg" field is added to the ftrace_event_class structure. If the "reg" field is assigned, then it will be called for enabling and disabling of the probe for either ftrace or perf. To let the reg function know what is happening, a new enum (trace_reg) is created that has the type of control that is needed. With this new rework, the 82 kernel events and 618 syscall events has their footprint dramatically lowered: text data bss dec hex filename 4913961 1088356 861512 6863829 68bbd5 vmlinux.orig 4914025 1088868 861512 6864405 68be15 vmlinux.class 4918492 1084612 861512 6864616 68bee8 vmlinux.tracepoint 4900252 1057412 861512 6819176 680d68 vmlinux.regs The size went from 6863829 to 6819176, that's a total of 44K in savings. With tracepoints being continuously added, this is critical that the footprint becomes minimal. v5: Added #ifdef CONFIG_PERF_EVENTS around a reference to perf specific structure in trace_events.c. v4: Fixed trace self tests to check probe because regfunc no longer exists. v3: Updated to handle void *data in beginning of probe parameters. Also added the tracepoint: check_trace_callback_type_##call(). v2: Changed the callback probes to pass void * and typecast the value within the function. Acked-by: Mathieu Desnoyers Acked-by: Masami Hiramatsu Acked-by: Frederic Weisbecker Signed-off-by: Steven Rostedt --- kernel/trace/trace_event_perf.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'kernel/trace/trace_event_perf.c') diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 0565bb42566f..196fe9d26773 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -49,7 +49,12 @@ static int perf_trace_event_enable(struct ftrace_event_call *event) rcu_assign_pointer(perf_trace_buf_nmi, buf); } - ret = event->perf_event_enable(event); + if (event->class->reg) + ret = event->class->reg(event, TRACE_REG_PERF_REGISTER); + else + ret = tracepoint_probe_register(event->name, + event->class->perf_probe, + event); if (!ret) { total_ref_count++; return 0; @@ -75,7 +80,8 @@ int perf_trace_enable(int event_id) mutex_lock(&event_mutex); list_for_each_entry(event, &ftrace_events, list) { - if (event->id == event_id && event->perf_event_enable && + if (event->id == event_id && + event->class && event->class->perf_probe && try_module_get(event->mod)) { ret = perf_trace_event_enable(event); break; @@ -93,7 +99,10 @@ static void perf_trace_event_disable(struct ftrace_event_call *event) if (--event->perf_refcount > 0) return; - event->perf_event_disable(event); + if (event->class->reg) + event->class->reg(event, TRACE_REG_PERF_UNREGISTER); + else + tracepoint_probe_unregister(event->name, event->class->perf_probe, event); if (!--total_ref_count) { buf = perf_trace_buf; -- cgit v1.2.3 From 32c0edaeaad74a7883e736ae0f3798784cfc2a80 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 23 Apr 2010 10:38:03 -0400 Subject: tracing: Remove duplicate id information in event structure Now that the trace_event structure is embedded in the ftrace_event_call structure, there is no need for the ftrace_event_call id field. The id field is the same as the trace_event type field. Removing the id and re-arranging the structure brings down the tracepoint footprint by another 5K. text data bss dec hex filename 4913961 1088356 861512 6863829 68bbd5 vmlinux.orig 4895024 1023812 861512 6780348 6775bc vmlinux.print 4894944 1018052 861512 6774508 675eec vmlinux.id Acked-by: Mathieu Desnoyers Acked-by: Masami Hiramatsu Acked-by: Frederic Weisbecker Signed-off-by: Steven Rostedt --- kernel/trace/trace_event_perf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/trace/trace_event_perf.c') diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 196fe9d26773..0a47e8d6b491 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -80,7 +80,7 @@ int perf_trace_enable(int event_id) mutex_lock(&event_mutex); list_for_each_entry(event, &ftrace_events, list) { - if (event->id == event_id && + if (event->event.type == event_id && event->class && event->class->perf_probe && try_module_get(event->mod)) { ret = perf_trace_event_enable(event); @@ -128,7 +128,7 @@ void perf_trace_disable(int event_id) mutex_lock(&event_mutex); list_for_each_entry(event, &ftrace_events, list) { - if (event->id == event_id) { + if (event->event.type == event_id) { perf_trace_event_disable(event); module_put(event->mod); break; -- cgit v1.2.3 From 4f41c013f553957765902fb01475972f0af3e8e7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 18 May 2010 18:08:32 +0200 Subject: perf/ftrace: Optimize perf/tracepoint interaction for single events When we've got but a single event per tracepoint there is no reason to try and multiplex it so don't. Signed-off-by: Peter Zijlstra Tested-by: Ingo Molnar Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/trace/trace_event_perf.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'kernel/trace/trace_event_perf.c') diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 0565bb42566f..89b780a7c522 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -27,13 +27,15 @@ typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)]) /* Count the events in use (per event id, not per instance) */ static int total_ref_count; -static int perf_trace_event_enable(struct ftrace_event_call *event) +static int perf_trace_event_enable(struct ftrace_event_call *event, void *data) { char *buf; int ret = -ENOMEM; - if (event->perf_refcount++ > 0) + if (event->perf_refcount++ > 0) { + event->perf_data = NULL; return 0; + } if (!total_ref_count) { buf = (char *)alloc_percpu(perf_trace_t); @@ -51,6 +53,7 @@ static int perf_trace_event_enable(struct ftrace_event_call *event) ret = event->perf_event_enable(event); if (!ret) { + event->perf_data = data; total_ref_count++; return 0; } @@ -68,7 +71,7 @@ fail_buf: return ret; } -int perf_trace_enable(int event_id) +int perf_trace_enable(int event_id, void *data) { struct ftrace_event_call *event; int ret = -EINVAL; @@ -77,7 +80,7 @@ int perf_trace_enable(int event_id) list_for_each_entry(event, &ftrace_events, list) { if (event->id == event_id && event->perf_event_enable && try_module_get(event->mod)) { - ret = perf_trace_event_enable(event); + ret = perf_trace_event_enable(event, data); break; } } -- cgit v1.2.3 From b7e2ecef92d2e7785e6d76b41e5ba8bcbc45259d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 19 May 2010 10:52:27 +0200 Subject: perf, trace: Optimize tracepoints by removing IRQ-disable from perf/tracepoint interaction Improves performance. Acked-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: Paul Mackerras Cc: Mike Galbraith Cc: Steven Rostedt LKML-Reference: <1274259525.5605.10352.camel@twins> Signed-off-by: Ingo Molnar --- kernel/trace/trace_event_perf.c | 73 ++++++++++++++++------------------------- 1 file changed, 29 insertions(+), 44 deletions(-) (limited to 'kernel/trace/trace_event_perf.c') diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 89b780a7c522..a1304f8c4440 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -9,13 +9,9 @@ #include #include "trace.h" -DEFINE_PER_CPU(struct pt_regs, perf_trace_regs); -EXPORT_PER_CPU_SYMBOL_GPL(perf_trace_regs); - EXPORT_SYMBOL_GPL(perf_arch_fetch_caller_regs); -static char *perf_trace_buf; -static char *perf_trace_buf_nmi; +static char *perf_trace_buf[4]; /* * Force it to be aligned to unsigned long to avoid misaligned accesses @@ -29,7 +25,6 @@ static int total_ref_count; static int perf_trace_event_enable(struct ftrace_event_call *event, void *data) { - char *buf; int ret = -ENOMEM; if (event->perf_refcount++ > 0) { @@ -38,17 +33,16 @@ static int perf_trace_event_enable(struct ftrace_event_call *event, void *data) } if (!total_ref_count) { - buf = (char *)alloc_percpu(perf_trace_t); - if (!buf) - goto fail_buf; - - rcu_assign_pointer(perf_trace_buf, buf); + char *buf; + int i; - buf = (char *)alloc_percpu(perf_trace_t); - if (!buf) - goto fail_buf_nmi; + for (i = 0; i < 4; i++) { + buf = (char *)alloc_percpu(perf_trace_t); + if (!buf) + goto fail_buf; - rcu_assign_pointer(perf_trace_buf_nmi, buf); + rcu_assign_pointer(perf_trace_buf[i], buf); + } } ret = event->perf_event_enable(event); @@ -58,14 +52,15 @@ static int perf_trace_event_enable(struct ftrace_event_call *event, void *data) return 0; } -fail_buf_nmi: +fail_buf: if (!total_ref_count) { - free_percpu(perf_trace_buf_nmi); - free_percpu(perf_trace_buf); - perf_trace_buf_nmi = NULL; - perf_trace_buf = NULL; + int i; + + for (i = 0; i < 4; i++) { + free_percpu(perf_trace_buf[i]); + perf_trace_buf[i] = NULL; + } } -fail_buf: event->perf_refcount--; return ret; @@ -91,19 +86,19 @@ int perf_trace_enable(int event_id, void *data) static void perf_trace_event_disable(struct ftrace_event_call *event) { - char *buf, *nmi_buf; - if (--event->perf_refcount > 0) return; event->perf_event_disable(event); if (!--total_ref_count) { - buf = perf_trace_buf; - rcu_assign_pointer(perf_trace_buf, NULL); + char *buf[4]; + int i; - nmi_buf = perf_trace_buf_nmi; - rcu_assign_pointer(perf_trace_buf_nmi, NULL); + for (i = 0; i < 4; i++) { + buf[i] = perf_trace_buf[i]; + rcu_assign_pointer(perf_trace_buf[i], NULL); + } /* * Ensure every events in profiling have finished before @@ -111,8 +106,8 @@ static void perf_trace_event_disable(struct ftrace_event_call *event) */ synchronize_sched(); - free_percpu(buf); - free_percpu(nmi_buf); + for (i = 0; i < 4; i++) + free_percpu(buf[i]); } } @@ -132,47 +127,37 @@ void perf_trace_disable(int event_id) } __kprobes void *perf_trace_buf_prepare(int size, unsigned short type, - int *rctxp, unsigned long *irq_flags) + struct pt_regs *regs, int *rctxp) { struct trace_entry *entry; char *trace_buf, *raw_data; - int pc, cpu; + int pc; BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long)); pc = preempt_count(); - /* Protect the per cpu buffer, begin the rcu read side */ - local_irq_save(*irq_flags); - *rctxp = perf_swevent_get_recursion_context(); if (*rctxp < 0) goto err_recursion; - cpu = smp_processor_id(); - - if (in_nmi()) - trace_buf = rcu_dereference_sched(perf_trace_buf_nmi); - else - trace_buf = rcu_dereference_sched(perf_trace_buf); - + trace_buf = rcu_dereference_sched(perf_trace_buf[*rctxp]); if (!trace_buf) goto err; - raw_data = per_cpu_ptr(trace_buf, cpu); + raw_data = per_cpu_ptr(trace_buf, smp_processor_id()); /* zero the dead bytes from align to not leak stack to user */ memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64)); entry = (struct trace_entry *)raw_data; - tracing_generic_entry_update(entry, *irq_flags, pc); + tracing_generic_entry_update(entry, regs->flags, pc); entry->type = type; return raw_data; err: perf_swevent_put_recursion_context(*rctxp); err_recursion: - local_irq_restore(*irq_flags); return NULL; } EXPORT_SYMBOL_GPL(perf_trace_buf_prepare); -- cgit v1.2.3 From 1c024eca51fdc965290acf342ae16a476c2189d0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 19 May 2010 14:02:22 +0200 Subject: perf, trace: Optimize tracepoints by using per-tracepoint-per-cpu hlist to track events Avoid the swevent hash-table by using per-tracepoint hlists. Also, avoid conditionals on the fast path by ordering with probe unregister so that we should never get on the callback path without the data being there. Signed-off-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: Paul Mackerras Cc: Mike Galbraith Cc: Steven Rostedt LKML-Reference: <20100521090710.473188012@chello.nl> Signed-off-by: Ingo Molnar --- kernel/trace/trace_event_perf.c | 127 +++++++++++++++++++++------------------- 1 file changed, 67 insertions(+), 60 deletions(-) (limited to 'kernel/trace/trace_event_perf.c') diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index a1304f8c4440..39d5ea7b0653 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -23,14 +23,25 @@ typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)]) /* Count the events in use (per event id, not per instance) */ static int total_ref_count; -static int perf_trace_event_enable(struct ftrace_event_call *event, void *data) +static int perf_trace_event_init(struct ftrace_event_call *tp_event, + struct perf_event *p_event) { + struct hlist_head *list; int ret = -ENOMEM; + int cpu; - if (event->perf_refcount++ > 0) { - event->perf_data = NULL; + p_event->tp_event = tp_event; + if (tp_event->perf_refcount++ > 0) return 0; - } + + list = alloc_percpu(struct hlist_head); + if (!list) + goto fail; + + for_each_possible_cpu(cpu) + INIT_HLIST_HEAD(per_cpu_ptr(list, cpu)); + + tp_event->perf_events = list; if (!total_ref_count) { char *buf; @@ -39,20 +50,20 @@ static int perf_trace_event_enable(struct ftrace_event_call *event, void *data) for (i = 0; i < 4; i++) { buf = (char *)alloc_percpu(perf_trace_t); if (!buf) - goto fail_buf; + goto fail; - rcu_assign_pointer(perf_trace_buf[i], buf); + perf_trace_buf[i] = buf; } } - ret = event->perf_event_enable(event); - if (!ret) { - event->perf_data = data; - total_ref_count++; - return 0; - } + ret = tp_event->perf_event_enable(tp_event); + if (ret) + goto fail; -fail_buf: + total_ref_count++; + return 0; + +fail: if (!total_ref_count) { int i; @@ -61,21 +72,26 @@ fail_buf: perf_trace_buf[i] = NULL; } } - event->perf_refcount--; + + if (!--tp_event->perf_refcount) { + free_percpu(tp_event->perf_events); + tp_event->perf_events = NULL; + } return ret; } -int perf_trace_enable(int event_id, void *data) +int perf_trace_init(struct perf_event *p_event) { - struct ftrace_event_call *event; + struct ftrace_event_call *tp_event; + int event_id = p_event->attr.config; int ret = -EINVAL; mutex_lock(&event_mutex); - list_for_each_entry(event, &ftrace_events, list) { - if (event->id == event_id && event->perf_event_enable && - try_module_get(event->mod)) { - ret = perf_trace_event_enable(event, data); + list_for_each_entry(tp_event, &ftrace_events, list) { + if (tp_event->id == event_id && tp_event->perf_event_enable && + try_module_get(tp_event->mod)) { + ret = perf_trace_event_init(tp_event, p_event); break; } } @@ -84,53 +100,52 @@ int perf_trace_enable(int event_id, void *data) return ret; } -static void perf_trace_event_disable(struct ftrace_event_call *event) +int perf_trace_enable(struct perf_event *p_event) { - if (--event->perf_refcount > 0) - return; + struct ftrace_event_call *tp_event = p_event->tp_event; + struct hlist_head *list; - event->perf_event_disable(event); + list = tp_event->perf_events; + if (WARN_ON_ONCE(!list)) + return -EINVAL; - if (!--total_ref_count) { - char *buf[4]; - int i; - - for (i = 0; i < 4; i++) { - buf[i] = perf_trace_buf[i]; - rcu_assign_pointer(perf_trace_buf[i], NULL); - } + list = per_cpu_ptr(list, smp_processor_id()); + hlist_add_head_rcu(&p_event->hlist_entry, list); - /* - * Ensure every events in profiling have finished before - * releasing the buffers - */ - synchronize_sched(); + return 0; +} - for (i = 0; i < 4; i++) - free_percpu(buf[i]); - } +void perf_trace_disable(struct perf_event *p_event) +{ + hlist_del_rcu(&p_event->hlist_entry); } -void perf_trace_disable(int event_id) +void perf_trace_destroy(struct perf_event *p_event) { - struct ftrace_event_call *event; + struct ftrace_event_call *tp_event = p_event->tp_event; + int i; - mutex_lock(&event_mutex); - list_for_each_entry(event, &ftrace_events, list) { - if (event->id == event_id) { - perf_trace_event_disable(event); - module_put(event->mod); - break; + if (--tp_event->perf_refcount > 0) + return; + + tp_event->perf_event_disable(tp_event); + + free_percpu(tp_event->perf_events); + tp_event->perf_events = NULL; + + if (!--total_ref_count) { + for (i = 0; i < 4; i++) { + free_percpu(perf_trace_buf[i]); + perf_trace_buf[i] = NULL; } } - mutex_unlock(&event_mutex); } __kprobes void *perf_trace_buf_prepare(int size, unsigned short type, struct pt_regs *regs, int *rctxp) { struct trace_entry *entry; - char *trace_buf, *raw_data; + char *raw_data; int pc; BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long)); @@ -139,13 +154,9 @@ __kprobes void *perf_trace_buf_prepare(int size, unsigned short type, *rctxp = perf_swevent_get_recursion_context(); if (*rctxp < 0) - goto err_recursion; - - trace_buf = rcu_dereference_sched(perf_trace_buf[*rctxp]); - if (!trace_buf) - goto err; + return NULL; - raw_data = per_cpu_ptr(trace_buf, smp_processor_id()); + raw_data = per_cpu_ptr(perf_trace_buf[*rctxp], smp_processor_id()); /* zero the dead bytes from align to not leak stack to user */ memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64)); @@ -155,9 +166,5 @@ __kprobes void *perf_trace_buf_prepare(int size, unsigned short type, entry->type = type; return raw_data; -err: - perf_swevent_put_recursion_context(*rctxp); -err_recursion: - return NULL; } EXPORT_SYMBOL_GPL(perf_trace_buf_prepare); -- cgit v1.2.3 From 87f44bbc246c5244c76a701f8eefba7788bce64a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 25 May 2010 11:02:55 +0200 Subject: perf, trace: Fix !x86 build bug Patch b7e2ecef92 (perf, trace: Optimize tracepoints by removing IRQ-disable from perf/tracepoint interaction) made the unfortunate mistake of assuming the world is x86 only, correct this. The problem was that perf_fetch_caller_regs() did local_save_flags() into regs->flags, and I re-used that to remove another local_save_flags(), forgetting !x86 doesn't have regs->flags. Do the reverse, remove the local_save_flags() from perf_fetch_caller_regs() and let the ftrace site do the local_save_flags() instead. Signed-off-by: Peter Zijlstra Acked-by: Paul Mackerras Cc: acme@redhat.com Cc: efault@gmx.de Cc: fweisbec@gmail.com Cc: rostedt@goodmis.org LKML-Reference: <1274778175.5882.623.camel@twins> Signed-off-by: Ingo Molnar --- kernel/trace/trace_event_perf.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'kernel/trace/trace_event_perf.c') diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 26b8607a0abc..cb6f365016e4 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -157,6 +157,7 @@ __kprobes void *perf_trace_buf_prepare(int size, unsigned short type, struct pt_regs *regs, int *rctxp) { struct trace_entry *entry; + unsigned long flags; char *raw_data; int pc; @@ -174,7 +175,8 @@ __kprobes void *perf_trace_buf_prepare(int size, unsigned short type, memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64)); entry = (struct trace_entry *)raw_data; - tracing_generic_entry_update(entry, regs->flags, pc); + local_save_flags(flags); + tracing_generic_entry_update(entry, flags, pc); entry->type = type; return raw_data; -- cgit v1.2.3