From a8b0ca17b80e92faab46ee7179ba9e99ccb61233 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 27 Jun 2011 14:41:57 +0200 Subject: perf: Remove the nmi parameter from the swevent and overflow interface The nmi parameter indicated if we could do wakeups from the current context, if not, we would set some state and self-IPI and let the resulting interrupt do the wakeup. For the various event classes: - hardware: nmi=0; PMI is in fact an NMI or we run irq_work_run from the PMI-tail (ARM etc.) - tracepoint: nmi=0; since tracepoint could be from NMI context. - software: nmi=[0,1]; some, like the schedule thing cannot perform wakeups, and hence need 0. As one can see, there is very little nmi=1 usage, and the down-side of not using it is that on some platforms some software events can have a jiffy delay in wakeup (when arch_irq_work_raise isn't implemented). The up-side however is that we can remove the nmi parameter and save a bunch of conditionals in fast paths. Signed-off-by: Peter Zijlstra Cc: Michael Cree Cc: Will Deacon Cc: Deng-Cheng Zhu Cc: Anton Blanchard Cc: Eric B Munson Cc: Heiko Carstens Cc: Paul Mundt Cc: David S. Miller Cc: Frederic Weisbecker Cc: Jason Wessel Cc: Don Zickus Link: http://lkml.kernel.org/n/tip-agjev8eu666tvknpb3iaj0fg@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel/cpu/perf_event_intel.c') diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 41178c826c48..d38b0020f775 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1003,7 +1003,7 @@ again: data.period = event->hw.last_period; - if (perf_event_overflow(event, 1, &data, regs)) + if (perf_event_overflow(event, &data, regs)) x86_pmu_stop(event, 0); } -- cgit v1.2.3 From efc9f05df2dd171280dcb736a4d973ffefd5508e Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Mon, 6 Jun 2011 16:57:03 +0200 Subject: perf_events: Update Intel extra regs shared constraints management This patch improves the code managing the extra shared registers used for offcore_response events on Intel Nehalem/Westmere. The idea is to use static allocation instead of dynamic allocation. This simplifies greatly the get and put constraint routines for those events. The patch also renames per_core to shared_regs because the same data structure gets used whether or not HT is on. When HT is off, those events still need to coordination because they use a extra MSR that has to be shared within an event group. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110606145703.GA7258@quad Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel.c | 260 ++++++++++++++++----------------- 1 file changed, 128 insertions(+), 132 deletions(-) (limited to 'arch/x86/kernel/cpu/perf_event_intel.c') diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index d38b0020f775..6ad95baff856 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1,25 +1,15 @@ #ifdef CONFIG_CPU_SUP_INTEL -#define MAX_EXTRA_REGS 2 - -/* - * Per register state. - */ -struct er_account { - int ref; /* reference count */ - unsigned int extra_reg; /* extra MSR number */ - u64 extra_config; /* extra MSR config */ -}; - /* - * Per core state - * This used to coordinate shared registers for HT threads. + * Per core/cpu state + * + * Used to coordinate shared registers between HT threads or + * among events on a single PMU. */ -struct intel_percore { - raw_spinlock_t lock; /* protect structure */ - struct er_account regs[MAX_EXTRA_REGS]; - int refcnt; /* number of threads */ - unsigned core_id; +struct intel_shared_regs { + struct er_account regs[EXTRA_REG_MAX]; + int refcnt; /* per-core: #HT threads */ + unsigned core_id; /* per-core: core id */ }; /* @@ -88,16 +78,10 @@ static struct event_constraint intel_nehalem_event_constraints[] __read_mostly = static struct extra_reg intel_nehalem_extra_regs[] __read_mostly = { - INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff), + INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0), EVENT_EXTRA_END }; -static struct event_constraint intel_nehalem_percore_constraints[] __read_mostly = -{ - INTEL_EVENT_CONSTRAINT(0xb7, 0), - EVENT_CONSTRAINT_END -}; - static struct event_constraint intel_westmere_event_constraints[] __read_mostly = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ @@ -125,18 +109,11 @@ static struct event_constraint intel_snb_event_constraints[] __read_mostly = static struct extra_reg intel_westmere_extra_regs[] __read_mostly = { - INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff), - INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0xffff), + INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0), + INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0xffff, RSP_1), EVENT_EXTRA_END }; -static struct event_constraint intel_westmere_percore_constraints[] __read_mostly = -{ - INTEL_EVENT_CONSTRAINT(0xb7, 0), - INTEL_EVENT_CONSTRAINT(0xbb, 0), - EVENT_CONSTRAINT_END -}; - static struct event_constraint intel_gen_event_constraints[] __read_mostly = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ @@ -1037,65 +1014,89 @@ intel_bts_constraints(struct perf_event *event) return NULL; } +/* + * manage allocation of shared extra msr for certain events + * + * sharing can be: + * per-cpu: to be shared between the various events on a single PMU + * per-core: per-cpu + shared by HT threads + */ static struct event_constraint * -intel_percore_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) +__intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc, + struct hw_perf_event_extra *reg) { - struct hw_perf_event *hwc = &event->hw; - unsigned int e = hwc->config & ARCH_PERFMON_EVENTSEL_EVENT; - struct event_constraint *c; - struct intel_percore *pc; + struct event_constraint *c = &emptyconstraint; struct er_account *era; - int i; - int free_slot; - int found; - if (!x86_pmu.percore_constraints || hwc->extra_alloc) - return NULL; + /* already allocated shared msr */ + if (reg->alloc || !cpuc->shared_regs) + return &unconstrained; - for (c = x86_pmu.percore_constraints; c->cmask; c++) { - if (e != c->code) - continue; + era = &cpuc->shared_regs->regs[reg->idx]; + + raw_spin_lock(&era->lock); + + if (!atomic_read(&era->ref) || era->config == reg->config) { + + /* lock in msr value */ + era->config = reg->config; + era->reg = reg->reg; + + /* one more user */ + atomic_inc(&era->ref); + + /* no need to reallocate during incremental event scheduling */ + reg->alloc = 1; /* - * Allocate resource per core. + * All events using extra_reg are unconstrained. + * Avoids calling x86_get_event_constraints() + * + * Must revisit if extra_reg controlling events + * ever have constraints. Worst case we go through + * the regular event constraint table. */ - pc = cpuc->per_core; - if (!pc) - break; - c = &emptyconstraint; - raw_spin_lock(&pc->lock); - free_slot = -1; - found = 0; - for (i = 0; i < MAX_EXTRA_REGS; i++) { - era = &pc->regs[i]; - if (era->ref > 0 && hwc->extra_reg == era->extra_reg) { - /* Allow sharing same config */ - if (hwc->extra_config == era->extra_config) { - era->ref++; - cpuc->percore_used = 1; - hwc->extra_alloc = 1; - c = NULL; - } - /* else conflict */ - found = 1; - break; - } else if (era->ref == 0 && free_slot == -1) - free_slot = i; - } - if (!found && free_slot != -1) { - era = &pc->regs[free_slot]; - era->ref = 1; - era->extra_reg = hwc->extra_reg; - era->extra_config = hwc->extra_config; - cpuc->percore_used = 1; - hwc->extra_alloc = 1; - c = NULL; - } - raw_spin_unlock(&pc->lock); - return c; + c = &unconstrained; } + raw_spin_unlock(&era->lock); - return NULL; + return c; +} + +static void +__intel_shared_reg_put_constraints(struct cpu_hw_events *cpuc, + struct hw_perf_event_extra *reg) +{ + struct er_account *era; + + /* + * only put constraint if extra reg was actually + * allocated. Also takes care of event which do + * not use an extra shared reg + */ + if (!reg->alloc) + return; + + era = &cpuc->shared_regs->regs[reg->idx]; + + /* one fewer user */ + atomic_dec(&era->ref); + + /* allocate again next time */ + reg->alloc = 0; +} + +static struct event_constraint * +intel_shared_regs_constraints(struct cpu_hw_events *cpuc, + struct perf_event *event) +{ + struct event_constraint *c = NULL; + struct hw_perf_event_extra *xreg; + + xreg = &event->hw.extra_reg; + if (xreg->idx != EXTRA_REG_NONE) + c = __intel_shared_reg_get_constraints(cpuc, xreg); + return c; } static struct event_constraint * @@ -1111,49 +1112,28 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event if (c) return c; - c = intel_percore_constraints(cpuc, event); + c = intel_shared_regs_constraints(cpuc, event); if (c) return c; return x86_get_event_constraints(cpuc, event); } -static void intel_put_event_constraints(struct cpu_hw_events *cpuc, +static void +intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) { - struct extra_reg *er; - struct intel_percore *pc; - struct er_account *era; - struct hw_perf_event *hwc = &event->hw; - int i, allref; - - if (!cpuc->percore_used) - return; + struct hw_perf_event_extra *reg; - for (er = x86_pmu.extra_regs; er->msr; er++) { - if (er->event != (hwc->config & er->config_mask)) - continue; + reg = &event->hw.extra_reg; + if (reg->idx != EXTRA_REG_NONE) + __intel_shared_reg_put_constraints(cpuc, reg); +} - pc = cpuc->per_core; - raw_spin_lock(&pc->lock); - for (i = 0; i < MAX_EXTRA_REGS; i++) { - era = &pc->regs[i]; - if (era->ref > 0 && - era->extra_config == hwc->extra_config && - era->extra_reg == er->msr) { - era->ref--; - hwc->extra_alloc = 0; - break; - } - } - allref = 0; - for (i = 0; i < MAX_EXTRA_REGS; i++) - allref += pc->regs[i].ref; - if (allref == 0) - cpuc->percore_used = 0; - raw_spin_unlock(&pc->lock); - break; - } +static void intel_put_event_constraints(struct cpu_hw_events *cpuc, + struct perf_event *event) +{ + intel_put_shared_regs_event_constraints(cpuc, event); } static int intel_pmu_hw_config(struct perf_event *event) @@ -1231,20 +1211,36 @@ static __initconst const struct x86_pmu core_pmu = { .event_constraints = intel_core_event_constraints, }; +static struct intel_shared_regs *allocate_shared_regs(int cpu) +{ + struct intel_shared_regs *regs; + int i; + + regs = kzalloc_node(sizeof(struct intel_shared_regs), + GFP_KERNEL, cpu_to_node(cpu)); + if (regs) { + /* + * initialize the locks to keep lockdep happy + */ + for (i = 0; i < EXTRA_REG_MAX; i++) + raw_spin_lock_init(®s->regs[i].lock); + + regs->core_id = -1; + } + return regs; +} + static int intel_pmu_cpu_prepare(int cpu) { struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); - if (!cpu_has_ht_siblings()) + if (!x86_pmu.extra_regs) return NOTIFY_OK; - cpuc->per_core = kzalloc_node(sizeof(struct intel_percore), - GFP_KERNEL, cpu_to_node(cpu)); - if (!cpuc->per_core) + cpuc->shared_regs = allocate_shared_regs(cpu); + if (!cpuc->shared_regs) return NOTIFY_BAD; - raw_spin_lock_init(&cpuc->per_core->lock); - cpuc->per_core->core_id = -1; return NOTIFY_OK; } @@ -1260,32 +1256,34 @@ static void intel_pmu_cpu_starting(int cpu) */ intel_pmu_lbr_reset(); - if (!cpu_has_ht_siblings()) + if (!cpuc->shared_regs) return; for_each_cpu(i, topology_thread_cpumask(cpu)) { - struct intel_percore *pc = per_cpu(cpu_hw_events, i).per_core; + struct intel_shared_regs *pc; + pc = per_cpu(cpu_hw_events, i).shared_regs; if (pc && pc->core_id == core_id) { - kfree(cpuc->per_core); - cpuc->per_core = pc; + kfree(cpuc->shared_regs); + cpuc->shared_regs = pc; break; } } - cpuc->per_core->core_id = core_id; - cpuc->per_core->refcnt++; + cpuc->shared_regs->core_id = core_id; + cpuc->shared_regs->refcnt++; } static void intel_pmu_cpu_dying(int cpu) { struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu); - struct intel_percore *pc = cpuc->per_core; + struct intel_shared_regs *pc; + pc = cpuc->shared_regs; if (pc) { if (pc->core_id == -1 || --pc->refcnt == 0) kfree(pc); - cpuc->per_core = NULL; + cpuc->shared_regs = NULL; } fini_debug_store_on_cpu(cpu); @@ -1436,7 +1434,6 @@ static __init int intel_pmu_init(void) x86_pmu.event_constraints = intel_nehalem_event_constraints; x86_pmu.pebs_constraints = intel_nehalem_pebs_event_constraints; - x86_pmu.percore_constraints = intel_nehalem_percore_constraints; x86_pmu.enable_all = intel_pmu_nhm_enable_all; x86_pmu.extra_regs = intel_nehalem_extra_regs; @@ -1481,7 +1478,6 @@ static __init int intel_pmu_init(void) intel_pmu_lbr_init_nhm(); x86_pmu.event_constraints = intel_westmere_event_constraints; - x86_pmu.percore_constraints = intel_westmere_percore_constraints; x86_pmu.enable_all = intel_pmu_nhm_enable_all; x86_pmu.pebs_constraints = intel_westmere_pebs_event_constraints; x86_pmu.extra_regs = intel_westmere_extra_regs; -- cgit v1.2.3 From cd8a38d33e2528998998bae70a45ad27e442f114 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Mon, 6 Jun 2011 16:57:08 +0200 Subject: perf_events: Fix validation of events using an extra reg The validate_group() function needs to validate events with extra shared regs. Within an event group, only events with the same value for the extra reg can co-exist. This was not checked by validate_group() because it was missing the shared_regs logic. This patch changes the allocation of the fake cpuc used for validation to also point to a fake shared_regs structure such that group events be properly testing. It modifies __intel_shared_reg_get_constraints() to use spin_lock_irqsave() to avoid lockdep issues. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110606145708.GA7279@quad Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel/cpu/perf_event_intel.c') diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 6ad95baff856..ac02b83e8614 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1027,14 +1027,18 @@ __intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc, { struct event_constraint *c = &emptyconstraint; struct er_account *era; + unsigned long flags; /* already allocated shared msr */ - if (reg->alloc || !cpuc->shared_regs) + if (reg->alloc) return &unconstrained; era = &cpuc->shared_regs->regs[reg->idx]; - - raw_spin_lock(&era->lock); + /* + * we use spin_lock_irqsave() to avoid lockdep issues when + * passing a fake cpuc + */ + raw_spin_lock_irqsave(&era->lock, flags); if (!atomic_read(&era->ref) || era->config == reg->config) { @@ -1058,7 +1062,7 @@ __intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc, */ c = &unconstrained; } - raw_spin_unlock(&era->lock); + raw_spin_unlock_irqrestore(&era->lock, flags); return c; } @@ -1524,4 +1528,8 @@ static int intel_pmu_init(void) return 0; } +static struct intel_shared_regs *allocate_shared_regs(int cpu) +{ + return NULL; +} #endif /* CONFIG_CPU_SUP_INTEL */ -- cgit v1.2.3 From ee89cbc2d48150c7c0e9f2aaac00afde99af098c Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Mon, 6 Jun 2011 16:57:12 +0200 Subject: perf_events: Add Intel Sandy Bridge offcore_response low-level support This patch adds Intel Sandy Bridge offcore_response support by providing the low-level constraint table for those events. On Sandy Bridge, there are two offcore_response events. Each uses its own dedictated extra register. But those registers are NOT shared between sibling CPUs when HT is on unlike Nehalem/Westmere. They are always private to each CPU. But they still need to be controlled within an event group. All events within an event group must use the same value for the extra MSR. That's not controlled by the second patch in this series. Furthermore on Sandy Bridge, the offcore_response events have NO counter constraints contrary to what the official documentation indicates, so drop the events from the contraint table. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20110606145712.GA7304@quad Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel/cpu/perf_event_intel.c') diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index ac02b83e8614..a674ae45a472 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -100,8 +100,6 @@ static struct event_constraint intel_snb_event_constraints[] __read_mostly = FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.PENDING */ - INTEL_EVENT_CONSTRAINT(0xb7, 0x1), /* OFF_CORE_RESPONSE_0 */ - INTEL_EVENT_CONSTRAINT(0xbb, 0x8), /* OFF_CORE_RESPONSE_1 */ INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */ INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */ EVENT_CONSTRAINT_END @@ -122,6 +120,12 @@ static struct event_constraint intel_gen_event_constraints[] __read_mostly = EVENT_CONSTRAINT_END }; +static struct extra_reg intel_snb_extra_regs[] __read_mostly = { + INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0x3fffffffffull, RSP_0), + INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0x3fffffffffull, RSP_1), + EVENT_EXTRA_END +}; + static u64 intel_pmu_event_map(int hw_event) { return intel_perfmon_event_map[hw_event]; @@ -1260,7 +1264,7 @@ static void intel_pmu_cpu_starting(int cpu) */ intel_pmu_lbr_reset(); - if (!cpuc->shared_regs) + if (!cpuc->shared_regs || x86_pmu.regs_no_ht_sharing) return; for_each_cpu(i, topology_thread_cpumask(cpu)) { @@ -1502,6 +1506,9 @@ static __init int intel_pmu_init(void) x86_pmu.event_constraints = intel_snb_event_constraints; x86_pmu.pebs_constraints = intel_snb_pebs_events; + x86_pmu.extra_regs = intel_snb_extra_regs; + /* all extra regs are per-cpu when HT is on */ + x86_pmu.regs_no_ht_sharing = true; /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e; -- cgit v1.2.3 From b79e8941fb9af07d810da91b4e29da2bba331b6e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 23 May 2011 11:08:15 +0200 Subject: perf, intel: Try alternative OFFCORE encodings Since the OFFCORE registers are fully symmetric, try the other one when the specified one is already in use. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1306141897.18455.8.camel@twins Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel.c | 44 ++++++++++++++++++++++++++++------ 1 file changed, 37 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel/cpu/perf_event_intel.c') diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index a674ae45a472..5c448622468c 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1018,6 +1018,29 @@ intel_bts_constraints(struct perf_event *event) return NULL; } +static bool intel_try_alt_er(struct perf_event *event, int orig_idx) +{ + if (!(x86_pmu.er_flags & ERF_HAS_RSP_1)) + return false; + + if (event->hw.extra_reg.idx == EXTRA_REG_RSP_0) { + event->hw.config &= ~INTEL_ARCH_EVENT_MASK; + event->hw.config |= 0x01bb; + event->hw.extra_reg.idx = EXTRA_REG_RSP_1; + event->hw.extra_reg.reg = MSR_OFFCORE_RSP_1; + } else if (event->hw.extra_reg.idx == EXTRA_REG_RSP_1) { + event->hw.config &= ~INTEL_ARCH_EVENT_MASK; + event->hw.config |= 0x01b7; + event->hw.extra_reg.idx = EXTRA_REG_RSP_0; + event->hw.extra_reg.reg = MSR_OFFCORE_RSP_0; + } + + if (event->hw.extra_reg.idx == orig_idx) + return false; + + return true; +} + /* * manage allocation of shared extra msr for certain events * @@ -1027,16 +1050,19 @@ intel_bts_constraints(struct perf_event *event) */ static struct event_constraint * __intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc, - struct hw_perf_event_extra *reg) + struct perf_event *event) { struct event_constraint *c = &emptyconstraint; + struct hw_perf_event_extra *reg = &event->hw.extra_reg; struct er_account *era; unsigned long flags; + int orig_idx = reg->idx; /* already allocated shared msr */ if (reg->alloc) return &unconstrained; +again: era = &cpuc->shared_regs->regs[reg->idx]; /* * we use spin_lock_irqsave() to avoid lockdep issues when @@ -1065,6 +1091,9 @@ __intel_shared_reg_get_constraints(struct cpu_hw_events *cpuc, * the regular event constraint table. */ c = &unconstrained; + } else if (intel_try_alt_er(event, orig_idx)) { + raw_spin_unlock(&era->lock); + goto again; } raw_spin_unlock_irqrestore(&era->lock, flags); @@ -1099,11 +1128,10 @@ intel_shared_regs_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) { struct event_constraint *c = NULL; - struct hw_perf_event_extra *xreg; - xreg = &event->hw.extra_reg; - if (xreg->idx != EXTRA_REG_NONE) - c = __intel_shared_reg_get_constraints(cpuc, xreg); + if (event->hw.extra_reg.idx != EXTRA_REG_NONE) + c = __intel_shared_reg_get_constraints(cpuc, event); + return c; } @@ -1264,7 +1292,7 @@ static void intel_pmu_cpu_starting(int cpu) */ intel_pmu_lbr_reset(); - if (!cpuc->shared_regs || x86_pmu.regs_no_ht_sharing) + if (!cpuc->shared_regs || (x86_pmu.er_flags & ERF_NO_HT_SHARING)) return; for_each_cpu(i, topology_thread_cpumask(cpu)) { @@ -1489,6 +1517,7 @@ static __init int intel_pmu_init(void) x86_pmu.enable_all = intel_pmu_nhm_enable_all; x86_pmu.pebs_constraints = intel_westmere_pebs_event_constraints; x86_pmu.extra_regs = intel_westmere_extra_regs; + x86_pmu.er_flags |= ERF_HAS_RSP_1; /* UOPS_ISSUED.STALLED_CYCLES */ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e; @@ -1508,7 +1537,8 @@ static __init int intel_pmu_init(void) x86_pmu.pebs_constraints = intel_snb_pebs_events; x86_pmu.extra_regs = intel_snb_extra_regs; /* all extra regs are per-cpu when HT is on */ - x86_pmu.regs_no_ht_sharing = true; + x86_pmu.er_flags |= ERF_HAS_RSP_1; + x86_pmu.er_flags |= ERF_NO_HT_SHARING; /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e; -- cgit v1.2.3 From 89d6c0b5bdbb1927775584dcf532d98b3efe1477 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 22 Apr 2011 23:37:06 +0200 Subject: perf, arch: Add generic NODE cache events Add a NODE level to the generic cache events which is used to measure local vs remote memory accesses. Like all other cache events, an ACCESS is HIT+MISS, if there is no way to distinguish between reads and writes do reads only etc.. The below needs filling out for !x86 (which I filled out with unsupported events). I'm fairly sure ARM can leave it like that since it doesn't strike me as an architecture that even has NUMA support. SH might have something since it does appear to have some NUMA bits. Sparc64, PowerPC and MIPS certainly want a good look there since they clearly are NUMA capable. Signed-off-by: Peter Zijlstra Cc: David Miller Cc: Anton Blanchard Cc: David Daney Cc: Deng-Cheng Zhu Cc: Paul Mundt Cc: Will Deacon Cc: Robert Richter Cc: Stephane Eranian Link: http://lkml.kernel.org/r/1303508226.4865.8.camel@laptop Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel.c | 59 +++++++++++++++++++++++++++++++++- 1 file changed, 58 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel/cpu/perf_event_intel.c') diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 5c448622468c..bf6f92f7c19d 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -226,6 +226,21 @@ static __initconst const u64 snb_hw_cache_event_ids [ C(RESULT_MISS) ] = -1, }, }, + [ C(NODE) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, + }; static __initconst const u64 westmere_hw_cache_event_ids @@ -327,6 +342,20 @@ static __initconst const u64 westmere_hw_cache_event_ids [ C(RESULT_MISS) ] = -1, }, }, + [ C(NODE) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x01b7, + [ C(RESULT_MISS) ] = 0x01b7, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x01b7, + [ C(RESULT_MISS) ] = 0x01b7, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x01b7, + [ C(RESULT_MISS) ] = 0x01b7, + }, + }, }; /* @@ -379,7 +408,21 @@ static __initconst const u64 nehalem_hw_cache_extra_regs [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_L3_ACCESS, [ C(RESULT_MISS) ] = NHM_DMND_PREFETCH|NHM_L3_MISS, }, - } + }, + [ C(NODE) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_ALL_DRAM, + [ C(RESULT_MISS) ] = NHM_DMND_READ|NHM_REMOTE_DRAM, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_ALL_DRAM, + [ C(RESULT_MISS) ] = NHM_DMND_WRITE|NHM_REMOTE_DRAM, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_ALL_DRAM, + [ C(RESULT_MISS) ] = NHM_DMND_PREFETCH|NHM_REMOTE_DRAM, + }, + }, }; static __initconst const u64 nehalem_hw_cache_event_ids @@ -481,6 +524,20 @@ static __initconst const u64 nehalem_hw_cache_event_ids [ C(RESULT_MISS) ] = -1, }, }, + [ C(NODE) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x01b7, + [ C(RESULT_MISS) ] = 0x01b7, + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x01b7, + [ C(RESULT_MISS) ] = 0x01b7, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = 0x01b7, + [ C(RESULT_MISS) ] = 0x01b7, + }, + }, }; static __initconst const u64 core2_hw_cache_event_ids -- cgit v1.2.3 From 0af3ac1fdb9d5c297b4b07c9e0172531d42b6716 Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Wed, 29 Jun 2011 18:42:36 +0300 Subject: x86, perf: Add constraints for architectural PMU The v1 PMU does not have any fixed counters. Using the v2 constraints, which do have fixed counters, causes an additional choice to be present in the weight calculation, but not when actually scheduling the event, leading to an event being not scheduled at all. Signed-off-by: Avi Kivity Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1309362157-6596-3-git-send-email-avi@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel/cpu/perf_event_intel.c') diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index bf6f92f7c19d..45fbb8f7f549 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -112,6 +112,11 @@ static struct extra_reg intel_westmere_extra_regs[] __read_mostly = EVENT_EXTRA_END }; +static struct event_constraint intel_v1_event_constraints[] __read_mostly = +{ + EVENT_CONSTRAINT_END +}; + static struct event_constraint intel_gen_event_constraints[] __read_mostly = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ @@ -1606,11 +1611,19 @@ static __init int intel_pmu_init(void) break; default: - /* - * default constraints for v2 and up - */ - x86_pmu.event_constraints = intel_gen_event_constraints; - pr_cont("generic architected perfmon, "); + switch (x86_pmu.version) { + case 1: + x86_pmu.event_constraints = intel_v1_event_constraints; + pr_cont("generic architected perfmon v1, "); + break; + default: + /* + * default constraints for v2 and up + */ + x86_pmu.event_constraints = intel_gen_event_constraints; + pr_cont("generic architected perfmon, "); + break; + } } return 0; } -- cgit v1.2.3