From 9fac2cf316b070ae43d2ae2525e381ff2d1d68aa Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 24 Jan 2013 16:10:27 +0100 Subject: perf/x86: Add flags to event constraints This patch adds a flags field to each event constraint. It can be used to store event specific features which can then later be used by scheduling code or low-level x86 code. The flags are propagated into event->hw.flags during the get_event_constraint() call. They are cleared during the put_event_constraint() call. This mechanism is going to be used by the PEBS-LL patches. It avoids defining yet another table to hold event specific information. Signed-off-by: Stephane Eranian Cc: peterz@infradead.org Cc: ak@linux.intel.com Cc: jolsa@redhat.com Cc: namhyung.kim@lge.com Link: http://lkml.kernel.org/r/1359040242-8269-4-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar Signed-off-by: Arnaldo Carvalho de Melo --- arch/x86/kernel/cpu/perf_event_intel.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel/cpu/perf_event_intel.c') diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index dab7580c47a..df3beaac339 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1392,8 +1392,11 @@ x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) if (x86_pmu.event_constraints) { for_each_event_constraint(c, x86_pmu.event_constraints) { - if ((event->hw.config & c->cmask) == c->code) + if ((event->hw.config & c->cmask) == c->code) { + /* hw.flags zeroed at initialization */ + event->hw.flags |= c->flags; return c; + } } } @@ -1438,6 +1441,7 @@ intel_put_shared_regs_event_constraints(struct cpu_hw_events *cpuc, static void intel_put_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) { + event->hw.flags = 0; intel_put_shared_regs_event_constraints(cpuc, event); } -- cgit v1.2.3-70-g09d2 From f20093eef5f7843a25adfc0512617d4b1ff1aa6e Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 24 Jan 2013 16:10:32 +0100 Subject: perf/x86: Add memory profiling via PEBS Load Latency This patch adds support for memory profiling using the PEBS Load Latency facility. Load accesses are sampled by HW and the instruction address, data address, load latency, data source, tlb, locked information can be saved in the sampling buffer if using the PERF_SAMPLE_COST (for latency), PERF_SAMPLE_ADDR, PERF_SAMPLE_DATA_SRC types. To enable PEBS Load Latency, users have to use the model specific event: - on NHM/WSM: MEM_INST_RETIRED:LATENCY_ABOVE_THRESHOLD - on SNB/IVB: MEM_TRANS_RETIRED:LATENCY_ABOVE_THRESHOLD To make things easier, this patch also exports a generic alias via sysfs: mem-loads. It export the right event encoding based on the host CPU and can be used directly by the perf tool. Loosely based on Intel's Lin Ming patch posted on LKML in July 2011. Signed-off-by: Stephane Eranian Cc: peterz@infradead.org Cc: ak@linux.intel.com Cc: acme@redhat.com Cc: jolsa@redhat.com Cc: namhyung.kim@lge.com Link: http://lkml.kernel.org/r/1359040242-8269-9-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar Signed-off-by: Arnaldo Carvalho de Melo --- arch/x86/include/uapi/asm/msr-index.h | 1 + arch/x86/kernel/cpu/perf_event.c | 5 +- arch/x86/kernel/cpu/perf_event.h | 25 +++++- arch/x86/kernel/cpu/perf_event_intel.c | 24 ++++++ arch/x86/kernel/cpu/perf_event_intel_ds.c | 133 ++++++++++++++++++++++++++++-- 5 files changed, 178 insertions(+), 10 deletions(-) (limited to 'arch/x86/kernel/cpu/perf_event_intel.c') diff --git a/arch/x86/include/uapi/asm/msr-index.h b/arch/x86/include/uapi/asm/msr-index.h index 892ce40a747..b31798d5e62 100644 --- a/arch/x86/include/uapi/asm/msr-index.h +++ b/arch/x86/include/uapi/asm/msr-index.h @@ -71,6 +71,7 @@ #define MSR_IA32_PEBS_ENABLE 0x000003f1 #define MSR_IA32_DS_AREA 0x00000600 #define MSR_IA32_PERF_CAPABILITIES 0x00000345 +#define MSR_PEBS_LD_LAT_THRESHOLD 0x000003f6 #define MSR_MTRRfix64K_00000 0x00000250 #define MSR_MTRRfix16K_80000 0x00000258 diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 8ba51518f68..5ed7a4c5baf 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1363,7 +1363,7 @@ static __init struct attribute **merge_attr(struct attribute **a, struct attribu return new; } -static ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr, +ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr, char *page) { struct perf_pmu_events_attr *pmu_attr = \ @@ -1494,6 +1494,9 @@ static int __init init_hw_perf_events(void) x86_pmu.attr_rdpmc = 1; /* enable userspace RDPMC usage by default */ x86_pmu_format_group.attrs = x86_pmu.format_attrs; + if (x86_pmu.event_attrs) + x86_pmu_events_group.attrs = x86_pmu.event_attrs; + if (!x86_pmu.events_sysfs_show) x86_pmu_events_group.attrs = &empty_attrs; else diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 9686d38eb45..f3a9a94e4d2 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -46,6 +46,7 @@ enum extra_reg_type { EXTRA_REG_RSP_0 = 0, /* offcore_response_0 */ EXTRA_REG_RSP_1 = 1, /* offcore_response_1 */ EXTRA_REG_LBR = 2, /* lbr_select */ + EXTRA_REG_LDLAT = 3, /* ld_lat_threshold */ EXTRA_REG_MAX /* number of entries needed */ }; @@ -61,6 +62,10 @@ struct event_constraint { int overlap; int flags; }; +/* + * struct event_constraint flags + */ +#define PERF_X86_EVENT_PEBS_LDLAT 0x1 /* ld+ldlat data address sampling */ struct amd_nb { int nb_id; /* NorthBridge id */ @@ -233,6 +238,10 @@ struct cpu_hw_events { #define INTEL_UEVENT_CONSTRAINT(c, n) \ EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK) +#define INTEL_PLD_CONSTRAINT(c, n) \ + __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK, \ + HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LDLAT) + #define EVENT_CONSTRAINT_END \ EVENT_CONSTRAINT(0, 0, 0) @@ -262,12 +271,22 @@ struct extra_reg { .msr = (ms), \ .config_mask = (m), \ .valid_mask = (vm), \ - .idx = EXTRA_REG_##i \ + .idx = EXTRA_REG_##i, \ } #define INTEL_EVENT_EXTRA_REG(event, msr, vm, idx) \ EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm, idx) +#define INTEL_UEVENT_EXTRA_REG(event, msr, vm, idx) \ + EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT | \ + ARCH_PERFMON_EVENTSEL_UMASK, vm, idx) + +#define INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(c) \ + INTEL_UEVENT_EXTRA_REG(c, \ + MSR_PEBS_LD_LAT_THRESHOLD, \ + 0xffff, \ + LDLAT) + #define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0, RSP_0) union perf_capabilities { @@ -357,6 +376,7 @@ struct x86_pmu { */ int attr_rdpmc; struct attribute **format_attrs; + struct attribute **event_attrs; ssize_t (*events_sysfs_show)(char *page, u64 config); struct attribute **cpu_events; @@ -648,6 +668,9 @@ int p6_pmu_init(void); int knc_pmu_init(void); +ssize_t events_sysfs_show(struct device *dev, struct device_attribute *attr, + char *page); + #else /* CONFIG_CPU_SUP_INTEL */ static inline void reserve_ds_buffers(void) diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index df3beaac339..d5ea5a03cd3 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -81,6 +81,7 @@ static struct event_constraint intel_nehalem_event_constraints[] __read_mostly = static struct extra_reg intel_nehalem_extra_regs[] __read_mostly = { INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0), + INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x100b), EVENT_EXTRA_END }; @@ -136,6 +137,7 @@ static struct extra_reg intel_westmere_extra_regs[] __read_mostly = { INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff, RSP_0), INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0xffff, RSP_1), + INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x100b), EVENT_EXTRA_END }; @@ -155,9 +157,23 @@ static struct event_constraint intel_gen_event_constraints[] __read_mostly = static struct extra_reg intel_snb_extra_regs[] __read_mostly = { INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0x3fffffffffull, RSP_0), INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0x3fffffffffull, RSP_1), + INTEL_UEVENT_PEBS_LDLAT_EXTRA_REG(0x01cd), EVENT_EXTRA_END }; +EVENT_ATTR_STR(mem-loads, mem_ld_nhm, "event=0x0b,umask=0x10,ldlat=3"); +EVENT_ATTR_STR(mem-loads, mem_ld_snb, "event=0xcd,umask=0x1,ldlat=3"); + +struct attribute *nhm_events_attrs[] = { + EVENT_PTR(mem_ld_nhm), + NULL, +}; + +struct attribute *snb_events_attrs[] = { + EVENT_PTR(mem_ld_snb), + NULL, +}; + static u64 intel_pmu_event_map(int hw_event) { return intel_perfmon_event_map[hw_event]; @@ -2035,6 +2051,8 @@ __init int intel_pmu_init(void) x86_pmu.enable_all = intel_pmu_nhm_enable_all; x86_pmu.extra_regs = intel_nehalem_extra_regs; + x86_pmu.cpu_events = nhm_events_attrs; + /* UOPS_ISSUED.STALLED_CYCLES */ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1); @@ -2078,6 +2096,8 @@ __init int intel_pmu_init(void) x86_pmu.extra_regs = intel_westmere_extra_regs; x86_pmu.er_flags |= ERF_HAS_RSP_1; + x86_pmu.cpu_events = nhm_events_attrs; + /* UOPS_ISSUED.STALLED_CYCLES */ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1); @@ -2106,6 +2126,8 @@ __init int intel_pmu_init(void) x86_pmu.er_flags |= ERF_HAS_RSP_1; x86_pmu.er_flags |= ERF_NO_HT_SHARING; + x86_pmu.cpu_events = snb_events_attrs; + /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1); @@ -2132,6 +2154,8 @@ __init int intel_pmu_init(void) x86_pmu.er_flags |= ERF_HAS_RSP_1; x86_pmu.er_flags |= ERF_NO_HT_SHARING; + x86_pmu.cpu_events = snb_events_attrs; + /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1); diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index f30d85bcbda..a6400bd0463 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -24,6 +24,92 @@ struct pebs_record_32 { */ +union intel_x86_pebs_dse { + u64 val; + struct { + unsigned int ld_dse:4; + unsigned int ld_stlb_miss:1; + unsigned int ld_locked:1; + unsigned int ld_reserved:26; + }; + struct { + unsigned int st_l1d_hit:1; + unsigned int st_reserved1:3; + unsigned int st_stlb_miss:1; + unsigned int st_locked:1; + unsigned int st_reserved2:26; + }; +}; + + +/* + * Map PEBS Load Latency Data Source encodings to generic + * memory data source information + */ +#define P(a, b) PERF_MEM_S(a, b) +#define OP_LH (P(OP, LOAD) | P(LVL, HIT)) +#define SNOOP_NONE_MISS (P(SNOOP, NONE) | P(SNOOP, MISS)) + +static const u64 pebs_data_source[] = { + P(OP, LOAD) | P(LVL, MISS) | P(LVL, L3) | P(SNOOP, NA),/* 0x00:ukn L3 */ + OP_LH | P(LVL, L1) | P(SNOOP, NONE), /* 0x01: L1 local */ + OP_LH | P(LVL, LFB) | P(SNOOP, NONE), /* 0x02: LFB hit */ + OP_LH | P(LVL, L2) | P(SNOOP, NONE), /* 0x03: L2 hit */ + OP_LH | P(LVL, L3) | P(SNOOP, NONE), /* 0x04: L3 hit */ + OP_LH | P(LVL, L3) | P(SNOOP, MISS), /* 0x05: L3 hit, snoop miss */ + OP_LH | P(LVL, L3) | P(SNOOP, HIT), /* 0x06: L3 hit, snoop hit */ + OP_LH | P(LVL, L3) | P(SNOOP, HITM), /* 0x07: L3 hit, snoop hitm */ + OP_LH | P(LVL, REM_CCE1) | P(SNOOP, HIT), /* 0x08: L3 miss snoop hit */ + OP_LH | P(LVL, REM_CCE1) | P(SNOOP, HITM), /* 0x09: L3 miss snoop hitm*/ + OP_LH | P(LVL, LOC_RAM) | P(SNOOP, HIT), /* 0x0a: L3 miss, shared */ + OP_LH | P(LVL, REM_RAM1) | P(SNOOP, HIT), /* 0x0b: L3 miss, shared */ + OP_LH | P(LVL, LOC_RAM) | SNOOP_NONE_MISS,/* 0x0c: L3 miss, excl */ + OP_LH | P(LVL, REM_RAM1) | SNOOP_NONE_MISS,/* 0x0d: L3 miss, excl */ + OP_LH | P(LVL, IO) | P(SNOOP, NONE), /* 0x0e: I/O */ + OP_LH | P(LVL, UNC) | P(SNOOP, NONE), /* 0x0f: uncached */ +}; + +static u64 load_latency_data(u64 status) +{ + union intel_x86_pebs_dse dse; + u64 val; + int model = boot_cpu_data.x86_model; + int fam = boot_cpu_data.x86; + + dse.val = status; + + /* + * use the mapping table for bit 0-3 + */ + val = pebs_data_source[dse.ld_dse]; + + /* + * Nehalem models do not support TLB, Lock infos + */ + if (fam == 0x6 && (model == 26 || model == 30 + || model == 31 || model == 46)) { + val |= P(TLB, NA) | P(LOCK, NA); + return val; + } + /* + * bit 4: TLB access + * 0 = did not miss 2nd level TLB + * 1 = missed 2nd level TLB + */ + if (dse.ld_stlb_miss) + val |= P(TLB, MISS) | P(TLB, L2); + else + val |= P(TLB, HIT) | P(TLB, L1) | P(TLB, L2); + + /* + * bit 5: locked prefix + */ + if (dse.ld_locked) + val |= P(LOCK, LOCKED); + + return val; +} + struct pebs_record_core { u64 flags, ip; u64 ax, bx, cx, dx; @@ -364,7 +450,7 @@ struct event_constraint intel_atom_pebs_event_constraints[] = { }; struct event_constraint intel_nehalem_pebs_event_constraints[] = { - INTEL_EVENT_CONSTRAINT(0x0b, 0xf), /* MEM_INST_RETIRED.* */ + INTEL_PLD_CONSTRAINT(0x100b, 0xf), /* MEM_INST_RETIRED.* */ INTEL_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */ INTEL_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */ INTEL_EVENT_CONSTRAINT(0xc0, 0xf), /* INST_RETIRED.ANY */ @@ -379,7 +465,7 @@ struct event_constraint intel_nehalem_pebs_event_constraints[] = { }; struct event_constraint intel_westmere_pebs_event_constraints[] = { - INTEL_EVENT_CONSTRAINT(0x0b, 0xf), /* MEM_INST_RETIRED.* */ + INTEL_PLD_CONSTRAINT(0x100b, 0xf), /* MEM_INST_RETIRED.* */ INTEL_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */ INTEL_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */ INTEL_EVENT_CONSTRAINT(0xc0, 0xf), /* INSTR_RETIRED.* */ @@ -399,7 +485,7 @@ struct event_constraint intel_snb_pebs_event_constraints[] = { INTEL_UEVENT_CONSTRAINT(0x02c2, 0xf), /* UOPS_RETIRED.RETIRE_SLOTS */ INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */ INTEL_EVENT_CONSTRAINT(0xc5, 0xf), /* BR_MISP_RETIRED.* */ - INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.* */ + INTEL_PLD_CONSTRAINT(0x01cd, 0x8), /* MEM_TRANS_RETIRED.LAT_ABOVE_THR */ INTEL_EVENT_CONSTRAINT(0xd0, 0xf), /* MEM_UOP_RETIRED.* */ INTEL_EVENT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */ INTEL_EVENT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */ @@ -413,7 +499,7 @@ struct event_constraint intel_ivb_pebs_event_constraints[] = { INTEL_UEVENT_CONSTRAINT(0x02c2, 0xf), /* UOPS_RETIRED.RETIRE_SLOTS */ INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */ INTEL_EVENT_CONSTRAINT(0xc5, 0xf), /* BR_MISP_RETIRED.* */ - INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.* */ + INTEL_PLD_CONSTRAINT(0x01cd, 0x8), /* MEM_TRANS_RETIRED.LAT_ABOVE_THR */ INTEL_EVENT_CONSTRAINT(0xd0, 0xf), /* MEM_UOP_RETIRED.* */ INTEL_EVENT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */ INTEL_EVENT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */ @@ -448,6 +534,9 @@ void intel_pmu_pebs_enable(struct perf_event *event) hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT; cpuc->pebs_enabled |= 1ULL << hwc->idx; + + if (event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT) + cpuc->pebs_enabled |= 1ULL << (hwc->idx + 32); } void intel_pmu_pebs_disable(struct perf_event *event) @@ -560,20 +649,48 @@ static void __intel_pmu_pebs_event(struct perf_event *event, struct pt_regs *iregs, void *__pebs) { /* - * We cast to pebs_record_core since that is a subset of - * both formats and we don't use the other fields in this - * routine. + * We cast to pebs_record_nhm to get the load latency data + * if extra_reg MSR_PEBS_LD_LAT_THRESHOLD used */ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - struct pebs_record_core *pebs = __pebs; + struct pebs_record_nhm *pebs = __pebs; struct perf_sample_data data; struct pt_regs regs; + u64 sample_type; + int fll; if (!intel_pmu_save_and_restart(event)) return; + fll = event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT; + perf_sample_data_init(&data, 0, event->hw.last_period); + data.period = event->hw.last_period; + sample_type = event->attr.sample_type; + + /* + * if PEBS-LL or PreciseStore + */ + if (fll) { + if (sample_type & PERF_SAMPLE_ADDR) + data.addr = pebs->dla; + + /* + * Use latency for weight (only avail with PEBS-LL) + */ + if (fll && (sample_type & PERF_SAMPLE_WEIGHT)) + data.weight = pebs->lat; + + /* + * data.data_src encodes the data source + */ + if (sample_type & PERF_SAMPLE_DATA_SRC) { + if (fll) + data.data_src.val = load_latency_data(pebs->dse); + } + } + /* * We use the interrupt regs as a base because the PEBS record * does not contain a full regs set, specifically it seems to -- cgit v1.2.3-70-g09d2 From a63fcab45273174e665e6a8c9fa1a79a9046d0d5 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 24 Jan 2013 16:10:33 +0100 Subject: perf/x86: Export PEBS load latency threshold register to sysfs Make the PEBS Load Latency threshold register layout and encoding visible to user level tools. Signed-off-by: Stephane Eranian Cc: peterz@infradead.org Cc: ak@linux.intel.com Cc: acme@redhat.com Cc: jolsa@redhat.com Cc: namhyung.kim@lge.com Link: http://lkml.kernel.org/r/1359040242-8269-10-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar Signed-off-by: Arnaldo Carvalho de Melo --- arch/x86/kernel/cpu/perf_event_intel.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86/kernel/cpu/perf_event_intel.c') diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index d5ea5a03cd3..ae6096b175b 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1781,6 +1781,8 @@ static void intel_pmu_flush_branch_stack(void) PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63"); +PMU_FORMAT_ATTR(ldlat, "config1:0-15"); + static struct attribute *intel_arch3_formats_attr[] = { &format_attr_event.attr, &format_attr_umask.attr, @@ -1791,6 +1793,7 @@ static struct attribute *intel_arch3_formats_attr[] = { &format_attr_cmask.attr, &format_attr_offcore_rsp.attr, /* XXX do NHM/WSM + SNB breakout */ + &format_attr_ldlat.attr, /* PEBS load latency */ NULL, }; -- cgit v1.2.3-70-g09d2 From 9ad64c0f481c37a63dd39842a0fd264bee44a097 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 24 Jan 2013 16:10:34 +0100 Subject: perf/x86: Add support for PEBS Precise Store This patch adds support for PEBS Precise Store which is available on Intel Sandy Bridge and Ivy Bridge processors. To use Precise store, the proper PEBS event must be used: mem_trans_retired:precise_stores. For the perf tool, the generic mem-stores event exported via sysfs can be used directly. Signed-off-by: Stephane Eranian Cc: peterz@infradead.org Cc: ak@linux.intel.com Cc: acme@redhat.com Cc: jolsa@redhat.com Cc: namhyung.kim@lge.com Link: http://lkml.kernel.org/r/1359040242-8269-11-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar Signed-off-by: Arnaldo Carvalho de Melo --- arch/x86/kernel/cpu/perf_event.h | 5 ++++ arch/x86/kernel/cpu/perf_event_intel.c | 2 ++ arch/x86/kernel/cpu/perf_event_intel_ds.c | 49 +++++++++++++++++++++++++++++-- 3 files changed, 54 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel/cpu/perf_event_intel.c') diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index f3a9a94e4d2..ba9aadfa683 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -66,6 +66,7 @@ struct event_constraint { * struct event_constraint flags */ #define PERF_X86_EVENT_PEBS_LDLAT 0x1 /* ld+ldlat data address sampling */ +#define PERF_X86_EVENT_PEBS_ST 0x2 /* st data address sampling */ struct amd_nb { int nb_id; /* NorthBridge id */ @@ -242,6 +243,10 @@ struct cpu_hw_events { __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK, \ HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_LDLAT) +#define INTEL_PST_CONSTRAINT(c, n) \ + __EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK, \ + HWEIGHT(n), 0, PERF_X86_EVENT_PEBS_ST) + #define EVENT_CONSTRAINT_END \ EVENT_CONSTRAINT(0, 0, 0) diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index ae6096b175b..e84c4ba44b5 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -163,6 +163,7 @@ static struct extra_reg intel_snb_extra_regs[] __read_mostly = { EVENT_ATTR_STR(mem-loads, mem_ld_nhm, "event=0x0b,umask=0x10,ldlat=3"); EVENT_ATTR_STR(mem-loads, mem_ld_snb, "event=0xcd,umask=0x1,ldlat=3"); +EVENT_ATTR_STR(mem-stores, mem_st_snb, "event=0xcd,umask=0x2"); struct attribute *nhm_events_attrs[] = { EVENT_PTR(mem_ld_nhm), @@ -171,6 +172,7 @@ struct attribute *nhm_events_attrs[] = { struct attribute *snb_events_attrs[] = { EVENT_PTR(mem_ld_snb), + EVENT_PTR(mem_st_snb), NULL, }; diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index a6400bd0463..36dc13d1ad0 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -69,6 +69,44 @@ static const u64 pebs_data_source[] = { OP_LH | P(LVL, UNC) | P(SNOOP, NONE), /* 0x0f: uncached */ }; +static u64 precise_store_data(u64 status) +{ + union intel_x86_pebs_dse dse; + u64 val = P(OP, STORE) | P(SNOOP, NA) | P(LVL, L1) | P(TLB, L2); + + dse.val = status; + + /* + * bit 4: TLB access + * 1 = stored missed 2nd level TLB + * + * so it either hit the walker or the OS + * otherwise hit 2nd level TLB + */ + if (dse.st_stlb_miss) + val |= P(TLB, MISS); + else + val |= P(TLB, HIT); + + /* + * bit 0: hit L1 data cache + * if not set, then all we know is that + * it missed L1D + */ + if (dse.st_l1d_hit) + val |= P(LVL, HIT); + else + val |= P(LVL, MISS); + + /* + * bit 5: Locked prefix + */ + if (dse.st_locked) + val |= P(LOCK, LOCKED); + + return val; +} + static u64 load_latency_data(u64 status) { union intel_x86_pebs_dse dse; @@ -486,6 +524,7 @@ struct event_constraint intel_snb_pebs_event_constraints[] = { INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */ INTEL_EVENT_CONSTRAINT(0xc5, 0xf), /* BR_MISP_RETIRED.* */ INTEL_PLD_CONSTRAINT(0x01cd, 0x8), /* MEM_TRANS_RETIRED.LAT_ABOVE_THR */ + INTEL_PST_CONSTRAINT(0x02cd, 0x8), /* MEM_TRANS_RETIRED.PRECISE_STORES */ INTEL_EVENT_CONSTRAINT(0xd0, 0xf), /* MEM_UOP_RETIRED.* */ INTEL_EVENT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */ INTEL_EVENT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */ @@ -500,6 +539,7 @@ struct event_constraint intel_ivb_pebs_event_constraints[] = { INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */ INTEL_EVENT_CONSTRAINT(0xc5, 0xf), /* BR_MISP_RETIRED.* */ INTEL_PLD_CONSTRAINT(0x01cd, 0x8), /* MEM_TRANS_RETIRED.LAT_ABOVE_THR */ + INTEL_PST_CONSTRAINT(0x02cd, 0x8), /* MEM_TRANS_RETIRED.PRECISE_STORES */ INTEL_EVENT_CONSTRAINT(0xd0, 0xf), /* MEM_UOP_RETIRED.* */ INTEL_EVENT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */ INTEL_EVENT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */ @@ -537,6 +577,8 @@ void intel_pmu_pebs_enable(struct perf_event *event) if (event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT) cpuc->pebs_enabled |= 1ULL << (hwc->idx + 32); + else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST) + cpuc->pebs_enabled |= 1ULL << 63; } void intel_pmu_pebs_disable(struct perf_event *event) @@ -657,12 +699,13 @@ static void __intel_pmu_pebs_event(struct perf_event *event, struct perf_sample_data data; struct pt_regs regs; u64 sample_type; - int fll; + int fll, fst; if (!intel_pmu_save_and_restart(event)) return; fll = event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT; + fst = event->hw.flags & PERF_X86_EVENT_PEBS_ST; perf_sample_data_init(&data, 0, event->hw.last_period); @@ -672,7 +715,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event, /* * if PEBS-LL or PreciseStore */ - if (fll) { + if (fll || fst) { if (sample_type & PERF_SAMPLE_ADDR) data.addr = pebs->dla; @@ -688,6 +731,8 @@ static void __intel_pmu_pebs_event(struct perf_event *event, if (sample_type & PERF_SAMPLE_DATA_SRC) { if (fll) data.data_src.val = load_latency_data(pebs->dse); + else + data.data_src.val = precise_store_data(pebs->dse); } } -- cgit v1.2.3-70-g09d2 From f8378f5259647710f0b4ecb814b0a1b0d9040de0 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 8 Mar 2013 15:22:48 -0800 Subject: perf/x86: Add Sandy Bridge constraints for CYCLE_ACTIVITY.* Add CYCLE_ACTIVITY.CYCLES_NO_DISPATCH/CYCLES_L1D_PENDING constraints. These recently documented events have restrictions to counter 0-3 and counter 2 respectively. The perf scheduler needs to know that to schedule them correctly. IvyBridge already has the necessary constraints. Signed-off-by: Andi Kleen Cc: a.p.zijlstra@chello.nl Cc: Stephane Eranian Link: http://lkml.kernel.org/r/1362784968-12542-1-git-send-email-andi@firstfloor.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/kernel/cpu/perf_event_intel.c') diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index e84c4ba44b5..2ad2374d53d 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -109,6 +109,8 @@ static struct event_constraint intel_snb_event_constraints[] __read_mostly = INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.PENDING */ INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */ INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */ + INTEL_UEVENT_CONSTRAINT(0x04a3, 0xf), /* CYCLE_ACTIVITY.CYCLES_NO_DISPATCH */ + INTEL_UEVENT_CONSTRAINT(0x02a3, 0x4), /* CYCLE_ACTIVITY.CYCLES_L1D_PENDING */ EVENT_CONSTRAINT_END }; -- cgit v1.2.3-70-g09d2