diff options
| author | Linus Torvalds <torvalds@linux-foundation.org> | 2009-06-11 14:01:07 -0700 | 
|---|---|---|
| committer | Linus Torvalds <torvalds@linux-foundation.org> | 2009-06-11 14:01:07 -0700 | 
| commit | 8a1ca8cedd108c8e76a6ab34079d0bbb4f244799 (patch) | |
| tree | 636c715524f1718599209cc289908ea44b6cb859 | |
| parent | b640f042faa2a2fad6464f259a8afec06e2f6386 (diff) | |
| parent | 940010c5a314a7bd9b498593bc6ba1718ac5aec5 (diff) | |
| download | olio-linux-3.10-8a1ca8cedd108c8e76a6ab34079d0bbb4f244799.tar.xz olio-linux-3.10-8a1ca8cedd108c8e76a6ab34079d0bbb4f244799.zip  | |
Merge branch 'perfcounters-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip
* 'perfcounters-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/linux-2.6-tip: (574 commits)
  perf_counter: Turn off by default
  perf_counter: Add counter->id to the throttle event
  perf_counter: Better align code
  perf_counter: Rename L2 to LL cache
  perf_counter: Standardize event names
  perf_counter: Rename enums
  perf_counter tools: Clean up u64 usage
  perf_counter: Rename perf_counter_limit sysctl
  perf_counter: More paranoia settings
  perf_counter: powerpc: Implement generalized cache events for POWER processors
  perf_counters: powerpc: Add support for POWER7 processors
  perf_counter: Accurate period data
  perf_counter: Introduce struct for sample data
  perf_counter tools: Normalize data using per sample period data
  perf_counter: Annotate exit ctx recursion
  perf_counter tools: Propagate signals properly
  perf_counter tools: Small frequency related fixes
  perf_counter: More aggressive frequency adjustment
  perf_counter/x86: Fix the model number of Intel Core2 processors
  perf_counter, x86: Correct some event and umask values for Intel processors
  ...
138 files changed, 27406 insertions, 85 deletions
diff --git a/MAINTAINERS b/MAINTAINERS index ccdb57524e3..70f961d43d9 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4403,6 +4403,16 @@ S:	Maintained  F:	include/linux/delayacct.h  F:	kernel/delayacct.c +PERFORMANCE COUNTER SUBSYSTEM +P:	Peter Zijlstra +M:	a.p.zijlstra@chello.nl +P:	Paul Mackerras +M:	paulus@samba.org +P:	Ingo Molnar +M:	mingo@elte.hu +L:	linux-kernel@vger.kernel.org +S:	Supported +  PERSONALITY HANDLING  P:	Christoph Hellwig  M:	hch@infradead.org diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h index b7e034b0a6d..20a44d0c9fd 100644 --- a/arch/powerpc/include/asm/hw_irq.h +++ b/arch/powerpc/include/asm/hw_irq.h @@ -131,5 +131,44 @@ static inline int irqs_disabled_flags(unsigned long flags)   */  struct irq_chip; +#ifdef CONFIG_PERF_COUNTERS +static inline unsigned long test_perf_counter_pending(void) +{ +	unsigned long x; + +	asm volatile("lbz %0,%1(13)" +		: "=r" (x) +		: "i" (offsetof(struct paca_struct, perf_counter_pending))); +	return x; +} + +static inline void set_perf_counter_pending(void) +{ +	asm volatile("stb %0,%1(13)" : : +		"r" (1), +		"i" (offsetof(struct paca_struct, perf_counter_pending))); +} + +static inline void clear_perf_counter_pending(void) +{ +	asm volatile("stb %0,%1(13)" : : +		"r" (0), +		"i" (offsetof(struct paca_struct, perf_counter_pending))); +} + +extern void perf_counter_do_pending(void); + +#else + +static inline unsigned long test_perf_counter_pending(void) +{ +	return 0; +} + +static inline void set_perf_counter_pending(void) {} +static inline void clear_perf_counter_pending(void) {} +static inline void perf_counter_do_pending(void) {} +#endif /* CONFIG_PERF_COUNTERS */ +  #endif	/* __KERNEL__ */  #endif	/* _ASM_POWERPC_HW_IRQ_H */ diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index 082b3aedf14..6ef05572301 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -99,6 +99,7 @@ struct paca_struct {  	u8 soft_enabled;		/* irq soft-enable flag */  	u8 hard_enabled;		/* set if irqs are enabled in MSR */  	u8 io_sync;			/* writel() needs spin_unlock sync */ +	u8 perf_counter_pending;	/* PM interrupt while soft-disabled */  	/* Stuff for accurate time accounting */  	u64 user_time;			/* accumulated usermode TB ticks */ diff --git a/arch/powerpc/include/asm/perf_counter.h b/arch/powerpc/include/asm/perf_counter.h new file mode 100644 index 00000000000..cc7c887705b --- /dev/null +++ b/arch/powerpc/include/asm/perf_counter.h @@ -0,0 +1,98 @@ +/* + * Performance counter support - PowerPC-specific definitions. + * + * Copyright 2008-2009 Paul Mackerras, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <linux/types.h> + +#define MAX_HWCOUNTERS		8 +#define MAX_EVENT_ALTERNATIVES	8 +#define MAX_LIMITED_HWCOUNTERS	2 + +/* + * This struct provides the constants and functions needed to + * describe the PMU on a particular POWER-family CPU. + */ +struct power_pmu { +	int	n_counter; +	int	max_alternatives; +	u64	add_fields; +	u64	test_adder; +	int	(*compute_mmcr)(u64 events[], int n_ev, +				unsigned int hwc[], u64 mmcr[]); +	int	(*get_constraint)(u64 event, u64 *mskp, u64 *valp); +	int	(*get_alternatives)(u64 event, unsigned int flags, +				    u64 alt[]); +	void	(*disable_pmc)(unsigned int pmc, u64 mmcr[]); +	int	(*limited_pmc_event)(u64 event); +	u32	flags; +	int	n_generic; +	int	*generic_events; +	int	(*cache_events)[PERF_COUNT_HW_CACHE_MAX] +			       [PERF_COUNT_HW_CACHE_OP_MAX] +			       [PERF_COUNT_HW_CACHE_RESULT_MAX]; +}; + +extern struct power_pmu *ppmu; + +/* + * Values for power_pmu.flags + */ +#define PPMU_LIMITED_PMC5_6	1	/* PMC5/6 have limited function */ +#define PPMU_ALT_SIPR		2	/* uses alternate posn for SIPR/HV */ + +/* + * Values for flags to get_alternatives() + */ +#define PPMU_LIMITED_PMC_OK	1	/* can put this on a limited PMC */ +#define PPMU_LIMITED_PMC_REQD	2	/* have to put this on a limited PMC */ +#define PPMU_ONLY_COUNT_RUN	4	/* only counting in run state */ + +struct pt_regs; +extern unsigned long perf_misc_flags(struct pt_regs *regs); +#define perf_misc_flags(regs)	perf_misc_flags(regs) + +extern unsigned long perf_instruction_pointer(struct pt_regs *regs); + +/* + * The power_pmu.get_constraint function returns a 64-bit value and + * a 64-bit mask that express the constraints between this event and + * other events. + * + * The value and mask are divided up into (non-overlapping) bitfields + * of three different types: + * + * Select field: this expresses the constraint that some set of bits + * in MMCR* needs to be set to a specific value for this event.  For a + * select field, the mask contains 1s in every bit of the field, and + * the value contains a unique value for each possible setting of the + * MMCR* bits.  The constraint checking code will ensure that two events + * that set the same field in their masks have the same value in their + * value dwords. + * + * Add field: this expresses the constraint that there can be at most + * N events in a particular class.  A field of k bits can be used for + * N <= 2^(k-1) - 1.  The mask has the most significant bit of the field + * set (and the other bits 0), and the value has only the least significant + * bit of the field set.  In addition, the 'add_fields' and 'test_adder' + * in the struct power_pmu for this processor come into play.  The + * add_fields value contains 1 in the LSB of the field, and the + * test_adder contains 2^(k-1) - 1 - N in the field. + * + * NAND field: this expresses the constraint that you may not have events + * in all of a set of classes.  (For example, on PPC970, you can't select + * events from the FPU, ISU and IDU simultaneously, although any two are + * possible.)  For N classes, the field is N+1 bits wide, and each class + * is assigned one bit from the least-significant N bits.  The mask has + * only the most-significant bit set, and the value has only the bit + * for the event's class set.  The test_adder has the least significant + * bit set in the field. + * + * If an event is not subject to the constraint expressed by a particular + * field, then it will have 0 in both the mask and value for that field. + */ diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index e8018d540e8..fb359b0a693 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -492,11 +492,13 @@  #define   MMCR0_FCHV	0x00000001UL /* freeze conditions in hypervisor mode */  #define SPRN_MMCR1	798  #define SPRN_MMCRA	0x312 +#define   MMCRA_SDSYNC	0x80000000UL /* SDAR synced with SIAR */  #define   MMCRA_SIHV	0x10000000UL /* state of MSR HV when SIAR set */  #define   MMCRA_SIPR	0x08000000UL /* state of MSR PR when SIAR set */  #define   MMCRA_SLOT	0x07000000UL /* SLOT bits (37-39) */  #define   MMCRA_SLOT_SHIFT	24  #define   MMCRA_SAMPLE_ENABLE 0x00000001UL /* enable sampling */ +#define   POWER6_MMCRA_SDSYNC 0x0000080000000000ULL	/* SDAR/SIAR synced */  #define   POWER6_MMCRA_SIHV   0x0000040000000000ULL  #define   POWER6_MMCRA_SIPR   0x0000020000000000ULL  #define   POWER6_MMCRA_THRM	0x00000020UL diff --git a/arch/powerpc/include/asm/systbl.h b/arch/powerpc/include/asm/systbl.h index d98a30dfd41..a0b92de51c7 100644 --- a/arch/powerpc/include/asm/systbl.h +++ b/arch/powerpc/include/asm/systbl.h @@ -322,6 +322,6 @@ SYSCALL_SPU(epoll_create1)  SYSCALL_SPU(dup3)  SYSCALL_SPU(pipe2)  SYSCALL(inotify_init1) -SYSCALL(ni_syscall) +SYSCALL_SPU(perf_counter_open)  COMPAT_SYS_SPU(preadv)  COMPAT_SYS_SPU(pwritev) diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h index 3f06f8ec81c..4badac2d11d 100644 --- a/arch/powerpc/include/asm/unistd.h +++ b/arch/powerpc/include/asm/unistd.h @@ -341,6 +341,7 @@  #define __NR_dup3		316  #define __NR_pipe2		317  #define __NR_inotify_init1	318 +#define __NR_perf_counter_open	319  #define __NR_preadv		320  #define __NR_pwritev		321 diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 71901fbda4a..a2c683403c2 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -94,6 +94,9 @@ obj64-$(CONFIG_AUDIT)		+= compat_audit.o  obj-$(CONFIG_DYNAMIC_FTRACE)	+= ftrace.o  obj-$(CONFIG_FUNCTION_GRAPH_TRACER)	+= ftrace.o +obj-$(CONFIG_PERF_COUNTERS)	+= perf_counter.o power4-pmu.o ppc970-pmu.o \ +				   power5-pmu.o power5+-pmu.o power6-pmu.o \ +				   power7-pmu.o  obj-$(CONFIG_8XX_MINIMAL_FPEMU) += softemu8xx.o diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 1e40bc05394..e981d1ce191 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -131,6 +131,7 @@ int main(void)  	DEFINE(PACAKMSR, offsetof(struct paca_struct, kernel_msr));  	DEFINE(PACASOFTIRQEN, offsetof(struct paca_struct, soft_enabled));  	DEFINE(PACAHARDIRQEN, offsetof(struct paca_struct, hard_enabled)); +	DEFINE(PACAPERFPEND, offsetof(struct paca_struct, perf_counter_pending));  	DEFINE(PACASLBCACHE, offsetof(struct paca_struct, slb_cache));  	DEFINE(PACASLBCACHEPTR, offsetof(struct paca_struct, slb_cache_ptr));  	DEFINE(PACACONTEXTID, offsetof(struct paca_struct, context.id)); diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index abfc3233047..43e073477c3 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -526,6 +526,15 @@ ALT_FW_FTR_SECTION_END_IFCLR(FW_FEATURE_ISERIES)  2:  	TRACE_AND_RESTORE_IRQ(r5); +#ifdef CONFIG_PERF_COUNTERS +	/* check paca->perf_counter_pending if we're enabling ints */ +	lbz	r3,PACAPERFPEND(r13) +	and.	r3,r3,r5 +	beq	27f +	bl	.perf_counter_do_pending +27: +#endif /* CONFIG_PERF_COUNTERS */ +  	/* extract EE bit and use it to restore paca->hard_enabled */  	ld	r3,_MSR(r1)  	rldicl	r4,r3,49,63		/* r0 = (r3 >> 15) & 1 */ diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index 8c1a4966867..feff792ed0f 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -135,6 +135,11 @@ notrace void raw_local_irq_restore(unsigned long en)  			iseries_handle_interrupts();  	} +	if (test_perf_counter_pending()) { +		clear_perf_counter_pending(); +		perf_counter_do_pending(); +	} +  	/*  	 * if (get_paca()->hard_enabled) return;  	 * But again we need to take care that gcc gets hard_enabled directly diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c new file mode 100644 index 00000000000..bb202388170 --- /dev/null +++ b/arch/powerpc/kernel/perf_counter.c @@ -0,0 +1,1263 @@ +/* + * Performance counter support - powerpc architecture code + * + * Copyright 2008-2009 Paul Mackerras, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/perf_counter.h> +#include <linux/percpu.h> +#include <linux/hardirq.h> +#include <asm/reg.h> +#include <asm/pmc.h> +#include <asm/machdep.h> +#include <asm/firmware.h> +#include <asm/ptrace.h> + +struct cpu_hw_counters { +	int n_counters; +	int n_percpu; +	int disabled; +	int n_added; +	int n_limited; +	u8  pmcs_enabled; +	struct perf_counter *counter[MAX_HWCOUNTERS]; +	u64 events[MAX_HWCOUNTERS]; +	unsigned int flags[MAX_HWCOUNTERS]; +	u64 mmcr[3]; +	struct perf_counter *limited_counter[MAX_LIMITED_HWCOUNTERS]; +	u8  limited_hwidx[MAX_LIMITED_HWCOUNTERS]; +}; +DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters); + +struct power_pmu *ppmu; + +/* + * Normally, to ignore kernel events we set the FCS (freeze counters + * in supervisor mode) bit in MMCR0, but if the kernel runs with the + * hypervisor bit set in the MSR, or if we are running on a processor + * where the hypervisor bit is forced to 1 (as on Apple G5 processors), + * then we need to use the FCHV bit to ignore kernel events. + */ +static unsigned int freeze_counters_kernel = MMCR0_FCS; + +static void perf_counter_interrupt(struct pt_regs *regs); + +void perf_counter_print_debug(void) +{ +} + +/* + * Read one performance monitor counter (PMC). + */ +static unsigned long read_pmc(int idx) +{ +	unsigned long val; + +	switch (idx) { +	case 1: +		val = mfspr(SPRN_PMC1); +		break; +	case 2: +		val = mfspr(SPRN_PMC2); +		break; +	case 3: +		val = mfspr(SPRN_PMC3); +		break; +	case 4: +		val = mfspr(SPRN_PMC4); +		break; +	case 5: +		val = mfspr(SPRN_PMC5); +		break; +	case 6: +		val = mfspr(SPRN_PMC6); +		break; +	case 7: +		val = mfspr(SPRN_PMC7); +		break; +	case 8: +		val = mfspr(SPRN_PMC8); +		break; +	default: +		printk(KERN_ERR "oops trying to read PMC%d\n", idx); +		val = 0; +	} +	return val; +} + +/* + * Write one PMC. + */ +static void write_pmc(int idx, unsigned long val) +{ +	switch (idx) { +	case 1: +		mtspr(SPRN_PMC1, val); +		break; +	case 2: +		mtspr(SPRN_PMC2, val); +		break; +	case 3: +		mtspr(SPRN_PMC3, val); +		break; +	case 4: +		mtspr(SPRN_PMC4, val); +		break; +	case 5: +		mtspr(SPRN_PMC5, val); +		break; +	case 6: +		mtspr(SPRN_PMC6, val); +		break; +	case 7: +		mtspr(SPRN_PMC7, val); +		break; +	case 8: +		mtspr(SPRN_PMC8, val); +		break; +	default: +		printk(KERN_ERR "oops trying to write PMC%d\n", idx); +	} +} + +/* + * Check if a set of events can all go on the PMU at once. + * If they can't, this will look at alternative codes for the events + * and see if any combination of alternative codes is feasible. + * The feasible set is returned in event[]. + */ +static int power_check_constraints(u64 event[], unsigned int cflags[], +				   int n_ev) +{ +	u64 mask, value, nv; +	u64 alternatives[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES]; +	u64 amasks[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES]; +	u64 avalues[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES]; +	u64 smasks[MAX_HWCOUNTERS], svalues[MAX_HWCOUNTERS]; +	int n_alt[MAX_HWCOUNTERS], choice[MAX_HWCOUNTERS]; +	int i, j; +	u64 addf = ppmu->add_fields; +	u64 tadd = ppmu->test_adder; + +	if (n_ev > ppmu->n_counter) +		return -1; + +	/* First see if the events will go on as-is */ +	for (i = 0; i < n_ev; ++i) { +		if ((cflags[i] & PPMU_LIMITED_PMC_REQD) +		    && !ppmu->limited_pmc_event(event[i])) { +			ppmu->get_alternatives(event[i], cflags[i], +					       alternatives[i]); +			event[i] = alternatives[i][0]; +		} +		if (ppmu->get_constraint(event[i], &amasks[i][0], +					 &avalues[i][0])) +			return -1; +	} +	value = mask = 0; +	for (i = 0; i < n_ev; ++i) { +		nv = (value | avalues[i][0]) + (value & avalues[i][0] & addf); +		if ((((nv + tadd) ^ value) & mask) != 0 || +		    (((nv + tadd) ^ avalues[i][0]) & amasks[i][0]) != 0) +			break; +		value = nv; +		mask |= amasks[i][0]; +	} +	if (i == n_ev) +		return 0;	/* all OK */ + +	/* doesn't work, gather alternatives... */ +	if (!ppmu->get_alternatives) +		return -1; +	for (i = 0; i < n_ev; ++i) { +		choice[i] = 0; +		n_alt[i] = ppmu->get_alternatives(event[i], cflags[i], +						  alternatives[i]); +		for (j = 1; j < n_alt[i]; ++j) +			ppmu->get_constraint(alternatives[i][j], +					     &amasks[i][j], &avalues[i][j]); +	} + +	/* enumerate all possibilities and see if any will work */ +	i = 0; +	j = -1; +	value = mask = nv = 0; +	while (i < n_ev) { +		if (j >= 0) { +			/* we're backtracking, restore context */ +			value = svalues[i]; +			mask = smasks[i]; +			j = choice[i]; +		} +		/* +		 * See if any alternative k for event i, +		 * where k > j, will satisfy the constraints. +		 */ +		while (++j < n_alt[i]) { +			nv = (value | avalues[i][j]) + +				(value & avalues[i][j] & addf); +			if ((((nv + tadd) ^ value) & mask) == 0 && +			    (((nv + tadd) ^ avalues[i][j]) +			     & amasks[i][j]) == 0) +				break; +		} +		if (j >= n_alt[i]) { +			/* +			 * No feasible alternative, backtrack +			 * to event i-1 and continue enumerating its +			 * alternatives from where we got up to. +			 */ +			if (--i < 0) +				return -1; +		} else { +			/* +			 * Found a feasible alternative for event i, +			 * remember where we got up to with this event, +			 * go on to the next event, and start with +			 * the first alternative for it. +			 */ +			choice[i] = j; +			svalues[i] = value; +			smasks[i] = mask; +			value = nv; +			mask |= amasks[i][j]; +			++i; +			j = -1; +		} +	} + +	/* OK, we have a feasible combination, tell the caller the solution */ +	for (i = 0; i < n_ev; ++i) +		event[i] = alternatives[i][choice[i]]; +	return 0; +} + +/* + * Check if newly-added counters have consistent settings for + * exclude_{user,kernel,hv} with each other and any previously + * added counters. + */ +static int check_excludes(struct perf_counter **ctrs, unsigned int cflags[], +			  int n_prev, int n_new) +{ +	int eu = 0, ek = 0, eh = 0; +	int i, n, first; +	struct perf_counter *counter; + +	n = n_prev + n_new; +	if (n <= 1) +		return 0; + +	first = 1; +	for (i = 0; i < n; ++i) { +		if (cflags[i] & PPMU_LIMITED_PMC_OK) { +			cflags[i] &= ~PPMU_LIMITED_PMC_REQD; +			continue; +		} +		counter = ctrs[i]; +		if (first) { +			eu = counter->attr.exclude_user; +			ek = counter->attr.exclude_kernel; +			eh = counter->attr.exclude_hv; +			first = 0; +		} else if (counter->attr.exclude_user != eu || +			   counter->attr.exclude_kernel != ek || +			   counter->attr.exclude_hv != eh) { +			return -EAGAIN; +		} +	} + +	if (eu || ek || eh) +		for (i = 0; i < n; ++i) +			if (cflags[i] & PPMU_LIMITED_PMC_OK) +				cflags[i] |= PPMU_LIMITED_PMC_REQD; + +	return 0; +} + +static void power_pmu_read(struct perf_counter *counter) +{ +	long val, delta, prev; + +	if (!counter->hw.idx) +		return; +	/* +	 * Performance monitor interrupts come even when interrupts +	 * are soft-disabled, as long as interrupts are hard-enabled. +	 * Therefore we treat them like NMIs. +	 */ +	do { +		prev = atomic64_read(&counter->hw.prev_count); +		barrier(); +		val = read_pmc(counter->hw.idx); +	} while (atomic64_cmpxchg(&counter->hw.prev_count, prev, val) != prev); + +	/* The counters are only 32 bits wide */ +	delta = (val - prev) & 0xfffffffful; +	atomic64_add(delta, &counter->count); +	atomic64_sub(delta, &counter->hw.period_left); +} + +/* + * On some machines, PMC5 and PMC6 can't be written, don't respect + * the freeze conditions, and don't generate interrupts.  This tells + * us if `counter' is using such a PMC. + */ +static int is_limited_pmc(int pmcnum) +{ +	return (ppmu->flags & PPMU_LIMITED_PMC5_6) +		&& (pmcnum == 5 || pmcnum == 6); +} + +static void freeze_limited_counters(struct cpu_hw_counters *cpuhw, +				    unsigned long pmc5, unsigned long pmc6) +{ +	struct perf_counter *counter; +	u64 val, prev, delta; +	int i; + +	for (i = 0; i < cpuhw->n_limited; ++i) { +		counter = cpuhw->limited_counter[i]; +		if (!counter->hw.idx) +			continue; +		val = (counter->hw.idx == 5) ? pmc5 : pmc6; +		prev = atomic64_read(&counter->hw.prev_count); +		counter->hw.idx = 0; +		delta = (val - prev) & 0xfffffffful; +		atomic64_add(delta, &counter->count); +	} +} + +static void thaw_limited_counters(struct cpu_hw_counters *cpuhw, +				  unsigned long pmc5, unsigned long pmc6) +{ +	struct perf_counter *counter; +	u64 val; +	int i; + +	for (i = 0; i < cpuhw->n_limited; ++i) { +		counter = cpuhw->limited_counter[i]; +		counter->hw.idx = cpuhw->limited_hwidx[i]; +		val = (counter->hw.idx == 5) ? pmc5 : pmc6; +		atomic64_set(&counter->hw.prev_count, val); +		perf_counter_update_userpage(counter); +	} +} + +/* + * Since limited counters don't respect the freeze conditions, we + * have to read them immediately after freezing or unfreezing the + * other counters.  We try to keep the values from the limited + * counters as consistent as possible by keeping the delay (in + * cycles and instructions) between freezing/unfreezing and reading + * the limited counters as small and consistent as possible. + * Therefore, if any limited counters are in use, we read them + * both, and always in the same order, to minimize variability, + * and do it inside the same asm that writes MMCR0. + */ +static void write_mmcr0(struct cpu_hw_counters *cpuhw, unsigned long mmcr0) +{ +	unsigned long pmc5, pmc6; + +	if (!cpuhw->n_limited) { +		mtspr(SPRN_MMCR0, mmcr0); +		return; +	} + +	/* +	 * Write MMCR0, then read PMC5 and PMC6 immediately. +	 * To ensure we don't get a performance monitor interrupt +	 * between writing MMCR0 and freezing/thawing the limited +	 * counters, we first write MMCR0 with the counter overflow +	 * interrupt enable bits turned off. +	 */ +	asm volatile("mtspr %3,%2; mfspr %0,%4; mfspr %1,%5" +		     : "=&r" (pmc5), "=&r" (pmc6) +		     : "r" (mmcr0 & ~(MMCR0_PMC1CE | MMCR0_PMCjCE)), +		       "i" (SPRN_MMCR0), +		       "i" (SPRN_PMC5), "i" (SPRN_PMC6)); + +	if (mmcr0 & MMCR0_FC) +		freeze_limited_counters(cpuhw, pmc5, pmc6); +	else +		thaw_limited_counters(cpuhw, pmc5, pmc6); + +	/* +	 * Write the full MMCR0 including the counter overflow interrupt +	 * enable bits, if necessary. +	 */ +	if (mmcr0 & (MMCR0_PMC1CE | MMCR0_PMCjCE)) +		mtspr(SPRN_MMCR0, mmcr0); +} + +/* + * Disable all counters to prevent PMU interrupts and to allow + * counters to be added or removed. + */ +void hw_perf_disable(void) +{ +	struct cpu_hw_counters *cpuhw; +	unsigned long ret; +	unsigned long flags; + +	local_irq_save(flags); +	cpuhw = &__get_cpu_var(cpu_hw_counters); + +	ret = cpuhw->disabled; +	if (!ret) { +		cpuhw->disabled = 1; +		cpuhw->n_added = 0; + +		/* +		 * Check if we ever enabled the PMU on this cpu. +		 */ +		if (!cpuhw->pmcs_enabled) { +			if (ppc_md.enable_pmcs) +				ppc_md.enable_pmcs(); +			cpuhw->pmcs_enabled = 1; +		} + +		/* +		 * Disable instruction sampling if it was enabled +		 */ +		if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) { +			mtspr(SPRN_MMCRA, +			      cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE); +			mb(); +		} + +		/* +		 * Set the 'freeze counters' bit. +		 * The barrier is to make sure the mtspr has been +		 * executed and the PMU has frozen the counters +		 * before we return. +		 */ +		write_mmcr0(cpuhw, mfspr(SPRN_MMCR0) | MMCR0_FC); +		mb(); +	} +	local_irq_restore(flags); +} + +/* + * Re-enable all counters if disable == 0. + * If we were previously disabled and counters were added, then + * put the new config on the PMU. + */ +void hw_perf_enable(void) +{ +	struct perf_counter *counter; +	struct cpu_hw_counters *cpuhw; +	unsigned long flags; +	long i; +	unsigned long val; +	s64 left; +	unsigned int hwc_index[MAX_HWCOUNTERS]; +	int n_lim; +	int idx; + +	local_irq_save(flags); +	cpuhw = &__get_cpu_var(cpu_hw_counters); +	if (!cpuhw->disabled) { +		local_irq_restore(flags); +		return; +	} +	cpuhw->disabled = 0; + +	/* +	 * If we didn't change anything, or only removed counters, +	 * no need to recalculate MMCR* settings and reset the PMCs. +	 * Just reenable the PMU with the current MMCR* settings +	 * (possibly updated for removal of counters). +	 */ +	if (!cpuhw->n_added) { +		mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE); +		mtspr(SPRN_MMCR1, cpuhw->mmcr[1]); +		if (cpuhw->n_counters == 0) +			get_lppaca()->pmcregs_in_use = 0; +		goto out_enable; +	} + +	/* +	 * Compute MMCR* values for the new set of counters +	 */ +	if (ppmu->compute_mmcr(cpuhw->events, cpuhw->n_counters, hwc_index, +			       cpuhw->mmcr)) { +		/* shouldn't ever get here */ +		printk(KERN_ERR "oops compute_mmcr failed\n"); +		goto out; +	} + +	/* +	 * Add in MMCR0 freeze bits corresponding to the +	 * attr.exclude_* bits for the first counter. +	 * We have already checked that all counters have the +	 * same values for these bits as the first counter. +	 */ +	counter = cpuhw->counter[0]; +	if (counter->attr.exclude_user) +		cpuhw->mmcr[0] |= MMCR0_FCP; +	if (counter->attr.exclude_kernel) +		cpuhw->mmcr[0] |= freeze_counters_kernel; +	if (counter->attr.exclude_hv) +		cpuhw->mmcr[0] |= MMCR0_FCHV; + +	/* +	 * Write the new configuration to MMCR* with the freeze +	 * bit set and set the hardware counters to their initial values. +	 * Then unfreeze the counters. +	 */ +	get_lppaca()->pmcregs_in_use = 1; +	mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE); +	mtspr(SPRN_MMCR1, cpuhw->mmcr[1]); +	mtspr(SPRN_MMCR0, (cpuhw->mmcr[0] & ~(MMCR0_PMC1CE | MMCR0_PMCjCE)) +				| MMCR0_FC); + +	/* +	 * Read off any pre-existing counters that need to move +	 * to another PMC. +	 */ +	for (i = 0; i < cpuhw->n_counters; ++i) { +		counter = cpuhw->counter[i]; +		if (counter->hw.idx && counter->hw.idx != hwc_index[i] + 1) { +			power_pmu_read(counter); +			write_pmc(counter->hw.idx, 0); +			counter->hw.idx = 0; +		} +	} + +	/* +	 * Initialize the PMCs for all the new and moved counters. +	 */ +	cpuhw->n_limited = n_lim = 0; +	for (i = 0; i < cpuhw->n_counters; ++i) { +		counter = cpuhw->counter[i]; +		if (counter->hw.idx) +			continue; +		idx = hwc_index[i] + 1; +		if (is_limited_pmc(idx)) { +			cpuhw->limited_counter[n_lim] = counter; +			cpuhw->limited_hwidx[n_lim] = idx; +			++n_lim; +			continue; +		} +		val = 0; +		if (counter->hw.sample_period) { +			left = atomic64_read(&counter->hw.period_left); +			if (left < 0x80000000L) +				val = 0x80000000L - left; +		} +		atomic64_set(&counter->hw.prev_count, val); +		counter->hw.idx = idx; +		write_pmc(idx, val); +		perf_counter_update_userpage(counter); +	} +	cpuhw->n_limited = n_lim; +	cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE; + + out_enable: +	mb(); +	write_mmcr0(cpuhw, cpuhw->mmcr[0]); + +	/* +	 * Enable instruction sampling if necessary +	 */ +	if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) { +		mb(); +		mtspr(SPRN_MMCRA, cpuhw->mmcr[2]); +	} + + out: +	local_irq_restore(flags); +} + +static int collect_events(struct perf_counter *group, int max_count, +			  struct perf_counter *ctrs[], u64 *events, +			  unsigned int *flags) +{ +	int n = 0; +	struct perf_counter *counter; + +	if (!is_software_counter(group)) { +		if (n >= max_count) +			return -1; +		ctrs[n] = group; +		flags[n] = group->hw.counter_base; +		events[n++] = group->hw.config; +	} +	list_for_each_entry(counter, &group->sibling_list, list_entry) { +		if (!is_software_counter(counter) && +		    counter->state != PERF_COUNTER_STATE_OFF) { +			if (n >= max_count) +				return -1; +			ctrs[n] = counter; +			flags[n] = counter->hw.counter_base; +			events[n++] = counter->hw.config; +		} +	} +	return n; +} + +static void counter_sched_in(struct perf_counter *counter, int cpu) +{ +	counter->state = PERF_COUNTER_STATE_ACTIVE; +	counter->oncpu = cpu; +	counter->tstamp_running += counter->ctx->time - counter->tstamp_stopped; +	if (is_software_counter(counter)) +		counter->pmu->enable(counter); +} + +/* + * Called to enable a whole group of counters. + * Returns 1 if the group was enabled, or -EAGAIN if it could not be. + * Assumes the caller has disabled interrupts and has + * frozen the PMU with hw_perf_save_disable. + */ +int hw_perf_group_sched_in(struct perf_counter *group_leader, +	       struct perf_cpu_context *cpuctx, +	       struct perf_counter_context *ctx, int cpu) +{ +	struct cpu_hw_counters *cpuhw; +	long i, n, n0; +	struct perf_counter *sub; + +	cpuhw = &__get_cpu_var(cpu_hw_counters); +	n0 = cpuhw->n_counters; +	n = collect_events(group_leader, ppmu->n_counter - n0, +			   &cpuhw->counter[n0], &cpuhw->events[n0], +			   &cpuhw->flags[n0]); +	if (n < 0) +		return -EAGAIN; +	if (check_excludes(cpuhw->counter, cpuhw->flags, n0, n)) +		return -EAGAIN; +	i = power_check_constraints(cpuhw->events, cpuhw->flags, n + n0); +	if (i < 0) +		return -EAGAIN; +	cpuhw->n_counters = n0 + n; +	cpuhw->n_added += n; + +	/* +	 * OK, this group can go on; update counter states etc., +	 * and enable any software counters +	 */ +	for (i = n0; i < n0 + n; ++i) +		cpuhw->counter[i]->hw.config = cpuhw->events[i]; +	cpuctx->active_oncpu += n; +	n = 1; +	counter_sched_in(group_leader, cpu); +	list_for_each_entry(sub, &group_leader->sibling_list, list_entry) { +		if (sub->state != PERF_COUNTER_STATE_OFF) { +			counter_sched_in(sub, cpu); +			++n; +		} +	} +	ctx->nr_active += n; + +	return 1; +} + +/* + * Add a counter to the PMU. + * If all counters are not already frozen, then we disable and + * re-enable the PMU in order to get hw_perf_enable to do the + * actual work of reconfiguring the PMU. + */ +static int power_pmu_enable(struct perf_counter *counter) +{ +	struct cpu_hw_counters *cpuhw; +	unsigned long flags; +	int n0; +	int ret = -EAGAIN; + +	local_irq_save(flags); +	perf_disable(); + +	/* +	 * Add the counter to the list (if there is room) +	 * and check whether the total set is still feasible. +	 */ +	cpuhw = &__get_cpu_var(cpu_hw_counters); +	n0 = cpuhw->n_counters; +	if (n0 >= ppmu->n_counter) +		goto out; +	cpuhw->counter[n0] = counter; +	cpuhw->events[n0] = counter->hw.config; +	cpuhw->flags[n0] = counter->hw.counter_base; +	if (check_excludes(cpuhw->counter, cpuhw->flags, n0, 1)) +		goto out; +	if (power_check_constraints(cpuhw->events, cpuhw->flags, n0 + 1)) +		goto out; + +	counter->hw.config = cpuhw->events[n0]; +	++cpuhw->n_counters; +	++cpuhw->n_added; + +	ret = 0; + out: +	perf_enable(); +	local_irq_restore(flags); +	return ret; +} + +/* + * Remove a counter from the PMU. + */ +static void power_pmu_disable(struct perf_counter *counter) +{ +	struct cpu_hw_counters *cpuhw; +	long i; +	unsigned long flags; + +	local_irq_save(flags); +	perf_disable(); + +	power_pmu_read(counter); + +	cpuhw = &__get_cpu_var(cpu_hw_counters); +	for (i = 0; i < cpuhw->n_counters; ++i) { +		if (counter == cpuhw->counter[i]) { +			while (++i < cpuhw->n_counters) +				cpuhw->counter[i-1] = cpuhw->counter[i]; +			--cpuhw->n_counters; +			ppmu->disable_pmc(counter->hw.idx - 1, cpuhw->mmcr); +			if (counter->hw.idx) { +				write_pmc(counter->hw.idx, 0); +				counter->hw.idx = 0; +			} +			perf_counter_update_userpage(counter); +			break; +		} +	} +	for (i = 0; i < cpuhw->n_limited; ++i) +		if (counter == cpuhw->limited_counter[i]) +			break; +	if (i < cpuhw->n_limited) { +		while (++i < cpuhw->n_limited) { +			cpuhw->limited_counter[i-1] = cpuhw->limited_counter[i]; +			cpuhw->limited_hwidx[i-1] = cpuhw->limited_hwidx[i]; +		} +		--cpuhw->n_limited; +	} +	if (cpuhw->n_counters == 0) { +		/* disable exceptions if no counters are running */ +		cpuhw->mmcr[0] &= ~(MMCR0_PMXE | MMCR0_FCECE); +	} + +	perf_enable(); +	local_irq_restore(flags); +} + +/* + * Re-enable interrupts on a counter after they were throttled + * because they were coming too fast. + */ +static void power_pmu_unthrottle(struct perf_counter *counter) +{ +	s64 val, left; +	unsigned long flags; + +	if (!counter->hw.idx || !counter->hw.sample_period) +		return; +	local_irq_save(flags); +	perf_disable(); +	power_pmu_read(counter); +	left = counter->hw.sample_period; +	counter->hw.last_period = left; +	val = 0; +	if (left < 0x80000000L) +		val = 0x80000000L - left; +	write_pmc(counter->hw.idx, val); +	atomic64_set(&counter->hw.prev_count, val); +	atomic64_set(&counter->hw.period_left, left); +	perf_counter_update_userpage(counter); +	perf_enable(); +	local_irq_restore(flags); +} + +struct pmu power_pmu = { +	.enable		= power_pmu_enable, +	.disable	= power_pmu_disable, +	.read		= power_pmu_read, +	.unthrottle	= power_pmu_unthrottle, +}; + +/* + * Return 1 if we might be able to put counter on a limited PMC, + * or 0 if not. + * A counter can only go on a limited PMC if it counts something + * that a limited PMC can count, doesn't require interrupts, and + * doesn't exclude any processor mode. + */ +static int can_go_on_limited_pmc(struct perf_counter *counter, u64 ev, +				 unsigned int flags) +{ +	int n; +	u64 alt[MAX_EVENT_ALTERNATIVES]; + +	if (counter->attr.exclude_user +	    || counter->attr.exclude_kernel +	    || counter->attr.exclude_hv +	    || counter->attr.sample_period) +		return 0; + +	if (ppmu->limited_pmc_event(ev)) +		return 1; + +	/* +	 * The requested event isn't on a limited PMC already; +	 * see if any alternative code goes on a limited PMC. +	 */ +	if (!ppmu->get_alternatives) +		return 0; + +	flags |= PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD; +	n = ppmu->get_alternatives(ev, flags, alt); + +	return n > 0; +} + +/* + * Find an alternative event that goes on a normal PMC, if possible, + * and return the event code, or 0 if there is no such alternative. + * (Note: event code 0 is "don't count" on all machines.) + */ +static u64 normal_pmc_alternative(u64 ev, unsigned long flags) +{ +	u64 alt[MAX_EVENT_ALTERNATIVES]; +	int n; + +	flags &= ~(PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD); +	n = ppmu->get_alternatives(ev, flags, alt); +	if (!n) +		return 0; +	return alt[0]; +} + +/* Number of perf_counters counting hardware events */ +static atomic_t num_counters; +/* Used to avoid races in calling reserve/release_pmc_hardware */ +static DEFINE_MUTEX(pmc_reserve_mutex); + +/* + * Release the PMU if this is the last perf_counter. + */ +static void hw_perf_counter_destroy(struct perf_counter *counter) +{ +	if (!atomic_add_unless(&num_counters, -1, 1)) { +		mutex_lock(&pmc_reserve_mutex); +		if (atomic_dec_return(&num_counters) == 0) +			release_pmc_hardware(); +		mutex_unlock(&pmc_reserve_mutex); +	} +} + +/* + * Translate a generic cache event config to a raw event code. + */ +static int hw_perf_cache_event(u64 config, u64 *eventp) +{ +	unsigned long type, op, result; +	int ev; + +	if (!ppmu->cache_events) +		return -EINVAL; + +	/* unpack config */ +	type = config & 0xff; +	op = (config >> 8) & 0xff; +	result = (config >> 16) & 0xff; + +	if (type >= PERF_COUNT_HW_CACHE_MAX || +	    op >= PERF_COUNT_HW_CACHE_OP_MAX || +	    result >= PERF_COUNT_HW_CACHE_RESULT_MAX) +		return -EINVAL; + +	ev = (*ppmu->cache_events)[type][op][result]; +	if (ev == 0) +		return -EOPNOTSUPP; +	if (ev == -1) +		return -EINVAL; +	*eventp = ev; +	return 0; +} + +const struct pmu *hw_perf_counter_init(struct perf_counter *counter) +{ +	u64 ev; +	unsigned long flags; +	struct perf_counter *ctrs[MAX_HWCOUNTERS]; +	u64 events[MAX_HWCOUNTERS]; +	unsigned int cflags[MAX_HWCOUNTERS]; +	int n; +	int err; + +	if (!ppmu) +		return ERR_PTR(-ENXIO); +	switch (counter->attr.type) { +	case PERF_TYPE_HARDWARE: +		ev = counter->attr.config; +		if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0) +			return ERR_PTR(-EOPNOTSUPP); +		ev = ppmu->generic_events[ev]; +		break; +	case PERF_TYPE_HW_CACHE: +		err = hw_perf_cache_event(counter->attr.config, &ev); +		if (err) +			return ERR_PTR(err); +		break; +	case PERF_TYPE_RAW: +		ev = counter->attr.config; +		break; +	} +	counter->hw.config_base = ev; +	counter->hw.idx = 0; + +	/* +	 * If we are not running on a hypervisor, force the +	 * exclude_hv bit to 0 so that we don't care what +	 * the user set it to. +	 */ +	if (!firmware_has_feature(FW_FEATURE_LPAR)) +		counter->attr.exclude_hv = 0; + +	/* +	 * If this is a per-task counter, then we can use +	 * PM_RUN_* events interchangeably with their non RUN_* +	 * equivalents, e.g. PM_RUN_CYC instead of PM_CYC. +	 * XXX we should check if the task is an idle task. +	 */ +	flags = 0; +	if (counter->ctx->task) +		flags |= PPMU_ONLY_COUNT_RUN; + +	/* +	 * If this machine has limited counters, check whether this +	 * event could go on a limited counter. +	 */ +	if (ppmu->flags & PPMU_LIMITED_PMC5_6) { +		if (can_go_on_limited_pmc(counter, ev, flags)) { +			flags |= PPMU_LIMITED_PMC_OK; +		} else if (ppmu->limited_pmc_event(ev)) { +			/* +			 * The requested event is on a limited PMC, +			 * but we can't use a limited PMC; see if any +			 * alternative goes on a normal PMC. +			 */ +			ev = normal_pmc_alternative(ev, flags); +			if (!ev) +				return ERR_PTR(-EINVAL); +		} +	} + +	/* +	 * If this is in a group, check if it can go on with all the +	 * other hardware counters in the group.  We assume the counter +	 * hasn't been linked into its leader's sibling list at this point. +	 */ +	n = 0; +	if (counter->group_leader != counter) { +		n = collect_events(counter->group_leader, ppmu->n_counter - 1, +				   ctrs, events, cflags); +		if (n < 0) +			return ERR_PTR(-EINVAL); +	} +	events[n] = ev; +	ctrs[n] = counter; +	cflags[n] = flags; +	if (check_excludes(ctrs, cflags, n, 1)) +		return ERR_PTR(-EINVAL); +	if (power_check_constraints(events, cflags, n + 1)) +		return ERR_PTR(-EINVAL); + +	counter->hw.config = events[n]; +	counter->hw.counter_base = cflags[n]; +	counter->hw.last_period = counter->hw.sample_period; +	atomic64_set(&counter->hw.period_left, counter->hw.last_period); + +	/* +	 * See if we need to reserve the PMU. +	 * If no counters are currently in use, then we have to take a +	 * mutex to ensure that we don't race with another task doing +	 * reserve_pmc_hardware or release_pmc_hardware. +	 */ +	err = 0; +	if (!atomic_inc_not_zero(&num_counters)) { +		mutex_lock(&pmc_reserve_mutex); +		if (atomic_read(&num_counters) == 0 && +		    reserve_pmc_hardware(perf_counter_interrupt)) +			err = -EBUSY; +		else +			atomic_inc(&num_counters); +		mutex_unlock(&pmc_reserve_mutex); +	} +	counter->destroy = hw_perf_counter_destroy; + +	if (err) +		return ERR_PTR(err); +	return &power_pmu; +} + +/* + * A counter has overflowed; update its count and record + * things if requested.  Note that interrupts are hard-disabled + * here so there is no possibility of being interrupted. + */ +static void record_and_restart(struct perf_counter *counter, long val, +			       struct pt_regs *regs, int nmi) +{ +	u64 period = counter->hw.sample_period; +	s64 prev, delta, left; +	int record = 0; +	u64 addr, mmcra, sdsync; + +	/* we don't have to worry about interrupts here */ +	prev = atomic64_read(&counter->hw.prev_count); +	delta = (val - prev) & 0xfffffffful; +	atomic64_add(delta, &counter->count); + +	/* +	 * See if the total period for this counter has expired, +	 * and update for the next period. +	 */ +	val = 0; +	left = atomic64_read(&counter->hw.period_left) - delta; +	if (period) { +		if (left <= 0) { +			left += period; +			if (left <= 0) +				left = period; +			record = 1; +		} +		if (left < 0x80000000L) +			val = 0x80000000L - left; +	} + +	/* +	 * Finally record data if requested. +	 */ +	if (record) { +		struct perf_sample_data data = { +			.regs	= regs, +			.addr	= 0, +			.period	= counter->hw.last_period, +		}; + +		if (counter->attr.sample_type & PERF_SAMPLE_ADDR) { +			/* +			 * The user wants a data address recorded. +			 * If we're not doing instruction sampling, +			 * give them the SDAR (sampled data address). +			 * If we are doing instruction sampling, then only +			 * give them the SDAR if it corresponds to the +			 * instruction pointed to by SIAR; this is indicated +			 * by the [POWER6_]MMCRA_SDSYNC bit in MMCRA. +			 */ +			mmcra = regs->dsisr; +			sdsync = (ppmu->flags & PPMU_ALT_SIPR) ? +				POWER6_MMCRA_SDSYNC : MMCRA_SDSYNC; +			if (!(mmcra & MMCRA_SAMPLE_ENABLE) || (mmcra & sdsync)) +				data.addr = mfspr(SPRN_SDAR); +		} +		if (perf_counter_overflow(counter, nmi, &data)) { +			/* +			 * Interrupts are coming too fast - throttle them +			 * by setting the counter to 0, so it will be +			 * at least 2^30 cycles until the next interrupt +			 * (assuming each counter counts at most 2 counts +			 * per cycle). +			 */ +			val = 0; +			left = ~0ULL >> 1; +		} +	} + +	write_pmc(counter->hw.idx, val); +	atomic64_set(&counter->hw.prev_count, val); +	atomic64_set(&counter->hw.period_left, left); +	perf_counter_update_userpage(counter); +} + +/* + * Called from generic code to get the misc flags (i.e. processor mode) + * for an event. + */ +unsigned long perf_misc_flags(struct pt_regs *regs) +{ +	unsigned long mmcra; + +	if (TRAP(regs) != 0xf00) { +		/* not a PMU interrupt */ +		return user_mode(regs) ? PERF_EVENT_MISC_USER : +			PERF_EVENT_MISC_KERNEL; +	} + +	mmcra = regs->dsisr; +	if (ppmu->flags & PPMU_ALT_SIPR) { +		if (mmcra & POWER6_MMCRA_SIHV) +			return PERF_EVENT_MISC_HYPERVISOR; +		return (mmcra & POWER6_MMCRA_SIPR) ? PERF_EVENT_MISC_USER : +			PERF_EVENT_MISC_KERNEL; +	} +	if (mmcra & MMCRA_SIHV) +		return PERF_EVENT_MISC_HYPERVISOR; +	return (mmcra & MMCRA_SIPR) ? PERF_EVENT_MISC_USER : +			PERF_EVENT_MISC_KERNEL; +} + +/* + * Called from generic code to get the instruction pointer + * for an event. + */ +unsigned long perf_instruction_pointer(struct pt_regs *regs) +{ +	unsigned long mmcra; +	unsigned long ip; +	unsigned long slot; + +	if (TRAP(regs) != 0xf00) +		return regs->nip;	/* not a PMU interrupt */ + +	ip = mfspr(SPRN_SIAR); +	mmcra = regs->dsisr; +	if ((mmcra & MMCRA_SAMPLE_ENABLE) && !(ppmu->flags & PPMU_ALT_SIPR)) { +		slot = (mmcra & MMCRA_SLOT) >> MMCRA_SLOT_SHIFT; +		if (slot > 1) +			ip += 4 * (slot - 1); +	} +	return ip; +} + +/* + * Performance monitor interrupt stuff + */ +static void perf_counter_interrupt(struct pt_regs *regs) +{ +	int i; +	struct cpu_hw_counters *cpuhw = &__get_cpu_var(cpu_hw_counters); +	struct perf_counter *counter; +	long val; +	int found = 0; +	int nmi; + +	if (cpuhw->n_limited) +		freeze_limited_counters(cpuhw, mfspr(SPRN_PMC5), +					mfspr(SPRN_PMC6)); + +	/* +	 * Overload regs->dsisr to store MMCRA so we only need to read it once. +	 */ +	regs->dsisr = mfspr(SPRN_MMCRA); + +	/* +	 * If interrupts were soft-disabled when this PMU interrupt +	 * occurred, treat it as an NMI. +	 */ +	nmi = !regs->softe; +	if (nmi) +		nmi_enter(); +	else +		irq_enter(); + +	for (i = 0; i < cpuhw->n_counters; ++i) { +		counter = cpuhw->counter[i]; +		if (!counter->hw.idx || is_limited_pmc(counter->hw.idx)) +			continue; +		val = read_pmc(counter->hw.idx); +		if ((int)val < 0) { +			/* counter has overflowed */ +			found = 1; +			record_and_restart(counter, val, regs, nmi); +		} +	} + +	/* +	 * In case we didn't find and reset the counter that caused +	 * the interrupt, scan all counters and reset any that are +	 * negative, to avoid getting continual interrupts. +	 * Any that we processed in the previous loop will not be negative. +	 */ +	if (!found) { +		for (i = 0; i < ppmu->n_counter; ++i) { +			if (is_limited_pmc(i + 1)) +				continue; +			val = read_pmc(i + 1); +			if ((int)val < 0) +				write_pmc(i + 1, 0); +		} +	} + +	/* +	 * Reset MMCR0 to its normal value.  This will set PMXE and +	 * clear FC (freeze counters) and PMAO (perf mon alert occurred) +	 * and thus allow interrupts to occur again. +	 * XXX might want to use MSR.PM to keep the counters frozen until +	 * we get back out of this interrupt. +	 */ +	write_mmcr0(cpuhw, cpuhw->mmcr[0]); + +	if (nmi) +		nmi_exit(); +	else +		irq_exit(); +} + +void hw_perf_counter_setup(int cpu) +{ +	struct cpu_hw_counters *cpuhw = &per_cpu(cpu_hw_counters, cpu); + +	memset(cpuhw, 0, sizeof(*cpuhw)); +	cpuhw->mmcr[0] = MMCR0_FC; +} + +extern struct power_pmu power4_pmu; +extern struct power_pmu ppc970_pmu; +extern struct power_pmu power5_pmu; +extern struct power_pmu power5p_pmu; +extern struct power_pmu power6_pmu; +extern struct power_pmu power7_pmu; + +static int init_perf_counters(void) +{ +	unsigned long pvr; + +	/* XXX should get this from cputable */ +	pvr = mfspr(SPRN_PVR); +	switch (PVR_VER(pvr)) { +	case PV_POWER4: +	case PV_POWER4p: +		ppmu = &power4_pmu; +		break; +	case PV_970: +	case PV_970FX: +	case PV_970MP: +		ppmu = &ppc970_pmu; +		break; +	case PV_POWER5: +		ppmu = &power5_pmu; +		break; +	case PV_POWER5p: +		ppmu = &power5p_pmu; +		break; +	case 0x3e: +		ppmu = &power6_pmu; +		break; +	case 0x3f: +		ppmu = &power7_pmu; +		break; +	} + +	/* +	 * Use FCHV to ignore kernel events if MSR.HV is set. +	 */ +	if (mfmsr() & MSR_HV) +		freeze_counters_kernel = MMCR0_FCHV; + +	return 0; +} + +arch_initcall(init_perf_counters); diff --git a/arch/powerpc/kernel/power4-pmu.c b/arch/powerpc/kernel/power4-pmu.c new file mode 100644 index 00000000000..07bd308a5fa --- /dev/null +++ b/arch/powerpc/kernel/power4-pmu.c @@ -0,0 +1,598 @@ +/* + * Performance counter support for POWER4 (GP) and POWER4+ (GQ) processors. + * + * Copyright 2009 Paul Mackerras, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <linux/kernel.h> +#include <linux/perf_counter.h> +#include <asm/reg.h> + +/* + * Bits in event code for POWER4 + */ +#define PM_PMC_SH	12	/* PMC number (1-based) for direct events */ +#define PM_PMC_MSK	0xf +#define PM_UNIT_SH	8	/* TTMMUX number and setting - unit select */ +#define PM_UNIT_MSK	0xf +#define PM_LOWER_SH	6 +#define PM_LOWER_MSK	1 +#define PM_LOWER_MSKS	0x40 +#define PM_BYTE_SH	4	/* Byte number of event bus to use */ +#define PM_BYTE_MSK	3 +#define PM_PMCSEL_MSK	7 + +/* + * Unit code values + */ +#define PM_FPU		1 +#define PM_ISU1		2 +#define PM_IFU		3 +#define PM_IDU0		4 +#define PM_ISU1_ALT	6 +#define PM_ISU2		7 +#define PM_IFU_ALT	8 +#define PM_LSU0		9 +#define PM_LSU1		0xc +#define PM_GPS		0xf + +/* + * Bits in MMCR0 for POWER4 + */ +#define MMCR0_PMC1SEL_SH	8 +#define MMCR0_PMC2SEL_SH	1 +#define MMCR_PMCSEL_MSK		0x1f + +/* + * Bits in MMCR1 for POWER4 + */ +#define MMCR1_TTM0SEL_SH	62 +#define MMCR1_TTC0SEL_SH	61 +#define MMCR1_TTM1SEL_SH	59 +#define MMCR1_TTC1SEL_SH	58 +#define MMCR1_TTM2SEL_SH	56 +#define MMCR1_TTC2SEL_SH	55 +#define MMCR1_TTM3SEL_SH	53 +#define MMCR1_TTC3SEL_SH	52 +#define MMCR1_TTMSEL_MSK	3 +#define MMCR1_TD_CP_DBG0SEL_SH	50 +#define MMCR1_TD_CP_DBG1SEL_SH	48 +#define MMCR1_TD_CP_DBG2SEL_SH	46 +#define MMCR1_TD_CP_DBG3SEL_SH	44 +#define MMCR1_DEBUG0SEL_SH	43 +#define MMCR1_DEBUG1SEL_SH	42 +#define MMCR1_DEBUG2SEL_SH	41 +#define MMCR1_DEBUG3SEL_SH	40 +#define MMCR1_PMC1_ADDER_SEL_SH	39 +#define MMCR1_PMC2_ADDER_SEL_SH	38 +#define MMCR1_PMC6_ADDER_SEL_SH	37 +#define MMCR1_PMC5_ADDER_SEL_SH	36 +#define MMCR1_PMC8_ADDER_SEL_SH	35 +#define MMCR1_PMC7_ADDER_SEL_SH	34 +#define MMCR1_PMC3_ADDER_SEL_SH	33 +#define MMCR1_PMC4_ADDER_SEL_SH	32 +#define MMCR1_PMC3SEL_SH	27 +#define MMCR1_PMC4SEL_SH	22 +#define MMCR1_PMC5SEL_SH	17 +#define MMCR1_PMC6SEL_SH	12 +#define MMCR1_PMC7SEL_SH	7 +#define MMCR1_PMC8SEL_SH	2	/* note bit 0 is in MMCRA for GP */ + +static short mmcr1_adder_bits[8] = { +	MMCR1_PMC1_ADDER_SEL_SH, +	MMCR1_PMC2_ADDER_SEL_SH, +	MMCR1_PMC3_ADDER_SEL_SH, +	MMCR1_PMC4_ADDER_SEL_SH, +	MMCR1_PMC5_ADDER_SEL_SH, +	MMCR1_PMC6_ADDER_SEL_SH, +	MMCR1_PMC7_ADDER_SEL_SH, +	MMCR1_PMC8_ADDER_SEL_SH +}; + +/* + * Bits in MMCRA + */ +#define MMCRA_PMC8SEL0_SH	17	/* PMC8SEL bit 0 for GP */ + +/* + * Layout of constraint bits: + * 6666555555555544444444443333333333222222222211111111110000000000 + * 3210987654321098765432109876543210987654321098765432109876543210 + *        |[  >[  >[   >|||[  >[  ><  ><  ><  ><  ><><><><><><><><> + *        | UC1 UC2 UC3 ||| PS1 PS2 B0  B1  B2  B3 P1P2P3P4P5P6P7P8 + * 	  \SMPL	        ||\TTC3SEL + * 		        |\TTC_IFU_SEL + * 		        \TTM2SEL0 + * + * SMPL - SAMPLE_ENABLE constraint + *     56: SAMPLE_ENABLE value 0x0100_0000_0000_0000 + * + * UC1 - unit constraint 1: can't have all three of FPU/ISU1/IDU0|ISU2 + *     55: UC1 error 0x0080_0000_0000_0000 + *     54: FPU events needed 0x0040_0000_0000_0000 + *     53: ISU1 events needed 0x0020_0000_0000_0000 + *     52: IDU0|ISU2 events needed 0x0010_0000_0000_0000 + * + * UC2 - unit constraint 2: can't have all three of FPU/IFU/LSU0 + *     51: UC2 error 0x0008_0000_0000_0000 + *     50: FPU events needed 0x0004_0000_0000_0000 + *     49: IFU events needed 0x0002_0000_0000_0000 + *     48: LSU0 events needed 0x0001_0000_0000_0000 + * + * UC3 - unit constraint 3: can't have all four of LSU0/IFU/IDU0|ISU2/ISU1 + *     47: UC3 error 0x8000_0000_0000 + *     46: LSU0 events needed 0x4000_0000_0000 + *     45: IFU events needed 0x2000_0000_0000 + *     44: IDU0|ISU2 events needed 0x1000_0000_0000 + *     43: ISU1 events needed 0x0800_0000_0000 + * + * TTM2SEL0 + *     42: 0 = IDU0 events needed + *     	   1 = ISU2 events needed 0x0400_0000_0000 + * + * TTC_IFU_SEL + *     41: 0 = IFU.U events needed + *     	   1 = IFU.L events needed 0x0200_0000_0000 + * + * TTC3SEL + *     40: 0 = LSU1.U events needed + *     	   1 = LSU1.L events needed 0x0100_0000_0000 + * + * PS1 + *     39: PS1 error 0x0080_0000_0000 + *     36-38: count of events needing PMC1/2/5/6 0x0070_0000_0000 + * + * PS2 + *     35: PS2 error 0x0008_0000_0000 + *     32-34: count of events needing PMC3/4/7/8 0x0007_0000_0000 + * + * B0 + *     28-31: Byte 0 event source 0xf000_0000 + *     	   1 = FPU + * 	   2 = ISU1 + * 	   3 = IFU + * 	   4 = IDU0 + * 	   7 = ISU2 + * 	   9 = LSU0 + * 	   c = LSU1 + * 	   f = GPS + * + * B1, B2, B3 + *     24-27, 20-23, 16-19: Byte 1, 2, 3 event sources + * + * P8 + *     15: P8 error 0x8000 + *     14-15: Count of events needing PMC8 + * + * P1..P7 + *     0-13: Count of events needing PMC1..PMC7 + * + * Note: this doesn't allow events using IFU.U to be combined with events + * using IFU.L, though that is feasible (using TTM0 and TTM2).  However + * there are no listed events for IFU.L (they are debug events not + * verified for performance monitoring) so this shouldn't cause a + * problem. + */ + +static struct unitinfo { +	u64	value, mask; +	int	unit; +	int	lowerbit; +} p4_unitinfo[16] = { +	[PM_FPU]  = { 0x44000000000000ull, 0x88000000000000ull, PM_FPU, 0 }, +	[PM_ISU1] = { 0x20080000000000ull, 0x88000000000000ull, PM_ISU1, 0 }, +	[PM_ISU1_ALT] = +		    { 0x20080000000000ull, 0x88000000000000ull, PM_ISU1, 0 }, +	[PM_IFU]  = { 0x02200000000000ull, 0x08820000000000ull, PM_IFU, 41 }, +	[PM_IFU_ALT] = +		    { 0x02200000000000ull, 0x08820000000000ull, PM_IFU, 41 }, +	[PM_IDU0] = { 0x10100000000000ull, 0x80840000000000ull, PM_IDU0, 1 }, +	[PM_ISU2] = { 0x10140000000000ull, 0x80840000000000ull, PM_ISU2, 0 }, +	[PM_LSU0] = { 0x01400000000000ull, 0x08800000000000ull, PM_LSU0, 0 }, +	[PM_LSU1] = { 0x00000000000000ull, 0x00010000000000ull, PM_LSU1, 40 }, +	[PM_GPS]  = { 0x00000000000000ull, 0x00000000000000ull, PM_GPS, 0 } +}; + +static unsigned char direct_marked_event[8] = { +	(1<<2) | (1<<3),	/* PMC1: PM_MRK_GRP_DISP, PM_MRK_ST_CMPL */ +	(1<<3) | (1<<5),	/* PMC2: PM_THRESH_TIMEO, PM_MRK_BRU_FIN */ +	(1<<3),			/* PMC3: PM_MRK_ST_CMPL_INT */ +	(1<<4) | (1<<5),	/* PMC4: PM_MRK_GRP_CMPL, PM_MRK_CRU_FIN */ +	(1<<4) | (1<<5),	/* PMC5: PM_MRK_GRP_TIMEO */ +	(1<<3) | (1<<4) | (1<<5), +		/* PMC6: PM_MRK_ST_GPS, PM_MRK_FXU_FIN, PM_MRK_GRP_ISSUED */ +	(1<<4) | (1<<5),	/* PMC7: PM_MRK_FPU_FIN, PM_MRK_INST_FIN */ +	(1<<4),			/* PMC8: PM_MRK_LSU_FIN */ +}; + +/* + * Returns 1 if event counts things relating to marked instructions + * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not. + */ +static int p4_marked_instr_event(u64 event) +{ +	int pmc, psel, unit, byte, bit; +	unsigned int mask; + +	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; +	psel = event & PM_PMCSEL_MSK; +	if (pmc) { +		if (direct_marked_event[pmc - 1] & (1 << psel)) +			return 1; +		if (psel == 0)		/* add events */ +			bit = (pmc <= 4)? pmc - 1: 8 - pmc; +		else if (psel == 6)	/* decode events */ +			bit = 4; +		else +			return 0; +	} else +		bit = psel; + +	byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK; +	unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK; +	mask = 0; +	switch (unit) { +	case PM_LSU1: +		if (event & PM_LOWER_MSKS) +			mask = 1 << 28;		/* byte 7 bit 4 */ +		else +			mask = 6 << 24;		/* byte 3 bits 1 and 2 */ +		break; +	case PM_LSU0: +		/* byte 3, bit 3; byte 2 bits 0,2,3,4,5; byte 1 */ +		mask = 0x083dff00; +	} +	return (mask >> (byte * 8 + bit)) & 1; +} + +static int p4_get_constraint(u64 event, u64 *maskp, u64 *valp) +{ +	int pmc, byte, unit, lower, sh; +	u64 mask = 0, value = 0; +	int grp = -1; + +	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; +	if (pmc) { +		if (pmc > 8) +			return -1; +		sh = (pmc - 1) * 2; +		mask |= 2 << sh; +		value |= 1 << sh; +		grp = ((pmc - 1) >> 1) & 1; +	} +	unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK; +	byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK; +	if (unit) { +		lower = (event >> PM_LOWER_SH) & PM_LOWER_MSK; + +		/* +		 * Bus events on bytes 0 and 2 can be counted +		 * on PMC1/2/5/6; bytes 1 and 3 on PMC3/4/7/8. +		 */ +		if (!pmc) +			grp = byte & 1; + +		if (!p4_unitinfo[unit].unit) +			return -1; +		mask  |= p4_unitinfo[unit].mask; +		value |= p4_unitinfo[unit].value; +		sh = p4_unitinfo[unit].lowerbit; +		if (sh > 1) +			value |= (u64)lower << sh; +		else if (lower != sh) +			return -1; +		unit = p4_unitinfo[unit].unit; + +		/* Set byte lane select field */ +		mask  |= 0xfULL << (28 - 4 * byte); +		value |= (u64)unit << (28 - 4 * byte); +	} +	if (grp == 0) { +		/* increment PMC1/2/5/6 field */ +		mask  |= 0x8000000000ull; +		value |= 0x1000000000ull; +	} else { +		/* increment PMC3/4/7/8 field */ +		mask  |= 0x800000000ull; +		value |= 0x100000000ull; +	} + +	/* Marked instruction events need sample_enable set */ +	if (p4_marked_instr_event(event)) { +		mask  |= 1ull << 56; +		value |= 1ull << 56; +	} + +	/* PMCSEL=6 decode events on byte 2 need sample_enable clear */ +	if (pmc && (event & PM_PMCSEL_MSK) == 6 && byte == 2) +		mask  |= 1ull << 56; + +	*maskp = mask; +	*valp = value; +	return 0; +} + +static unsigned int ppc_inst_cmpl[] = { +	0x1001, 0x4001, 0x6001, 0x7001, 0x8001 +}; + +static int p4_get_alternatives(u64 event, unsigned int flags, u64 alt[]) +{ +	int i, j, na; + +	alt[0] = event; +	na = 1; + +	/* 2 possibilities for PM_GRP_DISP_REJECT */ +	if (event == 0x8003 || event == 0x0224) { +		alt[1] = event ^ (0x8003 ^ 0x0224); +		return 2; +	} + +	/* 2 possibilities for PM_ST_MISS_L1 */ +	if (event == 0x0c13 || event == 0x0c23) { +		alt[1] = event ^ (0x0c13 ^ 0x0c23); +		return 2; +	} + +	/* several possibilities for PM_INST_CMPL */ +	for (i = 0; i < ARRAY_SIZE(ppc_inst_cmpl); ++i) { +		if (event == ppc_inst_cmpl[i]) { +			for (j = 0; j < ARRAY_SIZE(ppc_inst_cmpl); ++j) +				if (j != i) +					alt[na++] = ppc_inst_cmpl[j]; +			break; +		} +	} + +	return na; +} + +static int p4_compute_mmcr(u64 event[], int n_ev, +			   unsigned int hwc[], u64 mmcr[]) +{ +	u64 mmcr0 = 0, mmcr1 = 0, mmcra = 0; +	unsigned int pmc, unit, byte, psel, lower; +	unsigned int ttm, grp; +	unsigned int pmc_inuse = 0; +	unsigned int pmc_grp_use[2]; +	unsigned char busbyte[4]; +	unsigned char unituse[16]; +	unsigned int unitlower = 0; +	int i; + +	if (n_ev > 8) +		return -1; + +	/* First pass to count resource use */ +	pmc_grp_use[0] = pmc_grp_use[1] = 0; +	memset(busbyte, 0, sizeof(busbyte)); +	memset(unituse, 0, sizeof(unituse)); +	for (i = 0; i < n_ev; ++i) { +		pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK; +		if (pmc) { +			if (pmc_inuse & (1 << (pmc - 1))) +				return -1; +			pmc_inuse |= 1 << (pmc - 1); +			/* count 1/2/5/6 vs 3/4/7/8 use */ +			++pmc_grp_use[((pmc - 1) >> 1) & 1]; +		} +		unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK; +		byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK; +		lower = (event[i] >> PM_LOWER_SH) & PM_LOWER_MSK; +		if (unit) { +			if (!pmc) +				++pmc_grp_use[byte & 1]; +			if (unit == 6 || unit == 8) +				/* map alt ISU1/IFU codes: 6->2, 8->3 */ +				unit = (unit >> 1) - 1; +			if (busbyte[byte] && busbyte[byte] != unit) +				return -1; +			busbyte[byte] = unit; +			lower <<= unit; +			if (unituse[unit] && lower != (unitlower & lower)) +				return -1; +			unituse[unit] = 1; +			unitlower |= lower; +		} +	} +	if (pmc_grp_use[0] > 4 || pmc_grp_use[1] > 4) +		return -1; + +	/* +	 * Assign resources and set multiplexer selects. +	 * +	 * Units 1,2,3 are on TTM0, 4,6,7 on TTM1, 8,10 on TTM2. +	 * Each TTMx can only select one unit, but since +	 * units 2 and 6 are both ISU1, and 3 and 8 are both IFU, +	 * we have some choices. +	 */ +	if (unituse[2] & (unituse[1] | (unituse[3] & unituse[9]))) { +		unituse[6] = 1;		/* Move 2 to 6 */ +		unituse[2] = 0; +	} +	if (unituse[3] & (unituse[1] | unituse[2])) { +		unituse[8] = 1;		/* Move 3 to 8 */ +		unituse[3] = 0; +		unitlower = (unitlower & ~8) | ((unitlower & 8) << 5); +	} +	/* Check only one unit per TTMx */ +	if (unituse[1] + unituse[2] + unituse[3] > 1 || +	    unituse[4] + unituse[6] + unituse[7] > 1 || +	    unituse[8] + unituse[9] > 1 || +	    (unituse[5] | unituse[10] | unituse[11] | +	     unituse[13] | unituse[14])) +		return -1; + +	/* Set TTMxSEL fields.  Note, units 1-3 => TTM0SEL codes 0-2 */ +	mmcr1 |= (u64)(unituse[3] * 2 + unituse[2]) << MMCR1_TTM0SEL_SH; +	mmcr1 |= (u64)(unituse[7] * 3 + unituse[6] * 2) << MMCR1_TTM1SEL_SH; +	mmcr1 |= (u64)unituse[9] << MMCR1_TTM2SEL_SH; + +	/* Set TTCxSEL fields. */ +	if (unitlower & 0xe) +		mmcr1 |= 1ull << MMCR1_TTC0SEL_SH; +	if (unitlower & 0xf0) +		mmcr1 |= 1ull << MMCR1_TTC1SEL_SH; +	if (unitlower & 0xf00) +		mmcr1 |= 1ull << MMCR1_TTC2SEL_SH; +	if (unitlower & 0x7000) +		mmcr1 |= 1ull << MMCR1_TTC3SEL_SH; + +	/* Set byte lane select fields. */ +	for (byte = 0; byte < 4; ++byte) { +		unit = busbyte[byte]; +		if (!unit) +			continue; +		if (unit == 0xf) { +			/* special case for GPS */ +			mmcr1 |= 1ull << (MMCR1_DEBUG0SEL_SH - byte); +		} else { +			if (!unituse[unit]) +				ttm = unit - 1;		/* 2->1, 3->2 */ +			else +				ttm = unit >> 2; +			mmcr1 |= (u64)ttm << (MMCR1_TD_CP_DBG0SEL_SH - 2*byte); +		} +	} + +	/* Second pass: assign PMCs, set PMCxSEL and PMCx_ADDER_SEL fields */ +	for (i = 0; i < n_ev; ++i) { +		pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK; +		unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK; +		byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK; +		psel = event[i] & PM_PMCSEL_MSK; +		if (!pmc) { +			/* Bus event or 00xxx direct event (off or cycles) */ +			if (unit) +				psel |= 0x10 | ((byte & 2) << 2); +			for (pmc = 0; pmc < 8; ++pmc) { +				if (pmc_inuse & (1 << pmc)) +					continue; +				grp = (pmc >> 1) & 1; +				if (unit) { +					if (grp == (byte & 1)) +						break; +				} else if (pmc_grp_use[grp] < 4) { +					++pmc_grp_use[grp]; +					break; +				} +			} +			pmc_inuse |= 1 << pmc; +		} else { +			/* Direct event */ +			--pmc; +			if (psel == 0 && (byte & 2)) +				/* add events on higher-numbered bus */ +				mmcr1 |= 1ull << mmcr1_adder_bits[pmc]; +			else if (psel == 6 && byte == 3) +				/* seem to need to set sample_enable here */ +				mmcra |= MMCRA_SAMPLE_ENABLE; +			psel |= 8; +		} +		if (pmc <= 1) +			mmcr0 |= psel << (MMCR0_PMC1SEL_SH - 7 * pmc); +		else +			mmcr1 |= psel << (MMCR1_PMC3SEL_SH - 5 * (pmc - 2)); +		if (pmc == 7)	/* PMC8 */ +			mmcra |= (psel & 1) << MMCRA_PMC8SEL0_SH; +		hwc[i] = pmc; +		if (p4_marked_instr_event(event[i])) +			mmcra |= MMCRA_SAMPLE_ENABLE; +	} + +	if (pmc_inuse & 1) +		mmcr0 |= MMCR0_PMC1CE; +	if (pmc_inuse & 0xfe) +		mmcr0 |= MMCR0_PMCjCE; + +	mmcra |= 0x2000;	/* mark only one IOP per PPC instruction */ + +	/* Return MMCRx values */ +	mmcr[0] = mmcr0; +	mmcr[1] = mmcr1; +	mmcr[2] = mmcra; +	return 0; +} + +static void p4_disable_pmc(unsigned int pmc, u64 mmcr[]) +{ +	/* +	 * Setting the PMCxSEL field to 0 disables PMC x. +	 * (Note that pmc is 0-based here, not 1-based.) +	 */ +	if (pmc <= 1) { +		mmcr[0] &= ~(0x1fUL << (MMCR0_PMC1SEL_SH - 7 * pmc)); +	} else { +		mmcr[1] &= ~(0x1fUL << (MMCR1_PMC3SEL_SH - 5 * (pmc - 2))); +		if (pmc == 7) +			mmcr[2] &= ~(1UL << MMCRA_PMC8SEL0_SH); +	} +} + +static int p4_generic_events[] = { +	[PERF_COUNT_HW_CPU_CYCLES]		= 7, +	[PERF_COUNT_HW_INSTRUCTIONS]		= 0x1001, +	[PERF_COUNT_HW_CACHE_REFERENCES]	= 0x8c10, /* PM_LD_REF_L1 */ +	[PERF_COUNT_HW_CACHE_MISSES]		= 0x3c10, /* PM_LD_MISS_L1 */ +	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x330,  /* PM_BR_ISSUED */ +	[PERF_COUNT_HW_BRANCH_MISSES]		= 0x331,  /* PM_BR_MPRED_CR */ +}; + +#define C(x)	PERF_COUNT_HW_CACHE_##x + +/* + * Table of generalized cache-related events. + * 0 means not supported, -1 means nonsensical, other values + * are event codes. + */ +static int power4_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = { +	[C(L1D)] = {		/* 	RESULT_ACCESS	RESULT_MISS */ +		[C(OP_READ)] = {	0x8c10,		0x3c10	}, +		[C(OP_WRITE)] = {	0x7c10,		0xc13	}, +		[C(OP_PREFETCH)] = {	0xc35,		0	}, +	}, +	[C(L1I)] = {		/* 	RESULT_ACCESS	RESULT_MISS */ +		[C(OP_READ)] = {	0,		0	}, +		[C(OP_WRITE)] = {	-1,		-1	}, +		[C(OP_PREFETCH)] = {	0,		0	}, +	}, +	[C(LL)] = {		/* 	RESULT_ACCESS	RESULT_MISS */ +		[C(OP_READ)] = {	0,		0	}, +		[C(OP_WRITE)] = {	0,		0	}, +		[C(OP_PREFETCH)] = {	0xc34,		0	}, +	}, +	[C(DTLB)] = {		/* 	RESULT_ACCESS	RESULT_MISS */ +		[C(OP_READ)] = {	0,		0x904	}, +		[C(OP_WRITE)] = {	-1,		-1	}, +		[C(OP_PREFETCH)] = {	-1,		-1	}, +	}, +	[C(ITLB)] = {		/* 	RESULT_ACCESS	RESULT_MISS */ +		[C(OP_READ)] = {	0,		0x900	}, +		[C(OP_WRITE)] = {	-1,		-1	}, +		[C(OP_PREFETCH)] = {	-1,		-1	}, +	}, +	[C(BPU)] = {		/* 	RESULT_ACCESS	RESULT_MISS */ +		[C(OP_READ)] = {	0x330,		0x331	}, +		[C(OP_WRITE)] = {	-1,		-1	}, +		[C(OP_PREFETCH)] = {	-1,		-1	}, +	}, +}; + +struct power_pmu power4_pmu = { +	.n_counter = 8, +	.max_alternatives = 5, +	.add_fields = 0x0000001100005555ull, +	.test_adder = 0x0011083300000000ull, +	.compute_mmcr = p4_compute_mmcr, +	.get_constraint = p4_get_constraint, +	.get_alternatives = p4_get_alternatives, +	.disable_pmc = p4_disable_pmc, +	.n_generic = ARRAY_SIZE(p4_generic_events), +	.generic_events = p4_generic_events, +	.cache_events = &power4_cache_events, +}; diff --git a/arch/powerpc/kernel/power5+-pmu.c b/arch/powerpc/kernel/power5+-pmu.c new file mode 100644 index 00000000000..41e5d2d958d --- /dev/null +++ b/arch/powerpc/kernel/power5+-pmu.c @@ -0,0 +1,671 @@ +/* + * Performance counter support for POWER5+/++ (not POWER5) processors. + * + * Copyright 2009 Paul Mackerras, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <linux/kernel.h> +#include <linux/perf_counter.h> +#include <asm/reg.h> + +/* + * Bits in event code for POWER5+ (POWER5 GS) and POWER5++ (POWER5 GS DD3) + */ +#define PM_PMC_SH	20	/* PMC number (1-based) for direct events */ +#define PM_PMC_MSK	0xf +#define PM_PMC_MSKS	(PM_PMC_MSK << PM_PMC_SH) +#define PM_UNIT_SH	16	/* TTMMUX number and setting - unit select */ +#define PM_UNIT_MSK	0xf +#define PM_BYTE_SH	12	/* Byte number of event bus to use */ +#define PM_BYTE_MSK	7 +#define PM_GRS_SH	8	/* Storage subsystem mux select */ +#define PM_GRS_MSK	7 +#define PM_BUSEVENT_MSK	0x80	/* Set if event uses event bus */ +#define PM_PMCSEL_MSK	0x7f + +/* Values in PM_UNIT field */ +#define PM_FPU		0 +#define PM_ISU0		1 +#define PM_IFU		2 +#define PM_ISU1		3 +#define PM_IDU		4 +#define PM_ISU0_ALT	6 +#define PM_GRS		7 +#define PM_LSU0		8 +#define PM_LSU1		0xc +#define PM_LASTUNIT	0xc + +/* + * Bits in MMCR1 for POWER5+ + */ +#define MMCR1_TTM0SEL_SH	62 +#define MMCR1_TTM1SEL_SH	60 +#define MMCR1_TTM2SEL_SH	58 +#define MMCR1_TTM3SEL_SH	56 +#define MMCR1_TTMSEL_MSK	3 +#define MMCR1_TD_CP_DBG0SEL_SH	54 +#define MMCR1_TD_CP_DBG1SEL_SH	52 +#define MMCR1_TD_CP_DBG2SEL_SH	50 +#define MMCR1_TD_CP_DBG3SEL_SH	48 +#define MMCR1_GRS_L2SEL_SH	46 +#define MMCR1_GRS_L2SEL_MSK	3 +#define MMCR1_GRS_L3SEL_SH	44 +#define MMCR1_GRS_L3SEL_MSK	3 +#define MMCR1_GRS_MCSEL_SH	41 +#define MMCR1_GRS_MCSEL_MSK	7 +#define MMCR1_GRS_FABSEL_SH	39 +#define MMCR1_GRS_FABSEL_MSK	3 +#define MMCR1_PMC1_ADDER_SEL_SH	35 +#define MMCR1_PMC2_ADDER_SEL_SH	34 +#define MMCR1_PMC3_ADDER_SEL_SH	33 +#define MMCR1_PMC4_ADDER_SEL_SH	32 +#define MMCR1_PMC1SEL_SH	25 +#define MMCR1_PMC2SEL_SH	17 +#define MMCR1_PMC3SEL_SH	9 +#define MMCR1_PMC4SEL_SH	1 +#define MMCR1_PMCSEL_SH(n)	(MMCR1_PMC1SEL_SH - (n) * 8) +#define MMCR1_PMCSEL_MSK	0x7f + +/* + * Bits in MMCRA + */ + +/* + * Layout of constraint bits: + * 6666555555555544444444443333333333222222222211111111110000000000 + * 3210987654321098765432109876543210987654321098765432109876543210 + *             [  ><><>< ><> <><>[  >  <  ><  ><  ><  ><><><><><><> + *             NC  G0G1G2 G3 T0T1 UC    B0  B1  B2  B3 P6P5P4P3P2P1 + * + * NC - number of counters + *     51: NC error 0x0008_0000_0000_0000 + *     48-50: number of events needing PMC1-4 0x0007_0000_0000_0000 + * + * G0..G3 - GRS mux constraints + *     46-47: GRS_L2SEL value + *     44-45: GRS_L3SEL value + *     41-44: GRS_MCSEL value + *     39-40: GRS_FABSEL value + *	Note that these match up with their bit positions in MMCR1 + * + * T0 - TTM0 constraint + *     36-37: TTM0SEL value (0=FPU, 2=IFU, 3=ISU1) 0x30_0000_0000 + * + * T1 - TTM1 constraint + *     34-35: TTM1SEL value (0=IDU, 3=GRS) 0x0c_0000_0000 + * + * UC - unit constraint: can't have all three of FPU|IFU|ISU1, ISU0, IDU|GRS + *     33: UC3 error 0x02_0000_0000 + *     32: FPU|IFU|ISU1 events needed 0x01_0000_0000 + *     31: ISU0 events needed 0x01_8000_0000 + *     30: IDU|GRS events needed 0x00_4000_0000 + * + * B0 + *     24-27: Byte 0 event source 0x0f00_0000 + *	      Encoding as for the event code + * + * B1, B2, B3 + *     20-23, 16-19, 12-15: Byte 1, 2, 3 event sources + * + * P6 + *     11: P6 error 0x800 + *     10-11: Count of events needing PMC6 + * + * P1..P5 + *     0-9: Count of events needing PMC1..PMC5 + */ + +static const int grsel_shift[8] = { +	MMCR1_GRS_L2SEL_SH, MMCR1_GRS_L2SEL_SH, MMCR1_GRS_L2SEL_SH, +	MMCR1_GRS_L3SEL_SH, MMCR1_GRS_L3SEL_SH, MMCR1_GRS_L3SEL_SH, +	MMCR1_GRS_MCSEL_SH, MMCR1_GRS_FABSEL_SH +}; + +/* Masks and values for using events from the various units */ +static u64 unit_cons[PM_LASTUNIT+1][2] = { +	[PM_FPU] =   { 0x3200000000ull, 0x0100000000ull }, +	[PM_ISU0] =  { 0x0200000000ull, 0x0080000000ull }, +	[PM_ISU1] =  { 0x3200000000ull, 0x3100000000ull }, +	[PM_IFU] =   { 0x3200000000ull, 0x2100000000ull }, +	[PM_IDU] =   { 0x0e00000000ull, 0x0040000000ull }, +	[PM_GRS] =   { 0x0e00000000ull, 0x0c40000000ull }, +}; + +static int power5p_get_constraint(u64 event, u64 *maskp, u64 *valp) +{ +	int pmc, byte, unit, sh; +	int bit, fmask; +	u64 mask = 0, value = 0; + +	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; +	if (pmc) { +		if (pmc > 6) +			return -1; +		sh = (pmc - 1) * 2; +		mask |= 2 << sh; +		value |= 1 << sh; +		if (pmc >= 5 && !(event == 0x500009 || event == 0x600005)) +			return -1; +	} +	if (event & PM_BUSEVENT_MSK) { +		unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK; +		if (unit > PM_LASTUNIT) +			return -1; +		if (unit == PM_ISU0_ALT) +			unit = PM_ISU0; +		mask |= unit_cons[unit][0]; +		value |= unit_cons[unit][1]; +		byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK; +		if (byte >= 4) { +			if (unit != PM_LSU1) +				return -1; +			/* Map LSU1 low word (bytes 4-7) to unit LSU1+1 */ +			++unit; +			byte &= 3; +		} +		if (unit == PM_GRS) { +			bit = event & 7; +			fmask = (bit == 6)? 7: 3; +			sh = grsel_shift[bit]; +			mask |= (u64)fmask << sh; +			value |= (u64)((event >> PM_GRS_SH) & fmask) << sh; +		} +		/* Set byte lane select field */ +		mask  |= 0xfULL << (24 - 4 * byte); +		value |= (u64)unit << (24 - 4 * byte); +	} +	if (pmc < 5) { +		/* need a counter from PMC1-4 set */ +		mask  |= 0x8000000000000ull; +		value |= 0x1000000000000ull; +	} +	*maskp = mask; +	*valp = value; +	return 0; +} + +static int power5p_limited_pmc_event(u64 event) +{ +	int pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; + +	return pmc == 5 || pmc == 6; +} + +#define MAX_ALT	3	/* at most 3 alternatives for any event */ + +static const unsigned int event_alternatives[][MAX_ALT] = { +	{ 0x100c0,  0x40001f },			/* PM_GCT_FULL_CYC */ +	{ 0x120e4,  0x400002 },			/* PM_GRP_DISP_REJECT */ +	{ 0x230e2,  0x323087 },			/* PM_BR_PRED_CR */ +	{ 0x230e3,  0x223087, 0x3230a0 },	/* PM_BR_PRED_TA */ +	{ 0x410c7,  0x441084 },			/* PM_THRD_L2MISS_BOTH_CYC */ +	{ 0x800c4,  0xc20e0 },			/* PM_DTLB_MISS */ +	{ 0xc50c6,  0xc60e0 },			/* PM_MRK_DTLB_MISS */ +	{ 0x100005, 0x600005 },			/* PM_RUN_CYC */ +	{ 0x100009, 0x200009 },			/* PM_INST_CMPL */ +	{ 0x200015, 0x300015 },			/* PM_LSU_LMQ_SRQ_EMPTY_CYC */ +	{ 0x300009, 0x400009 },			/* PM_INST_DISP */ +}; + +/* + * Scan the alternatives table for a match and return the + * index into the alternatives table if found, else -1. + */ +static int find_alternative(unsigned int event) +{ +	int i, j; + +	for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) { +		if (event < event_alternatives[i][0]) +			break; +		for (j = 0; j < MAX_ALT && event_alternatives[i][j]; ++j) +			if (event == event_alternatives[i][j]) +				return i; +	} +	return -1; +} + +static const unsigned char bytedecode_alternatives[4][4] = { +	/* PMC 1 */	{ 0x21, 0x23, 0x25, 0x27 }, +	/* PMC 2 */	{ 0x07, 0x17, 0x0e, 0x1e }, +	/* PMC 3 */	{ 0x20, 0x22, 0x24, 0x26 }, +	/* PMC 4 */	{ 0x07, 0x17, 0x0e, 0x1e } +}; + +/* + * Some direct events for decodes of event bus byte 3 have alternative + * PMCSEL values on other counters.  This returns the alternative + * event code for those that do, or -1 otherwise.  This also handles + * alternative PCMSEL values for add events. + */ +static s64 find_alternative_bdecode(u64 event) +{ +	int pmc, altpmc, pp, j; + +	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; +	if (pmc == 0 || pmc > 4) +		return -1; +	altpmc = 5 - pmc;	/* 1 <-> 4, 2 <-> 3 */ +	pp = event & PM_PMCSEL_MSK; +	for (j = 0; j < 4; ++j) { +		if (bytedecode_alternatives[pmc - 1][j] == pp) { +			return (event & ~(PM_PMC_MSKS | PM_PMCSEL_MSK)) | +				(altpmc << PM_PMC_SH) | +				bytedecode_alternatives[altpmc - 1][j]; +		} +	} + +	/* new decode alternatives for power5+ */ +	if (pmc == 1 && (pp == 0x0d || pp == 0x0e)) +		return event + (2 << PM_PMC_SH) + (0x2e - 0x0d); +	if (pmc == 3 && (pp == 0x2e || pp == 0x2f)) +		return event - (2 << PM_PMC_SH) - (0x2e - 0x0d); + +	/* alternative add event encodings */ +	if (pp == 0x10 || pp == 0x28) +		return ((event ^ (0x10 ^ 0x28)) & ~PM_PMC_MSKS) | +			(altpmc << PM_PMC_SH); + +	return -1; +} + +static int power5p_get_alternatives(u64 event, unsigned int flags, u64 alt[]) +{ +	int i, j, nalt = 1; +	int nlim; +	s64 ae; + +	alt[0] = event; +	nalt = 1; +	nlim = power5p_limited_pmc_event(event); +	i = find_alternative(event); +	if (i >= 0) { +		for (j = 0; j < MAX_ALT; ++j) { +			ae = event_alternatives[i][j]; +			if (ae && ae != event) +				alt[nalt++] = ae; +			nlim += power5p_limited_pmc_event(ae); +		} +	} else { +		ae = find_alternative_bdecode(event); +		if (ae > 0) +			alt[nalt++] = ae; +	} + +	if (flags & PPMU_ONLY_COUNT_RUN) { +		/* +		 * We're only counting in RUN state, +		 * so PM_CYC is equivalent to PM_RUN_CYC +		 * and PM_INST_CMPL === PM_RUN_INST_CMPL. +		 * This doesn't include alternatives that don't provide +		 * any extra flexibility in assigning PMCs (e.g. +		 * 0x100005 for PM_RUN_CYC vs. 0xf for PM_CYC). +		 * Note that even with these additional alternatives +		 * we never end up with more than 3 alternatives for any event. +		 */ +		j = nalt; +		for (i = 0; i < nalt; ++i) { +			switch (alt[i]) { +			case 0xf:	/* PM_CYC */ +				alt[j++] = 0x600005;	/* PM_RUN_CYC */ +				++nlim; +				break; +			case 0x600005:	/* PM_RUN_CYC */ +				alt[j++] = 0xf; +				break; +			case 0x100009:	/* PM_INST_CMPL */ +				alt[j++] = 0x500009;	/* PM_RUN_INST_CMPL */ +				++nlim; +				break; +			case 0x500009:	/* PM_RUN_INST_CMPL */ +				alt[j++] = 0x100009;	/* PM_INST_CMPL */ +				alt[j++] = 0x200009; +				break; +			} +		} +		nalt = j; +	} + +	if (!(flags & PPMU_LIMITED_PMC_OK) && nlim) { +		/* remove the limited PMC events */ +		j = 0; +		for (i = 0; i < nalt; ++i) { +			if (!power5p_limited_pmc_event(alt[i])) { +				alt[j] = alt[i]; +				++j; +			} +		} +		nalt = j; +	} else if ((flags & PPMU_LIMITED_PMC_REQD) && nlim < nalt) { +		/* remove all but the limited PMC events */ +		j = 0; +		for (i = 0; i < nalt; ++i) { +			if (power5p_limited_pmc_event(alt[i])) { +				alt[j] = alt[i]; +				++j; +			} +		} +		nalt = j; +	} + +	return nalt; +} + +/* + * Map of which direct events on which PMCs are marked instruction events. + * Indexed by PMCSEL value, bit i (LE) set if PMC i is a marked event. + * Bit 0 is set if it is marked for all PMCs. + * The 0x80 bit indicates a byte decode PMCSEL value. + */ +static unsigned char direct_event_is_marked[0x28] = { +	0,	/* 00 */ +	0x1f,	/* 01 PM_IOPS_CMPL */ +	0x2,	/* 02 PM_MRK_GRP_DISP */ +	0xe,	/* 03 PM_MRK_ST_CMPL, PM_MRK_ST_GPS, PM_MRK_ST_CMPL_INT */ +	0,	/* 04 */ +	0x1c,	/* 05 PM_MRK_BRU_FIN, PM_MRK_INST_FIN, PM_MRK_CRU_FIN */ +	0x80,	/* 06 */ +	0x80,	/* 07 */ +	0, 0, 0,/* 08 - 0a */ +	0x18,	/* 0b PM_THRESH_TIMEO, PM_MRK_GRP_TIMEO */ +	0,	/* 0c */ +	0x80,	/* 0d */ +	0x80,	/* 0e */ +	0,	/* 0f */ +	0,	/* 10 */ +	0x14,	/* 11 PM_MRK_GRP_BR_REDIR, PM_MRK_GRP_IC_MISS */ +	0,	/* 12 */ +	0x10,	/* 13 PM_MRK_GRP_CMPL */ +	0x1f,	/* 14 PM_GRP_MRK, PM_MRK_{FXU,FPU,LSU}_FIN */ +	0x2,	/* 15 PM_MRK_GRP_ISSUED */ +	0x80,	/* 16 */ +	0x80,	/* 17 */ +	0, 0, 0, 0, 0, +	0x80,	/* 1d */ +	0x80,	/* 1e */ +	0,	/* 1f */ +	0x80,	/* 20 */ +	0x80,	/* 21 */ +	0x80,	/* 22 */ +	0x80,	/* 23 */ +	0x80,	/* 24 */ +	0x80,	/* 25 */ +	0x80,	/* 26 */ +	0x80,	/* 27 */ +}; + +/* + * Returns 1 if event counts things relating to marked instructions + * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not. + */ +static int power5p_marked_instr_event(u64 event) +{ +	int pmc, psel; +	int bit, byte, unit; +	u32 mask; + +	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; +	psel = event & PM_PMCSEL_MSK; +	if (pmc >= 5) +		return 0; + +	bit = -1; +	if (psel < sizeof(direct_event_is_marked)) { +		if (direct_event_is_marked[psel] & (1 << pmc)) +			return 1; +		if (direct_event_is_marked[psel] & 0x80) +			bit = 4; +		else if (psel == 0x08) +			bit = pmc - 1; +		else if (psel == 0x10) +			bit = 4 - pmc; +		else if (psel == 0x1b && (pmc == 1 || pmc == 3)) +			bit = 4; +	} else if ((psel & 0x48) == 0x40) { +		bit = psel & 7; +	} else if (psel == 0x28) { +		bit = pmc - 1; +	} else if (pmc == 3 && (psel == 0x2e || psel == 0x2f)) { +		bit = 4; +	} + +	if (!(event & PM_BUSEVENT_MSK) || bit == -1) +		return 0; + +	byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK; +	unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK; +	if (unit == PM_LSU0) { +		/* byte 1 bits 0-7, byte 2 bits 0,2-4,6 */ +		mask = 0x5dff00; +	} else if (unit == PM_LSU1 && byte >= 4) { +		byte -= 4; +		/* byte 5 bits 6-7, byte 6 bits 0,4, byte 7 bits 0-4,6 */ +		mask = 0x5f11c000; +	} else +		return 0; + +	return (mask >> (byte * 8 + bit)) & 1; +} + +static int power5p_compute_mmcr(u64 event[], int n_ev, +				unsigned int hwc[], u64 mmcr[]) +{ +	u64 mmcr1 = 0; +	u64 mmcra = 0; +	unsigned int pmc, unit, byte, psel; +	unsigned int ttm; +	int i, isbus, bit, grsel; +	unsigned int pmc_inuse = 0; +	unsigned char busbyte[4]; +	unsigned char unituse[16]; +	int ttmuse; + +	if (n_ev > 6) +		return -1; + +	/* First pass to count resource use */ +	memset(busbyte, 0, sizeof(busbyte)); +	memset(unituse, 0, sizeof(unituse)); +	for (i = 0; i < n_ev; ++i) { +		pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK; +		if (pmc) { +			if (pmc > 6) +				return -1; +			if (pmc_inuse & (1 << (pmc - 1))) +				return -1; +			pmc_inuse |= 1 << (pmc - 1); +		} +		if (event[i] & PM_BUSEVENT_MSK) { +			unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK; +			byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK; +			if (unit > PM_LASTUNIT) +				return -1; +			if (unit == PM_ISU0_ALT) +				unit = PM_ISU0; +			if (byte >= 4) { +				if (unit != PM_LSU1) +					return -1; +				++unit; +				byte &= 3; +			} +			if (busbyte[byte] && busbyte[byte] != unit) +				return -1; +			busbyte[byte] = unit; +			unituse[unit] = 1; +		} +	} + +	/* +	 * Assign resources and set multiplexer selects. +	 * +	 * PM_ISU0 can go either on TTM0 or TTM1, but that's the only +	 * choice we have to deal with. +	 */ +	if (unituse[PM_ISU0] & +	    (unituse[PM_FPU] | unituse[PM_IFU] | unituse[PM_ISU1])) { +		unituse[PM_ISU0_ALT] = 1;	/* move ISU to TTM1 */ +		unituse[PM_ISU0] = 0; +	} +	/* Set TTM[01]SEL fields. */ +	ttmuse = 0; +	for (i = PM_FPU; i <= PM_ISU1; ++i) { +		if (!unituse[i]) +			continue; +		if (ttmuse++) +			return -1; +		mmcr1 |= (u64)i << MMCR1_TTM0SEL_SH; +	} +	ttmuse = 0; +	for (; i <= PM_GRS; ++i) { +		if (!unituse[i]) +			continue; +		if (ttmuse++) +			return -1; +		mmcr1 |= (u64)(i & 3) << MMCR1_TTM1SEL_SH; +	} +	if (ttmuse > 1) +		return -1; + +	/* Set byte lane select fields, TTM[23]SEL and GRS_*SEL. */ +	for (byte = 0; byte < 4; ++byte) { +		unit = busbyte[byte]; +		if (!unit) +			continue; +		if (unit == PM_ISU0 && unituse[PM_ISU0_ALT]) { +			/* get ISU0 through TTM1 rather than TTM0 */ +			unit = PM_ISU0_ALT; +		} else if (unit == PM_LSU1 + 1) { +			/* select lower word of LSU1 for this byte */ +			mmcr1 |= 1ull << (MMCR1_TTM3SEL_SH + 3 - byte); +		} +		ttm = unit >> 2; +		mmcr1 |= (u64)ttm << (MMCR1_TD_CP_DBG0SEL_SH - 2 * byte); +	} + +	/* Second pass: assign PMCs, set PMCxSEL and PMCx_ADDER_SEL fields */ +	for (i = 0; i < n_ev; ++i) { +		pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK; +		unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK; +		byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK; +		psel = event[i] & PM_PMCSEL_MSK; +		isbus = event[i] & PM_BUSEVENT_MSK; +		if (!pmc) { +			/* Bus event or any-PMC direct event */ +			for (pmc = 0; pmc < 4; ++pmc) { +				if (!(pmc_inuse & (1 << pmc))) +					break; +			} +			if (pmc >= 4) +				return -1; +			pmc_inuse |= 1 << pmc; +		} else if (pmc <= 4) { +			/* Direct event */ +			--pmc; +			if (isbus && (byte & 2) && +			    (psel == 8 || psel == 0x10 || psel == 0x28)) +				/* add events on higher-numbered bus */ +				mmcr1 |= 1ull << (MMCR1_PMC1_ADDER_SEL_SH - pmc); +		} else { +			/* Instructions or run cycles on PMC5/6 */ +			--pmc; +		} +		if (isbus && unit == PM_GRS) { +			bit = psel & 7; +			grsel = (event[i] >> PM_GRS_SH) & PM_GRS_MSK; +			mmcr1 |= (u64)grsel << grsel_shift[bit]; +		} +		if (power5p_marked_instr_event(event[i])) +			mmcra |= MMCRA_SAMPLE_ENABLE; +		if ((psel & 0x58) == 0x40 && (byte & 1) != ((pmc >> 1) & 1)) +			/* select alternate byte lane */ +			psel |= 0x10; +		if (pmc <= 3) +			mmcr1 |= psel << MMCR1_PMCSEL_SH(pmc); +		hwc[i] = pmc; +	} + +	/* Return MMCRx values */ +	mmcr[0] = 0; +	if (pmc_inuse & 1) +		mmcr[0] = MMCR0_PMC1CE; +	if (pmc_inuse & 0x3e) +		mmcr[0] |= MMCR0_PMCjCE; +	mmcr[1] = mmcr1; +	mmcr[2] = mmcra; +	return 0; +} + +static void power5p_disable_pmc(unsigned int pmc, u64 mmcr[]) +{ +	if (pmc <= 3) +		mmcr[1] &= ~(0x7fUL << MMCR1_PMCSEL_SH(pmc)); +} + +static int power5p_generic_events[] = { +	[PERF_COUNT_HW_CPU_CYCLES]		= 0xf, +	[PERF_COUNT_HW_INSTRUCTIONS]		= 0x100009, +	[PERF_COUNT_HW_CACHE_REFERENCES]	= 0x1c10a8, /* LD_REF_L1 */ +	[PERF_COUNT_HW_CACHE_MISSES]		= 0x3c1088, /* LD_MISS_L1 */ +	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x230e4,  /* BR_ISSUED */ +	[PERF_COUNT_HW_BRANCH_MISSES]		= 0x230e5,  /* BR_MPRED_CR */ +}; + +#define C(x)	PERF_COUNT_HW_CACHE_##x + +/* + * Table of generalized cache-related events. + * 0 means not supported, -1 means nonsensical, other values + * are event codes. + */ +static int power5p_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = { +	[C(L1D)] = {		/* 	RESULT_ACCESS	RESULT_MISS */ +		[C(OP_READ)] = {	0x1c10a8,	0x3c1088	}, +		[C(OP_WRITE)] = {	0x2c10a8,	0xc10c3		}, +		[C(OP_PREFETCH)] = {	0xc70e7,	-1		}, +	}, +	[C(L1I)] = {		/* 	RESULT_ACCESS	RESULT_MISS */ +		[C(OP_READ)] = {	0,		0		}, +		[C(OP_WRITE)] = {	-1,		-1		}, +		[C(OP_PREFETCH)] = {	0,		0		}, +	}, +	[C(LL)] = {		/* 	RESULT_ACCESS	RESULT_MISS */ +		[C(OP_READ)] = {	0,		0		}, +		[C(OP_WRITE)] = {	0,		0		}, +		[C(OP_PREFETCH)] = {	0xc50c3,	0		}, +	}, +	[C(DTLB)] = {		/* 	RESULT_ACCESS	RESULT_MISS */ +		[C(OP_READ)] = {	0xc20e4,	0x800c4		}, +		[C(OP_WRITE)] = {	-1,		-1		}, +		[C(OP_PREFETCH)] = {	-1,		-1		}, +	}, +	[C(ITLB)] = {		/* 	RESULT_ACCESS	RESULT_MISS */ +		[C(OP_READ)] = {	0,		0x800c0		}, +		[C(OP_WRITE)] = {	-1,		-1		}, +		[C(OP_PREFETCH)] = {	-1,		-1		}, +	}, +	[C(BPU)] = {		/* 	RESULT_ACCESS	RESULT_MISS */ +		[C(OP_READ)] = {	0x230e4,	0x230e5		}, +		[C(OP_WRITE)] = {	-1,		-1		}, +		[C(OP_PREFETCH)] = {	-1,		-1		}, +	}, +}; + +struct power_pmu power5p_pmu = { +	.n_counter = 6, +	.max_alternatives = MAX_ALT, +	.add_fields = 0x7000000000055ull, +	.test_adder = 0x3000040000000ull, +	.compute_mmcr = power5p_compute_mmcr, +	.get_constraint = power5p_get_constraint, +	.get_alternatives = power5p_get_alternatives, +	.disable_pmc = power5p_disable_pmc, +	.limited_pmc_event = power5p_limited_pmc_event, +	.flags = PPMU_LIMITED_PMC5_6, +	.n_generic = ARRAY_SIZE(power5p_generic_events), +	.generic_events = power5p_generic_events, +	.cache_events = &power5p_cache_events, +}; diff --git a/arch/powerpc/kernel/power5-pmu.c b/arch/powerpc/kernel/power5-pmu.c new file mode 100644 index 00000000000..05600b66221 --- /dev/null +++ b/arch/powerpc/kernel/power5-pmu.c @@ -0,0 +1,611 @@ +/* + * Performance counter support for POWER5 (not POWER5++) processors. + * + * Copyright 2009 Paul Mackerras, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <linux/kernel.h> +#include <linux/perf_counter.h> +#include <asm/reg.h> + +/* + * Bits in event code for POWER5 (not POWER5++) + */ +#define PM_PMC_SH	20	/* PMC number (1-based) for direct events */ +#define PM_PMC_MSK	0xf +#define PM_PMC_MSKS	(PM_PMC_MSK << PM_PMC_SH) +#define PM_UNIT_SH	16	/* TTMMUX number and setting - unit select */ +#define PM_UNIT_MSK	0xf +#define PM_BYTE_SH	12	/* Byte number of event bus to use */ +#define PM_BYTE_MSK	7 +#define PM_GRS_SH	8	/* Storage subsystem mux select */ +#define PM_GRS_MSK	7 +#define PM_BUSEVENT_MSK	0x80	/* Set if event uses event bus */ +#define PM_PMCSEL_MSK	0x7f + +/* Values in PM_UNIT field */ +#define PM_FPU		0 +#define PM_ISU0		1 +#define PM_IFU		2 +#define PM_ISU1		3 +#define PM_IDU		4 +#define PM_ISU0_ALT	6 +#define PM_GRS		7 +#define PM_LSU0		8 +#define PM_LSU1		0xc +#define PM_LASTUNIT	0xc + +/* + * Bits in MMCR1 for POWER5 + */ +#define MMCR1_TTM0SEL_SH	62 +#define MMCR1_TTM1SEL_SH	60 +#define MMCR1_TTM2SEL_SH	58 +#define MMCR1_TTM3SEL_SH	56 +#define MMCR1_TTMSEL_MSK	3 +#define MMCR1_TD_CP_DBG0SEL_SH	54 +#define MMCR1_TD_CP_DBG1SEL_SH	52 +#define MMCR1_TD_CP_DBG2SEL_SH	50 +#define MMCR1_TD_CP_DBG3SEL_SH	48 +#define MMCR1_GRS_L2SEL_SH	46 +#define MMCR1_GRS_L2SEL_MSK	3 +#define MMCR1_GRS_L3SEL_SH	44 +#define MMCR1_GRS_L3SEL_MSK	3 +#define MMCR1_GRS_MCSEL_SH	41 +#define MMCR1_GRS_MCSEL_MSK	7 +#define MMCR1_GRS_FABSEL_SH	39 +#define MMCR1_GRS_FABSEL_MSK	3 +#define MMCR1_PMC1_ADDER_SEL_SH	35 +#define MMCR1_PMC2_ADDER_SEL_SH	34 +#define MMCR1_PMC3_ADDER_SEL_SH	33 +#define MMCR1_PMC4_ADDER_SEL_SH	32 +#define MMCR1_PMC1SEL_SH	25 +#define MMCR1_PMC2SEL_SH	17 +#define MMCR1_PMC3SEL_SH	9 +#define MMCR1_PMC4SEL_SH	1 +#define MMCR1_PMCSEL_SH(n)	(MMCR1_PMC1SEL_SH - (n) * 8) +#define MMCR1_PMCSEL_MSK	0x7f + +/* + * Bits in MMCRA + */ + +/* + * Layout of constraint bits: + * 6666555555555544444444443333333333222222222211111111110000000000 + * 3210987654321098765432109876543210987654321098765432109876543210 + *         <><>[  ><><>< ><> [  >[ >[ ><  ><  ><  ><  ><><><><><><> + *         T0T1 NC G0G1G2 G3  UC PS1PS2 B0  B1  B2  B3 P6P5P4P3P2P1 + * + * T0 - TTM0 constraint + *     54-55: TTM0SEL value (0=FPU, 2=IFU, 3=ISU1) 0xc0_0000_0000_0000 + * + * T1 - TTM1 constraint + *     52-53: TTM1SEL value (0=IDU, 3=GRS) 0x30_0000_0000_0000 + * + * NC - number of counters + *     51: NC error 0x0008_0000_0000_0000 + *     48-50: number of events needing PMC1-4 0x0007_0000_0000_0000 + * + * G0..G3 - GRS mux constraints + *     46-47: GRS_L2SEL value + *     44-45: GRS_L3SEL value + *     41-44: GRS_MCSEL value + *     39-40: GRS_FABSEL value + *	Note that these match up with their bit positions in MMCR1 + * + * UC - unit constraint: can't have all three of FPU|IFU|ISU1, ISU0, IDU|GRS + *     37: UC3 error 0x20_0000_0000 + *     36: FPU|IFU|ISU1 events needed 0x10_0000_0000 + *     35: ISU0 events needed 0x08_0000_0000 + *     34: IDU|GRS events needed 0x04_0000_0000 + * + * PS1 + *     33: PS1 error 0x2_0000_0000 + *     31-32: count of events needing PMC1/2 0x1_8000_0000 + * + * PS2 + *     30: PS2 error 0x4000_0000 + *     28-29: count of events needing PMC3/4 0x3000_0000 + * + * B0 + *     24-27: Byte 0 event source 0x0f00_0000 + *	      Encoding as for the event code + * + * B1, B2, B3 + *     20-23, 16-19, 12-15: Byte 1, 2, 3 event sources + * + * P1..P6 + *     0-11: Count of events needing PMC1..PMC6 + */ + +static const int grsel_shift[8] = { +	MMCR1_GRS_L2SEL_SH, MMCR1_GRS_L2SEL_SH, MMCR1_GRS_L2SEL_SH, +	MMCR1_GRS_L3SEL_SH, MMCR1_GRS_L3SEL_SH, MMCR1_GRS_L3SEL_SH, +	MMCR1_GRS_MCSEL_SH, MMCR1_GRS_FABSEL_SH +}; + +/* Masks and values for using events from the various units */ +static u64 unit_cons[PM_LASTUNIT+1][2] = { +	[PM_FPU] =   { 0xc0002000000000ull, 0x00001000000000ull }, +	[PM_ISU0] =  { 0x00002000000000ull, 0x00000800000000ull }, +	[PM_ISU1] =  { 0xc0002000000000ull, 0xc0001000000000ull }, +	[PM_IFU] =   { 0xc0002000000000ull, 0x80001000000000ull }, +	[PM_IDU] =   { 0x30002000000000ull, 0x00000400000000ull }, +	[PM_GRS] =   { 0x30002000000000ull, 0x30000400000000ull }, +}; + +static int power5_get_constraint(u64 event, u64 *maskp, u64 *valp) +{ +	int pmc, byte, unit, sh; +	int bit, fmask; +	u64 mask = 0, value = 0; +	int grp = -1; + +	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; +	if (pmc) { +		if (pmc > 6) +			return -1; +		sh = (pmc - 1) * 2; +		mask |= 2 << sh; +		value |= 1 << sh; +		if (pmc <= 4) +			grp = (pmc - 1) >> 1; +		else if (event != 0x500009 && event != 0x600005) +			return -1; +	} +	if (event & PM_BUSEVENT_MSK) { +		unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK; +		if (unit > PM_LASTUNIT) +			return -1; +		if (unit == PM_ISU0_ALT) +			unit = PM_ISU0; +		mask |= unit_cons[unit][0]; +		value |= unit_cons[unit][1]; +		byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK; +		if (byte >= 4) { +			if (unit != PM_LSU1) +				return -1; +			/* Map LSU1 low word (bytes 4-7) to unit LSU1+1 */ +			++unit; +			byte &= 3; +		} +		if (unit == PM_GRS) { +			bit = event & 7; +			fmask = (bit == 6)? 7: 3; +			sh = grsel_shift[bit]; +			mask |= (u64)fmask << sh; +			value |= (u64)((event >> PM_GRS_SH) & fmask) << sh; +		} +		/* +		 * Bus events on bytes 0 and 2 can be counted +		 * on PMC1/2; bytes 1 and 3 on PMC3/4. +		 */ +		if (!pmc) +			grp = byte & 1; +		/* Set byte lane select field */ +		mask  |= 0xfULL << (24 - 4 * byte); +		value |= (u64)unit << (24 - 4 * byte); +	} +	if (grp == 0) { +		/* increment PMC1/2 field */ +		mask  |= 0x200000000ull; +		value |= 0x080000000ull; +	} else if (grp == 1) { +		/* increment PMC3/4 field */ +		mask  |= 0x40000000ull; +		value |= 0x10000000ull; +	} +	if (pmc < 5) { +		/* need a counter from PMC1-4 set */ +		mask  |= 0x8000000000000ull; +		value |= 0x1000000000000ull; +	} +	*maskp = mask; +	*valp = value; +	return 0; +} + +#define MAX_ALT	3	/* at most 3 alternatives for any event */ + +static const unsigned int event_alternatives[][MAX_ALT] = { +	{ 0x120e4,  0x400002 },			/* PM_GRP_DISP_REJECT */ +	{ 0x410c7,  0x441084 },			/* PM_THRD_L2MISS_BOTH_CYC */ +	{ 0x100005, 0x600005 },			/* PM_RUN_CYC */ +	{ 0x100009, 0x200009, 0x500009 },	/* PM_INST_CMPL */ +	{ 0x300009, 0x400009 },			/* PM_INST_DISP */ +}; + +/* + * Scan the alternatives table for a match and return the + * index into the alternatives table if found, else -1. + */ +static int find_alternative(u64 event) +{ +	int i, j; + +	for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) { +		if (event < event_alternatives[i][0]) +			break; +		for (j = 0; j < MAX_ALT && event_alternatives[i][j]; ++j) +			if (event == event_alternatives[i][j]) +				return i; +	} +	return -1; +} + +static const unsigned char bytedecode_alternatives[4][4] = { +	/* PMC 1 */	{ 0x21, 0x23, 0x25, 0x27 }, +	/* PMC 2 */	{ 0x07, 0x17, 0x0e, 0x1e }, +	/* PMC 3 */	{ 0x20, 0x22, 0x24, 0x26 }, +	/* PMC 4 */	{ 0x07, 0x17, 0x0e, 0x1e } +}; + +/* + * Some direct events for decodes of event bus byte 3 have alternative + * PMCSEL values on other counters.  This returns the alternative + * event code for those that do, or -1 otherwise. + */ +static s64 find_alternative_bdecode(u64 event) +{ +	int pmc, altpmc, pp, j; + +	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; +	if (pmc == 0 || pmc > 4) +		return -1; +	altpmc = 5 - pmc;	/* 1 <-> 4, 2 <-> 3 */ +	pp = event & PM_PMCSEL_MSK; +	for (j = 0; j < 4; ++j) { +		if (bytedecode_alternatives[pmc - 1][j] == pp) { +			return (event & ~(PM_PMC_MSKS | PM_PMCSEL_MSK)) | +				(altpmc << PM_PMC_SH) | +				bytedecode_alternatives[altpmc - 1][j]; +		} +	} +	return -1; +} + +static int power5_get_alternatives(u64 event, unsigned int flags, u64 alt[]) +{ +	int i, j, nalt = 1; +	s64 ae; + +	alt[0] = event; +	nalt = 1; +	i = find_alternative(event); +	if (i >= 0) { +		for (j = 0; j < MAX_ALT; ++j) { +			ae = event_alternatives[i][j]; +			if (ae && ae != event) +				alt[nalt++] = ae; +		} +	} else { +		ae = find_alternative_bdecode(event); +		if (ae > 0) +			alt[nalt++] = ae; +	} +	return nalt; +} + +/* + * Map of which direct events on which PMCs are marked instruction events. + * Indexed by PMCSEL value, bit i (LE) set if PMC i is a marked event. + * Bit 0 is set if it is marked for all PMCs. + * The 0x80 bit indicates a byte decode PMCSEL value. + */ +static unsigned char direct_event_is_marked[0x28] = { +	0,	/* 00 */ +	0x1f,	/* 01 PM_IOPS_CMPL */ +	0x2,	/* 02 PM_MRK_GRP_DISP */ +	0xe,	/* 03 PM_MRK_ST_CMPL, PM_MRK_ST_GPS, PM_MRK_ST_CMPL_INT */ +	0,	/* 04 */ +	0x1c,	/* 05 PM_MRK_BRU_FIN, PM_MRK_INST_FIN, PM_MRK_CRU_FIN */ +	0x80,	/* 06 */ +	0x80,	/* 07 */ +	0, 0, 0,/* 08 - 0a */ +	0x18,	/* 0b PM_THRESH_TIMEO, PM_MRK_GRP_TIMEO */ +	0,	/* 0c */ +	0x80,	/* 0d */ +	0x80,	/* 0e */ +	0,	/* 0f */ +	0,	/* 10 */ +	0x14,	/* 11 PM_MRK_GRP_BR_REDIR, PM_MRK_GRP_IC_MISS */ +	0,	/* 12 */ +	0x10,	/* 13 PM_MRK_GRP_CMPL */ +	0x1f,	/* 14 PM_GRP_MRK, PM_MRK_{FXU,FPU,LSU}_FIN */ +	0x2,	/* 15 PM_MRK_GRP_ISSUED */ +	0x80,	/* 16 */ +	0x80,	/* 17 */ +	0, 0, 0, 0, 0, +	0x80,	/* 1d */ +	0x80,	/* 1e */ +	0,	/* 1f */ +	0x80,	/* 20 */ +	0x80,	/* 21 */ +	0x80,	/* 22 */ +	0x80,	/* 23 */ +	0x80,	/* 24 */ +	0x80,	/* 25 */ +	0x80,	/* 26 */ +	0x80,	/* 27 */ +}; + +/* + * Returns 1 if event counts things relating to marked instructions + * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not. + */ +static int power5_marked_instr_event(u64 event) +{ +	int pmc, psel; +	int bit, byte, unit; +	u32 mask; + +	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; +	psel = event & PM_PMCSEL_MSK; +	if (pmc >= 5) +		return 0; + +	bit = -1; +	if (psel < sizeof(direct_event_is_marked)) { +		if (direct_event_is_marked[psel] & (1 << pmc)) +			return 1; +		if (direct_event_is_marked[psel] & 0x80) +			bit = 4; +		else if (psel == 0x08) +			bit = pmc - 1; +		else if (psel == 0x10) +			bit = 4 - pmc; +		else if (psel == 0x1b && (pmc == 1 || pmc == 3)) +			bit = 4; +	} else if ((psel & 0x58) == 0x40) +		bit = psel & 7; + +	if (!(event & PM_BUSEVENT_MSK)) +		return 0; + +	byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK; +	unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK; +	if (unit == PM_LSU0) { +		/* byte 1 bits 0-7, byte 2 bits 0,2-4,6 */ +		mask = 0x5dff00; +	} else if (unit == PM_LSU1 && byte >= 4) { +		byte -= 4; +		/* byte 4 bits 1,3,5,7, byte 5 bits 6-7, byte 7 bits 0-4,6 */ +		mask = 0x5f00c0aa; +	} else +		return 0; + +	return (mask >> (byte * 8 + bit)) & 1; +} + +static int power5_compute_mmcr(u64 event[], int n_ev, +			       unsigned int hwc[], u64 mmcr[]) +{ +	u64 mmcr1 = 0; +	u64 mmcra = 0; +	unsigned int pmc, unit, byte, psel; +	unsigned int ttm, grp; +	int i, isbus, bit, grsel; +	unsigned int pmc_inuse = 0; +	unsigned int pmc_grp_use[2]; +	unsigned char busbyte[4]; +	unsigned char unituse[16]; +	int ttmuse; + +	if (n_ev > 6) +		return -1; + +	/* First pass to count resource use */ +	pmc_grp_use[0] = pmc_grp_use[1] = 0; +	memset(busbyte, 0, sizeof(busbyte)); +	memset(unituse, 0, sizeof(unituse)); +	for (i = 0; i < n_ev; ++i) { +		pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK; +		if (pmc) { +			if (pmc > 6) +				return -1; +			if (pmc_inuse & (1 << (pmc - 1))) +				return -1; +			pmc_inuse |= 1 << (pmc - 1); +			/* count 1/2 vs 3/4 use */ +			if (pmc <= 4) +				++pmc_grp_use[(pmc - 1) >> 1]; +		} +		if (event[i] & PM_BUSEVENT_MSK) { +			unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK; +			byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK; +			if (unit > PM_LASTUNIT) +				return -1; +			if (unit == PM_ISU0_ALT) +				unit = PM_ISU0; +			if (byte >= 4) { +				if (unit != PM_LSU1) +					return -1; +				++unit; +				byte &= 3; +			} +			if (!pmc) +				++pmc_grp_use[byte & 1]; +			if (busbyte[byte] && busbyte[byte] != unit) +				return -1; +			busbyte[byte] = unit; +			unituse[unit] = 1; +		} +	} +	if (pmc_grp_use[0] > 2 || pmc_grp_use[1] > 2) +		return -1; + +	/* +	 * Assign resources and set multiplexer selects. +	 * +	 * PM_ISU0 can go either on TTM0 or TTM1, but that's the only +	 * choice we have to deal with. +	 */ +	if (unituse[PM_ISU0] & +	    (unituse[PM_FPU] | unituse[PM_IFU] | unituse[PM_ISU1])) { +		unituse[PM_ISU0_ALT] = 1;	/* move ISU to TTM1 */ +		unituse[PM_ISU0] = 0; +	} +	/* Set TTM[01]SEL fields. */ +	ttmuse = 0; +	for (i = PM_FPU; i <= PM_ISU1; ++i) { +		if (!unituse[i]) +			continue; +		if (ttmuse++) +			return -1; +		mmcr1 |= (u64)i << MMCR1_TTM0SEL_SH; +	} +	ttmuse = 0; +	for (; i <= PM_GRS; ++i) { +		if (!unituse[i]) +			continue; +		if (ttmuse++) +			return -1; +		mmcr1 |= (u64)(i & 3) << MMCR1_TTM1SEL_SH; +	} +	if (ttmuse > 1) +		return -1; + +	/* Set byte lane select fields, TTM[23]SEL and GRS_*SEL. */ +	for (byte = 0; byte < 4; ++byte) { +		unit = busbyte[byte]; +		if (!unit) +			continue; +		if (unit == PM_ISU0 && unituse[PM_ISU0_ALT]) { +			/* get ISU0 through TTM1 rather than TTM0 */ +			unit = PM_ISU0_ALT; +		} else if (unit == PM_LSU1 + 1) { +			/* select lower word of LSU1 for this byte */ +			mmcr1 |= 1ull << (MMCR1_TTM3SEL_SH + 3 - byte); +		} +		ttm = unit >> 2; +		mmcr1 |= (u64)ttm << (MMCR1_TD_CP_DBG0SEL_SH - 2 * byte); +	} + +	/* Second pass: assign PMCs, set PMCxSEL and PMCx_ADDER_SEL fields */ +	for (i = 0; i < n_ev; ++i) { +		pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK; +		unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK; +		byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK; +		psel = event[i] & PM_PMCSEL_MSK; +		isbus = event[i] & PM_BUSEVENT_MSK; +		if (!pmc) { +			/* Bus event or any-PMC direct event */ +			for (pmc = 0; pmc < 4; ++pmc) { +				if (pmc_inuse & (1 << pmc)) +					continue; +				grp = (pmc >> 1) & 1; +				if (isbus) { +					if (grp == (byte & 1)) +						break; +				} else if (pmc_grp_use[grp] < 2) { +					++pmc_grp_use[grp]; +					break; +				} +			} +			pmc_inuse |= 1 << pmc; +		} else if (pmc <= 4) { +			/* Direct event */ +			--pmc; +			if ((psel == 8 || psel == 0x10) && isbus && (byte & 2)) +				/* add events on higher-numbered bus */ +				mmcr1 |= 1ull << (MMCR1_PMC1_ADDER_SEL_SH - pmc); +		} else { +			/* Instructions or run cycles on PMC5/6 */ +			--pmc; +		} +		if (isbus && unit == PM_GRS) { +			bit = psel & 7; +			grsel = (event[i] >> PM_GRS_SH) & PM_GRS_MSK; +			mmcr1 |= (u64)grsel << grsel_shift[bit]; +		} +		if (power5_marked_instr_event(event[i])) +			mmcra |= MMCRA_SAMPLE_ENABLE; +		if (pmc <= 3) +			mmcr1 |= psel << MMCR1_PMCSEL_SH(pmc); +		hwc[i] = pmc; +	} + +	/* Return MMCRx values */ +	mmcr[0] = 0; +	if (pmc_inuse & 1) +		mmcr[0] = MMCR0_PMC1CE; +	if (pmc_inuse & 0x3e) +		mmcr[0] |= MMCR0_PMCjCE; +	mmcr[1] = mmcr1; +	mmcr[2] = mmcra; +	return 0; +} + +static void power5_disable_pmc(unsigned int pmc, u64 mmcr[]) +{ +	if (pmc <= 3) +		mmcr[1] &= ~(0x7fUL << MMCR1_PMCSEL_SH(pmc)); +} + +static int power5_generic_events[] = { +	[PERF_COUNT_HW_CPU_CYCLES]		= 0xf, +	[PERF_COUNT_HW_INSTRUCTIONS]		= 0x100009, +	[PERF_COUNT_HW_CACHE_REFERENCES]	= 0x4c1090, /* LD_REF_L1 */ +	[PERF_COUNT_HW_CACHE_MISSES]		= 0x3c1088, /* LD_MISS_L1 */ +	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x230e4,  /* BR_ISSUED */ +	[PERF_COUNT_HW_BRANCH_MISSES]		= 0x230e5,  /* BR_MPRED_CR */ +}; + +#define C(x)	PERF_COUNT_HW_CACHE_##x + +/* + * Table of generalized cache-related events. + * 0 means not supported, -1 means nonsensical, other values + * are event codes. + */ +static int power5_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = { +	[C(L1D)] = {		/* 	RESULT_ACCESS	RESULT_MISS */ +		[C(OP_READ)] = {	0x4c1090,	0x3c1088	}, +		[C(OP_WRITE)] = {	0x3c1090,	0xc10c3		}, +		[C(OP_PREFETCH)] = {	0xc70e7,	0		}, +	}, +	[C(L1I)] = {		/* 	RESULT_ACCESS	RESULT_MISS */ +		[C(OP_READ)] = {	0,		0		}, +		[C(OP_WRITE)] = {	-1,		-1		}, +		[C(OP_PREFETCH)] = {	0,		0		}, +	}, +	[C(LL)] = {		/* 	RESULT_ACCESS	RESULT_MISS */ +		[C(OP_READ)] = {	0,		0x3c309b	}, +		[C(OP_WRITE)] = {	0,		0		}, +		[C(OP_PREFETCH)] = {	0xc50c3,	0		}, +	}, +	[C(DTLB)] = {		/* 	RESULT_ACCESS	RESULT_MISS */ +		[C(OP_READ)] = {	0x2c4090,	0x800c4		}, +		[C(OP_WRITE)] = {	-1,		-1		}, +		[C(OP_PREFETCH)] = {	-1,		-1		}, +	}, +	[C(ITLB)] = {		/* 	RESULT_ACCESS	RESULT_MISS */ +		[C(OP_READ)] = {	0,		0x800c0		}, +		[C(OP_WRITE)] = {	-1,		-1		}, +		[C(OP_PREFETCH)] = {	-1,		-1		}, +	}, +	[C(BPU)] = {		/* 	RESULT_ACCESS	RESULT_MISS */ +		[C(OP_READ)] = {	0x230e4,	0x230e5		}, +		[C(OP_WRITE)] = {	-1,		-1		}, +		[C(OP_PREFETCH)] = {	-1,		-1		}, +	}, +}; + +struct power_pmu power5_pmu = { +	.n_counter = 6, +	.max_alternatives = MAX_ALT, +	.add_fields = 0x7000090000555ull, +	.test_adder = 0x3000490000000ull, +	.compute_mmcr = power5_compute_mmcr, +	.get_constraint = power5_get_constraint, +	.get_alternatives = power5_get_alternatives, +	.disable_pmc = power5_disable_pmc, +	.n_generic = ARRAY_SIZE(power5_generic_events), +	.generic_events = power5_generic_events, +	.cache_events = &power5_cache_events, +}; diff --git a/arch/powerpc/kernel/power6-pmu.c b/arch/powerpc/kernel/power6-pmu.c new file mode 100644 index 00000000000..46f74bebcfd --- /dev/null +++ b/arch/powerpc/kernel/power6-pmu.c @@ -0,0 +1,532 @@ +/* + * Performance counter support for POWER6 processors. + * + * Copyright 2008-2009 Paul Mackerras, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <linux/kernel.h> +#include <linux/perf_counter.h> +#include <asm/reg.h> + +/* + * Bits in event code for POWER6 + */ +#define PM_PMC_SH	20	/* PMC number (1-based) for direct events */ +#define PM_PMC_MSK	0x7 +#define PM_PMC_MSKS	(PM_PMC_MSK << PM_PMC_SH) +#define PM_UNIT_SH	16	/* Unit event comes (TTMxSEL encoding) */ +#define PM_UNIT_MSK	0xf +#define PM_UNIT_MSKS	(PM_UNIT_MSK << PM_UNIT_SH) +#define PM_LLAV		0x8000	/* Load lookahead match value */ +#define PM_LLA		0x4000	/* Load lookahead match enable */ +#define PM_BYTE_SH	12	/* Byte of event bus to use */ +#define PM_BYTE_MSK	3 +#define PM_SUBUNIT_SH	8	/* Subunit event comes from (NEST_SEL enc.) */ +#define PM_SUBUNIT_MSK	7 +#define PM_SUBUNIT_MSKS	(PM_SUBUNIT_MSK << PM_SUBUNIT_SH) +#define PM_PMCSEL_MSK	0xff	/* PMCxSEL value */ +#define PM_BUSEVENT_MSK	0xf3700 + +/* + * Bits in MMCR1 for POWER6 + */ +#define MMCR1_TTM0SEL_SH	60 +#define MMCR1_TTMSEL_SH(n)	(MMCR1_TTM0SEL_SH - (n) * 4) +#define MMCR1_TTMSEL_MSK	0xf +#define MMCR1_TTMSEL(m, n)	(((m) >> MMCR1_TTMSEL_SH(n)) & MMCR1_TTMSEL_MSK) +#define MMCR1_NESTSEL_SH	45 +#define MMCR1_NESTSEL_MSK	0x7 +#define MMCR1_NESTSEL(m)	(((m) >> MMCR1_NESTSEL_SH) & MMCR1_NESTSEL_MSK) +#define MMCR1_PMC1_LLA		((u64)1 << 44) +#define MMCR1_PMC1_LLA_VALUE	((u64)1 << 39) +#define MMCR1_PMC1_ADDR_SEL	((u64)1 << 35) +#define MMCR1_PMC1SEL_SH	24 +#define MMCR1_PMCSEL_SH(n)	(MMCR1_PMC1SEL_SH - (n) * 8) +#define MMCR1_PMCSEL_MSK	0xff + +/* + * Map of which direct events on which PMCs are marked instruction events. + * Indexed by PMCSEL value >> 1. + * Bottom 4 bits are a map of which PMCs are interesting, + * top 4 bits say what sort of event: + *   0 = direct marked event, + *   1 = byte decode event, + *   4 = add/and event (PMC1 -> bits 0 & 4), + *   5 = add/and event (PMC1 -> bits 1 & 5), + *   6 = add/and event (PMC1 -> bits 2 & 6), + *   7 = add/and event (PMC1 -> bits 3 & 7). + */ +static unsigned char direct_event_is_marked[0x60 >> 1] = { +	0,	/* 00 */ +	0,	/* 02 */ +	0,	/* 04 */ +	0x07,	/* 06 PM_MRK_ST_CMPL, PM_MRK_ST_GPS, PM_MRK_ST_CMPL_INT */ +	0x04,	/* 08 PM_MRK_DFU_FIN */ +	0x06,	/* 0a PM_MRK_IFU_FIN, PM_MRK_INST_FIN */ +	0,	/* 0c */ +	0,	/* 0e */ +	0x02,	/* 10 PM_MRK_INST_DISP */ +	0x08,	/* 12 PM_MRK_LSU_DERAT_MISS */ +	0,	/* 14 */ +	0,	/* 16 */ +	0x0c,	/* 18 PM_THRESH_TIMEO, PM_MRK_INST_FIN */ +	0x0f,	/* 1a PM_MRK_INST_DISP, PM_MRK_{FXU,FPU,LSU}_FIN */ +	0x01,	/* 1c PM_MRK_INST_ISSUED */ +	0,	/* 1e */ +	0,	/* 20 */ +	0,	/* 22 */ +	0,	/* 24 */ +	0,	/* 26 */ +	0x15,	/* 28 PM_MRK_DATA_FROM_L2MISS, PM_MRK_DATA_FROM_L3MISS */ +	0,	/* 2a */ +	0,	/* 2c */ +	0,	/* 2e */ +	0x4f,	/* 30 */ +	0x7f,	/* 32 */ +	0x4f,	/* 34 */ +	0x5f,	/* 36 */ +	0x6f,	/* 38 */ +	0x4f,	/* 3a */ +	0,	/* 3c */ +	0x08,	/* 3e PM_MRK_INST_TIMEO */ +	0x1f,	/* 40 */ +	0x1f,	/* 42 */ +	0x1f,	/* 44 */ +	0x1f,	/* 46 */ +	0x1f,	/* 48 */ +	0x1f,	/* 4a */ +	0x1f,	/* 4c */ +	0x1f,	/* 4e */ +	0,	/* 50 */ +	0x05,	/* 52 PM_MRK_BR_TAKEN, PM_MRK_BR_MPRED */ +	0x1c,	/* 54 PM_MRK_PTEG_FROM_L3MISS, PM_MRK_PTEG_FROM_L2MISS */ +	0x02,	/* 56 PM_MRK_LD_MISS_L1 */ +	0,	/* 58 */ +	0,	/* 5a */ +	0,	/* 5c */ +	0,	/* 5e */ +}; + +/* + * Masks showing for each unit which bits are marked events. + * These masks are in LE order, i.e. 0x00000001 is byte 0, bit 0. + */ +static u32 marked_bus_events[16] = { +	0x01000000,	/* direct events set 1: byte 3 bit 0 */ +	0x00010000,	/* direct events set 2: byte 2 bit 0 */ +	0, 0, 0, 0,	/* IDU, IFU, nest: nothing */ +	0x00000088,	/* VMX set 1: byte 0 bits 3, 7 */ +	0x000000c0,	/* VMX set 2: byte 0 bits 4-7 */ +	0x04010000,	/* LSU set 1: byte 2 bit 0, byte 3 bit 2 */ +	0xff010000u,	/* LSU set 2: byte 2 bit 0, all of byte 3 */ +	0,		/* LSU set 3 */ +	0x00000010,	/* VMX set 3: byte 0 bit 4 */ +	0,		/* BFP set 1 */ +	0x00000022,	/* BFP set 2: byte 0 bits 1, 5 */ +	0, 0 +}; +	 +/* + * Returns 1 if event counts things relating to marked instructions + * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not. + */ +static int power6_marked_instr_event(u64 event) +{ +	int pmc, psel, ptype; +	int bit, byte, unit; +	u32 mask; + +	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; +	psel = (event & PM_PMCSEL_MSK) >> 1;	/* drop edge/level bit */ +	if (pmc >= 5) +		return 0; + +	bit = -1; +	if (psel < sizeof(direct_event_is_marked)) { +		ptype = direct_event_is_marked[psel]; +		if (pmc == 0 || !(ptype & (1 << (pmc - 1)))) +			return 0; +		ptype >>= 4; +		if (ptype == 0) +			return 1; +		if (ptype == 1) +			bit = 0; +		else +			bit = ptype ^ (pmc - 1); +	} else if ((psel & 0x48) == 0x40) +		bit = psel & 7; + +	if (!(event & PM_BUSEVENT_MSK) || bit == -1) +		return 0; + +	byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK; +	unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK; +	mask = marked_bus_events[unit]; +	return (mask >> (byte * 8 + bit)) & 1; +} + +/* + * Assign PMC numbers and compute MMCR1 value for a set of events + */ +static int p6_compute_mmcr(u64 event[], int n_ev, +			   unsigned int hwc[], u64 mmcr[]) +{ +	u64 mmcr1 = 0; +	u64 mmcra = 0; +	int i; +	unsigned int pmc, ev, b, u, s, psel; +	unsigned int ttmset = 0; +	unsigned int pmc_inuse = 0; + +	if (n_ev > 6) +		return -1; +	for (i = 0; i < n_ev; ++i) { +		pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK; +		if (pmc) { +			if (pmc_inuse & (1 << (pmc - 1))) +				return -1;	/* collision! */ +			pmc_inuse |= 1 << (pmc - 1); +		} +	} +	for (i = 0; i < n_ev; ++i) { +		ev = event[i]; +		pmc = (ev >> PM_PMC_SH) & PM_PMC_MSK; +		if (pmc) { +			--pmc; +		} else { +			/* can go on any PMC; find a free one */ +			for (pmc = 0; pmc < 4; ++pmc) +				if (!(pmc_inuse & (1 << pmc))) +					break; +			if (pmc >= 4) +				return -1; +			pmc_inuse |= 1 << pmc; +		} +		hwc[i] = pmc; +		psel = ev & PM_PMCSEL_MSK; +		if (ev & PM_BUSEVENT_MSK) { +			/* this event uses the event bus */ +			b = (ev >> PM_BYTE_SH) & PM_BYTE_MSK; +			u = (ev >> PM_UNIT_SH) & PM_UNIT_MSK; +			/* check for conflict on this byte of event bus */ +			if ((ttmset & (1 << b)) && MMCR1_TTMSEL(mmcr1, b) != u) +				return -1; +			mmcr1 |= (u64)u << MMCR1_TTMSEL_SH(b); +			ttmset |= 1 << b; +			if (u == 5) { +				/* Nest events have a further mux */ +				s = (ev >> PM_SUBUNIT_SH) & PM_SUBUNIT_MSK; +				if ((ttmset & 0x10) && +				    MMCR1_NESTSEL(mmcr1) != s) +					return -1; +				ttmset |= 0x10; +				mmcr1 |= (u64)s << MMCR1_NESTSEL_SH; +			} +			if (0x30 <= psel && psel <= 0x3d) { +				/* these need the PMCx_ADDR_SEL bits */ +				if (b >= 2) +					mmcr1 |= MMCR1_PMC1_ADDR_SEL >> pmc; +			} +			/* bus select values are different for PMC3/4 */ +			if (pmc >= 2 && (psel & 0x90) == 0x80) +				psel ^= 0x20; +		} +		if (ev & PM_LLA) { +			mmcr1 |= MMCR1_PMC1_LLA >> pmc; +			if (ev & PM_LLAV) +				mmcr1 |= MMCR1_PMC1_LLA_VALUE >> pmc; +		} +		if (power6_marked_instr_event(event[i])) +			mmcra |= MMCRA_SAMPLE_ENABLE; +		if (pmc < 4) +			mmcr1 |= (u64)psel << MMCR1_PMCSEL_SH(pmc); +	} +	mmcr[0] = 0; +	if (pmc_inuse & 1) +		mmcr[0] = MMCR0_PMC1CE; +	if (pmc_inuse & 0xe) +		mmcr[0] |= MMCR0_PMCjCE; +	mmcr[1] = mmcr1; +	mmcr[2] = mmcra; +	return 0; +} + +/* + * Layout of constraint bits: + * + *	0-1	add field: number of uses of PMC1 (max 1) + *	2-3, 4-5, 6-7, 8-9, 10-11: ditto for PMC2, 3, 4, 5, 6 + *	12-15	add field: number of uses of PMC1-4 (max 4) + *	16-19	select field: unit on byte 0 of event bus + *	20-23, 24-27, 28-31 ditto for bytes 1, 2, 3 + *	32-34	select field: nest (subunit) event selector + */ +static int p6_get_constraint(u64 event, u64 *maskp, u64 *valp) +{ +	int pmc, byte, sh, subunit; +	u64 mask = 0, value = 0; + +	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; +	if (pmc) { +		if (pmc > 4 && !(event == 0x500009 || event == 0x600005)) +			return -1; +		sh = (pmc - 1) * 2; +		mask |= 2 << sh; +		value |= 1 << sh; +	} +	if (event & PM_BUSEVENT_MSK) { +		byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK; +		sh = byte * 4 + (16 - PM_UNIT_SH); +		mask |= PM_UNIT_MSKS << sh; +		value |= (u64)(event & PM_UNIT_MSKS) << sh; +		if ((event & PM_UNIT_MSKS) == (5 << PM_UNIT_SH)) { +			subunit = (event >> PM_SUBUNIT_SH) & PM_SUBUNIT_MSK; +			mask  |= (u64)PM_SUBUNIT_MSK << 32; +			value |= (u64)subunit << 32; +		} +	} +	if (pmc <= 4) { +		mask  |= 0x8000;	/* add field for count of PMC1-4 uses */ +		value |= 0x1000; +	} +	*maskp = mask; +	*valp = value; +	return 0; +} + +static int p6_limited_pmc_event(u64 event) +{ +	int pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; + +	return pmc == 5 || pmc == 6; +} + +#define MAX_ALT	4	/* at most 4 alternatives for any event */ + +static const unsigned int event_alternatives[][MAX_ALT] = { +	{ 0x0130e8, 0x2000f6, 0x3000fc },	/* PM_PTEG_RELOAD_VALID */ +	{ 0x080080, 0x10000d, 0x30000c, 0x4000f0 }, /* PM_LD_MISS_L1 */ +	{ 0x080088, 0x200054, 0x3000f0 },	/* PM_ST_MISS_L1 */ +	{ 0x10000a, 0x2000f4, 0x600005 },	/* PM_RUN_CYC */ +	{ 0x10000b, 0x2000f5 },			/* PM_RUN_COUNT */ +	{ 0x10000e, 0x400010 },			/* PM_PURR */ +	{ 0x100010, 0x4000f8 },			/* PM_FLUSH */ +	{ 0x10001a, 0x200010 },			/* PM_MRK_INST_DISP */ +	{ 0x100026, 0x3000f8 },			/* PM_TB_BIT_TRANS */ +	{ 0x100054, 0x2000f0 },			/* PM_ST_FIN */ +	{ 0x100056, 0x2000fc },			/* PM_L1_ICACHE_MISS */ +	{ 0x1000f0, 0x40000a },			/* PM_INST_IMC_MATCH_CMPL */ +	{ 0x1000f8, 0x200008 },			/* PM_GCT_EMPTY_CYC */ +	{ 0x1000fc, 0x400006 },			/* PM_LSU_DERAT_MISS_CYC */ +	{ 0x20000e, 0x400007 },			/* PM_LSU_DERAT_MISS */ +	{ 0x200012, 0x300012 },			/* PM_INST_DISP */ +	{ 0x2000f2, 0x3000f2 },			/* PM_INST_DISP */ +	{ 0x2000f8, 0x300010 },			/* PM_EXT_INT */ +	{ 0x2000fe, 0x300056 },			/* PM_DATA_FROM_L2MISS */ +	{ 0x2d0030, 0x30001a },			/* PM_MRK_FPU_FIN */ +	{ 0x30000a, 0x400018 },			/* PM_MRK_INST_FIN */ +	{ 0x3000f6, 0x40000e },			/* PM_L1_DCACHE_RELOAD_VALID */ +	{ 0x3000fe, 0x400056 },			/* PM_DATA_FROM_L3MISS */ +}; + +/* + * This could be made more efficient with a binary search on + * a presorted list, if necessary + */ +static int find_alternatives_list(u64 event) +{ +	int i, j; +	unsigned int alt; + +	for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) { +		if (event < event_alternatives[i][0]) +			return -1; +		for (j = 0; j < MAX_ALT; ++j) { +			alt = event_alternatives[i][j]; +			if (!alt || event < alt) +				break; +			if (event == alt) +				return i; +		} +	} +	return -1; +} + +static int p6_get_alternatives(u64 event, unsigned int flags, u64 alt[]) +{ +	int i, j, nlim; +	unsigned int psel, pmc; +	unsigned int nalt = 1; +	u64 aevent; + +	alt[0] = event; +	nlim = p6_limited_pmc_event(event); + +	/* check the alternatives table */ +	i = find_alternatives_list(event); +	if (i >= 0) { +		/* copy out alternatives from list */ +		for (j = 0; j < MAX_ALT; ++j) { +			aevent = event_alternatives[i][j]; +			if (!aevent) +				break; +			if (aevent != event) +				alt[nalt++] = aevent; +			nlim += p6_limited_pmc_event(aevent); +		} + +	} else { +		/* Check for alternative ways of computing sum events */ +		/* PMCSEL 0x32 counter N == PMCSEL 0x34 counter 5-N */ +		psel = event & (PM_PMCSEL_MSK & ~1);	/* ignore edge bit */ +		pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; +		if (pmc && (psel == 0x32 || psel == 0x34)) +			alt[nalt++] = ((event ^ 0x6) & ~PM_PMC_MSKS) | +				((5 - pmc) << PM_PMC_SH); + +		/* PMCSEL 0x38 counter N == PMCSEL 0x3a counter N+/-2 */ +		if (pmc && (psel == 0x38 || psel == 0x3a)) +			alt[nalt++] = ((event ^ 0x2) & ~PM_PMC_MSKS) | +				((pmc > 2? pmc - 2: pmc + 2) << PM_PMC_SH); +	} + +	if (flags & PPMU_ONLY_COUNT_RUN) { +		/* +		 * We're only counting in RUN state, +		 * so PM_CYC is equivalent to PM_RUN_CYC, +		 * PM_INST_CMPL === PM_RUN_INST_CMPL, PM_PURR === PM_RUN_PURR. +		 * This doesn't include alternatives that don't provide +		 * any extra flexibility in assigning PMCs (e.g. +		 * 0x10000a for PM_RUN_CYC vs. 0x1e for PM_CYC). +		 * Note that even with these additional alternatives +		 * we never end up with more than 4 alternatives for any event. +		 */ +		j = nalt; +		for (i = 0; i < nalt; ++i) { +			switch (alt[i]) { +			case 0x1e:	/* PM_CYC */ +				alt[j++] = 0x600005;	/* PM_RUN_CYC */ +				++nlim; +				break; +			case 0x10000a:	/* PM_RUN_CYC */ +				alt[j++] = 0x1e;	/* PM_CYC */ +				break; +			case 2:		/* PM_INST_CMPL */ +				alt[j++] = 0x500009;	/* PM_RUN_INST_CMPL */ +				++nlim; +				break; +			case 0x500009:	/* PM_RUN_INST_CMPL */ +				alt[j++] = 2;		/* PM_INST_CMPL */ +				break; +			case 0x10000e:	/* PM_PURR */ +				alt[j++] = 0x4000f4;	/* PM_RUN_PURR */ +				break; +			case 0x4000f4:	/* PM_RUN_PURR */ +				alt[j++] = 0x10000e;	/* PM_PURR */ +				break; +			} +		} +		nalt = j; +	} + +	if (!(flags & PPMU_LIMITED_PMC_OK) && nlim) { +		/* remove the limited PMC events */ +		j = 0; +		for (i = 0; i < nalt; ++i) { +			if (!p6_limited_pmc_event(alt[i])) { +				alt[j] = alt[i]; +				++j; +			} +		} +		nalt = j; +	} else if ((flags & PPMU_LIMITED_PMC_REQD) && nlim < nalt) { +		/* remove all but the limited PMC events */ +		j = 0; +		for (i = 0; i < nalt; ++i) { +			if (p6_limited_pmc_event(alt[i])) { +				alt[j] = alt[i]; +				++j; +			} +		} +		nalt = j; +	} + +	return nalt; +} + +static void p6_disable_pmc(unsigned int pmc, u64 mmcr[]) +{ +	/* Set PMCxSEL to 0 to disable PMCx */ +	if (pmc <= 3) +		mmcr[1] &= ~(0xffUL << MMCR1_PMCSEL_SH(pmc)); +} + +static int power6_generic_events[] = { +	[PERF_COUNT_HW_CPU_CYCLES]		= 0x1e, +	[PERF_COUNT_HW_INSTRUCTIONS]		= 2, +	[PERF_COUNT_HW_CACHE_REFERENCES]	= 0x280030, /* LD_REF_L1 */ +	[PERF_COUNT_HW_CACHE_MISSES]		= 0x30000c, /* LD_MISS_L1 */ +	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x410a0,  /* BR_PRED */ +	[PERF_COUNT_HW_BRANCH_MISSES]		= 0x400052, /* BR_MPRED */ +}; + +#define C(x)	PERF_COUNT_HW_CACHE_##x + +/* + * Table of generalized cache-related events. + * 0 means not supported, -1 means nonsensical, other values + * are event codes. + * The "DTLB" and "ITLB" events relate to the DERAT and IERAT. + */ +static int power6_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = { +	[C(L1D)] = {		/* 	RESULT_ACCESS	RESULT_MISS */ +		[C(OP_READ)] = {	0x80082,	0x80080		}, +		[C(OP_WRITE)] = {	0x80086,	0x80088		}, +		[C(OP_PREFETCH)] = {	0x810a4,	0		}, +	}, +	[C(L1I)] = {		/* 	RESULT_ACCESS	RESULT_MISS */ +		[C(OP_READ)] = {	0,		0x100056 	}, +		[C(OP_WRITE)] = {	-1,		-1		}, +		[C(OP_PREFETCH)] = {	0x4008c,	0		}, +	}, +	[C(LL)] = {		/* 	RESULT_ACCESS	RESULT_MISS */ +		[C(OP_READ)] = {	0x150730,	0x250532	}, +		[C(OP_WRITE)] = {	0x250432,	0x150432	}, +		[C(OP_PREFETCH)] = {	0x810a6,	0		}, +	}, +	[C(DTLB)] = {		/* 	RESULT_ACCESS	RESULT_MISS */ +		[C(OP_READ)] = {	0,		0x20000e	}, +		[C(OP_WRITE)] = {	-1,		-1		}, +		[C(OP_PREFETCH)] = {	-1,		-1		}, +	}, +	[C(ITLB)] = {		/* 	RESULT_ACCESS	RESULT_MISS */ +		[C(OP_READ)] = {	0,		0x420ce		}, +		[C(OP_WRITE)] = {	-1,		-1		}, +		[C(OP_PREFETCH)] = {	-1,		-1		}, +	}, +	[C(BPU)] = {		/* 	RESULT_ACCESS	RESULT_MISS */ +		[C(OP_READ)] = {	0x430e6,	0x400052	}, +		[C(OP_WRITE)] = {	-1,		-1		}, +		[C(OP_PREFETCH)] = {	-1,		-1		}, +	}, +}; + +struct power_pmu power6_pmu = { +	.n_counter = 6, +	.max_alternatives = MAX_ALT, +	.add_fields = 0x1555, +	.test_adder = 0x3000, +	.compute_mmcr = p6_compute_mmcr, +	.get_constraint = p6_get_constraint, +	.get_alternatives = p6_get_alternatives, +	.disable_pmc = p6_disable_pmc, +	.limited_pmc_event = p6_limited_pmc_event, +	.flags = PPMU_LIMITED_PMC5_6 | PPMU_ALT_SIPR, +	.n_generic = ARRAY_SIZE(power6_generic_events), +	.generic_events = power6_generic_events, +	.cache_events = &power6_cache_events, +}; diff --git a/arch/powerpc/kernel/power7-pmu.c b/arch/powerpc/kernel/power7-pmu.c new file mode 100644 index 00000000000..b3f7d1216ba --- /dev/null +++ b/arch/powerpc/kernel/power7-pmu.c @@ -0,0 +1,357 @@ +/* + * Performance counter support for POWER7 processors. + * + * Copyright 2009 Paul Mackerras, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <linux/kernel.h> +#include <linux/perf_counter.h> +#include <asm/reg.h> + +/* + * Bits in event code for POWER7 + */ +#define PM_PMC_SH	16	/* PMC number (1-based) for direct events */ +#define PM_PMC_MSK	0xf +#define PM_PMC_MSKS	(PM_PMC_MSK << PM_PMC_SH) +#define PM_UNIT_SH	12	/* TTMMUX number and setting - unit select */ +#define PM_UNIT_MSK	0xf +#define PM_COMBINE_SH	11	/* Combined event bit */ +#define PM_COMBINE_MSK	1 +#define PM_COMBINE_MSKS	0x800 +#define PM_L2SEL_SH	8	/* L2 event select */ +#define PM_L2SEL_MSK	7 +#define PM_PMCSEL_MSK	0xff + +/* + * Bits in MMCR1 for POWER7 + */ +#define MMCR1_TTM0SEL_SH	60 +#define MMCR1_TTM1SEL_SH	56 +#define MMCR1_TTM2SEL_SH	52 +#define MMCR1_TTM3SEL_SH	48 +#define MMCR1_TTMSEL_MSK	0xf +#define MMCR1_L2SEL_SH		45 +#define MMCR1_L2SEL_MSK		7 +#define MMCR1_PMC1_COMBINE_SH	35 +#define MMCR1_PMC2_COMBINE_SH	34 +#define MMCR1_PMC3_COMBINE_SH	33 +#define MMCR1_PMC4_COMBINE_SH	32 +#define MMCR1_PMC1SEL_SH	24 +#define MMCR1_PMC2SEL_SH	16 +#define MMCR1_PMC3SEL_SH	8 +#define MMCR1_PMC4SEL_SH	0 +#define MMCR1_PMCSEL_SH(n)	(MMCR1_PMC1SEL_SH - (n) * 8) +#define MMCR1_PMCSEL_MSK	0xff + +/* + * Bits in MMCRA + */ + +/* + * Layout of constraint bits: + * 6666555555555544444444443333333333222222222211111111110000000000 + * 3210987654321098765432109876543210987654321098765432109876543210 + *                                                 [  ><><><><><><> + *                                                  NC P6P5P4P3P2P1 + * + * NC - number of counters + *     15: NC error 0x8000 + *     12-14: number of events needing PMC1-4 0x7000 + * + * P6 + *     11: P6 error 0x800 + *     10-11: Count of events needing PMC6 + * + * P1..P5 + *     0-9: Count of events needing PMC1..PMC5 + */ + +static int power7_get_constraint(u64 event, u64 *maskp, u64 *valp) +{ +	int pmc, sh; +	u64 mask = 0, value = 0; + +	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; +	if (pmc) { +		if (pmc > 6) +			return -1; +		sh = (pmc - 1) * 2; +		mask |= 2 << sh; +		value |= 1 << sh; +		if (pmc >= 5 && !(event == 0x500fa || event == 0x600f4)) +			return -1; +	} +	if (pmc < 5) { +		/* need a counter from PMC1-4 set */ +		mask  |= 0x8000; +		value |= 0x1000; +	} +	*maskp = mask; +	*valp = value; +	return 0; +} + +#define MAX_ALT	2	/* at most 2 alternatives for any event */ + +static const unsigned int event_alternatives[][MAX_ALT] = { +	{ 0x200f2, 0x300f2 },		/* PM_INST_DISP */ +	{ 0x200f4, 0x600f4 },		/* PM_RUN_CYC */ +	{ 0x400fa, 0x500fa },		/* PM_RUN_INST_CMPL */ +}; + +/* + * Scan the alternatives table for a match and return the + * index into the alternatives table if found, else -1. + */ +static int find_alternative(u64 event) +{ +	int i, j; + +	for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) { +		if (event < event_alternatives[i][0]) +			break; +		for (j = 0; j < MAX_ALT && event_alternatives[i][j]; ++j) +			if (event == event_alternatives[i][j]) +				return i; +	} +	return -1; +} + +static s64 find_alternative_decode(u64 event) +{ +	int pmc, psel; + +	/* this only handles the 4x decode events */ +	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; +	psel = event & PM_PMCSEL_MSK; +	if ((pmc == 2 || pmc == 4) && (psel & ~7) == 0x40) +		return event - (1 << PM_PMC_SH) + 8; +	if ((pmc == 1 || pmc == 3) && (psel & ~7) == 0x48) +		return event + (1 << PM_PMC_SH) - 8; +	return -1; +} + +static int power7_get_alternatives(u64 event, unsigned int flags, u64 alt[]) +{ +	int i, j, nalt = 1; +	s64 ae; + +	alt[0] = event; +	nalt = 1; +	i = find_alternative(event); +	if (i >= 0) { +		for (j = 0; j < MAX_ALT; ++j) { +			ae = event_alternatives[i][j]; +			if (ae && ae != event) +				alt[nalt++] = ae; +		} +	} else { +		ae = find_alternative_decode(event); +		if (ae > 0) +			alt[nalt++] = ae; +	} + +	if (flags & PPMU_ONLY_COUNT_RUN) { +		/* +		 * We're only counting in RUN state, +		 * so PM_CYC is equivalent to PM_RUN_CYC +		 * and PM_INST_CMPL === PM_RUN_INST_CMPL. +		 * This doesn't include alternatives that don't provide +		 * any extra flexibility in assigning PMCs. +		 */ +		j = nalt; +		for (i = 0; i < nalt; ++i) { +			switch (alt[i]) { +			case 0x1e:	/* PM_CYC */ +				alt[j++] = 0x600f4;	/* PM_RUN_CYC */ +				break; +			case 0x600f4:	/* PM_RUN_CYC */ +				alt[j++] = 0x1e; +				break; +			case 0x2:	/* PM_PPC_CMPL */ +				alt[j++] = 0x500fa;	/* PM_RUN_INST_CMPL */ +				break; +			case 0x500fa:	/* PM_RUN_INST_CMPL */ +				alt[j++] = 0x2;	/* PM_PPC_CMPL */ +				break; +			} +		} +		nalt = j; +	} + +	return nalt; +} + +/* + * Returns 1 if event counts things relating to marked instructions + * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not. + */ +static int power7_marked_instr_event(u64 event) +{ +	int pmc, psel; +	int unit; + +	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; +	unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK; +	psel = event & PM_PMCSEL_MSK & ~1;	/* trim off edge/level bit */ +	if (pmc >= 5) +		return 0; + +	switch (psel >> 4) { +	case 2: +		return pmc == 2 || pmc == 4; +	case 3: +		if (psel == 0x3c) +			return pmc == 1; +		if (psel == 0x3e) +			return pmc != 2; +		return 1; +	case 4: +	case 5: +		return unit == 0xd; +	case 6: +		if (psel == 0x64) +			return pmc >= 3; +	case 8: +		return unit == 0xd; +	} +	return 0; +} + +static int power7_compute_mmcr(u64 event[], int n_ev, +			       unsigned int hwc[], u64 mmcr[]) +{ +	u64 mmcr1 = 0; +	u64 mmcra = 0; +	unsigned int pmc, unit, combine, l2sel, psel; +	unsigned int pmc_inuse = 0; +	int i; + +	/* First pass to count resource use */ +	for (i = 0; i < n_ev; ++i) { +		pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK; +		if (pmc) { +			if (pmc > 6) +				return -1; +			if (pmc_inuse & (1 << (pmc - 1))) +				return -1; +			pmc_inuse |= 1 << (pmc - 1); +		} +	} + +	/* Second pass: assign PMCs, set all MMCR1 fields */ +	for (i = 0; i < n_ev; ++i) { +		pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK; +		unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK; +		combine = (event[i] >> PM_COMBINE_SH) & PM_COMBINE_MSK; +		l2sel = (event[i] >> PM_L2SEL_SH) & PM_L2SEL_MSK; +		psel = event[i] & PM_PMCSEL_MSK; +		if (!pmc) { +			/* Bus event or any-PMC direct event */ +			for (pmc = 0; pmc < 4; ++pmc) { +				if (!(pmc_inuse & (1 << pmc))) +					break; +			} +			if (pmc >= 4) +				return -1; +			pmc_inuse |= 1 << pmc; +		} else { +			/* Direct or decoded event */ +			--pmc; +		} +		if (pmc <= 3) { +			mmcr1 |= (u64) unit << (MMCR1_TTM0SEL_SH - 4 * pmc); +			mmcr1 |= (u64) combine << (MMCR1_PMC1_COMBINE_SH - pmc); +			mmcr1 |= psel << MMCR1_PMCSEL_SH(pmc); +			if (unit == 6)	/* L2 events */ +				mmcr1 |= (u64) l2sel << MMCR1_L2SEL_SH; +		} +		if (power7_marked_instr_event(event[i])) +			mmcra |= MMCRA_SAMPLE_ENABLE; +		hwc[i] = pmc; +	} + +	/* Return MMCRx values */ +	mmcr[0] = 0; +	if (pmc_inuse & 1) +		mmcr[0] = MMCR0_PMC1CE; +	if (pmc_inuse & 0x3e) +		mmcr[0] |= MMCR0_PMCjCE; +	mmcr[1] = mmcr1; +	mmcr[2] = mmcra; +	return 0; +} + +static void power7_disable_pmc(unsigned int pmc, u64 mmcr[]) +{ +	if (pmc <= 3) +		mmcr[1] &= ~(0xffULL << MMCR1_PMCSEL_SH(pmc)); +} + +static int power7_generic_events[] = { +	[PERF_COUNT_CPU_CYCLES] = 0x1e, +	[PERF_COUNT_INSTRUCTIONS] = 2, +	[PERF_COUNT_CACHE_REFERENCES] = 0xc880,		/* LD_REF_L1_LSU */ +	[PERF_COUNT_CACHE_MISSES] = 0x400f0,		/* LD_MISS_L1 */ +	[PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x10068,	/* BRU_FIN */ +	[PERF_COUNT_BRANCH_MISSES] = 0x400f6,		/* BR_MPRED */ +}; + +#define C(x)	PERF_COUNT_HW_CACHE_##x + +/* + * Table of generalized cache-related events. + * 0 means not supported, -1 means nonsensical, other values + * are event codes. + */ +static int power7_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = { +	[C(L1D)] = {		/* 	RESULT_ACCESS	RESULT_MISS */ +		[C(OP_READ)] = {	0x400f0,	0xc880	}, +		[C(OP_WRITE)] = {	0,		0x300f0	}, +		[C(OP_PREFETCH)] = {	0xd8b8,		0	}, +	}, +	[C(L1I)] = {		/* 	RESULT_ACCESS	RESULT_MISS */ +		[C(OP_READ)] = {	0,		0x200fc	}, +		[C(OP_WRITE)] = {	-1,		-1	}, +		[C(OP_PREFETCH)] = {	0x408a,		0	}, +	}, +	[C(LL)] = {		/* 	RESULT_ACCESS	RESULT_MISS */ +		[C(OP_READ)] = {	0x6080,		0x6084	}, +		[C(OP_WRITE)] = {	0x6082,		0x6086	}, +		[C(OP_PREFETCH)] = {	0,		0	}, +	}, +	[C(DTLB)] = {		/* 	RESULT_ACCESS	RESULT_MISS */ +		[C(OP_READ)] = {	0,		0x300fc	}, +		[C(OP_WRITE)] = {	-1,		-1	}, +		[C(OP_PREFETCH)] = {	-1,		-1	}, +	}, +	[C(ITLB)] = {		/* 	RESULT_ACCESS	RESULT_MISS */ +		[C(OP_READ)] = {	0,		0x400fc	}, +		[C(OP_WRITE)] = {	-1,		-1	}, +		[C(OP_PREFETCH)] = {	-1,		-1	}, +	}, +	[C(BPU)] = {		/* 	RESULT_ACCESS	RESULT_MISS */ +		[C(OP_READ)] = {	0x10068,	0x400f6	}, +		[C(OP_WRITE)] = {	-1,		-1	}, +		[C(OP_PREFETCH)] = {	-1,		-1	}, +	}, +}; + +struct power_pmu power7_pmu = { +	.n_counter = 6, +	.max_alternatives = MAX_ALT + 1, +	.add_fields = 0x1555ull, +	.test_adder = 0x3000ull, +	.compute_mmcr = power7_compute_mmcr, +	.get_constraint = power7_get_constraint, +	.get_alternatives = power7_get_alternatives, +	.disable_pmc = power7_disable_pmc, +	.n_generic = ARRAY_SIZE(power7_generic_events), +	.generic_events = power7_generic_events, +	.cache_events = &power7_cache_events, +}; diff --git a/arch/powerpc/kernel/ppc970-pmu.c b/arch/powerpc/kernel/ppc970-pmu.c new file mode 100644 index 00000000000..ba0a357a89f --- /dev/null +++ b/arch/powerpc/kernel/ppc970-pmu.c @@ -0,0 +1,482 @@ +/* + * Performance counter support for PPC970-family processors. + * + * Copyright 2008-2009 Paul Mackerras, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <linux/string.h> +#include <linux/perf_counter.h> +#include <asm/reg.h> + +/* + * Bits in event code for PPC970 + */ +#define PM_PMC_SH	12	/* PMC number (1-based) for direct events */ +#define PM_PMC_MSK	0xf +#define PM_UNIT_SH	8	/* TTMMUX number and setting - unit select */ +#define PM_UNIT_MSK	0xf +#define PM_SPCSEL_SH	6 +#define PM_SPCSEL_MSK	3 +#define PM_BYTE_SH	4	/* Byte number of event bus to use */ +#define PM_BYTE_MSK	3 +#define PM_PMCSEL_MSK	0xf + +/* Values in PM_UNIT field */ +#define PM_NONE		0 +#define PM_FPU		1 +#define PM_VPU		2 +#define PM_ISU		3 +#define PM_IFU		4 +#define PM_IDU		5 +#define PM_STS		6 +#define PM_LSU0		7 +#define PM_LSU1U	8 +#define PM_LSU1L	9 +#define PM_LASTUNIT	9 + +/* + * Bits in MMCR0 for PPC970 + */ +#define MMCR0_PMC1SEL_SH	8 +#define MMCR0_PMC2SEL_SH	1 +#define MMCR_PMCSEL_MSK		0x1f + +/* + * Bits in MMCR1 for PPC970 + */ +#define MMCR1_TTM0SEL_SH	62 +#define MMCR1_TTM1SEL_SH	59 +#define MMCR1_TTM3SEL_SH	53 +#define MMCR1_TTMSEL_MSK	3 +#define MMCR1_TD_CP_DBG0SEL_SH	50 +#define MMCR1_TD_CP_DBG1SEL_SH	48 +#define MMCR1_TD_CP_DBG2SEL_SH	46 +#define MMCR1_TD_CP_DBG3SEL_SH	44 +#define MMCR1_PMC1_ADDER_SEL_SH	39 +#define MMCR1_PMC2_ADDER_SEL_SH	38 +#define MMCR1_PMC6_ADDER_SEL_SH	37 +#define MMCR1_PMC5_ADDER_SEL_SH	36 +#define MMCR1_PMC8_ADDER_SEL_SH	35 +#define MMCR1_PMC7_ADDER_SEL_SH	34 +#define MMCR1_PMC3_ADDER_SEL_SH	33 +#define MMCR1_PMC4_ADDER_SEL_SH	32 +#define MMCR1_PMC3SEL_SH	27 +#define MMCR1_PMC4SEL_SH	22 +#define MMCR1_PMC5SEL_SH	17 +#define MMCR1_PMC6SEL_SH	12 +#define MMCR1_PMC7SEL_SH	7 +#define MMCR1_PMC8SEL_SH	2 + +static short mmcr1_adder_bits[8] = { +	MMCR1_PMC1_ADDER_SEL_SH, +	MMCR1_PMC2_ADDER_SEL_SH, +	MMCR1_PMC3_ADDER_SEL_SH, +	MMCR1_PMC4_ADDER_SEL_SH, +	MMCR1_PMC5_ADDER_SEL_SH, +	MMCR1_PMC6_ADDER_SEL_SH, +	MMCR1_PMC7_ADDER_SEL_SH, +	MMCR1_PMC8_ADDER_SEL_SH +}; + +/* + * Bits in MMCRA + */ + +/* + * Layout of constraint bits: + * 6666555555555544444444443333333333222222222211111111110000000000 + * 3210987654321098765432109876543210987654321098765432109876543210 + *               <><><>[  >[  >[  ><  ><  ><  ><  ><><><><><><><><> + *               SPT0T1 UC  PS1 PS2 B0  B1  B2  B3 P1P2P3P4P5P6P7P8 + * + * SP - SPCSEL constraint + *     48-49: SPCSEL value 0x3_0000_0000_0000 + * + * T0 - TTM0 constraint + *     46-47: TTM0SEL value (0=FPU, 2=IFU, 3=VPU) 0xC000_0000_0000 + * + * T1 - TTM1 constraint + *     44-45: TTM1SEL value (0=IDU, 3=STS) 0x3000_0000_0000 + * + * UC - unit constraint: can't have all three of FPU|IFU|VPU, ISU, IDU|STS + *     43: UC3 error 0x0800_0000_0000 + *     42: FPU|IFU|VPU events needed 0x0400_0000_0000 + *     41: ISU events needed 0x0200_0000_0000 + *     40: IDU|STS events needed 0x0100_0000_0000 + * + * PS1 + *     39: PS1 error 0x0080_0000_0000 + *     36-38: count of events needing PMC1/2/5/6 0x0070_0000_0000 + * + * PS2 + *     35: PS2 error 0x0008_0000_0000 + *     32-34: count of events needing PMC3/4/7/8 0x0007_0000_0000 + * + * B0 + *     28-31: Byte 0 event source 0xf000_0000 + *	      Encoding as for the event code + * + * B1, B2, B3 + *     24-27, 20-23, 16-19: Byte 1, 2, 3 event sources + * + * P1 + *     15: P1 error 0x8000 + *     14-15: Count of events needing PMC1 + * + * P2..P8 + *     0-13: Count of events needing PMC2..PMC8 + */ + +static unsigned char direct_marked_event[8] = { +	(1<<2) | (1<<3),	/* PMC1: PM_MRK_GRP_DISP, PM_MRK_ST_CMPL */ +	(1<<3) | (1<<5),	/* PMC2: PM_THRESH_TIMEO, PM_MRK_BRU_FIN */ +	(1<<3) | (1<<5),	/* PMC3: PM_MRK_ST_CMPL_INT, PM_MRK_VMX_FIN */ +	(1<<4) | (1<<5),	/* PMC4: PM_MRK_GRP_CMPL, PM_MRK_CRU_FIN */ +	(1<<4) | (1<<5),	/* PMC5: PM_GRP_MRK, PM_MRK_GRP_TIMEO */ +	(1<<3) | (1<<4) | (1<<5), +		/* PMC6: PM_MRK_ST_STS, PM_MRK_FXU_FIN, PM_MRK_GRP_ISSUED */ +	(1<<4) | (1<<5),	/* PMC7: PM_MRK_FPU_FIN, PM_MRK_INST_FIN */ +	(1<<4)			/* PMC8: PM_MRK_LSU_FIN */ +}; + +/* + * Returns 1 if event counts things relating to marked instructions + * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not. + */ +static int p970_marked_instr_event(u64 event) +{ +	int pmc, psel, unit, byte, bit; +	unsigned int mask; + +	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; +	psel = event & PM_PMCSEL_MSK; +	if (pmc) { +		if (direct_marked_event[pmc - 1] & (1 << psel)) +			return 1; +		if (psel == 0)		/* add events */ +			bit = (pmc <= 4)? pmc - 1: 8 - pmc; +		else if (psel == 7 || psel == 13)	/* decode events */ +			bit = 4; +		else +			return 0; +	} else +		bit = psel; + +	byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK; +	unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK; +	mask = 0; +	switch (unit) { +	case PM_VPU: +		mask = 0x4c;		/* byte 0 bits 2,3,6 */ +	case PM_LSU0: +		/* byte 2 bits 0,2,3,4,6; all of byte 1 */ +		mask = 0x085dff00; +	case PM_LSU1L: +		mask = 0x50 << 24;	/* byte 3 bits 4,6 */ +		break; +	} +	return (mask >> (byte * 8 + bit)) & 1; +} + +/* Masks and values for using events from the various units */ +static u64 unit_cons[PM_LASTUNIT+1][2] = { +	[PM_FPU] =   { 0xc80000000000ull, 0x040000000000ull }, +	[PM_VPU] =   { 0xc80000000000ull, 0xc40000000000ull }, +	[PM_ISU] =   { 0x080000000000ull, 0x020000000000ull }, +	[PM_IFU] =   { 0xc80000000000ull, 0x840000000000ull }, +	[PM_IDU] =   { 0x380000000000ull, 0x010000000000ull }, +	[PM_STS] =   { 0x380000000000ull, 0x310000000000ull }, +}; + +static int p970_get_constraint(u64 event, u64 *maskp, u64 *valp) +{ +	int pmc, byte, unit, sh, spcsel; +	u64 mask = 0, value = 0; +	int grp = -1; + +	pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; +	if (pmc) { +		if (pmc > 8) +			return -1; +		sh = (pmc - 1) * 2; +		mask |= 2 << sh; +		value |= 1 << sh; +		grp = ((pmc - 1) >> 1) & 1; +	} +	unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK; +	if (unit) { +		if (unit > PM_LASTUNIT) +			return -1; +		mask |= unit_cons[unit][0]; +		value |= unit_cons[unit][1]; +		byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK; +		/* +		 * Bus events on bytes 0 and 2 can be counted +		 * on PMC1/2/5/6; bytes 1 and 3 on PMC3/4/7/8. +		 */ +		if (!pmc) +			grp = byte & 1; +		/* Set byte lane select field */ +		mask  |= 0xfULL << (28 - 4 * byte); +		value |= (u64)unit << (28 - 4 * byte); +	} +	if (grp == 0) { +		/* increment PMC1/2/5/6 field */ +		mask  |= 0x8000000000ull; +		value |= 0x1000000000ull; +	} else if (grp == 1) { +		/* increment PMC3/4/7/8 field */ +		mask  |= 0x800000000ull; +		value |= 0x100000000ull; +	} +	spcsel = (event >> PM_SPCSEL_SH) & PM_SPCSEL_MSK; +	if (spcsel) { +		mask  |= 3ull << 48; +		value |= (u64)spcsel << 48; +	} +	*maskp = mask; +	*valp = value; +	return 0; +} + +static int p970_get_alternatives(u64 event, unsigned int flags, u64 alt[]) +{ +	alt[0] = event; + +	/* 2 alternatives for LSU empty */ +	if (event == 0x2002 || event == 0x3002) { +		alt[1] = event ^ 0x1000; +		return 2; +	} +		 +	return 1; +} + +static int p970_compute_mmcr(u64 event[], int n_ev, +			     unsigned int hwc[], u64 mmcr[]) +{ +	u64 mmcr0 = 0, mmcr1 = 0, mmcra = 0; +	unsigned int pmc, unit, byte, psel; +	unsigned int ttm, grp; +	unsigned int pmc_inuse = 0; +	unsigned int pmc_grp_use[2]; +	unsigned char busbyte[4]; +	unsigned char unituse[16]; +	unsigned char unitmap[] = { 0, 0<<3, 3<<3, 1<<3, 2<<3, 0|4, 3|4 }; +	unsigned char ttmuse[2]; +	unsigned char pmcsel[8]; +	int i; +	int spcsel; + +	if (n_ev > 8) +		return -1; + +	/* First pass to count resource use */ +	pmc_grp_use[0] = pmc_grp_use[1] = 0; +	memset(busbyte, 0, sizeof(busbyte)); +	memset(unituse, 0, sizeof(unituse)); +	for (i = 0; i < n_ev; ++i) { +		pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK; +		if (pmc) { +			if (pmc_inuse & (1 << (pmc - 1))) +				return -1; +			pmc_inuse |= 1 << (pmc - 1); +			/* count 1/2/5/6 vs 3/4/7/8 use */ +			++pmc_grp_use[((pmc - 1) >> 1) & 1]; +		} +		unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK; +		byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK; +		if (unit) { +			if (unit > PM_LASTUNIT) +				return -1; +			if (!pmc) +				++pmc_grp_use[byte & 1]; +			if (busbyte[byte] && busbyte[byte] != unit) +				return -1; +			busbyte[byte] = unit; +			unituse[unit] = 1; +		} +	} +	if (pmc_grp_use[0] > 4 || pmc_grp_use[1] > 4) +		return -1; + +	/* +	 * Assign resources and set multiplexer selects. +	 * +	 * PM_ISU can go either on TTM0 or TTM1, but that's the only +	 * choice we have to deal with. +	 */ +	if (unituse[PM_ISU] & +	    (unituse[PM_FPU] | unituse[PM_IFU] | unituse[PM_VPU])) +		unitmap[PM_ISU] = 2 | 4;	/* move ISU to TTM1 */ +	/* Set TTM[01]SEL fields. */ +	ttmuse[0] = ttmuse[1] = 0; +	for (i = PM_FPU; i <= PM_STS; ++i) { +		if (!unituse[i]) +			continue; +		ttm = unitmap[i]; +		++ttmuse[(ttm >> 2) & 1]; +		mmcr1 |= (u64)(ttm & ~4) << MMCR1_TTM1SEL_SH; +	} +	/* Check only one unit per TTMx */ +	if (ttmuse[0] > 1 || ttmuse[1] > 1) +		return -1; + +	/* Set byte lane select fields and TTM3SEL. */ +	for (byte = 0; byte < 4; ++byte) { +		unit = busbyte[byte]; +		if (!unit) +			continue; +		if (unit <= PM_STS) +			ttm = (unitmap[unit] >> 2) & 1; +		else if (unit == PM_LSU0) +			ttm = 2; +		else { +			ttm = 3; +			if (unit == PM_LSU1L && byte >= 2) +				mmcr1 |= 1ull << (MMCR1_TTM3SEL_SH + 3 - byte); +		} +		mmcr1 |= (u64)ttm << (MMCR1_TD_CP_DBG0SEL_SH - 2 * byte); +	} + +	/* Second pass: assign PMCs, set PMCxSEL and PMCx_ADDER_SEL fields */ +	memset(pmcsel, 0x8, sizeof(pmcsel));	/* 8 means don't count */ +	for (i = 0; i < n_ev; ++i) { +		pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK; +		unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK; +		byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK; +		psel = event[i] & PM_PMCSEL_MSK; +		if (!pmc) { +			/* Bus event or any-PMC direct event */ +			if (unit) +				psel |= 0x10 | ((byte & 2) << 2); +			else +				psel |= 8; +			for (pmc = 0; pmc < 8; ++pmc) { +				if (pmc_inuse & (1 << pmc)) +					continue; +				grp = (pmc >> 1) & 1; +				if (unit) { +					if (grp == (byte & 1)) +						break; +				} else if (pmc_grp_use[grp] < 4) { +					++pmc_grp_use[grp]; +					break; +				} +			} +			pmc_inuse |= 1 << pmc; +		} else { +			/* Direct event */ +			--pmc; +			if (psel == 0 && (byte & 2)) +				/* add events on higher-numbered bus */ +				mmcr1 |= 1ull << mmcr1_adder_bits[pmc]; +		} +		pmcsel[pmc] = psel; +		hwc[i] = pmc; +		spcsel = (event[i] >> PM_SPCSEL_SH) & PM_SPCSEL_MSK; +		mmcr1 |= spcsel; +		if (p970_marked_instr_event(event[i])) +			mmcra |= MMCRA_SAMPLE_ENABLE; +	} +	for (pmc = 0; pmc < 2; ++pmc) +		mmcr0 |= pmcsel[pmc] << (MMCR0_PMC1SEL_SH - 7 * pmc); +	for (; pmc < 8; ++pmc) +		mmcr1 |= (u64)pmcsel[pmc] << (MMCR1_PMC3SEL_SH - 5 * (pmc - 2)); +	if (pmc_inuse & 1) +		mmcr0 |= MMCR0_PMC1CE; +	if (pmc_inuse & 0xfe) +		mmcr0 |= MMCR0_PMCjCE; + +	mmcra |= 0x2000;	/* mark only one IOP per PPC instruction */ + +	/* Return MMCRx values */ +	mmcr[0] = mmcr0; +	mmcr[1] = mmcr1; +	mmcr[2] = mmcra; +	return 0; +} + +static void p970_disable_pmc(unsigned int pmc, u64 mmcr[]) +{ +	int shift, i; + +	if (pmc <= 1) { +		shift = MMCR0_PMC1SEL_SH - 7 * pmc; +		i = 0; +	} else { +		shift = MMCR1_PMC3SEL_SH - 5 * (pmc - 2); +		i = 1; +	} +	/* +	 * Setting the PMCxSEL field to 0x08 disables PMC x. +	 */ +	mmcr[i] = (mmcr[i] & ~(0x1fUL << shift)) | (0x08UL << shift); +} + +static int ppc970_generic_events[] = { +	[PERF_COUNT_HW_CPU_CYCLES]		= 7, +	[PERF_COUNT_HW_INSTRUCTIONS]		= 1, +	[PERF_COUNT_HW_CACHE_REFERENCES]	= 0x8810, /* PM_LD_REF_L1 */ +	[PERF_COUNT_HW_CACHE_MISSES]		= 0x3810, /* PM_LD_MISS_L1 */ +	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x431,  /* PM_BR_ISSUED */ +	[PERF_COUNT_HW_BRANCH_MISSES] 		= 0x327,  /* PM_GRP_BR_MPRED */ +}; + +#define C(x)	PERF_COUNT_HW_CACHE_##x + +/* + * Table of generalized cache-related events. + * 0 means not supported, -1 means nonsensical, other values + * are event codes. + */ +static int ppc970_cache_events[C(MAX)][C(OP_MAX)][C(RESULT_MAX)] = { +	[C(L1D)] = {		/* 	RESULT_ACCESS	RESULT_MISS */ +		[C(OP_READ)] = {	0x8810,		0x3810	}, +		[C(OP_WRITE)] = {	0x7810,		0x813	}, +		[C(OP_PREFETCH)] = {	0x731,		0	}, +	}, +	[C(L1I)] = {		/* 	RESULT_ACCESS	RESULT_MISS */ +		[C(OP_READ)] = {	0,		0	}, +		[C(OP_WRITE)] = {	-1,		-1	}, +		[C(OP_PREFETCH)] = {	0,		0	}, +	}, +	[C(LL)] = {		/* 	RESULT_ACCESS	RESULT_MISS */ +		[C(OP_READ)] = {	0,		0	}, +		[C(OP_WRITE)] = {	0,		0	}, +		[C(OP_PREFETCH)] = {	0x733,		0	}, +	}, +	[C(DTLB)] = {		/* 	RESULT_ACCESS	RESULT_MISS */ +		[C(OP_READ)] = {	0,		0x704	}, +		[C(OP_WRITE)] = {	-1,		-1	}, +		[C(OP_PREFETCH)] = {	-1,		-1	}, +	}, +	[C(ITLB)] = {		/* 	RESULT_ACCESS	RESULT_MISS */ +		[C(OP_READ)] = {	0,		0x700	}, +		[C(OP_WRITE)] = {	-1,		-1	}, +		[C(OP_PREFETCH)] = {	-1,		-1	}, +	}, +	[C(BPU)] = {		/* 	RESULT_ACCESS	RESULT_MISS */ +		[C(OP_READ)] = {	0x431,		0x327	}, +		[C(OP_WRITE)] = {	-1,		-1	}, +		[C(OP_PREFETCH)] = {	-1,		-1	}, +	}, +}; + +struct power_pmu ppc970_pmu = { +	.n_counter = 8, +	.max_alternatives = 2, +	.add_fields = 0x001100005555ull, +	.test_adder = 0x013300000000ull, +	.compute_mmcr = p970_compute_mmcr, +	.get_constraint = p970_get_constraint, +	.get_alternatives = p970_get_alternatives, +	.disable_pmc = p970_disable_pmc, +	.n_generic = ARRAY_SIZE(ppc970_generic_events), +	.generic_events = ppc970_generic_events, +	.cache_events = &ppc970_cache_events, +}; diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 76993941cac..5beffc8f481 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -29,6 +29,7 @@  #include <linux/module.h>  #include <linux/kprobes.h>  #include <linux/kdebug.h> +#include <linux/perf_counter.h>  #include <asm/firmware.h>  #include <asm/page.h> @@ -170,6 +171,8 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,  		die("Weird page fault", regs, SIGSEGV);  	} +	perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); +  	/* When running in the kernel we expect faults to occur only to  	 * addresses in user space.  All other faults represent errors in the  	 * kernel and should generate an OOPS.  Unfortunately, in the case of an @@ -309,6 +312,8 @@ good_area:  	}  	if (ret & VM_FAULT_MAJOR) {  		current->maj_flt++; +		perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, +				     regs, address);  #ifdef CONFIG_PPC_SMLPAR  		if (firmware_has_feature(FW_FEATURE_CMO)) {  			preempt_disable(); @@ -316,8 +321,11 @@ good_area:  			preempt_enable();  		}  #endif -	} else +	} else {  		current->min_flt++; +		perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, +				     regs, address); +	}  	up_read(&mm->mmap_sem);  	return 0; diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 9da795e4933..732ee93a8e9 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -1,6 +1,7 @@  config PPC64  	bool "64-bit kernel"  	default n +	select HAVE_PERF_COUNTERS  	help  	  This option selects whether a 32-bit or a 64-bit kernel  	  will be built. diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index aafae3b140d..68f5578fe38 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -739,6 +739,7 @@ config X86_UP_IOAPIC  config X86_LOCAL_APIC  	def_bool y  	depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC +	select HAVE_PERF_COUNTERS if (!M386 && !M486)  config X86_IO_APIC  	def_bool y diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index dcef387ddc3..e590261ba05 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -825,10 +825,11 @@ ia32_sys_call_table:  	.quad compat_sys_signalfd4  	.quad sys_eventfd2  	.quad sys_epoll_create1 -	.quad sys_dup3			/* 330 */ +	.quad sys_dup3				/* 330 */  	.quad sys_pipe2  	.quad sys_inotify_init1  	.quad compat_sys_preadv  	.quad compat_sys_pwritev  	.quad compat_sys_rt_tgsigqueueinfo	/* 335 */ +	.quad sys_perf_counter_open  ia32_syscall_end: diff --git a/arch/x86/include/asm/atomic_32.h b/arch/x86/include/asm/atomic_32.h index 85b46fba422..aff9f1fcdcd 100644 --- a/arch/x86/include/asm/atomic_32.h +++ b/arch/x86/include/asm/atomic_32.h @@ -247,5 +247,241 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u)  #define smp_mb__before_atomic_inc()	barrier()  #define smp_mb__after_atomic_inc()	barrier() +/* An 64bit atomic type */ + +typedef struct { +	unsigned long long counter; +} atomic64_t; + +#define ATOMIC64_INIT(val)	{ (val) } + +/** + * atomic64_read - read atomic64 variable + * @v: pointer of type atomic64_t + * + * Atomically reads the value of @v. + * Doesn't imply a read memory barrier. + */ +#define __atomic64_read(ptr)		((ptr)->counter) + +static inline unsigned long long +cmpxchg8b(unsigned long long *ptr, unsigned long long old, unsigned long long new) +{ +	asm volatile( + +		LOCK_PREFIX "cmpxchg8b (%[ptr])\n" + +		     :		"=A" (old) + +		     : [ptr]	"D" (ptr), +				"A" (old), +				"b" (ll_low(new)), +				"c" (ll_high(new)) + +		     : "memory"); + +	return old; +} + +static inline unsigned long long +atomic64_cmpxchg(atomic64_t *ptr, unsigned long long old_val, +		 unsigned long long new_val) +{ +	return cmpxchg8b(&ptr->counter, old_val, new_val); +} + +/** + * atomic64_xchg - xchg atomic64 variable + * @ptr:      pointer to type atomic64_t + * @new_val:  value to assign + * @old_val:  old value that was there + * + * Atomically xchgs the value of @ptr to @new_val and returns + * the old value. + */ + +static inline unsigned long long +atomic64_xchg(atomic64_t *ptr, unsigned long long new_val) +{ +	unsigned long long old_val; + +	do { +		old_val = atomic_read(ptr); +	} while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val); + +	return old_val; +} + +/** + * atomic64_set - set atomic64 variable + * @ptr:      pointer to type atomic64_t + * @new_val:  value to assign + * + * Atomically sets the value of @ptr to @new_val. + */ +static inline void atomic64_set(atomic64_t *ptr, unsigned long long new_val) +{ +	atomic64_xchg(ptr, new_val); +} + +/** + * atomic64_read - read atomic64 variable + * @ptr:      pointer to type atomic64_t + * + * Atomically reads the value of @ptr and returns it. + */ +static inline unsigned long long atomic64_read(atomic64_t *ptr) +{ +	unsigned long long curr_val; + +	do { +		curr_val = __atomic64_read(ptr); +	} while (atomic64_cmpxchg(ptr, curr_val, curr_val) != curr_val); + +	return curr_val; +} + +/** + * atomic64_add_return - add and return + * @delta: integer value to add + * @ptr:   pointer to type atomic64_t + * + * Atomically adds @delta to @ptr and returns @delta + *@ptr + */ +static inline unsigned long long +atomic64_add_return(unsigned long long delta, atomic64_t *ptr) +{ +	unsigned long long old_val, new_val; + +	do { +		old_val = atomic_read(ptr); +		new_val = old_val + delta; + +	} while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val); + +	return new_val; +} + +static inline long atomic64_sub_return(unsigned long long delta, atomic64_t *ptr) +{ +	return atomic64_add_return(-delta, ptr); +} + +static inline long atomic64_inc_return(atomic64_t *ptr) +{ +	return atomic64_add_return(1, ptr); +} + +static inline long atomic64_dec_return(atomic64_t *ptr) +{ +	return atomic64_sub_return(1, ptr); +} + +/** + * atomic64_add - add integer to atomic64 variable + * @delta: integer value to add + * @ptr:   pointer to type atomic64_t + * + * Atomically adds @delta to @ptr. + */ +static inline void atomic64_add(unsigned long long delta, atomic64_t *ptr) +{ +	atomic64_add_return(delta, ptr); +} + +/** + * atomic64_sub - subtract the atomic64 variable + * @delta: integer value to subtract + * @ptr:   pointer to type atomic64_t + * + * Atomically subtracts @delta from @ptr. + */ +static inline void atomic64_sub(unsigned long long delta, atomic64_t *ptr) +{ +	atomic64_add(-delta, ptr); +} + +/** + * atomic64_sub_and_test - subtract value from variable and test result + * @delta: integer value to subtract + * @ptr:   pointer to type atomic64_t + * + * Atomically subtracts @delta from @ptr and returns + * true if the result is zero, or false for all + * other cases. + */ +static inline int +atomic64_sub_and_test(unsigned long long delta, atomic64_t *ptr) +{ +	unsigned long long old_val = atomic64_sub_return(delta, ptr); + +	return old_val == 0; +} + +/** + * atomic64_inc - increment atomic64 variable + * @ptr: pointer to type atomic64_t + * + * Atomically increments @ptr by 1. + */ +static inline void atomic64_inc(atomic64_t *ptr) +{ +	atomic64_add(1, ptr); +} + +/** + * atomic64_dec - decrement atomic64 variable + * @ptr: pointer to type atomic64_t + * + * Atomically decrements @ptr by 1. + */ +static inline void atomic64_dec(atomic64_t *ptr) +{ +	atomic64_sub(1, ptr); +} + +/** + * atomic64_dec_and_test - decrement and test + * @ptr: pointer to type atomic64_t + * + * Atomically decrements @ptr by 1 and + * returns true if the result is 0, or false for all other + * cases. + */ +static inline int atomic64_dec_and_test(atomic64_t *ptr) +{ +	return atomic64_sub_and_test(1, ptr); +} + +/** + * atomic64_inc_and_test - increment and test + * @ptr: pointer to type atomic64_t + * + * Atomically increments @ptr by 1 + * and returns true if the result is zero, or false for all + * other cases. + */ +static inline int atomic64_inc_and_test(atomic64_t *ptr) +{ +	return atomic64_sub_and_test(-1, ptr); +} + +/** + * atomic64_add_negative - add and test if negative + * @delta: integer value to add + * @ptr:   pointer to type atomic64_t + * + * Atomically adds @delta to @ptr and returns true + * if the result is negative, or false when + * result is greater than or equal to zero. + */ +static inline int +atomic64_add_negative(unsigned long long delta, atomic64_t *ptr) +{ +	long long old_val = atomic64_add_return(delta, ptr); + +	return old_val < 0; +} +  #include <asm-generic/atomic.h>  #endif /* _ASM_X86_ATOMIC_32_H */ diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h index c2e6bedaf25..d750a10ccad 100644 --- a/arch/x86/include/asm/entry_arch.h +++ b/arch/x86/include/asm/entry_arch.h @@ -49,7 +49,7 @@ BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR)  BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR)  #ifdef CONFIG_PERF_COUNTERS -BUILD_INTERRUPT(perf_counter_interrupt, LOCAL_PERF_VECTOR) +BUILD_INTERRUPT(perf_pending_interrupt, LOCAL_PENDING_VECTOR)  #endif  #ifdef CONFIG_X86_MCE_P4THERMAL diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index 37555e52f98..9ebc5c25503 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -13,6 +13,8 @@ typedef struct {  	unsigned int irq_spurious_count;  #endif  	unsigned int generic_irqs;	/* arch dependent */ +	unsigned int apic_perf_irqs; +	unsigned int apic_pending_irqs;  #ifdef CONFIG_SMP  	unsigned int irq_resched_count;  	unsigned int irq_call_count; diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 3bd1777a4c8..6df45f63966 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -29,6 +29,8 @@  extern void apic_timer_interrupt(void);  extern void generic_interrupt(void);  extern void error_interrupt(void); +extern void perf_pending_interrupt(void); +  extern void spurious_interrupt(void);  extern void thermal_interrupt(void);  extern void reschedule_interrupt(void); diff --git a/arch/x86/include/asm/intel_arch_perfmon.h b/arch/x86/include/asm/intel_arch_perfmon.h deleted file mode 100644 index fa0fd068bc2..00000000000 --- a/arch/x86/include/asm/intel_arch_perfmon.h +++ /dev/null @@ -1,31 +0,0 @@ -#ifndef _ASM_X86_INTEL_ARCH_PERFMON_H -#define _ASM_X86_INTEL_ARCH_PERFMON_H - -#define MSR_ARCH_PERFMON_PERFCTR0		0xc1 -#define MSR_ARCH_PERFMON_PERFCTR1		0xc2 - -#define MSR_ARCH_PERFMON_EVENTSEL0		0x186 -#define MSR_ARCH_PERFMON_EVENTSEL1		0x187 - -#define ARCH_PERFMON_EVENTSEL0_ENABLE	(1 << 22) -#define ARCH_PERFMON_EVENTSEL_INT	(1 << 20) -#define ARCH_PERFMON_EVENTSEL_OS	(1 << 17) -#define ARCH_PERFMON_EVENTSEL_USR	(1 << 16) - -#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL	(0x3c) -#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK	(0x00 << 8) -#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX (0) -#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \ -	(1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX)) - -union cpuid10_eax { -	struct { -		unsigned int version_id:8; -		unsigned int num_counters:8; -		unsigned int bit_width:8; -		unsigned int mask_length:8; -	} split; -	unsigned int full; -}; - -#endif /* _ASM_X86_INTEL_ARCH_PERFMON_H */ diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 910b5a3d675..e997be98c9b 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -108,14 +108,14 @@  #define LOCAL_TIMER_VECTOR		0xef  /* - * Performance monitoring interrupt vector: + * Generic system vector for platform specific use   */ -#define LOCAL_PERF_VECTOR		0xee +#define GENERIC_INTERRUPT_VECTOR	0xed  /* - * Generic system vector for platform specific use + * Performance monitoring pending work vector:   */ -#define GENERIC_INTERRUPT_VECTOR	0xed +#define LOCAL_PENDING_VECTOR		0xec  /*   * First APIC vector available to drivers: (vectors 0x30-0xee) we diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h new file mode 100644 index 00000000000..876ed97147b --- /dev/null +++ b/arch/x86/include/asm/perf_counter.h @@ -0,0 +1,100 @@ +#ifndef _ASM_X86_PERF_COUNTER_H +#define _ASM_X86_PERF_COUNTER_H + +/* + * Performance counter hw details: + */ + +#define X86_PMC_MAX_GENERIC					8 +#define X86_PMC_MAX_FIXED					3 + +#define X86_PMC_IDX_GENERIC				        0 +#define X86_PMC_IDX_FIXED				       32 +#define X86_PMC_IDX_MAX					       64 + +#define MSR_ARCH_PERFMON_PERFCTR0			      0xc1 +#define MSR_ARCH_PERFMON_PERFCTR1			      0xc2 + +#define MSR_ARCH_PERFMON_EVENTSEL0			     0x186 +#define MSR_ARCH_PERFMON_EVENTSEL1			     0x187 + +#define ARCH_PERFMON_EVENTSEL0_ENABLE			  (1 << 22) +#define ARCH_PERFMON_EVENTSEL_INT			  (1 << 20) +#define ARCH_PERFMON_EVENTSEL_OS			  (1 << 17) +#define ARCH_PERFMON_EVENTSEL_USR			  (1 << 16) + +/* + * Includes eventsel and unit mask as well: + */ +#define ARCH_PERFMON_EVENT_MASK				    0xffff + +#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL		      0x3c +#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK		(0x00 << 8) +#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 		 0 +#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \ +		(1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX)) + +#define ARCH_PERFMON_BRANCH_MISSES_RETIRED			 6 + +/* + * Intel "Architectural Performance Monitoring" CPUID + * detection/enumeration details: + */ +union cpuid10_eax { +	struct { +		unsigned int version_id:8; +		unsigned int num_counters:8; +		unsigned int bit_width:8; +		unsigned int mask_length:8; +	} split; +	unsigned int full; +}; + +union cpuid10_edx { +	struct { +		unsigned int num_counters_fixed:4; +		unsigned int reserved:28; +	} split; +	unsigned int full; +}; + + +/* + * Fixed-purpose performance counters: + */ + +/* + * All 3 fixed-mode PMCs are configured via this single MSR: + */ +#define MSR_ARCH_PERFMON_FIXED_CTR_CTRL			0x38d + +/* + * The counts are available in three separate MSRs: + */ + +/* Instr_Retired.Any: */ +#define MSR_ARCH_PERFMON_FIXED_CTR0			0x309 +#define X86_PMC_IDX_FIXED_INSTRUCTIONS			(X86_PMC_IDX_FIXED + 0) + +/* CPU_CLK_Unhalted.Core: */ +#define MSR_ARCH_PERFMON_FIXED_CTR1			0x30a +#define X86_PMC_IDX_FIXED_CPU_CYCLES			(X86_PMC_IDX_FIXED + 1) + +/* CPU_CLK_Unhalted.Ref: */ +#define MSR_ARCH_PERFMON_FIXED_CTR2			0x30b +#define X86_PMC_IDX_FIXED_BUS_CYCLES			(X86_PMC_IDX_FIXED + 2) + +extern void set_perf_counter_pending(void); + +#define clear_perf_counter_pending()	do { } while (0) +#define test_perf_counter_pending()	(0) + +#ifdef CONFIG_PERF_COUNTERS +extern void init_hw_perf_counters(void); +extern void perf_counters_lapic_init(void); +#else +static inline void init_hw_perf_counters(void)		{ } +static inline void perf_counters_lapic_init(void)	{ } +#endif + +#endif /* _ASM_X86_PERF_COUNTER_H */ diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h index 708dae61262..732a3070615 100644 --- a/arch/x86/include/asm/unistd_32.h +++ b/arch/x86/include/asm/unistd_32.h @@ -341,6 +341,7 @@  #define __NR_preadv		333  #define __NR_pwritev		334  #define __NR_rt_tgsigqueueinfo	335 +#define __NR_perf_counter_open	336  #ifdef __KERNEL__ diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h index 4e2b0540440..900e1617e67 100644 --- a/arch/x86/include/asm/unistd_64.h +++ b/arch/x86/include/asm/unistd_64.h @@ -659,7 +659,8 @@ __SYSCALL(__NR_preadv, sys_preadv)  __SYSCALL(__NR_pwritev, sys_pwritev)  #define __NR_rt_tgsigqueueinfo			297  __SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo) - +#define __NR_perf_counter_open			298 +__SYSCALL(__NR_perf_counter_open, sys_perf_counter_open)  #ifndef __NO_STUBS  #define __ARCH_WANT_OLD_READDIR diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index a4c9cf0bf70..076d3881f3d 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -14,6 +14,7 @@   *	Mikael Pettersson	:	PM converted to driver model.   */ +#include <linux/perf_counter.h>  #include <linux/kernel_stat.h>  #include <linux/mc146818rtc.h>  #include <linux/acpi_pmtmr.h> @@ -34,6 +35,7 @@  #include <linux/smp.h>  #include <linux/mm.h> +#include <asm/perf_counter.h>  #include <asm/pgalloc.h>  #include <asm/atomic.h>  #include <asm/mpspec.h> @@ -1187,6 +1189,7 @@ void __cpuinit setup_local_APIC(void)  		apic_write(APIC_ESR, 0);  	}  #endif +	perf_counters_lapic_init();  	preempt_disable(); diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 4e242f9a06e..3efcb2b96a1 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -1,5 +1,5 @@  # -# Makefile for x86-compatible CPU details and quirks +# Makefile for x86-compatible CPU details, features and quirks  #  # Don't trace early stages of a secondary CPU boot @@ -23,11 +23,13 @@ obj-$(CONFIG_CPU_SUP_CENTAUR)		+= centaur.o  obj-$(CONFIG_CPU_SUP_TRANSMETA_32)	+= transmeta.o  obj-$(CONFIG_CPU_SUP_UMC_32)		+= umc.o -obj-$(CONFIG_X86_MCE)	+= mcheck/ -obj-$(CONFIG_MTRR)	+= mtrr/ -obj-$(CONFIG_CPU_FREQ)	+= cpufreq/ +obj-$(CONFIG_PERF_COUNTERS)		+= perf_counter.o -obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o +obj-$(CONFIG_X86_MCE)			+= mcheck/ +obj-$(CONFIG_MTRR)			+= mtrr/ +obj-$(CONFIG_CPU_FREQ)			+= cpufreq/ + +obj-$(CONFIG_X86_LOCAL_APIC)		+= perfctr-watchdog.o  quiet_cmd_mkcapflags = MKCAP   $@        cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index b0517aa2bd3..3ffdcfa9abd 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -13,6 +13,7 @@  #include <linux/io.h>  #include <asm/stackprotector.h> +#include <asm/perf_counter.h>  #include <asm/mmu_context.h>  #include <asm/hypervisor.h>  #include <asm/processor.h> @@ -874,6 +875,7 @@ void __init identify_boot_cpu(void)  #else  	vgetcpu_set_mode();  #endif +	init_hw_perf_counters();  }  void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c new file mode 100644 index 00000000000..895c82e7845 --- /dev/null +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -0,0 +1,1704 @@ +/* + * Performance counter x86 architecture code + * + *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> + *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar + *  Copyright (C) 2009 Jaswinder Singh Rajput + *  Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter + *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + * + *  For licencing details see kernel-base/COPYING + */ + +#include <linux/perf_counter.h> +#include <linux/capability.h> +#include <linux/notifier.h> +#include <linux/hardirq.h> +#include <linux/kprobes.h> +#include <linux/module.h> +#include <linux/kdebug.h> +#include <linux/sched.h> +#include <linux/uaccess.h> + +#include <asm/apic.h> +#include <asm/stacktrace.h> +#include <asm/nmi.h> + +static u64 perf_counter_mask __read_mostly; + +struct cpu_hw_counters { +	struct perf_counter	*counters[X86_PMC_IDX_MAX]; +	unsigned long		used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; +	unsigned long		active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; +	unsigned long		interrupts; +	int			enabled; +}; + +/* + * struct x86_pmu - generic x86 pmu + */ +struct x86_pmu { +	const char	*name; +	int		version; +	int		(*handle_irq)(struct pt_regs *); +	void		(*disable_all)(void); +	void		(*enable_all)(void); +	void		(*enable)(struct hw_perf_counter *, int); +	void		(*disable)(struct hw_perf_counter *, int); +	unsigned	eventsel; +	unsigned	perfctr; +	u64		(*event_map)(int); +	u64		(*raw_event)(u64); +	int		max_events; +	int		num_counters; +	int		num_counters_fixed; +	int		counter_bits; +	u64		counter_mask; +	u64		max_period; +	u64		intel_ctrl; +}; + +static struct x86_pmu x86_pmu __read_mostly; + +static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = { +	.enabled = 1, +}; + +/* + * Intel PerfMon v3. Used on Core2 and later. + */ +static const u64 intel_perfmon_event_map[] = +{ +  [PERF_COUNT_HW_CPU_CYCLES]		= 0x003c, +  [PERF_COUNT_HW_INSTRUCTIONS]		= 0x00c0, +  [PERF_COUNT_HW_CACHE_REFERENCES]	= 0x4f2e, +  [PERF_COUNT_HW_CACHE_MISSES]		= 0x412e, +  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x00c4, +  [PERF_COUNT_HW_BRANCH_MISSES]		= 0x00c5, +  [PERF_COUNT_HW_BUS_CYCLES]		= 0x013c, +}; + +static u64 intel_pmu_event_map(int event) +{ +	return intel_perfmon_event_map[event]; +} + +/* + * Generalized hw caching related event table, filled + * in on a per model basis. A value of 0 means + * 'not supported', -1 means 'event makes no sense on + * this CPU', any other value means the raw event + * ID. + */ + +#define C(x) PERF_COUNT_HW_CACHE_##x + +static u64 __read_mostly hw_cache_event_ids +				[PERF_COUNT_HW_CACHE_MAX] +				[PERF_COUNT_HW_CACHE_OP_MAX] +				[PERF_COUNT_HW_CACHE_RESULT_MAX]; + +static const u64 nehalem_hw_cache_event_ids +				[PERF_COUNT_HW_CACHE_MAX] +				[PERF_COUNT_HW_CACHE_OP_MAX] +				[PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI            */ +		[ C(RESULT_MISS)   ] = 0x0140, /* L1D_CACHE_LD.I_STATE         */ +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI            */ +		[ C(RESULT_MISS)   ] = 0x0141, /* L1D_CACHE_ST.I_STATE         */ +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS        */ +		[ C(RESULT_MISS)   ] = 0x024e, /* L1D_PREFETCH.MISS            */ +	}, + }, + [ C(L1I ) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS                    */ +		[ C(RESULT_MISS)   ] = 0x0280, /* L1I.MISSES                   */ +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = -1, +		[ C(RESULT_MISS)   ] = -1, +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = 0x0, +		[ C(RESULT_MISS)   ] = 0x0, +	}, + }, + [ C(LL  ) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS               */ +		[ C(RESULT_MISS)   ] = 0x0224, /* L2_RQSTS.LD_MISS             */ +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS                */ +		[ C(RESULT_MISS)   ] = 0x0824, /* L2_RQSTS.RFO_MISS            */ +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference                */ +		[ C(RESULT_MISS)   ] = 0x412e, /* LLC Misses                   */ +	}, + }, + [ C(DTLB) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI   (alias)  */ +		[ C(RESULT_MISS)   ] = 0x0108, /* DTLB_LOAD_MISSES.ANY         */ +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI   (alias)  */ +		[ C(RESULT_MISS)   ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS  */ +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = 0x0, +		[ C(RESULT_MISS)   ] = 0x0, +	}, + }, + [ C(ITLB) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P           */ +		[ C(RESULT_MISS)   ] = 0x20c8, /* ITLB_MISS_RETIRED            */ +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = -1, +		[ C(RESULT_MISS)   ] = -1, +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = -1, +		[ C(RESULT_MISS)   ] = -1, +	}, + }, + [ C(BPU ) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */ +		[ C(RESULT_MISS)   ] = 0x03e8, /* BPU_CLEARS.ANY               */ +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = -1, +		[ C(RESULT_MISS)   ] = -1, +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = -1, +		[ C(RESULT_MISS)   ] = -1, +	}, + }, +}; + +static const u64 core2_hw_cache_event_ids +				[PERF_COUNT_HW_CACHE_MAX] +				[PERF_COUNT_HW_CACHE_OP_MAX] +				[PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI          */ +		[ C(RESULT_MISS)   ] = 0x0140, /* L1D_CACHE_LD.I_STATE       */ +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI          */ +		[ C(RESULT_MISS)   ] = 0x0141, /* L1D_CACHE_ST.I_STATE       */ +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS      */ +		[ C(RESULT_MISS)   ] = 0, +	}, + }, + [ C(L1I ) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS                  */ +		[ C(RESULT_MISS)   ] = 0x0081, /* L1I.MISSES                 */ +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = -1, +		[ C(RESULT_MISS)   ] = -1, +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = 0, +		[ C(RESULT_MISS)   ] = 0, +	}, + }, + [ C(LL  ) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI                 */ +		[ C(RESULT_MISS)   ] = 0x4129, /* L2_LD.ISTATE               */ +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI                 */ +		[ C(RESULT_MISS)   ] = 0x412A, /* L2_ST.ISTATE               */ +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = 0, +		[ C(RESULT_MISS)   ] = 0, +	}, + }, + [ C(DTLB) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI  (alias) */ +		[ C(RESULT_MISS)   ] = 0x0208, /* DTLB_MISSES.MISS_LD        */ +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI  (alias) */ +		[ C(RESULT_MISS)   ] = 0x0808, /* DTLB_MISSES.MISS_ST        */ +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = 0, +		[ C(RESULT_MISS)   ] = 0, +	}, + }, + [ C(ITLB) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P         */ +		[ C(RESULT_MISS)   ] = 0x1282, /* ITLBMISSES                 */ +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = -1, +		[ C(RESULT_MISS)   ] = -1, +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = -1, +		[ C(RESULT_MISS)   ] = -1, +	}, + }, + [ C(BPU ) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY        */ +		[ C(RESULT_MISS)   ] = 0x00c5, /* BP_INST_RETIRED.MISPRED    */ +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = -1, +		[ C(RESULT_MISS)   ] = -1, +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = -1, +		[ C(RESULT_MISS)   ] = -1, +	}, + }, +}; + +static const u64 atom_hw_cache_event_ids +				[PERF_COUNT_HW_CACHE_MAX] +				[PERF_COUNT_HW_CACHE_OP_MAX] +				[PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD               */ +		[ C(RESULT_MISS)   ] = 0, +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST               */ +		[ C(RESULT_MISS)   ] = 0, +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = 0x0, +		[ C(RESULT_MISS)   ] = 0, +	}, + }, + [ C(L1I ) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS                  */ +		[ C(RESULT_MISS)   ] = 0x0280, /* L1I.MISSES                 */ +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = -1, +		[ C(RESULT_MISS)   ] = -1, +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = 0, +		[ C(RESULT_MISS)   ] = 0, +	}, + }, + [ C(LL  ) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI                 */ +		[ C(RESULT_MISS)   ] = 0x4129, /* L2_LD.ISTATE               */ +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI                 */ +		[ C(RESULT_MISS)   ] = 0x412A, /* L2_ST.ISTATE               */ +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = 0, +		[ C(RESULT_MISS)   ] = 0, +	}, + }, + [ C(DTLB) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI  (alias) */ +		[ C(RESULT_MISS)   ] = 0x0508, /* DTLB_MISSES.MISS_LD        */ +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI  (alias) */ +		[ C(RESULT_MISS)   ] = 0x0608, /* DTLB_MISSES.MISS_ST        */ +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = 0, +		[ C(RESULT_MISS)   ] = 0, +	}, + }, + [ C(ITLB) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P         */ +		[ C(RESULT_MISS)   ] = 0x0282, /* ITLB.MISSES                */ +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = -1, +		[ C(RESULT_MISS)   ] = -1, +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = -1, +		[ C(RESULT_MISS)   ] = -1, +	}, + }, + [ C(BPU ) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY        */ +		[ C(RESULT_MISS)   ] = 0x00c5, /* BP_INST_RETIRED.MISPRED    */ +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = -1, +		[ C(RESULT_MISS)   ] = -1, +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = -1, +		[ C(RESULT_MISS)   ] = -1, +	}, + }, +}; + +static u64 intel_pmu_raw_event(u64 event) +{ +#define CORE_EVNTSEL_EVENT_MASK		0x000000FFULL +#define CORE_EVNTSEL_UNIT_MASK		0x0000FF00ULL +#define CORE_EVNTSEL_EDGE_MASK		0x00040000ULL +#define CORE_EVNTSEL_INV_MASK		0x00800000ULL +#define CORE_EVNTSEL_COUNTER_MASK	0xFF000000ULL + +#define CORE_EVNTSEL_MASK		\ +	(CORE_EVNTSEL_EVENT_MASK |	\ +	 CORE_EVNTSEL_UNIT_MASK  |	\ +	 CORE_EVNTSEL_EDGE_MASK  |	\ +	 CORE_EVNTSEL_INV_MASK  |	\ +	 CORE_EVNTSEL_COUNTER_MASK) + +	return event & CORE_EVNTSEL_MASK; +} + +static const u64 amd_0f_hw_cache_event_ids +				[PERF_COUNT_HW_CACHE_MAX] +				[PERF_COUNT_HW_CACHE_OP_MAX] +				[PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0, +		[ C(RESULT_MISS)   ] = 0, +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = 0, +		[ C(RESULT_MISS)   ] = 0, +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = 0, +		[ C(RESULT_MISS)   ] = 0, +	}, + }, + [ C(L1I ) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches  */ +		[ C(RESULT_MISS)   ] = 0x0081, /* Instruction cache misses   */ +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = -1, +		[ C(RESULT_MISS)   ] = -1, +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = 0, +		[ C(RESULT_MISS)   ] = 0, +	}, + }, + [ C(LL  ) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0, +		[ C(RESULT_MISS)   ] = 0, +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = 0, +		[ C(RESULT_MISS)   ] = 0, +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = 0, +		[ C(RESULT_MISS)   ] = 0, +	}, + }, + [ C(DTLB) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0, +		[ C(RESULT_MISS)   ] = 0, +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = 0, +		[ C(RESULT_MISS)   ] = 0, +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = 0, +		[ C(RESULT_MISS)   ] = 0, +	}, + }, + [ C(ITLB) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes        */ +		[ C(RESULT_MISS)   ] = 0x0085, /* Instr. fetch ITLB misses   */ +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = -1, +		[ C(RESULT_MISS)   ] = -1, +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = -1, +		[ C(RESULT_MISS)   ] = -1, +	}, + }, + [ C(BPU ) ] = { +	[ C(OP_READ) ] = { +		[ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr.      */ +		[ C(RESULT_MISS)   ] = 0x00c3, /* Retired Mispredicted BI    */ +	}, +	[ C(OP_WRITE) ] = { +		[ C(RESULT_ACCESS) ] = -1, +		[ C(RESULT_MISS)   ] = -1, +	}, +	[ C(OP_PREFETCH) ] = { +		[ C(RESULT_ACCESS) ] = -1, +		[ C(RESULT_MISS)   ] = -1, +	}, + }, +}; + +/* + * AMD Performance Monitor K7 and later. + */ +static const u64 amd_perfmon_event_map[] = +{ +  [PERF_COUNT_HW_CPU_CYCLES]		= 0x0076, +  [PERF_COUNT_HW_INSTRUCTIONS]		= 0x00c0, +  [PERF_COUNT_HW_CACHE_REFERENCES]	= 0x0080, +  [PERF_COUNT_HW_CACHE_MISSES]		= 0x0081, +  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x00c4, +  [PERF_COUNT_HW_BRANCH_MISSES]		= 0x00c5, +}; + +static u64 amd_pmu_event_map(int event) +{ +	return amd_perfmon_event_map[event]; +} + +static u64 amd_pmu_raw_event(u64 event) +{ +#define K7_EVNTSEL_EVENT_MASK	0x7000000FFULL +#define K7_EVNTSEL_UNIT_MASK	0x00000FF00ULL +#define K7_EVNTSEL_EDGE_MASK	0x000040000ULL +#define K7_EVNTSEL_INV_MASK	0x000800000ULL +#define K7_EVNTSEL_COUNTER_MASK	0x0FF000000ULL + +#define K7_EVNTSEL_MASK			\ +	(K7_EVNTSEL_EVENT_MASK |	\ +	 K7_EVNTSEL_UNIT_MASK  |	\ +	 K7_EVNTSEL_EDGE_MASK  |	\ +	 K7_EVNTSEL_INV_MASK   |	\ +	 K7_EVNTSEL_COUNTER_MASK) + +	return event & K7_EVNTSEL_MASK; +} + +/* + * Propagate counter elapsed time into the generic counter. + * Can only be executed on the CPU where the counter is active. + * Returns the delta events processed. + */ +static u64 +x86_perf_counter_update(struct perf_counter *counter, +			struct hw_perf_counter *hwc, int idx) +{ +	int shift = 64 - x86_pmu.counter_bits; +	u64 prev_raw_count, new_raw_count; +	s64 delta; + +	/* +	 * Careful: an NMI might modify the previous counter value. +	 * +	 * Our tactic to handle this is to first atomically read and +	 * exchange a new raw count - then add that new-prev delta +	 * count to the generic counter atomically: +	 */ +again: +	prev_raw_count = atomic64_read(&hwc->prev_count); +	rdmsrl(hwc->counter_base + idx, new_raw_count); + +	if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count, +					new_raw_count) != prev_raw_count) +		goto again; + +	/* +	 * Now we have the new raw value and have updated the prev +	 * timestamp already. We can now calculate the elapsed delta +	 * (counter-)time and add that to the generic counter. +	 * +	 * Careful, not all hw sign-extends above the physical width +	 * of the count. +	 */ +	delta = (new_raw_count << shift) - (prev_raw_count << shift); +	delta >>= shift; + +	atomic64_add(delta, &counter->count); +	atomic64_sub(delta, &hwc->period_left); + +	return new_raw_count; +} + +static atomic_t active_counters; +static DEFINE_MUTEX(pmc_reserve_mutex); + +static bool reserve_pmc_hardware(void) +{ +	int i; + +	if (nmi_watchdog == NMI_LOCAL_APIC) +		disable_lapic_nmi_watchdog(); + +	for (i = 0; i < x86_pmu.num_counters; i++) { +		if (!reserve_perfctr_nmi(x86_pmu.perfctr + i)) +			goto perfctr_fail; +	} + +	for (i = 0; i < x86_pmu.num_counters; i++) { +		if (!reserve_evntsel_nmi(x86_pmu.eventsel + i)) +			goto eventsel_fail; +	} + +	return true; + +eventsel_fail: +	for (i--; i >= 0; i--) +		release_evntsel_nmi(x86_pmu.eventsel + i); + +	i = x86_pmu.num_counters; + +perfctr_fail: +	for (i--; i >= 0; i--) +		release_perfctr_nmi(x86_pmu.perfctr + i); + +	if (nmi_watchdog == NMI_LOCAL_APIC) +		enable_lapic_nmi_watchdog(); + +	return false; +} + +static void release_pmc_hardware(void) +{ +	int i; + +	for (i = 0; i < x86_pmu.num_counters; i++) { +		release_perfctr_nmi(x86_pmu.perfctr + i); +		release_evntsel_nmi(x86_pmu.eventsel + i); +	} + +	if (nmi_watchdog == NMI_LOCAL_APIC) +		enable_lapic_nmi_watchdog(); +} + +static void hw_perf_counter_destroy(struct perf_counter *counter) +{ +	if (atomic_dec_and_mutex_lock(&active_counters, &pmc_reserve_mutex)) { +		release_pmc_hardware(); +		mutex_unlock(&pmc_reserve_mutex); +	} +} + +static inline int x86_pmu_initialized(void) +{ +	return x86_pmu.handle_irq != NULL; +} + +static inline int +set_ext_hw_attr(struct hw_perf_counter *hwc, struct perf_counter_attr *attr) +{ +	unsigned int cache_type, cache_op, cache_result; +	u64 config, val; + +	config = attr->config; + +	cache_type = (config >>  0) & 0xff; +	if (cache_type >= PERF_COUNT_HW_CACHE_MAX) +		return -EINVAL; + +	cache_op = (config >>  8) & 0xff; +	if (cache_op >= PERF_COUNT_HW_CACHE_OP_MAX) +		return -EINVAL; + +	cache_result = (config >> 16) & 0xff; +	if (cache_result >= PERF_COUNT_HW_CACHE_RESULT_MAX) +		return -EINVAL; + +	val = hw_cache_event_ids[cache_type][cache_op][cache_result]; + +	if (val == 0) +		return -ENOENT; + +	if (val == -1) +		return -EINVAL; + +	hwc->config |= val; + +	return 0; +} + +/* + * Setup the hardware configuration for a given attr_type + */ +static int __hw_perf_counter_init(struct perf_counter *counter) +{ +	struct perf_counter_attr *attr = &counter->attr; +	struct hw_perf_counter *hwc = &counter->hw; +	int err; + +	if (!x86_pmu_initialized()) +		return -ENODEV; + +	err = 0; +	if (!atomic_inc_not_zero(&active_counters)) { +		mutex_lock(&pmc_reserve_mutex); +		if (atomic_read(&active_counters) == 0 && !reserve_pmc_hardware()) +			err = -EBUSY; +		else +			atomic_inc(&active_counters); +		mutex_unlock(&pmc_reserve_mutex); +	} +	if (err) +		return err; + +	/* +	 * Generate PMC IRQs: +	 * (keep 'enabled' bit clear for now) +	 */ +	hwc->config = ARCH_PERFMON_EVENTSEL_INT; + +	/* +	 * Count user and OS events unless requested not to. +	 */ +	if (!attr->exclude_user) +		hwc->config |= ARCH_PERFMON_EVENTSEL_USR; +	if (!attr->exclude_kernel) +		hwc->config |= ARCH_PERFMON_EVENTSEL_OS; + +	if (!hwc->sample_period) { +		hwc->sample_period = x86_pmu.max_period; +		hwc->last_period = hwc->sample_period; +		atomic64_set(&hwc->period_left, hwc->sample_period); +	} + +	counter->destroy = hw_perf_counter_destroy; + +	/* +	 * Raw event type provide the config in the event structure +	 */ +	if (attr->type == PERF_TYPE_RAW) { +		hwc->config |= x86_pmu.raw_event(attr->config); +		return 0; +	} + +	if (attr->type == PERF_TYPE_HW_CACHE) +		return set_ext_hw_attr(hwc, attr); + +	if (attr->config >= x86_pmu.max_events) +		return -EINVAL; +	/* +	 * The generic map: +	 */ +	hwc->config |= x86_pmu.event_map(attr->config); + +	return 0; +} + +static void intel_pmu_disable_all(void) +{ +	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); +} + +static void amd_pmu_disable_all(void) +{ +	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); +	int idx; + +	if (!cpuc->enabled) +		return; + +	cpuc->enabled = 0; +	/* +	 * ensure we write the disable before we start disabling the +	 * counters proper, so that amd_pmu_enable_counter() does the +	 * right thing. +	 */ +	barrier(); + +	for (idx = 0; idx < x86_pmu.num_counters; idx++) { +		u64 val; + +		if (!test_bit(idx, cpuc->active_mask)) +			continue; +		rdmsrl(MSR_K7_EVNTSEL0 + idx, val); +		if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE)) +			continue; +		val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; +		wrmsrl(MSR_K7_EVNTSEL0 + idx, val); +	} +} + +void hw_perf_disable(void) +{ +	if (!x86_pmu_initialized()) +		return; +	return x86_pmu.disable_all(); +} + +static void intel_pmu_enable_all(void) +{ +	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); +} + +static void amd_pmu_enable_all(void) +{ +	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); +	int idx; + +	if (cpuc->enabled) +		return; + +	cpuc->enabled = 1; +	barrier(); + +	for (idx = 0; idx < x86_pmu.num_counters; idx++) { +		u64 val; + +		if (!test_bit(idx, cpuc->active_mask)) +			continue; +		rdmsrl(MSR_K7_EVNTSEL0 + idx, val); +		if (val & ARCH_PERFMON_EVENTSEL0_ENABLE) +			continue; +		val |= ARCH_PERFMON_EVENTSEL0_ENABLE; +		wrmsrl(MSR_K7_EVNTSEL0 + idx, val); +	} +} + +void hw_perf_enable(void) +{ +	if (!x86_pmu_initialized()) +		return; +	x86_pmu.enable_all(); +} + +static inline u64 intel_pmu_get_status(void) +{ +	u64 status; + +	rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); + +	return status; +} + +static inline void intel_pmu_ack_status(u64 ack) +{ +	wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); +} + +static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) +{ +	int err; +	err = checking_wrmsrl(hwc->config_base + idx, +			      hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE); +} + +static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) +{ +	int err; +	err = checking_wrmsrl(hwc->config_base + idx, +			      hwc->config); +} + +static inline void +intel_pmu_disable_fixed(struct hw_perf_counter *hwc, int __idx) +{ +	int idx = __idx - X86_PMC_IDX_FIXED; +	u64 ctrl_val, mask; +	int err; + +	mask = 0xfULL << (idx * 4); + +	rdmsrl(hwc->config_base, ctrl_val); +	ctrl_val &= ~mask; +	err = checking_wrmsrl(hwc->config_base, ctrl_val); +} + +static inline void +intel_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) +{ +	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { +		intel_pmu_disable_fixed(hwc, idx); +		return; +	} + +	x86_pmu_disable_counter(hwc, idx); +} + +static inline void +amd_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) +{ +	x86_pmu_disable_counter(hwc, idx); +} + +static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]); + +/* + * Set the next IRQ period, based on the hwc->period_left value. + * To be called with the counter disabled in hw: + */ +static int +x86_perf_counter_set_period(struct perf_counter *counter, +			     struct hw_perf_counter *hwc, int idx) +{ +	s64 left = atomic64_read(&hwc->period_left); +	s64 period = hwc->sample_period; +	int err, ret = 0; + +	/* +	 * If we are way outside a reasoable range then just skip forward: +	 */ +	if (unlikely(left <= -period)) { +		left = period; +		atomic64_set(&hwc->period_left, left); +		hwc->last_period = period; +		ret = 1; +	} + +	if (unlikely(left <= 0)) { +		left += period; +		atomic64_set(&hwc->period_left, left); +		hwc->last_period = period; +		ret = 1; +	} +	/* +	 * Quirk: certain CPUs dont like it if just 1 event is left: +	 */ +	if (unlikely(left < 2)) +		left = 2; + +	if (left > x86_pmu.max_period) +		left = x86_pmu.max_period; + +	per_cpu(prev_left[idx], smp_processor_id()) = left; + +	/* +	 * The hw counter starts counting from this counter offset, +	 * mark it to be able to extra future deltas: +	 */ +	atomic64_set(&hwc->prev_count, (u64)-left); + +	err = checking_wrmsrl(hwc->counter_base + idx, +			     (u64)(-left) & x86_pmu.counter_mask); + +	return ret; +} + +static inline void +intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx) +{ +	int idx = __idx - X86_PMC_IDX_FIXED; +	u64 ctrl_val, bits, mask; +	int err; + +	/* +	 * Enable IRQ generation (0x8), +	 * and enable ring-3 counting (0x2) and ring-0 counting (0x1) +	 * if requested: +	 */ +	bits = 0x8ULL; +	if (hwc->config & ARCH_PERFMON_EVENTSEL_USR) +		bits |= 0x2; +	if (hwc->config & ARCH_PERFMON_EVENTSEL_OS) +		bits |= 0x1; +	bits <<= (idx * 4); +	mask = 0xfULL << (idx * 4); + +	rdmsrl(hwc->config_base, ctrl_val); +	ctrl_val &= ~mask; +	ctrl_val |= bits; +	err = checking_wrmsrl(hwc->config_base, ctrl_val); +} + +static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) +{ +	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { +		intel_pmu_enable_fixed(hwc, idx); +		return; +	} + +	x86_pmu_enable_counter(hwc, idx); +} + +static void amd_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) +{ +	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + +	if (cpuc->enabled) +		x86_pmu_enable_counter(hwc, idx); +	else +		x86_pmu_disable_counter(hwc, idx); +} + +static int +fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc) +{ +	unsigned int event; + +	if (!x86_pmu.num_counters_fixed) +		return -1; + +	event = hwc->config & ARCH_PERFMON_EVENT_MASK; + +	if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS))) +		return X86_PMC_IDX_FIXED_INSTRUCTIONS; +	if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES))) +		return X86_PMC_IDX_FIXED_CPU_CYCLES; +	if (unlikely(event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES))) +		return X86_PMC_IDX_FIXED_BUS_CYCLES; + +	return -1; +} + +/* + * Find a PMC slot for the freshly enabled / scheduled in counter: + */ +static int x86_pmu_enable(struct perf_counter *counter) +{ +	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); +	struct hw_perf_counter *hwc = &counter->hw; +	int idx; + +	idx = fixed_mode_idx(counter, hwc); +	if (idx >= 0) { +		/* +		 * Try to get the fixed counter, if that is already taken +		 * then try to get a generic counter: +		 */ +		if (test_and_set_bit(idx, cpuc->used_mask)) +			goto try_generic; + +		hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; +		/* +		 * We set it so that counter_base + idx in wrmsr/rdmsr maps to +		 * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2: +		 */ +		hwc->counter_base = +			MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED; +		hwc->idx = idx; +	} else { +		idx = hwc->idx; +		/* Try to get the previous generic counter again */ +		if (test_and_set_bit(idx, cpuc->used_mask)) { +try_generic: +			idx = find_first_zero_bit(cpuc->used_mask, +						  x86_pmu.num_counters); +			if (idx == x86_pmu.num_counters) +				return -EAGAIN; + +			set_bit(idx, cpuc->used_mask); +			hwc->idx = idx; +		} +		hwc->config_base  = x86_pmu.eventsel; +		hwc->counter_base = x86_pmu.perfctr; +	} + +	perf_counters_lapic_init(); + +	x86_pmu.disable(hwc, idx); + +	cpuc->counters[idx] = counter; +	set_bit(idx, cpuc->active_mask); + +	x86_perf_counter_set_period(counter, hwc, idx); +	x86_pmu.enable(hwc, idx); + +	return 0; +} + +static void x86_pmu_unthrottle(struct perf_counter *counter) +{ +	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); +	struct hw_perf_counter *hwc = &counter->hw; + +	if (WARN_ON_ONCE(hwc->idx >= X86_PMC_IDX_MAX || +				cpuc->counters[hwc->idx] != counter)) +		return; + +	x86_pmu.enable(hwc, hwc->idx); +} + +void perf_counter_print_debug(void) +{ +	u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed; +	struct cpu_hw_counters *cpuc; +	unsigned long flags; +	int cpu, idx; + +	if (!x86_pmu.num_counters) +		return; + +	local_irq_save(flags); + +	cpu = smp_processor_id(); +	cpuc = &per_cpu(cpu_hw_counters, cpu); + +	if (x86_pmu.version >= 2) { +		rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); +		rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); +		rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow); +		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed); + +		pr_info("\n"); +		pr_info("CPU#%d: ctrl:       %016llx\n", cpu, ctrl); +		pr_info("CPU#%d: status:     %016llx\n", cpu, status); +		pr_info("CPU#%d: overflow:   %016llx\n", cpu, overflow); +		pr_info("CPU#%d: fixed:      %016llx\n", cpu, fixed); +	} +	pr_info("CPU#%d: used:       %016llx\n", cpu, *(u64 *)cpuc->used_mask); + +	for (idx = 0; idx < x86_pmu.num_counters; idx++) { +		rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl); +		rdmsrl(x86_pmu.perfctr  + idx, pmc_count); + +		prev_left = per_cpu(prev_left[idx], cpu); + +		pr_info("CPU#%d:   gen-PMC%d ctrl:  %016llx\n", +			cpu, idx, pmc_ctrl); +		pr_info("CPU#%d:   gen-PMC%d count: %016llx\n", +			cpu, idx, pmc_count); +		pr_info("CPU#%d:   gen-PMC%d left:  %016llx\n", +			cpu, idx, prev_left); +	} +	for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) { +		rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count); + +		pr_info("CPU#%d: fixed-PMC%d count: %016llx\n", +			cpu, idx, pmc_count); +	} +	local_irq_restore(flags); +} + +static void x86_pmu_disable(struct perf_counter *counter) +{ +	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); +	struct hw_perf_counter *hwc = &counter->hw; +	int idx = hwc->idx; + +	/* +	 * Must be done before we disable, otherwise the nmi handler +	 * could reenable again: +	 */ +	clear_bit(idx, cpuc->active_mask); +	x86_pmu.disable(hwc, idx); + +	/* +	 * Make sure the cleared pointer becomes visible before we +	 * (potentially) free the counter: +	 */ +	barrier(); + +	/* +	 * Drain the remaining delta count out of a counter +	 * that we are disabling: +	 */ +	x86_perf_counter_update(counter, hwc, idx); +	cpuc->counters[idx] = NULL; +	clear_bit(idx, cpuc->used_mask); +} + +/* + * Save and restart an expired counter. Called by NMI contexts, + * so it has to be careful about preempting normal counter ops: + */ +static int intel_pmu_save_and_restart(struct perf_counter *counter) +{ +	struct hw_perf_counter *hwc = &counter->hw; +	int idx = hwc->idx; +	int ret; + +	x86_perf_counter_update(counter, hwc, idx); +	ret = x86_perf_counter_set_period(counter, hwc, idx); + +	if (counter->state == PERF_COUNTER_STATE_ACTIVE) +		intel_pmu_enable_counter(hwc, idx); + +	return ret; +} + +static void intel_pmu_reset(void) +{ +	unsigned long flags; +	int idx; + +	if (!x86_pmu.num_counters) +		return; + +	local_irq_save(flags); + +	printk("clearing PMU state on CPU#%d\n", smp_processor_id()); + +	for (idx = 0; idx < x86_pmu.num_counters; idx++) { +		checking_wrmsrl(x86_pmu.eventsel + idx, 0ull); +		checking_wrmsrl(x86_pmu.perfctr  + idx, 0ull); +	} +	for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) { +		checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); +	} + +	local_irq_restore(flags); +} + + +/* + * This handler is triggered by the local APIC, so the APIC IRQ handling + * rules apply: + */ +static int intel_pmu_handle_irq(struct pt_regs *regs) +{ +	struct perf_sample_data data; +	struct cpu_hw_counters *cpuc; +	int bit, cpu, loops; +	u64 ack, status; + +	data.regs = regs; +	data.addr = 0; + +	cpu = smp_processor_id(); +	cpuc = &per_cpu(cpu_hw_counters, cpu); + +	perf_disable(); +	status = intel_pmu_get_status(); +	if (!status) { +		perf_enable(); +		return 0; +	} + +	loops = 0; +again: +	if (++loops > 100) { +		WARN_ONCE(1, "perfcounters: irq loop stuck!\n"); +		perf_counter_print_debug(); +		intel_pmu_reset(); +		perf_enable(); +		return 1; +	} + +	inc_irq_stat(apic_perf_irqs); +	ack = status; +	for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { +		struct perf_counter *counter = cpuc->counters[bit]; + +		clear_bit(bit, (unsigned long *) &status); +		if (!test_bit(bit, cpuc->active_mask)) +			continue; + +		if (!intel_pmu_save_and_restart(counter)) +			continue; + +		if (perf_counter_overflow(counter, 1, &data)) +			intel_pmu_disable_counter(&counter->hw, bit); +	} + +	intel_pmu_ack_status(ack); + +	/* +	 * Repeat if there is more work to be done: +	 */ +	status = intel_pmu_get_status(); +	if (status) +		goto again; + +	perf_enable(); + +	return 1; +} + +static int amd_pmu_handle_irq(struct pt_regs *regs) +{ +	struct perf_sample_data data; +	struct cpu_hw_counters *cpuc; +	struct perf_counter *counter; +	struct hw_perf_counter *hwc; +	int cpu, idx, handled = 0; +	u64 val; + +	data.regs = regs; +	data.addr = 0; + +	cpu = smp_processor_id(); +	cpuc = &per_cpu(cpu_hw_counters, cpu); + +	for (idx = 0; idx < x86_pmu.num_counters; idx++) { +		if (!test_bit(idx, cpuc->active_mask)) +			continue; + +		counter = cpuc->counters[idx]; +		hwc = &counter->hw; + +		val = x86_perf_counter_update(counter, hwc, idx); +		if (val & (1ULL << (x86_pmu.counter_bits - 1))) +			continue; + +		/* +		 * counter overflow +		 */ +		handled		= 1; +		data.period	= counter->hw.last_period; + +		if (!x86_perf_counter_set_period(counter, hwc, idx)) +			continue; + +		if (perf_counter_overflow(counter, 1, &data)) +			amd_pmu_disable_counter(hwc, idx); +	} + +	if (handled) +		inc_irq_stat(apic_perf_irqs); + +	return handled; +} + +void smp_perf_pending_interrupt(struct pt_regs *regs) +{ +	irq_enter(); +	ack_APIC_irq(); +	inc_irq_stat(apic_pending_irqs); +	perf_counter_do_pending(); +	irq_exit(); +} + +void set_perf_counter_pending(void) +{ +	apic->send_IPI_self(LOCAL_PENDING_VECTOR); +} + +void perf_counters_lapic_init(void) +{ +	if (!x86_pmu_initialized()) +		return; + +	/* +	 * Always use NMI for PMU +	 */ +	apic_write(APIC_LVTPC, APIC_DM_NMI); +} + +static int __kprobes +perf_counter_nmi_handler(struct notifier_block *self, +			 unsigned long cmd, void *__args) +{ +	struct die_args *args = __args; +	struct pt_regs *regs; + +	if (!atomic_read(&active_counters)) +		return NOTIFY_DONE; + +	switch (cmd) { +	case DIE_NMI: +	case DIE_NMI_IPI: +		break; + +	default: +		return NOTIFY_DONE; +	} + +	regs = args->regs; + +	apic_write(APIC_LVTPC, APIC_DM_NMI); +	/* +	 * Can't rely on the handled return value to say it was our NMI, two +	 * counters could trigger 'simultaneously' raising two back-to-back NMIs. +	 * +	 * If the first NMI handles both, the latter will be empty and daze +	 * the CPU. +	 */ +	x86_pmu.handle_irq(regs); + +	return NOTIFY_STOP; +} + +static __read_mostly struct notifier_block perf_counter_nmi_notifier = { +	.notifier_call		= perf_counter_nmi_handler, +	.next			= NULL, +	.priority		= 1 +}; + +static struct x86_pmu intel_pmu = { +	.name			= "Intel", +	.handle_irq		= intel_pmu_handle_irq, +	.disable_all		= intel_pmu_disable_all, +	.enable_all		= intel_pmu_enable_all, +	.enable			= intel_pmu_enable_counter, +	.disable		= intel_pmu_disable_counter, +	.eventsel		= MSR_ARCH_PERFMON_EVENTSEL0, +	.perfctr		= MSR_ARCH_PERFMON_PERFCTR0, +	.event_map		= intel_pmu_event_map, +	.raw_event		= intel_pmu_raw_event, +	.max_events		= ARRAY_SIZE(intel_perfmon_event_map), +	/* +	 * Intel PMCs cannot be accessed sanely above 32 bit width, +	 * so we install an artificial 1<<31 period regardless of +	 * the generic counter period: +	 */ +	.max_period		= (1ULL << 31) - 1, +}; + +static struct x86_pmu amd_pmu = { +	.name			= "AMD", +	.handle_irq		= amd_pmu_handle_irq, +	.disable_all		= amd_pmu_disable_all, +	.enable_all		= amd_pmu_enable_all, +	.enable			= amd_pmu_enable_counter, +	.disable		= amd_pmu_disable_counter, +	.eventsel		= MSR_K7_EVNTSEL0, +	.perfctr		= MSR_K7_PERFCTR0, +	.event_map		= amd_pmu_event_map, +	.raw_event		= amd_pmu_raw_event, +	.max_events		= ARRAY_SIZE(amd_perfmon_event_map), +	.num_counters		= 4, +	.counter_bits		= 48, +	.counter_mask		= (1ULL << 48) - 1, +	/* use highest bit to detect overflow */ +	.max_period		= (1ULL << 47) - 1, +}; + +static int intel_pmu_init(void) +{ +	union cpuid10_edx edx; +	union cpuid10_eax eax; +	unsigned int unused; +	unsigned int ebx; +	int version; + +	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) +		return -ENODEV; + +	/* +	 * Check whether the Architectural PerfMon supports +	 * Branch Misses Retired Event or not. +	 */ +	cpuid(10, &eax.full, &ebx, &unused, &edx.full); +	if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED) +		return -ENODEV; + +	version = eax.split.version_id; +	if (version < 2) +		return -ENODEV; + +	x86_pmu				= intel_pmu; +	x86_pmu.version			= version; +	x86_pmu.num_counters		= eax.split.num_counters; +	x86_pmu.counter_bits		= eax.split.bit_width; +	x86_pmu.counter_mask		= (1ULL << eax.split.bit_width) - 1; + +	/* +	 * Quirk: v2 perfmon does not report fixed-purpose counters, so +	 * assume at least 3 counters: +	 */ +	x86_pmu.num_counters_fixed	= max((int)edx.split.num_counters_fixed, 3); + +	rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); + +	/* +	 * Install the hw-cache-events table: +	 */ +	switch (boot_cpu_data.x86_model) { +	case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */ +	case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */ +	case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */ +	case 29: /* six-core 45 nm xeon "Dunnington" */ +		memcpy(hw_cache_event_ids, core2_hw_cache_event_ids, +		       sizeof(hw_cache_event_ids)); + +		pr_cont("Core2 events, "); +		break; +	default: +	case 26: +		memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, +		       sizeof(hw_cache_event_ids)); + +		pr_cont("Nehalem/Corei7 events, "); +		break; +	case 28: +		memcpy(hw_cache_event_ids, atom_hw_cache_event_ids, +		       sizeof(hw_cache_event_ids)); + +		pr_cont("Atom events, "); +		break; +	} +	return 0; +} + +static int amd_pmu_init(void) +{ +	x86_pmu = amd_pmu; + +	switch (boot_cpu_data.x86) { +	case 0x0f: +	case 0x10: +	case 0x11: +		memcpy(hw_cache_event_ids, amd_0f_hw_cache_event_ids, +		       sizeof(hw_cache_event_ids)); + +		pr_cont("AMD Family 0f/10/11 events, "); +		break; +	} +	return 0; +} + +void __init init_hw_perf_counters(void) +{ +	int err; + +	pr_info("Performance Counters: "); + +	switch (boot_cpu_data.x86_vendor) { +	case X86_VENDOR_INTEL: +		err = intel_pmu_init(); +		break; +	case X86_VENDOR_AMD: +		err = amd_pmu_init(); +		break; +	default: +		return; +	} +	if (err != 0) { +		pr_cont("no PMU driver, software counters only.\n"); +		return; +	} + +	pr_cont("%s PMU driver.\n", x86_pmu.name); + +	if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) { +		x86_pmu.num_counters = X86_PMC_MAX_GENERIC; +		WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!", +		     x86_pmu.num_counters, X86_PMC_MAX_GENERIC); +	} +	perf_counter_mask = (1 << x86_pmu.num_counters) - 1; +	perf_max_counters = x86_pmu.num_counters; + +	if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) { +		x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED; +		WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!", +		     x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED); +	} + +	perf_counter_mask |= +		((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED; + +	perf_counters_lapic_init(); +	register_die_notifier(&perf_counter_nmi_notifier); + +	pr_info("... version:                 %d\n",     x86_pmu.version); +	pr_info("... bit width:               %d\n",     x86_pmu.counter_bits); +	pr_info("... generic counters:        %d\n",     x86_pmu.num_counters); +	pr_info("... value mask:              %016Lx\n", x86_pmu.counter_mask); +	pr_info("... max period:              %016Lx\n", x86_pmu.max_period); +	pr_info("... fixed-purpose counters:  %d\n",     x86_pmu.num_counters_fixed); +	pr_info("... counter mask:            %016Lx\n", perf_counter_mask); +} + +static inline void x86_pmu_read(struct perf_counter *counter) +{ +	x86_perf_counter_update(counter, &counter->hw, counter->hw.idx); +} + +static const struct pmu pmu = { +	.enable		= x86_pmu_enable, +	.disable	= x86_pmu_disable, +	.read		= x86_pmu_read, +	.unthrottle	= x86_pmu_unthrottle, +}; + +const struct pmu *hw_perf_counter_init(struct perf_counter *counter) +{ +	int err; + +	err = __hw_perf_counter_init(counter); +	if (err) +		return ERR_PTR(err); + +	return &pmu; +} + +/* + * callchain support + */ + +static inline +void callchain_store(struct perf_callchain_entry *entry, unsigned long ip) +{ +	if (entry->nr < MAX_STACK_DEPTH) +		entry->ip[entry->nr++] = ip; +} + +static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry); +static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry); + + +static void +backtrace_warning_symbol(void *data, char *msg, unsigned long symbol) +{ +	/* Ignore warnings */ +} + +static void backtrace_warning(void *data, char *msg) +{ +	/* Ignore warnings */ +} + +static int backtrace_stack(void *data, char *name) +{ +	/* Don't bother with IRQ stacks for now */ +	return -1; +} + +static void backtrace_address(void *data, unsigned long addr, int reliable) +{ +	struct perf_callchain_entry *entry = data; + +	if (reliable) +		callchain_store(entry, addr); +} + +static const struct stacktrace_ops backtrace_ops = { +	.warning		= backtrace_warning, +	.warning_symbol		= backtrace_warning_symbol, +	.stack			= backtrace_stack, +	.address		= backtrace_address, +}; + +static void +perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry) +{ +	unsigned long bp; +	char *stack; +	int nr = entry->nr; + +	callchain_store(entry, instruction_pointer(regs)); + +	stack = ((char *)regs + sizeof(struct pt_regs)); +#ifdef CONFIG_FRAME_POINTER +	bp = frame_pointer(regs); +#else +	bp = 0; +#endif + +	dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, entry); + +	entry->kernel = entry->nr - nr; +} + + +struct stack_frame { +	const void __user	*next_fp; +	unsigned long		return_address; +}; + +static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) +{ +	int ret; + +	if (!access_ok(VERIFY_READ, fp, sizeof(*frame))) +		return 0; + +	ret = 1; +	pagefault_disable(); +	if (__copy_from_user_inatomic(frame, fp, sizeof(*frame))) +		ret = 0; +	pagefault_enable(); + +	return ret; +} + +static void +perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry) +{ +	struct stack_frame frame; +	const void __user *fp; +	int nr = entry->nr; + +	regs = (struct pt_regs *)current->thread.sp0 - 1; +	fp   = (void __user *)regs->bp; + +	callchain_store(entry, regs->ip); + +	while (entry->nr < MAX_STACK_DEPTH) { +		frame.next_fp	     = NULL; +		frame.return_address = 0; + +		if (!copy_stack_frame(fp, &frame)) +			break; + +		if ((unsigned long)fp < user_stack_pointer(regs)) +			break; + +		callchain_store(entry, frame.return_address); +		fp = frame.next_fp; +	} + +	entry->user = entry->nr - nr; +} + +static void +perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry) +{ +	int is_user; + +	if (!regs) +		return; + +	is_user = user_mode(regs); + +	if (!current || current->pid == 0) +		return; + +	if (is_user && current->state != TASK_RUNNING) +		return; + +	if (!is_user) +		perf_callchain_kernel(regs, entry); + +	if (current->mm) +		perf_callchain_user(regs, entry); +} + +struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) +{ +	struct perf_callchain_entry *entry; + +	if (in_nmi()) +		entry = &__get_cpu_var(nmi_entry); +	else +		entry = &__get_cpu_var(irq_entry); + +	entry->nr = 0; +	entry->hv = 0; +	entry->kernel = 0; +	entry->user = 0; + +	perf_do_callchain(regs, entry); + +	return entry; +} diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index f6c70a164e3..d6f5b9fbde3 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -19,8 +19,8 @@  #include <linux/nmi.h>  #include <linux/kprobes.h> -#include <asm/genapic.h> -#include <asm/intel_arch_perfmon.h> +#include <asm/apic.h> +#include <asm/perf_counter.h>  struct nmi_watchdog_ctlblk {  	unsigned int cccr_msr; diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 1c17d7c751a..a4742a340d8 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1012,6 +1012,11 @@ apicinterrupt ERROR_APIC_VECTOR \  apicinterrupt SPURIOUS_APIC_VECTOR \  	spurious_interrupt smp_spurious_interrupt +#ifdef CONFIG_PERF_COUNTERS +apicinterrupt LOCAL_PENDING_VECTOR \ +	perf_pending_interrupt smp_perf_pending_interrupt +#endif +  /*   * Exception entry points.   */ diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 9a391bbb8ba..38287b5f116 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -62,6 +62,14 @@ static int show_other_interrupts(struct seq_file *p, int prec)  	for_each_online_cpu(j)  		seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count);  	seq_printf(p, "  Spurious interrupts\n"); +	seq_printf(p, "%*s: ", prec, "CNT"); +	for_each_online_cpu(j) +		seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs); +	seq_printf(p, "  Performance counter interrupts\n"); +	seq_printf(p, "%*s: ", prec, "PND"); +	for_each_online_cpu(j) +		seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs); +	seq_printf(p, "  Performance pending work\n");  #endif  	if (generic_interrupt_extension) {  		seq_printf(p, "%*s: ", prec, "PLT"); @@ -165,6 +173,8 @@ u64 arch_irq_stat_cpu(unsigned int cpu)  #ifdef CONFIG_X86_LOCAL_APIC  	sum += irq_stats(cpu)->apic_timer_irqs;  	sum += irq_stats(cpu)->irq_spurious_count; +	sum += irq_stats(cpu)->apic_perf_irqs; +	sum += irq_stats(cpu)->apic_pending_irqs;  #endif  	if (generic_interrupt_extension)  		sum += irq_stats(cpu)->generic_irqs; diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 2e08b10ad51..267c6624c77 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -181,10 +181,15 @@ static void __init apic_intr_init(void)  {  	smp_intr_init(); -#ifdef CONFIG_X86_64 +#ifdef CONFIG_X86_THERMAL_VECTOR  	alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); +#endif +#ifdef CONFIG_X86_THRESHOLD  	alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);  #endif +#if defined(CONFIG_X86_NEW_MCE) && defined(CONFIG_X86_LOCAL_APIC) +	alloc_intr_gate(MCE_SELF_VECTOR, mce_self_interrupt); +#endif  #if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)  	/* self generated IPI for local APIC timer */ @@ -199,18 +204,10 @@ static void __init apic_intr_init(void)  	/* Performance monitoring interrupts: */  # ifdef CONFIG_PERF_COUNTERS -	alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt);  	alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt);  # endif  #endif - -#ifdef CONFIG_X86_32 -#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL) -	/* thermal monitor LVT interrupt */ -	alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); -#endif -#endif  }  /** diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 14425166b8e..0a813b17b17 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -6,7 +6,6 @@   *  2000-06-20  Pentium III FXSR, SSE support by Gareth Hughes   *  2000-2002   x86-64 support by Andi Kleen   */ -  #include <linux/sched.h>  #include <linux/mm.h>  #include <linux/smp.h> diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index 734f92c02dd..d51321ddafd 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -335,3 +335,4 @@ ENTRY(sys_call_table)  	.long sys_preadv  	.long sys_pwritev  	.long sys_rt_tgsigqueueinfo	/* 335 */ +	.long sys_perf_counter_open diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index ede024531f8..07d60c870ce 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -942,8 +942,13 @@ void __init trap_init(void)  #endif  	set_intr_gate(19, &simd_coprocessor_error); +	/* Reserve all the builtin and the syscall vector: */ +	for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) +		set_bit(i, used_vectors); +  #ifdef CONFIG_IA32_EMULATION  	set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall); +	set_bit(IA32_SYSCALL_VECTOR, used_vectors);  #endif  #ifdef CONFIG_X86_32 @@ -960,14 +965,9 @@ void __init trap_init(void)  	}  	set_system_trap_gate(SYSCALL_VECTOR, &system_call); +	set_bit(SYSCALL_VECTOR, used_vectors);  #endif -	/* Reserve all the builtin and the syscall vector: */ -	for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) -		set_bit(i, used_vectors); - -	set_bit(IA32_SYSCALL_VECTOR, used_vectors); -  	/*  	 * Should be a barrier for any external CPU state:  	 */ diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 5ec7ae36661..c6acc632637 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -10,6 +10,7 @@  #include <linux/bootmem.h>		/* max_low_pfn			*/  #include <linux/kprobes.h>		/* __kprobes, ...		*/  #include <linux/mmiotrace.h>		/* kmmio_handler, ...		*/ +#include <linux/perf_counter.h>		/* perf_swcounter_event		*/  #include <asm/traps.h>			/* dotraplinkage, ...		*/  #include <asm/pgalloc.h>		/* pgd_*(), ...			*/ @@ -1013,6 +1014,8 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)  	if (unlikely(error_code & PF_RSVD))  		pgtable_bad(regs, error_code, address); +	perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address); +  	/*  	 * If we're in an interrupt, have no user context or are running  	 * in an atomic region then we must not take the fault: @@ -1106,10 +1109,15 @@ good_area:  		return;  	} -	if (fault & VM_FAULT_MAJOR) +	if (fault & VM_FAULT_MAJOR) {  		tsk->maj_flt++; -	else +		perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, +				     regs, address); +	} else {  		tsk->min_flt++; +		perf_swcounter_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, +				     regs, address); +	}  	check_v8086_mode(regs, address, tsk); diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 3b285e656e2..b07dd8d0b32 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -40,8 +40,9 @@ static int profile_exceptions_notify(struct notifier_block *self,  	switch (val) {  	case DIE_NMI: -		if (model->check_ctrs(args->regs, &per_cpu(cpu_msrs, cpu))) -			ret = NOTIFY_STOP; +	case DIE_NMI_IPI: +		model->check_ctrs(args->regs, &per_cpu(cpu_msrs, cpu)); +		ret = NOTIFY_STOP;  		break;  	default:  		break; @@ -134,7 +135,7 @@ static void nmi_cpu_setup(void *dummy)  static struct notifier_block profile_exceptions_nb = {  	.notifier_call = profile_exceptions_notify,  	.next = NULL, -	.priority = 0 +	.priority = 2  };  static int nmi_setup(void) diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index 10131fbdaad..4da7230b3d1 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c @@ -18,7 +18,7 @@  #include <asm/msr.h>  #include <asm/apic.h>  #include <asm/nmi.h> -#include <asm/intel_arch_perfmon.h> +#include <asm/perf_counter.h>  #include "op_x86_model.h"  #include "op_counter.h" @@ -136,6 +136,13 @@ static int ppro_check_ctrs(struct pt_regs * const regs,  	u64 val;  	int i; +	/* +	 * This can happen if perf counters are in use when +	 * we steal the die notifier NMI. +	 */ +	if (unlikely(!reset_value)) +		goto out; +  	for (i = 0 ; i < num_counters; ++i) {  		if (!reset_value[i])  			continue; @@ -146,6 +153,7 @@ static int ppro_check_ctrs(struct pt_regs * const regs,  		}  	} +out:  	/* Only P6 based Pentium M need to re-unmask the apic vector but it  	 * doesn't hurt other P6 variant */  	apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED); diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c index 1241f118ab5..58bc00f68b1 100644 --- a/arch/x86/vdso/vdso32-setup.c +++ b/arch/x86/vdso/vdso32-setup.c @@ -338,6 +338,8 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)  		}  	} +	current->mm->context.vdso = (void *)addr; +  	if (compat_uses_vma || !compat) {  		/*  		 * MAYWRITE to allow gdb to COW and set breakpoints @@ -358,11 +360,13 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)  			goto up_fail;  	} -	current->mm->context.vdso = (void *)addr;  	current_thread_info()->sysenter_return =  		VDSO32_SYMBOL(addr, SYSENTER_RETURN);    up_fail: +	if (ret) +		current->mm->context.vdso = NULL; +  	up_write(&mm->mmap_sem);  	return ret; diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c index cac083386e0..21e1aeb9f3e 100644 --- a/arch/x86/vdso/vma.c +++ b/arch/x86/vdso/vma.c @@ -116,15 +116,18 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)  		goto up_fail;  	} +	current->mm->context.vdso = (void *)addr; +  	ret = install_special_mapping(mm, addr, vdso_size,  				      VM_READ|VM_EXEC|  				      VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC|  				      VM_ALWAYSDUMP,  				      vdso_pages); -	if (ret) +	if (ret) { +		current->mm->context.vdso = NULL;  		goto up_fail; +	} -	current->mm->context.vdso = (void *)addr;  up_fail:  	up_write(&mm->mmap_sem);  	return ret; diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c index d6a807f4077..39a05b5fa9c 100644 --- a/drivers/char/sysrq.c +++ b/drivers/char/sysrq.c @@ -25,6 +25,7 @@  #include <linux/kbd_kern.h>  #include <linux/proc_fs.h>  #include <linux/quotaops.h> +#include <linux/perf_counter.h>  #include <linux/kernel.h>  #include <linux/module.h>  #include <linux/suspend.h> @@ -243,6 +244,7 @@ static void sysrq_handle_showregs(int key, struct tty_struct *tty)  	struct pt_regs *regs = get_irq_regs();  	if (regs)  		show_regs(regs); +	perf_counter_print_debug();  }  static struct sysrq_key_op sysrq_showregs_op = {  	.handler	= sysrq_handle_showregs, diff --git a/fs/exec.c b/fs/exec.c index a7fcd975c6b..e639957d7a5 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -33,6 +33,7 @@  #include <linux/string.h>  #include <linux/init.h>  #include <linux/pagemap.h> +#include <linux/perf_counter.h>  #include <linux/highmem.h>  #include <linux/spinlock.h>  #include <linux/key.h> @@ -922,6 +923,7 @@ void set_task_comm(struct task_struct *tsk, char *buf)  	task_lock(tsk);  	strlcpy(tsk->comm, buf, sizeof(tsk->comm));  	task_unlock(tsk); +	perf_counter_comm(tsk);  }  int flush_old_exec(struct linux_binprm * bprm) @@ -990,6 +992,13 @@ int flush_old_exec(struct linux_binprm * bprm)  	current->personality &= ~bprm->per_clear; +	/* +	 * Flush performance counters when crossing a +	 * security domain: +	 */ +	if (!get_dumpable(current->mm)) +		perf_counter_exit_task(current); +  	/* An exec changes our domain. We are no longer part of the thread  	   group */ diff --git a/include/asm-generic/atomic.h b/include/asm-generic/atomic.h index 3673a13b670..81d3be459ef 100644 --- a/include/asm-generic/atomic.h +++ b/include/asm-generic/atomic.h @@ -134,7 +134,7 @@ static inline long atomic_long_add_unless(atomic_long_t *l, long a, long u)  #define atomic_long_cmpxchg(l, old, new) \  	(atomic64_cmpxchg((atomic64_t *)(l), (old), (new)))  #define atomic_long_xchg(v, new) \ -	(atomic64_xchg((atomic64_t *)(l), (new))) +	(atomic64_xchg((atomic64_t *)(v), (new)))  #else  /*  BITS_PER_LONG == 64  */ diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 6646bfc7b89..28b1f30601b 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -108,6 +108,15 @@ extern struct group_info init_groups;  extern struct cred init_cred; +#ifdef CONFIG_PERF_COUNTERS +# define INIT_PERF_COUNTERS(tsk)					\ +	.perf_counter_mutex = 						\ +		 __MUTEX_INITIALIZER(tsk.perf_counter_mutex),		\ +	.perf_counter_list = LIST_HEAD_INIT(tsk.perf_counter_list), +#else +# define INIT_PERF_COUNTERS(tsk) +#endif +  /*   *  INIT_TASK is used to set up the first task table, touch at   * your own risk!. Base=0, limit=0x1fffff (=2MB) @@ -171,6 +180,7 @@ extern struct cred init_cred;  	},								\  	.dirties = INIT_PROP_LOCAL_SINGLE(dirties),			\  	INIT_IDS							\ +	INIT_PERF_COUNTERS(tsk)						\  	INIT_TRACE_IRQFLAGS						\  	INIT_LOCKDEP							\  	INIT_FTRACE_GRAPH						\ diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index 0c8b89f28a9..a77c6007dc9 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -81,7 +81,12 @@ static inline unsigned int kstat_irqs(unsigned int irq)  	return sum;  } + +/* + * Lock/unlock the current runqueue - to extract task statistics: + */  extern unsigned long long task_delta_exec(struct task_struct *); +  extern void account_user_time(struct task_struct *, cputime_t, cputime_t);  extern void account_system_time(struct task_struct *, int, cputime_t, cputime_t);  extern void account_steal_time(cputime_t); diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h new file mode 100644 index 00000000000..6e133954e2e --- /dev/null +++ b/include/linux/perf_counter.h @@ -0,0 +1,697 @@ +/* + *  Performance counters: + * + *    Copyright (C) 2008-2009, Thomas Gleixner <tglx@linutronix.de> + *    Copyright (C) 2008-2009, Red Hat, Inc., Ingo Molnar + *    Copyright (C) 2008-2009, Red Hat, Inc., Peter Zijlstra + * + *  Data type definitions, declarations, prototypes. + * + *    Started by: Thomas Gleixner and Ingo Molnar + * + *  For licencing details see kernel-base/COPYING + */ +#ifndef _LINUX_PERF_COUNTER_H +#define _LINUX_PERF_COUNTER_H + +#include <linux/types.h> +#include <linux/ioctl.h> +#include <asm/byteorder.h> + +/* + * User-space ABI bits: + */ + +/* + * attr.type + */ +enum perf_type_id { +	PERF_TYPE_HARDWARE			= 0, +	PERF_TYPE_SOFTWARE			= 1, +	PERF_TYPE_TRACEPOINT			= 2, +	PERF_TYPE_HW_CACHE			= 3, +	PERF_TYPE_RAW				= 4, + +	PERF_TYPE_MAX,				/* non-ABI */ +}; + +/* + * Generalized performance counter event types, used by the + * attr.event_id parameter of the sys_perf_counter_open() + * syscall: + */ +enum perf_hw_id { +	/* +	 * Common hardware events, generalized by the kernel: +	 */ +	PERF_COUNT_HW_CPU_CYCLES		= 0, +	PERF_COUNT_HW_INSTRUCTIONS		= 1, +	PERF_COUNT_HW_CACHE_REFERENCES		= 2, +	PERF_COUNT_HW_CACHE_MISSES		= 3, +	PERF_COUNT_HW_BRANCH_INSTRUCTIONS	= 4, +	PERF_COUNT_HW_BRANCH_MISSES		= 5, +	PERF_COUNT_HW_BUS_CYCLES		= 6, + +	PERF_COUNT_HW_MAX,			/* non-ABI */ +}; + +/* + * Generalized hardware cache counters: + * + *       { L1-D, L1-I, LLC, ITLB, DTLB, BPU } x + *       { read, write, prefetch } x + *       { accesses, misses } + */ +enum perf_hw_cache_id { +	PERF_COUNT_HW_CACHE_L1D			= 0, +	PERF_COUNT_HW_CACHE_L1I			= 1, +	PERF_COUNT_HW_CACHE_LL			= 2, +	PERF_COUNT_HW_CACHE_DTLB		= 3, +	PERF_COUNT_HW_CACHE_ITLB		= 4, +	PERF_COUNT_HW_CACHE_BPU			= 5, + +	PERF_COUNT_HW_CACHE_MAX,		/* non-ABI */ +}; + +enum perf_hw_cache_op_id { +	PERF_COUNT_HW_CACHE_OP_READ		= 0, +	PERF_COUNT_HW_CACHE_OP_WRITE		= 1, +	PERF_COUNT_HW_CACHE_OP_PREFETCH		= 2, + +	PERF_COUNT_HW_CACHE_OP_MAX,		/* non-ABI */ +}; + +enum perf_hw_cache_op_result_id { +	PERF_COUNT_HW_CACHE_RESULT_ACCESS	= 0, +	PERF_COUNT_HW_CACHE_RESULT_MISS		= 1, + +	PERF_COUNT_HW_CACHE_RESULT_MAX,		/* non-ABI */ +}; + +/* + * Special "software" counters provided by the kernel, even if the hardware + * does not support performance counters. These counters measure various + * physical and sw events of the kernel (and allow the profiling of them as + * well): + */ +enum perf_sw_ids { +	PERF_COUNT_SW_CPU_CLOCK			= 0, +	PERF_COUNT_SW_TASK_CLOCK		= 1, +	PERF_COUNT_SW_PAGE_FAULTS		= 2, +	PERF_COUNT_SW_CONTEXT_SWITCHES		= 3, +	PERF_COUNT_SW_CPU_MIGRATIONS		= 4, +	PERF_COUNT_SW_PAGE_FAULTS_MIN		= 5, +	PERF_COUNT_SW_PAGE_FAULTS_MAJ		= 6, + +	PERF_COUNT_SW_MAX,			/* non-ABI */ +}; + +/* + * Bits that can be set in attr.sample_type to request information + * in the overflow packets. + */ +enum perf_counter_sample_format { +	PERF_SAMPLE_IP				= 1U << 0, +	PERF_SAMPLE_TID				= 1U << 1, +	PERF_SAMPLE_TIME			= 1U << 2, +	PERF_SAMPLE_ADDR			= 1U << 3, +	PERF_SAMPLE_GROUP			= 1U << 4, +	PERF_SAMPLE_CALLCHAIN			= 1U << 5, +	PERF_SAMPLE_ID				= 1U << 6, +	PERF_SAMPLE_CPU				= 1U << 7, +	PERF_SAMPLE_PERIOD			= 1U << 8, +}; + +/* + * Bits that can be set in attr.read_format to request that + * reads on the counter should return the indicated quantities, + * in increasing order of bit value, after the counter value. + */ +enum perf_counter_read_format { +	PERF_FORMAT_TOTAL_TIME_ENABLED		= 1U << 0, +	PERF_FORMAT_TOTAL_TIME_RUNNING		= 1U << 1, +	PERF_FORMAT_ID				= 1U << 2, +}; + +/* + * Hardware event to monitor via a performance monitoring counter: + */ +struct perf_counter_attr { +	/* +	 * Major type: hardware/software/tracepoint/etc. +	 */ +	__u32			type; +	__u32			__reserved_1; + +	/* +	 * Type specific configuration information. +	 */ +	__u64			config; + +	union { +		__u64		sample_period; +		__u64		sample_freq; +	}; + +	__u64			sample_type; +	__u64			read_format; + +	__u64			disabled       :  1, /* off by default        */ +				inherit	       :  1, /* children inherit it   */ +				pinned	       :  1, /* must always be on PMU */ +				exclusive      :  1, /* only group on PMU     */ +				exclude_user   :  1, /* don't count user      */ +				exclude_kernel :  1, /* ditto kernel          */ +				exclude_hv     :  1, /* ditto hypervisor      */ +				exclude_idle   :  1, /* don't count when idle */ +				mmap           :  1, /* include mmap data     */ +				comm	       :  1, /* include comm data     */ +				freq           :  1, /* use freq, not period  */ + +				__reserved_2   : 53; + +	__u32			wakeup_events;	/* wakeup every n events */ +	__u32			__reserved_3; + +	__u64			__reserved_4; +}; + +/* + * Ioctls that can be done on a perf counter fd: + */ +#define PERF_COUNTER_IOC_ENABLE		_IO ('$', 0) +#define PERF_COUNTER_IOC_DISABLE	_IO ('$', 1) +#define PERF_COUNTER_IOC_REFRESH	_IO ('$', 2) +#define PERF_COUNTER_IOC_RESET		_IO ('$', 3) +#define PERF_COUNTER_IOC_PERIOD		_IOW('$', 4, u64) + +enum perf_counter_ioc_flags { +	PERF_IOC_FLAG_GROUP		= 1U << 0, +}; + +/* + * Structure of the page that can be mapped via mmap + */ +struct perf_counter_mmap_page { +	__u32	version;		/* version number of this structure */ +	__u32	compat_version;		/* lowest version this is compat with */ + +	/* +	 * Bits needed to read the hw counters in user-space. +	 * +	 *   u32 seq; +	 *   s64 count; +	 * +	 *   do { +	 *     seq = pc->lock; +	 * +	 *     barrier() +	 *     if (pc->index) { +	 *       count = pmc_read(pc->index - 1); +	 *       count += pc->offset; +	 *     } else +	 *       goto regular_read; +	 * +	 *     barrier(); +	 *   } while (pc->lock != seq); +	 * +	 * NOTE: for obvious reason this only works on self-monitoring +	 *       processes. +	 */ +	__u32	lock;			/* seqlock for synchronization */ +	__u32	index;			/* hardware counter identifier */ +	__s64	offset;			/* add to hardware counter value */ + +	/* +	 * Control data for the mmap() data buffer. +	 * +	 * User-space reading this value should issue an rmb(), on SMP capable +	 * platforms, after reading this value -- see perf_counter_wakeup(). +	 */ +	__u64   data_head;		/* head in the data section */ +}; + +#define PERF_EVENT_MISC_CPUMODE_MASK		(3 << 0) +#define PERF_EVENT_MISC_CPUMODE_UNKNOWN		(0 << 0) +#define PERF_EVENT_MISC_KERNEL			(1 << 0) +#define PERF_EVENT_MISC_USER			(2 << 0) +#define PERF_EVENT_MISC_HYPERVISOR		(3 << 0) +#define PERF_EVENT_MISC_OVERFLOW		(1 << 2) + +struct perf_event_header { +	__u32	type; +	__u16	misc; +	__u16	size; +}; + +enum perf_event_type { + +	/* +	 * The MMAP events record the PROT_EXEC mappings so that we can +	 * correlate userspace IPs to code. They have the following structure: +	 * +	 * struct { +	 *	struct perf_event_header	header; +	 * +	 *	u32				pid, tid; +	 *	u64				addr; +	 *	u64				len; +	 *	u64				pgoff; +	 *	char				filename[]; +	 * }; +	 */ +	PERF_EVENT_MMAP			= 1, + +	/* +	 * struct { +	 *	struct perf_event_header	header; +	 * +	 *	u32				pid, tid; +	 *	char				comm[]; +	 * }; +	 */ +	PERF_EVENT_COMM			= 3, + +	/* +	 * struct { +	 *	struct perf_event_header	header; +	 *	u64				time; +	 *	u64				id; +	 *	u64				sample_period; +	 * }; +	 */ +	PERF_EVENT_PERIOD		= 4, + +	/* +	 * struct { +	 *	struct perf_event_header	header; +	 *	u64				time; +	 *	u64				id; +	 * }; +	 */ +	PERF_EVENT_THROTTLE		= 5, +	PERF_EVENT_UNTHROTTLE		= 6, + +	/* +	 * struct { +	 *	struct perf_event_header	header; +	 *	u32				pid, ppid; +	 * }; +	 */ +	PERF_EVENT_FORK			= 7, + +	/* +	 * When header.misc & PERF_EVENT_MISC_OVERFLOW the event_type field +	 * will be PERF_RECORD_* +	 * +	 * struct { +	 *	struct perf_event_header	header; +	 * +	 *	{ u64			ip;	  } && PERF_RECORD_IP +	 *	{ u32			pid, tid; } && PERF_RECORD_TID +	 *	{ u64			time;     } && PERF_RECORD_TIME +	 *	{ u64			addr;     } && PERF_RECORD_ADDR +	 *	{ u64			config;   } && PERF_RECORD_CONFIG +	 *	{ u32			cpu, res; } && PERF_RECORD_CPU +	 * +	 *	{ u64			nr; +	 *	  { u64 id, val; }	cnt[nr];  } && PERF_RECORD_GROUP +	 * +	 *	{ u16			nr, +	 *				hv, +	 *				kernel, +	 *				user; +	 *	  u64			ips[nr];  } && PERF_RECORD_CALLCHAIN +	 * }; +	 */ +}; + +#ifdef __KERNEL__ +/* + * Kernel-internal data types and definitions: + */ + +#ifdef CONFIG_PERF_COUNTERS +# include <asm/perf_counter.h> +#endif + +#include <linux/list.h> +#include <linux/mutex.h> +#include <linux/rculist.h> +#include <linux/rcupdate.h> +#include <linux/spinlock.h> +#include <linux/hrtimer.h> +#include <linux/fs.h> +#include <linux/pid_namespace.h> +#include <asm/atomic.h> + +struct task_struct; + +/** + * struct hw_perf_counter - performance counter hardware details: + */ +struct hw_perf_counter { +#ifdef CONFIG_PERF_COUNTERS +	union { +		struct { /* hardware */ +			u64		config; +			unsigned long	config_base; +			unsigned long	counter_base; +			int		idx; +		}; +		union { /* software */ +			atomic64_t	count; +			struct hrtimer	hrtimer; +		}; +	}; +	atomic64_t			prev_count; +	u64				sample_period; +	u64				last_period; +	atomic64_t			period_left; +	u64				interrupts; + +	u64				freq_count; +	u64				freq_interrupts; +	u64				freq_stamp; +#endif +}; + +struct perf_counter; + +/** + * struct pmu - generic performance monitoring unit + */ +struct pmu { +	int (*enable)			(struct perf_counter *counter); +	void (*disable)			(struct perf_counter *counter); +	void (*read)			(struct perf_counter *counter); +	void (*unthrottle)		(struct perf_counter *counter); +}; + +/** + * enum perf_counter_active_state - the states of a counter + */ +enum perf_counter_active_state { +	PERF_COUNTER_STATE_ERROR	= -2, +	PERF_COUNTER_STATE_OFF		= -1, +	PERF_COUNTER_STATE_INACTIVE	=  0, +	PERF_COUNTER_STATE_ACTIVE	=  1, +}; + +struct file; + +struct perf_mmap_data { +	struct rcu_head			rcu_head; +	int				nr_pages;	/* nr of data pages  */ +	int				nr_locked;	/* nr pages mlocked  */ + +	atomic_t			poll;		/* POLL_ for wakeups */ +	atomic_t			events;		/* event limit       */ + +	atomic_long_t			head;		/* write position    */ +	atomic_long_t			done_head;	/* completed head    */ + +	atomic_t			lock;		/* concurrent writes */ + +	atomic_t			wakeup;		/* needs a wakeup    */ + +	struct perf_counter_mmap_page   *user_page; +	void				*data_pages[0]; +}; + +struct perf_pending_entry { +	struct perf_pending_entry *next; +	void (*func)(struct perf_pending_entry *); +}; + +/** + * struct perf_counter - performance counter kernel representation: + */ +struct perf_counter { +#ifdef CONFIG_PERF_COUNTERS +	struct list_head		list_entry; +	struct list_head		event_entry; +	struct list_head		sibling_list; +	int				nr_siblings; +	struct perf_counter		*group_leader; +	const struct pmu		*pmu; + +	enum perf_counter_active_state	state; +	atomic64_t			count; + +	/* +	 * These are the total time in nanoseconds that the counter +	 * has been enabled (i.e. eligible to run, and the task has +	 * been scheduled in, if this is a per-task counter) +	 * and running (scheduled onto the CPU), respectively. +	 * +	 * They are computed from tstamp_enabled, tstamp_running and +	 * tstamp_stopped when the counter is in INACTIVE or ACTIVE state. +	 */ +	u64				total_time_enabled; +	u64				total_time_running; + +	/* +	 * These are timestamps used for computing total_time_enabled +	 * and total_time_running when the counter is in INACTIVE or +	 * ACTIVE state, measured in nanoseconds from an arbitrary point +	 * in time. +	 * tstamp_enabled: the notional time when the counter was enabled +	 * tstamp_running: the notional time when the counter was scheduled on +	 * tstamp_stopped: in INACTIVE state, the notional time when the +	 *	counter was scheduled off. +	 */ +	u64				tstamp_enabled; +	u64				tstamp_running; +	u64				tstamp_stopped; + +	struct perf_counter_attr	attr; +	struct hw_perf_counter		hw; + +	struct perf_counter_context	*ctx; +	struct file			*filp; + +	/* +	 * These accumulate total time (in nanoseconds) that children +	 * counters have been enabled and running, respectively. +	 */ +	atomic64_t			child_total_time_enabled; +	atomic64_t			child_total_time_running; + +	/* +	 * Protect attach/detach and child_list: +	 */ +	struct mutex			child_mutex; +	struct list_head		child_list; +	struct perf_counter		*parent; + +	int				oncpu; +	int				cpu; + +	struct list_head		owner_entry; +	struct task_struct		*owner; + +	/* mmap bits */ +	struct mutex			mmap_mutex; +	atomic_t			mmap_count; +	struct perf_mmap_data		*data; + +	/* poll related */ +	wait_queue_head_t		waitq; +	struct fasync_struct		*fasync; + +	/* delayed work for NMIs and such */ +	int				pending_wakeup; +	int				pending_kill; +	int				pending_disable; +	struct perf_pending_entry	pending; + +	atomic_t			event_limit; + +	void (*destroy)(struct perf_counter *); +	struct rcu_head			rcu_head; + +	struct pid_namespace		*ns; +	u64				id; +#endif +}; + +/** + * struct perf_counter_context - counter context structure + * + * Used as a container for task counters and CPU counters as well: + */ +struct perf_counter_context { +	/* +	 * Protect the states of the counters in the list, +	 * nr_active, and the list: +	 */ +	spinlock_t			lock; +	/* +	 * Protect the list of counters.  Locking either mutex or lock +	 * is sufficient to ensure the list doesn't change; to change +	 * the list you need to lock both the mutex and the spinlock. +	 */ +	struct mutex			mutex; + +	struct list_head		counter_list; +	struct list_head		event_list; +	int				nr_counters; +	int				nr_active; +	int				is_active; +	atomic_t			refcount; +	struct task_struct		*task; + +	/* +	 * Context clock, runs when context enabled. +	 */ +	u64				time; +	u64				timestamp; + +	/* +	 * These fields let us detect when two contexts have both +	 * been cloned (inherited) from a common ancestor. +	 */ +	struct perf_counter_context	*parent_ctx; +	u64				parent_gen; +	u64				generation; +	int				pin_count; +	struct rcu_head			rcu_head; +}; + +/** + * struct perf_counter_cpu_context - per cpu counter context structure + */ +struct perf_cpu_context { +	struct perf_counter_context	ctx; +	struct perf_counter_context	*task_ctx; +	int				active_oncpu; +	int				max_pertask; +	int				exclusive; + +	/* +	 * Recursion avoidance: +	 * +	 * task, softirq, irq, nmi context +	 */ +	int				recursion[4]; +}; + +#ifdef CONFIG_PERF_COUNTERS + +/* + * Set by architecture code: + */ +extern int perf_max_counters; + +extern const struct pmu *hw_perf_counter_init(struct perf_counter *counter); + +extern void perf_counter_task_sched_in(struct task_struct *task, int cpu); +extern void perf_counter_task_sched_out(struct task_struct *task, +					struct task_struct *next, int cpu); +extern void perf_counter_task_tick(struct task_struct *task, int cpu); +extern int perf_counter_init_task(struct task_struct *child); +extern void perf_counter_exit_task(struct task_struct *child); +extern void perf_counter_free_task(struct task_struct *task); +extern void perf_counter_do_pending(void); +extern void perf_counter_print_debug(void); +extern void __perf_disable(void); +extern bool __perf_enable(void); +extern void perf_disable(void); +extern void perf_enable(void); +extern int perf_counter_task_disable(void); +extern int perf_counter_task_enable(void); +extern int hw_perf_group_sched_in(struct perf_counter *group_leader, +	       struct perf_cpu_context *cpuctx, +	       struct perf_counter_context *ctx, int cpu); +extern void perf_counter_update_userpage(struct perf_counter *counter); + +struct perf_sample_data { +	struct pt_regs			*regs; +	u64				addr; +	u64				period; +}; + +extern int perf_counter_overflow(struct perf_counter *counter, int nmi, +				 struct perf_sample_data *data); + +/* + * Return 1 for a software counter, 0 for a hardware counter + */ +static inline int is_software_counter(struct perf_counter *counter) +{ +	return (counter->attr.type != PERF_TYPE_RAW) && +		(counter->attr.type != PERF_TYPE_HARDWARE); +} + +extern void perf_swcounter_event(u32, u64, int, struct pt_regs *, u64); + +extern void __perf_counter_mmap(struct vm_area_struct *vma); + +static inline void perf_counter_mmap(struct vm_area_struct *vma) +{ +	if (vma->vm_flags & VM_EXEC) +		__perf_counter_mmap(vma); +} + +extern void perf_counter_comm(struct task_struct *tsk); +extern void perf_counter_fork(struct task_struct *tsk); + +extern void perf_counter_task_migration(struct task_struct *task, int cpu); + +#define MAX_STACK_DEPTH			255 + +struct perf_callchain_entry { +	u16				nr; +	u16				hv; +	u16				kernel; +	u16				user; +	u64				ip[MAX_STACK_DEPTH]; +}; + +extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs); + +extern int sysctl_perf_counter_paranoid; +extern int sysctl_perf_counter_mlock; +extern int sysctl_perf_counter_sample_rate; + +extern void perf_counter_init(void); + +#ifndef perf_misc_flags +#define perf_misc_flags(regs)	(user_mode(regs) ? PERF_EVENT_MISC_USER : \ +				 PERF_EVENT_MISC_KERNEL) +#define perf_instruction_pointer(regs)	instruction_pointer(regs) +#endif + +#else +static inline void +perf_counter_task_sched_in(struct task_struct *task, int cpu)		{ } +static inline void +perf_counter_task_sched_out(struct task_struct *task, +			    struct task_struct *next, int cpu)		{ } +static inline void +perf_counter_task_tick(struct task_struct *task, int cpu)		{ } +static inline int perf_counter_init_task(struct task_struct *child)	{ return 0; } +static inline void perf_counter_exit_task(struct task_struct *child)	{ } +static inline void perf_counter_free_task(struct task_struct *task)	{ } +static inline void perf_counter_do_pending(void)			{ } +static inline void perf_counter_print_debug(void)			{ } +static inline void perf_disable(void)					{ } +static inline void perf_enable(void)					{ } +static inline int perf_counter_task_disable(void)	{ return -EINVAL; } +static inline int perf_counter_task_enable(void)	{ return -EINVAL; } + +static inline void +perf_swcounter_event(u32 event, u64 nr, int nmi, +		     struct pt_regs *regs, u64 addr)			{ } + +static inline void perf_counter_mmap(struct vm_area_struct *vma)	{ } +static inline void perf_counter_comm(struct task_struct *tsk)		{ } +static inline void perf_counter_fork(struct task_struct *tsk)		{ } +static inline void perf_counter_init(void)				{ } +static inline void perf_counter_task_migration(struct task_struct *task, +					       int cpu)			{ } +#endif + +#endif /* __KERNEL__ */ +#endif /* _LINUX_PERF_COUNTER_H */ diff --git a/include/linux/prctl.h b/include/linux/prctl.h index 48d887e3c6e..b00df4c79c6 100644 --- a/include/linux/prctl.h +++ b/include/linux/prctl.h @@ -85,4 +85,7 @@  #define PR_SET_TIMERSLACK 29  #define PR_GET_TIMERSLACK 30 +#define PR_TASK_PERF_COUNTERS_DISABLE		31 +#define PR_TASK_PERF_COUNTERS_ENABLE		32 +  #endif /* _LINUX_PRCTL_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 42bf2766111..4896fdfec91 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -99,6 +99,7 @@ struct robust_list_head;  struct bio;  struct fs_struct;  struct bts_context; +struct perf_counter_context;  /*   * List of flags we want to share for kernel threads, @@ -139,6 +140,7 @@ extern unsigned long nr_running(void);  extern unsigned long nr_uninterruptible(void);  extern unsigned long nr_iowait(void);  extern void calc_global_load(void); +extern u64 cpu_nr_migrations(int cpu);  extern unsigned long get_parent_ip(unsigned long addr); @@ -674,6 +676,10 @@ struct user_struct {  	struct work_struct work;  #endif  #endif + +#ifdef CONFIG_PERF_COUNTERS +	atomic_long_t locked_vm; +#endif  };  extern int uids_sysfs_init(void); @@ -1073,9 +1079,10 @@ struct sched_entity {  	u64			last_wakeup;  	u64			avg_overlap; +	u64			nr_migrations; +  	u64			start_runtime;  	u64			avg_wakeup; -	u64			nr_migrations;  #ifdef CONFIG_SCHEDSTATS  	u64			wait_start; @@ -1396,6 +1403,11 @@ struct task_struct {  	struct list_head pi_state_list;  	struct futex_pi_state *pi_state_cache;  #endif +#ifdef CONFIG_PERF_COUNTERS +	struct perf_counter_context *perf_counter_ctxp; +	struct mutex perf_counter_mutex; +	struct list_head perf_counter_list; +#endif  #ifdef CONFIG_NUMA  	struct mempolicy *mempolicy;  	short il_next; @@ -2410,6 +2422,13 @@ static inline void inc_syscw(struct task_struct *tsk)  #define TASK_SIZE_OF(tsk)	TASK_SIZE  #endif +/* + * Call the function if the target task is executing on a CPU right now: + */ +extern void task_oncpu_function_call(struct task_struct *p, +				     void (*func) (void *info), void *info); + +  #ifdef CONFIG_MM_OWNER  extern void mm_update_next_owner(struct mm_struct *mm);  extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p); diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 30520844b8d..c6c84ad8bd7 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -55,6 +55,7 @@ struct compat_timeval;  struct robust_list_head;  struct getcpu_cache;  struct old_linux_dirent; +struct perf_counter_attr;  #include <linux/types.h>  #include <linux/aio_abi.h> @@ -755,4 +756,8 @@ asmlinkage long sys_pipe(int __user *);  int kernel_execve(const char *filename, char *const argv[], char *const envp[]); + +asmlinkage long sys_perf_counter_open( +		const struct perf_counter_attr __user *attr_uptr, +		pid_t pid, int cpu, int group_fd, unsigned long flags);  #endif diff --git a/init/Kconfig b/init/Kconfig index d4e9671347e..9b68fee8d79 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -933,6 +933,40 @@ config AIO            by some high performance threaded applications. Disabling            this option saves about 7k. +config HAVE_PERF_COUNTERS +	bool + +menu "Performance Counters" + +config PERF_COUNTERS +	bool "Kernel Performance Counters" +	depends on HAVE_PERF_COUNTERS +	select ANON_INODES +	help +	  Enable kernel support for performance counter hardware. + +	  Performance counters are special hardware registers available +	  on most modern CPUs. These registers count the number of certain +	  types of hw events: such as instructions executed, cachemisses +	  suffered, or branches mis-predicted - without slowing down the +	  kernel or applications. These registers can also trigger interrupts +	  when a threshold number of events have passed - and can thus be +	  used to profile the code that runs on that CPU. + +	  The Linux Performance Counter subsystem provides an abstraction of +	  these hardware capabilities, available via a system call. It +	  provides per task and per CPU counters, and it provides event +	  capabilities on top of those. + +	  Say Y if unsure. + +config EVENT_PROFILE +	bool "Tracepoint profile sources" +	depends on PERF_COUNTERS && EVENT_TRACER +	default y + +endmenu +  config VM_EVENT_COUNTERS  	default y  	bool "Enable VM event counters for /proc/vmstat" if EMBEDDED diff --git a/kernel/Makefile b/kernel/Makefile index a35eee3436d..90b53f6dc22 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -96,6 +96,7 @@ obj-$(CONFIG_TRACING) += trace/  obj-$(CONFIG_X86_DS) += trace/  obj-$(CONFIG_SMP) += sched_cpupri.o  obj-$(CONFIG_SLOW_WORK) += slow-work.o +obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o  ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)  # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is diff --git a/kernel/exit.c b/kernel/exit.c index 51d1fe3fb7a..b6c90b5ef50 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -48,6 +48,7 @@  #include <linux/tracehook.h>  #include <linux/fs_struct.h>  #include <linux/init_task.h> +#include <linux/perf_counter.h>  #include <trace/events/sched.h>  #include <asm/uaccess.h> @@ -154,6 +155,9 @@ static void delayed_put_task_struct(struct rcu_head *rhp)  {  	struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); +#ifdef CONFIG_PERF_COUNTERS +	WARN_ON_ONCE(tsk->perf_counter_ctxp); +#endif  	trace_sched_process_free(tsk);  	put_task_struct(tsk);  } @@ -170,6 +174,7 @@ repeat:  	atomic_dec(&__task_cred(p)->user->processes);  	proc_flush_task(p); +  	write_lock_irq(&tasklist_lock);  	tracehook_finish_release_task(p);  	__exit_signal(p); @@ -971,16 +976,19 @@ NORET_TYPE void do_exit(long code)  		module_put(tsk->binfmt->module);  	proc_exit_connector(tsk); + +	/* +	 * Flush inherited counters to the parent - before the parent +	 * gets woken up by child-exit notifications. +	 */ +	perf_counter_exit_task(tsk); +  	exit_notify(tsk, group_dead);  #ifdef CONFIG_NUMA  	mpol_put(tsk->mempolicy);  	tsk->mempolicy = NULL;  #endif  #ifdef CONFIG_FUTEX -	/* -	 * This must happen late, after the PID is not -	 * hashed anymore: -	 */  	if (unlikely(!list_empty(&tsk->pi_state_list)))  		exit_pi_state_list(tsk);  	if (unlikely(current->pi_state_cache)) diff --git a/kernel/fork.c b/kernel/fork.c index bb762b4dd21..4430eb1376f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -62,6 +62,7 @@  #include <linux/blkdev.h>  #include <linux/fs_struct.h>  #include <linux/magic.h> +#include <linux/perf_counter.h>  #include <asm/pgtable.h>  #include <asm/pgalloc.h> @@ -1096,6 +1097,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,  	/* Perform scheduler related setup. Assign this task to a CPU. */  	sched_fork(p, clone_flags); +	retval = perf_counter_init_task(p); +	if (retval) +		goto bad_fork_cleanup_policy; +  	if ((retval = audit_alloc(p)))  		goto bad_fork_cleanup_policy;  	/* copy all the process information */ @@ -1290,6 +1295,7 @@ bad_fork_cleanup_semundo:  bad_fork_cleanup_audit:  	audit_free(p);  bad_fork_cleanup_policy: +	perf_counter_free_task(p);  #ifdef CONFIG_NUMA  	mpol_put(p->mempolicy);  bad_fork_cleanup_cgroup: @@ -1403,6 +1409,12 @@ long do_fork(unsigned long clone_flags,  		if (clone_flags & CLONE_VFORK) {  			p->vfork_done = &vfork;  			init_completion(&vfork); +		} else if (!(clone_flags & CLONE_VM)) { +			/* +			 * vfork will do an exec which will call +			 * set_task_comm() +			 */ +			perf_counter_fork(p);  		}  		audit_finish_fork(p); diff --git a/kernel/mutex.c b/kernel/mutex.c index e5cc0cd28d5..947b3ad551f 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -89,7 +89,7 @@ __mutex_lock_slowpath(atomic_t *lock_count);   *   * This function is similar to (but not equivalent to) down().   */ -void inline __sched mutex_lock(struct mutex *lock) +void __sched mutex_lock(struct mutex *lock)  {  	might_sleep();  	/* diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c new file mode 100644 index 00000000000..ef5d8a5b245 --- /dev/null +++ b/kernel/perf_counter.c @@ -0,0 +1,4260 @@ +/* + * Performance counter core code + * + *  Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> + *  Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar + *  Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + *  Copyright  ©  2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> + * + *  For licensing details see kernel-base/COPYING + */ + +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/cpu.h> +#include <linux/smp.h> +#include <linux/file.h> +#include <linux/poll.h> +#include <linux/sysfs.h> +#include <linux/dcache.h> +#include <linux/percpu.h> +#include <linux/ptrace.h> +#include <linux/vmstat.h> +#include <linux/hardirq.h> +#include <linux/rculist.h> +#include <linux/uaccess.h> +#include <linux/syscalls.h> +#include <linux/anon_inodes.h> +#include <linux/kernel_stat.h> +#include <linux/perf_counter.h> + +#include <asm/irq_regs.h> + +/* + * Each CPU has a list of per CPU counters: + */ +DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context); + +int perf_max_counters __read_mostly = 1; +static int perf_reserved_percpu __read_mostly; +static int perf_overcommit __read_mostly = 1; + +static atomic_t nr_counters __read_mostly; +static atomic_t nr_mmap_counters __read_mostly; +static atomic_t nr_comm_counters __read_mostly; + +/* + * perf counter paranoia level: + *  0 - not paranoid + *  1 - disallow cpu counters to unpriv + *  2 - disallow kernel profiling to unpriv + */ +int sysctl_perf_counter_paranoid __read_mostly; + +static inline bool perf_paranoid_cpu(void) +{ +	return sysctl_perf_counter_paranoid > 0; +} + +static inline bool perf_paranoid_kernel(void) +{ +	return sysctl_perf_counter_paranoid > 1; +} + +int sysctl_perf_counter_mlock __read_mostly = 512; /* 'free' kb per user */ + +/* + * max perf counter sample rate + */ +int sysctl_perf_counter_sample_rate __read_mostly = 100000; + +static atomic64_t perf_counter_id; + +/* + * Lock for (sysadmin-configurable) counter reservations: + */ +static DEFINE_SPINLOCK(perf_resource_lock); + +/* + * Architecture provided APIs - weak aliases: + */ +extern __weak const struct pmu *hw_perf_counter_init(struct perf_counter *counter) +{ +	return NULL; +} + +void __weak hw_perf_disable(void)		{ barrier(); } +void __weak hw_perf_enable(void)		{ barrier(); } + +void __weak hw_perf_counter_setup(int cpu)	{ barrier(); } + +int __weak +hw_perf_group_sched_in(struct perf_counter *group_leader, +	       struct perf_cpu_context *cpuctx, +	       struct perf_counter_context *ctx, int cpu) +{ +	return 0; +} + +void __weak perf_counter_print_debug(void)	{ } + +static DEFINE_PER_CPU(int, disable_count); + +void __perf_disable(void) +{ +	__get_cpu_var(disable_count)++; +} + +bool __perf_enable(void) +{ +	return !--__get_cpu_var(disable_count); +} + +void perf_disable(void) +{ +	__perf_disable(); +	hw_perf_disable(); +} + +void perf_enable(void) +{ +	if (__perf_enable()) +		hw_perf_enable(); +} + +static void get_ctx(struct perf_counter_context *ctx) +{ +	atomic_inc(&ctx->refcount); +} + +static void free_ctx(struct rcu_head *head) +{ +	struct perf_counter_context *ctx; + +	ctx = container_of(head, struct perf_counter_context, rcu_head); +	kfree(ctx); +} + +static void put_ctx(struct perf_counter_context *ctx) +{ +	if (atomic_dec_and_test(&ctx->refcount)) { +		if (ctx->parent_ctx) +			put_ctx(ctx->parent_ctx); +		if (ctx->task) +			put_task_struct(ctx->task); +		call_rcu(&ctx->rcu_head, free_ctx); +	} +} + +/* + * Get the perf_counter_context for a task and lock it. + * This has to cope with with the fact that until it is locked, + * the context could get moved to another task. + */ +static struct perf_counter_context * +perf_lock_task_context(struct task_struct *task, unsigned long *flags) +{ +	struct perf_counter_context *ctx; + +	rcu_read_lock(); + retry: +	ctx = rcu_dereference(task->perf_counter_ctxp); +	if (ctx) { +		/* +		 * If this context is a clone of another, it might +		 * get swapped for another underneath us by +		 * perf_counter_task_sched_out, though the +		 * rcu_read_lock() protects us from any context +		 * getting freed.  Lock the context and check if it +		 * got swapped before we could get the lock, and retry +		 * if so.  If we locked the right context, then it +		 * can't get swapped on us any more. +		 */ +		spin_lock_irqsave(&ctx->lock, *flags); +		if (ctx != rcu_dereference(task->perf_counter_ctxp)) { +			spin_unlock_irqrestore(&ctx->lock, *flags); +			goto retry; +		} +	} +	rcu_read_unlock(); +	return ctx; +} + +/* + * Get the context for a task and increment its pin_count so it + * can't get swapped to another task.  This also increments its + * reference count so that the context can't get freed. + */ +static struct perf_counter_context *perf_pin_task_context(struct task_struct *task) +{ +	struct perf_counter_context *ctx; +	unsigned long flags; + +	ctx = perf_lock_task_context(task, &flags); +	if (ctx) { +		++ctx->pin_count; +		get_ctx(ctx); +		spin_unlock_irqrestore(&ctx->lock, flags); +	} +	return ctx; +} + +static void perf_unpin_context(struct perf_counter_context *ctx) +{ +	unsigned long flags; + +	spin_lock_irqsave(&ctx->lock, flags); +	--ctx->pin_count; +	spin_unlock_irqrestore(&ctx->lock, flags); +	put_ctx(ctx); +} + +/* + * Add a counter from the lists for its context. + * Must be called with ctx->mutex and ctx->lock held. + */ +static void +list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx) +{ +	struct perf_counter *group_leader = counter->group_leader; + +	/* +	 * Depending on whether it is a standalone or sibling counter, +	 * add it straight to the context's counter list, or to the group +	 * leader's sibling list: +	 */ +	if (group_leader == counter) +		list_add_tail(&counter->list_entry, &ctx->counter_list); +	else { +		list_add_tail(&counter->list_entry, &group_leader->sibling_list); +		group_leader->nr_siblings++; +	} + +	list_add_rcu(&counter->event_entry, &ctx->event_list); +	ctx->nr_counters++; +} + +/* + * Remove a counter from the lists for its context. + * Must be called with ctx->mutex and ctx->lock held. + */ +static void +list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx) +{ +	struct perf_counter *sibling, *tmp; + +	if (list_empty(&counter->list_entry)) +		return; +	ctx->nr_counters--; + +	list_del_init(&counter->list_entry); +	list_del_rcu(&counter->event_entry); + +	if (counter->group_leader != counter) +		counter->group_leader->nr_siblings--; + +	/* +	 * If this was a group counter with sibling counters then +	 * upgrade the siblings to singleton counters by adding them +	 * to the context list directly: +	 */ +	list_for_each_entry_safe(sibling, tmp, +				 &counter->sibling_list, list_entry) { + +		list_move_tail(&sibling->list_entry, &ctx->counter_list); +		sibling->group_leader = sibling; +	} +} + +static void +counter_sched_out(struct perf_counter *counter, +		  struct perf_cpu_context *cpuctx, +		  struct perf_counter_context *ctx) +{ +	if (counter->state != PERF_COUNTER_STATE_ACTIVE) +		return; + +	counter->state = PERF_COUNTER_STATE_INACTIVE; +	counter->tstamp_stopped = ctx->time; +	counter->pmu->disable(counter); +	counter->oncpu = -1; + +	if (!is_software_counter(counter)) +		cpuctx->active_oncpu--; +	ctx->nr_active--; +	if (counter->attr.exclusive || !cpuctx->active_oncpu) +		cpuctx->exclusive = 0; +} + +static void +group_sched_out(struct perf_counter *group_counter, +		struct perf_cpu_context *cpuctx, +		struct perf_counter_context *ctx) +{ +	struct perf_counter *counter; + +	if (group_counter->state != PERF_COUNTER_STATE_ACTIVE) +		return; + +	counter_sched_out(group_counter, cpuctx, ctx); + +	/* +	 * Schedule out siblings (if any): +	 */ +	list_for_each_entry(counter, &group_counter->sibling_list, list_entry) +		counter_sched_out(counter, cpuctx, ctx); + +	if (group_counter->attr.exclusive) +		cpuctx->exclusive = 0; +} + +/* + * Cross CPU call to remove a performance counter + * + * We disable the counter on the hardware level first. After that we + * remove it from the context list. + */ +static void __perf_counter_remove_from_context(void *info) +{ +	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); +	struct perf_counter *counter = info; +	struct perf_counter_context *ctx = counter->ctx; + +	/* +	 * If this is a task context, we need to check whether it is +	 * the current task context of this cpu. If not it has been +	 * scheduled out before the smp call arrived. +	 */ +	if (ctx->task && cpuctx->task_ctx != ctx) +		return; + +	spin_lock(&ctx->lock); +	/* +	 * Protect the list operation against NMI by disabling the +	 * counters on a global level. +	 */ +	perf_disable(); + +	counter_sched_out(counter, cpuctx, ctx); + +	list_del_counter(counter, ctx); + +	if (!ctx->task) { +		/* +		 * Allow more per task counters with respect to the +		 * reservation: +		 */ +		cpuctx->max_pertask = +			min(perf_max_counters - ctx->nr_counters, +			    perf_max_counters - perf_reserved_percpu); +	} + +	perf_enable(); +	spin_unlock(&ctx->lock); +} + + +/* + * Remove the counter from a task's (or a CPU's) list of counters. + * + * Must be called with ctx->mutex held. + * + * CPU counters are removed with a smp call. For task counters we only + * call when the task is on a CPU. + * + * If counter->ctx is a cloned context, callers must make sure that + * every task struct that counter->ctx->task could possibly point to + * remains valid.  This is OK when called from perf_release since + * that only calls us on the top-level context, which can't be a clone. + * When called from perf_counter_exit_task, it's OK because the + * context has been detached from its task. + */ +static void perf_counter_remove_from_context(struct perf_counter *counter) +{ +	struct perf_counter_context *ctx = counter->ctx; +	struct task_struct *task = ctx->task; + +	if (!task) { +		/* +		 * Per cpu counters are removed via an smp call and +		 * the removal is always sucessful. +		 */ +		smp_call_function_single(counter->cpu, +					 __perf_counter_remove_from_context, +					 counter, 1); +		return; +	} + +retry: +	task_oncpu_function_call(task, __perf_counter_remove_from_context, +				 counter); + +	spin_lock_irq(&ctx->lock); +	/* +	 * If the context is active we need to retry the smp call. +	 */ +	if (ctx->nr_active && !list_empty(&counter->list_entry)) { +		spin_unlock_irq(&ctx->lock); +		goto retry; +	} + +	/* +	 * The lock prevents that this context is scheduled in so we +	 * can remove the counter safely, if the call above did not +	 * succeed. +	 */ +	if (!list_empty(&counter->list_entry)) { +		list_del_counter(counter, ctx); +	} +	spin_unlock_irq(&ctx->lock); +} + +static inline u64 perf_clock(void) +{ +	return cpu_clock(smp_processor_id()); +} + +/* + * Update the record of the current time in a context. + */ +static void update_context_time(struct perf_counter_context *ctx) +{ +	u64 now = perf_clock(); + +	ctx->time += now - ctx->timestamp; +	ctx->timestamp = now; +} + +/* + * Update the total_time_enabled and total_time_running fields for a counter. + */ +static void update_counter_times(struct perf_counter *counter) +{ +	struct perf_counter_context *ctx = counter->ctx; +	u64 run_end; + +	if (counter->state < PERF_COUNTER_STATE_INACTIVE) +		return; + +	counter->total_time_enabled = ctx->time - counter->tstamp_enabled; + +	if (counter->state == PERF_COUNTER_STATE_INACTIVE) +		run_end = counter->tstamp_stopped; +	else +		run_end = ctx->time; + +	counter->total_time_running = run_end - counter->tstamp_running; +} + +/* + * Update total_time_enabled and total_time_running for all counters in a group. + */ +static void update_group_times(struct perf_counter *leader) +{ +	struct perf_counter *counter; + +	update_counter_times(leader); +	list_for_each_entry(counter, &leader->sibling_list, list_entry) +		update_counter_times(counter); +} + +/* + * Cross CPU call to disable a performance counter + */ +static void __perf_counter_disable(void *info) +{ +	struct perf_counter *counter = info; +	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); +	struct perf_counter_context *ctx = counter->ctx; + +	/* +	 * If this is a per-task counter, need to check whether this +	 * counter's task is the current task on this cpu. +	 */ +	if (ctx->task && cpuctx->task_ctx != ctx) +		return; + +	spin_lock(&ctx->lock); + +	/* +	 * If the counter is on, turn it off. +	 * If it is in error state, leave it in error state. +	 */ +	if (counter->state >= PERF_COUNTER_STATE_INACTIVE) { +		update_context_time(ctx); +		update_counter_times(counter); +		if (counter == counter->group_leader) +			group_sched_out(counter, cpuctx, ctx); +		else +			counter_sched_out(counter, cpuctx, ctx); +		counter->state = PERF_COUNTER_STATE_OFF; +	} + +	spin_unlock(&ctx->lock); +} + +/* + * Disable a counter. + * + * If counter->ctx is a cloned context, callers must make sure that + * every task struct that counter->ctx->task could possibly point to + * remains valid.  This condition is satisifed when called through + * perf_counter_for_each_child or perf_counter_for_each because they + * hold the top-level counter's child_mutex, so any descendant that + * goes to exit will block in sync_child_counter. + * When called from perf_pending_counter it's OK because counter->ctx + * is the current context on this CPU and preemption is disabled, + * hence we can't get into perf_counter_task_sched_out for this context. + */ +static void perf_counter_disable(struct perf_counter *counter) +{ +	struct perf_counter_context *ctx = counter->ctx; +	struct task_struct *task = ctx->task; + +	if (!task) { +		/* +		 * Disable the counter on the cpu that it's on +		 */ +		smp_call_function_single(counter->cpu, __perf_counter_disable, +					 counter, 1); +		return; +	} + + retry: +	task_oncpu_function_call(task, __perf_counter_disable, counter); + +	spin_lock_irq(&ctx->lock); +	/* +	 * If the counter is still active, we need to retry the cross-call. +	 */ +	if (counter->state == PERF_COUNTER_STATE_ACTIVE) { +		spin_unlock_irq(&ctx->lock); +		goto retry; +	} + +	/* +	 * Since we have the lock this context can't be scheduled +	 * in, so we can change the state safely. +	 */ +	if (counter->state == PERF_COUNTER_STATE_INACTIVE) { +		update_counter_times(counter); +		counter->state = PERF_COUNTER_STATE_OFF; +	} + +	spin_unlock_irq(&ctx->lock); +} + +static int +counter_sched_in(struct perf_counter *counter, +		 struct perf_cpu_context *cpuctx, +		 struct perf_counter_context *ctx, +		 int cpu) +{ +	if (counter->state <= PERF_COUNTER_STATE_OFF) +		return 0; + +	counter->state = PERF_COUNTER_STATE_ACTIVE; +	counter->oncpu = cpu;	/* TODO: put 'cpu' into cpuctx->cpu */ +	/* +	 * The new state must be visible before we turn it on in the hardware: +	 */ +	smp_wmb(); + +	if (counter->pmu->enable(counter)) { +		counter->state = PERF_COUNTER_STATE_INACTIVE; +		counter->oncpu = -1; +		return -EAGAIN; +	} + +	counter->tstamp_running += ctx->time - counter->tstamp_stopped; + +	if (!is_software_counter(counter)) +		cpuctx->active_oncpu++; +	ctx->nr_active++; + +	if (counter->attr.exclusive) +		cpuctx->exclusive = 1; + +	return 0; +} + +static int +group_sched_in(struct perf_counter *group_counter, +	       struct perf_cpu_context *cpuctx, +	       struct perf_counter_context *ctx, +	       int cpu) +{ +	struct perf_counter *counter, *partial_group; +	int ret; + +	if (group_counter->state == PERF_COUNTER_STATE_OFF) +		return 0; + +	ret = hw_perf_group_sched_in(group_counter, cpuctx, ctx, cpu); +	if (ret) +		return ret < 0 ? ret : 0; + +	if (counter_sched_in(group_counter, cpuctx, ctx, cpu)) +		return -EAGAIN; + +	/* +	 * Schedule in siblings as one group (if any): +	 */ +	list_for_each_entry(counter, &group_counter->sibling_list, list_entry) { +		if (counter_sched_in(counter, cpuctx, ctx, cpu)) { +			partial_group = counter; +			goto group_error; +		} +	} + +	return 0; + +group_error: +	/* +	 * Groups can be scheduled in as one unit only, so undo any +	 * partial group before returning: +	 */ +	list_for_each_entry(counter, &group_counter->sibling_list, list_entry) { +		if (counter == partial_group) +			break; +		counter_sched_out(counter, cpuctx, ctx); +	} +	counter_sched_out(group_counter, cpuctx, ctx); + +	return -EAGAIN; +} + +/* + * Return 1 for a group consisting entirely of software counters, + * 0 if the group contains any hardware counters. + */ +static int is_software_only_group(struct perf_counter *leader) +{ +	struct perf_counter *counter; + +	if (!is_software_counter(leader)) +		return 0; + +	list_for_each_entry(counter, &leader->sibling_list, list_entry) +		if (!is_software_counter(counter)) +			return 0; + +	return 1; +} + +/* + * Work out whether we can put this counter group on the CPU now. + */ +static int group_can_go_on(struct perf_counter *counter, +			   struct perf_cpu_context *cpuctx, +			   int can_add_hw) +{ +	/* +	 * Groups consisting entirely of software counters can always go on. +	 */ +	if (is_software_only_group(counter)) +		return 1; +	/* +	 * If an exclusive group is already on, no other hardware +	 * counters can go on. +	 */ +	if (cpuctx->exclusive) +		return 0; +	/* +	 * If this group is exclusive and there are already +	 * counters on the CPU, it can't go on. +	 */ +	if (counter->attr.exclusive && cpuctx->active_oncpu) +		return 0; +	/* +	 * Otherwise, try to add it if all previous groups were able +	 * to go on. +	 */ +	return can_add_hw; +} + +static void add_counter_to_ctx(struct perf_counter *counter, +			       struct perf_counter_context *ctx) +{ +	list_add_counter(counter, ctx); +	counter->tstamp_enabled = ctx->time; +	counter->tstamp_running = ctx->time; +	counter->tstamp_stopped = ctx->time; +} + +/* + * Cross CPU call to install and enable a performance counter + * + * Must be called with ctx->mutex held + */ +static void __perf_install_in_context(void *info) +{ +	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); +	struct perf_counter *counter = info; +	struct perf_counter_context *ctx = counter->ctx; +	struct perf_counter *leader = counter->group_leader; +	int cpu = smp_processor_id(); +	int err; + +	/* +	 * If this is a task context, we need to check whether it is +	 * the current task context of this cpu. If not it has been +	 * scheduled out before the smp call arrived. +	 * Or possibly this is the right context but it isn't +	 * on this cpu because it had no counters. +	 */ +	if (ctx->task && cpuctx->task_ctx != ctx) { +		if (cpuctx->task_ctx || ctx->task != current) +			return; +		cpuctx->task_ctx = ctx; +	} + +	spin_lock(&ctx->lock); +	ctx->is_active = 1; +	update_context_time(ctx); + +	/* +	 * Protect the list operation against NMI by disabling the +	 * counters on a global level. NOP for non NMI based counters. +	 */ +	perf_disable(); + +	add_counter_to_ctx(counter, ctx); + +	/* +	 * Don't put the counter on if it is disabled or if +	 * it is in a group and the group isn't on. +	 */ +	if (counter->state != PERF_COUNTER_STATE_INACTIVE || +	    (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE)) +		goto unlock; + +	/* +	 * An exclusive counter can't go on if there are already active +	 * hardware counters, and no hardware counter can go on if there +	 * is already an exclusive counter on. +	 */ +	if (!group_can_go_on(counter, cpuctx, 1)) +		err = -EEXIST; +	else +		err = counter_sched_in(counter, cpuctx, ctx, cpu); + +	if (err) { +		/* +		 * This counter couldn't go on.  If it is in a group +		 * then we have to pull the whole group off. +		 * If the counter group is pinned then put it in error state. +		 */ +		if (leader != counter) +			group_sched_out(leader, cpuctx, ctx); +		if (leader->attr.pinned) { +			update_group_times(leader); +			leader->state = PERF_COUNTER_STATE_ERROR; +		} +	} + +	if (!err && !ctx->task && cpuctx->max_pertask) +		cpuctx->max_pertask--; + + unlock: +	perf_enable(); + +	spin_unlock(&ctx->lock); +} + +/* + * Attach a performance counter to a context + * + * First we add the counter to the list with the hardware enable bit + * in counter->hw_config cleared. + * + * If the counter is attached to a task which is on a CPU we use a smp + * call to enable it in the task context. The task might have been + * scheduled away, but we check this in the smp call again. + * + * Must be called with ctx->mutex held. + */ +static void +perf_install_in_context(struct perf_counter_context *ctx, +			struct perf_counter *counter, +			int cpu) +{ +	struct task_struct *task = ctx->task; + +	if (!task) { +		/* +		 * Per cpu counters are installed via an smp call and +		 * the install is always sucessful. +		 */ +		smp_call_function_single(cpu, __perf_install_in_context, +					 counter, 1); +		return; +	} + +retry: +	task_oncpu_function_call(task, __perf_install_in_context, +				 counter); + +	spin_lock_irq(&ctx->lock); +	/* +	 * we need to retry the smp call. +	 */ +	if (ctx->is_active && list_empty(&counter->list_entry)) { +		spin_unlock_irq(&ctx->lock); +		goto retry; +	} + +	/* +	 * The lock prevents that this context is scheduled in so we +	 * can add the counter safely, if it the call above did not +	 * succeed. +	 */ +	if (list_empty(&counter->list_entry)) +		add_counter_to_ctx(counter, ctx); +	spin_unlock_irq(&ctx->lock); +} + +/* + * Cross CPU call to enable a performance counter + */ +static void __perf_counter_enable(void *info) +{ +	struct perf_counter *counter = info; +	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); +	struct perf_counter_context *ctx = counter->ctx; +	struct perf_counter *leader = counter->group_leader; +	int err; + +	/* +	 * If this is a per-task counter, need to check whether this +	 * counter's task is the current task on this cpu. +	 */ +	if (ctx->task && cpuctx->task_ctx != ctx) { +		if (cpuctx->task_ctx || ctx->task != current) +			return; +		cpuctx->task_ctx = ctx; +	} + +	spin_lock(&ctx->lock); +	ctx->is_active = 1; +	update_context_time(ctx); + +	if (counter->state >= PERF_COUNTER_STATE_INACTIVE) +		goto unlock; +	counter->state = PERF_COUNTER_STATE_INACTIVE; +	counter->tstamp_enabled = ctx->time - counter->total_time_enabled; + +	/* +	 * If the counter is in a group and isn't the group leader, +	 * then don't put it on unless the group is on. +	 */ +	if (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE) +		goto unlock; + +	if (!group_can_go_on(counter, cpuctx, 1)) { +		err = -EEXIST; +	} else { +		perf_disable(); +		if (counter == leader) +			err = group_sched_in(counter, cpuctx, ctx, +					     smp_processor_id()); +		else +			err = counter_sched_in(counter, cpuctx, ctx, +					       smp_processor_id()); +		perf_enable(); +	} + +	if (err) { +		/* +		 * If this counter can't go on and it's part of a +		 * group, then the whole group has to come off. +		 */ +		if (leader != counter) +			group_sched_out(leader, cpuctx, ctx); +		if (leader->attr.pinned) { +			update_group_times(leader); +			leader->state = PERF_COUNTER_STATE_ERROR; +		} +	} + + unlock: +	spin_unlock(&ctx->lock); +} + +/* + * Enable a counter. + * + * If counter->ctx is a cloned context, callers must make sure that + * every task struct that counter->ctx->task could possibly point to + * remains valid.  This condition is satisfied when called through + * perf_counter_for_each_child or perf_counter_for_each as described + * for perf_counter_disable. + */ +static void perf_counter_enable(struct perf_counter *counter) +{ +	struct perf_counter_context *ctx = counter->ctx; +	struct task_struct *task = ctx->task; + +	if (!task) { +		/* +		 * Enable the counter on the cpu that it's on +		 */ +		smp_call_function_single(counter->cpu, __perf_counter_enable, +					 counter, 1); +		return; +	} + +	spin_lock_irq(&ctx->lock); +	if (counter->state >= PERF_COUNTER_STATE_INACTIVE) +		goto out; + +	/* +	 * If the counter is in error state, clear that first. +	 * That way, if we see the counter in error state below, we +	 * know that it has gone back into error state, as distinct +	 * from the task having been scheduled away before the +	 * cross-call arrived. +	 */ +	if (counter->state == PERF_COUNTER_STATE_ERROR) +		counter->state = PERF_COUNTER_STATE_OFF; + + retry: +	spin_unlock_irq(&ctx->lock); +	task_oncpu_function_call(task, __perf_counter_enable, counter); + +	spin_lock_irq(&ctx->lock); + +	/* +	 * If the context is active and the counter is still off, +	 * we need to retry the cross-call. +	 */ +	if (ctx->is_active && counter->state == PERF_COUNTER_STATE_OFF) +		goto retry; + +	/* +	 * Since we have the lock this context can't be scheduled +	 * in, so we can change the state safely. +	 */ +	if (counter->state == PERF_COUNTER_STATE_OFF) { +		counter->state = PERF_COUNTER_STATE_INACTIVE; +		counter->tstamp_enabled = +			ctx->time - counter->total_time_enabled; +	} + out: +	spin_unlock_irq(&ctx->lock); +} + +static int perf_counter_refresh(struct perf_counter *counter, int refresh) +{ +	/* +	 * not supported on inherited counters +	 */ +	if (counter->attr.inherit) +		return -EINVAL; + +	atomic_add(refresh, &counter->event_limit); +	perf_counter_enable(counter); + +	return 0; +} + +void __perf_counter_sched_out(struct perf_counter_context *ctx, +			      struct perf_cpu_context *cpuctx) +{ +	struct perf_counter *counter; + +	spin_lock(&ctx->lock); +	ctx->is_active = 0; +	if (likely(!ctx->nr_counters)) +		goto out; +	update_context_time(ctx); + +	perf_disable(); +	if (ctx->nr_active) { +		list_for_each_entry(counter, &ctx->counter_list, list_entry) { +			if (counter != counter->group_leader) +				counter_sched_out(counter, cpuctx, ctx); +			else +				group_sched_out(counter, cpuctx, ctx); +		} +	} +	perf_enable(); + out: +	spin_unlock(&ctx->lock); +} + +/* + * Test whether two contexts are equivalent, i.e. whether they + * have both been cloned from the same version of the same context + * and they both have the same number of enabled counters. + * If the number of enabled counters is the same, then the set + * of enabled counters should be the same, because these are both + * inherited contexts, therefore we can't access individual counters + * in them directly with an fd; we can only enable/disable all + * counters via prctl, or enable/disable all counters in a family + * via ioctl, which will have the same effect on both contexts. + */ +static int context_equiv(struct perf_counter_context *ctx1, +			 struct perf_counter_context *ctx2) +{ +	return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx +		&& ctx1->parent_gen == ctx2->parent_gen +		&& !ctx1->pin_count && !ctx2->pin_count; +} + +/* + * Called from scheduler to remove the counters of the current task, + * with interrupts disabled. + * + * We stop each counter and update the counter value in counter->count. + * + * This does not protect us against NMI, but disable() + * sets the disabled bit in the control field of counter _before_ + * accessing the counter control register. If a NMI hits, then it will + * not restart the counter. + */ +void perf_counter_task_sched_out(struct task_struct *task, +				 struct task_struct *next, int cpu) +{ +	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); +	struct perf_counter_context *ctx = task->perf_counter_ctxp; +	struct perf_counter_context *next_ctx; +	struct perf_counter_context *parent; +	struct pt_regs *regs; +	int do_switch = 1; + +	regs = task_pt_regs(task); +	perf_swcounter_event(PERF_COUNT_SW_CONTEXT_SWITCHES, 1, 1, regs, 0); + +	if (likely(!ctx || !cpuctx->task_ctx)) +		return; + +	update_context_time(ctx); + +	rcu_read_lock(); +	parent = rcu_dereference(ctx->parent_ctx); +	next_ctx = next->perf_counter_ctxp; +	if (parent && next_ctx && +	    rcu_dereference(next_ctx->parent_ctx) == parent) { +		/* +		 * Looks like the two contexts are clones, so we might be +		 * able to optimize the context switch.  We lock both +		 * contexts and check that they are clones under the +		 * lock (including re-checking that neither has been +		 * uncloned in the meantime).  It doesn't matter which +		 * order we take the locks because no other cpu could +		 * be trying to lock both of these tasks. +		 */ +		spin_lock(&ctx->lock); +		spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); +		if (context_equiv(ctx, next_ctx)) { +			/* +			 * XXX do we need a memory barrier of sorts +			 * wrt to rcu_dereference() of perf_counter_ctxp +			 */ +			task->perf_counter_ctxp = next_ctx; +			next->perf_counter_ctxp = ctx; +			ctx->task = next; +			next_ctx->task = task; +			do_switch = 0; +		} +		spin_unlock(&next_ctx->lock); +		spin_unlock(&ctx->lock); +	} +	rcu_read_unlock(); + +	if (do_switch) { +		__perf_counter_sched_out(ctx, cpuctx); +		cpuctx->task_ctx = NULL; +	} +} + +/* + * Called with IRQs disabled + */ +static void __perf_counter_task_sched_out(struct perf_counter_context *ctx) +{ +	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + +	if (!cpuctx->task_ctx) +		return; + +	if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) +		return; + +	__perf_counter_sched_out(ctx, cpuctx); +	cpuctx->task_ctx = NULL; +} + +/* + * Called with IRQs disabled + */ +static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx) +{ +	__perf_counter_sched_out(&cpuctx->ctx, cpuctx); +} + +static void +__perf_counter_sched_in(struct perf_counter_context *ctx, +			struct perf_cpu_context *cpuctx, int cpu) +{ +	struct perf_counter *counter; +	int can_add_hw = 1; + +	spin_lock(&ctx->lock); +	ctx->is_active = 1; +	if (likely(!ctx->nr_counters)) +		goto out; + +	ctx->timestamp = perf_clock(); + +	perf_disable(); + +	/* +	 * First go through the list and put on any pinned groups +	 * in order to give them the best chance of going on. +	 */ +	list_for_each_entry(counter, &ctx->counter_list, list_entry) { +		if (counter->state <= PERF_COUNTER_STATE_OFF || +		    !counter->attr.pinned) +			continue; +		if (counter->cpu != -1 && counter->cpu != cpu) +			continue; + +		if (counter != counter->group_leader) +			counter_sched_in(counter, cpuctx, ctx, cpu); +		else { +			if (group_can_go_on(counter, cpuctx, 1)) +				group_sched_in(counter, cpuctx, ctx, cpu); +		} + +		/* +		 * If this pinned group hasn't been scheduled, +		 * put it in error state. +		 */ +		if (counter->state == PERF_COUNTER_STATE_INACTIVE) { +			update_group_times(counter); +			counter->state = PERF_COUNTER_STATE_ERROR; +		} +	} + +	list_for_each_entry(counter, &ctx->counter_list, list_entry) { +		/* +		 * Ignore counters in OFF or ERROR state, and +		 * ignore pinned counters since we did them already. +		 */ +		if (counter->state <= PERF_COUNTER_STATE_OFF || +		    counter->attr.pinned) +			continue; + +		/* +		 * Listen to the 'cpu' scheduling filter constraint +		 * of counters: +		 */ +		if (counter->cpu != -1 && counter->cpu != cpu) +			continue; + +		if (counter != counter->group_leader) { +			if (counter_sched_in(counter, cpuctx, ctx, cpu)) +				can_add_hw = 0; +		} else { +			if (group_can_go_on(counter, cpuctx, can_add_hw)) { +				if (group_sched_in(counter, cpuctx, ctx, cpu)) +					can_add_hw = 0; +			} +		} +	} +	perf_enable(); + out: +	spin_unlock(&ctx->lock); +} + +/* + * Called from scheduler to add the counters of the current task + * with interrupts disabled. + * + * We restore the counter value and then enable it. + * + * This does not protect us against NMI, but enable() + * sets the enabled bit in the control field of counter _before_ + * accessing the counter control register. If a NMI hits, then it will + * keep the counter running. + */ +void perf_counter_task_sched_in(struct task_struct *task, int cpu) +{ +	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); +	struct perf_counter_context *ctx = task->perf_counter_ctxp; + +	if (likely(!ctx)) +		return; +	if (cpuctx->task_ctx == ctx) +		return; +	__perf_counter_sched_in(ctx, cpuctx, cpu); +	cpuctx->task_ctx = ctx; +} + +static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu) +{ +	struct perf_counter_context *ctx = &cpuctx->ctx; + +	__perf_counter_sched_in(ctx, cpuctx, cpu); +} + +#define MAX_INTERRUPTS (~0ULL) + +static void perf_log_throttle(struct perf_counter *counter, int enable); +static void perf_log_period(struct perf_counter *counter, u64 period); + +static void perf_adjust_period(struct perf_counter *counter, u64 events) +{ +	struct hw_perf_counter *hwc = &counter->hw; +	u64 period, sample_period; +	s64 delta; + +	events *= hwc->sample_period; +	period = div64_u64(events, counter->attr.sample_freq); + +	delta = (s64)(period - hwc->sample_period); +	delta = (delta + 7) / 8; /* low pass filter */ + +	sample_period = hwc->sample_period + delta; + +	if (!sample_period) +		sample_period = 1; + +	perf_log_period(counter, sample_period); + +	hwc->sample_period = sample_period; +} + +static void perf_ctx_adjust_freq(struct perf_counter_context *ctx) +{ +	struct perf_counter *counter; +	struct hw_perf_counter *hwc; +	u64 interrupts, freq; + +	spin_lock(&ctx->lock); +	list_for_each_entry(counter, &ctx->counter_list, list_entry) { +		if (counter->state != PERF_COUNTER_STATE_ACTIVE) +			continue; + +		hwc = &counter->hw; + +		interrupts = hwc->interrupts; +		hwc->interrupts = 0; + +		/* +		 * unthrottle counters on the tick +		 */ +		if (interrupts == MAX_INTERRUPTS) { +			perf_log_throttle(counter, 1); +			counter->pmu->unthrottle(counter); +			interrupts = 2*sysctl_perf_counter_sample_rate/HZ; +		} + +		if (!counter->attr.freq || !counter->attr.sample_freq) +			continue; + +		/* +		 * if the specified freq < HZ then we need to skip ticks +		 */ +		if (counter->attr.sample_freq < HZ) { +			freq = counter->attr.sample_freq; + +			hwc->freq_count += freq; +			hwc->freq_interrupts += interrupts; + +			if (hwc->freq_count < HZ) +				continue; + +			interrupts = hwc->freq_interrupts; +			hwc->freq_interrupts = 0; +			hwc->freq_count -= HZ; +		} else +			freq = HZ; + +		perf_adjust_period(counter, freq * interrupts); + +		/* +		 * In order to avoid being stalled by an (accidental) huge +		 * sample period, force reset the sample period if we didn't +		 * get any events in this freq period. +		 */ +		if (!interrupts) { +			perf_disable(); +			counter->pmu->disable(counter); +			atomic_set(&hwc->period_left, 0); +			counter->pmu->enable(counter); +			perf_enable(); +		} +	} +	spin_unlock(&ctx->lock); +} + +/* + * Round-robin a context's counters: + */ +static void rotate_ctx(struct perf_counter_context *ctx) +{ +	struct perf_counter *counter; + +	if (!ctx->nr_counters) +		return; + +	spin_lock(&ctx->lock); +	/* +	 * Rotate the first entry last (works just fine for group counters too): +	 */ +	perf_disable(); +	list_for_each_entry(counter, &ctx->counter_list, list_entry) { +		list_move_tail(&counter->list_entry, &ctx->counter_list); +		break; +	} +	perf_enable(); + +	spin_unlock(&ctx->lock); +} + +void perf_counter_task_tick(struct task_struct *curr, int cpu) +{ +	struct perf_cpu_context *cpuctx; +	struct perf_counter_context *ctx; + +	if (!atomic_read(&nr_counters)) +		return; + +	cpuctx = &per_cpu(perf_cpu_context, cpu); +	ctx = curr->perf_counter_ctxp; + +	perf_ctx_adjust_freq(&cpuctx->ctx); +	if (ctx) +		perf_ctx_adjust_freq(ctx); + +	perf_counter_cpu_sched_out(cpuctx); +	if (ctx) +		__perf_counter_task_sched_out(ctx); + +	rotate_ctx(&cpuctx->ctx); +	if (ctx) +		rotate_ctx(ctx); + +	perf_counter_cpu_sched_in(cpuctx, cpu); +	if (ctx) +		perf_counter_task_sched_in(curr, cpu); +} + +/* + * Cross CPU call to read the hardware counter + */ +static void __read(void *info) +{ +	struct perf_counter *counter = info; +	struct perf_counter_context *ctx = counter->ctx; +	unsigned long flags; + +	local_irq_save(flags); +	if (ctx->is_active) +		update_context_time(ctx); +	counter->pmu->read(counter); +	update_counter_times(counter); +	local_irq_restore(flags); +} + +static u64 perf_counter_read(struct perf_counter *counter) +{ +	/* +	 * If counter is enabled and currently active on a CPU, update the +	 * value in the counter structure: +	 */ +	if (counter->state == PERF_COUNTER_STATE_ACTIVE) { +		smp_call_function_single(counter->oncpu, +					 __read, counter, 1); +	} else if (counter->state == PERF_COUNTER_STATE_INACTIVE) { +		update_counter_times(counter); +	} + +	return atomic64_read(&counter->count); +} + +/* + * Initialize the perf_counter context in a task_struct: + */ +static void +__perf_counter_init_context(struct perf_counter_context *ctx, +			    struct task_struct *task) +{ +	memset(ctx, 0, sizeof(*ctx)); +	spin_lock_init(&ctx->lock); +	mutex_init(&ctx->mutex); +	INIT_LIST_HEAD(&ctx->counter_list); +	INIT_LIST_HEAD(&ctx->event_list); +	atomic_set(&ctx->refcount, 1); +	ctx->task = task; +} + +static struct perf_counter_context *find_get_context(pid_t pid, int cpu) +{ +	struct perf_counter_context *parent_ctx; +	struct perf_counter_context *ctx; +	struct perf_cpu_context *cpuctx; +	struct task_struct *task; +	unsigned long flags; +	int err; + +	/* +	 * If cpu is not a wildcard then this is a percpu counter: +	 */ +	if (cpu != -1) { +		/* Must be root to operate on a CPU counter: */ +		if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) +			return ERR_PTR(-EACCES); + +		if (cpu < 0 || cpu > num_possible_cpus()) +			return ERR_PTR(-EINVAL); + +		/* +		 * We could be clever and allow to attach a counter to an +		 * offline CPU and activate it when the CPU comes up, but +		 * that's for later. +		 */ +		if (!cpu_isset(cpu, cpu_online_map)) +			return ERR_PTR(-ENODEV); + +		cpuctx = &per_cpu(perf_cpu_context, cpu); +		ctx = &cpuctx->ctx; +		get_ctx(ctx); + +		return ctx; +	} + +	rcu_read_lock(); +	if (!pid) +		task = current; +	else +		task = find_task_by_vpid(pid); +	if (task) +		get_task_struct(task); +	rcu_read_unlock(); + +	if (!task) +		return ERR_PTR(-ESRCH); + +	/* +	 * Can't attach counters to a dying task. +	 */ +	err = -ESRCH; +	if (task->flags & PF_EXITING) +		goto errout; + +	/* Reuse ptrace permission checks for now. */ +	err = -EACCES; +	if (!ptrace_may_access(task, PTRACE_MODE_READ)) +		goto errout; + + retry: +	ctx = perf_lock_task_context(task, &flags); +	if (ctx) { +		parent_ctx = ctx->parent_ctx; +		if (parent_ctx) { +			put_ctx(parent_ctx); +			ctx->parent_ctx = NULL;		/* no longer a clone */ +		} +		/* +		 * Get an extra reference before dropping the lock so that +		 * this context won't get freed if the task exits. +		 */ +		get_ctx(ctx); +		spin_unlock_irqrestore(&ctx->lock, flags); +	} + +	if (!ctx) { +		ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL); +		err = -ENOMEM; +		if (!ctx) +			goto errout; +		__perf_counter_init_context(ctx, task); +		get_ctx(ctx); +		if (cmpxchg(&task->perf_counter_ctxp, NULL, ctx)) { +			/* +			 * We raced with some other task; use +			 * the context they set. +			 */ +			kfree(ctx); +			goto retry; +		} +		get_task_struct(task); +	} + +	put_task_struct(task); +	return ctx; + + errout: +	put_task_struct(task); +	return ERR_PTR(err); +} + +static void free_counter_rcu(struct rcu_head *head) +{ +	struct perf_counter *counter; + +	counter = container_of(head, struct perf_counter, rcu_head); +	if (counter->ns) +		put_pid_ns(counter->ns); +	kfree(counter); +} + +static void perf_pending_sync(struct perf_counter *counter); + +static void free_counter(struct perf_counter *counter) +{ +	perf_pending_sync(counter); + +	atomic_dec(&nr_counters); +	if (counter->attr.mmap) +		atomic_dec(&nr_mmap_counters); +	if (counter->attr.comm) +		atomic_dec(&nr_comm_counters); + +	if (counter->destroy) +		counter->destroy(counter); + +	put_ctx(counter->ctx); +	call_rcu(&counter->rcu_head, free_counter_rcu); +} + +/* + * Called when the last reference to the file is gone. + */ +static int perf_release(struct inode *inode, struct file *file) +{ +	struct perf_counter *counter = file->private_data; +	struct perf_counter_context *ctx = counter->ctx; + +	file->private_data = NULL; + +	WARN_ON_ONCE(ctx->parent_ctx); +	mutex_lock(&ctx->mutex); +	perf_counter_remove_from_context(counter); +	mutex_unlock(&ctx->mutex); + +	mutex_lock(&counter->owner->perf_counter_mutex); +	list_del_init(&counter->owner_entry); +	mutex_unlock(&counter->owner->perf_counter_mutex); +	put_task_struct(counter->owner); + +	free_counter(counter); + +	return 0; +} + +/* + * Read the performance counter - simple non blocking version for now + */ +static ssize_t +perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) +{ +	u64 values[3]; +	int n; + +	/* +	 * Return end-of-file for a read on a counter that is in +	 * error state (i.e. because it was pinned but it couldn't be +	 * scheduled on to the CPU at some point). +	 */ +	if (counter->state == PERF_COUNTER_STATE_ERROR) +		return 0; + +	WARN_ON_ONCE(counter->ctx->parent_ctx); +	mutex_lock(&counter->child_mutex); +	values[0] = perf_counter_read(counter); +	n = 1; +	if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) +		values[n++] = counter->total_time_enabled + +			atomic64_read(&counter->child_total_time_enabled); +	if (counter->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) +		values[n++] = counter->total_time_running + +			atomic64_read(&counter->child_total_time_running); +	if (counter->attr.read_format & PERF_FORMAT_ID) +		values[n++] = counter->id; +	mutex_unlock(&counter->child_mutex); + +	if (count < n * sizeof(u64)) +		return -EINVAL; +	count = n * sizeof(u64); + +	if (copy_to_user(buf, values, count)) +		return -EFAULT; + +	return count; +} + +static ssize_t +perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) +{ +	struct perf_counter *counter = file->private_data; + +	return perf_read_hw(counter, buf, count); +} + +static unsigned int perf_poll(struct file *file, poll_table *wait) +{ +	struct perf_counter *counter = file->private_data; +	struct perf_mmap_data *data; +	unsigned int events = POLL_HUP; + +	rcu_read_lock(); +	data = rcu_dereference(counter->data); +	if (data) +		events = atomic_xchg(&data->poll, 0); +	rcu_read_unlock(); + +	poll_wait(file, &counter->waitq, wait); + +	return events; +} + +static void perf_counter_reset(struct perf_counter *counter) +{ +	(void)perf_counter_read(counter); +	atomic64_set(&counter->count, 0); +	perf_counter_update_userpage(counter); +} + +static void perf_counter_for_each_sibling(struct perf_counter *counter, +					  void (*func)(struct perf_counter *)) +{ +	struct perf_counter_context *ctx = counter->ctx; +	struct perf_counter *sibling; + +	WARN_ON_ONCE(ctx->parent_ctx); +	mutex_lock(&ctx->mutex); +	counter = counter->group_leader; + +	func(counter); +	list_for_each_entry(sibling, &counter->sibling_list, list_entry) +		func(sibling); +	mutex_unlock(&ctx->mutex); +} + +/* + * Holding the top-level counter's child_mutex means that any + * descendant process that has inherited this counter will block + * in sync_child_counter if it goes to exit, thus satisfying the + * task existence requirements of perf_counter_enable/disable. + */ +static void perf_counter_for_each_child(struct perf_counter *counter, +					void (*func)(struct perf_counter *)) +{ +	struct perf_counter *child; + +	WARN_ON_ONCE(counter->ctx->parent_ctx); +	mutex_lock(&counter->child_mutex); +	func(counter); +	list_for_each_entry(child, &counter->child_list, child_list) +		func(child); +	mutex_unlock(&counter->child_mutex); +} + +static void perf_counter_for_each(struct perf_counter *counter, +				  void (*func)(struct perf_counter *)) +{ +	struct perf_counter *child; + +	WARN_ON_ONCE(counter->ctx->parent_ctx); +	mutex_lock(&counter->child_mutex); +	perf_counter_for_each_sibling(counter, func); +	list_for_each_entry(child, &counter->child_list, child_list) +		perf_counter_for_each_sibling(child, func); +	mutex_unlock(&counter->child_mutex); +} + +static int perf_counter_period(struct perf_counter *counter, u64 __user *arg) +{ +	struct perf_counter_context *ctx = counter->ctx; +	unsigned long size; +	int ret = 0; +	u64 value; + +	if (!counter->attr.sample_period) +		return -EINVAL; + +	size = copy_from_user(&value, arg, sizeof(value)); +	if (size != sizeof(value)) +		return -EFAULT; + +	if (!value) +		return -EINVAL; + +	spin_lock_irq(&ctx->lock); +	if (counter->attr.freq) { +		if (value > sysctl_perf_counter_sample_rate) { +			ret = -EINVAL; +			goto unlock; +		} + +		counter->attr.sample_freq = value; +	} else { +		perf_log_period(counter, value); + +		counter->attr.sample_period = value; +		counter->hw.sample_period = value; +	} +unlock: +	spin_unlock_irq(&ctx->lock); + +	return ret; +} + +static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ +	struct perf_counter *counter = file->private_data; +	void (*func)(struct perf_counter *); +	u32 flags = arg; + +	switch (cmd) { +	case PERF_COUNTER_IOC_ENABLE: +		func = perf_counter_enable; +		break; +	case PERF_COUNTER_IOC_DISABLE: +		func = perf_counter_disable; +		break; +	case PERF_COUNTER_IOC_RESET: +		func = perf_counter_reset; +		break; + +	case PERF_COUNTER_IOC_REFRESH: +		return perf_counter_refresh(counter, arg); + +	case PERF_COUNTER_IOC_PERIOD: +		return perf_counter_period(counter, (u64 __user *)arg); + +	default: +		return -ENOTTY; +	} + +	if (flags & PERF_IOC_FLAG_GROUP) +		perf_counter_for_each(counter, func); +	else +		perf_counter_for_each_child(counter, func); + +	return 0; +} + +int perf_counter_task_enable(void) +{ +	struct perf_counter *counter; + +	mutex_lock(¤t->perf_counter_mutex); +	list_for_each_entry(counter, ¤t->perf_counter_list, owner_entry) +		perf_counter_for_each_child(counter, perf_counter_enable); +	mutex_unlock(¤t->perf_counter_mutex); + +	return 0; +} + +int perf_counter_task_disable(void) +{ +	struct perf_counter *counter; + +	mutex_lock(¤t->perf_counter_mutex); +	list_for_each_entry(counter, ¤t->perf_counter_list, owner_entry) +		perf_counter_for_each_child(counter, perf_counter_disable); +	mutex_unlock(¤t->perf_counter_mutex); + +	return 0; +} + +/* + * Callers need to ensure there can be no nesting of this function, otherwise + * the seqlock logic goes bad. We can not serialize this because the arch + * code calls this from NMI context. + */ +void perf_counter_update_userpage(struct perf_counter *counter) +{ +	struct perf_counter_mmap_page *userpg; +	struct perf_mmap_data *data; + +	rcu_read_lock(); +	data = rcu_dereference(counter->data); +	if (!data) +		goto unlock; + +	userpg = data->user_page; + +	/* +	 * Disable preemption so as to not let the corresponding user-space +	 * spin too long if we get preempted. +	 */ +	preempt_disable(); +	++userpg->lock; +	barrier(); +	userpg->index = counter->hw.idx; +	userpg->offset = atomic64_read(&counter->count); +	if (counter->state == PERF_COUNTER_STATE_ACTIVE) +		userpg->offset -= atomic64_read(&counter->hw.prev_count); + +	barrier(); +	++userpg->lock; +	preempt_enable(); +unlock: +	rcu_read_unlock(); +} + +static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ +	struct perf_counter *counter = vma->vm_file->private_data; +	struct perf_mmap_data *data; +	int ret = VM_FAULT_SIGBUS; + +	rcu_read_lock(); +	data = rcu_dereference(counter->data); +	if (!data) +		goto unlock; + +	if (vmf->pgoff == 0) { +		vmf->page = virt_to_page(data->user_page); +	} else { +		int nr = vmf->pgoff - 1; + +		if ((unsigned)nr > data->nr_pages) +			goto unlock; + +		vmf->page = virt_to_page(data->data_pages[nr]); +	} +	get_page(vmf->page); +	ret = 0; +unlock: +	rcu_read_unlock(); + +	return ret; +} + +static int perf_mmap_data_alloc(struct perf_counter *counter, int nr_pages) +{ +	struct perf_mmap_data *data; +	unsigned long size; +	int i; + +	WARN_ON(atomic_read(&counter->mmap_count)); + +	size = sizeof(struct perf_mmap_data); +	size += nr_pages * sizeof(void *); + +	data = kzalloc(size, GFP_KERNEL); +	if (!data) +		goto fail; + +	data->user_page = (void *)get_zeroed_page(GFP_KERNEL); +	if (!data->user_page) +		goto fail_user_page; + +	for (i = 0; i < nr_pages; i++) { +		data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL); +		if (!data->data_pages[i]) +			goto fail_data_pages; +	} + +	data->nr_pages = nr_pages; +	atomic_set(&data->lock, -1); + +	rcu_assign_pointer(counter->data, data); + +	return 0; + +fail_data_pages: +	for (i--; i >= 0; i--) +		free_page((unsigned long)data->data_pages[i]); + +	free_page((unsigned long)data->user_page); + +fail_user_page: +	kfree(data); + +fail: +	return -ENOMEM; +} + +static void __perf_mmap_data_free(struct rcu_head *rcu_head) +{ +	struct perf_mmap_data *data; +	int i; + +	data = container_of(rcu_head, struct perf_mmap_data, rcu_head); + +	free_page((unsigned long)data->user_page); +	for (i = 0; i < data->nr_pages; i++) +		free_page((unsigned long)data->data_pages[i]); +	kfree(data); +} + +static void perf_mmap_data_free(struct perf_counter *counter) +{ +	struct perf_mmap_data *data = counter->data; + +	WARN_ON(atomic_read(&counter->mmap_count)); + +	rcu_assign_pointer(counter->data, NULL); +	call_rcu(&data->rcu_head, __perf_mmap_data_free); +} + +static void perf_mmap_open(struct vm_area_struct *vma) +{ +	struct perf_counter *counter = vma->vm_file->private_data; + +	atomic_inc(&counter->mmap_count); +} + +static void perf_mmap_close(struct vm_area_struct *vma) +{ +	struct perf_counter *counter = vma->vm_file->private_data; + +	WARN_ON_ONCE(counter->ctx->parent_ctx); +	if (atomic_dec_and_mutex_lock(&counter->mmap_count, &counter->mmap_mutex)) { +		struct user_struct *user = current_user(); + +		atomic_long_sub(counter->data->nr_pages + 1, &user->locked_vm); +		vma->vm_mm->locked_vm -= counter->data->nr_locked; +		perf_mmap_data_free(counter); +		mutex_unlock(&counter->mmap_mutex); +	} +} + +static struct vm_operations_struct perf_mmap_vmops = { +	.open  = perf_mmap_open, +	.close = perf_mmap_close, +	.fault = perf_mmap_fault, +}; + +static int perf_mmap(struct file *file, struct vm_area_struct *vma) +{ +	struct perf_counter *counter = file->private_data; +	unsigned long user_locked, user_lock_limit; +	struct user_struct *user = current_user(); +	unsigned long locked, lock_limit; +	unsigned long vma_size; +	unsigned long nr_pages; +	long user_extra, extra; +	int ret = 0; + +	if (!(vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_WRITE)) +		return -EINVAL; + +	vma_size = vma->vm_end - vma->vm_start; +	nr_pages = (vma_size / PAGE_SIZE) - 1; + +	/* +	 * If we have data pages ensure they're a power-of-two number, so we +	 * can do bitmasks instead of modulo. +	 */ +	if (nr_pages != 0 && !is_power_of_2(nr_pages)) +		return -EINVAL; + +	if (vma_size != PAGE_SIZE * (1 + nr_pages)) +		return -EINVAL; + +	if (vma->vm_pgoff != 0) +		return -EINVAL; + +	WARN_ON_ONCE(counter->ctx->parent_ctx); +	mutex_lock(&counter->mmap_mutex); +	if (atomic_inc_not_zero(&counter->mmap_count)) { +		if (nr_pages != counter->data->nr_pages) +			ret = -EINVAL; +		goto unlock; +	} + +	user_extra = nr_pages + 1; +	user_lock_limit = sysctl_perf_counter_mlock >> (PAGE_SHIFT - 10); + +	/* +	 * Increase the limit linearly with more CPUs: +	 */ +	user_lock_limit *= num_online_cpus(); + +	user_locked = atomic_long_read(&user->locked_vm) + user_extra; + +	extra = 0; +	if (user_locked > user_lock_limit) +		extra = user_locked - user_lock_limit; + +	lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; +	lock_limit >>= PAGE_SHIFT; +	locked = vma->vm_mm->locked_vm + extra; + +	if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) { +		ret = -EPERM; +		goto unlock; +	} + +	WARN_ON(counter->data); +	ret = perf_mmap_data_alloc(counter, nr_pages); +	if (ret) +		goto unlock; + +	atomic_set(&counter->mmap_count, 1); +	atomic_long_add(user_extra, &user->locked_vm); +	vma->vm_mm->locked_vm += extra; +	counter->data->nr_locked = extra; +unlock: +	mutex_unlock(&counter->mmap_mutex); + +	vma->vm_flags &= ~VM_MAYWRITE; +	vma->vm_flags |= VM_RESERVED; +	vma->vm_ops = &perf_mmap_vmops; + +	return ret; +} + +static int perf_fasync(int fd, struct file *filp, int on) +{ +	struct inode *inode = filp->f_path.dentry->d_inode; +	struct perf_counter *counter = filp->private_data; +	int retval; + +	mutex_lock(&inode->i_mutex); +	retval = fasync_helper(fd, filp, on, &counter->fasync); +	mutex_unlock(&inode->i_mutex); + +	if (retval < 0) +		return retval; + +	return 0; +} + +static const struct file_operations perf_fops = { +	.release		= perf_release, +	.read			= perf_read, +	.poll			= perf_poll, +	.unlocked_ioctl		= perf_ioctl, +	.compat_ioctl		= perf_ioctl, +	.mmap			= perf_mmap, +	.fasync			= perf_fasync, +}; + +/* + * Perf counter wakeup + * + * If there's data, ensure we set the poll() state and publish everything + * to user-space before waking everybody up. + */ + +void perf_counter_wakeup(struct perf_counter *counter) +{ +	wake_up_all(&counter->waitq); + +	if (counter->pending_kill) { +		kill_fasync(&counter->fasync, SIGIO, counter->pending_kill); +		counter->pending_kill = 0; +	} +} + +/* + * Pending wakeups + * + * Handle the case where we need to wakeup up from NMI (or rq->lock) context. + * + * The NMI bit means we cannot possibly take locks. Therefore, maintain a + * single linked list and use cmpxchg() to add entries lockless. + */ + +static void perf_pending_counter(struct perf_pending_entry *entry) +{ +	struct perf_counter *counter = container_of(entry, +			struct perf_counter, pending); + +	if (counter->pending_disable) { +		counter->pending_disable = 0; +		perf_counter_disable(counter); +	} + +	if (counter->pending_wakeup) { +		counter->pending_wakeup = 0; +		perf_counter_wakeup(counter); +	} +} + +#define PENDING_TAIL ((struct perf_pending_entry *)-1UL) + +static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = { +	PENDING_TAIL, +}; + +static void perf_pending_queue(struct perf_pending_entry *entry, +			       void (*func)(struct perf_pending_entry *)) +{ +	struct perf_pending_entry **head; + +	if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL) +		return; + +	entry->func = func; + +	head = &get_cpu_var(perf_pending_head); + +	do { +		entry->next = *head; +	} while (cmpxchg(head, entry->next, entry) != entry->next); + +	set_perf_counter_pending(); + +	put_cpu_var(perf_pending_head); +} + +static int __perf_pending_run(void) +{ +	struct perf_pending_entry *list; +	int nr = 0; + +	list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL); +	while (list != PENDING_TAIL) { +		void (*func)(struct perf_pending_entry *); +		struct perf_pending_entry *entry = list; + +		list = list->next; + +		func = entry->func; +		entry->next = NULL; +		/* +		 * Ensure we observe the unqueue before we issue the wakeup, +		 * so that we won't be waiting forever. +		 * -- see perf_not_pending(). +		 */ +		smp_wmb(); + +		func(entry); +		nr++; +	} + +	return nr; +} + +static inline int perf_not_pending(struct perf_counter *counter) +{ +	/* +	 * If we flush on whatever cpu we run, there is a chance we don't +	 * need to wait. +	 */ +	get_cpu(); +	__perf_pending_run(); +	put_cpu(); + +	/* +	 * Ensure we see the proper queue state before going to sleep +	 * so that we do not miss the wakeup. -- see perf_pending_handle() +	 */ +	smp_rmb(); +	return counter->pending.next == NULL; +} + +static void perf_pending_sync(struct perf_counter *counter) +{ +	wait_event(counter->waitq, perf_not_pending(counter)); +} + +void perf_counter_do_pending(void) +{ +	__perf_pending_run(); +} + +/* + * Callchain support -- arch specific + */ + +__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) +{ +	return NULL; +} + +/* + * Output + */ + +struct perf_output_handle { +	struct perf_counter	*counter; +	struct perf_mmap_data	*data; +	unsigned long		head; +	unsigned long		offset; +	int			nmi; +	int			overflow; +	int			locked; +	unsigned long		flags; +}; + +static void perf_output_wakeup(struct perf_output_handle *handle) +{ +	atomic_set(&handle->data->poll, POLL_IN); + +	if (handle->nmi) { +		handle->counter->pending_wakeup = 1; +		perf_pending_queue(&handle->counter->pending, +				   perf_pending_counter); +	} else +		perf_counter_wakeup(handle->counter); +} + +/* + * Curious locking construct. + * + * We need to ensure a later event doesn't publish a head when a former + * event isn't done writing. However since we need to deal with NMIs we + * cannot fully serialize things. + * + * What we do is serialize between CPUs so we only have to deal with NMI + * nesting on a single CPU. + * + * We only publish the head (and generate a wakeup) when the outer-most + * event completes. + */ +static void perf_output_lock(struct perf_output_handle *handle) +{ +	struct perf_mmap_data *data = handle->data; +	int cpu; + +	handle->locked = 0; + +	local_irq_save(handle->flags); +	cpu = smp_processor_id(); + +	if (in_nmi() && atomic_read(&data->lock) == cpu) +		return; + +	while (atomic_cmpxchg(&data->lock, -1, cpu) != -1) +		cpu_relax(); + +	handle->locked = 1; +} + +static void perf_output_unlock(struct perf_output_handle *handle) +{ +	struct perf_mmap_data *data = handle->data; +	unsigned long head; +	int cpu; + +	data->done_head = data->head; + +	if (!handle->locked) +		goto out; + +again: +	/* +	 * The xchg implies a full barrier that ensures all writes are done +	 * before we publish the new head, matched by a rmb() in userspace when +	 * reading this position. +	 */ +	while ((head = atomic_long_xchg(&data->done_head, 0))) +		data->user_page->data_head = head; + +	/* +	 * NMI can happen here, which means we can miss a done_head update. +	 */ + +	cpu = atomic_xchg(&data->lock, -1); +	WARN_ON_ONCE(cpu != smp_processor_id()); + +	/* +	 * Therefore we have to validate we did not indeed do so. +	 */ +	if (unlikely(atomic_long_read(&data->done_head))) { +		/* +		 * Since we had it locked, we can lock it again. +		 */ +		while (atomic_cmpxchg(&data->lock, -1, cpu) != -1) +			cpu_relax(); + +		goto again; +	} + +	if (atomic_xchg(&data->wakeup, 0)) +		perf_output_wakeup(handle); +out: +	local_irq_restore(handle->flags); +} + +static int perf_output_begin(struct perf_output_handle *handle, +			     struct perf_counter *counter, unsigned int size, +			     int nmi, int overflow) +{ +	struct perf_mmap_data *data; +	unsigned int offset, head; + +	/* +	 * For inherited counters we send all the output towards the parent. +	 */ +	if (counter->parent) +		counter = counter->parent; + +	rcu_read_lock(); +	data = rcu_dereference(counter->data); +	if (!data) +		goto out; + +	handle->data	 = data; +	handle->counter	 = counter; +	handle->nmi	 = nmi; +	handle->overflow = overflow; + +	if (!data->nr_pages) +		goto fail; + +	perf_output_lock(handle); + +	do { +		offset = head = atomic_long_read(&data->head); +		head += size; +	} while (atomic_long_cmpxchg(&data->head, offset, head) != offset); + +	handle->offset	= offset; +	handle->head	= head; + +	if ((offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT)) +		atomic_set(&data->wakeup, 1); + +	return 0; + +fail: +	perf_output_wakeup(handle); +out: +	rcu_read_unlock(); + +	return -ENOSPC; +} + +static void perf_output_copy(struct perf_output_handle *handle, +			     const void *buf, unsigned int len) +{ +	unsigned int pages_mask; +	unsigned int offset; +	unsigned int size; +	void **pages; + +	offset		= handle->offset; +	pages_mask	= handle->data->nr_pages - 1; +	pages		= handle->data->data_pages; + +	do { +		unsigned int page_offset; +		int nr; + +		nr	    = (offset >> PAGE_SHIFT) & pages_mask; +		page_offset = offset & (PAGE_SIZE - 1); +		size	    = min_t(unsigned int, PAGE_SIZE - page_offset, len); + +		memcpy(pages[nr] + page_offset, buf, size); + +		len	    -= size; +		buf	    += size; +		offset	    += size; +	} while (len); + +	handle->offset = offset; + +	/* +	 * Check we didn't copy past our reservation window, taking the +	 * possible unsigned int wrap into account. +	 */ +	WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0); +} + +#define perf_output_put(handle, x) \ +	perf_output_copy((handle), &(x), sizeof(x)) + +static void perf_output_end(struct perf_output_handle *handle) +{ +	struct perf_counter *counter = handle->counter; +	struct perf_mmap_data *data = handle->data; + +	int wakeup_events = counter->attr.wakeup_events; + +	if (handle->overflow && wakeup_events) { +		int events = atomic_inc_return(&data->events); +		if (events >= wakeup_events) { +			atomic_sub(wakeup_events, &data->events); +			atomic_set(&data->wakeup, 1); +		} +	} + +	perf_output_unlock(handle); +	rcu_read_unlock(); +} + +static u32 perf_counter_pid(struct perf_counter *counter, struct task_struct *p) +{ +	/* +	 * only top level counters have the pid namespace they were created in +	 */ +	if (counter->parent) +		counter = counter->parent; + +	return task_tgid_nr_ns(p, counter->ns); +} + +static u32 perf_counter_tid(struct perf_counter *counter, struct task_struct *p) +{ +	/* +	 * only top level counters have the pid namespace they were created in +	 */ +	if (counter->parent) +		counter = counter->parent; + +	return task_pid_nr_ns(p, counter->ns); +} + +static void perf_counter_output(struct perf_counter *counter, int nmi, +				struct perf_sample_data *data) +{ +	int ret; +	u64 sample_type = counter->attr.sample_type; +	struct perf_output_handle handle; +	struct perf_event_header header; +	u64 ip; +	struct { +		u32 pid, tid; +	} tid_entry; +	struct { +		u64 id; +		u64 counter; +	} group_entry; +	struct perf_callchain_entry *callchain = NULL; +	int callchain_size = 0; +	u64 time; +	struct { +		u32 cpu, reserved; +	} cpu_entry; + +	header.type = 0; +	header.size = sizeof(header); + +	header.misc = PERF_EVENT_MISC_OVERFLOW; +	header.misc |= perf_misc_flags(data->regs); + +	if (sample_type & PERF_SAMPLE_IP) { +		ip = perf_instruction_pointer(data->regs); +		header.type |= PERF_SAMPLE_IP; +		header.size += sizeof(ip); +	} + +	if (sample_type & PERF_SAMPLE_TID) { +		/* namespace issues */ +		tid_entry.pid = perf_counter_pid(counter, current); +		tid_entry.tid = perf_counter_tid(counter, current); + +		header.type |= PERF_SAMPLE_TID; +		header.size += sizeof(tid_entry); +	} + +	if (sample_type & PERF_SAMPLE_TIME) { +		/* +		 * Maybe do better on x86 and provide cpu_clock_nmi() +		 */ +		time = sched_clock(); + +		header.type |= PERF_SAMPLE_TIME; +		header.size += sizeof(u64); +	} + +	if (sample_type & PERF_SAMPLE_ADDR) { +		header.type |= PERF_SAMPLE_ADDR; +		header.size += sizeof(u64); +	} + +	if (sample_type & PERF_SAMPLE_ID) { +		header.type |= PERF_SAMPLE_ID; +		header.size += sizeof(u64); +	} + +	if (sample_type & PERF_SAMPLE_CPU) { +		header.type |= PERF_SAMPLE_CPU; +		header.size += sizeof(cpu_entry); + +		cpu_entry.cpu = raw_smp_processor_id(); +	} + +	if (sample_type & PERF_SAMPLE_PERIOD) { +		header.type |= PERF_SAMPLE_PERIOD; +		header.size += sizeof(u64); +	} + +	if (sample_type & PERF_SAMPLE_GROUP) { +		header.type |= PERF_SAMPLE_GROUP; +		header.size += sizeof(u64) + +			counter->nr_siblings * sizeof(group_entry); +	} + +	if (sample_type & PERF_SAMPLE_CALLCHAIN) { +		callchain = perf_callchain(data->regs); + +		if (callchain) { +			callchain_size = (1 + callchain->nr) * sizeof(u64); + +			header.type |= PERF_SAMPLE_CALLCHAIN; +			header.size += callchain_size; +		} +	} + +	ret = perf_output_begin(&handle, counter, header.size, nmi, 1); +	if (ret) +		return; + +	perf_output_put(&handle, header); + +	if (sample_type & PERF_SAMPLE_IP) +		perf_output_put(&handle, ip); + +	if (sample_type & PERF_SAMPLE_TID) +		perf_output_put(&handle, tid_entry); + +	if (sample_type & PERF_SAMPLE_TIME) +		perf_output_put(&handle, time); + +	if (sample_type & PERF_SAMPLE_ADDR) +		perf_output_put(&handle, data->addr); + +	if (sample_type & PERF_SAMPLE_ID) +		perf_output_put(&handle, counter->id); + +	if (sample_type & PERF_SAMPLE_CPU) +		perf_output_put(&handle, cpu_entry); + +	if (sample_type & PERF_SAMPLE_PERIOD) +		perf_output_put(&handle, data->period); + +	/* +	 * XXX PERF_SAMPLE_GROUP vs inherited counters seems difficult. +	 */ +	if (sample_type & PERF_SAMPLE_GROUP) { +		struct perf_counter *leader, *sub; +		u64 nr = counter->nr_siblings; + +		perf_output_put(&handle, nr); + +		leader = counter->group_leader; +		list_for_each_entry(sub, &leader->sibling_list, list_entry) { +			if (sub != counter) +				sub->pmu->read(sub); + +			group_entry.id = sub->id; +			group_entry.counter = atomic64_read(&sub->count); + +			perf_output_put(&handle, group_entry); +		} +	} + +	if (callchain) +		perf_output_copy(&handle, callchain, callchain_size); + +	perf_output_end(&handle); +} + +/* + * fork tracking + */ + +struct perf_fork_event { +	struct task_struct	*task; + +	struct { +		struct perf_event_header	header; + +		u32				pid; +		u32				ppid; +	} event; +}; + +static void perf_counter_fork_output(struct perf_counter *counter, +				     struct perf_fork_event *fork_event) +{ +	struct perf_output_handle handle; +	int size = fork_event->event.header.size; +	struct task_struct *task = fork_event->task; +	int ret = perf_output_begin(&handle, counter, size, 0, 0); + +	if (ret) +		return; + +	fork_event->event.pid = perf_counter_pid(counter, task); +	fork_event->event.ppid = perf_counter_pid(counter, task->real_parent); + +	perf_output_put(&handle, fork_event->event); +	perf_output_end(&handle); +} + +static int perf_counter_fork_match(struct perf_counter *counter) +{ +	if (counter->attr.comm || counter->attr.mmap) +		return 1; + +	return 0; +} + +static void perf_counter_fork_ctx(struct perf_counter_context *ctx, +				  struct perf_fork_event *fork_event) +{ +	struct perf_counter *counter; + +	if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) +		return; + +	rcu_read_lock(); +	list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { +		if (perf_counter_fork_match(counter)) +			perf_counter_fork_output(counter, fork_event); +	} +	rcu_read_unlock(); +} + +static void perf_counter_fork_event(struct perf_fork_event *fork_event) +{ +	struct perf_cpu_context *cpuctx; +	struct perf_counter_context *ctx; + +	cpuctx = &get_cpu_var(perf_cpu_context); +	perf_counter_fork_ctx(&cpuctx->ctx, fork_event); +	put_cpu_var(perf_cpu_context); + +	rcu_read_lock(); +	/* +	 * doesn't really matter which of the child contexts the +	 * events ends up in. +	 */ +	ctx = rcu_dereference(current->perf_counter_ctxp); +	if (ctx) +		perf_counter_fork_ctx(ctx, fork_event); +	rcu_read_unlock(); +} + +void perf_counter_fork(struct task_struct *task) +{ +	struct perf_fork_event fork_event; + +	if (!atomic_read(&nr_comm_counters) && +	    !atomic_read(&nr_mmap_counters)) +		return; + +	fork_event = (struct perf_fork_event){ +		.task	= task, +		.event  = { +			.header = { +				.type = PERF_EVENT_FORK, +				.size = sizeof(fork_event.event), +			}, +		}, +	}; + +	perf_counter_fork_event(&fork_event); +} + +/* + * comm tracking + */ + +struct perf_comm_event { +	struct task_struct	*task; +	char			*comm; +	int			comm_size; + +	struct { +		struct perf_event_header	header; + +		u32				pid; +		u32				tid; +	} event; +}; + +static void perf_counter_comm_output(struct perf_counter *counter, +				     struct perf_comm_event *comm_event) +{ +	struct perf_output_handle handle; +	int size = comm_event->event.header.size; +	int ret = perf_output_begin(&handle, counter, size, 0, 0); + +	if (ret) +		return; + +	comm_event->event.pid = perf_counter_pid(counter, comm_event->task); +	comm_event->event.tid = perf_counter_tid(counter, comm_event->task); + +	perf_output_put(&handle, comm_event->event); +	perf_output_copy(&handle, comm_event->comm, +				   comm_event->comm_size); +	perf_output_end(&handle); +} + +static int perf_counter_comm_match(struct perf_counter *counter) +{ +	if (counter->attr.comm) +		return 1; + +	return 0; +} + +static void perf_counter_comm_ctx(struct perf_counter_context *ctx, +				  struct perf_comm_event *comm_event) +{ +	struct perf_counter *counter; + +	if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) +		return; + +	rcu_read_lock(); +	list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { +		if (perf_counter_comm_match(counter)) +			perf_counter_comm_output(counter, comm_event); +	} +	rcu_read_unlock(); +} + +static void perf_counter_comm_event(struct perf_comm_event *comm_event) +{ +	struct perf_cpu_context *cpuctx; +	struct perf_counter_context *ctx; +	unsigned int size; +	char *comm = comm_event->task->comm; + +	size = ALIGN(strlen(comm)+1, sizeof(u64)); + +	comm_event->comm = comm; +	comm_event->comm_size = size; + +	comm_event->event.header.size = sizeof(comm_event->event) + size; + +	cpuctx = &get_cpu_var(perf_cpu_context); +	perf_counter_comm_ctx(&cpuctx->ctx, comm_event); +	put_cpu_var(perf_cpu_context); + +	rcu_read_lock(); +	/* +	 * doesn't really matter which of the child contexts the +	 * events ends up in. +	 */ +	ctx = rcu_dereference(current->perf_counter_ctxp); +	if (ctx) +		perf_counter_comm_ctx(ctx, comm_event); +	rcu_read_unlock(); +} + +void perf_counter_comm(struct task_struct *task) +{ +	struct perf_comm_event comm_event; + +	if (!atomic_read(&nr_comm_counters)) +		return; + +	comm_event = (struct perf_comm_event){ +		.task	= task, +		.event  = { +			.header = { .type = PERF_EVENT_COMM, }, +		}, +	}; + +	perf_counter_comm_event(&comm_event); +} + +/* + * mmap tracking + */ + +struct perf_mmap_event { +	struct vm_area_struct	*vma; + +	const char		*file_name; +	int			file_size; + +	struct { +		struct perf_event_header	header; + +		u32				pid; +		u32				tid; +		u64				start; +		u64				len; +		u64				pgoff; +	} event; +}; + +static void perf_counter_mmap_output(struct perf_counter *counter, +				     struct perf_mmap_event *mmap_event) +{ +	struct perf_output_handle handle; +	int size = mmap_event->event.header.size; +	int ret = perf_output_begin(&handle, counter, size, 0, 0); + +	if (ret) +		return; + +	mmap_event->event.pid = perf_counter_pid(counter, current); +	mmap_event->event.tid = perf_counter_tid(counter, current); + +	perf_output_put(&handle, mmap_event->event); +	perf_output_copy(&handle, mmap_event->file_name, +				   mmap_event->file_size); +	perf_output_end(&handle); +} + +static int perf_counter_mmap_match(struct perf_counter *counter, +				   struct perf_mmap_event *mmap_event) +{ +	if (counter->attr.mmap) +		return 1; + +	return 0; +} + +static void perf_counter_mmap_ctx(struct perf_counter_context *ctx, +				  struct perf_mmap_event *mmap_event) +{ +	struct perf_counter *counter; + +	if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) +		return; + +	rcu_read_lock(); +	list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { +		if (perf_counter_mmap_match(counter, mmap_event)) +			perf_counter_mmap_output(counter, mmap_event); +	} +	rcu_read_unlock(); +} + +static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event) +{ +	struct perf_cpu_context *cpuctx; +	struct perf_counter_context *ctx; +	struct vm_area_struct *vma = mmap_event->vma; +	struct file *file = vma->vm_file; +	unsigned int size; +	char tmp[16]; +	char *buf = NULL; +	const char *name; + +	if (file) { +		buf = kzalloc(PATH_MAX, GFP_KERNEL); +		if (!buf) { +			name = strncpy(tmp, "//enomem", sizeof(tmp)); +			goto got_name; +		} +		name = d_path(&file->f_path, buf, PATH_MAX); +		if (IS_ERR(name)) { +			name = strncpy(tmp, "//toolong", sizeof(tmp)); +			goto got_name; +		} +	} else { +		name = arch_vma_name(mmap_event->vma); +		if (name) +			goto got_name; + +		if (!vma->vm_mm) { +			name = strncpy(tmp, "[vdso]", sizeof(tmp)); +			goto got_name; +		} + +		name = strncpy(tmp, "//anon", sizeof(tmp)); +		goto got_name; +	} + +got_name: +	size = ALIGN(strlen(name)+1, sizeof(u64)); + +	mmap_event->file_name = name; +	mmap_event->file_size = size; + +	mmap_event->event.header.size = sizeof(mmap_event->event) + size; + +	cpuctx = &get_cpu_var(perf_cpu_context); +	perf_counter_mmap_ctx(&cpuctx->ctx, mmap_event); +	put_cpu_var(perf_cpu_context); + +	rcu_read_lock(); +	/* +	 * doesn't really matter which of the child contexts the +	 * events ends up in. +	 */ +	ctx = rcu_dereference(current->perf_counter_ctxp); +	if (ctx) +		perf_counter_mmap_ctx(ctx, mmap_event); +	rcu_read_unlock(); + +	kfree(buf); +} + +void __perf_counter_mmap(struct vm_area_struct *vma) +{ +	struct perf_mmap_event mmap_event; + +	if (!atomic_read(&nr_mmap_counters)) +		return; + +	mmap_event = (struct perf_mmap_event){ +		.vma	= vma, +		.event  = { +			.header = { .type = PERF_EVENT_MMAP, }, +			.start  = vma->vm_start, +			.len    = vma->vm_end - vma->vm_start, +			.pgoff  = vma->vm_pgoff, +		}, +	}; + +	perf_counter_mmap_event(&mmap_event); +} + +/* + * Log sample_period changes so that analyzing tools can re-normalize the + * event flow. + */ + +struct freq_event { +	struct perf_event_header	header; +	u64				time; +	u64				id; +	u64				period; +}; + +static void perf_log_period(struct perf_counter *counter, u64 period) +{ +	struct perf_output_handle handle; +	struct freq_event event; +	int ret; + +	if (counter->hw.sample_period == period) +		return; + +	if (counter->attr.sample_type & PERF_SAMPLE_PERIOD) +		return; + +	event = (struct freq_event) { +		.header = { +			.type = PERF_EVENT_PERIOD, +			.misc = 0, +			.size = sizeof(event), +		}, +		.time = sched_clock(), +		.id = counter->id, +		.period = period, +	}; + +	ret = perf_output_begin(&handle, counter, sizeof(event), 1, 0); +	if (ret) +		return; + +	perf_output_put(&handle, event); +	perf_output_end(&handle); +} + +/* + * IRQ throttle logging + */ + +static void perf_log_throttle(struct perf_counter *counter, int enable) +{ +	struct perf_output_handle handle; +	int ret; + +	struct { +		struct perf_event_header	header; +		u64				time; +		u64				id; +	} throttle_event = { +		.header = { +			.type = PERF_EVENT_THROTTLE + 1, +			.misc = 0, +			.size = sizeof(throttle_event), +		}, +		.time	= sched_clock(), +		.id	= counter->id, +	}; + +	ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0); +	if (ret) +		return; + +	perf_output_put(&handle, throttle_event); +	perf_output_end(&handle); +} + +/* + * Generic counter overflow handling. + */ + +int perf_counter_overflow(struct perf_counter *counter, int nmi, +			  struct perf_sample_data *data) +{ +	int events = atomic_read(&counter->event_limit); +	int throttle = counter->pmu->unthrottle != NULL; +	struct hw_perf_counter *hwc = &counter->hw; +	int ret = 0; + +	if (!throttle) { +		hwc->interrupts++; +	} else { +		if (hwc->interrupts != MAX_INTERRUPTS) { +			hwc->interrupts++; +			if (HZ * hwc->interrupts > +					(u64)sysctl_perf_counter_sample_rate) { +				hwc->interrupts = MAX_INTERRUPTS; +				perf_log_throttle(counter, 0); +				ret = 1; +			} +		} else { +			/* +			 * Keep re-disabling counters even though on the previous +			 * pass we disabled it - just in case we raced with a +			 * sched-in and the counter got enabled again: +			 */ +			ret = 1; +		} +	} + +	if (counter->attr.freq) { +		u64 now = sched_clock(); +		s64 delta = now - hwc->freq_stamp; + +		hwc->freq_stamp = now; + +		if (delta > 0 && delta < TICK_NSEC) +			perf_adjust_period(counter, NSEC_PER_SEC / (int)delta); +	} + +	/* +	 * XXX event_limit might not quite work as expected on inherited +	 * counters +	 */ + +	counter->pending_kill = POLL_IN; +	if (events && atomic_dec_and_test(&counter->event_limit)) { +		ret = 1; +		counter->pending_kill = POLL_HUP; +		if (nmi) { +			counter->pending_disable = 1; +			perf_pending_queue(&counter->pending, +					   perf_pending_counter); +		} else +			perf_counter_disable(counter); +	} + +	perf_counter_output(counter, nmi, data); +	return ret; +} + +/* + * Generic software counter infrastructure + */ + +static void perf_swcounter_update(struct perf_counter *counter) +{ +	struct hw_perf_counter *hwc = &counter->hw; +	u64 prev, now; +	s64 delta; + +again: +	prev = atomic64_read(&hwc->prev_count); +	now = atomic64_read(&hwc->count); +	if (atomic64_cmpxchg(&hwc->prev_count, prev, now) != prev) +		goto again; + +	delta = now - prev; + +	atomic64_add(delta, &counter->count); +	atomic64_sub(delta, &hwc->period_left); +} + +static void perf_swcounter_set_period(struct perf_counter *counter) +{ +	struct hw_perf_counter *hwc = &counter->hw; +	s64 left = atomic64_read(&hwc->period_left); +	s64 period = hwc->sample_period; + +	if (unlikely(left <= -period)) { +		left = period; +		atomic64_set(&hwc->period_left, left); +		hwc->last_period = period; +	} + +	if (unlikely(left <= 0)) { +		left += period; +		atomic64_add(period, &hwc->period_left); +		hwc->last_period = period; +	} + +	atomic64_set(&hwc->prev_count, -left); +	atomic64_set(&hwc->count, -left); +} + +static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) +{ +	enum hrtimer_restart ret = HRTIMER_RESTART; +	struct perf_sample_data data; +	struct perf_counter *counter; +	u64 period; + +	counter	= container_of(hrtimer, struct perf_counter, hw.hrtimer); +	counter->pmu->read(counter); + +	data.addr = 0; +	data.regs = get_irq_regs(); +	/* +	 * In case we exclude kernel IPs or are somehow not in interrupt +	 * context, provide the next best thing, the user IP. +	 */ +	if ((counter->attr.exclude_kernel || !data.regs) && +			!counter->attr.exclude_user) +		data.regs = task_pt_regs(current); + +	if (data.regs) { +		if (perf_counter_overflow(counter, 0, &data)) +			ret = HRTIMER_NORESTART; +	} + +	period = max_t(u64, 10000, counter->hw.sample_period); +	hrtimer_forward_now(hrtimer, ns_to_ktime(period)); + +	return ret; +} + +static void perf_swcounter_overflow(struct perf_counter *counter, +				    int nmi, struct pt_regs *regs, u64 addr) +{ +	struct perf_sample_data data = { +		.regs	= regs, +		.addr	= addr, +		.period	= counter->hw.last_period, +	}; + +	perf_swcounter_update(counter); +	perf_swcounter_set_period(counter); +	if (perf_counter_overflow(counter, nmi, &data)) +		/* soft-disable the counter */ +		; + +} + +static int perf_swcounter_is_counting(struct perf_counter *counter) +{ +	struct perf_counter_context *ctx; +	unsigned long flags; +	int count; + +	if (counter->state == PERF_COUNTER_STATE_ACTIVE) +		return 1; + +	if (counter->state != PERF_COUNTER_STATE_INACTIVE) +		return 0; + +	/* +	 * If the counter is inactive, it could be just because +	 * its task is scheduled out, or because it's in a group +	 * which could not go on the PMU.  We want to count in +	 * the first case but not the second.  If the context is +	 * currently active then an inactive software counter must +	 * be the second case.  If it's not currently active then +	 * we need to know whether the counter was active when the +	 * context was last active, which we can determine by +	 * comparing counter->tstamp_stopped with ctx->time. +	 * +	 * We are within an RCU read-side critical section, +	 * which protects the existence of *ctx. +	 */ +	ctx = counter->ctx; +	spin_lock_irqsave(&ctx->lock, flags); +	count = 1; +	/* Re-check state now we have the lock */ +	if (counter->state < PERF_COUNTER_STATE_INACTIVE || +	    counter->ctx->is_active || +	    counter->tstamp_stopped < ctx->time) +		count = 0; +	spin_unlock_irqrestore(&ctx->lock, flags); +	return count; +} + +static int perf_swcounter_match(struct perf_counter *counter, +				enum perf_type_id type, +				u32 event, struct pt_regs *regs) +{ +	if (!perf_swcounter_is_counting(counter)) +		return 0; + +	if (counter->attr.type != type) +		return 0; +	if (counter->attr.config != event) +		return 0; + +	if (regs) { +		if (counter->attr.exclude_user && user_mode(regs)) +			return 0; + +		if (counter->attr.exclude_kernel && !user_mode(regs)) +			return 0; +	} + +	return 1; +} + +static void perf_swcounter_add(struct perf_counter *counter, u64 nr, +			       int nmi, struct pt_regs *regs, u64 addr) +{ +	int neg = atomic64_add_negative(nr, &counter->hw.count); + +	if (counter->hw.sample_period && !neg && regs) +		perf_swcounter_overflow(counter, nmi, regs, addr); +} + +static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, +				     enum perf_type_id type, u32 event, +				     u64 nr, int nmi, struct pt_regs *regs, +				     u64 addr) +{ +	struct perf_counter *counter; + +	if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) +		return; + +	rcu_read_lock(); +	list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { +		if (perf_swcounter_match(counter, type, event, regs)) +			perf_swcounter_add(counter, nr, nmi, regs, addr); +	} +	rcu_read_unlock(); +} + +static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx) +{ +	if (in_nmi()) +		return &cpuctx->recursion[3]; + +	if (in_irq()) +		return &cpuctx->recursion[2]; + +	if (in_softirq()) +		return &cpuctx->recursion[1]; + +	return &cpuctx->recursion[0]; +} + +static void __perf_swcounter_event(enum perf_type_id type, u32 event, +				   u64 nr, int nmi, struct pt_regs *regs, +				   u64 addr) +{ +	struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context); +	int *recursion = perf_swcounter_recursion_context(cpuctx); +	struct perf_counter_context *ctx; + +	if (*recursion) +		goto out; + +	(*recursion)++; +	barrier(); + +	perf_swcounter_ctx_event(&cpuctx->ctx, type, event, +				 nr, nmi, regs, addr); +	rcu_read_lock(); +	/* +	 * doesn't really matter which of the child contexts the +	 * events ends up in. +	 */ +	ctx = rcu_dereference(current->perf_counter_ctxp); +	if (ctx) +		perf_swcounter_ctx_event(ctx, type, event, nr, nmi, regs, addr); +	rcu_read_unlock(); + +	barrier(); +	(*recursion)--; + +out: +	put_cpu_var(perf_cpu_context); +} + +void +perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs, u64 addr) +{ +	__perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, regs, addr); +} + +static void perf_swcounter_read(struct perf_counter *counter) +{ +	perf_swcounter_update(counter); +} + +static int perf_swcounter_enable(struct perf_counter *counter) +{ +	perf_swcounter_set_period(counter); +	return 0; +} + +static void perf_swcounter_disable(struct perf_counter *counter) +{ +	perf_swcounter_update(counter); +} + +static const struct pmu perf_ops_generic = { +	.enable		= perf_swcounter_enable, +	.disable	= perf_swcounter_disable, +	.read		= perf_swcounter_read, +}; + +/* + * Software counter: cpu wall time clock + */ + +static void cpu_clock_perf_counter_update(struct perf_counter *counter) +{ +	int cpu = raw_smp_processor_id(); +	s64 prev; +	u64 now; + +	now = cpu_clock(cpu); +	prev = atomic64_read(&counter->hw.prev_count); +	atomic64_set(&counter->hw.prev_count, now); +	atomic64_add(now - prev, &counter->count); +} + +static int cpu_clock_perf_counter_enable(struct perf_counter *counter) +{ +	struct hw_perf_counter *hwc = &counter->hw; +	int cpu = raw_smp_processor_id(); + +	atomic64_set(&hwc->prev_count, cpu_clock(cpu)); +	hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); +	hwc->hrtimer.function = perf_swcounter_hrtimer; +	if (hwc->sample_period) { +		u64 period = max_t(u64, 10000, hwc->sample_period); +		__hrtimer_start_range_ns(&hwc->hrtimer, +				ns_to_ktime(period), 0, +				HRTIMER_MODE_REL, 0); +	} + +	return 0; +} + +static void cpu_clock_perf_counter_disable(struct perf_counter *counter) +{ +	if (counter->hw.sample_period) +		hrtimer_cancel(&counter->hw.hrtimer); +	cpu_clock_perf_counter_update(counter); +} + +static void cpu_clock_perf_counter_read(struct perf_counter *counter) +{ +	cpu_clock_perf_counter_update(counter); +} + +static const struct pmu perf_ops_cpu_clock = { +	.enable		= cpu_clock_perf_counter_enable, +	.disable	= cpu_clock_perf_counter_disable, +	.read		= cpu_clock_perf_counter_read, +}; + +/* + * Software counter: task time clock + */ + +static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now) +{ +	u64 prev; +	s64 delta; + +	prev = atomic64_xchg(&counter->hw.prev_count, now); +	delta = now - prev; +	atomic64_add(delta, &counter->count); +} + +static int task_clock_perf_counter_enable(struct perf_counter *counter) +{ +	struct hw_perf_counter *hwc = &counter->hw; +	u64 now; + +	now = counter->ctx->time; + +	atomic64_set(&hwc->prev_count, now); +	hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); +	hwc->hrtimer.function = perf_swcounter_hrtimer; +	if (hwc->sample_period) { +		u64 period = max_t(u64, 10000, hwc->sample_period); +		__hrtimer_start_range_ns(&hwc->hrtimer, +				ns_to_ktime(period), 0, +				HRTIMER_MODE_REL, 0); +	} + +	return 0; +} + +static void task_clock_perf_counter_disable(struct perf_counter *counter) +{ +	if (counter->hw.sample_period) +		hrtimer_cancel(&counter->hw.hrtimer); +	task_clock_perf_counter_update(counter, counter->ctx->time); + +} + +static void task_clock_perf_counter_read(struct perf_counter *counter) +{ +	u64 time; + +	if (!in_nmi()) { +		update_context_time(counter->ctx); +		time = counter->ctx->time; +	} else { +		u64 now = perf_clock(); +		u64 delta = now - counter->ctx->timestamp; +		time = counter->ctx->time + delta; +	} + +	task_clock_perf_counter_update(counter, time); +} + +static const struct pmu perf_ops_task_clock = { +	.enable		= task_clock_perf_counter_enable, +	.disable	= task_clock_perf_counter_disable, +	.read		= task_clock_perf_counter_read, +}; + +/* + * Software counter: cpu migrations + */ +void perf_counter_task_migration(struct task_struct *task, int cpu) +{ +	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); +	struct perf_counter_context *ctx; + +	perf_swcounter_ctx_event(&cpuctx->ctx, PERF_TYPE_SOFTWARE, +				 PERF_COUNT_SW_CPU_MIGRATIONS, +				 1, 1, NULL, 0); + +	ctx = perf_pin_task_context(task); +	if (ctx) { +		perf_swcounter_ctx_event(ctx, PERF_TYPE_SOFTWARE, +					 PERF_COUNT_SW_CPU_MIGRATIONS, +					 1, 1, NULL, 0); +		perf_unpin_context(ctx); +	} +} + +#ifdef CONFIG_EVENT_PROFILE +void perf_tpcounter_event(int event_id) +{ +	struct pt_regs *regs = get_irq_regs(); + +	if (!regs) +		regs = task_pt_regs(current); + +	__perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, regs, 0); +} +EXPORT_SYMBOL_GPL(perf_tpcounter_event); + +extern int ftrace_profile_enable(int); +extern void ftrace_profile_disable(int); + +static void tp_perf_counter_destroy(struct perf_counter *counter) +{ +	ftrace_profile_disable(perf_event_id(&counter->attr)); +} + +static const struct pmu *tp_perf_counter_init(struct perf_counter *counter) +{ +	int event_id = perf_event_id(&counter->attr); +	int ret; + +	ret = ftrace_profile_enable(event_id); +	if (ret) +		return NULL; + +	counter->destroy = tp_perf_counter_destroy; + +	return &perf_ops_generic; +} +#else +static const struct pmu *tp_perf_counter_init(struct perf_counter *counter) +{ +	return NULL; +} +#endif + +static const struct pmu *sw_perf_counter_init(struct perf_counter *counter) +{ +	const struct pmu *pmu = NULL; + +	/* +	 * Software counters (currently) can't in general distinguish +	 * between user, kernel and hypervisor events. +	 * However, context switches and cpu migrations are considered +	 * to be kernel events, and page faults are never hypervisor +	 * events. +	 */ +	switch (counter->attr.config) { +	case PERF_COUNT_SW_CPU_CLOCK: +		pmu = &perf_ops_cpu_clock; + +		break; +	case PERF_COUNT_SW_TASK_CLOCK: +		/* +		 * If the user instantiates this as a per-cpu counter, +		 * use the cpu_clock counter instead. +		 */ +		if (counter->ctx->task) +			pmu = &perf_ops_task_clock; +		else +			pmu = &perf_ops_cpu_clock; + +		break; +	case PERF_COUNT_SW_PAGE_FAULTS: +	case PERF_COUNT_SW_PAGE_FAULTS_MIN: +	case PERF_COUNT_SW_PAGE_FAULTS_MAJ: +	case PERF_COUNT_SW_CONTEXT_SWITCHES: +	case PERF_COUNT_SW_CPU_MIGRATIONS: +		pmu = &perf_ops_generic; +		break; +	} + +	return pmu; +} + +/* + * Allocate and initialize a counter structure + */ +static struct perf_counter * +perf_counter_alloc(struct perf_counter_attr *attr, +		   int cpu, +		   struct perf_counter_context *ctx, +		   struct perf_counter *group_leader, +		   gfp_t gfpflags) +{ +	const struct pmu *pmu; +	struct perf_counter *counter; +	struct hw_perf_counter *hwc; +	long err; + +	counter = kzalloc(sizeof(*counter), gfpflags); +	if (!counter) +		return ERR_PTR(-ENOMEM); + +	/* +	 * Single counters are their own group leaders, with an +	 * empty sibling list: +	 */ +	if (!group_leader) +		group_leader = counter; + +	mutex_init(&counter->child_mutex); +	INIT_LIST_HEAD(&counter->child_list); + +	INIT_LIST_HEAD(&counter->list_entry); +	INIT_LIST_HEAD(&counter->event_entry); +	INIT_LIST_HEAD(&counter->sibling_list); +	init_waitqueue_head(&counter->waitq); + +	mutex_init(&counter->mmap_mutex); + +	counter->cpu		= cpu; +	counter->attr		= *attr; +	counter->group_leader	= group_leader; +	counter->pmu		= NULL; +	counter->ctx		= ctx; +	counter->oncpu		= -1; + +	counter->ns		= get_pid_ns(current->nsproxy->pid_ns); +	counter->id		= atomic64_inc_return(&perf_counter_id); + +	counter->state		= PERF_COUNTER_STATE_INACTIVE; + +	if (attr->disabled) +		counter->state = PERF_COUNTER_STATE_OFF; + +	pmu = NULL; + +	hwc = &counter->hw; +	hwc->sample_period = attr->sample_period; +	if (attr->freq && attr->sample_freq) +		hwc->sample_period = 1; + +	atomic64_set(&hwc->period_left, hwc->sample_period); + +	/* +	 * we currently do not support PERF_SAMPLE_GROUP on inherited counters +	 */ +	if (attr->inherit && (attr->sample_type & PERF_SAMPLE_GROUP)) +		goto done; + +	if (attr->type == PERF_TYPE_RAW) { +		pmu = hw_perf_counter_init(counter); +		goto done; +	} + +	switch (attr->type) { +	case PERF_TYPE_HARDWARE: +	case PERF_TYPE_HW_CACHE: +		pmu = hw_perf_counter_init(counter); +		break; + +	case PERF_TYPE_SOFTWARE: +		pmu = sw_perf_counter_init(counter); +		break; + +	case PERF_TYPE_TRACEPOINT: +		pmu = tp_perf_counter_init(counter); +		break; +	} +done: +	err = 0; +	if (!pmu) +		err = -EINVAL; +	else if (IS_ERR(pmu)) +		err = PTR_ERR(pmu); + +	if (err) { +		if (counter->ns) +			put_pid_ns(counter->ns); +		kfree(counter); +		return ERR_PTR(err); +	} + +	counter->pmu = pmu; + +	atomic_inc(&nr_counters); +	if (counter->attr.mmap) +		atomic_inc(&nr_mmap_counters); +	if (counter->attr.comm) +		atomic_inc(&nr_comm_counters); + +	return counter; +} + +/** + * sys_perf_counter_open - open a performance counter, associate it to a task/cpu + * + * @attr_uptr:	event type attributes for monitoring/sampling + * @pid:		target pid + * @cpu:		target cpu + * @group_fd:		group leader counter fd + */ +SYSCALL_DEFINE5(perf_counter_open, +		const struct perf_counter_attr __user *, attr_uptr, +		pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) +{ +	struct perf_counter *counter, *group_leader; +	struct perf_counter_attr attr; +	struct perf_counter_context *ctx; +	struct file *counter_file = NULL; +	struct file *group_file = NULL; +	int fput_needed = 0; +	int fput_needed2 = 0; +	int ret; + +	/* for future expandability... */ +	if (flags) +		return -EINVAL; + +	if (copy_from_user(&attr, attr_uptr, sizeof(attr)) != 0) +		return -EFAULT; + +	if (!attr.exclude_kernel) { +		if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN)) +			return -EACCES; +	} + +	if (attr.freq) { +		if (attr.sample_freq > sysctl_perf_counter_sample_rate) +			return -EINVAL; +	} + +	/* +	 * Get the target context (task or percpu): +	 */ +	ctx = find_get_context(pid, cpu); +	if (IS_ERR(ctx)) +		return PTR_ERR(ctx); + +	/* +	 * Look up the group leader (we will attach this counter to it): +	 */ +	group_leader = NULL; +	if (group_fd != -1) { +		ret = -EINVAL; +		group_file = fget_light(group_fd, &fput_needed); +		if (!group_file) +			goto err_put_context; +		if (group_file->f_op != &perf_fops) +			goto err_put_context; + +		group_leader = group_file->private_data; +		/* +		 * Do not allow a recursive hierarchy (this new sibling +		 * becoming part of another group-sibling): +		 */ +		if (group_leader->group_leader != group_leader) +			goto err_put_context; +		/* +		 * Do not allow to attach to a group in a different +		 * task or CPU context: +		 */ +		if (group_leader->ctx != ctx) +			goto err_put_context; +		/* +		 * Only a group leader can be exclusive or pinned +		 */ +		if (attr.exclusive || attr.pinned) +			goto err_put_context; +	} + +	counter = perf_counter_alloc(&attr, cpu, ctx, group_leader, +				     GFP_KERNEL); +	ret = PTR_ERR(counter); +	if (IS_ERR(counter)) +		goto err_put_context; + +	ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0); +	if (ret < 0) +		goto err_free_put_context; + +	counter_file = fget_light(ret, &fput_needed2); +	if (!counter_file) +		goto err_free_put_context; + +	counter->filp = counter_file; +	WARN_ON_ONCE(ctx->parent_ctx); +	mutex_lock(&ctx->mutex); +	perf_install_in_context(ctx, counter, cpu); +	++ctx->generation; +	mutex_unlock(&ctx->mutex); + +	counter->owner = current; +	get_task_struct(current); +	mutex_lock(¤t->perf_counter_mutex); +	list_add_tail(&counter->owner_entry, ¤t->perf_counter_list); +	mutex_unlock(¤t->perf_counter_mutex); + +	fput_light(counter_file, fput_needed2); + +out_fput: +	fput_light(group_file, fput_needed); + +	return ret; + +err_free_put_context: +	kfree(counter); + +err_put_context: +	put_ctx(ctx); + +	goto out_fput; +} + +/* + * inherit a counter from parent task to child task: + */ +static struct perf_counter * +inherit_counter(struct perf_counter *parent_counter, +	      struct task_struct *parent, +	      struct perf_counter_context *parent_ctx, +	      struct task_struct *child, +	      struct perf_counter *group_leader, +	      struct perf_counter_context *child_ctx) +{ +	struct perf_counter *child_counter; + +	/* +	 * Instead of creating recursive hierarchies of counters, +	 * we link inherited counters back to the original parent, +	 * which has a filp for sure, which we use as the reference +	 * count: +	 */ +	if (parent_counter->parent) +		parent_counter = parent_counter->parent; + +	child_counter = perf_counter_alloc(&parent_counter->attr, +					   parent_counter->cpu, child_ctx, +					   group_leader, GFP_KERNEL); +	if (IS_ERR(child_counter)) +		return child_counter; +	get_ctx(child_ctx); + +	/* +	 * Make the child state follow the state of the parent counter, +	 * not its attr.disabled bit.  We hold the parent's mutex, +	 * so we won't race with perf_counter_{en, dis}able_family. +	 */ +	if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE) +		child_counter->state = PERF_COUNTER_STATE_INACTIVE; +	else +		child_counter->state = PERF_COUNTER_STATE_OFF; + +	if (parent_counter->attr.freq) +		child_counter->hw.sample_period = parent_counter->hw.sample_period; + +	/* +	 * Link it up in the child's context: +	 */ +	add_counter_to_ctx(child_counter, child_ctx); + +	child_counter->parent = parent_counter; +	/* +	 * inherit into child's child as well: +	 */ +	child_counter->attr.inherit = 1; + +	/* +	 * Get a reference to the parent filp - we will fput it +	 * when the child counter exits. This is safe to do because +	 * we are in the parent and we know that the filp still +	 * exists and has a nonzero count: +	 */ +	atomic_long_inc(&parent_counter->filp->f_count); + +	/* +	 * Link this into the parent counter's child list +	 */ +	WARN_ON_ONCE(parent_counter->ctx->parent_ctx); +	mutex_lock(&parent_counter->child_mutex); +	list_add_tail(&child_counter->child_list, &parent_counter->child_list); +	mutex_unlock(&parent_counter->child_mutex); + +	return child_counter; +} + +static int inherit_group(struct perf_counter *parent_counter, +	      struct task_struct *parent, +	      struct perf_counter_context *parent_ctx, +	      struct task_struct *child, +	      struct perf_counter_context *child_ctx) +{ +	struct perf_counter *leader; +	struct perf_counter *sub; +	struct perf_counter *child_ctr; + +	leader = inherit_counter(parent_counter, parent, parent_ctx, +				 child, NULL, child_ctx); +	if (IS_ERR(leader)) +		return PTR_ERR(leader); +	list_for_each_entry(sub, &parent_counter->sibling_list, list_entry) { +		child_ctr = inherit_counter(sub, parent, parent_ctx, +					    child, leader, child_ctx); +		if (IS_ERR(child_ctr)) +			return PTR_ERR(child_ctr); +	} +	return 0; +} + +static void sync_child_counter(struct perf_counter *child_counter, +			       struct perf_counter *parent_counter) +{ +	u64 child_val; + +	child_val = atomic64_read(&child_counter->count); + +	/* +	 * Add back the child's count to the parent's count: +	 */ +	atomic64_add(child_val, &parent_counter->count); +	atomic64_add(child_counter->total_time_enabled, +		     &parent_counter->child_total_time_enabled); +	atomic64_add(child_counter->total_time_running, +		     &parent_counter->child_total_time_running); + +	/* +	 * Remove this counter from the parent's list +	 */ +	WARN_ON_ONCE(parent_counter->ctx->parent_ctx); +	mutex_lock(&parent_counter->child_mutex); +	list_del_init(&child_counter->child_list); +	mutex_unlock(&parent_counter->child_mutex); + +	/* +	 * Release the parent counter, if this was the last +	 * reference to it. +	 */ +	fput(parent_counter->filp); +} + +static void +__perf_counter_exit_task(struct perf_counter *child_counter, +			 struct perf_counter_context *child_ctx) +{ +	struct perf_counter *parent_counter; + +	update_counter_times(child_counter); +	perf_counter_remove_from_context(child_counter); + +	parent_counter = child_counter->parent; +	/* +	 * It can happen that parent exits first, and has counters +	 * that are still around due to the child reference. These +	 * counters need to be zapped - but otherwise linger. +	 */ +	if (parent_counter) { +		sync_child_counter(child_counter, parent_counter); +		free_counter(child_counter); +	} +} + +/* + * When a child task exits, feed back counter values to parent counters. + */ +void perf_counter_exit_task(struct task_struct *child) +{ +	struct perf_counter *child_counter, *tmp; +	struct perf_counter_context *child_ctx; +	unsigned long flags; + +	if (likely(!child->perf_counter_ctxp)) +		return; + +	local_irq_save(flags); +	/* +	 * We can't reschedule here because interrupts are disabled, +	 * and either child is current or it is a task that can't be +	 * scheduled, so we are now safe from rescheduling changing +	 * our context. +	 */ +	child_ctx = child->perf_counter_ctxp; +	__perf_counter_task_sched_out(child_ctx); + +	/* +	 * Take the context lock here so that if find_get_context is +	 * reading child->perf_counter_ctxp, we wait until it has +	 * incremented the context's refcount before we do put_ctx below. +	 */ +	spin_lock(&child_ctx->lock); +	child->perf_counter_ctxp = NULL; +	if (child_ctx->parent_ctx) { +		/* +		 * This context is a clone; unclone it so it can't get +		 * swapped to another process while we're removing all +		 * the counters from it. +		 */ +		put_ctx(child_ctx->parent_ctx); +		child_ctx->parent_ctx = NULL; +	} +	spin_unlock(&child_ctx->lock); +	local_irq_restore(flags); + +	/* +	 * We can recurse on the same lock type through: +	 * +	 *   __perf_counter_exit_task() +	 *     sync_child_counter() +	 *       fput(parent_counter->filp) +	 *         perf_release() +	 *           mutex_lock(&ctx->mutex) +	 * +	 * But since its the parent context it won't be the same instance. +	 */ +	mutex_lock_nested(&child_ctx->mutex, SINGLE_DEPTH_NESTING); + +again: +	list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list, +				 list_entry) +		__perf_counter_exit_task(child_counter, child_ctx); + +	/* +	 * If the last counter was a group counter, it will have appended all +	 * its siblings to the list, but we obtained 'tmp' before that which +	 * will still point to the list head terminating the iteration. +	 */ +	if (!list_empty(&child_ctx->counter_list)) +		goto again; + +	mutex_unlock(&child_ctx->mutex); + +	put_ctx(child_ctx); +} + +/* + * free an unexposed, unused context as created by inheritance by + * init_task below, used by fork() in case of fail. + */ +void perf_counter_free_task(struct task_struct *task) +{ +	struct perf_counter_context *ctx = task->perf_counter_ctxp; +	struct perf_counter *counter, *tmp; + +	if (!ctx) +		return; + +	mutex_lock(&ctx->mutex); +again: +	list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry) { +		struct perf_counter *parent = counter->parent; + +		if (WARN_ON_ONCE(!parent)) +			continue; + +		mutex_lock(&parent->child_mutex); +		list_del_init(&counter->child_list); +		mutex_unlock(&parent->child_mutex); + +		fput(parent->filp); + +		list_del_counter(counter, ctx); +		free_counter(counter); +	} + +	if (!list_empty(&ctx->counter_list)) +		goto again; + +	mutex_unlock(&ctx->mutex); + +	put_ctx(ctx); +} + +/* + * Initialize the perf_counter context in task_struct + */ +int perf_counter_init_task(struct task_struct *child) +{ +	struct perf_counter_context *child_ctx, *parent_ctx; +	struct perf_counter_context *cloned_ctx; +	struct perf_counter *counter; +	struct task_struct *parent = current; +	int inherited_all = 1; +	int ret = 0; + +	child->perf_counter_ctxp = NULL; + +	mutex_init(&child->perf_counter_mutex); +	INIT_LIST_HEAD(&child->perf_counter_list); + +	if (likely(!parent->perf_counter_ctxp)) +		return 0; + +	/* +	 * This is executed from the parent task context, so inherit +	 * counters that have been marked for cloning. +	 * First allocate and initialize a context for the child. +	 */ + +	child_ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL); +	if (!child_ctx) +		return -ENOMEM; + +	__perf_counter_init_context(child_ctx, child); +	child->perf_counter_ctxp = child_ctx; +	get_task_struct(child); + +	/* +	 * If the parent's context is a clone, pin it so it won't get +	 * swapped under us. +	 */ +	parent_ctx = perf_pin_task_context(parent); + +	/* +	 * No need to check if parent_ctx != NULL here; since we saw +	 * it non-NULL earlier, the only reason for it to become NULL +	 * is if we exit, and since we're currently in the middle of +	 * a fork we can't be exiting at the same time. +	 */ + +	/* +	 * Lock the parent list. No need to lock the child - not PID +	 * hashed yet and not running, so nobody can access it. +	 */ +	mutex_lock(&parent_ctx->mutex); + +	/* +	 * We dont have to disable NMIs - we are only looking at +	 * the list, not manipulating it: +	 */ +	list_for_each_entry_rcu(counter, &parent_ctx->event_list, event_entry) { +		if (counter != counter->group_leader) +			continue; + +		if (!counter->attr.inherit) { +			inherited_all = 0; +			continue; +		} + +		ret = inherit_group(counter, parent, parent_ctx, +					     child, child_ctx); +		if (ret) { +			inherited_all = 0; +			break; +		} +	} + +	if (inherited_all) { +		/* +		 * Mark the child context as a clone of the parent +		 * context, or of whatever the parent is a clone of. +		 * Note that if the parent is a clone, it could get +		 * uncloned at any point, but that doesn't matter +		 * because the list of counters and the generation +		 * count can't have changed since we took the mutex. +		 */ +		cloned_ctx = rcu_dereference(parent_ctx->parent_ctx); +		if (cloned_ctx) { +			child_ctx->parent_ctx = cloned_ctx; +			child_ctx->parent_gen = parent_ctx->parent_gen; +		} else { +			child_ctx->parent_ctx = parent_ctx; +			child_ctx->parent_gen = parent_ctx->generation; +		} +		get_ctx(child_ctx->parent_ctx); +	} + +	mutex_unlock(&parent_ctx->mutex); + +	perf_unpin_context(parent_ctx); + +	return ret; +} + +static void __cpuinit perf_counter_init_cpu(int cpu) +{ +	struct perf_cpu_context *cpuctx; + +	cpuctx = &per_cpu(perf_cpu_context, cpu); +	__perf_counter_init_context(&cpuctx->ctx, NULL); + +	spin_lock(&perf_resource_lock); +	cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu; +	spin_unlock(&perf_resource_lock); + +	hw_perf_counter_setup(cpu); +} + +#ifdef CONFIG_HOTPLUG_CPU +static void __perf_counter_exit_cpu(void *info) +{ +	struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); +	struct perf_counter_context *ctx = &cpuctx->ctx; +	struct perf_counter *counter, *tmp; + +	list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry) +		__perf_counter_remove_from_context(counter); +} +static void perf_counter_exit_cpu(int cpu) +{ +	struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); +	struct perf_counter_context *ctx = &cpuctx->ctx; + +	mutex_lock(&ctx->mutex); +	smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1); +	mutex_unlock(&ctx->mutex); +} +#else +static inline void perf_counter_exit_cpu(int cpu) { } +#endif + +static int __cpuinit +perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) +{ +	unsigned int cpu = (long)hcpu; + +	switch (action) { + +	case CPU_UP_PREPARE: +	case CPU_UP_PREPARE_FROZEN: +		perf_counter_init_cpu(cpu); +		break; + +	case CPU_DOWN_PREPARE: +	case CPU_DOWN_PREPARE_FROZEN: +		perf_counter_exit_cpu(cpu); +		break; + +	default: +		break; +	} + +	return NOTIFY_OK; +} + +/* + * This has to have a higher priority than migration_notifier in sched.c. + */ +static struct notifier_block __cpuinitdata perf_cpu_nb = { +	.notifier_call		= perf_cpu_notify, +	.priority		= 20, +}; + +void __init perf_counter_init(void) +{ +	perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, +			(void *)(long)smp_processor_id()); +	register_cpu_notifier(&perf_cpu_nb); +} + +static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf) +{ +	return sprintf(buf, "%d\n", perf_reserved_percpu); +} + +static ssize_t +perf_set_reserve_percpu(struct sysdev_class *class, +			const char *buf, +			size_t count) +{ +	struct perf_cpu_context *cpuctx; +	unsigned long val; +	int err, cpu, mpt; + +	err = strict_strtoul(buf, 10, &val); +	if (err) +		return err; +	if (val > perf_max_counters) +		return -EINVAL; + +	spin_lock(&perf_resource_lock); +	perf_reserved_percpu = val; +	for_each_online_cpu(cpu) { +		cpuctx = &per_cpu(perf_cpu_context, cpu); +		spin_lock_irq(&cpuctx->ctx.lock); +		mpt = min(perf_max_counters - cpuctx->ctx.nr_counters, +			  perf_max_counters - perf_reserved_percpu); +		cpuctx->max_pertask = mpt; +		spin_unlock_irq(&cpuctx->ctx.lock); +	} +	spin_unlock(&perf_resource_lock); + +	return count; +} + +static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf) +{ +	return sprintf(buf, "%d\n", perf_overcommit); +} + +static ssize_t +perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count) +{ +	unsigned long val; +	int err; + +	err = strict_strtoul(buf, 10, &val); +	if (err) +		return err; +	if (val > 1) +		return -EINVAL; + +	spin_lock(&perf_resource_lock); +	perf_overcommit = val; +	spin_unlock(&perf_resource_lock); + +	return count; +} + +static SYSDEV_CLASS_ATTR( +				reserve_percpu, +				0644, +				perf_show_reserve_percpu, +				perf_set_reserve_percpu +			); + +static SYSDEV_CLASS_ATTR( +				overcommit, +				0644, +				perf_show_overcommit, +				perf_set_overcommit +			); + +static struct attribute *perfclass_attrs[] = { +	&attr_reserve_percpu.attr, +	&attr_overcommit.attr, +	NULL +}; + +static struct attribute_group perfclass_attr_group = { +	.attrs			= perfclass_attrs, +	.name			= "perf_counters", +}; + +static int __init perf_counter_sysfs_init(void) +{ +	return sysfs_create_group(&cpu_sysdev_class.kset.kobj, +				  &perfclass_attr_group); +} +device_initcall(perf_counter_sysfs_init); diff --git a/kernel/sched.c b/kernel/sched.c index dcf2dc28931..f04aa966450 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -39,6 +39,7 @@  #include <linux/completion.h>  #include <linux/kernel_stat.h>  #include <linux/debug_locks.h> +#include <linux/perf_counter.h>  #include <linux/security.h>  #include <linux/notifier.h>  #include <linux/profile.h> @@ -579,6 +580,7 @@ struct rq {  	struct load_weight load;  	unsigned long nr_load_updates;  	u64 nr_switches; +	u64 nr_migrations_in;  	struct cfs_rq cfs;  	struct rt_rq rt; @@ -691,7 +693,7 @@ static inline int cpu_of(struct rq *rq)  #define task_rq(p)		cpu_rq(task_cpu(p))  #define cpu_curr(cpu)		(cpu_rq(cpu)->curr) -static inline void update_rq_clock(struct rq *rq) +inline void update_rq_clock(struct rq *rq)  {  	rq->clock = sched_clock_cpu(cpu_of(rq));  } @@ -1968,12 +1970,16 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)  		p->se.sleep_start -= clock_offset;  	if (p->se.block_start)  		p->se.block_start -= clock_offset; +#endif  	if (old_cpu != new_cpu) { -		schedstat_inc(p, se.nr_migrations); +		p->se.nr_migrations++; +		new_rq->nr_migrations_in++; +#ifdef CONFIG_SCHEDSTATS  		if (task_hot(p, old_rq->clock, NULL))  			schedstat_inc(p, se.nr_forced2_migrations); -	}  #endif +		perf_counter_task_migration(p, new_cpu); +	}  	p->se.vruntime -= old_cfsrq->min_vruntime -  					 new_cfsrq->min_vruntime; @@ -2368,6 +2374,27 @@ static int sched_balance_self(int cpu, int flag)  #endif /* CONFIG_SMP */ +/** + * task_oncpu_function_call - call a function on the cpu on which a task runs + * @p:		the task to evaluate + * @func:	the function to be called + * @info:	the function call argument + * + * Calls the function @func when the task is currently running. This might + * be on the current CPU, which just calls the function directly + */ +void task_oncpu_function_call(struct task_struct *p, +			      void (*func) (void *info), void *info) +{ +	int cpu; + +	preempt_disable(); +	cpu = task_cpu(p); +	if (task_curr(p)) +		smp_call_function_single(cpu, func, info, 1); +	preempt_enable(); +} +  /***   * try_to_wake_up - wake up a thread   * @p: the to-be-woken-up thread @@ -2535,6 +2562,7 @@ static void __sched_fork(struct task_struct *p)  	p->se.exec_start		= 0;  	p->se.sum_exec_runtime		= 0;  	p->se.prev_sum_exec_runtime	= 0; +	p->se.nr_migrations		= 0;  	p->se.last_wakeup		= 0;  	p->se.avg_overlap		= 0;  	p->se.start_runtime		= 0; @@ -2765,6 +2793,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)  	 */  	prev_state = prev->state;  	finish_arch_switch(prev); +	perf_counter_task_sched_in(current, cpu_of(rq));  	finish_lock_switch(rq, prev);  #ifdef CONFIG_SMP  	if (post_schedule) @@ -2980,6 +3009,15 @@ static void calc_load_account_active(struct rq *this_rq)  }  /* + * Externally visible per-cpu scheduler statistics: + * cpu_nr_migrations(cpu) - number of migrations into that cpu + */ +u64 cpu_nr_migrations(int cpu) +{ +	return cpu_rq(cpu)->nr_migrations_in; +} + +/*   * Update rq->cpu_load[] statistics. This function is usually called every   * scheduler tick (TICK_NSEC).   */ @@ -5077,6 +5115,8 @@ void scheduler_tick(void)  	curr->sched_class->task_tick(rq, curr, 0);  	spin_unlock(&rq->lock); +	perf_counter_task_tick(curr, cpu); +  #ifdef CONFIG_SMP  	rq->idle_at_tick = idle_cpu(cpu);  	trigger_load_balance(rq, cpu); @@ -5292,6 +5332,7 @@ need_resched_nonpreemptible:  	if (likely(prev != next)) {  		sched_info_switch(prev, next); +		perf_counter_task_sched_out(prev, next, cpu);  		rq->nr_switches++;  		rq->curr = next; @@ -7535,8 +7576,10 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)  	return NOTIFY_OK;  } -/* Register at highest priority so that task migration (migrate_all_tasks) - * happens before everything else. +/* + * Register at high priority so that task migration (migrate_all_tasks) + * happens before everything else.  This has to be lower priority than + * the notifier in the perf_counter subsystem, though.   */  static struct notifier_block __cpuinitdata migration_notifier = {  	.notifier_call = migration_call, @@ -9214,7 +9257,7 @@ void __init sched_init(void)  		 * 1024) and two child groups A0 and A1 (of weight 1024 each),  		 * then A0's share of the cpu resource is:  		 * -		 * 	A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% +		 *	A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%  		 *  		 * We achieve this by letting init_task_group's tasks sit  		 * directly in rq->cfs (i.e init_task_group->se[] = NULL). @@ -9319,6 +9362,8 @@ void __init sched_init(void)  	alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);  #endif /* SMP */ +	perf_counter_init(); +  	scheduler_running = 1;  } diff --git a/kernel/sys.c b/kernel/sys.c index e7998cf3149..438d99a38c8 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -14,6 +14,7 @@  #include <linux/prctl.h>  #include <linux/highuid.h>  #include <linux/fs.h> +#include <linux/perf_counter.h>  #include <linux/resource.h>  #include <linux/kernel.h>  #include <linux/kexec.h> @@ -1793,6 +1794,12 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,  		case PR_SET_TSC:  			error = SET_TSC_CTL(arg2);  			break; +		case PR_TASK_PERF_COUNTERS_DISABLE: +			error = perf_counter_task_disable(); +			break; +		case PR_TASK_PERF_COUNTERS_ENABLE: +			error = perf_counter_task_enable(); +			break;  		case PR_GET_TIMERSLACK:  			error = current->timer_slack_ns;  			break; diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 27dad296738..68320f6b07b 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -175,3 +175,6 @@ cond_syscall(compat_sys_timerfd_settime);  cond_syscall(compat_sys_timerfd_gettime);  cond_syscall(sys_eventfd);  cond_syscall(sys_eventfd2); + +/* performance counters: */ +cond_syscall(sys_perf_counter_open); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 944ba03cae1..ce664f98e3f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -49,6 +49,7 @@  #include <linux/reboot.h>  #include <linux/ftrace.h>  #include <linux/slow-work.h> +#include <linux/perf_counter.h>  #include <asm/uaccess.h>  #include <asm/processor.h> @@ -932,6 +933,32 @@ static struct ctl_table kern_table[] = {  		.child		= slow_work_sysctls,  	},  #endif +#ifdef CONFIG_PERF_COUNTERS +	{ +		.ctl_name	= CTL_UNNUMBERED, +		.procname	= "perf_counter_paranoid", +		.data		= &sysctl_perf_counter_paranoid, +		.maxlen		= sizeof(sysctl_perf_counter_paranoid), +		.mode		= 0644, +		.proc_handler	= &proc_dointvec, +	}, +	{ +		.ctl_name	= CTL_UNNUMBERED, +		.procname	= "perf_counter_mlock_kb", +		.data		= &sysctl_perf_counter_mlock, +		.maxlen		= sizeof(sysctl_perf_counter_mlock), +		.mode		= 0644, +		.proc_handler	= &proc_dointvec, +	}, +	{ +		.ctl_name	= CTL_UNNUMBERED, +		.procname	= "perf_counter_max_sample_rate", +		.data		= &sysctl_perf_counter_sample_rate, +		.maxlen		= sizeof(sysctl_perf_counter_sample_rate), +		.mode		= 0644, +		.proc_handler	= &proc_dointvec, +	}, +#endif  /*   * NOTE: do not add new entries to this table unless you have read   * Documentation/sysctl/ctl_unnumbered.txt diff --git a/kernel/timer.c b/kernel/timer.c index a26ed294f93..c01e568935e 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -37,6 +37,7 @@  #include <linux/delay.h>  #include <linux/tick.h>  #include <linux/kallsyms.h> +#include <linux/perf_counter.h>  #include <asm/uaccess.h>  #include <asm/unistd.h> @@ -1129,6 +1130,8 @@ static void run_timer_softirq(struct softirq_action *h)  {  	struct tvec_base *base = __get_cpu_var(tvec_bases); +	perf_counter_do_pending(); +  	hrtimer_run_pending();  	if (time_after_eq(jiffies, base->timer_jiffies)) diff --git a/mm/mmap.c b/mm/mmap.c index 2b43fa1aa3c..34579b23ebd 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -28,6 +28,7 @@  #include <linux/mempolicy.h>  #include <linux/rmap.h>  #include <linux/mmu_notifier.h> +#include <linux/perf_counter.h>  #include <asm/uaccess.h>  #include <asm/cacheflush.h> @@ -1222,6 +1223,8 @@ munmap_back:  	if (correct_wcount)  		atomic_inc(&inode->i_writecount);  out: +	perf_counter_mmap(vma); +  	mm->total_vm += len >> PAGE_SHIFT;  	vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT);  	if (vm_flags & VM_LOCKED) { @@ -2308,6 +2311,8 @@ int install_special_mapping(struct mm_struct *mm,  	mm->total_vm += len >> PAGE_SHIFT; +	perf_counter_mmap(vma); +  	return 0;  } diff --git a/mm/mprotect.c b/mm/mprotect.c index 258197b76fb..d80311baeb2 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -23,6 +23,7 @@  #include <linux/swapops.h>  #include <linux/mmu_notifier.h>  #include <linux/migrate.h> +#include <linux/perf_counter.h>  #include <asm/uaccess.h>  #include <asm/pgtable.h>  #include <asm/cacheflush.h> @@ -299,6 +300,7 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len,  		error = mprotect_fixup(vma, &prev, nstart, tmp, newflags);  		if (error)  			goto out; +		perf_counter_mmap(vma);  		nstart = tmp;  		if (nstart < prev->vm_end) diff --git a/tools/perf/.gitignore b/tools/perf/.gitignore new file mode 100644 index 00000000000..d69a759a104 --- /dev/null +++ b/tools/perf/.gitignore @@ -0,0 +1,16 @@ +PERF-BUILD-OPTIONS +PERF-CFLAGS +PERF-GUI-VARS +PERF-VERSION-FILE +perf +perf-help +perf-record +perf-report +perf-stat +perf-top +perf*.1 +perf*.xml +common-cmds.h +tags +TAGS +cscope* diff --git a/tools/perf/Documentation/Makefile b/tools/perf/Documentation/Makefile new file mode 100644 index 00000000000..5457192e1b4 --- /dev/null +++ b/tools/perf/Documentation/Makefile @@ -0,0 +1,300 @@ +MAN1_TXT= \ +	$(filter-out $(addsuffix .txt, $(ARTICLES) $(SP_ARTICLES)), \ +		$(wildcard perf-*.txt)) \ +	perf.txt +MAN5_TXT= +MAN7_TXT= + +MAN_TXT = $(MAN1_TXT) $(MAN5_TXT) $(MAN7_TXT) +MAN_XML=$(patsubst %.txt,%.xml,$(MAN_TXT)) +MAN_HTML=$(patsubst %.txt,%.html,$(MAN_TXT)) + +DOC_HTML=$(MAN_HTML) + +ARTICLES = +# with their own formatting rules. +SP_ARTICLES = +API_DOCS = $(patsubst %.txt,%,$(filter-out technical/api-index-skel.txt technical/api-index.txt, $(wildcard technical/api-*.txt))) +SP_ARTICLES += $(API_DOCS) +SP_ARTICLES += technical/api-index + +DOC_HTML += $(patsubst %,%.html,$(ARTICLES) $(SP_ARTICLES)) + +DOC_MAN1=$(patsubst %.txt,%.1,$(MAN1_TXT)) +DOC_MAN5=$(patsubst %.txt,%.5,$(MAN5_TXT)) +DOC_MAN7=$(patsubst %.txt,%.7,$(MAN7_TXT)) + +prefix?=$(HOME) +bindir?=$(prefix)/bin +htmldir?=$(prefix)/share/doc/perf-doc +pdfdir?=$(prefix)/share/doc/perf-doc +mandir?=$(prefix)/share/man +man1dir=$(mandir)/man1 +man5dir=$(mandir)/man5 +man7dir=$(mandir)/man7 +# DESTDIR= + +ASCIIDOC=asciidoc +ASCIIDOC_EXTRA = +MANPAGE_XSL = manpage-normal.xsl +XMLTO_EXTRA = +INSTALL?=install +RM ?= rm -f +DOC_REF = origin/man +HTML_REF = origin/html + +infodir?=$(prefix)/share/info +MAKEINFO=makeinfo +INSTALL_INFO=install-info +DOCBOOK2X_TEXI=docbook2x-texi +DBLATEX=dblatex +ifndef PERL_PATH +	PERL_PATH = /usr/bin/perl +endif + +-include ../config.mak.autogen +-include ../config.mak + +# +# For asciidoc ... +#	-7.1.2,	no extra settings are needed. +#	8.0-,	set ASCIIDOC8. +# + +# +# For docbook-xsl ... +#	-1.68.1,	set ASCIIDOC_NO_ROFF? (based on changelog from 1.73.0) +#	1.69.0,		no extra settings are needed? +#	1.69.1-1.71.0,	set DOCBOOK_SUPPRESS_SP? +#	1.71.1,		no extra settings are needed? +#	1.72.0,		set DOCBOOK_XSL_172. +#	1.73.0-,	set ASCIIDOC_NO_ROFF +# + +# +# If you had been using DOCBOOK_XSL_172 in an attempt to get rid +# of 'the ".ft C" problem' in your generated manpages, and you +# instead ended up with weird characters around callouts, try +# using ASCIIDOC_NO_ROFF instead (it works fine with ASCIIDOC8). +# + +ifdef ASCIIDOC8 +ASCIIDOC_EXTRA += -a asciidoc7compatible +endif +ifdef DOCBOOK_XSL_172 +ASCIIDOC_EXTRA += -a perf-asciidoc-no-roff +MANPAGE_XSL = manpage-1.72.xsl +else +	ifdef ASCIIDOC_NO_ROFF +	# docbook-xsl after 1.72 needs the regular XSL, but will not +	# pass-thru raw roff codes from asciidoc.conf, so turn them off. +	ASCIIDOC_EXTRA += -a perf-asciidoc-no-roff +	endif +endif +ifdef MAN_BOLD_LITERAL +XMLTO_EXTRA += -m manpage-bold-literal.xsl +endif +ifdef DOCBOOK_SUPPRESS_SP +XMLTO_EXTRA += -m manpage-suppress-sp.xsl +endif + +SHELL_PATH ?= $(SHELL) +# Shell quote; +SHELL_PATH_SQ = $(subst ','\'',$(SHELL_PATH)) + +# +# Please note that there is a minor bug in asciidoc. +# The version after 6.0.3 _will_ include the patch found here: +#   http://marc.theaimsgroup.com/?l=perf&m=111558757202243&w=2 +# +# Until that version is released you may have to apply the patch +# yourself - yes, all 6 characters of it! +# + +QUIET_SUBDIR0  = +$(MAKE) -C # space to separate -C and subdir +QUIET_SUBDIR1  = + +ifneq ($(findstring $(MAKEFLAGS),w),w) +PRINT_DIR = --no-print-directory +else # "make -w" +NO_SUBDIR = : +endif + +ifneq ($(findstring $(MAKEFLAGS),s),s) +ifndef V +	QUIET_ASCIIDOC	= @echo '   ' ASCIIDOC $@; +	QUIET_XMLTO	= @echo '   ' XMLTO $@; +	QUIET_DB2TEXI	= @echo '   ' DB2TEXI $@; +	QUIET_MAKEINFO	= @echo '   ' MAKEINFO $@; +	QUIET_DBLATEX	= @echo '   ' DBLATEX $@; +	QUIET_XSLTPROC	= @echo '   ' XSLTPROC $@; +	QUIET_GEN	= @echo '   ' GEN $@; +	QUIET_STDERR	= 2> /dev/null +	QUIET_SUBDIR0	= +@subdir= +	QUIET_SUBDIR1	= ;$(NO_SUBDIR) echo '   ' SUBDIR $$subdir; \ +			  $(MAKE) $(PRINT_DIR) -C $$subdir +	export V +endif +endif + +all: html man + +html: $(DOC_HTML) + +$(DOC_HTML) $(DOC_MAN1) $(DOC_MAN5) $(DOC_MAN7): asciidoc.conf + +man: man1 man5 man7 +man1: $(DOC_MAN1) +man5: $(DOC_MAN5) +man7: $(DOC_MAN7) + +info: perf.info perfman.info + +pdf: user-manual.pdf + +install: install-man + +install-man: man +	$(INSTALL) -d -m 755 $(DESTDIR)$(man1dir) +#	$(INSTALL) -d -m 755 $(DESTDIR)$(man5dir) +#	$(INSTALL) -d -m 755 $(DESTDIR)$(man7dir) +	$(INSTALL) -m 644 $(DOC_MAN1) $(DESTDIR)$(man1dir) +#	$(INSTALL) -m 644 $(DOC_MAN5) $(DESTDIR)$(man5dir) +#	$(INSTALL) -m 644 $(DOC_MAN7) $(DESTDIR)$(man7dir) + +install-info: info +	$(INSTALL) -d -m 755 $(DESTDIR)$(infodir) +	$(INSTALL) -m 644 perf.info perfman.info $(DESTDIR)$(infodir) +	if test -r $(DESTDIR)$(infodir)/dir; then \ +	  $(INSTALL_INFO) --info-dir=$(DESTDIR)$(infodir) perf.info ;\ +	  $(INSTALL_INFO) --info-dir=$(DESTDIR)$(infodir) perfman.info ;\ +	else \ +	  echo "No directory found in $(DESTDIR)$(infodir)" >&2 ; \ +	fi + +install-pdf: pdf +	$(INSTALL) -d -m 755 $(DESTDIR)$(pdfdir) +	$(INSTALL) -m 644 user-manual.pdf $(DESTDIR)$(pdfdir) + +install-html: html +	'$(SHELL_PATH_SQ)' ./install-webdoc.sh $(DESTDIR)$(htmldir) + +../PERF-VERSION-FILE: .FORCE-PERF-VERSION-FILE +	$(QUIET_SUBDIR0)../ $(QUIET_SUBDIR1) PERF-VERSION-FILE + +-include ../PERF-VERSION-FILE + +# +# Determine "include::" file references in asciidoc files. +# +doc.dep : $(wildcard *.txt) build-docdep.perl +	$(QUIET_GEN)$(RM) $@+ $@ && \ +	$(PERL_PATH) ./build-docdep.perl >$@+ $(QUIET_STDERR) && \ +	mv $@+ $@ + +-include doc.dep + +cmds_txt = cmds-ancillaryinterrogators.txt \ +	cmds-ancillarymanipulators.txt \ +	cmds-mainporcelain.txt \ +	cmds-plumbinginterrogators.txt \ +	cmds-plumbingmanipulators.txt \ +	cmds-synchingrepositories.txt \ +	cmds-synchelpers.txt \ +	cmds-purehelpers.txt \ +	cmds-foreignscminterface.txt + +$(cmds_txt): cmd-list.made + +cmd-list.made: cmd-list.perl ../command-list.txt $(MAN1_TXT) +	$(QUIET_GEN)$(RM) $@ && \ +	$(PERL_PATH) ./cmd-list.perl ../command-list.txt $(QUIET_STDERR) && \ +	date >$@ + +clean: +	$(RM) *.xml *.xml+ *.html *.html+ *.1 *.5 *.7 +	$(RM) *.texi *.texi+ *.texi++ perf.info perfman.info +	$(RM) howto-index.txt howto/*.html doc.dep +	$(RM) technical/api-*.html technical/api-index.txt +	$(RM) $(cmds_txt) *.made + +$(MAN_HTML): %.html : %.txt +	$(QUIET_ASCIIDOC)$(RM) $@+ $@ && \ +	$(ASCIIDOC) -b xhtml11 -d manpage -f asciidoc.conf \ +		$(ASCIIDOC_EXTRA) -aperf_version=$(PERF_VERSION) -o $@+ $< && \ +	mv $@+ $@ + +%.1 %.5 %.7 : %.xml +	$(QUIET_XMLTO)$(RM) $@ && \ +	xmlto -m $(MANPAGE_XSL) $(XMLTO_EXTRA) man $< + +%.xml : %.txt +	$(QUIET_ASCIIDOC)$(RM) $@+ $@ && \ +	$(ASCIIDOC) -b docbook -d manpage -f asciidoc.conf \ +		$(ASCIIDOC_EXTRA) -aperf_version=$(PERF_VERSION) -o $@+ $< && \ +	mv $@+ $@ + +XSLT = docbook.xsl +XSLTOPTS = --xinclude --stringparam html.stylesheet docbook-xsl.css + +user-manual.html: user-manual.xml +	$(QUIET_XSLTPROC)xsltproc $(XSLTOPTS) -o $@ $(XSLT) $< + +perf.info: user-manual.texi +	$(QUIET_MAKEINFO)$(MAKEINFO) --no-split -o $@ user-manual.texi + +user-manual.texi: user-manual.xml +	$(QUIET_DB2TEXI)$(RM) $@+ $@ && \ +	$(DOCBOOK2X_TEXI) user-manual.xml --encoding=UTF-8 --to-stdout >$@++ && \ +	$(PERL_PATH) fix-texi.perl <$@++ >$@+ && \ +	rm $@++ && \ +	mv $@+ $@ + +user-manual.pdf: user-manual.xml +	$(QUIET_DBLATEX)$(RM) $@+ $@ && \ +	$(DBLATEX) -o $@+ -p /etc/asciidoc/dblatex/asciidoc-dblatex.xsl -s /etc/asciidoc/dblatex/asciidoc-dblatex.sty $< && \ +	mv $@+ $@ + +perfman.texi: $(MAN_XML) cat-texi.perl +	$(QUIET_DB2TEXI)$(RM) $@+ $@ && \ +	($(foreach xml,$(MAN_XML),$(DOCBOOK2X_TEXI) --encoding=UTF-8 \ +		--to-stdout $(xml) &&) true) > $@++ && \ +	$(PERL_PATH) cat-texi.perl $@ <$@++ >$@+ && \ +	rm $@++ && \ +	mv $@+ $@ + +perfman.info: perfman.texi +	$(QUIET_MAKEINFO)$(MAKEINFO) --no-split --no-validate $*.texi + +$(patsubst %.txt,%.texi,$(MAN_TXT)): %.texi : %.xml +	$(QUIET_DB2TEXI)$(RM) $@+ $@ && \ +	$(DOCBOOK2X_TEXI) --to-stdout $*.xml >$@+ && \ +	mv $@+ $@ + +howto-index.txt: howto-index.sh $(wildcard howto/*.txt) +	$(QUIET_GEN)$(RM) $@+ $@ && \ +	'$(SHELL_PATH_SQ)' ./howto-index.sh $(wildcard howto/*.txt) >$@+ && \ +	mv $@+ $@ + +$(patsubst %,%.html,$(ARTICLES)) : %.html : %.txt +	$(QUIET_ASCIIDOC)$(ASCIIDOC) -b xhtml11 $*.txt + +WEBDOC_DEST = /pub/software/tools/perf/docs + +$(patsubst %.txt,%.html,$(wildcard howto/*.txt)): %.html : %.txt +	$(QUIET_ASCIIDOC)$(RM) $@+ $@ && \ +	sed -e '1,/^$$/d' $< | $(ASCIIDOC) -b xhtml11 - >$@+ && \ +	mv $@+ $@ + +install-webdoc : html +	'$(SHELL_PATH_SQ)' ./install-webdoc.sh $(WEBDOC_DEST) + +quick-install: quick-install-man + +quick-install-man: +	'$(SHELL_PATH_SQ)' ./install-doc-quick.sh $(DOC_REF) $(DESTDIR)$(mandir) + +quick-install-html: +	'$(SHELL_PATH_SQ)' ./install-doc-quick.sh $(HTML_REF) $(DESTDIR)$(htmldir) + +.PHONY: .FORCE-PERF-VERSION-FILE diff --git a/tools/perf/Documentation/asciidoc.conf b/tools/perf/Documentation/asciidoc.conf new file mode 100644 index 00000000000..356b23a4033 --- /dev/null +++ b/tools/perf/Documentation/asciidoc.conf @@ -0,0 +1,91 @@ +## linkperf: macro +# +# Usage: linkperf:command[manpage-section] +# +# Note, {0} is the manpage section, while {target} is the command. +# +# Show PERF link as: <command>(<section>); if section is defined, else just show +# the command. + +[macros] +(?su)[\\]?(?P<name>linkperf):(?P<target>\S*?)\[(?P<attrlist>.*?)\]= + +[attributes] +asterisk=* +plus=+ +caret=^ +startsb=[ +endsb=] +tilde=~ + +ifdef::backend-docbook[] +[linkperf-inlinemacro] +{0%{target}} +{0#<citerefentry>} +{0#<refentrytitle>{target}</refentrytitle><manvolnum>{0}</manvolnum>} +{0#</citerefentry>} +endif::backend-docbook[] + +ifdef::backend-docbook[] +ifndef::perf-asciidoc-no-roff[] +# "unbreak" docbook-xsl v1.68 for manpages. v1.69 works with or without this. +# v1.72 breaks with this because it replaces dots not in roff requests. +[listingblock] +<example><title>{title}</title> +<literallayout> +ifdef::doctype-manpage[] +
.ft C
 +endif::doctype-manpage[] +| +ifdef::doctype-manpage[] +
.ft
 +endif::doctype-manpage[] +</literallayout> +{title#}</example> +endif::perf-asciidoc-no-roff[] + +ifdef::perf-asciidoc-no-roff[] +ifdef::doctype-manpage[] +# The following two small workarounds insert a simple paragraph after screen +[listingblock] +<example><title>{title}</title> +<literallayout> +| +</literallayout><simpara></simpara> +{title#}</example> + +[verseblock] +<formalpara{id? id="{id}"}><title>{title}</title><para> +{title%}<literallayout{id? id="{id}"}> +{title#}<literallayout> +| +</literallayout> +{title#}</para></formalpara> +{title%}<simpara></simpara> +endif::doctype-manpage[] +endif::perf-asciidoc-no-roff[] +endif::backend-docbook[] + +ifdef::doctype-manpage[] +ifdef::backend-docbook[] +[header] +template::[header-declarations] +<refentry> +<refmeta> +<refentrytitle>{mantitle}</refentrytitle> +<manvolnum>{manvolnum}</manvolnum> +<refmiscinfo class="source">perf</refmiscinfo> +<refmiscinfo class="version">{perf_version}</refmiscinfo> +<refmiscinfo class="manual">perf Manual</refmiscinfo> +</refmeta> +<refnamediv> +  <refname>{manname}</refname> +  <refpurpose>{manpurpose}</refpurpose> +</refnamediv> +endif::backend-docbook[] +endif::doctype-manpage[] + +ifdef::backend-xhtml11[] +[linkperf-inlinemacro] +<a href="{target}.html">{target}{0?({0})}</a> +endif::backend-xhtml11[] diff --git a/tools/perf/Documentation/manpage-1.72.xsl b/tools/perf/Documentation/manpage-1.72.xsl new file mode 100644 index 00000000000..b4d315cb8c4 --- /dev/null +++ b/tools/perf/Documentation/manpage-1.72.xsl @@ -0,0 +1,14 @@ +<!-- manpage-1.72.xsl: +     special settings for manpages rendered from asciidoc+docbook +     handles peculiarities in docbook-xsl 1.72.0 --> +<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" +		version="1.0"> + +<xsl:import href="manpage-base.xsl"/> + +<!-- these are the special values for the roff control characters +     needed for docbook-xsl 1.72.0 --> +<xsl:param name="git.docbook.backslash">▓</xsl:param> +<xsl:param name="git.docbook.dot"      >⌂</xsl:param> + +</xsl:stylesheet> diff --git a/tools/perf/Documentation/manpage-base.xsl b/tools/perf/Documentation/manpage-base.xsl new file mode 100644 index 00000000000..a264fa61609 --- /dev/null +++ b/tools/perf/Documentation/manpage-base.xsl @@ -0,0 +1,35 @@ +<!-- manpage-base.xsl: +     special formatting for manpages rendered from asciidoc+docbook --> +<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" +		version="1.0"> + +<!-- these params silence some output from xmlto --> +<xsl:param name="man.output.quietly" select="1"/> +<xsl:param name="refentry.meta.get.quietly" select="1"/> + +<!-- convert asciidoc callouts to man page format; +     git.docbook.backslash and git.docbook.dot params +     must be supplied by another XSL file or other means --> +<xsl:template match="co"> +	<xsl:value-of select="concat( +			      $git.docbook.backslash,'fB(', +			      substring-after(@id,'-'),')', +			      $git.docbook.backslash,'fR')"/> +</xsl:template> +<xsl:template match="calloutlist"> +	<xsl:value-of select="$git.docbook.dot"/> +	<xsl:text>sp
</xsl:text> +	<xsl:apply-templates/> +	<xsl:text>
</xsl:text> +</xsl:template> +<xsl:template match="callout"> +	<xsl:value-of select="concat( +			      $git.docbook.backslash,'fB', +			      substring-after(@arearefs,'-'), +			      '. ',$git.docbook.backslash,'fR')"/> +	<xsl:apply-templates/> +	<xsl:value-of select="$git.docbook.dot"/> +	<xsl:text>br
</xsl:text> +</xsl:template> + +</xsl:stylesheet> diff --git a/tools/perf/Documentation/manpage-bold-literal.xsl b/tools/perf/Documentation/manpage-bold-literal.xsl new file mode 100644 index 00000000000..608eb5df628 --- /dev/null +++ b/tools/perf/Documentation/manpage-bold-literal.xsl @@ -0,0 +1,17 @@ +<!-- manpage-bold-literal.xsl: +     special formatting for manpages rendered from asciidoc+docbook --> +<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" +		version="1.0"> + +<!-- render literal text as bold (instead of plain or monospace); +     this makes literal text easier to distinguish in manpages +     viewed on a tty --> +<xsl:template match="literal"> +	<xsl:value-of select="$git.docbook.backslash"/> +	<xsl:text>fB</xsl:text> +	<xsl:apply-templates/> +	<xsl:value-of select="$git.docbook.backslash"/> +	<xsl:text>fR</xsl:text> +</xsl:template> + +</xsl:stylesheet> diff --git a/tools/perf/Documentation/manpage-normal.xsl b/tools/perf/Documentation/manpage-normal.xsl new file mode 100644 index 00000000000..a48f5b11f3d --- /dev/null +++ b/tools/perf/Documentation/manpage-normal.xsl @@ -0,0 +1,13 @@ +<!-- manpage-normal.xsl: +     special settings for manpages rendered from asciidoc+docbook +     handles anything we want to keep away from docbook-xsl 1.72.0 --> +<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" +		version="1.0"> + +<xsl:import href="manpage-base.xsl"/> + +<!-- these are the normal values for the roff control characters --> +<xsl:param name="git.docbook.backslash">\</xsl:param> +<xsl:param name="git.docbook.dot"	>.</xsl:param> + +</xsl:stylesheet> diff --git a/tools/perf/Documentation/manpage-suppress-sp.xsl b/tools/perf/Documentation/manpage-suppress-sp.xsl new file mode 100644 index 00000000000..a63c7632a87 --- /dev/null +++ b/tools/perf/Documentation/manpage-suppress-sp.xsl @@ -0,0 +1,21 @@ +<!-- manpage-suppress-sp.xsl: +     special settings for manpages rendered from asciidoc+docbook +     handles erroneous, inline .sp in manpage output of some +     versions of docbook-xsl --> +<xsl:stylesheet xmlns:xsl="http://www.w3.org/1999/XSL/Transform" +		version="1.0"> + +<!-- attempt to work around spurious .sp at the tail of the line +     that some versions of docbook stylesheets seem to add --> +<xsl:template match="simpara"> +  <xsl:variable name="content"> +    <xsl:apply-templates/> +  </xsl:variable> +  <xsl:value-of select="normalize-space($content)"/> +  <xsl:if test="not(ancestor::authorblurb) and +                not(ancestor::personblurb)"> +    <xsl:text>

</xsl:text> +  </xsl:if> +</xsl:template> + +</xsl:stylesheet> diff --git a/tools/perf/Documentation/perf-annotate.txt b/tools/perf/Documentation/perf-annotate.txt new file mode 100644 index 00000000000..c9dcade0683 --- /dev/null +++ b/tools/perf/Documentation/perf-annotate.txt @@ -0,0 +1,29 @@ +perf-annotate(1) +============== + +NAME +---- +perf-annotate - Read perf.data (created by perf record) and display annotated code + +SYNOPSIS +-------- +[verse] +'perf annotate' [-i <file> | --input=file] symbol_name + +DESCRIPTION +----------- +This command reads the input file and displays an annotated version of the +code. If the object file has debug symbols then the source code will be +displayed alongside assembly code. + +If there is no debug info in the object, then annotated assembly is displayed. + +OPTIONS +------- +-i:: +--input=:: +        Input file name. (default: perf.data) + +SEE ALSO +-------- +linkperf:perf-record[1] diff --git a/tools/perf/Documentation/perf-help.txt b/tools/perf/Documentation/perf-help.txt new file mode 100644 index 00000000000..514391818d1 --- /dev/null +++ b/tools/perf/Documentation/perf-help.txt @@ -0,0 +1,38 @@ +perf-help(1) +============ + +NAME +---- +perf-help - display help information about perf + +SYNOPSIS +-------- +'perf help' [-a|--all] [COMMAND] + +DESCRIPTION +----------- + +With no options and no COMMAND given, the synopsis of the 'perf' +command and a list of the most commonly used perf commands are printed +on the standard output. + +If the option '--all' or '-a' is given, then all available commands are +printed on the standard output. + +If a perf command is named, a manual page for that command is brought +up. The 'man' program is used by default for this purpose, but this +can be overridden by other options or configuration variables. + +Note that `perf --help ...` is identical to `perf help ...` because the +former is internally converted into the latter. + +OPTIONS +------- +-a:: +--all:: +	Prints all the available commands on the standard output. This +	option supersedes any other option. + +PERF +---- +Part of the linkperf:perf[1] suite diff --git a/tools/perf/Documentation/perf-list.txt b/tools/perf/Documentation/perf-list.txt new file mode 100644 index 00000000000..8290b942266 --- /dev/null +++ b/tools/perf/Documentation/perf-list.txt @@ -0,0 +1,25 @@ +perf-list(1) +============ + +NAME +---- +perf-list - List all symbolic event types + +SYNOPSIS +-------- +[verse] +'perf list' + +DESCRIPTION +----------- +This command displays the symbolic event types which can be selected in the +various perf commands with the -e option. + +OPTIONS +------- +None + +SEE ALSO +-------- +linkperf:perf-stat[1], linkperf:perf-top[1], +linkperf:perf-record[1] diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt new file mode 100644 index 00000000000..1dbc1eeb4c0 --- /dev/null +++ b/tools/perf/Documentation/perf-record.txt @@ -0,0 +1,42 @@ +perf-record(1) +============== + +NAME +---- +perf-record - Run a command and record its profile into perf.data + +SYNOPSIS +-------- +[verse] +'perf record' [-e <EVENT> | --event=EVENT] [-l] [-a] <command> +'perf record' [-e <EVENT> | --event=EVENT] [-l] [-a] -- <command> [<options>] + +DESCRIPTION +----------- +This command runs a command and gathers a performance counter profile +from it, into perf.data - without displaying anything. + +This file can then be inspected later on, using 'perf report'. + + +OPTIONS +------- +<command>...:: +	Any command you can specify in a shell. + +-e:: +--event=:: +	Select the PMU event. Selection can be a symbolic event name +	(use 'perf list' to list all events) or a raw PMU +	event (eventsel+umask) in the form of rNNN where NNN is a +	 hexadecimal event descriptor. + +-a:: +        system-wide collection + +-l:: +        scale counter values + +SEE ALSO +-------- +linkperf:perf-stat[1], linkperf:perf-list[1] diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt new file mode 100644 index 00000000000..52d3fc6846a --- /dev/null +++ b/tools/perf/Documentation/perf-report.txt @@ -0,0 +1,26 @@ +perf-report(1) +============== + +NAME +---- +perf-report - Read perf.data (created by perf record) and display the profile + +SYNOPSIS +-------- +[verse] +'perf report' [-i <file> | --input=file] + +DESCRIPTION +----------- +This command displays the performance counter profile information recorded +via perf report. + +OPTIONS +------- +-i:: +--input=:: +        Input file name. (default: perf.data) + +SEE ALSO +-------- +linkperf:perf-stat[1] diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt new file mode 100644 index 00000000000..c368a72721d --- /dev/null +++ b/tools/perf/Documentation/perf-stat.txt @@ -0,0 +1,66 @@ +perf-stat(1) +============ + +NAME +---- +perf-stat - Run a command and gather performance counter statistics + +SYNOPSIS +-------- +[verse] +'perf stat' [-e <EVENT> | --event=EVENT] [-l] [-a] <command> +'perf stat' [-e <EVENT> | --event=EVENT] [-l] [-a] -- <command> [<options>] + +DESCRIPTION +----------- +This command runs a command and gathers performance counter statistics +from it. + + +OPTIONS +------- +<command>...:: +	Any command you can specify in a shell. + + +-e:: +--event=:: +	Select the PMU event. Selection can be a symbolic event name +	(use 'perf list' to list all events) or a raw PMU +	event (eventsel+umask) in the form of rNNN where NNN is a +	 hexadecimal event descriptor. + +-i:: +--inherit:: +        child tasks inherit counters +-p:: +--pid=<pid>:: +        stat events on existing pid + +-a:: +        system-wide collection + +-l:: +        scale counter values + +EXAMPLES +-------- + +$ perf stat -- make -j + + Performance counter stats for 'make -j': + +    8117.370256  task clock ticks     #      11.281 CPU utilization factor +            678  context switches     #       0.000 M/sec +            133  CPU migrations       #       0.000 M/sec +         235724  pagefaults           #       0.029 M/sec +    24821162526  CPU cycles           #    3057.784 M/sec +    18687303457  instructions         #    2302.138 M/sec +      172158895  cache references     #      21.209 M/sec +       27075259  cache misses         #       3.335 M/sec + + Wall-clock time elapsed:   719.554352 msecs + +SEE ALSO +-------- +linkperf:perf-top[1], linkperf:perf-list[1] diff --git a/tools/perf/Documentation/perf-top.txt b/tools/perf/Documentation/perf-top.txt new file mode 100644 index 00000000000..539d0128972 --- /dev/null +++ b/tools/perf/Documentation/perf-top.txt @@ -0,0 +1,39 @@ +perf-top(1) +=========== + +NAME +---- +perf-top - Run a command and profile it + +SYNOPSIS +-------- +[verse] +'perf top' [-e <EVENT> | --event=EVENT] [-l] [-a] <command> + +DESCRIPTION +----------- +This command runs a command and gathers a performance counter profile +from it. + + +OPTIONS +------- +<command>...:: +	Any command you can specify in a shell. + +-e:: +--event=:: +	Select the PMU event. Selection can be a symbolic event name +	(use 'perf list' to list all events) or a raw PMU +	event (eventsel+umask) in the form of rNNN where NNN is a +	 hexadecimal event descriptor. + +-a:: +        system-wide collection + +-l:: +        scale counter values + +SEE ALSO +-------- +linkperf:perf-stat[1], linkperf:perf-list[1] diff --git a/tools/perf/Documentation/perf.txt b/tools/perf/Documentation/perf.txt new file mode 100644 index 00000000000..69c83255719 --- /dev/null +++ b/tools/perf/Documentation/perf.txt @@ -0,0 +1,24 @@ +perf(1) +======= + +NAME +---- +perf - Performance analysis tools for Linux + +SYNOPSIS +-------- +[verse] +'perf' [--version] [--help] COMMAND [ARGS] + +DESCRIPTION +----------- +Performance counters for Linux are are a new kernel-based subsystem +that provide a framework for all things performance analysis. It +covers hardware level (CPU/PMU, Performance Monitoring Unit) features +and software features (software counters, tracepoints) as well. + +SEE ALSO +-------- +linkperf:perf-stat[1], linkperf:perf-top[1], +linkperf:perf-record[1], linkperf:perf-report[1], +linkperf:perf-list[1] diff --git a/tools/perf/Makefile b/tools/perf/Makefile new file mode 100644 index 00000000000..0cbd5d6874e --- /dev/null +++ b/tools/perf/Makefile @@ -0,0 +1,929 @@ +# The default target of this Makefile is... +all:: + +# Define V=1 to have a more verbose compile. +# +# Define SNPRINTF_RETURNS_BOGUS if your are on a system which snprintf() +# or vsnprintf() return -1 instead of number of characters which would +# have been written to the final string if enough space had been available. +# +# Define FREAD_READS_DIRECTORIES if your are on a system which succeeds +# when attempting to read from an fopen'ed directory. +# +# Define NO_OPENSSL environment variable if you do not have OpenSSL. +# This also implies MOZILLA_SHA1. +# +# Define CURLDIR=/foo/bar if your curl header and library files are in +# /foo/bar/include and /foo/bar/lib directories. +# +# Define EXPATDIR=/foo/bar if your expat header and library files are in +# /foo/bar/include and /foo/bar/lib directories. +# +# Define NO_D_INO_IN_DIRENT if you don't have d_ino in your struct dirent. +# +# Define NO_D_TYPE_IN_DIRENT if your platform defines DT_UNKNOWN but lacks +# d_type in struct dirent (latest Cygwin -- will be fixed soonish). +# +# Define NO_C99_FORMAT if your formatted IO functions (printf/scanf et.al.) +# do not support the 'size specifiers' introduced by C99, namely ll, hh, +# j, z, t. (representing long long int, char, intmax_t, size_t, ptrdiff_t). +# some C compilers supported these specifiers prior to C99 as an extension. +# +# Define NO_STRCASESTR if you don't have strcasestr. +# +# Define NO_MEMMEM if you don't have memmem. +# +# Define NO_STRTOUMAX if you don't have strtoumax in the C library. +# If your compiler also does not support long long or does not have +# strtoull, define NO_STRTOULL. +# +# Define NO_SETENV if you don't have setenv in the C library. +# +# Define NO_UNSETENV if you don't have unsetenv in the C library. +# +# Define NO_MKDTEMP if you don't have mkdtemp in the C library. +# +# Define NO_SYS_SELECT_H if you don't have sys/select.h. +# +# Define NO_SYMLINK_HEAD if you never want .perf/HEAD to be a symbolic link. +# Enable it on Windows.  By default, symrefs are still used. +# +# Define NO_SVN_TESTS if you want to skip time-consuming SVN interoperability +# tests.  These tests take up a significant amount of the total test time +# but are not needed unless you plan to talk to SVN repos. +# +# Define NO_FINK if you are building on Darwin/Mac OS X, have Fink +# installed in /sw, but don't want PERF to link against any libraries +# installed there.  If defined you may specify your own (or Fink's) +# include directories and library directories by defining CFLAGS +# and LDFLAGS appropriately. +# +# Define NO_DARWIN_PORTS if you are building on Darwin/Mac OS X, +# have DarwinPorts installed in /opt/local, but don't want PERF to +# link against any libraries installed there.  If defined you may +# specify your own (or DarwinPort's) include directories and +# library directories by defining CFLAGS and LDFLAGS appropriately. +# +# Define PPC_SHA1 environment variable when running make to make use of +# a bundled SHA1 routine optimized for PowerPC. +# +# Define ARM_SHA1 environment variable when running make to make use of +# a bundled SHA1 routine optimized for ARM. +# +# Define MOZILLA_SHA1 environment variable when running make to make use of +# a bundled SHA1 routine coming from Mozilla. It is GPL'd and should be fast +# on non-x86 architectures (e.g. PowerPC), while the OpenSSL version (default +# choice) has very fast version optimized for i586. +# +# Define NEEDS_SSL_WITH_CRYPTO if you need -lcrypto with -lssl (Darwin). +# +# Define NEEDS_LIBICONV if linking with libc is not enough (Darwin). +# +# Define NEEDS_SOCKET if linking with libc is not enough (SunOS, +# Patrick Mauritz). +# +# Define NO_MMAP if you want to avoid mmap. +# +# Define NO_PTHREADS if you do not have or do not want to use Pthreads. +# +# Define NO_PREAD if you have a problem with pread() system call (e.g. +# cygwin.dll before v1.5.22). +# +# Define NO_FAST_WORKING_DIRECTORY if accessing objects in pack files is +# generally faster on your platform than accessing the working directory. +# +# Define NO_TRUSTABLE_FILEMODE if your filesystem may claim to support +# the executable mode bit, but doesn't really do so. +# +# Define NO_IPV6 if you lack IPv6 support and getaddrinfo(). +# +# Define NO_SOCKADDR_STORAGE if your platform does not have struct +# sockaddr_storage. +# +# Define NO_ICONV if your libc does not properly support iconv. +# +# Define OLD_ICONV if your library has an old iconv(), where the second +# (input buffer pointer) parameter is declared with type (const char **). +# +# Define NO_DEFLATE_BOUND if your zlib does not have deflateBound. +# +# Define NO_R_TO_GCC_LINKER if your gcc does not like "-R/path/lib" +# that tells runtime paths to dynamic libraries; +# "-Wl,-rpath=/path/lib" is used instead. +# +# Define USE_NSEC below if you want perf to care about sub-second file mtimes +# and ctimes. Note that you need recent glibc (at least 2.2.4) for this, and +# it will BREAK YOUR LOCAL DIFFS! show-diff and anything using it will likely +# randomly break unless your underlying filesystem supports those sub-second +# times (my ext3 doesn't). +# +# Define USE_ST_TIMESPEC if your "struct stat" uses "st_ctimespec" instead of +# "st_ctim" +# +# Define NO_NSEC if your "struct stat" does not have "st_ctim.tv_nsec" +# available.  This automatically turns USE_NSEC off. +# +# Define USE_STDEV below if you want perf to care about the underlying device +# change being considered an inode change from the update-index perspective. +# +# Define NO_ST_BLOCKS_IN_STRUCT_STAT if your platform does not have st_blocks +# field that counts the on-disk footprint in 512-byte blocks. +# +# Define ASCIIDOC8 if you want to format documentation with AsciiDoc 8 +# +# Define DOCBOOK_XSL_172 if you want to format man pages with DocBook XSL v1.72. +# +# Define NO_PERL_MAKEMAKER if you cannot use Makefiles generated by perl's +# MakeMaker (e.g. using ActiveState under Cygwin). +# +# Define NO_PERL if you do not want Perl scripts or libraries at all. +# +# Define INTERNAL_QSORT to use Git's implementation of qsort(), which +# is a simplified version of the merge sort used in glibc. This is +# recommended if Git triggers O(n^2) behavior in your platform's qsort(). +# +# Define NO_EXTERNAL_GREP if you don't want "perf grep" to ever call +# your external grep (e.g., if your system lacks grep, if its grep is +# broken, or spawning external process is slower than built-in grep perf has). + +PERF-VERSION-FILE: .FORCE-PERF-VERSION-FILE +	@$(SHELL_PATH) util/PERF-VERSION-GEN +-include PERF-VERSION-FILE + +uname_S := $(shell sh -c 'uname -s 2>/dev/null || echo not') +uname_M := $(shell sh -c 'uname -m 2>/dev/null || echo not') +uname_O := $(shell sh -c 'uname -o 2>/dev/null || echo not') +uname_R := $(shell sh -c 'uname -r 2>/dev/null || echo not') +uname_P := $(shell sh -c 'uname -p 2>/dev/null || echo not') +uname_V := $(shell sh -c 'uname -v 2>/dev/null || echo not') + +# CFLAGS and LDFLAGS are for the users to override from the command line. + +CFLAGS = -ggdb3 -Wall -Werror -Wstrict-prototypes -Wmissing-declarations -Wmissing-prototypes -std=gnu99 -Wdeclaration-after-statement -O6 +LDFLAGS = -lpthread -lrt -lelf +ALL_CFLAGS = $(CFLAGS) +ALL_LDFLAGS = $(LDFLAGS) +STRIP ?= strip + +# Among the variables below, these: +#   perfexecdir +#   template_dir +#   mandir +#   infodir +#   htmldir +#   ETC_PERFCONFIG (but not sysconfdir) +# can be specified as a relative path some/where/else; +# this is interpreted as relative to $(prefix) and "perf" at +# runtime figures out where they are based on the path to the executable. +# This can help installing the suite in a relocatable way. + +prefix = $(HOME) +bindir_relative = bin +bindir = $(prefix)/$(bindir_relative) +mandir = share/man +infodir = share/info +perfexecdir = libexec/perf-core +sharedir = $(prefix)/share +template_dir = share/perf-core/templates +htmldir = share/doc/perf-doc +ifeq ($(prefix),/usr) +sysconfdir = /etc +ETC_PERFCONFIG = $(sysconfdir)/perfconfig +else +sysconfdir = $(prefix)/etc +ETC_PERFCONFIG = etc/perfconfig +endif +lib = lib +# DESTDIR= + +export prefix bindir sharedir sysconfdir + +CC = gcc +AR = ar +RM = rm -f +TAR = tar +FIND = find +INSTALL = install +RPMBUILD = rpmbuild +PTHREAD_LIBS = -lpthread + +# sparse is architecture-neutral, which means that we need to tell it +# explicitly what architecture to check for. Fix this up for yours.. +SPARSE_FLAGS = -D__BIG_ENDIAN__ -D__powerpc__ + + + +### --- END CONFIGURATION SECTION --- + +# Those must not be GNU-specific; they are shared with perl/ which may +# be built by a different compiler. (Note that this is an artifact now +# but it still might be nice to keep that distinction.) +BASIC_CFLAGS = +BASIC_LDFLAGS = + +# Guard against environment variables +BUILTIN_OBJS = +BUILT_INS = +COMPAT_CFLAGS = +COMPAT_OBJS = +LIB_H = +LIB_OBJS = +SCRIPT_PERL = +SCRIPT_SH = +TEST_PROGRAMS = + +# +# No scripts right now: +# + +# SCRIPT_SH += perf-am.sh + +# +# No Perl scripts right now: +# + +# SCRIPT_PERL += perf-add--interactive.perl + +SCRIPTS = $(patsubst %.sh,%,$(SCRIPT_SH)) \ +	  $(patsubst %.perl,%,$(SCRIPT_PERL)) + +# Empty... +EXTRA_PROGRAMS = + +# ... and all the rest that could be moved out of bindir to perfexecdir +PROGRAMS += $(EXTRA_PROGRAMS) + +# +# Single 'perf' binary right now: +# +PROGRAMS += perf + +# List built-in command $C whose implementation cmd_$C() is not in +# builtin-$C.o but is linked in as part of some other command. +# +# None right now: +# +# BUILT_INS += perf-init $X + +# what 'all' will build and 'install' will install, in perfexecdir +ALL_PROGRAMS = $(PROGRAMS) $(SCRIPTS) + +# what 'all' will build but not install in perfexecdir +OTHER_PROGRAMS = perf$X + +# Set paths to tools early so that they can be used for version tests. +ifndef SHELL_PATH +	SHELL_PATH = /bin/sh +endif +ifndef PERL_PATH +	PERL_PATH = /usr/bin/perl +endif + +export PERL_PATH + +LIB_FILE=libperf.a + +LIB_H += ../../include/linux/perf_counter.h +LIB_H += perf.h +LIB_H += util/list.h +LIB_H += util/rbtree.h +LIB_H += util/levenshtein.h +LIB_H += util/parse-options.h +LIB_H += util/parse-events.h +LIB_H += util/quote.h +LIB_H += util/util.h +LIB_H += util/help.h +LIB_H += util/strbuf.h +LIB_H += util/string.h +LIB_H += util/run-command.h +LIB_H += util/sigchain.h +LIB_H += util/symbol.h +LIB_H += util/color.h + +LIB_OBJS += util/abspath.o +LIB_OBJS += util/alias.o +LIB_OBJS += util/config.o +LIB_OBJS += util/ctype.o +LIB_OBJS += util/environment.o +LIB_OBJS += util/exec_cmd.o +LIB_OBJS += util/help.o +LIB_OBJS += util/levenshtein.o +LIB_OBJS += util/parse-options.o +LIB_OBJS += util/parse-events.o +LIB_OBJS += util/path.o +LIB_OBJS += util/rbtree.o +LIB_OBJS += util/run-command.o +LIB_OBJS += util/quote.o +LIB_OBJS += util/strbuf.o +LIB_OBJS += util/string.o +LIB_OBJS += util/usage.o +LIB_OBJS += util/wrapper.o +LIB_OBJS += util/sigchain.o +LIB_OBJS += util/symbol.o +LIB_OBJS += util/color.o +LIB_OBJS += util/pager.o + +BUILTIN_OBJS += builtin-annotate.o +BUILTIN_OBJS += builtin-help.o +BUILTIN_OBJS += builtin-list.o +BUILTIN_OBJS += builtin-record.o +BUILTIN_OBJS += builtin-report.o +BUILTIN_OBJS += builtin-stat.o +BUILTIN_OBJS += builtin-top.o + +PERFLIBS = $(LIB_FILE) +EXTLIBS = + +# +# Platform specific tweaks +# + +# We choose to avoid "if .. else if .. else .. endif endif" +# because maintaining the nesting to match is a pain.  If +# we had "elif" things would have been much nicer... + +-include config.mak.autogen +-include config.mak + +ifeq ($(uname_S),Darwin) +	ifndef NO_FINK +		ifeq ($(shell test -d /sw/lib && echo y),y) +			BASIC_CFLAGS += -I/sw/include +			BASIC_LDFLAGS += -L/sw/lib +		endif +	endif +	ifndef NO_DARWIN_PORTS +		ifeq ($(shell test -d /opt/local/lib && echo y),y) +			BASIC_CFLAGS += -I/opt/local/include +			BASIC_LDFLAGS += -L/opt/local/lib +		endif +	endif +	PTHREAD_LIBS = +endif + +ifndef CC_LD_DYNPATH +	ifdef NO_R_TO_GCC_LINKER +		# Some gcc does not accept and pass -R to the linker to specify +		# the runtime dynamic library path. +		CC_LD_DYNPATH = -Wl,-rpath, +	else +		CC_LD_DYNPATH = -R +	endif +endif + +ifdef ZLIB_PATH +	BASIC_CFLAGS += -I$(ZLIB_PATH)/include +	EXTLIBS += -L$(ZLIB_PATH)/$(lib) $(CC_LD_DYNPATH)$(ZLIB_PATH)/$(lib) +endif +EXTLIBS += -lz + +ifdef NEEDS_SOCKET +	EXTLIBS += -lsocket +endif +ifdef NEEDS_NSL +	EXTLIBS += -lnsl +endif +ifdef NO_D_TYPE_IN_DIRENT +	BASIC_CFLAGS += -DNO_D_TYPE_IN_DIRENT +endif +ifdef NO_D_INO_IN_DIRENT +	BASIC_CFLAGS += -DNO_D_INO_IN_DIRENT +endif +ifdef NO_ST_BLOCKS_IN_STRUCT_STAT +	BASIC_CFLAGS += -DNO_ST_BLOCKS_IN_STRUCT_STAT +endif +ifdef USE_NSEC +	BASIC_CFLAGS += -DUSE_NSEC +endif +ifdef USE_ST_TIMESPEC +	BASIC_CFLAGS += -DUSE_ST_TIMESPEC +endif +ifdef NO_NSEC +	BASIC_CFLAGS += -DNO_NSEC +endif +ifdef NO_C99_FORMAT +	BASIC_CFLAGS += -DNO_C99_FORMAT +endif +ifdef SNPRINTF_RETURNS_BOGUS +	COMPAT_CFLAGS += -DSNPRINTF_RETURNS_BOGUS +	COMPAT_OBJS += compat/snprintf.o +endif +ifdef FREAD_READS_DIRECTORIES +	COMPAT_CFLAGS += -DFREAD_READS_DIRECTORIES +	COMPAT_OBJS += compat/fopen.o +endif +ifdef NO_SYMLINK_HEAD +	BASIC_CFLAGS += -DNO_SYMLINK_HEAD +endif +ifdef NO_STRCASESTR +	COMPAT_CFLAGS += -DNO_STRCASESTR +	COMPAT_OBJS += compat/strcasestr.o +endif +ifdef NO_STRTOUMAX +	COMPAT_CFLAGS += -DNO_STRTOUMAX +	COMPAT_OBJS += compat/strtoumax.o +endif +ifdef NO_STRTOULL +	COMPAT_CFLAGS += -DNO_STRTOULL +endif +ifdef NO_SETENV +	COMPAT_CFLAGS += -DNO_SETENV +	COMPAT_OBJS += compat/setenv.o +endif +ifdef NO_MKDTEMP +	COMPAT_CFLAGS += -DNO_MKDTEMP +	COMPAT_OBJS += compat/mkdtemp.o +endif +ifdef NO_UNSETENV +	COMPAT_CFLAGS += -DNO_UNSETENV +	COMPAT_OBJS += compat/unsetenv.o +endif +ifdef NO_SYS_SELECT_H +	BASIC_CFLAGS += -DNO_SYS_SELECT_H +endif +ifdef NO_MMAP +	COMPAT_CFLAGS += -DNO_MMAP +	COMPAT_OBJS += compat/mmap.o +else +	ifdef USE_WIN32_MMAP +		COMPAT_CFLAGS += -DUSE_WIN32_MMAP +		COMPAT_OBJS += compat/win32mmap.o +	endif +endif +ifdef NO_PREAD +	COMPAT_CFLAGS += -DNO_PREAD +	COMPAT_OBJS += compat/pread.o +endif +ifdef NO_FAST_WORKING_DIRECTORY +	BASIC_CFLAGS += -DNO_FAST_WORKING_DIRECTORY +endif +ifdef NO_TRUSTABLE_FILEMODE +	BASIC_CFLAGS += -DNO_TRUSTABLE_FILEMODE +endif +ifdef NO_IPV6 +	BASIC_CFLAGS += -DNO_IPV6 +endif +ifdef NO_UINTMAX_T +	BASIC_CFLAGS += -Duintmax_t=uint32_t +endif +ifdef NO_SOCKADDR_STORAGE +ifdef NO_IPV6 +	BASIC_CFLAGS += -Dsockaddr_storage=sockaddr_in +else +	BASIC_CFLAGS += -Dsockaddr_storage=sockaddr_in6 +endif +endif +ifdef NO_INET_NTOP +	LIB_OBJS += compat/inet_ntop.o +endif +ifdef NO_INET_PTON +	LIB_OBJS += compat/inet_pton.o +endif + +ifdef NO_ICONV +	BASIC_CFLAGS += -DNO_ICONV +endif + +ifdef OLD_ICONV +	BASIC_CFLAGS += -DOLD_ICONV +endif + +ifdef NO_DEFLATE_BOUND +	BASIC_CFLAGS += -DNO_DEFLATE_BOUND +endif + +ifdef PPC_SHA1 +	SHA1_HEADER = "ppc/sha1.h" +	LIB_OBJS += ppc/sha1.o ppc/sha1ppc.o +else +ifdef ARM_SHA1 +	SHA1_HEADER = "arm/sha1.h" +	LIB_OBJS += arm/sha1.o arm/sha1_arm.o +else +ifdef MOZILLA_SHA1 +	SHA1_HEADER = "mozilla-sha1/sha1.h" +	LIB_OBJS += mozilla-sha1/sha1.o +else +	SHA1_HEADER = <openssl/sha.h> +	EXTLIBS += $(LIB_4_CRYPTO) +endif +endif +endif +ifdef NO_PERL_MAKEMAKER +	export NO_PERL_MAKEMAKER +endif +ifdef NO_HSTRERROR +	COMPAT_CFLAGS += -DNO_HSTRERROR +	COMPAT_OBJS += compat/hstrerror.o +endif +ifdef NO_MEMMEM +	COMPAT_CFLAGS += -DNO_MEMMEM +	COMPAT_OBJS += compat/memmem.o +endif +ifdef INTERNAL_QSORT +	COMPAT_CFLAGS += -DINTERNAL_QSORT +	COMPAT_OBJS += compat/qsort.o +endif +ifdef RUNTIME_PREFIX +	COMPAT_CFLAGS += -DRUNTIME_PREFIX +endif + +ifdef DIR_HAS_BSD_GROUP_SEMANTICS +	COMPAT_CFLAGS += -DDIR_HAS_BSD_GROUP_SEMANTICS +endif +ifdef NO_EXTERNAL_GREP +	BASIC_CFLAGS += -DNO_EXTERNAL_GREP +endif + +ifeq ($(PERL_PATH),) +NO_PERL=NoThanks +endif + +QUIET_SUBDIR0  = +$(MAKE) -C # space to separate -C and subdir +QUIET_SUBDIR1  = + +ifneq ($(findstring $(MAKEFLAGS),w),w) +PRINT_DIR = --no-print-directory +else # "make -w" +NO_SUBDIR = : +endif + +ifneq ($(findstring $(MAKEFLAGS),s),s) +ifndef V +	QUIET_CC       = @echo '   ' CC $@; +	QUIET_AR       = @echo '   ' AR $@; +	QUIET_LINK     = @echo '   ' LINK $@; +	QUIET_BUILT_IN = @echo '   ' BUILTIN $@; +	QUIET_GEN      = @echo '   ' GEN $@; +	QUIET_SUBDIR0  = +@subdir= +	QUIET_SUBDIR1  = ;$(NO_SUBDIR) echo '   ' SUBDIR $$subdir; \ +			 $(MAKE) $(PRINT_DIR) -C $$subdir +	export V +	export QUIET_GEN +	export QUIET_BUILT_IN +endif +endif + +ifdef ASCIIDOC8 +	export ASCIIDOC8 +endif + +# Shell quote (do not use $(call) to accommodate ancient setups); + +SHA1_HEADER_SQ = $(subst ','\'',$(SHA1_HEADER)) +ETC_PERFCONFIG_SQ = $(subst ','\'',$(ETC_PERFCONFIG)) + +DESTDIR_SQ = $(subst ','\'',$(DESTDIR)) +bindir_SQ = $(subst ','\'',$(bindir)) +bindir_relative_SQ = $(subst ','\'',$(bindir_relative)) +mandir_SQ = $(subst ','\'',$(mandir)) +infodir_SQ = $(subst ','\'',$(infodir)) +perfexecdir_SQ = $(subst ','\'',$(perfexecdir)) +template_dir_SQ = $(subst ','\'',$(template_dir)) +htmldir_SQ = $(subst ','\'',$(htmldir)) +prefix_SQ = $(subst ','\'',$(prefix)) + +SHELL_PATH_SQ = $(subst ','\'',$(SHELL_PATH)) +PERL_PATH_SQ = $(subst ','\'',$(PERL_PATH)) + +LIBS = $(PERFLIBS) $(EXTLIBS) + +BASIC_CFLAGS += -DSHA1_HEADER='$(SHA1_HEADER_SQ)' \ +	$(COMPAT_CFLAGS) +LIB_OBJS += $(COMPAT_OBJS) + +ALL_CFLAGS += $(BASIC_CFLAGS) +ALL_LDFLAGS += $(BASIC_LDFLAGS) + +export TAR INSTALL DESTDIR SHELL_PATH + + +### Build rules + +SHELL = $(SHELL_PATH) + +all:: shell_compatibility_test $(ALL_PROGRAMS) $(BUILT_INS) $(OTHER_PROGRAMS) PERF-BUILD-OPTIONS +ifneq (,$X) +	$(foreach p,$(patsubst %$X,%,$(filter %$X,$(ALL_PROGRAMS) $(BUILT_INS) perf$X)), test '$p' -ef '$p$X' || $(RM) '$p';) +endif + +all:: + +please_set_SHELL_PATH_to_a_more_modern_shell: +	@$$(:) + +shell_compatibility_test: please_set_SHELL_PATH_to_a_more_modern_shell + +strip: $(PROGRAMS) perf$X +	$(STRIP) $(STRIP_OPTS) $(PROGRAMS) perf$X + +perf.o: perf.c common-cmds.h PERF-CFLAGS +	$(QUIET_CC)$(CC) -DPERF_VERSION='"$(PERF_VERSION)"' \ +		'-DPERF_HTML_PATH="$(htmldir_SQ)"' \ +		$(ALL_CFLAGS) -c $(filter %.c,$^) + +perf$X: perf.o $(BUILTIN_OBJS) $(PERFLIBS) +	$(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ perf.o \ +		$(BUILTIN_OBJS) $(ALL_LDFLAGS) $(LIBS) + +builtin-help.o: builtin-help.c common-cmds.h PERF-CFLAGS +	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) \ +		'-DPERF_HTML_PATH="$(htmldir_SQ)"' \ +		'-DPERF_MAN_PATH="$(mandir_SQ)"' \ +		'-DPERF_INFO_PATH="$(infodir_SQ)"' $< + +$(BUILT_INS): perf$X +	$(QUIET_BUILT_IN)$(RM) $@ && \ +	ln perf$X $@ 2>/dev/null || \ +	ln -s perf$X $@ 2>/dev/null || \ +	cp perf$X $@ + +common-cmds.h: util/generate-cmdlist.sh command-list.txt + +common-cmds.h: $(wildcard Documentation/perf-*.txt) +	$(QUIET_GEN)util/generate-cmdlist.sh > $@+ && mv $@+ $@ + +$(patsubst %.sh,%,$(SCRIPT_SH)) : % : %.sh +	$(QUIET_GEN)$(RM) $@ $@+ && \ +	sed -e '1s|#!.*/sh|#!$(SHELL_PATH_SQ)|' \ +	    -e 's|@SHELL_PATH@|$(SHELL_PATH_SQ)|' \ +	    -e 's|@@PERL@@|$(PERL_PATH_SQ)|g' \ +	    -e 's/@@PERF_VERSION@@/$(PERF_VERSION)/g' \ +	    -e 's/@@NO_CURL@@/$(NO_CURL)/g' \ +	    $@.sh >$@+ && \ +	chmod +x $@+ && \ +	mv $@+ $@ + +configure: configure.ac +	$(QUIET_GEN)$(RM) $@ $<+ && \ +	sed -e 's/@@PERF_VERSION@@/$(PERF_VERSION)/g' \ +	    $< > $<+ && \ +	autoconf -o $@ $<+ && \ +	$(RM) $<+ + +# These can record PERF_VERSION +perf.o perf.spec \ +	$(patsubst %.sh,%,$(SCRIPT_SH)) \ +	$(patsubst %.perl,%,$(SCRIPT_PERL)) \ +	: PERF-VERSION-FILE + +%.o: %.c PERF-CFLAGS +	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) $< +%.s: %.c PERF-CFLAGS +	$(QUIET_CC)$(CC) -S $(ALL_CFLAGS) $< +%.o: %.S +	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) $< + +util/exec_cmd.o: util/exec_cmd.c PERF-CFLAGS +	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) \ +		'-DPERF_EXEC_PATH="$(perfexecdir_SQ)"' \ +		'-DBINDIR="$(bindir_relative_SQ)"' \ +		'-DPREFIX="$(prefix_SQ)"' \ +		$< + +builtin-init-db.o: builtin-init-db.c PERF-CFLAGS +	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) -DDEFAULT_PERF_TEMPLATE_DIR='"$(template_dir_SQ)"' $< + +util/config.o: util/config.c PERF-CFLAGS +	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) -DETC_PERFCONFIG='"$(ETC_PERFCONFIG_SQ)"' $< + +perf-%$X: %.o $(PERFLIBS) +	$(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ $(ALL_LDFLAGS) $(filter %.o,$^) $(LIBS) + +$(LIB_OBJS) $(BUILTIN_OBJS): $(LIB_H) +$(patsubst perf-%$X,%.o,$(PROGRAMS)): $(LIB_H) $(wildcard */*.h) +builtin-revert.o wt-status.o: wt-status.h + +$(LIB_FILE): $(LIB_OBJS) +	$(QUIET_AR)$(RM) $@ && $(AR) rcs $@ $(LIB_OBJS) + +doc: +	$(MAKE) -C Documentation all + +man: +	$(MAKE) -C Documentation man + +html: +	$(MAKE) -C Documentation html + +info: +	$(MAKE) -C Documentation info + +pdf: +	$(MAKE) -C Documentation pdf + +TAGS: +	$(RM) TAGS +	$(FIND) . -name '*.[hcS]' -print | xargs etags -a + +tags: +	$(RM) tags +	$(FIND) . -name '*.[hcS]' -print | xargs ctags -a + +cscope: +	$(RM) cscope* +	$(FIND) . -name '*.[hcS]' -print | xargs cscope -b + +### Detect prefix changes +TRACK_CFLAGS = $(subst ','\'',$(ALL_CFLAGS)):\ +             $(bindir_SQ):$(perfexecdir_SQ):$(template_dir_SQ):$(prefix_SQ) + +PERF-CFLAGS: .FORCE-PERF-CFLAGS +	@FLAGS='$(TRACK_CFLAGS)'; \ +	    if test x"$$FLAGS" != x"`cat PERF-CFLAGS 2>/dev/null`" ; then \ +		echo 1>&2 "    * new build flags or prefix"; \ +		echo "$$FLAGS" >PERF-CFLAGS; \ +            fi + +# We need to apply sq twice, once to protect from the shell +# that runs PERF-BUILD-OPTIONS, and then again to protect it +# and the first level quoting from the shell that runs "echo". +PERF-BUILD-OPTIONS: .FORCE-PERF-BUILD-OPTIONS +	@echo SHELL_PATH=\''$(subst ','\'',$(SHELL_PATH_SQ))'\' >$@ +	@echo TAR=\''$(subst ','\'',$(subst ','\'',$(TAR)))'\' >>$@ +	@echo NO_CURL=\''$(subst ','\'',$(subst ','\'',$(NO_CURL)))'\' >>$@ +	@echo NO_PERL=\''$(subst ','\'',$(subst ','\'',$(NO_PERL)))'\' >>$@ + +### Testing rules + +# +# None right now: +# +# TEST_PROGRAMS += test-something$X + +all:: $(TEST_PROGRAMS) + +# GNU make supports exporting all variables by "export" without parameters. +# However, the environment gets quite big, and some programs have problems +# with that. + +export NO_SVN_TESTS + +check: common-cmds.h +	if sparse; \ +	then \ +		for i in *.c */*.c; \ +		do \ +			sparse $(ALL_CFLAGS) $(SPARSE_FLAGS) $$i || exit; \ +		done; \ +	else \ +		echo 2>&1 "Did you mean 'make test'?"; \ +		exit 1; \ +	fi + +remove-dashes: +	./fixup-builtins $(BUILT_INS) $(PROGRAMS) $(SCRIPTS) + +### Installation rules + +ifneq ($(filter /%,$(firstword $(template_dir))),) +template_instdir = $(template_dir) +else +template_instdir = $(prefix)/$(template_dir) +endif +export template_instdir + +ifneq ($(filter /%,$(firstword $(perfexecdir))),) +perfexec_instdir = $(perfexecdir) +else +perfexec_instdir = $(prefix)/$(perfexecdir) +endif +perfexec_instdir_SQ = $(subst ','\'',$(perfexec_instdir)) +export perfexec_instdir + +install: all +	$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(bindir_SQ)' +	$(INSTALL) perf$X '$(DESTDIR_SQ)$(bindir_SQ)' +ifdef BUILT_INS +	$(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)' +	$(INSTALL) $(BUILT_INS) '$(DESTDIR_SQ)$(perfexec_instdir_SQ)' +ifneq (,$X) +	$(foreach p,$(patsubst %$X,%,$(filter %$X,$(ALL_PROGRAMS) $(BUILT_INS) perf$X)), $(RM) '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/$p';) +endif +endif + +install-doc: +	$(MAKE) -C Documentation install + +install-man: +	$(MAKE) -C Documentation install-man + +install-html: +	$(MAKE) -C Documentation install-html + +install-info: +	$(MAKE) -C Documentation install-info + +install-pdf: +	$(MAKE) -C Documentation install-pdf + +quick-install-doc: +	$(MAKE) -C Documentation quick-install + +quick-install-man: +	$(MAKE) -C Documentation quick-install-man + +quick-install-html: +	$(MAKE) -C Documentation quick-install-html + + +### Maintainer's dist rules +# +# None right now +# +# +# perf.spec: perf.spec.in +#	sed -e 's/@@VERSION@@/$(PERF_VERSION)/g' < $< > $@+ +#	mv $@+ $@ +# +# PERF_TARNAME=perf-$(PERF_VERSION) +# dist: perf.spec perf-archive$(X) configure +#	./perf-archive --format=tar \ +#		--prefix=$(PERF_TARNAME)/ HEAD^{tree} > $(PERF_TARNAME).tar +#	@mkdir -p $(PERF_TARNAME) +#	@cp perf.spec configure $(PERF_TARNAME) +#	@echo $(PERF_VERSION) > $(PERF_TARNAME)/version +#	$(TAR) rf $(PERF_TARNAME).tar \ +#		$(PERF_TARNAME)/perf.spec \ +#		$(PERF_TARNAME)/configure \ +#		$(PERF_TARNAME)/version +#	@$(RM) -r $(PERF_TARNAME) +#	gzip -f -9 $(PERF_TARNAME).tar +# +# htmldocs = perf-htmldocs-$(PERF_VERSION) +# manpages = perf-manpages-$(PERF_VERSION) +# dist-doc: +#	$(RM) -r .doc-tmp-dir +#	mkdir .doc-tmp-dir +#	$(MAKE) -C Documentation WEBDOC_DEST=../.doc-tmp-dir install-webdoc +#	cd .doc-tmp-dir && $(TAR) cf ../$(htmldocs).tar . +#	gzip -n -9 -f $(htmldocs).tar +#	: +#	$(RM) -r .doc-tmp-dir +#	mkdir -p .doc-tmp-dir/man1 .doc-tmp-dir/man5 .doc-tmp-dir/man7 +#	$(MAKE) -C Documentation DESTDIR=./ \ +#		man1dir=../.doc-tmp-dir/man1 \ +#		man5dir=../.doc-tmp-dir/man5 \ +#		man7dir=../.doc-tmp-dir/man7 \ +#		install +#	cd .doc-tmp-dir && $(TAR) cf ../$(manpages).tar . +#	gzip -n -9 -f $(manpages).tar +#	$(RM) -r .doc-tmp-dir +# +# rpm: dist +#	$(RPMBUILD) -ta $(PERF_TARNAME).tar.gz + +### Cleaning rules + +distclean: clean +#	$(RM) configure + +clean: +	$(RM) *.o */*.o $(LIB_FILE) +	$(RM) $(ALL_PROGRAMS) $(BUILT_INS) perf$X +	$(RM) $(TEST_PROGRAMS) +	$(RM) *.spec *.pyc *.pyo */*.pyc */*.pyo common-cmds.h TAGS tags cscope* +	$(RM) -r autom4te.cache +	$(RM) config.log config.mak.autogen config.mak.append config.status config.cache +	$(RM) -r $(PERF_TARNAME) .doc-tmp-dir +	$(RM) $(PERF_TARNAME).tar.gz perf-core_$(PERF_VERSION)-*.tar.gz +	$(RM) $(htmldocs).tar.gz $(manpages).tar.gz +	$(MAKE) -C Documentation/ clean +	$(RM) PERF-VERSION-FILE PERF-CFLAGS PERF-BUILD-OPTIONS + +.PHONY: all install clean strip +.PHONY: shell_compatibility_test please_set_SHELL_PATH_to_a_more_modern_shell +.PHONY: .FORCE-PERF-VERSION-FILE TAGS tags cscope .FORCE-PERF-CFLAGS +.PHONY: .FORCE-PERF-BUILD-OPTIONS + +### Make sure built-ins do not have dups and listed in perf.c +# +check-builtins:: +	./check-builtins.sh + +### Test suite coverage testing +# +# None right now +# +# .PHONY: coverage coverage-clean coverage-build coverage-report +# +# coverage: +#	$(MAKE) coverage-build +#	$(MAKE) coverage-report +# +# coverage-clean: +#	rm -f *.gcda *.gcno +# +# COVERAGE_CFLAGS = $(CFLAGS) -O0 -ftest-coverage -fprofile-arcs +# COVERAGE_LDFLAGS = $(CFLAGS)  -O0 -lgcov +# +# coverage-build: coverage-clean +#	$(MAKE) CFLAGS="$(COVERAGE_CFLAGS)" LDFLAGS="$(COVERAGE_LDFLAGS)" all +#	$(MAKE) CFLAGS="$(COVERAGE_CFLAGS)" LDFLAGS="$(COVERAGE_LDFLAGS)" \ +#		-j1 test +# +# coverage-report: +#	gcov -b *.c */*.c +#	grep '^function.*called 0 ' *.c.gcov */*.c.gcov \ +#		| sed -e 's/\([^:]*\)\.gcov: *function \([^ ]*\) called.*/\1: \2/' \ +#		| tee coverage-untested-functions diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c new file mode 100644 index 00000000000..b1ed5f766cb --- /dev/null +++ b/tools/perf/builtin-annotate.c @@ -0,0 +1,1356 @@ +/* + * builtin-annotate.c + * + * Builtin annotate command: Analyze the perf.data input file, + * look up and read DSOs and symbol information and display + * a histogram of results, along various sorting keys. + */ +#include "builtin.h" + +#include "util/util.h" + +#include "util/color.h" +#include "util/list.h" +#include "util/cache.h" +#include "util/rbtree.h" +#include "util/symbol.h" +#include "util/string.h" + +#include "perf.h" + +#include "util/parse-options.h" +#include "util/parse-events.h" + +#define SHOW_KERNEL	1 +#define SHOW_USER	2 +#define SHOW_HV		4 + +static char		const *input_name = "perf.data"; +static char		*vmlinux = "vmlinux"; + +static char		default_sort_order[] = "comm,symbol"; +static char		*sort_order = default_sort_order; + +static int		input; +static int		show_mask = SHOW_KERNEL | SHOW_USER | SHOW_HV; + +static int		dump_trace = 0; +#define dprintf(x...)	do { if (dump_trace) printf(x); } while (0) + +static int		verbose; + +static unsigned long	page_size; +static unsigned long	mmap_window = 32; + +struct ip_event { +	struct perf_event_header header; +	__u64 ip; +	__u32 pid, tid; +}; + +struct mmap_event { +	struct perf_event_header header; +	__u32 pid, tid; +	__u64 start; +	__u64 len; +	__u64 pgoff; +	char filename[PATH_MAX]; +}; + +struct comm_event { +	struct perf_event_header header; +	__u32 pid, tid; +	char comm[16]; +}; + +struct fork_event { +	struct perf_event_header header; +	__u32 pid, ppid; +}; + +struct period_event { +	struct perf_event_header header; +	__u64 time; +	__u64 id; +	__u64 sample_period; +}; + +typedef union event_union { +	struct perf_event_header	header; +	struct ip_event			ip; +	struct mmap_event		mmap; +	struct comm_event		comm; +	struct fork_event		fork; +	struct period_event		period; +} event_t; + +static LIST_HEAD(dsos); +static struct dso *kernel_dso; +static struct dso *vdso; + + +static void dsos__add(struct dso *dso) +{ +	list_add_tail(&dso->node, &dsos); +} + +static struct dso *dsos__find(const char *name) +{ +	struct dso *pos; + +	list_for_each_entry(pos, &dsos, node) +		if (strcmp(pos->name, name) == 0) +			return pos; +	return NULL; +} + +static struct dso *dsos__findnew(const char *name) +{ +	struct dso *dso = dsos__find(name); +	int nr; + +	if (dso) +		return dso; + +	dso = dso__new(name, 0); +	if (!dso) +		goto out_delete_dso; + +	nr = dso__load(dso, NULL, verbose); +	if (nr < 0) { +		if (verbose) +			fprintf(stderr, "Failed to open: %s\n", name); +		goto out_delete_dso; +	} +	if (!nr && verbose) { +		fprintf(stderr, +		"No symbols found in: %s, maybe install a debug package?\n", +				name); +	} + +	dsos__add(dso); + +	return dso; + +out_delete_dso: +	dso__delete(dso); +	return NULL; +} + +static void dsos__fprintf(FILE *fp) +{ +	struct dso *pos; + +	list_for_each_entry(pos, &dsos, node) +		dso__fprintf(pos, fp); +} + +static struct symbol *vdso__find_symbol(struct dso *dso, __u64 ip) +{ +	return dso__find_symbol(kernel_dso, ip); +} + +static int load_kernel(void) +{ +	int err; + +	kernel_dso = dso__new("[kernel]", 0); +	if (!kernel_dso) +		return -1; + +	err = dso__load_kernel(kernel_dso, vmlinux, NULL, verbose); +	if (err) { +		dso__delete(kernel_dso); +		kernel_dso = NULL; +	} else +		dsos__add(kernel_dso); + +	vdso = dso__new("[vdso]", 0); +	if (!vdso) +		return -1; + +	vdso->find_symbol = vdso__find_symbol; + +	dsos__add(vdso); + +	return err; +} + +struct map { +	struct list_head node; +	__u64	 start; +	__u64	 end; +	__u64	 pgoff; +	__u64	 (*map_ip)(struct map *, __u64); +	struct dso	 *dso; +}; + +static __u64 map__map_ip(struct map *map, __u64 ip) +{ +	return ip - map->start + map->pgoff; +} + +static __u64 vdso__map_ip(struct map *map, __u64 ip) +{ +	return ip; +} + +static struct map *map__new(struct mmap_event *event) +{ +	struct map *self = malloc(sizeof(*self)); + +	if (self != NULL) { +		const char *filename = event->filename; + +		self->start = event->start; +		self->end   = event->start + event->len; +		self->pgoff = event->pgoff; + +		self->dso = dsos__findnew(filename); +		if (self->dso == NULL) +			goto out_delete; + +		if (self->dso == vdso) +			self->map_ip = vdso__map_ip; +		else +			self->map_ip = map__map_ip; +	} +	return self; +out_delete: +	free(self); +	return NULL; +} + +static struct map *map__clone(struct map *self) +{ +	struct map *map = malloc(sizeof(*self)); + +	if (!map) +		return NULL; + +	memcpy(map, self, sizeof(*self)); + +	return map; +} + +static int map__overlap(struct map *l, struct map *r) +{ +	if (l->start > r->start) { +		struct map *t = l; +		l = r; +		r = t; +	} + +	if (l->end > r->start) +		return 1; + +	return 0; +} + +static size_t map__fprintf(struct map *self, FILE *fp) +{ +	return fprintf(fp, " %Lx-%Lx %Lx %s\n", +		       self->start, self->end, self->pgoff, self->dso->name); +} + + +struct thread { +	struct rb_node	 rb_node; +	struct list_head maps; +	pid_t		 pid; +	char		 *comm; +}; + +static struct thread *thread__new(pid_t pid) +{ +	struct thread *self = malloc(sizeof(*self)); + +	if (self != NULL) { +		self->pid = pid; +		self->comm = malloc(32); +		if (self->comm) +			snprintf(self->comm, 32, ":%d", self->pid); +		INIT_LIST_HEAD(&self->maps); +	} + +	return self; +} + +static int thread__set_comm(struct thread *self, const char *comm) +{ +	if (self->comm) +		free(self->comm); +	self->comm = strdup(comm); +	return self->comm ? 0 : -ENOMEM; +} + +static size_t thread__fprintf(struct thread *self, FILE *fp) +{ +	struct map *pos; +	size_t ret = fprintf(fp, "Thread %d %s\n", self->pid, self->comm); + +	list_for_each_entry(pos, &self->maps, node) +		ret += map__fprintf(pos, fp); + +	return ret; +} + + +static struct rb_root threads; +static struct thread *last_match; + +static struct thread *threads__findnew(pid_t pid) +{ +	struct rb_node **p = &threads.rb_node; +	struct rb_node *parent = NULL; +	struct thread *th; + +	/* +	 * Font-end cache - PID lookups come in blocks, +	 * so most of the time we dont have to look up +	 * the full rbtree: +	 */ +	if (last_match && last_match->pid == pid) +		return last_match; + +	while (*p != NULL) { +		parent = *p; +		th = rb_entry(parent, struct thread, rb_node); + +		if (th->pid == pid) { +			last_match = th; +			return th; +		} + +		if (pid < th->pid) +			p = &(*p)->rb_left; +		else +			p = &(*p)->rb_right; +	} + +	th = thread__new(pid); +	if (th != NULL) { +		rb_link_node(&th->rb_node, parent, p); +		rb_insert_color(&th->rb_node, &threads); +		last_match = th; +	} + +	return th; +} + +static void thread__insert_map(struct thread *self, struct map *map) +{ +	struct map *pos, *tmp; + +	list_for_each_entry_safe(pos, tmp, &self->maps, node) { +		if (map__overlap(pos, map)) { +			list_del_init(&pos->node); +			/* XXX leaks dsos */ +			free(pos); +		} +	} + +	list_add_tail(&map->node, &self->maps); +} + +static int thread__fork(struct thread *self, struct thread *parent) +{ +	struct map *map; + +	if (self->comm) +		free(self->comm); +	self->comm = strdup(parent->comm); +	if (!self->comm) +		return -ENOMEM; + +	list_for_each_entry(map, &parent->maps, node) { +		struct map *new = map__clone(map); +		if (!new) +			return -ENOMEM; +		thread__insert_map(self, new); +	} + +	return 0; +} + +static struct map *thread__find_map(struct thread *self, __u64 ip) +{ +	struct map *pos; + +	if (self == NULL) +		return NULL; + +	list_for_each_entry(pos, &self->maps, node) +		if (ip >= pos->start && ip <= pos->end) +			return pos; + +	return NULL; +} + +static size_t threads__fprintf(FILE *fp) +{ +	size_t ret = 0; +	struct rb_node *nd; + +	for (nd = rb_first(&threads); nd; nd = rb_next(nd)) { +		struct thread *pos = rb_entry(nd, struct thread, rb_node); + +		ret += thread__fprintf(pos, fp); +	} + +	return ret; +} + +/* + * histogram, sorted on item, collects counts + */ + +static struct rb_root hist; + +struct hist_entry { +	struct rb_node	 rb_node; + +	struct thread	 *thread; +	struct map	 *map; +	struct dso	 *dso; +	struct symbol	 *sym; +	__u64	 ip; +	char		 level; + +	uint32_t	 count; +}; + +/* + * configurable sorting bits + */ + +struct sort_entry { +	struct list_head list; + +	char *header; + +	int64_t (*cmp)(struct hist_entry *, struct hist_entry *); +	int64_t (*collapse)(struct hist_entry *, struct hist_entry *); +	size_t	(*print)(FILE *fp, struct hist_entry *); +}; + +/* --sort pid */ + +static int64_t +sort__thread_cmp(struct hist_entry *left, struct hist_entry *right) +{ +	return right->thread->pid - left->thread->pid; +} + +static size_t +sort__thread_print(FILE *fp, struct hist_entry *self) +{ +	return fprintf(fp, "%16s:%5d", self->thread->comm ?: "", self->thread->pid); +} + +static struct sort_entry sort_thread = { +	.header = "         Command:  Pid", +	.cmp	= sort__thread_cmp, +	.print	= sort__thread_print, +}; + +/* --sort comm */ + +static int64_t +sort__comm_cmp(struct hist_entry *left, struct hist_entry *right) +{ +	return right->thread->pid - left->thread->pid; +} + +static int64_t +sort__comm_collapse(struct hist_entry *left, struct hist_entry *right) +{ +	char *comm_l = left->thread->comm; +	char *comm_r = right->thread->comm; + +	if (!comm_l || !comm_r) { +		if (!comm_l && !comm_r) +			return 0; +		else if (!comm_l) +			return -1; +		else +			return 1; +	} + +	return strcmp(comm_l, comm_r); +} + +static size_t +sort__comm_print(FILE *fp, struct hist_entry *self) +{ +	return fprintf(fp, "%16s", self->thread->comm); +} + +static struct sort_entry sort_comm = { +	.header		= "         Command", +	.cmp		= sort__comm_cmp, +	.collapse	= sort__comm_collapse, +	.print		= sort__comm_print, +}; + +/* --sort dso */ + +static int64_t +sort__dso_cmp(struct hist_entry *left, struct hist_entry *right) +{ +	struct dso *dso_l = left->dso; +	struct dso *dso_r = right->dso; + +	if (!dso_l || !dso_r) { +		if (!dso_l && !dso_r) +			return 0; +		else if (!dso_l) +			return -1; +		else +			return 1; +	} + +	return strcmp(dso_l->name, dso_r->name); +} + +static size_t +sort__dso_print(FILE *fp, struct hist_entry *self) +{ +	if (self->dso) +		return fprintf(fp, "%-25s", self->dso->name); + +	return fprintf(fp, "%016llx         ", (__u64)self->ip); +} + +static struct sort_entry sort_dso = { +	.header = "Shared Object            ", +	.cmp	= sort__dso_cmp, +	.print	= sort__dso_print, +}; + +/* --sort symbol */ + +static int64_t +sort__sym_cmp(struct hist_entry *left, struct hist_entry *right) +{ +	__u64 ip_l, ip_r; + +	if (left->sym == right->sym) +		return 0; + +	ip_l = left->sym ? left->sym->start : left->ip; +	ip_r = right->sym ? right->sym->start : right->ip; + +	return (int64_t)(ip_r - ip_l); +} + +static size_t +sort__sym_print(FILE *fp, struct hist_entry *self) +{ +	size_t ret = 0; + +	if (verbose) +		ret += fprintf(fp, "%#018llx  ", (__u64)self->ip); + +	if (self->sym) { +		ret += fprintf(fp, "[%c] %s", +			self->dso == kernel_dso ? 'k' : '.', self->sym->name); +	} else { +		ret += fprintf(fp, "%#016llx", (__u64)self->ip); +	} + +	return ret; +} + +static struct sort_entry sort_sym = { +	.header = "Symbol", +	.cmp	= sort__sym_cmp, +	.print	= sort__sym_print, +}; + +static int sort__need_collapse = 0; + +struct sort_dimension { +	char			*name; +	struct sort_entry	*entry; +	int			taken; +}; + +static struct sort_dimension sort_dimensions[] = { +	{ .name = "pid",	.entry = &sort_thread,	}, +	{ .name = "comm",	.entry = &sort_comm,	}, +	{ .name = "dso",	.entry = &sort_dso,	}, +	{ .name = "symbol",	.entry = &sort_sym,	}, +}; + +static LIST_HEAD(hist_entry__sort_list); + +static int sort_dimension__add(char *tok) +{ +	int i; + +	for (i = 0; i < ARRAY_SIZE(sort_dimensions); i++) { +		struct sort_dimension *sd = &sort_dimensions[i]; + +		if (sd->taken) +			continue; + +		if (strncasecmp(tok, sd->name, strlen(tok))) +			continue; + +		if (sd->entry->collapse) +			sort__need_collapse = 1; + +		list_add_tail(&sd->entry->list, &hist_entry__sort_list); +		sd->taken = 1; + +		return 0; +	} + +	return -ESRCH; +} + +static int64_t +hist_entry__cmp(struct hist_entry *left, struct hist_entry *right) +{ +	struct sort_entry *se; +	int64_t cmp = 0; + +	list_for_each_entry(se, &hist_entry__sort_list, list) { +		cmp = se->cmp(left, right); +		if (cmp) +			break; +	} + +	return cmp; +} + +static int64_t +hist_entry__collapse(struct hist_entry *left, struct hist_entry *right) +{ +	struct sort_entry *se; +	int64_t cmp = 0; + +	list_for_each_entry(se, &hist_entry__sort_list, list) { +		int64_t (*f)(struct hist_entry *, struct hist_entry *); + +		f = se->collapse ?: se->cmp; + +		cmp = f(left, right); +		if (cmp) +			break; +	} + +	return cmp; +} + +/* + * collect histogram counts + */ +static void hist_hit(struct hist_entry *he, __u64 ip) +{ +	unsigned int sym_size, offset; +	struct symbol *sym = he->sym; + +	he->count++; + +	if (!sym || !sym->hist) +		return; + +	sym_size = sym->end - sym->start; +	offset = ip - sym->start; + +	if (offset >= sym_size) +		return; + +	sym->hist_sum++; +	sym->hist[offset]++; + +	if (verbose >= 3) +		printf("%p %s: count++ [ip: %p, %08Lx] => %Ld\n", +			(void *)(unsigned long)he->sym->start, +			he->sym->name, +			(void *)(unsigned long)ip, ip - he->sym->start, +			sym->hist[offset]); +} + +static int +hist_entry__add(struct thread *thread, struct map *map, struct dso *dso, +		struct symbol *sym, __u64 ip, char level) +{ +	struct rb_node **p = &hist.rb_node; +	struct rb_node *parent = NULL; +	struct hist_entry *he; +	struct hist_entry entry = { +		.thread	= thread, +		.map	= map, +		.dso	= dso, +		.sym	= sym, +		.ip	= ip, +		.level	= level, +		.count	= 1, +	}; +	int cmp; + +	while (*p != NULL) { +		parent = *p; +		he = rb_entry(parent, struct hist_entry, rb_node); + +		cmp = hist_entry__cmp(&entry, he); + +		if (!cmp) { +			hist_hit(he, ip); + +			return 0; +		} + +		if (cmp < 0) +			p = &(*p)->rb_left; +		else +			p = &(*p)->rb_right; +	} + +	he = malloc(sizeof(*he)); +	if (!he) +		return -ENOMEM; +	*he = entry; +	rb_link_node(&he->rb_node, parent, p); +	rb_insert_color(&he->rb_node, &hist); + +	return 0; +} + +static void hist_entry__free(struct hist_entry *he) +{ +	free(he); +} + +/* + * collapse the histogram + */ + +static struct rb_root collapse_hists; + +static void collapse__insert_entry(struct hist_entry *he) +{ +	struct rb_node **p = &collapse_hists.rb_node; +	struct rb_node *parent = NULL; +	struct hist_entry *iter; +	int64_t cmp; + +	while (*p != NULL) { +		parent = *p; +		iter = rb_entry(parent, struct hist_entry, rb_node); + +		cmp = hist_entry__collapse(iter, he); + +		if (!cmp) { +			iter->count += he->count; +			hist_entry__free(he); +			return; +		} + +		if (cmp < 0) +			p = &(*p)->rb_left; +		else +			p = &(*p)->rb_right; +	} + +	rb_link_node(&he->rb_node, parent, p); +	rb_insert_color(&he->rb_node, &collapse_hists); +} + +static void collapse__resort(void) +{ +	struct rb_node *next; +	struct hist_entry *n; + +	if (!sort__need_collapse) +		return; + +	next = rb_first(&hist); +	while (next) { +		n = rb_entry(next, struct hist_entry, rb_node); +		next = rb_next(&n->rb_node); + +		rb_erase(&n->rb_node, &hist); +		collapse__insert_entry(n); +	} +} + +/* + * reverse the map, sort on count. + */ + +static struct rb_root output_hists; + +static void output__insert_entry(struct hist_entry *he) +{ +	struct rb_node **p = &output_hists.rb_node; +	struct rb_node *parent = NULL; +	struct hist_entry *iter; + +	while (*p != NULL) { +		parent = *p; +		iter = rb_entry(parent, struct hist_entry, rb_node); + +		if (he->count > iter->count) +			p = &(*p)->rb_left; +		else +			p = &(*p)->rb_right; +	} + +	rb_link_node(&he->rb_node, parent, p); +	rb_insert_color(&he->rb_node, &output_hists); +} + +static void output__resort(void) +{ +	struct rb_node *next; +	struct hist_entry *n; +	struct rb_root *tree = &hist; + +	if (sort__need_collapse) +		tree = &collapse_hists; + +	next = rb_first(tree); + +	while (next) { +		n = rb_entry(next, struct hist_entry, rb_node); +		next = rb_next(&n->rb_node); + +		rb_erase(&n->rb_node, tree); +		output__insert_entry(n); +	} +} + +static void register_idle_thread(void) +{ +	struct thread *thread = threads__findnew(0); + +	if (thread == NULL || +			thread__set_comm(thread, "[idle]")) { +		fprintf(stderr, "problem inserting idle task.\n"); +		exit(-1); +	} +} + +static unsigned long total = 0, +		     total_mmap = 0, +		     total_comm = 0, +		     total_fork = 0, +		     total_unknown = 0; + +static int +process_overflow_event(event_t *event, unsigned long offset, unsigned long head) +{ +	char level; +	int show = 0; +	struct dso *dso = NULL; +	struct thread *thread = threads__findnew(event->ip.pid); +	__u64 ip = event->ip.ip; +	struct map *map = NULL; + +	dprintf("%p [%p]: PERF_EVENT (IP, %d): %d: %p\n", +		(void *)(offset + head), +		(void *)(long)(event->header.size), +		event->header.misc, +		event->ip.pid, +		(void *)(long)ip); + +	dprintf(" ... thread: %s:%d\n", thread->comm, thread->pid); + +	if (thread == NULL) { +		fprintf(stderr, "problem processing %d event, skipping it.\n", +			event->header.type); +		return -1; +	} + +	if (event->header.misc & PERF_EVENT_MISC_KERNEL) { +		show = SHOW_KERNEL; +		level = 'k'; + +		dso = kernel_dso; + +		dprintf(" ...... dso: %s\n", dso->name); + +	} else if (event->header.misc & PERF_EVENT_MISC_USER) { + +		show = SHOW_USER; +		level = '.'; + +		map = thread__find_map(thread, ip); +		if (map != NULL) { +			ip = map->map_ip(map, ip); +			dso = map->dso; +		} else { +			/* +			 * If this is outside of all known maps, +			 * and is a negative address, try to look it +			 * up in the kernel dso, as it might be a +			 * vsyscall (which executes in user-mode): +			 */ +			if ((long long)ip < 0) +				dso = kernel_dso; +		} +		dprintf(" ...... dso: %s\n", dso ? dso->name : "<not found>"); + +	} else { +		show = SHOW_HV; +		level = 'H'; +		dprintf(" ...... dso: [hypervisor]\n"); +	} + +	if (show & show_mask) { +		struct symbol *sym = NULL; + +		if (dso) +			sym = dso->find_symbol(dso, ip); + +		if (hist_entry__add(thread, map, dso, sym, ip, level)) { +			fprintf(stderr, +		"problem incrementing symbol count, skipping event\n"); +			return -1; +		} +	} +	total++; + +	return 0; +} + +static int +process_mmap_event(event_t *event, unsigned long offset, unsigned long head) +{ +	struct thread *thread = threads__findnew(event->mmap.pid); +	struct map *map = map__new(&event->mmap); + +	dprintf("%p [%p]: PERF_EVENT_MMAP %d: [%p(%p) @ %p]: %s\n", +		(void *)(offset + head), +		(void *)(long)(event->header.size), +		event->mmap.pid, +		(void *)(long)event->mmap.start, +		(void *)(long)event->mmap.len, +		(void *)(long)event->mmap.pgoff, +		event->mmap.filename); + +	if (thread == NULL || map == NULL) { +		dprintf("problem processing PERF_EVENT_MMAP, skipping event.\n"); +		return 0; +	} + +	thread__insert_map(thread, map); +	total_mmap++; + +	return 0; +} + +static int +process_comm_event(event_t *event, unsigned long offset, unsigned long head) +{ +	struct thread *thread = threads__findnew(event->comm.pid); + +	dprintf("%p [%p]: PERF_EVENT_COMM: %s:%d\n", +		(void *)(offset + head), +		(void *)(long)(event->header.size), +		event->comm.comm, event->comm.pid); + +	if (thread == NULL || +	    thread__set_comm(thread, event->comm.comm)) { +		dprintf("problem processing PERF_EVENT_COMM, skipping event.\n"); +		return -1; +	} +	total_comm++; + +	return 0; +} + +static int +process_fork_event(event_t *event, unsigned long offset, unsigned long head) +{ +	struct thread *thread = threads__findnew(event->fork.pid); +	struct thread *parent = threads__findnew(event->fork.ppid); + +	dprintf("%p [%p]: PERF_EVENT_FORK: %d:%d\n", +		(void *)(offset + head), +		(void *)(long)(event->header.size), +		event->fork.pid, event->fork.ppid); + +	if (!thread || !parent || thread__fork(thread, parent)) { +		dprintf("problem processing PERF_EVENT_FORK, skipping event.\n"); +		return -1; +	} +	total_fork++; + +	return 0; +} + +static int +process_period_event(event_t *event, unsigned long offset, unsigned long head) +{ +	dprintf("%p [%p]: PERF_EVENT_PERIOD: time:%Ld, id:%Ld: period:%Ld\n", +		(void *)(offset + head), +		(void *)(long)(event->header.size), +		event->period.time, +		event->period.id, +		event->period.sample_period); + +	return 0; +} + +static int +process_event(event_t *event, unsigned long offset, unsigned long head) +{ +	if (event->header.misc & PERF_EVENT_MISC_OVERFLOW) +		return process_overflow_event(event, offset, head); + +	switch (event->header.type) { +	case PERF_EVENT_MMAP: +		return process_mmap_event(event, offset, head); + +	case PERF_EVENT_COMM: +		return process_comm_event(event, offset, head); + +	case PERF_EVENT_FORK: +		return process_fork_event(event, offset, head); + +	case PERF_EVENT_PERIOD: +		return process_period_event(event, offset, head); +	/* +	 * We dont process them right now but they are fine: +	 */ + +	case PERF_EVENT_THROTTLE: +	case PERF_EVENT_UNTHROTTLE: +		return 0; + +	default: +		return -1; +	} + +	return 0; +} + +static int +parse_line(FILE *file, struct symbol *sym, __u64 start, __u64 len) +{ +	char *line = NULL, *tmp, *tmp2; +	unsigned int offset; +	size_t line_len; +	__u64 line_ip; +	int ret; +	char *c; + +	if (getline(&line, &line_len, file) < 0) +		return -1; +	if (!line) +		return -1; + +	c = strchr(line, '\n'); +	if (c) +		*c = 0; + +	line_ip = -1; +	offset = 0; +	ret = -2; + +	/* +	 * Strip leading spaces: +	 */ +	tmp = line; +	while (*tmp) { +		if (*tmp != ' ') +			break; +		tmp++; +	} + +	if (*tmp) { +		/* +		 * Parse hexa addresses followed by ':' +		 */ +		line_ip = strtoull(tmp, &tmp2, 16); +		if (*tmp2 != ':') +			line_ip = -1; +	} + +	if (line_ip != -1) { +		unsigned int hits = 0; +		double percent = 0.0; +		char *color = PERF_COLOR_NORMAL; + +		offset = line_ip - start; +		if (offset < len) +			hits = sym->hist[offset]; + +		if (sym->hist_sum) +			percent = 100.0 * hits / sym->hist_sum; + +		/* +		 * We color high-overhead entries in red, mid-overhead +		 * entries in green - and keep the low overhead places +		 * normal: +		 */ +		if (percent >= 5.0) +			color = PERF_COLOR_RED; +		else { +			if (percent > 0.5) +				color = PERF_COLOR_GREEN; +		} + +		color_fprintf(stdout, color, " %7.2f", percent); +		printf(" :	"); +		color_fprintf(stdout, PERF_COLOR_BLUE, "%s\n", line); +	} else { +		if (!*line) +			printf("         :\n"); +		else +			printf("         :	%s\n", line); +	} + +	return 0; +} + +static void annotate_sym(struct dso *dso, struct symbol *sym) +{ +	char *filename = dso->name; +	__u64 start, end, len; +	char command[PATH_MAX*2]; +	FILE *file; + +	if (!filename) +		return; +	if (dso == kernel_dso) +		filename = vmlinux; + +	printf("\n------------------------------------------------\n"); +	printf(" Percent |	Source code & Disassembly of %s\n", filename); +	printf("------------------------------------------------\n"); + +	if (verbose >= 2) +		printf("annotating [%p] %30s : [%p] %30s\n", dso, dso->name, sym, sym->name); + +	start = sym->obj_start; +	if (!start) +		start = sym->start; + +	end = start + sym->end - sym->start + 1; +	len = sym->end - sym->start; + +	sprintf(command, "objdump --start-address=0x%016Lx --stop-address=0x%016Lx -dS %s", (__u64)start, (__u64)end, filename); + +	if (verbose >= 3) +		printf("doing: %s\n", command); + +	file = popen(command, "r"); +	if (!file) +		return; + +	while (!feof(file)) { +		if (parse_line(file, sym, start, len) < 0) +			break; +	} + +	pclose(file); +} + +static void find_annotations(void) +{ +	struct rb_node *nd; +	struct dso *dso; +	int count = 0; + +	list_for_each_entry(dso, &dsos, node) { + +		for (nd = rb_first(&dso->syms); nd; nd = rb_next(nd)) { +			struct symbol *sym = rb_entry(nd, struct symbol, rb_node); + +			if (sym->hist) { +				annotate_sym(dso, sym); +				count++; +			} +		} +	} + +	if (!count) +		printf(" Error: symbol '%s' not present amongst the samples.\n", sym_hist_filter); +} + +static int __cmd_annotate(void) +{ +	int ret, rc = EXIT_FAILURE; +	unsigned long offset = 0; +	unsigned long head = 0; +	struct stat stat; +	event_t *event; +	uint32_t size; +	char *buf; + +	register_idle_thread(); + +	input = open(input_name, O_RDONLY); +	if (input < 0) { +		perror("failed to open file"); +		exit(-1); +	} + +	ret = fstat(input, &stat); +	if (ret < 0) { +		perror("failed to stat file"); +		exit(-1); +	} + +	if (!stat.st_size) { +		fprintf(stderr, "zero-sized file, nothing to do!\n"); +		exit(0); +	} + +	if (load_kernel() < 0) { +		perror("failed to load kernel symbols"); +		return EXIT_FAILURE; +	} + +remap: +	buf = (char *)mmap(NULL, page_size * mmap_window, PROT_READ, +			   MAP_SHARED, input, offset); +	if (buf == MAP_FAILED) { +		perror("failed to mmap file"); +		exit(-1); +	} + +more: +	event = (event_t *)(buf + head); + +	size = event->header.size; +	if (!size) +		size = 8; + +	if (head + event->header.size >= page_size * mmap_window) { +		unsigned long shift = page_size * (head / page_size); +		int ret; + +		ret = munmap(buf, page_size * mmap_window); +		assert(ret == 0); + +		offset += shift; +		head -= shift; +		goto remap; +	} + +	size = event->header.size; + +	dprintf("%p [%p]: event: %d\n", +			(void *)(offset + head), +			(void *)(long)event->header.size, +			event->header.type); + +	if (!size || process_event(event, offset, head) < 0) { + +		dprintf("%p [%p]: skipping unknown header type: %d\n", +			(void *)(offset + head), +			(void *)(long)(event->header.size), +			event->header.type); + +		total_unknown++; + +		/* +		 * assume we lost track of the stream, check alignment, and +		 * increment a single u64 in the hope to catch on again 'soon'. +		 */ + +		if (unlikely(head & 7)) +			head &= ~7ULL; + +		size = 8; +	} + +	head += size; + +	if (offset + head < stat.st_size) +		goto more; + +	rc = EXIT_SUCCESS; +	close(input); + +	dprintf("      IP events: %10ld\n", total); +	dprintf("    mmap events: %10ld\n", total_mmap); +	dprintf("    comm events: %10ld\n", total_comm); +	dprintf("    fork events: %10ld\n", total_fork); +	dprintf(" unknown events: %10ld\n", total_unknown); + +	if (dump_trace) +		return 0; + +	if (verbose >= 3) +		threads__fprintf(stdout); + +	if (verbose >= 2) +		dsos__fprintf(stdout); + +	collapse__resort(); +	output__resort(); + +	find_annotations(); + +	return rc; +} + +static const char * const annotate_usage[] = { +	"perf annotate [<options>] <command>", +	NULL +}; + +static const struct option options[] = { +	OPT_STRING('i', "input", &input_name, "file", +		    "input file name"), +	OPT_STRING('s', "symbol", &sym_hist_filter, "symbol", +		    "symbol to annotate"), +	OPT_BOOLEAN('v', "verbose", &verbose, +		    "be more verbose (show symbol address, etc)"), +	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace, +		    "dump raw trace in ASCII"), +	OPT_STRING('k', "vmlinux", &vmlinux, "file", "vmlinux pathname"), +	OPT_END() +}; + +static void setup_sorting(void) +{ +	char *tmp, *tok, *str = strdup(sort_order); + +	for (tok = strtok_r(str, ", ", &tmp); +			tok; tok = strtok_r(NULL, ", ", &tmp)) { +		if (sort_dimension__add(tok) < 0) { +			error("Unknown --sort key: `%s'", tok); +			usage_with_options(annotate_usage, options); +		} +	} + +	free(str); +} + +int cmd_annotate(int argc, const char **argv, const char *prefix) +{ +	symbol__init(); + +	page_size = getpagesize(); + +	argc = parse_options(argc, argv, options, annotate_usage, 0); + +	setup_sorting(); + +	if (argc) { +		/* +		 * Special case: if there's an argument left then assume tha +		 * it's a symbol filter: +		 */ +		if (argc > 1) +			usage_with_options(annotate_usage, options); + +		sym_hist_filter = argv[0]; +	} + +	if (!sym_hist_filter) +		usage_with_options(annotate_usage, options); + +	setup_pager(); + +	return __cmd_annotate(); +} diff --git a/tools/perf/builtin-help.c b/tools/perf/builtin-help.c new file mode 100644 index 00000000000..0f32dc3f3c4 --- /dev/null +++ b/tools/perf/builtin-help.c @@ -0,0 +1,461 @@ +/* + * builtin-help.c + * + * Builtin help command + */ +#include "util/cache.h" +#include "builtin.h" +#include "util/exec_cmd.h" +#include "common-cmds.h" +#include "util/parse-options.h" +#include "util/run-command.h" +#include "util/help.h" + +static struct man_viewer_list { +	struct man_viewer_list *next; +	char name[FLEX_ARRAY]; +} *man_viewer_list; + +static struct man_viewer_info_list { +	struct man_viewer_info_list *next; +	const char *info; +	char name[FLEX_ARRAY]; +} *man_viewer_info_list; + +enum help_format { +	HELP_FORMAT_MAN, +	HELP_FORMAT_INFO, +	HELP_FORMAT_WEB, +}; + +static int show_all = 0; +static enum help_format help_format = HELP_FORMAT_MAN; +static struct option builtin_help_options[] = { +	OPT_BOOLEAN('a', "all", &show_all, "print all available commands"), +	OPT_SET_INT('m', "man", &help_format, "show man page", HELP_FORMAT_MAN), +	OPT_SET_INT('w', "web", &help_format, "show manual in web browser", +			HELP_FORMAT_WEB), +	OPT_SET_INT('i', "info", &help_format, "show info page", +			HELP_FORMAT_INFO), +	OPT_END(), +}; + +static const char * const builtin_help_usage[] = { +	"perf help [--all] [--man|--web|--info] [command]", +	NULL +}; + +static enum help_format parse_help_format(const char *format) +{ +	if (!strcmp(format, "man")) +		return HELP_FORMAT_MAN; +	if (!strcmp(format, "info")) +		return HELP_FORMAT_INFO; +	if (!strcmp(format, "web") || !strcmp(format, "html")) +		return HELP_FORMAT_WEB; +	die("unrecognized help format '%s'", format); +} + +static const char *get_man_viewer_info(const char *name) +{ +	struct man_viewer_info_list *viewer; + +	for (viewer = man_viewer_info_list; viewer; viewer = viewer->next) +	{ +		if (!strcasecmp(name, viewer->name)) +			return viewer->info; +	} +	return NULL; +} + +static int check_emacsclient_version(void) +{ +	struct strbuf buffer = STRBUF_INIT; +	struct child_process ec_process; +	const char *argv_ec[] = { "emacsclient", "--version", NULL }; +	int version; + +	/* emacsclient prints its version number on stderr */ +	memset(&ec_process, 0, sizeof(ec_process)); +	ec_process.argv = argv_ec; +	ec_process.err = -1; +	ec_process.stdout_to_stderr = 1; +	if (start_command(&ec_process)) { +		fprintf(stderr, "Failed to start emacsclient.\n"); +		return -1; +	} +	strbuf_read(&buffer, ec_process.err, 20); +	close(ec_process.err); + +	/* +	 * Don't bother checking return value, because "emacsclient --version" +	 * seems to always exits with code 1. +	 */ +	finish_command(&ec_process); + +	if (prefixcmp(buffer.buf, "emacsclient")) { +		fprintf(stderr, "Failed to parse emacsclient version.\n"); +		strbuf_release(&buffer); +		return -1; +	} + +	strbuf_remove(&buffer, 0, strlen("emacsclient")); +	version = atoi(buffer.buf); + +	if (version < 22) { +		fprintf(stderr, +			"emacsclient version '%d' too old (< 22).\n", +			version); +		strbuf_release(&buffer); +		return -1; +	} + +	strbuf_release(&buffer); +	return 0; +} + +static void exec_woman_emacs(const char* path, const char *page) +{ +	if (!check_emacsclient_version()) { +		/* This works only with emacsclient version >= 22. */ +		struct strbuf man_page = STRBUF_INIT; + +		if (!path) +			path = "emacsclient"; +		strbuf_addf(&man_page, "(woman \"%s\")", page); +		execlp(path, "emacsclient", "-e", man_page.buf, NULL); +		warning("failed to exec '%s': %s", path, strerror(errno)); +	} +} + +static void exec_man_konqueror(const char* path, const char *page) +{ +	const char *display = getenv("DISPLAY"); +	if (display && *display) { +		struct strbuf man_page = STRBUF_INIT; +		const char *filename = "kfmclient"; + +		/* It's simpler to launch konqueror using kfmclient. */ +		if (path) { +			const char *file = strrchr(path, '/'); +			if (file && !strcmp(file + 1, "konqueror")) { +				char *new = strdup(path); +				char *dest = strrchr(new, '/'); + +				/* strlen("konqueror") == strlen("kfmclient") */ +				strcpy(dest + 1, "kfmclient"); +				path = new; +			} +			if (file) +				filename = file; +		} else +			path = "kfmclient"; +		strbuf_addf(&man_page, "man:%s(1)", page); +		execlp(path, filename, "newTab", man_page.buf, NULL); +		warning("failed to exec '%s': %s", path, strerror(errno)); +	} +} + +static void exec_man_man(const char* path, const char *page) +{ +	if (!path) +		path = "man"; +	execlp(path, "man", page, NULL); +	warning("failed to exec '%s': %s", path, strerror(errno)); +} + +static void exec_man_cmd(const char *cmd, const char *page) +{ +	struct strbuf shell_cmd = STRBUF_INIT; +	strbuf_addf(&shell_cmd, "%s %s", cmd, page); +	execl("/bin/sh", "sh", "-c", shell_cmd.buf, NULL); +	warning("failed to exec '%s': %s", cmd, strerror(errno)); +} + +static void add_man_viewer(const char *name) +{ +	struct man_viewer_list **p = &man_viewer_list; +	size_t len = strlen(name); + +	while (*p) +		p = &((*p)->next); +	*p = calloc(1, (sizeof(**p) + len + 1)); +	strncpy((*p)->name, name, len); +} + +static int supported_man_viewer(const char *name, size_t len) +{ +	return (!strncasecmp("man", name, len) || +		!strncasecmp("woman", name, len) || +		!strncasecmp("konqueror", name, len)); +} + +static void do_add_man_viewer_info(const char *name, +				   size_t len, +				   const char *value) +{ +	struct man_viewer_info_list *new = calloc(1, sizeof(*new) + len + 1); + +	strncpy(new->name, name, len); +	new->info = strdup(value); +	new->next = man_viewer_info_list; +	man_viewer_info_list = new; +} + +static int add_man_viewer_path(const char *name, +			       size_t len, +			       const char *value) +{ +	if (supported_man_viewer(name, len)) +		do_add_man_viewer_info(name, len, value); +	else +		warning("'%s': path for unsupported man viewer.\n" +			"Please consider using 'man.<tool>.cmd' instead.", +			name); + +	return 0; +} + +static int add_man_viewer_cmd(const char *name, +			      size_t len, +			      const char *value) +{ +	if (supported_man_viewer(name, len)) +		warning("'%s': cmd for supported man viewer.\n" +			"Please consider using 'man.<tool>.path' instead.", +			name); +	else +		do_add_man_viewer_info(name, len, value); + +	return 0; +} + +static int add_man_viewer_info(const char *var, const char *value) +{ +	const char *name = var + 4; +	const char *subkey = strrchr(name, '.'); + +	if (!subkey) +		return error("Config with no key for man viewer: %s", name); + +	if (!strcmp(subkey, ".path")) { +		if (!value) +			return config_error_nonbool(var); +		return add_man_viewer_path(name, subkey - name, value); +	} +	if (!strcmp(subkey, ".cmd")) { +		if (!value) +			return config_error_nonbool(var); +		return add_man_viewer_cmd(name, subkey - name, value); +	} + +	warning("'%s': unsupported man viewer sub key.", subkey); +	return 0; +} + +static int perf_help_config(const char *var, const char *value, void *cb) +{ +	if (!strcmp(var, "help.format")) { +		if (!value) +			return config_error_nonbool(var); +		help_format = parse_help_format(value); +		return 0; +	} +	if (!strcmp(var, "man.viewer")) { +		if (!value) +			return config_error_nonbool(var); +		add_man_viewer(value); +		return 0; +	} +	if (!prefixcmp(var, "man.")) +		return add_man_viewer_info(var, value); + +	return perf_default_config(var, value, cb); +} + +static struct cmdnames main_cmds, other_cmds; + +void list_common_cmds_help(void) +{ +	int i, longest = 0; + +	for (i = 0; i < ARRAY_SIZE(common_cmds); i++) { +		if (longest < strlen(common_cmds[i].name)) +			longest = strlen(common_cmds[i].name); +	} + +	puts(" The most commonly used perf commands are:"); +	for (i = 0; i < ARRAY_SIZE(common_cmds); i++) { +		printf("   %s   ", common_cmds[i].name); +		mput_char(' ', longest - strlen(common_cmds[i].name)); +		puts(common_cmds[i].help); +	} +} + +static int is_perf_command(const char *s) +{ +	return is_in_cmdlist(&main_cmds, s) || +		is_in_cmdlist(&other_cmds, s); +} + +static const char *prepend(const char *prefix, const char *cmd) +{ +	size_t pre_len = strlen(prefix); +	size_t cmd_len = strlen(cmd); +	char *p = malloc(pre_len + cmd_len + 1); +	memcpy(p, prefix, pre_len); +	strcpy(p + pre_len, cmd); +	return p; +} + +static const char *cmd_to_page(const char *perf_cmd) +{ +	if (!perf_cmd) +		return "perf"; +	else if (!prefixcmp(perf_cmd, "perf")) +		return perf_cmd; +	else if (is_perf_command(perf_cmd)) +		return prepend("perf-", perf_cmd); +	else +		return prepend("perf-", perf_cmd); +} + +static void setup_man_path(void) +{ +	struct strbuf new_path = STRBUF_INIT; +	const char *old_path = getenv("MANPATH"); + +	/* We should always put ':' after our path. If there is no +	 * old_path, the ':' at the end will let 'man' to try +	 * system-wide paths after ours to find the manual page. If +	 * there is old_path, we need ':' as delimiter. */ +	strbuf_addstr(&new_path, system_path(PERF_MAN_PATH)); +	strbuf_addch(&new_path, ':'); +	if (old_path) +		strbuf_addstr(&new_path, old_path); + +	setenv("MANPATH", new_path.buf, 1); + +	strbuf_release(&new_path); +} + +static void exec_viewer(const char *name, const char *page) +{ +	const char *info = get_man_viewer_info(name); + +	if (!strcasecmp(name, "man")) +		exec_man_man(info, page); +	else if (!strcasecmp(name, "woman")) +		exec_woman_emacs(info, page); +	else if (!strcasecmp(name, "konqueror")) +		exec_man_konqueror(info, page); +	else if (info) +		exec_man_cmd(info, page); +	else +		warning("'%s': unknown man viewer.", name); +} + +static void show_man_page(const char *perf_cmd) +{ +	struct man_viewer_list *viewer; +	const char *page = cmd_to_page(perf_cmd); +	const char *fallback = getenv("PERF_MAN_VIEWER"); + +	setup_man_path(); +	for (viewer = man_viewer_list; viewer; viewer = viewer->next) +	{ +		exec_viewer(viewer->name, page); /* will return when unable */ +	} +	if (fallback) +		exec_viewer(fallback, page); +	exec_viewer("man", page); +	die("no man viewer handled the request"); +} + +static void show_info_page(const char *perf_cmd) +{ +	const char *page = cmd_to_page(perf_cmd); +	setenv("INFOPATH", system_path(PERF_INFO_PATH), 1); +	execlp("info", "info", "perfman", page, NULL); +} + +static void get_html_page_path(struct strbuf *page_path, const char *page) +{ +	struct stat st; +	const char *html_path = system_path(PERF_HTML_PATH); + +	/* Check that we have a perf documentation directory. */ +	if (stat(mkpath("%s/perf.html", html_path), &st) +	    || !S_ISREG(st.st_mode)) +		die("'%s': not a documentation directory.", html_path); + +	strbuf_init(page_path, 0); +	strbuf_addf(page_path, "%s/%s.html", html_path, page); +} + +/* + * If open_html is not defined in a platform-specific way (see for + * example compat/mingw.h), we use the script web--browse to display + * HTML. + */ +#ifndef open_html +static void open_html(const char *path) +{ +	execl_perf_cmd("web--browse", "-c", "help.browser", path, NULL); +} +#endif + +static void show_html_page(const char *perf_cmd) +{ +	const char *page = cmd_to_page(perf_cmd); +	struct strbuf page_path; /* it leaks but we exec bellow */ + +	get_html_page_path(&page_path, page); + +	open_html(page_path.buf); +} + +int cmd_help(int argc, const char **argv, const char *prefix) +{ +	const char *alias; +	load_command_list("perf-", &main_cmds, &other_cmds); + +	perf_config(perf_help_config, NULL); + +	argc = parse_options(argc, argv, builtin_help_options, +			builtin_help_usage, 0); + +	if (show_all) { +		printf("\n usage: %s\n\n", perf_usage_string); +		list_commands("perf commands", &main_cmds, &other_cmds); +		printf(" %s\n\n", perf_more_info_string); +		return 0; +	} + +	if (!argv[0]) { +		printf("\n usage: %s\n\n", perf_usage_string); +		list_common_cmds_help(); +		printf("\n %s\n\n", perf_more_info_string); +		return 0; +	} + +	alias = alias_lookup(argv[0]); +	if (alias && !is_perf_command(argv[0])) { +		printf("`perf %s' is aliased to `%s'\n", argv[0], alias); +		return 0; +	} + +	switch (help_format) { +	case HELP_FORMAT_MAN: +		show_man_page(argv[0]); +		break; +	case HELP_FORMAT_INFO: +		show_info_page(argv[0]); +		break; +	case HELP_FORMAT_WEB: +		show_html_page(argv[0]); +		break; +	} + +	return 0; +} diff --git a/tools/perf/builtin-list.c b/tools/perf/builtin-list.c new file mode 100644 index 00000000000..fe60e37c96e --- /dev/null +++ b/tools/perf/builtin-list.c @@ -0,0 +1,20 @@ +/* + * builtin-list.c + * + * Builtin list command: list all event types + * + * Copyright (C) 2009, Thomas Gleixner <tglx@linutronix.de> + * Copyright (C) 2008-2009, Red Hat Inc, Ingo Molnar <mingo@redhat.com> + */ +#include "builtin.h" + +#include "perf.h" + +#include "util/parse-options.h" +#include "util/parse-events.h" + +int cmd_list(int argc, const char **argv, const char *prefix) +{ +	print_events(); +	return 0; +} diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c new file mode 100644 index 00000000000..29259e74dcf --- /dev/null +++ b/tools/perf/builtin-record.c @@ -0,0 +1,582 @@ +/* + * builtin-record.c + * + * Builtin record command: Record the profile of a workload + * (or a CPU, or a PID) into the perf.data output file - for + * later analysis via perf report. + */ +#include "builtin.h" + +#include "perf.h" + +#include "util/util.h" +#include "util/parse-options.h" +#include "util/parse-events.h" +#include "util/string.h" + +#include <unistd.h> +#include <sched.h> + +#define ALIGN(x, a)		__ALIGN_MASK(x, (typeof(x))(a)-1) +#define __ALIGN_MASK(x, mask)	(((x)+(mask))&~(mask)) + +static int			fd[MAX_NR_CPUS][MAX_COUNTERS]; + +static long			default_interval		= 100000; + +static int			nr_cpus				= 0; +static unsigned int		page_size; +static unsigned int		mmap_pages			= 128; +static int			freq				= 0; +static int			output; +static const char		*output_name			= "perf.data"; +static int			group				= 0; +static unsigned int		realtime_prio			= 0; +static int			system_wide			= 0; +static pid_t			target_pid			= -1; +static int			inherit				= 1; +static int			force				= 0; +static int			append_file			= 0; +static int			verbose				= 0; + +static long			samples; +static struct timeval		last_read; +static struct timeval		this_read; + +static __u64			bytes_written; + +static struct pollfd		event_array[MAX_NR_CPUS * MAX_COUNTERS]; + +static int			nr_poll; +static int			nr_cpu; + +struct mmap_event { +	struct perf_event_header	header; +	__u32				pid; +	__u32				tid; +	__u64				start; +	__u64				len; +	__u64				pgoff; +	char				filename[PATH_MAX]; +}; + +struct comm_event { +	struct perf_event_header	header; +	__u32				pid; +	__u32				tid; +	char				comm[16]; +}; + + +struct mmap_data { +	int			counter; +	void			*base; +	unsigned int		mask; +	unsigned int		prev; +}; + +static struct mmap_data		mmap_array[MAX_NR_CPUS][MAX_COUNTERS]; + +static unsigned int mmap_read_head(struct mmap_data *md) +{ +	struct perf_counter_mmap_page *pc = md->base; +	int head; + +	head = pc->data_head; +	rmb(); + +	return head; +} + +static void mmap_read(struct mmap_data *md) +{ +	unsigned int head = mmap_read_head(md); +	unsigned int old = md->prev; +	unsigned char *data = md->base + page_size; +	unsigned long size; +	void *buf; +	int diff; + +	gettimeofday(&this_read, NULL); + +	/* +	 * If we're further behind than half the buffer, there's a chance +	 * the writer will bite our tail and mess up the samples under us. +	 * +	 * If we somehow ended up ahead of the head, we got messed up. +	 * +	 * In either case, truncate and restart at head. +	 */ +	diff = head - old; +	if (diff > md->mask / 2 || diff < 0) { +		struct timeval iv; +		unsigned long msecs; + +		timersub(&this_read, &last_read, &iv); +		msecs = iv.tv_sec*1000 + iv.tv_usec/1000; + +		fprintf(stderr, "WARNING: failed to keep up with mmap data." +				"  Last read %lu msecs ago.\n", msecs); + +		/* +		 * head points to a known good entry, start there. +		 */ +		old = head; +	} + +	last_read = this_read; + +	if (old != head) +		samples++; + +	size = head - old; + +	if ((old & md->mask) + size != (head & md->mask)) { +		buf = &data[old & md->mask]; +		size = md->mask + 1 - (old & md->mask); +		old += size; + +		while (size) { +			int ret = write(output, buf, size); + +			if (ret < 0) +				die("failed to write"); + +			size -= ret; +			buf += ret; + +			bytes_written += ret; +		} +	} + +	buf = &data[old & md->mask]; +	size = head - old; +	old += size; + +	while (size) { +		int ret = write(output, buf, size); + +		if (ret < 0) +			die("failed to write"); + +		size -= ret; +		buf += ret; + +		bytes_written += ret; +	} + +	md->prev = old; +} + +static volatile int done = 0; +static volatile int signr = -1; + +static void sig_handler(int sig) +{ +	done = 1; +	signr = sig; +} + +static void sig_atexit(void) +{ +	if (signr == -1) +		return; + +	signal(signr, SIG_DFL); +	kill(getpid(), signr); +} + +static void pid_synthesize_comm_event(pid_t pid, int full) +{ +	struct comm_event comm_ev; +	char filename[PATH_MAX]; +	char bf[BUFSIZ]; +	int fd, ret; +	size_t size; +	char *field, *sep; +	DIR *tasks; +	struct dirent dirent, *next; + +	snprintf(filename, sizeof(filename), "/proc/%d/stat", pid); + +	fd = open(filename, O_RDONLY); +	if (fd < 0) { +		fprintf(stderr, "couldn't open %s\n", filename); +		exit(EXIT_FAILURE); +	} +	if (read(fd, bf, sizeof(bf)) < 0) { +		fprintf(stderr, "couldn't read %s\n", filename); +		exit(EXIT_FAILURE); +	} +	close(fd); + +	/* 9027 (cat) R 6747 9027 6747 34816 9027 ... */ +	memset(&comm_ev, 0, sizeof(comm_ev)); +	field = strchr(bf, '('); +	if (field == NULL) +		goto out_failure; +	sep = strchr(++field, ')'); +	if (sep == NULL) +		goto out_failure; +	size = sep - field; +	memcpy(comm_ev.comm, field, size++); + +	comm_ev.pid = pid; +	comm_ev.header.type = PERF_EVENT_COMM; +	size = ALIGN(size, sizeof(__u64)); +	comm_ev.header.size = sizeof(comm_ev) - (sizeof(comm_ev.comm) - size); + +	if (!full) { +		comm_ev.tid = pid; + +		ret = write(output, &comm_ev, comm_ev.header.size); +		if (ret < 0) { +			perror("failed to write"); +			exit(-1); +		} +		return; +	} + +	snprintf(filename, sizeof(filename), "/proc/%d/task", pid); + +	tasks = opendir(filename); +	while (!readdir_r(tasks, &dirent, &next) && next) { +		char *end; +		pid = strtol(dirent.d_name, &end, 10); +		if (*end) +			continue; + +		comm_ev.tid = pid; + +		ret = write(output, &comm_ev, comm_ev.header.size); +		if (ret < 0) { +			perror("failed to write"); +			exit(-1); +		} +	} +	closedir(tasks); +	return; + +out_failure: +	fprintf(stderr, "couldn't get COMM and pgid, malformed %s\n", +		filename); +	exit(EXIT_FAILURE); +} + +static void pid_synthesize_mmap_samples(pid_t pid) +{ +	char filename[PATH_MAX]; +	FILE *fp; + +	snprintf(filename, sizeof(filename), "/proc/%d/maps", pid); + +	fp = fopen(filename, "r"); +	if (fp == NULL) { +		fprintf(stderr, "couldn't open %s\n", filename); +		exit(EXIT_FAILURE); +	} +	while (1) { +		char bf[BUFSIZ], *pbf = bf; +		struct mmap_event mmap_ev = { +			.header.type = PERF_EVENT_MMAP, +		}; +		int n; +		size_t size; +		if (fgets(bf, sizeof(bf), fp) == NULL) +			break; + +		/* 00400000-0040c000 r-xp 00000000 fd:01 41038  /bin/cat */ +		n = hex2u64(pbf, &mmap_ev.start); +		if (n < 0) +			continue; +		pbf += n + 1; +		n = hex2u64(pbf, &mmap_ev.len); +		if (n < 0) +			continue; +		pbf += n + 3; +		if (*pbf == 'x') { /* vm_exec */ +			char *execname = strrchr(bf, ' '); + +			if (execname == NULL || execname[1] != '/') +				continue; + +			execname += 1; +			size = strlen(execname); +			execname[size - 1] = '\0'; /* Remove \n */ +			memcpy(mmap_ev.filename, execname, size); +			size = ALIGN(size, sizeof(__u64)); +			mmap_ev.len -= mmap_ev.start; +			mmap_ev.header.size = (sizeof(mmap_ev) - +					       (sizeof(mmap_ev.filename) - size)); +			mmap_ev.pid = pid; +			mmap_ev.tid = pid; + +			if (write(output, &mmap_ev, mmap_ev.header.size) < 0) { +				perror("failed to write"); +				exit(-1); +			} +		} +	} + +	fclose(fp); +} + +static void synthesize_samples(void) +{ +	DIR *proc; +	struct dirent dirent, *next; + +	proc = opendir("/proc"); + +	while (!readdir_r(proc, &dirent, &next) && next) { +		char *end; +		pid_t pid; + +		pid = strtol(dirent.d_name, &end, 10); +		if (*end) /* only interested in proper numerical dirents */ +			continue; + +		pid_synthesize_comm_event(pid, 1); +		pid_synthesize_mmap_samples(pid); +	} + +	closedir(proc); +} + +static int group_fd; + +static void create_counter(int counter, int cpu, pid_t pid) +{ +	struct perf_counter_attr *attr = attrs + counter; +	int track = 1; + +	attr->sample_type	= PERF_SAMPLE_IP | PERF_SAMPLE_TID; +	if (freq) { +		attr->sample_type	|= PERF_SAMPLE_PERIOD; +		attr->freq		= 1; +		attr->sample_freq	= freq; +	} +	attr->mmap		= track; +	attr->comm		= track; +	attr->inherit		= (cpu < 0) && inherit; +	attr->disabled		= 1; + +	track = 0; /* only the first counter needs these */ + +try_again: +	fd[nr_cpu][counter] = sys_perf_counter_open(attr, pid, cpu, group_fd, 0); + +	if (fd[nr_cpu][counter] < 0) { +		int err = errno; + +		if (err == EPERM) +			die("Permission error - are you root?\n"); + +		/* +		 * If it's cycles then fall back to hrtimer +		 * based cpu-clock-tick sw counter, which +		 * is always available even if no PMU support: +		 */ +		if (attr->type == PERF_TYPE_HARDWARE +			&& attr->config == PERF_COUNT_HW_CPU_CYCLES) { + +			if (verbose) +				warning(" ... trying to fall back to cpu-clock-ticks\n"); +			attr->type = PERF_TYPE_SOFTWARE; +			attr->config = PERF_COUNT_SW_CPU_CLOCK; +			goto try_again; +		} +		printf("\n"); +		error("perfcounter syscall returned with %d (%s)\n", +			fd[nr_cpu][counter], strerror(err)); +		die("No CONFIG_PERF_COUNTERS=y kernel support configured?\n"); +		exit(-1); +	} + +	assert(fd[nr_cpu][counter] >= 0); +	fcntl(fd[nr_cpu][counter], F_SETFL, O_NONBLOCK); + +	/* +	 * First counter acts as the group leader: +	 */ +	if (group && group_fd == -1) +		group_fd = fd[nr_cpu][counter]; + +	event_array[nr_poll].fd = fd[nr_cpu][counter]; +	event_array[nr_poll].events = POLLIN; +	nr_poll++; + +	mmap_array[nr_cpu][counter].counter = counter; +	mmap_array[nr_cpu][counter].prev = 0; +	mmap_array[nr_cpu][counter].mask = mmap_pages*page_size - 1; +	mmap_array[nr_cpu][counter].base = mmap(NULL, (mmap_pages+1)*page_size, +			PROT_READ, MAP_SHARED, fd[nr_cpu][counter], 0); +	if (mmap_array[nr_cpu][counter].base == MAP_FAILED) { +		error("failed to mmap with %d (%s)\n", errno, strerror(errno)); +		exit(-1); +	} + +	ioctl(fd[nr_cpu][counter], PERF_COUNTER_IOC_ENABLE); +} + +static void open_counters(int cpu, pid_t pid) +{ +	int counter; + +	if (pid > 0) { +		pid_synthesize_comm_event(pid, 0); +		pid_synthesize_mmap_samples(pid); +	} + +	group_fd = -1; +	for (counter = 0; counter < nr_counters; counter++) +		create_counter(counter, cpu, pid); + +	nr_cpu++; +} + +static int __cmd_record(int argc, const char **argv) +{ +	int i, counter; +	struct stat st; +	pid_t pid; +	int flags; +	int ret; + +	page_size = sysconf(_SC_PAGE_SIZE); +	nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); +	assert(nr_cpus <= MAX_NR_CPUS); +	assert(nr_cpus >= 0); + +	if (!stat(output_name, &st) && !force && !append_file) { +		fprintf(stderr, "Error, output file %s exists, use -A to append or -f to overwrite.\n", +				output_name); +		exit(-1); +	} + +	flags = O_CREAT|O_RDWR; +	if (append_file) +		flags |= O_APPEND; +	else +		flags |= O_TRUNC; + +	output = open(output_name, flags, S_IRUSR|S_IWUSR); +	if (output < 0) { +		perror("failed to create output file"); +		exit(-1); +	} + +	if (!system_wide) { +		open_counters(-1, target_pid != -1 ? target_pid : getpid()); +	} else for (i = 0; i < nr_cpus; i++) +		open_counters(i, target_pid); + +	atexit(sig_atexit); +	signal(SIGCHLD, sig_handler); +	signal(SIGINT, sig_handler); + +	if (target_pid == -1 && argc) { +		pid = fork(); +		if (pid < 0) +			perror("failed to fork"); + +		if (!pid) { +			if (execvp(argv[0], (char **)argv)) { +				perror(argv[0]); +				exit(-1); +			} +		} +	} + +	if (realtime_prio) { +		struct sched_param param; + +		param.sched_priority = realtime_prio; +		if (sched_setscheduler(0, SCHED_FIFO, ¶m)) { +			printf("Could not set realtime priority.\n"); +			exit(-1); +		} +	} + +	if (system_wide) +		synthesize_samples(); + +	while (!done) { +		int hits = samples; + +		for (i = 0; i < nr_cpu; i++) { +			for (counter = 0; counter < nr_counters; counter++) +				mmap_read(&mmap_array[i][counter]); +		} + +		if (hits == samples) +			ret = poll(event_array, nr_poll, 100); +	} + +	/* +	 * Approximate RIP event size: 24 bytes. +	 */ +	fprintf(stderr, +		"[ perf record: Captured and wrote %.3f MB %s (~%lld samples) ]\n", +		(double)bytes_written / 1024.0 / 1024.0, +		output_name, +		bytes_written / 24); + +	return 0; +} + +static const char * const record_usage[] = { +	"perf record [<options>] [<command>]", +	"perf record [<options>] -- <command> [<options>]", +	NULL +}; + +static const struct option options[] = { +	OPT_CALLBACK('e', "event", NULL, "event", +		     "event selector. use 'perf list' to list available events", +		     parse_events), +	OPT_INTEGER('p', "pid", &target_pid, +		    "record events on existing pid"), +	OPT_INTEGER('r', "realtime", &realtime_prio, +		    "collect data with this RT SCHED_FIFO priority"), +	OPT_BOOLEAN('a', "all-cpus", &system_wide, +			    "system-wide collection from all CPUs"), +	OPT_BOOLEAN('A', "append", &append_file, +			    "append to the output file to do incremental profiling"), +	OPT_BOOLEAN('f', "force", &force, +			"overwrite existing data file"), +	OPT_LONG('c', "count", &default_interval, +		    "event period to sample"), +	OPT_STRING('o', "output", &output_name, "file", +		    "output file name"), +	OPT_BOOLEAN('i', "inherit", &inherit, +		    "child tasks inherit counters"), +	OPT_INTEGER('F', "freq", &freq, +		    "profile at this frequency"), +	OPT_INTEGER('m', "mmap-pages", &mmap_pages, +		    "number of mmap data pages"), +	OPT_BOOLEAN('v', "verbose", &verbose, +		    "be more verbose (show counter open errors, etc)"), +	OPT_END() +}; + +int cmd_record(int argc, const char **argv, const char *prefix) +{ +	int counter; + +	argc = parse_options(argc, argv, options, record_usage, 0); +	if (!argc && target_pid == -1 && !system_wide) +		usage_with_options(record_usage, options); + +	if (!nr_counters) +		nr_counters = 1; + +	for (counter = 0; counter < nr_counters; counter++) { +		if (attrs[counter].sample_period) +			continue; + +		attrs[counter].sample_period = default_interval; +	} + +	return __cmd_record(argc, argv); +} diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c new file mode 100644 index 00000000000..82fa93b4db9 --- /dev/null +++ b/tools/perf/builtin-report.c @@ -0,0 +1,1316 @@ +/* + * builtin-report.c + * + * Builtin report command: Analyze the perf.data input file, + * look up and read DSOs and symbol information and display + * a histogram of results, along various sorting keys. + */ +#include "builtin.h" + +#include "util/util.h" + +#include "util/color.h" +#include "util/list.h" +#include "util/cache.h" +#include "util/rbtree.h" +#include "util/symbol.h" +#include "util/string.h" + +#include "perf.h" + +#include "util/parse-options.h" +#include "util/parse-events.h" + +#define SHOW_KERNEL	1 +#define SHOW_USER	2 +#define SHOW_HV		4 + +static char		const *input_name = "perf.data"; +static char		*vmlinux = NULL; + +static char		default_sort_order[] = "comm,dso"; +static char		*sort_order = default_sort_order; + +static int		input; +static int		show_mask = SHOW_KERNEL | SHOW_USER | SHOW_HV; + +static int		dump_trace = 0; +#define dprintf(x...)	do { if (dump_trace) printf(x); } while (0) + +static int		verbose; +static int		full_paths; + +static unsigned long	page_size; +static unsigned long	mmap_window = 32; + +struct ip_event { +	struct perf_event_header header; +	__u64 ip; +	__u32 pid, tid; +	__u64 period; +}; + +struct mmap_event { +	struct perf_event_header header; +	__u32 pid, tid; +	__u64 start; +	__u64 len; +	__u64 pgoff; +	char filename[PATH_MAX]; +}; + +struct comm_event { +	struct perf_event_header header; +	__u32 pid, tid; +	char comm[16]; +}; + +struct fork_event { +	struct perf_event_header header; +	__u32 pid, ppid; +}; + +struct period_event { +	struct perf_event_header header; +	__u64 time; +	__u64 id; +	__u64 sample_period; +}; + +typedef union event_union { +	struct perf_event_header	header; +	struct ip_event			ip; +	struct mmap_event		mmap; +	struct comm_event		comm; +	struct fork_event		fork; +	struct period_event		period; +} event_t; + +static LIST_HEAD(dsos); +static struct dso *kernel_dso; +static struct dso *vdso; + +static void dsos__add(struct dso *dso) +{ +	list_add_tail(&dso->node, &dsos); +} + +static struct dso *dsos__find(const char *name) +{ +	struct dso *pos; + +	list_for_each_entry(pos, &dsos, node) +		if (strcmp(pos->name, name) == 0) +			return pos; +	return NULL; +} + +static struct dso *dsos__findnew(const char *name) +{ +	struct dso *dso = dsos__find(name); +	int nr; + +	if (dso) +		return dso; + +	dso = dso__new(name, 0); +	if (!dso) +		goto out_delete_dso; + +	nr = dso__load(dso, NULL, verbose); +	if (nr < 0) { +		if (verbose) +			fprintf(stderr, "Failed to open: %s\n", name); +		goto out_delete_dso; +	} +	if (!nr && verbose) { +		fprintf(stderr, +		"No symbols found in: %s, maybe install a debug package?\n", +				name); +	} + +	dsos__add(dso); + +	return dso; + +out_delete_dso: +	dso__delete(dso); +	return NULL; +} + +static void dsos__fprintf(FILE *fp) +{ +	struct dso *pos; + +	list_for_each_entry(pos, &dsos, node) +		dso__fprintf(pos, fp); +} + +static struct symbol *vdso__find_symbol(struct dso *dso, __u64 ip) +{ +	return dso__find_symbol(kernel_dso, ip); +} + +static int load_kernel(void) +{ +	int err; + +	kernel_dso = dso__new("[kernel]", 0); +	if (!kernel_dso) +		return -1; + +	err = dso__load_kernel(kernel_dso, vmlinux, NULL, verbose); +	if (err) { +		dso__delete(kernel_dso); +		kernel_dso = NULL; +	} else +		dsos__add(kernel_dso); + +	vdso = dso__new("[vdso]", 0); +	if (!vdso) +		return -1; + +	vdso->find_symbol = vdso__find_symbol; + +	dsos__add(vdso); + +	return err; +} + +static char __cwd[PATH_MAX]; +static char *cwd = __cwd; +static int cwdlen; + +static int strcommon(const char *pathname) +{ +	int n = 0; + +	while (pathname[n] == cwd[n] && n < cwdlen) +		++n; + +	return n; +} + +struct map { +	struct list_head node; +	__u64	 start; +	__u64	 end; +	__u64	 pgoff; +	__u64	 (*map_ip)(struct map *, __u64); +	struct dso	 *dso; +}; + +static __u64 map__map_ip(struct map *map, __u64 ip) +{ +	return ip - map->start + map->pgoff; +} + +static __u64 vdso__map_ip(struct map *map, __u64 ip) +{ +	return ip; +} + +static inline int is_anon_memory(const char *filename) +{ +     return strcmp(filename, "//anon") == 0; +} + +static struct map *map__new(struct mmap_event *event) +{ +	struct map *self = malloc(sizeof(*self)); + +	if (self != NULL) { +		const char *filename = event->filename; +		char newfilename[PATH_MAX]; +		int anon; + +		if (cwd) { +			int n = strcommon(filename); + +			if (n == cwdlen) { +				snprintf(newfilename, sizeof(newfilename), +					 ".%s", filename + n); +				filename = newfilename; +			} +		} + +		anon = is_anon_memory(filename); + +		if (anon) { +			snprintf(newfilename, sizeof(newfilename), "/tmp/perf-%d.map", event->pid); +			filename = newfilename; +		} + +		self->start = event->start; +		self->end   = event->start + event->len; +		self->pgoff = event->pgoff; + +		self->dso = dsos__findnew(filename); +		if (self->dso == NULL) +			goto out_delete; + +		if (self->dso == vdso || anon) +			self->map_ip = vdso__map_ip; +		else +			self->map_ip = map__map_ip; +	} +	return self; +out_delete: +	free(self); +	return NULL; +} + +static struct map *map__clone(struct map *self) +{ +	struct map *map = malloc(sizeof(*self)); + +	if (!map) +		return NULL; + +	memcpy(map, self, sizeof(*self)); + +	return map; +} + +static int map__overlap(struct map *l, struct map *r) +{ +	if (l->start > r->start) { +		struct map *t = l; +		l = r; +		r = t; +	} + +	if (l->end > r->start) +		return 1; + +	return 0; +} + +static size_t map__fprintf(struct map *self, FILE *fp) +{ +	return fprintf(fp, " %Lx-%Lx %Lx %s\n", +		       self->start, self->end, self->pgoff, self->dso->name); +} + + +struct thread { +	struct rb_node	 rb_node; +	struct list_head maps; +	pid_t		 pid; +	char		 *comm; +}; + +static struct thread *thread__new(pid_t pid) +{ +	struct thread *self = malloc(sizeof(*self)); + +	if (self != NULL) { +		self->pid = pid; +		self->comm = malloc(32); +		if (self->comm) +			snprintf(self->comm, 32, ":%d", self->pid); +		INIT_LIST_HEAD(&self->maps); +	} + +	return self; +} + +static int thread__set_comm(struct thread *self, const char *comm) +{ +	if (self->comm) +		free(self->comm); +	self->comm = strdup(comm); +	return self->comm ? 0 : -ENOMEM; +} + +static size_t thread__fprintf(struct thread *self, FILE *fp) +{ +	struct map *pos; +	size_t ret = fprintf(fp, "Thread %d %s\n", self->pid, self->comm); + +	list_for_each_entry(pos, &self->maps, node) +		ret += map__fprintf(pos, fp); + +	return ret; +} + + +static struct rb_root threads; +static struct thread *last_match; + +static struct thread *threads__findnew(pid_t pid) +{ +	struct rb_node **p = &threads.rb_node; +	struct rb_node *parent = NULL; +	struct thread *th; + +	/* +	 * Font-end cache - PID lookups come in blocks, +	 * so most of the time we dont have to look up +	 * the full rbtree: +	 */ +	if (last_match && last_match->pid == pid) +		return last_match; + +	while (*p != NULL) { +		parent = *p; +		th = rb_entry(parent, struct thread, rb_node); + +		if (th->pid == pid) { +			last_match = th; +			return th; +		} + +		if (pid < th->pid) +			p = &(*p)->rb_left; +		else +			p = &(*p)->rb_right; +	} + +	th = thread__new(pid); +	if (th != NULL) { +		rb_link_node(&th->rb_node, parent, p); +		rb_insert_color(&th->rb_node, &threads); +		last_match = th; +	} + +	return th; +} + +static void thread__insert_map(struct thread *self, struct map *map) +{ +	struct map *pos, *tmp; + +	list_for_each_entry_safe(pos, tmp, &self->maps, node) { +		if (map__overlap(pos, map)) { +			list_del_init(&pos->node); +			/* XXX leaks dsos */ +			free(pos); +		} +	} + +	list_add_tail(&map->node, &self->maps); +} + +static int thread__fork(struct thread *self, struct thread *parent) +{ +	struct map *map; + +	if (self->comm) +		free(self->comm); +	self->comm = strdup(parent->comm); +	if (!self->comm) +		return -ENOMEM; + +	list_for_each_entry(map, &parent->maps, node) { +		struct map *new = map__clone(map); +		if (!new) +			return -ENOMEM; +		thread__insert_map(self, new); +	} + +	return 0; +} + +static struct map *thread__find_map(struct thread *self, __u64 ip) +{ +	struct map *pos; + +	if (self == NULL) +		return NULL; + +	list_for_each_entry(pos, &self->maps, node) +		if (ip >= pos->start && ip <= pos->end) +			return pos; + +	return NULL; +} + +static size_t threads__fprintf(FILE *fp) +{ +	size_t ret = 0; +	struct rb_node *nd; + +	for (nd = rb_first(&threads); nd; nd = rb_next(nd)) { +		struct thread *pos = rb_entry(nd, struct thread, rb_node); + +		ret += thread__fprintf(pos, fp); +	} + +	return ret; +} + +/* + * histogram, sorted on item, collects counts + */ + +static struct rb_root hist; + +struct hist_entry { +	struct rb_node	 rb_node; + +	struct thread	 *thread; +	struct map	 *map; +	struct dso	 *dso; +	struct symbol	 *sym; +	__u64		 ip; +	char		 level; + +	__u64		 count; +}; + +/* + * configurable sorting bits + */ + +struct sort_entry { +	struct list_head list; + +	char *header; + +	int64_t (*cmp)(struct hist_entry *, struct hist_entry *); +	int64_t (*collapse)(struct hist_entry *, struct hist_entry *); +	size_t	(*print)(FILE *fp, struct hist_entry *); +}; + +/* --sort pid */ + +static int64_t +sort__thread_cmp(struct hist_entry *left, struct hist_entry *right) +{ +	return right->thread->pid - left->thread->pid; +} + +static size_t +sort__thread_print(FILE *fp, struct hist_entry *self) +{ +	return fprintf(fp, "%16s:%5d", self->thread->comm ?: "", self->thread->pid); +} + +static struct sort_entry sort_thread = { +	.header = "         Command:  Pid", +	.cmp	= sort__thread_cmp, +	.print	= sort__thread_print, +}; + +/* --sort comm */ + +static int64_t +sort__comm_cmp(struct hist_entry *left, struct hist_entry *right) +{ +	return right->thread->pid - left->thread->pid; +} + +static int64_t +sort__comm_collapse(struct hist_entry *left, struct hist_entry *right) +{ +	char *comm_l = left->thread->comm; +	char *comm_r = right->thread->comm; + +	if (!comm_l || !comm_r) { +		if (!comm_l && !comm_r) +			return 0; +		else if (!comm_l) +			return -1; +		else +			return 1; +	} + +	return strcmp(comm_l, comm_r); +} + +static size_t +sort__comm_print(FILE *fp, struct hist_entry *self) +{ +	return fprintf(fp, "%16s", self->thread->comm); +} + +static struct sort_entry sort_comm = { +	.header		= "         Command", +	.cmp		= sort__comm_cmp, +	.collapse	= sort__comm_collapse, +	.print		= sort__comm_print, +}; + +/* --sort dso */ + +static int64_t +sort__dso_cmp(struct hist_entry *left, struct hist_entry *right) +{ +	struct dso *dso_l = left->dso; +	struct dso *dso_r = right->dso; + +	if (!dso_l || !dso_r) { +		if (!dso_l && !dso_r) +			return 0; +		else if (!dso_l) +			return -1; +		else +			return 1; +	} + +	return strcmp(dso_l->name, dso_r->name); +} + +static size_t +sort__dso_print(FILE *fp, struct hist_entry *self) +{ +	if (self->dso) +		return fprintf(fp, "%-25s", self->dso->name); + +	return fprintf(fp, "%016llx         ", (__u64)self->ip); +} + +static struct sort_entry sort_dso = { +	.header = "Shared Object            ", +	.cmp	= sort__dso_cmp, +	.print	= sort__dso_print, +}; + +/* --sort symbol */ + +static int64_t +sort__sym_cmp(struct hist_entry *left, struct hist_entry *right) +{ +	__u64 ip_l, ip_r; + +	if (left->sym == right->sym) +		return 0; + +	ip_l = left->sym ? left->sym->start : left->ip; +	ip_r = right->sym ? right->sym->start : right->ip; + +	return (int64_t)(ip_r - ip_l); +} + +static size_t +sort__sym_print(FILE *fp, struct hist_entry *self) +{ +	size_t ret = 0; + +	if (verbose) +		ret += fprintf(fp, "%#018llx  ", (__u64)self->ip); + +	if (self->sym) { +		ret += fprintf(fp, "[%c] %s", +			self->dso == kernel_dso ? 'k' : '.', self->sym->name); +	} else { +		ret += fprintf(fp, "%#016llx", (__u64)self->ip); +	} + +	return ret; +} + +static struct sort_entry sort_sym = { +	.header = "Symbol", +	.cmp	= sort__sym_cmp, +	.print	= sort__sym_print, +}; + +static int sort__need_collapse = 0; + +struct sort_dimension { +	char			*name; +	struct sort_entry	*entry; +	int			taken; +}; + +static struct sort_dimension sort_dimensions[] = { +	{ .name = "pid",	.entry = &sort_thread,	}, +	{ .name = "comm",	.entry = &sort_comm,	}, +	{ .name = "dso",	.entry = &sort_dso,	}, +	{ .name = "symbol",	.entry = &sort_sym,	}, +}; + +static LIST_HEAD(hist_entry__sort_list); + +static int sort_dimension__add(char *tok) +{ +	int i; + +	for (i = 0; i < ARRAY_SIZE(sort_dimensions); i++) { +		struct sort_dimension *sd = &sort_dimensions[i]; + +		if (sd->taken) +			continue; + +		if (strncasecmp(tok, sd->name, strlen(tok))) +			continue; + +		if (sd->entry->collapse) +			sort__need_collapse = 1; + +		list_add_tail(&sd->entry->list, &hist_entry__sort_list); +		sd->taken = 1; + +		return 0; +	} + +	return -ESRCH; +} + +static int64_t +hist_entry__cmp(struct hist_entry *left, struct hist_entry *right) +{ +	struct sort_entry *se; +	int64_t cmp = 0; + +	list_for_each_entry(se, &hist_entry__sort_list, list) { +		cmp = se->cmp(left, right); +		if (cmp) +			break; +	} + +	return cmp; +} + +static int64_t +hist_entry__collapse(struct hist_entry *left, struct hist_entry *right) +{ +	struct sort_entry *se; +	int64_t cmp = 0; + +	list_for_each_entry(se, &hist_entry__sort_list, list) { +		int64_t (*f)(struct hist_entry *, struct hist_entry *); + +		f = se->collapse ?: se->cmp; + +		cmp = f(left, right); +		if (cmp) +			break; +	} + +	return cmp; +} + +static size_t +hist_entry__fprintf(FILE *fp, struct hist_entry *self, __u64 total_samples) +{ +	struct sort_entry *se; +	size_t ret; + +	if (total_samples) { +		double percent = self->count * 100.0 / total_samples; +		char *color = PERF_COLOR_NORMAL; + +		/* +		 * We color high-overhead entries in red, mid-overhead +		 * entries in green - and keep the low overhead places +		 * normal: +		 */ +		if (percent >= 5.0) { +			color = PERF_COLOR_RED; +		} else { +			if (percent >= 0.5) +				color = PERF_COLOR_GREEN; +		} + +		ret = color_fprintf(fp, color, "   %6.2f%%", +				(self->count * 100.0) / total_samples); +	} else +		ret = fprintf(fp, "%12Ld ", self->count); + +	list_for_each_entry(se, &hist_entry__sort_list, list) { +		fprintf(fp, "  "); +		ret += se->print(fp, self); +	} + +	ret += fprintf(fp, "\n"); + +	return ret; +} + +/* + * collect histogram counts + */ + +static int +hist_entry__add(struct thread *thread, struct map *map, struct dso *dso, +		struct symbol *sym, __u64 ip, char level, __u64 count) +{ +	struct rb_node **p = &hist.rb_node; +	struct rb_node *parent = NULL; +	struct hist_entry *he; +	struct hist_entry entry = { +		.thread	= thread, +		.map	= map, +		.dso	= dso, +		.sym	= sym, +		.ip	= ip, +		.level	= level, +		.count	= count, +	}; +	int cmp; + +	while (*p != NULL) { +		parent = *p; +		he = rb_entry(parent, struct hist_entry, rb_node); + +		cmp = hist_entry__cmp(&entry, he); + +		if (!cmp) { +			he->count += count; +			return 0; +		} + +		if (cmp < 0) +			p = &(*p)->rb_left; +		else +			p = &(*p)->rb_right; +	} + +	he = malloc(sizeof(*he)); +	if (!he) +		return -ENOMEM; +	*he = entry; +	rb_link_node(&he->rb_node, parent, p); +	rb_insert_color(&he->rb_node, &hist); + +	return 0; +} + +static void hist_entry__free(struct hist_entry *he) +{ +	free(he); +} + +/* + * collapse the histogram + */ + +static struct rb_root collapse_hists; + +static void collapse__insert_entry(struct hist_entry *he) +{ +	struct rb_node **p = &collapse_hists.rb_node; +	struct rb_node *parent = NULL; +	struct hist_entry *iter; +	int64_t cmp; + +	while (*p != NULL) { +		parent = *p; +		iter = rb_entry(parent, struct hist_entry, rb_node); + +		cmp = hist_entry__collapse(iter, he); + +		if (!cmp) { +			iter->count += he->count; +			hist_entry__free(he); +			return; +		} + +		if (cmp < 0) +			p = &(*p)->rb_left; +		else +			p = &(*p)->rb_right; +	} + +	rb_link_node(&he->rb_node, parent, p); +	rb_insert_color(&he->rb_node, &collapse_hists); +} + +static void collapse__resort(void) +{ +	struct rb_node *next; +	struct hist_entry *n; + +	if (!sort__need_collapse) +		return; + +	next = rb_first(&hist); +	while (next) { +		n = rb_entry(next, struct hist_entry, rb_node); +		next = rb_next(&n->rb_node); + +		rb_erase(&n->rb_node, &hist); +		collapse__insert_entry(n); +	} +} + +/* + * reverse the map, sort on count. + */ + +static struct rb_root output_hists; + +static void output__insert_entry(struct hist_entry *he) +{ +	struct rb_node **p = &output_hists.rb_node; +	struct rb_node *parent = NULL; +	struct hist_entry *iter; + +	while (*p != NULL) { +		parent = *p; +		iter = rb_entry(parent, struct hist_entry, rb_node); + +		if (he->count > iter->count) +			p = &(*p)->rb_left; +		else +			p = &(*p)->rb_right; +	} + +	rb_link_node(&he->rb_node, parent, p); +	rb_insert_color(&he->rb_node, &output_hists); +} + +static void output__resort(void) +{ +	struct rb_node *next; +	struct hist_entry *n; +	struct rb_root *tree = &hist; + +	if (sort__need_collapse) +		tree = &collapse_hists; + +	next = rb_first(tree); + +	while (next) { +		n = rb_entry(next, struct hist_entry, rb_node); +		next = rb_next(&n->rb_node); + +		rb_erase(&n->rb_node, tree); +		output__insert_entry(n); +	} +} + +static size_t output__fprintf(FILE *fp, __u64 total_samples) +{ +	struct hist_entry *pos; +	struct sort_entry *se; +	struct rb_node *nd; +	size_t ret = 0; + +	fprintf(fp, "\n"); +	fprintf(fp, "#\n"); +	fprintf(fp, "# (%Ld samples)\n", (__u64)total_samples); +	fprintf(fp, "#\n"); + +	fprintf(fp, "# Overhead"); +	list_for_each_entry(se, &hist_entry__sort_list, list) +		fprintf(fp, "  %s", se->header); +	fprintf(fp, "\n"); + +	fprintf(fp, "# ........"); +	list_for_each_entry(se, &hist_entry__sort_list, list) { +		int i; + +		fprintf(fp, "  "); +		for (i = 0; i < strlen(se->header); i++) +			fprintf(fp, "."); +	} +	fprintf(fp, "\n"); + +	fprintf(fp, "#\n"); + +	for (nd = rb_first(&output_hists); nd; nd = rb_next(nd)) { +		pos = rb_entry(nd, struct hist_entry, rb_node); +		ret += hist_entry__fprintf(fp, pos, total_samples); +	} + +	if (!strcmp(sort_order, default_sort_order)) { +		fprintf(fp, "#\n"); +		fprintf(fp, "# (For more details, try: perf report --sort comm,dso,symbol)\n"); +		fprintf(fp, "#\n"); +	} +	fprintf(fp, "\n"); + +	return ret; +} + +static void register_idle_thread(void) +{ +	struct thread *thread = threads__findnew(0); + +	if (thread == NULL || +			thread__set_comm(thread, "[idle]")) { +		fprintf(stderr, "problem inserting idle task.\n"); +		exit(-1); +	} +} + +static unsigned long total = 0, +		     total_mmap = 0, +		     total_comm = 0, +		     total_fork = 0, +		     total_unknown = 0; + +static int +process_overflow_event(event_t *event, unsigned long offset, unsigned long head) +{ +	char level; +	int show = 0; +	struct dso *dso = NULL; +	struct thread *thread = threads__findnew(event->ip.pid); +	__u64 ip = event->ip.ip; +	__u64 period = 1; +	struct map *map = NULL; + +	if (event->header.type & PERF_SAMPLE_PERIOD) +		period = event->ip.period; + +	dprintf("%p [%p]: PERF_EVENT (IP, %d): %d: %p period: %Ld\n", +		(void *)(offset + head), +		(void *)(long)(event->header.size), +		event->header.misc, +		event->ip.pid, +		(void *)(long)ip, +		(long long)period); + +	dprintf(" ... thread: %s:%d\n", thread->comm, thread->pid); + +	if (thread == NULL) { +		fprintf(stderr, "problem processing %d event, skipping it.\n", +			event->header.type); +		return -1; +	} + +	if (event->header.misc & PERF_EVENT_MISC_KERNEL) { +		show = SHOW_KERNEL; +		level = 'k'; + +		dso = kernel_dso; + +		dprintf(" ...... dso: %s\n", dso->name); + +	} else if (event->header.misc & PERF_EVENT_MISC_USER) { + +		show = SHOW_USER; +		level = '.'; + +		map = thread__find_map(thread, ip); +		if (map != NULL) { +			ip = map->map_ip(map, ip); +			dso = map->dso; +		} else { +			/* +			 * If this is outside of all known maps, +			 * and is a negative address, try to look it +			 * up in the kernel dso, as it might be a +			 * vsyscall (which executes in user-mode): +			 */ +			if ((long long)ip < 0) +				dso = kernel_dso; +		} +		dprintf(" ...... dso: %s\n", dso ? dso->name : "<not found>"); + +	} else { +		show = SHOW_HV; +		level = 'H'; +		dprintf(" ...... dso: [hypervisor]\n"); +	} + +	if (show & show_mask) { +		struct symbol *sym = NULL; + +		if (dso) +			sym = dso->find_symbol(dso, ip); + +		if (hist_entry__add(thread, map, dso, sym, ip, level, period)) { +			fprintf(stderr, +		"problem incrementing symbol count, skipping event\n"); +			return -1; +		} +	} +	total += period; + +	return 0; +} + +static int +process_mmap_event(event_t *event, unsigned long offset, unsigned long head) +{ +	struct thread *thread = threads__findnew(event->mmap.pid); +	struct map *map = map__new(&event->mmap); + +	dprintf("%p [%p]: PERF_EVENT_MMAP %d: [%p(%p) @ %p]: %s\n", +		(void *)(offset + head), +		(void *)(long)(event->header.size), +		event->mmap.pid, +		(void *)(long)event->mmap.start, +		(void *)(long)event->mmap.len, +		(void *)(long)event->mmap.pgoff, +		event->mmap.filename); + +	if (thread == NULL || map == NULL) { +		dprintf("problem processing PERF_EVENT_MMAP, skipping event.\n"); +		return 0; +	} + +	thread__insert_map(thread, map); +	total_mmap++; + +	return 0; +} + +static int +process_comm_event(event_t *event, unsigned long offset, unsigned long head) +{ +	struct thread *thread = threads__findnew(event->comm.pid); + +	dprintf("%p [%p]: PERF_EVENT_COMM: %s:%d\n", +		(void *)(offset + head), +		(void *)(long)(event->header.size), +		event->comm.comm, event->comm.pid); + +	if (thread == NULL || +	    thread__set_comm(thread, event->comm.comm)) { +		dprintf("problem processing PERF_EVENT_COMM, skipping event.\n"); +		return -1; +	} +	total_comm++; + +	return 0; +} + +static int +process_fork_event(event_t *event, unsigned long offset, unsigned long head) +{ +	struct thread *thread = threads__findnew(event->fork.pid); +	struct thread *parent = threads__findnew(event->fork.ppid); + +	dprintf("%p [%p]: PERF_EVENT_FORK: %d:%d\n", +		(void *)(offset + head), +		(void *)(long)(event->header.size), +		event->fork.pid, event->fork.ppid); + +	if (!thread || !parent || thread__fork(thread, parent)) { +		dprintf("problem processing PERF_EVENT_FORK, skipping event.\n"); +		return -1; +	} +	total_fork++; + +	return 0; +} + +static int +process_period_event(event_t *event, unsigned long offset, unsigned long head) +{ +	dprintf("%p [%p]: PERF_EVENT_PERIOD: time:%Ld, id:%Ld: period:%Ld\n", +		(void *)(offset + head), +		(void *)(long)(event->header.size), +		event->period.time, +		event->period.id, +		event->period.sample_period); + +	return 0; +} + +static int +process_event(event_t *event, unsigned long offset, unsigned long head) +{ +	if (event->header.misc & PERF_EVENT_MISC_OVERFLOW) +		return process_overflow_event(event, offset, head); + +	switch (event->header.type) { +	case PERF_EVENT_MMAP: +		return process_mmap_event(event, offset, head); + +	case PERF_EVENT_COMM: +		return process_comm_event(event, offset, head); + +	case PERF_EVENT_FORK: +		return process_fork_event(event, offset, head); + +	case PERF_EVENT_PERIOD: +		return process_period_event(event, offset, head); +	/* +	 * We dont process them right now but they are fine: +	 */ + +	case PERF_EVENT_THROTTLE: +	case PERF_EVENT_UNTHROTTLE: +		return 0; + +	default: +		return -1; +	} + +	return 0; +} + +static int __cmd_report(void) +{ +	int ret, rc = EXIT_FAILURE; +	unsigned long offset = 0; +	unsigned long head = 0; +	struct stat stat; +	event_t *event; +	uint32_t size; +	char *buf; + +	register_idle_thread(); + +	input = open(input_name, O_RDONLY); +	if (input < 0) { +		fprintf(stderr, " failed to open file: %s", input_name); +		if (!strcmp(input_name, "perf.data")) +			fprintf(stderr, "  (try 'perf record' first)"); +		fprintf(stderr, "\n"); +		exit(-1); +	} + +	ret = fstat(input, &stat); +	if (ret < 0) { +		perror("failed to stat file"); +		exit(-1); +	} + +	if (!stat.st_size) { +		fprintf(stderr, "zero-sized file, nothing to do!\n"); +		exit(0); +	} + +	if (load_kernel() < 0) { +		perror("failed to load kernel symbols"); +		return EXIT_FAILURE; +	} + +	if (!full_paths) { +		if (getcwd(__cwd, sizeof(__cwd)) == NULL) { +			perror("failed to get the current directory"); +			return EXIT_FAILURE; +		} +		cwdlen = strlen(cwd); +	} else { +		cwd = NULL; +		cwdlen = 0; +	} +remap: +	buf = (char *)mmap(NULL, page_size * mmap_window, PROT_READ, +			   MAP_SHARED, input, offset); +	if (buf == MAP_FAILED) { +		perror("failed to mmap file"); +		exit(-1); +	} + +more: +	event = (event_t *)(buf + head); + +	size = event->header.size; +	if (!size) +		size = 8; + +	if (head + event->header.size >= page_size * mmap_window) { +		unsigned long shift = page_size * (head / page_size); +		int ret; + +		ret = munmap(buf, page_size * mmap_window); +		assert(ret == 0); + +		offset += shift; +		head -= shift; +		goto remap; +	} + +	size = event->header.size; + +	dprintf("%p [%p]: event: %d\n", +			(void *)(offset + head), +			(void *)(long)event->header.size, +			event->header.type); + +	if (!size || process_event(event, offset, head) < 0) { + +		dprintf("%p [%p]: skipping unknown header type: %d\n", +			(void *)(offset + head), +			(void *)(long)(event->header.size), +			event->header.type); + +		total_unknown++; + +		/* +		 * assume we lost track of the stream, check alignment, and +		 * increment a single u64 in the hope to catch on again 'soon'. +		 */ + +		if (unlikely(head & 7)) +			head &= ~7ULL; + +		size = 8; +	} + +	head += size; + +	if (offset + head < stat.st_size) +		goto more; + +	rc = EXIT_SUCCESS; +	close(input); + +	dprintf("      IP events: %10ld\n", total); +	dprintf("    mmap events: %10ld\n", total_mmap); +	dprintf("    comm events: %10ld\n", total_comm); +	dprintf("    fork events: %10ld\n", total_fork); +	dprintf(" unknown events: %10ld\n", total_unknown); + +	if (dump_trace) +		return 0; + +	if (verbose >= 3) +		threads__fprintf(stdout); + +	if (verbose >= 2) +		dsos__fprintf(stdout); + +	collapse__resort(); +	output__resort(); +	output__fprintf(stdout, total); + +	return rc; +} + +static const char * const report_usage[] = { +	"perf report [<options>] <command>", +	NULL +}; + +static const struct option options[] = { +	OPT_STRING('i', "input", &input_name, "file", +		    "input file name"), +	OPT_BOOLEAN('v', "verbose", &verbose, +		    "be more verbose (show symbol address, etc)"), +	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace, +		    "dump raw trace in ASCII"), +	OPT_STRING('k', "vmlinux", &vmlinux, "file", "vmlinux pathname"), +	OPT_STRING('s', "sort", &sort_order, "key[,key2...]", +		   "sort by key(s): pid, comm, dso, symbol. Default: pid,symbol"), +	OPT_BOOLEAN('P', "full-paths", &full_paths, +		    "Don't shorten the pathnames taking into account the cwd"), +	OPT_END() +}; + +static void setup_sorting(void) +{ +	char *tmp, *tok, *str = strdup(sort_order); + +	for (tok = strtok_r(str, ", ", &tmp); +			tok; tok = strtok_r(NULL, ", ", &tmp)) { +		if (sort_dimension__add(tok) < 0) { +			error("Unknown --sort key: `%s'", tok); +			usage_with_options(report_usage, options); +		} +	} + +	free(str); +} + +int cmd_report(int argc, const char **argv, const char *prefix) +{ +	symbol__init(); + +	page_size = getpagesize(); + +	argc = parse_options(argc, argv, options, report_usage, 0); + +	setup_sorting(); + +	/* +	 * Any (unrecognized) arguments left? +	 */ +	if (argc) +		usage_with_options(report_usage, options); + +	setup_pager(); + +	return __cmd_report(); +} diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c new file mode 100644 index 00000000000..c43e4a97dc4 --- /dev/null +++ b/tools/perf/builtin-stat.c @@ -0,0 +1,367 @@ +/* + * builtin-stat.c + * + * Builtin stat command: Give a precise performance counters summary + * overview about any workload, CPU or specific PID. + * + * Sample output: + +   $ perf stat ~/hackbench 10 +   Time: 0.104 + +    Performance counter stats for '/home/mingo/hackbench': + +       1255.538611  task clock ticks     #      10.143 CPU utilization factor +             54011  context switches     #       0.043 M/sec +               385  CPU migrations       #       0.000 M/sec +             17755  pagefaults           #       0.014 M/sec +        3808323185  CPU cycles           #    3033.219 M/sec +        1575111190  instructions         #    1254.530 M/sec +          17367895  cache references     #      13.833 M/sec +           7674421  cache misses         #       6.112 M/sec + +    Wall-clock time elapsed:   123.786620 msecs + + * + * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com> + * + * Improvements and fixes by: + * + *   Arjan van de Ven <arjan@linux.intel.com> + *   Yanmin Zhang <yanmin.zhang@intel.com> + *   Wu Fengguang <fengguang.wu@intel.com> + *   Mike Galbraith <efault@gmx.de> + *   Paul Mackerras <paulus@samba.org> + * + * Released under the GPL v2. (and only v2, not any later version) + */ + +#include "perf.h" +#include "builtin.h" +#include "util/util.h" +#include "util/parse-options.h" +#include "util/parse-events.h" + +#include <sys/prctl.h> + +static struct perf_counter_attr default_attrs[MAX_COUNTERS] = { + +  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_TASK_CLOCK	}, +  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CONTEXT_SWITCHES}, +  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CPU_MIGRATIONS	}, +  { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_PAGE_FAULTS	}, + +  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES	}, +  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_INSTRUCTIONS	}, +  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CACHE_REFERENCES}, +  { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CACHE_MISSES	}, + +}; + +static int			system_wide			=  0; +static int			inherit				=  1; +static int			verbose				=  0; + +static int			fd[MAX_NR_CPUS][MAX_COUNTERS]; + +static int			target_pid			= -1; +static int			nr_cpus				=  0; +static unsigned int		page_size; + +static int			scale				=  1; + +static const unsigned int default_count[] = { +	1000000, +	1000000, +	  10000, +	  10000, +	1000000, +	  10000, +}; + +static __u64			event_res[MAX_COUNTERS][3]; +static __u64			event_scaled[MAX_COUNTERS]; + +static __u64			runtime_nsecs; +static __u64			walltime_nsecs; +static __u64			runtime_cycles; + +static void create_perf_stat_counter(int counter) +{ +	struct perf_counter_attr *attr = attrs + counter; + +	if (scale) +		attr->read_format = PERF_FORMAT_TOTAL_TIME_ENABLED | +				    PERF_FORMAT_TOTAL_TIME_RUNNING; + +	if (system_wide) { +		int cpu; +		for (cpu = 0; cpu < nr_cpus; cpu ++) { +			fd[cpu][counter] = sys_perf_counter_open(attr, -1, cpu, -1, 0); +			if (fd[cpu][counter] < 0 && verbose) { +				printf("Error: counter %d, sys_perf_counter_open() syscall returned with %d (%s)\n", counter, fd[cpu][counter], strerror(errno)); +			} +		} +	} else { +		attr->inherit	= inherit; +		attr->disabled	= 1; + +		fd[0][counter] = sys_perf_counter_open(attr, 0, -1, -1, 0); +		if (fd[0][counter] < 0 && verbose) { +			printf("Error: counter %d, sys_perf_counter_open() syscall returned with %d (%s)\n", counter, fd[0][counter], strerror(errno)); +		} +	} +} + +/* + * Does the counter have nsecs as a unit? + */ +static inline int nsec_counter(int counter) +{ +	if (attrs[counter].type != PERF_TYPE_SOFTWARE) +		return 0; + +	if (attrs[counter].config == PERF_COUNT_SW_CPU_CLOCK) +		return 1; + +	if (attrs[counter].config == PERF_COUNT_SW_TASK_CLOCK) +		return 1; + +	return 0; +} + +/* + * Read out the results of a single counter: + */ +static void read_counter(int counter) +{ +	__u64 *count, single_count[3]; +	ssize_t res; +	int cpu, nv; +	int scaled; + +	count = event_res[counter]; + +	count[0] = count[1] = count[2] = 0; + +	nv = scale ? 3 : 1; +	for (cpu = 0; cpu < nr_cpus; cpu ++) { +		if (fd[cpu][counter] < 0) +			continue; + +		res = read(fd[cpu][counter], single_count, nv * sizeof(__u64)); +		assert(res == nv * sizeof(__u64)); + +		count[0] += single_count[0]; +		if (scale) { +			count[1] += single_count[1]; +			count[2] += single_count[2]; +		} +	} + +	scaled = 0; +	if (scale) { +		if (count[2] == 0) { +			event_scaled[counter] = -1; +			count[0] = 0; +			return; +		} + +		if (count[2] < count[1]) { +			event_scaled[counter] = 1; +			count[0] = (unsigned long long) +				((double)count[0] * count[1] / count[2] + 0.5); +		} +	} +	/* +	 * Save the full runtime - to allow normalization during printout: +	 */ +	if (attrs[counter].type == PERF_TYPE_SOFTWARE && +		attrs[counter].config == PERF_COUNT_SW_TASK_CLOCK) +		runtime_nsecs = count[0]; +	if (attrs[counter].type == PERF_TYPE_HARDWARE && +		attrs[counter].config == PERF_COUNT_HW_CPU_CYCLES) +		runtime_cycles = count[0]; +} + +/* + * Print out the results of a single counter: + */ +static void print_counter(int counter) +{ +	__u64 *count; +	int scaled; + +	count = event_res[counter]; +	scaled = event_scaled[counter]; + +	if (scaled == -1) { +		fprintf(stderr, " %14s  %-20s\n", +			"<not counted>", event_name(counter)); +		return; +	} + +	if (nsec_counter(counter)) { +		double msecs = (double)count[0] / 1000000; + +		fprintf(stderr, " %14.6f  %-20s", +			msecs, event_name(counter)); +		if (attrs[counter].type == PERF_TYPE_SOFTWARE && +			attrs[counter].config == PERF_COUNT_SW_TASK_CLOCK) { + +			if (walltime_nsecs) +				fprintf(stderr, " # %11.3f CPU utilization factor", +					(double)count[0] / (double)walltime_nsecs); +		} +	} else { +		fprintf(stderr, " %14Ld  %-20s", +			count[0], event_name(counter)); +		if (runtime_nsecs) +			fprintf(stderr, " # %11.3f M/sec", +				(double)count[0]/runtime_nsecs*1000.0); +		if (runtime_cycles && +			attrs[counter].type == PERF_TYPE_HARDWARE && +				attrs[counter].config == PERF_COUNT_HW_INSTRUCTIONS) { + +			fprintf(stderr, " # %1.3f per cycle", +				(double)count[0] / (double)runtime_cycles); +		} +	} +	if (scaled) +		fprintf(stderr, "  (scaled from %.2f%%)", +			(double) count[2] / count[1] * 100); +	fprintf(stderr, "\n"); +} + +static int do_perf_stat(int argc, const char **argv) +{ +	unsigned long long t0, t1; +	int counter; +	int status; +	int pid; +	int i; + +	if (!system_wide) +		nr_cpus = 1; + +	for (counter = 0; counter < nr_counters; counter++) +		create_perf_stat_counter(counter); + +	/* +	 * Enable counters and exec the command: +	 */ +	t0 = rdclock(); +	prctl(PR_TASK_PERF_COUNTERS_ENABLE); + +	if ((pid = fork()) < 0) +		perror("failed to fork"); + +	if (!pid) { +		if (execvp(argv[0], (char **)argv)) { +			perror(argv[0]); +			exit(-1); +		} +	} + +	while (wait(&status) >= 0) +		; + +	prctl(PR_TASK_PERF_COUNTERS_DISABLE); +	t1 = rdclock(); + +	walltime_nsecs = t1 - t0; + +	fflush(stdout); + +	fprintf(stderr, "\n"); +	fprintf(stderr, " Performance counter stats for \'%s", argv[0]); + +	for (i = 1; i < argc; i++) +		fprintf(stderr, " %s", argv[i]); + +	fprintf(stderr, "\':\n"); +	fprintf(stderr, "\n"); + +	for (counter = 0; counter < nr_counters; counter++) +		read_counter(counter); + +	for (counter = 0; counter < nr_counters; counter++) +		print_counter(counter); + + +	fprintf(stderr, "\n"); +	fprintf(stderr, " Wall-clock time elapsed: %12.6f msecs\n", +			(double)(t1-t0)/1e6); +	fprintf(stderr, "\n"); + +	return 0; +} + +static volatile int signr = -1; + +static void skip_signal(int signo) +{ +	signr = signo; +} + +static void sig_atexit(void) +{ +	if (signr == -1) +		return; + +	signal(signr, SIG_DFL); +	kill(getpid(), signr); +} + +static const char * const stat_usage[] = { +	"perf stat [<options>] <command>", +	NULL +}; + +static const struct option options[] = { +	OPT_CALLBACK('e', "event", NULL, "event", +		     "event selector. use 'perf list' to list available events", +		     parse_events), +	OPT_BOOLEAN('i', "inherit", &inherit, +		    "child tasks inherit counters"), +	OPT_INTEGER('p', "pid", &target_pid, +		    "stat events on existing pid"), +	OPT_BOOLEAN('a', "all-cpus", &system_wide, +			    "system-wide collection from all CPUs"), +	OPT_BOOLEAN('S', "scale", &scale, +			    "scale/normalize counters"), +	OPT_BOOLEAN('v', "verbose", &verbose, +		    "be more verbose (show counter open errors, etc)"), +	OPT_END() +}; + +int cmd_stat(int argc, const char **argv, const char *prefix) +{ +	page_size = sysconf(_SC_PAGE_SIZE); + +	memcpy(attrs, default_attrs, sizeof(attrs)); + +	argc = parse_options(argc, argv, options, stat_usage, 0); +	if (!argc) +		usage_with_options(stat_usage, options); + +	if (!nr_counters) +		nr_counters = 8; + +	nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); +	assert(nr_cpus <= MAX_NR_CPUS); +	assert(nr_cpus >= 0); + +	/* +	 * We dont want to block the signals - that would cause +	 * child tasks to inherit that and Ctrl-C would not work. +	 * What we want is for Ctrl-C to work in the exec()-ed +	 * task, but being ignored by perf stat itself: +	 */ +	atexit(sig_atexit); +	signal(SIGINT,  skip_signal); +	signal(SIGALRM, skip_signal); +	signal(SIGABRT, skip_signal); + +	return do_perf_stat(argc, argv); +} diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c new file mode 100644 index 00000000000..fe338d3c5d7 --- /dev/null +++ b/tools/perf/builtin-top.c @@ -0,0 +1,736 @@ +/* + * builtin-top.c + * + * Builtin top command: Display a continuously updated profile of + * any workload, CPU or specific PID. + * + * Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com> + * + * Improvements and fixes by: + * + *   Arjan van de Ven <arjan@linux.intel.com> + *   Yanmin Zhang <yanmin.zhang@intel.com> + *   Wu Fengguang <fengguang.wu@intel.com> + *   Mike Galbraith <efault@gmx.de> + *   Paul Mackerras <paulus@samba.org> + * + * Released under the GPL v2. (and only v2, not any later version) + */ +#include "builtin.h" + +#include "perf.h" + +#include "util/symbol.h" +#include "util/color.h" +#include "util/util.h" +#include "util/rbtree.h" +#include "util/parse-options.h" +#include "util/parse-events.h" + +#include <assert.h> +#include <fcntl.h> + +#include <stdio.h> + +#include <errno.h> +#include <time.h> +#include <sched.h> +#include <pthread.h> + +#include <sys/syscall.h> +#include <sys/ioctl.h> +#include <sys/poll.h> +#include <sys/prctl.h> +#include <sys/wait.h> +#include <sys/uio.h> +#include <sys/mman.h> + +#include <linux/unistd.h> +#include <linux/types.h> + +static int			fd[MAX_NR_CPUS][MAX_COUNTERS]; + +static int			system_wide			=  0; + +static int			default_interval		= 100000; + +static __u64			count_filter			=  5; +static int			print_entries			= 15; + +static int			target_pid			= -1; +static int			profile_cpu			= -1; +static int			nr_cpus				=  0; +static unsigned int		realtime_prio			=  0; +static int			group				=  0; +static unsigned int		page_size; +static unsigned int		mmap_pages			= 16; +static int			freq				=  0; +static int			verbose				=  0; + +static char			*sym_filter; +static unsigned long		filter_start; +static unsigned long		filter_end; + +static int			delay_secs			=  2; +static int			zero; +static int			dump_symtab; + +/* + * Symbols + */ + +static __u64			min_ip; +static __u64			max_ip = -1ll; + +struct sym_entry { +	struct rb_node		rb_node; +	struct list_head	node; +	unsigned long		count[MAX_COUNTERS]; +	unsigned long		snap_count; +	double			weight; +	int			skip; +}; + +struct sym_entry		*sym_filter_entry; + +struct dso			*kernel_dso; + +/* + * Symbols will be added here in record_ip and will get out + * after decayed. + */ +static LIST_HEAD(active_symbols); +static pthread_mutex_t active_symbols_lock = PTHREAD_MUTEX_INITIALIZER; + +/* + * Ordering weight: count-1 * count-2 * ... / count-n + */ +static double sym_weight(const struct sym_entry *sym) +{ +	double weight = sym->snap_count; +	int counter; + +	for (counter = 1; counter < nr_counters-1; counter++) +		weight *= sym->count[counter]; + +	weight /= (sym->count[counter] + 1); + +	return weight; +} + +static long			samples; +static long			userspace_samples; +static const char		CONSOLE_CLEAR[] = "[H[2J"; + +static void __list_insert_active_sym(struct sym_entry *syme) +{ +	list_add(&syme->node, &active_symbols); +} + +static void list_remove_active_sym(struct sym_entry *syme) +{ +	pthread_mutex_lock(&active_symbols_lock); +	list_del_init(&syme->node); +	pthread_mutex_unlock(&active_symbols_lock); +} + +static void rb_insert_active_sym(struct rb_root *tree, struct sym_entry *se) +{ +	struct rb_node **p = &tree->rb_node; +	struct rb_node *parent = NULL; +	struct sym_entry *iter; + +	while (*p != NULL) { +		parent = *p; +		iter = rb_entry(parent, struct sym_entry, rb_node); + +		if (se->weight > iter->weight) +			p = &(*p)->rb_left; +		else +			p = &(*p)->rb_right; +	} + +	rb_link_node(&se->rb_node, parent, p); +	rb_insert_color(&se->rb_node, tree); +} + +static void print_sym_table(void) +{ +	int printed = 0, j; +	int counter; +	float samples_per_sec = samples/delay_secs; +	float ksamples_per_sec = (samples-userspace_samples)/delay_secs; +	float sum_ksamples = 0.0; +	struct sym_entry *syme, *n; +	struct rb_root tmp = RB_ROOT; +	struct rb_node *nd; + +	samples = userspace_samples = 0; + +	/* Sort the active symbols */ +	pthread_mutex_lock(&active_symbols_lock); +	syme = list_entry(active_symbols.next, struct sym_entry, node); +	pthread_mutex_unlock(&active_symbols_lock); + +	list_for_each_entry_safe_from(syme, n, &active_symbols, node) { +		syme->snap_count = syme->count[0]; +		if (syme->snap_count != 0) { +			syme->weight = sym_weight(syme); +			rb_insert_active_sym(&tmp, syme); +			sum_ksamples += syme->snap_count; + +			for (j = 0; j < nr_counters; j++) +				syme->count[j] = zero ? 0 : syme->count[j] * 7 / 8; +		} else +			list_remove_active_sym(syme); +	} + +	puts(CONSOLE_CLEAR); + +	printf( +"------------------------------------------------------------------------------\n"); +	printf( "   PerfTop:%8.0f irqs/sec  kernel:%4.1f%% [", +		samples_per_sec, +		100.0 - (100.0*((samples_per_sec-ksamples_per_sec)/samples_per_sec))); + +	if (nr_counters == 1) { +		printf("%Ld", attrs[0].sample_period); +		if (freq) +			printf("Hz "); +		else +			printf(" "); +	} + +	for (counter = 0; counter < nr_counters; counter++) { +		if (counter) +			printf("/"); + +		printf("%s", event_name(counter)); +	} + +	printf( "], "); + +	if (target_pid != -1) +		printf(" (target_pid: %d", target_pid); +	else +		printf(" (all"); + +	if (profile_cpu != -1) +		printf(", cpu: %d)\n", profile_cpu); +	else { +		if (target_pid != -1) +			printf(")\n"); +		else +			printf(", %d CPUs)\n", nr_cpus); +	} + +	printf("------------------------------------------------------------------------------\n\n"); + +	if (nr_counters == 1) +		printf("             samples    pcnt"); +	else +		printf("  weight     samples    pcnt"); + +	printf("         RIP          kernel function\n" +	       	       "  ______     _______   _____   ________________   _______________\n\n" +	); + +	for (nd = rb_first(&tmp); nd; nd = rb_next(nd)) { +		struct sym_entry *syme = rb_entry(nd, struct sym_entry, rb_node); +		struct symbol *sym = (struct symbol *)(syme + 1); +		char *color = PERF_COLOR_NORMAL; +		double pcnt; + +		if (++printed > print_entries || syme->snap_count < count_filter) +			continue; + +		pcnt = 100.0 - (100.0 * ((sum_ksamples - syme->snap_count) / +					 sum_ksamples)); + +		/* +		 * We color high-overhead entries in red, mid-overhead +		 * entries in green - and keep the low overhead places +		 * normal: +		 */ +		if (pcnt >= 5.0) { +			color = PERF_COLOR_RED; +		} else { +			if (pcnt >= 0.5) +				color = PERF_COLOR_GREEN; +		} + +		if (nr_counters == 1) +			printf("%20.2f - ", syme->weight); +		else +			printf("%9.1f %10ld - ", syme->weight, syme->snap_count); + +		color_fprintf(stdout, color, "%4.1f%%", pcnt); +		printf(" - %016llx : %s\n", sym->start, sym->name); +	} +} + +static void *display_thread(void *arg) +{ +	struct pollfd stdin_poll = { .fd = 0, .events = POLLIN }; +	int delay_msecs = delay_secs * 1000; + +	printf("PerfTop refresh period: %d seconds\n", delay_secs); + +	do { +		print_sym_table(); +	} while (!poll(&stdin_poll, 1, delay_msecs) == 1); + +	printf("key pressed - exiting.\n"); +	exit(0); + +	return NULL; +} + +static int symbol_filter(struct dso *self, struct symbol *sym) +{ +	static int filter_match; +	struct sym_entry *syme; +	const char *name = sym->name; + +	if (!strcmp(name, "_text") || +	    !strcmp(name, "_etext") || +	    !strcmp(name, "_sinittext") || +	    !strncmp("init_module", name, 11) || +	    !strncmp("cleanup_module", name, 14) || +	    strstr(name, "_text_start") || +	    strstr(name, "_text_end")) +		return 1; + +	syme = dso__sym_priv(self, sym); +	/* Tag samples to be skipped. */ +	if (!strcmp("default_idle", name) || +	    !strcmp("cpu_idle", name) || +	    !strcmp("enter_idle", name) || +	    !strcmp("exit_idle", name) || +	    !strcmp("mwait_idle", name)) +		syme->skip = 1; + +	if (filter_match == 1) { +		filter_end = sym->start; +		filter_match = -1; +		if (filter_end - filter_start > 10000) { +			fprintf(stderr, +				"hm, too large filter symbol <%s> - skipping.\n", +				sym_filter); +			fprintf(stderr, "symbol filter start: %016lx\n", +				filter_start); +			fprintf(stderr, "                end: %016lx\n", +				filter_end); +			filter_end = filter_start = 0; +			sym_filter = NULL; +			sleep(1); +		} +	} + +	if (filter_match == 0 && sym_filter && !strcmp(name, sym_filter)) { +		filter_match = 1; +		filter_start = sym->start; +	} + + +	return 0; +} + +static int parse_symbols(void) +{ +	struct rb_node *node; +	struct symbol  *sym; + +	kernel_dso = dso__new("[kernel]", sizeof(struct sym_entry)); +	if (kernel_dso == NULL) +		return -1; + +	if (dso__load_kernel(kernel_dso, NULL, symbol_filter, 1) != 0) +		goto out_delete_dso; + +	node = rb_first(&kernel_dso->syms); +	sym = rb_entry(node, struct symbol, rb_node); +	min_ip = sym->start; + +	node = rb_last(&kernel_dso->syms); +	sym = rb_entry(node, struct symbol, rb_node); +	max_ip = sym->end; + +	if (dump_symtab) +		dso__fprintf(kernel_dso, stderr); + +	return 0; + +out_delete_dso: +	dso__delete(kernel_dso); +	kernel_dso = NULL; +	return -1; +} + +#define TRACE_COUNT     3 + +/* + * Binary search in the histogram table and record the hit: + */ +static void record_ip(__u64 ip, int counter) +{ +	struct symbol *sym = dso__find_symbol(kernel_dso, ip); + +	if (sym != NULL) { +		struct sym_entry *syme = dso__sym_priv(kernel_dso, sym); + +		if (!syme->skip) { +			syme->count[counter]++; +			pthread_mutex_lock(&active_symbols_lock); +			if (list_empty(&syme->node) || !syme->node.next) +				__list_insert_active_sym(syme); +			pthread_mutex_unlock(&active_symbols_lock); +			return; +		} +	} + +	samples--; +} + +static void process_event(__u64 ip, int counter) +{ +	samples++; + +	if (ip < min_ip || ip > max_ip) { +		userspace_samples++; +		return; +	} + +	record_ip(ip, counter); +} + +struct mmap_data { +	int			counter; +	void			*base; +	unsigned int		mask; +	unsigned int		prev; +}; + +static unsigned int mmap_read_head(struct mmap_data *md) +{ +	struct perf_counter_mmap_page *pc = md->base; +	int head; + +	head = pc->data_head; +	rmb(); + +	return head; +} + +struct timeval last_read, this_read; + +static void mmap_read_counter(struct mmap_data *md) +{ +	unsigned int head = mmap_read_head(md); +	unsigned int old = md->prev; +	unsigned char *data = md->base + page_size; +	int diff; + +	gettimeofday(&this_read, NULL); + +	/* +	 * If we're further behind than half the buffer, there's a chance +	 * the writer will bite our tail and mess up the samples under us. +	 * +	 * If we somehow ended up ahead of the head, we got messed up. +	 * +	 * In either case, truncate and restart at head. +	 */ +	diff = head - old; +	if (diff > md->mask / 2 || diff < 0) { +		struct timeval iv; +		unsigned long msecs; + +		timersub(&this_read, &last_read, &iv); +		msecs = iv.tv_sec*1000 + iv.tv_usec/1000; + +		fprintf(stderr, "WARNING: failed to keep up with mmap data." +				"  Last read %lu msecs ago.\n", msecs); + +		/* +		 * head points to a known good entry, start there. +		 */ +		old = head; +	} + +	last_read = this_read; + +	for (; old != head;) { +		struct ip_event { +			struct perf_event_header header; +			__u64 ip; +			__u32 pid, target_pid; +		}; +		struct mmap_event { +			struct perf_event_header header; +			__u32 pid, target_pid; +			__u64 start; +			__u64 len; +			__u64 pgoff; +			char filename[PATH_MAX]; +		}; + +		typedef union event_union { +			struct perf_event_header header; +			struct ip_event ip; +			struct mmap_event mmap; +		} event_t; + +		event_t *event = (event_t *)&data[old & md->mask]; + +		event_t event_copy; + +		size_t size = event->header.size; + +		/* +		 * Event straddles the mmap boundary -- header should always +		 * be inside due to u64 alignment of output. +		 */ +		if ((old & md->mask) + size != ((old + size) & md->mask)) { +			unsigned int offset = old; +			unsigned int len = min(sizeof(*event), size), cpy; +			void *dst = &event_copy; + +			do { +				cpy = min(md->mask + 1 - (offset & md->mask), len); +				memcpy(dst, &data[offset & md->mask], cpy); +				offset += cpy; +				dst += cpy; +				len -= cpy; +			} while (len); + +			event = &event_copy; +		} + +		old += size; + +		if (event->header.misc & PERF_EVENT_MISC_OVERFLOW) { +			if (event->header.type & PERF_SAMPLE_IP) +				process_event(event->ip.ip, md->counter); +		} +	} + +	md->prev = old; +} + +static struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS]; +static struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS]; + +static void mmap_read(void) +{ +	int i, counter; + +	for (i = 0; i < nr_cpus; i++) { +		for (counter = 0; counter < nr_counters; counter++) +			mmap_read_counter(&mmap_array[i][counter]); +	} +} + +int nr_poll; +int group_fd; + +static void start_counter(int i, int counter) +{ +	struct perf_counter_attr *attr; +	unsigned int cpu; + +	cpu = profile_cpu; +	if (target_pid == -1 && profile_cpu == -1) +		cpu = i; + +	attr = attrs + counter; + +	attr->sample_type	= PERF_SAMPLE_IP | PERF_SAMPLE_TID; +	attr->freq		= freq; + +try_again: +	fd[i][counter] = sys_perf_counter_open(attr, target_pid, cpu, group_fd, 0); + +	if (fd[i][counter] < 0) { +		int err = errno; + +		if (err == EPERM) +			die("No permission - are you root?\n"); +		/* +		 * If it's cycles then fall back to hrtimer +		 * based cpu-clock-tick sw counter, which +		 * is always available even if no PMU support: +		 */ +		if (attr->type == PERF_TYPE_HARDWARE +			&& attr->config == PERF_COUNT_HW_CPU_CYCLES) { + +			if (verbose) +				warning(" ... trying to fall back to cpu-clock-ticks\n"); + +			attr->type = PERF_TYPE_SOFTWARE; +			attr->config = PERF_COUNT_SW_CPU_CLOCK; +			goto try_again; +		} +		printf("\n"); +		error("perfcounter syscall returned with %d (%s)\n", +			fd[i][counter], strerror(err)); +		die("No CONFIG_PERF_COUNTERS=y kernel support configured?\n"); +		exit(-1); +	} +	assert(fd[i][counter] >= 0); +	fcntl(fd[i][counter], F_SETFL, O_NONBLOCK); + +	/* +	 * First counter acts as the group leader: +	 */ +	if (group && group_fd == -1) +		group_fd = fd[i][counter]; + +	event_array[nr_poll].fd = fd[i][counter]; +	event_array[nr_poll].events = POLLIN; +	nr_poll++; + +	mmap_array[i][counter].counter = counter; +	mmap_array[i][counter].prev = 0; +	mmap_array[i][counter].mask = mmap_pages*page_size - 1; +	mmap_array[i][counter].base = mmap(NULL, (mmap_pages+1)*page_size, +			PROT_READ, MAP_SHARED, fd[i][counter], 0); +	if (mmap_array[i][counter].base == MAP_FAILED) +		die("failed to mmap with %d (%s)\n", errno, strerror(errno)); +} + +static int __cmd_top(void) +{ +	pthread_t thread; +	int i, counter; +	int ret; + +	for (i = 0; i < nr_cpus; i++) { +		group_fd = -1; +		for (counter = 0; counter < nr_counters; counter++) +			start_counter(i, counter); +	} + +	/* Wait for a minimal set of events before starting the snapshot */ +	poll(event_array, nr_poll, 100); + +	mmap_read(); + +	if (pthread_create(&thread, NULL, display_thread, NULL)) { +		printf("Could not create display thread.\n"); +		exit(-1); +	} + +	if (realtime_prio) { +		struct sched_param param; + +		param.sched_priority = realtime_prio; +		if (sched_setscheduler(0, SCHED_FIFO, ¶m)) { +			printf("Could not set realtime priority.\n"); +			exit(-1); +		} +	} + +	while (1) { +		int hits = samples; + +		mmap_read(); + +		if (hits == samples) +			ret = poll(event_array, nr_poll, 100); +	} + +	return 0; +} + +static const char * const top_usage[] = { +	"perf top [<options>]", +	NULL +}; + +static const struct option options[] = { +	OPT_CALLBACK('e', "event", NULL, "event", +		     "event selector. use 'perf list' to list available events", +		     parse_events), +	OPT_INTEGER('c', "count", &default_interval, +		    "event period to sample"), +	OPT_INTEGER('p', "pid", &target_pid, +		    "profile events on existing pid"), +	OPT_BOOLEAN('a', "all-cpus", &system_wide, +			    "system-wide collection from all CPUs"), +	OPT_INTEGER('C', "CPU", &profile_cpu, +		    "CPU to profile on"), +	OPT_INTEGER('m', "mmap-pages", &mmap_pages, +		    "number of mmap data pages"), +	OPT_INTEGER('r', "realtime", &realtime_prio, +		    "collect data with this RT SCHED_FIFO priority"), +	OPT_INTEGER('d', "delay", &delay_secs, +		    "number of seconds to delay between refreshes"), +	OPT_BOOLEAN('D', "dump-symtab", &dump_symtab, +			    "dump the symbol table used for profiling"), +	OPT_INTEGER('f', "count-filter", &count_filter, +		    "only display functions with more events than this"), +	OPT_BOOLEAN('g', "group", &group, +			    "put the counters into a counter group"), +	OPT_STRING('s', "sym-filter", &sym_filter, "pattern", +		    "only display symbols matchig this pattern"), +	OPT_BOOLEAN('z', "zero", &group, +		    "zero history across updates"), +	OPT_INTEGER('F', "freq", &freq, +		    "profile at this frequency"), +	OPT_INTEGER('E', "entries", &print_entries, +		    "display this many functions"), +	OPT_BOOLEAN('v', "verbose", &verbose, +		    "be more verbose (show counter open errors, etc)"), +	OPT_END() +}; + +int cmd_top(int argc, const char **argv, const char *prefix) +{ +	int counter; + +	page_size = sysconf(_SC_PAGE_SIZE); + +	argc = parse_options(argc, argv, options, top_usage, 0); +	if (argc) +		usage_with_options(top_usage, options); + +	if (freq) { +		default_interval = freq; +		freq = 1; +	} + +	/* CPU and PID are mutually exclusive */ +	if (target_pid != -1 && profile_cpu != -1) { +		printf("WARNING: PID switch overriding CPU\n"); +		sleep(1); +		profile_cpu = -1; +	} + +	if (!nr_counters) +		nr_counters = 1; + +	if (delay_secs < 1) +		delay_secs = 1; + +	parse_symbols(); + +	/* +	 * Fill in the ones not specifically initialized via -c: +	 */ +	for (counter = 0; counter < nr_counters; counter++) { +		if (attrs[counter].sample_period) +			continue; + +		attrs[counter].sample_period = default_interval; +	} + +	nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); +	assert(nr_cpus <= MAX_NR_CPUS); +	assert(nr_cpus >= 0); + +	if (target_pid != -1 || profile_cpu != -1) +		nr_cpus = 1; + +	return __cmd_top(); +} diff --git a/tools/perf/builtin.h b/tools/perf/builtin.h new file mode 100644 index 00000000000..51d168230ee --- /dev/null +++ b/tools/perf/builtin.h @@ -0,0 +1,26 @@ +#ifndef BUILTIN_H +#define BUILTIN_H + +#include "util/util.h" +#include "util/strbuf.h" + +extern const char perf_version_string[]; +extern const char perf_usage_string[]; +extern const char perf_more_info_string[]; + +extern void list_common_cmds_help(void); +extern const char *help_unknown_cmd(const char *cmd); +extern void prune_packed_objects(int); +extern int read_line_with_nul(char *buf, int size, FILE *file); +extern int check_pager_config(const char *cmd); + +extern int cmd_annotate(int argc, const char **argv, const char *prefix); +extern int cmd_help(int argc, const char **argv, const char *prefix); +extern int cmd_record(int argc, const char **argv, const char *prefix); +extern int cmd_report(int argc, const char **argv, const char *prefix); +extern int cmd_stat(int argc, const char **argv, const char *prefix); +extern int cmd_top(int argc, const char **argv, const char *prefix); +extern int cmd_version(int argc, const char **argv, const char *prefix); +extern int cmd_list(int argc, const char **argv, const char *prefix); + +#endif diff --git a/tools/perf/command-list.txt b/tools/perf/command-list.txt new file mode 100644 index 00000000000..eebce30afbc --- /dev/null +++ b/tools/perf/command-list.txt @@ -0,0 +1,10 @@ +# +# List of known perf commands. +# command name			category [deprecated] [common] +# +perf-annotate			mainporcelain common +perf-list			mainporcelain common +perf-record			mainporcelain common +perf-report			mainporcelain common +perf-stat			mainporcelain common +perf-top			mainporcelain common diff --git a/tools/perf/design.txt b/tools/perf/design.txt new file mode 100644 index 00000000000..860e116d979 --- /dev/null +++ b/tools/perf/design.txt @@ -0,0 +1,442 @@ + +Performance Counters for Linux +------------------------------ + +Performance counters are special hardware registers available on most modern +CPUs. These registers count the number of certain types of hw events: such +as instructions executed, cachemisses suffered, or branches mis-predicted - +without slowing down the kernel or applications. These registers can also +trigger interrupts when a threshold number of events have passed - and can +thus be used to profile the code that runs on that CPU. + +The Linux Performance Counter subsystem provides an abstraction of these +hardware capabilities. It provides per task and per CPU counters, counter +groups, and it provides event capabilities on top of those.  It +provides "virtual" 64-bit counters, regardless of the width of the +underlying hardware counters. + +Performance counters are accessed via special file descriptors. +There's one file descriptor per virtual counter used. + +The special file descriptor is opened via the perf_counter_open() +system call: + +   int sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr, +			     pid_t pid, int cpu, int group_fd, +			     unsigned long flags); + +The syscall returns the new fd. The fd can be used via the normal +VFS system calls: read() can be used to read the counter, fcntl() +can be used to set the blocking mode, etc. + +Multiple counters can be kept open at a time, and the counters +can be poll()ed. + +When creating a new counter fd, 'perf_counter_hw_event' is: + +struct perf_counter_hw_event { +        /* +         * The MSB of the config word signifies if the rest contains cpu +         * specific (raw) counter configuration data, if unset, the next +         * 7 bits are an event type and the rest of the bits are the event +         * identifier. +         */ +        __u64                   config; + +        __u64                   irq_period; +        __u32                   record_type; +        __u32                   read_format; + +        __u64                   disabled       :  1, /* off by default        */ +                                inherit        :  1, /* children inherit it   */ +                                pinned         :  1, /* must always be on PMU */ +                                exclusive      :  1, /* only group on PMU     */ +                                exclude_user   :  1, /* don't count user      */ +                                exclude_kernel :  1, /* ditto kernel          */ +                                exclude_hv     :  1, /* ditto hypervisor      */ +                                exclude_idle   :  1, /* don't count when idle */ +                                mmap           :  1, /* include mmap data     */ +                                munmap         :  1, /* include munmap data   */ +                                comm           :  1, /* include comm data     */ + +                                __reserved_1   : 52; + +        __u32                   extra_config_len; +        __u32                   wakeup_events;  /* wakeup every n events */ + +        __u64                   __reserved_2; +        __u64                   __reserved_3; +}; + +The 'config' field specifies what the counter should count.  It +is divided into 3 bit-fields: + +raw_type: 1 bit   (most significant bit)	0x8000_0000_0000_0000 +type:	  7 bits  (next most significant)	0x7f00_0000_0000_0000 +event_id: 56 bits (least significant)		0x00ff_ffff_ffff_ffff + +If 'raw_type' is 1, then the counter will count a hardware event +specified by the remaining 63 bits of event_config.  The encoding is +machine-specific. + +If 'raw_type' is 0, then the 'type' field says what kind of counter +this is, with the following encoding: + +enum perf_event_types { +	PERF_TYPE_HARDWARE		= 0, +	PERF_TYPE_SOFTWARE		= 1, +	PERF_TYPE_TRACEPOINT		= 2, +}; + +A counter of PERF_TYPE_HARDWARE will count the hardware event +specified by 'event_id': + +/* + * Generalized performance counter event types, used by the hw_event.event_id + * parameter of the sys_perf_counter_open() syscall: + */ +enum hw_event_ids { +	/* +	 * Common hardware events, generalized by the kernel: +	 */ +	PERF_COUNT_HW_CPU_CYCLES		= 0, +	PERF_COUNT_HW_INSTRUCTIONS		= 1, +	PERF_COUNT_HW_CACHE_REFERENCES	= 2, +	PERF_COUNT_HW_CACHE_MISSES		= 3, +	PERF_COUNT_HW_BRANCH_INSTRUCTIONS	= 4, +	PERF_COUNT_HW_BRANCH_MISSES	= 5, +	PERF_COUNT_HW_BUS_CYCLES		= 6, +}; + +These are standardized types of events that work relatively uniformly +on all CPUs that implement Performance Counters support under Linux, +although there may be variations (e.g., different CPUs might count +cache references and misses at different levels of the cache hierarchy). +If a CPU is not able to count the selected event, then the system call +will return -EINVAL. + +More hw_event_types are supported as well, but they are CPU-specific +and accessed as raw events.  For example, to count "External bus +cycles while bus lock signal asserted" events on Intel Core CPUs, pass +in a 0x4064 event_id value and set hw_event.raw_type to 1. + +A counter of type PERF_TYPE_SOFTWARE will count one of the available +software events, selected by 'event_id': + +/* + * Special "software" counters provided by the kernel, even if the hardware + * does not support performance counters. These counters measure various + * physical and sw events of the kernel (and allow the profiling of them as + * well): + */ +enum sw_event_ids { +	PERF_COUNT_SW_CPU_CLOCK		= 0, +	PERF_COUNT_SW_TASK_CLOCK		= 1, +	PERF_COUNT_SW_PAGE_FAULTS		= 2, +	PERF_COUNT_SW_CONTEXT_SWITCHES	= 3, +	PERF_COUNT_SW_CPU_MIGRATIONS	= 4, +	PERF_COUNT_SW_PAGE_FAULTS_MIN	= 5, +	PERF_COUNT_SW_PAGE_FAULTS_MAJ	= 6, +}; + +Counters of the type PERF_TYPE_TRACEPOINT are available when the ftrace event +tracer is available, and event_id values can be obtained from +/debug/tracing/events/*/*/id + + +Counters come in two flavours: counting counters and sampling +counters.  A "counting" counter is one that is used for counting the +number of events that occur, and is characterised by having +irq_period = 0. + + +A read() on a counter returns the current value of the counter and possible +additional values as specified by 'read_format', each value is a u64 (8 bytes) +in size. + +/* + * Bits that can be set in hw_event.read_format to request that + * reads on the counter should return the indicated quantities, + * in increasing order of bit value, after the counter value. + */ +enum perf_counter_read_format { +        PERF_FORMAT_TOTAL_TIME_ENABLED  =  1, +        PERF_FORMAT_TOTAL_TIME_RUNNING  =  2, +}; + +Using these additional values one can establish the overcommit ratio for a +particular counter allowing one to take the round-robin scheduling effect +into account. + + +A "sampling" counter is one that is set up to generate an interrupt +every N events, where N is given by 'irq_period'.  A sampling counter +has irq_period > 0. The record_type controls what data is recorded on each +interrupt: + +/* + * Bits that can be set in hw_event.record_type to request information + * in the overflow packets. + */ +enum perf_counter_record_format { +        PERF_RECORD_IP          = 1U << 0, +        PERF_RECORD_TID         = 1U << 1, +        PERF_RECORD_TIME        = 1U << 2, +        PERF_RECORD_ADDR        = 1U << 3, +        PERF_RECORD_GROUP       = 1U << 4, +        PERF_RECORD_CALLCHAIN   = 1U << 5, +}; + +Such (and other) events will be recorded in a ring-buffer, which is +available to user-space using mmap() (see below). + +The 'disabled' bit specifies whether the counter starts out disabled +or enabled.  If it is initially disabled, it can be enabled by ioctl +or prctl (see below). + +The 'inherit' bit, if set, specifies that this counter should count +events on descendant tasks as well as the task specified.  This only +applies to new descendents, not to any existing descendents at the +time the counter is created (nor to any new descendents of existing +descendents). + +The 'pinned' bit, if set, specifies that the counter should always be +on the CPU if at all possible.  It only applies to hardware counters +and only to group leaders.  If a pinned counter cannot be put onto the +CPU (e.g. because there are not enough hardware counters or because of +a conflict with some other event), then the counter goes into an +'error' state, where reads return end-of-file (i.e. read() returns 0) +until the counter is subsequently enabled or disabled. + +The 'exclusive' bit, if set, specifies that when this counter's group +is on the CPU, it should be the only group using the CPU's counters. +In future, this will allow sophisticated monitoring programs to supply +extra configuration information via 'extra_config_len' to exploit +advanced features of the CPU's Performance Monitor Unit (PMU) that are +not otherwise accessible and that might disrupt other hardware +counters. + +The 'exclude_user', 'exclude_kernel' and 'exclude_hv' bits provide a +way to request that counting of events be restricted to times when the +CPU is in user, kernel and/or hypervisor mode. + +The 'mmap' and 'munmap' bits allow recording of PROT_EXEC mmap/munmap +operations, these can be used to relate userspace IP addresses to actual +code, even after the mapping (or even the whole process) is gone, +these events are recorded in the ring-buffer (see below). + +The 'comm' bit allows tracking of process comm data on process creation. +This too is recorded in the ring-buffer (see below). + +The 'pid' parameter to the perf_counter_open() system call allows the +counter to be specific to a task: + + pid == 0: if the pid parameter is zero, the counter is attached to the + current task. + + pid > 0: the counter is attached to a specific task (if the current task + has sufficient privilege to do so) + + pid < 0: all tasks are counted (per cpu counters) + +The 'cpu' parameter allows a counter to be made specific to a CPU: + + cpu >= 0: the counter is restricted to a specific CPU + cpu == -1: the counter counts on all CPUs + +(Note: the combination of 'pid == -1' and 'cpu == -1' is not valid.) + +A 'pid > 0' and 'cpu == -1' counter is a per task counter that counts +events of that task and 'follows' that task to whatever CPU the task +gets schedule to. Per task counters can be created by any user, for +their own tasks. + +A 'pid == -1' and 'cpu == x' counter is a per CPU counter that counts +all events on CPU-x. Per CPU counters need CAP_SYS_ADMIN privilege. + +The 'flags' parameter is currently unused and must be zero. + +The 'group_fd' parameter allows counter "groups" to be set up.  A +counter group has one counter which is the group "leader".  The leader +is created first, with group_fd = -1 in the perf_counter_open call +that creates it.  The rest of the group members are created +subsequently, with group_fd giving the fd of the group leader. +(A single counter on its own is created with group_fd = -1 and is +considered to be a group with only 1 member.) + +A counter group is scheduled onto the CPU as a unit, that is, it will +only be put onto the CPU if all of the counters in the group can be +put onto the CPU.  This means that the values of the member counters +can be meaningfully compared, added, divided (to get ratios), etc., +with each other, since they have counted events for the same set of +executed instructions. + + +Like stated, asynchronous events, like counter overflow or PROT_EXEC mmap +tracking are logged into a ring-buffer. This ring-buffer is created and +accessed through mmap(). + +The mmap size should be 1+2^n pages, where the first page is a meta-data page +(struct perf_counter_mmap_page) that contains various bits of information such +as where the ring-buffer head is. + +/* + * Structure of the page that can be mapped via mmap + */ +struct perf_counter_mmap_page { +        __u32   version;                /* version number of this structure */ +        __u32   compat_version;         /* lowest version this is compat with */ + +        /* +         * Bits needed to read the hw counters in user-space. +         * +         *   u32 seq; +         *   s64 count; +         * +         *   do { +         *     seq = pc->lock; +         * +         *     barrier() +         *     if (pc->index) { +         *       count = pmc_read(pc->index - 1); +         *       count += pc->offset; +         *     } else +         *       goto regular_read; +         * +         *     barrier(); +         *   } while (pc->lock != seq); +         * +         * NOTE: for obvious reason this only works on self-monitoring +         *       processes. +         */ +        __u32   lock;                   /* seqlock for synchronization */ +        __u32   index;                  /* hardware counter identifier */ +        __s64   offset;                 /* add to hardware counter value */ + +        /* +         * Control data for the mmap() data buffer. +         * +         * User-space reading this value should issue an rmb(), on SMP capable +         * platforms, after reading this value -- see perf_counter_wakeup(). +         */ +        __u32   data_head;              /* head in the data section */ +}; + +NOTE: the hw-counter userspace bits are arch specific and are currently only +      implemented on powerpc. + +The following 2^n pages are the ring-buffer which contains events of the form: + +#define PERF_EVENT_MISC_KERNEL          (1 << 0) +#define PERF_EVENT_MISC_USER            (1 << 1) +#define PERF_EVENT_MISC_OVERFLOW        (1 << 2) + +struct perf_event_header { +        __u32   type; +        __u16   misc; +        __u16   size; +}; + +enum perf_event_type { + +        /* +         * The MMAP events record the PROT_EXEC mappings so that we can +         * correlate userspace IPs to code. They have the following structure: +         * +         * struct { +         *      struct perf_event_header        header; +         * +         *      u32                             pid, tid; +         *      u64                             addr; +         *      u64                             len; +         *      u64                             pgoff; +         *      char                            filename[]; +         * }; +         */ +        PERF_EVENT_MMAP                 = 1, +        PERF_EVENT_MUNMAP               = 2, + +        /* +         * struct { +         *      struct perf_event_header        header; +         * +         *      u32                             pid, tid; +         *      char                            comm[]; +         * }; +         */ +        PERF_EVENT_COMM                 = 3, + +        /* +         * When header.misc & PERF_EVENT_MISC_OVERFLOW the event_type field +         * will be PERF_RECORD_* +         * +         * struct { +         *      struct perf_event_header        header; +         * +         *      { u64                   ip;       } && PERF_RECORD_IP +         *      { u32                   pid, tid; } && PERF_RECORD_TID +         *      { u64                   time;     } && PERF_RECORD_TIME +         *      { u64                   addr;     } && PERF_RECORD_ADDR +         * +         *      { u64                   nr; +         *        { u64 event, val; }   cnt[nr];  } && PERF_RECORD_GROUP +         * +         *      { u16                   nr, +         *                              hv, +         *                              kernel, +         *                              user; +         *        u64                   ips[nr];  } && PERF_RECORD_CALLCHAIN +         * }; +         */ +}; + +NOTE: PERF_RECORD_CALLCHAIN is arch specific and currently only implemented +      on x86. + +Notification of new events is possible through poll()/select()/epoll() and +fcntl() managing signals. + +Normally a notification is generated for every page filled, however one can +additionally set perf_counter_hw_event.wakeup_events to generate one every +so many counter overflow events. + +Future work will include a splice() interface to the ring-buffer. + + +Counters can be enabled and disabled in two ways: via ioctl and via +prctl.  When a counter is disabled, it doesn't count or generate +events but does continue to exist and maintain its count value. + +An individual counter or counter group can be enabled with + +	ioctl(fd, PERF_COUNTER_IOC_ENABLE); + +or disabled with + +	ioctl(fd, PERF_COUNTER_IOC_DISABLE); + +Enabling or disabling the leader of a group enables or disables the +whole group; that is, while the group leader is disabled, none of the +counters in the group will count.  Enabling or disabling a member of a +group other than the leader only affects that counter - disabling an +non-leader stops that counter from counting but doesn't affect any +other counter. + +Additionally, non-inherited overflow counters can use + +	ioctl(fd, PERF_COUNTER_IOC_REFRESH, nr); + +to enable a counter for 'nr' events, after which it gets disabled again. + +A process can enable or disable all the counter groups that are +attached to it, using prctl: + +	prctl(PR_TASK_PERF_COUNTERS_ENABLE); + +	prctl(PR_TASK_PERF_COUNTERS_DISABLE); + +This applies to all counters on the current process, whether created +by this process or by another, and doesn't affect any counters that +this process has created on other processes.  It only enables or +disables the group leaders, not any other members in the groups. + diff --git a/tools/perf/perf.c b/tools/perf/perf.c new file mode 100644 index 00000000000..4eb72593370 --- /dev/null +++ b/tools/perf/perf.c @@ -0,0 +1,428 @@ +/* + * perf.c + * + * Performance analysis utility. + * + * This is the main hub from which the sub-commands (perf stat, + * perf top, perf record, perf report, etc.) are started. + */ +#include "builtin.h" + +#include "util/exec_cmd.h" +#include "util/cache.h" +#include "util/quote.h" +#include "util/run-command.h" + +const char perf_usage_string[] = +	"perf [--version] [--help] COMMAND [ARGS]"; + +const char perf_more_info_string[] = +	"See 'perf help COMMAND' for more information on a specific command."; + +static int use_pager = -1; +struct pager_config { +	const char *cmd; +	int val; +}; + +static int pager_command_config(const char *var, const char *value, void *data) +{ +	struct pager_config *c = data; +	if (!prefixcmp(var, "pager.") && !strcmp(var + 6, c->cmd)) +		c->val = perf_config_bool(var, value); +	return 0; +} + +/* returns 0 for "no pager", 1 for "use pager", and -1 for "not specified" */ +int check_pager_config(const char *cmd) +{ +	struct pager_config c; +	c.cmd = cmd; +	c.val = -1; +	perf_config(pager_command_config, &c); +	return c.val; +} + +static void commit_pager_choice(void) { +	switch (use_pager) { +	case 0: +		setenv("PERF_PAGER", "cat", 1); +		break; +	case 1: +		/* setup_pager(); */ +		break; +	default: +		break; +	} +} + +static int handle_options(const char*** argv, int* argc, int* envchanged) +{ +	int handled = 0; + +	while (*argc > 0) { +		const char *cmd = (*argv)[0]; +		if (cmd[0] != '-') +			break; + +		/* +		 * For legacy reasons, the "version" and "help" +		 * commands can be written with "--" prepended +		 * to make them look like flags. +		 */ +		if (!strcmp(cmd, "--help") || !strcmp(cmd, "--version")) +			break; + +		/* +		 * Check remaining flags. +		 */ +		if (!prefixcmp(cmd, "--exec-path")) { +			cmd += 11; +			if (*cmd == '=') +				perf_set_argv_exec_path(cmd + 1); +			else { +				puts(perf_exec_path()); +				exit(0); +			} +		} else if (!strcmp(cmd, "--html-path")) { +			puts(system_path(PERF_HTML_PATH)); +			exit(0); +		} else if (!strcmp(cmd, "-p") || !strcmp(cmd, "--paginate")) { +			use_pager = 1; +		} else if (!strcmp(cmd, "--no-pager")) { +			use_pager = 0; +			if (envchanged) +				*envchanged = 1; +		} else if (!strcmp(cmd, "--perf-dir")) { +			if (*argc < 2) { +				fprintf(stderr, "No directory given for --perf-dir.\n" ); +				usage(perf_usage_string); +			} +			setenv(PERF_DIR_ENVIRONMENT, (*argv)[1], 1); +			if (envchanged) +				*envchanged = 1; +			(*argv)++; +			(*argc)--; +			handled++; +		} else if (!prefixcmp(cmd, "--perf-dir=")) { +			setenv(PERF_DIR_ENVIRONMENT, cmd + 10, 1); +			if (envchanged) +				*envchanged = 1; +		} else if (!strcmp(cmd, "--work-tree")) { +			if (*argc < 2) { +				fprintf(stderr, "No directory given for --work-tree.\n" ); +				usage(perf_usage_string); +			} +			setenv(PERF_WORK_TREE_ENVIRONMENT, (*argv)[1], 1); +			if (envchanged) +				*envchanged = 1; +			(*argv)++; +			(*argc)--; +		} else if (!prefixcmp(cmd, "--work-tree=")) { +			setenv(PERF_WORK_TREE_ENVIRONMENT, cmd + 12, 1); +			if (envchanged) +				*envchanged = 1; +		} else { +			fprintf(stderr, "Unknown option: %s\n", cmd); +			usage(perf_usage_string); +		} + +		(*argv)++; +		(*argc)--; +		handled++; +	} +	return handled; +} + +static int handle_alias(int *argcp, const char ***argv) +{ +	int envchanged = 0, ret = 0, saved_errno = errno; +	int count, option_count; +	const char** new_argv; +	const char *alias_command; +	char *alias_string; + +	alias_command = (*argv)[0]; +	alias_string = alias_lookup(alias_command); +	if (alias_string) { +		if (alias_string[0] == '!') { +			if (*argcp > 1) { +				struct strbuf buf; + +				strbuf_init(&buf, PATH_MAX); +				strbuf_addstr(&buf, alias_string); +				sq_quote_argv(&buf, (*argv) + 1, PATH_MAX); +				free(alias_string); +				alias_string = buf.buf; +			} +			ret = system(alias_string + 1); +			if (ret >= 0 && WIFEXITED(ret) && +			    WEXITSTATUS(ret) != 127) +				exit(WEXITSTATUS(ret)); +			die("Failed to run '%s' when expanding alias '%s'", +			    alias_string + 1, alias_command); +		} +		count = split_cmdline(alias_string, &new_argv); +		if (count < 0) +			die("Bad alias.%s string", alias_command); +		option_count = handle_options(&new_argv, &count, &envchanged); +		if (envchanged) +			die("alias '%s' changes environment variables\n" +				 "You can use '!perf' in the alias to do this.", +				 alias_command); +		memmove(new_argv - option_count, new_argv, +				count * sizeof(char *)); +		new_argv -= option_count; + +		if (count < 1) +			die("empty alias for %s", alias_command); + +		if (!strcmp(alias_command, new_argv[0])) +			die("recursive alias: %s", alias_command); + +		new_argv = realloc(new_argv, sizeof(char*) * +				    (count + *argcp + 1)); +		/* insert after command name */ +		memcpy(new_argv + count, *argv + 1, sizeof(char*) * *argcp); +		new_argv[count+*argcp] = NULL; + +		*argv = new_argv; +		*argcp += count - 1; + +		ret = 1; +	} + +	errno = saved_errno; + +	return ret; +} + +const char perf_version_string[] = PERF_VERSION; + +#define RUN_SETUP	(1<<0) +#define USE_PAGER	(1<<1) +/* + * require working tree to be present -- anything uses this needs + * RUN_SETUP for reading from the configuration file. + */ +#define NEED_WORK_TREE	(1<<2) + +struct cmd_struct { +	const char *cmd; +	int (*fn)(int, const char **, const char *); +	int option; +}; + +static int run_builtin(struct cmd_struct *p, int argc, const char **argv) +{ +	int status; +	struct stat st; +	const char *prefix; + +	prefix = NULL; +	if (p->option & RUN_SETUP) +		prefix = NULL; /* setup_perf_directory(); */ + +	if (use_pager == -1 && p->option & RUN_SETUP) +		use_pager = check_pager_config(p->cmd); +	if (use_pager == -1 && p->option & USE_PAGER) +		use_pager = 1; +	commit_pager_choice(); + +	if (p->option & NEED_WORK_TREE) +		/* setup_work_tree() */; + +	status = p->fn(argc, argv, prefix); +	if (status) +		return status & 0xff; + +	/* Somebody closed stdout? */ +	if (fstat(fileno(stdout), &st)) +		return 0; +	/* Ignore write errors for pipes and sockets.. */ +	if (S_ISFIFO(st.st_mode) || S_ISSOCK(st.st_mode)) +		return 0; + +	/* Check for ENOSPC and EIO errors.. */ +	if (fflush(stdout)) +		die("write failure on standard output: %s", strerror(errno)); +	if (ferror(stdout)) +		die("unknown write failure on standard output"); +	if (fclose(stdout)) +		die("close failed on standard output: %s", strerror(errno)); +	return 0; +} + +static void handle_internal_command(int argc, const char **argv) +{ +	const char *cmd = argv[0]; +	static struct cmd_struct commands[] = { +		{ "help", cmd_help, 0 }, +		{ "list", cmd_list, 0 }, +		{ "record", cmd_record, 0 }, +		{ "report", cmd_report, 0 }, +		{ "stat", cmd_stat, 0 }, +		{ "top", cmd_top, 0 }, +		{ "annotate", cmd_annotate, 0 }, +		{ "version", cmd_version, 0 }, +	}; +	int i; +	static const char ext[] = STRIP_EXTENSION; + +	if (sizeof(ext) > 1) { +		i = strlen(argv[0]) - strlen(ext); +		if (i > 0 && !strcmp(argv[0] + i, ext)) { +			char *argv0 = strdup(argv[0]); +			argv[0] = cmd = argv0; +			argv0[i] = '\0'; +		} +	} + +	/* Turn "perf cmd --help" into "perf help cmd" */ +	if (argc > 1 && !strcmp(argv[1], "--help")) { +		argv[1] = argv[0]; +		argv[0] = cmd = "help"; +	} + +	for (i = 0; i < ARRAY_SIZE(commands); i++) { +		struct cmd_struct *p = commands+i; +		if (strcmp(p->cmd, cmd)) +			continue; +		exit(run_builtin(p, argc, argv)); +	} +} + +static void execv_dashed_external(const char **argv) +{ +	struct strbuf cmd = STRBUF_INIT; +	const char *tmp; +	int status; + +	strbuf_addf(&cmd, "perf-%s", argv[0]); + +	/* +	 * argv[0] must be the perf command, but the argv array +	 * belongs to the caller, and may be reused in +	 * subsequent loop iterations. Save argv[0] and +	 * restore it on error. +	 */ +	tmp = argv[0]; +	argv[0] = cmd.buf; + +	/* +	 * if we fail because the command is not found, it is +	 * OK to return. Otherwise, we just pass along the status code. +	 */ +	status = run_command_v_opt(argv, 0); +	if (status != -ERR_RUN_COMMAND_EXEC) { +		if (IS_RUN_COMMAND_ERR(status)) +			die("unable to run '%s'", argv[0]); +		exit(-status); +	} +	errno = ENOENT; /* as if we called execvp */ + +	argv[0] = tmp; + +	strbuf_release(&cmd); +} + +static int run_argv(int *argcp, const char ***argv) +{ +	int done_alias = 0; + +	while (1) { +		/* See if it's an internal command */ +		handle_internal_command(*argcp, *argv); + +		/* .. then try the external ones */ +		execv_dashed_external(*argv); + +		/* It could be an alias -- this works around the insanity +		 * of overriding "perf log" with "perf show" by having +		 * alias.log = show +		 */ +		if (done_alias || !handle_alias(argcp, argv)) +			break; +		done_alias = 1; +	} + +	return done_alias; +} + + +int main(int argc, const char **argv) +{ +	const char *cmd; + +	cmd = perf_extract_argv0_path(argv[0]); +	if (!cmd) +		cmd = "perf-help"; + +	/* +	 * "perf-xxxx" is the same as "perf xxxx", but we obviously: +	 * +	 *  - cannot take flags in between the "perf" and the "xxxx". +	 *  - cannot execute it externally (since it would just do +	 *    the same thing over again) +	 * +	 * So we just directly call the internal command handler, and +	 * die if that one cannot handle it. +	 */ +	if (!prefixcmp(cmd, "perf-")) { +		cmd += 5; +		argv[0] = cmd; +		handle_internal_command(argc, argv); +		die("cannot handle %s internally", cmd); +	} + +	/* Look for flags.. */ +	argv++; +	argc--; +	handle_options(&argv, &argc, NULL); +	commit_pager_choice(); +	if (argc > 0) { +		if (!prefixcmp(argv[0], "--")) +			argv[0] += 2; +	} else { +		/* The user didn't specify a command; give them help */ +		printf("\n usage: %s\n\n", perf_usage_string); +		list_common_cmds_help(); +		printf("\n %s\n\n", perf_more_info_string); +		exit(1); +	} +	cmd = argv[0]; + +	/* +	 * We use PATH to find perf commands, but we prepend some higher +	 * precidence paths: the "--exec-path" option, the PERF_EXEC_PATH +	 * environment, and the $(perfexecdir) from the Makefile at build +	 * time. +	 */ +	setup_path(); + +	while (1) { +		static int done_help = 0; +		static int was_alias = 0; + +		was_alias = run_argv(&argc, &argv); +		if (errno != ENOENT) +			break; + +		if (was_alias) { +			fprintf(stderr, "Expansion of alias '%s' failed; " +				"'%s' is not a perf-command\n", +				cmd, argv[0]); +			exit(1); +		} +		if (!done_help) { +			cmd = argv[0] = help_unknown_cmd(cmd); +			done_help = 1; +		} else +			break; +	} + +	fprintf(stderr, "Failed to run command '%s': %s\n", +		cmd, strerror(errno)); + +	return 1; +} diff --git a/tools/perf/perf.h b/tools/perf/perf.h new file mode 100644 index 00000000000..af0a5046d74 --- /dev/null +++ b/tools/perf/perf.h @@ -0,0 +1,67 @@ +#ifndef _PERF_PERF_H +#define _PERF_PERF_H + +#if defined(__x86_64__) || defined(__i386__) +#include "../../arch/x86/include/asm/unistd.h" +#define rmb()		asm volatile("lfence" ::: "memory") +#define cpu_relax()	asm volatile("rep; nop" ::: "memory"); +#endif + +#ifdef __powerpc__ +#include "../../arch/powerpc/include/asm/unistd.h" +#define rmb()		asm volatile ("sync" ::: "memory") +#define cpu_relax()	asm volatile ("" ::: "memory"); +#endif + +#include <time.h> +#include <unistd.h> +#include <sys/types.h> +#include <sys/syscall.h> + +#include "../../include/linux/perf_counter.h" + +/* + * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all + * counters in the current task. + */ +#define PR_TASK_PERF_COUNTERS_DISABLE   31 +#define PR_TASK_PERF_COUNTERS_ENABLE    32 + +#ifndef NSEC_PER_SEC +# define NSEC_PER_SEC			1000000000ULL +#endif + +static inline unsigned long long rdclock(void) +{ +	struct timespec ts; + +	clock_gettime(CLOCK_MONOTONIC, &ts); +	return ts.tv_sec * 1000000000ULL + ts.tv_nsec; +} + +/* + * Pick up some kernel type conventions: + */ +#define __user +#define asmlinkage + +#define unlikely(x)	__builtin_expect(!!(x), 0) +#define min(x, y) ({				\ +	typeof(x) _min1 = (x);			\ +	typeof(y) _min2 = (y);			\ +	(void) (&_min1 == &_min2);		\ +	_min1 < _min2 ? _min1 : _min2; }) + +static inline int +sys_perf_counter_open(struct perf_counter_attr *attr_uptr, +		      pid_t pid, int cpu, int group_fd, +		      unsigned long flags) +{ +	return syscall(__NR_perf_counter_open, attr_uptr, pid, cpu, +		       group_fd, flags); +} + +#define MAX_COUNTERS			256 +#define MAX_NR_CPUS			256 + +#endif diff --git a/tools/perf/util/PERF-VERSION-GEN b/tools/perf/util/PERF-VERSION-GEN new file mode 100755 index 00000000000..c561d1538c0 --- /dev/null +++ b/tools/perf/util/PERF-VERSION-GEN @@ -0,0 +1,42 @@ +#!/bin/sh + +GVF=PERF-VERSION-FILE +DEF_VER=v0.0.1.PERF + +LF=' +' + +# First see if there is a version file (included in release tarballs), +# then try git-describe, then default. +if test -f version +then +	VN=$(cat version) || VN="$DEF_VER" +elif test -d .git -o -f .git && +	VN=$(git describe --abbrev=4 HEAD 2>/dev/null) && +	case "$VN" in +	*$LF*) (exit 1) ;; +	v[0-9]*) +		git update-index -q --refresh +		test -z "$(git diff-index --name-only HEAD --)" || +		VN="$VN-dirty" ;; +	esac +then +	VN=$(echo "$VN" | sed -e 's/-/./g'); +else +	VN="$DEF_VER" +fi + +VN=$(expr "$VN" : v*'\(.*\)') + +if test -r $GVF +then +	VC=$(sed -e 's/^PERF_VERSION = //' <$GVF) +else +	VC=unset +fi +test "$VN" = "$VC" || { +	echo >&2 "PERF_VERSION = $VN" +	echo "PERF_VERSION = $VN" >$GVF +} + + diff --git a/tools/perf/util/abspath.c b/tools/perf/util/abspath.c new file mode 100644 index 00000000000..61d33b81fc9 --- /dev/null +++ b/tools/perf/util/abspath.c @@ -0,0 +1,117 @@ +#include "cache.h" + +/* + * Do not use this for inspecting *tracked* content.  When path is a + * symlink to a directory, we do not want to say it is a directory when + * dealing with tracked content in the working tree. + */ +static int is_directory(const char *path) +{ +	struct stat st; +	return (!stat(path, &st) && S_ISDIR(st.st_mode)); +} + +/* We allow "recursive" symbolic links. Only within reason, though. */ +#define MAXDEPTH 5 + +const char *make_absolute_path(const char *path) +{ +	static char bufs[2][PATH_MAX + 1], *buf = bufs[0], *next_buf = bufs[1]; +	char cwd[1024] = ""; +	int buf_index = 1, len; + +	int depth = MAXDEPTH; +	char *last_elem = NULL; +	struct stat st; + +	if (strlcpy(buf, path, PATH_MAX) >= PATH_MAX) +		die ("Too long path: %.*s", 60, path); + +	while (depth--) { +		if (!is_directory(buf)) { +			char *last_slash = strrchr(buf, '/'); +			if (last_slash) { +				*last_slash = '\0'; +				last_elem = xstrdup(last_slash + 1); +			} else { +				last_elem = xstrdup(buf); +				*buf = '\0'; +			} +		} + +		if (*buf) { +			if (!*cwd && !getcwd(cwd, sizeof(cwd))) +				die ("Could not get current working directory"); + +			if (chdir(buf)) +				die ("Could not switch to '%s'", buf); +		} +		if (!getcwd(buf, PATH_MAX)) +			die ("Could not get current working directory"); + +		if (last_elem) { +			int len = strlen(buf); +			if (len + strlen(last_elem) + 2 > PATH_MAX) +				die ("Too long path name: '%s/%s'", +						buf, last_elem); +			buf[len] = '/'; +			strcpy(buf + len + 1, last_elem); +			free(last_elem); +			last_elem = NULL; +		} + +		if (!lstat(buf, &st) && S_ISLNK(st.st_mode)) { +			len = readlink(buf, next_buf, PATH_MAX); +			if (len < 0) +				die ("Invalid symlink: %s", buf); +			if (PATH_MAX <= len) +				die("symbolic link too long: %s", buf); +			next_buf[len] = '\0'; +			buf = next_buf; +			buf_index = 1 - buf_index; +			next_buf = bufs[buf_index]; +		} else +			break; +	} + +	if (*cwd && chdir(cwd)) +		die ("Could not change back to '%s'", cwd); + +	return buf; +} + +static const char *get_pwd_cwd(void) +{ +	static char cwd[PATH_MAX + 1]; +	char *pwd; +	struct stat cwd_stat, pwd_stat; +	if (getcwd(cwd, PATH_MAX) == NULL) +		return NULL; +	pwd = getenv("PWD"); +	if (pwd && strcmp(pwd, cwd)) { +		stat(cwd, &cwd_stat); +		if (!stat(pwd, &pwd_stat) && +		    pwd_stat.st_dev == cwd_stat.st_dev && +		    pwd_stat.st_ino == cwd_stat.st_ino) { +			strlcpy(cwd, pwd, PATH_MAX); +		} +	} +	return cwd; +} + +const char *make_nonrelative_path(const char *path) +{ +	static char buf[PATH_MAX + 1]; + +	if (is_absolute_path(path)) { +		if (strlcpy(buf, path, PATH_MAX) >= PATH_MAX) +			die("Too long path: %.*s", 60, path); +	} else { +		const char *cwd = get_pwd_cwd(); +		if (!cwd) +			die("Cannot determine the current working directory"); +		if (snprintf(buf, PATH_MAX, "%s/%s", cwd, path) >= PATH_MAX) +			die("Too long path: %.*s", 60, path); +	} +	return buf; +} diff --git a/tools/perf/util/alias.c b/tools/perf/util/alias.c new file mode 100644 index 00000000000..9b3dd2b428d --- /dev/null +++ b/tools/perf/util/alias.c @@ -0,0 +1,77 @@ +#include "cache.h" + +static const char *alias_key; +static char *alias_val; + +static int alias_lookup_cb(const char *k, const char *v, void *cb) +{ +	if (!prefixcmp(k, "alias.") && !strcmp(k+6, alias_key)) { +		if (!v) +			return config_error_nonbool(k); +		alias_val = strdup(v); +		return 0; +	} +	return 0; +} + +char *alias_lookup(const char *alias) +{ +	alias_key = alias; +	alias_val = NULL; +	perf_config(alias_lookup_cb, NULL); +	return alias_val; +} + +int split_cmdline(char *cmdline, const char ***argv) +{ +	int src, dst, count = 0, size = 16; +	char quoted = 0; + +	*argv = malloc(sizeof(char*) * size); + +	/* split alias_string */ +	(*argv)[count++] = cmdline; +	for (src = dst = 0; cmdline[src];) { +		char c = cmdline[src]; +		if (!quoted && isspace(c)) { +			cmdline[dst++] = 0; +			while (cmdline[++src] +					&& isspace(cmdline[src])) +				; /* skip */ +			if (count >= size) { +				size += 16; +				*argv = realloc(*argv, sizeof(char*) * size); +			} +			(*argv)[count++] = cmdline + dst; +		} else if (!quoted && (c == '\'' || c == '"')) { +			quoted = c; +			src++; +		} else if (c == quoted) { +			quoted = 0; +			src++; +		} else { +			if (c == '\\' && quoted != '\'') { +				src++; +				c = cmdline[src]; +				if (!c) { +					free(*argv); +					*argv = NULL; +					return error("cmdline ends with \\"); +				} +			} +			cmdline[dst++] = c; +			src++; +		} +	} + +	cmdline[dst] = 0; + +	if (quoted) { +		free(*argv); +		*argv = NULL; +		return error("unclosed quote"); +	} + +	return count; +} + diff --git a/tools/perf/util/cache.h b/tools/perf/util/cache.h new file mode 100644 index 00000000000..393d6146d13 --- /dev/null +++ b/tools/perf/util/cache.h @@ -0,0 +1,119 @@ +#ifndef CACHE_H +#define CACHE_H + +#include "util.h" +#include "strbuf.h" + +#define PERF_DIR_ENVIRONMENT "PERF_DIR" +#define PERF_WORK_TREE_ENVIRONMENT "PERF_WORK_TREE" +#define DEFAULT_PERF_DIR_ENVIRONMENT ".perf" +#define DB_ENVIRONMENT "PERF_OBJECT_DIRECTORY" +#define INDEX_ENVIRONMENT "PERF_INDEX_FILE" +#define GRAFT_ENVIRONMENT "PERF_GRAFT_FILE" +#define TEMPLATE_DIR_ENVIRONMENT "PERF_TEMPLATE_DIR" +#define CONFIG_ENVIRONMENT "PERF_CONFIG" +#define EXEC_PATH_ENVIRONMENT "PERF_EXEC_PATH" +#define CEILING_DIRECTORIES_ENVIRONMENT "PERF_CEILING_DIRECTORIES" +#define PERFATTRIBUTES_FILE ".perfattributes" +#define INFOATTRIBUTES_FILE "info/attributes" +#define ATTRIBUTE_MACRO_PREFIX "[attr]" + +typedef int (*config_fn_t)(const char *, const char *, void *); +extern int perf_default_config(const char *, const char *, void *); +extern int perf_config_from_file(config_fn_t fn, const char *, void *); +extern int perf_config(config_fn_t fn, void *); +extern int perf_parse_ulong(const char *, unsigned long *); +extern int perf_config_int(const char *, const char *); +extern unsigned long perf_config_ulong(const char *, const char *); +extern int perf_config_bool_or_int(const char *, const char *, int *); +extern int perf_config_bool(const char *, const char *); +extern int perf_config_string(const char **, const char *, const char *); +extern int perf_config_set(const char *, const char *); +extern int perf_config_set_multivar(const char *, const char *, const char *, int); +extern int perf_config_rename_section(const char *, const char *); +extern const char *perf_etc_perfconfig(void); +extern int check_repository_format_version(const char *var, const char *value, void *cb); +extern int perf_config_system(void); +extern int perf_config_global(void); +extern int config_error_nonbool(const char *); +extern const char *config_exclusive_filename; + +#define MAX_PERFNAME (1000) +extern char perf_default_email[MAX_PERFNAME]; +extern char perf_default_name[MAX_PERFNAME]; +extern int user_ident_explicitly_given; + +extern const char *perf_log_output_encoding; +extern const char *perf_mailmap_file; + +/* IO helper functions */ +extern void maybe_flush_or_die(FILE *, const char *); +extern int copy_fd(int ifd, int ofd); +extern int copy_file(const char *dst, const char *src, int mode); +extern ssize_t read_in_full(int fd, void *buf, size_t count); +extern ssize_t write_in_full(int fd, const void *buf, size_t count); +extern void write_or_die(int fd, const void *buf, size_t count); +extern int write_or_whine(int fd, const void *buf, size_t count, const char *msg); +extern int write_or_whine_pipe(int fd, const void *buf, size_t count, const char *msg); +extern void fsync_or_die(int fd, const char *); + +/* pager.c */ +extern void setup_pager(void); +extern const char *pager_program; +extern int pager_in_use(void); +extern int pager_use_color; + +extern const char *editor_program; +extern const char *excludes_file; + +char *alias_lookup(const char *alias); +int split_cmdline(char *cmdline, const char ***argv); + +#define alloc_nr(x) (((x)+16)*3/2) + +/* + * Realloc the buffer pointed at by variable 'x' so that it can hold + * at least 'nr' entries; the number of entries currently allocated + * is 'alloc', using the standard growing factor alloc_nr() macro. + * + * DO NOT USE any expression with side-effect for 'x' or 'alloc'. + */ +#define ALLOC_GROW(x, nr, alloc) \ +	do { \ +		if ((nr) > alloc) { \ +			if (alloc_nr(alloc) < (nr)) \ +				alloc = (nr); \ +			else \ +				alloc = alloc_nr(alloc); \ +			x = xrealloc((x), alloc * sizeof(*(x))); \ +		} \ +	} while(0) + + +static inline int is_absolute_path(const char *path) +{ +	return path[0] == '/'; +} + +const char *make_absolute_path(const char *path); +const char *make_nonrelative_path(const char *path); +const char *make_relative_path(const char *abs, const char *base); +int normalize_path_copy(char *dst, const char *src); +int longest_ancestor_length(const char *path, const char *prefix_list); +char *strip_path_suffix(const char *path, const char *suffix); + +extern char *mkpath(const char *fmt, ...) __attribute__((format (printf, 1, 2))); +extern char *perf_path(const char *fmt, ...) __attribute__((format (printf, 1, 2))); +/* perf_mkstemp() - create tmp file honoring TMPDIR variable */ +extern int perf_mkstemp(char *path, size_t len, const char *template); + +extern char *mksnpath(char *buf, size_t n, const char *fmt, ...) +	__attribute__((format (printf, 3, 4))); +extern char *perf_snpath(char *buf, size_t n, const char *fmt, ...) +	__attribute__((format (printf, 3, 4))); +extern char *perf_pathdup(const char *fmt, ...) +	__attribute__((format (printf, 1, 2))); + +extern size_t strlcpy(char *dest, const char *src, size_t size); + +#endif /* CACHE_H */ diff --git a/tools/perf/util/color.c b/tools/perf/util/color.c new file mode 100644 index 00000000000..9a8c20ccc53 --- /dev/null +++ b/tools/perf/util/color.c @@ -0,0 +1,241 @@ +#include "cache.h" +#include "color.h" + +int perf_use_color_default = -1; + +static int parse_color(const char *name, int len) +{ +	static const char * const color_names[] = { +		"normal", "black", "red", "green", "yellow", +		"blue", "magenta", "cyan", "white" +	}; +	char *end; +	int i; +	for (i = 0; i < ARRAY_SIZE(color_names); i++) { +		const char *str = color_names[i]; +		if (!strncasecmp(name, str, len) && !str[len]) +			return i - 1; +	} +	i = strtol(name, &end, 10); +	if (end - name == len && i >= -1 && i <= 255) +		return i; +	return -2; +} + +static int parse_attr(const char *name, int len) +{ +	static const int attr_values[] = { 1, 2, 4, 5, 7 }; +	static const char * const attr_names[] = { +		"bold", "dim", "ul", "blink", "reverse" +	}; +	int i; +	for (i = 0; i < ARRAY_SIZE(attr_names); i++) { +		const char *str = attr_names[i]; +		if (!strncasecmp(name, str, len) && !str[len]) +			return attr_values[i]; +	} +	return -1; +} + +void color_parse(const char *value, const char *var, char *dst) +{ +	color_parse_mem(value, strlen(value), var, dst); +} + +void color_parse_mem(const char *value, int value_len, const char *var, +		char *dst) +{ +	const char *ptr = value; +	int len = value_len; +	int attr = -1; +	int fg = -2; +	int bg = -2; + +	if (!strncasecmp(value, "reset", len)) { +		strcpy(dst, PERF_COLOR_RESET); +		return; +	} + +	/* [fg [bg]] [attr] */ +	while (len > 0) { +		const char *word = ptr; +		int val, wordlen = 0; + +		while (len > 0 && !isspace(word[wordlen])) { +			wordlen++; +			len--; +		} + +		ptr = word + wordlen; +		while (len > 0 && isspace(*ptr)) { +			ptr++; +			len--; +		} + +		val = parse_color(word, wordlen); +		if (val >= -1) { +			if (fg == -2) { +				fg = val; +				continue; +			} +			if (bg == -2) { +				bg = val; +				continue; +			} +			goto bad; +		} +		val = parse_attr(word, wordlen); +		if (val < 0 || attr != -1) +			goto bad; +		attr = val; +	} + +	if (attr >= 0 || fg >= 0 || bg >= 0) { +		int sep = 0; + +		*dst++ = '\033'; +		*dst++ = '['; +		if (attr >= 0) { +			*dst++ = '0' + attr; +			sep++; +		} +		if (fg >= 0) { +			if (sep++) +				*dst++ = ';'; +			if (fg < 8) { +				*dst++ = '3'; +				*dst++ = '0' + fg; +			} else { +				dst += sprintf(dst, "38;5;%d", fg); +			} +		} +		if (bg >= 0) { +			if (sep++) +				*dst++ = ';'; +			if (bg < 8) { +				*dst++ = '4'; +				*dst++ = '0' + bg; +			} else { +				dst += sprintf(dst, "48;5;%d", bg); +			} +		} +		*dst++ = 'm'; +	} +	*dst = 0; +	return; +bad: +	die("bad color value '%.*s' for variable '%s'", value_len, value, var); +} + +int perf_config_colorbool(const char *var, const char *value, int stdout_is_tty) +{ +	if (value) { +		if (!strcasecmp(value, "never")) +			return 0; +		if (!strcasecmp(value, "always")) +			return 1; +		if (!strcasecmp(value, "auto")) +			goto auto_color; +	} + +	/* Missing or explicit false to turn off colorization */ +	if (!perf_config_bool(var, value)) +		return 0; + +	/* any normal truth value defaults to 'auto' */ + auto_color: +	if (stdout_is_tty < 0) +		stdout_is_tty = isatty(1); +	if (stdout_is_tty || (pager_in_use() && pager_use_color)) { +		char *term = getenv("TERM"); +		if (term && strcmp(term, "dumb")) +			return 1; +	} +	return 0; +} + +int perf_color_default_config(const char *var, const char *value, void *cb) +{ +	if (!strcmp(var, "color.ui")) { +		perf_use_color_default = perf_config_colorbool(var, value, -1); +		return 0; +	} + +	return perf_default_config(var, value, cb); +} + +static int color_vfprintf(FILE *fp, const char *color, const char *fmt, +		va_list args, const char *trail) +{ +	int r = 0; + +	/* +	 * Auto-detect: +	 */ +	if (perf_use_color_default < 0) { +		if (isatty(1) || pager_in_use()) +			perf_use_color_default = 1; +		else +			perf_use_color_default = 0; +	} + +	if (perf_use_color_default && *color) +		r += fprintf(fp, "%s", color); +	r += vfprintf(fp, fmt, args); +	if (perf_use_color_default && *color) +		r += fprintf(fp, "%s", PERF_COLOR_RESET); +	if (trail) +		r += fprintf(fp, "%s", trail); +	return r; +} + + + +int color_fprintf(FILE *fp, const char *color, const char *fmt, ...) +{ +	va_list args; +	int r; + +	va_start(args, fmt); +	r = color_vfprintf(fp, color, fmt, args, NULL); +	va_end(args); +	return r; +} + +int color_fprintf_ln(FILE *fp, const char *color, const char *fmt, ...) +{ +	va_list args; +	int r; +	va_start(args, fmt); +	r = color_vfprintf(fp, color, fmt, args, "\n"); +	va_end(args); +	return r; +} + +/* + * This function splits the buffer by newlines and colors the lines individually. + * + * Returns 0 on success. + */ +int color_fwrite_lines(FILE *fp, const char *color, +		size_t count, const char *buf) +{ +	if (!*color) +		return fwrite(buf, count, 1, fp) != 1; +	while (count) { +		char *p = memchr(buf, '\n', count); +		if (p != buf && (fputs(color, fp) < 0 || +				fwrite(buf, p ? p - buf : count, 1, fp) != 1 || +				fputs(PERF_COLOR_RESET, fp) < 0)) +			return -1; +		if (!p) +			return 0; +		if (fputc('\n', fp) < 0) +			return -1; +		count -= p + 1 - buf; +		buf = p + 1; +	} +	return 0; +} + + diff --git a/tools/perf/util/color.h b/tools/perf/util/color.h new file mode 100644 index 00000000000..5abfd379582 --- /dev/null +++ b/tools/perf/util/color.h @@ -0,0 +1,36 @@ +#ifndef COLOR_H +#define COLOR_H + +/* "\033[1;38;5;2xx;48;5;2xxm\0" is 23 bytes */ +#define COLOR_MAXLEN 24 + +#define PERF_COLOR_NORMAL	"" +#define PERF_COLOR_RESET	"\033[m" +#define PERF_COLOR_BOLD		"\033[1m" +#define PERF_COLOR_RED		"\033[31m" +#define PERF_COLOR_GREEN	"\033[32m" +#define PERF_COLOR_YELLOW	"\033[33m" +#define PERF_COLOR_BLUE		"\033[34m" +#define PERF_COLOR_MAGENTA	"\033[35m" +#define PERF_COLOR_CYAN		"\033[36m" +#define PERF_COLOR_BG_RED	"\033[41m" + +/* + * This variable stores the value of color.ui + */ +extern int perf_use_color_default; + + +/* + * Use this instead of perf_default_config if you need the value of color.ui. + */ +int perf_color_default_config(const char *var, const char *value, void *cb); + +int perf_config_colorbool(const char *var, const char *value, int stdout_is_tty); +void color_parse(const char *value, const char *var, char *dst); +void color_parse_mem(const char *value, int len, const char *var, char *dst); +int color_fprintf(FILE *fp, const char *color, const char *fmt, ...); +int color_fprintf_ln(FILE *fp, const char *color, const char *fmt, ...); +int color_fwrite_lines(FILE *fp, const char *color, size_t count, const char *buf); + +#endif /* COLOR_H */ diff --git a/tools/perf/util/config.c b/tools/perf/util/config.c new file mode 100644 index 00000000000..3dd13faa6a2 --- /dev/null +++ b/tools/perf/util/config.c @@ -0,0 +1,873 @@ +/* + * GIT - The information manager from hell + * + * Copyright (C) Linus Torvalds, 2005 + * Copyright (C) Johannes Schindelin, 2005 + * + */ +#include "util.h" +#include "cache.h" +#include "exec_cmd.h" + +#define MAXNAME (256) + +static FILE *config_file; +static const char *config_file_name; +static int config_linenr; +static int config_file_eof; + +const char *config_exclusive_filename = NULL; + +static int get_next_char(void) +{ +	int c; +	FILE *f; + +	c = '\n'; +	if ((f = config_file) != NULL) { +		c = fgetc(f); +		if (c == '\r') { +			/* DOS like systems */ +			c = fgetc(f); +			if (c != '\n') { +				ungetc(c, f); +				c = '\r'; +			} +		} +		if (c == '\n') +			config_linenr++; +		if (c == EOF) { +			config_file_eof = 1; +			c = '\n'; +		} +	} +	return c; +} + +static char *parse_value(void) +{ +	static char value[1024]; +	int quote = 0, comment = 0, len = 0, space = 0; + +	for (;;) { +		int c = get_next_char(); +		if (len >= sizeof(value) - 1) +			return NULL; +		if (c == '\n') { +			if (quote) +				return NULL; +			value[len] = 0; +			return value; +		} +		if (comment) +			continue; +		if (isspace(c) && !quote) { +			space = 1; +			continue; +		} +		if (!quote) { +			if (c == ';' || c == '#') { +				comment = 1; +				continue; +			} +		} +		if (space) { +			if (len) +				value[len++] = ' '; +			space = 0; +		} +		if (c == '\\') { +			c = get_next_char(); +			switch (c) { +			case '\n': +				continue; +			case 't': +				c = '\t'; +				break; +			case 'b': +				c = '\b'; +				break; +			case 'n': +				c = '\n'; +				break; +			/* Some characters escape as themselves */ +			case '\\': case '"': +				break; +			/* Reject unknown escape sequences */ +			default: +				return NULL; +			} +			value[len++] = c; +			continue; +		} +		if (c == '"') { +			quote = 1-quote; +			continue; +		} +		value[len++] = c; +	} +} + +static inline int iskeychar(int c) +{ +	return isalnum(c) || c == '-'; +} + +static int get_value(config_fn_t fn, void *data, char *name, unsigned int len) +{ +	int c; +	char *value; + +	/* Get the full name */ +	for (;;) { +		c = get_next_char(); +		if (config_file_eof) +			break; +		if (!iskeychar(c)) +			break; +		name[len++] = tolower(c); +		if (len >= MAXNAME) +			return -1; +	} +	name[len] = 0; +	while (c == ' ' || c == '\t') +		c = get_next_char(); + +	value = NULL; +	if (c != '\n') { +		if (c != '=') +			return -1; +		value = parse_value(); +		if (!value) +			return -1; +	} +	return fn(name, value, data); +} + +static int get_extended_base_var(char *name, int baselen, int c) +{ +	do { +		if (c == '\n') +			return -1; +		c = get_next_char(); +	} while (isspace(c)); + +	/* We require the format to be '[base "extension"]' */ +	if (c != '"') +		return -1; +	name[baselen++] = '.'; + +	for (;;) { +		int c = get_next_char(); +		if (c == '\n') +			return -1; +		if (c == '"') +			break; +		if (c == '\\') { +			c = get_next_char(); +			if (c == '\n') +				return -1; +		} +		name[baselen++] = c; +		if (baselen > MAXNAME / 2) +			return -1; +	} + +	/* Final ']' */ +	if (get_next_char() != ']') +		return -1; +	return baselen; +} + +static int get_base_var(char *name) +{ +	int baselen = 0; + +	for (;;) { +		int c = get_next_char(); +		if (config_file_eof) +			return -1; +		if (c == ']') +			return baselen; +		if (isspace(c)) +			return get_extended_base_var(name, baselen, c); +		if (!iskeychar(c) && c != '.') +			return -1; +		if (baselen > MAXNAME / 2) +			return -1; +		name[baselen++] = tolower(c); +	} +} + +static int perf_parse_file(config_fn_t fn, void *data) +{ +	int comment = 0; +	int baselen = 0; +	static char var[MAXNAME]; + +	/* U+FEFF Byte Order Mark in UTF8 */ +	static const unsigned char *utf8_bom = (unsigned char *) "\xef\xbb\xbf"; +	const unsigned char *bomptr = utf8_bom; + +	for (;;) { +		int c = get_next_char(); +		if (bomptr && *bomptr) { +			/* We are at the file beginning; skip UTF8-encoded BOM +			 * if present. Sane editors won't put this in on their +			 * own, but e.g. Windows Notepad will do it happily. */ +			if ((unsigned char) c == *bomptr) { +				bomptr++; +				continue; +			} else { +				/* Do not tolerate partial BOM. */ +				if (bomptr != utf8_bom) +					break; +				/* No BOM at file beginning. Cool. */ +				bomptr = NULL; +			} +		} +		if (c == '\n') { +			if (config_file_eof) +				return 0; +			comment = 0; +			continue; +		} +		if (comment || isspace(c)) +			continue; +		if (c == '#' || c == ';') { +			comment = 1; +			continue; +		} +		if (c == '[') { +			baselen = get_base_var(var); +			if (baselen <= 0) +				break; +			var[baselen++] = '.'; +			var[baselen] = 0; +			continue; +		} +		if (!isalpha(c)) +			break; +		var[baselen] = tolower(c); +		if (get_value(fn, data, var, baselen+1) < 0) +			break; +	} +	die("bad config file line %d in %s", config_linenr, config_file_name); +} + +static int parse_unit_factor(const char *end, unsigned long *val) +{ +	if (!*end) +		return 1; +	else if (!strcasecmp(end, "k")) { +		*val *= 1024; +		return 1; +	} +	else if (!strcasecmp(end, "m")) { +		*val *= 1024 * 1024; +		return 1; +	} +	else if (!strcasecmp(end, "g")) { +		*val *= 1024 * 1024 * 1024; +		return 1; +	} +	return 0; +} + +static int perf_parse_long(const char *value, long *ret) +{ +	if (value && *value) { +		char *end; +		long val = strtol(value, &end, 0); +		unsigned long factor = 1; +		if (!parse_unit_factor(end, &factor)) +			return 0; +		*ret = val * factor; +		return 1; +	} +	return 0; +} + +int perf_parse_ulong(const char *value, unsigned long *ret) +{ +	if (value && *value) { +		char *end; +		unsigned long val = strtoul(value, &end, 0); +		if (!parse_unit_factor(end, &val)) +			return 0; +		*ret = val; +		return 1; +	} +	return 0; +} + +static void die_bad_config(const char *name) +{ +	if (config_file_name) +		die("bad config value for '%s' in %s", name, config_file_name); +	die("bad config value for '%s'", name); +} + +int perf_config_int(const char *name, const char *value) +{ +	long ret = 0; +	if (!perf_parse_long(value, &ret)) +		die_bad_config(name); +	return ret; +} + +unsigned long perf_config_ulong(const char *name, const char *value) +{ +	unsigned long ret; +	if (!perf_parse_ulong(value, &ret)) +		die_bad_config(name); +	return ret; +} + +int perf_config_bool_or_int(const char *name, const char *value, int *is_bool) +{ +	*is_bool = 1; +	if (!value) +		return 1; +	if (!*value) +		return 0; +	if (!strcasecmp(value, "true") || !strcasecmp(value, "yes") || !strcasecmp(value, "on")) +		return 1; +	if (!strcasecmp(value, "false") || !strcasecmp(value, "no") || !strcasecmp(value, "off")) +		return 0; +	*is_bool = 0; +	return perf_config_int(name, value); +} + +int perf_config_bool(const char *name, const char *value) +{ +	int discard; +	return !!perf_config_bool_or_int(name, value, &discard); +} + +int perf_config_string(const char **dest, const char *var, const char *value) +{ +	if (!value) +		return config_error_nonbool(var); +	*dest = strdup(value); +	return 0; +} + +static int perf_default_core_config(const char *var, const char *value) +{ +	/* Add other config variables here and to Documentation/config.txt. */ +	return 0; +} + +int perf_default_config(const char *var, const char *value, void *dummy) +{ +	if (!prefixcmp(var, "core.")) +		return perf_default_core_config(var, value); + +	/* Add other config variables here and to Documentation/config.txt. */ +	return 0; +} + +int perf_config_from_file(config_fn_t fn, const char *filename, void *data) +{ +	int ret; +	FILE *f = fopen(filename, "r"); + +	ret = -1; +	if (f) { +		config_file = f; +		config_file_name = filename; +		config_linenr = 1; +		config_file_eof = 0; +		ret = perf_parse_file(fn, data); +		fclose(f); +		config_file_name = NULL; +	} +	return ret; +} + +const char *perf_etc_perfconfig(void) +{ +	static const char *system_wide; +	if (!system_wide) +		system_wide = system_path(ETC_PERFCONFIG); +	return system_wide; +} + +static int perf_env_bool(const char *k, int def) +{ +	const char *v = getenv(k); +	return v ? perf_config_bool(k, v) : def; +} + +int perf_config_system(void) +{ +	return !perf_env_bool("PERF_CONFIG_NOSYSTEM", 0); +} + +int perf_config_global(void) +{ +	return !perf_env_bool("PERF_CONFIG_NOGLOBAL", 0); +} + +int perf_config(config_fn_t fn, void *data) +{ +	int ret = 0, found = 0; +	char *repo_config = NULL; +	const char *home = NULL; + +	/* Setting $PERF_CONFIG makes perf read _only_ the given config file. */ +	if (config_exclusive_filename) +		return perf_config_from_file(fn, config_exclusive_filename, data); +	if (perf_config_system() && !access(perf_etc_perfconfig(), R_OK)) { +		ret += perf_config_from_file(fn, perf_etc_perfconfig(), +					    data); +		found += 1; +	} + +	home = getenv("HOME"); +	if (perf_config_global() && home) { +		char *user_config = strdup(mkpath("%s/.perfconfig", home)); +		if (!access(user_config, R_OK)) { +			ret += perf_config_from_file(fn, user_config, data); +			found += 1; +		} +		free(user_config); +	} + +	repo_config = perf_pathdup("config"); +	if (!access(repo_config, R_OK)) { +		ret += perf_config_from_file(fn, repo_config, data); +		found += 1; +	} +	free(repo_config); +	if (found == 0) +		return -1; +	return ret; +} + +/* + * Find all the stuff for perf_config_set() below. + */ + +#define MAX_MATCHES 512 + +static struct { +	int baselen; +	char* key; +	int do_not_match; +	regex_t* value_regex; +	int multi_replace; +	size_t offset[MAX_MATCHES]; +	enum { START, SECTION_SEEN, SECTION_END_SEEN, KEY_SEEN } state; +	int seen; +} store; + +static int matches(const char* key, const char* value) +{ +	return !strcmp(key, store.key) && +		(store.value_regex == NULL || +		 (store.do_not_match ^ +		  !regexec(store.value_regex, value, 0, NULL, 0))); +} + +static int store_aux(const char* key, const char* value, void *cb) +{ +	const char *ep; +	size_t section_len; + +	switch (store.state) { +	case KEY_SEEN: +		if (matches(key, value)) { +			if (store.seen == 1 && store.multi_replace == 0) { +				warning("%s has multiple values", key); +			} else if (store.seen >= MAX_MATCHES) { +				error("too many matches for %s", key); +				return 1; +			} + +			store.offset[store.seen] = ftell(config_file); +			store.seen++; +		} +		break; +	case SECTION_SEEN: +		/* +		 * What we are looking for is in store.key (both +		 * section and var), and its section part is baselen +		 * long.  We found key (again, both section and var). +		 * We would want to know if this key is in the same +		 * section as what we are looking for.  We already +		 * know we are in the same section as what should +		 * hold store.key. +		 */ +		ep = strrchr(key, '.'); +		section_len = ep - key; + +		if ((section_len != store.baselen) || +		    memcmp(key, store.key, section_len+1)) { +			store.state = SECTION_END_SEEN; +			break; +		} + +		/* +		 * Do not increment matches: this is no match, but we +		 * just made sure we are in the desired section. +		 */ +		store.offset[store.seen] = ftell(config_file); +		/* fallthru */ +	case SECTION_END_SEEN: +	case START: +		if (matches(key, value)) { +			store.offset[store.seen] = ftell(config_file); +			store.state = KEY_SEEN; +			store.seen++; +		} else { +			if (strrchr(key, '.') - key == store.baselen && +			      !strncmp(key, store.key, store.baselen)) { +					store.state = SECTION_SEEN; +					store.offset[store.seen] = ftell(config_file); +			} +		} +	} +	return 0; +} + +static int store_write_section(int fd, const char* key) +{ +	const char *dot; +	int i, success; +	struct strbuf sb = STRBUF_INIT; + +	dot = memchr(key, '.', store.baselen); +	if (dot) { +		strbuf_addf(&sb, "[%.*s \"", (int)(dot - key), key); +		for (i = dot - key + 1; i < store.baselen; i++) { +			if (key[i] == '"' || key[i] == '\\') +				strbuf_addch(&sb, '\\'); +			strbuf_addch(&sb, key[i]); +		} +		strbuf_addstr(&sb, "\"]\n"); +	} else { +		strbuf_addf(&sb, "[%.*s]\n", store.baselen, key); +	} + +	success = write_in_full(fd, sb.buf, sb.len) == sb.len; +	strbuf_release(&sb); + +	return success; +} + +static int store_write_pair(int fd, const char* key, const char* value) +{ +	int i, success; +	int length = strlen(key + store.baselen + 1); +	const char *quote = ""; +	struct strbuf sb = STRBUF_INIT; + +	/* +	 * Check to see if the value needs to be surrounded with a dq pair. +	 * Note that problematic characters are always backslash-quoted; this +	 * check is about not losing leading or trailing SP and strings that +	 * follow beginning-of-comment characters (i.e. ';' and '#') by the +	 * configuration parser. +	 */ +	if (value[0] == ' ') +		quote = "\""; +	for (i = 0; value[i]; i++) +		if (value[i] == ';' || value[i] == '#') +			quote = "\""; +	if (i && value[i - 1] == ' ') +		quote = "\""; + +	strbuf_addf(&sb, "\t%.*s = %s", +		    length, key + store.baselen + 1, quote); + +	for (i = 0; value[i]; i++) +		switch (value[i]) { +		case '\n': +			strbuf_addstr(&sb, "\\n"); +			break; +		case '\t': +			strbuf_addstr(&sb, "\\t"); +			break; +		case '"': +		case '\\': +			strbuf_addch(&sb, '\\'); +		default: +			strbuf_addch(&sb, value[i]); +			break; +		} +	strbuf_addf(&sb, "%s\n", quote); + +	success = write_in_full(fd, sb.buf, sb.len) == sb.len; +	strbuf_release(&sb); + +	return success; +} + +static ssize_t find_beginning_of_line(const char* contents, size_t size, +	size_t offset_, int* found_bracket) +{ +	size_t equal_offset = size, bracket_offset = size; +	ssize_t offset; + +contline: +	for (offset = offset_-2; offset > 0 +			&& contents[offset] != '\n'; offset--) +		switch (contents[offset]) { +			case '=': equal_offset = offset; break; +			case ']': bracket_offset = offset; break; +		} +	if (offset > 0 && contents[offset-1] == '\\') { +		offset_ = offset; +		goto contline; +	} +	if (bracket_offset < equal_offset) { +		*found_bracket = 1; +		offset = bracket_offset+1; +	} else +		offset++; + +	return offset; +} + +int perf_config_set(const char* key, const char* value) +{ +	return perf_config_set_multivar(key, value, NULL, 0); +} + +/* + * If value==NULL, unset in (remove from) config, + * if value_regex!=NULL, disregard key/value pairs where value does not match. + * if multi_replace==0, nothing, or only one matching key/value is replaced, + *     else all matching key/values (regardless how many) are removed, + *     before the new pair is written. + * + * Returns 0 on success. + * + * This function does this: + * + * - it locks the config file by creating ".perf/config.lock" + * + * - it then parses the config using store_aux() as validator to find + *   the position on the key/value pair to replace. If it is to be unset, + *   it must be found exactly once. + * + * - the config file is mmap()ed and the part before the match (if any) is + *   written to the lock file, then the changed part and the rest. + * + * - the config file is removed and the lock file rename()d to it. + * + */ +int perf_config_set_multivar(const char* key, const char* value, +	const char* value_regex, int multi_replace) +{ +	int i, dot; +	int fd = -1, in_fd; +	int ret = 0; +	char* config_filename; +	const char* last_dot = strrchr(key, '.'); + +	if (config_exclusive_filename) +		config_filename = strdup(config_exclusive_filename); +	else +		config_filename = perf_pathdup("config"); + +	/* +	 * Since "key" actually contains the section name and the real +	 * key name separated by a dot, we have to know where the dot is. +	 */ + +	if (last_dot == NULL) { +		error("key does not contain a section: %s", key); +		ret = 2; +		goto out_free; +	} +	store.baselen = last_dot - key; + +	store.multi_replace = multi_replace; + +	/* +	 * Validate the key and while at it, lower case it for matching. +	 */ +	store.key = malloc(strlen(key) + 1); +	dot = 0; +	for (i = 0; key[i]; i++) { +		unsigned char c = key[i]; +		if (c == '.') +			dot = 1; +		/* Leave the extended basename untouched.. */ +		if (!dot || i > store.baselen) { +			if (!iskeychar(c) || (i == store.baselen+1 && !isalpha(c))) { +				error("invalid key: %s", key); +				free(store.key); +				ret = 1; +				goto out_free; +			} +			c = tolower(c); +		} else if (c == '\n') { +			error("invalid key (newline): %s", key); +			free(store.key); +			ret = 1; +			goto out_free; +		} +		store.key[i] = c; +	} +	store.key[i] = 0; + +	/* +	 * If .perf/config does not exist yet, write a minimal version. +	 */ +	in_fd = open(config_filename, O_RDONLY); +	if ( in_fd < 0 ) { +		free(store.key); + +		if ( ENOENT != errno ) { +			error("opening %s: %s", config_filename, +			      strerror(errno)); +			ret = 3; /* same as "invalid config file" */ +			goto out_free; +		} +		/* if nothing to unset, error out */ +		if (value == NULL) { +			ret = 5; +			goto out_free; +		} + +		store.key = (char*)key; +		if (!store_write_section(fd, key) || +		    !store_write_pair(fd, key, value)) +			goto write_err_out; +	} else { +		struct stat st; +		char* contents; +		size_t contents_sz, copy_begin, copy_end; +		int i, new_line = 0; + +		if (value_regex == NULL) +			store.value_regex = NULL; +		else { +			if (value_regex[0] == '!') { +				store.do_not_match = 1; +				value_regex++; +			} else +				store.do_not_match = 0; + +			store.value_regex = (regex_t*)malloc(sizeof(regex_t)); +			if (regcomp(store.value_regex, value_regex, +					REG_EXTENDED)) { +				error("invalid pattern: %s", value_regex); +				free(store.value_regex); +				ret = 6; +				goto out_free; +			} +		} + +		store.offset[0] = 0; +		store.state = START; +		store.seen = 0; + +		/* +		 * After this, store.offset will contain the *end* offset +		 * of the last match, or remain at 0 if no match was found. +		 * As a side effect, we make sure to transform only a valid +		 * existing config file. +		 */ +		if (perf_config_from_file(store_aux, config_filename, NULL)) { +			error("invalid config file %s", config_filename); +			free(store.key); +			if (store.value_regex != NULL) { +				regfree(store.value_regex); +				free(store.value_regex); +			} +			ret = 3; +			goto out_free; +		} + +		free(store.key); +		if (store.value_regex != NULL) { +			regfree(store.value_regex); +			free(store.value_regex); +		} + +		/* if nothing to unset, or too many matches, error out */ +		if ((store.seen == 0 && value == NULL) || +				(store.seen > 1 && multi_replace == 0)) { +			ret = 5; +			goto out_free; +		} + +		fstat(in_fd, &st); +		contents_sz = xsize_t(st.st_size); +		contents = mmap(NULL, contents_sz, PROT_READ, +			MAP_PRIVATE, in_fd, 0); +		close(in_fd); + +		if (store.seen == 0) +			store.seen = 1; + +		for (i = 0, copy_begin = 0; i < store.seen; i++) { +			if (store.offset[i] == 0) { +				store.offset[i] = copy_end = contents_sz; +			} else if (store.state != KEY_SEEN) { +				copy_end = store.offset[i]; +			} else +				copy_end = find_beginning_of_line( +					contents, contents_sz, +					store.offset[i]-2, &new_line); + +			if (copy_end > 0 && contents[copy_end-1] != '\n') +				new_line = 1; + +			/* write the first part of the config */ +			if (copy_end > copy_begin) { +				if (write_in_full(fd, contents + copy_begin, +						  copy_end - copy_begin) < +				    copy_end - copy_begin) +					goto write_err_out; +				if (new_line && +				    write_in_full(fd, "\n", 1) != 1) +					goto write_err_out; +			} +			copy_begin = store.offset[i]; +		} + +		/* write the pair (value == NULL means unset) */ +		if (value != NULL) { +			if (store.state == START) { +				if (!store_write_section(fd, key)) +					goto write_err_out; +			} +			if (!store_write_pair(fd, key, value)) +				goto write_err_out; +		} + +		/* write the rest of the config */ +		if (copy_begin < contents_sz) +			if (write_in_full(fd, contents + copy_begin, +					  contents_sz - copy_begin) < +			    contents_sz - copy_begin) +				goto write_err_out; + +		munmap(contents, contents_sz); +	} + +	ret = 0; + +out_free: +	free(config_filename); +	return ret; + +write_err_out: +	goto out_free; + +} + +/* + * Call this to report error for your variable that should not + * get a boolean value (i.e. "[my] var" means "true"). + */ +int config_error_nonbool(const char *var) +{ +	return error("Missing value for '%s'", var); +} diff --git a/tools/perf/util/ctype.c b/tools/perf/util/ctype.c new file mode 100644 index 00000000000..b90ec004f29 --- /dev/null +++ b/tools/perf/util/ctype.c @@ -0,0 +1,26 @@ +/* + * Sane locale-independent, ASCII ctype. + * + * No surprises, and works with signed and unsigned chars. + */ +#include "cache.h" + +enum { +	S = GIT_SPACE, +	A = GIT_ALPHA, +	D = GIT_DIGIT, +	G = GIT_GLOB_SPECIAL,	/* *, ?, [, \\ */ +	R = GIT_REGEX_SPECIAL,	/* $, (, ), +, ., ^, {, | * */ +}; + +unsigned char sane_ctype[256] = { +	0, 0, 0, 0, 0, 0, 0, 0, 0, S, S, 0, 0, S, 0, 0,		/*   0.. 15 */ +	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,		/*  16.. 31 */ +	S, 0, 0, 0, R, 0, 0, 0, R, R, G, R, 0, 0, R, 0,		/*  32.. 47 */ +	D, D, D, D, D, D, D, D, D, D, 0, 0, 0, 0, 0, G,		/*  48.. 63 */ +	0, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,		/*  64.. 79 */ +	A, A, A, A, A, A, A, A, A, A, A, G, G, 0, R, 0,		/*  80.. 95 */ +	0, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,		/*  96..111 */ +	A, A, A, A, A, A, A, A, A, A, A, R, R, 0, 0, 0,		/* 112..127 */ +	/* Nothing in the 128.. range */ +}; diff --git a/tools/perf/util/environment.c b/tools/perf/util/environment.c new file mode 100644 index 00000000000..275b0ee345f --- /dev/null +++ b/tools/perf/util/environment.c @@ -0,0 +1,9 @@ +/* + * We put all the perf config variables in this same object + * file, so that programs can link against the config parser + * without having to link against all the rest of perf. + */ +#include "cache.h" + +const char *pager_program; +int pager_use_color = 1; diff --git a/tools/perf/util/exec_cmd.c b/tools/perf/util/exec_cmd.c new file mode 100644 index 00000000000..d3929226315 --- /dev/null +++ b/tools/perf/util/exec_cmd.c @@ -0,0 +1,165 @@ +#include "cache.h" +#include "exec_cmd.h" +#include "quote.h" +#define MAX_ARGS	32 + +extern char **environ; +static const char *argv_exec_path; +static const char *argv0_path; + +const char *system_path(const char *path) +{ +#ifdef RUNTIME_PREFIX +	static const char *prefix; +#else +	static const char *prefix = PREFIX; +#endif +	struct strbuf d = STRBUF_INIT; + +	if (is_absolute_path(path)) +		return path; + +#ifdef RUNTIME_PREFIX +	assert(argv0_path); +	assert(is_absolute_path(argv0_path)); + +	if (!prefix && +	    !(prefix = strip_path_suffix(argv0_path, PERF_EXEC_PATH)) && +	    !(prefix = strip_path_suffix(argv0_path, BINDIR)) && +	    !(prefix = strip_path_suffix(argv0_path, "perf"))) { +		prefix = PREFIX; +		fprintf(stderr, "RUNTIME_PREFIX requested, " +				"but prefix computation failed.  " +				"Using static fallback '%s'.\n", prefix); +	} +#endif + +	strbuf_addf(&d, "%s/%s", prefix, path); +	path = strbuf_detach(&d, NULL); +	return path; +} + +const char *perf_extract_argv0_path(const char *argv0) +{ +	const char *slash; + +	if (!argv0 || !*argv0) +		return NULL; +	slash = argv0 + strlen(argv0); + +	while (argv0 <= slash && !is_dir_sep(*slash)) +		slash--; + +	if (slash >= argv0) { +		argv0_path = strndup(argv0, slash - argv0); +		return slash + 1; +	} + +	return argv0; +} + +void perf_set_argv_exec_path(const char *exec_path) +{ +	argv_exec_path = exec_path; +	/* +	 * Propagate this setting to external programs. +	 */ +	setenv(EXEC_PATH_ENVIRONMENT, exec_path, 1); +} + + +/* Returns the highest-priority, location to look for perf programs. */ +const char *perf_exec_path(void) +{ +	const char *env; + +	if (argv_exec_path) +		return argv_exec_path; + +	env = getenv(EXEC_PATH_ENVIRONMENT); +	if (env && *env) { +		return env; +	} + +	return system_path(PERF_EXEC_PATH); +} + +static void add_path(struct strbuf *out, const char *path) +{ +	if (path && *path) { +		if (is_absolute_path(path)) +			strbuf_addstr(out, path); +		else +			strbuf_addstr(out, make_nonrelative_path(path)); + +		strbuf_addch(out, PATH_SEP); +	} +} + +void setup_path(void) +{ +	const char *old_path = getenv("PATH"); +	struct strbuf new_path = STRBUF_INIT; + +	add_path(&new_path, perf_exec_path()); +	add_path(&new_path, argv0_path); + +	if (old_path) +		strbuf_addstr(&new_path, old_path); +	else +		strbuf_addstr(&new_path, "/usr/local/bin:/usr/bin:/bin"); + +	setenv("PATH", new_path.buf, 1); + +	strbuf_release(&new_path); +} + +const char **prepare_perf_cmd(const char **argv) +{ +	int argc; +	const char **nargv; + +	for (argc = 0; argv[argc]; argc++) +		; /* just counting */ +	nargv = malloc(sizeof(*nargv) * (argc + 2)); + +	nargv[0] = "perf"; +	for (argc = 0; argv[argc]; argc++) +		nargv[argc + 1] = argv[argc]; +	nargv[argc + 1] = NULL; +	return nargv; +} + +int execv_perf_cmd(const char **argv) { +	const char **nargv = prepare_perf_cmd(argv); + +	/* execvp() can only ever return if it fails */ +	execvp("perf", (char **)nargv); + +	free(nargv); +	return -1; +} + + +int execl_perf_cmd(const char *cmd,...) +{ +	int argc; +	const char *argv[MAX_ARGS + 1]; +	const char *arg; +	va_list param; + +	va_start(param, cmd); +	argv[0] = cmd; +	argc = 1; +	while (argc < MAX_ARGS) { +		arg = argv[argc++] = va_arg(param, char *); +		if (!arg) +			break; +	} +	va_end(param); +	if (MAX_ARGS <= argc) +		return error("too many args to run %s", cmd); + +	argv[argc] = NULL; +	return execv_perf_cmd(argv); +} diff --git a/tools/perf/util/exec_cmd.h b/tools/perf/util/exec_cmd.h new file mode 100644 index 00000000000..effe25eb154 --- /dev/null +++ b/tools/perf/util/exec_cmd.h @@ -0,0 +1,13 @@ +#ifndef PERF_EXEC_CMD_H +#define PERF_EXEC_CMD_H + +extern void perf_set_argv_exec_path(const char *exec_path); +extern const char *perf_extract_argv0_path(const char *path); +extern const char *perf_exec_path(void); +extern void setup_path(void); +extern const char **prepare_perf_cmd(const char **argv); +extern int execv_perf_cmd(const char **argv); /* NULL terminated */ +extern int execl_perf_cmd(const char *cmd, ...); +extern const char *system_path(const char *path); + +#endif /* PERF_EXEC_CMD_H */ diff --git a/tools/perf/util/generate-cmdlist.sh b/tools/perf/util/generate-cmdlist.sh new file mode 100755 index 00000000000..f06f6fd148f --- /dev/null +++ b/tools/perf/util/generate-cmdlist.sh @@ -0,0 +1,24 @@ +#!/bin/sh + +echo "/* Automatically generated by $0 */ +struct cmdname_help +{ +    char name[16]; +    char help[80]; +}; + +static struct cmdname_help common_cmds[] = {" + +sed -n -e 's/^perf-\([^ 	]*\)[ 	].* common.*/\1/p' command-list.txt | +sort | +while read cmd +do +     sed -n ' +     /^NAME/,/perf-'"$cmd"'/H +     ${ +            x +            s/.*perf-'"$cmd"' - \(.*\)/  {"'"$cmd"'", "\1"},/ +	    p +     }' "Documentation/perf-$cmd.txt" +done +echo "};" diff --git a/tools/perf/util/help.c b/tools/perf/util/help.c new file mode 100644 index 00000000000..6653f7dd1d7 --- /dev/null +++ b/tools/perf/util/help.c @@ -0,0 +1,367 @@ +#include "cache.h" +#include "../builtin.h" +#include "exec_cmd.h" +#include "levenshtein.h" +#include "help.h" + +/* most GUI terminals set COLUMNS (although some don't export it) */ +static int term_columns(void) +{ +	char *col_string = getenv("COLUMNS"); +	int n_cols; + +	if (col_string && (n_cols = atoi(col_string)) > 0) +		return n_cols; + +#ifdef TIOCGWINSZ +	{ +		struct winsize ws; +		if (!ioctl(1, TIOCGWINSZ, &ws)) { +			if (ws.ws_col) +				return ws.ws_col; +		} +	} +#endif + +	return 80; +} + +void add_cmdname(struct cmdnames *cmds, const char *name, int len) +{ +	struct cmdname *ent = malloc(sizeof(*ent) + len + 1); + +	ent->len = len; +	memcpy(ent->name, name, len); +	ent->name[len] = 0; + +	ALLOC_GROW(cmds->names, cmds->cnt + 1, cmds->alloc); +	cmds->names[cmds->cnt++] = ent; +} + +static void clean_cmdnames(struct cmdnames *cmds) +{ +	int i; +	for (i = 0; i < cmds->cnt; ++i) +		free(cmds->names[i]); +	free(cmds->names); +	cmds->cnt = 0; +	cmds->alloc = 0; +} + +static int cmdname_compare(const void *a_, const void *b_) +{ +	struct cmdname *a = *(struct cmdname **)a_; +	struct cmdname *b = *(struct cmdname **)b_; +	return strcmp(a->name, b->name); +} + +static void uniq(struct cmdnames *cmds) +{ +	int i, j; + +	if (!cmds->cnt) +		return; + +	for (i = j = 1; i < cmds->cnt; i++) +		if (strcmp(cmds->names[i]->name, cmds->names[i-1]->name)) +			cmds->names[j++] = cmds->names[i]; + +	cmds->cnt = j; +} + +void exclude_cmds(struct cmdnames *cmds, struct cmdnames *excludes) +{ +	int ci, cj, ei; +	int cmp; + +	ci = cj = ei = 0; +	while (ci < cmds->cnt && ei < excludes->cnt) { +		cmp = strcmp(cmds->names[ci]->name, excludes->names[ei]->name); +		if (cmp < 0) +			cmds->names[cj++] = cmds->names[ci++]; +		else if (cmp == 0) +			ci++, ei++; +		else if (cmp > 0) +			ei++; +	} + +	while (ci < cmds->cnt) +		cmds->names[cj++] = cmds->names[ci++]; + +	cmds->cnt = cj; +} + +static void pretty_print_string_list(struct cmdnames *cmds, int longest) +{ +	int cols = 1, rows; +	int space = longest + 1; /* min 1 SP between words */ +	int max_cols = term_columns() - 1; /* don't print *on* the edge */ +	int i, j; + +	if (space < max_cols) +		cols = max_cols / space; +	rows = (cmds->cnt + cols - 1) / cols; + +	for (i = 0; i < rows; i++) { +		printf("  "); + +		for (j = 0; j < cols; j++) { +			int n = j * rows + i; +			int size = space; +			if (n >= cmds->cnt) +				break; +			if (j == cols-1 || n + rows >= cmds->cnt) +				size = 1; +			printf("%-*s", size, cmds->names[n]->name); +		} +		putchar('\n'); +	} +} + +static int is_executable(const char *name) +{ +	struct stat st; + +	if (stat(name, &st) || /* stat, not lstat */ +	    !S_ISREG(st.st_mode)) +		return 0; + +#ifdef __MINGW32__ +	/* cannot trust the executable bit, peek into the file instead */ +	char buf[3] = { 0 }; +	int n; +	int fd = open(name, O_RDONLY); +	st.st_mode &= ~S_IXUSR; +	if (fd >= 0) { +		n = read(fd, buf, 2); +		if (n == 2) +			/* DOS executables start with "MZ" */ +			if (!strcmp(buf, "#!") || !strcmp(buf, "MZ")) +				st.st_mode |= S_IXUSR; +		close(fd); +	} +#endif +	return st.st_mode & S_IXUSR; +} + +static void list_commands_in_dir(struct cmdnames *cmds, +					 const char *path, +					 const char *prefix) +{ +	int prefix_len; +	DIR *dir = opendir(path); +	struct dirent *de; +	struct strbuf buf = STRBUF_INIT; +	int len; + +	if (!dir) +		return; +	if (!prefix) +		prefix = "perf-"; +	prefix_len = strlen(prefix); + +	strbuf_addf(&buf, "%s/", path); +	len = buf.len; + +	while ((de = readdir(dir)) != NULL) { +		int entlen; + +		if (prefixcmp(de->d_name, prefix)) +			continue; + +		strbuf_setlen(&buf, len); +		strbuf_addstr(&buf, de->d_name); +		if (!is_executable(buf.buf)) +			continue; + +		entlen = strlen(de->d_name) - prefix_len; +		if (has_extension(de->d_name, ".exe")) +			entlen -= 4; + +		add_cmdname(cmds, de->d_name + prefix_len, entlen); +	} +	closedir(dir); +	strbuf_release(&buf); +} + +void load_command_list(const char *prefix, +		struct cmdnames *main_cmds, +		struct cmdnames *other_cmds) +{ +	const char *env_path = getenv("PATH"); +	const char *exec_path = perf_exec_path(); + +	if (exec_path) { +		list_commands_in_dir(main_cmds, exec_path, prefix); +		qsort(main_cmds->names, main_cmds->cnt, +		      sizeof(*main_cmds->names), cmdname_compare); +		uniq(main_cmds); +	} + +	if (env_path) { +		char *paths, *path, *colon; +		path = paths = strdup(env_path); +		while (1) { +			if ((colon = strchr(path, PATH_SEP))) +				*colon = 0; +			if (!exec_path || strcmp(path, exec_path)) +				list_commands_in_dir(other_cmds, path, prefix); + +			if (!colon) +				break; +			path = colon + 1; +		} +		free(paths); + +		qsort(other_cmds->names, other_cmds->cnt, +		      sizeof(*other_cmds->names), cmdname_compare); +		uniq(other_cmds); +	} +	exclude_cmds(other_cmds, main_cmds); +} + +void list_commands(const char *title, struct cmdnames *main_cmds, +		   struct cmdnames *other_cmds) +{ +	int i, longest = 0; + +	for (i = 0; i < main_cmds->cnt; i++) +		if (longest < main_cmds->names[i]->len) +			longest = main_cmds->names[i]->len; +	for (i = 0; i < other_cmds->cnt; i++) +		if (longest < other_cmds->names[i]->len) +			longest = other_cmds->names[i]->len; + +	if (main_cmds->cnt) { +		const char *exec_path = perf_exec_path(); +		printf("available %s in '%s'\n", title, exec_path); +		printf("----------------"); +		mput_char('-', strlen(title) + strlen(exec_path)); +		putchar('\n'); +		pretty_print_string_list(main_cmds, longest); +		putchar('\n'); +	} + +	if (other_cmds->cnt) { +		printf("%s available from elsewhere on your $PATH\n", title); +		printf("---------------------------------------"); +		mput_char('-', strlen(title)); +		putchar('\n'); +		pretty_print_string_list(other_cmds, longest); +		putchar('\n'); +	} +} + +int is_in_cmdlist(struct cmdnames *c, const char *s) +{ +	int i; +	for (i = 0; i < c->cnt; i++) +		if (!strcmp(s, c->names[i]->name)) +			return 1; +	return 0; +} + +static int autocorrect; +static struct cmdnames aliases; + +static int perf_unknown_cmd_config(const char *var, const char *value, void *cb) +{ +	if (!strcmp(var, "help.autocorrect")) +		autocorrect = perf_config_int(var,value); +	/* Also use aliases for command lookup */ +	if (!prefixcmp(var, "alias.")) +		add_cmdname(&aliases, var + 6, strlen(var + 6)); + +	return perf_default_config(var, value, cb); +} + +static int levenshtein_compare(const void *p1, const void *p2) +{ +	const struct cmdname *const *c1 = p1, *const *c2 = p2; +	const char *s1 = (*c1)->name, *s2 = (*c2)->name; +	int l1 = (*c1)->len; +	int l2 = (*c2)->len; +	return l1 != l2 ? l1 - l2 : strcmp(s1, s2); +} + +static void add_cmd_list(struct cmdnames *cmds, struct cmdnames *old) +{ +	int i; +	ALLOC_GROW(cmds->names, cmds->cnt + old->cnt, cmds->alloc); + +	for (i = 0; i < old->cnt; i++) +		cmds->names[cmds->cnt++] = old->names[i]; +	free(old->names); +	old->cnt = 0; +	old->names = NULL; +} + +const char *help_unknown_cmd(const char *cmd) +{ +	int i, n = 0, best_similarity = 0; +	struct cmdnames main_cmds, other_cmds; + +	memset(&main_cmds, 0, sizeof(main_cmds)); +	memset(&other_cmds, 0, sizeof(main_cmds)); +	memset(&aliases, 0, sizeof(aliases)); + +	perf_config(perf_unknown_cmd_config, NULL); + +	load_command_list("perf-", &main_cmds, &other_cmds); + +	add_cmd_list(&main_cmds, &aliases); +	add_cmd_list(&main_cmds, &other_cmds); +	qsort(main_cmds.names, main_cmds.cnt, +	      sizeof(main_cmds.names), cmdname_compare); +	uniq(&main_cmds); + +	if (main_cmds.cnt) { +		/* This reuses cmdname->len for similarity index */ +		for (i = 0; i < main_cmds.cnt; ++i) +			main_cmds.names[i]->len = +				levenshtein(cmd, main_cmds.names[i]->name, 0, 2, 1, 4); + +		qsort(main_cmds.names, main_cmds.cnt, +		      sizeof(*main_cmds.names), levenshtein_compare); + +		best_similarity = main_cmds.names[0]->len; +		n = 1; +		while (n < main_cmds.cnt && best_similarity == main_cmds.names[n]->len) +			++n; +	} + +	if (autocorrect && n == 1) { +		const char *assumed = main_cmds.names[0]->name; + +		main_cmds.names[0] = NULL; +		clean_cmdnames(&main_cmds); +		fprintf(stderr, "WARNING: You called a Git program named '%s', " +			"which does not exist.\n" +			"Continuing under the assumption that you meant '%s'\n", +			cmd, assumed); +		if (autocorrect > 0) { +			fprintf(stderr, "in %0.1f seconds automatically...\n", +				(float)autocorrect/10.0); +			poll(NULL, 0, autocorrect * 100); +		} +		return assumed; +	} + +	fprintf(stderr, "perf: '%s' is not a perf-command. See 'perf --help'.\n", cmd); + +	if (main_cmds.cnt && best_similarity < 6) { +		fprintf(stderr, "\nDid you mean %s?\n", +			n < 2 ? "this": "one of these"); + +		for (i = 0; i < n; i++) +			fprintf(stderr, "\t%s\n", main_cmds.names[i]->name); +	} + +	exit(1); +} + +int cmd_version(int argc, const char **argv, const char *prefix) +{ +	printf("perf version %s\n", perf_version_string); +	return 0; +} diff --git a/tools/perf/util/help.h b/tools/perf/util/help.h new file mode 100644 index 00000000000..56bc15406ff --- /dev/null +++ b/tools/perf/util/help.h @@ -0,0 +1,29 @@ +#ifndef HELP_H +#define HELP_H + +struct cmdnames { +	int alloc; +	int cnt; +	struct cmdname { +		size_t len; /* also used for similarity index in help.c */ +		char name[FLEX_ARRAY]; +	} **names; +}; + +static inline void mput_char(char c, unsigned int num) +{ +	while(num--) +		putchar(c); +} + +void load_command_list(const char *prefix, +		struct cmdnames *main_cmds, +		struct cmdnames *other_cmds); +void add_cmdname(struct cmdnames *cmds, const char *name, int len); +/* Here we require that excludes is a sorted list. */ +void exclude_cmds(struct cmdnames *cmds, struct cmdnames *excludes); +int is_in_cmdlist(struct cmdnames *c, const char *s); +void list_commands(const char *title, struct cmdnames *main_cmds, +		   struct cmdnames *other_cmds); + +#endif /* HELP_H */ diff --git a/tools/perf/util/levenshtein.c b/tools/perf/util/levenshtein.c new file mode 100644 index 00000000000..e521d1516df --- /dev/null +++ b/tools/perf/util/levenshtein.c @@ -0,0 +1,84 @@ +#include "cache.h" +#include "levenshtein.h" + +/* + * This function implements the Damerau-Levenshtein algorithm to + * calculate a distance between strings. + * + * Basically, it says how many letters need to be swapped, substituted, + * deleted from, or added to string1, at least, to get string2. + * + * The idea is to build a distance matrix for the substrings of both + * strings.  To avoid a large space complexity, only the last three rows + * are kept in memory (if swaps had the same or higher cost as one deletion + * plus one insertion, only two rows would be needed). + * + * At any stage, "i + 1" denotes the length of the current substring of + * string1 that the distance is calculated for. + * + * row2 holds the current row, row1 the previous row (i.e. for the substring + * of string1 of length "i"), and row0 the row before that. + * + * In other words, at the start of the big loop, row2[j + 1] contains the + * Damerau-Levenshtein distance between the substring of string1 of length + * "i" and the substring of string2 of length "j + 1". + * + * All the big loop does is determine the partial minimum-cost paths. + * + * It does so by calculating the costs of the path ending in characters + * i (in string1) and j (in string2), respectively, given that the last + * operation is a substition, a swap, a deletion, or an insertion. + * + * This implementation allows the costs to be weighted: + * + * - w (as in "sWap") + * - s (as in "Substitution") + * - a (for insertion, AKA "Add") + * - d (as in "Deletion") + * + * Note that this algorithm calculates a distance _iff_ d == a. + */ +int levenshtein(const char *string1, const char *string2, +		int w, int s, int a, int d) +{ +	int len1 = strlen(string1), len2 = strlen(string2); +	int *row0 = malloc(sizeof(int) * (len2 + 1)); +	int *row1 = malloc(sizeof(int) * (len2 + 1)); +	int *row2 = malloc(sizeof(int) * (len2 + 1)); +	int i, j; + +	for (j = 0; j <= len2; j++) +		row1[j] = j * a; +	for (i = 0; i < len1; i++) { +		int *dummy; + +		row2[0] = (i + 1) * d; +		for (j = 0; j < len2; j++) { +			/* substitution */ +			row2[j + 1] = row1[j] + s * (string1[i] != string2[j]); +			/* swap */ +			if (i > 0 && j > 0 && string1[i - 1] == string2[j] && +					string1[i] == string2[j - 1] && +					row2[j + 1] > row0[j - 1] + w) +				row2[j + 1] = row0[j - 1] + w; +			/* deletion */ +			if (row2[j + 1] > row1[j + 1] + d) +				row2[j + 1] = row1[j + 1] + d; +			/* insertion */ +			if (row2[j + 1] > row2[j] + a) +				row2[j + 1] = row2[j] + a; +		} + +		dummy = row0; +		row0 = row1; +		row1 = row2; +		row2 = dummy; +	} + +	i = row1[len2]; +	free(row0); +	free(row1); +	free(row2); + +	return i; +} diff --git a/tools/perf/util/levenshtein.h b/tools/perf/util/levenshtein.h new file mode 100644 index 00000000000..0173abeef52 --- /dev/null +++ b/tools/perf/util/levenshtein.h @@ -0,0 +1,8 @@ +#ifndef LEVENSHTEIN_H +#define LEVENSHTEIN_H + +int levenshtein(const char *string1, const char *string2, +	int swap_penalty, int substition_penalty, +	int insertion_penalty, int deletion_penalty); + +#endif diff --git a/tools/perf/util/list.h b/tools/perf/util/list.h new file mode 100644 index 00000000000..e2548e8072c --- /dev/null +++ b/tools/perf/util/list.h @@ -0,0 +1,603 @@ +#ifndef _LINUX_LIST_H +#define _LINUX_LIST_H +/* +  Copyright (C) Cast of dozens, comes from the Linux kernel + +  This program is free software; you can redistribute it and/or modify it +  under the terms of version 2 of the GNU General Public License as +  published by the Free Software Foundation. +*/ + +#include <stddef.h> + +/* + * These are non-NULL pointers that will result in page faults + * under normal circumstances, used to verify that nobody uses + * non-initialized list entries. + */ +#define LIST_POISON1 ((void *)0x00100100) +#define LIST_POISON2 ((void *)0x00200200) + +/** + * container_of - cast a member of a structure out to the containing structure + * @ptr:	the pointer to the member. + * @type:	the type of the container struct this is embedded in. + * @member:	the name of the member within the struct. + * + */ +#define container_of(ptr, type, member) ({			\ +        const typeof( ((type *)0)->member ) *__mptr = (ptr);	\ +        (type *)( (char *)__mptr - offsetof(type,member) );}) + +/* + * Simple doubly linked list implementation. + * + * Some of the internal functions ("__xxx") are useful when + * manipulating whole lists rather than single entries, as + * sometimes we already know the next/prev entries and we can + * generate better code by using them directly rather than + * using the generic single-entry routines. + */ + +struct list_head { +	struct list_head *next, *prev; +}; + +#define LIST_HEAD_INIT(name) { &(name), &(name) } + +#define LIST_HEAD(name) \ +	struct list_head name = LIST_HEAD_INIT(name) + +static inline void INIT_LIST_HEAD(struct list_head *list) +{ +	list->next = list; +	list->prev = list; +} + +/* + * Insert a new entry between two known consecutive entries. + * + * This is only for internal list manipulation where we know + * the prev/next entries already! + */ +static inline void __list_add(struct list_head *new, +			      struct list_head *prev, +			      struct list_head *next) +{ +	next->prev = new; +	new->next = next; +	new->prev = prev; +	prev->next = new; +} + +/** + * list_add - add a new entry + * @new: new entry to be added + * @head: list head to add it after + * + * Insert a new entry after the specified head. + * This is good for implementing stacks. + */ +static inline void list_add(struct list_head *new, struct list_head *head) +{ +	__list_add(new, head, head->next); +} + +/** + * list_add_tail - add a new entry + * @new: new entry to be added + * @head: list head to add it before + * + * Insert a new entry before the specified head. + * This is useful for implementing queues. + */ +static inline void list_add_tail(struct list_head *new, struct list_head *head) +{ +	__list_add(new, head->prev, head); +} + +/* + * Delete a list entry by making the prev/next entries + * point to each other. + * + * This is only for internal list manipulation where we know + * the prev/next entries already! + */ +static inline void __list_del(struct list_head * prev, struct list_head * next) +{ +	next->prev = prev; +	prev->next = next; +} + +/** + * list_del - deletes entry from list. + * @entry: the element to delete from the list. + * Note: list_empty on entry does not return true after this, the entry is + * in an undefined state. + */ +static inline void list_del(struct list_head *entry) +{ +	__list_del(entry->prev, entry->next); +	entry->next = LIST_POISON1; +	entry->prev = LIST_POISON2; +} + +/** + * list_del_range - deletes range of entries from list. + * @beging: first element in the range to delete from the list. + * @beging: first element in the range to delete from the list. + * Note: list_empty on the range of entries does not return true after this, + * the entries is in an undefined state. + */ +static inline void list_del_range(struct list_head *begin, +				  struct list_head *end) +{ +	begin->prev->next = end->next; +	end->next->prev = begin->prev; +} + +/** + * list_replace - replace old entry by new one + * @old : the element to be replaced + * @new : the new element to insert + * Note: if 'old' was empty, it will be overwritten. + */ +static inline void list_replace(struct list_head *old, +				struct list_head *new) +{ +	new->next = old->next; +	new->next->prev = new; +	new->prev = old->prev; +	new->prev->next = new; +} + +static inline void list_replace_init(struct list_head *old, +					struct list_head *new) +{ +	list_replace(old, new); +	INIT_LIST_HEAD(old); +} + +/** + * list_del_init - deletes entry from list and reinitialize it. + * @entry: the element to delete from the list. + */ +static inline void list_del_init(struct list_head *entry) +{ +	__list_del(entry->prev, entry->next); +	INIT_LIST_HEAD(entry); +} + +/** + * list_move - delete from one list and add as another's head + * @list: the entry to move + * @head: the head that will precede our entry + */ +static inline void list_move(struct list_head *list, struct list_head *head) +{ +        __list_del(list->prev, list->next); +        list_add(list, head); +} + +/** + * list_move_tail - delete from one list and add as another's tail + * @list: the entry to move + * @head: the head that will follow our entry + */ +static inline void list_move_tail(struct list_head *list, +				  struct list_head *head) +{ +        __list_del(list->prev, list->next); +        list_add_tail(list, head); +} + +/** + * list_is_last - tests whether @list is the last entry in list @head + * @list: the entry to test + * @head: the head of the list + */ +static inline int list_is_last(const struct list_head *list, +				const struct list_head *head) +{ +	return list->next == head; +} + +/** + * list_empty - tests whether a list is empty + * @head: the list to test. + */ +static inline int list_empty(const struct list_head *head) +{ +	return head->next == head; +} + +/** + * list_empty_careful - tests whether a list is empty and not being modified + * @head: the list to test + * + * Description: + * tests whether a list is empty _and_ checks that no other CPU might be + * in the process of modifying either member (next or prev) + * + * NOTE: using list_empty_careful() without synchronization + * can only be safe if the only activity that can happen + * to the list entry is list_del_init(). Eg. it cannot be used + * if another CPU could re-list_add() it. + */ +static inline int list_empty_careful(const struct list_head *head) +{ +	struct list_head *next = head->next; +	return (next == head) && (next == head->prev); +} + +static inline void __list_splice(struct list_head *list, +				 struct list_head *head) +{ +	struct list_head *first = list->next; +	struct list_head *last = list->prev; +	struct list_head *at = head->next; + +	first->prev = head; +	head->next = first; + +	last->next = at; +	at->prev = last; +} + +/** + * list_splice - join two lists + * @list: the new list to add. + * @head: the place to add it in the first list. + */ +static inline void list_splice(struct list_head *list, struct list_head *head) +{ +	if (!list_empty(list)) +		__list_splice(list, head); +} + +/** + * list_splice_init - join two lists and reinitialise the emptied list. + * @list: the new list to add. + * @head: the place to add it in the first list. + * + * The list at @list is reinitialised + */ +static inline void list_splice_init(struct list_head *list, +				    struct list_head *head) +{ +	if (!list_empty(list)) { +		__list_splice(list, head); +		INIT_LIST_HEAD(list); +	} +} + +/** + * list_entry - get the struct for this entry + * @ptr:	the &struct list_head pointer. + * @type:	the type of the struct this is embedded in. + * @member:	the name of the list_struct within the struct. + */ +#define list_entry(ptr, type, member) \ +	container_of(ptr, type, member) + +/** + * list_first_entry - get the first element from a list + * @ptr:       the list head to take the element from. + * @type:      the type of the struct this is embedded in. + * @member:    the name of the list_struct within the struct. + * + * Note, that list is expected to be not empty. + */ +#define list_first_entry(ptr, type, member) \ +	list_entry((ptr)->next, type, member) + +/** + * list_for_each	-	iterate over a list + * @pos:	the &struct list_head to use as a loop cursor. + * @head:	the head for your list. + */ +#define list_for_each(pos, head) \ +	for (pos = (head)->next; pos != (head); \ +        	pos = pos->next) + +/** + * __list_for_each	-	iterate over a list + * @pos:	the &struct list_head to use as a loop cursor. + * @head:	the head for your list. + * + * This variant differs from list_for_each() in that it's the + * simplest possible list iteration code, no prefetching is done. + * Use this for code that knows the list to be very short (empty + * or 1 entry) most of the time. + */ +#define __list_for_each(pos, head) \ +	for (pos = (head)->next; pos != (head); pos = pos->next) + +/** + * list_for_each_prev	-	iterate over a list backwards + * @pos:	the &struct list_head to use as a loop cursor. + * @head:	the head for your list. + */ +#define list_for_each_prev(pos, head) \ +	for (pos = (head)->prev; pos != (head); \ +        	pos = pos->prev) + +/** + * list_for_each_safe - iterate over a list safe against removal of list entry + * @pos:	the &struct list_head to use as a loop cursor. + * @n:		another &struct list_head to use as temporary storage + * @head:	the head for your list. + */ +#define list_for_each_safe(pos, n, head) \ +	for (pos = (head)->next, n = pos->next; pos != (head); \ +		pos = n, n = pos->next) + +/** + * list_for_each_entry	-	iterate over list of given type + * @pos:	the type * to use as a loop cursor. + * @head:	the head for your list. + * @member:	the name of the list_struct within the struct. + */ +#define list_for_each_entry(pos, head, member)				\ +	for (pos = list_entry((head)->next, typeof(*pos), member);	\ +	     &pos->member != (head); 	\ +	     pos = list_entry(pos->member.next, typeof(*pos), member)) + +/** + * list_for_each_entry_reverse - iterate backwards over list of given type. + * @pos:	the type * to use as a loop cursor. + * @head:	the head for your list. + * @member:	the name of the list_struct within the struct. + */ +#define list_for_each_entry_reverse(pos, head, member)			\ +	for (pos = list_entry((head)->prev, typeof(*pos), member);	\ +	     &pos->member != (head); 	\ +	     pos = list_entry(pos->member.prev, typeof(*pos), member)) + +/** + * list_prepare_entry - prepare a pos entry for use in list_for_each_entry_continue + * @pos:	the type * to use as a start point + * @head:	the head of the list + * @member:	the name of the list_struct within the struct. + * + * Prepares a pos entry for use as a start point in list_for_each_entry_continue. + */ +#define list_prepare_entry(pos, head, member) \ +	((pos) ? : list_entry(head, typeof(*pos), member)) + +/** + * list_for_each_entry_continue - continue iteration over list of given type + * @pos:	the type * to use as a loop cursor. + * @head:	the head for your list. + * @member:	the name of the list_struct within the struct. + * + * Continue to iterate over list of given type, continuing after + * the current position. + */ +#define list_for_each_entry_continue(pos, head, member) 		\ +	for (pos = list_entry(pos->member.next, typeof(*pos), member);	\ +	     &pos->member != (head);	\ +	     pos = list_entry(pos->member.next, typeof(*pos), member)) + +/** + * list_for_each_entry_from - iterate over list of given type from the current point + * @pos:	the type * to use as a loop cursor. + * @head:	the head for your list. + * @member:	the name of the list_struct within the struct. + * + * Iterate over list of given type, continuing from current position. + */ +#define list_for_each_entry_from(pos, head, member) 			\ +	for (; &pos->member != (head);	\ +	     pos = list_entry(pos->member.next, typeof(*pos), member)) + +/** + * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry + * @pos:	the type * to use as a loop cursor. + * @n:		another type * to use as temporary storage + * @head:	the head for your list. + * @member:	the name of the list_struct within the struct. + */ +#define list_for_each_entry_safe(pos, n, head, member)			\ +	for (pos = list_entry((head)->next, typeof(*pos), member),	\ +		n = list_entry(pos->member.next, typeof(*pos), member);	\ +	     &pos->member != (head); 					\ +	     pos = n, n = list_entry(n->member.next, typeof(*n), member)) + +/** + * list_for_each_entry_safe_continue + * @pos:	the type * to use as a loop cursor. + * @n:		another type * to use as temporary storage + * @head:	the head for your list. + * @member:	the name of the list_struct within the struct. + * + * Iterate over list of given type, continuing after current point, + * safe against removal of list entry. + */ +#define list_for_each_entry_safe_continue(pos, n, head, member) 		\ +	for (pos = list_entry(pos->member.next, typeof(*pos), member), 		\ +		n = list_entry(pos->member.next, typeof(*pos), member);		\ +	     &pos->member != (head);						\ +	     pos = n, n = list_entry(n->member.next, typeof(*n), member)) + +/** + * list_for_each_entry_safe_from + * @pos:	the type * to use as a loop cursor. + * @n:		another type * to use as temporary storage + * @head:	the head for your list. + * @member:	the name of the list_struct within the struct. + * + * Iterate over list of given type from current point, safe against + * removal of list entry. + */ +#define list_for_each_entry_safe_from(pos, n, head, member) 			\ +	for (n = list_entry(pos->member.next, typeof(*pos), member);		\ +	     &pos->member != (head);						\ +	     pos = n, n = list_entry(n->member.next, typeof(*n), member)) + +/** + * list_for_each_entry_safe_reverse + * @pos:	the type * to use as a loop cursor. + * @n:		another type * to use as temporary storage + * @head:	the head for your list. + * @member:	the name of the list_struct within the struct. + * + * Iterate backwards over list of given type, safe against removal + * of list entry. + */ +#define list_for_each_entry_safe_reverse(pos, n, head, member)		\ +	for (pos = list_entry((head)->prev, typeof(*pos), member),	\ +		n = list_entry(pos->member.prev, typeof(*pos), member);	\ +	     &pos->member != (head); 					\ +	     pos = n, n = list_entry(n->member.prev, typeof(*n), member)) + +/* + * Double linked lists with a single pointer list head. + * Mostly useful for hash tables where the two pointer list head is + * too wasteful. + * You lose the ability to access the tail in O(1). + */ + +struct hlist_head { +	struct hlist_node *first; +}; + +struct hlist_node { +	struct hlist_node *next, **pprev; +}; + +#define HLIST_HEAD_INIT { .first = NULL } +#define HLIST_HEAD(name) struct hlist_head name = {  .first = NULL } +#define INIT_HLIST_HEAD(ptr) ((ptr)->first = NULL) +static inline void INIT_HLIST_NODE(struct hlist_node *h) +{ +	h->next = NULL; +	h->pprev = NULL; +} + +static inline int hlist_unhashed(const struct hlist_node *h) +{ +	return !h->pprev; +} + +static inline int hlist_empty(const struct hlist_head *h) +{ +	return !h->first; +} + +static inline void __hlist_del(struct hlist_node *n) +{ +	struct hlist_node *next = n->next; +	struct hlist_node **pprev = n->pprev; +	*pprev = next; +	if (next) +		next->pprev = pprev; +} + +static inline void hlist_del(struct hlist_node *n) +{ +	__hlist_del(n); +	n->next = LIST_POISON1; +	n->pprev = LIST_POISON2; +} + +static inline void hlist_del_init(struct hlist_node *n) +{ +	if (!hlist_unhashed(n)) { +		__hlist_del(n); +		INIT_HLIST_NODE(n); +	} +} + +static inline void hlist_add_head(struct hlist_node *n, struct hlist_head *h) +{ +	struct hlist_node *first = h->first; +	n->next = first; +	if (first) +		first->pprev = &n->next; +	h->first = n; +	n->pprev = &h->first; +} + +/* next must be != NULL */ +static inline void hlist_add_before(struct hlist_node *n, +					struct hlist_node *next) +{ +	n->pprev = next->pprev; +	n->next = next; +	next->pprev = &n->next; +	*(n->pprev) = n; +} + +static inline void hlist_add_after(struct hlist_node *n, +					struct hlist_node *next) +{ +	next->next = n->next; +	n->next = next; +	next->pprev = &n->next; + +	if(next->next) +		next->next->pprev  = &next->next; +} + +#define hlist_entry(ptr, type, member) container_of(ptr,type,member) + +#define hlist_for_each(pos, head) \ +	for (pos = (head)->first; pos; \ +	     pos = pos->next) + +#define hlist_for_each_safe(pos, n, head) \ +	for (pos = (head)->first; pos && ({ n = pos->next; 1; }); \ +	     pos = n) + +/** + * hlist_for_each_entry	- iterate over list of given type + * @tpos:	the type * to use as a loop cursor. + * @pos:	the &struct hlist_node to use as a loop cursor. + * @head:	the head for your list. + * @member:	the name of the hlist_node within the struct. + */ +#define hlist_for_each_entry(tpos, pos, head, member)			 \ +	for (pos = (head)->first;					 \ +	     pos && 			 \ +		({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \ +	     pos = pos->next) + +/** + * hlist_for_each_entry_continue - iterate over a hlist continuing after current point + * @tpos:	the type * to use as a loop cursor. + * @pos:	the &struct hlist_node to use as a loop cursor. + * @member:	the name of the hlist_node within the struct. + */ +#define hlist_for_each_entry_continue(tpos, pos, member)		 \ +	for (pos = (pos)->next;						 \ +	     pos && 			 \ +		({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \ +	     pos = pos->next) + +/** + * hlist_for_each_entry_from - iterate over a hlist continuing from current point + * @tpos:	the type * to use as a loop cursor. + * @pos:	the &struct hlist_node to use as a loop cursor. + * @member:	the name of the hlist_node within the struct. + */ +#define hlist_for_each_entry_from(tpos, pos, member)			 \ +	for (; pos && 			 \ +		({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \ +	     pos = pos->next) + +/** + * hlist_for_each_entry_safe - iterate over list of given type safe against removal of list entry + * @tpos:	the type * to use as a loop cursor. + * @pos:	the &struct hlist_node to use as a loop cursor. + * @n:		another &struct hlist_node to use as temporary storage + * @head:	the head for your list. + * @member:	the name of the hlist_node within the struct. + */ +#define hlist_for_each_entry_safe(tpos, pos, n, head, member) 		 \ +	for (pos = (head)->first;					 \ +	     pos && ({ n = pos->next; 1; }) && 				 \ +		({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \ +	     pos = n) + +#endif diff --git a/tools/perf/util/pager.c b/tools/perf/util/pager.c new file mode 100644 index 00000000000..a28bccae545 --- /dev/null +++ b/tools/perf/util/pager.c @@ -0,0 +1,99 @@ +#include "cache.h" +#include "run-command.h" +#include "sigchain.h" + +/* + * This is split up from the rest of git so that we can do + * something different on Windows. + */ + +static int spawned_pager; + +#ifndef __MINGW32__ +static void pager_preexec(void) +{ +	/* +	 * Work around bug in "less" by not starting it until we +	 * have real input +	 */ +	fd_set in; + +	FD_ZERO(&in); +	FD_SET(0, &in); +	select(1, &in, NULL, &in, NULL); + +	setenv("LESS", "FRSX", 0); +} +#endif + +static const char *pager_argv[] = { "sh", "-c", NULL, NULL }; +static struct child_process pager_process; + +static void wait_for_pager(void) +{ +	fflush(stdout); +	fflush(stderr); +	/* signal EOF to pager */ +	close(1); +	close(2); +	finish_command(&pager_process); +} + +static void wait_for_pager_signal(int signo) +{ +	wait_for_pager(); +	sigchain_pop(signo); +	raise(signo); +} + +void setup_pager(void) +{ +	const char *pager = getenv("PERF_PAGER"); + +	if (!isatty(1)) +		return; +	if (!pager) { +		if (!pager_program) +			perf_config(perf_default_config, NULL); +		pager = pager_program; +	} +	if (!pager) +		pager = getenv("PAGER"); +	if (!pager) +		pager = "less"; +	else if (!*pager || !strcmp(pager, "cat")) +		return; + +	spawned_pager = 1; /* means we are emitting to terminal */ + +	/* spawn the pager */ +	pager_argv[2] = pager; +	pager_process.argv = pager_argv; +	pager_process.in = -1; +#ifndef __MINGW32__ +	pager_process.preexec_cb = pager_preexec; +#endif +	if (start_command(&pager_process)) +		return; + +	/* original process continues, but writes to the pipe */ +	dup2(pager_process.in, 1); +	if (isatty(2)) +		dup2(pager_process.in, 2); +	close(pager_process.in); + +	/* this makes sure that the parent terminates after the pager */ +	sigchain_push_common(wait_for_pager_signal); +	atexit(wait_for_pager); +} + +int pager_in_use(void) +{ +	const char *env; + +	if (spawned_pager) +		return 1; + +	env = getenv("PERF_PAGER_IN_USE"); +	return env ? perf_config_bool("PERF_PAGER_IN_USE", env) : 0; +} diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c new file mode 100644 index 00000000000..9d5f1ca50e6 --- /dev/null +++ b/tools/perf/util/parse-events.c @@ -0,0 +1,316 @@ + +#include "../perf.h" +#include "util.h" +#include "parse-options.h" +#include "parse-events.h" +#include "exec_cmd.h" +#include "string.h" + +extern char *strcasestr(const char *haystack, const char *needle); + +int					nr_counters; + +struct perf_counter_attr		attrs[MAX_COUNTERS]; + +struct event_symbol { +	__u8	type; +	__u64	config; +	char	*symbol; +}; + +#define C(x, y) .type = PERF_TYPE_##x, .config = PERF_COUNT_##y +#define CR(x, y) .type = PERF_TYPE_##x, .config = y + +static struct event_symbol event_symbols[] = { +  { C(HARDWARE, HW_CPU_CYCLES),		"cpu-cycles",		}, +  { C(HARDWARE, HW_CPU_CYCLES),		"cycles",		}, +  { C(HARDWARE, HW_INSTRUCTIONS),	"instructions",		}, +  { C(HARDWARE, HW_CACHE_REFERENCES),	"cache-references",	}, +  { C(HARDWARE, HW_CACHE_MISSES),	"cache-misses",		}, +  { C(HARDWARE, HW_BRANCH_INSTRUCTIONS),"branch-instructions",	}, +  { C(HARDWARE, HW_BRANCH_INSTRUCTIONS),"branches",		}, +  { C(HARDWARE, HW_BRANCH_MISSES),	"branch-misses",	}, +  { C(HARDWARE, HW_BUS_CYCLES),		"bus-cycles",		}, + +  { C(SOFTWARE, SW_CPU_CLOCK),		"cpu-clock",		}, +  { C(SOFTWARE, SW_TASK_CLOCK),		"task-clock",		}, +  { C(SOFTWARE, SW_PAGE_FAULTS),	"page-faults",		}, +  { C(SOFTWARE, SW_PAGE_FAULTS),	"faults",		}, +  { C(SOFTWARE, SW_PAGE_FAULTS_MIN),	"minor-faults",		}, +  { C(SOFTWARE, SW_PAGE_FAULTS_MAJ),	"major-faults",		}, +  { C(SOFTWARE, SW_CONTEXT_SWITCHES),	"context-switches",	}, +  { C(SOFTWARE, SW_CONTEXT_SWITCHES),	"cs",			}, +  { C(SOFTWARE, SW_CPU_MIGRATIONS),	"cpu-migrations",	}, +  { C(SOFTWARE, SW_CPU_MIGRATIONS),	"migrations",		}, +}; + +#define __PERF_COUNTER_FIELD(config, name) \ +	((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT) + +#define PERF_COUNTER_RAW(config)	__PERF_COUNTER_FIELD(config, RAW) +#define PERF_COUNTER_CONFIG(config)	__PERF_COUNTER_FIELD(config, CONFIG) +#define PERF_COUNTER_TYPE(config)	__PERF_COUNTER_FIELD(config, TYPE) +#define PERF_COUNTER_ID(config)		__PERF_COUNTER_FIELD(config, EVENT) + +static char *hw_event_names[] = { +	"cycles", +	"instructions", +	"cache-references", +	"cache-misses", +	"branches", +	"branch-misses", +	"bus-cycles", +}; + +static char *sw_event_names[] = { +	"cpu-clock-ticks", +	"task-clock-ticks", +	"page-faults", +	"context-switches", +	"CPU-migrations", +	"minor-faults", +	"major-faults", +}; + +#define MAX_ALIASES 8 + +static char *hw_cache [][MAX_ALIASES] = { +	{ "L1-data"		, "l1-d", "l1d", "l1"				}, +	{ "L1-instruction"	, "l1-i", "l1i"					}, +	{ "L2"			, "l2"						}, +	{ "Data-TLB"		, "dtlb", "d-tlb"				}, +	{ "Instruction-TLB"	, "itlb", "i-tlb"				}, +	{ "Branch"		, "bpu" , "btb", "bpc"				}, +}; + +static char *hw_cache_op [][MAX_ALIASES] = { +	{ "Load"		, "read"					}, +	{ "Store"		, "write"					}, +	{ "Prefetch"		, "speculative-read", "speculative-load"	}, +}; + +static char *hw_cache_result [][MAX_ALIASES] = { +	{ "Reference"		, "ops", "access"				}, +	{ "Miss"								}, +}; + +char *event_name(int counter) +{ +	__u64 config = attrs[counter].config; +	int type = attrs[counter].type; +	static char buf[32]; + +	if (attrs[counter].type == PERF_TYPE_RAW) { +		sprintf(buf, "raw 0x%llx", config); +		return buf; +	} + +	switch (type) { +	case PERF_TYPE_HARDWARE: +		if (config < PERF_COUNT_HW_MAX) +			return hw_event_names[config]; +		return "unknown-hardware"; + +	case PERF_TYPE_HW_CACHE: { +		__u8 cache_type, cache_op, cache_result; +		static char name[100]; + +		cache_type   = (config >>  0) & 0xff; +		if (cache_type > PERF_COUNT_HW_CACHE_MAX) +			return "unknown-ext-hardware-cache-type"; + +		cache_op     = (config >>  8) & 0xff; +		if (cache_op > PERF_COUNT_HW_CACHE_OP_MAX) +			return "unknown-ext-hardware-cache-op"; + +		cache_result = (config >> 16) & 0xff; +		if (cache_result > PERF_COUNT_HW_CACHE_RESULT_MAX) +			return "unknown-ext-hardware-cache-result"; + +		sprintf(name, "%s-Cache-%s-%ses", +			hw_cache[cache_type][0], +			hw_cache_op[cache_op][0], +			hw_cache_result[cache_result][0]); + +		return name; +	} + +	case PERF_TYPE_SOFTWARE: +		if (config < PERF_COUNT_SW_MAX) +			return sw_event_names[config]; +		return "unknown-software"; + +	default: +		break; +	} + +	return "unknown"; +} + +static int parse_aliases(const char *str, char *names[][MAX_ALIASES], int size) +{ +	int i, j; + +	for (i = 0; i < size; i++) { +		for (j = 0; j < MAX_ALIASES; j++) { +			if (!names[i][j]) +				break; +			if (strcasestr(str, names[i][j])) +				return i; +		} +	} + +	return -1; +} + +static int parse_generic_hw_symbols(const char *str, struct perf_counter_attr *attr) +{ +	int cache_type = -1, cache_op = 0, cache_result = 0; + +	cache_type = parse_aliases(str, hw_cache, PERF_COUNT_HW_CACHE_MAX); +	/* +	 * No fallback - if we cannot get a clear cache type +	 * then bail out: +	 */ +	if (cache_type == -1) +		return -EINVAL; + +	cache_op = parse_aliases(str, hw_cache_op, PERF_COUNT_HW_CACHE_OP_MAX); +	/* +	 * Fall back to reads: +	 */ +	if (cache_op == -1) +		cache_op = PERF_COUNT_HW_CACHE_OP_READ; + +	cache_result = parse_aliases(str, hw_cache_result, +					PERF_COUNT_HW_CACHE_RESULT_MAX); +	/* +	 * Fall back to accesses: +	 */ +	if (cache_result == -1) +		cache_result = PERF_COUNT_HW_CACHE_RESULT_ACCESS; + +	attr->config = cache_type | (cache_op << 8) | (cache_result << 16); +	attr->type = PERF_TYPE_HW_CACHE; + +	return 0; +} + +/* + * Each event can have multiple symbolic names. + * Symbolic names are (almost) exactly matched. + */ +static int parse_event_symbols(const char *str, struct perf_counter_attr *attr) +{ +	__u64 config, id; +	int type; +	unsigned int i; +	const char *sep, *pstr; + +	if (str[0] == 'r' && hex2u64(str + 1, &config) > 0) { +		attr->type = PERF_TYPE_RAW; +		attr->config = config; + +		return 0; +	} + +	pstr = str; +	sep = strchr(pstr, ':'); +	if (sep) { +		type = atoi(pstr); +		pstr = sep + 1; +		id = atoi(pstr); +		sep = strchr(pstr, ':'); +		if (sep) { +			pstr = sep + 1; +			if (strchr(pstr, 'k')) +				attr->exclude_user = 1; +			if (strchr(pstr, 'u')) +				attr->exclude_kernel = 1; +		} +		attr->type = type; +		attr->config = id; + +		return 0; +	} + +	for (i = 0; i < ARRAY_SIZE(event_symbols); i++) { +		if (!strncmp(str, event_symbols[i].symbol, +			     strlen(event_symbols[i].symbol))) { + +			attr->type = event_symbols[i].type; +			attr->config = event_symbols[i].config; + +			return 0; +		} +	} + +	return parse_generic_hw_symbols(str, attr); +} + +int parse_events(const struct option *opt, const char *str, int unset) +{ +	struct perf_counter_attr attr; +	int ret; + +	memset(&attr, 0, sizeof(attr)); +again: +	if (nr_counters == MAX_COUNTERS) +		return -1; + +	ret = parse_event_symbols(str, &attr); +	if (ret < 0) +		return ret; + +	attrs[nr_counters] = attr; +	nr_counters++; + +	str = strstr(str, ","); +	if (str) { +		str++; +		goto again; +	} + +	return 0; +} + +static const char * const event_type_descriptors[] = { +	"", +	"Hardware event", +	"Software event", +	"Tracepoint event", +	"Hardware cache event", +}; + +/* + * Print the help text for the event symbols: + */ +void print_events(void) +{ +	struct event_symbol *syms = event_symbols; +	unsigned int i, type, prev_type = -1; + +	fprintf(stderr, "\n"); +	fprintf(stderr, "List of pre-defined events (to be used in -e):\n"); + +	for (i = 0; i < ARRAY_SIZE(event_symbols); i++, syms++) { +		type = syms->type + 1; +		if (type > ARRAY_SIZE(event_type_descriptors)) +			type = 0; + +		if (type != prev_type) +			fprintf(stderr, "\n"); + +		fprintf(stderr, "  %-30s [%s]\n", syms->symbol, +			event_type_descriptors[type]); + +		prev_type = type; +	} + +	fprintf(stderr, "\n"); +	fprintf(stderr, "  %-30s [raw hardware event descriptor]\n", +		"rNNN"); +	fprintf(stderr, "\n"); + +	exit(129); +} diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h new file mode 100644 index 00000000000..e3d552908e6 --- /dev/null +++ b/tools/perf/util/parse-events.h @@ -0,0 +1,17 @@ + +/* + * Parse symbolic events/counts passed in as options: + */ + +extern int			nr_counters; + +extern struct perf_counter_attr attrs[MAX_COUNTERS]; + +extern char *event_name(int ctr); + +extern int parse_events(const struct option *opt, const char *str, int unset); + +#define EVENTS_HELP_MAX (128*1024) + +extern void print_events(void); + diff --git a/tools/perf/util/parse-options.c b/tools/perf/util/parse-options.c new file mode 100644 index 00000000000..b3affb1658d --- /dev/null +++ b/tools/perf/util/parse-options.c @@ -0,0 +1,508 @@ +#include "util.h" +#include "parse-options.h" +#include "cache.h" + +#define OPT_SHORT 1 +#define OPT_UNSET 2 + +static int opterror(const struct option *opt, const char *reason, int flags) +{ +	if (flags & OPT_SHORT) +		return error("switch `%c' %s", opt->short_name, reason); +	if (flags & OPT_UNSET) +		return error("option `no-%s' %s", opt->long_name, reason); +	return error("option `%s' %s", opt->long_name, reason); +} + +static int get_arg(struct parse_opt_ctx_t *p, const struct option *opt, +		   int flags, const char **arg) +{ +	if (p->opt) { +		*arg = p->opt; +		p->opt = NULL; +	} else if (p->argc == 1 && (opt->flags & PARSE_OPT_LASTARG_DEFAULT)) { +		*arg = (const char *)opt->defval; +	} else if (p->argc > 1) { +		p->argc--; +		*arg = *++p->argv; +	} else +		return opterror(opt, "requires a value", flags); +	return 0; +} + +static int get_value(struct parse_opt_ctx_t *p, +		     const struct option *opt, int flags) +{ +	const char *s, *arg = NULL; +	const int unset = flags & OPT_UNSET; + +	if (unset && p->opt) +		return opterror(opt, "takes no value", flags); +	if (unset && (opt->flags & PARSE_OPT_NONEG)) +		return opterror(opt, "isn't available", flags); + +	if (!(flags & OPT_SHORT) && p->opt) { +		switch (opt->type) { +		case OPTION_CALLBACK: +			if (!(opt->flags & PARSE_OPT_NOARG)) +				break; +			/* FALLTHROUGH */ +		case OPTION_BOOLEAN: +		case OPTION_BIT: +		case OPTION_SET_INT: +		case OPTION_SET_PTR: +			return opterror(opt, "takes no value", flags); +		default: +			break; +		} +	} + +	switch (opt->type) { +	case OPTION_BIT: +		if (unset) +			*(int *)opt->value &= ~opt->defval; +		else +			*(int *)opt->value |= opt->defval; +		return 0; + +	case OPTION_BOOLEAN: +		*(int *)opt->value = unset ? 0 : *(int *)opt->value + 1; +		return 0; + +	case OPTION_SET_INT: +		*(int *)opt->value = unset ? 0 : opt->defval; +		return 0; + +	case OPTION_SET_PTR: +		*(void **)opt->value = unset ? NULL : (void *)opt->defval; +		return 0; + +	case OPTION_STRING: +		if (unset) +			*(const char **)opt->value = NULL; +		else if (opt->flags & PARSE_OPT_OPTARG && !p->opt) +			*(const char **)opt->value = (const char *)opt->defval; +		else +			return get_arg(p, opt, flags, (const char **)opt->value); +		return 0; + +	case OPTION_CALLBACK: +		if (unset) +			return (*opt->callback)(opt, NULL, 1) ? (-1) : 0; +		if (opt->flags & PARSE_OPT_NOARG) +			return (*opt->callback)(opt, NULL, 0) ? (-1) : 0; +		if (opt->flags & PARSE_OPT_OPTARG && !p->opt) +			return (*opt->callback)(opt, NULL, 0) ? (-1) : 0; +		if (get_arg(p, opt, flags, &arg)) +			return -1; +		return (*opt->callback)(opt, arg, 0) ? (-1) : 0; + +	case OPTION_INTEGER: +		if (unset) { +			*(int *)opt->value = 0; +			return 0; +		} +		if (opt->flags & PARSE_OPT_OPTARG && !p->opt) { +			*(int *)opt->value = opt->defval; +			return 0; +		} +		if (get_arg(p, opt, flags, &arg)) +			return -1; +		*(int *)opt->value = strtol(arg, (char **)&s, 10); +		if (*s) +			return opterror(opt, "expects a numerical value", flags); +		return 0; + +	case OPTION_LONG: +		if (unset) { +			*(long *)opt->value = 0; +			return 0; +		} +		if (opt->flags & PARSE_OPT_OPTARG && !p->opt) { +			*(long *)opt->value = opt->defval; +			return 0; +		} +		if (get_arg(p, opt, flags, &arg)) +			return -1; +		*(long *)opt->value = strtol(arg, (char **)&s, 10); +		if (*s) +			return opterror(opt, "expects a numerical value", flags); +		return 0; + +	default: +		die("should not happen, someone must be hit on the forehead"); +	} +} + +static int parse_short_opt(struct parse_opt_ctx_t *p, const struct option *options) +{ +	for (; options->type != OPTION_END; options++) { +		if (options->short_name == *p->opt) { +			p->opt = p->opt[1] ? p->opt + 1 : NULL; +			return get_value(p, options, OPT_SHORT); +		} +	} +	return -2; +} + +static int parse_long_opt(struct parse_opt_ctx_t *p, const char *arg, +                          const struct option *options) +{ +	const char *arg_end = strchr(arg, '='); +	const struct option *abbrev_option = NULL, *ambiguous_option = NULL; +	int abbrev_flags = 0, ambiguous_flags = 0; + +	if (!arg_end) +		arg_end = arg + strlen(arg); + +	for (; options->type != OPTION_END; options++) { +		const char *rest; +		int flags = 0; + +		if (!options->long_name) +			continue; + +		rest = skip_prefix(arg, options->long_name); +		if (options->type == OPTION_ARGUMENT) { +			if (!rest) +				continue; +			if (*rest == '=') +				return opterror(options, "takes no value", flags); +			if (*rest) +				continue; +			p->out[p->cpidx++] = arg - 2; +			return 0; +		} +		if (!rest) { +			/* abbreviated? */ +			if (!strncmp(options->long_name, arg, arg_end - arg)) { +is_abbreviated: +				if (abbrev_option) { +					/* +					 * If this is abbreviated, it is +					 * ambiguous. So when there is no +					 * exact match later, we need to +					 * error out. +					 */ +					ambiguous_option = abbrev_option; +					ambiguous_flags = abbrev_flags; +				} +				if (!(flags & OPT_UNSET) && *arg_end) +					p->opt = arg_end + 1; +				abbrev_option = options; +				abbrev_flags = flags; +				continue; +			} +			/* negated and abbreviated very much? */ +			if (!prefixcmp("no-", arg)) { +				flags |= OPT_UNSET; +				goto is_abbreviated; +			} +			/* negated? */ +			if (strncmp(arg, "no-", 3)) +				continue; +			flags |= OPT_UNSET; +			rest = skip_prefix(arg + 3, options->long_name); +			/* abbreviated and negated? */ +			if (!rest && !prefixcmp(options->long_name, arg + 3)) +				goto is_abbreviated; +			if (!rest) +				continue; +		} +		if (*rest) { +			if (*rest != '=') +				continue; +			p->opt = rest + 1; +		} +		return get_value(p, options, flags); +	} + +	if (ambiguous_option) +		return error("Ambiguous option: %s " +			"(could be --%s%s or --%s%s)", +			arg, +			(ambiguous_flags & OPT_UNSET) ?  "no-" : "", +			ambiguous_option->long_name, +			(abbrev_flags & OPT_UNSET) ?  "no-" : "", +			abbrev_option->long_name); +	if (abbrev_option) +		return get_value(p, abbrev_option, abbrev_flags); +	return -2; +} + +static void check_typos(const char *arg, const struct option *options) +{ +	if (strlen(arg) < 3) +		return; + +	if (!prefixcmp(arg, "no-")) { +		error ("did you mean `--%s` (with two dashes ?)", arg); +		exit(129); +	} + +	for (; options->type != OPTION_END; options++) { +		if (!options->long_name) +			continue; +		if (!prefixcmp(options->long_name, arg)) { +			error ("did you mean `--%s` (with two dashes ?)", arg); +			exit(129); +		} +	} +} + +void parse_options_start(struct parse_opt_ctx_t *ctx, +			 int argc, const char **argv, int flags) +{ +	memset(ctx, 0, sizeof(*ctx)); +	ctx->argc = argc - 1; +	ctx->argv = argv + 1; +	ctx->out  = argv; +	ctx->cpidx = ((flags & PARSE_OPT_KEEP_ARGV0) != 0); +	ctx->flags = flags; +	if ((flags & PARSE_OPT_KEEP_UNKNOWN) && +	    (flags & PARSE_OPT_STOP_AT_NON_OPTION)) +		die("STOP_AT_NON_OPTION and KEEP_UNKNOWN don't go together"); +} + +static int usage_with_options_internal(const char * const *, +				       const struct option *, int); + +int parse_options_step(struct parse_opt_ctx_t *ctx, +		       const struct option *options, +		       const char * const usagestr[]) +{ +	int internal_help = !(ctx->flags & PARSE_OPT_NO_INTERNAL_HELP); + +	/* we must reset ->opt, unknown short option leave it dangling */ +	ctx->opt = NULL; + +	for (; ctx->argc; ctx->argc--, ctx->argv++) { +		const char *arg = ctx->argv[0]; + +		if (*arg != '-' || !arg[1]) { +			if (ctx->flags & PARSE_OPT_STOP_AT_NON_OPTION) +				break; +			ctx->out[ctx->cpidx++] = ctx->argv[0]; +			continue; +		} + +		if (arg[1] != '-') { +			ctx->opt = arg + 1; +			if (internal_help && *ctx->opt == 'h') +				return parse_options_usage(usagestr, options); +			switch (parse_short_opt(ctx, options)) { +			case -1: +				return parse_options_usage(usagestr, options); +			case -2: +				goto unknown; +			} +			if (ctx->opt) +				check_typos(arg + 1, options); +			while (ctx->opt) { +				if (internal_help && *ctx->opt == 'h') +					return parse_options_usage(usagestr, options); +				switch (parse_short_opt(ctx, options)) { +				case -1: +					return parse_options_usage(usagestr, options); +				case -2: +					/* fake a short option thing to hide the fact that we may have +					 * started to parse aggregated stuff +					 * +					 * This is leaky, too bad. +					 */ +					ctx->argv[0] = strdup(ctx->opt - 1); +					*(char *)ctx->argv[0] = '-'; +					goto unknown; +				} +			} +			continue; +		} + +		if (!arg[2]) { /* "--" */ +			if (!(ctx->flags & PARSE_OPT_KEEP_DASHDASH)) { +				ctx->argc--; +				ctx->argv++; +			} +			break; +		} + +		if (internal_help && !strcmp(arg + 2, "help-all")) +			return usage_with_options_internal(usagestr, options, 1); +		if (internal_help && !strcmp(arg + 2, "help")) +			return parse_options_usage(usagestr, options); +		switch (parse_long_opt(ctx, arg + 2, options)) { +		case -1: +			return parse_options_usage(usagestr, options); +		case -2: +			goto unknown; +		} +		continue; +unknown: +		if (!(ctx->flags & PARSE_OPT_KEEP_UNKNOWN)) +			return PARSE_OPT_UNKNOWN; +		ctx->out[ctx->cpidx++] = ctx->argv[0]; +		ctx->opt = NULL; +	} +	return PARSE_OPT_DONE; +} + +int parse_options_end(struct parse_opt_ctx_t *ctx) +{ +	memmove(ctx->out + ctx->cpidx, ctx->argv, ctx->argc * sizeof(*ctx->out)); +	ctx->out[ctx->cpidx + ctx->argc] = NULL; +	return ctx->cpidx + ctx->argc; +} + +int parse_options(int argc, const char **argv, const struct option *options, +		  const char * const usagestr[], int flags) +{ +	struct parse_opt_ctx_t ctx; + +	parse_options_start(&ctx, argc, argv, flags); +	switch (parse_options_step(&ctx, options, usagestr)) { +	case PARSE_OPT_HELP: +		exit(129); +	case PARSE_OPT_DONE: +		break; +	default: /* PARSE_OPT_UNKNOWN */ +		if (ctx.argv[0][1] == '-') { +			error("unknown option `%s'", ctx.argv[0] + 2); +		} else { +			error("unknown switch `%c'", *ctx.opt); +		} +		usage_with_options(usagestr, options); +	} + +	return parse_options_end(&ctx); +} + +#define USAGE_OPTS_WIDTH 24 +#define USAGE_GAP         2 + +int usage_with_options_internal(const char * const *usagestr, +				const struct option *opts, int full) +{ +	if (!usagestr) +		return PARSE_OPT_HELP; + +	fprintf(stderr, "\n usage: %s\n", *usagestr++); +	while (*usagestr && **usagestr) +		fprintf(stderr, "    or: %s\n", *usagestr++); +	while (*usagestr) { +		fprintf(stderr, "%s%s\n", +				**usagestr ? "    " : "", +				*usagestr); +		usagestr++; +	} + +	if (opts->type != OPTION_GROUP) +		fputc('\n', stderr); + +	for (; opts->type != OPTION_END; opts++) { +		size_t pos; +		int pad; + +		if (opts->type == OPTION_GROUP) { +			fputc('\n', stderr); +			if (*opts->help) +				fprintf(stderr, "%s\n", opts->help); +			continue; +		} +		if (!full && (opts->flags & PARSE_OPT_HIDDEN)) +			continue; + +		pos = fprintf(stderr, "    "); +		if (opts->short_name) +			pos += fprintf(stderr, "-%c", opts->short_name); +		if (opts->long_name && opts->short_name) +			pos += fprintf(stderr, ", "); +		if (opts->long_name) +			pos += fprintf(stderr, "--%s", opts->long_name); + +		switch (opts->type) { +		case OPTION_ARGUMENT: +			break; +		case OPTION_INTEGER: +			if (opts->flags & PARSE_OPT_OPTARG) +				if (opts->long_name) +					pos += fprintf(stderr, "[=<n>]"); +				else +					pos += fprintf(stderr, "[<n>]"); +			else +				pos += fprintf(stderr, " <n>"); +			break; +		case OPTION_CALLBACK: +			if (opts->flags & PARSE_OPT_NOARG) +				break; +			/* FALLTHROUGH */ +		case OPTION_STRING: +			if (opts->argh) { +				if (opts->flags & PARSE_OPT_OPTARG) +					if (opts->long_name) +						pos += fprintf(stderr, "[=<%s>]", opts->argh); +					else +						pos += fprintf(stderr, "[<%s>]", opts->argh); +				else +					pos += fprintf(stderr, " <%s>", opts->argh); +			} else { +				if (opts->flags & PARSE_OPT_OPTARG) +					if (opts->long_name) +						pos += fprintf(stderr, "[=...]"); +					else +						pos += fprintf(stderr, "[...]"); +				else +					pos += fprintf(stderr, " ..."); +			} +			break; +		default: /* OPTION_{BIT,BOOLEAN,SET_INT,SET_PTR} */ +			break; +		} + +		if (pos <= USAGE_OPTS_WIDTH) +			pad = USAGE_OPTS_WIDTH - pos; +		else { +			fputc('\n', stderr); +			pad = USAGE_OPTS_WIDTH; +		} +		fprintf(stderr, "%*s%s\n", pad + USAGE_GAP, "", opts->help); +	} +	fputc('\n', stderr); + +	return PARSE_OPT_HELP; +} + +void usage_with_options(const char * const *usagestr, +			const struct option *opts) +{ +	usage_with_options_internal(usagestr, opts, 0); +	exit(129); +} + +int parse_options_usage(const char * const *usagestr, +			const struct option *opts) +{ +	return usage_with_options_internal(usagestr, opts, 0); +} + + +int parse_opt_verbosity_cb(const struct option *opt, const char *arg, +			   int unset) +{ +	int *target = opt->value; + +	if (unset) +		/* --no-quiet, --no-verbose */ +		*target = 0; +	else if (opt->short_name == 'v') { +		if (*target >= 0) +			(*target)++; +		else +			*target = 1; +	} else { +		if (*target <= 0) +			(*target)--; +		else +			*target = -1; +	} +	return 0; +} diff --git a/tools/perf/util/parse-options.h b/tools/perf/util/parse-options.h new file mode 100644 index 00000000000..a1039a6ce0e --- /dev/null +++ b/tools/perf/util/parse-options.h @@ -0,0 +1,174 @@ +#ifndef PARSE_OPTIONS_H +#define PARSE_OPTIONS_H + +enum parse_opt_type { +	/* special types */ +	OPTION_END, +	OPTION_ARGUMENT, +	OPTION_GROUP, +	/* options with no arguments */ +	OPTION_BIT, +	OPTION_BOOLEAN, /* _INCR would have been a better name */ +	OPTION_SET_INT, +	OPTION_SET_PTR, +	/* options with arguments (usually) */ +	OPTION_STRING, +	OPTION_INTEGER, +	OPTION_LONG, +	OPTION_CALLBACK, +}; + +enum parse_opt_flags { +	PARSE_OPT_KEEP_DASHDASH = 1, +	PARSE_OPT_STOP_AT_NON_OPTION = 2, +	PARSE_OPT_KEEP_ARGV0 = 4, +	PARSE_OPT_KEEP_UNKNOWN = 8, +	PARSE_OPT_NO_INTERNAL_HELP = 16, +}; + +enum parse_opt_option_flags { +	PARSE_OPT_OPTARG  = 1, +	PARSE_OPT_NOARG   = 2, +	PARSE_OPT_NONEG   = 4, +	PARSE_OPT_HIDDEN  = 8, +	PARSE_OPT_LASTARG_DEFAULT = 16, +}; + +struct option; +typedef int parse_opt_cb(const struct option *, const char *arg, int unset); + +/* + * `type`:: + *   holds the type of the option, you must have an OPTION_END last in your + *   array. + * + * `short_name`:: + *   the character to use as a short option name, '\0' if none. + * + * `long_name`:: + *   the long option name, without the leading dashes, NULL if none. + * + * `value`:: + *   stores pointers to the values to be filled. + * + * `argh`:: + *   token to explain the kind of argument this option wants. Keep it + *   homogenous across the repository. + * + * `help`:: + *   the short help associated to what the option does. + *   Must never be NULL (except for OPTION_END). + *   OPTION_GROUP uses this pointer to store the group header. + * + * `flags`:: + *   mask of parse_opt_option_flags. + *   PARSE_OPT_OPTARG: says that the argument is optionnal (not for BOOLEANs) + *   PARSE_OPT_NOARG: says that this option takes no argument, for CALLBACKs + *   PARSE_OPT_NONEG: says that this option cannot be negated + *   PARSE_OPT_HIDDEN this option is skipped in the default usage, showed in + *                    the long one. + * + * `callback`:: + *   pointer to the callback to use for OPTION_CALLBACK. + * + * `defval`:: + *   default value to fill (*->value) with for PARSE_OPT_OPTARG. + *   OPTION_{BIT,SET_INT,SET_PTR} store the {mask,integer,pointer} to put in + *   the value when met. + *   CALLBACKS can use it like they want. + */ +struct option { +	enum parse_opt_type type; +	int short_name; +	const char *long_name; +	void *value; +	const char *argh; +	const char *help; + +	int flags; +	parse_opt_cb *callback; +	intptr_t defval; +}; + +#define OPT_END()                   { OPTION_END } +#define OPT_ARGUMENT(l, h)          { OPTION_ARGUMENT, 0, (l), NULL, NULL, (h) } +#define OPT_GROUP(h)                { OPTION_GROUP, 0, NULL, NULL, NULL, (h) } +#define OPT_BIT(s, l, v, h, b)      { OPTION_BIT, (s), (l), (v), NULL, (h), 0, NULL, (b) } +#define OPT_BOOLEAN(s, l, v, h)     { OPTION_BOOLEAN, (s), (l), (v), NULL, (h) } +#define OPT_SET_INT(s, l, v, h, i)  { OPTION_SET_INT, (s), (l), (v), NULL, (h), 0, NULL, (i) } +#define OPT_SET_PTR(s, l, v, h, p)  { OPTION_SET_PTR, (s), (l), (v), NULL, (h), 0, NULL, (p) } +#define OPT_INTEGER(s, l, v, h)     { OPTION_INTEGER, (s), (l), (v), NULL, (h) } +#define OPT_LONG(s, l, v, h)        { OPTION_LONG, (s), (l), (v), NULL, (h) } +#define OPT_STRING(s, l, v, a, h)   { OPTION_STRING,  (s), (l), (v), (a), (h) } +#define OPT_DATE(s, l, v, h) \ +	{ OPTION_CALLBACK, (s), (l), (v), "time",(h), 0, \ +	  parse_opt_approxidate_cb } +#define OPT_CALLBACK(s, l, v, a, h, f) \ +	{ OPTION_CALLBACK, (s), (l), (v), (a), (h), 0, (f) } + +/* parse_options() will filter out the processed options and leave the + * non-option argments in argv[]. + * Returns the number of arguments left in argv[]. + */ +extern int parse_options(int argc, const char **argv, +                         const struct option *options, +                         const char * const usagestr[], int flags); + +extern NORETURN void usage_with_options(const char * const *usagestr, +                                        const struct option *options); + +/*----- incremantal advanced APIs -----*/ + +enum { +	PARSE_OPT_HELP = -1, +	PARSE_OPT_DONE, +	PARSE_OPT_UNKNOWN, +}; + +/* + * It's okay for the caller to consume argv/argc in the usual way. + * Other fields of that structure are private to parse-options and should not + * be modified in any way. + */ +struct parse_opt_ctx_t { +	const char **argv; +	const char **out; +	int argc, cpidx; +	const char *opt; +	int flags; +}; + +extern int parse_options_usage(const char * const *usagestr, +			       const struct option *opts); + +extern void parse_options_start(struct parse_opt_ctx_t *ctx, +				int argc, const char **argv, int flags); + +extern int parse_options_step(struct parse_opt_ctx_t *ctx, +			      const struct option *options, +			      const char * const usagestr[]); + +extern int parse_options_end(struct parse_opt_ctx_t *ctx); + + +/*----- some often used options -----*/ +extern int parse_opt_abbrev_cb(const struct option *, const char *, int); +extern int parse_opt_approxidate_cb(const struct option *, const char *, int); +extern int parse_opt_verbosity_cb(const struct option *, const char *, int); + +#define OPT__VERBOSE(var)  OPT_BOOLEAN('v', "verbose", (var), "be verbose") +#define OPT__QUIET(var)    OPT_BOOLEAN('q', "quiet",   (var), "be quiet") +#define OPT__VERBOSITY(var) \ +	{ OPTION_CALLBACK, 'v', "verbose", (var), NULL, "be more verbose", \ +	  PARSE_OPT_NOARG, &parse_opt_verbosity_cb, 0 }, \ +	{ OPTION_CALLBACK, 'q', "quiet", (var), NULL, "be more quiet", \ +	  PARSE_OPT_NOARG, &parse_opt_verbosity_cb, 0 } +#define OPT__DRY_RUN(var)  OPT_BOOLEAN('n', "dry-run", (var), "dry run") +#define OPT__ABBREV(var)  \ +	{ OPTION_CALLBACK, 0, "abbrev", (var), "n", \ +	  "use <n> digits to display SHA-1s", \ +	  PARSE_OPT_OPTARG, &parse_opt_abbrev_cb, 0 } + +extern const char *parse_options_fix_filename(const char *prefix, const char *file); + +#endif diff --git a/tools/perf/util/path.c b/tools/perf/util/path.c new file mode 100644 index 00000000000..a501a40dd2c --- /dev/null +++ b/tools/perf/util/path.c @@ -0,0 +1,353 @@ +/* + * I'm tired of doing "vsnprintf()" etc just to open a + * file, so here's a "return static buffer with printf" + * interface for paths. + * + * It's obviously not thread-safe. Sue me. But it's quite + * useful for doing things like + * + *   f = open(mkpath("%s/%s.perf", base, name), O_RDONLY); + * + * which is what it's designed for. + */ +#include "cache.h" + +static char bad_path[] = "/bad-path/"; +/* + * Two hacks: + */ + +static char *get_perf_dir(void) +{ +	return "."; +} + +size_t strlcpy(char *dest, const char *src, size_t size) +{ +	size_t ret = strlen(src); + +	if (size) { +		size_t len = (ret >= size) ? size - 1 : ret; +		memcpy(dest, src, len); +		dest[len] = '\0'; +	} +	return ret; +} + + +static char *get_pathname(void) +{ +	static char pathname_array[4][PATH_MAX]; +	static int index; +	return pathname_array[3 & ++index]; +} + +static char *cleanup_path(char *path) +{ +	/* Clean it up */ +	if (!memcmp(path, "./", 2)) { +		path += 2; +		while (*path == '/') +			path++; +	} +	return path; +} + +char *mksnpath(char *buf, size_t n, const char *fmt, ...) +{ +	va_list args; +	unsigned len; + +	va_start(args, fmt); +	len = vsnprintf(buf, n, fmt, args); +	va_end(args); +	if (len >= n) { +		strlcpy(buf, bad_path, n); +		return buf; +	} +	return cleanup_path(buf); +} + +static char *perf_vsnpath(char *buf, size_t n, const char *fmt, va_list args) +{ +	const char *perf_dir = get_perf_dir(); +	size_t len; + +	len = strlen(perf_dir); +	if (n < len + 1) +		goto bad; +	memcpy(buf, perf_dir, len); +	if (len && !is_dir_sep(perf_dir[len-1])) +		buf[len++] = '/'; +	len += vsnprintf(buf + len, n - len, fmt, args); +	if (len >= n) +		goto bad; +	return cleanup_path(buf); +bad: +	strlcpy(buf, bad_path, n); +	return buf; +} + +char *perf_snpath(char *buf, size_t n, const char *fmt, ...) +{ +	va_list args; +	va_start(args, fmt); +	(void)perf_vsnpath(buf, n, fmt, args); +	va_end(args); +	return buf; +} + +char *perf_pathdup(const char *fmt, ...) +{ +	char path[PATH_MAX]; +	va_list args; +	va_start(args, fmt); +	(void)perf_vsnpath(path, sizeof(path), fmt, args); +	va_end(args); +	return xstrdup(path); +} + +char *mkpath(const char *fmt, ...) +{ +	va_list args; +	unsigned len; +	char *pathname = get_pathname(); + +	va_start(args, fmt); +	len = vsnprintf(pathname, PATH_MAX, fmt, args); +	va_end(args); +	if (len >= PATH_MAX) +		return bad_path; +	return cleanup_path(pathname); +} + +char *perf_path(const char *fmt, ...) +{ +	const char *perf_dir = get_perf_dir(); +	char *pathname = get_pathname(); +	va_list args; +	unsigned len; + +	len = strlen(perf_dir); +	if (len > PATH_MAX-100) +		return bad_path; +	memcpy(pathname, perf_dir, len); +	if (len && perf_dir[len-1] != '/') +		pathname[len++] = '/'; +	va_start(args, fmt); +	len += vsnprintf(pathname + len, PATH_MAX - len, fmt, args); +	va_end(args); +	if (len >= PATH_MAX) +		return bad_path; +	return cleanup_path(pathname); +} + + +/* perf_mkstemp() - create tmp file honoring TMPDIR variable */ +int perf_mkstemp(char *path, size_t len, const char *template) +{ +	const char *tmp; +	size_t n; + +	tmp = getenv("TMPDIR"); +	if (!tmp) +		tmp = "/tmp"; +	n = snprintf(path, len, "%s/%s", tmp, template); +	if (len <= n) { +		errno = ENAMETOOLONG; +		return -1; +	} +	return mkstemp(path); +} + + +const char *make_relative_path(const char *abs, const char *base) +{ +	static char buf[PATH_MAX + 1]; +	int baselen; +	if (!base) +		return abs; +	baselen = strlen(base); +	if (prefixcmp(abs, base)) +		return abs; +	if (abs[baselen] == '/') +		baselen++; +	else if (base[baselen - 1] != '/') +		return abs; +	strcpy(buf, abs + baselen); +	return buf; +} + +/* + * It is okay if dst == src, but they should not overlap otherwise. + * + * Performs the following normalizations on src, storing the result in dst: + * - Ensures that components are separated by '/' (Windows only) + * - Squashes sequences of '/'. + * - Removes "." components. + * - Removes ".." components, and the components the precede them. + * Returns failure (non-zero) if a ".." component appears as first path + * component anytime during the normalization. Otherwise, returns success (0). + * + * Note that this function is purely textual.  It does not follow symlinks, + * verify the existence of the path, or make any system calls. + */ +int normalize_path_copy(char *dst, const char *src) +{ +	char *dst0; + +	if (has_dos_drive_prefix(src)) { +		*dst++ = *src++; +		*dst++ = *src++; +	} +	dst0 = dst; + +	if (is_dir_sep(*src)) { +		*dst++ = '/'; +		while (is_dir_sep(*src)) +			src++; +	} + +	for (;;) { +		char c = *src; + +		/* +		 * A path component that begins with . could be +		 * special: +		 * (1) "." and ends   -- ignore and terminate. +		 * (2) "./"           -- ignore them, eat slash and continue. +		 * (3) ".." and ends  -- strip one and terminate. +		 * (4) "../"          -- strip one, eat slash and continue. +		 */ +		if (c == '.') { +			if (!src[1]) { +				/* (1) */ +				src++; +			} else if (is_dir_sep(src[1])) { +				/* (2) */ +				src += 2; +				while (is_dir_sep(*src)) +					src++; +				continue; +			} else if (src[1] == '.') { +				if (!src[2]) { +					/* (3) */ +					src += 2; +					goto up_one; +				} else if (is_dir_sep(src[2])) { +					/* (4) */ +					src += 3; +					while (is_dir_sep(*src)) +						src++; +					goto up_one; +				} +			} +		} + +		/* copy up to the next '/', and eat all '/' */ +		while ((c = *src++) != '\0' && !is_dir_sep(c)) +			*dst++ = c; +		if (is_dir_sep(c)) { +			*dst++ = '/'; +			while (is_dir_sep(c)) +				c = *src++; +			src--; +		} else if (!c) +			break; +		continue; + +	up_one: +		/* +		 * dst0..dst is prefix portion, and dst[-1] is '/'; +		 * go up one level. +		 */ +		dst--;	/* go to trailing '/' */ +		if (dst <= dst0) +			return -1; +		/* Windows: dst[-1] cannot be backslash anymore */ +		while (dst0 < dst && dst[-1] != '/') +			dst--; +	} +	*dst = '\0'; +	return 0; +} + +/* + * path = Canonical absolute path + * prefix_list = Colon-separated list of absolute paths + * + * Determines, for each path in prefix_list, whether the "prefix" really + * is an ancestor directory of path.  Returns the length of the longest + * ancestor directory, excluding any trailing slashes, or -1 if no prefix + * is an ancestor.  (Note that this means 0 is returned if prefix_list is + * "/".) "/foo" is not considered an ancestor of "/foobar".  Directories + * are not considered to be their own ancestors.  path must be in a + * canonical form: empty components, or "." or ".." components are not + * allowed.  prefix_list may be null, which is like "". + */ +int longest_ancestor_length(const char *path, const char *prefix_list) +{ +	char buf[PATH_MAX+1]; +	const char *ceil, *colon; +	int len, max_len = -1; + +	if (prefix_list == NULL || !strcmp(path, "/")) +		return -1; + +	for (colon = ceil = prefix_list; *colon; ceil = colon+1) { +		for (colon = ceil; *colon && *colon != PATH_SEP; colon++); +		len = colon - ceil; +		if (len == 0 || len > PATH_MAX || !is_absolute_path(ceil)) +			continue; +		strlcpy(buf, ceil, len+1); +		if (normalize_path_copy(buf, buf) < 0) +			continue; +		len = strlen(buf); +		if (len > 0 && buf[len-1] == '/') +			buf[--len] = '\0'; + +		if (!strncmp(path, buf, len) && +		    path[len] == '/' && +		    len > max_len) { +			max_len = len; +		} +	} + +	return max_len; +} + +/* strip arbitrary amount of directory separators at end of path */ +static inline int chomp_trailing_dir_sep(const char *path, int len) +{ +	while (len && is_dir_sep(path[len - 1])) +		len--; +	return len; +} + +/* + * If path ends with suffix (complete path components), returns the + * part before suffix (sans trailing directory separators). + * Otherwise returns NULL. + */ +char *strip_path_suffix(const char *path, const char *suffix) +{ +	int path_len = strlen(path), suffix_len = strlen(suffix); + +	while (suffix_len) { +		if (!path_len) +			return NULL; + +		if (is_dir_sep(path[path_len - 1])) { +			if (!is_dir_sep(suffix[suffix_len - 1])) +				return NULL; +			path_len = chomp_trailing_dir_sep(path, path_len); +			suffix_len = chomp_trailing_dir_sep(suffix, suffix_len); +		} +		else if (path[--path_len] != suffix[--suffix_len]) +			return NULL; +	} + +	if (path_len && !is_dir_sep(path[path_len - 1])) +		return NULL; +	return xstrndup(path, chomp_trailing_dir_sep(path, path_len)); +} diff --git a/tools/perf/util/quote.c b/tools/perf/util/quote.c new file mode 100644 index 00000000000..f18c5212bc9 --- /dev/null +++ b/tools/perf/util/quote.c @@ -0,0 +1,481 @@ +#include "cache.h" +#include "quote.h" + +int quote_path_fully = 1; + +/* Help to copy the thing properly quoted for the shell safety. + * any single quote is replaced with '\'', any exclamation point + * is replaced with '\!', and the whole thing is enclosed in a + * + * E.g. + *  original     sq_quote     result + *  name     ==> name      ==> 'name' + *  a b      ==> a b       ==> 'a b' + *  a'b      ==> a'\''b    ==> 'a'\''b' + *  a!b      ==> a'\!'b    ==> 'a'\!'b' + */ +static inline int need_bs_quote(char c) +{ +	return (c == '\'' || c == '!'); +} + +void sq_quote_buf(struct strbuf *dst, const char *src) +{ +	char *to_free = NULL; + +	if (dst->buf == src) +		to_free = strbuf_detach(dst, NULL); + +	strbuf_addch(dst, '\''); +	while (*src) { +		size_t len = strcspn(src, "'!"); +		strbuf_add(dst, src, len); +		src += len; +		while (need_bs_quote(*src)) { +			strbuf_addstr(dst, "'\\"); +			strbuf_addch(dst, *src++); +			strbuf_addch(dst, '\''); +		} +	} +	strbuf_addch(dst, '\''); +	free(to_free); +} + +void sq_quote_print(FILE *stream, const char *src) +{ +	char c; + +	fputc('\'', stream); +	while ((c = *src++)) { +		if (need_bs_quote(c)) { +			fputs("'\\", stream); +			fputc(c, stream); +			fputc('\'', stream); +		} else { +			fputc(c, stream); +		} +	} +	fputc('\'', stream); +} + +void sq_quote_argv(struct strbuf *dst, const char** argv, size_t maxlen) +{ +	int i; + +	/* Copy into destination buffer. */ +	strbuf_grow(dst, 255); +	for (i = 0; argv[i]; ++i) { +		strbuf_addch(dst, ' '); +		sq_quote_buf(dst, argv[i]); +		if (maxlen && dst->len > maxlen) +			die("Too many or long arguments"); +	} +} + +char *sq_dequote_step(char *arg, char **next) +{ +	char *dst = arg; +	char *src = arg; +	char c; + +	if (*src != '\'') +		return NULL; +	for (;;) { +		c = *++src; +		if (!c) +			return NULL; +		if (c != '\'') { +			*dst++ = c; +			continue; +		} +		/* We stepped out of sq */ +		switch (*++src) { +		case '\0': +			*dst = 0; +			if (next) +				*next = NULL; +			return arg; +		case '\\': +			c = *++src; +			if (need_bs_quote(c) && *++src == '\'') { +				*dst++ = c; +				continue; +			} +		/* Fallthrough */ +		default: +			if (!next || !isspace(*src)) +				return NULL; +			do { +				c = *++src; +			} while (isspace(c)); +			*dst = 0; +			*next = src; +			return arg; +		} +	} +} + +char *sq_dequote(char *arg) +{ +	return sq_dequote_step(arg, NULL); +} + +int sq_dequote_to_argv(char *arg, const char ***argv, int *nr, int *alloc) +{ +	char *next = arg; + +	if (!*arg) +		return 0; +	do { +		char *dequoted = sq_dequote_step(next, &next); +		if (!dequoted) +			return -1; +		ALLOC_GROW(*argv, *nr + 1, *alloc); +		(*argv)[(*nr)++] = dequoted; +	} while (next); + +	return 0; +} + +/* 1 means: quote as octal + * 0 means: quote as octal if (quote_path_fully) + * -1 means: never quote + * c: quote as "\\c" + */ +#define X8(x)   x, x, x, x, x, x, x, x +#define X16(x)  X8(x), X8(x) +static signed char const sq_lookup[256] = { +	/*           0    1    2    3    4    5    6    7 */ +	/* 0x00 */   1,   1,   1,   1,   1,   1,   1, 'a', +	/* 0x08 */ 'b', 't', 'n', 'v', 'f', 'r',   1,   1, +	/* 0x10 */ X16(1), +	/* 0x20 */  -1,  -1, '"',  -1,  -1,  -1,  -1,  -1, +	/* 0x28 */ X16(-1), X16(-1), X16(-1), +	/* 0x58 */  -1,  -1,  -1,  -1,'\\',  -1,  -1,  -1, +	/* 0x60 */ X16(-1), X8(-1), +	/* 0x78 */  -1,  -1,  -1,  -1,  -1,  -1,  -1,   1, +	/* 0x80 */ /* set to 0 */ +}; + +static inline int sq_must_quote(char c) +{ +	return sq_lookup[(unsigned char)c] + quote_path_fully > 0; +} + +/* returns the longest prefix not needing a quote up to maxlen if positive. +   This stops at the first \0 because it's marked as a character needing an +   escape */ +static size_t next_quote_pos(const char *s, ssize_t maxlen) +{ +	size_t len; +	if (maxlen < 0) { +		for (len = 0; !sq_must_quote(s[len]); len++); +	} else { +		for (len = 0; len < maxlen && !sq_must_quote(s[len]); len++); +	} +	return len; +} + +/* + * C-style name quoting. + * + * (1) if sb and fp are both NULL, inspect the input name and counts the + *     number of bytes that are needed to hold c_style quoted version of name, + *     counting the double quotes around it but not terminating NUL, and + *     returns it. + *     However, if name does not need c_style quoting, it returns 0. + * + * (2) if sb or fp are not NULL, it emits the c_style quoted version + *     of name, enclosed with double quotes if asked and needed only. + *     Return value is the same as in (1). + */ +static size_t quote_c_style_counted(const char *name, ssize_t maxlen, +                                    struct strbuf *sb, FILE *fp, int no_dq) +{ +#undef EMIT +#define EMIT(c)                                 \ +	do {                                        \ +		if (sb) strbuf_addch(sb, (c));          \ +		if (fp) fputc((c), fp);                 \ +		count++;                                \ +	} while (0) +#define EMITBUF(s, l)                           \ +	do {                                        \ +		int __ret;				\ +		if (sb) strbuf_add(sb, (s), (l));       \ +		if (fp) __ret = fwrite((s), (l), 1, fp);        \ +		count += (l);                           \ +	} while (0) + +	size_t len, count = 0; +	const char *p = name; + +	for (;;) { +		int ch; + +		len = next_quote_pos(p, maxlen); +		if (len == maxlen || !p[len]) +			break; + +		if (!no_dq && p == name) +			EMIT('"'); + +		EMITBUF(p, len); +		EMIT('\\'); +		p += len; +		ch = (unsigned char)*p++; +		if (sq_lookup[ch] >= ' ') { +			EMIT(sq_lookup[ch]); +		} else { +			EMIT(((ch >> 6) & 03) + '0'); +			EMIT(((ch >> 3) & 07) + '0'); +			EMIT(((ch >> 0) & 07) + '0'); +		} +	} + +	EMITBUF(p, len); +	if (p == name)   /* no ending quote needed */ +		return 0; + +	if (!no_dq) +		EMIT('"'); +	return count; +} + +size_t quote_c_style(const char *name, struct strbuf *sb, FILE *fp, int nodq) +{ +	return quote_c_style_counted(name, -1, sb, fp, nodq); +} + +void quote_two_c_style(struct strbuf *sb, const char *prefix, const char *path, int nodq) +{ +	if (quote_c_style(prefix, NULL, NULL, 0) || +	    quote_c_style(path, NULL, NULL, 0)) { +		if (!nodq) +			strbuf_addch(sb, '"'); +		quote_c_style(prefix, sb, NULL, 1); +		quote_c_style(path, sb, NULL, 1); +		if (!nodq) +			strbuf_addch(sb, '"'); +	} else { +		strbuf_addstr(sb, prefix); +		strbuf_addstr(sb, path); +	} +} + +void write_name_quoted(const char *name, FILE *fp, int terminator) +{ +	if (terminator) { +		quote_c_style(name, NULL, fp, 0); +	} else { +		fputs(name, fp); +	} +	fputc(terminator, fp); +} + +extern void write_name_quotedpfx(const char *pfx, size_t pfxlen, +                                 const char *name, FILE *fp, int terminator) +{ +	int needquote = 0; + +	if (terminator) { +		needquote = next_quote_pos(pfx, pfxlen) < pfxlen +			|| name[next_quote_pos(name, -1)]; +	} +	if (needquote) { +		fputc('"', fp); +		quote_c_style_counted(pfx, pfxlen, NULL, fp, 1); +		quote_c_style(name, NULL, fp, 1); +		fputc('"', fp); +	} else { +		int ret; + +		ret = fwrite(pfx, pfxlen, 1, fp); +		fputs(name, fp); +	} +	fputc(terminator, fp); +} + +/* quote path as relative to the given prefix */ +char *quote_path_relative(const char *in, int len, +			  struct strbuf *out, const char *prefix) +{ +	int needquote; + +	if (len < 0) +		len = strlen(in); + +	/* "../" prefix itself does not need quoting, but "in" might. */ +	needquote = next_quote_pos(in, len) < len; +	strbuf_setlen(out, 0); +	strbuf_grow(out, len); + +	if (needquote) +		strbuf_addch(out, '"'); +	if (prefix) { +		int off = 0; +		while (prefix[off] && off < len && prefix[off] == in[off]) +			if (prefix[off] == '/') { +				prefix += off + 1; +				in += off + 1; +				len -= off + 1; +				off = 0; +			} else +				off++; + +		for (; *prefix; prefix++) +			if (*prefix == '/') +				strbuf_addstr(out, "../"); +	} + +	quote_c_style_counted (in, len, out, NULL, 1); + +	if (needquote) +		strbuf_addch(out, '"'); +	if (!out->len) +		strbuf_addstr(out, "./"); + +	return out->buf; +} + +/* + * C-style name unquoting. + * + * Quoted should point at the opening double quote. + * + Returns 0 if it was able to unquote the string properly, and appends the + *   result in the strbuf `sb'. + * + Returns -1 in case of error, and doesn't touch the strbuf. Though note + *   that this function will allocate memory in the strbuf, so calling + *   strbuf_release is mandatory whichever result unquote_c_style returns. + * + * Updates endp pointer to point at one past the ending double quote if given. + */ +int unquote_c_style(struct strbuf *sb, const char *quoted, const char **endp) +{ +	size_t oldlen = sb->len, len; +	int ch, ac; + +	if (*quoted++ != '"') +		return -1; + +	for (;;) { +		len = strcspn(quoted, "\"\\"); +		strbuf_add(sb, quoted, len); +		quoted += len; + +		switch (*quoted++) { +		  case '"': +			if (endp) +				*endp = quoted; +			return 0; +		  case '\\': +			break; +		  default: +			goto error; +		} + +		switch ((ch = *quoted++)) { +		case 'a': ch = '\a'; break; +		case 'b': ch = '\b'; break; +		case 'f': ch = '\f'; break; +		case 'n': ch = '\n'; break; +		case 'r': ch = '\r'; break; +		case 't': ch = '\t'; break; +		case 'v': ch = '\v'; break; + +		case '\\': case '"': +			break; /* verbatim */ + +		/* octal values with first digit over 4 overflow */ +		case '0': case '1': case '2': case '3': +					ac = ((ch - '0') << 6); +			if ((ch = *quoted++) < '0' || '7' < ch) +				goto error; +					ac |= ((ch - '0') << 3); +			if ((ch = *quoted++) < '0' || '7' < ch) +				goto error; +					ac |= (ch - '0'); +					ch = ac; +					break; +				default: +			goto error; +			} +		strbuf_addch(sb, ch); +		} + +  error: +	strbuf_setlen(sb, oldlen); +	return -1; +} + +/* quoting as a string literal for other languages */ + +void perl_quote_print(FILE *stream, const char *src) +{ +	const char sq = '\''; +	const char bq = '\\'; +	char c; + +	fputc(sq, stream); +	while ((c = *src++)) { +		if (c == sq || c == bq) +			fputc(bq, stream); +		fputc(c, stream); +	} +	fputc(sq, stream); +} + +void python_quote_print(FILE *stream, const char *src) +{ +	const char sq = '\''; +	const char bq = '\\'; +	const char nl = '\n'; +	char c; + +	fputc(sq, stream); +	while ((c = *src++)) { +		if (c == nl) { +			fputc(bq, stream); +			fputc('n', stream); +			continue; +		} +		if (c == sq || c == bq) +			fputc(bq, stream); +		fputc(c, stream); +	} +	fputc(sq, stream); +} + +void tcl_quote_print(FILE *stream, const char *src) +{ +	char c; + +	fputc('"', stream); +	while ((c = *src++)) { +		switch (c) { +		case '[': case ']': +		case '{': case '}': +		case '$': case '\\': case '"': +			fputc('\\', stream); +		default: +			fputc(c, stream); +			break; +		case '\f': +			fputs("\\f", stream); +			break; +		case '\r': +			fputs("\\r", stream); +			break; +		case '\n': +			fputs("\\n", stream); +			break; +		case '\t': +			fputs("\\t", stream); +			break; +		case '\v': +			fputs("\\v", stream); +			break; +		} +	} +	fputc('"', stream); +} diff --git a/tools/perf/util/quote.h b/tools/perf/util/quote.h new file mode 100644 index 00000000000..5dfad89816d --- /dev/null +++ b/tools/perf/util/quote.h @@ -0,0 +1,68 @@ +#ifndef QUOTE_H +#define QUOTE_H + +#include <stddef.h> +#include <stdio.h> + +/* Help to copy the thing properly quoted for the shell safety. + * any single quote is replaced with '\'', any exclamation point + * is replaced with '\!', and the whole thing is enclosed in a + * single quote pair. + * + * For example, if you are passing the result to system() as an + * argument: + * + * sprintf(cmd, "foobar %s %s", sq_quote(arg0), sq_quote(arg1)) + * + * would be appropriate.  If the system() is going to call ssh to + * run the command on the other side: + * + * sprintf(cmd, "git-diff-tree %s %s", sq_quote(arg0), sq_quote(arg1)); + * sprintf(rcmd, "ssh %s %s", sq_util/quote.host), sq_quote(cmd)); + * + * Note that the above examples leak memory!  Remember to free result from + * sq_quote() in a real application. + * + * sq_quote_buf() writes to an existing buffer of specified size; it + * will return the number of characters that would have been written + * excluding the final null regardless of the buffer size. + */ + +extern void sq_quote_print(FILE *stream, const char *src); + +extern void sq_quote_buf(struct strbuf *, const char *src); +extern void sq_quote_argv(struct strbuf *, const char **argv, size_t maxlen); + +/* This unwraps what sq_quote() produces in place, but returns + * NULL if the input does not look like what sq_quote would have + * produced. + */ +extern char *sq_dequote(char *); + +/* + * Same as the above, but can be used to unwrap many arguments in the + * same string separated by space. "next" is changed to point to the + * next argument that should be passed as first parameter. When there + * is no more argument to be dequoted, "next" is updated to point to NULL. + */ +extern char *sq_dequote_step(char *arg, char **next); +extern int sq_dequote_to_argv(char *arg, const char ***argv, int *nr, int *alloc); + +extern int unquote_c_style(struct strbuf *, const char *quoted, const char **endp); +extern size_t quote_c_style(const char *name, struct strbuf *, FILE *, int no_dq); +extern void quote_two_c_style(struct strbuf *, const char *, const char *, int); + +extern void write_name_quoted(const char *name, FILE *, int terminator); +extern void write_name_quotedpfx(const char *pfx, size_t pfxlen, +                                 const char *name, FILE *, int terminator); + +/* quote path as relative to the given prefix */ +char *quote_path_relative(const char *in, int len, +			  struct strbuf *out, const char *prefix); + +/* quoting as a string literal for other languages */ +extern void perl_quote_print(FILE *stream, const char *src); +extern void python_quote_print(FILE *stream, const char *src); +extern void tcl_quote_print(FILE *stream, const char *src); + +#endif diff --git a/tools/perf/util/rbtree.c b/tools/perf/util/rbtree.c new file mode 100644 index 00000000000..b15ba9c7cb3 --- /dev/null +++ b/tools/perf/util/rbtree.c @@ -0,0 +1,383 @@ +/* +  Red Black Trees +  (C) 1999  Andrea Arcangeli <andrea@suse.de> +  (C) 2002  David Woodhouse <dwmw2@infradead.org> +   +  This program is free software; you can redistribute it and/or modify +  it under the terms of the GNU General Public License as published by +  the Free Software Foundation; either version 2 of the License, or +  (at your option) any later version. + +  This program is distributed in the hope that it will be useful, +  but WITHOUT ANY WARRANTY; without even the implied warranty of +  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the +  GNU General Public License for more details. + +  You should have received a copy of the GNU General Public License +  along with this program; if not, write to the Free Software +  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA + +  linux/lib/rbtree.c +*/ + +#include "rbtree.h" + +static void __rb_rotate_left(struct rb_node *node, struct rb_root *root) +{ +	struct rb_node *right = node->rb_right; +	struct rb_node *parent = rb_parent(node); + +	if ((node->rb_right = right->rb_left)) +		rb_set_parent(right->rb_left, node); +	right->rb_left = node; + +	rb_set_parent(right, parent); + +	if (parent) +	{ +		if (node == parent->rb_left) +			parent->rb_left = right; +		else +			parent->rb_right = right; +	} +	else +		root->rb_node = right; +	rb_set_parent(node, right); +} + +static void __rb_rotate_right(struct rb_node *node, struct rb_root *root) +{ +	struct rb_node *left = node->rb_left; +	struct rb_node *parent = rb_parent(node); + +	if ((node->rb_left = left->rb_right)) +		rb_set_parent(left->rb_right, node); +	left->rb_right = node; + +	rb_set_parent(left, parent); + +	if (parent) +	{ +		if (node == parent->rb_right) +			parent->rb_right = left; +		else +			parent->rb_left = left; +	} +	else +		root->rb_node = left; +	rb_set_parent(node, left); +} + +void rb_insert_color(struct rb_node *node, struct rb_root *root) +{ +	struct rb_node *parent, *gparent; + +	while ((parent = rb_parent(node)) && rb_is_red(parent)) +	{ +		gparent = rb_parent(parent); + +		if (parent == gparent->rb_left) +		{ +			{ +				register struct rb_node *uncle = gparent->rb_right; +				if (uncle && rb_is_red(uncle)) +				{ +					rb_set_black(uncle); +					rb_set_black(parent); +					rb_set_red(gparent); +					node = gparent; +					continue; +				} +			} + +			if (parent->rb_right == node) +			{ +				register struct rb_node *tmp; +				__rb_rotate_left(parent, root); +				tmp = parent; +				parent = node; +				node = tmp; +			} + +			rb_set_black(parent); +			rb_set_red(gparent); +			__rb_rotate_right(gparent, root); +		} else { +			{ +				register struct rb_node *uncle = gparent->rb_left; +				if (uncle && rb_is_red(uncle)) +				{ +					rb_set_black(uncle); +					rb_set_black(parent); +					rb_set_red(gparent); +					node = gparent; +					continue; +				} +			} + +			if (parent->rb_left == node) +			{ +				register struct rb_node *tmp; +				__rb_rotate_right(parent, root); +				tmp = parent; +				parent = node; +				node = tmp; +			} + +			rb_set_black(parent); +			rb_set_red(gparent); +			__rb_rotate_left(gparent, root); +		} +	} + +	rb_set_black(root->rb_node); +} + +static void __rb_erase_color(struct rb_node *node, struct rb_node *parent, +			     struct rb_root *root) +{ +	struct rb_node *other; + +	while ((!node || rb_is_black(node)) && node != root->rb_node) +	{ +		if (parent->rb_left == node) +		{ +			other = parent->rb_right; +			if (rb_is_red(other)) +			{ +				rb_set_black(other); +				rb_set_red(parent); +				__rb_rotate_left(parent, root); +				other = parent->rb_right; +			} +			if ((!other->rb_left || rb_is_black(other->rb_left)) && +			    (!other->rb_right || rb_is_black(other->rb_right))) +			{ +				rb_set_red(other); +				node = parent; +				parent = rb_parent(node); +			} +			else +			{ +				if (!other->rb_right || rb_is_black(other->rb_right)) +				{ +					rb_set_black(other->rb_left); +					rb_set_red(other); +					__rb_rotate_right(other, root); +					other = parent->rb_right; +				} +				rb_set_color(other, rb_color(parent)); +				rb_set_black(parent); +				rb_set_black(other->rb_right); +				__rb_rotate_left(parent, root); +				node = root->rb_node; +				break; +			} +		} +		else +		{ +			other = parent->rb_left; +			if (rb_is_red(other)) +			{ +				rb_set_black(other); +				rb_set_red(parent); +				__rb_rotate_right(parent, root); +				other = parent->rb_left; +			} +			if ((!other->rb_left || rb_is_black(other->rb_left)) && +			    (!other->rb_right || rb_is_black(other->rb_right))) +			{ +				rb_set_red(other); +				node = parent; +				parent = rb_parent(node); +			} +			else +			{ +				if (!other->rb_left || rb_is_black(other->rb_left)) +				{ +					rb_set_black(other->rb_right); +					rb_set_red(other); +					__rb_rotate_left(other, root); +					other = parent->rb_left; +				} +				rb_set_color(other, rb_color(parent)); +				rb_set_black(parent); +				rb_set_black(other->rb_left); +				__rb_rotate_right(parent, root); +				node = root->rb_node; +				break; +			} +		} +	} +	if (node) +		rb_set_black(node); +} + +void rb_erase(struct rb_node *node, struct rb_root *root) +{ +	struct rb_node *child, *parent; +	int color; + +	if (!node->rb_left) +		child = node->rb_right; +	else if (!node->rb_right) +		child = node->rb_left; +	else +	{ +		struct rb_node *old = node, *left; + +		node = node->rb_right; +		while ((left = node->rb_left) != NULL) +			node = left; +		child = node->rb_right; +		parent = rb_parent(node); +		color = rb_color(node); + +		if (child) +			rb_set_parent(child, parent); +		if (parent == old) { +			parent->rb_right = child; +			parent = node; +		} else +			parent->rb_left = child; + +		node->rb_parent_color = old->rb_parent_color; +		node->rb_right = old->rb_right; +		node->rb_left = old->rb_left; + +		if (rb_parent(old)) +		{ +			if (rb_parent(old)->rb_left == old) +				rb_parent(old)->rb_left = node; +			else +				rb_parent(old)->rb_right = node; +		} else +			root->rb_node = node; + +		rb_set_parent(old->rb_left, node); +		if (old->rb_right) +			rb_set_parent(old->rb_right, node); +		goto color; +	} + +	parent = rb_parent(node); +	color = rb_color(node); + +	if (child) +		rb_set_parent(child, parent); +	if (parent) +	{ +		if (parent->rb_left == node) +			parent->rb_left = child; +		else +			parent->rb_right = child; +	} +	else +		root->rb_node = child; + + color: +	if (color == RB_BLACK) +		__rb_erase_color(child, parent, root); +} + +/* + * This function returns the first node (in sort order) of the tree. + */ +struct rb_node *rb_first(const struct rb_root *root) +{ +	struct rb_node	*n; + +	n = root->rb_node; +	if (!n) +		return NULL; +	while (n->rb_left) +		n = n->rb_left; +	return n; +} + +struct rb_node *rb_last(const struct rb_root *root) +{ +	struct rb_node	*n; + +	n = root->rb_node; +	if (!n) +		return NULL; +	while (n->rb_right) +		n = n->rb_right; +	return n; +} + +struct rb_node *rb_next(const struct rb_node *node) +{ +	struct rb_node *parent; + +	if (rb_parent(node) == node) +		return NULL; + +	/* If we have a right-hand child, go down and then left as far +	   as we can. */ +	if (node->rb_right) { +		node = node->rb_right;  +		while (node->rb_left) +			node=node->rb_left; +		return (struct rb_node *)node; +	} + +	/* No right-hand children.  Everything down and left is +	   smaller than us, so any 'next' node must be in the general +	   direction of our parent. Go up the tree; any time the +	   ancestor is a right-hand child of its parent, keep going +	   up. First time it's a left-hand child of its parent, said +	   parent is our 'next' node. */ +	while ((parent = rb_parent(node)) && node == parent->rb_right) +		node = parent; + +	return parent; +} + +struct rb_node *rb_prev(const struct rb_node *node) +{ +	struct rb_node *parent; + +	if (rb_parent(node) == node) +		return NULL; + +	/* If we have a left-hand child, go down and then right as far +	   as we can. */ +	if (node->rb_left) { +		node = node->rb_left;  +		while (node->rb_right) +			node=node->rb_right; +		return (struct rb_node *)node; +	} + +	/* No left-hand children. Go up till we find an ancestor which +	   is a right-hand child of its parent */ +	while ((parent = rb_parent(node)) && node == parent->rb_left) +		node = parent; + +	return parent; +} + +void rb_replace_node(struct rb_node *victim, struct rb_node *new, +		     struct rb_root *root) +{ +	struct rb_node *parent = rb_parent(victim); + +	/* Set the surrounding nodes to point to the replacement */ +	if (parent) { +		if (victim == parent->rb_left) +			parent->rb_left = new; +		else +			parent->rb_right = new; +	} else { +		root->rb_node = new; +	} +	if (victim->rb_left) +		rb_set_parent(victim->rb_left, new); +	if (victim->rb_right) +		rb_set_parent(victim->rb_right, new); + +	/* Copy the pointers/colour from the victim to the replacement */ +	*new = *victim; +} diff --git a/tools/perf/util/rbtree.h b/tools/perf/util/rbtree.h new file mode 100644 index 00000000000..6bdc488a47f --- /dev/null +++ b/tools/perf/util/rbtree.h @@ -0,0 +1,171 @@ +/* +  Red Black Trees +  (C) 1999  Andrea Arcangeli <andrea@suse.de> +   +  This program is free software; you can redistribute it and/or modify +  it under the terms of the GNU General Public License as published by +  the Free Software Foundation; either version 2 of the License, or +  (at your option) any later version. + +  This program is distributed in the hope that it will be useful, +  but WITHOUT ANY WARRANTY; without even the implied warranty of +  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the +  GNU General Public License for more details. + +  You should have received a copy of the GNU General Public License +  along with this program; if not, write to the Free Software +  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA + +  linux/include/linux/rbtree.h + +  To use rbtrees you'll have to implement your own insert and search cores. +  This will avoid us to use callbacks and to drop drammatically performances. +  I know it's not the cleaner way,  but in C (not in C++) to get +  performances and genericity... + +  Some example of insert and search follows here. The search is a plain +  normal search over an ordered tree. The insert instead must be implemented +  int two steps: as first thing the code must insert the element in +  order as a red leaf in the tree, then the support library function +  rb_insert_color() must be called. Such function will do the +  not trivial work to rebalance the rbtree if necessary. + +----------------------------------------------------------------------- +static inline struct page * rb_search_page_cache(struct inode * inode, +						 unsigned long offset) +{ +	struct rb_node * n = inode->i_rb_page_cache.rb_node; +	struct page * page; + +	while (n) +	{ +		page = rb_entry(n, struct page, rb_page_cache); + +		if (offset < page->offset) +			n = n->rb_left; +		else if (offset > page->offset) +			n = n->rb_right; +		else +			return page; +	} +	return NULL; +} + +static inline struct page * __rb_insert_page_cache(struct inode * inode, +						   unsigned long offset, +						   struct rb_node * node) +{ +	struct rb_node ** p = &inode->i_rb_page_cache.rb_node; +	struct rb_node * parent = NULL; +	struct page * page; + +	while (*p) +	{ +		parent = *p; +		page = rb_entry(parent, struct page, rb_page_cache); + +		if (offset < page->offset) +			p = &(*p)->rb_left; +		else if (offset > page->offset) +			p = &(*p)->rb_right; +		else +			return page; +	} + +	rb_link_node(node, parent, p); + +	return NULL; +} + +static inline struct page * rb_insert_page_cache(struct inode * inode, +						 unsigned long offset, +						 struct rb_node * node) +{ +	struct page * ret; +	if ((ret = __rb_insert_page_cache(inode, offset, node))) +		goto out; +	rb_insert_color(node, &inode->i_rb_page_cache); + out: +	return ret; +} +----------------------------------------------------------------------- +*/ + +#ifndef	_LINUX_RBTREE_H +#define	_LINUX_RBTREE_H + +#include <stddef.h> + +/** + * container_of - cast a member of a structure out to the containing structure + * @ptr:	the pointer to the member. + * @type:	the type of the container struct this is embedded in. + * @member:	the name of the member within the struct. + * + */ +#define container_of(ptr, type, member) ({			\ +	const typeof( ((type *)0)->member ) *__mptr = (ptr);	\ +	(type *)( (char *)__mptr - offsetof(type,member) );}) + +struct rb_node +{ +	unsigned long  rb_parent_color; +#define	RB_RED		0 +#define	RB_BLACK	1 +	struct rb_node *rb_right; +	struct rb_node *rb_left; +} __attribute__((aligned(sizeof(long)))); +    /* The alignment might seem pointless, but allegedly CRIS needs it */ + +struct rb_root +{ +	struct rb_node *rb_node; +}; + + +#define rb_parent(r)   ((struct rb_node *)((r)->rb_parent_color & ~3)) +#define rb_color(r)   ((r)->rb_parent_color & 1) +#define rb_is_red(r)   (!rb_color(r)) +#define rb_is_black(r) rb_color(r) +#define rb_set_red(r)  do { (r)->rb_parent_color &= ~1; } while (0) +#define rb_set_black(r)  do { (r)->rb_parent_color |= 1; } while (0) + +static inline void rb_set_parent(struct rb_node *rb, struct rb_node *p) +{ +	rb->rb_parent_color = (rb->rb_parent_color & 3) | (unsigned long)p; +} +static inline void rb_set_color(struct rb_node *rb, int color) +{ +	rb->rb_parent_color = (rb->rb_parent_color & ~1) | color; +} + +#define RB_ROOT	(struct rb_root) { NULL, } +#define	rb_entry(ptr, type, member) container_of(ptr, type, member) + +#define RB_EMPTY_ROOT(root)	((root)->rb_node == NULL) +#define RB_EMPTY_NODE(node)	(rb_parent(node) == node) +#define RB_CLEAR_NODE(node)	(rb_set_parent(node, node)) + +extern void rb_insert_color(struct rb_node *, struct rb_root *); +extern void rb_erase(struct rb_node *, struct rb_root *); + +/* Find logical next and previous nodes in a tree */ +extern struct rb_node *rb_next(const struct rb_node *); +extern struct rb_node *rb_prev(const struct rb_node *); +extern struct rb_node *rb_first(const struct rb_root *); +extern struct rb_node *rb_last(const struct rb_root *); + +/* Fast replacement of a single node without remove/rebalance/add/rebalance */ +extern void rb_replace_node(struct rb_node *victim, struct rb_node *new,  +			    struct rb_root *root); + +static inline void rb_link_node(struct rb_node * node, struct rb_node * parent, +				struct rb_node ** rb_link) +{ +	node->rb_parent_color = (unsigned long )parent; +	node->rb_left = node->rb_right = NULL; + +	*rb_link = node; +} + +#endif	/* _LINUX_RBTREE_H */ diff --git a/tools/perf/util/run-command.c b/tools/perf/util/run-command.c new file mode 100644 index 00000000000..b2f5e854f40 --- /dev/null +++ b/tools/perf/util/run-command.c @@ -0,0 +1,395 @@ +#include "cache.h" +#include "run-command.h" +#include "exec_cmd.h" + +static inline void close_pair(int fd[2]) +{ +	close(fd[0]); +	close(fd[1]); +} + +static inline void dup_devnull(int to) +{ +	int fd = open("/dev/null", O_RDWR); +	dup2(fd, to); +	close(fd); +} + +int start_command(struct child_process *cmd) +{ +	int need_in, need_out, need_err; +	int fdin[2], fdout[2], fderr[2]; + +	/* +	 * In case of errors we must keep the promise to close FDs +	 * that have been passed in via ->in and ->out. +	 */ + +	need_in = !cmd->no_stdin && cmd->in < 0; +	if (need_in) { +		if (pipe(fdin) < 0) { +			if (cmd->out > 0) +				close(cmd->out); +			return -ERR_RUN_COMMAND_PIPE; +		} +		cmd->in = fdin[1]; +	} + +	need_out = !cmd->no_stdout +		&& !cmd->stdout_to_stderr +		&& cmd->out < 0; +	if (need_out) { +		if (pipe(fdout) < 0) { +			if (need_in) +				close_pair(fdin); +			else if (cmd->in) +				close(cmd->in); +			return -ERR_RUN_COMMAND_PIPE; +		} +		cmd->out = fdout[0]; +	} + +	need_err = !cmd->no_stderr && cmd->err < 0; +	if (need_err) { +		if (pipe(fderr) < 0) { +			if (need_in) +				close_pair(fdin); +			else if (cmd->in) +				close(cmd->in); +			if (need_out) +				close_pair(fdout); +			else if (cmd->out) +				close(cmd->out); +			return -ERR_RUN_COMMAND_PIPE; +		} +		cmd->err = fderr[0]; +	} + +#ifndef __MINGW32__ +	fflush(NULL); +	cmd->pid = fork(); +	if (!cmd->pid) { +		if (cmd->no_stdin) +			dup_devnull(0); +		else if (need_in) { +			dup2(fdin[0], 0); +			close_pair(fdin); +		} else if (cmd->in) { +			dup2(cmd->in, 0); +			close(cmd->in); +		} + +		if (cmd->no_stderr) +			dup_devnull(2); +		else if (need_err) { +			dup2(fderr[1], 2); +			close_pair(fderr); +		} + +		if (cmd->no_stdout) +			dup_devnull(1); +		else if (cmd->stdout_to_stderr) +			dup2(2, 1); +		else if (need_out) { +			dup2(fdout[1], 1); +			close_pair(fdout); +		} else if (cmd->out > 1) { +			dup2(cmd->out, 1); +			close(cmd->out); +		} + +		if (cmd->dir && chdir(cmd->dir)) +			die("exec %s: cd to %s failed (%s)", cmd->argv[0], +			    cmd->dir, strerror(errno)); +		if (cmd->env) { +			for (; *cmd->env; cmd->env++) { +				if (strchr(*cmd->env, '=')) +					putenv((char*)*cmd->env); +				else +					unsetenv(*cmd->env); +			} +		} +		if (cmd->preexec_cb) +			cmd->preexec_cb(); +		if (cmd->perf_cmd) { +			execv_perf_cmd(cmd->argv); +		} else { +			execvp(cmd->argv[0], (char *const*) cmd->argv); +		} +		exit(127); +	} +#else +	int s0 = -1, s1 = -1, s2 = -1;	/* backups of stdin, stdout, stderr */ +	const char **sargv = cmd->argv; +	char **env = environ; + +	if (cmd->no_stdin) { +		s0 = dup(0); +		dup_devnull(0); +	} else if (need_in) { +		s0 = dup(0); +		dup2(fdin[0], 0); +	} else if (cmd->in) { +		s0 = dup(0); +		dup2(cmd->in, 0); +	} + +	if (cmd->no_stderr) { +		s2 = dup(2); +		dup_devnull(2); +	} else if (need_err) { +		s2 = dup(2); +		dup2(fderr[1], 2); +	} + +	if (cmd->no_stdout) { +		s1 = dup(1); +		dup_devnull(1); +	} else if (cmd->stdout_to_stderr) { +		s1 = dup(1); +		dup2(2, 1); +	} else if (need_out) { +		s1 = dup(1); +		dup2(fdout[1], 1); +	} else if (cmd->out > 1) { +		s1 = dup(1); +		dup2(cmd->out, 1); +	} + +	if (cmd->dir) +		die("chdir in start_command() not implemented"); +	if (cmd->env) { +		env = copy_environ(); +		for (; *cmd->env; cmd->env++) +			env = env_setenv(env, *cmd->env); +	} + +	if (cmd->perf_cmd) { +		cmd->argv = prepare_perf_cmd(cmd->argv); +	} + +	cmd->pid = mingw_spawnvpe(cmd->argv[0], cmd->argv, env); + +	if (cmd->env) +		free_environ(env); +	if (cmd->perf_cmd) +		free(cmd->argv); + +	cmd->argv = sargv; +	if (s0 >= 0) +		dup2(s0, 0), close(s0); +	if (s1 >= 0) +		dup2(s1, 1), close(s1); +	if (s2 >= 0) +		dup2(s2, 2), close(s2); +#endif + +	if (cmd->pid < 0) { +		int err = errno; +		if (need_in) +			close_pair(fdin); +		else if (cmd->in) +			close(cmd->in); +		if (need_out) +			close_pair(fdout); +		else if (cmd->out) +			close(cmd->out); +		if (need_err) +			close_pair(fderr); +		return err == ENOENT ? +			-ERR_RUN_COMMAND_EXEC : +			-ERR_RUN_COMMAND_FORK; +	} + +	if (need_in) +		close(fdin[0]); +	else if (cmd->in) +		close(cmd->in); + +	if (need_out) +		close(fdout[1]); +	else if (cmd->out) +		close(cmd->out); + +	if (need_err) +		close(fderr[1]); + +	return 0; +} + +static int wait_or_whine(pid_t pid) +{ +	for (;;) { +		int status, code; +		pid_t waiting = waitpid(pid, &status, 0); + +		if (waiting < 0) { +			if (errno == EINTR) +				continue; +			error("waitpid failed (%s)", strerror(errno)); +			return -ERR_RUN_COMMAND_WAITPID; +		} +		if (waiting != pid) +			return -ERR_RUN_COMMAND_WAITPID_WRONG_PID; +		if (WIFSIGNALED(status)) +			return -ERR_RUN_COMMAND_WAITPID_SIGNAL; + +		if (!WIFEXITED(status)) +			return -ERR_RUN_COMMAND_WAITPID_NOEXIT; +		code = WEXITSTATUS(status); +		switch (code) { +		case 127: +			return -ERR_RUN_COMMAND_EXEC; +		case 0: +			return 0; +		default: +			return -code; +		} +	} +} + +int finish_command(struct child_process *cmd) +{ +	return wait_or_whine(cmd->pid); +} + +int run_command(struct child_process *cmd) +{ +	int code = start_command(cmd); +	if (code) +		return code; +	return finish_command(cmd); +} + +static void prepare_run_command_v_opt(struct child_process *cmd, +				      const char **argv, +				      int opt) +{ +	memset(cmd, 0, sizeof(*cmd)); +	cmd->argv = argv; +	cmd->no_stdin = opt & RUN_COMMAND_NO_STDIN ? 1 : 0; +	cmd->perf_cmd = opt & RUN_PERF_CMD ? 1 : 0; +	cmd->stdout_to_stderr = opt & RUN_COMMAND_STDOUT_TO_STDERR ? 1 : 0; +} + +int run_command_v_opt(const char **argv, int opt) +{ +	struct child_process cmd; +	prepare_run_command_v_opt(&cmd, argv, opt); +	return run_command(&cmd); +} + +int run_command_v_opt_cd_env(const char **argv, int opt, const char *dir, const char *const *env) +{ +	struct child_process cmd; +	prepare_run_command_v_opt(&cmd, argv, opt); +	cmd.dir = dir; +	cmd.env = env; +	return run_command(&cmd); +} + +#ifdef __MINGW32__ +static __stdcall unsigned run_thread(void *data) +{ +	struct async *async = data; +	return async->proc(async->fd_for_proc, async->data); +} +#endif + +int start_async(struct async *async) +{ +	int pipe_out[2]; + +	if (pipe(pipe_out) < 0) +		return error("cannot create pipe: %s", strerror(errno)); +	async->out = pipe_out[0]; + +#ifndef __MINGW32__ +	/* Flush stdio before fork() to avoid cloning buffers */ +	fflush(NULL); + +	async->pid = fork(); +	if (async->pid < 0) { +		error("fork (async) failed: %s", strerror(errno)); +		close_pair(pipe_out); +		return -1; +	} +	if (!async->pid) { +		close(pipe_out[0]); +		exit(!!async->proc(pipe_out[1], async->data)); +	} +	close(pipe_out[1]); +#else +	async->fd_for_proc = pipe_out[1]; +	async->tid = (HANDLE) _beginthreadex(NULL, 0, run_thread, async, 0, NULL); +	if (!async->tid) { +		error("cannot create thread: %s", strerror(errno)); +		close_pair(pipe_out); +		return -1; +	} +#endif +	return 0; +} + +int finish_async(struct async *async) +{ +#ifndef __MINGW32__ +	int ret = 0; + +	if (wait_or_whine(async->pid)) +		ret = error("waitpid (async) failed"); +#else +	DWORD ret = 0; +	if (WaitForSingleObject(async->tid, INFINITE) != WAIT_OBJECT_0) +		ret = error("waiting for thread failed: %lu", GetLastError()); +	else if (!GetExitCodeThread(async->tid, &ret)) +		ret = error("cannot get thread exit code: %lu", GetLastError()); +	CloseHandle(async->tid); +#endif +	return ret; +} + +int run_hook(const char *index_file, const char *name, ...) +{ +	struct child_process hook; +	const char **argv = NULL, *env[2]; +	char index[PATH_MAX]; +	va_list args; +	int ret; +	size_t i = 0, alloc = 0; + +	if (access(perf_path("hooks/%s", name), X_OK) < 0) +		return 0; + +	va_start(args, name); +	ALLOC_GROW(argv, i + 1, alloc); +	argv[i++] = perf_path("hooks/%s", name); +	while (argv[i-1]) { +		ALLOC_GROW(argv, i + 1, alloc); +		argv[i++] = va_arg(args, const char *); +	} +	va_end(args); + +	memset(&hook, 0, sizeof(hook)); +	hook.argv = argv; +	hook.no_stdin = 1; +	hook.stdout_to_stderr = 1; +	if (index_file) { +		snprintf(index, sizeof(index), "PERF_INDEX_FILE=%s", index_file); +		env[0] = index; +		env[1] = NULL; +		hook.env = env; +	} + +	ret = start_command(&hook); +	free(argv); +	if (ret) { +		warning("Could not spawn %s", argv[0]); +		return ret; +	} +	ret = finish_command(&hook); +	if (ret == -ERR_RUN_COMMAND_WAITPID_SIGNAL) +		warning("%s exited due to uncaught signal", argv[0]); + +	return ret; +} diff --git a/tools/perf/util/run-command.h b/tools/perf/util/run-command.h new file mode 100644 index 00000000000..328289f2366 --- /dev/null +++ b/tools/perf/util/run-command.h @@ -0,0 +1,93 @@ +#ifndef RUN_COMMAND_H +#define RUN_COMMAND_H + +enum { +	ERR_RUN_COMMAND_FORK = 10000, +	ERR_RUN_COMMAND_EXEC, +	ERR_RUN_COMMAND_PIPE, +	ERR_RUN_COMMAND_WAITPID, +	ERR_RUN_COMMAND_WAITPID_WRONG_PID, +	ERR_RUN_COMMAND_WAITPID_SIGNAL, +	ERR_RUN_COMMAND_WAITPID_NOEXIT, +}; +#define IS_RUN_COMMAND_ERR(x) (-(x) >= ERR_RUN_COMMAND_FORK) + +struct child_process { +	const char **argv; +	pid_t pid; +	/* +	 * Using .in, .out, .err: +	 * - Specify 0 for no redirections (child inherits stdin, stdout, +	 *   stderr from parent). +	 * - Specify -1 to have a pipe allocated as follows: +	 *     .in: returns the writable pipe end; parent writes to it, +	 *          the readable pipe end becomes child's stdin +	 *     .out, .err: returns the readable pipe end; parent reads from +	 *          it, the writable pipe end becomes child's stdout/stderr +	 *   The caller of start_command() must close the returned FDs +	 *   after it has completed reading from/writing to it! +	 * - Specify > 0 to set a channel to a particular FD as follows: +	 *     .in: a readable FD, becomes child's stdin +	 *     .out: a writable FD, becomes child's stdout/stderr +	 *     .err > 0 not supported +	 *   The specified FD is closed by start_command(), even in case +	 *   of errors! +	 */ +	int in; +	int out; +	int err; +	const char *dir; +	const char *const *env; +	unsigned no_stdin:1; +	unsigned no_stdout:1; +	unsigned no_stderr:1; +	unsigned perf_cmd:1; /* if this is to be perf sub-command */ +	unsigned stdout_to_stderr:1; +	void (*preexec_cb)(void); +}; + +int start_command(struct child_process *); +int finish_command(struct child_process *); +int run_command(struct child_process *); + +extern int run_hook(const char *index_file, const char *name, ...); + +#define RUN_COMMAND_NO_STDIN 1 +#define RUN_PERF_CMD	     2	/*If this is to be perf sub-command */ +#define RUN_COMMAND_STDOUT_TO_STDERR 4 +int run_command_v_opt(const char **argv, int opt); + +/* + * env (the environment) is to be formatted like environ: "VAR=VALUE". + * To unset an environment variable use just "VAR". + */ +int run_command_v_opt_cd_env(const char **argv, int opt, const char *dir, const char *const *env); + +/* + * The purpose of the following functions is to feed a pipe by running + * a function asynchronously and providing output that the caller reads. + * + * It is expected that no synchronization and mutual exclusion between + * the caller and the feed function is necessary so that the function + * can run in a thread without interfering with the caller. + */ +struct async { +	/* +	 * proc writes to fd and closes it; +	 * returns 0 on success, non-zero on failure +	 */ +	int (*proc)(int fd, void *data); +	void *data; +	int out;	/* caller reads from here and closes it */ +#ifndef __MINGW32__ +	pid_t pid; +#else +	HANDLE tid; +	int fd_for_proc; +#endif +}; + +int start_async(struct async *async); +int finish_async(struct async *async); + +#endif diff --git a/tools/perf/util/sigchain.c b/tools/perf/util/sigchain.c new file mode 100644 index 00000000000..1118b99e57d --- /dev/null +++ b/tools/perf/util/sigchain.c @@ -0,0 +1,52 @@ +#include "sigchain.h" +#include "cache.h" + +#define SIGCHAIN_MAX_SIGNALS 32 + +struct sigchain_signal { +	sigchain_fun *old; +	int n; +	int alloc; +}; +static struct sigchain_signal signals[SIGCHAIN_MAX_SIGNALS]; + +static void check_signum(int sig) +{ +	if (sig < 1 || sig >= SIGCHAIN_MAX_SIGNALS) +		die("BUG: signal out of range: %d", sig); +} + +int sigchain_push(int sig, sigchain_fun f) +{ +	struct sigchain_signal *s = signals + sig; +	check_signum(sig); + +	ALLOC_GROW(s->old, s->n + 1, s->alloc); +	s->old[s->n] = signal(sig, f); +	if (s->old[s->n] == SIG_ERR) +		return -1; +	s->n++; +	return 0; +} + +int sigchain_pop(int sig) +{ +	struct sigchain_signal *s = signals + sig; +	check_signum(sig); +	if (s->n < 1) +		return 0; + +	if (signal(sig, s->old[s->n - 1]) == SIG_ERR) +		return -1; +	s->n--; +	return 0; +} + +void sigchain_push_common(sigchain_fun f) +{ +	sigchain_push(SIGINT, f); +	sigchain_push(SIGHUP, f); +	sigchain_push(SIGTERM, f); +	sigchain_push(SIGQUIT, f); +	sigchain_push(SIGPIPE, f); +} diff --git a/tools/perf/util/sigchain.h b/tools/perf/util/sigchain.h new file mode 100644 index 00000000000..618083bce0c --- /dev/null +++ b/tools/perf/util/sigchain.h @@ -0,0 +1,11 @@ +#ifndef SIGCHAIN_H +#define SIGCHAIN_H + +typedef void (*sigchain_fun)(int); + +int sigchain_push(int sig, sigchain_fun f); +int sigchain_pop(int sig); + +void sigchain_push_common(sigchain_fun f); + +#endif /* SIGCHAIN_H */ diff --git a/tools/perf/util/strbuf.c b/tools/perf/util/strbuf.c new file mode 100644 index 00000000000..eaba0930680 --- /dev/null +++ b/tools/perf/util/strbuf.c @@ -0,0 +1,359 @@ +#include "cache.h" + +int prefixcmp(const char *str, const char *prefix) +{ +	for (; ; str++, prefix++) +		if (!*prefix) +			return 0; +		else if (*str != *prefix) +			return (unsigned char)*prefix - (unsigned char)*str; +} + +/* + * Used as the default ->buf value, so that people can always assume + * buf is non NULL and ->buf is NUL terminated even for a freshly + * initialized strbuf. + */ +char strbuf_slopbuf[1]; + +void strbuf_init(struct strbuf *sb, size_t hint) +{ +	sb->alloc = sb->len = 0; +	sb->buf = strbuf_slopbuf; +	if (hint) +		strbuf_grow(sb, hint); +} + +void strbuf_release(struct strbuf *sb) +{ +	if (sb->alloc) { +		free(sb->buf); +		strbuf_init(sb, 0); +	} +} + +char *strbuf_detach(struct strbuf *sb, size_t *sz) +{ +	char *res = sb->alloc ? sb->buf : NULL; +	if (sz) +		*sz = sb->len; +	strbuf_init(sb, 0); +	return res; +} + +void strbuf_attach(struct strbuf *sb, void *buf, size_t len, size_t alloc) +{ +	strbuf_release(sb); +	sb->buf   = buf; +	sb->len   = len; +	sb->alloc = alloc; +	strbuf_grow(sb, 0); +	sb->buf[sb->len] = '\0'; +} + +void strbuf_grow(struct strbuf *sb, size_t extra) +{ +	if (sb->len + extra + 1 <= sb->len) +		die("you want to use way too much memory"); +	if (!sb->alloc) +		sb->buf = NULL; +	ALLOC_GROW(sb->buf, sb->len + extra + 1, sb->alloc); +} + +void strbuf_trim(struct strbuf *sb) +{ +	char *b = sb->buf; +	while (sb->len > 0 && isspace((unsigned char)sb->buf[sb->len - 1])) +		sb->len--; +	while (sb->len > 0 && isspace(*b)) { +		b++; +		sb->len--; +	} +	memmove(sb->buf, b, sb->len); +	sb->buf[sb->len] = '\0'; +} +void strbuf_rtrim(struct strbuf *sb) +{ +	while (sb->len > 0 && isspace((unsigned char)sb->buf[sb->len - 1])) +		sb->len--; +	sb->buf[sb->len] = '\0'; +} + +void strbuf_ltrim(struct strbuf *sb) +{ +	char *b = sb->buf; +	while (sb->len > 0 && isspace(*b)) { +		b++; +		sb->len--; +	} +	memmove(sb->buf, b, sb->len); +	sb->buf[sb->len] = '\0'; +} + +void strbuf_tolower(struct strbuf *sb) +{ +	int i; +	for (i = 0; i < sb->len; i++) +		sb->buf[i] = tolower(sb->buf[i]); +} + +struct strbuf **strbuf_split(const struct strbuf *sb, int delim) +{ +	int alloc = 2, pos = 0; +	char *n, *p; +	struct strbuf **ret; +	struct strbuf *t; + +	ret = calloc(alloc, sizeof(struct strbuf *)); +	p = n = sb->buf; +	while (n < sb->buf + sb->len) { +		int len; +		n = memchr(n, delim, sb->len - (n - sb->buf)); +		if (pos + 1 >= alloc) { +			alloc = alloc * 2; +			ret = realloc(ret, sizeof(struct strbuf *) * alloc); +		} +		if (!n) +			n = sb->buf + sb->len - 1; +		len = n - p + 1; +		t = malloc(sizeof(struct strbuf)); +		strbuf_init(t, len); +		strbuf_add(t, p, len); +		ret[pos] = t; +		ret[++pos] = NULL; +		p = ++n; +	} +	return ret; +} + +void strbuf_list_free(struct strbuf **sbs) +{ +	struct strbuf **s = sbs; + +	while (*s) { +		strbuf_release(*s); +		free(*s++); +	} +	free(sbs); +} + +int strbuf_cmp(const struct strbuf *a, const struct strbuf *b) +{ +	int len = a->len < b->len ? a->len: b->len; +	int cmp = memcmp(a->buf, b->buf, len); +	if (cmp) +		return cmp; +	return a->len < b->len ? -1: a->len != b->len; +} + +void strbuf_splice(struct strbuf *sb, size_t pos, size_t len, +				   const void *data, size_t dlen) +{ +	if (pos + len < pos) +		die("you want to use way too much memory"); +	if (pos > sb->len) +		die("`pos' is too far after the end of the buffer"); +	if (pos + len > sb->len) +		die("`pos + len' is too far after the end of the buffer"); + +	if (dlen >= len) +		strbuf_grow(sb, dlen - len); +	memmove(sb->buf + pos + dlen, +			sb->buf + pos + len, +			sb->len - pos - len); +	memcpy(sb->buf + pos, data, dlen); +	strbuf_setlen(sb, sb->len + dlen - len); +} + +void strbuf_insert(struct strbuf *sb, size_t pos, const void *data, size_t len) +{ +	strbuf_splice(sb, pos, 0, data, len); +} + +void strbuf_remove(struct strbuf *sb, size_t pos, size_t len) +{ +	strbuf_splice(sb, pos, len, NULL, 0); +} + +void strbuf_add(struct strbuf *sb, const void *data, size_t len) +{ +	strbuf_grow(sb, len); +	memcpy(sb->buf + sb->len, data, len); +	strbuf_setlen(sb, sb->len + len); +} + +void strbuf_adddup(struct strbuf *sb, size_t pos, size_t len) +{ +	strbuf_grow(sb, len); +	memcpy(sb->buf + sb->len, sb->buf + pos, len); +	strbuf_setlen(sb, sb->len + len); +} + +void strbuf_addf(struct strbuf *sb, const char *fmt, ...) +{ +	int len; +	va_list ap; + +	if (!strbuf_avail(sb)) +		strbuf_grow(sb, 64); +	va_start(ap, fmt); +	len = vsnprintf(sb->buf + sb->len, sb->alloc - sb->len, fmt, ap); +	va_end(ap); +	if (len < 0) +		die("your vsnprintf is broken"); +	if (len > strbuf_avail(sb)) { +		strbuf_grow(sb, len); +		va_start(ap, fmt); +		len = vsnprintf(sb->buf + sb->len, sb->alloc - sb->len, fmt, ap); +		va_end(ap); +		if (len > strbuf_avail(sb)) { +			die("this should not happen, your snprintf is broken"); +		} +	} +	strbuf_setlen(sb, sb->len + len); +} + +void strbuf_expand(struct strbuf *sb, const char *format, expand_fn_t fn, +		   void *context) +{ +	for (;;) { +		const char *percent; +		size_t consumed; + +		percent = strchrnul(format, '%'); +		strbuf_add(sb, format, percent - format); +		if (!*percent) +			break; +		format = percent + 1; + +		consumed = fn(sb, format, context); +		if (consumed) +			format += consumed; +		else +			strbuf_addch(sb, '%'); +	} +} + +size_t strbuf_expand_dict_cb(struct strbuf *sb, const char *placeholder, +		void *context) +{ +	struct strbuf_expand_dict_entry *e = context; +	size_t len; + +	for (; e->placeholder && (len = strlen(e->placeholder)); e++) { +		if (!strncmp(placeholder, e->placeholder, len)) { +			if (e->value) +				strbuf_addstr(sb, e->value); +			return len; +		} +	} +	return 0; +} + +size_t strbuf_fread(struct strbuf *sb, size_t size, FILE *f) +{ +	size_t res; +	size_t oldalloc = sb->alloc; + +	strbuf_grow(sb, size); +	res = fread(sb->buf + sb->len, 1, size, f); +	if (res > 0) +		strbuf_setlen(sb, sb->len + res); +	else if (res < 0 && oldalloc == 0) +		strbuf_release(sb); +	return res; +} + +ssize_t strbuf_read(struct strbuf *sb, int fd, size_t hint) +{ +	size_t oldlen = sb->len; +	size_t oldalloc = sb->alloc; + +	strbuf_grow(sb, hint ? hint : 8192); +	for (;;) { +		ssize_t cnt; + +		cnt = read(fd, sb->buf + sb->len, sb->alloc - sb->len - 1); +		if (cnt < 0) { +			if (oldalloc == 0) +				strbuf_release(sb); +			else +				strbuf_setlen(sb, oldlen); +			return -1; +		} +		if (!cnt) +			break; +		sb->len += cnt; +		strbuf_grow(sb, 8192); +	} + +	sb->buf[sb->len] = '\0'; +	return sb->len - oldlen; +} + +#define STRBUF_MAXLINK (2*PATH_MAX) + +int strbuf_readlink(struct strbuf *sb, const char *path, size_t hint) +{ +	size_t oldalloc = sb->alloc; + +	if (hint < 32) +		hint = 32; + +	while (hint < STRBUF_MAXLINK) { +		int len; + +		strbuf_grow(sb, hint); +		len = readlink(path, sb->buf, hint); +		if (len < 0) { +			if (errno != ERANGE) +				break; +		} else if (len < hint) { +			strbuf_setlen(sb, len); +			return 0; +		} + +		/* .. the buffer was too small - try again */ +		hint *= 2; +	} +	if (oldalloc == 0) +		strbuf_release(sb); +	return -1; +} + +int strbuf_getline(struct strbuf *sb, FILE *fp, int term) +{ +	int ch; + +	strbuf_grow(sb, 0); +	if (feof(fp)) +		return EOF; + +	strbuf_reset(sb); +	while ((ch = fgetc(fp)) != EOF) { +		if (ch == term) +			break; +		strbuf_grow(sb, 1); +		sb->buf[sb->len++] = ch; +	} +	if (ch == EOF && sb->len == 0) +		return EOF; + +	sb->buf[sb->len] = '\0'; +	return 0; +} + +int strbuf_read_file(struct strbuf *sb, const char *path, size_t hint) +{ +	int fd, len; + +	fd = open(path, O_RDONLY); +	if (fd < 0) +		return -1; +	len = strbuf_read(sb, fd, hint); +	close(fd); +	if (len < 0) +		return -1; + +	return len; +} diff --git a/tools/perf/util/strbuf.h b/tools/perf/util/strbuf.h new file mode 100644 index 00000000000..9ee908a3ec5 --- /dev/null +++ b/tools/perf/util/strbuf.h @@ -0,0 +1,137 @@ +#ifndef STRBUF_H +#define STRBUF_H + +/* + * Strbuf's can be use in many ways: as a byte array, or to store arbitrary + * long, overflow safe strings. + * + * Strbufs has some invariants that are very important to keep in mind: + * + * 1. the ->buf member is always malloc-ed, hence strbuf's can be used to + *    build complex strings/buffers whose final size isn't easily known. + * + *    It is NOT legal to copy the ->buf pointer away. + *    `strbuf_detach' is the operation that detachs a buffer from its shell + *    while keeping the shell valid wrt its invariants. + * + * 2. the ->buf member is a byte array that has at least ->len + 1 bytes + *    allocated. The extra byte is used to store a '\0', allowing the ->buf + *    member to be a valid C-string. Every strbuf function ensure this + *    invariant is preserved. + * + *    Note that it is OK to "play" with the buffer directly if you work it + *    that way: + * + *    strbuf_grow(sb, SOME_SIZE); + *       ... Here, the memory array starting at sb->buf, and of length + *       ... strbuf_avail(sb) is all yours, and you are sure that + *       ... strbuf_avail(sb) is at least SOME_SIZE. + *    strbuf_setlen(sb, sb->len + SOME_OTHER_SIZE); + * + *    Of course, SOME_OTHER_SIZE must be smaller or equal to strbuf_avail(sb). + * + *    Doing so is safe, though if it has to be done in many places, adding the + *    missing API to the strbuf module is the way to go. + * + *    XXX: do _not_ assume that the area that is yours is of size ->alloc - 1 + *         even if it's true in the current implementation. Alloc is somehow a + *         "private" member that should not be messed with. + */ + +#include <assert.h> + +extern char strbuf_slopbuf[]; +struct strbuf { +	size_t alloc; +	size_t len; +	char *buf; +}; + +#define STRBUF_INIT  { 0, 0, strbuf_slopbuf } + +/*----- strbuf life cycle -----*/ +extern void strbuf_init(struct strbuf *, size_t); +extern void strbuf_release(struct strbuf *); +extern char *strbuf_detach(struct strbuf *, size_t *); +extern void strbuf_attach(struct strbuf *, void *, size_t, size_t); +static inline void strbuf_swap(struct strbuf *a, struct strbuf *b) { +	struct strbuf tmp = *a; +	*a = *b; +	*b = tmp; +} + +/*----- strbuf size related -----*/ +static inline size_t strbuf_avail(const struct strbuf *sb) { +	return sb->alloc ? sb->alloc - sb->len - 1 : 0; +} + +extern void strbuf_grow(struct strbuf *, size_t); + +static inline void strbuf_setlen(struct strbuf *sb, size_t len) { +	if (!sb->alloc) +		strbuf_grow(sb, 0); +	assert(len < sb->alloc); +	sb->len = len; +	sb->buf[len] = '\0'; +} +#define strbuf_reset(sb)  strbuf_setlen(sb, 0) + +/*----- content related -----*/ +extern void strbuf_trim(struct strbuf *); +extern void strbuf_rtrim(struct strbuf *); +extern void strbuf_ltrim(struct strbuf *); +extern int strbuf_cmp(const struct strbuf *, const struct strbuf *); +extern void strbuf_tolower(struct strbuf *); + +extern struct strbuf **strbuf_split(const struct strbuf *, int delim); +extern void strbuf_list_free(struct strbuf **); + +/*----- add data in your buffer -----*/ +static inline void strbuf_addch(struct strbuf *sb, int c) { +	strbuf_grow(sb, 1); +	sb->buf[sb->len++] = c; +	sb->buf[sb->len] = '\0'; +} + +extern void strbuf_insert(struct strbuf *, size_t pos, const void *, size_t); +extern void strbuf_remove(struct strbuf *, size_t pos, size_t len); + +/* splice pos..pos+len with given data */ +extern void strbuf_splice(struct strbuf *, size_t pos, size_t len, +                          const void *, size_t); + +extern void strbuf_add(struct strbuf *, const void *, size_t); +static inline void strbuf_addstr(struct strbuf *sb, const char *s) { +	strbuf_add(sb, s, strlen(s)); +} +static inline void strbuf_addbuf(struct strbuf *sb, const struct strbuf *sb2) { +	strbuf_add(sb, sb2->buf, sb2->len); +} +extern void strbuf_adddup(struct strbuf *sb, size_t pos, size_t len); + +typedef size_t (*expand_fn_t) (struct strbuf *sb, const char *placeholder, void *context); +extern void strbuf_expand(struct strbuf *sb, const char *format, expand_fn_t fn, void *context); +struct strbuf_expand_dict_entry { +	const char *placeholder; +	const char *value; +}; +extern size_t strbuf_expand_dict_cb(struct strbuf *sb, const char *placeholder, void *context); + +__attribute__((format(printf,2,3))) +extern void strbuf_addf(struct strbuf *sb, const char *fmt, ...); + +extern size_t strbuf_fread(struct strbuf *, size_t, FILE *); +/* XXX: if read fails, any partial read is undone */ +extern ssize_t strbuf_read(struct strbuf *, int fd, size_t hint); +extern int strbuf_read_file(struct strbuf *sb, const char *path, size_t hint); +extern int strbuf_readlink(struct strbuf *sb, const char *path, size_t hint); + +extern int strbuf_getline(struct strbuf *, FILE *, int); + +extern void stripspace(struct strbuf *buf, int skip_comments); +extern int launch_editor(const char *path, struct strbuf *buffer, const char *const *env); + +extern int strbuf_branchname(struct strbuf *sb, const char *name); +extern int strbuf_check_branch_ref(struct strbuf *sb, const char *name); + +#endif /* STRBUF_H */ diff --git a/tools/perf/util/string.c b/tools/perf/util/string.c new file mode 100644 index 00000000000..ec33c0c7f4e --- /dev/null +++ b/tools/perf/util/string.c @@ -0,0 +1,34 @@ +#include "string.h" + +static int hex(char ch) +{ +	if ((ch >= '0') && (ch <= '9')) +		return ch - '0'; +	if ((ch >= 'a') && (ch <= 'f')) +		return ch - 'a' + 10; +	if ((ch >= 'A') && (ch <= 'F')) +		return ch - 'A' + 10; +	return -1; +} + +/* + * While we find nice hex chars, build a long_val. + * Return number of chars processed. + */ +int hex2u64(const char *ptr, __u64 *long_val) +{ +	const char *p = ptr; +	*long_val = 0; + +	while (*p) { +		const int hex_val = hex(*p); + +		if (hex_val < 0) +			break; + +		*long_val = (*long_val << 4) | hex_val; +		p++; +	} + +	return p - ptr; +} diff --git a/tools/perf/util/string.h b/tools/perf/util/string.h new file mode 100644 index 00000000000..72812c1c9a7 --- /dev/null +++ b/tools/perf/util/string.h @@ -0,0 +1,8 @@ +#ifndef _PERF_STRING_H_ +#define _PERF_STRING_H_ + +#include <linux/types.h> + +int hex2u64(const char *ptr, __u64 *val); + +#endif diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c new file mode 100644 index 00000000000..49a55f81371 --- /dev/null +++ b/tools/perf/util/symbol.c @@ -0,0 +1,641 @@ +#include "util.h" +#include "../perf.h" +#include "string.h" +#include "symbol.h" + +#include <libelf.h> +#include <gelf.h> +#include <elf.h> + +const char *sym_hist_filter; + +static struct symbol *symbol__new(__u64 start, __u64 len, +				  const char *name, unsigned int priv_size, +				  __u64 obj_start, int verbose) +{ +	size_t namelen = strlen(name) + 1; +	struct symbol *self = calloc(1, priv_size + sizeof(*self) + namelen); + +	if (!self) +		return NULL; + +	if (verbose >= 2) +		printf("new symbol: %016Lx [%08lx]: %s, hist: %p, obj_start: %p\n", +			(__u64)start, (unsigned long)len, name, self->hist, (void *)(unsigned long)obj_start); + +	self->obj_start= obj_start; +	self->hist = NULL; +	self->hist_sum = 0; + +	if (sym_hist_filter && !strcmp(name, sym_hist_filter)) +		self->hist = calloc(sizeof(__u64), len); + +	if (priv_size) { +		memset(self, 0, priv_size); +		self = ((void *)self) + priv_size; +	} +	self->start = start; +	self->end   = start + len - 1; +	memcpy(self->name, name, namelen); + +	return self; +} + +static void symbol__delete(struct symbol *self, unsigned int priv_size) +{ +	free(((void *)self) - priv_size); +} + +static size_t symbol__fprintf(struct symbol *self, FILE *fp) +{ +	return fprintf(fp, " %llx-%llx %s\n", +		       self->start, self->end, self->name); +} + +struct dso *dso__new(const char *name, unsigned int sym_priv_size) +{ +	struct dso *self = malloc(sizeof(*self) + strlen(name) + 1); + +	if (self != NULL) { +		strcpy(self->name, name); +		self->syms = RB_ROOT; +		self->sym_priv_size = sym_priv_size; +		self->find_symbol = dso__find_symbol; +	} + +	return self; +} + +static void dso__delete_symbols(struct dso *self) +{ +	struct symbol *pos; +	struct rb_node *next = rb_first(&self->syms); + +	while (next) { +		pos = rb_entry(next, struct symbol, rb_node); +		next = rb_next(&pos->rb_node); +		rb_erase(&pos->rb_node, &self->syms); +		symbol__delete(pos, self->sym_priv_size); +	} +} + +void dso__delete(struct dso *self) +{ +	dso__delete_symbols(self); +	free(self); +} + +static void dso__insert_symbol(struct dso *self, struct symbol *sym) +{ +	struct rb_node **p = &self->syms.rb_node; +	struct rb_node *parent = NULL; +	const __u64 ip = sym->start; +	struct symbol *s; + +	while (*p != NULL) { +		parent = *p; +		s = rb_entry(parent, struct symbol, rb_node); +		if (ip < s->start) +			p = &(*p)->rb_left; +		else +			p = &(*p)->rb_right; +	} +	rb_link_node(&sym->rb_node, parent, p); +	rb_insert_color(&sym->rb_node, &self->syms); +} + +struct symbol *dso__find_symbol(struct dso *self, __u64 ip) +{ +	struct rb_node *n; + +	if (self == NULL) +		return NULL; + +	n = self->syms.rb_node; + +	while (n) { +		struct symbol *s = rb_entry(n, struct symbol, rb_node); + +		if (ip < s->start) +			n = n->rb_left; +		else if (ip > s->end) +			n = n->rb_right; +		else +			return s; +	} + +	return NULL; +} + +size_t dso__fprintf(struct dso *self, FILE *fp) +{ +	size_t ret = fprintf(fp, "dso: %s\n", self->name); + +	struct rb_node *nd; +	for (nd = rb_first(&self->syms); nd; nd = rb_next(nd)) { +		struct symbol *pos = rb_entry(nd, struct symbol, rb_node); +		ret += symbol__fprintf(pos, fp); +	} + +	return ret; +} + +static int dso__load_kallsyms(struct dso *self, symbol_filter_t filter, int verbose) +{ +	struct rb_node *nd, *prevnd; +	char *line = NULL; +	size_t n; +	FILE *file = fopen("/proc/kallsyms", "r"); + +	if (file == NULL) +		goto out_failure; + +	while (!feof(file)) { +		__u64 start; +		struct symbol *sym; +		int line_len, len; +		char symbol_type; + +		line_len = getline(&line, &n, file); +		if (line_len < 0) +			break; + +		if (!line) +			goto out_failure; + +		line[--line_len] = '\0'; /* \n */ + +		len = hex2u64(line, &start); + +		len++; +		if (len + 2 >= line_len) +			continue; + +		symbol_type = toupper(line[len]); +		/* +		 * We're interested only in code ('T'ext) +		 */ +		if (symbol_type != 'T' && symbol_type != 'W') +			continue; +		/* +		 * Well fix up the end later, when we have all sorted. +		 */ +		sym = symbol__new(start, 0xdead, line + len + 2, +				  self->sym_priv_size, 0, verbose); + +		if (sym == NULL) +			goto out_delete_line; + +		if (filter && filter(self, sym)) +			symbol__delete(sym, self->sym_priv_size); +		else +			dso__insert_symbol(self, sym); +	} + +	/* +	 * Now that we have all sorted out, just set the ->end of all +	 * symbols +	 */ +	prevnd = rb_first(&self->syms); + +	if (prevnd == NULL) +		goto out_delete_line; + +	for (nd = rb_next(prevnd); nd; nd = rb_next(nd)) { +		struct symbol *prev = rb_entry(prevnd, struct symbol, rb_node), +			      *curr = rb_entry(nd, struct symbol, rb_node); + +		prev->end = curr->start - 1; +		prevnd = nd; +	} + +	free(line); +	fclose(file); + +	return 0; + +out_delete_line: +	free(line); +out_failure: +	return -1; +} + +static int dso__load_perf_map(struct dso *self, symbol_filter_t filter, int verbose) +{ +	char *line = NULL; +	size_t n; +	FILE *file; +	int nr_syms = 0; + +	file = fopen(self->name, "r"); +	if (file == NULL) +		goto out_failure; + +	while (!feof(file)) { +		__u64 start, size; +		struct symbol *sym; +		int line_len, len; + +		line_len = getline(&line, &n, file); +		if (line_len < 0) +			break; + +		if (!line) +			goto out_failure; + +		line[--line_len] = '\0'; /* \n */ + +		len = hex2u64(line, &start); + +		len++; +		if (len + 2 >= line_len) +			continue; + +		len += hex2u64(line + len, &size); + +		len++; +		if (len + 2 >= line_len) +			continue; + +		sym = symbol__new(start, size, line + len, +				  self->sym_priv_size, start, verbose); + +		if (sym == NULL) +			goto out_delete_line; + +		if (filter && filter(self, sym)) +			symbol__delete(sym, self->sym_priv_size); +		else { +			dso__insert_symbol(self, sym); +			nr_syms++; +		} +	} + +	free(line); +	fclose(file); + +	return nr_syms; + +out_delete_line: +	free(line); +out_failure: +	return -1; +} + +/** + * elf_symtab__for_each_symbol - iterate thru all the symbols + * + * @self: struct elf_symtab instance to iterate + * @index: uint32_t index + * @sym: GElf_Sym iterator + */ +#define elf_symtab__for_each_symbol(syms, nr_syms, index, sym) \ +	for (index = 0, gelf_getsym(syms, index, &sym);\ +	     index < nr_syms; \ +	     index++, gelf_getsym(syms, index, &sym)) + +static inline uint8_t elf_sym__type(const GElf_Sym *sym) +{ +	return GELF_ST_TYPE(sym->st_info); +} + +static inline int elf_sym__is_function(const GElf_Sym *sym) +{ +	return elf_sym__type(sym) == STT_FUNC && +	       sym->st_name != 0 && +	       sym->st_shndx != SHN_UNDEF && +	       sym->st_size != 0; +} + +static inline const char *elf_sym__name(const GElf_Sym *sym, +					const Elf_Data *symstrs) +{ +	return symstrs->d_buf + sym->st_name; +} + +static Elf_Scn *elf_section_by_name(Elf *elf, GElf_Ehdr *ep, +				    GElf_Shdr *shp, const char *name, +				    size_t *index) +{ +	Elf_Scn *sec = NULL; +	size_t cnt = 1; + +	while ((sec = elf_nextscn(elf, sec)) != NULL) { +		char *str; + +		gelf_getshdr(sec, shp); +		str = elf_strptr(elf, ep->e_shstrndx, shp->sh_name); +		if (!strcmp(name, str)) { +			if (index) +				*index = cnt; +			break; +		} +		++cnt; +	} + +	return sec; +} + +#define elf_section__for_each_rel(reldata, pos, pos_mem, idx, nr_entries) \ +	for (idx = 0, pos = gelf_getrel(reldata, 0, &pos_mem); \ +	     idx < nr_entries; \ +	     ++idx, pos = gelf_getrel(reldata, idx, &pos_mem)) + +#define elf_section__for_each_rela(reldata, pos, pos_mem, idx, nr_entries) \ +	for (idx = 0, pos = gelf_getrela(reldata, 0, &pos_mem); \ +	     idx < nr_entries; \ +	     ++idx, pos = gelf_getrela(reldata, idx, &pos_mem)) + +static int dso__synthesize_plt_symbols(struct  dso *self, Elf *elf, +				       GElf_Ehdr *ehdr, Elf_Scn *scn_dynsym, +				       GElf_Shdr *shdr_dynsym, +				       size_t dynsym_idx, int verbose) +{ +	uint32_t nr_rel_entries, idx; +	GElf_Sym sym; +	__u64 plt_offset; +	GElf_Shdr shdr_plt; +	struct symbol *f; +	GElf_Shdr shdr_rel_plt; +	Elf_Data *reldata, *syms, *symstrs; +	Elf_Scn *scn_plt_rel, *scn_symstrs; +	char sympltname[1024]; +	int nr = 0, symidx; + +	scn_plt_rel = elf_section_by_name(elf, ehdr, &shdr_rel_plt, +					  ".rela.plt", NULL); +	if (scn_plt_rel == NULL) { +		scn_plt_rel = elf_section_by_name(elf, ehdr, &shdr_rel_plt, +						  ".rel.plt", NULL); +		if (scn_plt_rel == NULL) +			return 0; +	} + +	if (shdr_rel_plt.sh_link != dynsym_idx) +		return 0; + +	if (elf_section_by_name(elf, ehdr, &shdr_plt, ".plt", NULL) == NULL) +		return 0; + +	/* +	 * Fetch the relocation section to find the indexes to the GOT +	 * and the symbols in the .dynsym they refer to. +	 */ +	reldata = elf_getdata(scn_plt_rel, NULL); +	if (reldata == NULL) +		return -1; + +	syms = elf_getdata(scn_dynsym, NULL); +	if (syms == NULL) +		return -1; + +	scn_symstrs = elf_getscn(elf, shdr_dynsym->sh_link); +	if (scn_symstrs == NULL) +		return -1; + +	symstrs = elf_getdata(scn_symstrs, NULL); +	if (symstrs == NULL) +		return -1; + +	nr_rel_entries = shdr_rel_plt.sh_size / shdr_rel_plt.sh_entsize; +	plt_offset = shdr_plt.sh_offset; + +	if (shdr_rel_plt.sh_type == SHT_RELA) { +		GElf_Rela pos_mem, *pos; + +		elf_section__for_each_rela(reldata, pos, pos_mem, idx, +					   nr_rel_entries) { +			symidx = GELF_R_SYM(pos->r_info); +			plt_offset += shdr_plt.sh_entsize; +			gelf_getsym(syms, symidx, &sym); +			snprintf(sympltname, sizeof(sympltname), +				 "%s@plt", elf_sym__name(&sym, symstrs)); + +			f = symbol__new(plt_offset, shdr_plt.sh_entsize, +					sympltname, self->sym_priv_size, 0, verbose); +			if (!f) +				return -1; + +			dso__insert_symbol(self, f); +			++nr; +		} +	} else if (shdr_rel_plt.sh_type == SHT_REL) { +		GElf_Rel pos_mem, *pos; +		elf_section__for_each_rel(reldata, pos, pos_mem, idx, +					  nr_rel_entries) { +			symidx = GELF_R_SYM(pos->r_info); +			plt_offset += shdr_plt.sh_entsize; +			gelf_getsym(syms, symidx, &sym); +			snprintf(sympltname, sizeof(sympltname), +				 "%s@plt", elf_sym__name(&sym, symstrs)); + +			f = symbol__new(plt_offset, shdr_plt.sh_entsize, +					sympltname, self->sym_priv_size, 0, verbose); +			if (!f) +				return -1; + +			dso__insert_symbol(self, f); +			++nr; +		} +	} else { +		/* +		 * TODO: There are still one more shdr_rel_plt.sh_type +		 * I have to investigate, but probably should be ignored. +		 */ +	} + +	return nr; +} + +static int dso__load_sym(struct dso *self, int fd, const char *name, +			 symbol_filter_t filter, int verbose) +{ +	Elf_Data *symstrs; +	uint32_t nr_syms; +	int err = -1; +	uint32_t index; +	GElf_Ehdr ehdr; +	GElf_Shdr shdr; +	Elf_Data *syms; +	GElf_Sym sym; +	Elf_Scn *sec, *sec_dynsym; +	Elf *elf; +	size_t dynsym_idx; +	int nr = 0; + +	elf = elf_begin(fd, ELF_C_READ_MMAP, NULL); +	if (elf == NULL) { +		if (verbose) +			fprintf(stderr, "%s: cannot read %s ELF file.\n", +				__func__, name); +		goto out_close; +	} + +	if (gelf_getehdr(elf, &ehdr) == NULL) { +		if (verbose) +			fprintf(stderr, "%s: cannot get elf header.\n", __func__); +		goto out_elf_end; +	} + +	/* +	 * We need to check if we have a .dynsym, so that we can handle the +	 * .plt, synthesizing its symbols, that aren't on the symtabs (be it +	 * .dynsym or .symtab) +	 */ +	sec_dynsym = elf_section_by_name(elf, &ehdr, &shdr, +					 ".dynsym", &dynsym_idx); +	if (sec_dynsym != NULL) { +		nr = dso__synthesize_plt_symbols(self, elf, &ehdr, +						 sec_dynsym, &shdr, +						 dynsym_idx, verbose); +		if (nr < 0) +			goto out_elf_end; +	} + +	/* +	 * But if we have a full .symtab (that is a superset of .dynsym) we +	 * should add the symbols not in the .dynsyn +	 */ +	sec = elf_section_by_name(elf, &ehdr, &shdr, ".symtab", NULL); +	if (sec == NULL) { +		if (sec_dynsym == NULL) +			goto out_elf_end; + +		sec = sec_dynsym; +		gelf_getshdr(sec, &shdr); +	} + +	syms = elf_getdata(sec, NULL); +	if (syms == NULL) +		goto out_elf_end; + +	sec = elf_getscn(elf, shdr.sh_link); +	if (sec == NULL) +		goto out_elf_end; + +	symstrs = elf_getdata(sec, NULL); +	if (symstrs == NULL) +		goto out_elf_end; + +	nr_syms = shdr.sh_size / shdr.sh_entsize; + +	memset(&sym, 0, sizeof(sym)); + +	elf_symtab__for_each_symbol(syms, nr_syms, index, sym) { +		struct symbol *f; +		__u64 obj_start; + +		if (!elf_sym__is_function(&sym)) +			continue; + +		sec = elf_getscn(elf, sym.st_shndx); +		if (!sec) +			goto out_elf_end; + +		gelf_getshdr(sec, &shdr); +		obj_start = sym.st_value; + +		sym.st_value -= shdr.sh_addr - shdr.sh_offset; + +		f = symbol__new(sym.st_value, sym.st_size, +				elf_sym__name(&sym, symstrs), +				self->sym_priv_size, obj_start, verbose); +		if (!f) +			goto out_elf_end; + +		if (filter && filter(self, f)) +			symbol__delete(f, self->sym_priv_size); +		else { +			dso__insert_symbol(self, f); +			nr++; +		} +	} + +	err = nr; +out_elf_end: +	elf_end(elf); +out_close: +	return err; +} + +int dso__load(struct dso *self, symbol_filter_t filter, int verbose) +{ +	int size = strlen(self->name) + sizeof("/usr/lib/debug%s.debug"); +	char *name = malloc(size); +	int variant = 0; +	int ret = -1; +	int fd; + +	if (!name) +		return -1; + +	if (strncmp(self->name, "/tmp/perf-", 10) == 0) +		return dso__load_perf_map(self, filter, verbose); + +more: +	do { +		switch (variant) { +		case 0: /* Fedora */ +			snprintf(name, size, "/usr/lib/debug%s.debug", self->name); +			break; +		case 1: /* Ubuntu */ +			snprintf(name, size, "/usr/lib/debug%s", self->name); +			break; +		case 2: /* Sane people */ +			snprintf(name, size, "%s", self->name); +			break; + +		default: +			goto out; +		} +		variant++; + +		fd = open(name, O_RDONLY); +	} while (fd < 0); + +	ret = dso__load_sym(self, fd, name, filter, verbose); +	close(fd); + +	/* +	 * Some people seem to have debuginfo files _WITHOUT_ debug info!?!? +	 */ +	if (!ret) +		goto more; + +out: +	free(name); +	return ret; +} + +static int dso__load_vmlinux(struct dso *self, const char *vmlinux, +			     symbol_filter_t filter, int verbose) +{ +	int err, fd = open(vmlinux, O_RDONLY); + +	if (fd < 0) +		return -1; + +	err = dso__load_sym(self, fd, vmlinux, filter, verbose); +	close(fd); + +	return err; +} + +int dso__load_kernel(struct dso *self, const char *vmlinux, +		     symbol_filter_t filter, int verbose) +{ +	int err = -1; + +	if (vmlinux) +		err = dso__load_vmlinux(self, vmlinux, filter, verbose); + +	if (err) +		err = dso__load_kallsyms(self, filter, verbose); + +	return err; +} + +void symbol__init(void) +{ +	elf_version(EV_CURRENT); +} diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h new file mode 100644 index 00000000000..0d1292bd827 --- /dev/null +++ b/tools/perf/util/symbol.h @@ -0,0 +1,47 @@ +#ifndef _PERF_SYMBOL_ +#define _PERF_SYMBOL_ 1 + +#include <linux/types.h> +#include "list.h" +#include "rbtree.h" + +struct symbol { +	struct rb_node	rb_node; +	__u64		start; +	__u64		end; +	__u64		obj_start; +	__u64		hist_sum; +	__u64		*hist; +	char		name[0]; +}; + +struct dso { +	struct list_head node; +	struct rb_root	 syms; +	unsigned int	 sym_priv_size; +	struct symbol    *(*find_symbol)(struct dso *, __u64 ip); +	char		 name[0]; +}; + +const char *sym_hist_filter; + +typedef int (*symbol_filter_t)(struct dso *self, struct symbol *sym); + +struct dso *dso__new(const char *name, unsigned int sym_priv_size); +void dso__delete(struct dso *self); + +static inline void *dso__sym_priv(struct dso *self, struct symbol *sym) +{ +	return ((void *)sym) - self->sym_priv_size; +} + +struct symbol *dso__find_symbol(struct dso *self, __u64 ip); + +int dso__load_kernel(struct dso *self, const char *vmlinux, +		     symbol_filter_t filter, int verbose); +int dso__load(struct dso *self, symbol_filter_t filter, int verbose); + +size_t dso__fprintf(struct dso *self, FILE *fp); + +void symbol__init(void); +#endif /* _PERF_SYMBOL_ */ diff --git a/tools/perf/util/usage.c b/tools/perf/util/usage.c new file mode 100644 index 00000000000..e16bf9a707e --- /dev/null +++ b/tools/perf/util/usage.c @@ -0,0 +1,80 @@ +/* + * GIT - The information manager from hell + * + * Copyright (C) Linus Torvalds, 2005 + */ +#include "util.h" + +static void report(const char *prefix, const char *err, va_list params) +{ +	char msg[1024]; +	vsnprintf(msg, sizeof(msg), err, params); +	fprintf(stderr, " %s%s\n", prefix, msg); +} + +static NORETURN void usage_builtin(const char *err) +{ +	fprintf(stderr, "\n Usage: %s\n", err); +	exit(129); +} + +static NORETURN void die_builtin(const char *err, va_list params) +{ +	report(" Fatal: ", err, params); +	exit(128); +} + +static void error_builtin(const char *err, va_list params) +{ +	report(" Error: ", err, params); +} + +static void warn_builtin(const char *warn, va_list params) +{ +	report(" Warning: ", warn, params); +} + +/* If we are in a dlopen()ed .so write to a global variable would segfault + * (ugh), so keep things static. */ +static void (*usage_routine)(const char *err) NORETURN = usage_builtin; +static void (*die_routine)(const char *err, va_list params) NORETURN = die_builtin; +static void (*error_routine)(const char *err, va_list params) = error_builtin; +static void (*warn_routine)(const char *err, va_list params) = warn_builtin; + +void set_die_routine(void (*routine)(const char *err, va_list params) NORETURN) +{ +	die_routine = routine; +} + +void usage(const char *err) +{ +	usage_routine(err); +} + +void die(const char *err, ...) +{ +	va_list params; + +	va_start(params, err); +	die_routine(err, params); +	va_end(params); +} + +int error(const char *err, ...) +{ +	va_list params; + +	va_start(params, err); +	error_routine(err, params); +	va_end(params); +	return -1; +} + +void warning(const char *warn, ...) +{ +	va_list params; + +	va_start(params, warn); +	warn_routine(warn, params); +	va_end(params); +} diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h new file mode 100644 index 00000000000..76590a16c27 --- /dev/null +++ b/tools/perf/util/util.h @@ -0,0 +1,410 @@ +#ifndef GIT_COMPAT_UTIL_H +#define GIT_COMPAT_UTIL_H + +#define _FILE_OFFSET_BITS 64 + +#ifndef FLEX_ARRAY +/* + * See if our compiler is known to support flexible array members. + */ +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) +# define FLEX_ARRAY /* empty */ +#elif defined(__GNUC__) +# if (__GNUC__ >= 3) +#  define FLEX_ARRAY /* empty */ +# else +#  define FLEX_ARRAY 0 /* older GNU extension */ +# endif +#endif + +/* + * Otherwise, default to safer but a bit wasteful traditional style + */ +#ifndef FLEX_ARRAY +# define FLEX_ARRAY 1 +#endif +#endif + +#define ARRAY_SIZE(x) (sizeof(x)/sizeof(x[0])) + +#ifdef __GNUC__ +#define TYPEOF(x) (__typeof__(x)) +#else +#define TYPEOF(x) +#endif + +#define MSB(x, bits) ((x) & TYPEOF(x)(~0ULL << (sizeof(x) * 8 - (bits)))) +#define HAS_MULTI_BITS(i)  ((i) & ((i) - 1))  /* checks if an integer has more than 1 bit set */ + +/* Approximation of the length of the decimal representation of this type. */ +#define decimal_length(x)	((int)(sizeof(x) * 2.56 + 0.5) + 1) + +#if !defined(__APPLE__) && !defined(__FreeBSD__)  && !defined(__USLC__) && !defined(_M_UNIX) +#define _XOPEN_SOURCE 600 /* glibc2 and AIX 5.3L need 500, OpenBSD needs 600 for S_ISLNK() */ +#define _XOPEN_SOURCE_EXTENDED 1 /* AIX 5.3L needs this */ +#endif +#define _ALL_SOURCE 1 +#define _GNU_SOURCE 1 +#define _BSD_SOURCE 1 + +#include <unistd.h> +#include <stdio.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <stddef.h> +#include <stdlib.h> +#include <stdarg.h> +#include <string.h> +#include <errno.h> +#include <limits.h> +#include <sys/param.h> +#include <sys/types.h> +#include <dirent.h> +#include <sys/time.h> +#include <time.h> +#include <signal.h> +#include <fnmatch.h> +#include <assert.h> +#include <regex.h> +#include <utime.h> +#ifndef __MINGW32__ +#include <sys/wait.h> +#include <sys/poll.h> +#include <sys/socket.h> +#include <sys/ioctl.h> +#ifndef NO_SYS_SELECT_H +#include <sys/select.h> +#endif +#include <netinet/in.h> +#include <netinet/tcp.h> +#include <arpa/inet.h> +#include <netdb.h> +#include <pwd.h> +#include <inttypes.h> +#if defined(__CYGWIN__) +#undef _XOPEN_SOURCE +#include <grp.h> +#define _XOPEN_SOURCE 600 +#include "compat/cygwin.h" +#else +#undef _ALL_SOURCE /* AIX 5.3L defines a struct list with _ALL_SOURCE. */ +#include <grp.h> +#define _ALL_SOURCE 1 +#endif +#else 	/* __MINGW32__ */ +/* pull in Windows compatibility stuff */ +#include "compat/mingw.h" +#endif	/* __MINGW32__ */ + +#ifndef NO_ICONV +#include <iconv.h> +#endif + +#ifndef NO_OPENSSL +#include <openssl/ssl.h> +#include <openssl/err.h> +#endif + +/* On most systems <limits.h> would have given us this, but + * not on some systems (e.g. GNU/Hurd). + */ +#ifndef PATH_MAX +#define PATH_MAX 4096 +#endif + +#ifndef PRIuMAX +#define PRIuMAX "llu" +#endif + +#ifndef PRIu32 +#define PRIu32 "u" +#endif + +#ifndef PRIx32 +#define PRIx32 "x" +#endif + +#ifndef PATH_SEP +#define PATH_SEP ':' +#endif + +#ifndef STRIP_EXTENSION +#define STRIP_EXTENSION "" +#endif + +#ifndef has_dos_drive_prefix +#define has_dos_drive_prefix(path) 0 +#endif + +#ifndef is_dir_sep +#define is_dir_sep(c) ((c) == '/') +#endif + +#ifdef __GNUC__ +#define NORETURN __attribute__((__noreturn__)) +#else +#define NORETURN +#ifndef __attribute__ +#define __attribute__(x) +#endif +#endif + +/* General helper functions */ +extern void usage(const char *err) NORETURN; +extern void die(const char *err, ...) NORETURN __attribute__((format (printf, 1, 2))); +extern int error(const char *err, ...) __attribute__((format (printf, 1, 2))); +extern void warning(const char *err, ...) __attribute__((format (printf, 1, 2))); + +extern void set_die_routine(void (*routine)(const char *err, va_list params) NORETURN); + +extern int prefixcmp(const char *str, const char *prefix); +extern time_t tm_to_time_t(const struct tm *tm); + +static inline const char *skip_prefix(const char *str, const char *prefix) +{ +	size_t len = strlen(prefix); +	return strncmp(str, prefix, len) ? NULL : str + len; +} + +#if defined(NO_MMAP) || defined(USE_WIN32_MMAP) + +#ifndef PROT_READ +#define PROT_READ 1 +#define PROT_WRITE 2 +#define MAP_PRIVATE 1 +#define MAP_FAILED ((void*)-1) +#endif + +#define mmap git_mmap +#define munmap git_munmap +extern void *git_mmap(void *start, size_t length, int prot, int flags, int fd, off_t offset); +extern int git_munmap(void *start, size_t length); + +#else /* NO_MMAP || USE_WIN32_MMAP */ + +#include <sys/mman.h> + +#endif /* NO_MMAP || USE_WIN32_MMAP */ + +#ifdef NO_MMAP + +/* This value must be multiple of (pagesize * 2) */ +#define DEFAULT_PACKED_GIT_WINDOW_SIZE (1 * 1024 * 1024) + +#else /* NO_MMAP */ + +/* This value must be multiple of (pagesize * 2) */ +#define DEFAULT_PACKED_GIT_WINDOW_SIZE \ +	(sizeof(void*) >= 8 \ +		?  1 * 1024 * 1024 * 1024 \ +		: 32 * 1024 * 1024) + +#endif /* NO_MMAP */ + +#ifdef NO_ST_BLOCKS_IN_STRUCT_STAT +#define on_disk_bytes(st) ((st).st_size) +#else +#define on_disk_bytes(st) ((st).st_blocks * 512) +#endif + +#define DEFAULT_PACKED_GIT_LIMIT \ +	((1024L * 1024L) * (sizeof(void*) >= 8 ? 8192 : 256)) + +#ifdef NO_PREAD +#define pread git_pread +extern ssize_t git_pread(int fd, void *buf, size_t count, off_t offset); +#endif +/* + * Forward decl that will remind us if its twin in cache.h changes. + * This function is used in compat/pread.c.  But we can't include + * cache.h there. + */ +extern ssize_t read_in_full(int fd, void *buf, size_t count); + +#ifdef NO_SETENV +#define setenv gitsetenv +extern int gitsetenv(const char *, const char *, int); +#endif + +#ifdef NO_MKDTEMP +#define mkdtemp gitmkdtemp +extern char *gitmkdtemp(char *); +#endif + +#ifdef NO_UNSETENV +#define unsetenv gitunsetenv +extern void gitunsetenv(const char *); +#endif + +#ifdef NO_STRCASESTR +#define strcasestr gitstrcasestr +extern char *gitstrcasestr(const char *haystack, const char *needle); +#endif + +#ifdef NO_STRLCPY +#define strlcpy gitstrlcpy +extern size_t gitstrlcpy(char *, const char *, size_t); +#endif + +#ifdef NO_STRTOUMAX +#define strtoumax gitstrtoumax +extern uintmax_t gitstrtoumax(const char *, char **, int); +#endif + +#ifdef NO_HSTRERROR +#define hstrerror githstrerror +extern const char *githstrerror(int herror); +#endif + +#ifdef NO_MEMMEM +#define memmem gitmemmem +void *gitmemmem(const void *haystack, size_t haystacklen, +                const void *needle, size_t needlelen); +#endif + +#ifdef FREAD_READS_DIRECTORIES +#ifdef fopen +#undef fopen +#endif +#define fopen(a,b) git_fopen(a,b) +extern FILE *git_fopen(const char*, const char*); +#endif + +#ifdef SNPRINTF_RETURNS_BOGUS +#define snprintf git_snprintf +extern int git_snprintf(char *str, size_t maxsize, +			const char *format, ...); +#define vsnprintf git_vsnprintf +extern int git_vsnprintf(char *str, size_t maxsize, +			 const char *format, va_list ap); +#endif + +#ifdef __GLIBC_PREREQ +#if __GLIBC_PREREQ(2, 1) +#define HAVE_STRCHRNUL +#endif +#endif + +#ifndef HAVE_STRCHRNUL +#define strchrnul gitstrchrnul +static inline char *gitstrchrnul(const char *s, int c) +{ +	while (*s && *s != c) +		s++; +	return (char *)s; +} +#endif + +/* + * Wrappers: + */ +extern char *xstrdup(const char *str); +extern void *xmalloc(size_t size); +extern void *xmemdupz(const void *data, size_t len); +extern char *xstrndup(const char *str, size_t len); +extern void *xrealloc(void *ptr, size_t size); +extern void *xcalloc(size_t nmemb, size_t size); +extern void *xmmap(void *start, size_t length, int prot, int flags, int fd, off_t offset); +extern ssize_t xread(int fd, void *buf, size_t len); +extern ssize_t xwrite(int fd, const void *buf, size_t len); +extern int xdup(int fd); +extern FILE *xfdopen(int fd, const char *mode); +extern int xmkstemp(char *template); + +static inline size_t xsize_t(off_t len) +{ +	return (size_t)len; +} + +static inline int has_extension(const char *filename, const char *ext) +{ +	size_t len = strlen(filename); +	size_t extlen = strlen(ext); +	return len > extlen && !memcmp(filename + len - extlen, ext, extlen); +} + +/* Sane ctype - no locale, and works with signed chars */ +#undef isascii +#undef isspace +#undef isdigit +#undef isalpha +#undef isalnum +#undef tolower +#undef toupper +extern unsigned char sane_ctype[256]; +#define GIT_SPACE 0x01 +#define GIT_DIGIT 0x02 +#define GIT_ALPHA 0x04 +#define GIT_GLOB_SPECIAL 0x08 +#define GIT_REGEX_SPECIAL 0x10 +#define sane_istest(x,mask) ((sane_ctype[(unsigned char)(x)] & (mask)) != 0) +#define isascii(x) (((x) & ~0x7f) == 0) +#define isspace(x) sane_istest(x,GIT_SPACE) +#define isdigit(x) sane_istest(x,GIT_DIGIT) +#define isalpha(x) sane_istest(x,GIT_ALPHA) +#define isalnum(x) sane_istest(x,GIT_ALPHA | GIT_DIGIT) +#define is_glob_special(x) sane_istest(x,GIT_GLOB_SPECIAL) +#define is_regex_special(x) sane_istest(x,GIT_GLOB_SPECIAL | GIT_REGEX_SPECIAL) +#define tolower(x) sane_case((unsigned char)(x), 0x20) +#define toupper(x) sane_case((unsigned char)(x), 0) + +static inline int sane_case(int x, int high) +{ +	if (sane_istest(x, GIT_ALPHA)) +		x = (x & ~0x20) | high; +	return x; +} + +static inline int strtoul_ui(char const *s, int base, unsigned int *result) +{ +	unsigned long ul; +	char *p; + +	errno = 0; +	ul = strtoul(s, &p, base); +	if (errno || *p || p == s || (unsigned int) ul != ul) +		return -1; +	*result = ul; +	return 0; +} + +static inline int strtol_i(char const *s, int base, int *result) +{ +	long ul; +	char *p; + +	errno = 0; +	ul = strtol(s, &p, base); +	if (errno || *p || p == s || (int) ul != ul) +		return -1; +	*result = ul; +	return 0; +} + +#ifdef INTERNAL_QSORT +void git_qsort(void *base, size_t nmemb, size_t size, +	       int(*compar)(const void *, const void *)); +#define qsort git_qsort +#endif + +#ifndef DIR_HAS_BSD_GROUP_SEMANTICS +# define FORCE_DIR_SET_GID S_ISGID +#else +# define FORCE_DIR_SET_GID 0 +#endif + +#ifdef NO_NSEC +#undef USE_NSEC +#define ST_CTIME_NSEC(st) 0 +#define ST_MTIME_NSEC(st) 0 +#else +#ifdef USE_ST_TIMESPEC +#define ST_CTIME_NSEC(st) ((unsigned int)((st).st_ctimespec.tv_nsec)) +#define ST_MTIME_NSEC(st) ((unsigned int)((st).st_mtimespec.tv_nsec)) +#else +#define ST_CTIME_NSEC(st) ((unsigned int)((st).st_ctim.tv_nsec)) +#define ST_MTIME_NSEC(st) ((unsigned int)((st).st_mtim.tv_nsec)) +#endif +#endif + +#endif diff --git a/tools/perf/util/wrapper.c b/tools/perf/util/wrapper.c new file mode 100644 index 00000000000..6350d65f6d9 --- /dev/null +++ b/tools/perf/util/wrapper.c @@ -0,0 +1,206 @@ +/* + * Various trivial helper wrappers around standard functions + */ +#include "cache.h" + +/* + * There's no pack memory to release - but stay close to the Git + * version so wrap this away: + */ +static inline void release_pack_memory(size_t size, int flag) +{ +} + +char *xstrdup(const char *str) +{ +	char *ret = strdup(str); +	if (!ret) { +		release_pack_memory(strlen(str) + 1, -1); +		ret = strdup(str); +		if (!ret) +			die("Out of memory, strdup failed"); +	} +	return ret; +} + +void *xmalloc(size_t size) +{ +	void *ret = malloc(size); +	if (!ret && !size) +		ret = malloc(1); +	if (!ret) { +		release_pack_memory(size, -1); +		ret = malloc(size); +		if (!ret && !size) +			ret = malloc(1); +		if (!ret) +			die("Out of memory, malloc failed"); +	} +#ifdef XMALLOC_POISON +	memset(ret, 0xA5, size); +#endif +	return ret; +} + +/* + * xmemdupz() allocates (len + 1) bytes of memory, duplicates "len" bytes of + * "data" to the allocated memory, zero terminates the allocated memory, + * and returns a pointer to the allocated memory. If the allocation fails, + * the program dies. + */ +void *xmemdupz(const void *data, size_t len) +{ +	char *p = xmalloc(len + 1); +	memcpy(p, data, len); +	p[len] = '\0'; +	return p; +} + +char *xstrndup(const char *str, size_t len) +{ +	char *p = memchr(str, '\0', len); +	return xmemdupz(str, p ? p - str : len); +} + +void *xrealloc(void *ptr, size_t size) +{ +	void *ret = realloc(ptr, size); +	if (!ret && !size) +		ret = realloc(ptr, 1); +	if (!ret) { +		release_pack_memory(size, -1); +		ret = realloc(ptr, size); +		if (!ret && !size) +			ret = realloc(ptr, 1); +		if (!ret) +			die("Out of memory, realloc failed"); +	} +	return ret; +} + +void *xcalloc(size_t nmemb, size_t size) +{ +	void *ret = calloc(nmemb, size); +	if (!ret && (!nmemb || !size)) +		ret = calloc(1, 1); +	if (!ret) { +		release_pack_memory(nmemb * size, -1); +		ret = calloc(nmemb, size); +		if (!ret && (!nmemb || !size)) +			ret = calloc(1, 1); +		if (!ret) +			die("Out of memory, calloc failed"); +	} +	return ret; +} + +void *xmmap(void *start, size_t length, +	int prot, int flags, int fd, off_t offset) +{ +	void *ret = mmap(start, length, prot, flags, fd, offset); +	if (ret == MAP_FAILED) { +		if (!length) +			return NULL; +		release_pack_memory(length, fd); +		ret = mmap(start, length, prot, flags, fd, offset); +		if (ret == MAP_FAILED) +			die("Out of memory? mmap failed: %s", strerror(errno)); +	} +	return ret; +} + +/* + * xread() is the same a read(), but it automatically restarts read() + * operations with a recoverable error (EAGAIN and EINTR). xread() + * DOES NOT GUARANTEE that "len" bytes is read even if the data is available. + */ +ssize_t xread(int fd, void *buf, size_t len) +{ +	ssize_t nr; +	while (1) { +		nr = read(fd, buf, len); +		if ((nr < 0) && (errno == EAGAIN || errno == EINTR)) +			continue; +		return nr; +	} +} + +/* + * xwrite() is the same a write(), but it automatically restarts write() + * operations with a recoverable error (EAGAIN and EINTR). xwrite() DOES NOT + * GUARANTEE that "len" bytes is written even if the operation is successful. + */ +ssize_t xwrite(int fd, const void *buf, size_t len) +{ +	ssize_t nr; +	while (1) { +		nr = write(fd, buf, len); +		if ((nr < 0) && (errno == EAGAIN || errno == EINTR)) +			continue; +		return nr; +	} +} + +ssize_t read_in_full(int fd, void *buf, size_t count) +{ +	char *p = buf; +	ssize_t total = 0; + +	while (count > 0) { +		ssize_t loaded = xread(fd, p, count); +		if (loaded <= 0) +			return total ? total : loaded; +		count -= loaded; +		p += loaded; +		total += loaded; +	} + +	return total; +} + +ssize_t write_in_full(int fd, const void *buf, size_t count) +{ +	const char *p = buf; +	ssize_t total = 0; + +	while (count > 0) { +		ssize_t written = xwrite(fd, p, count); +		if (written < 0) +			return -1; +		if (!written) { +			errno = ENOSPC; +			return -1; +		} +		count -= written; +		p += written; +		total += written; +	} + +	return total; +} + +int xdup(int fd) +{ +	int ret = dup(fd); +	if (ret < 0) +		die("dup failed: %s", strerror(errno)); +	return ret; +} + +FILE *xfdopen(int fd, const char *mode) +{ +	FILE *stream = fdopen(fd, mode); +	if (stream == NULL) +		die("Out of memory? fdopen failed: %s", strerror(errno)); +	return stream; +} + +int xmkstemp(char *template) +{ +	int fd; + +	fd = mkstemp(template); +	if (fd < 0) +		die("Unable to create temporary file: %s", strerror(errno)); +	return fd; +}  |