diff options
Diffstat (limited to 'arch/x86/kernel/ds.c')
| -rw-r--r-- | arch/x86/kernel/ds.c | 921 | 
1 files changed, 663 insertions, 258 deletions
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 87b67e3a765..48bfe138603 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -19,45 +19,61 @@   * Markus Metzger <markus.t.metzger@intel.com>, 2007-2009   */ - -#include <asm/ds.h> - -#include <linux/errno.h> +#include <linux/kernel.h>  #include <linux/string.h> -#include <linux/slab.h> +#include <linux/errno.h>  #include <linux/sched.h> +#include <linux/slab.h>  #include <linux/mm.h> -#include <linux/kernel.h> +#include <linux/trace_clock.h> + +#include <asm/ds.h> +#include "ds_selftest.h"  /* - * The configuration for a particular DS hardware implementation. + * The configuration for a particular DS hardware implementation:   */  struct ds_configuration { -	/* the name of the configuration */ -	const char *name; -	/* the size of one pointer-typed field in the DS structure and -	   in the BTS and PEBS buffers in bytes; -	   this covers the first 8 DS fields related to buffer management. */ -	unsigned char  sizeof_field; -	/* the size of a BTS/PEBS record in bytes */ -	unsigned char  sizeof_rec[2]; -	/* a series of bit-masks to control various features indexed -	 * by enum ds_feature */ -	unsigned long ctl[dsf_ctl_max]; +	/* The name of the configuration: */ +	const char		*name; + +	/* The size of pointer-typed fields in DS, BTS, and PEBS: */ +	unsigned char		sizeof_ptr_field; + +	/* The size of a BTS/PEBS record in bytes: */ +	unsigned char		sizeof_rec[2]; + +	/* The number of pebs counter reset values in the DS structure. */ +	unsigned char		nr_counter_reset; + +	/* Control bit-masks indexed by enum ds_feature: */ +	unsigned long		ctl[dsf_ctl_max];  }; -static DEFINE_PER_CPU(struct ds_configuration, ds_cfg_array); +static struct ds_configuration ds_cfg __read_mostly; + + +/* Maximal size of a DS configuration: */ +#define MAX_SIZEOF_DS		0x80 -#define ds_cfg per_cpu(ds_cfg_array, smp_processor_id()) +/* Maximal size of a BTS record: */ +#define MAX_SIZEOF_BTS		(3 * 8) -#define MAX_SIZEOF_DS (12 * 8)	/* maximal size of a DS configuration */ -#define MAX_SIZEOF_BTS (3 * 8)	/* maximal size of a BTS record */ -#define DS_ALIGNMENT (1 << 3)	/* BTS and PEBS buffer alignment */ +/* BTS and PEBS buffer alignment: */ +#define DS_ALIGNMENT		(1 << 3) -#define BTS_CONTROL \ - (ds_cfg.ctl[dsf_bts] | ds_cfg.ctl[dsf_bts_kernel] | ds_cfg.ctl[dsf_bts_user] |\ -  ds_cfg.ctl[dsf_bts_overflow]) +/* Number of buffer pointers in DS: */ +#define NUM_DS_PTR_FIELDS	8 +/* Size of a pebs reset value in DS: */ +#define PEBS_RESET_FIELD_SIZE	8 + +/* Mask of control bits in the DS MSR register: */ +#define BTS_CONTROL				  \ +	( ds_cfg.ctl[dsf_bts]			| \ +	  ds_cfg.ctl[dsf_bts_kernel]		| \ +	  ds_cfg.ctl[dsf_bts_user]		| \ +	  ds_cfg.ctl[dsf_bts_overflow] )  /*   * A BTS or PEBS tracer. @@ -66,29 +82,36 @@ static DEFINE_PER_CPU(struct ds_configuration, ds_cfg_array);   * to identify tracers.   */  struct ds_tracer { -	/* the DS context (partially) owned by this tracer */ -	struct ds_context *context; -	/* the buffer provided on ds_request() and its size in bytes */ -	void *buffer; -	size_t size; +	/* The DS context (partially) owned by this tracer. */ +	struct ds_context	*context; +	/* The buffer provided on ds_request() and its size in bytes. */ +	void			*buffer; +	size_t			size;  };  struct bts_tracer { -	/* the common DS part */ -	struct ds_tracer ds; -	/* the trace including the DS configuration */ -	struct bts_trace trace; -	/* buffer overflow notification function */ -	bts_ovfl_callback_t ovfl; +	/* The common DS part: */ +	struct ds_tracer	ds; + +	/* The trace including the DS configuration: */ +	struct bts_trace	trace; + +	/* Buffer overflow notification function: */ +	bts_ovfl_callback_t	ovfl; + +	/* Active flags affecting trace collection. */ +	unsigned int		flags;  };  struct pebs_tracer { -	/* the common DS part */ -	struct ds_tracer ds; -	/* the trace including the DS configuration */ -	struct pebs_trace trace; -	/* buffer overflow notification function */ -	pebs_ovfl_callback_t ovfl; +	/* The common DS part: */ +	struct ds_tracer	ds; + +	/* The trace including the DS configuration: */ +	struct pebs_trace	trace; + +	/* Buffer overflow notification function: */ +	pebs_ovfl_callback_t	ovfl;  };  /* @@ -97,6 +120,7 @@ struct pebs_tracer {   *   * The DS configuration consists of the following fields; different   * architetures vary in the size of those fields. + *   * - double-word aligned base linear address of the BTS buffer   * - write pointer into the BTS buffer   * - end linear address of the BTS buffer (one byte beyond the end of @@ -135,21 +159,22 @@ enum ds_field {  };  enum ds_qualifier { -	ds_bts  = 0, +	ds_bts = 0,  	ds_pebs  }; -static inline unsigned long ds_get(const unsigned char *base, -				   enum ds_qualifier qual, enum ds_field field) +static inline unsigned long +ds_get(const unsigned char *base, enum ds_qualifier qual, enum ds_field field)  { -	base += (ds_cfg.sizeof_field * (field + (4 * qual))); +	base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual)));  	return *(unsigned long *)base;  } -static inline void ds_set(unsigned char *base, enum ds_qualifier qual, -			  enum ds_field field, unsigned long value) +static inline void +ds_set(unsigned char *base, enum ds_qualifier qual, enum ds_field field, +       unsigned long value)  { -	base += (ds_cfg.sizeof_field * (field + (4 * qual))); +	base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual)));  	(*(unsigned long *)base) = value;  } @@ -159,7 +184,6 @@ static inline void ds_set(unsigned char *base, enum ds_qualifier qual,   */  static DEFINE_SPINLOCK(ds_lock); -  /*   * We either support (system-wide) per-cpu or per-thread allocation.   * We distinguish the two based on the task_struct pointer, where a @@ -178,12 +202,28 @@ static DEFINE_SPINLOCK(ds_lock);   */  static atomic_t tracers = ATOMIC_INIT(0); -static inline void get_tracer(struct task_struct *task) +static inline int get_tracer(struct task_struct *task)  { -	if (task) +	int error; + +	spin_lock_irq(&ds_lock); + +	if (task) { +		error = -EPERM; +		if (atomic_read(&tracers) < 0) +			goto out;  		atomic_inc(&tracers); -	else +	} else { +		error = -EPERM; +		if (atomic_read(&tracers) > 0) +			goto out;  		atomic_dec(&tracers); +	} + +	error = 0; +out: +	spin_unlock_irq(&ds_lock); +	return error;  }  static inline void put_tracer(struct task_struct *task) @@ -194,14 +234,6 @@ static inline void put_tracer(struct task_struct *task)  		atomic_inc(&tracers);  } -static inline int check_tracer(struct task_struct *task) -{ -	return task ? -		(atomic_read(&tracers) >= 0) : -		(atomic_read(&tracers) <= 0); -} - -  /*   * The DS context is either attached to a thread or to a cpu:   * - in the former case, the thread_struct contains a pointer to the @@ -213,61 +245,58 @@ static inline int check_tracer(struct task_struct *task)   * deallocated when the last user puts the context.   */  struct ds_context { -	/* pointer to the DS configuration; goes into MSR_IA32_DS_AREA */ -	unsigned char ds[MAX_SIZEOF_DS]; -	/* the owner of the BTS and PEBS configuration, respectively */ -	struct bts_tracer *bts_master; -	struct pebs_tracer *pebs_master; -	/* use count */ -	unsigned long count; -	/* a pointer to the context location inside the thread_struct -	 * or the per_cpu context array */ -	struct ds_context **this; -	/* a pointer to the task owning this context, or NULL, if the -	 * context is owned by a cpu */ -	struct task_struct *task; -}; +	/* The DS configuration; goes into MSR_IA32_DS_AREA: */ +	unsigned char		ds[MAX_SIZEOF_DS]; + +	/* The owner of the BTS and PEBS configuration, respectively: */ +	struct bts_tracer	*bts_master; +	struct pebs_tracer	*pebs_master; -static DEFINE_PER_CPU(struct ds_context *, system_context_array); +	/* Use count: */ +	unsigned long		count; -#define system_context per_cpu(system_context_array, smp_processor_id()) +	/* Pointer to the context pointer field: */ +	struct ds_context	**this; + +	/* The traced task; NULL for cpu tracing: */ +	struct task_struct	*task; + +	/* The traced cpu; only valid if task is NULL: */ +	int			cpu; +}; +static DEFINE_PER_CPU(struct ds_context *, cpu_context); -static inline struct ds_context *ds_get_context(struct task_struct *task) + +static struct ds_context *ds_get_context(struct task_struct *task, int cpu)  {  	struct ds_context **p_context = -		(task ? &task->thread.ds_ctx : &system_context); +		(task ? &task->thread.ds_ctx : &per_cpu(cpu_context, cpu));  	struct ds_context *context = NULL;  	struct ds_context *new_context = NULL; -	unsigned long irq;  	/* Chances are small that we already have a context. */  	new_context = kzalloc(sizeof(*new_context), GFP_KERNEL);  	if (!new_context)  		return NULL; -	spin_lock_irqsave(&ds_lock, irq); +	spin_lock_irq(&ds_lock);  	context = *p_context; -	if (!context) { +	if (likely(!context)) {  		context = new_context;  		context->this = p_context;  		context->task = task; +		context->cpu = cpu;  		context->count = 0; -		if (task) -			set_tsk_thread_flag(task, TIF_DS_AREA_MSR); - -		if (!task || (task == current)) -			wrmsrl(MSR_IA32_DS_AREA, (unsigned long)context->ds); -  		*p_context = context;  	}  	context->count++; -	spin_unlock_irqrestore(&ds_lock, irq); +	spin_unlock_irq(&ds_lock);  	if (context != new_context)  		kfree(new_context); @@ -275,8 +304,9 @@ static inline struct ds_context *ds_get_context(struct task_struct *task)  	return context;  } -static inline void ds_put_context(struct ds_context *context) +static void ds_put_context(struct ds_context *context)  { +	struct task_struct *task;  	unsigned long irq;  	if (!context) @@ -291,17 +321,55 @@ static inline void ds_put_context(struct ds_context *context)  	*(context->this) = NULL; -	if (context->task) -		clear_tsk_thread_flag(context->task, TIF_DS_AREA_MSR); +	task = context->task; + +	if (task) +		clear_tsk_thread_flag(task, TIF_DS_AREA_MSR); -	if (!context->task || (context->task == current)) -		wrmsrl(MSR_IA32_DS_AREA, 0); +	/* +	 * We leave the (now dangling) pointer to the DS configuration in +	 * the DS_AREA msr. This is as good or as bad as replacing it with +	 * NULL - the hardware would crash if we enabled tracing. +	 * +	 * This saves us some problems with having to write an msr on a +	 * different cpu while preventing others from doing the same for the +	 * next context for that same cpu. +	 */  	spin_unlock_irqrestore(&ds_lock, irq); +	/* The context might still be in use for context switching. */ +	if (task && (task != current)) +		wait_task_context_switch(task); +  	kfree(context);  } +static void ds_install_ds_area(struct ds_context *context) +{ +	unsigned long ds; + +	ds = (unsigned long)context->ds; + +	/* +	 * There is a race between the bts master and the pebs master. +	 * +	 * The thread/cpu access is synchronized via get/put_cpu() for +	 * task tracing and via wrmsr_on_cpu for cpu tracing. +	 * +	 * If bts and pebs are collected for the same task or same cpu, +	 * the same confiuration is written twice. +	 */ +	if (context->task) { +		get_cpu(); +		if (context->task == current) +			wrmsrl(MSR_IA32_DS_AREA, ds); +		set_tsk_thread_flag(context->task, TIF_DS_AREA_MSR); +		put_cpu(); +	} else +		wrmsr_on_cpu(context->cpu, MSR_IA32_DS_AREA, +			     (u32)((u64)ds), (u32)((u64)ds >> 32)); +}  /*   * Call the tracer's callback on a buffer overflow. @@ -332,9 +400,9 @@ static void ds_overflow(struct ds_context *context, enum ds_qualifier qual)   * The remainder of any partially written record is zeroed out.   *   * context: the DS context - * qual: the buffer type - * record: the data to write - * size: the size of the data + * qual:    the buffer type + * record:  the data to write + * size:    the size of the data   */  static int ds_write(struct ds_context *context, enum ds_qualifier qual,  		    const void *record, size_t size) @@ -349,14 +417,14 @@ static int ds_write(struct ds_context *context, enum ds_qualifier qual,  		unsigned long write_size, adj_write_size;  		/* -		 * write as much as possible without producing an +		 * Write as much as possible without producing an  		 * overflow interrupt.  		 * -		 * interrupt_threshold must either be +		 * Interrupt_threshold must either be  		 * - bigger than absolute_maximum or  		 * - point to a record between buffer_base and absolute_maximum  		 * -		 * index points to a valid record. +		 * Index points to a valid record.  		 */  		base   = ds_get(context->ds, qual, ds_buffer_base);  		index  = ds_get(context->ds, qual, ds_index); @@ -365,8 +433,10 @@ static int ds_write(struct ds_context *context, enum ds_qualifier qual,  		write_end = min(end, int_th); -		/* if we are already beyond the interrupt threshold, -		 * we fill the entire buffer */ +		/* +		 * If we are already beyond the interrupt threshold, +		 * we fill the entire buffer. +		 */  		if (write_end <= index)  			write_end = end; @@ -383,7 +453,7 @@ static int ds_write(struct ds_context *context, enum ds_qualifier qual,  		adj_write_size = write_size / ds_cfg.sizeof_rec[qual];  		adj_write_size *= ds_cfg.sizeof_rec[qual]; -		/* zero out trailing bytes */ +		/* Zero out trailing bytes. */  		memset((char *)index + write_size, 0,  		       adj_write_size - write_size);  		index += adj_write_size; @@ -410,7 +480,7 @@ static int ds_write(struct ds_context *context, enum ds_qualifier qual,   * Later architectures use 64bit pointers throughout, whereas earlier   * architectures use 32bit pointers in 32bit mode.   * - * We compute the base address for the first 8 fields based on: + * We compute the base address for the fields based on:   * - the field size stored in the DS configuration   * - the relative field position   * @@ -431,23 +501,23 @@ enum bts_field {  	bts_to,  	bts_flags, -	bts_qual = bts_from, -	bts_jiffies = bts_to, -	bts_pid = bts_flags, +	bts_qual		= bts_from, +	bts_clock		= bts_to, +	bts_pid			= bts_flags, -	bts_qual_mask = (bts_qual_max - 1), -	bts_escape = ((unsigned long)-1 & ~bts_qual_mask) +	bts_qual_mask		= (bts_qual_max - 1), +	bts_escape		= ((unsigned long)-1 & ~bts_qual_mask)  };  static inline unsigned long bts_get(const char *base, enum bts_field field)  { -	base += (ds_cfg.sizeof_field * field); +	base += (ds_cfg.sizeof_ptr_field * field);  	return *(unsigned long *)base;  }  static inline void bts_set(char *base, enum bts_field field, unsigned long val)  { -	base += (ds_cfg.sizeof_field * field);; +	base += (ds_cfg.sizeof_ptr_field * field);;  	(*(unsigned long *)base) = val;  } @@ -463,8 +533,8 @@ static inline void bts_set(char *base, enum bts_field field, unsigned long val)   *   * return: bytes read/written on success; -Eerrno, otherwise   */ -static int bts_read(struct bts_tracer *tracer, const void *at, -		    struct bts_struct *out) +static int +bts_read(struct bts_tracer *tracer, const void *at, struct bts_struct *out)  {  	if (!tracer)  		return -EINVAL; @@ -478,8 +548,8 @@ static int bts_read(struct bts_tracer *tracer, const void *at,  	memset(out, 0, sizeof(*out));  	if ((bts_get(at, bts_qual) & ~bts_qual_mask) == bts_escape) {  		out->qualifier = (bts_get(at, bts_qual) & bts_qual_mask); -		out->variant.timestamp.jiffies = bts_get(at, bts_jiffies); -		out->variant.timestamp.pid = bts_get(at, bts_pid); +		out->variant.event.clock = bts_get(at, bts_clock); +		out->variant.event.pid = bts_get(at, bts_pid);  	} else {  		out->qualifier = bts_branch;  		out->variant.lbr.from = bts_get(at, bts_from); @@ -516,8 +586,8 @@ static int bts_write(struct bts_tracer *tracer, const struct bts_struct *in)  	case bts_task_arrives:  	case bts_task_departs:  		bts_set(raw, bts_qual, (bts_escape | in->qualifier)); -		bts_set(raw, bts_jiffies, in->variant.timestamp.jiffies); -		bts_set(raw, bts_pid, in->variant.timestamp.pid); +		bts_set(raw, bts_clock, in->variant.event.clock); +		bts_set(raw, bts_pid, in->variant.event.pid);  		break;  	default:  		return -EINVAL; @@ -555,7 +625,8 @@ static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual,  			     unsigned int flags) {  	unsigned long buffer, adj; -	/* adjust the buffer address and size to meet alignment +	/* +	 * Adjust the buffer address and size to meet alignment  	 * constraints:  	 * - buffer is double-word aligned  	 * - size is multiple of record size @@ -577,9 +648,11 @@ static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual,  	trace->begin = (void *)buffer;  	trace->top = trace->begin;  	trace->end = (void *)(buffer + size); -	/* The value for 'no threshold' is -1, which will set the +	/* +	 * The value for 'no threshold' is -1, which will set the  	 * threshold outside of the buffer, just like we want it.  	 */ +	ith *= ds_cfg.sizeof_rec[qual];  	trace->ith = (void *)(buffer + size - ith);  	trace->flags = flags; @@ -588,18 +661,27 @@ static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual,  static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace,  		      enum ds_qualifier qual, struct task_struct *task, -		      void *base, size_t size, size_t th, unsigned int flags) +		      int cpu, void *base, size_t size, size_t th)  {  	struct ds_context *context;  	int error; +	size_t req_size; + +	error = -EOPNOTSUPP; +	if (!ds_cfg.sizeof_rec[qual]) +		goto out;  	error = -EINVAL;  	if (!base)  		goto out; -	/* we require some space to do alignment adjustments below */ +	req_size = ds_cfg.sizeof_rec[qual]; +	/* We might need space for alignment adjustments. */ +	if (!IS_ALIGNED((unsigned long)base, DS_ALIGNMENT)) +		req_size += DS_ALIGNMENT; +  	error = -EINVAL; -	if (size < (DS_ALIGNMENT + ds_cfg.sizeof_rec[qual])) +	if (size < req_size)  		goto out;  	if (th != (size_t)-1) { @@ -614,182 +696,318 @@ static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace,  	tracer->size = size;  	error = -ENOMEM; -	context = ds_get_context(task); +	context = ds_get_context(task, cpu);  	if (!context)  		goto out;  	tracer->context = context; -	ds_init_ds_trace(trace, qual, base, size, th, flags); +	/* +	 * Defer any tracer-specific initialization work for the context until +	 * context ownership has been clarified. +	 */  	error = 0;   out:  	return error;  } -struct bts_tracer *ds_request_bts(struct task_struct *task, -				  void *base, size_t size, -				  bts_ovfl_callback_t ovfl, size_t th, -				  unsigned int flags) +static struct bts_tracer *ds_request_bts(struct task_struct *task, int cpu, +					 void *base, size_t size, +					 bts_ovfl_callback_t ovfl, size_t th, +					 unsigned int flags)  {  	struct bts_tracer *tracer; -	unsigned long irq;  	int error; +	/* Buffer overflow notification is not yet implemented. */  	error = -EOPNOTSUPP; -	if (!ds_cfg.ctl[dsf_bts]) +	if (ovfl)  		goto out; -	/* buffer overflow notification is not yet implemented */ -	error = -EOPNOTSUPP; -	if (ovfl) +	error = get_tracer(task); +	if (error < 0)  		goto out;  	error = -ENOMEM;  	tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);  	if (!tracer) -		goto out; +		goto out_put_tracer;  	tracer->ovfl = ovfl; +	/* Do some more error checking and acquire a tracing context. */  	error = ds_request(&tracer->ds, &tracer->trace.ds, -			   ds_bts, task, base, size, th, flags); +			   ds_bts, task, cpu, base, size, th);  	if (error < 0)  		goto out_tracer; - -	spin_lock_irqsave(&ds_lock, irq); - -	error = -EPERM; -	if (!check_tracer(task)) -		goto out_unlock; -	get_tracer(task); +	/* Claim the bts part of the tracing context we acquired above. */ +	spin_lock_irq(&ds_lock);  	error = -EPERM;  	if (tracer->ds.context->bts_master) -		goto out_put_tracer; +		goto out_unlock;  	tracer->ds.context->bts_master = tracer; -	spin_unlock_irqrestore(&ds_lock, irq); +	spin_unlock_irq(&ds_lock); +	/* +	 * Now that we own the bts part of the context, let's complete the +	 * initialization for that part. +	 */ +	ds_init_ds_trace(&tracer->trace.ds, ds_bts, base, size, th, flags); +	ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts); +	ds_install_ds_area(tracer->ds.context);  	tracer->trace.read  = bts_read;  	tracer->trace.write = bts_write; -	ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts); +	/* Start tracing. */  	ds_resume_bts(tracer);  	return tracer; - out_put_tracer: -	put_tracer(task);   out_unlock: -	spin_unlock_irqrestore(&ds_lock, irq); +	spin_unlock_irq(&ds_lock);  	ds_put_context(tracer->ds.context);   out_tracer:  	kfree(tracer); + out_put_tracer: +	put_tracer(task);   out:  	return ERR_PTR(error);  } -struct pebs_tracer *ds_request_pebs(struct task_struct *task, -				    void *base, size_t size, -				    pebs_ovfl_callback_t ovfl, size_t th, -				    unsigned int flags) +struct bts_tracer *ds_request_bts_task(struct task_struct *task, +				       void *base, size_t size, +				       bts_ovfl_callback_t ovfl, +				       size_t th, unsigned int flags) +{ +	return ds_request_bts(task, 0, base, size, ovfl, th, flags); +} + +struct bts_tracer *ds_request_bts_cpu(int cpu, void *base, size_t size, +				      bts_ovfl_callback_t ovfl, +				      size_t th, unsigned int flags) +{ +	return ds_request_bts(NULL, cpu, base, size, ovfl, th, flags); +} + +static struct pebs_tracer *ds_request_pebs(struct task_struct *task, int cpu, +					   void *base, size_t size, +					   pebs_ovfl_callback_t ovfl, size_t th, +					   unsigned int flags)  {  	struct pebs_tracer *tracer; -	unsigned long irq;  	int error; -	/* buffer overflow notification is not yet implemented */ +	/* Buffer overflow notification is not yet implemented. */  	error = -EOPNOTSUPP;  	if (ovfl)  		goto out; +	error = get_tracer(task); +	if (error < 0) +		goto out; +  	error = -ENOMEM;  	tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);  	if (!tracer) -		goto out; +		goto out_put_tracer;  	tracer->ovfl = ovfl; +	/* Do some more error checking and acquire a tracing context. */  	error = ds_request(&tracer->ds, &tracer->trace.ds, -			   ds_pebs, task, base, size, th, flags); +			   ds_pebs, task, cpu, base, size, th);  	if (error < 0)  		goto out_tracer; -	spin_lock_irqsave(&ds_lock, irq); - -	error = -EPERM; -	if (!check_tracer(task)) -		goto out_unlock; -	get_tracer(task); +	/* Claim the pebs part of the tracing context we acquired above. */ +	spin_lock_irq(&ds_lock);  	error = -EPERM;  	if (tracer->ds.context->pebs_master) -		goto out_put_tracer; +		goto out_unlock;  	tracer->ds.context->pebs_master = tracer; -	spin_unlock_irqrestore(&ds_lock, irq); +	spin_unlock_irq(&ds_lock); +	/* +	 * Now that we own the pebs part of the context, let's complete the +	 * initialization for that part. +	 */ +	ds_init_ds_trace(&tracer->trace.ds, ds_pebs, base, size, th, flags);  	ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_pebs); +	ds_install_ds_area(tracer->ds.context); + +	/* Start tracing. */  	ds_resume_pebs(tracer);  	return tracer; - out_put_tracer: -	put_tracer(task);   out_unlock: -	spin_unlock_irqrestore(&ds_lock, irq); +	spin_unlock_irq(&ds_lock);  	ds_put_context(tracer->ds.context);   out_tracer:  	kfree(tracer); + out_put_tracer: +	put_tracer(task);   out:  	return ERR_PTR(error);  } -void ds_release_bts(struct bts_tracer *tracer) +struct pebs_tracer *ds_request_pebs_task(struct task_struct *task, +					 void *base, size_t size, +					 pebs_ovfl_callback_t ovfl, +					 size_t th, unsigned int flags)  { -	if (!tracer) -		return; +	return ds_request_pebs(task, 0, base, size, ovfl, th, flags); +} -	ds_suspend_bts(tracer); +struct pebs_tracer *ds_request_pebs_cpu(int cpu, void *base, size_t size, +					pebs_ovfl_callback_t ovfl, +					size_t th, unsigned int flags) +{ +	return ds_request_pebs(NULL, cpu, base, size, ovfl, th, flags); +} + +static void ds_free_bts(struct bts_tracer *tracer) +{ +	struct task_struct *task; + +	task = tracer->ds.context->task;  	WARN_ON_ONCE(tracer->ds.context->bts_master != tracer);  	tracer->ds.context->bts_master = NULL; -	put_tracer(tracer->ds.context->task); +	/* Make sure tracing stopped and the tracer is not in use. */ +	if (task && (task != current)) +		wait_task_context_switch(task); +  	ds_put_context(tracer->ds.context); +	put_tracer(task);  	kfree(tracer);  } +void ds_release_bts(struct bts_tracer *tracer) +{ +	might_sleep(); + +	if (!tracer) +		return; + +	ds_suspend_bts(tracer); +	ds_free_bts(tracer); +} + +int ds_release_bts_noirq(struct bts_tracer *tracer) +{ +	struct task_struct *task; +	unsigned long irq; +	int error; + +	if (!tracer) +		return 0; + +	task = tracer->ds.context->task; + +	local_irq_save(irq); + +	error = -EPERM; +	if (!task && +	    (tracer->ds.context->cpu != smp_processor_id())) +		goto out; + +	error = -EPERM; +	if (task && (task != current)) +		goto out; + +	ds_suspend_bts_noirq(tracer); +	ds_free_bts(tracer); + +	error = 0; + out: +	local_irq_restore(irq); +	return error; +} + +static void update_task_debugctlmsr(struct task_struct *task, +				    unsigned long debugctlmsr) +{ +	task->thread.debugctlmsr = debugctlmsr; + +	get_cpu(); +	if (task == current) +		update_debugctlmsr(debugctlmsr); +	put_cpu(); +} +  void ds_suspend_bts(struct bts_tracer *tracer)  {  	struct task_struct *task; +	unsigned long debugctlmsr; +	int cpu;  	if (!tracer)  		return; +	tracer->flags = 0; +  	task = tracer->ds.context->task; +	cpu  = tracer->ds.context->cpu; -	if (!task || (task == current)) -		update_debugctlmsr(get_debugctlmsr() & ~BTS_CONTROL); +	WARN_ON(!task && irqs_disabled()); -	if (task) { -		task->thread.debugctlmsr &= ~BTS_CONTROL; +	debugctlmsr = (task ? +		       task->thread.debugctlmsr : +		       get_debugctlmsr_on_cpu(cpu)); +	debugctlmsr &= ~BTS_CONTROL; -		if (!task->thread.debugctlmsr) -			clear_tsk_thread_flag(task, TIF_DEBUGCTLMSR); -	} +	if (task) +		update_task_debugctlmsr(task, debugctlmsr); +	else +		update_debugctlmsr_on_cpu(cpu, debugctlmsr);  } -void ds_resume_bts(struct bts_tracer *tracer) +int ds_suspend_bts_noirq(struct bts_tracer *tracer)  {  	struct task_struct *task; -	unsigned long control; +	unsigned long debugctlmsr, irq; +	int cpu, error = 0;  	if (!tracer) -		return; +		return 0; + +	tracer->flags = 0;  	task = tracer->ds.context->task; +	cpu  = tracer->ds.context->cpu; + +	local_irq_save(irq); + +	error = -EPERM; +	if (!task && (cpu != smp_processor_id())) +		goto out; + +	debugctlmsr = (task ? +		       task->thread.debugctlmsr : +		       get_debugctlmsr()); +	debugctlmsr &= ~BTS_CONTROL; + +	if (task) +		update_task_debugctlmsr(task, debugctlmsr); +	else +		update_debugctlmsr(debugctlmsr); + +	error = 0; + out: +	local_irq_restore(irq); +	return error; +} + +static unsigned long ds_bts_control(struct bts_tracer *tracer) +{ +	unsigned long control;  	control = ds_cfg.ctl[dsf_bts];  	if (!(tracer->trace.ds.flags & BTS_KERNEL)) @@ -797,41 +1015,149 @@ void ds_resume_bts(struct bts_tracer *tracer)  	if (!(tracer->trace.ds.flags & BTS_USER))  		control |= ds_cfg.ctl[dsf_bts_user]; -	if (task) { -		task->thread.debugctlmsr |= control; -		set_tsk_thread_flag(task, TIF_DEBUGCTLMSR); -	} - -	if (!task || (task == current)) -		update_debugctlmsr(get_debugctlmsr() | control); +	return control;  } -void ds_release_pebs(struct pebs_tracer *tracer) +void ds_resume_bts(struct bts_tracer *tracer)  { +	struct task_struct *task; +	unsigned long debugctlmsr; +	int cpu; +  	if (!tracer)  		return; -	ds_suspend_pebs(tracer); +	tracer->flags = tracer->trace.ds.flags; + +	task = tracer->ds.context->task; +	cpu  = tracer->ds.context->cpu; + +	WARN_ON(!task && irqs_disabled()); + +	debugctlmsr = (task ? +		       task->thread.debugctlmsr : +		       get_debugctlmsr_on_cpu(cpu)); +	debugctlmsr |= ds_bts_control(tracer); + +	if (task) +		update_task_debugctlmsr(task, debugctlmsr); +	else +		update_debugctlmsr_on_cpu(cpu, debugctlmsr); +} + +int ds_resume_bts_noirq(struct bts_tracer *tracer) +{ +	struct task_struct *task; +	unsigned long debugctlmsr, irq; +	int cpu, error = 0; + +	if (!tracer) +		return 0; + +	tracer->flags = tracer->trace.ds.flags; + +	task = tracer->ds.context->task; +	cpu  = tracer->ds.context->cpu; + +	local_irq_save(irq); + +	error = -EPERM; +	if (!task && (cpu != smp_processor_id())) +		goto out; + +	debugctlmsr = (task ? +		       task->thread.debugctlmsr : +		       get_debugctlmsr()); +	debugctlmsr |= ds_bts_control(tracer); + +	if (task) +		update_task_debugctlmsr(task, debugctlmsr); +	else +		update_debugctlmsr(debugctlmsr); + +	error = 0; + out: +	local_irq_restore(irq); +	return error; +} + +static void ds_free_pebs(struct pebs_tracer *tracer) +{ +	struct task_struct *task; + +	task = tracer->ds.context->task;  	WARN_ON_ONCE(tracer->ds.context->pebs_master != tracer);  	tracer->ds.context->pebs_master = NULL; -	put_tracer(tracer->ds.context->task);  	ds_put_context(tracer->ds.context); +	put_tracer(task);  	kfree(tracer);  } +void ds_release_pebs(struct pebs_tracer *tracer) +{ +	might_sleep(); + +	if (!tracer) +		return; + +	ds_suspend_pebs(tracer); +	ds_free_pebs(tracer); +} + +int ds_release_pebs_noirq(struct pebs_tracer *tracer) +{ +	struct task_struct *task; +	unsigned long irq; +	int error; + +	if (!tracer) +		return 0; + +	task = tracer->ds.context->task; + +	local_irq_save(irq); + +	error = -EPERM; +	if (!task && +	    (tracer->ds.context->cpu != smp_processor_id())) +		goto out; + +	error = -EPERM; +	if (task && (task != current)) +		goto out; + +	ds_suspend_pebs_noirq(tracer); +	ds_free_pebs(tracer); + +	error = 0; + out: +	local_irq_restore(irq); +	return error; +} +  void ds_suspend_pebs(struct pebs_tracer *tracer)  {  } +int ds_suspend_pebs_noirq(struct pebs_tracer *tracer) +{ +	return 0; +} +  void ds_resume_pebs(struct pebs_tracer *tracer)  {  } +int ds_resume_pebs_noirq(struct pebs_tracer *tracer) +{ +	return 0; +} +  const struct bts_trace *ds_read_bts(struct bts_tracer *tracer)  {  	if (!tracer) @@ -847,8 +1173,12 @@ const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer)  		return NULL;  	ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_pebs); -	tracer->trace.reset_value = -		*(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8)); + +	tracer->trace.counters = ds_cfg.nr_counter_reset; +	memcpy(tracer->trace.counter_reset, +	       tracer->ds.context->ds + +	       (NUM_DS_PTR_FIELDS * ds_cfg.sizeof_ptr_field), +	       ds_cfg.nr_counter_reset * PEBS_RESET_FIELD_SIZE);  	return &tracer->trace;  } @@ -873,18 +1203,24 @@ int ds_reset_pebs(struct pebs_tracer *tracer)  	tracer->trace.ds.top = tracer->trace.ds.begin; -	ds_set(tracer->ds.context->ds, ds_bts, ds_index, +	ds_set(tracer->ds.context->ds, ds_pebs, ds_index,  	       (unsigned long)tracer->trace.ds.top);  	return 0;  } -int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value) +int ds_set_pebs_reset(struct pebs_tracer *tracer, +		      unsigned int counter, u64 value)  {  	if (!tracer)  		return -EINVAL; -	*(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8)) = value; +	if (ds_cfg.nr_counter_reset < counter) +		return -EINVAL; + +	*(u64 *)(tracer->ds.context->ds + +		 (NUM_DS_PTR_FIELDS * ds_cfg.sizeof_ptr_field) + +		 (counter * PEBS_RESET_FIELD_SIZE)) = value;  	return 0;  } @@ -894,73 +1230,117 @@ static const struct ds_configuration ds_cfg_netburst = {  	.ctl[dsf_bts]		= (1 << 2) | (1 << 3),  	.ctl[dsf_bts_kernel]	= (1 << 5),  	.ctl[dsf_bts_user]	= (1 << 6), - -	.sizeof_field		= sizeof(long), -	.sizeof_rec[ds_bts]	= sizeof(long) * 3, -#ifdef __i386__ -	.sizeof_rec[ds_pebs]	= sizeof(long) * 10, -#else -	.sizeof_rec[ds_pebs]	= sizeof(long) * 18, -#endif +	.nr_counter_reset	= 1,  };  static const struct ds_configuration ds_cfg_pentium_m = {  	.name = "Pentium M",  	.ctl[dsf_bts]		= (1 << 6) | (1 << 7), - -	.sizeof_field		= sizeof(long), -	.sizeof_rec[ds_bts]	= sizeof(long) * 3, -#ifdef __i386__ -	.sizeof_rec[ds_pebs]	= sizeof(long) * 10, -#else -	.sizeof_rec[ds_pebs]	= sizeof(long) * 18, -#endif +	.nr_counter_reset	= 1,  };  static const struct ds_configuration ds_cfg_core2_atom = {  	.name = "Core 2/Atom",  	.ctl[dsf_bts]		= (1 << 6) | (1 << 7),  	.ctl[dsf_bts_kernel]	= (1 << 9),  	.ctl[dsf_bts_user]	= (1 << 10), - -	.sizeof_field		= 8, -	.sizeof_rec[ds_bts]	= 8 * 3, -	.sizeof_rec[ds_pebs]	= 8 * 18, +	.nr_counter_reset	= 1, +}; +static const struct ds_configuration ds_cfg_core_i7 = { +	.name = "Core i7", +	.ctl[dsf_bts]		= (1 << 6) | (1 << 7), +	.ctl[dsf_bts_kernel]	= (1 << 9), +	.ctl[dsf_bts_user]	= (1 << 10), +	.nr_counter_reset	= 4,  };  static void -ds_configure(const struct ds_configuration *cfg) +ds_configure(const struct ds_configuration *cfg, +	     struct cpuinfo_x86 *cpu)  { +	unsigned long nr_pebs_fields = 0; + +	printk(KERN_INFO "[ds] using %s configuration\n", cfg->name); + +#ifdef __i386__ +	nr_pebs_fields = 10; +#else +	nr_pebs_fields = 18; +#endif + +	/* +	 * Starting with version 2, architectural performance +	 * monitoring supports a format specifier. +	 */ +	if ((cpuid_eax(0xa) & 0xff) > 1) { +		unsigned long perf_capabilities, format; + +		rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_capabilities); + +		format = (perf_capabilities >> 8) & 0xf; + +		switch (format) { +		case 0: +			nr_pebs_fields = 18; +			break; +		case 1: +			nr_pebs_fields = 22; +			break; +		default: +			printk(KERN_INFO +			       "[ds] unknown PEBS format: %lu\n", format); +			nr_pebs_fields = 0; +			break; +		} +	} +  	memset(&ds_cfg, 0, sizeof(ds_cfg));  	ds_cfg = *cfg; -	printk(KERN_INFO "[ds] using %s configuration\n", ds_cfg.name); +	ds_cfg.sizeof_ptr_field = +		(cpu_has(cpu, X86_FEATURE_DTES64) ? 8 : 4); + +	ds_cfg.sizeof_rec[ds_bts]  = ds_cfg.sizeof_ptr_field * 3; +	ds_cfg.sizeof_rec[ds_pebs] = ds_cfg.sizeof_ptr_field * nr_pebs_fields; -	if (!cpu_has_bts) { -		ds_cfg.ctl[dsf_bts] = 0; +	if (!cpu_has(cpu, X86_FEATURE_BTS)) { +		ds_cfg.sizeof_rec[ds_bts] = 0;  		printk(KERN_INFO "[ds] bts not available\n");  	} -	if (!cpu_has_pebs) +	if (!cpu_has(cpu, X86_FEATURE_PEBS)) { +		ds_cfg.sizeof_rec[ds_pebs] = 0;  		printk(KERN_INFO "[ds] pebs not available\n"); +	} + +	printk(KERN_INFO "[ds] sizes: address: %u bit, ", +	       8 * ds_cfg.sizeof_ptr_field); +	printk("bts/pebs record: %u/%u bytes\n", +	       ds_cfg.sizeof_rec[ds_bts], ds_cfg.sizeof_rec[ds_pebs]); -	WARN_ON_ONCE(MAX_SIZEOF_DS < (12 * ds_cfg.sizeof_field)); +	WARN_ON_ONCE(MAX_PEBS_COUNTERS < ds_cfg.nr_counter_reset);  }  void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)  { +	/* Only configure the first cpu. Others are identical. */ +	if (ds_cfg.name) +		return; +  	switch (c->x86) {  	case 0x6:  		switch (c->x86_model) {  		case 0x9:  		case 0xd: /* Pentium M */ -			ds_configure(&ds_cfg_pentium_m); +			ds_configure(&ds_cfg_pentium_m, c);  			break;  		case 0xf:  		case 0x17: /* Core2 */  		case 0x1c: /* Atom */ -			ds_configure(&ds_cfg_core2_atom); +			ds_configure(&ds_cfg_core2_atom, c); +			break; +		case 0x1a: /* Core i7 */ +			ds_configure(&ds_cfg_core_i7, c);  			break; -		case 0x1a: /* i7 */  		default: -			/* sorry, don't know about them */ +			/* Sorry, don't know about them. */  			break;  		}  		break; @@ -969,64 +1349,89 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)  		case 0x0:  		case 0x1:  		case 0x2: /* Netburst */ -			ds_configure(&ds_cfg_netburst); +			ds_configure(&ds_cfg_netburst, c);  			break;  		default: -			/* sorry, don't know about them */ +			/* Sorry, don't know about them. */  			break;  		}  		break;  	default: -		/* sorry, don't know about them */ +		/* Sorry, don't know about them. */  		break;  	}  } +static inline void ds_take_timestamp(struct ds_context *context, +				     enum bts_qualifier qualifier, +				     struct task_struct *task) +{ +	struct bts_tracer *tracer = context->bts_master; +	struct bts_struct ts; + +	/* Prevent compilers from reading the tracer pointer twice. */ +	barrier(); + +	if (!tracer || !(tracer->flags & BTS_TIMESTAMPS)) +		return; + +	memset(&ts, 0, sizeof(ts)); +	ts.qualifier		= qualifier; +	ts.variant.event.clock	= trace_clock_global(); +	ts.variant.event.pid	= task->pid; + +	bts_write(tracer, &ts); +} +  /*   * Change the DS configuration from tracing prev to tracing next.   */  void ds_switch_to(struct task_struct *prev, struct task_struct *next)  { -	struct ds_context *prev_ctx = prev->thread.ds_ctx; -	struct ds_context *next_ctx = next->thread.ds_ctx; +	struct ds_context *prev_ctx	= prev->thread.ds_ctx; +	struct ds_context *next_ctx	= next->thread.ds_ctx; +	unsigned long debugctlmsr	= next->thread.debugctlmsr; + +	/* Make sure all data is read before we start. */ +	barrier();  	if (prev_ctx) {  		update_debugctlmsr(0); -		if (prev_ctx->bts_master && -		    (prev_ctx->bts_master->trace.ds.flags & BTS_TIMESTAMPS)) { -			struct bts_struct ts = { -				.qualifier = bts_task_departs, -				.variant.timestamp.jiffies = jiffies_64, -				.variant.timestamp.pid = prev->pid -			}; -			bts_write(prev_ctx->bts_master, &ts); -		} +		ds_take_timestamp(prev_ctx, bts_task_departs, prev);  	}  	if (next_ctx) { -		if (next_ctx->bts_master && -		    (next_ctx->bts_master->trace.ds.flags & BTS_TIMESTAMPS)) { -			struct bts_struct ts = { -				.qualifier = bts_task_arrives, -				.variant.timestamp.jiffies = jiffies_64, -				.variant.timestamp.pid = next->pid -			}; -			bts_write(next_ctx->bts_master, &ts); -		} +		ds_take_timestamp(next_ctx, bts_task_arrives, next);  		wrmsrl(MSR_IA32_DS_AREA, (unsigned long)next_ctx->ds);  	} -	update_debugctlmsr(next->thread.debugctlmsr); +	update_debugctlmsr(debugctlmsr);  } -void ds_copy_thread(struct task_struct *tsk, struct task_struct *father) +static __init int ds_selftest(void)  { -	clear_tsk_thread_flag(tsk, TIF_DS_AREA_MSR); -	tsk->thread.ds_ctx = NULL; -} +	if (ds_cfg.sizeof_rec[ds_bts]) { +		int error; -void ds_exit_thread(struct task_struct *tsk) -{ +		error = ds_selftest_bts(); +		if (error) { +			WARN(1, "[ds] selftest failed. disabling bts.\n"); +			ds_cfg.sizeof_rec[ds_bts] = 0; +		} +	} + +	if (ds_cfg.sizeof_rec[ds_pebs]) { +		int error; + +		error = ds_selftest_pebs(); +		if (error) { +			WARN(1, "[ds] selftest failed. disabling pebs.\n"); +			ds_cfg.sizeof_rec[ds_pebs] = 0; +		} +	} + +	return 0;  } +device_initcall(ds_selftest);  |