diff options
Diffstat (limited to 'arch/x86/include/asm/percpu.h')
| -rw-r--r-- | arch/x86/include/asm/percpu.h | 187 | 
1 files changed, 150 insertions, 37 deletions
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 38f9e965ff9..8ee45167e81 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -177,39 +177,6 @@ do {									\  	}								\  } while (0) -/* - * Add return operation - */ -#define percpu_add_return_op(var, val)					\ -({									\ -	typeof(var) paro_ret__ = val;					\ -	switch (sizeof(var)) {						\ -	case 1:								\ -		asm("xaddb %0, "__percpu_arg(1)				\ -			    : "+q" (paro_ret__), "+m" (var)		\ -			    : : "memory");				\ -		break;							\ -	case 2:								\ -		asm("xaddw %0, "__percpu_arg(1)				\ -			    : "+r" (paro_ret__), "+m" (var)		\ -			    : : "memory");				\ -		break;							\ -	case 4:								\ -		asm("xaddl %0, "__percpu_arg(1)				\ -			    : "+r" (paro_ret__), "+m" (var)		\ -			    : : "memory");				\ -		break;							\ -	case 8:								\ -		asm("xaddq %0, "__percpu_arg(1)				\ -			    : "+re" (paro_ret__), "+m" (var)		\ -			    : : "memory");				\ -		break;							\ -	default: __bad_percpu_size();					\ -	}								\ -	paro_ret__ += val;						\ -	paro_ret__;							\ -}) -  #define percpu_from_op(op, var, constraint)		\  ({							\  	typeof(var) pfo_ret__;				\ @@ -263,6 +230,125 @@ do {									\  })  /* + * Add return operation + */ +#define percpu_add_return_op(var, val)					\ +({									\ +	typeof(var) paro_ret__ = val;					\ +	switch (sizeof(var)) {						\ +	case 1:								\ +		asm("xaddb %0, "__percpu_arg(1)				\ +			    : "+q" (paro_ret__), "+m" (var)		\ +			    : : "memory");				\ +		break;							\ +	case 2:								\ +		asm("xaddw %0, "__percpu_arg(1)				\ +			    : "+r" (paro_ret__), "+m" (var)		\ +			    : : "memory");				\ +		break;							\ +	case 4:								\ +		asm("xaddl %0, "__percpu_arg(1)				\ +			    : "+r" (paro_ret__), "+m" (var)		\ +			    : : "memory");				\ +		break;							\ +	case 8:								\ +		asm("xaddq %0, "__percpu_arg(1)				\ +			    : "+re" (paro_ret__), "+m" (var)		\ +			    : : "memory");				\ +		break;							\ +	default: __bad_percpu_size();					\ +	}								\ +	paro_ret__ += val;						\ +	paro_ret__;							\ +}) + +/* + * xchg is implemented using cmpxchg without a lock prefix. xchg is + * expensive due to the implied lock prefix.  The processor cannot prefetch + * cachelines if xchg is used. + */ +#define percpu_xchg_op(var, nval)					\ +({									\ +	typeof(var) pxo_ret__;						\ +	typeof(var) pxo_new__ = (nval);					\ +	switch (sizeof(var)) {						\ +	case 1:								\ +		asm("\n1:mov "__percpu_arg(1)",%%al"			\ +		    "\n\tcmpxchgb %2, "__percpu_arg(1)			\ +		    "\n\tjnz 1b"					\ +			    : "=a" (pxo_ret__), "+m" (var)		\ +			    : "q" (pxo_new__)				\ +			    : "memory");				\ +		break;							\ +	case 2:								\ +		asm("\n1:mov "__percpu_arg(1)",%%ax"			\ +		    "\n\tcmpxchgw %2, "__percpu_arg(1)			\ +		    "\n\tjnz 1b"					\ +			    : "=a" (pxo_ret__), "+m" (var)		\ +			    : "r" (pxo_new__)				\ +			    : "memory");				\ +		break;							\ +	case 4:								\ +		asm("\n1:mov "__percpu_arg(1)",%%eax"			\ +		    "\n\tcmpxchgl %2, "__percpu_arg(1)			\ +		    "\n\tjnz 1b"					\ +			    : "=a" (pxo_ret__), "+m" (var)		\ +			    : "r" (pxo_new__)				\ +			    : "memory");				\ +		break;							\ +	case 8:								\ +		asm("\n1:mov "__percpu_arg(1)",%%rax"			\ +		    "\n\tcmpxchgq %2, "__percpu_arg(1)			\ +		    "\n\tjnz 1b"					\ +			    : "=a" (pxo_ret__), "+m" (var)		\ +			    : "r" (pxo_new__)				\ +			    : "memory");				\ +		break;							\ +	default: __bad_percpu_size();					\ +	}								\ +	pxo_ret__;							\ +}) + +/* + * cmpxchg has no such implied lock semantics as a result it is much + * more efficient for cpu local operations. + */ +#define percpu_cmpxchg_op(var, oval, nval)				\ +({									\ +	typeof(var) pco_ret__;						\ +	typeof(var) pco_old__ = (oval);					\ +	typeof(var) pco_new__ = (nval);					\ +	switch (sizeof(var)) {						\ +	case 1:								\ +		asm("cmpxchgb %2, "__percpu_arg(1)			\ +			    : "=a" (pco_ret__), "+m" (var)		\ +			    : "q" (pco_new__), "0" (pco_old__)		\ +			    : "memory");				\ +		break;							\ +	case 2:								\ +		asm("cmpxchgw %2, "__percpu_arg(1)			\ +			    : "=a" (pco_ret__), "+m" (var)		\ +			    : "r" (pco_new__), "0" (pco_old__)		\ +			    : "memory");				\ +		break;							\ +	case 4:								\ +		asm("cmpxchgl %2, "__percpu_arg(1)			\ +			    : "=a" (pco_ret__), "+m" (var)		\ +			    : "r" (pco_new__), "0" (pco_old__)		\ +			    : "memory");				\ +		break;							\ +	case 8:								\ +		asm("cmpxchgq %2, "__percpu_arg(1)			\ +			    : "=a" (pco_ret__), "+m" (var)		\ +			    : "r" (pco_new__), "0" (pco_old__)		\ +			    : "memory");				\ +		break;							\ +	default: __bad_percpu_size();					\ +	}								\ +	pco_ret__;							\ +}) + +/*   * percpu_read() makes gcc load the percpu variable every time it is   * accessed while percpu_read_stable() allows the value to be cached.   * percpu_read_stable() is more efficient and can be used if its value @@ -300,6 +386,12 @@ do {									\  #define __this_cpu_xor_1(pcp, val)	percpu_to_op("xor", (pcp), val)  #define __this_cpu_xor_2(pcp, val)	percpu_to_op("xor", (pcp), val)  #define __this_cpu_xor_4(pcp, val)	percpu_to_op("xor", (pcp), val) +/* + * Generic fallback operations for __this_cpu_xchg_[1-4] are okay and much + * faster than an xchg with forced lock semantics. + */ +#define __this_cpu_xchg_8(pcp, nval)	percpu_xchg_op(pcp, nval) +#define __this_cpu_cmpxchg_8(pcp, oval, nval)	percpu_cmpxchg_op(pcp, oval, nval)  #define this_cpu_read_1(pcp)		percpu_from_op("mov", (pcp), "m"(pcp))  #define this_cpu_read_2(pcp)		percpu_from_op("mov", (pcp), "m"(pcp)) @@ -319,6 +411,11 @@ do {									\  #define this_cpu_xor_1(pcp, val)	percpu_to_op("xor", (pcp), val)  #define this_cpu_xor_2(pcp, val)	percpu_to_op("xor", (pcp), val)  #define this_cpu_xor_4(pcp, val)	percpu_to_op("xor", (pcp), val) +#define this_cpu_xchg_1(pcp, nval)	percpu_xchg_op(pcp, nval) +#define this_cpu_xchg_2(pcp, nval)	percpu_xchg_op(pcp, nval) +#define this_cpu_xchg_4(pcp, nval)	percpu_xchg_op(pcp, nval) +#define this_cpu_xchg_8(pcp, nval)	percpu_xchg_op(pcp, nval) +#define this_cpu_cmpxchg_8(pcp, oval, nval)	percpu_cmpxchg_op(pcp, oval, nval)  #define irqsafe_cpu_add_1(pcp, val)	percpu_add_op((pcp), val)  #define irqsafe_cpu_add_2(pcp, val)	percpu_add_op((pcp), val) @@ -332,15 +429,32 @@ do {									\  #define irqsafe_cpu_xor_1(pcp, val)	percpu_to_op("xor", (pcp), val)  #define irqsafe_cpu_xor_2(pcp, val)	percpu_to_op("xor", (pcp), val)  #define irqsafe_cpu_xor_4(pcp, val)	percpu_to_op("xor", (pcp), val) +#define irqsafe_cpu_xchg_1(pcp, nval)	percpu_xchg_op(pcp, nval) +#define irqsafe_cpu_xchg_2(pcp, nval)	percpu_xchg_op(pcp, nval) +#define irqsafe_cpu_xchg_4(pcp, nval)	percpu_xchg_op(pcp, nval) +#define irqsafe_cpu_xchg_8(pcp, nval)	percpu_xchg_op(pcp, nval) +#define irqsafe_cpu_cmpxchg_8(pcp, oval, nval)	percpu_cmpxchg_op(pcp, oval, nval)  #ifndef CONFIG_M386  #define __this_cpu_add_return_1(pcp, val) percpu_add_return_op(pcp, val)  #define __this_cpu_add_return_2(pcp, val) percpu_add_return_op(pcp, val)  #define __this_cpu_add_return_4(pcp, val) percpu_add_return_op(pcp, val) +#define __this_cpu_cmpxchg_1(pcp, oval, nval)	percpu_cmpxchg_op(pcp, oval, nval) +#define __this_cpu_cmpxchg_2(pcp, oval, nval)	percpu_cmpxchg_op(pcp, oval, nval) +#define __this_cpu_cmpxchg_4(pcp, oval, nval)	percpu_cmpxchg_op(pcp, oval, nval) +  #define this_cpu_add_return_1(pcp, val)	percpu_add_return_op(pcp, val)  #define this_cpu_add_return_2(pcp, val)	percpu_add_return_op(pcp, val)  #define this_cpu_add_return_4(pcp, val)	percpu_add_return_op(pcp, val) -#endif +#define this_cpu_cmpxchg_1(pcp, oval, nval)	percpu_cmpxchg_op(pcp, oval, nval) +#define this_cpu_cmpxchg_2(pcp, oval, nval)	percpu_cmpxchg_op(pcp, oval, nval) +#define this_cpu_cmpxchg_4(pcp, oval, nval)	percpu_cmpxchg_op(pcp, oval, nval) + +#define irqsafe_cpu_cmpxchg_1(pcp, oval, nval)	percpu_cmpxchg_op(pcp, oval, nval) +#define irqsafe_cpu_cmpxchg_2(pcp, oval, nval)	percpu_cmpxchg_op(pcp, oval, nval) +#define irqsafe_cpu_cmpxchg_4(pcp, oval, nval)	percpu_cmpxchg_op(pcp, oval, nval) +#endif /* !CONFIG_M386 */ +  /*   * Per cpu atomic 64 bit operations are only available under 64 bit.   * 32 bit must fall back to generic operations. @@ -352,6 +466,7 @@ do {									\  #define __this_cpu_and_8(pcp, val)	percpu_to_op("and", (pcp), val)  #define __this_cpu_or_8(pcp, val)	percpu_to_op("or", (pcp), val)  #define __this_cpu_xor_8(pcp, val)	percpu_to_op("xor", (pcp), val) +#define __this_cpu_add_return_8(pcp, val) percpu_add_return_op(pcp, val)  #define this_cpu_read_8(pcp)		percpu_from_op("mov", (pcp), "m"(pcp))  #define this_cpu_write_8(pcp, val)	percpu_to_op("mov", (pcp), val) @@ -359,14 +474,12 @@ do {									\  #define this_cpu_and_8(pcp, val)	percpu_to_op("and", (pcp), val)  #define this_cpu_or_8(pcp, val)		percpu_to_op("or", (pcp), val)  #define this_cpu_xor_8(pcp, val)	percpu_to_op("xor", (pcp), val) +#define this_cpu_add_return_8(pcp, val)	percpu_add_return_op(pcp, val)  #define irqsafe_cpu_add_8(pcp, val)	percpu_add_op((pcp), val)  #define irqsafe_cpu_and_8(pcp, val)	percpu_to_op("and", (pcp), val)  #define irqsafe_cpu_or_8(pcp, val)	percpu_to_op("or", (pcp), val)  #define irqsafe_cpu_xor_8(pcp, val)	percpu_to_op("xor", (pcp), val) - -#define __this_cpu_add_return_8(pcp, val) percpu_add_return_op(pcp, val) -#define this_cpu_add_return_8(pcp, val)	percpu_add_return_op(pcp, val)  #endif  /* This is not atomic against other CPUs -- CPU preemption needs to be off */  |