diff options
Diffstat (limited to 'arch/x86')
136 files changed, 3291 insertions, 2060 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 8ec3a1aa4ab..7f9a395c525 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -7,11 +7,13 @@ config 64BIT  	  Say no to build a 32-bit kernel - formerly known as i386  config X86_32 -	def_bool !64BIT +	def_bool y +	depends on !64BIT  	select CLKSRC_I8253  config X86_64 -	def_bool 64BIT +	def_bool y +	depends on 64BIT  	select X86_DEV_DMA_OPS  ### Arch settings @@ -36,6 +38,7 @@ config X86  	select HAVE_KRETPROBES  	select HAVE_OPTPROBES  	select HAVE_FTRACE_MCOUNT_RECORD +	select HAVE_FENTRY if X86_64  	select HAVE_C_RECORDMCOUNT  	select HAVE_DYNAMIC_FTRACE  	select HAVE_FUNCTION_TRACER @@ -60,6 +63,8 @@ config X86  	select HAVE_MIXED_BREAKPOINTS_REGS  	select PERF_EVENTS  	select HAVE_PERF_EVENTS_NMI +	select HAVE_PERF_REGS +	select HAVE_PERF_USER_STACK_DUMP  	select ANON_INODES  	select HAVE_ALIGNED_STRUCT_PAGE if SLUB && !M386  	select HAVE_CMPXCHG_LOCAL if !M386 @@ -97,9 +102,12 @@ config X86  	select KTIME_SCALAR if X86_32  	select GENERIC_STRNCPY_FROM_USER  	select GENERIC_STRNLEN_USER +	select HAVE_RCU_USER_QS if X86_64 +	select HAVE_IRQ_TIME_ACCOUNTING  config INSTRUCTION_DECODER -	def_bool (KPROBES || PERF_EVENTS || UPROBES) +	def_bool y +	depends on KPROBES || PERF_EVENTS || UPROBES  config OUTPUT_FORMAT  	string @@ -127,13 +135,15 @@ config SBUS  	bool  config NEED_DMA_MAP_STATE -       def_bool (X86_64 || INTEL_IOMMU || DMA_API_DEBUG) +	def_bool y +	depends on X86_64 || INTEL_IOMMU || DMA_API_DEBUG  config NEED_SG_DMA_LENGTH  	def_bool y  config GENERIC_ISA_DMA -	def_bool ISA_DMA_API +	def_bool y +	depends on ISA_DMA_API  config GENERIC_BUG  	def_bool y @@ -150,13 +160,16 @@ config GENERIC_GPIO  	bool  config ARCH_MAY_HAVE_PC_FDC -	def_bool ISA_DMA_API +	def_bool y +	depends on ISA_DMA_API  config RWSEM_GENERIC_SPINLOCK -	def_bool !X86_XADD +	def_bool y +	depends on !X86_XADD  config RWSEM_XCHGADD_ALGORITHM -	def_bool X86_XADD +	def_bool y +	depends on X86_XADD  config GENERIC_CALIBRATE_DELAY  	def_bool y @@ -746,13 +759,14 @@ config SWIOTLB  	def_bool y if X86_64  	---help---  	  Support for software bounce buffers used on x86-64 systems -	  which don't have a hardware IOMMU (e.g. the current generation -	  of Intel's x86-64 CPUs). Using this PCI devices which can only -	  access 32-bits of memory can be used on systems with more than -	  3 GB of memory. If unsure, say Y. +	  which don't have a hardware IOMMU. Using this PCI devices +	  which can only access 32-bits of memory can be used on systems +	  with more than 3 GB of memory. +	  If unsure, say Y.  config IOMMU_HELPER -	def_bool (CALGARY_IOMMU || GART_IOMMU || SWIOTLB || AMD_IOMMU) +	def_bool y +	depends on CALGARY_IOMMU || GART_IOMMU || SWIOTLB || AMD_IOMMU  config MAXSMP  	bool "Enable Maximum number of SMP Processors and NUMA Nodes" @@ -796,17 +810,6 @@ config SCHED_MC  	  making when dealing with multi-core CPU chips at a cost of slightly  	  increased overhead in some places. If unsure say N here. -config IRQ_TIME_ACCOUNTING -	bool "Fine granularity task level IRQ time accounting" -	default n -	---help--- -	  Select this option to enable fine granularity task irq time -	  accounting. This is done by reading a timestamp on each -	  transitions between softirq and hardirq state, so there can be a -	  small performance impact. - -	  If in doubt, say N here. -  source "kernel/Kconfig.preempt"  config X86_UP_APIC @@ -871,6 +874,7 @@ config X86_REROUTE_FOR_BROKEN_BOOT_IRQS  config X86_MCE  	bool "Machine Check / overheating reporting" +	default y  	---help---  	  Machine Check support allows the processor to notify the  	  kernel if it detects a problem (e.g. overheating, data corruption). @@ -982,25 +986,25 @@ config X86_REBOOTFIXUPS  	  Say N otherwise.  config MICROCODE -	tristate "/dev/cpu/microcode - microcode support" +	tristate "CPU microcode loading support"  	select FW_LOADER  	---help--- +  	  If you say Y here, you will be able to update the microcode on  	  certain Intel and AMD processors. The Intel support is for the -	  IA32 family, e.g. Pentium Pro, Pentium II, Pentium III, -	  Pentium 4, Xeon etc. The AMD support is for family 0x10 and -	  0x11 processors, e.g. Opteron, Phenom and Turion 64 Ultra. -	  You will obviously need the actual microcode binary data itself -	  which is not shipped with the Linux kernel. +	  IA32 family, e.g. Pentium Pro, Pentium II, Pentium III, Pentium 4, +	  Xeon etc. The AMD support is for families 0x10 and later. You will +	  obviously need the actual microcode binary data itself which is not +	  shipped with the Linux kernel.  	  This option selects the general module only, you need to select  	  at least one vendor specific module as well. -	  To compile this driver as a module, choose M here: the -	  module will be called microcode. +	  To compile this driver as a module, choose M here: the module +	  will be called microcode.  config MICROCODE_INTEL -	bool "Intel microcode patch loading support" +	bool "Intel microcode loading support"  	depends on MICROCODE  	default MICROCODE  	select FW_LOADER @@ -1013,7 +1017,7 @@ config MICROCODE_INTEL  	  <http://www.urbanmyth.org/microcode/>.  config MICROCODE_AMD -	bool "AMD microcode patch loading support" +	bool "AMD microcode loading support"  	depends on MICROCODE  	select FW_LOADER  	---help--- @@ -1159,10 +1163,12 @@ config X86_PAE  	  consumes more pagetable space per process.  config ARCH_PHYS_ADDR_T_64BIT -	def_bool X86_64 || X86_PAE +	def_bool y +	depends on X86_64 || X86_PAE  config ARCH_DMA_ADDR_T_64BIT -	def_bool X86_64 || HIGHMEM64G +	def_bool y +	depends on X86_64 || HIGHMEM64G  config DIRECT_GBPAGES  	bool "Enable 1GB pages for kernel pagetables" if EXPERT @@ -1285,8 +1291,8 @@ config ARCH_SELECT_MEMORY_MODEL  	depends on ARCH_SPARSEMEM_ENABLE  config ARCH_MEMORY_PROBE -	def_bool X86_64 -	depends on MEMORY_HOTPLUG +	def_bool y +	depends on X86_64 && MEMORY_HOTPLUG  config ARCH_PROC_KCORE_TEXT  	def_bool y @@ -1487,6 +1493,17 @@ config ARCH_RANDOM  	  If supported, this is a high bandwidth, cryptographically  	  secure hardware random number generator. +config X86_SMAP +	def_bool y +	prompt "Supervisor Mode Access Prevention" if EXPERT +	---help--- +	  Supervisor Mode Access Prevention (SMAP) is a security +	  feature in newer Intel processors.  There is a small +	  performance cost if this enabled and turned on; there is +	  also a small increase in the kernel size if this is enabled. + +	  If unsure, say Y. +  config EFI  	bool "EFI runtime service support"  	depends on ACPI @@ -1975,7 +1992,6 @@ config PCI_MMCONFIG  config PCI_CNB20LE_QUIRK  	bool "Read CNB20LE Host Bridge Windows" if EXPERT -	default n  	depends on PCI && EXPERIMENTAL  	help  	  Read the PCI windows out of the CNB20LE host bridge. This allows @@ -2186,18 +2202,18 @@ config COMPAT  	depends on IA32_EMULATION || X86_X32  	select ARCH_WANT_OLD_COMPAT_IPC +if COMPAT  config COMPAT_FOR_U64_ALIGNMENT -	def_bool COMPAT -	depends on X86_64 +	def_bool y  config SYSVIPC_COMPAT  	def_bool y -	depends on COMPAT && SYSVIPC +	depends on SYSVIPC  config KEYS_COMPAT -	bool -	depends on COMPAT && KEYS -	default y +	def_bool y +	depends on KEYS +endif  endmenu diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 706e12e9984..f3b86d0df44 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -306,7 +306,8 @@ config X86_INTERNODE_CACHE_SHIFT  	default X86_L1_CACHE_SHIFT  config X86_CMPXCHG -	def_bool X86_64 || (X86_32 && !M386) +	def_bool y +	depends on X86_64 || (X86_32 && !M386)  config X86_L1_CACHE_SHIFT  	int @@ -317,7 +318,7 @@ config X86_L1_CACHE_SHIFT  config X86_XADD  	def_bool y -	depends on X86_64 || !M386 +	depends on !M386  config X86_PPRO_FENCE  	bool "PentiumPro memory ordering errata workaround" diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 682e9c210ba..474ca35b1bc 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -142,7 +142,7 @@ KBUILD_CFLAGS += $(call cc-option,-mno-avx,)  KBUILD_CFLAGS += $(mflags-y)  KBUILD_AFLAGS += $(mflags-y) -archscripts: +archscripts: scripts_basic  	$(Q)$(MAKE) $(build)=arch/x86/tools relocs  ### diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index e398bb5d63b..8a84501acb1 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -28,6 +28,9 @@ VMLINUX_OBJS = $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o \  	$(obj)/string.o $(obj)/cmdline.o $(obj)/early_serial_console.o \  	$(obj)/piggy.o +$(obj)/eboot.o: KBUILD_CFLAGS += -fshort-wchar -mno-red-zone +$(obj)/efi_stub_$(BITS).o: KBUILD_CLFAGS += -fshort-wchar -mno-red-zone +  ifeq ($(CONFIG_EFI_STUB), y)  	VMLINUX_OBJS += $(obj)/eboot.o $(obj)/efi_stub_$(BITS).o  endif diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index b3e0227df2c..c760e073963 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -276,8 +276,9 @@ static efi_status_t setup_gop(struct screen_info *si, efi_guid_t *proto,  	nr_gops = size / sizeof(void *);  	for (i = 0; i < nr_gops; i++) {  		struct efi_graphics_output_mode_info *info; -		efi_guid_t pciio_proto = EFI_PCI_IO_PROTOCOL_GUID; -		void *pciio; +		efi_guid_t conout_proto = EFI_CONSOLE_OUT_DEVICE_GUID; +		bool conout_found = false; +		void *dummy;  		void *h = gop_handle[i];  		status = efi_call_phys3(sys_table->boottime->handle_protocol, @@ -285,19 +286,21 @@ static efi_status_t setup_gop(struct screen_info *si, efi_guid_t *proto,  		if (status != EFI_SUCCESS)  			continue; -		efi_call_phys3(sys_table->boottime->handle_protocol, -			       h, &pciio_proto, &pciio); +		status = efi_call_phys3(sys_table->boottime->handle_protocol, +					h, &conout_proto, &dummy); + +		if (status == EFI_SUCCESS) +			conout_found = true;  		status = efi_call_phys4(gop->query_mode, gop,  					gop->mode->mode, &size, &info); -		if (status == EFI_SUCCESS && (!first_gop || pciio)) { +		if (status == EFI_SUCCESS && (!first_gop || conout_found)) {  			/* -			 * Apple provide GOPs that are not backed by -			 * real hardware (they're used to handle -			 * multiple displays). The workaround is to -			 * search for a GOP implementing the PCIIO -			 * protocol, and if one isn't found, to just -			 * fallback to the first GOP. +			 * Systems that use the UEFI Console Splitter may +			 * provide multiple GOP devices, not all of which are +			 * backed by real hardware. The workaround is to search +			 * for a GOP implementing the ConOut protocol, and if +			 * one isn't found, to just fall back to the first GOP.  			 */  			width = info->horizontal_resolution;  			height = info->vertical_resolution; @@ -308,10 +311,10 @@ static efi_status_t setup_gop(struct screen_info *si, efi_guid_t *proto,  			pixels_per_scan_line = info->pixels_per_scan_line;  			/* -			 * Once we've found a GOP supporting PCIIO, +			 * Once we've found a GOP supporting ConOut,  			 * don't bother looking any further.  			 */ -			if (pciio) +			if (conout_found)  				break;  			first_gop = gop; @@ -328,7 +331,6 @@ static efi_status_t setup_gop(struct screen_info *si, efi_guid_t *proto,  	si->lfb_width = width;  	si->lfb_height = height;  	si->lfb_base = fb_base; -	si->lfb_size = fb_size;  	si->pages = 1;  	if (pixel_format == PIXEL_RGB_RESERVED_8BIT_PER_COLOR) { @@ -376,6 +378,10 @@ static efi_status_t setup_gop(struct screen_info *si, efi_guid_t *proto,  		si->rsvd_pos = 0;  	} +	si->lfb_size = si->lfb_linelength * si->lfb_height; + +	si->capabilities |= VIDEO_CAPABILITY_SKIP_QUIRKS; +  free_handle:  	efi_call_phys1(sys_table->boottime->free_pool, gop_handle);  	return status; diff --git a/arch/x86/boot/compressed/eboot.h b/arch/x86/boot/compressed/eboot.h index 3b6e15627c5..e5b0a8f91c5 100644 --- a/arch/x86/boot/compressed/eboot.h +++ b/arch/x86/boot/compressed/eboot.h @@ -14,6 +14,10 @@  #define EFI_PAGE_SIZE		(1UL << EFI_PAGE_SHIFT)  #define EFI_READ_CHUNK_SIZE	(1024 * 1024) +#define EFI_CONSOLE_OUT_DEVICE_GUID    \ +	EFI_GUID(0xd3b36f2c, 0xd551, 0x11d4, 0x9a, 0x46, 0x0, 0x90, 0x27, \ +		  0x3f, 0xc1, 0x4d) +  #define PIXEL_RGB_RESERVED_8BIT_PER_COLOR		0  #define PIXEL_BGR_RESERVED_8BIT_PER_COLOR		1  #define PIXEL_BIT_MASK					2 diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index b4e15dd6786..2a017441b8b 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -32,10 +32,6 @@ SYSSEG		= 0x1000		/* historical load address >> 4 */  #define SVGA_MODE ASK_VGA  #endif -#ifndef RAMDISK -#define RAMDISK 0 -#endif -  #ifndef ROOT_RDONLY  #define ROOT_RDONLY 1  #endif diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 119db67dcb0..5598547281a 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -8,6 +8,8 @@ CONFIG_TASK_DELAY_ACCT=y  CONFIG_TASK_XACCT=y  CONFIG_TASK_IO_ACCOUNTING=y  CONFIG_AUDIT=y +CONFIG_NO_HZ=y +CONFIG_HIGH_RES_TIMERS=y  CONFIG_LOG_BUF_SHIFT=18  CONFIG_CGROUPS=y  CONFIG_CGROUP_FREEZER=y @@ -34,8 +36,6 @@ CONFIG_SGI_PARTITION=y  CONFIG_SUN_PARTITION=y  CONFIG_KARMA_PARTITION=y  CONFIG_EFI_PARTITION=y -CONFIG_NO_HZ=y -CONFIG_HIGH_RES_TIMERS=y  CONFIG_SMP=y  CONFIG_X86_GENERIC=y  CONFIG_HPET_TIMER=y @@ -144,8 +144,6 @@ CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"  CONFIG_DEBUG_DEVRES=y  CONFIG_CONNECTOR=y  CONFIG_BLK_DEV_LOOP=y -CONFIG_BLK_DEV_RAM=y -CONFIG_BLK_DEV_RAM_SIZE=16384  CONFIG_BLK_DEV_SD=y  CONFIG_BLK_DEV_SR=y  CONFIG_BLK_DEV_SR_VENDOR=y @@ -231,8 +229,6 @@ CONFIG_SND_HRTIMER=y  CONFIG_SND_HDA_INTEL=y  CONFIG_SND_HDA_HWDEP=y  CONFIG_HIDRAW=y -CONFIG_HID_PID=y -CONFIG_USB_HIDDEV=y  CONFIG_HID_GYRATION=y  CONFIG_LOGITECH_FF=y  CONFIG_HID_NTRIG=y @@ -243,11 +239,11 @@ CONFIG_HID_SAMSUNG=y  CONFIG_HID_SONY=y  CONFIG_HID_SUNPLUS=y  CONFIG_HID_TOPSEED=y +CONFIG_HID_PID=y +CONFIG_USB_HIDDEV=y  CONFIG_USB=y  CONFIG_USB_DEBUG=y  CONFIG_USB_ANNOUNCE_NEW_DEVICES=y -CONFIG_USB_DEVICEFS=y -# CONFIG_USB_DEVICE_CLASS is not set  CONFIG_USB_MON=y  CONFIG_USB_EHCI_HCD=y  # CONFIG_USB_EHCI_TT_NEWSCHED is not set @@ -262,10 +258,9 @@ CONFIG_RTC_CLASS=y  CONFIG_DMADEVICES=y  CONFIG_EEEPC_LAPTOP=y  CONFIG_EFI_VARS=y -CONFIG_EXT3_FS=y -# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set -CONFIG_EXT3_FS_POSIX_ACL=y -CONFIG_EXT3_FS_SECURITY=y +CONFIG_EXT4_FS=y +CONFIG_EXT4_FS_POSIX_ACL=y +CONFIG_EXT4_FS_SECURITY=y  CONFIG_QUOTA=y  CONFIG_QUOTA_NETLINK_INTERFACE=y  # CONFIG_PRINT_QUOTA_WARNING is not set @@ -280,7 +275,6 @@ CONFIG_PROC_KCORE=y  CONFIG_TMPFS_POSIX_ACL=y  CONFIG_HUGETLBFS=y  CONFIG_NFS_FS=y -CONFIG_NFS_V3=y  CONFIG_NFS_V3_ACL=y  CONFIG_NFS_V4=y  CONFIG_ROOT_NFS=y @@ -299,13 +293,11 @@ CONFIG_DEBUG_KERNEL=y  CONFIG_SCHEDSTATS=y  CONFIG_TIMER_STATS=y  CONFIG_DEBUG_STACK_USAGE=y -CONFIG_SYSCTL_SYSCALL_CHECK=y  CONFIG_BLK_DEV_IO_TRACE=y  CONFIG_PROVIDE_OHCI1394_DMA_INIT=y  CONFIG_EARLY_PRINTK_DBGP=y  CONFIG_DEBUG_STACKOVERFLOW=y  # CONFIG_DEBUG_RODATA_TEST is not set -CONFIG_DEBUG_NX_TEST=m  CONFIG_DEBUG_BOOT_PARAMS=y  CONFIG_OPTIMIZE_INLINING=y  CONFIG_KEYS_DEBUG_PROC_KEYS=y @@ -316,4 +308,3 @@ CONFIG_SECURITY_SELINUX_BOOTPARAM=y  CONFIG_SECURITY_SELINUX_DISABLE=y  CONFIG_CRYPTO_AES_586=y  # CONFIG_CRYPTO_ANSI_CPRNG is not set -CONFIG_CRC_T10DIF=y diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index 76eb2903809..671524d0f6c 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -8,6 +8,8 @@ CONFIG_TASK_DELAY_ACCT=y  CONFIG_TASK_XACCT=y  CONFIG_TASK_IO_ACCOUNTING=y  CONFIG_AUDIT=y +CONFIG_NO_HZ=y +CONFIG_HIGH_RES_TIMERS=y  CONFIG_LOG_BUF_SHIFT=18  CONFIG_CGROUPS=y  CONFIG_CGROUP_FREEZER=y @@ -34,8 +36,6 @@ CONFIG_SGI_PARTITION=y  CONFIG_SUN_PARTITION=y  CONFIG_KARMA_PARTITION=y  CONFIG_EFI_PARTITION=y -CONFIG_NO_HZ=y -CONFIG_HIGH_RES_TIMERS=y  CONFIG_SMP=y  CONFIG_CALGARY_IOMMU=y  CONFIG_NR_CPUS=64 @@ -144,8 +144,6 @@ CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"  CONFIG_DEBUG_DEVRES=y  CONFIG_CONNECTOR=y  CONFIG_BLK_DEV_LOOP=y -CONFIG_BLK_DEV_RAM=y -CONFIG_BLK_DEV_RAM_SIZE=16384  CONFIG_BLK_DEV_SD=y  CONFIG_BLK_DEV_SR=y  CONFIG_BLK_DEV_SR_VENDOR=y @@ -227,8 +225,6 @@ CONFIG_SND_HRTIMER=y  CONFIG_SND_HDA_INTEL=y  CONFIG_SND_HDA_HWDEP=y  CONFIG_HIDRAW=y -CONFIG_HID_PID=y -CONFIG_USB_HIDDEV=y  CONFIG_HID_GYRATION=y  CONFIG_LOGITECH_FF=y  CONFIG_HID_NTRIG=y @@ -239,11 +235,11 @@ CONFIG_HID_SAMSUNG=y  CONFIG_HID_SONY=y  CONFIG_HID_SUNPLUS=y  CONFIG_HID_TOPSEED=y +CONFIG_HID_PID=y +CONFIG_USB_HIDDEV=y  CONFIG_USB=y  CONFIG_USB_DEBUG=y  CONFIG_USB_ANNOUNCE_NEW_DEVICES=y -CONFIG_USB_DEVICEFS=y -# CONFIG_USB_DEVICE_CLASS is not set  CONFIG_USB_MON=y  CONFIG_USB_EHCI_HCD=y  # CONFIG_USB_EHCI_TT_NEWSCHED is not set @@ -262,10 +258,9 @@ CONFIG_AMD_IOMMU_STATS=y  CONFIG_INTEL_IOMMU=y  # CONFIG_INTEL_IOMMU_DEFAULT_ON is not set  CONFIG_EFI_VARS=y -CONFIG_EXT3_FS=y -# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set -CONFIG_EXT3_FS_POSIX_ACL=y -CONFIG_EXT3_FS_SECURITY=y +CONFIG_EXT4_FS=y +CONFIG_EXT4_FS_POSIX_ACL=y +CONFIG_EXT4_FS_SECURITY=y  CONFIG_QUOTA=y  CONFIG_QUOTA_NETLINK_INTERFACE=y  # CONFIG_PRINT_QUOTA_WARNING is not set @@ -280,7 +275,6 @@ CONFIG_PROC_KCORE=y  CONFIG_TMPFS_POSIX_ACL=y  CONFIG_HUGETLBFS=y  CONFIG_NFS_FS=y -CONFIG_NFS_V3=y  CONFIG_NFS_V3_ACL=y  CONFIG_NFS_V4=y  CONFIG_ROOT_NFS=y @@ -298,13 +292,11 @@ CONFIG_DEBUG_KERNEL=y  CONFIG_SCHEDSTATS=y  CONFIG_TIMER_STATS=y  CONFIG_DEBUG_STACK_USAGE=y -CONFIG_SYSCTL_SYSCALL_CHECK=y  CONFIG_BLK_DEV_IO_TRACE=y  CONFIG_PROVIDE_OHCI1394_DMA_INIT=y  CONFIG_EARLY_PRINTK_DBGP=y  CONFIG_DEBUG_STACKOVERFLOW=y  # CONFIG_DEBUG_RODATA_TEST is not set -CONFIG_DEBUG_NX_TEST=m  CONFIG_DEBUG_BOOT_PARAMS=y  CONFIG_OPTIMIZE_INLINING=y  CONFIG_KEYS_DEBUG_PROC_KEYS=y @@ -314,4 +306,3 @@ CONFIG_SECURITY_SELINUX=y  CONFIG_SECURITY_SELINUX_BOOTPARAM=y  CONFIG_SECURITY_SELINUX_DISABLE=y  # CONFIG_CRYPTO_ANSI_CPRNG is not set -CONFIG_CRC_T10DIF=y diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index 673ac9b63d6..efc6a958b71 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -32,6 +32,7 @@  #include <asm/sigframe.h>  #include <asm/sighandling.h>  #include <asm/sys_ia32.h> +#include <asm/smap.h>  #define FIX_EFLAGS	__FIX_EFLAGS @@ -162,7 +163,8 @@ asmlinkage long sys32_sigaltstack(const stack_ia32_t __user *uss_ptr,  	}  	seg = get_fs();  	set_fs(KERNEL_DS); -	ret = do_sigaltstack(uss_ptr ? &uss : NULL, &uoss, regs->sp); +	ret = do_sigaltstack((stack_t __force __user *) (uss_ptr ? &uss : NULL), +			     (stack_t __force __user *) &uoss, regs->sp);  	set_fs(seg);  	if (ret >= 0 && uoss_ptr)  {  		if (!access_ok(VERIFY_WRITE, uoss_ptr, sizeof(stack_ia32_t))) @@ -250,11 +252,12 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,  		get_user_ex(tmp, &sc->fpstate);  		buf = compat_ptr(tmp); -		err |= restore_i387_xstate_ia32(buf);  		get_user_ex(*pax, &sc->ax);  	} get_user_catch(err); +	err |= restore_xstate_sig(buf, 1); +  	return err;  } @@ -361,7 +364,7 @@ static int ia32_setup_sigcontext(struct sigcontext_ia32 __user *sc,   */  static void __user *get_sigframe(struct k_sigaction *ka, struct pt_regs *regs,  				 size_t frame_size, -				 void **fpstate) +				 void __user **fpstate)  {  	unsigned long sp; @@ -381,9 +384,12 @@ static void __user *get_sigframe(struct k_sigaction *ka, struct pt_regs *regs,  		sp = (unsigned long) ka->sa.sa_restorer;  	if (used_math()) { -		sp = sp - sig_xstate_ia32_size; -		*fpstate = (struct _fpstate_ia32 *) sp; -		if (save_i387_xstate_ia32(*fpstate) < 0) +		unsigned long fx_aligned, math_size; + +		sp = alloc_mathframe(sp, 1, &fx_aligned, &math_size); +		*fpstate = (struct _fpstate_ia32 __user *) sp; +		if (save_xstate_sig(*fpstate, (void __user *)fx_aligned, +				    math_size) < 0)  			return (void __user *) -1L;  	} @@ -448,7 +454,7 @@ int ia32_setup_frame(int sig, struct k_sigaction *ka,  		 * These are actually not used anymore, but left because some  		 * gdb versions depend on them as a marker.  		 */ -		put_user_ex(*((u64 *)&code), (u64 *)frame->retcode); +		put_user_ex(*((u64 *)&code), (u64 __user *)frame->retcode);  	} put_user_catch(err);  	if (err) @@ -502,7 +508,6 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,  		put_user_ex(sig, &frame->sig);  		put_user_ex(ptr_to_compat(&frame->info), &frame->pinfo);  		put_user_ex(ptr_to_compat(&frame->uc), &frame->puc); -		err |= copy_siginfo_to_user32(&frame->info, info);  		/* Create the ucontext.  */  		if (cpu_has_xsave) @@ -514,9 +519,6 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,  		put_user_ex(sas_ss_flags(regs->sp),  			    &frame->uc.uc_stack.ss_flags);  		put_user_ex(current->sas_ss_size, &frame->uc.uc_stack.ss_size); -		err |= ia32_setup_sigcontext(&frame->uc.uc_mcontext, fpstate, -					     regs, set->sig[0]); -		err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));  		if (ka->sa.sa_flags & SA_RESTORER)  			restorer = ka->sa.sa_restorer; @@ -529,9 +531,14 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,  		 * Not actually used anymore, but left because some gdb  		 * versions need it.  		 */ -		put_user_ex(*((u64 *)&code), (u64 *)frame->retcode); +		put_user_ex(*((u64 *)&code), (u64 __user *)frame->retcode);  	} put_user_catch(err); +	err |= copy_siginfo_to_user32(&frame->info, info); +	err |= ia32_setup_sigcontext(&frame->uc.uc_mcontext, fpstate, +				     regs, set->sig[0]); +	err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); +  	if (err)  		return -EFAULT; diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 20e5f7ba0e6..9c289504e68 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -14,6 +14,7 @@  #include <asm/segment.h>  #include <asm/irqflags.h>  #include <asm/asm.h> +#include <asm/smap.h>  #include <linux/linkage.h>  #include <linux/err.h> @@ -146,8 +147,10 @@ ENTRY(ia32_sysenter_target)  	SAVE_ARGS 0,1,0   	/* no need to do an access_ok check here because rbp has been   	   32bit zero extended */  +	ASM_STAC  1:	movl	(%rbp),%ebp  	_ASM_EXTABLE(1b,ia32_badarg) +	ASM_CLAC  	orl     $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)  	testl   $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)  	CFI_REMEMBER_STATE @@ -301,8 +304,10 @@ ENTRY(ia32_cstar_target)  	/* no need to do an access_ok check here because r8 has been  	   32bit zero extended */   	/* hardware stack frame is complete now */	 +	ASM_STAC  1:	movl	(%r8),%r9d  	_ASM_EXTABLE(1b,ia32_badarg) +	ASM_CLAC  	orl     $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)  	testl   $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)  	CFI_REMEMBER_STATE @@ -365,6 +370,7 @@ cstar_tracesys:  END(ia32_cstar_target)  ia32_badarg: +	ASM_CLAC  	movq $-EFAULT,%rax  	jmp ia32_sysret  	CFI_ENDPROC diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c index 4540bece094..c5b938d92ea 100644 --- a/arch/x86/ia32/sys_ia32.c +++ b/arch/x86/ia32/sys_ia32.c @@ -287,7 +287,7 @@ asmlinkage long sys32_sigaction(int sig, struct old_sigaction32 __user *act,  	return ret;  } -asmlinkage long sys32_waitpid(compat_pid_t pid, unsigned int *stat_addr, +asmlinkage long sys32_waitpid(compat_pid_t pid, unsigned int __user *stat_addr,  			      int options)  {  	return compat_sys_wait4(pid, stat_addr, options, NULL); diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h index 952bd0100c5..372231c22a4 100644 --- a/arch/x86/include/asm/alternative-asm.h +++ b/arch/x86/include/asm/alternative-asm.h @@ -1,3 +1,6 @@ +#ifndef _ASM_X86_ALTERNATIVE_ASM_H +#define _ASM_X86_ALTERNATIVE_ASM_H +  #ifdef __ASSEMBLY__  #include <asm/asm.h> @@ -5,10 +8,10 @@  #ifdef CONFIG_SMP  	.macro LOCK_PREFIX  672:	lock -	.section .smp_locks,"a" +	.pushsection .smp_locks,"a"  	.balign 4  	.long 672b - . -	.previous +	.popsection  	.endm  #else  	.macro LOCK_PREFIX @@ -24,3 +27,5 @@  .endm  #endif  /*  __ASSEMBLY__  */ + +#endif /* _ASM_X86_ALTERNATIVE_ASM_H */ diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 70780689599..58ed6d96a6a 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -29,10 +29,10 @@  #ifdef CONFIG_SMP  #define LOCK_PREFIX_HERE \ -		".section .smp_locks,\"a\"\n"	\ -		".balign 4\n"			\ -		".long 671f - .\n" /* offset */	\ -		".previous\n"			\ +		".pushsection .smp_locks,\"a\"\n"	\ +		".balign 4\n"				\ +		".long 671f - .\n" /* offset */		\ +		".popsection\n"				\  		"671:"  #define LOCK_PREFIX LOCK_PREFIX_HERE "\n\tlock; " @@ -60,7 +60,7 @@ extern void alternatives_smp_module_add(struct module *mod, char *name,  					void *locks, void *locks_end,  					void *text, void *text_end);  extern void alternatives_smp_module_del(struct module *mod); -extern void alternatives_smp_switch(int smp); +extern void alternatives_enable_smp(void);  extern int alternatives_text_reserved(void *start, void *end);  extern bool skip_smp_alternatives;  #else @@ -68,7 +68,7 @@ static inline void alternatives_smp_module_add(struct module *mod, char *name,  					       void *locks, void *locks_end,  					       void *text, void *text_end) {}  static inline void alternatives_smp_module_del(struct module *mod) {} -static inline void alternatives_smp_switch(int smp) {} +static inline void alternatives_enable_smp(void) {}  static inline int alternatives_text_reserved(void *start, void *end)  {  	return 0; @@ -99,30 +99,30 @@ static inline int alternatives_text_reserved(void *start, void *end)  /* alternative assembly primitive: */  #define ALTERNATIVE(oldinstr, newinstr, feature)			\  	OLDINSTR(oldinstr)						\ -	".section .altinstructions,\"a\"\n"				\ +	".pushsection .altinstructions,\"a\"\n"				\  	ALTINSTR_ENTRY(feature, 1)					\ -	".previous\n"							\ -	".section .discard,\"aw\",@progbits\n"				\ +	".popsection\n"							\ +	".pushsection .discard,\"aw\",@progbits\n"			\  	DISCARD_ENTRY(1)						\ -	".previous\n"							\ -	".section .altinstr_replacement, \"ax\"\n"			\ +	".popsection\n"							\ +	".pushsection .altinstr_replacement, \"ax\"\n"			\  	ALTINSTR_REPLACEMENT(newinstr, feature, 1)			\ -	".previous" +	".popsection"  #define ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2)\  	OLDINSTR(oldinstr)						\ -	".section .altinstructions,\"a\"\n"				\ +	".pushsection .altinstructions,\"a\"\n"				\  	ALTINSTR_ENTRY(feature1, 1)					\  	ALTINSTR_ENTRY(feature2, 2)					\ -	".previous\n"							\ -	".section .discard,\"aw\",@progbits\n"				\ +	".popsection\n"							\ +	".pushsection .discard,\"aw\",@progbits\n"			\  	DISCARD_ENTRY(1)						\  	DISCARD_ENTRY(2)						\ -	".previous\n"							\ -	".section .altinstr_replacement, \"ax\"\n"			\ +	".popsection\n"							\ +	".pushsection .altinstr_replacement, \"ax\"\n"			\  	ALTINSTR_REPLACEMENT(newinstr1, feature1, 1)			\  	ALTINSTR_REPLACEMENT(newinstr2, feature2, 2)			\ -	".previous" +	".popsection"  /*   * This must be included *after* the definition of ALTERNATIVE due to diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index 72f5009deb5..6dfd0195bb5 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -355,7 +355,7 @@ static int test_bit(int nr, const volatile unsigned long *addr);   */  static inline unsigned long __ffs(unsigned long word)  { -	asm("bsf %1,%0" +	asm("rep; bsf %1,%0"  		: "=r" (word)  		: "rm" (word));  	return word; @@ -369,7 +369,7 @@ static inline unsigned long __ffs(unsigned long word)   */  static inline unsigned long ffz(unsigned long word)  { -	asm("bsf %1,%0" +	asm("rep; bsf %1,%0"  		: "=r" (word)  		: "r" (~word));  	return word; @@ -417,10 +417,9 @@ static inline int ffs(int x)  	 * We cannot do this on 32 bits because at the very least some  	 * 486 CPUs did not behave this way.  	 */ -	long tmp = -1;  	asm("bsfl %1,%0"  	    : "=r" (r) -	    : "rm" (x), "0" (tmp)); +	    : "rm" (x), "0" (-1));  #elif defined(CONFIG_X86_CMOV)  	asm("bsfl %1,%0\n\t"  	    "cmovzl %2,%0" @@ -459,10 +458,9 @@ static inline int fls(int x)  	 * We cannot do this on 32 bits because at the very least some  	 * 486 CPUs did not behave this way.  	 */ -	long tmp = -1;  	asm("bsrl %1,%0"  	    : "=r" (r) -	    : "rm" (x), "0" (tmp)); +	    : "rm" (x), "0" (-1));  #elif defined(CONFIG_X86_CMOV)  	asm("bsrl %1,%0\n\t"  	    "cmovzl %2,%0" @@ -490,13 +488,13 @@ static inline int fls(int x)  #ifdef CONFIG_X86_64  static __always_inline int fls64(__u64 x)  { -	long bitpos = -1; +	int bitpos = -1;  	/*  	 * AMD64 says BSRQ won't clobber the dest reg if x==0; Intel64 says the  	 * dest reg is undefined if x==0, but their CPU architect says its  	 * value is written to set it to the same as before.  	 */ -	asm("bsrq %1,%0" +	asm("bsrq %1,%q0"  	    : "+r" (bitpos)  	    : "rm" (x));  	return bitpos + 1; diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h index a9e3a740f69..7f8422a28a4 100644 --- a/arch/x86/include/asm/calling.h +++ b/arch/x86/include/asm/calling.h @@ -49,38 +49,36 @@ For 32-bit we have the following conventions - kernel is built with  #include "dwarf2.h"  /* - * 64-bit system call stack frame layout defines and helpers, for - * assembly code (note that the seemingly unnecessary parentheses - * are to prevent cpp from inserting spaces in expressions that get - * passed to macros): + * 64-bit system call stack frame layout defines and helpers, + * for assembly code:   */ -#define R15		  (0) -#define R14		  (8) -#define R13		 (16) -#define R12		 (24) -#define RBP		 (32) -#define RBX		 (40) +#define R15		  0 +#define R14		  8 +#define R13		 16 +#define R12		 24 +#define RBP		 32 +#define RBX		 40  /* arguments: interrupts/non tracing syscalls only save up to here: */ -#define R11		 (48) -#define R10		 (56) -#define R9		 (64) -#define R8		 (72) -#define RAX		 (80) -#define RCX		 (88) -#define RDX		 (96) -#define RSI		(104) -#define RDI		(112) -#define ORIG_RAX	(120)       /* + error_code */ +#define R11		 48 +#define R10		 56 +#define R9		 64 +#define R8		 72 +#define RAX		 80 +#define RCX		 88 +#define RDX		 96 +#define RSI		104 +#define RDI		112 +#define ORIG_RAX	120       /* + error_code */  /* end of arguments */  /* cpu exception frame or undefined in case of fast syscall: */ -#define RIP		(128) -#define CS		(136) -#define EFLAGS		(144) -#define RSP		(152) -#define SS		(160) +#define RIP		128 +#define CS		136 +#define EFLAGS		144 +#define RSP		152 +#define SS		160  #define ARGOFFSET	R11  #define SWFRAME		ORIG_RAX diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 6b7ee5ff682..16cae425d1f 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -97,6 +97,7 @@  #define X86_FEATURE_EXTD_APICID	(3*32+26) /* has extended APICID (8 bits) */  #define X86_FEATURE_AMD_DCM     (3*32+27) /* multi-node processor */  #define X86_FEATURE_APERFMPERF	(3*32+28) /* APERFMPERF */ +#define X86_FEATURE_EAGER_FPU	(3*32+29) /* "eagerfpu" Non lazy FPU restore */  /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */  #define X86_FEATURE_XMM3	(4*32+ 0) /* "pni" SSE-3 */ @@ -209,6 +210,7 @@  #define X86_FEATURE_RTM		(9*32+11) /* Restricted Transactional Memory */  #define X86_FEATURE_RDSEED	(9*32+18) /* The RDSEED instruction */  #define X86_FEATURE_ADX		(9*32+19) /* The ADCX and ADOX instructions */ +#define X86_FEATURE_SMAP	(9*32+20) /* Supervisor Mode Access Prevention */  #if defined(__KERNEL__) && !defined(__ASSEMBLY__) @@ -299,12 +301,14 @@ extern const char * const x86_power_flags[32];  #define cpu_has_xmm4_2		boot_cpu_has(X86_FEATURE_XMM4_2)  #define cpu_has_x2apic		boot_cpu_has(X86_FEATURE_X2APIC)  #define cpu_has_xsave		boot_cpu_has(X86_FEATURE_XSAVE) +#define cpu_has_xsaveopt	boot_cpu_has(X86_FEATURE_XSAVEOPT)  #define cpu_has_osxsave		boot_cpu_has(X86_FEATURE_OSXSAVE)  #define cpu_has_hypervisor	boot_cpu_has(X86_FEATURE_HYPERVISOR)  #define cpu_has_pclmulqdq	boot_cpu_has(X86_FEATURE_PCLMULQDQ)  #define cpu_has_perfctr_core	boot_cpu_has(X86_FEATURE_PERFCTR_CORE)  #define cpu_has_cx8		boot_cpu_has(X86_FEATURE_CX8)  #define cpu_has_cx16		boot_cpu_has(X86_FEATURE_CX16) +#define cpu_has_eager_fpu	boot_cpu_has(X86_FEATURE_EAGER_FPU)  #if defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_64)  # define cpu_has_invlpg		1 diff --git a/arch/x86/include/asm/fpu-internal.h b/arch/x86/include/asm/fpu-internal.h index 75f4c6d6a33..831dbb9c6c0 100644 --- a/arch/x86/include/asm/fpu-internal.h +++ b/arch/x86/include/asm/fpu-internal.h @@ -12,6 +12,7 @@  #include <linux/kernel_stat.h>  #include <linux/regset.h> +#include <linux/compat.h>  #include <linux/slab.h>  #include <asm/asm.h>  #include <asm/cpufeature.h> @@ -20,43 +21,76 @@  #include <asm/user.h>  #include <asm/uaccess.h>  #include <asm/xsave.h> +#include <asm/smap.h> -extern unsigned int sig_xstate_size; +#ifdef CONFIG_X86_64 +# include <asm/sigcontext32.h> +# include <asm/user32.h> +int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, +			compat_sigset_t *set, struct pt_regs *regs); +int ia32_setup_frame(int sig, struct k_sigaction *ka, +		     compat_sigset_t *set, struct pt_regs *regs); +#else +# define user_i387_ia32_struct	user_i387_struct +# define user32_fxsr_struct	user_fxsr_struct +# define ia32_setup_frame	__setup_frame +# define ia32_setup_rt_frame	__setup_rt_frame +#endif + +extern unsigned int mxcsr_feature_mask;  extern void fpu_init(void); +extern void eager_fpu_init(void);  DECLARE_PER_CPU(struct task_struct *, fpu_owner_task); +extern void convert_from_fxsr(struct user_i387_ia32_struct *env, +			      struct task_struct *tsk); +extern void convert_to_fxsr(struct task_struct *tsk, +			    const struct user_i387_ia32_struct *env); +  extern user_regset_active_fn fpregs_active, xfpregs_active;  extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get,  				xstateregs_get;  extern user_regset_set_fn fpregs_set, xfpregs_set, fpregs_soft_set,  				 xstateregs_set; -  /*   * xstateregs_active == fpregs_active. Please refer to the comment   * at the definition of fpregs_active.   */  #define xstateregs_active	fpregs_active -extern struct _fpx_sw_bytes fx_sw_reserved; -#ifdef CONFIG_IA32_EMULATION -extern unsigned int sig_xstate_ia32_size; -extern struct _fpx_sw_bytes fx_sw_reserved_ia32; -struct _fpstate_ia32; -struct _xstate_ia32; -extern int save_i387_xstate_ia32(void __user *buf); -extern int restore_i387_xstate_ia32(void __user *buf); -#endif -  #ifdef CONFIG_MATH_EMULATION +# define HAVE_HWFP		(boot_cpu_data.hard_math)  extern void finit_soft_fpu(struct i387_soft_struct *soft);  #else +# define HAVE_HWFP		1  static inline void finit_soft_fpu(struct i387_soft_struct *soft) {}  #endif +static inline int is_ia32_compat_frame(void) +{ +	return config_enabled(CONFIG_IA32_EMULATION) && +	       test_thread_flag(TIF_IA32); +} + +static inline int is_ia32_frame(void) +{ +	return config_enabled(CONFIG_X86_32) || is_ia32_compat_frame(); +} + +static inline int is_x32_frame(void) +{ +	return config_enabled(CONFIG_X86_X32_ABI) && test_thread_flag(TIF_X32); +} +  #define X87_FSW_ES (1 << 7)	/* Exception Summary */ +static __always_inline __pure bool use_eager_fpu(void) +{ +	return static_cpu_has(X86_FEATURE_EAGER_FPU); +} +  static __always_inline __pure bool use_xsaveopt(void)  {  	return static_cpu_has(X86_FEATURE_XSAVEOPT); @@ -72,6 +106,13 @@ static __always_inline __pure bool use_fxsr(void)          return static_cpu_has(X86_FEATURE_FXSR);  } +static inline void fx_finit(struct i387_fxsave_struct *fx) +{ +	memset(fx, 0, xstate_size); +	fx->cwd = 0x37f; +	fx->mxcsr = MXCSR_DEFAULT; +} +  extern void __sanitize_i387_state(struct task_struct *);  static inline void sanitize_i387_state(struct task_struct *tsk) @@ -81,131 +122,121 @@ static inline void sanitize_i387_state(struct task_struct *tsk)  	__sanitize_i387_state(tsk);  } -#ifdef CONFIG_X86_64 -static inline int fxrstor_checking(struct i387_fxsave_struct *fx) -{ -	int err; +#define user_insn(insn, output, input...)				\ +({									\ +	int err;							\ +	asm volatile(ASM_STAC "\n"					\ +		     "1:" #insn "\n\t"					\ +		     "2: " ASM_CLAC "\n"				\ +		     ".section .fixup,\"ax\"\n"				\ +		     "3:  movl $-1,%[err]\n"				\ +		     "    jmp  2b\n"					\ +		     ".previous\n"					\ +		     _ASM_EXTABLE(1b, 3b)				\ +		     : [err] "=r" (err), output				\ +		     : "0"(0), input);					\ +	err;								\ +}) -	/* See comment in fxsave() below. */ -#ifdef CONFIG_AS_FXSAVEQ -	asm volatile("1:  fxrstorq %[fx]\n\t" -		     "2:\n" -		     ".section .fixup,\"ax\"\n" -		     "3:  movl $-1,%[err]\n" -		     "    jmp  2b\n" -		     ".previous\n" -		     _ASM_EXTABLE(1b, 3b) -		     : [err] "=r" (err) -		     : [fx] "m" (*fx), "0" (0)); -#else -	asm volatile("1:  rex64/fxrstor (%[fx])\n\t" -		     "2:\n" -		     ".section .fixup,\"ax\"\n" -		     "3:  movl $-1,%[err]\n" -		     "    jmp  2b\n" -		     ".previous\n" -		     _ASM_EXTABLE(1b, 3b) -		     : [err] "=r" (err) -		     : [fx] "R" (fx), "m" (*fx), "0" (0)); -#endif -	return err; +#define check_insn(insn, output, input...)				\ +({									\ +	int err;							\ +	asm volatile("1:" #insn "\n\t"					\ +		     "2:\n"						\ +		     ".section .fixup,\"ax\"\n"				\ +		     "3:  movl $-1,%[err]\n"				\ +		     "    jmp  2b\n"					\ +		     ".previous\n"					\ +		     _ASM_EXTABLE(1b, 3b)				\ +		     : [err] "=r" (err), output				\ +		     : "0"(0), input);					\ +	err;								\ +}) + +static inline int fsave_user(struct i387_fsave_struct __user *fx) +{ +	return user_insn(fnsave %[fx]; fwait,  [fx] "=m" (*fx), "m" (*fx));  }  static inline int fxsave_user(struct i387_fxsave_struct __user *fx)  { -	int err; - -	/* -	 * Clear the bytes not touched by the fxsave and reserved -	 * for the SW usage. -	 */ -	err = __clear_user(&fx->sw_reserved, -			   sizeof(struct _fpx_sw_bytes)); -	if (unlikely(err)) -		return -EFAULT; +	if (config_enabled(CONFIG_X86_32)) +		return user_insn(fxsave %[fx], [fx] "=m" (*fx), "m" (*fx)); +	else if (config_enabled(CONFIG_AS_FXSAVEQ)) +		return user_insn(fxsaveq %[fx], [fx] "=m" (*fx), "m" (*fx)); -	/* See comment in fxsave() below. */ -#ifdef CONFIG_AS_FXSAVEQ -	asm volatile("1:  fxsaveq %[fx]\n\t" -		     "2:\n" -		     ".section .fixup,\"ax\"\n" -		     "3:  movl $-1,%[err]\n" -		     "    jmp  2b\n" -		     ".previous\n" -		     _ASM_EXTABLE(1b, 3b) -		     : [err] "=r" (err), [fx] "=m" (*fx) -		     : "0" (0)); -#else -	asm volatile("1:  rex64/fxsave (%[fx])\n\t" -		     "2:\n" -		     ".section .fixup,\"ax\"\n" -		     "3:  movl $-1,%[err]\n" -		     "    jmp  2b\n" -		     ".previous\n" -		     _ASM_EXTABLE(1b, 3b) -		     : [err] "=r" (err), "=m" (*fx) -		     : [fx] "R" (fx), "0" (0)); -#endif -	if (unlikely(err) && -	    __clear_user(fx, sizeof(struct i387_fxsave_struct))) -		err = -EFAULT; -	/* No need to clear here because the caller clears USED_MATH */ -	return err; +	/* See comment in fpu_fxsave() below. */ +	return user_insn(rex64/fxsave (%[fx]), "=m" (*fx), [fx] "R" (fx));  } -static inline void fpu_fxsave(struct fpu *fpu) +static inline int fxrstor_checking(struct i387_fxsave_struct *fx)  { -	/* Using "rex64; fxsave %0" is broken because, if the memory operand -	   uses any extended registers for addressing, a second REX prefix -	   will be generated (to the assembler, rex64 followed by semicolon -	   is a separate instruction), and hence the 64-bitness is lost. */ +	if (config_enabled(CONFIG_X86_32)) +		return check_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); +	else if (config_enabled(CONFIG_AS_FXSAVEQ)) +		return check_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); -#ifdef CONFIG_AS_FXSAVEQ -	/* Using "fxsaveq %0" would be the ideal choice, but is only supported -	   starting with gas 2.16. */ -	__asm__ __volatile__("fxsaveq %0" -			     : "=m" (fpu->state->fxsave)); -#else -	/* Using, as a workaround, the properly prefixed form below isn't -	   accepted by any binutils version so far released, complaining that -	   the same type of prefix is used twice if an extended register is -	   needed for addressing (fix submitted to mainline 2005-11-21). -	asm volatile("rex64/fxsave %0" -		     : "=m" (fpu->state->fxsave)); -	   This, however, we can work around by forcing the compiler to select -	   an addressing mode that doesn't require extended registers. */ -	asm volatile("rex64/fxsave (%[fx])" -		     : "=m" (fpu->state->fxsave) -		     : [fx] "R" (&fpu->state->fxsave)); -#endif +	/* See comment in fpu_fxsave() below. */ +	return check_insn(rex64/fxrstor (%[fx]), "=m" (*fx), [fx] "R" (fx), +			  "m" (*fx));  } -#else  /* CONFIG_X86_32 */ +static inline int fxrstor_user(struct i387_fxsave_struct __user *fx) +{ +	if (config_enabled(CONFIG_X86_32)) +		return user_insn(fxrstor %[fx], "=m" (*fx), [fx] "m" (*fx)); +	else if (config_enabled(CONFIG_AS_FXSAVEQ)) +		return user_insn(fxrstorq %[fx], "=m" (*fx), [fx] "m" (*fx)); -/* perform fxrstor iff the processor has extended states, otherwise frstor */ -static inline int fxrstor_checking(struct i387_fxsave_struct *fx) +	/* See comment in fpu_fxsave() below. */ +	return user_insn(rex64/fxrstor (%[fx]), "=m" (*fx), [fx] "R" (fx), +			  "m" (*fx)); +} + +static inline int frstor_checking(struct i387_fsave_struct *fx)  { -	/* -	 * The "nop" is needed to make the instructions the same -	 * length. -	 */ -	alternative_input( -		"nop ; frstor %1", -		"fxrstor %1", -		X86_FEATURE_FXSR, -		"m" (*fx)); +	return check_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx)); +} -	return 0; +static inline int frstor_user(struct i387_fsave_struct __user *fx) +{ +	return user_insn(frstor %[fx], "=m" (*fx), [fx] "m" (*fx));  }  static inline void fpu_fxsave(struct fpu *fpu)  { -	asm volatile("fxsave %[fx]" -		     : [fx] "=m" (fpu->state->fxsave)); +	if (config_enabled(CONFIG_X86_32)) +		asm volatile( "fxsave %[fx]" : [fx] "=m" (fpu->state->fxsave)); +	else if (config_enabled(CONFIG_AS_FXSAVEQ)) +		asm volatile("fxsaveq %0" : "=m" (fpu->state->fxsave)); +	else { +		/* Using "rex64; fxsave %0" is broken because, if the memory +		 * operand uses any extended registers for addressing, a second +		 * REX prefix will be generated (to the assembler, rex64 +		 * followed by semicolon is a separate instruction), and hence +		 * the 64-bitness is lost. +		 * +		 * Using "fxsaveq %0" would be the ideal choice, but is only +		 * supported starting with gas 2.16. +		 * +		 * Using, as a workaround, the properly prefixed form below +		 * isn't accepted by any binutils version so far released, +		 * complaining that the same type of prefix is used twice if +		 * an extended register is needed for addressing (fix submitted +		 * to mainline 2005-11-21). +		 * +		 *  asm volatile("rex64/fxsave %0" : "=m" (fpu->state->fxsave)); +		 * +		 * This, however, we can work around by forcing the compiler to +		 * select an addressing mode that doesn't require extended +		 * registers. +		 */ +		asm volatile( "rex64/fxsave (%[fx])" +			     : "=m" (fpu->state->fxsave) +			     : [fx] "R" (&fpu->state->fxsave)); +	}  } -#endif	/* CONFIG_X86_64 */ -  /*   * These must be called with preempt disabled. Returns   * 'true' if the FPU state is still intact. @@ -248,17 +279,14 @@ static inline int __save_init_fpu(struct task_struct *tsk)  	return fpu_save_init(&tsk->thread.fpu);  } -static inline int fpu_fxrstor_checking(struct fpu *fpu) -{ -	return fxrstor_checking(&fpu->state->fxsave); -} -  static inline int fpu_restore_checking(struct fpu *fpu)  {  	if (use_xsave()) -		return fpu_xrstor_checking(fpu); +		return fpu_xrstor_checking(&fpu->state->xsave); +	else if (use_fxsr()) +		return fxrstor_checking(&fpu->state->fxsave);  	else -		return fpu_fxrstor_checking(fpu); +		return frstor_checking(&fpu->state->fsave);  }  static inline int restore_fpu_checking(struct task_struct *tsk) @@ -310,15 +338,52 @@ static inline void __thread_set_has_fpu(struct task_struct *tsk)  static inline void __thread_fpu_end(struct task_struct *tsk)  {  	__thread_clear_has_fpu(tsk); -	stts(); +	if (!use_eager_fpu()) +		stts();  }  static inline void __thread_fpu_begin(struct task_struct *tsk)  { -	clts(); +	if (!use_eager_fpu()) +		clts();  	__thread_set_has_fpu(tsk);  } +static inline void __drop_fpu(struct task_struct *tsk) +{ +	if (__thread_has_fpu(tsk)) { +		/* Ignore delayed exceptions from user space */ +		asm volatile("1: fwait\n" +			     "2:\n" +			     _ASM_EXTABLE(1b, 2b)); +		__thread_fpu_end(tsk); +	} +} + +static inline void drop_fpu(struct task_struct *tsk) +{ +	/* +	 * Forget coprocessor state.. +	 */ +	preempt_disable(); +	tsk->fpu_counter = 0; +	__drop_fpu(tsk); +	clear_used_math(); +	preempt_enable(); +} + +static inline void drop_init_fpu(struct task_struct *tsk) +{ +	if (!use_eager_fpu()) +		drop_fpu(tsk); +	else { +		if (use_xsave()) +			xrstor_state(init_xstate_buf, -1); +		else +			fxrstor_checking(&init_xstate_buf->i387); +	} +} +  /*   * FPU state switching for scheduling.   * @@ -352,7 +417,12 @@ static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct ta  {  	fpu_switch_t fpu; -	fpu.preload = tsk_used_math(new) && new->fpu_counter > 5; +	/* +	 * If the task has used the math, pre-load the FPU on xsave processors +	 * or if the past 5 consecutive context-switches used math. +	 */ +	fpu.preload = tsk_used_math(new) && (use_eager_fpu() || +					     new->fpu_counter > 5);  	if (__thread_has_fpu(old)) {  		if (!__save_init_fpu(old))  			cpu = ~0; @@ -364,14 +434,14 @@ static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct ta  			new->fpu_counter++;  			__thread_set_has_fpu(new);  			prefetch(new->thread.fpu.state); -		} else +		} else if (!use_eager_fpu())  			stts();  	} else {  		old->fpu_counter = 0;  		old->thread.fpu.last_cpu = ~0;  		if (fpu.preload) {  			new->fpu_counter++; -			if (fpu_lazy_restore(new, cpu)) +			if (!use_eager_fpu() && fpu_lazy_restore(new, cpu))  				fpu.preload = 0;  			else  				prefetch(new->thread.fpu.state); @@ -391,44 +461,40 @@ static inline void switch_fpu_finish(struct task_struct *new, fpu_switch_t fpu)  {  	if (fpu.preload) {  		if (unlikely(restore_fpu_checking(new))) -			__thread_fpu_end(new); +			drop_init_fpu(new);  	}  }  /*   * Signal frame handlers...   */ -extern int save_i387_xstate(void __user *buf); -extern int restore_i387_xstate(void __user *buf); +extern int save_xstate_sig(void __user *buf, void __user *fx, int size); +extern int __restore_xstate_sig(void __user *buf, void __user *fx, int size); -static inline void __clear_fpu(struct task_struct *tsk) +static inline int xstate_sigframe_size(void)  { -	if (__thread_has_fpu(tsk)) { -		/* Ignore delayed exceptions from user space */ -		asm volatile("1: fwait\n" -			     "2:\n" -			     _ASM_EXTABLE(1b, 2b)); -		__thread_fpu_end(tsk); +	return use_xsave() ? xstate_size + FP_XSTATE_MAGIC2_SIZE : xstate_size; +} + +static inline int restore_xstate_sig(void __user *buf, int ia32_frame) +{ +	void __user *buf_fx = buf; +	int size = xstate_sigframe_size(); + +	if (ia32_frame && use_fxsr()) { +		buf_fx = buf + sizeof(struct i387_fsave_struct); +		size += sizeof(struct i387_fsave_struct);  	} + +	return __restore_xstate_sig(buf, buf_fx, size);  }  /* - * The actual user_fpu_begin/end() functions - * need to be preemption-safe. + * Need to be preemption-safe.   * - * NOTE! user_fpu_end() must be used only after you - * have saved the FP state, and user_fpu_begin() must - * be used only immediately before restoring it. - * These functions do not do any save/restore on - * their own. + * NOTE! user_fpu_begin() must be used only immediately before restoring + * it. This function does not do any save/restore on their own.   */ -static inline void user_fpu_end(void) -{ -	preempt_disable(); -	__thread_fpu_end(current); -	preempt_enable(); -} -  static inline void user_fpu_begin(void)  {  	preempt_disable(); @@ -437,25 +503,32 @@ static inline void user_fpu_begin(void)  	preempt_enable();  } +static inline void __save_fpu(struct task_struct *tsk) +{ +	if (use_xsave()) +		xsave_state(&tsk->thread.fpu.state->xsave, -1); +	else +		fpu_fxsave(&tsk->thread.fpu); +} +  /*   * These disable preemption on their own and are safe   */  static inline void save_init_fpu(struct task_struct *tsk)  {  	WARN_ON_ONCE(!__thread_has_fpu(tsk)); + +	if (use_eager_fpu()) { +		__save_fpu(tsk); +		return; +	} +  	preempt_disable();  	__save_init_fpu(tsk);  	__thread_fpu_end(tsk);  	preempt_enable();  } -static inline void clear_fpu(struct task_struct *tsk) -{ -	preempt_disable(); -	__clear_fpu(tsk); -	preempt_enable(); -} -  /*   * i387 state interaction   */ @@ -510,11 +583,34 @@ static inline void fpu_free(struct fpu *fpu)  	}  } -static inline void fpu_copy(struct fpu *dst, struct fpu *src) +static inline void fpu_copy(struct task_struct *dst, struct task_struct *src)  { -	memcpy(dst->state, src->state, xstate_size); +	if (use_eager_fpu()) { +		memset(&dst->thread.fpu.state->xsave, 0, xstate_size); +		__save_fpu(dst); +	} else { +		struct fpu *dfpu = &dst->thread.fpu; +		struct fpu *sfpu = &src->thread.fpu; + +		unlazy_fpu(src); +		memcpy(dfpu->state, sfpu->state, xstate_size); +	}  } -extern void fpu_finit(struct fpu *fpu); +static inline unsigned long +alloc_mathframe(unsigned long sp, int ia32_frame, unsigned long *buf_fx, +		unsigned long *size) +{ +	unsigned long frame_size = xstate_sigframe_size(); + +	*buf_fx = sp = round_down(sp - frame_size, 64); +	if (ia32_frame && use_fxsr()) { +		frame_size += sizeof(struct i387_fsave_struct); +		sp -= sizeof(struct i387_fsave_struct); +	} + +	*size = frame_size; +	return sp; +}  #endif diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index b0767bc0874..9a25b522d37 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -3,38 +3,54 @@  #ifdef __ASSEMBLY__ -	.macro MCOUNT_SAVE_FRAME -	/* taken from glibc */ -	subq $0x38, %rsp -	movq %rax, (%rsp) -	movq %rcx, 8(%rsp) -	movq %rdx, 16(%rsp) -	movq %rsi, 24(%rsp) -	movq %rdi, 32(%rsp) -	movq %r8, 40(%rsp) -	movq %r9, 48(%rsp) +	/* skip is set if the stack was already partially adjusted */ +	.macro MCOUNT_SAVE_FRAME skip=0 +	 /* +	  * We add enough stack to save all regs. +	  */ +	subq $(SS+8-\skip), %rsp +	movq %rax, RAX(%rsp) +	movq %rcx, RCX(%rsp) +	movq %rdx, RDX(%rsp) +	movq %rsi, RSI(%rsp) +	movq %rdi, RDI(%rsp) +	movq %r8, R8(%rsp) +	movq %r9, R9(%rsp) +	 /* Move RIP to its proper location */ +	movq SS+8(%rsp), %rdx +	movq %rdx, RIP(%rsp)  	.endm -	.macro MCOUNT_RESTORE_FRAME -	movq 48(%rsp), %r9 -	movq 40(%rsp), %r8 -	movq 32(%rsp), %rdi -	movq 24(%rsp), %rsi -	movq 16(%rsp), %rdx -	movq 8(%rsp), %rcx -	movq (%rsp), %rax -	addq $0x38, %rsp +	.macro MCOUNT_RESTORE_FRAME skip=0 +	movq R9(%rsp), %r9 +	movq R8(%rsp), %r8 +	movq RDI(%rsp), %rdi +	movq RSI(%rsp), %rsi +	movq RDX(%rsp), %rdx +	movq RCX(%rsp), %rcx +	movq RAX(%rsp), %rax +	addq $(SS+8-\skip), %rsp  	.endm  #endif  #ifdef CONFIG_FUNCTION_TRACER -#define MCOUNT_ADDR		((long)(mcount)) +#ifdef CC_USING_FENTRY +# define MCOUNT_ADDR		((long)(__fentry__)) +#else +# define MCOUNT_ADDR		((long)(mcount)) +#endif  #define MCOUNT_INSN_SIZE	5 /* sizeof mcount call */ +#ifdef CONFIG_DYNAMIC_FTRACE +#define ARCH_SUPPORTS_FTRACE_OPS 1 +#define ARCH_SUPPORTS_FTRACE_SAVE_REGS +#endif +  #ifndef __ASSEMBLY__  extern void mcount(void);  extern atomic_t modifying_ftrace_code; +extern void __fentry__(void);  static inline unsigned long ftrace_call_adjust(unsigned long addr)  { diff --git a/arch/x86/include/asm/futex.h b/arch/x86/include/asm/futex.h index 71ecbcba1a4..f373046e63e 100644 --- a/arch/x86/include/asm/futex.h +++ b/arch/x86/include/asm/futex.h @@ -9,10 +9,13 @@  #include <asm/asm.h>  #include <asm/errno.h>  #include <asm/processor.h> +#include <asm/smap.h>  #define __futex_atomic_op1(insn, ret, oldval, uaddr, oparg)	\ -	asm volatile("1:\t" insn "\n"				\ -		     "2:\t.section .fixup,\"ax\"\n"		\ +	asm volatile("\t" ASM_STAC "\n"				\ +		     "1:\t" insn "\n"				\ +		     "2:\t" ASM_CLAC "\n"			\ +		     "\t.section .fixup,\"ax\"\n"		\  		     "3:\tmov\t%3, %1\n"			\  		     "\tjmp\t2b\n"				\  		     "\t.previous\n"				\ @@ -21,12 +24,14 @@  		     : "i" (-EFAULT), "0" (oparg), "1" (0))  #define __futex_atomic_op2(insn, ret, oldval, uaddr, oparg)	\ -	asm volatile("1:\tmovl	%2, %0\n"			\ +	asm volatile("\t" ASM_STAC "\n"				\ +		     "1:\tmovl	%2, %0\n"			\  		     "\tmovl\t%0, %3\n"				\  		     "\t" insn "\n"				\  		     "2:\t" LOCK_PREFIX "cmpxchgl %3, %2\n"	\  		     "\tjnz\t1b\n"				\ -		     "3:\t.section .fixup,\"ax\"\n"		\ +		     "3:\t" ASM_CLAC "\n"			\ +		     "\t.section .fixup,\"ax\"\n"		\  		     "4:\tmov\t%5, %1\n"			\  		     "\tjmp\t3b\n"				\  		     "\t.previous\n"				\ @@ -122,8 +127,10 @@ static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,  	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))  		return -EFAULT; -	asm volatile("1:\t" LOCK_PREFIX "cmpxchgl %4, %2\n" -		     "2:\t.section .fixup, \"ax\"\n" +	asm volatile("\t" ASM_STAC "\n" +		     "1:\t" LOCK_PREFIX "cmpxchgl %4, %2\n" +		     "2:\t" ASM_CLAC "\n" +		     "\t.section .fixup, \"ax\"\n"  		     "3:\tmov     %3, %0\n"  		     "\tjmp     2b\n"  		     "\t.previous\n" diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index d3895dbf4dd..81f04cee5f7 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -18,6 +18,10 @@ typedef struct {  #ifdef CONFIG_SMP  	unsigned int irq_resched_count;  	unsigned int irq_call_count; +	/* +	 * irq_tlb_count is double-counted in irq_call_count, so it must be +	 * subtracted from irq_call_count when displaying irq_call_count +	 */  	unsigned int irq_tlb_count;  #endif  #ifdef CONFIG_X86_THERMAL_VECTOR diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h index 2c392d663dc..434e2106cc8 100644 --- a/arch/x86/include/asm/hpet.h +++ b/arch/x86/include/asm/hpet.h @@ -35,8 +35,6 @@  #define	HPET_ID_NUMBER_SHIFT	8  #define HPET_ID_VENDOR_SHIFT	16 -#define HPET_ID_VENDOR_8086	0x8086 -  #define HPET_CFG_ENABLE		0x001  #define HPET_CFG_LEGACY		0x002  #define	HPET_LEGACY_8254	2 diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 257d9cca214..ed8089d6909 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -19,12 +19,37 @@ struct pt_regs;  struct user_i387_struct;  extern int init_fpu(struct task_struct *child); +extern void fpu_finit(struct fpu *fpu);  extern int dump_fpu(struct pt_regs *, struct user_i387_struct *);  extern void math_state_restore(void);  extern bool irq_fpu_usable(void); -extern void kernel_fpu_begin(void); -extern void kernel_fpu_end(void); + +/* + * Careful: __kernel_fpu_begin/end() must be called with preempt disabled + * and they don't touch the preempt state on their own. + * If you enable preemption after __kernel_fpu_begin(), preempt notifier + * should call the __kernel_fpu_end() to prevent the kernel/user FPU + * state from getting corrupted. KVM for example uses this model. + * + * All other cases use kernel_fpu_begin/end() which disable preemption + * during kernel FPU usage. + */ +extern void __kernel_fpu_begin(void); +extern void __kernel_fpu_end(void); + +static inline void kernel_fpu_begin(void) +{ +	WARN_ON_ONCE(!irq_fpu_usable()); +	preempt_disable(); +	__kernel_fpu_begin(); +} + +static inline void kernel_fpu_end(void) +{ +	__kernel_fpu_end(); +	preempt_enable(); +}  /*   * Some instructions like VIA's padlock instructions generate a spurious diff --git a/arch/x86/include/asm/iommu_table.h b/arch/x86/include/asm/iommu_table.h index f229b13a5f3..f42a04735a0 100644 --- a/arch/x86/include/asm/iommu_table.h +++ b/arch/x86/include/asm/iommu_table.h @@ -48,7 +48,7 @@ struct iommu_table_entry {  #define __IOMMU_INIT(_detect, _depend, _early_init, _late_init, _finish)\ -	static const struct iommu_table_entry const			\ +	static const struct iommu_table_entry				\  		__iommu_entry_##_detect __used				\  	__attribute__ ((unused, __section__(".iommu_table"),		\  			aligned((sizeof(void *)))))	\ @@ -63,10 +63,10 @@ struct iommu_table_entry {   * to stop detecting the other IOMMUs after yours has been detected.   */  #define IOMMU_INIT_POST(_detect)					\ -	__IOMMU_INIT(_detect, pci_swiotlb_detect_4gb,  0, 0, 0) +	__IOMMU_INIT(_detect, pci_swiotlb_detect_4gb,  NULL, NULL, 0)  #define IOMMU_INIT_POST_FINISH(detect)					\ -	__IOMMU_INIT(_detect, pci_swiotlb_detect_4gb,  0, 0, 1) +	__IOMMU_INIT(_detect, pci_swiotlb_detect_4gb,  NULL, NULL, 1)  /*   * A more sophisticated version of IOMMU_INIT. This variant requires: diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h index 54788253915..d3ddd17405d 100644 --- a/arch/x86/include/asm/kprobes.h +++ b/arch/x86/include/asm/kprobes.h @@ -27,6 +27,7 @@  #include <asm/insn.h>  #define  __ARCH_WANT_KPROBES_INSN_SLOT +#define  ARCH_SUPPORTS_KPROBES_ON_FTRACE  struct pt_regs;  struct kprobe; diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h index 246617efd67..41e08cb6a09 100644 --- a/arch/x86/include/asm/kvm.h +++ b/arch/x86/include/asm/kvm.h @@ -9,6 +9,22 @@  #include <linux/types.h>  #include <linux/ioctl.h> +#define DE_VECTOR 0 +#define DB_VECTOR 1 +#define BP_VECTOR 3 +#define OF_VECTOR 4 +#define BR_VECTOR 5 +#define UD_VECTOR 6 +#define NM_VECTOR 7 +#define DF_VECTOR 8 +#define TS_VECTOR 10 +#define NP_VECTOR 11 +#define SS_VECTOR 12 +#define GP_VECTOR 13 +#define PF_VECTOR 14 +#define MF_VECTOR 16 +#define MC_VECTOR 18 +  /* Select x86 specific features in <linux/kvm.h> */  #define __KVM_HAVE_PIT  #define __KVM_HAVE_IOAPIC diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 09155d64cf7..1eaa6b05667 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -75,22 +75,6 @@  #define KVM_HPAGE_MASK(x)	(~(KVM_HPAGE_SIZE(x) - 1))  #define KVM_PAGES_PER_HPAGE(x)	(KVM_HPAGE_SIZE(x) / PAGE_SIZE) -#define DE_VECTOR 0 -#define DB_VECTOR 1 -#define BP_VECTOR 3 -#define OF_VECTOR 4 -#define BR_VECTOR 5 -#define UD_VECTOR 6 -#define NM_VECTOR 7 -#define DF_VECTOR 8 -#define TS_VECTOR 10 -#define NP_VECTOR 11 -#define SS_VECTOR 12 -#define GP_VECTOR 13 -#define PF_VECTOR 14 -#define MF_VECTOR 16 -#define MC_VECTOR 18 -  #define SELECTOR_TI_MASK (1 << 2)  #define SELECTOR_RPL_MASK 0x03 diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index a3ac52b29cb..54d73b1f00a 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -116,19 +116,9 @@ struct mce_log {  /* Software defined banks */  #define MCE_EXTENDED_BANK	128  #define MCE_THERMAL_BANK	MCE_EXTENDED_BANK + 0 - -#define K8_MCE_THRESHOLD_BASE      (MCE_EXTENDED_BANK + 1)      /* MCE_AMD */ -#define K8_MCE_THRESHOLD_BANK_0    (MCE_THRESHOLD_BASE + 0 * 9) -#define K8_MCE_THRESHOLD_BANK_1    (MCE_THRESHOLD_BASE + 1 * 9) -#define K8_MCE_THRESHOLD_BANK_2    (MCE_THRESHOLD_BASE + 2 * 9) -#define K8_MCE_THRESHOLD_BANK_3    (MCE_THRESHOLD_BASE + 3 * 9) -#define K8_MCE_THRESHOLD_BANK_4    (MCE_THRESHOLD_BASE + 4 * 9) -#define K8_MCE_THRESHOLD_BANK_5    (MCE_THRESHOLD_BASE + 5 * 9) -#define K8_MCE_THRESHOLD_DRAM_ECC  (MCE_THRESHOLD_BANK_4 + 0) - +#define K8_MCE_THRESHOLD_BASE      (MCE_EXTENDED_BANK + 1)  #ifdef __KERNEL__ -  extern void mce_register_decode_chain(struct notifier_block *nb);  extern void mce_unregister_decode_chain(struct notifier_block *nb); @@ -171,6 +161,7 @@ DECLARE_PER_CPU(struct device *, mce_device);  #ifdef CONFIG_X86_MCE_INTEL  extern int mce_cmci_disabled;  extern int mce_ignore_ce; +extern int mce_bios_cmci_threshold;  void mce_intel_feature_init(struct cpuinfo_x86 *c);  void cmci_clear(void);  void cmci_reenable(void); diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h index 4ebe157bf73..43d921b4752 100644 --- a/arch/x86/include/asm/microcode.h +++ b/arch/x86/include/asm/microcode.h @@ -15,8 +15,8 @@ struct microcode_ops {  	enum ucode_state (*request_microcode_user) (int cpu,  				const void __user *buf, size_t size); -	enum ucode_state (*request_microcode_fw) (int cpu, -				struct device *device); +	enum ucode_state (*request_microcode_fw) (int cpu, struct device *, +						  bool refresh_fw);  	void (*microcode_fini_cpu) (int cpu); @@ -49,12 +49,6 @@ static inline struct microcode_ops * __init init_intel_microcode(void)  #ifdef CONFIG_MICROCODE_AMD  extern struct microcode_ops * __init init_amd_microcode(void);  extern void __exit exit_amd_microcode(void); - -static inline void get_ucode_data(void *to, const u8 *from, size_t n) -{ -	memcpy(to, from, n); -} -  #else  static inline struct microcode_ops * __init init_amd_microcode(void)  { diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index cb4e43bce98..4fabcdf1cfa 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -262,4 +262,6 @@ static inline void perf_check_microcode(void) { }   static inline void amd_pmu_disable_virt(void) { }  #endif +#define arch_perf_out_copy_user copy_from_user_nmi +  #endif /* _ASM_X86_PERF_EVENT_H */ diff --git a/arch/x86/include/asm/perf_regs.h b/arch/x86/include/asm/perf_regs.h new file mode 100644 index 00000000000..3f2207bfd17 --- /dev/null +++ b/arch/x86/include/asm/perf_regs.h @@ -0,0 +1,33 @@ +#ifndef _ASM_X86_PERF_REGS_H +#define _ASM_X86_PERF_REGS_H + +enum perf_event_x86_regs { +	PERF_REG_X86_AX, +	PERF_REG_X86_BX, +	PERF_REG_X86_CX, +	PERF_REG_X86_DX, +	PERF_REG_X86_SI, +	PERF_REG_X86_DI, +	PERF_REG_X86_BP, +	PERF_REG_X86_SP, +	PERF_REG_X86_IP, +	PERF_REG_X86_FLAGS, +	PERF_REG_X86_CS, +	PERF_REG_X86_SS, +	PERF_REG_X86_DS, +	PERF_REG_X86_ES, +	PERF_REG_X86_FS, +	PERF_REG_X86_GS, +	PERF_REG_X86_R8, +	PERF_REG_X86_R9, +	PERF_REG_X86_R10, +	PERF_REG_X86_R11, +	PERF_REG_X86_R12, +	PERF_REG_X86_R13, +	PERF_REG_X86_R14, +	PERF_REG_X86_R15, + +	PERF_REG_X86_32_MAX = PERF_REG_X86_GS + 1, +	PERF_REG_X86_64_MAX = PERF_REG_X86_R15 + 1, +}; +#endif /* _ASM_X86_PERF_REGS_H */ diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 013286a10c2..db8fec6d295 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -303,11 +303,9 @@ void set_pte_vaddr(unsigned long vaddr, pte_t pte);  extern void native_pagetable_reserve(u64 start, u64 end);  #ifdef CONFIG_X86_32 -extern void native_pagetable_setup_start(pgd_t *base); -extern void native_pagetable_setup_done(pgd_t *base); +extern void native_pagetable_init(void);  #else -#define native_pagetable_setup_start x86_init_pgd_noop -#define native_pagetable_setup_done  x86_init_pgd_noop +#define native_pagetable_init        paging_init  #endif  struct seq_file; diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h index aea1d1d848c..680cf09ed10 100644 --- a/arch/x86/include/asm/processor-flags.h +++ b/arch/x86/include/asm/processor-flags.h @@ -65,6 +65,7 @@  #define X86_CR4_PCIDE	0x00020000 /* enable PCID support */  #define X86_CR4_OSXSAVE 0x00040000 /* enable xsave and xrestore */  #define X86_CR4_SMEP	0x00100000 /* enable SMEP support */ +#define X86_CR4_SMAP	0x00200000 /* enable SMAP support */  /*   * x86-64 Task Priority Register, CR8 diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index d048cad9bca..b98c0d958eb 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -423,7 +423,6 @@ DECLARE_INIT_PER_CPU(irq_stack_union);  DECLARE_PER_CPU(char *, irq_stack_ptr);  DECLARE_PER_CPU(unsigned int, irq_count); -extern unsigned long kernel_eflags;  extern asmlinkage void ignore_sysret(void);  #else	/* X86_64 */  #ifdef CONFIG_CC_STACKPROTECTOR @@ -759,6 +758,8 @@ static inline void update_debugctlmsr(unsigned long debugctlmsr)  	wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr);  } +extern void set_task_blockstep(struct task_struct *task, bool on); +  /*   * from system description table in BIOS. Mostly for MCA use, but   * others may find it useful: diff --git a/arch/x86/include/asm/rcu.h b/arch/x86/include/asm/rcu.h new file mode 100644 index 00000000000..d1ac07a2397 --- /dev/null +++ b/arch/x86/include/asm/rcu.h @@ -0,0 +1,32 @@ +#ifndef _ASM_X86_RCU_H +#define _ASM_X86_RCU_H + +#ifndef __ASSEMBLY__ + +#include <linux/rcupdate.h> +#include <asm/ptrace.h> + +static inline void exception_enter(struct pt_regs *regs) +{ +	rcu_user_exit(); +} + +static inline void exception_exit(struct pt_regs *regs) +{ +#ifdef CONFIG_RCU_USER_QS +	if (user_mode(regs)) +		rcu_user_enter(); +#endif +} + +#else /* __ASSEMBLY__ */ + +#ifdef CONFIG_RCU_USER_QS +# define SCHEDULE_USER call schedule_user +#else +# define SCHEDULE_USER call schedule +#endif + +#endif /* !__ASSEMBLY__ */ + +#endif diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h index 598457cbd0f..323973f4abf 100644 --- a/arch/x86/include/asm/signal.h +++ b/arch/x86/include/asm/signal.h @@ -31,6 +31,10 @@ typedef struct {  	unsigned long sig[_NSIG_WORDS];  } sigset_t; +#ifndef CONFIG_COMPAT +typedef sigset_t compat_sigset_t; +#endif +  #else  /* Here we must cater to libcs that poke about in kernel headers.  */ diff --git a/arch/x86/include/asm/smap.h b/arch/x86/include/asm/smap.h new file mode 100644 index 00000000000..8d3120f4e27 --- /dev/null +++ b/arch/x86/include/asm/smap.h @@ -0,0 +1,91 @@ +/* + * Supervisor Mode Access Prevention support + * + * Copyright (C) 2012 Intel Corporation + * Author: H. Peter Anvin <hpa@linux.intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ + +#ifndef _ASM_X86_SMAP_H +#define _ASM_X86_SMAP_H + +#include <linux/stringify.h> +#include <asm/nops.h> +#include <asm/cpufeature.h> + +/* "Raw" instruction opcodes */ +#define __ASM_CLAC	.byte 0x0f,0x01,0xca +#define __ASM_STAC	.byte 0x0f,0x01,0xcb + +#ifdef __ASSEMBLY__ + +#include <asm/alternative-asm.h> + +#ifdef CONFIG_X86_SMAP + +#define ASM_CLAC							\ +	661: ASM_NOP3 ;							\ +	.pushsection .altinstr_replacement, "ax" ;			\ +	662: __ASM_CLAC ;						\ +	.popsection ;							\ +	.pushsection .altinstructions, "a" ;				\ +	altinstruction_entry 661b, 662b, X86_FEATURE_SMAP, 3, 3 ;	\ +	.popsection + +#define ASM_STAC							\ +	661: ASM_NOP3 ;							\ +	.pushsection .altinstr_replacement, "ax" ;			\ +	662: __ASM_STAC ;						\ +	.popsection ;							\ +	.pushsection .altinstructions, "a" ;				\ +	altinstruction_entry 661b, 662b, X86_FEATURE_SMAP, 3, 3 ;	\ +	.popsection + +#else /* CONFIG_X86_SMAP */ + +#define ASM_CLAC +#define ASM_STAC + +#endif /* CONFIG_X86_SMAP */ + +#else /* __ASSEMBLY__ */ + +#include <asm/alternative.h> + +#ifdef CONFIG_X86_SMAP + +static __always_inline void clac(void) +{ +	/* Note: a barrier is implicit in alternative() */ +	alternative(ASM_NOP3, __stringify(__ASM_CLAC), X86_FEATURE_SMAP); +} + +static __always_inline void stac(void) +{ +	/* Note: a barrier is implicit in alternative() */ +	alternative(ASM_NOP3, __stringify(__ASM_STAC), X86_FEATURE_SMAP); +} + +/* These macros can be used in asm() statements */ +#define ASM_CLAC \ +	ALTERNATIVE(ASM_NOP3, __stringify(__ASM_CLAC), X86_FEATURE_SMAP) +#define ASM_STAC \ +	ALTERNATIVE(ASM_NOP3, __stringify(__ASM_STAC), X86_FEATURE_SMAP) + +#else /* CONFIG_X86_SMAP */ + +static inline void clac(void) { } +static inline void stac(void) { } + +#define ASM_CLAC +#define ASM_STAC + +#endif /* CONFIG_X86_SMAP */ + +#endif /* __ASSEMBLY__ */ + +#endif /* _ASM_X86_SMAP_H */ diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h index f2b83bc7d78..cdf5674dd23 100644 --- a/arch/x86/include/asm/svm.h +++ b/arch/x86/include/asm/svm.h @@ -1,6 +1,135 @@  #ifndef __SVM_H  #define __SVM_H +#define SVM_EXIT_READ_CR0      0x000 +#define SVM_EXIT_READ_CR3      0x003 +#define SVM_EXIT_READ_CR4      0x004 +#define SVM_EXIT_READ_CR8      0x008 +#define SVM_EXIT_WRITE_CR0     0x010 +#define SVM_EXIT_WRITE_CR3     0x013 +#define SVM_EXIT_WRITE_CR4     0x014 +#define SVM_EXIT_WRITE_CR8     0x018 +#define SVM_EXIT_READ_DR0      0x020 +#define SVM_EXIT_READ_DR1      0x021 +#define SVM_EXIT_READ_DR2      0x022 +#define SVM_EXIT_READ_DR3      0x023 +#define SVM_EXIT_READ_DR4      0x024 +#define SVM_EXIT_READ_DR5      0x025 +#define SVM_EXIT_READ_DR6      0x026 +#define SVM_EXIT_READ_DR7      0x027 +#define SVM_EXIT_WRITE_DR0     0x030 +#define SVM_EXIT_WRITE_DR1     0x031 +#define SVM_EXIT_WRITE_DR2     0x032 +#define SVM_EXIT_WRITE_DR3     0x033 +#define SVM_EXIT_WRITE_DR4     0x034 +#define SVM_EXIT_WRITE_DR5     0x035 +#define SVM_EXIT_WRITE_DR6     0x036 +#define SVM_EXIT_WRITE_DR7     0x037 +#define SVM_EXIT_EXCP_BASE     0x040 +#define SVM_EXIT_INTR          0x060 +#define SVM_EXIT_NMI           0x061 +#define SVM_EXIT_SMI           0x062 +#define SVM_EXIT_INIT          0x063 +#define SVM_EXIT_VINTR         0x064 +#define SVM_EXIT_CR0_SEL_WRITE 0x065 +#define SVM_EXIT_IDTR_READ     0x066 +#define SVM_EXIT_GDTR_READ     0x067 +#define SVM_EXIT_LDTR_READ     0x068 +#define SVM_EXIT_TR_READ       0x069 +#define SVM_EXIT_IDTR_WRITE    0x06a +#define SVM_EXIT_GDTR_WRITE    0x06b +#define SVM_EXIT_LDTR_WRITE    0x06c +#define SVM_EXIT_TR_WRITE      0x06d +#define SVM_EXIT_RDTSC         0x06e +#define SVM_EXIT_RDPMC         0x06f +#define SVM_EXIT_PUSHF         0x070 +#define SVM_EXIT_POPF          0x071 +#define SVM_EXIT_CPUID         0x072 +#define SVM_EXIT_RSM           0x073 +#define SVM_EXIT_IRET          0x074 +#define SVM_EXIT_SWINT         0x075 +#define SVM_EXIT_INVD          0x076 +#define SVM_EXIT_PAUSE         0x077 +#define SVM_EXIT_HLT           0x078 +#define SVM_EXIT_INVLPG        0x079 +#define SVM_EXIT_INVLPGA       0x07a +#define SVM_EXIT_IOIO          0x07b +#define SVM_EXIT_MSR           0x07c +#define SVM_EXIT_TASK_SWITCH   0x07d +#define SVM_EXIT_FERR_FREEZE   0x07e +#define SVM_EXIT_SHUTDOWN      0x07f +#define SVM_EXIT_VMRUN         0x080 +#define SVM_EXIT_VMMCALL       0x081 +#define SVM_EXIT_VMLOAD        0x082 +#define SVM_EXIT_VMSAVE        0x083 +#define SVM_EXIT_STGI          0x084 +#define SVM_EXIT_CLGI          0x085 +#define SVM_EXIT_SKINIT        0x086 +#define SVM_EXIT_RDTSCP        0x087 +#define SVM_EXIT_ICEBP         0x088 +#define SVM_EXIT_WBINVD        0x089 +#define SVM_EXIT_MONITOR       0x08a +#define SVM_EXIT_MWAIT         0x08b +#define SVM_EXIT_MWAIT_COND    0x08c +#define SVM_EXIT_XSETBV        0x08d +#define SVM_EXIT_NPF           0x400 + +#define SVM_EXIT_ERR           -1 + +#define SVM_EXIT_REASONS \ +	{ SVM_EXIT_READ_CR0,    "read_cr0" }, \ +	{ SVM_EXIT_READ_CR3,    "read_cr3" }, \ +	{ SVM_EXIT_READ_CR4,    "read_cr4" }, \ +	{ SVM_EXIT_READ_CR8,    "read_cr8" }, \ +	{ SVM_EXIT_WRITE_CR0,   "write_cr0" }, \ +	{ SVM_EXIT_WRITE_CR3,   "write_cr3" }, \ +	{ SVM_EXIT_WRITE_CR4,   "write_cr4" }, \ +	{ SVM_EXIT_WRITE_CR8,   "write_cr8" }, \ +	{ SVM_EXIT_READ_DR0,    "read_dr0" }, \ +	{ SVM_EXIT_READ_DR1,    "read_dr1" }, \ +	{ SVM_EXIT_READ_DR2,    "read_dr2" }, \ +	{ SVM_EXIT_READ_DR3,    "read_dr3" }, \ +	{ SVM_EXIT_WRITE_DR0,   "write_dr0" }, \ +	{ SVM_EXIT_WRITE_DR1,   "write_dr1" }, \ +	{ SVM_EXIT_WRITE_DR2,   "write_dr2" }, \ +	{ SVM_EXIT_WRITE_DR3,   "write_dr3" }, \ +	{ SVM_EXIT_WRITE_DR5,   "write_dr5" }, \ +	{ SVM_EXIT_WRITE_DR7,   "write_dr7" }, \ +	{ SVM_EXIT_EXCP_BASE + DB_VECTOR,       "DB excp" }, \ +	{ SVM_EXIT_EXCP_BASE + BP_VECTOR,       "BP excp" }, \ +	{ SVM_EXIT_EXCP_BASE + UD_VECTOR,       "UD excp" }, \ +	{ SVM_EXIT_EXCP_BASE + PF_VECTOR,       "PF excp" }, \ +	{ SVM_EXIT_EXCP_BASE + NM_VECTOR,       "NM excp" }, \ +	{ SVM_EXIT_EXCP_BASE + MC_VECTOR,       "MC excp" }, \ +	{ SVM_EXIT_INTR,        "interrupt" }, \ +	{ SVM_EXIT_NMI,         "nmi" }, \ +	{ SVM_EXIT_SMI,         "smi" }, \ +	{ SVM_EXIT_INIT,        "init" }, \ +	{ SVM_EXIT_VINTR,       "vintr" }, \ +	{ SVM_EXIT_CPUID,       "cpuid" }, \ +	{ SVM_EXIT_INVD,        "invd" }, \ +	{ SVM_EXIT_HLT,         "hlt" }, \ +	{ SVM_EXIT_INVLPG,      "invlpg" }, \ +	{ SVM_EXIT_INVLPGA,     "invlpga" }, \ +	{ SVM_EXIT_IOIO,        "io" }, \ +	{ SVM_EXIT_MSR,         "msr" }, \ +	{ SVM_EXIT_TASK_SWITCH, "task_switch" }, \ +	{ SVM_EXIT_SHUTDOWN,    "shutdown" }, \ +	{ SVM_EXIT_VMRUN,       "vmrun" }, \ +	{ SVM_EXIT_VMMCALL,     "hypercall" }, \ +	{ SVM_EXIT_VMLOAD,      "vmload" }, \ +	{ SVM_EXIT_VMSAVE,      "vmsave" }, \ +	{ SVM_EXIT_STGI,        "stgi" }, \ +	{ SVM_EXIT_CLGI,        "clgi" }, \ +	{ SVM_EXIT_SKINIT,      "skinit" }, \ +	{ SVM_EXIT_WBINVD,      "wbinvd" }, \ +	{ SVM_EXIT_MONITOR,     "monitor" }, \ +	{ SVM_EXIT_MWAIT,       "mwait" }, \ +	{ SVM_EXIT_XSETBV,      "xsetbv" }, \ +	{ SVM_EXIT_NPF,         "npf" } + +#ifdef __KERNEL__ +  enum {  	INTERCEPT_INTR,  	INTERCEPT_NMI, @@ -264,81 +393,6 @@ struct __attribute__ ((__packed__)) vmcb {  #define SVM_EXITINFO_REG_MASK 0x0F -#define	SVM_EXIT_READ_CR0 	0x000 -#define	SVM_EXIT_READ_CR3 	0x003 -#define	SVM_EXIT_READ_CR4 	0x004 -#define	SVM_EXIT_READ_CR8 	0x008 -#define	SVM_EXIT_WRITE_CR0 	0x010 -#define	SVM_EXIT_WRITE_CR3 	0x013 -#define	SVM_EXIT_WRITE_CR4 	0x014 -#define	SVM_EXIT_WRITE_CR8 	0x018 -#define	SVM_EXIT_READ_DR0 	0x020 -#define	SVM_EXIT_READ_DR1 	0x021 -#define	SVM_EXIT_READ_DR2 	0x022 -#define	SVM_EXIT_READ_DR3 	0x023 -#define	SVM_EXIT_READ_DR4 	0x024 -#define	SVM_EXIT_READ_DR5 	0x025 -#define	SVM_EXIT_READ_DR6 	0x026 -#define	SVM_EXIT_READ_DR7 	0x027 -#define	SVM_EXIT_WRITE_DR0 	0x030 -#define	SVM_EXIT_WRITE_DR1 	0x031 -#define	SVM_EXIT_WRITE_DR2 	0x032 -#define	SVM_EXIT_WRITE_DR3 	0x033 -#define	SVM_EXIT_WRITE_DR4 	0x034 -#define	SVM_EXIT_WRITE_DR5 	0x035 -#define	SVM_EXIT_WRITE_DR6 	0x036 -#define	SVM_EXIT_WRITE_DR7 	0x037 -#define SVM_EXIT_EXCP_BASE      0x040 -#define SVM_EXIT_INTR		0x060 -#define SVM_EXIT_NMI		0x061 -#define SVM_EXIT_SMI		0x062 -#define SVM_EXIT_INIT		0x063 -#define SVM_EXIT_VINTR		0x064 -#define SVM_EXIT_CR0_SEL_WRITE	0x065 -#define SVM_EXIT_IDTR_READ	0x066 -#define SVM_EXIT_GDTR_READ	0x067 -#define SVM_EXIT_LDTR_READ	0x068 -#define SVM_EXIT_TR_READ	0x069 -#define SVM_EXIT_IDTR_WRITE	0x06a -#define SVM_EXIT_GDTR_WRITE	0x06b -#define SVM_EXIT_LDTR_WRITE	0x06c -#define SVM_EXIT_TR_WRITE	0x06d -#define SVM_EXIT_RDTSC		0x06e -#define SVM_EXIT_RDPMC		0x06f -#define SVM_EXIT_PUSHF		0x070 -#define SVM_EXIT_POPF		0x071 -#define SVM_EXIT_CPUID		0x072 -#define SVM_EXIT_RSM		0x073 -#define SVM_EXIT_IRET		0x074 -#define SVM_EXIT_SWINT		0x075 -#define SVM_EXIT_INVD		0x076 -#define SVM_EXIT_PAUSE		0x077 -#define SVM_EXIT_HLT		0x078 -#define SVM_EXIT_INVLPG		0x079 -#define SVM_EXIT_INVLPGA	0x07a -#define SVM_EXIT_IOIO		0x07b -#define SVM_EXIT_MSR		0x07c -#define SVM_EXIT_TASK_SWITCH	0x07d -#define SVM_EXIT_FERR_FREEZE	0x07e -#define SVM_EXIT_SHUTDOWN	0x07f -#define SVM_EXIT_VMRUN		0x080 -#define SVM_EXIT_VMMCALL	0x081 -#define SVM_EXIT_VMLOAD		0x082 -#define SVM_EXIT_VMSAVE		0x083 -#define SVM_EXIT_STGI		0x084 -#define SVM_EXIT_CLGI		0x085 -#define SVM_EXIT_SKINIT		0x086 -#define SVM_EXIT_RDTSCP		0x087 -#define SVM_EXIT_ICEBP		0x088 -#define SVM_EXIT_WBINVD		0x089 -#define SVM_EXIT_MONITOR	0x08a -#define SVM_EXIT_MWAIT		0x08b -#define SVM_EXIT_MWAIT_COND	0x08c -#define SVM_EXIT_XSETBV		0x08d -#define SVM_EXIT_NPF  		0x400 - -#define SVM_EXIT_ERR		-1 -  #define SVM_CR0_SELECTIVE_MASK (X86_CR0_TS | X86_CR0_MP)  #define SVM_VMLOAD ".byte 0x0f, 0x01, 0xda" @@ -350,3 +404,4 @@ struct __attribute__ ((__packed__)) vmcb {  #endif +#endif diff --git a/arch/x86/include/asm/sys_ia32.h b/arch/x86/include/asm/sys_ia32.h index 3fda9db4881..4ca1c611b55 100644 --- a/arch/x86/include/asm/sys_ia32.h +++ b/arch/x86/include/asm/sys_ia32.h @@ -40,7 +40,7 @@ asmlinkage long sys32_sigaction(int, struct old_sigaction32 __user *,  				struct old_sigaction32 __user *);  asmlinkage long sys32_alarm(unsigned int); -asmlinkage long sys32_waitpid(compat_pid_t, unsigned int *, int); +asmlinkage long sys32_waitpid(compat_pid_t, unsigned int __user *, int);  asmlinkage long sys32_sysfs(int, u32, u32);  asmlinkage long sys32_sched_rr_get_interval(compat_pid_t, diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 89f794f007e..c535d847e3b 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -89,6 +89,7 @@ struct thread_info {  #define TIF_NOTSC		16	/* TSC is not accessible in userland */  #define TIF_IA32		17	/* IA32 compatibility process */  #define TIF_FORK		18	/* ret_from_fork */ +#define TIF_NOHZ		19	/* in adaptive nohz mode */  #define TIF_MEMDIE		20	/* is terminating due to OOM killer */  #define TIF_DEBUG		21	/* uses debug registers */  #define TIF_IO_BITMAP		22	/* uses I/O bitmap */ @@ -114,6 +115,7 @@ struct thread_info {  #define _TIF_NOTSC		(1 << TIF_NOTSC)  #define _TIF_IA32		(1 << TIF_IA32)  #define _TIF_FORK		(1 << TIF_FORK) +#define _TIF_NOHZ		(1 << TIF_NOHZ)  #define _TIF_DEBUG		(1 << TIF_DEBUG)  #define _TIF_IO_BITMAP		(1 << TIF_IO_BITMAP)  #define _TIF_FORCED_TF		(1 << TIF_FORCED_TF) @@ -126,12 +128,13 @@ struct thread_info {  /* work to do in syscall_trace_enter() */  #define _TIF_WORK_SYSCALL_ENTRY	\  	(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_EMU | _TIF_SYSCALL_AUDIT |	\ -	 _TIF_SECCOMP | _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT) +	 _TIF_SECCOMP | _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT |	\ +	 _TIF_NOHZ)  /* work to do in syscall_trace_leave() */  #define _TIF_WORK_SYSCALL_EXIT	\  	(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SINGLESTEP |	\ -	 _TIF_SYSCALL_TRACEPOINT) +	 _TIF_SYSCALL_TRACEPOINT | _TIF_NOHZ)  /* work to do on interrupt/exception return */  #define _TIF_WORK_MASK							\ @@ -141,7 +144,8 @@ struct thread_info {  /* work to do on any return to user space */  #define _TIF_ALLWORK_MASK						\ -	((0x0000FFFF & ~_TIF_SECCOMP) | _TIF_SYSCALL_TRACEPOINT) +	((0x0000FFFF & ~_TIF_SECCOMP) | _TIF_SYSCALL_TRACEPOINT |	\ +	_TIF_NOHZ)  /* Only used for 64 bit */  #define _TIF_DO_NOTIFY_MASK						\ diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index e1f3a17034f..a91acfbb1a9 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -9,6 +9,7 @@  #include <linux/string.h>  #include <asm/asm.h>  #include <asm/page.h> +#include <asm/smap.h>  #define VERIFY_READ 0  #define VERIFY_WRITE 1 @@ -192,9 +193,10 @@ extern int __get_user_bad(void);  #ifdef CONFIG_X86_32  #define __put_user_asm_u64(x, addr, err, errret)			\ -	asm volatile("1:	movl %%eax,0(%2)\n"			\ +	asm volatile(ASM_STAC "\n"					\ +		     "1:	movl %%eax,0(%2)\n"			\  		     "2:	movl %%edx,4(%2)\n"			\ -		     "3:\n"						\ +		     "3: " ASM_CLAC "\n"				\  		     ".section .fixup,\"ax\"\n"				\  		     "4:	movl %3,%0\n"				\  		     "	jmp 3b\n"					\ @@ -205,9 +207,10 @@ extern int __get_user_bad(void);  		     : "A" (x), "r" (addr), "i" (errret), "0" (err))  #define __put_user_asm_ex_u64(x, addr)					\ -	asm volatile("1:	movl %%eax,0(%1)\n"			\ +	asm volatile(ASM_STAC "\n"					\ +		     "1:	movl %%eax,0(%1)\n"			\  		     "2:	movl %%edx,4(%1)\n"			\ -		     "3:\n"						\ +		     "3: " ASM_CLAC "\n"				\  		     _ASM_EXTABLE_EX(1b, 2b)				\  		     _ASM_EXTABLE_EX(2b, 3b)				\  		     : : "A" (x), "r" (addr)) @@ -379,8 +382,9 @@ do {									\  } while (0)  #define __get_user_asm(x, addr, err, itype, rtype, ltype, errret)	\ -	asm volatile("1:	mov"itype" %2,%"rtype"1\n"		\ -		     "2:\n"						\ +	asm volatile(ASM_STAC "\n"					\ +		     "1:	mov"itype" %2,%"rtype"1\n"		\ +		     "2: " ASM_CLAC "\n"				\  		     ".section .fixup,\"ax\"\n"				\  		     "3:	mov %3,%0\n"				\  		     "	xor"itype" %"rtype"1,%"rtype"1\n"		\ @@ -443,8 +447,9 @@ struct __large_struct { unsigned long buf[100]; };   * aliasing issues.   */  #define __put_user_asm(x, addr, err, itype, rtype, ltype, errret)	\ -	asm volatile("1:	mov"itype" %"rtype"1,%2\n"		\ -		     "2:\n"						\ +	asm volatile(ASM_STAC "\n"					\ +		     "1:	mov"itype" %"rtype"1,%2\n"		\ +		     "2: " ASM_CLAC "\n"				\  		     ".section .fixup,\"ax\"\n"				\  		     "3:	mov %3,%0\n"				\  		     "	jmp 2b\n"					\ @@ -463,13 +468,13 @@ struct __large_struct { unsigned long buf[100]; };   * uaccess_try and catch   */  #define uaccess_try	do {						\ -	int prev_err = current_thread_info()->uaccess_err;		\  	current_thread_info()->uaccess_err = 0;				\ +	stac();								\  	barrier();  #define uaccess_catch(err)						\ +	clac();								\  	(err) |= (current_thread_info()->uaccess_err ? -EFAULT : 0);	\ -	current_thread_info()->uaccess_err = prev_err;			\  } while (0)  /** @@ -569,6 +574,9 @@ strncpy_from_user(char *dst, const char __user *src, long count);  extern __must_check long strlen_user(const char __user *str);  extern __must_check long strnlen_user(const char __user *str, long n); +unsigned long __must_check clear_user(void __user *mem, unsigned long len); +unsigned long __must_check __clear_user(void __user *mem, unsigned long len); +  /*   * movsl can be slow when source and dest are not both 8-byte aligned   */ diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h index 576e39bca6a..7f760a9f1f6 100644 --- a/arch/x86/include/asm/uaccess_32.h +++ b/arch/x86/include/asm/uaccess_32.h @@ -213,7 +213,4 @@ static inline unsigned long __must_check copy_from_user(void *to,  	return n;  } -unsigned long __must_check clear_user(void __user *mem, unsigned long len); -unsigned long __must_check __clear_user(void __user *mem, unsigned long len); -  #endif /* _ASM_X86_UACCESS_32_H */ diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index d8def8b3dba..142810c457d 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -217,9 +217,6 @@ int __copy_in_user(void __user *dst, const void __user *src, unsigned size)  	}  } -__must_check unsigned long clear_user(void __user *mem, unsigned long len); -__must_check unsigned long __clear_user(void __user *mem, unsigned long len); -  static __must_check __always_inline int  __copy_from_user_inatomic(void *dst, const void __user *src, unsigned size)  { diff --git a/arch/x86/include/asm/uprobes.h b/arch/x86/include/asm/uprobes.h index f3971bbcd1d..8ff8be7835a 100644 --- a/arch/x86/include/asm/uprobes.h +++ b/arch/x86/include/asm/uprobes.h @@ -42,10 +42,11 @@ struct arch_uprobe {  };  struct arch_uprobe_task { -	unsigned long			saved_trap_nr;  #ifdef CONFIG_X86_64  	unsigned long			saved_scratch_register;  #endif +	unsigned int			saved_trap_nr; +	unsigned int			saved_tf;  };  extern int  arch_uprobe_analyze_insn(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long addr); diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h index bb0522850b7..fddb53d6391 100644 --- a/arch/x86/include/asm/vdso.h +++ b/arch/x86/include/asm/vdso.h @@ -11,7 +11,8 @@ extern const char VDSO32_PRELINK[];  #define VDSO32_SYMBOL(base, name)					\  ({									\  	extern const char VDSO32_##name[];				\ -	(void *)(VDSO32_##name - VDSO32_PRELINK + (unsigned long)(base)); \ +	(void __user *)(VDSO32_##name - VDSO32_PRELINK +		\ +			(unsigned long)(base));				\  })  #endif diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 74fcb963595..36ec21c36d6 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -25,6 +25,88 @@   *   */ +#define VMX_EXIT_REASONS_FAILED_VMENTRY         0x80000000 + +#define EXIT_REASON_EXCEPTION_NMI       0 +#define EXIT_REASON_EXTERNAL_INTERRUPT  1 +#define EXIT_REASON_TRIPLE_FAULT        2 + +#define EXIT_REASON_PENDING_INTERRUPT   7 +#define EXIT_REASON_NMI_WINDOW          8 +#define EXIT_REASON_TASK_SWITCH         9 +#define EXIT_REASON_CPUID               10 +#define EXIT_REASON_HLT                 12 +#define EXIT_REASON_INVD                13 +#define EXIT_REASON_INVLPG              14 +#define EXIT_REASON_RDPMC               15 +#define EXIT_REASON_RDTSC               16 +#define EXIT_REASON_VMCALL              18 +#define EXIT_REASON_VMCLEAR             19 +#define EXIT_REASON_VMLAUNCH            20 +#define EXIT_REASON_VMPTRLD             21 +#define EXIT_REASON_VMPTRST             22 +#define EXIT_REASON_VMREAD              23 +#define EXIT_REASON_VMRESUME            24 +#define EXIT_REASON_VMWRITE             25 +#define EXIT_REASON_VMOFF               26 +#define EXIT_REASON_VMON                27 +#define EXIT_REASON_CR_ACCESS           28 +#define EXIT_REASON_DR_ACCESS           29 +#define EXIT_REASON_IO_INSTRUCTION      30 +#define EXIT_REASON_MSR_READ            31 +#define EXIT_REASON_MSR_WRITE           32 +#define EXIT_REASON_INVALID_STATE       33 +#define EXIT_REASON_MWAIT_INSTRUCTION   36 +#define EXIT_REASON_MONITOR_INSTRUCTION 39 +#define EXIT_REASON_PAUSE_INSTRUCTION   40 +#define EXIT_REASON_MCE_DURING_VMENTRY  41 +#define EXIT_REASON_TPR_BELOW_THRESHOLD 43 +#define EXIT_REASON_APIC_ACCESS         44 +#define EXIT_REASON_EPT_VIOLATION       48 +#define EXIT_REASON_EPT_MISCONFIG       49 +#define EXIT_REASON_WBINVD              54 +#define EXIT_REASON_XSETBV              55 +#define EXIT_REASON_INVPCID             58 + +#define VMX_EXIT_REASONS \ +	{ EXIT_REASON_EXCEPTION_NMI,         "EXCEPTION_NMI" }, \ +	{ EXIT_REASON_EXTERNAL_INTERRUPT,    "EXTERNAL_INTERRUPT" }, \ +	{ EXIT_REASON_TRIPLE_FAULT,          "TRIPLE_FAULT" }, \ +	{ EXIT_REASON_PENDING_INTERRUPT,     "PENDING_INTERRUPT" }, \ +	{ EXIT_REASON_NMI_WINDOW,            "NMI_WINDOW" }, \ +	{ EXIT_REASON_TASK_SWITCH,           "TASK_SWITCH" }, \ +	{ EXIT_REASON_CPUID,                 "CPUID" }, \ +	{ EXIT_REASON_HLT,                   "HLT" }, \ +	{ EXIT_REASON_INVLPG,                "INVLPG" }, \ +	{ EXIT_REASON_RDPMC,                 "RDPMC" }, \ +	{ EXIT_REASON_RDTSC,                 "RDTSC" }, \ +	{ EXIT_REASON_VMCALL,                "VMCALL" }, \ +	{ EXIT_REASON_VMCLEAR,               "VMCLEAR" }, \ +	{ EXIT_REASON_VMLAUNCH,              "VMLAUNCH" }, \ +	{ EXIT_REASON_VMPTRLD,               "VMPTRLD" }, \ +	{ EXIT_REASON_VMPTRST,               "VMPTRST" }, \ +	{ EXIT_REASON_VMREAD,                "VMREAD" }, \ +	{ EXIT_REASON_VMRESUME,              "VMRESUME" }, \ +	{ EXIT_REASON_VMWRITE,               "VMWRITE" }, \ +	{ EXIT_REASON_VMOFF,                 "VMOFF" }, \ +	{ EXIT_REASON_VMON,                  "VMON" }, \ +	{ EXIT_REASON_CR_ACCESS,             "CR_ACCESS" }, \ +	{ EXIT_REASON_DR_ACCESS,             "DR_ACCESS" }, \ +	{ EXIT_REASON_IO_INSTRUCTION,        "IO_INSTRUCTION" }, \ +	{ EXIT_REASON_MSR_READ,              "MSR_READ" }, \ +	{ EXIT_REASON_MSR_WRITE,             "MSR_WRITE" }, \ +	{ EXIT_REASON_MWAIT_INSTRUCTION,     "MWAIT_INSTRUCTION" }, \ +	{ EXIT_REASON_MONITOR_INSTRUCTION,   "MONITOR_INSTRUCTION" }, \ +	{ EXIT_REASON_PAUSE_INSTRUCTION,     "PAUSE_INSTRUCTION" }, \ +	{ EXIT_REASON_MCE_DURING_VMENTRY,    "MCE_DURING_VMENTRY" }, \ +	{ EXIT_REASON_TPR_BELOW_THRESHOLD,   "TPR_BELOW_THRESHOLD" }, \ +	{ EXIT_REASON_APIC_ACCESS,           "APIC_ACCESS" }, \ +	{ EXIT_REASON_EPT_VIOLATION,         "EPT_VIOLATION" }, \ +	{ EXIT_REASON_EPT_MISCONFIG,         "EPT_MISCONFIG" }, \ +	{ EXIT_REASON_WBINVD,                "WBINVD" } + +#ifdef __KERNEL__ +  #include <linux/types.h>  /* @@ -241,49 +323,6 @@ enum vmcs_field {  	HOST_RIP                        = 0x00006c16,  }; -#define VMX_EXIT_REASONS_FAILED_VMENTRY         0x80000000 - -#define EXIT_REASON_EXCEPTION_NMI       0 -#define EXIT_REASON_EXTERNAL_INTERRUPT  1 -#define EXIT_REASON_TRIPLE_FAULT        2 - -#define EXIT_REASON_PENDING_INTERRUPT   7 -#define EXIT_REASON_NMI_WINDOW		8 -#define EXIT_REASON_TASK_SWITCH         9 -#define EXIT_REASON_CPUID               10 -#define EXIT_REASON_HLT                 12 -#define EXIT_REASON_INVD                13 -#define EXIT_REASON_INVLPG              14 -#define EXIT_REASON_RDPMC               15 -#define EXIT_REASON_RDTSC               16 -#define EXIT_REASON_VMCALL              18 -#define EXIT_REASON_VMCLEAR             19 -#define EXIT_REASON_VMLAUNCH            20 -#define EXIT_REASON_VMPTRLD             21 -#define EXIT_REASON_VMPTRST             22 -#define EXIT_REASON_VMREAD              23 -#define EXIT_REASON_VMRESUME            24 -#define EXIT_REASON_VMWRITE             25 -#define EXIT_REASON_VMOFF               26 -#define EXIT_REASON_VMON                27 -#define EXIT_REASON_CR_ACCESS           28 -#define EXIT_REASON_DR_ACCESS           29 -#define EXIT_REASON_IO_INSTRUCTION      30 -#define EXIT_REASON_MSR_READ            31 -#define EXIT_REASON_MSR_WRITE           32 -#define EXIT_REASON_INVALID_STATE	33 -#define EXIT_REASON_MWAIT_INSTRUCTION   36 -#define EXIT_REASON_MONITOR_INSTRUCTION 39 -#define EXIT_REASON_PAUSE_INSTRUCTION   40 -#define EXIT_REASON_MCE_DURING_VMENTRY	 41 -#define EXIT_REASON_TPR_BELOW_THRESHOLD 43 -#define EXIT_REASON_APIC_ACCESS         44 -#define EXIT_REASON_EPT_VIOLATION       48 -#define EXIT_REASON_EPT_MISCONFIG       49 -#define EXIT_REASON_WBINVD		54 -#define EXIT_REASON_XSETBV		55 -#define EXIT_REASON_INVPCID		58 -  /*   * Interruption-information format   */ @@ -488,3 +527,5 @@ enum vm_instruction_error_number {  };  #endif + +#endif diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 38155f66714..57693498519 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -81,12 +81,13 @@ struct x86_init_mapping {  /**   * struct x86_init_paging - platform specific paging functions - * @pagetable_setup_start:	platform specific pre paging_init() call - * @pagetable_setup_done:	platform specific post paging_init() call + * @pagetable_init:	platform specific paging initialization call to setup + *			the kernel pagetables and prepare accessors functions. + *			Callback must call paging_init(). Called once after the + *			direct mapping for phys memory is available.   */  struct x86_init_paging { -	void (*pagetable_setup_start)(pgd_t *base); -	void (*pagetable_setup_done)(pgd_t *base); +	void (*pagetable_init)(void);  };  /** diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index 93971e841dd..472b9b78301 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -51,7 +51,8 @@ extern unsigned long set_phys_range_identity(unsigned long pfn_s,  extern int m2p_add_override(unsigned long mfn, struct page *page,  			    struct gnttab_map_grant_ref *kmap_op); -extern int m2p_remove_override(struct page *page, bool clear_pte); +extern int m2p_remove_override(struct page *page, +				struct gnttab_map_grant_ref *kmap_op);  extern struct page *m2p_find_override(unsigned long mfn);  extern unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn); diff --git a/arch/x86/include/asm/xor_32.h b/arch/x86/include/asm/xor_32.h index 454570891bd..aabd5850bdb 100644 --- a/arch/x86/include/asm/xor_32.h +++ b/arch/x86/include/asm/xor_32.h @@ -534,38 +534,6 @@ static struct xor_block_template xor_block_p5_mmx = {   * Copyright (C) 1999 Zach Brown (with obvious credit due Ingo)   */ -#define XMMS_SAVE				\ -do {						\ -	preempt_disable();			\ -	cr0 = read_cr0();			\ -	clts();					\ -	asm volatile(				\ -		"movups %%xmm0,(%0)	;\n\t"	\ -		"movups %%xmm1,0x10(%0)	;\n\t"	\ -		"movups %%xmm2,0x20(%0)	;\n\t"	\ -		"movups %%xmm3,0x30(%0)	;\n\t"	\ -		:				\ -		: "r" (xmm_save) 		\ -		: "memory");			\ -} while (0) - -#define XMMS_RESTORE				\ -do {						\ -	asm volatile(				\ -		"sfence			;\n\t"	\ -		"movups (%0),%%xmm0	;\n\t"	\ -		"movups 0x10(%0),%%xmm1	;\n\t"	\ -		"movups 0x20(%0),%%xmm2	;\n\t"	\ -		"movups 0x30(%0),%%xmm3	;\n\t"	\ -		:				\ -		: "r" (xmm_save)		\ -		: "memory");			\ -	write_cr0(cr0);				\ -	preempt_enable();			\ -} while (0) - -#define ALIGN16 __attribute__((aligned(16))) -  #define OFFS(x)		"16*("#x")"  #define PF_OFFS(x)	"256+16*("#x")"  #define	PF0(x)		"	prefetchnta "PF_OFFS(x)"(%1)		;\n" @@ -587,10 +555,8 @@ static void  xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)  {  	unsigned long lines = bytes >> 8; -	char xmm_save[16*4] ALIGN16; -	int cr0; -	XMMS_SAVE; +	kernel_fpu_begin();  	asm volatile(  #undef BLOCK @@ -633,7 +599,7 @@ xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)  	:  	: "memory"); -	XMMS_RESTORE; +	kernel_fpu_end();  }  static void @@ -641,10 +607,8 @@ xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,  	  unsigned long *p3)  {  	unsigned long lines = bytes >> 8; -	char xmm_save[16*4] ALIGN16; -	int cr0; -	XMMS_SAVE; +	kernel_fpu_begin();  	asm volatile(  #undef BLOCK @@ -694,7 +658,7 @@ xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,  	:  	: "memory" ); -	XMMS_RESTORE; +	kernel_fpu_end();  }  static void @@ -702,10 +666,8 @@ xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,  	  unsigned long *p3, unsigned long *p4)  {  	unsigned long lines = bytes >> 8; -	char xmm_save[16*4] ALIGN16; -	int cr0; -	XMMS_SAVE; +	kernel_fpu_begin();  	asm volatile(  #undef BLOCK @@ -762,7 +724,7 @@ xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,  	:  	: "memory" ); -	XMMS_RESTORE; +	kernel_fpu_end();  }  static void @@ -770,10 +732,8 @@ xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,  	  unsigned long *p3, unsigned long *p4, unsigned long *p5)  {  	unsigned long lines = bytes >> 8; -	char xmm_save[16*4] ALIGN16; -	int cr0; -	XMMS_SAVE; +	kernel_fpu_begin();  	/* Make sure GCC forgets anything it knows about p4 or p5,  	   such that it won't pass to the asm volatile below a @@ -850,7 +810,7 @@ xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,  	   like assuming they have some legal value.  */  	asm("" : "=r" (p4), "=r" (p5)); -	XMMS_RESTORE; +	kernel_fpu_end();  }  static struct xor_block_template xor_block_pIII_sse = { diff --git a/arch/x86/include/asm/xor_64.h b/arch/x86/include/asm/xor_64.h index b9b2323e90f..5fc06d0b7eb 100644 --- a/arch/x86/include/asm/xor_64.h +++ b/arch/x86/include/asm/xor_64.h @@ -34,41 +34,7 @@   * no advantages to be gotten from x86-64 here anyways.   */ -typedef struct { -	unsigned long a, b; -} __attribute__((aligned(16))) xmm_store_t; - -/* Doesn't use gcc to save the XMM registers, because there is no easy way to -   tell it to do a clts before the register saving. */ -#define XMMS_SAVE				\ -do {						\ -	preempt_disable();			\ -	asm volatile(				\ -		"movq %%cr0,%0		;\n\t"	\ -		"clts			;\n\t"	\ -		"movups %%xmm0,(%1)	;\n\t"	\ -		"movups %%xmm1,0x10(%1)	;\n\t"	\ -		"movups %%xmm2,0x20(%1)	;\n\t"	\ -		"movups %%xmm3,0x30(%1)	;\n\t"	\ -		: "=&r" (cr0)			\ -		: "r" (xmm_save) 		\ -		: "memory");			\ -} while (0) - -#define XMMS_RESTORE				\ -do {						\ -	asm volatile(				\ -		"sfence			;\n\t"	\ -		"movups (%1),%%xmm0	;\n\t"	\ -		"movups 0x10(%1),%%xmm1	;\n\t"	\ -		"movups 0x20(%1),%%xmm2	;\n\t"	\ -		"movups 0x30(%1),%%xmm3	;\n\t"	\ -		"movq 	%0,%%cr0	;\n\t"	\ -		:				\ -		: "r" (cr0), "r" (xmm_save)	\ -		: "memory");			\ -	preempt_enable();			\ -} while (0) +#include <asm/i387.h>  #define OFFS(x)		"16*("#x")"  #define PF_OFFS(x)	"256+16*("#x")" @@ -91,10 +57,8 @@ static void  xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)  {  	unsigned int lines = bytes >> 8; -	unsigned long cr0; -	xmm_store_t xmm_save[4]; -	XMMS_SAVE; +	kernel_fpu_begin();  	asm volatile(  #undef BLOCK @@ -135,7 +99,7 @@ xor_sse_2(unsigned long bytes, unsigned long *p1, unsigned long *p2)  	: [inc] "r" (256UL)  	: "memory"); -	XMMS_RESTORE; +	kernel_fpu_end();  }  static void @@ -143,11 +107,8 @@ xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,  	  unsigned long *p3)  {  	unsigned int lines = bytes >> 8; -	xmm_store_t xmm_save[4]; -	unsigned long cr0; - -	XMMS_SAVE; +	kernel_fpu_begin();  	asm volatile(  #undef BLOCK  #define BLOCK(i) \ @@ -194,7 +155,7 @@ xor_sse_3(unsigned long bytes, unsigned long *p1, unsigned long *p2,  	  [p1] "+r" (p1), [p2] "+r" (p2), [p3] "+r" (p3)  	: [inc] "r" (256UL)  	: "memory"); -	XMMS_RESTORE; +	kernel_fpu_end();  }  static void @@ -202,10 +163,8 @@ xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,  	  unsigned long *p3, unsigned long *p4)  {  	unsigned int lines = bytes >> 8; -	xmm_store_t xmm_save[4]; -	unsigned long cr0; -	XMMS_SAVE; +	kernel_fpu_begin();  	asm volatile(  #undef BLOCK @@ -261,7 +220,7 @@ xor_sse_4(unsigned long bytes, unsigned long *p1, unsigned long *p2,  	: [inc] "r" (256UL)  	: "memory" ); -	XMMS_RESTORE; +	kernel_fpu_end();  }  static void @@ -269,10 +228,8 @@ xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,  	  unsigned long *p3, unsigned long *p4, unsigned long *p5)  {  	unsigned int lines = bytes >> 8; -	xmm_store_t xmm_save[4]; -	unsigned long cr0; -	XMMS_SAVE; +	kernel_fpu_begin();  	asm volatile(  #undef BLOCK @@ -336,7 +293,7 @@ xor_sse_5(unsigned long bytes, unsigned long *p1, unsigned long *p2,  	: [inc] "r" (256UL)  	: "memory"); -	XMMS_RESTORE; +	kernel_fpu_end();  }  static struct xor_block_template xor_block_sse = { diff --git a/arch/x86/include/asm/xor_avx.h b/arch/x86/include/asm/xor_avx.h index 2510d35f480..7ea79c5fa1f 100644 --- a/arch/x86/include/asm/xor_avx.h +++ b/arch/x86/include/asm/xor_avx.h @@ -20,32 +20,6 @@  #include <linux/compiler.h>  #include <asm/i387.h> -#define ALIGN32 __aligned(32) - -#define YMM_SAVED_REGS 4 - -#define YMMS_SAVE \ -do { \ -	preempt_disable(); \ -	cr0 = read_cr0(); \ -	clts(); \ -	asm volatile("vmovaps %%ymm0, %0" : "=m" (ymm_save[0]) : : "memory"); \ -	asm volatile("vmovaps %%ymm1, %0" : "=m" (ymm_save[32]) : : "memory"); \ -	asm volatile("vmovaps %%ymm2, %0" : "=m" (ymm_save[64]) : : "memory"); \ -	asm volatile("vmovaps %%ymm3, %0" : "=m" (ymm_save[96]) : : "memory"); \ -} while (0); - -#define YMMS_RESTORE \ -do { \ -	asm volatile("sfence" : : : "memory"); \ -	asm volatile("vmovaps %0, %%ymm3" : : "m" (ymm_save[96])); \ -	asm volatile("vmovaps %0, %%ymm2" : : "m" (ymm_save[64])); \ -	asm volatile("vmovaps %0, %%ymm1" : : "m" (ymm_save[32])); \ -	asm volatile("vmovaps %0, %%ymm0" : : "m" (ymm_save[0])); \ -	write_cr0(cr0); \ -	preempt_enable(); \ -} while (0); -  #define BLOCK4(i) \  		BLOCK(32 * i, 0) \  		BLOCK(32 * (i + 1), 1) \ @@ -60,10 +34,9 @@ do { \  static void xor_avx_2(unsigned long bytes, unsigned long *p0, unsigned long *p1)  { -	unsigned long cr0, lines = bytes >> 9; -	char ymm_save[32 * YMM_SAVED_REGS] ALIGN32; +	unsigned long lines = bytes >> 9; -	YMMS_SAVE +	kernel_fpu_begin();  	while (lines--) {  #undef BLOCK @@ -82,16 +55,15 @@ do { \  		p1 = (unsigned long *)((uintptr_t)p1 + 512);  	} -	YMMS_RESTORE +	kernel_fpu_end();  }  static void xor_avx_3(unsigned long bytes, unsigned long *p0, unsigned long *p1,  	unsigned long *p2)  { -	unsigned long cr0, lines = bytes >> 9; -	char ymm_save[32 * YMM_SAVED_REGS] ALIGN32; +	unsigned long lines = bytes >> 9; -	YMMS_SAVE +	kernel_fpu_begin();  	while (lines--) {  #undef BLOCK @@ -113,16 +85,15 @@ do { \  		p2 = (unsigned long *)((uintptr_t)p2 + 512);  	} -	YMMS_RESTORE +	kernel_fpu_end();  }  static void xor_avx_4(unsigned long bytes, unsigned long *p0, unsigned long *p1,  	unsigned long *p2, unsigned long *p3)  { -	unsigned long cr0, lines = bytes >> 9; -	char ymm_save[32 * YMM_SAVED_REGS] ALIGN32; +	unsigned long lines = bytes >> 9; -	YMMS_SAVE +	kernel_fpu_begin();  	while (lines--) {  #undef BLOCK @@ -147,16 +118,15 @@ do { \  		p3 = (unsigned long *)((uintptr_t)p3 + 512);  	} -	YMMS_RESTORE +	kernel_fpu_end();  }  static void xor_avx_5(unsigned long bytes, unsigned long *p0, unsigned long *p1,  	unsigned long *p2, unsigned long *p3, unsigned long *p4)  { -	unsigned long cr0, lines = bytes >> 9; -	char ymm_save[32 * YMM_SAVED_REGS] ALIGN32; +	unsigned long lines = bytes >> 9; -	YMMS_SAVE +	kernel_fpu_begin();  	while (lines--) {  #undef BLOCK @@ -184,7 +154,7 @@ do { \  		p4 = (unsigned long *)((uintptr_t)p4 + 512);  	} -	YMMS_RESTORE +	kernel_fpu_end();  }  static struct xor_block_template xor_block_avx = { diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h index 8a1b6f9b594..0415cdabb5a 100644 --- a/arch/x86/include/asm/xsave.h +++ b/arch/x86/include/asm/xsave.h @@ -34,17 +34,14 @@  extern unsigned int xstate_size;  extern u64 pcntxt_mask;  extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS]; +extern struct xsave_struct *init_xstate_buf;  extern void xsave_init(void);  extern void update_regset_xstate_info(unsigned int size, u64 xstate_mask);  extern int init_fpu(struct task_struct *child); -extern int check_for_xstate(struct i387_fxsave_struct __user *buf, -			    void __user *fpstate, -			    struct _fpx_sw_bytes *sw); -static inline int fpu_xrstor_checking(struct fpu *fpu) +static inline int fpu_xrstor_checking(struct xsave_struct *fx)  { -	struct xsave_struct *fx = &fpu->state->xsave;  	int err;  	asm volatile("1: .byte " REX_PREFIX "0x0f,0xae,0x2f\n\t" @@ -69,13 +66,13 @@ static inline int xsave_user(struct xsave_struct __user *buf)  	 * Clear the xsave header first, so that reserved fields are  	 * initialized to zero.  	 */ -	err = __clear_user(&buf->xsave_hdr, -			   sizeof(struct xsave_hdr_struct)); +	err = __clear_user(&buf->xsave_hdr, sizeof(buf->xsave_hdr));  	if (unlikely(err))  		return -EFAULT; -	__asm__ __volatile__("1: .byte " REX_PREFIX "0x0f,0xae,0x27\n" -			     "2:\n" +	__asm__ __volatile__(ASM_STAC "\n" +			     "1: .byte " REX_PREFIX "0x0f,0xae,0x27\n" +			     "2: " ASM_CLAC "\n"  			     ".section .fixup,\"ax\"\n"  			     "3:  movl $-1,%[err]\n"  			     "    jmp  2b\n" @@ -84,9 +81,6 @@ static inline int xsave_user(struct xsave_struct __user *buf)  			     : [err] "=r" (err)  			     : "D" (buf), "a" (-1), "d" (-1), "0" (0)  			     : "memory"); -	if (unlikely(err) && __clear_user(buf, xstate_size)) -		err = -EFAULT; -	/* No need to clear here because the caller clears USED_MATH */  	return err;  } @@ -97,8 +91,9 @@ static inline int xrestore_user(struct xsave_struct __user *buf, u64 mask)  	u32 lmask = mask;  	u32 hmask = mask >> 32; -	__asm__ __volatile__("1: .byte " REX_PREFIX "0x0f,0xae,0x2f\n" -			     "2:\n" +	__asm__ __volatile__(ASM_STAC "\n" +			     "1: .byte " REX_PREFIX "0x0f,0xae,0x2f\n" +			     "2: " ASM_CLAC "\n"  			     ".section .fixup,\"ax\"\n"  			     "3:  movl $-1,%[err]\n"  			     "    jmp  2b\n" diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 8215e5652d9..8d7a619718b 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -100,6 +100,8 @@ obj-$(CONFIG_SWIOTLB)			+= pci-swiotlb.o  obj-$(CONFIG_OF)			+= devicetree.o  obj-$(CONFIG_UPROBES)			+= uprobes.o +obj-$(CONFIG_PERF_EVENTS)		+= perf_regs.o +  ###  # 64 bit specific files  ifeq ($(CONFIG_X86_64),y) diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index b2297e58c6e..e651f7a589a 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -656,7 +656,7 @@ static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu)  	acpi_register_lapic(physid, ACPI_MADT_ENABLED);  	/* -	 * If mp_register_lapic successfully generates a new logical cpu +	 * If acpi_register_lapic successfully generates a new logical cpu  	 * number, then the following will get us exactly what was mapped  	 */  	cpumask_andnot(new_map, cpu_present_mask, tmp_map); diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 1b8e5a03d94..11676cf65ae 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -43,17 +43,22 @@ int acpi_suspend_lowlevel(void)  	header->video_mode = saved_video_mode; +	header->pmode_behavior = 0; +  #ifndef CONFIG_64BIT  	store_gdt((struct desc_ptr *)&header->pmode_gdt); -	if (rdmsr_safe(MSR_EFER, &header->pmode_efer_low, -		       &header->pmode_efer_high)) -		header->pmode_efer_low = header->pmode_efer_high = 0; +	if (!rdmsr_safe(MSR_EFER, +			&header->pmode_efer_low, +			&header->pmode_efer_high)) +		header->pmode_behavior |= (1 << WAKEUP_BEHAVIOR_RESTORE_EFER);  #endif /* !CONFIG_64BIT */  	header->pmode_cr0 = read_cr0(); -	header->pmode_cr4 = read_cr4_safe(); -	header->pmode_behavior = 0; +	if (__this_cpu_read(cpu_info.cpuid_level) >= 0) { +		header->pmode_cr4 = read_cr4(); +		header->pmode_behavior |= (1 << WAKEUP_BEHAVIOR_RESTORE_CR4); +	}  	if (!rdmsr_safe(MSR_IA32_MISC_ENABLE,  			&header->pmode_misc_en_low,  			&header->pmode_misc_en_high)) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index ced4534baed..ef5ccca79a6 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -23,19 +23,6 @@  #define MAX_PATCH_LEN (255-1) -#ifdef CONFIG_HOTPLUG_CPU -static int smp_alt_once; - -static int __init bootonly(char *str) -{ -	smp_alt_once = 1; -	return 1; -} -__setup("smp-alt-boot", bootonly); -#else -#define smp_alt_once 1 -#endif -  static int __initdata_or_module debug_alternative;  static int __init debug_alt(char *str) @@ -317,7 +304,7 @@ static void alternatives_smp_lock(const s32 *start, const s32 *end,  		/* turn DS segment override prefix into lock prefix */  		if (*ptr == 0x3e)  			text_poke(ptr, ((unsigned char []){0xf0}), 1); -	}; +	}  	mutex_unlock(&text_mutex);  } @@ -326,9 +313,6 @@ static void alternatives_smp_unlock(const s32 *start, const s32 *end,  {  	const s32 *poff; -	if (noreplace_smp) -		return; -  	mutex_lock(&text_mutex);  	for (poff = start; poff < end; poff++) {  		u8 *ptr = (u8 *)poff + *poff; @@ -338,7 +322,7 @@ static void alternatives_smp_unlock(const s32 *start, const s32 *end,  		/* turn lock prefix into DS segment override prefix */  		if (*ptr == 0xf0)  			text_poke(ptr, ((unsigned char []){0x3E}), 1); -	}; +	}  	mutex_unlock(&text_mutex);  } @@ -359,7 +343,7 @@ struct smp_alt_module {  };  static LIST_HEAD(smp_alt_modules);  static DEFINE_MUTEX(smp_alt); -static int smp_mode = 1;	/* protected by smp_alt */ +static bool uniproc_patched = false;	/* protected by smp_alt */  void __init_or_module alternatives_smp_module_add(struct module *mod,  						  char *name, @@ -368,19 +352,18 @@ void __init_or_module alternatives_smp_module_add(struct module *mod,  {  	struct smp_alt_module *smp; -	if (noreplace_smp) -		return; +	mutex_lock(&smp_alt); +	if (!uniproc_patched) +		goto unlock; -	if (smp_alt_once) { -		if (boot_cpu_has(X86_FEATURE_UP)) -			alternatives_smp_unlock(locks, locks_end, -						text, text_end); -		return; -	} +	if (num_possible_cpus() == 1) +		/* Don't bother remembering, we'll never have to undo it. */ +		goto smp_unlock;  	smp = kzalloc(sizeof(*smp), GFP_KERNEL);  	if (NULL == smp) -		return; /* we'll run the (safe but slow) SMP code then ... */ +		/* we'll run the (safe but slow) SMP code then ... */ +		goto unlock;  	smp->mod	= mod;  	smp->name	= name; @@ -392,11 +375,10 @@ void __init_or_module alternatives_smp_module_add(struct module *mod,  		__func__, smp->locks, smp->locks_end,  		smp->text, smp->text_end, smp->name); -	mutex_lock(&smp_alt);  	list_add_tail(&smp->next, &smp_alt_modules); -	if (boot_cpu_has(X86_FEATURE_UP)) -		alternatives_smp_unlock(smp->locks, smp->locks_end, -					smp->text, smp->text_end); +smp_unlock: +	alternatives_smp_unlock(locks, locks_end, text, text_end); +unlock:  	mutex_unlock(&smp_alt);  } @@ -404,24 +386,18 @@ void __init_or_module alternatives_smp_module_del(struct module *mod)  {  	struct smp_alt_module *item; -	if (smp_alt_once || noreplace_smp) -		return; -  	mutex_lock(&smp_alt);  	list_for_each_entry(item, &smp_alt_modules, next) {  		if (mod != item->mod)  			continue;  		list_del(&item->next); -		mutex_unlock(&smp_alt); -		DPRINTK("%s: %s\n", __func__, item->name);  		kfree(item); -		return; +		break;  	}  	mutex_unlock(&smp_alt);  } -bool skip_smp_alternatives; -void alternatives_smp_switch(int smp) +void alternatives_enable_smp(void)  {  	struct smp_alt_module *mod; @@ -436,34 +412,21 @@ void alternatives_smp_switch(int smp)  	pr_info("lockdep: fixing up alternatives\n");  #endif -	if (noreplace_smp || smp_alt_once || skip_smp_alternatives) -		return; -	BUG_ON(!smp && (num_online_cpus() > 1)); +	/* Why bother if there are no other CPUs? */ +	BUG_ON(num_possible_cpus() == 1);  	mutex_lock(&smp_alt); -	/* -	 * Avoid unnecessary switches because it forces JIT based VMs to -	 * throw away all cached translations, which can be quite costly. -	 */ -	if (smp == smp_mode) { -		/* nothing */ -	} else if (smp) { +	if (uniproc_patched) {  		pr_info("switching to SMP code\n"); +		BUG_ON(num_online_cpus() != 1);  		clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);  		clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP);  		list_for_each_entry(mod, &smp_alt_modules, next)  			alternatives_smp_lock(mod->locks, mod->locks_end,  					      mod->text, mod->text_end); -	} else { -		pr_info("switching to UP code\n"); -		set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP); -		set_cpu_cap(&cpu_data(0), X86_FEATURE_UP); -		list_for_each_entry(mod, &smp_alt_modules, next) -			alternatives_smp_unlock(mod->locks, mod->locks_end, -						mod->text, mod->text_end); +		uniproc_patched = false;  	} -	smp_mode = smp;  	mutex_unlock(&smp_alt);  } @@ -540,40 +503,22 @@ void __init alternative_instructions(void)  	apply_alternatives(__alt_instructions, __alt_instructions_end); -	/* switch to patch-once-at-boottime-only mode and free the -	 * tables in case we know the number of CPUs will never ever -	 * change */ -#ifdef CONFIG_HOTPLUG_CPU -	if (num_possible_cpus() < 2) -		smp_alt_once = 1; -#endif -  #ifdef CONFIG_SMP -	if (smp_alt_once) { -		if (1 == num_possible_cpus()) { -			pr_info("switching to UP code\n"); -			set_cpu_cap(&boot_cpu_data, X86_FEATURE_UP); -			set_cpu_cap(&cpu_data(0), X86_FEATURE_UP); - -			alternatives_smp_unlock(__smp_locks, __smp_locks_end, -						_text, _etext); -		} -	} else { +	/* Patch to UP if other cpus not imminent. */ +	if (!noreplace_smp && (num_present_cpus() == 1 || setup_max_cpus <= 1)) { +		uniproc_patched = true;  		alternatives_smp_module_add(NULL, "core kernel",  					    __smp_locks, __smp_locks_end,  					    _text, _etext); - -		/* Only switch to UP mode if we don't immediately boot others */ -		if (num_present_cpus() == 1 || setup_max_cpus <= 1) -			alternatives_smp_switch(0);  	} -#endif - 	apply_paravirt(__parainstructions, __parainstructions_end); -	if (smp_alt_once) +	if (!uniproc_patched || num_possible_cpus() == 1)  		free_init_pages("SMP alternatives",  				(unsigned long)__smp_locks,  				(unsigned long)__smp_locks_end); +#endif + +	apply_paravirt(__parainstructions, __parainstructions_end);  	restart_nmi();  } diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 24deb308232..b17416e72fb 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1934,7 +1934,7 @@ void smp_error_interrupt(struct pt_regs *regs)  			apic_printk(APIC_DEBUG, KERN_CONT " : %s", error_interrupt_reason[i]);  		i++;  		v1 >>= 1; -	}; +	}  	apic_printk(APIC_DEBUG, KERN_CONT "\n"); diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 9d92e19039f..f7e98a2c0d1 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -737,6 +737,72 @@ static unsigned int __cpuinit amd_size_cache(struct cpuinfo_x86 *c,  }  #endif +static void __cpuinit cpu_set_tlb_flushall_shift(struct cpuinfo_x86 *c) +{ +	if (!cpu_has_invlpg) +		return; + +	tlb_flushall_shift = 5; + +	if (c->x86 <= 0x11) +		tlb_flushall_shift = 4; +} + +static void __cpuinit cpu_detect_tlb_amd(struct cpuinfo_x86 *c) +{ +	u32 ebx, eax, ecx, edx; +	u16 mask = 0xfff; + +	if (c->x86 < 0xf) +		return; + +	if (c->extended_cpuid_level < 0x80000006) +		return; + +	cpuid(0x80000006, &eax, &ebx, &ecx, &edx); + +	tlb_lld_4k[ENTRIES] = (ebx >> 16) & mask; +	tlb_lli_4k[ENTRIES] = ebx & mask; + +	/* +	 * K8 doesn't have 2M/4M entries in the L2 TLB so read out the L1 TLB +	 * characteristics from the CPUID function 0x80000005 instead. +	 */ +	if (c->x86 == 0xf) { +		cpuid(0x80000005, &eax, &ebx, &ecx, &edx); +		mask = 0xff; +	} + +	/* Handle DTLB 2M and 4M sizes, fall back to L1 if L2 is disabled */ +	if (!((eax >> 16) & mask)) { +		u32 a, b, c, d; + +		cpuid(0x80000005, &a, &b, &c, &d); +		tlb_lld_2m[ENTRIES] = (a >> 16) & 0xff; +	} else { +		tlb_lld_2m[ENTRIES] = (eax >> 16) & mask; +	} + +	/* a 4M entry uses two 2M entries */ +	tlb_lld_4m[ENTRIES] = tlb_lld_2m[ENTRIES] >> 1; + +	/* Handle ITLB 2M and 4M sizes, fall back to L1 if L2 is disabled */ +	if (!(eax & mask)) { +		/* Erratum 658 */ +		if (c->x86 == 0x15 && c->x86_model <= 0x1f) { +			tlb_lli_2m[ENTRIES] = 1024; +		} else { +			cpuid(0x80000005, &eax, &ebx, &ecx, &edx); +			tlb_lli_2m[ENTRIES] = eax & 0xff; +		} +	} else +		tlb_lli_2m[ENTRIES] = eax & mask; + +	tlb_lli_4m[ENTRIES] = tlb_lli_2m[ENTRIES] >> 1; + +	cpu_set_tlb_flushall_shift(c); +} +  static const struct cpu_dev __cpuinitconst amd_cpu_dev = {  	.c_vendor	= "AMD",  	.c_ident	= { "AuthenticAMD" }, @@ -756,6 +822,7 @@ static const struct cpu_dev __cpuinitconst amd_cpu_dev = {  	.c_size_cache	= amd_size_cache,  #endif  	.c_early_init   = early_init_amd, +	.c_detect_tlb	= cpu_detect_tlb_amd,  	.c_bsp_init	= bsp_init_amd,  	.c_init		= init_amd,  	.c_x86_vendor	= X86_VENDOR_AMD, diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index c97bb7b5a9f..d0e910da16c 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -165,10 +165,15 @@ void __init check_bugs(void)  	print_cpu_info(&boot_cpu_data);  #endif  	check_config(); -	check_fpu();  	check_hlt();  	check_popad();  	init_utsname()->machine[1] =  		'0' + (boot_cpu_data.x86 > 6 ? 6 : boot_cpu_data.x86);  	alternative_instructions(); + +	/* +	 * kernel_fpu_begin/end() in check_fpu() relies on the patched +	 * alternative instructions. +	 */ +	check_fpu();  } diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index a5fbc3c5fcc..7505f7b13e7 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -259,23 +259,36 @@ static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c)  }  #endif -static int disable_smep __cpuinitdata;  static __init int setup_disable_smep(char *arg)  { -	disable_smep = 1; +	setup_clear_cpu_cap(X86_FEATURE_SMEP);  	return 1;  }  __setup("nosmep", setup_disable_smep); -static __cpuinit void setup_smep(struct cpuinfo_x86 *c) +static __always_inline void setup_smep(struct cpuinfo_x86 *c)  { -	if (cpu_has(c, X86_FEATURE_SMEP)) { -		if (unlikely(disable_smep)) { -			setup_clear_cpu_cap(X86_FEATURE_SMEP); -			clear_in_cr4(X86_CR4_SMEP); -		} else -			set_in_cr4(X86_CR4_SMEP); -	} +	if (cpu_has(c, X86_FEATURE_SMEP)) +		set_in_cr4(X86_CR4_SMEP); +} + +static __init int setup_disable_smap(char *arg) +{ +	setup_clear_cpu_cap(X86_FEATURE_SMAP); +	return 1; +} +__setup("nosmap", setup_disable_smap); + +static __always_inline void setup_smap(struct cpuinfo_x86 *c) +{ +	unsigned long eflags; + +	/* This should have been cleared long ago */ +	raw_local_save_flags(eflags); +	BUG_ON(eflags & X86_EFLAGS_AC); + +	if (cpu_has(c, X86_FEATURE_SMAP)) +		set_in_cr4(X86_CR4_SMAP);  }  /* @@ -476,7 +489,7 @@ void __cpuinit cpu_detect_tlb(struct cpuinfo_x86 *c)  	printk(KERN_INFO "Last level iTLB entries: 4KB %d, 2MB %d, 4MB %d\n" \  		"Last level dTLB entries: 4KB %d, 2MB %d, 4MB %d\n"	     \ -		"tlb_flushall_shift is 0x%x\n", +		"tlb_flushall_shift: %d\n",  		tlb_lli_4k[ENTRIES], tlb_lli_2m[ENTRIES],  		tlb_lli_4m[ENTRIES], tlb_lld_4k[ENTRIES],  		tlb_lld_2m[ENTRIES], tlb_lld_4m[ENTRIES], @@ -712,8 +725,6 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)  	c->cpu_index = 0;  	filter_cpuid_features(c, false); -	setup_smep(c); -  	if (this_cpu->c_bsp_init)  		this_cpu->c_bsp_init(c);  } @@ -798,8 +809,6 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c)  		c->phys_proc_id = c->initial_apicid;  	} -	setup_smep(c); -  	get_model_name(c); /* Default name */  	detect_nopl(c); @@ -864,6 +873,10 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)  	/* Disable the PN if appropriate */  	squash_the_stupid_serial_number(c); +	/* Set up SMEP/SMAP */ +	setup_smep(c); +	setup_smap(c); +  	/*  	 * The vendor-specific functions might have changed features.  	 * Now we do "generic changes." @@ -942,8 +955,7 @@ void __init identify_boot_cpu(void)  #else  	vgetcpu_set_mode();  #endif -	if (boot_cpu_data.cpuid_level >= 2) -		cpu_detect_tlb(&boot_cpu_data); +	cpu_detect_tlb(&boot_cpu_data);  }  void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) @@ -1023,14 +1035,16 @@ void __cpuinit print_cpu_info(struct cpuinfo_x86 *c)  		printk(KERN_CONT "%s ", vendor);  	if (c->x86_model_id[0]) -		printk(KERN_CONT "%s", c->x86_model_id); +		printk(KERN_CONT "%s", strim(c->x86_model_id));  	else  		printk(KERN_CONT "%d86", c->x86); +	printk(KERN_CONT " (fam: %02x, model: %02x", c->x86, c->x86_model); +  	if (c->x86_mask || c->cpuid_level >= 0) -		printk(KERN_CONT " stepping %02x\n", c->x86_mask); +		printk(KERN_CONT ", stepping: %02x)\n", c->x86_mask);  	else -		printk(KERN_CONT "\n"); +		printk(KERN_CONT ")\n");  	print_cpu_msr(c);  } @@ -1113,11 +1127,10 @@ void syscall_init(void)  	/* Flags to clear on syscall */  	wrmsrl(MSR_SYSCALL_MASK, -	       X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF|X86_EFLAGS_IOPL); +	       X86_EFLAGS_TF|X86_EFLAGS_DF|X86_EFLAGS_IF| +	       X86_EFLAGS_IOPL|X86_EFLAGS_AC);  } -unsigned long kernel_eflags; -  /*   * Copies of the original ist values from the tss are only accessed during   * debugging, no special alignment required. @@ -1297,9 +1310,6 @@ void __cpuinit cpu_init(void)  	dbg_restore_debug_regs();  	fpu_init(); -	xsave_init(); - -	raw_local_save_flags(kernel_eflags);  	if (is_uv_system())  		uv_cpu_init(); @@ -1352,6 +1362,5 @@ void __cpuinit cpu_init(void)  	dbg_restore_debug_regs();  	fpu_init(); -	xsave_init();  }  #endif diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 0a4ce2980a5..198e019a531 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -648,6 +648,10 @@ static void __cpuinit intel_detect_tlb(struct cpuinfo_x86 *c)  	int i, j, n;  	unsigned int regs[4];  	unsigned char *desc = (unsigned char *)regs; + +	if (c->cpuid_level < 2) +		return; +  	/* Number of times to iterate */  	n = cpuid_eax(2) & 0xFF; diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index fc4beb39357..ddc72f83933 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -78,6 +78,7 @@ static void raise_exception(struct mce *m, struct pt_regs *pregs)  }  static cpumask_var_t mce_inject_cpumask; +static DEFINE_MUTEX(mce_inject_mutex);  static int mce_raise_notify(unsigned int cmd, struct pt_regs *regs)  { @@ -194,7 +195,11 @@ static void raise_mce(struct mce *m)  		put_online_cpus();  	} else  #endif +	{ +		preempt_disable();  		raise_local(); +		preempt_enable(); +	}  }  /* Error injection interface */ @@ -225,7 +230,10 @@ static ssize_t mce_write(struct file *filp, const char __user *ubuf,  	 * so do it a jiffie or two later everywhere.  	 */  	schedule_timeout(2); + +	mutex_lock(&mce_inject_mutex);  	raise_mce(&m); +	mutex_unlock(&mce_inject_mutex);  	return usize;  } diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index ed44c8a6585..6a05c1d327a 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -28,6 +28,18 @@ extern int mce_ser;  extern struct mce_bank *mce_banks; +#ifdef CONFIG_X86_MCE_INTEL +unsigned long mce_intel_adjust_timer(unsigned long interval); +void mce_intel_cmci_poll(void); +void mce_intel_hcpu_update(unsigned long cpu); +#else +# define mce_intel_adjust_timer mce_adjust_timer_default +static inline void mce_intel_cmci_poll(void) { } +static inline void mce_intel_hcpu_update(unsigned long cpu) { } +#endif + +void mce_timer_kick(unsigned long interval); +  #ifdef CONFIG_ACPI_APEI  int apei_write_mce(struct mce *m);  ssize_t apei_read_mce(struct mce *m, u64 *record_id); diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 292d0258311..29e87d3b284 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -83,6 +83,7 @@ static int			mce_dont_log_ce		__read_mostly;  int				mce_cmci_disabled	__read_mostly;  int				mce_ignore_ce		__read_mostly;  int				mce_ser			__read_mostly; +int				mce_bios_cmci_threshold	__read_mostly;  struct mce_bank                *mce_banks		__read_mostly; @@ -1266,6 +1267,14 @@ static unsigned long check_interval = 5 * 60; /* 5 minutes */  static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */  static DEFINE_PER_CPU(struct timer_list, mce_timer); +static unsigned long mce_adjust_timer_default(unsigned long interval) +{ +	return interval; +} + +static unsigned long (*mce_adjust_timer)(unsigned long interval) = +	mce_adjust_timer_default; +  static void mce_timer_fn(unsigned long data)  {  	struct timer_list *t = &__get_cpu_var(mce_timer); @@ -1276,6 +1285,7 @@ static void mce_timer_fn(unsigned long data)  	if (mce_available(__this_cpu_ptr(&cpu_info))) {  		machine_check_poll(MCP_TIMESTAMP,  				&__get_cpu_var(mce_poll_banks)); +		mce_intel_cmci_poll();  	}  	/* @@ -1283,14 +1293,38 @@ static void mce_timer_fn(unsigned long data)  	 * polling interval, otherwise increase the polling interval.  	 */  	iv = __this_cpu_read(mce_next_interval); -	if (mce_notify_irq()) +	if (mce_notify_irq()) {  		iv = max(iv / 2, (unsigned long) HZ/100); -	else +	} else {  		iv = min(iv * 2, round_jiffies_relative(check_interval * HZ)); +		iv = mce_adjust_timer(iv); +	}  	__this_cpu_write(mce_next_interval, iv); +	/* Might have become 0 after CMCI storm subsided */ +	if (iv) { +		t->expires = jiffies + iv; +		add_timer_on(t, smp_processor_id()); +	} +} -	t->expires = jiffies + iv; -	add_timer_on(t, smp_processor_id()); +/* + * Ensure that the timer is firing in @interval from now. + */ +void mce_timer_kick(unsigned long interval) +{ +	struct timer_list *t = &__get_cpu_var(mce_timer); +	unsigned long when = jiffies + interval; +	unsigned long iv = __this_cpu_read(mce_next_interval); + +	if (timer_pending(t)) { +		if (time_before(when, t->expires)) +			mod_timer_pinned(t, when); +	} else { +		t->expires = round_jiffies(when); +		add_timer_on(t, smp_processor_id()); +	} +	if (interval < iv) +		__this_cpu_write(mce_next_interval, interval);  }  /* Must not be called in IRQ context where del_timer_sync() can deadlock */ @@ -1585,6 +1619,7 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)  	switch (c->x86_vendor) {  	case X86_VENDOR_INTEL:  		mce_intel_feature_init(c); +		mce_adjust_timer = mce_intel_adjust_timer;  		break;  	case X86_VENDOR_AMD:  		mce_amd_feature_init(c); @@ -1594,23 +1629,28 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)  	}  } -static void __mcheck_cpu_init_timer(void) +static void mce_start_timer(unsigned int cpu, struct timer_list *t)  { -	struct timer_list *t = &__get_cpu_var(mce_timer); -	unsigned long iv = check_interval * HZ; +	unsigned long iv = mce_adjust_timer(check_interval * HZ); -	setup_timer(t, mce_timer_fn, smp_processor_id()); +	__this_cpu_write(mce_next_interval, iv); -	if (mce_ignore_ce) +	if (mce_ignore_ce || !iv)  		return; -	__this_cpu_write(mce_next_interval, iv); -	if (!iv) -		return;  	t->expires = round_jiffies(jiffies + iv);  	add_timer_on(t, smp_processor_id());  } +static void __mcheck_cpu_init_timer(void) +{ +	struct timer_list *t = &__get_cpu_var(mce_timer); +	unsigned int cpu = smp_processor_id(); + +	setup_timer(t, mce_timer_fn, cpu); +	mce_start_timer(cpu, t); +} +  /* Handle unconfigured int18 (should never happen) */  static void unexpected_machine_check(struct pt_regs *regs, long error_code)  { @@ -1907,6 +1947,7 @@ static struct miscdevice mce_chrdev_device = {   *	check, or 0 to not wait   * mce=bootlog Log MCEs from before booting. Disabled by default on AMD.   * mce=nobootlog Don't log MCEs from before booting. + * mce=bios_cmci_threshold Don't program the CMCI threshold   */  static int __init mcheck_enable(char *str)  { @@ -1926,6 +1967,8 @@ static int __init mcheck_enable(char *str)  		mce_ignore_ce = 1;  	else if (!strcmp(str, "bootlog") || !strcmp(str, "nobootlog"))  		mce_bootlog = (str[0] == 'b'); +	else if (!strcmp(str, "bios_cmci_threshold")) +		mce_bios_cmci_threshold = 1;  	else if (isdigit(str[0])) {  		get_option(&str, &tolerant);  		if (*str == ',') { @@ -2166,6 +2209,11 @@ static struct dev_ext_attribute dev_attr_cmci_disabled = {  	&mce_cmci_disabled  }; +static struct dev_ext_attribute dev_attr_bios_cmci_threshold = { +	__ATTR(bios_cmci_threshold, 0444, device_show_int, NULL), +	&mce_bios_cmci_threshold +}; +  static struct device_attribute *mce_device_attrs[] = {  	&dev_attr_tolerant.attr,  	&dev_attr_check_interval.attr, @@ -2174,6 +2222,7 @@ static struct device_attribute *mce_device_attrs[] = {  	&dev_attr_dont_log_ce.attr,  	&dev_attr_ignore_ce.attr,  	&dev_attr_cmci_disabled.attr, +	&dev_attr_bios_cmci_threshold.attr,  	NULL  }; @@ -2294,38 +2343,33 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)  	unsigned int cpu = (unsigned long)hcpu;  	struct timer_list *t = &per_cpu(mce_timer, cpu); -	switch (action) { +	switch (action & ~CPU_TASKS_FROZEN) {  	case CPU_ONLINE: -	case CPU_ONLINE_FROZEN:  		mce_device_create(cpu);  		if (threshold_cpu_callback)  			threshold_cpu_callback(action, cpu);  		break;  	case CPU_DEAD: -	case CPU_DEAD_FROZEN:  		if (threshold_cpu_callback)  			threshold_cpu_callback(action, cpu);  		mce_device_remove(cpu); +		mce_intel_hcpu_update(cpu);  		break;  	case CPU_DOWN_PREPARE: -	case CPU_DOWN_PREPARE_FROZEN: -		del_timer_sync(t);  		smp_call_function_single(cpu, mce_disable_cpu, &action, 1); +		del_timer_sync(t);  		break;  	case CPU_DOWN_FAILED: -	case CPU_DOWN_FAILED_FROZEN: -		if (!mce_ignore_ce && check_interval) { -			t->expires = round_jiffies(jiffies + -					per_cpu(mce_next_interval, cpu)); -			add_timer_on(t, cpu); -		}  		smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); +		mce_start_timer(cpu, t);  		break; -	case CPU_POST_DEAD: +	} + +	if (action == CPU_POST_DEAD) {  		/* intentionally ignoring frozen here */  		cmci_rediscover(cpu); -		break;  	} +  	return NOTIFY_OK;  } diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index 38e49bc95ff..5f88abf07e9 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -15,6 +15,8 @@  #include <asm/msr.h>  #include <asm/mce.h> +#include "mce-internal.h" +  /*   * Support for Intel Correct Machine Check Interrupts. This allows   * the CPU to raise an interrupt when a corrected machine check happened. @@ -30,7 +32,22 @@ static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);   */  static DEFINE_RAW_SPINLOCK(cmci_discover_lock); -#define CMCI_THRESHOLD 1 +#define CMCI_THRESHOLD		1 +#define CMCI_POLL_INTERVAL	(30 * HZ) +#define CMCI_STORM_INTERVAL	(1 * HZ) +#define CMCI_STORM_THRESHOLD	15 + +static DEFINE_PER_CPU(unsigned long, cmci_time_stamp); +static DEFINE_PER_CPU(unsigned int, cmci_storm_cnt); +static DEFINE_PER_CPU(unsigned int, cmci_storm_state); + +enum { +	CMCI_STORM_NONE, +	CMCI_STORM_ACTIVE, +	CMCI_STORM_SUBSIDED, +}; + +static atomic_t cmci_storm_on_cpus;  static int cmci_supported(int *banks)  { @@ -53,6 +70,93 @@ static int cmci_supported(int *banks)  	return !!(cap & MCG_CMCI_P);  } +void mce_intel_cmci_poll(void) +{ +	if (__this_cpu_read(cmci_storm_state) == CMCI_STORM_NONE) +		return; +	machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); +} + +void mce_intel_hcpu_update(unsigned long cpu) +{ +	if (per_cpu(cmci_storm_state, cpu) == CMCI_STORM_ACTIVE) +		atomic_dec(&cmci_storm_on_cpus); + +	per_cpu(cmci_storm_state, cpu) = CMCI_STORM_NONE; +} + +unsigned long mce_intel_adjust_timer(unsigned long interval) +{ +	int r; + +	if (interval < CMCI_POLL_INTERVAL) +		return interval; + +	switch (__this_cpu_read(cmci_storm_state)) { +	case CMCI_STORM_ACTIVE: +		/* +		 * We switch back to interrupt mode once the poll timer has +		 * silenced itself. That means no events recorded and the +		 * timer interval is back to our poll interval. +		 */ +		__this_cpu_write(cmci_storm_state, CMCI_STORM_SUBSIDED); +		r = atomic_sub_return(1, &cmci_storm_on_cpus); +		if (r == 0) +			pr_notice("CMCI storm subsided: switching to interrupt mode\n"); +		/* FALLTHROUGH */ + +	case CMCI_STORM_SUBSIDED: +		/* +		 * We wait for all cpus to go back to SUBSIDED +		 * state. When that happens we switch back to +		 * interrupt mode. +		 */ +		if (!atomic_read(&cmci_storm_on_cpus)) { +			__this_cpu_write(cmci_storm_state, CMCI_STORM_NONE); +			cmci_reenable(); +			cmci_recheck(); +		} +		return CMCI_POLL_INTERVAL; +	default: +		/* +		 * We have shiny weather. Let the poll do whatever it +		 * thinks. +		 */ +		return interval; +	} +} + +static bool cmci_storm_detect(void) +{ +	unsigned int cnt = __this_cpu_read(cmci_storm_cnt); +	unsigned long ts = __this_cpu_read(cmci_time_stamp); +	unsigned long now = jiffies; +	int r; + +	if (__this_cpu_read(cmci_storm_state) != CMCI_STORM_NONE) +		return true; + +	if (time_before_eq(now, ts + CMCI_STORM_INTERVAL)) { +		cnt++; +	} else { +		cnt = 1; +		__this_cpu_write(cmci_time_stamp, now); +	} +	__this_cpu_write(cmci_storm_cnt, cnt); + +	if (cnt <= CMCI_STORM_THRESHOLD) +		return false; + +	cmci_clear(); +	__this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE); +	r = atomic_add_return(1, &cmci_storm_on_cpus); +	mce_timer_kick(CMCI_POLL_INTERVAL); + +	if (r == 1) +		pr_notice("CMCI storm detected: switching to poll mode\n"); +	return true; +} +  /*   * The interrupt handler. This is called on every event.   * Just call the poller directly to log any events. @@ -61,33 +165,28 @@ static int cmci_supported(int *banks)   */  static void intel_threshold_interrupt(void)  { +	if (cmci_storm_detect()) +		return;  	machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));  	mce_notify_irq();  } -static void print_update(char *type, int *hdr, int num) -{ -	if (*hdr == 0) -		printk(KERN_INFO "CPU %d MCA banks", smp_processor_id()); -	*hdr = 1; -	printk(KERN_CONT " %s:%d", type, num); -} -  /*   * Enable CMCI (Corrected Machine Check Interrupt) for available MCE banks   * on this CPU. Use the algorithm recommended in the SDM to discover shared   * banks.   */ -static void cmci_discover(int banks, int boot) +static void cmci_discover(int banks)  {  	unsigned long *owned = (void *)&__get_cpu_var(mce_banks_owned);  	unsigned long flags; -	int hdr = 0;  	int i; +	int bios_wrong_thresh = 0;  	raw_spin_lock_irqsave(&cmci_discover_lock, flags);  	for (i = 0; i < banks; i++) {  		u64 val; +		int bios_zero_thresh = 0;  		if (test_bit(i, owned))  			continue; @@ -96,29 +195,52 @@ static void cmci_discover(int banks, int boot)  		/* Already owned by someone else? */  		if (val & MCI_CTL2_CMCI_EN) { -			if (test_and_clear_bit(i, owned) && !boot) -				print_update("SHD", &hdr, i); +			clear_bit(i, owned);  			__clear_bit(i, __get_cpu_var(mce_poll_banks));  			continue;  		} -		val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK; -		val |= MCI_CTL2_CMCI_EN | CMCI_THRESHOLD; +		if (!mce_bios_cmci_threshold) { +			val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK; +			val |= CMCI_THRESHOLD; +		} else if (!(val & MCI_CTL2_CMCI_THRESHOLD_MASK)) { +			/* +			 * If bios_cmci_threshold boot option was specified +			 * but the threshold is zero, we'll try to initialize +			 * it to 1. +			 */ +			bios_zero_thresh = 1; +			val |= CMCI_THRESHOLD; +		} + +		val |= MCI_CTL2_CMCI_EN;  		wrmsrl(MSR_IA32_MCx_CTL2(i), val);  		rdmsrl(MSR_IA32_MCx_CTL2(i), val);  		/* Did the enable bit stick? -- the bank supports CMCI */  		if (val & MCI_CTL2_CMCI_EN) { -			if (!test_and_set_bit(i, owned) && !boot) -				print_update("CMCI", &hdr, i); +			set_bit(i, owned);  			__clear_bit(i, __get_cpu_var(mce_poll_banks)); +			/* +			 * We are able to set thresholds for some banks that +			 * had a threshold of 0. This means the BIOS has not +			 * set the thresholds properly or does not work with +			 * this boot option. Note down now and report later. +			 */ +			if (mce_bios_cmci_threshold && bios_zero_thresh && +					(val & MCI_CTL2_CMCI_THRESHOLD_MASK)) +				bios_wrong_thresh = 1;  		} else {  			WARN_ON(!test_bit(i, __get_cpu_var(mce_poll_banks)));  		}  	}  	raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); -	if (hdr) -		printk(KERN_CONT "\n"); +	if (mce_bios_cmci_threshold && bios_wrong_thresh) { +		pr_info_once( +			"bios_cmci_threshold: Some banks do not have valid thresholds set\n"); +		pr_info_once( +			"bios_cmci_threshold: Make sure your BIOS supports this boot option\n"); +	}  }  /* @@ -156,7 +278,7 @@ void cmci_clear(void)  			continue;  		/* Disable CMCI */  		rdmsrl(MSR_IA32_MCx_CTL2(i), val); -		val &= ~(MCI_CTL2_CMCI_EN|MCI_CTL2_CMCI_THRESHOLD_MASK); +		val &= ~MCI_CTL2_CMCI_EN;  		wrmsrl(MSR_IA32_MCx_CTL2(i), val);  		__clear_bit(i, __get_cpu_var(mce_banks_owned));  	} @@ -186,7 +308,7 @@ void cmci_rediscover(int dying)  			continue;  		/* Recheck banks in case CPUs don't all have the same */  		if (cmci_supported(&banks)) -			cmci_discover(banks, 0); +			cmci_discover(banks);  	}  	set_cpus_allowed_ptr(current, old); @@ -200,7 +322,7 @@ void cmci_reenable(void)  {  	int banks;  	if (cmci_supported(&banks)) -		cmci_discover(banks, 0); +		cmci_discover(banks);  }  static void intel_init_cmci(void) @@ -211,7 +333,7 @@ static void intel_init_cmci(void)  		return;  	mce_threshold_vector = intel_threshold_interrupt; -	cmci_discover(banks, 1); +	cmci_discover(banks);  	/*  	 * For CPU #0 this runs with still disabled APIC, but that's  	 * ok because only the vector is set up. We still do another diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 6605a81ba33..8b6defe7eef 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -586,6 +586,8 @@ extern struct event_constraint intel_westmere_pebs_event_constraints[];  extern struct event_constraint intel_snb_pebs_event_constraints[]; +extern struct event_constraint intel_ivb_pebs_event_constraints[]; +  struct event_constraint *intel_pebs_constraints(struct perf_event *event);  void intel_pmu_pebs_enable(struct perf_event *event); diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c index 7bfb5bec863..eebd5ffe1bb 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c +++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c @@ -209,6 +209,15 @@ static int perf_ibs_precise_event(struct perf_event *event, u64 *config)  	return -EOPNOTSUPP;  } +static const struct perf_event_attr ibs_notsupp = { +	.exclude_user	= 1, +	.exclude_kernel	= 1, +	.exclude_hv	= 1, +	.exclude_idle	= 1, +	.exclude_host	= 1, +	.exclude_guest	= 1, +}; +  static int perf_ibs_init(struct perf_event *event)  {  	struct hw_perf_event *hwc = &event->hw; @@ -229,6 +238,9 @@ static int perf_ibs_init(struct perf_event *event)  	if (event->pmu != &perf_ibs->pmu)  		return -ENOENT; +	if (perf_flags(&event->attr) & perf_flags(&ibs_notsupp)) +		return -EINVAL; +  	if (config & ~perf_ibs->config_mask)  		return -EINVAL; diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 7f2739e03e7..6bca492b854 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -2008,6 +2008,7 @@ __init int intel_pmu_init(void)  		break;  	case 28: /* Atom */ +	case 54: /* Cedariew */  		memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,  		       sizeof(hw_cache_event_ids)); @@ -2047,7 +2048,6 @@ __init int intel_pmu_init(void)  	case 42: /* SandyBridge */  	case 45: /* SandyBridge, "Romely-EP" */  		x86_add_quirk(intel_sandybridge_quirk); -	case 58: /* IvyBridge */  		memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,  		       sizeof(hw_cache_event_ids));  		memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs, @@ -2072,6 +2072,29 @@ __init int intel_pmu_init(void)  		pr_cont("SandyBridge events, ");  		break; +	case 58: /* IvyBridge */ +		memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, +		       sizeof(hw_cache_event_ids)); +		memcpy(hw_cache_extra_regs, snb_hw_cache_extra_regs, +		       sizeof(hw_cache_extra_regs)); + +		intel_pmu_lbr_init_snb(); + +		x86_pmu.event_constraints = intel_snb_event_constraints; +		x86_pmu.pebs_constraints = intel_ivb_pebs_event_constraints; +		x86_pmu.pebs_aliases = intel_pebs_aliases_snb; +		x86_pmu.extra_regs = intel_snb_extra_regs; +		/* all extra regs are per-cpu when HT is on */ +		x86_pmu.er_flags |= ERF_HAS_RSP_1; +		x86_pmu.er_flags |= ERF_NO_HT_SHARING; + +		/* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */ +		intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = +			X86_CONFIG(.event=0x0e, .umask=0x01, .inv=1, .cmask=1); + +		pr_cont("IvyBridge events, "); +		break; +  	default:  		switch (x86_pmu.version) { diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index e38d97bf425..826054a4f2e 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -407,6 +407,20 @@ struct event_constraint intel_snb_pebs_event_constraints[] = {  	EVENT_CONSTRAINT_END  }; +struct event_constraint intel_ivb_pebs_event_constraints[] = { +        INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */ +        INTEL_UEVENT_CONSTRAINT(0x01c2, 0xf), /* UOPS_RETIRED.ALL */ +        INTEL_UEVENT_CONSTRAINT(0x02c2, 0xf), /* UOPS_RETIRED.RETIRE_SLOTS */ +        INTEL_EVENT_CONSTRAINT(0xc4, 0xf),    /* BR_INST_RETIRED.* */ +        INTEL_EVENT_CONSTRAINT(0xc5, 0xf),    /* BR_MISP_RETIRED.* */ +        INTEL_EVENT_CONSTRAINT(0xcd, 0x8),    /* MEM_TRANS_RETIRED.* */ +        INTEL_EVENT_CONSTRAINT(0xd0, 0xf),    /* MEM_UOP_RETIRED.* */ +        INTEL_EVENT_CONSTRAINT(0xd1, 0xf),    /* MEM_LOAD_UOPS_RETIRED.* */ +        INTEL_EVENT_CONSTRAINT(0xd2, 0xf),    /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */ +        INTEL_EVENT_CONSTRAINT(0xd3, 0xf),    /* MEM_LOAD_UOPS_LLC_MISS_RETIRED.* */ +        EVENT_CONSTRAINT_END +}; +  struct event_constraint *intel_pebs_constraints(struct perf_event *event)  {  	struct event_constraint *c; diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index 520b4265fcd..da02e9cc375 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -686,7 +686,8 @@ void intel_pmu_lbr_init_atom(void)  	 * to have an operational LBR which can freeze  	 * on PMU interrupt  	 */ -	if (boot_cpu_data.x86_mask < 10) { +	if (boot_cpu_data.x86_model == 28 +	    && boot_cpu_data.x86_mask < 10) {  		pr_cont("LBR disabled due to erratum");  		return;  	} diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index 0a5571080e7..99d96a4978b 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -661,6 +661,11 @@ static void snb_uncore_msr_init_box(struct intel_uncore_box *box)  	}  } +static struct uncore_event_desc snb_uncore_events[] = { +	INTEL_UNCORE_EVENT_DESC(clockticks, "event=0xff,umask=0x00"), +	{ /* end: all zeroes */ }, +}; +  static struct attribute *snb_uncore_formats_attr[] = {  	&format_attr_event.attr,  	&format_attr_umask.attr, @@ -704,6 +709,7 @@ static struct intel_uncore_type snb_uncore_cbox = {  	.constraints	= snb_uncore_cbox_constraints,  	.ops		= &snb_uncore_msr_ops,  	.format_group	= &snb_uncore_format_group, +	.event_descs	= snb_uncore_events,  };  static struct intel_uncore_type *snb_msr_uncores[] = { @@ -1944,7 +1950,7 @@ struct intel_uncore_box *uncore_alloc_box(struct intel_uncore_type *type, int cp  static struct intel_uncore_box *  uncore_pmu_to_box(struct intel_uncore_pmu *pmu, int cpu)  { -	static struct intel_uncore_box *box; +	struct intel_uncore_box *box;  	box = *per_cpu_ptr(pmu->box, cpu);  	if (box) @@ -2341,6 +2347,27 @@ int uncore_pmu_event_init(struct perf_event *event)  	return ret;  } +static ssize_t uncore_get_attr_cpumask(struct device *dev, +				struct device_attribute *attr, char *buf) +{ +	int n = cpulist_scnprintf(buf, PAGE_SIZE - 2, &uncore_cpu_mask); + +	buf[n++] = '\n'; +	buf[n] = '\0'; +	return n; +} + +static DEVICE_ATTR(cpumask, S_IRUGO, uncore_get_attr_cpumask, NULL); + +static struct attribute *uncore_pmu_attrs[] = { +	&dev_attr_cpumask.attr, +	NULL, +}; + +static struct attribute_group uncore_pmu_attr_group = { +	.attrs = uncore_pmu_attrs, +}; +  static int __init uncore_pmu_register(struct intel_uncore_pmu *pmu)  {  	int ret; @@ -2378,8 +2405,8 @@ static void __init uncore_type_exit(struct intel_uncore_type *type)  		free_percpu(type->pmus[i].box);  	kfree(type->pmus);  	type->pmus = NULL; -	kfree(type->attr_groups[1]); -	type->attr_groups[1] = NULL; +	kfree(type->events_group); +	type->events_group = NULL;  }  static void __init uncore_types_exit(struct intel_uncore_type **types) @@ -2431,9 +2458,10 @@ static int __init uncore_type_init(struct intel_uncore_type *type)  		for (j = 0; j < i; j++)  			attrs[j] = &type->event_descs[j].attr.attr; -		type->attr_groups[1] = events_group; +		type->events_group = events_group;  	} +	type->pmu_group = &uncore_pmu_attr_group;  	type->pmus = pmus;  	return 0;  fail: diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h index 5b81c1856aa..e68a4550e95 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h @@ -369,10 +369,12 @@ struct intel_uncore_type {  	struct intel_uncore_pmu *pmus;  	struct intel_uncore_ops *ops;  	struct uncore_event_desc *event_descs; -	const struct attribute_group *attr_groups[3]; +	const struct attribute_group *attr_groups[4];  }; -#define format_group attr_groups[0] +#define pmu_group attr_groups[0] +#define format_group attr_groups[1] +#define events_group attr_groups[2]  struct intel_uncore_ops {  	void (*init_box)(struct intel_uncore_box *); diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index 8022c668148..fbd89556229 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -140,10 +140,7 @@ static int show_cpuinfo(struct seq_file *m, void *v)  static void *c_start(struct seq_file *m, loff_t *pos)  { -	if (*pos == 0)	/* just in case, cpu 0 is not the first */ -		*pos = cpumask_first(cpu_online_mask); -	else -		*pos = cpumask_next(*pos - 1, cpu_online_mask); +	*pos = cpumask_next(*pos - 1, cpu_online_mask);  	if ((*pos) < nr_cpu_ids)  		return &cpu_data(*pos);  	return NULL; diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index 39472dd2323..60c78917190 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -199,12 +199,14 @@ static int __init cpuid_init(void)  		goto out_chrdev;  	}  	cpuid_class->devnode = cpuid_devnode; +	get_online_cpus();  	for_each_online_cpu(i) {  		err = cpuid_device_create(i);  		if (err != 0)  			goto out_class;  	}  	register_hotcpu_notifier(&cpuid_class_cpu_notifier); +	put_online_cpus();  	err = 0;  	goto out; @@ -214,6 +216,7 @@ out_class:  	for_each_online_cpu(i) {  		cpuid_device_destroy(i);  	} +	put_online_cpus();  	class_destroy(cpuid_class);  out_chrdev:  	__unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid"); @@ -225,11 +228,13 @@ static void __exit cpuid_exit(void)  {  	int cpu = 0; +	get_online_cpus();  	for_each_online_cpu(cpu)  		cpuid_device_destroy(cpu);  	class_destroy(cpuid_class);  	__unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid");  	unregister_hotcpu_notifier(&cpuid_class_cpu_notifier); +	put_online_cpus();  }  module_init(cpuid_init); diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index 3ae2ced4a87..b1581527a23 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -342,6 +342,47 @@ const struct irq_domain_ops ioapic_irq_domain_ops = {  	.xlate = ioapic_xlate,  }; +static void dt_add_ioapic_domain(unsigned int ioapic_num, +		struct device_node *np) +{ +	struct irq_domain *id; +	struct mp_ioapic_gsi *gsi_cfg; +	int ret; +	int num; + +	gsi_cfg = mp_ioapic_gsi_routing(ioapic_num); +	num = gsi_cfg->gsi_end - gsi_cfg->gsi_base + 1; + +	id = irq_domain_add_linear(np, num, &ioapic_irq_domain_ops, +			(void *)ioapic_num); +	BUG_ON(!id); +	if (gsi_cfg->gsi_base == 0) { +		/* +		 * The first NR_IRQS_LEGACY irq descs are allocated in +		 * early_irq_init() and need just a mapping. The +		 * remaining irqs need both. All of them are preallocated +		 * and assigned so we can keep the 1:1 mapping which the ioapic +		 * is having. +		 */ +		ret = irq_domain_associate_many(id, 0, 0, NR_IRQS_LEGACY); +		if (ret) +			pr_err("Error mapping legacy IRQs: %d\n", ret); + +		if (num > NR_IRQS_LEGACY) { +			ret = irq_create_strict_mappings(id, NR_IRQS_LEGACY, +					NR_IRQS_LEGACY, num - NR_IRQS_LEGACY); +			if (ret) +				pr_err("Error creating mapping for the " +						"remaining IRQs: %d\n", ret); +		} +		irq_set_default_host(id); +	} else { +		ret = irq_create_strict_mappings(id, gsi_cfg->gsi_base, 0, num); +		if (ret) +			pr_err("Error creating IRQ mapping: %d\n", ret); +	} +} +  static void __init ioapic_add_ofnode(struct device_node *np)  {  	struct resource r; @@ -356,15 +397,7 @@ static void __init ioapic_add_ofnode(struct device_node *np)  	for (i = 0; i < nr_ioapics; i++) {  		if (r.start == mpc_ioapic_addr(i)) { -			struct irq_domain *id; -			struct mp_ioapic_gsi *gsi_cfg; - -			gsi_cfg = mp_ioapic_gsi_routing(i); - -			id = irq_domain_add_legacy(np, 32, gsi_cfg->gsi_base, 0, -						   &ioapic_irq_domain_ops, -						   (void*)i); -			BUG_ON(!id); +			dt_add_ioapic_domain(i, np);  			return;  		}  	} diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 623f2883747..0750e3ba87c 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -57,6 +57,7 @@  #include <asm/cpufeature.h>  #include <asm/alternative-asm.h>  #include <asm/asm.h> +#include <asm/smap.h>  /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */  #include <linux/elf-em.h> @@ -407,7 +408,9 @@ sysenter_past_esp:   */  	cmpl $__PAGE_OFFSET-3,%ebp  	jae syscall_fault +	ASM_STAC  1:	movl (%ebp),%ebp +	ASM_CLAC  	movl %ebp,PT_EBP(%esp)  	_ASM_EXTABLE(1b,syscall_fault) @@ -488,6 +491,7 @@ ENDPROC(ia32_sysenter_target)  	# system call handler stub  ENTRY(system_call)  	RING0_INT_FRAME			# can't unwind into user space anyway +	ASM_CLAC  	pushl_cfi %eax			# save orig_eax  	SAVE_ALL  	GET_THREAD_INFO(%ebp) @@ -670,6 +674,7 @@ END(syscall_exit_work)  	RING0_INT_FRAME			# can't unwind into user space anyway  syscall_fault: +	ASM_CLAC  	GET_THREAD_INFO(%ebp)  	movl $-EFAULT,PT_EAX(%esp)  	jmp resume_userspace @@ -825,6 +830,7 @@ END(interrupt)   */  	.p2align CONFIG_X86_L1_CACHE_SHIFT  common_interrupt: +	ASM_CLAC  	addl $-0x80,(%esp)	/* Adjust vector into the [-256,-1] range */  	SAVE_ALL  	TRACE_IRQS_OFF @@ -841,6 +847,7 @@ ENDPROC(common_interrupt)  #define BUILD_INTERRUPT3(name, nr, fn)	\  ENTRY(name)				\  	RING0_INT_FRAME;		\ +	ASM_CLAC;			\  	pushl_cfi $~(nr);		\  	SAVE_ALL;			\  	TRACE_IRQS_OFF			\ @@ -857,6 +864,7 @@ ENDPROC(name)  ENTRY(coprocessor_error)  	RING0_INT_FRAME +	ASM_CLAC  	pushl_cfi $0  	pushl_cfi $do_coprocessor_error  	jmp error_code @@ -865,6 +873,7 @@ END(coprocessor_error)  ENTRY(simd_coprocessor_error)  	RING0_INT_FRAME +	ASM_CLAC  	pushl_cfi $0  #ifdef CONFIG_X86_INVD_BUG  	/* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */ @@ -886,6 +895,7 @@ END(simd_coprocessor_error)  ENTRY(device_not_available)  	RING0_INT_FRAME +	ASM_CLAC  	pushl_cfi $-1			# mark this as an int  	pushl_cfi $do_device_not_available  	jmp error_code @@ -906,6 +916,7 @@ END(native_irq_enable_sysexit)  ENTRY(overflow)  	RING0_INT_FRAME +	ASM_CLAC  	pushl_cfi $0  	pushl_cfi $do_overflow  	jmp error_code @@ -914,6 +925,7 @@ END(overflow)  ENTRY(bounds)  	RING0_INT_FRAME +	ASM_CLAC  	pushl_cfi $0  	pushl_cfi $do_bounds  	jmp error_code @@ -922,6 +934,7 @@ END(bounds)  ENTRY(invalid_op)  	RING0_INT_FRAME +	ASM_CLAC  	pushl_cfi $0  	pushl_cfi $do_invalid_op  	jmp error_code @@ -930,6 +943,7 @@ END(invalid_op)  ENTRY(coprocessor_segment_overrun)  	RING0_INT_FRAME +	ASM_CLAC  	pushl_cfi $0  	pushl_cfi $do_coprocessor_segment_overrun  	jmp error_code @@ -938,6 +952,7 @@ END(coprocessor_segment_overrun)  ENTRY(invalid_TSS)  	RING0_EC_FRAME +	ASM_CLAC  	pushl_cfi $do_invalid_TSS  	jmp error_code  	CFI_ENDPROC @@ -945,6 +960,7 @@ END(invalid_TSS)  ENTRY(segment_not_present)  	RING0_EC_FRAME +	ASM_CLAC  	pushl_cfi $do_segment_not_present  	jmp error_code  	CFI_ENDPROC @@ -952,6 +968,7 @@ END(segment_not_present)  ENTRY(stack_segment)  	RING0_EC_FRAME +	ASM_CLAC  	pushl_cfi $do_stack_segment  	jmp error_code  	CFI_ENDPROC @@ -959,6 +976,7 @@ END(stack_segment)  ENTRY(alignment_check)  	RING0_EC_FRAME +	ASM_CLAC  	pushl_cfi $do_alignment_check  	jmp error_code  	CFI_ENDPROC @@ -966,6 +984,7 @@ END(alignment_check)  ENTRY(divide_error)  	RING0_INT_FRAME +	ASM_CLAC  	pushl_cfi $0			# no error code  	pushl_cfi $do_divide_error  	jmp error_code @@ -975,6 +994,7 @@ END(divide_error)  #ifdef CONFIG_X86_MCE  ENTRY(machine_check)  	RING0_INT_FRAME +	ASM_CLAC  	pushl_cfi $0  	pushl_cfi machine_check_vector  	jmp error_code @@ -984,6 +1004,7 @@ END(machine_check)  ENTRY(spurious_interrupt_bug)  	RING0_INT_FRAME +	ASM_CLAC  	pushl_cfi $0  	pushl_cfi $do_spurious_interrupt_bug  	jmp error_code @@ -1109,17 +1130,21 @@ ENTRY(ftrace_caller)  	pushl %eax  	pushl %ecx  	pushl %edx -	movl 0xc(%esp), %eax +	pushl $0	/* Pass NULL as regs pointer */ +	movl 4*4(%esp), %eax  	movl 0x4(%ebp), %edx +	leal function_trace_op, %ecx  	subl $MCOUNT_INSN_SIZE, %eax  .globl ftrace_call  ftrace_call:  	call ftrace_stub +	addl $4,%esp	/* skip NULL pointer */  	popl %edx  	popl %ecx  	popl %eax +ftrace_ret:  #ifdef CONFIG_FUNCTION_GRAPH_TRACER  .globl ftrace_graph_call  ftrace_graph_call: @@ -1131,6 +1156,71 @@ ftrace_stub:  	ret  END(ftrace_caller) +ENTRY(ftrace_regs_caller) +	pushf	/* push flags before compare (in cs location) */ +	cmpl $0, function_trace_stop +	jne ftrace_restore_flags + +	/* +	 * i386 does not save SS and ESP when coming from kernel. +	 * Instead, to get sp, ®s->sp is used (see ptrace.h). +	 * Unfortunately, that means eflags must be at the same location +	 * as the current return ip is. We move the return ip into the +	 * ip location, and move flags into the return ip location. +	 */ +	pushl 4(%esp)	/* save return ip into ip slot */ + +	pushl $0	/* Load 0 into orig_ax */ +	pushl %gs +	pushl %fs +	pushl %es +	pushl %ds +	pushl %eax +	pushl %ebp +	pushl %edi +	pushl %esi +	pushl %edx +	pushl %ecx +	pushl %ebx + +	movl 13*4(%esp), %eax	/* Get the saved flags */ +	movl %eax, 14*4(%esp)	/* Move saved flags into regs->flags location */ +				/* clobbering return ip */ +	movl $__KERNEL_CS,13*4(%esp) + +	movl 12*4(%esp), %eax	/* Load ip (1st parameter) */ +	subl $MCOUNT_INSN_SIZE, %eax	/* Adjust ip */ +	movl 0x4(%ebp), %edx	/* Load parent ip (2nd parameter) */ +	leal function_trace_op, %ecx /* Save ftrace_pos in 3rd parameter */ +	pushl %esp		/* Save pt_regs as 4th parameter */ + +GLOBAL(ftrace_regs_call) +	call ftrace_stub + +	addl $4, %esp		/* Skip pt_regs */ +	movl 14*4(%esp), %eax	/* Move flags back into cs */ +	movl %eax, 13*4(%esp)	/* Needed to keep addl from modifying flags */ +	movl 12*4(%esp), %eax	/* Get return ip from regs->ip */ +	movl %eax, 14*4(%esp)	/* Put return ip back for ret */ + +	popl %ebx +	popl %ecx +	popl %edx +	popl %esi +	popl %edi +	popl %ebp +	popl %eax +	popl %ds +	popl %es +	popl %fs +	popl %gs +	addl $8, %esp		/* Skip orig_ax and ip */ +	popf			/* Pop flags at end (no addl to corrupt flags) */ +	jmp ftrace_ret + +ftrace_restore_flags: +	popf +	jmp  ftrace_stub  #else /* ! CONFIG_DYNAMIC_FTRACE */  ENTRY(mcount) @@ -1171,9 +1261,6 @@ END(mcount)  #ifdef CONFIG_FUNCTION_GRAPH_TRACER  ENTRY(ftrace_graph_caller) -	cmpl $0, function_trace_stop -	jne ftrace_stub -  	pushl %eax  	pushl %ecx  	pushl %edx @@ -1207,6 +1294,7 @@ return_to_handler:  ENTRY(page_fault)  	RING0_EC_FRAME +	ASM_CLAC  	pushl_cfi $do_page_fault  	ALIGN  error_code: @@ -1279,6 +1367,7 @@ END(page_fault)  ENTRY(debug)  	RING0_INT_FRAME +	ASM_CLAC  	cmpl $ia32_sysenter_target,(%esp)  	jne debug_stack_correct  	FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn @@ -1303,6 +1392,7 @@ END(debug)   */  ENTRY(nmi)  	RING0_INT_FRAME +	ASM_CLAC  	pushl_cfi %eax  	movl %ss, %eax  	cmpw $__ESPFIX_SS, %ax @@ -1373,6 +1463,7 @@ END(nmi)  ENTRY(int3)  	RING0_INT_FRAME +	ASM_CLAC  	pushl_cfi $-1			# mark this as an int  	SAVE_ALL  	TRACE_IRQS_OFF @@ -1393,6 +1484,7 @@ END(general_protection)  #ifdef CONFIG_KVM_GUEST  ENTRY(async_page_fault)  	RING0_EC_FRAME +	ASM_CLAC  	pushl_cfi $do_async_page_fault  	jmp error_code  	CFI_ENDPROC diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 69babd8c834..44531acd9a8 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -56,6 +56,8 @@  #include <asm/ftrace.h>  #include <asm/percpu.h>  #include <asm/asm.h> +#include <asm/rcu.h> +#include <asm/smap.h>  #include <linux/err.h>  /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */ @@ -68,25 +70,51 @@  	.section .entry.text, "ax"  #ifdef CONFIG_FUNCTION_TRACER + +#ifdef CC_USING_FENTRY +# define function_hook	__fentry__ +#else +# define function_hook	mcount +#endif +  #ifdef CONFIG_DYNAMIC_FTRACE -ENTRY(mcount) + +ENTRY(function_hook)  	retq -END(mcount) +END(function_hook) + +/* skip is set if stack has been adjusted */ +.macro ftrace_caller_setup skip=0 +	MCOUNT_SAVE_FRAME \skip + +	/* Load the ftrace_ops into the 3rd parameter */ +	leaq function_trace_op, %rdx + +	/* Load ip into the first parameter */ +	movq RIP(%rsp), %rdi +	subq $MCOUNT_INSN_SIZE, %rdi +	/* Load the parent_ip into the second parameter */ +#ifdef CC_USING_FENTRY +	movq SS+16(%rsp), %rsi +#else +	movq 8(%rbp), %rsi +#endif +.endm  ENTRY(ftrace_caller) +	/* Check if tracing was disabled (quick check) */  	cmpl $0, function_trace_stop  	jne  ftrace_stub -	MCOUNT_SAVE_FRAME - -	movq 0x38(%rsp), %rdi -	movq 8(%rbp), %rsi -	subq $MCOUNT_INSN_SIZE, %rdi +	ftrace_caller_setup +	/* regs go into 4th parameter (but make it NULL) */ +	movq $0, %rcx  GLOBAL(ftrace_call)  	call ftrace_stub  	MCOUNT_RESTORE_FRAME +ftrace_return:  #ifdef CONFIG_FUNCTION_GRAPH_TRACER  GLOBAL(ftrace_graph_call) @@ -97,8 +125,78 @@ GLOBAL(ftrace_stub)  	retq  END(ftrace_caller) +ENTRY(ftrace_regs_caller) +	/* Save the current flags before compare (in SS location)*/ +	pushfq + +	/* Check if tracing was disabled (quick check) */ +	cmpl $0, function_trace_stop +	jne  ftrace_restore_flags + +	/* skip=8 to skip flags saved in SS */ +	ftrace_caller_setup 8 + +	/* Save the rest of pt_regs */ +	movq %r15, R15(%rsp) +	movq %r14, R14(%rsp) +	movq %r13, R13(%rsp) +	movq %r12, R12(%rsp) +	movq %r11, R11(%rsp) +	movq %r10, R10(%rsp) +	movq %rbp, RBP(%rsp) +	movq %rbx, RBX(%rsp) +	/* Copy saved flags */ +	movq SS(%rsp), %rcx +	movq %rcx, EFLAGS(%rsp) +	/* Kernel segments */ +	movq $__KERNEL_DS, %rcx +	movq %rcx, SS(%rsp) +	movq $__KERNEL_CS, %rcx +	movq %rcx, CS(%rsp) +	/* Stack - skipping return address */ +	leaq SS+16(%rsp), %rcx +	movq %rcx, RSP(%rsp) + +	/* regs go into 4th parameter */ +	leaq (%rsp), %rcx + +GLOBAL(ftrace_regs_call) +	call ftrace_stub + +	/* Copy flags back to SS, to restore them */ +	movq EFLAGS(%rsp), %rax +	movq %rax, SS(%rsp) + +	/* Handlers can change the RIP */ +	movq RIP(%rsp), %rax +	movq %rax, SS+8(%rsp) + +	/* restore the rest of pt_regs */ +	movq R15(%rsp), %r15 +	movq R14(%rsp), %r14 +	movq R13(%rsp), %r13 +	movq R12(%rsp), %r12 +	movq R10(%rsp), %r10 +	movq RBP(%rsp), %rbp +	movq RBX(%rsp), %rbx + +	/* skip=8 to skip flags saved in SS */ +	MCOUNT_RESTORE_FRAME 8 + +	/* Restore flags */ +	popfq + +	jmp ftrace_return +ftrace_restore_flags: +	popfq +	jmp  ftrace_stub + +END(ftrace_regs_caller) + +  #else /* ! CONFIG_DYNAMIC_FTRACE */ -ENTRY(mcount) + +ENTRY(function_hook)  	cmpl $0, function_trace_stop  	jne  ftrace_stub @@ -119,8 +217,12 @@ GLOBAL(ftrace_stub)  trace:  	MCOUNT_SAVE_FRAME -	movq 0x38(%rsp), %rdi +	movq RIP(%rsp), %rdi +#ifdef CC_USING_FENTRY +	movq SS+16(%rsp), %rsi +#else  	movq 8(%rbp), %rsi +#endif  	subq $MCOUNT_INSN_SIZE, %rdi  	call   *ftrace_trace_function @@ -128,20 +230,22 @@ trace:  	MCOUNT_RESTORE_FRAME  	jmp ftrace_stub -END(mcount) +END(function_hook)  #endif /* CONFIG_DYNAMIC_FTRACE */  #endif /* CONFIG_FUNCTION_TRACER */  #ifdef CONFIG_FUNCTION_GRAPH_TRACER  ENTRY(ftrace_graph_caller) -	cmpl $0, function_trace_stop -	jne ftrace_stub -  	MCOUNT_SAVE_FRAME +#ifdef CC_USING_FENTRY +	leaq SS+16(%rsp), %rdi +	movq $0, %rdx	/* No framepointers needed */ +#else  	leaq 8(%rbp), %rdi -	movq 0x38(%rsp), %rsi  	movq (%rbp), %rdx +#endif +	movq RIP(%rsp), %rsi  	subq $MCOUNT_INSN_SIZE, %rsi  	call	prepare_ftrace_return @@ -342,15 +446,15 @@ ENDPROC(native_usergs_sysret64)  	.macro SAVE_ARGS_IRQ  	cld  	/* start from rbp in pt_regs and jump over */ -	movq_cfi rdi, RDI-RBP -	movq_cfi rsi, RSI-RBP -	movq_cfi rdx, RDX-RBP -	movq_cfi rcx, RCX-RBP -	movq_cfi rax, RAX-RBP -	movq_cfi  r8,  R8-RBP -	movq_cfi  r9,  R9-RBP -	movq_cfi r10, R10-RBP -	movq_cfi r11, R11-RBP +	movq_cfi rdi, (RDI-RBP) +	movq_cfi rsi, (RSI-RBP) +	movq_cfi rdx, (RDX-RBP) +	movq_cfi rcx, (RCX-RBP) +	movq_cfi rax, (RAX-RBP) +	movq_cfi  r8,  (R8-RBP) +	movq_cfi  r9,  (R9-RBP) +	movq_cfi r10, (R10-RBP) +	movq_cfi r11, (R11-RBP)  	/* Save rbp so that we can unwind from get_irq_regs() */  	movq_cfi rbp, 0 @@ -384,7 +488,7 @@ ENDPROC(native_usergs_sysret64)  	.endm  ENTRY(save_rest) -	PARTIAL_FRAME 1 REST_SKIP+8 +	PARTIAL_FRAME 1 (REST_SKIP+8)  	movq 5*8+16(%rsp), %r11	/* save return address */  	movq_cfi rbx, RBX+16  	movq_cfi rbp, RBP+16 @@ -440,7 +544,7 @@ ENTRY(ret_from_fork)  	LOCK ; btr $TIF_FORK,TI_flags(%r8) -	pushq_cfi kernel_eflags(%rip) +	pushq_cfi $0x0002  	popfq_cfi				# reset kernel eflags  	call schedule_tail			# rdi: 'prev' task parameter @@ -465,7 +569,8 @@ END(ret_from_fork)   * System call entry. Up to 6 arguments in registers are supported.   *   * SYSCALL does not save anything on the stack and does not change the - * stack pointer. + * stack pointer.  However, it does mask the flags register for us, so + * CLD and CLAC are not needed.   */  /* @@ -565,7 +670,7 @@ sysret_careful:  	TRACE_IRQS_ON  	ENABLE_INTERRUPTS(CLBR_NONE)  	pushq_cfi %rdi -	call schedule +	SCHEDULE_USER  	popq_cfi %rdi  	jmp sysret_check @@ -678,7 +783,7 @@ int_careful:  	TRACE_IRQS_ON  	ENABLE_INTERRUPTS(CLBR_NONE)  	pushq_cfi %rdi -	call schedule +	SCHEDULE_USER  	popq_cfi %rdi  	DISABLE_INTERRUPTS(CLBR_NONE)  	TRACE_IRQS_OFF @@ -884,6 +989,7 @@ END(interrupt)  	 */  	.p2align CONFIG_X86_L1_CACHE_SHIFT  common_interrupt: +	ASM_CLAC  	XCPT_FRAME  	addq $-0x80,(%rsp)		/* Adjust vector to [-256,-1] range */  	interrupt do_IRQ @@ -974,7 +1080,7 @@ retint_careful:  	TRACE_IRQS_ON  	ENABLE_INTERRUPTS(CLBR_NONE)  	pushq_cfi %rdi -	call  schedule +	SCHEDULE_USER  	popq_cfi %rdi  	GET_THREAD_INFO(%rcx)  	DISABLE_INTERRUPTS(CLBR_NONE) @@ -1023,6 +1129,7 @@ END(common_interrupt)   */  .macro apicinterrupt num sym do_sym  ENTRY(\sym) +	ASM_CLAC  	INTR_FRAME  	pushq_cfi $~(\num)  .Lcommon_\sym: @@ -1077,6 +1184,7 @@ apicinterrupt IRQ_WORK_VECTOR \   */  .macro zeroentry sym do_sym  ENTRY(\sym) +	ASM_CLAC  	INTR_FRAME  	PARAVIRT_ADJUST_EXCEPTION_FRAME  	pushq_cfi $-1		/* ORIG_RAX: no syscall to restart */ @@ -1094,6 +1202,7 @@ END(\sym)  .macro paranoidzeroentry sym do_sym  ENTRY(\sym) +	ASM_CLAC  	INTR_FRAME  	PARAVIRT_ADJUST_EXCEPTION_FRAME  	pushq_cfi $-1		/* ORIG_RAX: no syscall to restart */ @@ -1112,6 +1221,7 @@ END(\sym)  #define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8)  .macro paranoidzeroentry_ist sym do_sym ist  ENTRY(\sym) +	ASM_CLAC  	INTR_FRAME  	PARAVIRT_ADJUST_EXCEPTION_FRAME  	pushq_cfi $-1		/* ORIG_RAX: no syscall to restart */ @@ -1131,6 +1241,7 @@ END(\sym)  .macro errorentry sym do_sym  ENTRY(\sym) +	ASM_CLAC  	XCPT_FRAME  	PARAVIRT_ADJUST_EXCEPTION_FRAME  	subq $ORIG_RAX-R15, %rsp @@ -1149,6 +1260,7 @@ END(\sym)  	/* error code is on the stack already */  .macro paranoiderrorentry sym do_sym  ENTRY(\sym) +	ASM_CLAC  	XCPT_FRAME  	PARAVIRT_ADJUST_EXCEPTION_FRAME  	subq $ORIG_RAX-R15, %rsp @@ -1449,7 +1561,7 @@ paranoid_userspace:  paranoid_schedule:  	TRACE_IRQS_ON  	ENABLE_INTERRUPTS(CLBR_ANY) -	call schedule +	SCHEDULE_USER  	DISABLE_INTERRUPTS(CLBR_ANY)  	TRACE_IRQS_OFF  	jmp paranoid_userspace diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index c3a7cb4bf6e..1d414029f1d 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -206,6 +206,21 @@ static int  ftrace_modify_code(unsigned long ip, unsigned const char *old_code,  		   unsigned const char *new_code); +/* + * Should never be called: + *  As it is only called by __ftrace_replace_code() which is called by + *  ftrace_replace_code() that x86 overrides, and by ftrace_update_code() + *  which is called to turn mcount into nops or nops into function calls + *  but not to convert a function from not using regs to one that uses + *  regs, which ftrace_modify_call() is for. + */ +int ftrace_modify_call(struct dyn_ftrace *rec, unsigned long old_addr, +				 unsigned long addr) +{ +	WARN_ON(1); +	return -EINVAL; +} +  int ftrace_update_ftrace_func(ftrace_func_t func)  {  	unsigned long ip = (unsigned long)(&ftrace_call); @@ -220,6 +235,14 @@ int ftrace_update_ftrace_func(ftrace_func_t func)  	ret = ftrace_modify_code(ip, old, new); +	/* Also update the regs callback function */ +	if (!ret) { +		ip = (unsigned long)(&ftrace_regs_call); +		memcpy(old, &ftrace_regs_call, MCOUNT_INSN_SIZE); +		new = ftrace_call_replace(ip, (unsigned long)func); +		ret = ftrace_modify_code(ip, old, new); +	} +  	atomic_dec(&modifying_ftrace_code);  	return ret; @@ -299,6 +322,32 @@ static int add_brk_on_nop(struct dyn_ftrace *rec)  	return add_break(rec->ip, old);  } +/* + * If the record has the FTRACE_FL_REGS set, that means that it + * wants to convert to a callback that saves all regs. If FTRACE_FL_REGS + * is not not set, then it wants to convert to the normal callback. + */ +static unsigned long get_ftrace_addr(struct dyn_ftrace *rec) +{ +	if (rec->flags & FTRACE_FL_REGS) +		return (unsigned long)FTRACE_REGS_ADDR; +	else +		return (unsigned long)FTRACE_ADDR; +} + +/* + * The FTRACE_FL_REGS_EN is set when the record already points to + * a function that saves all the regs. Basically the '_EN' version + * represents the current state of the function. + */ +static unsigned long get_ftrace_old_addr(struct dyn_ftrace *rec) +{ +	if (rec->flags & FTRACE_FL_REGS_EN) +		return (unsigned long)FTRACE_REGS_ADDR; +	else +		return (unsigned long)FTRACE_ADDR; +} +  static int add_breakpoints(struct dyn_ftrace *rec, int enable)  {  	unsigned long ftrace_addr; @@ -306,7 +355,7 @@ static int add_breakpoints(struct dyn_ftrace *rec, int enable)  	ret = ftrace_test_record(rec, enable); -	ftrace_addr = (unsigned long)FTRACE_ADDR; +	ftrace_addr = get_ftrace_addr(rec);  	switch (ret) {  	case FTRACE_UPDATE_IGNORE: @@ -316,6 +365,10 @@ static int add_breakpoints(struct dyn_ftrace *rec, int enable)  		/* converting nop to call */  		return add_brk_on_nop(rec); +	case FTRACE_UPDATE_MODIFY_CALL_REGS: +	case FTRACE_UPDATE_MODIFY_CALL: +		ftrace_addr = get_ftrace_old_addr(rec); +		/* fall through */  	case FTRACE_UPDATE_MAKE_NOP:  		/* converting a call to a nop */  		return add_brk_on_call(rec, ftrace_addr); @@ -360,13 +413,21 @@ static int remove_breakpoint(struct dyn_ftrace *rec)  		 * If not, don't touch the breakpoint, we make just create  		 * a disaster.  		 */ -		ftrace_addr = (unsigned long)FTRACE_ADDR; +		ftrace_addr = get_ftrace_addr(rec); +		nop = ftrace_call_replace(ip, ftrace_addr); + +		if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) == 0) +			goto update; + +		/* Check both ftrace_addr and ftrace_old_addr */ +		ftrace_addr = get_ftrace_old_addr(rec);  		nop = ftrace_call_replace(ip, ftrace_addr);  		if (memcmp(&ins[1], &nop[1], MCOUNT_INSN_SIZE - 1) != 0)  			return -EINVAL;  	} + update:  	return probe_kernel_write((void *)ip, &nop[0], 1);  } @@ -405,12 +466,14 @@ static int add_update(struct dyn_ftrace *rec, int enable)  	ret = ftrace_test_record(rec, enable); -	ftrace_addr = (unsigned long)FTRACE_ADDR; +	ftrace_addr  = get_ftrace_addr(rec);  	switch (ret) {  	case FTRACE_UPDATE_IGNORE:  		return 0; +	case FTRACE_UPDATE_MODIFY_CALL_REGS: +	case FTRACE_UPDATE_MODIFY_CALL:  	case FTRACE_UPDATE_MAKE_CALL:  		/* converting nop to call */  		return add_update_call(rec, ftrace_addr); @@ -455,12 +518,14 @@ static int finish_update(struct dyn_ftrace *rec, int enable)  	ret = ftrace_update_record(rec, enable); -	ftrace_addr = (unsigned long)FTRACE_ADDR; +	ftrace_addr = get_ftrace_addr(rec);  	switch (ret) {  	case FTRACE_UPDATE_IGNORE:  		return 0; +	case FTRACE_UPDATE_MODIFY_CALL_REGS: +	case FTRACE_UPDATE_MODIFY_CALL:  	case FTRACE_UPDATE_MAKE_CALL:  		/* converting nop to call */  		return finish_update_call(rec, ftrace_addr); diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index d42ab17b739..957a47aec64 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -287,27 +287,28 @@ ENTRY(startup_32_smp)  	leal -__PAGE_OFFSET(%ecx),%esp  default_entry: -  /*   *	New page tables may be in 4Mbyte page mode and may   *	be using the global pages.    *   *	NOTE! If we are on a 486 we may have no cr4 at all! - *	So we do not try to touch it unless we really have - *	some bits in it to set.  This won't work if the BSP - *	implements cr4 but this AP does not -- very unlikely - *	but be warned!  The same applies to the pse feature - *	if not equally supported. --macro - * - *	NOTE! We have to correct for the fact that we're - *	not yet offset PAGE_OFFSET.. + *	Specifically, cr4 exists if and only if CPUID exists, + *	which in turn exists if and only if EFLAGS.ID exists.   */ -#define cr4_bits pa(mmu_cr4_features) -	movl cr4_bits,%edx -	andl %edx,%edx -	jz 6f -	movl %cr4,%eax		# Turn on paging options (PSE,PAE,..) -	orl %edx,%eax +	movl $X86_EFLAGS_ID,%ecx +	pushl %ecx +	popfl +	pushfl +	popl %eax +	pushl $0 +	popfl +	pushfl +	popl %edx +	xorl %edx,%eax +	testl %ecx,%eax +	jz 6f			# No ID flag = no CPUID = no CR4 + +	movl pa(mmu_cr4_features),%eax  	movl %eax,%cr4  	testb $X86_CR4_PAE, %al		# check if PAE is enabled diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index f250431fb50..675a0501244 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -19,24 +19,17 @@  #include <asm/fpu-internal.h>  #include <asm/user.h> -#ifdef CONFIG_X86_64 -# include <asm/sigcontext32.h> -# include <asm/user32.h> -#else -# define save_i387_xstate_ia32		save_i387_xstate -# define restore_i387_xstate_ia32	restore_i387_xstate -# define _fpstate_ia32		_fpstate -# define _xstate_ia32		_xstate -# define sig_xstate_ia32_size   sig_xstate_size -# define fx_sw_reserved_ia32	fx_sw_reserved -# define user_i387_ia32_struct	user_i387_struct -# define user32_fxsr_struct	user_fxsr_struct -#endif -  /*   * Were we in an interrupt that interrupted kernel mode?   * - * We can do a kernel_fpu_begin/end() pair *ONLY* if that + * For now, with eagerfpu we will return interrupted kernel FPU + * state as not-idle. TBD: Ideally we can change the return value + * to something like __thread_has_fpu(current). But we need to + * be careful of doing __thread_clear_has_fpu() before saving + * the FPU etc for supporting nested uses etc. For now, take + * the simple route! + * + * On others, we can do a kernel_fpu_begin/end() pair *ONLY* if that   * pair does nothing at all: the thread must not have fpu (so   * that we don't try to save the FPU state), and TS must   * be set (so that the clts/stts pair does nothing that is @@ -44,6 +37,9 @@   */  static inline bool interrupted_kernel_fpu_idle(void)  { +	if (use_eager_fpu()) +		return 0; +  	return !__thread_has_fpu(current) &&  		(read_cr0() & X86_CR0_TS);  } @@ -77,29 +73,29 @@ bool irq_fpu_usable(void)  }  EXPORT_SYMBOL(irq_fpu_usable); -void kernel_fpu_begin(void) +void __kernel_fpu_begin(void)  {  	struct task_struct *me = current; -	WARN_ON_ONCE(!irq_fpu_usable()); -	preempt_disable();  	if (__thread_has_fpu(me)) {  		__save_init_fpu(me);  		__thread_clear_has_fpu(me); -		/* We do 'stts()' in kernel_fpu_end() */ -	} else { +		/* We do 'stts()' in __kernel_fpu_end() */ +	} else if (!use_eager_fpu()) {  		this_cpu_write(fpu_owner_task, NULL);  		clts();  	}  } -EXPORT_SYMBOL(kernel_fpu_begin); +EXPORT_SYMBOL(__kernel_fpu_begin); -void kernel_fpu_end(void) +void __kernel_fpu_end(void)  { -	stts(); -	preempt_enable(); +	if (use_eager_fpu()) +		math_state_restore(); +	else +		stts();  } -EXPORT_SYMBOL(kernel_fpu_end); +EXPORT_SYMBOL(__kernel_fpu_end);  void unlazy_fpu(struct task_struct *tsk)  { @@ -113,23 +109,15 @@ void unlazy_fpu(struct task_struct *tsk)  }  EXPORT_SYMBOL(unlazy_fpu); -#ifdef CONFIG_MATH_EMULATION -# define HAVE_HWFP		(boot_cpu_data.hard_math) -#else -# define HAVE_HWFP		1 -#endif - -static unsigned int		mxcsr_feature_mask __read_mostly = 0xffffffffu; +unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu;  unsigned int xstate_size;  EXPORT_SYMBOL_GPL(xstate_size); -unsigned int sig_xstate_ia32_size = sizeof(struct _fpstate_ia32);  static struct i387_fxsave_struct fx_scratch __cpuinitdata;  static void __cpuinit mxcsr_feature_mask_init(void)  {  	unsigned long mask = 0; -	clts();  	if (cpu_has_fxsr) {  		memset(&fx_scratch, 0, sizeof(struct i387_fxsave_struct));  		asm volatile("fxsave %0" : : "m" (fx_scratch)); @@ -138,7 +126,6 @@ static void __cpuinit mxcsr_feature_mask_init(void)  			mask = 0x0000ffbf;  	}  	mxcsr_feature_mask &= mask; -	stts();  }  static void __cpuinit init_thread_xstate(void) @@ -192,9 +179,8 @@ void __cpuinit fpu_init(void)  		init_thread_xstate();  	mxcsr_feature_mask_init(); -	/* clean state in init */ -	current_thread_info()->status = 0; -	clear_used_math(); +	xsave_init(); +	eager_fpu_init();  }  void fpu_finit(struct fpu *fpu) @@ -205,12 +191,7 @@ void fpu_finit(struct fpu *fpu)  	}  	if (cpu_has_fxsr) { -		struct i387_fxsave_struct *fx = &fpu->state->fxsave; - -		memset(fx, 0, xstate_size); -		fx->cwd = 0x37f; -		if (cpu_has_xmm) -			fx->mxcsr = MXCSR_DEFAULT; +		fx_finit(&fpu->state->fxsave);  	} else {  		struct i387_fsave_struct *fp = &fpu->state->fsave;  		memset(fp, 0, xstate_size); @@ -454,7 +435,7 @@ static inline u32 twd_fxsr_to_i387(struct i387_fxsave_struct *fxsave)   * FXSR floating point environment conversions.   */ -static void +void  convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk)  {  	struct i387_fxsave_struct *fxsave = &tsk->thread.fpu.state->fxsave; @@ -491,8 +472,8 @@ convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk)  		memcpy(&to[i], &from[i], sizeof(to[0]));  } -static void convert_to_fxsr(struct task_struct *tsk, -			    const struct user_i387_ia32_struct *env) +void convert_to_fxsr(struct task_struct *tsk, +		     const struct user_i387_ia32_struct *env)  {  	struct i387_fxsave_struct *fxsave = &tsk->thread.fpu.state->fxsave; @@ -589,223 +570,6 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,  }  /* - * Signal frame handlers. - */ - -static inline int save_i387_fsave(struct _fpstate_ia32 __user *buf) -{ -	struct task_struct *tsk = current; -	struct i387_fsave_struct *fp = &tsk->thread.fpu.state->fsave; - -	fp->status = fp->swd; -	if (__copy_to_user(buf, fp, sizeof(struct i387_fsave_struct))) -		return -1; -	return 1; -} - -static int save_i387_fxsave(struct _fpstate_ia32 __user *buf) -{ -	struct task_struct *tsk = current; -	struct i387_fxsave_struct *fx = &tsk->thread.fpu.state->fxsave; -	struct user_i387_ia32_struct env; -	int err = 0; - -	convert_from_fxsr(&env, tsk); -	if (__copy_to_user(buf, &env, sizeof(env))) -		return -1; - -	err |= __put_user(fx->swd, &buf->status); -	err |= __put_user(X86_FXSR_MAGIC, &buf->magic); -	if (err) -		return -1; - -	if (__copy_to_user(&buf->_fxsr_env[0], fx, xstate_size)) -		return -1; -	return 1; -} - -static int save_i387_xsave(void __user *buf) -{ -	struct task_struct *tsk = current; -	struct _fpstate_ia32 __user *fx = buf; -	int err = 0; - - -	sanitize_i387_state(tsk); - -	/* -	 * For legacy compatible, we always set FP/SSE bits in the bit -	 * vector while saving the state to the user context. -	 * This will enable us capturing any changes(during sigreturn) to -	 * the FP/SSE bits by the legacy applications which don't touch -	 * xstate_bv in the xsave header. -	 * -	 * xsave aware applications can change the xstate_bv in the xsave -	 * header as well as change any contents in the memory layout. -	 * xrestore as part of sigreturn will capture all the changes. -	 */ -	tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE; - -	if (save_i387_fxsave(fx) < 0) -		return -1; - -	err = __copy_to_user(&fx->sw_reserved, &fx_sw_reserved_ia32, -			     sizeof(struct _fpx_sw_bytes)); -	err |= __put_user(FP_XSTATE_MAGIC2, -			  (__u32 __user *) (buf + sig_xstate_ia32_size -					    - FP_XSTATE_MAGIC2_SIZE)); -	if (err) -		return -1; - -	return 1; -} - -int save_i387_xstate_ia32(void __user *buf) -{ -	struct _fpstate_ia32 __user *fp = (struct _fpstate_ia32 __user *) buf; -	struct task_struct *tsk = current; - -	if (!used_math()) -		return 0; - -	if (!access_ok(VERIFY_WRITE, buf, sig_xstate_ia32_size)) -		return -EACCES; -	/* -	 * This will cause a "finit" to be triggered by the next -	 * attempted FPU operation by the 'current' process. -	 */ -	clear_used_math(); - -	if (!HAVE_HWFP) { -		return fpregs_soft_get(current, NULL, -				       0, sizeof(struct user_i387_ia32_struct), -				       NULL, fp) ? -1 : 1; -	} - -	unlazy_fpu(tsk); - -	if (cpu_has_xsave) -		return save_i387_xsave(fp); -	if (cpu_has_fxsr) -		return save_i387_fxsave(fp); -	else -		return save_i387_fsave(fp); -} - -static inline int restore_i387_fsave(struct _fpstate_ia32 __user *buf) -{ -	struct task_struct *tsk = current; - -	return __copy_from_user(&tsk->thread.fpu.state->fsave, buf, -				sizeof(struct i387_fsave_struct)); -} - -static int restore_i387_fxsave(struct _fpstate_ia32 __user *buf, -			       unsigned int size) -{ -	struct task_struct *tsk = current; -	struct user_i387_ia32_struct env; -	int err; - -	err = __copy_from_user(&tsk->thread.fpu.state->fxsave, &buf->_fxsr_env[0], -			       size); -	/* mxcsr reserved bits must be masked to zero for security reasons */ -	tsk->thread.fpu.state->fxsave.mxcsr &= mxcsr_feature_mask; -	if (err || __copy_from_user(&env, buf, sizeof(env))) -		return 1; -	convert_to_fxsr(tsk, &env); - -	return 0; -} - -static int restore_i387_xsave(void __user *buf) -{ -	struct _fpx_sw_bytes fx_sw_user; -	struct _fpstate_ia32 __user *fx_user = -			((struct _fpstate_ia32 __user *) buf); -	struct i387_fxsave_struct __user *fx = -		(struct i387_fxsave_struct __user *) &fx_user->_fxsr_env[0]; -	struct xsave_hdr_struct *xsave_hdr = -				¤t->thread.fpu.state->xsave.xsave_hdr; -	u64 mask; -	int err; - -	if (check_for_xstate(fx, buf, &fx_sw_user)) -		goto fx_only; - -	mask = fx_sw_user.xstate_bv; - -	err = restore_i387_fxsave(buf, fx_sw_user.xstate_size); - -	xsave_hdr->xstate_bv &= pcntxt_mask; -	/* -	 * These bits must be zero. -	 */ -	xsave_hdr->reserved1[0] = xsave_hdr->reserved1[1] = 0; - -	/* -	 * Init the state that is not present in the memory layout -	 * and enabled by the OS. -	 */ -	mask = ~(pcntxt_mask & ~mask); -	xsave_hdr->xstate_bv &= mask; - -	return err; -fx_only: -	/* -	 * Couldn't find the extended state information in the memory -	 * layout. Restore the FP/SSE and init the other extended state -	 * enabled by the OS. -	 */ -	xsave_hdr->xstate_bv = XSTATE_FPSSE; -	return restore_i387_fxsave(buf, sizeof(struct i387_fxsave_struct)); -} - -int restore_i387_xstate_ia32(void __user *buf) -{ -	int err; -	struct task_struct *tsk = current; -	struct _fpstate_ia32 __user *fp = (struct _fpstate_ia32 __user *) buf; - -	if (HAVE_HWFP) -		clear_fpu(tsk); - -	if (!buf) { -		if (used_math()) { -			clear_fpu(tsk); -			clear_used_math(); -		} - -		return 0; -	} else -		if (!access_ok(VERIFY_READ, buf, sig_xstate_ia32_size)) -			return -EACCES; - -	if (!used_math()) { -		err = init_fpu(tsk); -		if (err) -			return err; -	} - -	if (HAVE_HWFP) { -		if (cpu_has_xsave) -			err = restore_i387_xsave(buf); -		else if (cpu_has_fxsr) -			err = restore_i387_fxsave(fp, sizeof(struct -							   i387_fxsave_struct)); -		else -			err = restore_i387_fsave(fp); -	} else { -		err = fpregs_soft_set(current, NULL, -				      0, sizeof(struct user_i387_ia32_struct), -				      NULL, fp) != 0; -	} -	set_used_math(); - -	return err; -} - -/*   * FPU state for core dumps.   * This is only used for a.out dumps now.   * It is declared generically using elf_fpregset_t (which is diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 36d1853e91a..9a5c460404d 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -263,7 +263,7 @@ static void i8259A_shutdown(void)  	 * out of.  	 */  	outb(0xff, PIC_MASTER_IMR);	/* mask all of 8259A-1 */ -	outb(0xff, PIC_SLAVE_IMR);	/* mask all of 8259A-1 */ +	outb(0xff, PIC_SLAVE_IMR);	/* mask all of 8259A-2 */  }  static struct syscore_ops i8259_syscore_ops = { diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index d44f7829968..e4595f10591 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -92,7 +92,8 @@ int arch_show_interrupts(struct seq_file *p, int prec)  	seq_printf(p, "  Rescheduling interrupts\n");  	seq_printf(p, "%*s: ", prec, "CAL");  	for_each_online_cpu(j) -		seq_printf(p, "%10u ", irq_stats(j)->irq_call_count); +		seq_printf(p, "%10u ", irq_stats(j)->irq_call_count - +					irq_stats(j)->irq_tlb_count);  	seq_printf(p, "  Function call interrupts\n");  	seq_printf(p, "%*s: ", prec, "TLB");  	for_each_online_cpu(j) @@ -147,7 +148,6 @@ u64 arch_irq_stat_cpu(unsigned int cpu)  #ifdef CONFIG_SMP  	sum += irq_stats(cpu)->irq_resched_count;  	sum += irq_stats(cpu)->irq_call_count; -	sum += irq_stats(cpu)->irq_tlb_count;  #endif  #ifdef CONFIG_X86_THERMAL_VECTOR  	sum += irq_stats(cpu)->irq_thermal_count; diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index e2f751efb7b..57916c0d3cf 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -541,6 +541,23 @@ reenter_kprobe(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb  	return 1;  } +#ifdef KPROBES_CAN_USE_FTRACE +static void __kprobes skip_singlestep(struct kprobe *p, struct pt_regs *regs, +				      struct kprobe_ctlblk *kcb) +{ +	/* +	 * Emulate singlestep (and also recover regs->ip) +	 * as if there is a 5byte nop +	 */ +	regs->ip = (unsigned long)p->addr + MCOUNT_INSN_SIZE; +	if (unlikely(p->post_handler)) { +		kcb->kprobe_status = KPROBE_HIT_SSDONE; +		p->post_handler(p, regs, 0); +	} +	__this_cpu_write(current_kprobe, NULL); +} +#endif +  /*   * Interrupts are disabled on entry as trap3 is an interrupt gate and they   * remain disabled throughout this function. @@ -599,6 +616,12 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)  	} else if (kprobe_running()) {  		p = __this_cpu_read(current_kprobe);  		if (p->break_handler && p->break_handler(p, regs)) { +#ifdef KPROBES_CAN_USE_FTRACE +			if (kprobe_ftrace(p)) { +				skip_singlestep(p, regs, kcb); +				return 1; +			} +#endif  			setup_singlestep(p, regs, kcb, 0);  			return 1;  		} @@ -1052,6 +1075,50 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)  	return 0;  } +#ifdef KPROBES_CAN_USE_FTRACE +/* Ftrace callback handler for kprobes */ +void __kprobes kprobe_ftrace_handler(unsigned long ip, unsigned long parent_ip, +				     struct ftrace_ops *ops, struct pt_regs *regs) +{ +	struct kprobe *p; +	struct kprobe_ctlblk *kcb; +	unsigned long flags; + +	/* Disable irq for emulating a breakpoint and avoiding preempt */ +	local_irq_save(flags); + +	p = get_kprobe((kprobe_opcode_t *)ip); +	if (unlikely(!p) || kprobe_disabled(p)) +		goto end; + +	kcb = get_kprobe_ctlblk(); +	if (kprobe_running()) { +		kprobes_inc_nmissed_count(p); +	} else { +		/* Kprobe handler expects regs->ip = ip + 1 as breakpoint hit */ +		regs->ip = ip + sizeof(kprobe_opcode_t); + +		__this_cpu_write(current_kprobe, p); +		kcb->kprobe_status = KPROBE_HIT_ACTIVE; +		if (!p->pre_handler || !p->pre_handler(p, regs)) +			skip_singlestep(p, regs, kcb); +		/* +		 * If pre_handler returns !0, it sets regs->ip and +		 * resets current kprobe. +		 */ +	} +end: +	local_irq_restore(flags); +} + +int __kprobes arch_prepare_kprobe_ftrace(struct kprobe *p) +{ +	p->ainsn.insn = NULL; +	p->ainsn.boostable = -1; +	return 0; +} +#endif +  int __init arch_init_kprobes(void)  {  	return arch_init_optprobes(); diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 82746f942cd..7720ff5a9ee 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -75,20 +75,113 @@ struct microcode_amd {  static struct equiv_cpu_entry *equiv_cpu_table; -/* page-sized ucode patch buffer */ -void *patch; +struct ucode_patch { +	struct list_head plist; +	void *data; +	u32 patch_id; +	u16 equiv_cpu; +}; + +static LIST_HEAD(pcache); + +static u16 find_equiv_id(unsigned int cpu) +{ +	struct ucode_cpu_info *uci = ucode_cpu_info + cpu; +	int i = 0; + +	if (!equiv_cpu_table) +		return 0; + +	while (equiv_cpu_table[i].installed_cpu != 0) { +		if (uci->cpu_sig.sig == equiv_cpu_table[i].installed_cpu) +			return equiv_cpu_table[i].equiv_cpu; + +		i++; +	} +	return 0; +} + +static u32 find_cpu_family_by_equiv_cpu(u16 equiv_cpu) +{ +	int i = 0; + +	BUG_ON(!equiv_cpu_table); + +	while (equiv_cpu_table[i].equiv_cpu != 0) { +		if (equiv_cpu == equiv_cpu_table[i].equiv_cpu) +			return equiv_cpu_table[i].installed_cpu; +		i++; +	} +	return 0; +} + +/* + * a small, trivial cache of per-family ucode patches + */ +static struct ucode_patch *cache_find_patch(u16 equiv_cpu) +{ +	struct ucode_patch *p; + +	list_for_each_entry(p, &pcache, plist) +		if (p->equiv_cpu == equiv_cpu) +			return p; +	return NULL; +} + +static void update_cache(struct ucode_patch *new_patch) +{ +	struct ucode_patch *p; + +	list_for_each_entry(p, &pcache, plist) { +		if (p->equiv_cpu == new_patch->equiv_cpu) { +			if (p->patch_id >= new_patch->patch_id) +				/* we already have the latest patch */ +				return; + +			list_replace(&p->plist, &new_patch->plist); +			kfree(p->data); +			kfree(p); +			return; +		} +	} +	/* no patch found, add it */ +	list_add_tail(&new_patch->plist, &pcache); +} + +static void free_cache(void) +{ +	struct ucode_patch *p, *tmp; + +	list_for_each_entry_safe(p, tmp, &pcache, plist) { +		__list_del(p->plist.prev, p->plist.next); +		kfree(p->data); +		kfree(p); +	} +} + +static struct ucode_patch *find_patch(unsigned int cpu) +{ +	u16 equiv_id; + +	equiv_id = find_equiv_id(cpu); +	if (!equiv_id) +		return NULL; + +	return cache_find_patch(equiv_id); +}  static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)  {  	struct cpuinfo_x86 *c = &cpu_data(cpu); +	csig->sig = cpuid_eax(0x00000001);  	csig->rev = c->microcode;  	pr_info("CPU%d: patch_level=0x%08x\n", cpu, csig->rev);  	return 0;  } -static unsigned int verify_ucode_size(int cpu, u32 patch_size, +static unsigned int verify_patch_size(int cpu, u32 patch_size,  				      unsigned int size)  {  	struct cpuinfo_x86 *c = &cpu_data(cpu); @@ -118,95 +211,37 @@ static unsigned int verify_ucode_size(int cpu, u32 patch_size,  	return patch_size;  } -static u16 find_equiv_id(void) +static int apply_microcode_amd(int cpu)  { -	unsigned int current_cpu_id, i = 0; - -	BUG_ON(equiv_cpu_table == NULL); - -	current_cpu_id = cpuid_eax(0x00000001); - -	while (equiv_cpu_table[i].installed_cpu != 0) { -		if (current_cpu_id == equiv_cpu_table[i].installed_cpu) -			return equiv_cpu_table[i].equiv_cpu; - -		i++; -	} -	return 0; -} +	struct cpuinfo_x86 *c = &cpu_data(cpu); +	struct microcode_amd *mc_amd; +	struct ucode_cpu_info *uci; +	struct ucode_patch *p; +	u32 rev, dummy; -/* - * we signal a good patch is found by returning its size > 0 - */ -static int get_matching_microcode(int cpu, const u8 *ucode_ptr, -				  unsigned int leftover_size, int rev, -				  unsigned int *current_size) -{ -	struct microcode_header_amd *mc_hdr; -	unsigned int actual_size, patch_size; -	u16 equiv_cpu_id; +	BUG_ON(raw_smp_processor_id() != cpu); -	/* size of the current patch we're staring at */ -	patch_size = *(u32 *)(ucode_ptr + 4); -	*current_size = patch_size + SECTION_HDR_SIZE; +	uci = ucode_cpu_info + cpu; -	equiv_cpu_id = find_equiv_id(); -	if (!equiv_cpu_id) +	p = find_patch(cpu); +	if (!p)  		return 0; -	/* -	 * let's look at the patch header itself now -	 */ -	mc_hdr = (struct microcode_header_amd *)(ucode_ptr + SECTION_HDR_SIZE); +	mc_amd  = p->data; +	uci->mc = p->data; -	if (mc_hdr->processor_rev_id != equiv_cpu_id) -		return 0; +	rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); -	/* ucode might be chipset specific -- currently we don't support this */ -	if (mc_hdr->nb_dev_id || mc_hdr->sb_dev_id) { -		pr_err("CPU%d: chipset specific code not yet supported\n", -		       cpu); +	/* need to apply patch? */ +	if (rev >= mc_amd->hdr.patch_id) { +		c->microcode = rev;  		return 0;  	} -	if (mc_hdr->patch_id <= rev) -		return 0; - -	/* -	 * now that the header looks sane, verify its size -	 */ -	actual_size = verify_ucode_size(cpu, patch_size, leftover_size); -	if (!actual_size) -		return 0; - -	/* clear the patch buffer */ -	memset(patch, 0, PAGE_SIZE); - -	/* all looks ok, get the binary patch */ -	get_ucode_data(patch, ucode_ptr + SECTION_HDR_SIZE, actual_size); - -	return actual_size; -} - -static int apply_microcode_amd(int cpu) -{ -	u32 rev, dummy; -	int cpu_num = raw_smp_processor_id(); -	struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; -	struct microcode_amd *mc_amd = uci->mc; -	struct cpuinfo_x86 *c = &cpu_data(cpu); - -	/* We should bind the task to the CPU */ -	BUG_ON(cpu_num != cpu); - -	if (mc_amd == NULL) -		return 0; -  	wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code); -	/* get patch id after patching */ -	rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); -	/* check current patch id and patch's id for match */ +	/* verify patch application was successful */ +	rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);  	if (rev != mc_amd->hdr.patch_id) {  		pr_err("CPU%d: update failed for patch_level=0x%08x\n",  		       cpu, mc_amd->hdr.patch_id); @@ -238,7 +273,7 @@ static int install_equiv_cpu_table(const u8 *buf)  		return -ENOMEM;  	} -	get_ucode_data(equiv_cpu_table, buf + CONTAINER_HDR_SZ, size); +	memcpy(equiv_cpu_table, buf + CONTAINER_HDR_SZ, size);  	/* add header length */  	return size + CONTAINER_HDR_SZ; @@ -250,61 +285,113 @@ static void free_equiv_cpu_table(void)  	equiv_cpu_table = NULL;  } -static enum ucode_state -generic_load_microcode(int cpu, const u8 *data, size_t size) +static void cleanup(void)  { -	struct ucode_cpu_info *uci = ucode_cpu_info + cpu; -	struct microcode_header_amd *mc_hdr = NULL; -	unsigned int mc_size, leftover, current_size = 0; +	free_equiv_cpu_table(); +	free_cache(); +} + +/* + * We return the current size even if some of the checks failed so that + * we can skip over the next patch. If we return a negative value, we + * signal a grave error like a memory allocation has failed and the + * driver cannot continue functioning normally. In such cases, we tear + * down everything we've used up so far and exit. + */ +static int verify_and_add_patch(unsigned int cpu, u8 *fw, unsigned int leftover) +{ +	struct cpuinfo_x86 *c = &cpu_data(cpu); +	struct microcode_header_amd *mc_hdr; +	struct ucode_patch *patch; +	unsigned int patch_size, crnt_size, ret; +	u32 proc_fam; +	u16 proc_id; + +	patch_size  = *(u32 *)(fw + 4); +	crnt_size   = patch_size + SECTION_HDR_SIZE; +	mc_hdr	    = (struct microcode_header_amd *)(fw + SECTION_HDR_SIZE); +	proc_id	    = mc_hdr->processor_rev_id; + +	proc_fam = find_cpu_family_by_equiv_cpu(proc_id); +	if (!proc_fam) { +		pr_err("No patch family for equiv ID: 0x%04x\n", proc_id); +		return crnt_size; +	} + +	/* check if patch is for the current family */ +	proc_fam = ((proc_fam >> 8) & 0xf) + ((proc_fam >> 20) & 0xff); +	if (proc_fam != c->x86) +		return crnt_size; + +	if (mc_hdr->nb_dev_id || mc_hdr->sb_dev_id) { +		pr_err("Patch-ID 0x%08x: chipset-specific code unsupported.\n", +			mc_hdr->patch_id); +		return crnt_size; +	} + +	ret = verify_patch_size(cpu, patch_size, leftover); +	if (!ret) { +		pr_err("Patch-ID 0x%08x: size mismatch.\n", mc_hdr->patch_id); +		return crnt_size; +	} + +	patch = kzalloc(sizeof(*patch), GFP_KERNEL); +	if (!patch) { +		pr_err("Patch allocation failure.\n"); +		return -EINVAL; +	} + +	patch->data = kzalloc(patch_size, GFP_KERNEL); +	if (!patch->data) { +		pr_err("Patch data allocation failure.\n"); +		kfree(patch); +		return -EINVAL; +	} + +	/* All looks ok, copy patch... */ +	memcpy(patch->data, fw + SECTION_HDR_SIZE, patch_size); +	INIT_LIST_HEAD(&patch->plist); +	patch->patch_id  = mc_hdr->patch_id; +	patch->equiv_cpu = proc_id; + +	/* ... and add to cache. */ +	update_cache(patch); + +	return crnt_size; +} + +static enum ucode_state load_microcode_amd(int cpu, const u8 *data, size_t size) +{ +	enum ucode_state ret = UCODE_ERROR; +	unsigned int leftover; +	u8 *fw = (u8 *)data; +	int crnt_size = 0;  	int offset; -	const u8 *ucode_ptr = data; -	void *new_mc = NULL; -	unsigned int new_rev = uci->cpu_sig.rev; -	enum ucode_state state = UCODE_ERROR; -	offset = install_equiv_cpu_table(ucode_ptr); +	offset = install_equiv_cpu_table(data);  	if (offset < 0) {  		pr_err("failed to create equivalent cpu table\n"); -		goto out; +		return ret;  	} -	ucode_ptr += offset; +	fw += offset;  	leftover = size - offset; -	if (*(u32 *)ucode_ptr != UCODE_UCODE_TYPE) { +	if (*(u32 *)fw != UCODE_UCODE_TYPE) {  		pr_err("invalid type field in container file section header\n"); -		goto free_table; +		free_equiv_cpu_table(); +		return ret;  	}  	while (leftover) { -		mc_size = get_matching_microcode(cpu, ucode_ptr, leftover, -						 new_rev, ¤t_size); -		if (mc_size) { -			mc_hdr  = patch; -			new_mc  = patch; -			new_rev = mc_hdr->patch_id; -			goto out_ok; -		} - -		ucode_ptr += current_size; -		leftover  -= current_size; -	} +		crnt_size = verify_and_add_patch(cpu, fw, leftover); +		if (crnt_size < 0) +			return ret; -	if (!new_mc) { -		state = UCODE_NFOUND; -		goto free_table; +		fw	 += crnt_size; +		leftover -= crnt_size;  	} -out_ok: -	uci->mc = new_mc; -	state = UCODE_OK; -	pr_debug("CPU%d update ucode (0x%08x -> 0x%08x)\n", -		 cpu, uci->cpu_sig.rev, new_rev); - -free_table: -	free_equiv_cpu_table(); - -out: -	return state; +	return UCODE_OK;  }  /* @@ -315,7 +402,7 @@ out:   *   * This legacy file is always smaller than 2K in size.   * - * Starting at family 15h they are in family specific firmware files: + * Beginning with family 15h, they are in family-specific firmware files:   *   *    amd-ucode/microcode_amd_fam15h.bin   *    amd-ucode/microcode_amd_fam16h.bin @@ -323,12 +410,17 @@ out:   *   * These might be larger than 2K.   */ -static enum ucode_state request_microcode_amd(int cpu, struct device *device) +static enum ucode_state request_microcode_amd(int cpu, struct device *device, +					      bool refresh_fw)  {  	char fw_name[36] = "amd-ucode/microcode_amd.bin"; -	const struct firmware *fw; -	enum ucode_state ret = UCODE_NFOUND;  	struct cpuinfo_x86 *c = &cpu_data(cpu); +	enum ucode_state ret = UCODE_NFOUND; +	const struct firmware *fw; + +	/* reload ucode container only on the boot cpu */ +	if (!refresh_fw || c->cpu_index != boot_cpu_data.cpu_index) +		return UCODE_OK;  	if (c->x86 >= 0x15)  		snprintf(fw_name, sizeof(fw_name), "amd-ucode/microcode_amd_fam%.2xh.bin", c->x86); @@ -344,12 +436,17 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device)  		goto fw_release;  	} -	ret = generic_load_microcode(cpu, fw->data, fw->size); +	/* free old equiv table */ +	free_equiv_cpu_table(); + +	ret = load_microcode_amd(cpu, fw->data, fw->size); +	if (ret != UCODE_OK) +		cleanup(); -fw_release: + fw_release:  	release_firmware(fw); -out: + out:  	return ret;  } @@ -383,14 +480,10 @@ struct microcode_ops * __init init_amd_microcode(void)  		return NULL;  	} -	patch = (void *)get_zeroed_page(GFP_KERNEL); -	if (!patch) -		return NULL; -  	return µcode_amd_ops;  }  void __exit exit_amd_microcode(void)  { -	free_page((unsigned long)patch); +	cleanup();  } diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index 4873e62db6a..3a04b224d0c 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -225,6 +225,9 @@ static ssize_t microcode_write(struct file *file, const char __user *buf,  	if (do_microcode_update(buf, len) == 0)  		ret = (ssize_t)len; +	if (ret > 0) +		perf_check_microcode(); +  	mutex_unlock(µcode_mutex);  	put_online_cpus(); @@ -276,19 +279,18 @@ static struct platform_device	*microcode_pdev;  static int reload_for_cpu(int cpu)  {  	struct ucode_cpu_info *uci = ucode_cpu_info + cpu; +	enum ucode_state ustate;  	int err = 0; -	if (uci->valid) { -		enum ucode_state ustate; - -		ustate = microcode_ops->request_microcode_fw(cpu, µcode_pdev->dev); -		if (ustate == UCODE_OK) -			apply_microcode_on_target(cpu); -		else -			if (ustate == UCODE_ERROR) -				err = -EINVAL; -	} +	if (!uci->valid) +		return err; +	ustate = microcode_ops->request_microcode_fw(cpu, µcode_pdev->dev, true); +	if (ustate == UCODE_OK) +		apply_microcode_on_target(cpu); +	else +		if (ustate == UCODE_ERROR) +			err = -EINVAL;  	return err;  } @@ -370,18 +372,15 @@ static void microcode_fini_cpu(int cpu)  static enum ucode_state microcode_resume_cpu(int cpu)  { -	struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - -	if (!uci->mc) -		return UCODE_NFOUND; -  	pr_debug("CPU%d updated upon resume\n", cpu); -	apply_microcode_on_target(cpu); + +	if (apply_microcode_on_target(cpu)) +		return UCODE_ERROR;  	return UCODE_OK;  } -static enum ucode_state microcode_init_cpu(int cpu) +static enum ucode_state microcode_init_cpu(int cpu, bool refresh_fw)  {  	enum ucode_state ustate; @@ -392,7 +391,8 @@ static enum ucode_state microcode_init_cpu(int cpu)  	if (system_state != SYSTEM_RUNNING)  		return UCODE_NFOUND; -	ustate = microcode_ops->request_microcode_fw(cpu, µcode_pdev->dev); +	ustate = microcode_ops->request_microcode_fw(cpu, µcode_pdev->dev, +						     refresh_fw);  	if (ustate == UCODE_OK) {  		pr_debug("CPU%d updated upon init\n", cpu); @@ -405,14 +405,11 @@ static enum ucode_state microcode_init_cpu(int cpu)  static enum ucode_state microcode_update_cpu(int cpu)  {  	struct ucode_cpu_info *uci = ucode_cpu_info + cpu; -	enum ucode_state ustate;  	if (uci->valid) -		ustate = microcode_resume_cpu(cpu); -	else -		ustate = microcode_init_cpu(cpu); +		return microcode_resume_cpu(cpu); -	return ustate; +	return microcode_init_cpu(cpu, false);  }  static int mc_device_add(struct device *dev, struct subsys_interface *sif) @@ -428,7 +425,7 @@ static int mc_device_add(struct device *dev, struct subsys_interface *sif)  	if (err)  		return err; -	if (microcode_init_cpu(cpu) == UCODE_ERROR) +	if (microcode_init_cpu(cpu, true) == UCODE_ERROR)  		return -EINVAL;  	return err; @@ -477,34 +474,41 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)  	struct device *dev;  	dev = get_cpu_device(cpu); -	switch (action) { + +	switch (action & ~CPU_TASKS_FROZEN) {  	case CPU_ONLINE: -	case CPU_ONLINE_FROZEN:  		microcode_update_cpu(cpu); -	case CPU_DOWN_FAILED: -	case CPU_DOWN_FAILED_FROZEN:  		pr_debug("CPU%d added\n", cpu); +		/* +		 * "break" is missing on purpose here because we want to fall +		 * through in order to create the sysfs group. +		 */ + +	case CPU_DOWN_FAILED:  		if (sysfs_create_group(&dev->kobj, &mc_attr_group))  			pr_err("Failed to create group for CPU%d\n", cpu);  		break; +  	case CPU_DOWN_PREPARE: -	case CPU_DOWN_PREPARE_FROZEN:  		/* Suspend is in progress, only remove the interface */  		sysfs_remove_group(&dev->kobj, &mc_attr_group);  		pr_debug("CPU%d removed\n", cpu);  		break;  	/* +	 * case CPU_DEAD: +	 *  	 * When a CPU goes offline, don't free up or invalidate the copy of  	 * the microcode in kernel memory, so that we can reuse it when the  	 * CPU comes back online without unnecessarily requesting the userspace  	 * for it again.  	 */ -	case CPU_UP_CANCELED_FROZEN: -		/* The CPU refused to come up during a system resume */ -		microcode_fini_cpu(cpu); -		break;  	} + +	/* The CPU refused to come up during a system resume */ +	if (action == CPU_UP_CANCELED_FROZEN) +		microcode_fini_cpu(cpu); +  	return NOTIFY_OK;  } diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c index 0327e2b3c40..3544aed3933 100644 --- a/arch/x86/kernel/microcode_intel.c +++ b/arch/x86/kernel/microcode_intel.c @@ -405,7 +405,8 @@ static int get_ucode_fw(void *to, const void *from, size_t n)  	return 0;  } -static enum ucode_state request_microcode_fw(int cpu, struct device *device) +static enum ucode_state request_microcode_fw(int cpu, struct device *device, +					     bool refresh_fw)  {  	char name[30];  	struct cpuinfo_x86 *c = &cpu_data(cpu); diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index eb113693f04..a7c5661f849 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -257,12 +257,14 @@ static int __init msr_init(void)  		goto out_chrdev;  	}  	msr_class->devnode = msr_devnode; +	get_online_cpus();  	for_each_online_cpu(i) {  		err = msr_device_create(i);  		if (err != 0)  			goto out_class;  	}  	register_hotcpu_notifier(&msr_class_cpu_notifier); +	put_online_cpus();  	err = 0;  	goto out; @@ -271,6 +273,7 @@ out_class:  	i = 0;  	for_each_online_cpu(i)  		msr_device_destroy(i); +	put_online_cpus();  	class_destroy(msr_class);  out_chrdev:  	__unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr"); @@ -281,11 +284,13 @@ out:  static void __exit msr_exit(void)  {  	int cpu = 0; +	get_online_cpus();  	for_each_online_cpu(cpu)  		msr_device_destroy(cpu);  	class_destroy(msr_class);  	__unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr");  	unregister_hotcpu_notifier(&msr_class_cpu_notifier); +	put_online_cpus();  }  module_init(msr_init); diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c new file mode 100644 index 00000000000..e309cc5c276 --- /dev/null +++ b/arch/x86/kernel/perf_regs.c @@ -0,0 +1,105 @@ +#include <linux/errno.h> +#include <linux/kernel.h> +#include <linux/sched.h> +#include <linux/perf_event.h> +#include <linux/bug.h> +#include <linux/stddef.h> +#include <asm/perf_regs.h> +#include <asm/ptrace.h> + +#ifdef CONFIG_X86_32 +#define PERF_REG_X86_MAX PERF_REG_X86_32_MAX +#else +#define PERF_REG_X86_MAX PERF_REG_X86_64_MAX +#endif + +#define PT_REGS_OFFSET(id, r) [id] = offsetof(struct pt_regs, r) + +static unsigned int pt_regs_offset[PERF_REG_X86_MAX] = { +	PT_REGS_OFFSET(PERF_REG_X86_AX, ax), +	PT_REGS_OFFSET(PERF_REG_X86_BX, bx), +	PT_REGS_OFFSET(PERF_REG_X86_CX, cx), +	PT_REGS_OFFSET(PERF_REG_X86_DX, dx), +	PT_REGS_OFFSET(PERF_REG_X86_SI, si), +	PT_REGS_OFFSET(PERF_REG_X86_DI, di), +	PT_REGS_OFFSET(PERF_REG_X86_BP, bp), +	PT_REGS_OFFSET(PERF_REG_X86_SP, sp), +	PT_REGS_OFFSET(PERF_REG_X86_IP, ip), +	PT_REGS_OFFSET(PERF_REG_X86_FLAGS, flags), +	PT_REGS_OFFSET(PERF_REG_X86_CS, cs), +	PT_REGS_OFFSET(PERF_REG_X86_SS, ss), +#ifdef CONFIG_X86_32 +	PT_REGS_OFFSET(PERF_REG_X86_DS, ds), +	PT_REGS_OFFSET(PERF_REG_X86_ES, es), +	PT_REGS_OFFSET(PERF_REG_X86_FS, fs), +	PT_REGS_OFFSET(PERF_REG_X86_GS, gs), +#else +	/* +	 * The pt_regs struct does not store +	 * ds, es, fs, gs in 64 bit mode. +	 */ +	(unsigned int) -1, +	(unsigned int) -1, +	(unsigned int) -1, +	(unsigned int) -1, +#endif +#ifdef CONFIG_X86_64 +	PT_REGS_OFFSET(PERF_REG_X86_R8, r8), +	PT_REGS_OFFSET(PERF_REG_X86_R9, r9), +	PT_REGS_OFFSET(PERF_REG_X86_R10, r10), +	PT_REGS_OFFSET(PERF_REG_X86_R11, r11), +	PT_REGS_OFFSET(PERF_REG_X86_R12, r12), +	PT_REGS_OFFSET(PERF_REG_X86_R13, r13), +	PT_REGS_OFFSET(PERF_REG_X86_R14, r14), +	PT_REGS_OFFSET(PERF_REG_X86_R15, r15), +#endif +}; + +u64 perf_reg_value(struct pt_regs *regs, int idx) +{ +	if (WARN_ON_ONCE(idx >= ARRAY_SIZE(pt_regs_offset))) +		return 0; + +	return regs_get_register(regs, pt_regs_offset[idx]); +} + +#define REG_RESERVED (~((1ULL << PERF_REG_X86_MAX) - 1ULL)) + +#ifdef CONFIG_X86_32 +int perf_reg_validate(u64 mask) +{ +	if (!mask || mask & REG_RESERVED) +		return -EINVAL; + +	return 0; +} + +u64 perf_reg_abi(struct task_struct *task) +{ +	return PERF_SAMPLE_REGS_ABI_32; +} +#else /* CONFIG_X86_64 */ +#define REG_NOSUPPORT ((1ULL << PERF_REG_X86_DS) | \ +		       (1ULL << PERF_REG_X86_ES) | \ +		       (1ULL << PERF_REG_X86_FS) | \ +		       (1ULL << PERF_REG_X86_GS)) + +int perf_reg_validate(u64 mask) +{ +	if (!mask || mask & REG_RESERVED) +		return -EINVAL; + +	if (mask & REG_NOSUPPORT) +		return -EINVAL; + +	return 0; +} + +u64 perf_reg_abi(struct task_struct *task) +{ +	if (test_tsk_thread_flag(task, TIF_IA32)) +		return PERF_SAMPLE_REGS_ABI_32; +	else +		return PERF_SAMPLE_REGS_ABI_64; +} +#endif /* CONFIG_X86_32 */ diff --git a/arch/x86/kernel/probe_roms.c b/arch/x86/kernel/probe_roms.c index 0bc72e2069e..d5f15c3f7b2 100644 --- a/arch/x86/kernel/probe_roms.c +++ b/arch/x86/kernel/probe_roms.c @@ -150,7 +150,7 @@ static struct resource *find_oprom(struct pci_dev *pdev)  	return oprom;  } -void *pci_map_biosrom(struct pci_dev *pdev) +void __iomem *pci_map_biosrom(struct pci_dev *pdev)  {  	struct resource *oprom = find_oprom(pdev); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index ef6a8456f71..dc3567e083f 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -66,15 +66,13 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)  {  	int ret; -	unlazy_fpu(src); -  	*dst = *src;  	if (fpu_allocated(&src->thread.fpu)) {  		memset(&dst->thread.fpu, 0, sizeof(dst->thread.fpu));  		ret = fpu_alloc(&dst->thread.fpu);  		if (ret)  			return ret; -		fpu_copy(&dst->thread.fpu, &src->thread.fpu); +		fpu_copy(dst, src);  	}  	return 0;  } @@ -97,16 +95,6 @@ void arch_task_cache_init(void)  				  SLAB_PANIC | SLAB_NOTRACK, NULL);  } -static inline void drop_fpu(struct task_struct *tsk) -{ -	/* -	 * Forget coprocessor state.. -	 */ -	tsk->fpu_counter = 0; -	clear_fpu(tsk); -	clear_used_math(); -} -  /*   * Free current thread data structures etc..   */ @@ -163,7 +151,13 @@ void flush_thread(void)  	flush_ptrace_hw_breakpoint(tsk);  	memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); -	drop_fpu(tsk); +	drop_init_fpu(tsk); +	/* +	 * Free the FPU state for non xsave platforms. They get reallocated +	 * lazily at the first use. +	 */ +	if (!use_eager_fpu()) +		free_thread_xstate(tsk);  }  static void hard_disable_TSC(void) diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 516fa186121..b9ff83c7135 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -190,10 +190,6 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)  	regs->cs		= __USER_CS;  	regs->ip		= new_ip;  	regs->sp		= new_sp; -	/* -	 * Free the old FP and other extended state -	 */ -	free_thread_xstate(current);  }  EXPORT_SYMBOL_GPL(start_thread); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 0a980c9d7cb..8a6d20ce197 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -232,10 +232,6 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip,  	regs->cs		= _cs;  	regs->ss		= _ss;  	regs->flags		= X86_EFLAGS_IF; -	/* -	 * Free the old FP and other extended state -	 */ -	free_thread_xstate(current);  }  void diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index c4c6a5c2bf0..b00b33a1839 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -21,6 +21,7 @@  #include <linux/signal.h>  #include <linux/perf_event.h>  #include <linux/hw_breakpoint.h> +#include <linux/rcupdate.h>  #include <asm/uaccess.h>  #include <asm/pgtable.h> @@ -1332,9 +1333,6 @@ static const struct user_regset_view user_x86_64_view = {  #define genregs32_get		genregs_get  #define genregs32_set		genregs_set -#define user_i387_ia32_struct	user_i387_struct -#define user32_fxsr_struct	user_fxsr_struct -  #endif	/* CONFIG_X86_64 */  #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION @@ -1463,6 +1461,8 @@ long syscall_trace_enter(struct pt_regs *regs)  {  	long ret = 0; +	rcu_user_exit(); +  	/*  	 * If we stepped into a sysenter/syscall insn, it trapped in  	 * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP. @@ -1526,4 +1526,6 @@ void syscall_trace_leave(struct pt_regs *regs)  			!test_thread_flag(TIF_SYSCALL_EMU);  	if (step || test_thread_flag(TIF_SYSCALL_TRACE))  		tracehook_report_syscall_exit(regs, step); + +	rcu_user_enter();  } diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index f4b9b80e1b9..4f165479c45 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -961,9 +961,7 @@ void __init setup_arch(char **cmdline_p)  	kvmclock_init();  #endif -	x86_init.paging.pagetable_setup_start(swapper_pg_dir); -	paging_init(); -	x86_init.paging.pagetable_setup_done(swapper_pg_dir); +	x86_init.paging.pagetable_init();  	if (boot_cpu_data.cpuid_level >= 0) {  		/* A CPU has %cr4 if and only if it has CPUID */ diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index b280908a376..b33144c8b30 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -114,11 +114,12 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,  		regs->orig_ax = -1;		/* disable syscall checks */  		get_user_ex(buf, &sc->fpstate); -		err |= restore_i387_xstate(buf);  		get_user_ex(*pax, &sc->ax);  	} get_user_catch(err); +	err |= restore_xstate_sig(buf, config_enabled(CONFIG_X86_32)); +  	return err;  } @@ -206,35 +207,32 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,  	     void __user **fpstate)  {  	/* Default to using normal stack */ +	unsigned long math_size = 0;  	unsigned long sp = regs->sp; +	unsigned long buf_fx = 0;  	int onsigstack = on_sig_stack(sp); -#ifdef CONFIG_X86_64  	/* redzone */ -	sp -= 128; -#endif /* CONFIG_X86_64 */ +	if (config_enabled(CONFIG_X86_64)) +		sp -= 128;  	if (!onsigstack) {  		/* This is the X/Open sanctioned signal stack switching.  */  		if (ka->sa.sa_flags & SA_ONSTACK) {  			if (current->sas_ss_size)  				sp = current->sas_ss_sp + current->sas_ss_size; -		} else { -#ifdef CONFIG_X86_32 -			/* This is the legacy signal stack switching. */ -			if ((regs->ss & 0xffff) != __USER_DS && -				!(ka->sa.sa_flags & SA_RESTORER) && -					ka->sa.sa_restorer) +		} else if (config_enabled(CONFIG_X86_32) && +			   (regs->ss & 0xffff) != __USER_DS && +			   !(ka->sa.sa_flags & SA_RESTORER) && +			   ka->sa.sa_restorer) { +				/* This is the legacy signal stack switching. */  				sp = (unsigned long) ka->sa.sa_restorer; -#endif /* CONFIG_X86_32 */  		}  	}  	if (used_math()) { -		sp -= sig_xstate_size; -#ifdef CONFIG_X86_64 -		sp = round_down(sp, 64); -#endif /* CONFIG_X86_64 */ +		sp = alloc_mathframe(sp, config_enabled(CONFIG_X86_32), +				     &buf_fx, &math_size);  		*fpstate = (void __user *)sp;  	} @@ -247,8 +245,9 @@ get_sigframe(struct k_sigaction *ka, struct pt_regs *regs, size_t frame_size,  	if (onsigstack && !likely(on_sig_stack(sp)))  		return (void __user *)-1L; -	/* save i387 state */ -	if (used_math() && save_i387_xstate(*fpstate) < 0) +	/* save i387 and extended state */ +	if (used_math() && +	    save_xstate_sig(*fpstate, (void __user *)buf_fx, math_size) < 0)  		return (void __user *)-1L;  	return (void __user *)sp; @@ -357,7 +356,6 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,  		put_user_ex(sig, &frame->sig);  		put_user_ex(&frame->info, &frame->pinfo);  		put_user_ex(&frame->uc, &frame->puc); -		err |= copy_siginfo_to_user(&frame->info, info);  		/* Create the ucontext.  */  		if (cpu_has_xsave) @@ -369,9 +367,6 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,  		put_user_ex(sas_ss_flags(regs->sp),  			    &frame->uc.uc_stack.ss_flags);  		put_user_ex(current->sas_ss_size, &frame->uc.uc_stack.ss_size); -		err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate, -					regs, set->sig[0]); -		err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));  		/* Set up to return from userspace.  */  		restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn); @@ -388,6 +383,11 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,  		 */  		put_user_ex(*((u64 *)&rt_retcode), (u64 *)frame->retcode);  	} put_user_catch(err); +	 +	err |= copy_siginfo_to_user(&frame->info, info); +	err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate, +				regs, set->sig[0]); +	err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));  	if (err)  		return -EFAULT; @@ -436,8 +436,6 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,  		put_user_ex(sas_ss_flags(regs->sp),  			    &frame->uc.uc_stack.ss_flags);  		put_user_ex(me->sas_ss_size, &frame->uc.uc_stack.ss_size); -		err |= setup_sigcontext(&frame->uc.uc_mcontext, fp, regs, set->sig[0]); -		err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));  		/* Set up to return from userspace.  If provided, use a stub  		   already in userspace.  */ @@ -450,6 +448,9 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,  		}  	} put_user_catch(err); +	err |= setup_sigcontext(&frame->uc.uc_mcontext, fp, regs, set->sig[0]); +	err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); +  	if (err)  		return -EFAULT; @@ -474,6 +475,75 @@ static int __setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,  }  #endif /* CONFIG_X86_32 */ +static int x32_setup_rt_frame(int sig, struct k_sigaction *ka, +			      siginfo_t *info, compat_sigset_t *set, +			      struct pt_regs *regs) +{ +#ifdef CONFIG_X86_X32_ABI +	struct rt_sigframe_x32 __user *frame; +	void __user *restorer; +	int err = 0; +	void __user *fpstate = NULL; + +	frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate); + +	if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) +		return -EFAULT; + +	if (ka->sa.sa_flags & SA_SIGINFO) { +		if (copy_siginfo_to_user32(&frame->info, info)) +			return -EFAULT; +	} + +	put_user_try { +		/* Create the ucontext.  */ +		if (cpu_has_xsave) +			put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags); +		else +			put_user_ex(0, &frame->uc.uc_flags); +		put_user_ex(0, &frame->uc.uc_link); +		put_user_ex(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp); +		put_user_ex(sas_ss_flags(regs->sp), +			    &frame->uc.uc_stack.ss_flags); +		put_user_ex(current->sas_ss_size, &frame->uc.uc_stack.ss_size); +		put_user_ex(0, &frame->uc.uc__pad0); + +		if (ka->sa.sa_flags & SA_RESTORER) { +			restorer = ka->sa.sa_restorer; +		} else { +			/* could use a vstub here */ +			restorer = NULL; +			err |= -EFAULT; +		} +		put_user_ex(restorer, &frame->pretcode); +	} put_user_catch(err); + +	err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate, +				regs, set->sig[0]); +	err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); + +	if (err) +		return -EFAULT; + +	/* Set up registers for signal handler */ +	regs->sp = (unsigned long) frame; +	regs->ip = (unsigned long) ka->sa.sa_handler; + +	/* We use the x32 calling convention here... */ +	regs->di = sig; +	regs->si = (unsigned long) &frame->info; +	regs->dx = (unsigned long) &frame->uc; + +	loadsegment(ds, __USER_DS); +	loadsegment(es, __USER_DS); + +	regs->cs = __USER_CS; +	regs->ss = __USER_DS; +#endif	/* CONFIG_X86_X32_ABI */ + +	return 0; +} +  #ifdef CONFIG_X86_32  /*   * Atomically swap in the new signal mask, and wait for a signal. @@ -612,55 +682,22 @@ static int signr_convert(int sig)  	return sig;  } -#ifdef CONFIG_X86_32 - -#define is_ia32	1 -#define ia32_setup_frame	__setup_frame -#define ia32_setup_rt_frame	__setup_rt_frame - -#else /* !CONFIG_X86_32 */ - -#ifdef CONFIG_IA32_EMULATION -#define is_ia32	test_thread_flag(TIF_IA32) -#else /* !CONFIG_IA32_EMULATION */ -#define is_ia32	0 -#endif /* CONFIG_IA32_EMULATION */ - -#ifdef CONFIG_X86_X32_ABI -#define is_x32	test_thread_flag(TIF_X32) - -static int x32_setup_rt_frame(int sig, struct k_sigaction *ka, -			      siginfo_t *info, compat_sigset_t *set, -			      struct pt_regs *regs); -#else /* !CONFIG_X86_X32_ABI */ -#define is_x32	0 -#endif /* CONFIG_X86_X32_ABI */ - -int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, -		sigset_t *set, struct pt_regs *regs); -int ia32_setup_frame(int sig, struct k_sigaction *ka, -		sigset_t *set, struct pt_regs *regs); - -#endif /* CONFIG_X86_32 */ -  static int  setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info,  		struct pt_regs *regs)  {  	int usig = signr_convert(sig);  	sigset_t *set = sigmask_to_save(); +	compat_sigset_t *cset = (compat_sigset_t *) set;  	/* Set up the stack frame */ -	if (is_ia32) { +	if (is_ia32_frame()) {  		if (ka->sa.sa_flags & SA_SIGINFO) -			return ia32_setup_rt_frame(usig, ka, info, set, regs); +			return ia32_setup_rt_frame(usig, ka, info, cset, regs);  		else -			return ia32_setup_frame(usig, ka, set, regs); -#ifdef CONFIG_X86_X32_ABI -	} else if (is_x32) { -		return x32_setup_rt_frame(usig, ka, info, -					 (compat_sigset_t *)set, regs); -#endif +			return ia32_setup_frame(usig, ka, cset, regs); +	} else if (is_x32_frame()) { +		return x32_setup_rt_frame(usig, ka, info, cset, regs);  	} else {  		return __setup_rt_frame(sig, ka, info, set, regs);  	} @@ -779,6 +816,8 @@ static void do_signal(struct pt_regs *regs)  void  do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)  { +	rcu_user_exit(); +  #ifdef CONFIG_X86_MCE  	/* notify userspace of pending MCEs */  	if (thread_info_flags & _TIF_MCE_NOTIFY) @@ -804,6 +843,8 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)  #ifdef CONFIG_X86_32  	clear_thread_flag(TIF_IRET);  #endif /* CONFIG_X86_32 */ + +	rcu_user_enter();  }  void signal_fault(struct pt_regs *regs, void __user *frame, char *where) @@ -824,72 +865,6 @@ void signal_fault(struct pt_regs *regs, void __user *frame, char *where)  }  #ifdef CONFIG_X86_X32_ABI -static int x32_setup_rt_frame(int sig, struct k_sigaction *ka, -			      siginfo_t *info, compat_sigset_t *set, -			      struct pt_regs *regs) -{ -	struct rt_sigframe_x32 __user *frame; -	void __user *restorer; -	int err = 0; -	void __user *fpstate = NULL; - -	frame = get_sigframe(ka, regs, sizeof(*frame), &fpstate); - -	if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) -		return -EFAULT; - -	if (ka->sa.sa_flags & SA_SIGINFO) { -		if (copy_siginfo_to_user32(&frame->info, info)) -			return -EFAULT; -	} - -	put_user_try { -		/* Create the ucontext.  */ -		if (cpu_has_xsave) -			put_user_ex(UC_FP_XSTATE, &frame->uc.uc_flags); -		else -			put_user_ex(0, &frame->uc.uc_flags); -		put_user_ex(0, &frame->uc.uc_link); -		put_user_ex(current->sas_ss_sp, &frame->uc.uc_stack.ss_sp); -		put_user_ex(sas_ss_flags(regs->sp), -			    &frame->uc.uc_stack.ss_flags); -		put_user_ex(current->sas_ss_size, &frame->uc.uc_stack.ss_size); -		put_user_ex(0, &frame->uc.uc__pad0); -		err |= setup_sigcontext(&frame->uc.uc_mcontext, fpstate, -					regs, set->sig[0]); -		err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set)); - -		if (ka->sa.sa_flags & SA_RESTORER) { -			restorer = ka->sa.sa_restorer; -		} else { -			/* could use a vstub here */ -			restorer = NULL; -			err |= -EFAULT; -		} -		put_user_ex(restorer, &frame->pretcode); -	} put_user_catch(err); - -	if (err) -		return -EFAULT; - -	/* Set up registers for signal handler */ -	regs->sp = (unsigned long) frame; -	regs->ip = (unsigned long) ka->sa.sa_handler; - -	/* We use the x32 calling convention here... */ -	regs->di = sig; -	regs->si = (unsigned long) &frame->info; -	regs->dx = (unsigned long) &frame->uc; - -	loadsegment(ds, __USER_DS); -	loadsegment(es, __USER_DS); - -	regs->cs = __USER_CS; -	regs->ss = __USER_DS; - -	return 0; -} -  asmlinkage long sys32_x32_rt_sigreturn(struct pt_regs *regs)  {  	struct rt_sigframe_x32 __user *frame; diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 7c5a8c314c0..c80a33bc528 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -665,7 +665,8 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu, struct task_struct *idle)  	unsigned long boot_error = 0;  	int timeout; -	alternatives_smp_switch(1); +	/* Just in case we booted with a single CPU. */ +	alternatives_enable_smp();  	idle->thread.sp = (unsigned long) (((struct pt_regs *)  			  (THREAD_SIZE +  task_stack_page(idle))) - 1); @@ -1053,20 +1054,6 @@ out:  	preempt_enable();  } -void arch_disable_nonboot_cpus_begin(void) -{ -	/* -	 * Avoid the smp alternatives switch during the disable_nonboot_cpus(). -	 * In the suspend path, we will be back in the SMP mode shortly anyways. -	 */ -	skip_smp_alternatives = true; -} - -void arch_disable_nonboot_cpus_end(void) -{ -	skip_smp_alternatives = false; -} -  void arch_enable_nonboot_cpus_begin(void)  {  	set_mtrr_aps_delayed_init(); @@ -1256,9 +1243,6 @@ void native_cpu_die(unsigned int cpu)  		if (per_cpu(cpu_state, cpu) == CPU_DEAD) {  			if (system_state == SYSTEM_RUNNING)  				pr_info("CPU %u is now offline\n", cpu); - -			if (1 == num_online_cpus()) -				alternatives_smp_switch(0);  			return;  		}  		msleep(100); diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c index c346d116148..cd3b2438a98 100644 --- a/arch/x86/kernel/step.c +++ b/arch/x86/kernel/step.c @@ -157,6 +157,33 @@ static int enable_single_step(struct task_struct *child)  	return 1;  } +void set_task_blockstep(struct task_struct *task, bool on) +{ +	unsigned long debugctl; + +	/* +	 * Ensure irq/preemption can't change debugctl in between. +	 * Note also that both TIF_BLOCKSTEP and debugctl should +	 * be changed atomically wrt preemption. +	 * FIXME: this means that set/clear TIF_BLOCKSTEP is simply +	 * wrong if task != current, SIGKILL can wakeup the stopped +	 * tracee and set/clear can play with the running task, this +	 * can confuse the next __switch_to_xtra(). +	 */ +	local_irq_disable(); +	debugctl = get_debugctlmsr(); +	if (on) { +		debugctl |= DEBUGCTLMSR_BTF; +		set_tsk_thread_flag(task, TIF_BLOCKSTEP); +	} else { +		debugctl &= ~DEBUGCTLMSR_BTF; +		clear_tsk_thread_flag(task, TIF_BLOCKSTEP); +	} +	if (task == current) +		update_debugctlmsr(debugctl); +	local_irq_enable(); +} +  /*   * Enable single or block step.   */ @@ -169,19 +196,10 @@ static void enable_step(struct task_struct *child, bool block)  	 * So no one should try to use debugger block stepping in a program  	 * that uses user-mode single stepping itself.  	 */ -	if (enable_single_step(child) && block) { -		unsigned long debugctl = get_debugctlmsr(); - -		debugctl |= DEBUGCTLMSR_BTF; -		update_debugctlmsr(debugctl); -		set_tsk_thread_flag(child, TIF_BLOCKSTEP); -	} else if (test_tsk_thread_flag(child, TIF_BLOCKSTEP)) { -		unsigned long debugctl = get_debugctlmsr(); - -		debugctl &= ~DEBUGCTLMSR_BTF; -		update_debugctlmsr(debugctl); -		clear_tsk_thread_flag(child, TIF_BLOCKSTEP); -	} +	if (enable_single_step(child) && block) +		set_task_blockstep(child, true); +	else if (test_tsk_thread_flag(child, TIF_BLOCKSTEP)) +		set_task_blockstep(child, false);  }  void user_enable_single_step(struct task_struct *child) @@ -199,13 +217,8 @@ void user_disable_single_step(struct task_struct *child)  	/*  	 * Make sure block stepping (BTF) is disabled.  	 */ -	if (test_tsk_thread_flag(child, TIF_BLOCKSTEP)) { -		unsigned long debugctl = get_debugctlmsr(); - -		debugctl &= ~DEBUGCTLMSR_BTF; -		update_debugctlmsr(debugctl); -		clear_tsk_thread_flag(child, TIF_BLOCKSTEP); -	} +	if (test_tsk_thread_flag(child, TIF_BLOCKSTEP)) +		set_task_blockstep(child, false);  	/* Always clear TIF_SINGLESTEP... */  	clear_tsk_thread_flag(child, TIF_SINGLESTEP); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index b481341c936..8276dc6794c 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -55,6 +55,7 @@  #include <asm/i387.h>  #include <asm/fpu-internal.h>  #include <asm/mce.h> +#include <asm/rcu.h>  #include <asm/mach_traps.h> @@ -107,30 +108,45 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)  	dec_preempt_count();  } -static void __kprobes -do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, -	long error_code, siginfo_t *info) +static int __kprobes +do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str, +		  struct pt_regs *regs,	long error_code)  { -	struct task_struct *tsk = current; -  #ifdef CONFIG_X86_32  	if (regs->flags & X86_VM_MASK) {  		/* -		 * traps 0, 1, 3, 4, and 5 should be forwarded to vm86. +		 * Traps 0, 1, 3, 4, and 5 should be forwarded to vm86.  		 * On nmi (interrupt 2), do_trap should not be called.  		 */ -		if (trapnr < X86_TRAP_UD) -			goto vm86_trap; -		goto trap_signal; +		if (trapnr < X86_TRAP_UD) { +			if (!handle_vm86_trap((struct kernel_vm86_regs *) regs, +						error_code, trapnr)) +				return 0; +		} +		return -1;  	}  #endif +	if (!user_mode(regs)) { +		if (!fixup_exception(regs)) { +			tsk->thread.error_code = error_code; +			tsk->thread.trap_nr = trapnr; +			die(str, regs, error_code); +		} +		return 0; +	} -	if (!user_mode(regs)) -		goto kernel_trap; +	return -1; +} -#ifdef CONFIG_X86_32 -trap_signal: -#endif +static void __kprobes +do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, +	long error_code, siginfo_t *info) +{ +	struct task_struct *tsk = current; + + +	if (!do_trap_no_signal(tsk, trapnr, str, regs, error_code)) +		return;  	/*  	 * We want error_code and trap_nr set for userspace faults and  	 * kernelspace faults which result in die(), but not @@ -158,33 +174,20 @@ trap_signal:  		force_sig_info(signr, info, tsk);  	else  		force_sig(signr, tsk); -	return; - -kernel_trap: -	if (!fixup_exception(regs)) { -		tsk->thread.error_code = error_code; -		tsk->thread.trap_nr = trapnr; -		die(str, regs, error_code); -	} -	return; - -#ifdef CONFIG_X86_32 -vm86_trap: -	if (handle_vm86_trap((struct kernel_vm86_regs *) regs, -						error_code, trapnr)) -		goto trap_signal; -	return; -#endif  }  #define DO_ERROR(trapnr, signr, str, name)				\  dotraplinkage void do_##name(struct pt_regs *regs, long error_code)	\  {									\ -	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr)	\ -							== NOTIFY_STOP)	\ +	exception_enter(regs);						\ +	if (notify_die(DIE_TRAP, str, regs, error_code,			\ +			trapnr, signr) == NOTIFY_STOP) {		\ +		exception_exit(regs);					\  		return;							\ +	}								\  	conditional_sti(regs);						\  	do_trap(trapnr, signr, str, regs, error_code, NULL);		\ +	exception_exit(regs);						\  }  #define DO_ERROR_INFO(trapnr, signr, str, name, sicode, siaddr)		\ @@ -195,11 +198,15 @@ dotraplinkage void do_##name(struct pt_regs *regs, long error_code)	\  	info.si_errno = 0;						\  	info.si_code = sicode;						\  	info.si_addr = (void __user *)siaddr;				\ -	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr)	\ -							== NOTIFY_STOP)	\ +	exception_enter(regs);						\ +	if (notify_die(DIE_TRAP, str, regs, error_code,			\ +			trapnr, signr) == NOTIFY_STOP) {		\ +		exception_exit(regs);					\  		return;							\ +	}								\  	conditional_sti(regs);						\  	do_trap(trapnr, signr, str, regs, error_code, &info);		\ +	exception_exit(regs);						\  }  DO_ERROR_INFO(X86_TRAP_DE, SIGFPE, "divide error", divide_error, FPE_INTDIV, @@ -222,12 +229,14 @@ DO_ERROR_INFO(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check,  /* Runs on IST stack */  dotraplinkage void do_stack_segment(struct pt_regs *regs, long error_code)  { +	exception_enter(regs);  	if (notify_die(DIE_TRAP, "stack segment", regs, error_code, -			X86_TRAP_SS, SIGBUS) == NOTIFY_STOP) -		return; -	preempt_conditional_sti(regs); -	do_trap(X86_TRAP_SS, SIGBUS, "stack segment", regs, error_code, NULL); -	preempt_conditional_cli(regs); +		       X86_TRAP_SS, SIGBUS) != NOTIFY_STOP) { +		preempt_conditional_sti(regs); +		do_trap(X86_TRAP_SS, SIGBUS, "stack segment", regs, error_code, NULL); +		preempt_conditional_cli(regs); +	} +	exception_exit(regs);  }  dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) @@ -235,6 +244,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)  	static const char str[] = "double fault";  	struct task_struct *tsk = current; +	exception_enter(regs);  	/* Return not checked because double check cannot be ignored */  	notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV); @@ -255,16 +265,29 @@ do_general_protection(struct pt_regs *regs, long error_code)  {  	struct task_struct *tsk; +	exception_enter(regs);  	conditional_sti(regs);  #ifdef CONFIG_X86_32 -	if (regs->flags & X86_VM_MASK) -		goto gp_in_vm86; +	if (regs->flags & X86_VM_MASK) { +		local_irq_enable(); +		handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); +		goto exit; +	}  #endif  	tsk = current; -	if (!user_mode(regs)) -		goto gp_in_kernel; +	if (!user_mode(regs)) { +		if (fixup_exception(regs)) +			goto exit; + +		tsk->thread.error_code = error_code; +		tsk->thread.trap_nr = X86_TRAP_GP; +		if (notify_die(DIE_GPF, "general protection fault", regs, error_code, +			       X86_TRAP_GP, SIGSEGV) != NOTIFY_STOP) +			die("general protection fault", regs, error_code); +		goto exit; +	}  	tsk->thread.error_code = error_code;  	tsk->thread.trap_nr = X86_TRAP_GP; @@ -279,25 +302,8 @@ do_general_protection(struct pt_regs *regs, long error_code)  	}  	force_sig(SIGSEGV, tsk); -	return; - -#ifdef CONFIG_X86_32 -gp_in_vm86: -	local_irq_enable(); -	handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); -	return; -#endif - -gp_in_kernel: -	if (fixup_exception(regs)) -		return; - -	tsk->thread.error_code = error_code; -	tsk->thread.trap_nr = X86_TRAP_GP; -	if (notify_die(DIE_GPF, "general protection fault", regs, error_code, -			X86_TRAP_GP, SIGSEGV) == NOTIFY_STOP) -		return; -	die("general protection fault", regs, error_code); +exit: +	exception_exit(regs);  }  /* May run on IST stack. */ @@ -312,15 +318,16 @@ dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_co  	    ftrace_int3_handler(regs))  		return;  #endif +	exception_enter(regs);  #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP  	if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,  				SIGTRAP) == NOTIFY_STOP) -		return; +		goto exit;  #endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */  	if (notify_die(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,  			SIGTRAP) == NOTIFY_STOP) -		return; +		goto exit;  	/*  	 * Let others (NMI) know that the debug stack is in use @@ -331,6 +338,8 @@ dotraplinkage void __kprobes notrace do_int3(struct pt_regs *regs, long error_co  	do_trap(X86_TRAP_BP, SIGTRAP, "int3", regs, error_code, NULL);  	preempt_conditional_cli(regs);  	debug_stack_usage_dec(); +exit: +	exception_exit(regs);  }  #ifdef CONFIG_X86_64 @@ -391,6 +400,8 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)  	unsigned long dr6;  	int si_code; +	exception_enter(regs); +  	get_debugreg(dr6, 6);  	/* Filter out all the reserved bits which are preset to 1 */ @@ -406,7 +417,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)  	/* Catch kmemcheck conditions first of all! */  	if ((dr6 & DR_STEP) && kmemcheck_trap(regs)) -		return; +		goto exit;  	/* DR6 may or may not be cleared by the CPU */  	set_debugreg(0, 6); @@ -421,7 +432,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)  	if (notify_die(DIE_DEBUG, "debug", regs, PTR_ERR(&dr6), error_code,  							SIGTRAP) == NOTIFY_STOP) -		return; +		goto exit;  	/*  	 * Let others (NMI) know that the debug stack is in use @@ -437,7 +448,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)  					X86_TRAP_DB);  		preempt_conditional_cli(regs);  		debug_stack_usage_dec(); -		return; +		goto exit;  	}  	/* @@ -458,7 +469,8 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)  	preempt_conditional_cli(regs);  	debug_stack_usage_dec(); -	return; +exit: +	exception_exit(regs);  }  /* @@ -555,14 +567,17 @@ dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)  #ifdef CONFIG_X86_32  	ignore_fpu_irq = 1;  #endif - +	exception_enter(regs);  	math_error(regs, error_code, X86_TRAP_MF); +	exception_exit(regs);  }  dotraplinkage void  do_simd_coprocessor_error(struct pt_regs *regs, long error_code)  { +	exception_enter(regs);  	math_error(regs, error_code, X86_TRAP_XF); +	exception_exit(regs);  }  dotraplinkage void @@ -613,11 +628,12 @@ void math_state_restore(void)  	}  	__thread_fpu_begin(tsk); +  	/*  	 * Paranoid restore. send a SIGSEGV if we fail to restore the state.  	 */  	if (unlikely(restore_fpu_checking(tsk))) { -		__thread_fpu_end(tsk); +		drop_init_fpu(tsk);  		force_sig(SIGSEGV, tsk);  		return;  	} @@ -629,6 +645,9 @@ EXPORT_SYMBOL_GPL(math_state_restore);  dotraplinkage void __kprobes  do_device_not_available(struct pt_regs *regs, long error_code)  { +	exception_enter(regs); +	BUG_ON(use_eager_fpu()); +  #ifdef CONFIG_MATH_EMULATION  	if (read_cr0() & X86_CR0_EM) {  		struct math_emu_info info = { }; @@ -637,6 +656,7 @@ do_device_not_available(struct pt_regs *regs, long error_code)  		info.regs = regs;  		math_emulate(&info); +		exception_exit(regs);  		return;  	}  #endif @@ -644,12 +664,15 @@ do_device_not_available(struct pt_regs *regs, long error_code)  #ifdef CONFIG_X86_32  	conditional_sti(regs);  #endif +	exception_exit(regs);  }  #ifdef CONFIG_X86_32  dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)  {  	siginfo_t info; + +	exception_enter(regs);  	local_irq_enable();  	info.si_signo = SIGILL; @@ -657,10 +680,11 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)  	info.si_code = ILL_BADSTK;  	info.si_addr = NULL;  	if (notify_die(DIE_TRAP, "iret exception", regs, error_code, -			X86_TRAP_IRET, SIGILL) == NOTIFY_STOP) -		return; -	do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, error_code, -		&info); +			X86_TRAP_IRET, SIGILL) != NOTIFY_STOP) { +		do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, error_code, +			&info); +	} +	exception_exit(regs);  }  #endif diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 36fd42091fa..9538f00827a 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -41,6 +41,9 @@  /* Adjust the return address of a call insn */  #define UPROBE_FIX_CALL	0x2 +/* Instruction will modify TF, don't change it */ +#define UPROBE_FIX_SETF	0x4 +  #define UPROBE_FIX_RIP_AX	0x8000  #define UPROBE_FIX_RIP_CX	0x4000 @@ -239,6 +242,10 @@ static void prepare_fixups(struct arch_uprobe *auprobe, struct insn *insn)  	insn_get_opcode(insn);	/* should be a nop */  	switch (OPCODE1(insn)) { +	case 0x9d: +		/* popf */ +		auprobe->fixups |= UPROBE_FIX_SETF; +		break;  	case 0xc3:		/* ret/lret */  	case 0xcb:  	case 0xc2: @@ -646,7 +653,7 @@ void arch_uprobe_abort_xol(struct arch_uprobe *auprobe, struct pt_regs *regs)   * Skip these instructions as per the currently known x86 ISA.   * 0x66* { 0x90 | 0x0f 0x1f | 0x0f 0x19 | 0x87 0xc0 }   */ -bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) +static bool __skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs)  {  	int i; @@ -673,3 +680,46 @@ bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs)  	}  	return false;  } + +bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs) +{ +	bool ret = __skip_sstep(auprobe, regs); +	if (ret && (regs->flags & X86_EFLAGS_TF)) +		send_sig(SIGTRAP, current, 0); +	return ret; +} + +void arch_uprobe_enable_step(struct arch_uprobe *auprobe) +{ +	struct task_struct *task = current; +	struct arch_uprobe_task	*autask	= &task->utask->autask; +	struct pt_regs *regs = task_pt_regs(task); + +	autask->saved_tf = !!(regs->flags & X86_EFLAGS_TF); + +	regs->flags |= X86_EFLAGS_TF; +	if (test_tsk_thread_flag(task, TIF_BLOCKSTEP)) +		set_task_blockstep(task, false); +} + +void arch_uprobe_disable_step(struct arch_uprobe *auprobe) +{ +	struct task_struct *task = current; +	struct arch_uprobe_task	*autask	= &task->utask->autask; +	bool trapped = (task->utask->state == UTASK_SSTEP_TRAPPED); +	struct pt_regs *regs = task_pt_regs(task); +	/* +	 * The state of TIF_BLOCKSTEP was not saved so we can get an extra +	 * SIGTRAP if we do not clear TF. We need to examine the opcode to +	 * make it right. +	 */ +	if (unlikely(trapped)) { +		if (!autask->saved_tf) +			regs->flags &= ~X86_EFLAGS_TF; +	} else { +		if (autask->saved_tf) +			send_sig(SIGTRAP, task, 0); +		else if (!(auprobe->fixups & UPROBE_FIX_SETF)) +			regs->flags &= ~X86_EFLAGS_TF; +	} +} diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c index 6020f6f5927..1330dd10295 100644 --- a/arch/x86/kernel/x8664_ksyms_64.c +++ b/arch/x86/kernel/x8664_ksyms_64.c @@ -13,9 +13,13 @@  #include <asm/ftrace.h>  #ifdef CONFIG_FUNCTION_TRACER -/* mcount is defined in assembly */ +/* mcount and __fentry__ are defined in assembly */ +#ifdef CC_USING_FENTRY +EXPORT_SYMBOL(__fentry__); +#else  EXPORT_SYMBOL(mcount);  #endif +#endif  EXPORT_SYMBOL(__get_user_1);  EXPORT_SYMBOL(__get_user_2); diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 9f3167e891e..7a3d075a814 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -26,7 +26,6 @@  void __cpuinit x86_init_noop(void) { }  void __init x86_init_uint_noop(unsigned int unused) { } -void __init x86_init_pgd_noop(pgd_t *unused) { }  int __init iommu_init_noop(void) { return 0; }  void iommu_shutdown_noop(void) { } @@ -68,8 +67,7 @@ struct x86_init_ops x86_init __initdata = {  	},  	.paging = { -		.pagetable_setup_start	= native_pagetable_setup_start, -		.pagetable_setup_done	= native_pagetable_setup_done, +		.pagetable_init		= native_pagetable_init,  	},  	.timers = { diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 3d3e2070911..ada87a329ed 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -10,9 +10,7 @@  #include <linux/compat.h>  #include <asm/i387.h>  #include <asm/fpu-internal.h> -#ifdef CONFIG_IA32_EMULATION -#include <asm/sigcontext32.h> -#endif +#include <asm/sigframe.h>  #include <asm/xcr.h>  /* @@ -23,13 +21,9 @@ u64 pcntxt_mask;  /*   * Represents init state for the supported extended state.   */ -static struct xsave_struct *init_xstate_buf; - -struct _fpx_sw_bytes fx_sw_reserved; -#ifdef CONFIG_IA32_EMULATION -struct _fpx_sw_bytes fx_sw_reserved_ia32; -#endif +struct xsave_struct *init_xstate_buf; +static struct _fpx_sw_bytes fx_sw_reserved, fx_sw_reserved_ia32;  static unsigned int *xstate_offsets, *xstate_sizes, xstate_features;  /* @@ -44,9 +38,9 @@ static unsigned int *xstate_offsets, *xstate_sizes, xstate_features;   */  void __sanitize_i387_state(struct task_struct *tsk)  { -	u64 xstate_bv; -	int feature_bit = 0x2;  	struct i387_fxsave_struct *fx = &tsk->thread.fpu.state->fxsave; +	int feature_bit = 0x2; +	u64 xstate_bv;  	if (!fx)  		return; @@ -104,213 +98,326 @@ void __sanitize_i387_state(struct task_struct *tsk)   * Check for the presence of extended state information in the   * user fpstate pointer in the sigcontext.   */ -int check_for_xstate(struct i387_fxsave_struct __user *buf, -		     void __user *fpstate, -		     struct _fpx_sw_bytes *fx_sw_user) +static inline int check_for_xstate(struct i387_fxsave_struct __user *buf, +				   void __user *fpstate, +				   struct _fpx_sw_bytes *fx_sw)  {  	int min_xstate_size = sizeof(struct i387_fxsave_struct) +  			      sizeof(struct xsave_hdr_struct);  	unsigned int magic2; -	int err; -	err = __copy_from_user(fx_sw_user, &buf->sw_reserved[0], -			       sizeof(struct _fpx_sw_bytes)); -	if (err) -		return -EFAULT; +	if (__copy_from_user(fx_sw, &buf->sw_reserved[0], sizeof(*fx_sw))) +		return -1; -	/* -	 * First Magic check failed. -	 */ -	if (fx_sw_user->magic1 != FP_XSTATE_MAGIC1) -		return -EINVAL; - -	/* -	 * Check for error scenarios. -	 */ -	if (fx_sw_user->xstate_size < min_xstate_size || -	    fx_sw_user->xstate_size > xstate_size || -	    fx_sw_user->xstate_size > fx_sw_user->extended_size) -		return -EINVAL; +	/* Check for the first magic field and other error scenarios. */ +	if (fx_sw->magic1 != FP_XSTATE_MAGIC1 || +	    fx_sw->xstate_size < min_xstate_size || +	    fx_sw->xstate_size > xstate_size || +	    fx_sw->xstate_size > fx_sw->extended_size) +		return -1; -	err = __get_user(magic2, (__u32 *) (((void *)fpstate) + -					    fx_sw_user->extended_size - -					    FP_XSTATE_MAGIC2_SIZE)); -	if (err) -		return err;  	/*  	 * Check for the presence of second magic word at the end of memory  	 * layout. This detects the case where the user just copied the legacy  	 * fpstate layout with out copying the extended state information  	 * in the memory layout.  	 */ -	if (magic2 != FP_XSTATE_MAGIC2) -		return -EFAULT; +	if (__get_user(magic2, (__u32 __user *)(fpstate + fx_sw->xstate_size)) +	    || magic2 != FP_XSTATE_MAGIC2) +		return -1;  	return 0;  } -#ifdef CONFIG_X86_64  /*   * Signal frame handlers.   */ - -int save_i387_xstate(void __user *buf) +static inline int save_fsave_header(struct task_struct *tsk, void __user *buf)  { -	struct task_struct *tsk = current; -	int err = 0; - -	if (!access_ok(VERIFY_WRITE, buf, sig_xstate_size)) -		return -EACCES; - -	BUG_ON(sig_xstate_size < xstate_size); +	if (use_fxsr()) { +		struct xsave_struct *xsave = &tsk->thread.fpu.state->xsave; +		struct user_i387_ia32_struct env; +		struct _fpstate_ia32 __user *fp = buf; -	if ((unsigned long)buf % 64) -		pr_err("%s: bad fpstate %p\n", __func__, buf); +		convert_from_fxsr(&env, tsk); -	if (!used_math()) -		return 0; - -	if (user_has_fpu()) { -		if (use_xsave()) -			err = xsave_user(buf); -		else -			err = fxsave_user(buf); - -		if (err) -			return err; -		user_fpu_end(); +		if (__copy_to_user(buf, &env, sizeof(env)) || +		    __put_user(xsave->i387.swd, &fp->status) || +		    __put_user(X86_FXSR_MAGIC, &fp->magic)) +			return -1;  	} else { -		sanitize_i387_state(tsk); -		if (__copy_to_user(buf, &tsk->thread.fpu.state->fxsave, -				   xstate_size)) +		struct i387_fsave_struct __user *fp = buf; +		u32 swd; +		if (__get_user(swd, &fp->swd) || __put_user(swd, &fp->status))  			return -1;  	} -	clear_used_math(); /* trigger finit */ +	return 0; +} -	if (use_xsave()) { -		struct _fpstate __user *fx = buf; -		struct _xstate __user *x = buf; -		u64 xstate_bv; +static inline int save_xstate_epilog(void __user *buf, int ia32_frame) +{ +	struct xsave_struct __user *x = buf; +	struct _fpx_sw_bytes *sw_bytes; +	u32 xstate_bv; +	int err; -		err = __copy_to_user(&fx->sw_reserved, &fx_sw_reserved, -				     sizeof(struct _fpx_sw_bytes)); +	/* Setup the bytes not touched by the [f]xsave and reserved for SW. */ +	sw_bytes = ia32_frame ? &fx_sw_reserved_ia32 : &fx_sw_reserved; +	err = __copy_to_user(&x->i387.sw_reserved, sw_bytes, sizeof(*sw_bytes)); -		err |= __put_user(FP_XSTATE_MAGIC2, -				  (__u32 __user *) (buf + sig_xstate_size -						    - FP_XSTATE_MAGIC2_SIZE)); +	if (!use_xsave()) +		return err; -		/* -		 * Read the xstate_bv which we copied (directly from the cpu or -		 * from the state in task struct) to the user buffers and -		 * set the FP/SSE bits. -		 */ -		err |= __get_user(xstate_bv, &x->xstate_hdr.xstate_bv); +	err |= __put_user(FP_XSTATE_MAGIC2, (__u32 *)(buf + xstate_size)); -		/* -		 * For legacy compatible, we always set FP/SSE bits in the bit -		 * vector while saving the state to the user context. This will -		 * enable us capturing any changes(during sigreturn) to -		 * the FP/SSE bits by the legacy applications which don't touch -		 * xstate_bv in the xsave header. -		 * -		 * xsave aware apps can change the xstate_bv in the xsave -		 * header as well as change any contents in the memory layout. -		 * xrestore as part of sigreturn will capture all the changes. -		 */ -		xstate_bv |= XSTATE_FPSSE; +	/* +	 * Read the xstate_bv which we copied (directly from the cpu or +	 * from the state in task struct) to the user buffers. +	 */ +	err |= __get_user(xstate_bv, (__u32 *)&x->xsave_hdr.xstate_bv); -		err |= __put_user(xstate_bv, &x->xstate_hdr.xstate_bv); +	/* +	 * For legacy compatible, we always set FP/SSE bits in the bit +	 * vector while saving the state to the user context. This will +	 * enable us capturing any changes(during sigreturn) to +	 * the FP/SSE bits by the legacy applications which don't touch +	 * xstate_bv in the xsave header. +	 * +	 * xsave aware apps can change the xstate_bv in the xsave +	 * header as well as change any contents in the memory layout. +	 * xrestore as part of sigreturn will capture all the changes. +	 */ +	xstate_bv |= XSTATE_FPSSE; -		if (err) -			return err; -	} +	err |= __put_user(xstate_bv, (__u32 *)&x->xsave_hdr.xstate_bv); -	return 1; +	return err; +} + +static inline int save_user_xstate(struct xsave_struct __user *buf) +{ +	int err; + +	if (use_xsave()) +		err = xsave_user(buf); +	else if (use_fxsr()) +		err = fxsave_user((struct i387_fxsave_struct __user *) buf); +	else +		err = fsave_user((struct i387_fsave_struct __user *) buf); + +	if (unlikely(err) && __clear_user(buf, xstate_size)) +		err = -EFAULT; +	return err;  }  /* - * Restore the extended state if present. Otherwise, restore the FP/SSE - * state. + * Save the fpu, extended register state to the user signal frame. + * + * 'buf_fx' is the 64-byte aligned pointer at which the [f|fx|x]save + *  state is copied. + *  'buf' points to the 'buf_fx' or to the fsave header followed by 'buf_fx'. + * + *	buf == buf_fx for 64-bit frames and 32-bit fsave frame. + *	buf != buf_fx for 32-bit frames with fxstate. + * + * If the fpu, extended register state is live, save the state directly + * to the user frame pointed by the aligned pointer 'buf_fx'. Otherwise, + * copy the thread's fpu state to the user frame starting at 'buf_fx'. + * + * If this is a 32-bit frame with fxstate, put a fsave header before + * the aligned state at 'buf_fx'. + * + * For [f]xsave state, update the SW reserved fields in the [f]xsave frame + * indicating the absence/presence of the extended state to the user.   */ -static int restore_user_xstate(void __user *buf) +int save_xstate_sig(void __user *buf, void __user *buf_fx, int size)  { -	struct _fpx_sw_bytes fx_sw_user; -	u64 mask; -	int err; +	struct xsave_struct *xsave = ¤t->thread.fpu.state->xsave; +	struct task_struct *tsk = current; +	int ia32_fxstate = (buf != buf_fx); -	if (((unsigned long)buf % 64) || -	     check_for_xstate(buf, buf, &fx_sw_user)) -		goto fx_only; +	ia32_fxstate &= (config_enabled(CONFIG_X86_32) || +			 config_enabled(CONFIG_IA32_EMULATION)); -	mask = fx_sw_user.xstate_bv; +	if (!access_ok(VERIFY_WRITE, buf, size)) +		return -EACCES; -	/* -	 * restore the state passed by the user. -	 */ -	err = xrestore_user(buf, mask); -	if (err) -		return err; +	if (!HAVE_HWFP) +		return fpregs_soft_get(current, NULL, 0, +			sizeof(struct user_i387_ia32_struct), NULL, +			(struct _fpstate_ia32 __user *) buf) ? -1 : 1; -	/* -	 * init the state skipped by the user. -	 */ -	mask = pcntxt_mask & ~mask; -	if (unlikely(mask)) -		xrstor_state(init_xstate_buf, mask); +	if (user_has_fpu()) { +		/* Save the live register state to the user directly. */ +		if (save_user_xstate(buf_fx)) +			return -1; +		/* Update the thread's fxstate to save the fsave header. */ +		if (ia32_fxstate) +			fpu_fxsave(&tsk->thread.fpu); +	} else { +		sanitize_i387_state(tsk); +		if (__copy_to_user(buf_fx, xsave, xstate_size)) +			return -1; +	} + +	/* Save the fsave header for the 32-bit frames. */ +	if ((ia32_fxstate || !use_fxsr()) && save_fsave_header(tsk, buf)) +		return -1; + +	if (use_fxsr() && save_xstate_epilog(buf_fx, ia32_fxstate)) +		return -1; + +	drop_init_fpu(tsk);	/* trigger finit */  	return 0; +} -fx_only: -	/* -	 * couldn't find the extended state information in the -	 * memory layout. Restore just the FP/SSE and init all -	 * the other extended state. -	 */ -	xrstor_state(init_xstate_buf, pcntxt_mask & ~XSTATE_FPSSE); -	return fxrstor_checking((__force struct i387_fxsave_struct *)buf); +static inline void +sanitize_restored_xstate(struct task_struct *tsk, +			 struct user_i387_ia32_struct *ia32_env, +			 u64 xstate_bv, int fx_only) +{ +	struct xsave_struct *xsave = &tsk->thread.fpu.state->xsave; +	struct xsave_hdr_struct *xsave_hdr = &xsave->xsave_hdr; + +	if (use_xsave()) { +		/* These bits must be zero. */ +		xsave_hdr->reserved1[0] = xsave_hdr->reserved1[1] = 0; + +		/* +		 * Init the state that is not present in the memory +		 * layout and not enabled by the OS. +		 */ +		if (fx_only) +			xsave_hdr->xstate_bv = XSTATE_FPSSE; +		else +			xsave_hdr->xstate_bv &= (pcntxt_mask & xstate_bv); +	} + +	if (use_fxsr()) { +		/* +		 * mscsr reserved bits must be masked to zero for security +		 * reasons. +		 */ +		xsave->i387.mxcsr &= mxcsr_feature_mask; + +		convert_to_fxsr(tsk, ia32_env); +	}  }  /* - * This restores directly out of user space. Exceptions are handled. + * Restore the extended state if present. Otherwise, restore the FP/SSE state.   */ -int restore_i387_xstate(void __user *buf) +static inline int restore_user_xstate(void __user *buf, u64 xbv, int fx_only)  { +	if (use_xsave()) { +		if ((unsigned long)buf % 64 || fx_only) { +			u64 init_bv = pcntxt_mask & ~XSTATE_FPSSE; +			xrstor_state(init_xstate_buf, init_bv); +			return fxrstor_user(buf); +		} else { +			u64 init_bv = pcntxt_mask & ~xbv; +			if (unlikely(init_bv)) +				xrstor_state(init_xstate_buf, init_bv); +			return xrestore_user(buf, xbv); +		} +	} else if (use_fxsr()) { +		return fxrstor_user(buf); +	} else +		return frstor_user(buf); +} + +int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size) +{ +	int ia32_fxstate = (buf != buf_fx);  	struct task_struct *tsk = current; -	int err = 0; +	int state_size = xstate_size; +	u64 xstate_bv = 0; +	int fx_only = 0; + +	ia32_fxstate &= (config_enabled(CONFIG_X86_32) || +			 config_enabled(CONFIG_IA32_EMULATION));  	if (!buf) { -		if (used_math()) -			goto clear; +		drop_init_fpu(tsk);  		return 0; -	} else -		if (!access_ok(VERIFY_READ, buf, sig_xstate_size)) -			return -EACCES; +	} -	if (!used_math()) { -		err = init_fpu(tsk); -		if (err) -			return err; +	if (!access_ok(VERIFY_READ, buf, size)) +		return -EACCES; + +	if (!used_math() && init_fpu(tsk)) +		return -1; + +	if (!HAVE_HWFP) { +		return fpregs_soft_set(current, NULL, +				       0, sizeof(struct user_i387_ia32_struct), +				       NULL, buf) != 0;  	} -	user_fpu_begin(); -	if (use_xsave()) -		err = restore_user_xstate(buf); -	else -		err = fxrstor_checking((__force struct i387_fxsave_struct *) -				       buf); -	if (unlikely(err)) { +	if (use_xsave()) { +		struct _fpx_sw_bytes fx_sw_user; +		if (unlikely(check_for_xstate(buf_fx, buf_fx, &fx_sw_user))) { +			/* +			 * Couldn't find the extended state information in the +			 * memory layout. Restore just the FP/SSE and init all +			 * the other extended state. +			 */ +			state_size = sizeof(struct i387_fxsave_struct); +			fx_only = 1; +		} else { +			state_size = fx_sw_user.xstate_size; +			xstate_bv = fx_sw_user.xstate_bv; +		} +	} + +	if (ia32_fxstate) { +		/* +		 * For 32-bit frames with fxstate, copy the user state to the +		 * thread's fpu state, reconstruct fxstate from the fsave +		 * header. Sanitize the copied state etc. +		 */ +		struct xsave_struct *xsave = &tsk->thread.fpu.state->xsave; +		struct user_i387_ia32_struct env; +		int err = 0; + +		/* +		 * Drop the current fpu which clears used_math(). This ensures +		 * that any context-switch during the copy of the new state, +		 * avoids the intermediate state from getting restored/saved. +		 * Thus avoiding the new restored state from getting corrupted. +		 * We will be ready to restore/save the state only after +		 * set_used_math() is again set. +		 */ +		drop_fpu(tsk); + +		if (__copy_from_user(xsave, buf_fx, state_size) || +		    __copy_from_user(&env, buf, sizeof(env))) { +			err = -1; +		} else { +			sanitize_restored_xstate(tsk, &env, xstate_bv, fx_only); +			set_used_math(); +		} + +		if (use_eager_fpu()) +			math_state_restore(); + +		return err; +	} else {  		/* -		 * Encountered an error while doing the restore from the -		 * user buffer, clear the fpu state. +		 * For 64-bit frames and 32-bit fsave frames, restore the user +		 * state to the registers directly (with exceptions handled).  		 */ -clear: -		clear_fpu(tsk); -		clear_used_math(); +		user_fpu_begin(); +		if (restore_user_xstate(buf_fx, xstate_bv, fx_only)) { +			drop_init_fpu(tsk); +			return -1; +		}  	} -	return err; + +	return 0;  } -#endif  /*   * Prepare the SW reserved portion of the fxsave memory layout, indicating @@ -321,31 +428,22 @@ clear:   */  static void prepare_fx_sw_frame(void)  { -	int size_extended = (xstate_size - sizeof(struct i387_fxsave_struct)) + -			     FP_XSTATE_MAGIC2_SIZE; +	int fsave_header_size = sizeof(struct i387_fsave_struct); +	int size = xstate_size + FP_XSTATE_MAGIC2_SIZE; -	sig_xstate_size = sizeof(struct _fpstate) + size_extended; - -#ifdef CONFIG_IA32_EMULATION -	sig_xstate_ia32_size = sizeof(struct _fpstate_ia32) + size_extended; -#endif - -	memset(&fx_sw_reserved, 0, sizeof(fx_sw_reserved)); +	if (config_enabled(CONFIG_X86_32)) +		size += fsave_header_size;  	fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1; -	fx_sw_reserved.extended_size = sig_xstate_size; +	fx_sw_reserved.extended_size = size;  	fx_sw_reserved.xstate_bv = pcntxt_mask;  	fx_sw_reserved.xstate_size = xstate_size; -#ifdef CONFIG_IA32_EMULATION -	memcpy(&fx_sw_reserved_ia32, &fx_sw_reserved, -	       sizeof(struct _fpx_sw_bytes)); -	fx_sw_reserved_ia32.extended_size = sig_xstate_ia32_size; -#endif -} -#ifdef CONFIG_X86_64 -unsigned int sig_xstate_size = sizeof(struct _fpstate); -#endif +	if (config_enabled(CONFIG_IA32_EMULATION)) { +		fx_sw_reserved_ia32 = fx_sw_reserved; +		fx_sw_reserved_ia32.extended_size += fsave_header_size; +	} +}  /*   * Enable the extended processor state save/restore feature @@ -384,19 +482,21 @@ static void __init setup_xstate_features(void)  /*   * setup the xstate image representing the init state   */ -static void __init setup_xstate_init(void) +static void __init setup_init_fpu_buf(void)  { -	setup_xstate_features(); -  	/*  	 * Setup init_xstate_buf to represent the init state of  	 * all the features managed by the xsave  	 */  	init_xstate_buf = alloc_bootmem_align(xstate_size,  					      __alignof__(struct xsave_struct)); -	init_xstate_buf->i387.mxcsr = MXCSR_DEFAULT; +	fx_finit(&init_xstate_buf->i387); + +	if (!cpu_has_xsave) +		return; + +	setup_xstate_features(); -	clts();  	/*  	 * Init all the features state with header_bv being 0x0  	 */ @@ -406,9 +506,21 @@ static void __init setup_xstate_init(void)  	 * of any feature which is not represented by all zero's.  	 */  	xsave_state(init_xstate_buf, -1); -	stts();  } +static enum { AUTO, ENABLE, DISABLE } eagerfpu = AUTO; +static int __init eager_fpu_setup(char *s) +{ +	if (!strcmp(s, "on")) +		eagerfpu = ENABLE; +	else if (!strcmp(s, "off")) +		eagerfpu = DISABLE; +	else if (!strcmp(s, "auto")) +		eagerfpu = AUTO; +	return 1; +} +__setup("eagerfpu=", eager_fpu_setup); +  /*   * Enable and initialize the xsave feature.   */ @@ -445,8 +557,11 @@ static void __init xstate_enable_boot_cpu(void)  	update_regset_xstate_info(xstate_size, pcntxt_mask);  	prepare_fx_sw_frame(); +	setup_init_fpu_buf(); -	setup_xstate_init(); +	/* Auto enable eagerfpu for xsaveopt */ +	if (cpu_has_xsaveopt && eagerfpu != DISABLE) +		eagerfpu = ENABLE;  	pr_info("enabled xstate_bv 0x%llx, cntxt size 0x%x\n",  		pcntxt_mask, xstate_size); @@ -471,3 +586,43 @@ void __cpuinit xsave_init(void)  	next_func = xstate_enable;  	this_func();  } + +static inline void __init eager_fpu_init_bp(void) +{ +	current->thread.fpu.state = +	    alloc_bootmem_align(xstate_size, __alignof__(struct xsave_struct)); +	if (!init_xstate_buf) +		setup_init_fpu_buf(); +} + +void __cpuinit eager_fpu_init(void) +{ +	static __refdata void (*boot_func)(void) = eager_fpu_init_bp; + +	clear_used_math(); +	current_thread_info()->status = 0; + +	if (eagerfpu == ENABLE) +		setup_force_cpu_cap(X86_FEATURE_EAGER_FPU); + +	if (!cpu_has_eager_fpu) { +		stts(); +		return; +	} + +	if (boot_func) { +		boot_func(); +		boot_func = NULL; +	} + +	/* +	 * This is same as math_state_restore(). But use_xsave() is +	 * not yet patched to use math_state_restore(). +	 */ +	init_fpu(current); +	__thread_fpu_begin(current); +	if (cpu_has_xsave) +		xrstor_state(init_xstate_buf, -1); +	else +		fxrstor_checking(&init_xstate_buf->i387); +} diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index e498b18f010..9fc9aa7ac70 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -318,7 +318,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)  		if (val & 0x10) {  			u8 edge_irr = s->irr & ~s->elcr;  			int i; -			bool found; +			bool found = false;  			struct kvm_vcpu *vcpu;  			s->init4 = val & 1; diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index a71faf727ff..bca63f04dcc 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -183,95 +183,6 @@ TRACE_EVENT(kvm_apic,  #define KVM_ISA_VMX   1  #define KVM_ISA_SVM   2 -#define VMX_EXIT_REASONS \ -	{ EXIT_REASON_EXCEPTION_NMI,		"EXCEPTION_NMI" }, \ -	{ EXIT_REASON_EXTERNAL_INTERRUPT,	"EXTERNAL_INTERRUPT" }, \ -	{ EXIT_REASON_TRIPLE_FAULT,		"TRIPLE_FAULT" }, \ -	{ EXIT_REASON_PENDING_INTERRUPT,	"PENDING_INTERRUPT" }, \ -	{ EXIT_REASON_NMI_WINDOW,		"NMI_WINDOW" }, \ -	{ EXIT_REASON_TASK_SWITCH,		"TASK_SWITCH" }, \ -	{ EXIT_REASON_CPUID,			"CPUID" }, \ -	{ EXIT_REASON_HLT,			"HLT" }, \ -	{ EXIT_REASON_INVLPG,			"INVLPG" }, \ -	{ EXIT_REASON_RDPMC,			"RDPMC" }, \ -	{ EXIT_REASON_RDTSC,			"RDTSC" }, \ -	{ EXIT_REASON_VMCALL,			"VMCALL" }, \ -	{ EXIT_REASON_VMCLEAR,			"VMCLEAR" }, \ -	{ EXIT_REASON_VMLAUNCH,			"VMLAUNCH" }, \ -	{ EXIT_REASON_VMPTRLD,			"VMPTRLD" }, \ -	{ EXIT_REASON_VMPTRST,			"VMPTRST" }, \ -	{ EXIT_REASON_VMREAD,			"VMREAD" }, \ -	{ EXIT_REASON_VMRESUME,			"VMRESUME" }, \ -	{ EXIT_REASON_VMWRITE,			"VMWRITE" }, \ -	{ EXIT_REASON_VMOFF,			"VMOFF" }, \ -	{ EXIT_REASON_VMON,			"VMON" }, \ -	{ EXIT_REASON_CR_ACCESS,		"CR_ACCESS" }, \ -	{ EXIT_REASON_DR_ACCESS,		"DR_ACCESS" }, \ -	{ EXIT_REASON_IO_INSTRUCTION,		"IO_INSTRUCTION" }, \ -	{ EXIT_REASON_MSR_READ,			"MSR_READ" }, \ -	{ EXIT_REASON_MSR_WRITE,		"MSR_WRITE" }, \ -	{ EXIT_REASON_MWAIT_INSTRUCTION,	"MWAIT_INSTRUCTION" }, \ -	{ EXIT_REASON_MONITOR_INSTRUCTION,	"MONITOR_INSTRUCTION" }, \ -	{ EXIT_REASON_PAUSE_INSTRUCTION,	"PAUSE_INSTRUCTION" }, \ -	{ EXIT_REASON_MCE_DURING_VMENTRY,	"MCE_DURING_VMENTRY" }, \ -	{ EXIT_REASON_TPR_BELOW_THRESHOLD,	"TPR_BELOW_THRESHOLD" },	\ -	{ EXIT_REASON_APIC_ACCESS,		"APIC_ACCESS" }, \ -	{ EXIT_REASON_EPT_VIOLATION,		"EPT_VIOLATION" }, \ -	{ EXIT_REASON_EPT_MISCONFIG,		"EPT_MISCONFIG" }, \ -	{ EXIT_REASON_WBINVD,			"WBINVD" } - -#define SVM_EXIT_REASONS \ -	{ SVM_EXIT_READ_CR0,			"read_cr0" }, \ -	{ SVM_EXIT_READ_CR3,			"read_cr3" }, \ -	{ SVM_EXIT_READ_CR4,			"read_cr4" }, \ -	{ SVM_EXIT_READ_CR8,			"read_cr8" }, \ -	{ SVM_EXIT_WRITE_CR0,			"write_cr0" }, \ -	{ SVM_EXIT_WRITE_CR3,			"write_cr3" }, \ -	{ SVM_EXIT_WRITE_CR4,			"write_cr4" }, \ -	{ SVM_EXIT_WRITE_CR8,			"write_cr8" }, \ -	{ SVM_EXIT_READ_DR0,			"read_dr0" }, \ -	{ SVM_EXIT_READ_DR1,			"read_dr1" }, \ -	{ SVM_EXIT_READ_DR2,			"read_dr2" }, \ -	{ SVM_EXIT_READ_DR3,			"read_dr3" }, \ -	{ SVM_EXIT_WRITE_DR0,			"write_dr0" }, \ -	{ SVM_EXIT_WRITE_DR1,			"write_dr1" }, \ -	{ SVM_EXIT_WRITE_DR2,			"write_dr2" }, \ -	{ SVM_EXIT_WRITE_DR3,			"write_dr3" }, \ -	{ SVM_EXIT_WRITE_DR5,			"write_dr5" }, \ -	{ SVM_EXIT_WRITE_DR7,			"write_dr7" }, \ -	{ SVM_EXIT_EXCP_BASE + DB_VECTOR,	"DB excp" }, \ -	{ SVM_EXIT_EXCP_BASE + BP_VECTOR,	"BP excp" }, \ -	{ SVM_EXIT_EXCP_BASE + UD_VECTOR,	"UD excp" }, \ -	{ SVM_EXIT_EXCP_BASE + PF_VECTOR,	"PF excp" }, \ -	{ SVM_EXIT_EXCP_BASE + NM_VECTOR,	"NM excp" }, \ -	{ SVM_EXIT_EXCP_BASE + MC_VECTOR,	"MC excp" }, \ -	{ SVM_EXIT_INTR,			"interrupt" }, \ -	{ SVM_EXIT_NMI,				"nmi" }, \ -	{ SVM_EXIT_SMI,				"smi" }, \ -	{ SVM_EXIT_INIT,			"init" }, \ -	{ SVM_EXIT_VINTR,			"vintr" }, \ -	{ SVM_EXIT_CPUID,			"cpuid" }, \ -	{ SVM_EXIT_INVD,			"invd" }, \ -	{ SVM_EXIT_HLT,				"hlt" }, \ -	{ SVM_EXIT_INVLPG,			"invlpg" }, \ -	{ SVM_EXIT_INVLPGA,			"invlpga" }, \ -	{ SVM_EXIT_IOIO,			"io" }, \ -	{ SVM_EXIT_MSR,				"msr" }, \ -	{ SVM_EXIT_TASK_SWITCH,			"task_switch" }, \ -	{ SVM_EXIT_SHUTDOWN,			"shutdown" }, \ -	{ SVM_EXIT_VMRUN,			"vmrun" }, \ -	{ SVM_EXIT_VMMCALL,			"hypercall" }, \ -	{ SVM_EXIT_VMLOAD,			"vmload" }, \ -	{ SVM_EXIT_VMSAVE,			"vmsave" }, \ -	{ SVM_EXIT_STGI,			"stgi" }, \ -	{ SVM_EXIT_CLGI,			"clgi" }, \ -	{ SVM_EXIT_SKINIT,			"skinit" }, \ -	{ SVM_EXIT_WBINVD,			"wbinvd" }, \ -	{ SVM_EXIT_MONITOR,			"monitor" }, \ -	{ SVM_EXIT_MWAIT,			"mwait" }, \ -	{ SVM_EXIT_XSETBV,			"xsetbv" }, \ -	{ SVM_EXIT_NPF,				"npf" } -  /*   * Tracepoint for kvm guest exit:   */ diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index c00f03de1b7..851aa7c3b89 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -1493,8 +1493,12 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)  #ifdef CONFIG_X86_64  	wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);  #endif -	if (user_has_fpu()) -		clts(); +	/* +	 * If the FPU is not active (through the host task or +	 * the guest vcpu), then restore the cr0.TS bit. +	 */ +	if (!user_has_fpu() && !vmx->vcpu.guest_fpu_loaded) +		stts();  	load_gdt(&__get_cpu_var(host_gdt));  } @@ -3619,6 +3623,7 @@ static void seg_setup(int seg)  static int alloc_apic_access_page(struct kvm *kvm)  { +	struct page *page;  	struct kvm_userspace_memory_region kvm_userspace_mem;  	int r = 0; @@ -3633,7 +3638,13 @@ static int alloc_apic_access_page(struct kvm *kvm)  	if (r)  		goto out; -	kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00); +	page = gfn_to_page(kvm, 0xfee00); +	if (is_error_page(page)) { +		r = -EFAULT; +		goto out; +	} + +	kvm->arch.apic_access_page = page;  out:  	mutex_unlock(&kvm->slots_lock);  	return r; @@ -3641,6 +3652,7 @@ out:  static int alloc_identity_pagetable(struct kvm *kvm)  { +	struct page *page;  	struct kvm_userspace_memory_region kvm_userspace_mem;  	int r = 0; @@ -3656,8 +3668,13 @@ static int alloc_identity_pagetable(struct kvm *kvm)  	if (r)  		goto out; -	kvm->arch.ept_identity_pagetable = gfn_to_page(kvm, -			kvm->arch.ept_identity_map_addr >> PAGE_SHIFT); +	page = gfn_to_page(kvm, kvm->arch.ept_identity_map_addr >> PAGE_SHIFT); +	if (is_error_page(page)) { +		r = -EFAULT; +		goto out; +	} + +	kvm->arch.ept_identity_pagetable = page;  out:  	mutex_unlock(&kvm->slots_lock);  	return r; @@ -3730,7 +3747,7 @@ static void vmx_set_constant_host_state(void)  	unsigned long tmpl;  	struct desc_ptr dt; -	vmcs_writel(HOST_CR0, read_cr0() | X86_CR0_TS);  /* 22.2.3 */ +	vmcs_writel(HOST_CR0, read_cr0() & ~X86_CR0_TS);  /* 22.2.3 */  	vmcs_writel(HOST_CR4, read_cr4());  /* 22.2.3, 22.2.5 */  	vmcs_writel(HOST_CR3, read_cr3());  /* 22.2.3  FIXME: shadow tables */ @@ -4530,7 +4547,7 @@ static int handle_cr(struct kvm_vcpu *vcpu)  				vcpu->run->exit_reason = KVM_EXIT_SET_TPR;  				return 0;  			} -		}; +		}  		break;  	case 2: /* clts */  		handle_clts(vcpu); @@ -6575,7 +6592,7 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)  	/* Exposing INVPCID only when PCID is exposed */  	best = kvm_find_cpuid_entry(vcpu, 0x7, 0);  	if (vmx_invpcid_supported() && -	    best && (best->ecx & bit(X86_FEATURE_INVPCID)) && +	    best && (best->ebx & bit(X86_FEATURE_INVPCID)) &&  	    guest_cpuid_has_pcid(vcpu)) {  		exec_control |= SECONDARY_EXEC_ENABLE_INVPCID;  		vmcs_write32(SECONDARY_VM_EXEC_CONTROL, @@ -6585,7 +6602,7 @@ static void vmx_cpuid_update(struct kvm_vcpu *vcpu)  		vmcs_write32(SECONDARY_VM_EXEC_CONTROL,  			     exec_control);  		if (best) -			best->ecx &= ~bit(X86_FEATURE_INVPCID); +			best->ebx &= ~bit(X86_FEATURE_INVPCID);  	}  } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 148ed666e31..1f09552572f 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5113,17 +5113,20 @@ static void post_kvm_run_save(struct kvm_vcpu *vcpu)  			!kvm_event_needs_reinjection(vcpu);  } -static void vapic_enter(struct kvm_vcpu *vcpu) +static int vapic_enter(struct kvm_vcpu *vcpu)  {  	struct kvm_lapic *apic = vcpu->arch.apic;  	struct page *page;  	if (!apic || !apic->vapic_addr) -		return; +		return 0;  	page = gfn_to_page(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); +	if (is_error_page(page)) +		return -EFAULT;  	vcpu->arch.apic->vapic_page = page; +	return 0;  }  static void vapic_exit(struct kvm_vcpu *vcpu) @@ -5430,7 +5433,11 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)  	}  	vcpu->srcu_idx = srcu_read_lock(&kvm->srcu); -	vapic_enter(vcpu); +	r = vapic_enter(vcpu); +	if (r) { +		srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); +		return r; +	}  	r = 1;  	while (r > 0) { @@ -5972,7 +5979,7 @@ void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)  	 */  	kvm_put_guest_xcr0(vcpu);  	vcpu->guest_fpu_loaded = 1; -	unlazy_fpu(current); +	__kernel_fpu_begin();  	fpu_restore_checking(&vcpu->arch.guest_fpu);  	trace_kvm_fpu(1);  } @@ -5986,6 +5993,7 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)  	vcpu->guest_fpu_loaded = 0;  	fpu_save_init(&vcpu->arch.guest_fpu); +	__kernel_fpu_end();  	++vcpu->stat.fpu_reload;  	kvm_make_request(KVM_REQ_DEACTIVATE_FPU, vcpu);  	trace_kvm_fpu(0); diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index 5b2995f4557..a30ca15be21 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -17,6 +17,7 @@  #include <asm/cpufeature.h>  #include <asm/alternative-asm.h>  #include <asm/asm.h> +#include <asm/smap.h>  /*   * By placing feature2 after feature1 in altinstructions section, we logically @@ -130,6 +131,7 @@ ENDPROC(bad_from_user)   */  ENTRY(copy_user_generic_unrolled)  	CFI_STARTPROC +	ASM_STAC  	cmpl $8,%edx  	jb 20f		/* less then 8 bytes, go to byte copy loop */  	ALIGN_DESTINATION @@ -177,6 +179,7 @@ ENTRY(copy_user_generic_unrolled)  	decl %ecx  	jnz 21b  23:	xor %eax,%eax +	ASM_CLAC  	ret  	.section .fixup,"ax" @@ -232,6 +235,7 @@ ENDPROC(copy_user_generic_unrolled)   */  ENTRY(copy_user_generic_string)  	CFI_STARTPROC +	ASM_STAC  	andl %edx,%edx  	jz 4f  	cmpl $8,%edx @@ -246,6 +250,7 @@ ENTRY(copy_user_generic_string)  3:	rep  	movsb  4:	xorl %eax,%eax +	ASM_CLAC  	ret  	.section .fixup,"ax" @@ -273,12 +278,14 @@ ENDPROC(copy_user_generic_string)   */  ENTRY(copy_user_enhanced_fast_string)  	CFI_STARTPROC +	ASM_STAC  	andl %edx,%edx  	jz 2f  	movl %edx,%ecx  1:	rep  	movsb  2:	xorl %eax,%eax +	ASM_CLAC  	ret  	.section .fixup,"ax" diff --git a/arch/x86/lib/copy_user_nocache_64.S b/arch/x86/lib/copy_user_nocache_64.S index cacddc7163e..6a4f43c2d9e 100644 --- a/arch/x86/lib/copy_user_nocache_64.S +++ b/arch/x86/lib/copy_user_nocache_64.S @@ -15,6 +15,7 @@  #include <asm/asm-offsets.h>  #include <asm/thread_info.h>  #include <asm/asm.h> +#include <asm/smap.h>  	.macro ALIGN_DESTINATION  #ifdef FIX_ALIGNMENT @@ -48,6 +49,7 @@   */  ENTRY(__copy_user_nocache)  	CFI_STARTPROC +	ASM_STAC  	cmpl $8,%edx  	jb 20f		/* less then 8 bytes, go to byte copy loop */  	ALIGN_DESTINATION @@ -95,6 +97,7 @@ ENTRY(__copy_user_nocache)  	decl %ecx  	jnz 21b  23:	xorl %eax,%eax +	ASM_CLAC  	sfence  	ret diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S index b33b1fb1e6d..156b9c80467 100644 --- a/arch/x86/lib/getuser.S +++ b/arch/x86/lib/getuser.S @@ -33,6 +33,7 @@  #include <asm/asm-offsets.h>  #include <asm/thread_info.h>  #include <asm/asm.h> +#include <asm/smap.h>  	.text  ENTRY(__get_user_1) @@ -40,8 +41,10 @@ ENTRY(__get_user_1)  	GET_THREAD_INFO(%_ASM_DX)  	cmp TI_addr_limit(%_ASM_DX),%_ASM_AX  	jae bad_get_user +	ASM_STAC  1:	movzb (%_ASM_AX),%edx  	xor %eax,%eax +	ASM_CLAC  	ret  	CFI_ENDPROC  ENDPROC(__get_user_1) @@ -53,8 +56,10 @@ ENTRY(__get_user_2)  	GET_THREAD_INFO(%_ASM_DX)  	cmp TI_addr_limit(%_ASM_DX),%_ASM_AX  	jae bad_get_user +	ASM_STAC  2:	movzwl -1(%_ASM_AX),%edx  	xor %eax,%eax +	ASM_CLAC  	ret  	CFI_ENDPROC  ENDPROC(__get_user_2) @@ -66,8 +71,10 @@ ENTRY(__get_user_4)  	GET_THREAD_INFO(%_ASM_DX)  	cmp TI_addr_limit(%_ASM_DX),%_ASM_AX  	jae bad_get_user +	ASM_STAC  3:	mov -3(%_ASM_AX),%edx  	xor %eax,%eax +	ASM_CLAC  	ret  	CFI_ENDPROC  ENDPROC(__get_user_4) @@ -80,8 +87,10 @@ ENTRY(__get_user_8)  	GET_THREAD_INFO(%_ASM_DX)  	cmp TI_addr_limit(%_ASM_DX),%_ASM_AX  	jae	bad_get_user +	ASM_STAC  4:	movq -7(%_ASM_AX),%_ASM_DX  	xor %eax,%eax +	ASM_CLAC  	ret  	CFI_ENDPROC  ENDPROC(__get_user_8) @@ -91,6 +100,7 @@ bad_get_user:  	CFI_STARTPROC  	xor %edx,%edx  	mov $(-EFAULT),%_ASM_AX +	ASM_CLAC  	ret  	CFI_ENDPROC  END(bad_get_user) diff --git a/arch/x86/lib/putuser.S b/arch/x86/lib/putuser.S index 7f951c8f76c..fc6ba17a7ee 100644 --- a/arch/x86/lib/putuser.S +++ b/arch/x86/lib/putuser.S @@ -15,6 +15,7 @@  #include <asm/thread_info.h>  #include <asm/errno.h>  #include <asm/asm.h> +#include <asm/smap.h>  /* @@ -31,7 +32,8 @@  #define ENTER	CFI_STARTPROC ; \  		GET_THREAD_INFO(%_ASM_BX) -#define EXIT	ret ; \ +#define EXIT	ASM_CLAC ;	\ +		ret ;		\  		CFI_ENDPROC  .text @@ -39,6 +41,7 @@ ENTRY(__put_user_1)  	ENTER  	cmp TI_addr_limit(%_ASM_BX),%_ASM_CX  	jae bad_put_user +	ASM_STAC  1:	movb %al,(%_ASM_CX)  	xor %eax,%eax  	EXIT @@ -50,6 +53,7 @@ ENTRY(__put_user_2)  	sub $1,%_ASM_BX  	cmp %_ASM_BX,%_ASM_CX  	jae bad_put_user +	ASM_STAC  2:	movw %ax,(%_ASM_CX)  	xor %eax,%eax  	EXIT @@ -61,6 +65,7 @@ ENTRY(__put_user_4)  	sub $3,%_ASM_BX  	cmp %_ASM_BX,%_ASM_CX  	jae bad_put_user +	ASM_STAC  3:	movl %eax,(%_ASM_CX)  	xor %eax,%eax  	EXIT @@ -72,6 +77,7 @@ ENTRY(__put_user_8)  	sub $7,%_ASM_BX  	cmp %_ASM_BX,%_ASM_CX  	jae bad_put_user +	ASM_STAC  4:	mov %_ASM_AX,(%_ASM_CX)  #ifdef CONFIG_X86_32  5:	movl %edx,4(%_ASM_CX) diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c index 1781b2f950e..98f6d6b68f5 100644 --- a/arch/x86/lib/usercopy_32.c +++ b/arch/x86/lib/usercopy_32.c @@ -42,10 +42,11 @@ do {									\  	int __d0;							\  	might_fault();							\  	__asm__ __volatile__(						\ +		ASM_STAC "\n"						\  		"0:	rep; stosl\n"					\  		"	movl %2,%0\n"					\  		"1:	rep; stosb\n"					\ -		"2:\n"							\ +		"2: " ASM_CLAC "\n"					\  		".section .fixup,\"ax\"\n"				\  		"3:	lea 0(%2,%0,4),%0\n"				\  		"	jmp 2b\n"					\ @@ -626,10 +627,12 @@ survive:  		return n;  	}  #endif +	stac();  	if (movsl_is_ok(to, from, n))  		__copy_user(to, from, n);  	else  		n = __copy_user_intel(to, from, n); +	clac();  	return n;  }  EXPORT_SYMBOL(__copy_to_user_ll); @@ -637,10 +640,12 @@ EXPORT_SYMBOL(__copy_to_user_ll);  unsigned long __copy_from_user_ll(void *to, const void __user *from,  					unsigned long n)  { +	stac();  	if (movsl_is_ok(to, from, n))  		__copy_user_zeroing(to, from, n);  	else  		n = __copy_user_zeroing_intel(to, from, n); +	clac();  	return n;  }  EXPORT_SYMBOL(__copy_from_user_ll); @@ -648,11 +653,13 @@ EXPORT_SYMBOL(__copy_from_user_ll);  unsigned long __copy_from_user_ll_nozero(void *to, const void __user *from,  					 unsigned long n)  { +	stac();  	if (movsl_is_ok(to, from, n))  		__copy_user(to, from, n);  	else  		n = __copy_user_intel((void __user *)to,  				      (const void *)from, n); +	clac();  	return n;  }  EXPORT_SYMBOL(__copy_from_user_ll_nozero); @@ -660,6 +667,7 @@ EXPORT_SYMBOL(__copy_from_user_ll_nozero);  unsigned long __copy_from_user_ll_nocache(void *to, const void __user *from,  					unsigned long n)  { +	stac();  #ifdef CONFIG_X86_INTEL_USERCOPY  	if (n > 64 && cpu_has_xmm2)  		n = __copy_user_zeroing_intel_nocache(to, from, n); @@ -668,6 +676,7 @@ unsigned long __copy_from_user_ll_nocache(void *to, const void __user *from,  #else  	__copy_user_zeroing(to, from, n);  #endif +	clac();  	return n;  }  EXPORT_SYMBOL(__copy_from_user_ll_nocache); @@ -675,6 +684,7 @@ EXPORT_SYMBOL(__copy_from_user_ll_nocache);  unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *from,  					unsigned long n)  { +	stac();  #ifdef CONFIG_X86_INTEL_USERCOPY  	if (n > 64 && cpu_has_xmm2)  		n = __copy_user_intel_nocache(to, from, n); @@ -683,6 +693,7 @@ unsigned long __copy_from_user_ll_nocache_nozero(void *to, const void __user *fr  #else  	__copy_user(to, from, n);  #endif +	clac();  	return n;  }  EXPORT_SYMBOL(__copy_from_user_ll_nocache_nozero); diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c index e5b130bc2d0..05928aae911 100644 --- a/arch/x86/lib/usercopy_64.c +++ b/arch/x86/lib/usercopy_64.c @@ -18,6 +18,7 @@ unsigned long __clear_user(void __user *addr, unsigned long size)  	might_fault();  	/* no memory constraint because it doesn't change any memory gcc knows  	   about */ +	stac();  	asm volatile(  		"	testq  %[size8],%[size8]\n"  		"	jz     4f\n" @@ -40,6 +41,7 @@ unsigned long __clear_user(void __user *addr, unsigned long size)  		: [size8] "=&c"(size), [dst] "=&D" (__d0)  		: [size1] "r"(size & 7), "[size8]" (size / 8), "[dst]"(addr),  		  [zero] "r" (0UL), [eight] "r" (8UL)); +	clac();  	return size;  }  EXPORT_SYMBOL(__clear_user); @@ -82,5 +84,6 @@ copy_user_handle_tail(char *to, char *from, unsigned len, unsigned zerorest)  	for (c = 0, zero_len = len; zerorest && zero_len; --zero_len)  		if (__put_user_nocheck(c, to++, sizeof(char)))  			break; +	clac();  	return len;  } diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 76dcd9d8e0b..a530b230e7d 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -18,6 +18,7 @@  #include <asm/pgalloc.h>		/* pgd_*(), ...			*/  #include <asm/kmemcheck.h>		/* kmemcheck_*(), ...		*/  #include <asm/fixmap.h>			/* VSYSCALL_START		*/ +#include <asm/rcu.h>			/* exception_enter(), ...	*/  /*   * Page fault error code bits: @@ -995,13 +996,24 @@ static int fault_in_kernel_space(unsigned long address)  	return address >= TASK_SIZE_MAX;  } +static inline bool smap_violation(int error_code, struct pt_regs *regs) +{ +	if (error_code & PF_USER) +		return false; + +	if (!user_mode_vm(regs) && (regs->flags & X86_EFLAGS_AC)) +		return false; + +	return true; +} +  /*   * This routine handles page faults.  It determines the address,   * and the problem, and then passes it off to one of the appropriate   * routines.   */ -dotraplinkage void __kprobes -do_page_fault(struct pt_regs *regs, unsigned long error_code) +static void __kprobes +__do_page_fault(struct pt_regs *regs, unsigned long error_code)  {  	struct vm_area_struct *vma;  	struct task_struct *tsk; @@ -1088,6 +1100,13 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)  	if (unlikely(error_code & PF_RSVD))  		pgtable_bad(regs, error_code, address); +	if (static_cpu_has(X86_FEATURE_SMAP)) { +		if (unlikely(smap_violation(error_code, regs))) { +			bad_area_nosemaphore(regs, error_code, address); +			return; +		} +	} +  	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);  	/* @@ -1209,3 +1228,11 @@ good_area:  	up_read(&mm->mmap_sem);  } + +dotraplinkage void __kprobes +do_page_fault(struct pt_regs *regs, unsigned long error_code) +{ +	exception_enter(regs); +	__do_page_fault(regs, error_code); +	exception_exit(regs); +} diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index e0e6990723e..ab1f6a93b52 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -319,7 +319,7 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,   */  int devmem_is_allowed(unsigned long pagenr)  { -	if (pagenr <= 256) +	if (pagenr < 256)  		return 1;  	if (iomem_is_exclusive(pagenr << PAGE_SHIFT))  		return 0; diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 575d86f85ce..11a58001b4c 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -445,10 +445,10 @@ static inline void permanent_kmaps_init(pgd_t *pgd_base)  }  #endif /* CONFIG_HIGHMEM */ -void __init native_pagetable_setup_start(pgd_t *base) +void __init native_pagetable_init(void)  {  	unsigned long pfn, va; -	pgd_t *pgd; +	pgd_t *pgd, *base = swapper_pg_dir;  	pud_t *pud;  	pmd_t *pmd;  	pte_t *pte; @@ -475,10 +475,7 @@ void __init native_pagetable_setup_start(pgd_t *base)  		pte_clear(NULL, va, pte);  	}  	paravirt_alloc_pmd(&init_mm, __pa(base) >> PAGE_SHIFT); -} - -void __init native_pagetable_setup_done(pgd_t *base) -{ +	paging_init();  }  /* @@ -493,7 +490,7 @@ void __init native_pagetable_setup_done(pgd_t *base)   * If we're booting paravirtualized under a hypervisor, then there are   * more options: we may already be running PAE, and the pagetable may   * or may not be based in swapper_pg_dir.  In any case, - * paravirt_pagetable_setup_start() will set up swapper_pg_dir + * paravirt_pagetable_init() will set up swapper_pg_dir   * appropriately for the rest of the initialization to work.   *   * In general, pagetable_init() assumes that the pagetable may already @@ -712,7 +709,7 @@ static void __init test_wp_bit(void)    "Checking if this processor honours the WP bit even in supervisor mode...");  	/* Any page-aligned address will do, the test is non-destructive */ -	__set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_READONLY); +	__set_fixmap(FIX_WP_TEST, __pa(&swapper_pg_dir), PAGE_KERNEL_RO);  	boot_cpu_data.wp_works_ok = do_test_wp_bit();  	clear_fixmap(FIX_WP_TEST); diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 613cd83e8c0..0777f042e40 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -98,6 +98,8 @@ static void flush_tlb_func(void *info)  {  	struct flush_tlb_info *f = info; +	inc_irq_stat(irq_tlb_count); +  	if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm))  		return; @@ -320,7 +322,7 @@ static ssize_t tlbflush_write_file(struct file *file,  	if (kstrtos8(buf, 0, &shift))  		return -EINVAL; -	if (shift > 64) +	if (shift < -1 || shift >= BITS_PER_LONG)  		return -EINVAL;  	tlb_flushall_shift = shift; diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 505acdd6d60..192397c9860 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -305,7 +305,6 @@ setup_resource(struct acpi_resource *acpi_res, void *data)  	res->flags = flags;  	res->start = start;  	res->end = end; -	res->child = NULL;  	if (!pci_use_crs) {  		dev_printk(KERN_DEBUG, &info->bridge->dev, @@ -434,7 +433,7 @@ probe_pci_root_info(struct pci_root_info *info, struct acpi_device *device,  	size = sizeof(*info->res) * info->res_num;  	info->res_num = 0; -	info->res = kmalloc(size, GFP_KERNEL); +	info->res = kzalloc(size, GFP_KERNEL);  	if (!info->res)  		return; diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index 937bcece700..704b9ec043d 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -585,7 +585,7 @@ static int __init pci_parse_mcfg(struct acpi_table_header *header)  	while (i >= sizeof(struct acpi_mcfg_allocation)) {  		entries++;  		i -= sizeof(struct acpi_mcfg_allocation); -	}; +	}  	if (entries == 0) {  		pr_err(PREFIX "MMCONFIG has no entries\n");  		return -ENODEV; diff --git a/arch/x86/pci/visws.c b/arch/x86/pci/visws.c index 6f2f8eeed17..3e6d2a6db86 100644 --- a/arch/x86/pci/visws.c +++ b/arch/x86/pci/visws.c @@ -62,11 +62,6 @@ out:  	return irq;  } -void __init pcibios_update_irq(struct pci_dev *dev, int irq) -{ -	pci_write_config_byte(dev, PCI_INTERRUPT_LINE, irq); -} -  int __init pci_visws_init(void)  {  	pcibios_enable_irq = &pci_visws_enable_irq; diff --git a/arch/x86/platform/efi/Makefile b/arch/x86/platform/efi/Makefile index 73b8be0f367..6db1cc4c753 100644 --- a/arch/x86/platform/efi/Makefile +++ b/arch/x86/platform/efi/Makefile @@ -1 +1,2 @@  obj-$(CONFIG_EFI) 		+= efi.o efi_$(BITS).o efi_stub_$(BITS).o +obj-$(CONFIG_ACPI_BGRT) += efi-bgrt.o diff --git a/arch/x86/platform/efi/efi-bgrt.c b/arch/x86/platform/efi/efi-bgrt.c new file mode 100644 index 00000000000..f6a0c1b8e51 --- /dev/null +++ b/arch/x86/platform/efi/efi-bgrt.c @@ -0,0 +1,76 @@ +/* + * Copyright 2012 Intel Corporation + * Author: Josh Triplett <josh@joshtriplett.org> + * + * Based on the bgrt driver: + * Copyright 2012 Red Hat, Inc <mjg@redhat.com> + * Author: Matthew Garrett + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include <linux/kernel.h> +#include <linux/acpi.h> +#include <linux/efi.h> +#include <linux/efi-bgrt.h> + +struct acpi_table_bgrt *bgrt_tab; +void *bgrt_image; +size_t bgrt_image_size; + +struct bmp_header { +	u16 id; +	u32 size; +} __packed; + +void efi_bgrt_init(void) +{ +	acpi_status status; +	void __iomem *image; +	bool ioremapped = false; +	struct bmp_header bmp_header; + +	if (acpi_disabled) +		return; + +	status = acpi_get_table("BGRT", 0, +	                        (struct acpi_table_header **)&bgrt_tab); +	if (ACPI_FAILURE(status)) +		return; + +	if (bgrt_tab->version != 1) +		return; +	if (bgrt_tab->image_type != 0 || !bgrt_tab->image_address) +		return; + +	image = efi_lookup_mapped_addr(bgrt_tab->image_address); +	if (!image) { +		image = ioremap(bgrt_tab->image_address, sizeof(bmp_header)); +		ioremapped = true; +		if (!image) +			return; +	} + +	memcpy_fromio(&bmp_header, image, sizeof(bmp_header)); +	if (ioremapped) +		iounmap(image); +	bgrt_image_size = bmp_header.size; + +	bgrt_image = kmalloc(bgrt_image_size, GFP_KERNEL); +	if (!bgrt_image) +		return; + +	if (ioremapped) { +		image = ioremap(bgrt_tab->image_address, bmp_header.size); +		if (!image) { +			kfree(bgrt_image); +			bgrt_image = NULL; +			return; +		} +	} + +	memcpy_fromio(bgrt_image, image, bgrt_image_size); +	if (ioremapped) +		iounmap(image); +} diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 92660edaa1e..aded2a91162 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -31,6 +31,7 @@  #include <linux/kernel.h>  #include <linux/init.h>  #include <linux/efi.h> +#include <linux/efi-bgrt.h>  #include <linux/export.h>  #include <linux/bootmem.h>  #include <linux/memblock.h> @@ -419,10 +420,21 @@ void __init efi_reserve_boot_services(void)  	}  } -static void __init efi_free_boot_services(void) +static void __init efi_unmap_memmap(void) +{ +	if (memmap.map) { +		early_iounmap(memmap.map, memmap.nr_map * memmap.desc_size); +		memmap.map = NULL; +	} +} + +void __init efi_free_boot_services(void)  {  	void *p; +	if (!efi_native) +		return; +  	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {  		efi_memory_desc_t *md = p;  		unsigned long long start = md->phys_addr; @@ -438,6 +450,8 @@ static void __init efi_free_boot_services(void)  		free_bootmem_late(start, size);  	} + +	efi_unmap_memmap();  }  static int __init efi_systab_init(void *phys) @@ -732,6 +746,11 @@ void __init efi_init(void)  #endif  } +void __init efi_late_init(void) +{ +	efi_bgrt_init(); +} +  void __init efi_set_executable(efi_memory_desc_t *md, bool executable)  {  	u64 addr, npages; @@ -764,6 +783,34 @@ static void __init runtime_code_page_mkexec(void)  }  /* + * We can't ioremap data in EFI boot services RAM, because we've already mapped + * it as RAM.  So, look it up in the existing EFI memory map instead.  Only + * callable after efi_enter_virtual_mode and before efi_free_boot_services. + */ +void __iomem *efi_lookup_mapped_addr(u64 phys_addr) +{ +	void *p; +	if (WARN_ON(!memmap.map)) +		return NULL; +	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { +		efi_memory_desc_t *md = p; +		u64 size = md->num_pages << EFI_PAGE_SHIFT; +		u64 end = md->phys_addr + size; +		if (!(md->attribute & EFI_MEMORY_RUNTIME) && +		    md->type != EFI_BOOT_SERVICES_CODE && +		    md->type != EFI_BOOT_SERVICES_DATA) +			continue; +		if (!md->virt_addr) +			continue; +		if (phys_addr >= md->phys_addr && phys_addr < end) { +			phys_addr += md->virt_addr - md->phys_addr; +			return (__force void __iomem *)(unsigned long)phys_addr; +		} +	} +	return NULL; +} + +/*   * This function will switch the EFI runtime services to virtual mode.   * Essentially, look through the EFI memmap and map every region that   * has the runtime attribute bit set in its memory descriptor and update @@ -787,8 +834,10 @@ void __init efi_enter_virtual_mode(void)  	 * non-native EFI  	 */ -	if (!efi_native) -		goto out; +	if (!efi_native) { +		efi_unmap_memmap(); +		return; +	}  	/* Merge contiguous regions of the same type and attribute */  	for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { @@ -878,18 +927,12 @@ void __init efi_enter_virtual_mode(void)  	}  	/* -	 * Thankfully, it does seem that no runtime services other than -	 * SetVirtualAddressMap() will touch boot services code, so we can -	 * get rid of it all at this point -	 */ -	efi_free_boot_services(); - -	/*  	 * Now that EFI is in virtual mode, update the function  	 * pointers in the runtime service table to the new virtual addresses.  	 *  	 * Call EFI services through wrapper functions.  	 */ +	efi.runtime_version = efi_systab.fw_revision;  	efi.get_time = virt_efi_get_time;  	efi.set_time = virt_efi_set_time;  	efi.get_wakeup_time = virt_efi_get_wakeup_time; @@ -906,9 +949,6 @@ void __init efi_enter_virtual_mode(void)  	if (__supported_pte_mask & _PAGE_NX)  		runtime_code_page_mkexec(); -out: -	early_iounmap(memmap.map, memmap.nr_map * memmap.desc_size); -	memmap.map = NULL;  	kfree(new_memmap);  } diff --git a/arch/x86/realmode/rm/wakeup.h b/arch/x86/realmode/rm/wakeup.h index 9317e0042f2..7dd86a419f5 100644 --- a/arch/x86/realmode/rm/wakeup.h +++ b/arch/x86/realmode/rm/wakeup.h @@ -36,5 +36,7 @@ extern struct wakeup_header wakeup_header;  /* Wakeup behavior bits */  #define WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE     0 +#define WAKEUP_BEHAVIOR_RESTORE_CR4		1 +#define WAKEUP_BEHAVIOR_RESTORE_EFER		2  #endif /* ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H */ diff --git a/arch/x86/realmode/rm/wakeup_asm.S b/arch/x86/realmode/rm/wakeup_asm.S index 8905166b0bb..e56479e5805 100644 --- a/arch/x86/realmode/rm/wakeup_asm.S +++ b/arch/x86/realmode/rm/wakeup_asm.S @@ -74,9 +74,18 @@ ENTRY(wakeup_start)  	lidtl	wakeup_idt -	/* Clear the EFLAGS */ -	pushl	$0 +	/* Clear the EFLAGS but remember if we have EFLAGS.ID */ +	movl $X86_EFLAGS_ID, %ecx +	pushl %ecx  	popfl +	pushfl +	popl %edi +	pushl $0 +	popfl +	pushfl +	popl %edx +	xorl %edx, %edi +	andl %ecx, %edi		/* %edi is zero iff CPUID & %cr4 are missing */  	/* Check header signature... */  	movl	signature, %eax @@ -93,8 +102,8 @@ ENTRY(wakeup_start)  	/* Restore MISC_ENABLE before entering protected mode, in case  	   BIOS decided to clear XD_DISABLE during S3. */ -	movl	pmode_behavior, %eax -	btl	$WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE, %eax +	movl	pmode_behavior, %edi +	btl	$WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE, %edi  	jnc	1f  	movl	pmode_misc_en, %eax @@ -110,15 +119,15 @@ ENTRY(wakeup_start)  	movl	pmode_cr3, %eax  	movl	%eax, %cr3 -	movl	pmode_cr4, %ecx -	jecxz	1f -	movl	%ecx, %cr4 +	btl	$WAKEUP_BEHAVIOR_RESTORE_CR4, %edi +	jz	1f +	movl	pmode_cr4, %eax +	movl	%eax, %cr4  1: +	btl	$WAKEUP_BEHAVIOR_RESTORE_EFER, %edi +	jz	1f  	movl	pmode_efer, %eax  	movl	pmode_efer + 4, %edx -	movl	%eax, %ecx -	orl	%edx, %ecx -	jz	1f  	movl	$MSR_EFER, %ecx  	wrmsr  1: diff --git a/arch/x86/um/Kconfig b/arch/x86/um/Kconfig index 9926e11a772..aeaff8bef2f 100644 --- a/arch/x86/um/Kconfig +++ b/arch/x86/um/Kconfig @@ -21,6 +21,7 @@ config 64BIT  config X86_32  	def_bool !64BIT  	select HAVE_AOUT +	select ARCH_WANT_IPC_PARSE_VERSION  config X86_64  	def_bool 64BIT diff --git a/arch/x86/um/shared/sysdep/kernel-offsets.h b/arch/x86/um/shared/sysdep/kernel-offsets.h index 5868526b5ee..46a9df99f3c 100644 --- a/arch/x86/um/shared/sysdep/kernel-offsets.h +++ b/arch/x86/um/shared/sysdep/kernel-offsets.h @@ -7,9 +7,6 @@  #define DEFINE(sym, val) \  	asm volatile("\n->" #sym " %0 " #val : : "i" (val)) -#define STR(x) #x -#define DEFINE_STR(sym, val) asm volatile("\n->" #sym " " STR(val) " " #val: : ) -  #define BLANK() asm volatile("\n->" : : )  #define OFFSET(sym, str, mem) \ diff --git a/arch/x86/um/shared/sysdep/syscalls.h b/arch/x86/um/shared/sysdep/syscalls.h index bd9a89b67e4..ca255a805ed 100644 --- a/arch/x86/um/shared/sysdep/syscalls.h +++ b/arch/x86/um/shared/sysdep/syscalls.h @@ -1,3 +1,5 @@ +extern long sys_clone(unsigned long clone_flags, unsigned long newsp, +	       void __user *parent_tid, void __user *child_tid);  #ifdef __i386__  #include "syscalls_32.h"  #else diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c index a508cea1350..ba7363ecf89 100644 --- a/arch/x86/um/signal.c +++ b/arch/x86/um/signal.c @@ -416,9 +416,6 @@ int setup_signal_stack_sc(unsigned long stack_top, int sig,  	PT_REGS_AX(regs) = (unsigned long) sig;  	PT_REGS_DX(regs) = (unsigned long) 0;  	PT_REGS_CX(regs) = (unsigned long) 0; - -	if ((current->ptrace & PT_DTRACE) && (current->ptrace & PT_PTRACED)) -		ptrace_notify(SIGTRAP);  	return 0;  } @@ -466,9 +463,6 @@ int setup_signal_stack_si(unsigned long stack_top, int sig,  	PT_REGS_AX(regs) = (unsigned long) sig;  	PT_REGS_DX(regs) = (unsigned long) &frame->info;  	PT_REGS_CX(regs) = (unsigned long) &frame->uc; - -	if ((current->ptrace & PT_DTRACE) && (current->ptrace & PT_PTRACED)) -		ptrace_notify(SIGTRAP);  	return 0;  } diff --git a/arch/x86/um/sys_call_table_32.c b/arch/x86/um/sys_call_table_32.c index 68d1dc91b37..b5408cecac6 100644 --- a/arch/x86/um/sys_call_table_32.c +++ b/arch/x86/um/sys_call_table_32.c @@ -28,7 +28,7 @@  #define ptregs_execve sys_execve  #define ptregs_iopl sys_iopl  #define ptregs_vm86old sys_vm86old -#define ptregs_clone sys_clone +#define ptregs_clone i386_clone  #define ptregs_vm86 sys_vm86  #define ptregs_sigaltstack sys_sigaltstack  #define ptregs_vfork sys_vfork diff --git a/arch/x86/um/syscalls_32.c b/arch/x86/um/syscalls_32.c index b853e8600b9..db444c7218f 100644 --- a/arch/x86/um/syscalls_32.c +++ b/arch/x86/um/syscalls_32.c @@ -3,37 +3,24 @@   * Licensed under the GPL   */ -#include "linux/sched.h" -#include "linux/shm.h" -#include "linux/ipc.h" -#include "linux/syscalls.h" -#include "asm/mman.h" -#include "asm/uaccess.h" -#include "asm/unistd.h" +#include <linux/syscalls.h> +#include <sysdep/syscalls.h>  /*   * The prototype on i386 is:   * - *     int clone(int flags, void * child_stack, int * parent_tidptr, struct user_desc * newtls, int * child_tidptr) + *     int clone(int flags, void * child_stack, int * parent_tidptr, struct user_desc * newtls   *   * and the "newtls" arg. on i386 is read by copy_thread directly from the   * register saved on the stack.   */ -long sys_clone(unsigned long clone_flags, unsigned long newsp, -	       int __user *parent_tid, void *newtls, int __user *child_tid) +long i386_clone(unsigned long clone_flags, unsigned long newsp, +		int __user *parent_tid, void *newtls, int __user *child_tid)  { -	long ret; - -	if (!newsp) -		newsp = UPT_SP(¤t->thread.regs.regs); - -	current->thread.forking = 1; -	ret = do_fork(clone_flags, newsp, ¤t->thread.regs, 0, parent_tid, -		      child_tid); -	current->thread.forking = 0; -	return ret; +	return sys_clone(clone_flags, newsp, parent_tid, child_tid);  } +  long sys_sigaction(int sig, const struct old_sigaction __user *act,  			 struct old_sigaction __user *oact)  { diff --git a/arch/x86/um/syscalls_64.c b/arch/x86/um/syscalls_64.c index f3d82bb6e15..adb08eb5c22 100644 --- a/arch/x86/um/syscalls_64.c +++ b/arch/x86/um/syscalls_64.c @@ -5,12 +5,9 @@   * Licensed under the GPL   */ -#include "linux/linkage.h" -#include "linux/personality.h" -#include "linux/utsname.h" -#include "asm/prctl.h" /* XXX This should get the constants from libc */ -#include "asm/uaccess.h" -#include "os.h" +#include <linux/sched.h> +#include <asm/prctl.h> /* XXX This should get the constants from libc */ +#include <os.h>  long arch_prctl(struct task_struct *task, int code, unsigned long __user *addr)  { @@ -79,20 +76,6 @@ long sys_arch_prctl(int code, unsigned long addr)  	return arch_prctl(current, code, (unsigned long __user *) addr);  } -long sys_clone(unsigned long clone_flags, unsigned long newsp, -	       void __user *parent_tid, void __user *child_tid) -{ -	long ret; - -	if (!newsp) -		newsp = UPT_SP(¤t->thread.regs.regs); -	current->thread.forking = 1; -	ret = do_fork(clone_flags, newsp, ¤t->thread.regs, 0, parent_tid, -		      child_tid); -	current->thread.forking = 0; -	return ret; -} -  void arch_switch_to(struct task_struct *to)  {  	if ((to->thread.arch.fs == 0) || (to->mm == NULL)) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 9642d4a3860..1fbe75a95f1 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1452,6 +1452,10 @@ asmlinkage void __init xen_start_kernel(void)  		pci_request_acs();  		xen_acpi_sleep_register(); + +		/* Avoid searching for BIOS MP tables */ +		x86_init.mpparse.find_smp_config = x86_init_noop; +		x86_init.mpparse.get_smp_config = x86_init_uint_noop;  	}  #ifdef CONFIG_PCI  	/* PCI BIOS service won't work from a PV guest. */ diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index b65a76133f4..7a769b7526c 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1174,8 +1174,13 @@ static void xen_exit_mmap(struct mm_struct *mm)  	spin_unlock(&mm->page_table_lock);  } -static void __init xen_pagetable_setup_start(pgd_t *base) +static void xen_post_allocator_init(void); + +static void __init xen_pagetable_init(void)  { +	paging_init(); +	xen_setup_shared_info(); +	xen_post_allocator_init();  }  static __init void xen_mapping_pagetable_reserve(u64 start, u64 end) @@ -1192,14 +1197,6 @@ static __init void xen_mapping_pagetable_reserve(u64 start, u64 end)  	}  } -static void xen_post_allocator_init(void); - -static void __init xen_pagetable_setup_done(pgd_t *base) -{ -	xen_setup_shared_info(); -	xen_post_allocator_init(); -} -  static void xen_write_cr2(unsigned long cr2)  {  	this_cpu_read(xen_vcpu)->arch.cr2 = cr2; @@ -1283,7 +1280,7 @@ static void xen_flush_tlb_others(const struct cpumask *cpus,  	cpumask_clear_cpu(smp_processor_id(), to_cpumask(args->mask));  	args->op.cmd = MMUEXT_TLB_FLUSH_MULTI; -	if (start != TLB_FLUSH_ALL && (end - start) <= PAGE_SIZE) { +	if (end != TLB_FLUSH_ALL && (end - start) <= PAGE_SIZE) {  		args->op.cmd = MMUEXT_INVLPG_MULTI;  		args->op.arg1.linear_addr = start;  	} @@ -2068,8 +2065,7 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = {  void __init xen_init_mmu_ops(void)  {  	x86_init.mapping.pagetable_reserve = xen_mapping_pagetable_reserve; -	x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start; -	x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done; +	x86_init.paging.pagetable_init = xen_pagetable_init;  	pv_mmu_ops = xen_mmu_ops;  	memset(dummy_mapping, 0xff, PAGE_SIZE); diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c index d4b25546325..72213da605f 100644 --- a/arch/x86/xen/p2m.c +++ b/arch/x86/xen/p2m.c @@ -599,7 +599,7 @@ bool __init early_can_reuse_p2m_middle(unsigned long set_pfn, unsigned long set_  	if (p2m_index(set_pfn))  		return false; -	for (pfn = 0; pfn <= MAX_DOMAIN_PAGES; pfn += P2M_PER_PAGE) { +	for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_PER_PAGE) {  		topidx = p2m_top_index(pfn);  		if (!p2m_top[topidx]) @@ -828,9 +828,6 @@ int m2p_add_override(unsigned long mfn, struct page *page,  			xen_mc_issue(PARAVIRT_LAZY_MMU);  		} -		/* let's use dev_bus_addr to record the old mfn instead */ -		kmap_op->dev_bus_addr = page->index; -		page->index = (unsigned long) kmap_op;  	}  	spin_lock_irqsave(&m2p_override_lock, flags);  	list_add(&page->lru,  &m2p_overrides[mfn_hash(mfn)]); @@ -857,7 +854,8 @@ int m2p_add_override(unsigned long mfn, struct page *page,  	return 0;  }  EXPORT_SYMBOL_GPL(m2p_add_override); -int m2p_remove_override(struct page *page, bool clear_pte) +int m2p_remove_override(struct page *page, +		struct gnttab_map_grant_ref *kmap_op)  {  	unsigned long flags;  	unsigned long mfn; @@ -887,10 +885,8 @@ int m2p_remove_override(struct page *page, bool clear_pte)  	WARN_ON(!PagePrivate(page));  	ClearPagePrivate(page); -	if (clear_pte) { -		struct gnttab_map_grant_ref *map_op = -			(struct gnttab_map_grant_ref *) page->index; -		set_phys_to_machine(pfn, map_op->dev_bus_addr); +	set_phys_to_machine(pfn, page->index); +	if (kmap_op != NULL) {  		if (!PageHighMem(page)) {  			struct multicall_space mcs;  			struct gnttab_unmap_grant_ref *unmap_op; @@ -902,13 +898,13 @@ int m2p_remove_override(struct page *page, bool clear_pte)  			 * issued. In this case handle is going to -1 because  			 * it hasn't been modified yet.  			 */ -			if (map_op->handle == -1) +			if (kmap_op->handle == -1)  				xen_mc_flush();  			/* -			 * Now if map_op->handle is negative it means that the +			 * Now if kmap_op->handle is negative it means that the  			 * hypercall actually returned an error.  			 */ -			if (map_op->handle == GNTST_general_error) { +			if (kmap_op->handle == GNTST_general_error) {  				printk(KERN_WARNING "m2p_remove_override: "  						"pfn %lx mfn %lx, failed to modify kernel mappings",  						pfn, mfn); @@ -918,8 +914,8 @@ int m2p_remove_override(struct page *page, bool clear_pte)  			mcs = xen_mc_entry(  					sizeof(struct gnttab_unmap_grant_ref));  			unmap_op = mcs.args; -			unmap_op->host_addr = map_op->host_addr; -			unmap_op->handle = map_op->handle; +			unmap_op->host_addr = kmap_op->host_addr; +			unmap_op->handle = kmap_op->handle;  			unmap_op->dev_bus_addr = 0;  			MULTI_grant_table_op(mcs.mc, @@ -930,10 +926,9 @@ int m2p_remove_override(struct page *page, bool clear_pte)  			set_pte_at(&init_mm, address, ptep,  					pfn_pte(pfn, PAGE_KERNEL));  			__flush_tlb_single(address); -			map_op->host_addr = 0; +			kmap_op->host_addr = 0;  		} -	} else -		set_phys_to_machine(pfn, page->index); +	}  	/* p2m(m2p(mfn)) == FOREIGN_FRAME(mfn): the mfn is already present  	 * somewhere in this domain, even before being added to the diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index d11ca11d14f..e2d62d697b5 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -17,6 +17,7 @@  #include <asm/e820.h>  #include <asm/setup.h>  #include <asm/acpi.h> +#include <asm/numa.h>  #include <asm/xen/hypervisor.h>  #include <asm/xen/hypercall.h> @@ -544,4 +545,7 @@ void __init xen_arch_setup(void)  	disable_cpufreq();  	WARN_ON(set_pm_idle_to_default());  	fiddle_vdso(); +#ifdef CONFIG_NUMA +	numa_off = 1; +#endif  } diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index f58dca7a6e5..353c50f1870 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -377,7 +377,8 @@ static int __cpuinit xen_cpu_up(unsigned int cpu, struct task_struct *idle)  		return rc;  	if (num_online_cpus() == 1) -		alternatives_smp_switch(1); +		/* Just in case we booted with a single CPU. */ +		alternatives_enable_smp();  	rc = xen_smp_intr_init(cpu);  	if (rc) @@ -424,9 +425,6 @@ static void xen_cpu_die(unsigned int cpu)  	unbind_from_irqhandler(per_cpu(xen_irq_work, cpu), NULL);  	xen_uninit_lock_cpu(cpu);  	xen_teardown_timer(cpu); - -	if (num_online_cpus() == 1) -		alternatives_smp_switch(0);  }  static void __cpuinit xen_play_dead(void) /* used only with HOTPLUG_CPU */  |