summaryrefslogtreecommitdiff
path: root/arch/x86/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/amd_nb.c8
-rw-r--r--arch/x86/kernel/apic/apic.c2
-rw-r--r--arch/x86/kernel/apic/io_apic.c16
-rw-r--r--arch/x86/kernel/apic/msi.c18
-rw-r--r--arch/x86/kernel/apic/vector.c27
-rw-r--r--arch/x86/kernel/cpu/common.c28
-rw-r--r--arch/x86/kernel/cpu/mce/core.c72
-rw-r--r--arch/x86/kernel/cpu/mce/inject.c2
-rw-r--r--arch/x86/kernel/cpu/mce/internal.h10
-rw-r--r--arch/x86/kernel/cpu/mce/severity.c28
-rw-r--r--arch/x86/kernel/cpu/resctrl/core.c2
-rw-r--r--arch/x86/kernel/cpu/resctrl/internal.h3
-rw-r--r--arch/x86/kernel/cpu/resctrl/monitor.c3
-rw-r--r--arch/x86/kernel/cpu/resctrl/rdtgroup.c1
-rw-r--r--arch/x86/kernel/cpu/umwait.c6
-rw-r--r--arch/x86/kernel/crash.c15
-rw-r--r--arch/x86/kernel/fpu/core.c6
-rw-r--r--arch/x86/kernel/fpu/init.c30
-rw-r--r--arch/x86/kernel/fpu/xstate.c11
-rw-r--r--arch/x86/kernel/i8259.c2
-rw-r--r--arch/x86/kernel/idt.c6
-rw-r--r--arch/x86/kernel/irq_64.c2
-rw-r--r--arch/x86/kernel/kprobes/core.c16
-rw-r--r--arch/x86/kernel/nmi.c5
-rw-r--r--arch/x86/kernel/process_64.c2
-rw-r--r--arch/x86/kernel/smpboot.c26
-rw-r--r--arch/x86/kernel/stacktrace.c5
-rw-r--r--arch/x86/kernel/tsc_msr.c9
-rw-r--r--arch/x86/kernel/unwind_orc.c8
-rw-r--r--arch/x86/kernel/vmlinux.lds.S1
30 files changed, 258 insertions, 112 deletions
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index 251c795b4eb3..c4bc01da820e 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -18,10 +18,13 @@
#define PCI_DEVICE_ID_AMD_17H_ROOT 0x1450
#define PCI_DEVICE_ID_AMD_17H_M10H_ROOT 0x15d0
#define PCI_DEVICE_ID_AMD_17H_M30H_ROOT 0x1480
+#define PCI_DEVICE_ID_AMD_17H_M60H_ROOT 0x1630
#define PCI_DEVICE_ID_AMD_17H_DF_F4 0x1464
#define PCI_DEVICE_ID_AMD_17H_M10H_DF_F4 0x15ec
#define PCI_DEVICE_ID_AMD_17H_M30H_DF_F4 0x1494
+#define PCI_DEVICE_ID_AMD_17H_M60H_DF_F4 0x144c
#define PCI_DEVICE_ID_AMD_17H_M70H_DF_F4 0x1444
+#define PCI_DEVICE_ID_AMD_19H_DF_F4 0x1654
/* Protect the PCI config register pairs used for SMN and DF indirect access. */
static DEFINE_MUTEX(smn_mutex);
@@ -32,6 +35,7 @@ static const struct pci_device_id amd_root_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_ROOT) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_ROOT) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_ROOT) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M60H_ROOT) },
{}
};
@@ -50,8 +54,10 @@ const struct pci_device_id amd_nb_misc_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_DF_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_DF_F3) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M60H_DF_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F3) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M70H_DF_F3) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_DF_F3) },
{}
};
EXPORT_SYMBOL_GPL(amd_nb_misc_ids);
@@ -65,7 +71,9 @@ static const struct pci_device_id amd_nb_link_ids[] = {
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M10H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M30H_DF_F4) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M60H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_M70H_DF_F4) },
+ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_19H_DF_F4) },
{ PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F4) },
{}
};
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 25b8c45467fc..fce94c799f01 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -2099,7 +2099,7 @@ void __init init_apic_mappings(void)
unsigned int new_apicid;
if (apic_validate_deadline_timer())
- pr_debug("TSC deadline timer available\n");
+ pr_info("TSC deadline timer available\n");
if (x2apic_mode) {
boot_cpu_physical_apicid = read_apic_id();
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index f0262cb5657a..4b6301946f45 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -2256,6 +2256,7 @@ static inline void __init check_timer(void)
legacy_pic->init(0);
legacy_pic->make_irq(0);
apic_write(APIC_LVT0, APIC_DM_EXTINT);
+ legacy_pic->unmask(0);
unlock_ExtINT_logic();
@@ -2329,12 +2330,12 @@ static int mp_irqdomain_create(int ioapic)
ip->irqdomain = irq_domain_create_linear(fn, hwirqs, cfg->ops,
(void *)(long)ioapic);
- /* Release fw handle if it was allocated above */
- if (!cfg->dev)
- irq_domain_free_fwnode(fn);
-
- if (!ip->irqdomain)
+ if (!ip->irqdomain) {
+ /* Release fw handle if it was allocated above */
+ if (!cfg->dev)
+ irq_domain_free_fwnode(fn);
return -ENOMEM;
+ }
ip->irqdomain->parent = parent;
@@ -2348,8 +2349,13 @@ static int mp_irqdomain_create(int ioapic)
static void ioapic_destroy_irqdomain(int idx)
{
+ struct ioapic_domain_cfg *cfg = &ioapics[idx].irqdomain_cfg;
+ struct fwnode_handle *fn = ioapics[idx].irqdomain->fwnode;
+
if (ioapics[idx].irqdomain) {
irq_domain_remove(ioapics[idx].irqdomain);
+ if (!cfg->dev)
+ irq_domain_free_fwnode(fn);
ioapics[idx].irqdomain = NULL;
}
}
diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c
index 159bd0cb8548..a20873bbbed6 100644
--- a/arch/x86/kernel/apic/msi.c
+++ b/arch/x86/kernel/apic/msi.c
@@ -262,12 +262,13 @@ void __init arch_init_msi_domain(struct irq_domain *parent)
msi_default_domain =
pci_msi_create_irq_domain(fn, &pci_msi_domain_info,
parent);
- irq_domain_free_fwnode(fn);
}
- if (!msi_default_domain)
+ if (!msi_default_domain) {
+ irq_domain_free_fwnode(fn);
pr_warn("failed to initialize irqdomain for MSI/MSI-x.\n");
- else
+ } else {
msi_default_domain->flags |= IRQ_DOMAIN_MSI_NOMASK_QUIRK;
+ }
}
#ifdef CONFIG_IRQ_REMAP
@@ -300,7 +301,8 @@ struct irq_domain *arch_create_remap_msi_irq_domain(struct irq_domain *parent,
if (!fn)
return NULL;
d = pci_msi_create_irq_domain(fn, &pci_msi_ir_domain_info, parent);
- irq_domain_free_fwnode(fn);
+ if (!d)
+ irq_domain_free_fwnode(fn);
return d;
}
#endif
@@ -363,7 +365,8 @@ static struct irq_domain *dmar_get_irq_domain(void)
if (fn) {
dmar_domain = msi_create_irq_domain(fn, &dmar_msi_domain_info,
x86_vector_domain);
- irq_domain_free_fwnode(fn);
+ if (!dmar_domain)
+ irq_domain_free_fwnode(fn);
}
out:
mutex_unlock(&dmar_lock);
@@ -488,7 +491,10 @@ struct irq_domain *hpet_create_irq_domain(int hpet_id)
}
d = msi_create_irq_domain(fn, domain_info, parent);
- irq_domain_free_fwnode(fn);
+ if (!d) {
+ irq_domain_free_fwnode(fn);
+ kfree(domain_info);
+ }
return d;
}
diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 2c5676b0a6e7..c8203694d9ce 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -446,12 +446,10 @@ static int x86_vector_activate(struct irq_domain *dom, struct irq_data *irqd,
trace_vector_activate(irqd->irq, apicd->is_managed,
apicd->can_reserve, reserve);
- /* Nothing to do for fixed assigned vectors */
- if (!apicd->can_reserve && !apicd->is_managed)
- return 0;
-
raw_spin_lock_irqsave(&vector_lock, flags);
- if (reserve || irqd_is_managed_and_shutdown(irqd))
+ if (!apicd->can_reserve && !apicd->is_managed)
+ assign_irq_vector_any_locked(irqd);
+ else if (reserve || irqd_is_managed_and_shutdown(irqd))
vector_assign_managed_shutdown(irqd);
else if (apicd->is_managed)
ret = activate_managed(irqd);
@@ -556,6 +554,10 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq,
irqd->chip_data = apicd;
irqd->hwirq = virq + i;
irqd_set_single_target(irqd);
+
+ /* Don't invoke affinity setter on deactivated interrupts */
+ irqd_set_affinity_on_activate(irqd);
+
/*
* Legacy vectors are already assigned when the IOAPIC
* takes them over. They stay on the same vector. This is
@@ -703,7 +705,6 @@ int __init arch_early_irq_init(void)
x86_vector_domain = irq_domain_create_tree(fn, &x86_vector_domain_ops,
NULL);
BUG_ON(x86_vector_domain == NULL);
- irq_domain_free_fwnode(fn);
irq_set_default_host(x86_vector_domain);
arch_init_msi_domain(x86_vector_domain);
@@ -769,20 +770,10 @@ void lapic_offline(void)
static int apic_set_affinity(struct irq_data *irqd,
const struct cpumask *dest, bool force)
{
- struct apic_chip_data *apicd = apic_chip_data(irqd);
int err;
- /*
- * Core code can call here for inactive interrupts. For inactive
- * interrupts which use managed or reservation mode there is no
- * point in going through the vector assignment right now as the
- * activation will assign a vector which fits the destination
- * cpumask. Let the core code store the destination mask and be
- * done with it.
- */
- if (!irqd_is_activated(irqd) &&
- (apicd->is_managed || apicd->can_reserve))
- return IRQ_SET_MASK_OK;
+ if (WARN_ON_ONCE(!irqd_is_activated(irqd)))
+ return -EIO;
raw_spin_lock(&vector_lock);
cpumask_and(vector_searchmask, dest, cpu_online_mask);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 650df6d21049..8a85c2e144a6 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -366,6 +366,9 @@ out:
cr4_clear_bits(X86_CR4_UMIP);
}
+/* These bits should not change their value after CPU init is finished. */
+static const unsigned long cr4_pinned_mask =
+ X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_UMIP | X86_CR4_FSGSBASE;
static DEFINE_STATIC_KEY_FALSE_RO(cr_pinning);
static unsigned long cr4_pinned_bits __ro_after_init;
@@ -374,7 +377,7 @@ void native_write_cr0(unsigned long val)
unsigned long bits_missing = 0;
set_register:
- asm volatile("mov %0,%%cr0": "+r" (val), "+m" (__force_order));
+ asm volatile("mov %0,%%cr0": "+r" (val) : : "memory");
if (static_branch_likely(&cr_pinning)) {
if (unlikely((val & X86_CR0_WP) != X86_CR0_WP)) {
@@ -390,20 +393,20 @@ EXPORT_SYMBOL(native_write_cr0);
void native_write_cr4(unsigned long val)
{
- unsigned long bits_missing = 0;
+ unsigned long bits_changed = 0;
set_register:
- asm volatile("mov %0,%%cr4": "+r" (val), "+m" (cr4_pinned_bits));
+ asm volatile("mov %0,%%cr4": "+r" (val) : : "memory");
if (static_branch_likely(&cr_pinning)) {
- if (unlikely((val & cr4_pinned_bits) != cr4_pinned_bits)) {
- bits_missing = ~val & cr4_pinned_bits;
- val |= bits_missing;
+ if (unlikely((val & cr4_pinned_mask) != cr4_pinned_bits)) {
+ bits_changed = (val & cr4_pinned_mask) ^ cr4_pinned_bits;
+ val = (val & ~cr4_pinned_mask) | cr4_pinned_bits;
goto set_register;
}
- /* Warn after we've set the missing bits. */
- WARN_ONCE(bits_missing, "CR4 bits went missing: %lx!?\n",
- bits_missing);
+ /* Warn after we've corrected the changed bits. */
+ WARN_ONCE(bits_changed, "pinned CR4 bits changed: 0x%lx!?\n",
+ bits_changed);
}
}
EXPORT_SYMBOL(native_write_cr4);
@@ -415,7 +418,7 @@ void cr4_init(void)
if (boot_cpu_has(X86_FEATURE_PCID))
cr4 |= X86_CR4_PCIDE;
if (static_branch_likely(&cr_pinning))
- cr4 |= cr4_pinned_bits;
+ cr4 = (cr4 & ~cr4_pinned_mask) | cr4_pinned_bits;
__write_cr4(cr4);
@@ -430,10 +433,7 @@ void cr4_init(void)
*/
static void __init setup_cr_pinning(void)
{
- unsigned long mask;
-
- mask = (X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_UMIP);
- cr4_pinned_bits = this_cpu_read(cpu_tlbstate.cr4) & mask;
+ cr4_pinned_bits = this_cpu_read(cpu_tlbstate.cr4) & cr4_pinned_mask;
static_key_enable(&cr_pinning.key);
}
diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c
index fd76e3733dd3..92331de16d70 100644
--- a/arch/x86/kernel/cpu/mce/core.c
+++ b/arch/x86/kernel/cpu/mce/core.c
@@ -388,10 +388,28 @@ static int msr_to_offset(u32 msr)
return -1;
}
+__visible bool ex_handler_rdmsr_fault(const struct exception_table_entry *fixup,
+ struct pt_regs *regs, int trapnr,
+ unsigned long error_code,
+ unsigned long fault_addr)
+{
+ pr_emerg("MSR access error: RDMSR from 0x%x at rIP: 0x%lx (%pS)\n",
+ (unsigned int)regs->cx, regs->ip, (void *)regs->ip);
+
+ show_stack_regs(regs);
+
+ panic("MCA architectural violation!\n");
+
+ while (true)
+ cpu_relax();
+
+ return true;
+}
+
/* MSR access wrappers used for error injection */
static u64 mce_rdmsrl(u32 msr)
{
- u64 v;
+ DECLARE_ARGS(val, low, high);
if (__this_cpu_read(injectm.finished)) {
int offset = msr_to_offset(msr);
@@ -401,21 +419,43 @@ static u64 mce_rdmsrl(u32 msr)
return *(u64 *)((char *)this_cpu_ptr(&injectm) + offset);
}
- if (rdmsrl_safe(msr, &v)) {
- WARN_ONCE(1, "mce: Unable to read MSR 0x%x!\n", msr);
- /*
- * Return zero in case the access faulted. This should
- * not happen normally but can happen if the CPU does
- * something weird, or if the code is buggy.
- */
- v = 0;
- }
+ /*
+ * RDMSR on MCA MSRs should not fault. If they do, this is very much an
+ * architectural violation and needs to be reported to hw vendor. Panic
+ * the box to not allow any further progress.
+ */
+ asm volatile("1: rdmsr\n"
+ "2:\n"
+ _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_rdmsr_fault)
+ : EAX_EDX_RET(val, low, high) : "c" (msr));
- return v;
+
+ return EAX_EDX_VAL(val, low, high);
+}
+
+__visible bool ex_handler_wrmsr_fault(const struct exception_table_entry *fixup,
+ struct pt_regs *regs, int trapnr,
+ unsigned long error_code,
+ unsigned long fault_addr)
+{
+ pr_emerg("MSR access error: WRMSR to 0x%x (tried to write 0x%08x%08x) at rIP: 0x%lx (%pS)\n",
+ (unsigned int)regs->cx, (unsigned int)regs->dx, (unsigned int)regs->ax,
+ regs->ip, (void *)regs->ip);
+
+ show_stack_regs(regs);
+
+ panic("MCA architectural violation!\n");
+
+ while (true)
+ cpu_relax();
+
+ return true;
}
static void mce_wrmsrl(u32 msr, u64 v)
{
+ u32 low, high;
+
if (__this_cpu_read(injectm.finished)) {
int offset = msr_to_offset(msr);
@@ -423,7 +463,15 @@ static void mce_wrmsrl(u32 msr, u64 v)
*(u64 *)((char *)this_cpu_ptr(&injectm) + offset) = v;
return;
}
- wrmsrl(msr, v);
+
+ low = (u32)v;
+ high = (u32)(v >> 32);
+
+ /* See comment in mce_rdmsrl() */
+ asm volatile("1: wrmsr\n"
+ "2:\n"
+ _ASM_EXTABLE_HANDLE(1b, 2b, ex_handler_wrmsr_fault)
+ : : "c" (msr), "a"(low), "d" (high) : "memory");
}
/*
diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c
index 1f30117b24ba..eb2d41c1816d 100644
--- a/arch/x86/kernel/cpu/mce/inject.c
+++ b/arch/x86/kernel/cpu/mce/inject.c
@@ -511,7 +511,7 @@ static void do_inject(void)
*/
if (inj_type == DFR_INT_INJ) {
i_mce.status |= MCI_STATUS_DEFERRED;
- i_mce.status |= (i_mce.status & ~MCI_STATUS_UC);
+ i_mce.status &= ~MCI_STATUS_UC;
}
/*
diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h
index 43031db429d2..231954fe5b4e 100644
--- a/arch/x86/kernel/cpu/mce/internal.h
+++ b/arch/x86/kernel/cpu/mce/internal.h
@@ -172,4 +172,14 @@ extern bool amd_filter_mce(struct mce *m);
static inline bool amd_filter_mce(struct mce *m) { return false; };
#endif
+__visible bool ex_handler_rdmsr_fault(const struct exception_table_entry *fixup,
+ struct pt_regs *regs, int trapnr,
+ unsigned long error_code,
+ unsigned long fault_addr);
+
+__visible bool ex_handler_wrmsr_fault(const struct exception_table_entry *fixup,
+ struct pt_regs *regs, int trapnr,
+ unsigned long error_code,
+ unsigned long fault_addr);
+
#endif /* __X86_MCE_INTERNAL_H__ */
diff --git a/arch/x86/kernel/cpu/mce/severity.c b/arch/x86/kernel/cpu/mce/severity.c
index 87bcdc6dc2f0..0d09eb13743b 100644
--- a/arch/x86/kernel/cpu/mce/severity.c
+++ b/arch/x86/kernel/cpu/mce/severity.c
@@ -9,9 +9,11 @@
#include <linux/seq_file.h>
#include <linux/init.h>
#include <linux/debugfs.h>
-#include <asm/mce.h>
#include <linux/uaccess.h>
+#include <asm/mce.h>
+#include <asm/intel-family.h>
+
#include "internal.h"
/*
@@ -40,9 +42,14 @@ static struct severity {
unsigned char context;
unsigned char excp;
unsigned char covered;
+ unsigned char cpu_model;
+ unsigned char cpu_minstepping;
+ unsigned char bank_lo, bank_hi;
char *msg;
} severities[] = {
#define MCESEV(s, m, c...) { .sev = MCE_ ## s ## _SEVERITY, .msg = m, ## c }
+#define BANK_RANGE(l, h) .bank_lo = l, .bank_hi = h
+#define MODEL_STEPPING(m, s) .cpu_model = m, .cpu_minstepping = s
#define KERNEL .context = IN_KERNEL
#define USER .context = IN_USER
#define KERNEL_RECOV .context = IN_KERNEL_RECOV
@@ -97,7 +104,6 @@ static struct severity {
KEEP, "Corrected error",
NOSER, BITCLR(MCI_STATUS_UC)
),
-
/*
* known AO MCACODs reported via MCE or CMC:
*
@@ -113,6 +119,18 @@ static struct severity {
AO, "Action optional: last level cache writeback error",
SER, MASK(MCI_UC_AR|MCACOD, MCI_STATUS_UC|MCACOD_L3WB)
),
+ /*
+ * Quirk for Skylake/Cascade Lake. Patrol scrubber may be configured
+ * to report uncorrected errors using CMCI with a special signature.
+ * UC=0, MSCOD=0x0010, MCACOD=binary(000X 0000 1100 XXXX) reported
+ * in one of the memory controller banks.
+ * Set severity to "AO" for same action as normal patrol scrub error.
+ */
+ MCESEV(
+ AO, "Uncorrected Patrol Scrub Error",
+ SER, MASK(MCI_STATUS_UC|MCI_ADDR|0xffffeff0, MCI_ADDR|0x001000c0),
+ MODEL_STEPPING(INTEL_FAM6_SKYLAKE_X, 4), BANK_RANGE(13, 18)
+ ),
/* ignore OVER for UCNA */
MCESEV(
@@ -320,6 +338,12 @@ static int mce_severity_intel(struct mce *m, int tolerant, char **msg, bool is_e
continue;
if (s->excp && excp != s->excp)
continue;
+ if (s->cpu_model && boot_cpu_data.x86_model != s->cpu_model)
+ continue;
+ if (s->cpu_minstepping && boot_cpu_data.x86_stepping < s->cpu_minstepping)
+ continue;
+ if (s->bank_lo && (m->bank < s->bank_lo || m->bank > s->bank_hi))
+ continue;
if (msg)
*msg = s->msg;
s->covered = 1;
diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c
index d8cc5223b7ce..87a34b6e06a2 100644
--- a/arch/x86/kernel/cpu/resctrl/core.c
+++ b/arch/x86/kernel/cpu/resctrl/core.c
@@ -260,6 +260,7 @@ static bool __get_mem_config_intel(struct rdt_resource *r)
r->num_closid = edx.split.cos_max + 1;
r->membw.max_delay = eax.split.max_delay + 1;
r->default_ctrl = MAX_MBA_BW;
+ r->membw.mbm_width = MBM_CNTR_WIDTH;
if (ecx & MBA_IS_LINEAR) {
r->membw.delay_linear = true;
r->membw.min_bw = MAX_MBA_BW - r->membw.max_delay;
@@ -289,6 +290,7 @@ static bool __rdt_get_mem_config_amd(struct rdt_resource *r)
/* AMD does not use delay */
r->membw.delay_linear = false;
+ r->membw.mbm_width = MBM_CNTR_WIDTH_AMD;
r->membw.min_bw = 0;
r->membw.bw_gran = 1;
/* Max value is 2048, Data width should be 4 in decimal */
diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h
index 3dd13f3a8b23..17095435c875 100644
--- a/arch/x86/kernel/cpu/resctrl/internal.h
+++ b/arch/x86/kernel/cpu/resctrl/internal.h
@@ -32,6 +32,7 @@
#define CQM_LIMBOCHECK_INTERVAL 1000
#define MBM_CNTR_WIDTH 24
+#define MBM_CNTR_WIDTH_AMD 44
#define MBM_OVERFLOW_INTERVAL 1000
#define MAX_MBA_BW 100u
#define MBA_IS_LINEAR 0x4
@@ -368,6 +369,7 @@ struct rdt_cache {
* @min_bw: Minimum memory bandwidth percentage user can request
* @bw_gran: Granularity at which the memory bandwidth is allocated
* @delay_linear: True if memory B/W delay is in linear scale
+ * @mbm_width: memory B/W monitor counter width
* @mba_sc: True if MBA software controller(mba_sc) is enabled
* @mb_map: Mapping of memory B/W percentage to memory B/W delay
*/
@@ -376,6 +378,7 @@ struct rdt_membw {
u32 min_bw;
u32 bw_gran;
u32 delay_linear;
+ u32 mbm_width;
bool mba_sc;
u32 *mb_map;
};
diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c
index 773124b0e18a..0cf4f87f6012 100644
--- a/arch/x86/kernel/cpu/resctrl/monitor.c
+++ b/arch/x86/kernel/cpu/resctrl/monitor.c
@@ -216,8 +216,9 @@ void free_rmid(u32 rmid)
static u64 mbm_overflow_count(u64 prev_msr, u64 cur_msr)
{
- u64 shift = 64 - MBM_CNTR_WIDTH, chunks;
+ u64 shift, chunks;
+ shift = 64 - rdt_resources_all[RDT_RESOURCE_MBA].membw.mbm_width;
chunks = (cur_msr << shift) - (prev_msr << shift);
return chunks >>= shift;
}
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index 20856d80dce3..54b711bc0607 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -1027,6 +1027,7 @@ static int rdt_cdp_peer_get(struct rdt_resource *r, struct rdt_domain *d,
_d_cdp = rdt_find_domain(_r_cdp, d->id, NULL);
if (WARN_ON(IS_ERR_OR_NULL(_d_cdp))) {
_r_cdp = NULL;
+ _d_cdp = NULL;
ret = -EINVAL;
}
diff --git a/arch/x86/kernel/cpu/umwait.c b/arch/x86/kernel/cpu/umwait.c
index c222f283b456..32b4dc9030aa 100644
--- a/arch/x86/kernel/cpu/umwait.c
+++ b/arch/x86/kernel/cpu/umwait.c
@@ -17,12 +17,6 @@
*/
static u32 umwait_control_cached = UMWAIT_CTRL_VAL(100000, UMWAIT_C02_ENABLE);
-u32 get_umwait_control_msr(void)
-{
- return umwait_control_cached;
-}
-EXPORT_SYMBOL_GPL(get_umwait_control_msr);
-
/*
* Cache the original IA32_UMWAIT_CONTROL MSR value which is configured by
* hardware or BIOS before kernel boot.
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index eb651fbde92a..ff25a2ea271c 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -24,6 +24,7 @@
#include <linux/export.h>
#include <linux/slab.h>
#include <linux/vmalloc.h>
+#include <linux/memblock.h>
#include <asm/processor.h>
#include <asm/hardirq.h>
@@ -39,6 +40,7 @@
#include <asm/virtext.h>
#include <asm/intel_pt.h>
#include <asm/crash.h>
+#include <asm/cmdline.h>
/* Used while preparing memory map entries for second kernel */
struct crash_memmap_data {
@@ -68,6 +70,19 @@ static inline void cpu_crash_vmclear_loaded_vmcss(void)
rcu_read_unlock();
}
+/*
+ * When the crashkernel option is specified, only use the low
+ * 1M for the real mode trampoline.
+ */
+void __init crash_reserve_low_1M(void)
+{
+ if (cmdline_find_option(boot_command_line, "crashkernel", NULL, 0) < 0)
+ return;
+
+ memblock_reserve(0, 1<<20);
+ pr_info("Reserving the low 1M of memory for crashkernel\n");
+}
+
#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
static void kdump_nmi_callback(int cpu, struct pt_regs *regs)
diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c
index 12c70840980e..cd8839027f66 100644
--- a/arch/x86/kernel/fpu/core.c
+++ b/arch/x86/kernel/fpu/core.c
@@ -101,6 +101,12 @@ void kernel_fpu_begin(void)
copy_fpregs_to_fpstate(&current->thread.fpu);
}
__cpu_invalidate_fpregs_state();
+
+ if (boot_cpu_has(X86_FEATURE_XMM))
+ ldmxcsr(MXCSR_DEFAULT);
+
+ if (boot_cpu_has(X86_FEATURE_FPU))
+ asm volatile ("fninit");
}
EXPORT_SYMBOL_GPL(kernel_fpu_begin);
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 6ce7e0a23268..b271da0fa219 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -242,9 +242,9 @@ static void __init fpu__init_system_ctx_switch(void)
*/
static void __init fpu__init_parse_early_param(void)
{
- char arg[32];
+ char arg[128];
char *argptr = arg;
- int bit;
+ int arglen, res, bit;
#ifdef CONFIG_X86_32
if (cmdline_find_option_bool(boot_command_line, "no387"))
@@ -267,12 +267,26 @@ static void __init fpu__init_parse_early_param(void)
if (cmdline_find_option_bool(boot_command_line, "noxsaves"))
setup_clear_cpu_cap(X86_FEATURE_XSAVES);
- if (cmdline_find_option(boot_command_line, "clearcpuid", arg,
- sizeof(arg)) &&
- get_option(&argptr, &bit) &&
- bit >= 0 &&
- bit < NCAPINTS * 32)
- setup_clear_cpu_cap(bit);
+ arglen = cmdline_find_option(boot_command_line, "clearcpuid", arg, sizeof(arg));
+ if (arglen <= 0)
+ return;
+
+ pr_info("Clearing CPUID bits:");
+ do {
+ res = get_option(&argptr, &bit);
+ if (res == 0 || res == 3)
+ break;
+
+ /* If the argument was too long, the last bit may be cut off */
+ if (res == 1 && arglen >= sizeof(arg))
+ break;
+
+ if (bit >= 0 && bit < NCAPINTS * 32) {
+ pr_cont(" " X86_CAP_FMT, x86_cap_flag(bit));
+ setup_clear_cpu_cap(bit);
+ }
+ } while (res == 2);
+ pr_cont("\n");
}
/*
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index c94fec268ef2..735d1f1bbabc 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -895,8 +895,6 @@ const void *get_xsave_field_ptr(int xfeature_nr)
#ifdef CONFIG_ARCH_HAS_PKEYS
-#define NR_VALID_PKRU_BITS (CONFIG_NR_PROTECTION_KEYS * 2)
-#define PKRU_VALID_MASK (NR_VALID_PKRU_BITS - 1)
/*
* This will go out and modify PKRU register to set the access
* rights for @pkey to @init_val.
@@ -915,6 +913,13 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
if (!boot_cpu_has(X86_FEATURE_OSPKE))
return -EINVAL;
+ /*
+ * This code should only be called with valid 'pkey'
+ * values originating from in-kernel users. Complain
+ * if a bad value is observed.
+ */
+ WARN_ON_ONCE(pkey >= arch_max_pkey());
+
/* Set the bits we need in PKRU: */
if (init_val & PKEY_DISABLE_ACCESS)
new_pkru_bits |= PKRU_AD_BIT;
@@ -1017,7 +1022,7 @@ int copy_xstate_to_kernel(void *kbuf, struct xregs_state *xsave, unsigned int of
copy_part(offsetof(struct fxregs_state, st_space), 128,
&xsave->i387.st_space, &kbuf, &offset_start, &count);
if (header.xfeatures & XFEATURE_MASK_SSE)
- copy_part(xstate_offsets[XFEATURE_MASK_SSE], 256,
+ copy_part(xstate_offsets[XFEATURE_SSE], 256,
&xsave->i387.xmm_space, &kbuf, &offset_start, &count);
/*
* Fill xsave->i387.sw_reserved value for ptrace frame:
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index 519649ddf100..fe522691ac71 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -207,7 +207,7 @@ spurious_8259A_irq:
* lets ACK and report it. [once per IRQ]
*/
if (!(spurious_irq_mask & irqmask)) {
- printk(KERN_DEBUG
+ printk_deferred(KERN_DEBUG
"spurious 8259A interrupt: IRQ%d.\n", irq);
spurious_irq_mask |= irqmask;
}
diff --git a/arch/x86/kernel/idt.c b/arch/x86/kernel/idt.c
index 87ef69a72c52..7bb4c3cbf4dc 100644
--- a/arch/x86/kernel/idt.c
+++ b/arch/x86/kernel/idt.c
@@ -318,7 +318,11 @@ void __init idt_setup_apic_and_irq_gates(void)
#ifdef CONFIG_X86_LOCAL_APIC
for_each_clear_bit_from(i, system_vectors, NR_VECTORS) {
- set_bit(i, system_vectors);
+ /*
+ * Don't set the non assigned system vectors in the
+ * system_vectors bitmap. Otherwise they show up in
+ * /proc/interrupts.
+ */
entry = spurious_entries_start + 8 * (i - FIRST_SYSTEM_VECTOR);
set_intr_gate(i, entry);
}
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 12df3a4abfdd..6b32ab009c19 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -43,7 +43,7 @@ static int map_irq_stack(unsigned int cpu)
pages[i] = pfn_to_page(pa >> PAGE_SHIFT);
}
- va = vmap(pages, IRQ_STACK_SIZE / PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL);
+ va = vmap(pages, IRQ_STACK_SIZE / PAGE_SIZE, VM_MAP, PAGE_KERNEL);
if (!va)
return -ENOMEM;
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 43fc13c831af..62c39baea39e 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -746,16 +746,11 @@ asm(
NOKPROBE_SYMBOL(kretprobe_trampoline);
STACK_FRAME_NON_STANDARD(kretprobe_trampoline);
-static struct kprobe kretprobe_kprobe = {
- .addr = (void *)kretprobe_trampoline,
-};
-
/*
* Called from kretprobe_trampoline
*/
__used __visible void *trampoline_handler(struct pt_regs *regs)
{
- struct kprobe_ctlblk *kcb;
struct kretprobe_instance *ri = NULL;
struct hlist_head *head, empty_rp;
struct hlist_node *tmp;
@@ -765,16 +760,12 @@ __used __visible void *trampoline_handler(struct pt_regs *regs)
void *frame_pointer;
bool skipped = false;
- preempt_disable();
-
/*
* Set a dummy kprobe for avoiding kretprobe recursion.
* Since kretprobe never run in kprobe handler, kprobe must not
* be running at this point.
*/
- kcb = get_kprobe_ctlblk();
- __this_cpu_write(current_kprobe, &kretprobe_kprobe);
- kcb->kprobe_status = KPROBE_HIT_ACTIVE;
+ kprobe_busy_begin();
INIT_HLIST_HEAD(&empty_rp);
kretprobe_hash_lock(current, &head, &flags);
@@ -850,7 +841,7 @@ __used __visible void *trampoline_handler(struct pt_regs *regs)
__this_cpu_write(current_kprobe, &ri->rp->kp);
ri->ret_addr = correct_ret_addr;
ri->rp->handler(ri, regs);
- __this_cpu_write(current_kprobe, &kretprobe_kprobe);
+ __this_cpu_write(current_kprobe, &kprobe_busy);
}
recycle_rp_inst(ri, &empty_rp);
@@ -866,8 +857,7 @@ __used __visible void *trampoline_handler(struct pt_regs *regs)
kretprobe_hash_unlock(current, &flags);
- __this_cpu_write(current_kprobe, NULL);
- preempt_enable();
+ kprobe_busy_end();
hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) {
hlist_del(&ri->hlist);
diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index 54c21d6abd5a..5bb001c0c771 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -106,7 +106,6 @@ fs_initcall(nmi_warning_debugfs);
static void nmi_check_duration(struct nmiaction *action, u64 duration)
{
- u64 whole_msecs = READ_ONCE(action->max_duration);
int remainder_ns, decimal_msecs;
if (duration < nmi_longest_ns || duration < action->max_duration)
@@ -114,12 +113,12 @@ static void nmi_check_duration(struct nmiaction *action, u64 duration)
action->max_duration = duration;
- remainder_ns = do_div(whole_msecs, (1000 * 1000));
+ remainder_ns = do_div(duration, (1000 * 1000));
decimal_msecs = remainder_ns / 1000;
printk_ratelimited(KERN_INFO
"INFO: NMI handler (%ps) took too long to run: %lld.%03d msecs\n",
- action->handler, whole_msecs, decimal_msecs);
+ action->handler, duration, decimal_msecs);
}
static int nmi_handle(unsigned int type, struct pt_regs *regs)
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index af64519b2695..da3cc3a10d63 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -316,7 +316,7 @@ static unsigned long x86_fsgsbase_read_task(struct task_struct *task,
*/
mutex_lock(&task->mm->context.lock);
ldt = task->mm->context.ldt;
- if (unlikely(idx >= ldt->nr_entries))
+ if (unlikely(!ldt || idx >= ldt->nr_entries))
base = 0;
else
base = get_desc_base(ldt->entries + idx);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 9674321ce3a3..8367bd7a9a81 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1599,14 +1599,28 @@ int native_cpu_disable(void)
if (ret)
return ret;
- /*
- * Disable the local APIC. Otherwise IPI broadcasts will reach
- * it. It still responds normally to INIT, NMI, SMI, and SIPI
- * messages.
- */
- apic_soft_disable();
cpu_disable_common();
+ /*
+ * Disable the local APIC. Otherwise IPI broadcasts will reach
+ * it. It still responds normally to INIT, NMI, SMI, and SIPI
+ * messages.
+ *
+ * Disabling the APIC must happen after cpu_disable_common()
+ * which invokes fixup_irqs().
+ *
+ * Disabling the APIC preserves already set bits in IRR, but
+ * an interrupt arriving after disabling the local APIC does not
+ * set the corresponding IRR bit.
+ *
+ * fixup_irqs() scans IRR for set bits so it can raise a not
+ * yet handled interrupt on the new destination CPU via an IPI
+ * but obviously it can't do so for IRR bits which are not set.
+ * IOW, interrupts arriving after disabling the local APIC will
+ * be lost.
+ */
+ apic_soft_disable();
+
return 0;
}
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 2d6898c2cb64..6d83b4b857e6 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -58,7 +58,6 @@ int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry,
* or a page fault), which can make frame pointers
* unreliable.
*/
-
if (IS_ENABLED(CONFIG_FRAME_POINTER))
return -EINVAL;
}
@@ -81,10 +80,6 @@ int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry,
if (unwind_error(&state))
return -EINVAL;
- /* Success path for non-user tasks, i.e. kthreads and idle tasks */
- if (!(task->flags & (PF_KTHREAD | PF_IDLE)))
- return -EINVAL;
-
return 0;
}
diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c
index c65adaf81384..41200706e6da 100644
--- a/arch/x86/kernel/tsc_msr.c
+++ b/arch/x86/kernel/tsc_msr.c
@@ -133,10 +133,15 @@ static const struct freq_desc freq_desc_ann = {
.mask = 0x0f,
};
-/* 24 MHz crystal? : 24 * 13 / 4 = 78 MHz */
+/*
+ * 24 MHz crystal? : 24 * 13 / 4 = 78 MHz
+ * Frequency step for Lightning Mountain SoC is fixed to 78 MHz,
+ * so all the frequency entries are 78000.
+ */
static const struct freq_desc freq_desc_lgm = {
.use_msr_plat = true,
- .freqs = { 78000, 78000, 78000, 78000, 78000, 78000, 78000, 78000 },
+ .freqs = { 78000, 78000, 78000, 78000, 78000, 78000, 78000, 78000,
+ 78000, 78000, 78000, 78000, 78000, 78000, 78000, 78000 },
.mask = 0x0f,
};
diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c
index aa0f39dc8129..187a86e0e753 100644
--- a/arch/x86/kernel/unwind_orc.c
+++ b/arch/x86/kernel/unwind_orc.c
@@ -431,8 +431,11 @@ bool unwind_next_frame(struct unwind_state *state)
/*
* Find the orc_entry associated with the text address.
*
- * Decrement call return addresses by one so they work for sibling
- * calls and calls to noreturn functions.
+ * For a call frame (as opposed to a signal frame), state->ip points to
+ * the instruction after the call. That instruction's stack layout
+ * could be different from the call instruction's layout, for example
+ * if the call was to a noreturn function. So get the ORC data for the
+ * call instruction itself.
*/
orc = orc_find(state->signal ? state->ip : state->ip - 1);
if (!orc) {
@@ -653,6 +656,7 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task,
state->sp = task->thread.sp;
state->bp = READ_ONCE_NOCHECK(frame->bp);
state->ip = READ_ONCE_NOCHECK(frame->ret_addr);
+ state->signal = (void *)state->ip == ret_from_fork;
}
if (get_stack_info((unsigned long *)state->sp, state->task,
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index bac1a65a9d39..1afe211d7a7c 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -362,6 +362,7 @@ SECTIONS
.bss : AT(ADDR(.bss) - LOAD_OFFSET) {
__bss_start = .;
*(.bss..page_aligned)
+ . = ALIGN(PAGE_SIZE);
*(BSS_MAIN)
BSS_DECRYPTED
. = ALIGN(PAGE_SIZE);