summaryrefslogtreecommitdiff
path: root/arch/x86/mm
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/mm')
-rw-r--r--arch/x86/mm/fault.c43
-rw-r--r--arch/x86/mm/gup.c41
-rw-r--r--arch/x86/mm/init.c2
-rw-r--r--arch/x86/mm/kaiser.c4
-rw-r--r--arch/x86/mm/pgtable.c10
-rw-r--r--arch/x86/mm/tlb.c114
6 files changed, 167 insertions, 47 deletions
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 5c419b8f99a0..102b4e78f4e6 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -273,18 +273,19 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
pmd = pmd_offset(pud, address);
pmd_k = pmd_offset(pud_k, address);
- if (!pmd_present(*pmd_k))
- return NULL;
- if (!pmd_present(*pmd))
+ if (pmd_present(*pmd) != pmd_present(*pmd_k))
set_pmd(pmd, *pmd_k);
+
+ if (!pmd_present(*pmd_k))
+ return NULL;
else
- BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
+ BUG_ON(pmd_pfn(*pmd) != pmd_pfn(*pmd_k));
return pmd_k;
}
-void vmalloc_sync_all(void)
+static void vmalloc_sync(void)
{
unsigned long address;
@@ -299,22 +300,28 @@ void vmalloc_sync_all(void)
spin_lock(&pgd_lock);
list_for_each_entry(page, &pgd_list, lru) {
spinlock_t *pgt_lock;
- pmd_t *ret;
/* the pgt_lock only for Xen */
pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
spin_lock(pgt_lock);
- ret = vmalloc_sync_one(page_address(page), address);
+ vmalloc_sync_one(page_address(page), address);
spin_unlock(pgt_lock);
-
- if (!ret)
- break;
}
spin_unlock(&pgd_lock);
}
}
+void vmalloc_sync_mappings(void)
+{
+ vmalloc_sync();
+}
+
+void vmalloc_sync_unmappings(void)
+{
+ vmalloc_sync();
+}
+
/*
* 32-bit:
*
@@ -409,11 +416,23 @@ out:
#else /* CONFIG_X86_64: */
-void vmalloc_sync_all(void)
+void vmalloc_sync_mappings(void)
{
+ /*
+ * 64-bit mappings might allocate new p4d/pud pages
+ * that need to be propagated to all tasks' PGDs.
+ */
sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END, 0);
}
+void vmalloc_sync_unmappings(void)
+{
+ /*
+ * Unmappings never allocate or free p4d/pud pages.
+ * No work is required here.
+ */
+}
+
/*
* 64-bit:
*
@@ -430,8 +449,6 @@ static noinline int vmalloc_fault(unsigned long address)
if (!(address >= VMALLOC_START && address < VMALLOC_END))
return -1;
- WARN_ON_ONCE(in_nmi());
-
/*
* Copy kernel mappings over when needed. This can also
* happen within a race in page table update. In the later
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index 1680768d392c..82f727fbbbd2 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -98,6 +98,20 @@ static inline int pte_allows_gup(unsigned long pteval, int write)
}
/*
+ * Return the compund head page with ref appropriately incremented,
+ * or NULL if that failed.
+ */
+static inline struct page *try_get_compound_head(struct page *page, int refs)
+{
+ struct page *head = compound_head(page);
+ if (WARN_ON_ONCE(page_ref_count(head) < 0))
+ return NULL;
+ if (unlikely(!page_cache_add_speculative(head, refs)))
+ return NULL;
+ return head;
+}
+
+/*
* The performance critical leaf functions are made noinline otherwise gcc
* inlines everything into a single function which results in too much
* register pressure.
@@ -112,7 +126,7 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
ptep = pte_offset_map(&pmd, addr);
do {
pte_t pte = gup_get_pte(ptep);
- struct page *page;
+ struct page *head, *page;
/* Similar to the PMD case, NUMA hinting must take slow path */
if (pte_protnone(pte)) {
@@ -138,7 +152,21 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
}
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
page = pte_page(pte);
- get_page(page);
+
+ head = try_get_compound_head(page, 1);
+ if (!head) {
+ put_dev_pagemap(pgmap);
+ pte_unmap(ptep);
+ return 0;
+ }
+
+ if (unlikely(pte_val(pte) != pte_val(*ptep))) {
+ put_page(head);
+ put_dev_pagemap(pgmap);
+ pte_unmap(ptep);
+ return 0;
+ }
+
put_dev_pagemap(pgmap);
SetPageReferenced(page);
pages[*nr] = page;
@@ -174,9 +202,12 @@ static int __gup_device_huge_pmd(pmd_t pmd, unsigned long addr,
undo_dev_pagemap(nr, nr_start, pages);
return 0;
}
+ if (unlikely(!try_get_page(page))) {
+ put_dev_pagemap(pgmap);
+ return 0;
+ }
SetPageReferenced(page);
pages[*nr] = page;
- get_page(page);
put_dev_pagemap(pgmap);
(*nr)++;
pfn++;
@@ -202,6 +233,8 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
refs = 0;
head = pmd_page(pmd);
+ if (WARN_ON_ONCE(page_ref_count(head) <= 0))
+ return 0;
page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
do {
VM_BUG_ON_PAGE(compound_head(page) != head, page);
@@ -261,6 +294,8 @@ static noinline int gup_huge_pud(pud_t pud, unsigned long addr,
refs = 0;
head = pud_page(pud);
+ if (WARN_ON_ONCE(page_ref_count(head) <= 0))
+ return 0;
page = head + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
do {
VM_BUG_ON_PAGE(compound_head(page) != head, page);
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 90801a8f19c9..ce092a62fc5d 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -790,7 +790,7 @@ unsigned long max_swapfile_size(void)
pages = generic_max_swapfile_size();
- if (boot_cpu_has_bug(X86_BUG_L1TF)) {
+ if (boot_cpu_has_bug(X86_BUG_L1TF) && l1tf_mitigation != L1TF_MITIGATION_OFF) {
/* Limit the swap file size to MAX_PA/2 for L1TF workaround */
unsigned long long l1tf_limit = l1tf_pfn_limit();
/*
diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c
index 3f729e20f0e3..12522dbae615 100644
--- a/arch/x86/mm/kaiser.c
+++ b/arch/x86/mm/kaiser.c
@@ -9,6 +9,7 @@
#include <linux/spinlock.h>
#include <linux/mm.h>
#include <linux/uaccess.h>
+#include <linux/cpu.h>
#undef pr_fmt
#define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt
@@ -297,7 +298,8 @@ void __init kaiser_check_boottime_disable(void)
goto skip;
}
- if (cmdline_find_option_bool(boot_command_line, "nopti"))
+ if (cmdline_find_option_bool(boot_command_line, "nopti") ||
+ cpu_mitigations_off())
goto disable;
skip:
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index e30baa8ad94f..08e0380414a9 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -251,7 +251,7 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp)
if (pgd_val(pgd) != 0) {
pmd_t *pmd = (pmd_t *)pgd_page_vaddr(pgd);
- pgdp[i] = native_make_pgd(0);
+ pgd_clear(&pgdp[i]);
paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT);
pmd_free(mm, pmd);
@@ -419,7 +419,7 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
int changed = !pte_same(*ptep, entry);
if (changed && dirty) {
- *ptep = entry;
+ set_pte(ptep, entry);
pte_update(vma->vm_mm, address, ptep);
}
@@ -436,7 +436,7 @@ int pmdp_set_access_flags(struct vm_area_struct *vma,
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
if (changed && dirty) {
- *pmdp = entry;
+ set_pmd(pmdp, entry);
/*
* We had a write-protection fault here and changed the pmd
* to to more permissive. No need to flush the TLB for that,
@@ -544,8 +544,8 @@ void __native_set_fixmap(enum fixed_addresses idx, pte_t pte)
fixmaps_set++;
}
-void native_set_fixmap(enum fixed_addresses idx, phys_addr_t phys,
- pgprot_t flags)
+void native_set_fixmap(unsigned /* enum fixed_addresses */ idx,
+ phys_addr_t phys, pgprot_t flags)
{
__native_set_fixmap(idx, pfn_pte(phys >> PAGE_SHIFT, flags));
}
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index eac92e2d171b..a112bb175dd4 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -30,6 +30,12 @@
* Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
*/
+/*
+ * Use bit 0 to mangle the TIF_SPEC_IB state into the mm pointer which is
+ * stored in cpu_tlb_state.last_user_mm_ibpb.
+ */
+#define LAST_USER_MM_IBPB 0x1UL
+
atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1);
struct flush_tlb_info {
@@ -101,33 +107,101 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next,
local_irq_restore(flags);
}
+static inline unsigned long mm_mangle_tif_spec_ib(struct task_struct *next)
+{
+ unsigned long next_tif = task_thread_info(next)->flags;
+ unsigned long ibpb = (next_tif >> TIF_SPEC_IB) & LAST_USER_MM_IBPB;
+
+ return (unsigned long)next->mm | ibpb;
+}
+
+static void cond_ibpb(struct task_struct *next)
+{
+ if (!next || !next->mm)
+ return;
+
+ /*
+ * Both, the conditional and the always IBPB mode use the mm
+ * pointer to avoid the IBPB when switching between tasks of the
+ * same process. Using the mm pointer instead of mm->context.ctx_id
+ * opens a hypothetical hole vs. mm_struct reuse, which is more or
+ * less impossible to control by an attacker. Aside of that it
+ * would only affect the first schedule so the theoretically
+ * exposed data is not really interesting.
+ */
+ if (static_branch_likely(&switch_mm_cond_ibpb)) {
+ unsigned long prev_mm, next_mm;
+
+ /*
+ * This is a bit more complex than the always mode because
+ * it has to handle two cases:
+ *
+ * 1) Switch from a user space task (potential attacker)
+ * which has TIF_SPEC_IB set to a user space task
+ * (potential victim) which has TIF_SPEC_IB not set.
+ *
+ * 2) Switch from a user space task (potential attacker)
+ * which has TIF_SPEC_IB not set to a user space task
+ * (potential victim) which has TIF_SPEC_IB set.
+ *
+ * This could be done by unconditionally issuing IBPB when
+ * a task which has TIF_SPEC_IB set is either scheduled in
+ * or out. Though that results in two flushes when:
+ *
+ * - the same user space task is scheduled out and later
+ * scheduled in again and only a kernel thread ran in
+ * between.
+ *
+ * - a user space task belonging to the same process is
+ * scheduled in after a kernel thread ran in between
+ *
+ * - a user space task belonging to the same process is
+ * scheduled in immediately.
+ *
+ * Optimize this with reasonably small overhead for the
+ * above cases. Mangle the TIF_SPEC_IB bit into the mm
+ * pointer of the incoming task which is stored in
+ * cpu_tlbstate.last_user_mm_ibpb for comparison.
+ */
+ next_mm = mm_mangle_tif_spec_ib(next);
+ prev_mm = this_cpu_read(cpu_tlbstate.last_user_mm_ibpb);
+
+ /*
+ * Issue IBPB only if the mm's are different and one or
+ * both have the IBPB bit set.
+ */
+ if (next_mm != prev_mm &&
+ (next_mm | prev_mm) & LAST_USER_MM_IBPB)
+ indirect_branch_prediction_barrier();
+
+ this_cpu_write(cpu_tlbstate.last_user_mm_ibpb, next_mm);
+ }
+
+ if (static_branch_unlikely(&switch_mm_always_ibpb)) {
+ /*
+ * Only flush when switching to a user space task with a
+ * different context than the user space task which ran
+ * last on this CPU.
+ */
+ if (this_cpu_read(cpu_tlbstate.last_user_mm) != next->mm) {
+ indirect_branch_prediction_barrier();
+ this_cpu_write(cpu_tlbstate.last_user_mm, next->mm);
+ }
+ }
+}
+
void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
struct task_struct *tsk)
{
unsigned cpu = smp_processor_id();
if (likely(prev != next)) {
- u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id);
-
/*
* Avoid user/user BTB poisoning by flushing the branch
* predictor when switching between processes. This stops
* one process from doing Spectre-v2 attacks on another.
- *
- * As an optimization, flush indirect branches only when
- * switching into processes that disable dumping. This
- * protects high value processes like gpg, without having
- * too high performance overhead. IBPB is *expensive*!
- *
- * This will not flush branches when switching into kernel
- * threads. It will also not flush if we switch to idle
- * thread and back to the same process. It will flush if we
- * switch to a different non-dumpable process.
*/
- if (tsk && tsk->mm &&
- tsk->mm->context.ctx_id != last_ctx_id &&
- get_dumpable(tsk->mm) != SUID_DUMP_USER)
- indirect_branch_prediction_barrier();
+ cond_ibpb(tsk);
if (IS_ENABLED(CONFIG_VMAP_STACK)) {
/*
@@ -143,14 +217,6 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
set_pgd(pgd, init_mm.pgd[stack_pgd_index]);
}
- /*
- * Record last user mm's context id, so we can avoid
- * flushing branch buffer with IBPB if we switch back
- * to the same user.
- */
- if (next != &init_mm)
- this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);
-
this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
this_cpu_write(cpu_tlbstate.active_mm, next);