#include #include #include #include #include #include #include #include #include #include #include #include #undef pr_fmt #define pr_fmt(fmt) "Kernel/User page tables isolation: " fmt #include #include /* to verify its kaiser declarations */ #include #include #include #include #include int kaiser_enabled __read_mostly = 1; EXPORT_SYMBOL(kaiser_enabled); /* for inlined TLB flush functions */ __visible DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup); /* * These can have bit 63 set, so we can not just use a plain "or" * instruction to get their value or'd into CR3. It would take * another register. So, we use a memory reference to these instead. * * This is also handy because systems that do not support PCIDs * just end up or'ing a 0 into their CR3, which does no harm. */ DEFINE_PER_CPU(unsigned long, x86_cr3_pcid_user); /* * At runtime, the only things we map are some things for CPU * hotplug, and stacks for new processes. No two CPUs will ever * be populating the same addresses, so we only need to ensure * that we protect between two CPUs trying to allocate and * populate the same page table page. * * Only take this lock when doing a set_p[4um]d(), but it is not * needed for doing a set_pte(). We assume that only the *owner* * of a given allocation will be doing this for _their_ * allocation. * * This ensures that once a system has been running for a while * and there have been stacks all over and these page tables * are fully populated, there will be no further acquisitions of * this lock. */ static DEFINE_SPINLOCK(shadow_table_allocation_lock); /* * Returns -1 on error. */ static inline unsigned long get_pa_from_mapping(unsigned long vaddr) { pgd_t *pgd; pud_t *pud; pmd_t *pmd; pte_t *pte; pgd = pgd_offset_k(vaddr); /* * We made all the kernel PGDs present in kaiser_init(). * We expect them to stay that way. */ BUG_ON(pgd_none(*pgd)); /* * PGDs are either 512GB or 128TB on all x86_64 * configurations. We don't handle these. */ BUG_ON(pgd_large(*pgd)); pud = pud_offset(pgd, vaddr); if (pud_none(*pud)) { WARN_ON_ONCE(1); return -1; } if (pud_large(*pud)) return (pud_pfn(*pud) << PAGE_SHIFT) | (vaddr & ~PUD_PAGE_MASK); pmd = pmd_offset(pud, vaddr); if (pmd_none(*pmd)) { WARN_ON_ONCE(1); return -1; } if (pmd_large(*pmd)) return (pmd_pfn(*pmd) << PAGE_SHIFT) | (vaddr & ~PMD_PAGE_MASK); pte = pte_offset_kernel(pmd, vaddr); if (pte_none(*pte)) { WARN_ON_ONCE(1); return -1; } return (pte_pfn(*pte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK); } /* * This is a relatively normal page table walk, except that it * also tries to allocate page tables pages along the way. * * Returns a pointer to a PTE on success, or NULL on failure. */ static pte_t *kaiser_pagetable_walk(unsigned long address, bool user) { pmd_t *pmd; pud_t *pud; pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address)); gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO); unsigned long prot = _KERNPG_TABLE; if (pgd_none(*pgd)) { WARN_ONCE(1, "All shadow pgds should have been populated"); return NULL; } BUILD_BUG_ON(pgd_large(*pgd) != 0); if (user) { /* * The vsyscall page is the only page that will have * _PAGE_USER set. Catch everything else. */ BUG_ON(address != VSYSCALL_ADDR); set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER)); prot = _PAGE_TABLE; } pud = pud_offset(pgd, address); /* The shadow page tables do not use large mappings: */ if (pud_large(*pud)) { WARN_ON(1); return NULL; } if (pud_none(*pud)) { unsigned long new_pmd_page = __get_free_page(gfp); if (!new_pmd_page) return NULL; spin_lock(&shadow_table_allocation_lock); if (pud_none(*pud)) { set_pud(pud, __pud(prot | __pa(new_pmd_page))); __inc_zone_page_state(virt_to_page((void *) new_pmd_page), NR_KAISERTABLE); } else free_page(new_pmd_page); spin_unlock(&shadow_table_allocation_lock); } pmd = pmd_offset(pud, address); /* The shadow page tables do not use large mappings: */ if (pmd_large(*pmd)) { WARN_ON(1); return NULL; } if (pmd_none(*pmd)) { unsigned long new_pte_page = __get_free_page(gfp); if (!new_pte_page) return NULL; spin_lock(&shadow_table_allocation_lock); if (pmd_none(*pmd)) { set_pmd(pmd, __pmd(prot | __pa(new_pte_page))); __inc_zone_page_state(virt_to_page((void *) new_pte_page), NR_KAISERTABLE); } else free_page(new_pte_page); spin_unlock(&shadow_table_allocation_lock); } return pte_offset_kernel(pmd, address); } static int kaiser_add_user_map(const void *__start_addr, unsigned long size, unsigned long flags) { int ret = 0; pte_t *pte; unsigned long start_addr = (unsigned long )__start_addr; unsigned long address = start_addr & PAGE_MASK; unsigned long end_addr = PAGE_ALIGN(start_addr + size); unsigned long target_address; /* * It is convenient for callers to pass in __PAGE_KERNEL etc, * and there is no actual harm from setting _PAGE_GLOBAL, so * long as CR4.PGE is not set. But it is nonetheless troubling * to see Kaiser itself setting _PAGE_GLOBAL (now that "nokaiser" * requires that not to be #defined to 0): so mask it off here. */ flags &= ~_PAGE_GLOBAL; if (!(__supported_pte_mask & _PAGE_NX)) flags &= ~_PAGE_NX; for (; address < end_addr; address += PAGE_SIZE) { target_address = get_pa_from_mapping(address); if (target_address == -1) { ret = -EIO; break; } pte = kaiser_pagetable_walk(address, flags & _PAGE_USER); if (!pte) { ret = -ENOMEM; break; } if (pte_none(*pte)) { set_pte(pte, __pte(flags | target_address)); } else { pte_t tmp; set_pte(&tmp, __pte(flags | target_address)); WARN_ON_ONCE(!pte_same(*pte, tmp)); } } return ret; } static int kaiser_add_user_map_ptrs(const void *start, const void *end, unsigned long flags) { unsigned long size = end - start; return kaiser_add_user_map(start, size, flags); } /* * Ensure that the top level of the (shadow) page tables are * entirely populated. This ensures that all processes that get * forked have the same entries. This way, we do not have to * ever go set up new entries in older processes. * * Note: we never free these, so there are no updates to them * after this. */ static void __init kaiser_init_all_pgds(void) { pgd_t *pgd; int i = 0; pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0)); for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) { pgd_t new_pgd; pud_t *pud = pud_alloc_one(&init_mm, PAGE_OFFSET + i * PGDIR_SIZE); if (!pud) { WARN_ON(1); break; } inc_zone_page_state(virt_to_page(pud), NR_KAISERTABLE); new_pgd = __pgd(_KERNPG_TABLE |__pa(pud)); /* * Make sure not to stomp on some other pgd entry. */ if (!pgd_none(pgd[i])) { WARN_ON(1); continue; } set_pgd(pgd + i, new_pgd); } } #define kaiser_add_user_map_early(start, size, flags) do { \ int __ret = kaiser_add_user_map(start, size, flags); \ WARN_ON(__ret); \ } while (0) #define kaiser_add_user_map_ptrs_early(start, end, flags) do { \ int __ret = kaiser_add_user_map_ptrs(start, end, flags); \ WARN_ON(__ret); \ } while (0) void __init kaiser_check_boottime_disable(void) { bool enable = true; char arg[5]; int ret; if (boot_cpu_has(X86_FEATURE_XENPV)) goto silent_disable; ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg)); if (ret > 0) { if (!strncmp(arg, "on", 2)) goto enable; if (!strncmp(arg, "off", 3)) goto disable; if (!strncmp(arg, "auto", 4)) goto skip; } if (cmdline_find_option_bool(boot_command_line, "nopti")) goto disable; skip: if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) goto disable; enable: if (enable) setup_force_cpu_cap(X86_FEATURE_KAISER); return; disable: pr_info("disabled\n"); silent_disable: kaiser_enabled = 0; setup_clear_cpu_cap(X86_FEATURE_KAISER); } /* * If anything in here fails, we will likely die on one of the * first kernel->user transitions and init will die. But, we * will have most of the kernel up by then and should be able to * get a clean warning out of it. If we BUG_ON() here, we run * the risk of being before we have good console output. */ void __init kaiser_init(void) { int cpu; if (!kaiser_enabled) return; kaiser_init_all_pgds(); /* * Note that this sets _PAGE_USER and it needs to happen when the * pagetable hierarchy gets created, i.e., early. Otherwise * kaiser_pagetable_walk() will encounter initialized PTEs in the * hierarchy and not set the proper permissions, leading to the * pagefaults with page-protection violations when trying to read the * vsyscall page. For example. */ if (vsyscall_enabled()) kaiser_add_user_map_early((void *)VSYSCALL_ADDR, PAGE_SIZE, vsyscall_pgprot); for_each_possible_cpu(cpu) { void *percpu_vaddr = __per_cpu_user_mapped_start + per_cpu_offset(cpu); unsigned long percpu_sz = __per_cpu_user_mapped_end - __per_cpu_user_mapped_start; kaiser_add_user_map_early(percpu_vaddr, percpu_sz, __PAGE_KERNEL); } /* * Map the entry/exit text section, which is needed at * switches from user to and from kernel. */ kaiser_add_user_map_ptrs_early(__entry_text_start, __entry_text_end, __PAGE_KERNEL_RX); #ifdef CONFIG_FUNCTION_GRAPH_TRACER kaiser_add_user_map_ptrs_early(__irqentry_text_start, __irqentry_text_end, __PAGE_KERNEL_RX); #endif kaiser_add_user_map_early((void *)idt_descr.address, sizeof(gate_desc) * NR_VECTORS, __PAGE_KERNEL_RO); #ifdef CONFIG_TRACING kaiser_add_user_map_early(&trace_idt_descr, sizeof(trace_idt_descr), __PAGE_KERNEL); kaiser_add_user_map_early(&trace_idt_table, sizeof(gate_desc) * NR_VECTORS, __PAGE_KERNEL); #endif kaiser_add_user_map_early(&debug_idt_descr, sizeof(debug_idt_descr), __PAGE_KERNEL); kaiser_add_user_map_early(&debug_idt_table, sizeof(gate_desc) * NR_VECTORS, __PAGE_KERNEL); pr_info("enabled\n"); } /* Add a mapping to the shadow mapping, and synchronize the mappings */ int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags) { if (!kaiser_enabled) return 0; return kaiser_add_user_map((const void *)addr, size, flags); } void kaiser_remove_mapping(unsigned long start, unsigned long size) { extern void unmap_pud_range_nofree(pgd_t *pgd, unsigned long start, unsigned long end); unsigned long end = start + size; unsigned long addr, next; pgd_t *pgd; if (!kaiser_enabled) return; pgd = native_get_shadow_pgd(pgd_offset_k(start)); for (addr = start; addr < end; pgd++, addr = next) { next = pgd_addr_end(addr, end); unmap_pud_range_nofree(pgd, addr, next); } } /* * Page table pages are page-aligned. The lower half of the top * level is used for userspace and the top half for the kernel. * This returns true for user pages that need to get copied into * both the user and kernel copies of the page tables, and false * for kernel pages that should only be in the kernel copy. */ static inline bool is_userspace_pgd(pgd_t *pgdp) { return ((unsigned long)pgdp % PAGE_SIZE) < (PAGE_SIZE / 2); } pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd) { if (!kaiser_enabled) return pgd; /* * Do we need to also populate the shadow pgd? Check _PAGE_USER to * skip cases like kexec and EFI which make temporary low mappings. */ if (pgd.pgd & _PAGE_USER) { if (is_userspace_pgd(pgdp)) { native_get_shadow_pgd(pgdp)->pgd = pgd.pgd; /* * Even if the entry is *mapping* userspace, ensure * that userspace can not use it. This way, if we * get out to userspace running on the kernel CR3, * userspace will crash instead of running. */ if (__supported_pte_mask & _PAGE_NX) pgd.pgd |= _PAGE_NX; } } else if (!pgd.pgd) { /* * pgd_clear() cannot check _PAGE_USER, and is even used to * clear corrupted pgd entries: so just rely on cases like * kexec and EFI never to be using pgd_clear(). */ if (!WARN_ON_ONCE((unsigned long)pgdp & PAGE_SIZE) && is_userspace_pgd(pgdp)) native_get_shadow_pgd(pgdp)->pgd = pgd.pgd; } return pgd; } void kaiser_setup_pcid(void) { unsigned long user_cr3 = KAISER_SHADOW_PGD_OFFSET; if (this_cpu_has(X86_FEATURE_PCID)) user_cr3 |= X86_CR3_PCID_USER_NOFLUSH; /* * These variables are used by the entry/exit * code to change PCID and pgd and TLB flushing. */ this_cpu_write(x86_cr3_pcid_user, user_cr3); } /* * Make a note that this cpu will need to flush USER tlb on return to user. * If cpu does not have PCID, then the NOFLUSH bit will never have been set. */ void kaiser_flush_tlb_on_return_to_user(void) { if (this_cpu_has(X86_FEATURE_PCID)) this_cpu_write(x86_cr3_pcid_user, X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET); } EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user);