From 8a43ddfb93a0c6ae1a6e1f5c25705ec5d1843c40 Mon Sep 17 00:00:00 2001 From: Richard Fellner Date: Thu, 4 May 2017 14:26:50 +0200 Subject: KAISER: Kernel Address Isolation This patch introduces our implementation of KAISER (Kernel Address Isolation to have Side-channels Efficiently Removed), a kernel isolation technique to close hardware side channels on kernel address information. More information about the patch can be found on: https://github.com/IAIK/KAISER From: Richard Fellner From: Daniel Gruss X-Subject: [RFC, PATCH] x86_64: KAISER - do not map kernel in user mode Date: Thu, 4 May 2017 14:26:50 +0200 Link: http://marc.info/?l=linux-kernel&m=149390087310405&w=2 Kaiser-4.10-SHA1: c4b1831d44c6144d3762ccc72f0c4e71a0c713e5 To: To: Cc: Cc: Cc: Michael Schwarz Cc: Richard Fellner Cc: Ingo Molnar Cc: Cc: After several recent works [1,2,3] KASLR on x86_64 was basically considered dead by many researchers. We have been working on an efficient but effective fix for this problem and found that not mapping the kernel space when running in user mode is the solution to this problem [4] (the corresponding paper [5] will be presented at ESSoS17). With this RFC patch we allow anybody to configure their kernel with the flag CONFIG_KAISER to add our defense mechanism. If there are any questions we would love to answer them. We also appreciate any comments! Cheers, Daniel (+ the KAISER team from Graz University of Technology) [1] http://www.ieee-security.org/TC/SP2013/papers/4977a191.pdf [2] https://www.blackhat.com/docs/us-16/materials/us-16-Fogh-Using-Undocumented-CPU-Behaviour-To-See-Into-Kernel-Mode-And-Break-KASLR-In-The-Process.pdf [3] https://www.blackhat.com/docs/us-16/materials/us-16-Jang-Breaking-Kernel-Address-Space-Layout-Randomization-KASLR-With-Intel-TSX.pdf [4] https://github.com/IAIK/KAISER [5] https://gruss.cc/files/kaiser.pdf [patch based also on https://raw.githubusercontent.com/IAIK/KAISER/master/KAISER/0001-KAISER-Kernel-Address-Isolation.patch] Signed-off-by: Richard Fellner Signed-off-by: Moritz Lipp Signed-off-by: Daniel Gruss Signed-off-by: Michael Schwarz Acked-by: Jiri Kosina Signed-off-by: Hugh Dickins Signed-off-by: Greg Kroah-Hartman --- arch/x86/mm/pgtable.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'arch/x86/mm/pgtable.c') diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index fb0a9dd1d6e4..087d3e1e7125 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -342,12 +342,38 @@ static inline void _pgd_free(pgd_t *pgd) #else static inline pgd_t *_pgd_alloc(void) { +#ifdef CONFIG_KAISER + // Instead of one PML4, we aquire two PML4s and, thus, an 8kb-aligned memory + // block. Therefore, we have to allocate at least 3 pages. However, the + // __get_free_pages returns us 4 pages. Hence, we store the base pointer at + // the beginning of the page of our 8kb-aligned memory block in order to + // correctly free it afterwars. + + unsigned long pages = __get_free_pages(PGALLOC_GFP, get_order(4*PAGE_SIZE)); + + if(native_get_normal_pgd((pgd_t*) pages) == (pgd_t*) pages) + { + *((unsigned long*)(pages + 2 * PAGE_SIZE)) = pages; + return (pgd_t *) pages; + } + else + { + *((unsigned long*)(pages + 3 * PAGE_SIZE)) = pages; + return (pgd_t *) (pages + PAGE_SIZE); + } +#else return (pgd_t *)__get_free_page(PGALLOC_GFP); +#endif } static inline void _pgd_free(pgd_t *pgd) { +#ifdef CONFIG_KAISER + unsigned long pages = *((unsigned long*) ((char*) pgd + 2 * PAGE_SIZE)); + free_pages(pages, get_order(4*PAGE_SIZE)); +#else free_page((unsigned long)pgd); +#endif } #endif /* CONFIG_X86_PAE */ -- cgit v1.2.3 From bed9bb7f3e6d4045013d2bb9e4004896de57f02b Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 30 Aug 2017 16:23:00 -0700 Subject: kaiser: merged update Merged fixes and cleanups, rebased to 4.4.89 tree (no 5-level paging). Signed-off-by: Dave Hansen Signed-off-by: Hugh Dickins Acked-by: Jiri Kosina Signed-off-by: Greg Kroah-Hartman --- arch/x86/mm/pgtable.c | 40 +++++++++++++--------------------------- 1 file changed, 13 insertions(+), 27 deletions(-) (limited to 'arch/x86/mm/pgtable.c') diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 087d3e1e7125..e2bd5c81279e 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -340,40 +340,26 @@ static inline void _pgd_free(pgd_t *pgd) kmem_cache_free(pgd_cache, pgd); } #else -static inline pgd_t *_pgd_alloc(void) -{ + #ifdef CONFIG_KAISER - // Instead of one PML4, we aquire two PML4s and, thus, an 8kb-aligned memory - // block. Therefore, we have to allocate at least 3 pages. However, the - // __get_free_pages returns us 4 pages. Hence, we store the base pointer at - // the beginning of the page of our 8kb-aligned memory block in order to - // correctly free it afterwars. - - unsigned long pages = __get_free_pages(PGALLOC_GFP, get_order(4*PAGE_SIZE)); - - if(native_get_normal_pgd((pgd_t*) pages) == (pgd_t*) pages) - { - *((unsigned long*)(pages + 2 * PAGE_SIZE)) = pages; - return (pgd_t *) pages; - } - else - { - *((unsigned long*)(pages + 3 * PAGE_SIZE)) = pages; - return (pgd_t *) (pages + PAGE_SIZE); - } +/* + * Instead of one pmd, we aquire two pmds. Being order-1, it is + * both 8k in size and 8k-aligned. That lets us just flip bit 12 + * in a pointer to swap between the two 4k halves. + */ +#define PGD_ALLOCATION_ORDER 1 #else - return (pgd_t *)__get_free_page(PGALLOC_GFP); +#define PGD_ALLOCATION_ORDER 0 #endif + +static inline pgd_t *_pgd_alloc(void) +{ + return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER); } static inline void _pgd_free(pgd_t *pgd) { -#ifdef CONFIG_KAISER - unsigned long pages = *((unsigned long*) ((char*) pgd + 2 * PAGE_SIZE)); - free_pages(pages, get_order(4*PAGE_SIZE)); -#else - free_page((unsigned long)pgd); -#endif + free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER); } #endif /* CONFIG_X86_PAE */ -- cgit v1.2.3 From d41f46f778951b0ea851ca52b88b2549c6336b47 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 13 Oct 2017 12:10:00 -0700 Subject: kaiser: _pgd_alloc() without __GFP_REPEAT to avoid stalls Synthetic filesystem mempressure testing has shown softlockups, with hour-long page allocation stalls, and pgd_alloc() trying for order:1 with __GFP_REPEAT in one of the backtraces each time. That's _pgd_alloc() going for a Kaiser double-pgd, using the __GFP_REPEAT common to all page table allocations, but actually having no effect on order:0 (see should_alloc_oom() and should_continue_reclaim() in this tree, but beware that ports to another tree might behave differently). Order:1 stack allocation has been working satisfactorily without __GFP_REPEAT forever, and page table allocation only asks __GFP_REPEAT for awkward occasions in a long-running process: it's not appropriate at fork or exec time, and seems to be doing much more harm than good: getting those contiguous pages under very heavy mempressure can be hard (though even without it, Kaiser does generate more mempressure). Mask out that __GFP_REPEAT inside _pgd_alloc(). Why not take it out of the PGALLOG_GFP altogether, as v4.7 commit a3a9a59d2067 ("x86: get rid of superfluous __GFP_REPEAT") did? Because I think that might make a difference to our page table memcg charging, which I'd prefer not to interfere with at this time. hughd adds: __alloc_pages_slowpath() in the 4.4.89-stable tree handles __GFP_REPEAT a little differently than in prod kernel or 3.18.72-stable, so it may not always be exactly a no-op on order:0 pages, as said above; but I think still appropriate to omit it from Kaiser or non-Kaiser pgd. Signed-off-by: Hugh Dickins Acked-by: Jiri Kosina Signed-off-by: Greg Kroah-Hartman --- arch/x86/mm/pgtable.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86/mm/pgtable.c') diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index e2bd5c81279e..d0a424988f82 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -6,7 +6,7 @@ #include #include -#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO +#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO) #ifdef CONFIG_HIGHPTE #define PGALLOC_USER_GFP __GFP_HIGHMEM @@ -354,7 +354,9 @@ static inline void _pgd_free(pgd_t *pgd) static inline pgd_t *_pgd_alloc(void) { - return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER); + /* No __GFP_REPEAT: to avoid page allocation stalls in order-1 case */ + return (pgd_t *)__get_free_pages(PGALLOC_GFP & ~__GFP_REPEAT, + PGD_ALLOCATION_ORDER); } static inline void _pgd_free(pgd_t *pgd) -- cgit v1.2.3 From e345dcc9481543edf4a0a5df4c4c2f9597b0a997 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sun, 24 Sep 2017 16:59:49 -0700 Subject: kaiser: add "nokaiser" boot option, using ALTERNATIVE Added "nokaiser" boot option: an early param like "noinvpcid". Most places now check int kaiser_enabled (#defined 0 when not CONFIG_KAISER) instead of #ifdef CONFIG_KAISER; but entry_64.S and entry_64_compat.S are using the ALTERNATIVE technique, which patches in the preferred instructions at runtime. That technique is tied to x86 cpu features, so X86_FEATURE_KAISER is fabricated. Prior to "nokaiser", Kaiser #defined _PAGE_GLOBAL 0: revert that, but be careful with both _PAGE_GLOBAL and CR4.PGE: setting them when nokaiser like when !CONFIG_KAISER, but not setting either when kaiser - neither matters on its own, but it's hard to be sure that _PAGE_GLOBAL won't get set in some obscure corner, or something add PGE into CR4. By omitting _PAGE_GLOBAL from __supported_pte_mask when kaiser_enabled, all page table setup which uses pte_pfn() masks it out of the ptes. It's slightly shameful that the same declaration versus definition of kaiser_enabled appears in not one, not two, but in three header files (asm/kaiser.h, asm/pgtable.h, asm/tlbflush.h). I felt safer that way, than with #including any of those in any of the others; and did not feel it worth an asm/kaiser_enabled.h - kernel/cpu/common.c includes them all, so we shall hear about it if they get out of synch. Cleanups while in the area: removed the silly #ifdef CONFIG_KAISER from kaiser.c; removed the unused native_get_normal_pgd(); removed the spurious reg clutter from SWITCH_*_CR3 macro stubs; corrected some comments. But more interestingly, set CR4.PSE in secondary_startup_64: the manual is clear that it does not matter whether it's 0 or 1 when 4-level-pts are enabled, but I was distracted to find cr4 different on BSP and auxiliaries - BSP alone was adding PSE, in probe_page_size_mask(). Signed-off-by: Hugh Dickins Acked-by: Jiri Kosina Signed-off-by: Greg Kroah-Hartman --- arch/x86/mm/pgtable.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'arch/x86/mm/pgtable.c') diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index d0a424988f82..dbc27a2b4ad5 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -341,16 +341,12 @@ static inline void _pgd_free(pgd_t *pgd) } #else -#ifdef CONFIG_KAISER /* - * Instead of one pmd, we aquire two pmds. Being order-1, it is + * Instead of one pgd, Kaiser acquires two pgds. Being order-1, it is * both 8k in size and 8k-aligned. That lets us just flip bit 12 * in a pointer to swap between the two 4k halves. */ -#define PGD_ALLOCATION_ORDER 1 -#else -#define PGD_ALLOCATION_ORDER 0 -#endif +#define PGD_ALLOCATION_ORDER kaiser_enabled static inline pgd_t *_pgd_alloc(void) { -- cgit v1.2.3