KAISER: Kernel Address Isolation

This patch introduces our implementation of KAISER (Kernel Address Isolation to have Side-channels Efficiently Removed), a kernel isolation technique to close hardware side channels on kernel address information. More information about the patch can be found on: https://github.com/IAIK/KAISER From: Richard Fellner <richard.fellner@student.tugraz.at> From: Daniel Gruss <daniel.gruss@iaik.tugraz.at> X-Subject: [RFC, PATCH] x86_64: KAISER - do not map kernel in user mode Date: Thu, 4 May 2017 14:26:50 +0200 Link: http://marc.info/?l=linux-kernel&m=149390087310405&w=2 Kaiser-4.10-SHA1: c4b1831d44c6144d3762ccc72f0c4e71a0c713e5 To: <linux-kernel@vger.kernel.org> To: <kernel-hardening@lists.openwall.com> Cc: <clementine.maurice@iaik.tugraz.at> Cc: <moritz.lipp@iaik.tugraz.at> Cc: Michael Schwarz <michael.schwarz@iaik.tugraz.at> Cc: Richard Fellner <richard.fellner@student.tugraz.at> Cc: Ingo Molnar <mingo@kernel.org> Cc: <kirill.shutemov@linux.intel.com> Cc: <anders.fogh@gdata-adan.de> After several recent works [1,2,3] KASLR on x86_64 was basically considered dead by many researchers. We have been working on an efficient but effective fix for this problem and found that not mapping the kernel space when running in user mode is the solution to this problem [4] (the corresponding paper [5] will be presented at ESSoS17). With this RFC patch we allow anybody to configure their kernel with the flag CONFIG_KAISER to add our defense mechanism. If there are any questions we would love to answer them. We also appreciate any comments! Cheers, Daniel (+ the KAISER team from Graz University of Technology) [1] http://www.ieee-security.org/TC/SP2013/papers/4977a191.pdf [2] https://www.blackhat.com/docs/us-16/materials/us-16-Fogh-Using-Undocumented-CPU-Behaviour-To-See-Into-Kernel-Mode-And-Break-KASLR-In-The-Process.pdf [3] https://www.blackhat.com/docs/us-16/materials/us-16-Jang-Breaking-Kernel-Address-Space-Layout-Randomization-KASLR-With-Intel-TSX.pdf [4] https://github.com/IAIK/KAISER [5] https://gruss.cc/files/kaiser.pdf [patch based also on https://raw.githubusercontent.com/IAIK/KAISER/master/KAISER/0001-KAISER-Kernel-Address-Isolation.patch] Signed-off-by: Richard Fellner <richard.fellner@student.tugraz.at> Signed-off-by: Moritz Lipp <moritz.lipp@iaik.tugraz.at> Signed-off-by: Daniel Gruss <daniel.gruss@iaik.tugraz.at> Signed-off-by: Michael Schwarz <michael.schwarz@iaik.tugraz.at> Acked-by: Jiri Kosina <jkosina@suse.cz> Signed-off-by: Hugh Dickins <hughd@google.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
author: Richard Fellner <richard.fellner@student.tugraz.at> 2017-05-04 14:26:50 +0200
committer: Greg Kroah-Hartman <gregkh@linuxfoundation.org> 2018-01-05 15:44:23 +0100
commit: 8a43ddfb93a0c6ae1a6e1f5c25705ec5d1843c40 (patch)
tree: 3eced34a05bd8124e0a03d63d982e7cdf01d013e /arch
parent: 0fa147b407478e73fe7a478677ff2b12bb824014 (diff)
17 files changed, 388 insertions, 15 deletions
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index cc0f2f5da19b..273bc5b8d491 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -35,6 +35,7 @@
 #include <asm/asm.h>
 #include <asm/smap.h>
 #include <asm/pgtable_types.h>
+#include <asm/kaiser.h>
 #include <linux/err.h>
 
 /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */
@@ -135,6 +136,7 @@ ENTRY(entry_SYSCALL_64)
 	 * it is too small to ever cause noticeable irq latency.
 	 */
 	SWAPGS_UNSAFE_STACK
+	SWITCH_KERNEL_CR3_NO_STACK
 	/*
 	 * A hypervisor implementation might want to use a label
 	 * after the swapgs, so that it can do the swapgs
@@ -207,9 +209,10 @@ entry_SYSCALL_64_fastpath:
 	testl	$_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
 	jnz	int_ret_from_sys_call_irqs_off	/* Go to the slow path */
 
-	RESTORE_C_REGS_EXCEPT_RCX_R11
 	movq	RIP(%rsp), %rcx
 	movq	EFLAGS(%rsp), %r11
+	RESTORE_C_REGS_EXCEPT_RCX_R11
+	SWITCH_USER_CR3
 	movq	RSP(%rsp), %rsp
 	/*
 	 * 64-bit SYSRET restores rip from rcx,
@@ -347,10 +350,12 @@ GLOBAL(int_ret_from_sys_call)
 syscall_return_via_sysret:
 	/* rcx and r11 are already restored (see code above) */
 	RESTORE_C_REGS_EXCEPT_RCX_R11
+	SWITCH_USER_CR3
 	movq	RSP(%rsp), %rsp
 	USERGS_SYSRET64
 
 opportunistic_sysret_failed:
+	SWITCH_USER_CR3
 	SWAPGS
 	jmp	restore_c_regs_and_iret
 END(entry_SYSCALL_64)
@@ -509,6 +514,7 @@ END(irq_entries_start)
 	 * tracking that we're in kernel mode.
 	 */
 	SWAPGS
+	SWITCH_KERNEL_CR3
 
 	/*
 	 * We need to tell lockdep that IRQs are off.  We can't do this until
@@ -568,6 +574,7 @@ GLOBAL(retint_user)
 	mov	%rsp,%rdi
 	call	prepare_exit_to_usermode
 	TRACE_IRQS_IRETQ
+	SWITCH_USER_CR3
 	SWAPGS
 	jmp	restore_regs_and_iret
 
@@ -625,6 +632,7 @@ native_irq_return_ldt:
 	pushq	%rax
 	pushq	%rdi
 	SWAPGS
+	SWITCH_KERNEL_CR3
 	movq	PER_CPU_VAR(espfix_waddr), %rdi
 	movq	%rax, (0*8)(%rdi)		/* RAX */
 	movq	(2*8)(%rsp), %rax		/* RIP */
@@ -640,6 +648,7 @@ native_irq_return_ldt:
 	andl	$0xffff0000, %eax
 	popq	%rdi
 	orq	PER_CPU_VAR(espfix_stack), %rax
+	SWITCH_USER_CR3
 	SWAPGS
 	movq	%rax, %rsp
 	popq	%rax
@@ -1007,6 +1016,7 @@ ENTRY(paranoid_entry)
 	testl	%edx, %edx
 	js	1f				/* negative -> in kernel */
 	SWAPGS
+	SWITCH_KERNEL_CR3
 	xorl	%ebx, %ebx
 1:	ret
 END(paranoid_entry)
@@ -1029,6 +1039,7 @@ ENTRY(paranoid_exit)
 	testl	%ebx, %ebx			/* swapgs needed? */
 	jnz	paranoid_exit_no_swapgs
 	TRACE_IRQS_IRETQ
+	SWITCH_USER_CR3_NO_STACK
 	SWAPGS_UNSAFE_STACK
 	jmp	paranoid_exit_restore
 paranoid_exit_no_swapgs:
@@ -1058,6 +1069,7 @@ ENTRY(error_entry)
 	 * from user mode due to an IRET fault.
 	 */
 	SWAPGS
+	SWITCH_KERNEL_CR3
 
 .Lerror_entry_from_usermode_after_swapgs:
 	/*
@@ -1110,7 +1122,7 @@ ENTRY(error_entry)
 	 * Switch to kernel gsbase:
 	 */
 	SWAPGS
-
+	SWITCH_KERNEL_CR3
 	/*
 	 * Pretend that the exception came from user mode: set up pt_regs
 	 * as if we faulted immediately after IRET and clear EBX so that
@@ -1210,6 +1222,7 @@ ENTRY(nmi)
 	 */
 
 	SWAPGS_UNSAFE_STACK
+	SWITCH_KERNEL_CR3_NO_STACK
 	cld
 	movq	%rsp, %rdx
 	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
@@ -1250,6 +1263,7 @@ ENTRY(nmi)
 	 * work, because we don't want to enable interrupts.  Fortunately,
 	 * do_nmi doesn't modify pt_regs.
 	 */
+	SWITCH_USER_CR3
 	SWAPGS
 	jmp	restore_c_regs_and_iret
 
@@ -1461,6 +1475,7 @@ end_repeat_nmi:
 	testl	%ebx, %ebx			/* swapgs needed? */
 	jnz	nmi_restore
 nmi_swapgs:
+	SWITCH_USER_CR3_NO_STACK
 	SWAPGS_UNSAFE_STACK
 nmi_restore:
 	RESTORE_EXTRA_REGS
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index 15cfebaa7688..fe1911930b52 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -13,6 +13,7 @@
 #include <asm/irqflags.h>
 #include <asm/asm.h>
 #include <asm/smap.h>
+#include <asm/kaiser.h>
 #include <linux/linkage.h>
 #include <linux/err.h>
 
@@ -50,6 +51,7 @@ ENDPROC(native_usergs_sysret32)
 ENTRY(entry_SYSENTER_compat)
 	/* Interrupts are off on entry. */
 	SWAPGS_UNSAFE_STACK
+	SWITCH_KERNEL_CR3_NO_STACK
 	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
 
 	/*
@@ -161,6 +163,7 @@ ENDPROC(entry_SYSENTER_compat)
 ENTRY(entry_SYSCALL_compat)
 	/* Interrupts are off on entry. */
 	SWAPGS_UNSAFE_STACK
+	SWITCH_KERNEL_CR3_NO_STACK
 
 	/* Stash user ESP and switch to the kernel stack. */
 	movl	%esp, %r8d
@@ -208,6 +211,7 @@ ENTRY(entry_SYSCALL_compat)
 	/* Opportunistic SYSRET */
 sysret32_from_system_call:
 	TRACE_IRQS_ON			/* User mode traces as IRQs on. */
+	SWITCH_USER_CR3
 	movq	RBX(%rsp), %rbx		/* pt_regs->rbx */
 	movq	RBP(%rsp), %rbp		/* pt_regs->rbp */
 	movq	EFLAGS(%rsp), %r11	/* pt_regs->flags (in r11) */
@@ -269,6 +273,7 @@ ENTRY(entry_INT80_compat)
 	PARAVIRT_ADJUST_EXCEPTION_FRAME
 	ASM_CLAC			/* Do this early to minimize exposure */
 	SWAPGS
+	SWITCH_KERNEL_CR3_NO_STACK
 
 	/*
 	 * User tracing code (ptrace or signal handlers) might assume that
@@ -311,6 +316,7 @@ ENTRY(entry_INT80_compat)
 
 	/* Go back to user mode. */
 	TRACE_IRQS_ON
+	SWITCH_USER_CR3
 	SWAPGS
 	jmp	restore_regs_and_iret
 END(entry_INT80_compat)
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 59caa55fb9b5..ee52ff858699 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -187,7 +187,7 @@ extern char irq_entries_start[];
 #define VECTOR_RETRIGGERED	((void *)~0UL)
 
 typedef struct irq_desc* vector_irq_t[NR_VECTORS];
-DECLARE_PER_CPU(vector_irq_t, vector_irq);
+DECLARE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq);
 
 #endif /* !ASSEMBLY_ */
 
diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h
new file mode 100644
index 000000000000..63ee8309b35b
--- /dev/null
+++ b/arch/x86/include/asm/kaiser.h
@@ -0,0 +1,113 @@
+#ifndef _ASM_X86_KAISER_H
+#define _ASM_X86_KAISER_H
+
+/* This file includes the definitions for the KAISER feature.
+ * KAISER is a counter measure against x86_64 side channel attacks on the kernel virtual memory.
+ * It has a shodow-pgd for every process. the shadow-pgd has a minimalistic kernel-set mapped,
+ * but includes the whole user memory. Within a kernel context switch, or when an interrupt is handled,
+ * the pgd is switched to the normal one. When the system switches to user mode, the shadow pgd is enabled.
+ * By this, the virtual memory chaches are freed, and the user may not attack the whole kernel memory.
+ *
+ * A minimalistic kernel mapping holds the parts needed to be mapped in user mode, as the entry/exit functions
+ * of the user space, or the stacks.
+ */
+#ifdef __ASSEMBLY__
+#ifdef CONFIG_KAISER
+
+.macro _SWITCH_TO_KERNEL_CR3 reg
+movq %cr3, \reg
+andq $(~0x1000), \reg
+movq \reg, %cr3
+.endm
+
+.macro _SWITCH_TO_USER_CR3 reg
+movq %cr3, \reg
+orq $(0x1000), \reg
+movq \reg, %cr3
+.endm
+
+.macro SWITCH_KERNEL_CR3
+pushq %rax
+_SWITCH_TO_KERNEL_CR3 %rax
+popq %rax
+.endm
+
+.macro SWITCH_USER_CR3
+pushq %rax
+_SWITCH_TO_USER_CR3 %rax
+popq %rax
+.endm
+
+.macro SWITCH_KERNEL_CR3_NO_STACK
+movq %rax, PER_CPU_VAR(unsafe_stack_register_backup)
+_SWITCH_TO_KERNEL_CR3 %rax
+movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
+.endm
+
+
+.macro SWITCH_USER_CR3_NO_STACK
+
+movq %rax, PER_CPU_VAR(unsafe_stack_register_backup)
+_SWITCH_TO_USER_CR3 %rax
+movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
+
+.endm
+
+#else /* CONFIG_KAISER */
+
+.macro SWITCH_KERNEL_CR3 reg
+.endm
+.macro SWITCH_USER_CR3 reg
+.endm
+.macro SWITCH_USER_CR3_NO_STACK
+.endm
+.macro SWITCH_KERNEL_CR3_NO_STACK
+.endm
+
+#endif /* CONFIG_KAISER */
+#else /* __ASSEMBLY__ */
+
+
+#ifdef CONFIG_KAISER
+// Upon kernel/user mode switch, it may happen that
+// the address space has to be switched before the registers have been stored.
+// To change the address space, another register is needed.
+// A register therefore has to be stored/restored.
+//
+DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
+
+#endif /* CONFIG_KAISER */
+
+/**
+ *  shadowmem_add_mapping - map a virtual memory part to the shadow mapping
+ *  @addr: the start address of the range
+ *  @size: the size of the range
+ *  @flags: The mapping flags of the pages
+ *
+ *  the mapping is done on a global scope, so no bigger synchronization has to be done.
+ *  the pages have to be manually unmapped again when they are not needed any longer.
+ */
+extern void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);
+
+
+/**
+ *  shadowmem_remove_mapping - unmap a virtual memory part of the shadow mapping
+ *  @addr: the start address of the range
+ *  @size: the size of the range
+ */
+extern void kaiser_remove_mapping(unsigned long start, unsigned long size);
+
+/**
+ *  shadowmem_initialize_mapping - Initalize the shadow mapping
+ *
+ *  most parts of the shadow mapping can be mapped upon boot time.
+ *  only the thread stacks have to be mapped on runtime.
+ *  the mapped regions are not unmapped at all.
+ */
+extern void kaiser_init(void);
+
+#endif
+
+
+
+#endif /* _ASM_X86_KAISER_H */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 6ec0c8b2e9df..6a843d254114 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -856,6 +856,10 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
 static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
 {
        memcpy(dst, src, count * sizeof(pgd_t));
+#ifdef CONFIG_KAISER
+	// clone the shadow pgd part as well
+	memcpy(native_get_shadow_pgd(dst), native_get_shadow_pgd(src), count * sizeof(pgd_t));
+#endif
 }
 
 #define PTE_SHIFT ilog2(PTRS_PER_PTE)
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 2ee781114d34..2131edda6620 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -106,9 +106,30 @@ static inline void native_pud_clear(pud_t *pud)
 	native_set_pud(pud, native_make_pud(0));
 }
 
+#ifdef CONFIG_KAISER
+static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp) {
+	return (pgd_t *)(void*)((unsigned long)(void*)pgdp | (unsigned long)PAGE_SIZE);
+}
+
+static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp) {
+	return (pgd_t *)(void*)((unsigned long)(void*)pgdp &  ~(unsigned long)PAGE_SIZE);
+}
+#endif /* CONFIG_KAISER */
+
 static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
 {
+#ifdef CONFIG_KAISER
+	// We know that a pgd is page aligned.
+	// Therefore the lower indices have to be mapped to user space.
+	// These pages are mapped to the shadow mapping.
+	if ((((unsigned long)pgdp) % PAGE_SIZE) < (PAGE_SIZE / 2)) {
+		native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
+	}
+
+	pgdp->pgd = pgd.pgd & ~_PAGE_USER;
+#else /* CONFIG_KAISER */
 	*pgdp = pgd;
+#endif
 }
 
 static inline void native_pgd_clear(pgd_t *pgd)
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 79c91853e50e..984670491901 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -39,7 +39,11 @@
 #define _PAGE_ACCESSED	(_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED)
 #define _PAGE_DIRTY	(_AT(pteval_t, 1) << _PAGE_BIT_DIRTY)
 #define _PAGE_PSE	(_AT(pteval_t, 1) << _PAGE_BIT_PSE)
-#define _PAGE_GLOBAL	(_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
+#ifdef CONFIG_KAISER
+#define _PAGE_GLOBAL	(_AT(pteval_t, 0))
+#else
+#define _PAGE_GLOBAL  (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
+#endif
 #define _PAGE_SOFTW1	(_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1)
 #define _PAGE_SOFTW2	(_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2)
 #define _PAGE_PAT	(_AT(pteval_t, 1) << _PAGE_BIT_PAT)
@@ -89,7 +93,11 @@
 #define _PAGE_NX	(_AT(pteval_t, 0))
 #endif
 
-#define _PAGE_PROTNONE	(_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
+#ifdef CONFIG_KAISER
+#define _PAGE_PROTNONE	(_AT(pteval_t, 0))
+#else
+#define _PAGE_PROTNONE  (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
+#endif
 
 #define _PAGE_TABLE	(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |	\
 			 _PAGE_ACCESSED | _PAGE_DIRTY)
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 2d5a50cb61a2..6a2e0a0b4a96 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -305,7 +305,7 @@ struct tss_struct {
 
 } ____cacheline_aligned;
 
-DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss);
+DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, cpu_tss);
 
 #ifdef CONFIG_X86_32
 DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
@@ -332,6 +332,11 @@ union irq_stack_union {
 		char gs_base[40];
 		unsigned long stack_canary;
 	};
+
+	struct {
+		char irq_stack_pointer[64];
+		char unused[IRQ_STACK_SIZE - 64];
+	};
 };
 
 DECLARE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union) __visible;
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index aa1e7246b06b..e5ba9701ee6c 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -92,7 +92,7 @@ static const struct cpu_dev default_cpu = {
 
 static const struct cpu_dev *this_cpu = &default_cpu;
 
-DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
+DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page) = { .gdt = {
 #ifdef CONFIG_X86_64
 	/*
 	 * We need valid kernel segments for data and code in long mode too
@@ -1229,7 +1229,7 @@ static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
 	  [DEBUG_STACK - 1]			= DEBUG_STKSZ
 };
 
-static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
+DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(char, exception_stacks
 	[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
 
 /* May not be marked __init: used by software suspend */
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index 4d38416e2a7f..bd1358d55146 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -41,6 +41,7 @@
 #include <asm/pgalloc.h>
 #include <asm/setup.h>
 #include <asm/espfix.h>
+#include <asm/kaiser.h>
 
 /*
  * Note: we only need 6*8 = 48 bytes for the espfix stack, but round
@@ -126,6 +127,11 @@ void __init init_espfix_bsp(void)
 	/* Install the espfix pud into the kernel page directory */
 	pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)];
 	pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page);
+#ifdef CONFIG_KAISER
+	// add the esp stack pud to the shadow mapping here.
+	// This can be done directly, because the fixup stack has its own pud
+	set_pgd(native_get_shadow_pgd(pgd_p), __pgd(_PAGE_TABLE | __pa((pud_t *)espfix_pud_page)));
+#endif
 
 	/* Randomize the locations */
 	init_espfix_random();
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index ffdc0e860390..0a8d18ac63eb 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -441,6 +441,14 @@ early_idt_ripmsg:
 	.balign	PAGE_SIZE; \
 GLOBAL(name)
 
+#ifdef CONFIG_KAISER
+#define NEXT_PGD_PAGE(name) \
+	.balign 2 * PAGE_SIZE; \
+GLOBAL(name)
+#else
+#define NEXT_PGD_PAGE(name) NEXT_PAGE(name)
+#endif
+
 /* Automate the creation of 1 to 1 mapping pmd entries */
 #define PMDS(START, PERM, COUNT)			\
 	i = 0 ;						\
@@ -450,7 +458,7 @@ GLOBAL(name)
 	.endr
 
 	__INITDATA
-NEXT_PAGE(early_level4_pgt)
+NEXT_PGD_PAGE(early_level4_pgt)
 	.fill	511,8,0
 	.quad	level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
 
@@ -460,10 +468,10 @@ NEXT_PAGE(early_dynamic_pgts)
 	.data
 
 #ifndef CONFIG_XEN
-NEXT_PAGE(init_level4_pgt)
-	.fill	512,8,0
+NEXT_PGD_PAGE(init_level4_pgt)
+	.fill	2*512,8,0
 #else
-NEXT_PAGE(init_level4_pgt)
+NEXT_PGD_PAGE(init_level4_pgt)
 	.quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
 	.org    init_level4_pgt + L4_PAGE_OFFSET*8, 0
 	.quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 1423ab1b0312..f480b38a03c3 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -51,7 +51,7 @@ static struct irqaction irq2 = {
 	.flags = IRQF_NO_THREAD,
 };
 
-DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
+DEFINE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq) = {
 	[0 ... NR_VECTORS - 1] = VECTOR_UNUSED,
 };
 
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 9f7c21c22477..7c5c5dc90ffa 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -39,7 +39,7 @@
  * section. Since TSS's are completely CPU-local, we want them
  * on exact cacheline boundaries, to eliminate cacheline ping-pong.
  */
-__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
+__visible DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, cpu_tss) = {
 	.x86_tss = {
 		.sp0 = TOP_OF_INIT_STACK,
 #ifdef CONFIG_X86_32
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 1ae7c141f778..978156081bed 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -32,3 +32,4 @@ obj-$(CONFIG_ACPI_NUMA)		+= srat.o
 obj-$(CONFIG_NUMA_EMU)		+= numa_emulation.o
 
 obj-$(CONFIG_X86_INTEL_MPX)	+= mpx.o
+obj-$(CONFIG_KAISER)		+= kaiser.o
diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c
new file mode 100644
index 000000000000..cf1bb922d467
--- /dev/null
+++ b/arch/x86/mm/kaiser.c
@@ -0,0 +1,160 @@
+
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/bug.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/mm.h>
+
+#include <linux/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/desc.h>
+#ifdef CONFIG_KAISER
+
+__visible DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
+
+/**
+ * Get the real ppn from a address in kernel mapping.
+ * @param address The virtual adrress
+ * @return the physical address
+ */
+static inline unsigned long get_pa_from_mapping (unsigned long address)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+
+	pgd = pgd_offset_k(address);
+	BUG_ON(pgd_none(*pgd) || pgd_large(*pgd));
+
+	pud = pud_offset(pgd, address);
+	BUG_ON(pud_none(*pud));
+
+	if (pud_large(*pud)) {
+		return (pud_pfn(*pud) << PAGE_SHIFT) | (address & ~PUD_PAGE_MASK);
+	}
+
+	pmd = pmd_offset(pud, address);
+	BUG_ON(pmd_none(*pmd));
+
+	if (pmd_large(*pmd)) {
+		return (pmd_pfn(*pmd) << PAGE_SHIFT) | (address & ~PMD_PAGE_MASK);
+	}
+
+	pte = pte_offset_kernel(pmd, address);
+	BUG_ON(pte_none(*pte));
+
+	return (pte_pfn(*pte) << PAGE_SHIFT) | (address & ~PAGE_MASK);
+}
+
+void _kaiser_copy (unsigned long start_addr, unsigned long size,
+					unsigned long flags)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+	unsigned long address;
+	unsigned long end_addr = start_addr + size;
+	unsigned long target_address;
+
+	for (address = PAGE_ALIGN(start_addr - (PAGE_SIZE - 1));
+			address < PAGE_ALIGN(end_addr); address += PAGE_SIZE) {
+		target_address = get_pa_from_mapping(address);
+
+		pgd = native_get_shadow_pgd(pgd_offset_k(address));
+
+		BUG_ON(pgd_none(*pgd) && "All shadow pgds should be mapped at this time\n");
+		BUG_ON(pgd_large(*pgd));
+
+		pud = pud_offset(pgd, address);
+		if (pud_none(*pud)) {
+			set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd_alloc_one(0, address))));
+		}
+		BUG_ON(pud_large(*pud));
+
+		pmd = pmd_offset(pud, address);
+		if (pmd_none(*pmd)) {
+			set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte_alloc_one_kernel(0, address))));
+		}
+		BUG_ON(pmd_large(*pmd));
+
+		pte = pte_offset_kernel(pmd, address);
+		if (pte_none(*pte)) {
+			set_pte(pte, __pte(flags | target_address));
+		} else {
+			BUG_ON(__pa(pte_page(*pte)) != target_address);
+		}
+	}
+}
+
+// at first, add a pmd for every pgd entry in the shadowmem-kernel-part of the kernel mapping
+static inline void __init _kaiser_init(void)
+{
+	pgd_t *pgd;
+	int i = 0;
+
+	pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0));
+	for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) {
+		set_pgd(pgd + i, __pgd(_PAGE_TABLE |__pa(pud_alloc_one(0, 0))));
+	}
+}
+
+extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];
+spinlock_t shadow_table_lock;
+void __init kaiser_init(void)
+{
+	int cpu;
+	spin_lock_init(&shadow_table_lock);
+
+	spin_lock(&shadow_table_lock);
+
+	_kaiser_init();
+
+	for_each_possible_cpu(cpu) {
+		// map the per cpu user variables
+		_kaiser_copy(
+				(unsigned long) (__per_cpu_user_mapped_start + per_cpu_offset(cpu)),
+				(unsigned long) __per_cpu_user_mapped_end - (unsigned long) __per_cpu_user_mapped_start,
+				__PAGE_KERNEL);
+	}
+
+	// map the entry/exit text section, which is responsible to switch between user- and kernel mode
+	_kaiser_copy(
+			(unsigned long) __entry_text_start,
+			(unsigned long) __entry_text_end - (unsigned long) __entry_text_start,
+			__PAGE_KERNEL_RX);
+
+	// the fixed map address of the idt_table
+	_kaiser_copy(
+			(unsigned long) idt_descr.address,
+			sizeof(gate_desc) * NR_VECTORS,
+			__PAGE_KERNEL_RO);
+
+	spin_unlock(&shadow_table_lock);
+}
+
+// add a mapping to the shadow-mapping, and synchronize the mappings
+void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
+{
+	spin_lock(&shadow_table_lock);
+	_kaiser_copy(addr, size, flags);
+	spin_unlock(&shadow_table_lock);
+}
+
+extern void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end);
+void kaiser_remove_mapping(unsigned long start, unsigned long size)
+{
+	pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(start));
+	spin_lock(&shadow_table_lock);
+	do {
+		unmap_pud_range(pgd, start, start + size);
+	} while (pgd++ != native_get_shadow_pgd(pgd_offset_k(start + size)));
+	spin_unlock(&shadow_table_lock);
+}
+#endif /* CONFIG_KAISER */
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index b599a780a5a9..4fe616bd2586 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -829,7 +829,7 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
 			pud_clear(pud);
 }
 
-static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
+void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
 {
 	pud_t *pud = pud_offset(pgd, start);
 
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index fb0a9dd1d6e4..087d3e1e7125 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -342,12 +342,38 @@ static inline void _pgd_free(pgd_t *pgd)
 #else
 static inline pgd_t *_pgd_alloc(void)
 {
+#ifdef CONFIG_KAISER
+	// Instead of one PML4, we aquire two PML4s and, thus, an 8kb-aligned memory
+	// block. Therefore, we have to allocate at least 3 pages. However, the
+	// __get_free_pages returns us 4 pages. Hence, we store the base pointer at
+	// the beginning of the page of our 8kb-aligned memory block in order to
+	// correctly free it afterwars.
+
+	unsigned long pages = __get_free_pages(PGALLOC_GFP, get_order(4*PAGE_SIZE));
+
+	if(native_get_normal_pgd((pgd_t*) pages) == (pgd_t*) pages)
+	{
+		*((unsigned long*)(pages + 2 * PAGE_SIZE)) = pages;
+		return (pgd_t *) pages;
+	}
+	else
+	{
+		*((unsigned long*)(pages + 3 * PAGE_SIZE)) = pages;
+		return (pgd_t *) (pages + PAGE_SIZE);
+	}
+#else
 	return (pgd_t *)__get_free_page(PGALLOC_GFP);
+#endif
 }
 
 static inline void _pgd_free(pgd_t *pgd)
 {
+#ifdef CONFIG_KAISER
+  unsigned long pages = *((unsigned long*) ((char*) pgd + 2 * PAGE_SIZE));
+	free_pages(pages, get_order(4*PAGE_SIZE));
+#else
 	free_page((unsigned long)pgd);
+#endif
 }
 #endif /* CONFIG_X86_PAE */
author	Richard Fellner <richard.fellner@student.tugraz.at>	2017-05-04 14:26:50 +0200
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>	2018-01-05 15:44:23 +0100
commit	8a43ddfb93a0c6ae1a6e1f5c25705ec5d1843c40 (patch)
tree	3eced34a05bd8124e0a03d63d982e7cdf01d013e /arch
parent	0fa147b407478e73fe7a478677ff2b12bb824014 (diff)