From 9e1485b1b570dd2827ecc51a847f7b27b2f69e70 Mon Sep 17 00:00:00 2001
From: Steve Capper <steve.capper@arm.com>
Date: Mon, 4 Dec 2017 14:13:05 +0000
Subject: arm64: Initialise high_memory global variable earlier

commit f24e5834a2c3f6c5f814a417f858226f0a010ade upstream.

The high_memory global variable is used by
cma_declare_contiguous(.) before it is defined.

We don't notice this as we compute __pa(high_memory - 1), and it looks
like we're processing a VA from the direct linear map.

This problem becomes apparent when we flip the kernel virtual address
space and the linear map is moved to the bottom of the kernel VA space.

This patch moves the initialisation of high_memory before it used.

Fixes: f7426b983a6a ("mm: cma: adjust address limit to avoid hitting low/high memory boundary")
Signed-off-by: Steve Capper <steve.capper@arm.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/arm64/mm/init.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
index 4cb98aa8c27b..efd89ce4533d 100644
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -178,6 +178,7 @@ void __init arm64_memblock_init(void)
 		arm64_dma_phys_limit = max_zone_dma_phys();
 	else
 		arm64_dma_phys_limit = PHYS_MASK + 1;
+	high_memory = __va(memblock_end_of_DRAM() - 1) + 1;
 	dma_contiguous_reserve(arm64_dma_phys_limit);
 
 	memblock_allow_resize();
@@ -202,7 +203,6 @@ void __init bootmem_init(void)
 	sparse_init();
 	zone_sizes_init(min, max);
 
-	high_memory = __va((max << PAGE_SHIFT) - 1) + 1;
 	max_pfn = max_low_pfn = max;
 }
 
-- 
cgit v1.2.3


From becf292446e9f2dc8842c448836bbe8005e24db0 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Fri, 29 Jan 2016 11:42:57 -0800
Subject: x86/mm: Add INVPCID helpers

commit 060a402a1ddb551455ee410de2eadd3349f2801b upstream.

This adds helpers for each of the four currently-specified INVPCID
modes.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luis R. Rodriguez <mcgrof@suse.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/8a62b23ad686888cee01da134c91409e22064db9.1454096309.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/tlbflush.h | 48 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 6433e28dc9c8..5f494d8bd158 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -7,6 +7,54 @@
 #include <asm/processor.h>
 #include <asm/special_insns.h>
 
+static inline void __invpcid(unsigned long pcid, unsigned long addr,
+			     unsigned long type)
+{
+	u64 desc[2] = { pcid, addr };
+
+	/*
+	 * The memory clobber is because the whole point is to invalidate
+	 * stale TLB entries and, especially if we're flushing global
+	 * mappings, we don't want the compiler to reorder any subsequent
+	 * memory accesses before the TLB flush.
+	 *
+	 * The hex opcode is invpcid (%ecx), %eax in 32-bit mode and
+	 * invpcid (%rcx), %rax in long mode.
+	 */
+	asm volatile (".byte 0x66, 0x0f, 0x38, 0x82, 0x01"
+		      : : "m" (desc), "a" (type), "c" (desc) : "memory");
+}
+
+#define INVPCID_TYPE_INDIV_ADDR		0
+#define INVPCID_TYPE_SINGLE_CTXT	1
+#define INVPCID_TYPE_ALL_INCL_GLOBAL	2
+#define INVPCID_TYPE_ALL_NON_GLOBAL	3
+
+/* Flush all mappings for a given pcid and addr, not including globals. */
+static inline void invpcid_flush_one(unsigned long pcid,
+				     unsigned long addr)
+{
+	__invpcid(pcid, addr, INVPCID_TYPE_INDIV_ADDR);
+}
+
+/* Flush all mappings for a given PCID, not including globals. */
+static inline void invpcid_flush_single_context(unsigned long pcid)
+{
+	__invpcid(pcid, 0, INVPCID_TYPE_SINGLE_CTXT);
+}
+
+/* Flush all mappings, including globals, for all PCIDs. */
+static inline void invpcid_flush_all(void)
+{
+	__invpcid(0, 0, INVPCID_TYPE_ALL_INCL_GLOBAL);
+}
+
+/* Flush all mappings for all PCIDs except globals. */
+static inline void invpcid_flush_all_nonglobals(void)
+{
+	__invpcid(0, 0, INVPCID_TYPE_ALL_NON_GLOBAL);
+}
+
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt.h>
 #else
-- 
cgit v1.2.3


From 04ec428b15f161ce8449756fb64b6f380c8d95fd Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 10 Feb 2016 15:51:16 +0100
Subject: x86/mm: Fix INVPCID asm constraint
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit e2c7698cd61f11d4077fdb28148b2d31b82ac848 upstream.

So we want to specify the dependency on both @pcid and @addr so that the
compiler doesn't reorder accesses to them *before* the TLB flush. But
for that to work, we need to express this properly in the inline asm and
deref the whole desc array, not the pointer to it. See clwb() for an
example.

This fixes the build error on 32-bit:

  arch/x86/include/asm/tlbflush.h: In function ‘__invpcid’:
  arch/x86/include/asm/tlbflush.h:26:18: error: memory input 0 is not directly addressable

which gcc4.7 caught but 5.x didn't. Which is strange. :-\

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luis R. Rodriguez <mcgrof@suse.com>
Cc: Michael Matz <matz@suse.de>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: linux-mm@kvack.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/tlbflush.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 5f494d8bd158..bd0ee0639bf0 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -10,7 +10,7 @@
 static inline void __invpcid(unsigned long pcid, unsigned long addr,
 			     unsigned long type)
 {
-	u64 desc[2] = { pcid, addr };
+	struct { u64 d[2]; } desc = { { pcid, addr } };
 
 	/*
 	 * The memory clobber is because the whole point is to invalidate
@@ -22,7 +22,7 @@ static inline void __invpcid(unsigned long pcid, unsigned long addr,
 	 * invpcid (%rcx), %rax in long mode.
 	 */
 	asm volatile (".byte 0x66, 0x0f, 0x38, 0x82, 0x01"
-		      : : "m" (desc), "a" (type), "c" (desc) : "memory");
+		      : : "m" (desc), "a" (type), "c" (&desc) : "memory");
 }
 
 #define INVPCID_TYPE_INDIV_ADDR		0
-- 
cgit v1.2.3


From 791a0f3fecdabe18cc291e5f9b7ebbdc81895975 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Fri, 29 Jan 2016 11:42:58 -0800
Subject: x86/mm: Add a 'noinvpcid' boot option to turn off INVPCID

commit d12a72b844a49d4162f24cefdab30bed3f86730e upstream.

This adds a chicken bit to turn off INVPCID in case something goes
wrong.  It's an early_param() because we do TLB flushes before we
parse __setup() parameters.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luis R. Rodriguez <mcgrof@suse.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/f586317ed1bc2b87aee652267e515b90051af385.1454096309.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kernel/cpu/common.c | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 637ca414d431..c84b62956e8d 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -162,6 +162,22 @@ static int __init x86_mpx_setup(char *s)
 }
 __setup("nompx", x86_mpx_setup);
 
+static int __init x86_noinvpcid_setup(char *s)
+{
+	/* noinvpcid doesn't accept parameters */
+	if (s)
+		return -EINVAL;
+
+	/* do not emit a message if the feature is not present */
+	if (!boot_cpu_has(X86_FEATURE_INVPCID))
+		return 0;
+
+	setup_clear_cpu_cap(X86_FEATURE_INVPCID);
+	pr_info("noinvpcid: INVPCID feature disabled\n");
+	return 0;
+}
+early_param("noinvpcid", x86_noinvpcid_setup);
+
 #ifdef CONFIG_X86_32
 static int cachesize_override = -1;
 static int disable_x86_serial_nr = 1;
-- 
cgit v1.2.3


From 85d3700c744a11ee2989252acf50ccbbd814167a Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Fri, 29 Jan 2016 11:42:59 -0800
Subject: x86/mm: If INVPCID is available, use it to flush global mappings

commit d8bced79af1db6734f66b42064cc773cada2ce99 upstream.

On my Skylake laptop, INVPCID function 2 (flush absolutely
everything) takes about 376ns, whereas saving flags, twiddling
CR4.PGE to flush global mappings, and restoring flags takes about
539ns.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luis R. Rodriguez <mcgrof@suse.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/ed0ef62581c0ea9c99b9bf6df726015e96d44743.1454096309.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/tlbflush.h | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index bd0ee0639bf0..6cf0893e9b70 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -159,6 +159,15 @@ static inline void __native_flush_tlb_global(void)
 {
 	unsigned long flags;
 
+	if (static_cpu_has(X86_FEATURE_INVPCID)) {
+		/*
+		 * Using INVPCID is considerably faster than a pair of writes
+		 * to CR4 sandwiched inside an IRQ flag save/restore.
+		 */
+		invpcid_flush_all();
+		return;
+	}
+
 	/*
 	 * Read-modify-write to CR4 - protect it from preemption and
 	 * from interrupts. (Use the raw variant because this code can
-- 
cgit v1.2.3


From 8d5ee51a6bce71d8905d8f01d0931f69be4489d5 Mon Sep 17 00:00:00 2001
From: Nadav Amit <namit@vmware.com>
Date: Fri, 1 Apr 2016 14:31:26 -0700
Subject: mm/rmap: batched invalidations should use existing api

commit 858eaaa711700ce4595e039441e239e56d7b9514 upstream.

The recently introduced batched invalidations mechanism uses its own
mechanism for shootdown.  However, it does wrong accounting of
interrupts (e.g., inc_irq_stat is called for local invalidations),
trace-points (e.g., TLB_REMOTE_SHOOTDOWN for local invalidations) and
may break some platforms as it bypasses the invalidation mechanisms of
Xen and SGI UV.

This patch reuses the existing TLB flushing mechnaisms instead.  We use
NULL as mm to indicate a global invalidation is required.

Fixes 72b252aed506b8 ("mm: send one IPI per CPU to TLB flush all entries after unmapping pages")
Signed-off-by: Nadav Amit <namit@vmware.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Rik van Riel <riel@redhat.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/tlbflush.h | 6 ------
 arch/x86/mm/tlb.c               | 2 +-
 2 files changed, 1 insertion(+), 7 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 6cf0893e9b70..4dc534175b5e 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -325,12 +325,6 @@ static inline void reset_lazy_tlbstate(void)
 
 #endif	/* SMP */
 
-/* Not inlined due to inc_irq_stat not being defined yet */
-#define flush_tlb_local() {		\
-	inc_irq_stat(irq_tlb_count);	\
-	local_flush_tlb();		\
-}
-
 #ifndef CONFIG_PARAVIRT
 #define flush_tlb_others(mask, mm, start, end)	\
 	native_flush_tlb_others(mask, mm, start, end)
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 5a760fd66bec..9ef66949a57b 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -104,7 +104,7 @@ static void flush_tlb_func(void *info)
 
 	inc_irq_stat(irq_tlb_count);
 
-	if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm))
+	if (f->flush_mm && f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm))
 		return;
 
 	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
-- 
cgit v1.2.3


From 83cc4b50e3a977915666ade0b951ba446e7181bd Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Tue, 26 Apr 2016 09:39:07 -0700
Subject: x86/mm: Build arch/x86/mm/tlb.c even on !SMP

commit e1074888c326038340a1ada9129d679e661f2ea6 upstream.

Currently all of the functions that live in tlb.c are inlined on
!SMP builds.  One can debate whether this is a good idea (in many
respects the code in tlb.c is better than the inlined UP code).

Regardless, I want to add code that needs to be built on UP and SMP
kernels and relates to tlb flushing, so arrange for tlb.c to be
compiled unconditionally.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/f0d778f0d828fc46e5d1946bca80f0aaf9abf032.1461688545.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/mm/Makefile | 3 +--
 arch/x86/mm/tlb.c    | 4 ++++
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 65c47fda26fc..1ae7c141f778 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -1,5 +1,5 @@
 obj-y	:=  init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
-	    pat.o pgtable.o physaddr.o gup.o setup_nx.o
+	    pat.o pgtable.o physaddr.o gup.o setup_nx.o tlb.o
 
 # Make sure __phys_addr has no stackprotector
 nostackp := $(call cc-option, -fno-stack-protector)
@@ -9,7 +9,6 @@ CFLAGS_setup_nx.o		:= $(nostackp)
 CFLAGS_fault.o := -I$(src)/../include/asm/trace
 
 obj-$(CONFIG_X86_PAT)		+= pat_rbtree.o
-obj-$(CONFIG_SMP)		+= tlb.o
 
 obj-$(CONFIG_X86_32)		+= pgtable_32.o iomap_32.o
 
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 9ef66949a57b..5ef19a413820 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -28,6 +28,8 @@
  *	Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
  */
 
+#ifdef CONFIG_SMP
+
 struct flush_tlb_info {
 	struct mm_struct *flush_mm;
 	unsigned long flush_start;
@@ -351,3 +353,5 @@ static int __init create_tlb_single_page_flush_ceiling(void)
 	return 0;
 }
 late_initcall(create_tlb_single_page_flush_ceiling);
+
+#endif /* CONFIG_SMP */
-- 
cgit v1.2.3


From 70a39c7fd167399fde76aeac314dce026a255b49 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Tue, 26 Apr 2016 09:39:08 -0700
Subject: x86/mm, sched/core: Uninline switch_mm()

commit 69c0319aabba45bcf33178916a2f06967b4adede upstream.

It's fairly large and it has quite a few callers.  This may also
help untangle some headers down the road.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/54f3367803e7f80b2be62c8a21879aa74b1a5f57.1461688545.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/mmu_context.h |  98 +----------------------------------
 arch/x86/mm/tlb.c                  | 102 +++++++++++++++++++++++++++++++++++++
 2 files changed, 104 insertions(+), 96 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index bfd9b2a35a0b..96e233b3597a 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -104,103 +104,9 @@ static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
 #endif
 }
 
-static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
-			     struct task_struct *tsk)
-{
-	unsigned cpu = smp_processor_id();
-
-	if (likely(prev != next)) {
-#ifdef CONFIG_SMP
-		this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
-		this_cpu_write(cpu_tlbstate.active_mm, next);
-#endif
-		cpumask_set_cpu(cpu, mm_cpumask(next));
-
-		/*
-		 * Re-load page tables.
-		 *
-		 * This logic has an ordering constraint:
-		 *
-		 *  CPU 0: Write to a PTE for 'next'
-		 *  CPU 0: load bit 1 in mm_cpumask.  if nonzero, send IPI.
-		 *  CPU 1: set bit 1 in next's mm_cpumask
-		 *  CPU 1: load from the PTE that CPU 0 writes (implicit)
-		 *
-		 * We need to prevent an outcome in which CPU 1 observes
-		 * the new PTE value and CPU 0 observes bit 1 clear in
-		 * mm_cpumask.  (If that occurs, then the IPI will never
-		 * be sent, and CPU 0's TLB will contain a stale entry.)
-		 *
-		 * The bad outcome can occur if either CPU's load is
-		 * reordered before that CPU's store, so both CPUs must
-		 * execute full barriers to prevent this from happening.
-		 *
-		 * Thus, switch_mm needs a full barrier between the
-		 * store to mm_cpumask and any operation that could load
-		 * from next->pgd.  TLB fills are special and can happen
-		 * due to instruction fetches or for no reason at all,
-		 * and neither LOCK nor MFENCE orders them.
-		 * Fortunately, load_cr3() is serializing and gives the
-		 * ordering guarantee we need.
-		 *
-		 */
-		load_cr3(next->pgd);
-
-		trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
-
-		/* Stop flush ipis for the previous mm */
-		cpumask_clear_cpu(cpu, mm_cpumask(prev));
-
-		/* Load per-mm CR4 state */
-		load_mm_cr4(next);
+extern void switch_mm(struct mm_struct *prev, struct mm_struct *next,
+		      struct task_struct *tsk);
 
-#ifdef CONFIG_MODIFY_LDT_SYSCALL
-		/*
-		 * Load the LDT, if the LDT is different.
-		 *
-		 * It's possible that prev->context.ldt doesn't match
-		 * the LDT register.  This can happen if leave_mm(prev)
-		 * was called and then modify_ldt changed
-		 * prev->context.ldt but suppressed an IPI to this CPU.
-		 * In this case, prev->context.ldt != NULL, because we
-		 * never set context.ldt to NULL while the mm still
-		 * exists.  That means that next->context.ldt !=
-		 * prev->context.ldt, because mms never share an LDT.
-		 */
-		if (unlikely(prev->context.ldt != next->context.ldt))
-			load_mm_ldt(next);
-#endif
-	}
-#ifdef CONFIG_SMP
-	  else {
-		this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
-		BUG_ON(this_cpu_read(cpu_tlbstate.active_mm) != next);
-
-		if (!cpumask_test_cpu(cpu, mm_cpumask(next))) {
-			/*
-			 * On established mms, the mm_cpumask is only changed
-			 * from irq context, from ptep_clear_flush() while in
-			 * lazy tlb mode, and here. Irqs are blocked during
-			 * schedule, protecting us from simultaneous changes.
-			 */
-			cpumask_set_cpu(cpu, mm_cpumask(next));
-
-			/*
-			 * We were in lazy tlb mode and leave_mm disabled
-			 * tlb flush IPI delivery. We must reload CR3
-			 * to make sure to use no freed page tables.
-			 *
-			 * As above, load_cr3() is serializing and orders TLB
-			 * fills with respect to the mm_cpumask write.
-			 */
-			load_cr3(next->pgd);
-			trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
-			load_mm_cr4(next);
-			load_mm_ldt(next);
-		}
-	}
-#endif
-}
 
 #define activate_mm(prev, next)			\
 do {						\
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 5ef19a413820..f9a30d79396c 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -59,6 +59,108 @@ void leave_mm(int cpu)
 }
 EXPORT_SYMBOL_GPL(leave_mm);
 
+#endif /* CONFIG_SMP */
+
+void switch_mm(struct mm_struct *prev, struct mm_struct *next,
+	       struct task_struct *tsk)
+{
+	unsigned cpu = smp_processor_id();
+
+	if (likely(prev != next)) {
+#ifdef CONFIG_SMP
+		this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
+		this_cpu_write(cpu_tlbstate.active_mm, next);
+#endif
+		cpumask_set_cpu(cpu, mm_cpumask(next));
+
+		/*
+		 * Re-load page tables.
+		 *
+		 * This logic has an ordering constraint:
+		 *
+		 *  CPU 0: Write to a PTE for 'next'
+		 *  CPU 0: load bit 1 in mm_cpumask.  if nonzero, send IPI.
+		 *  CPU 1: set bit 1 in next's mm_cpumask
+		 *  CPU 1: load from the PTE that CPU 0 writes (implicit)
+		 *
+		 * We need to prevent an outcome in which CPU 1 observes
+		 * the new PTE value and CPU 0 observes bit 1 clear in
+		 * mm_cpumask.  (If that occurs, then the IPI will never
+		 * be sent, and CPU 0's TLB will contain a stale entry.)
+		 *
+		 * The bad outcome can occur if either CPU's load is
+		 * reordered before that CPU's store, so both CPUs must
+		 * execute full barriers to prevent this from happening.
+		 *
+		 * Thus, switch_mm needs a full barrier between the
+		 * store to mm_cpumask and any operation that could load
+		 * from next->pgd.  TLB fills are special and can happen
+		 * due to instruction fetches or for no reason at all,
+		 * and neither LOCK nor MFENCE orders them.
+		 * Fortunately, load_cr3() is serializing and gives the
+		 * ordering guarantee we need.
+		 *
+		 */
+		load_cr3(next->pgd);
+
+		trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
+
+		/* Stop flush ipis for the previous mm */
+		cpumask_clear_cpu(cpu, mm_cpumask(prev));
+
+		/* Load per-mm CR4 state */
+		load_mm_cr4(next);
+
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
+		/*
+		 * Load the LDT, if the LDT is different.
+		 *
+		 * It's possible that prev->context.ldt doesn't match
+		 * the LDT register.  This can happen if leave_mm(prev)
+		 * was called and then modify_ldt changed
+		 * prev->context.ldt but suppressed an IPI to this CPU.
+		 * In this case, prev->context.ldt != NULL, because we
+		 * never set context.ldt to NULL while the mm still
+		 * exists.  That means that next->context.ldt !=
+		 * prev->context.ldt, because mms never share an LDT.
+		 */
+		if (unlikely(prev->context.ldt != next->context.ldt))
+			load_mm_ldt(next);
+#endif
+	}
+#ifdef CONFIG_SMP
+	  else {
+		this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
+		BUG_ON(this_cpu_read(cpu_tlbstate.active_mm) != next);
+
+		if (!cpumask_test_cpu(cpu, mm_cpumask(next))) {
+			/*
+			 * On established mms, the mm_cpumask is only changed
+			 * from irq context, from ptep_clear_flush() while in
+			 * lazy tlb mode, and here. Irqs are blocked during
+			 * schedule, protecting us from simultaneous changes.
+			 */
+			cpumask_set_cpu(cpu, mm_cpumask(next));
+
+			/*
+			 * We were in lazy tlb mode and leave_mm disabled
+			 * tlb flush IPI delivery. We must reload CR3
+			 * to make sure to use no freed page tables.
+			 *
+			 * As above, load_cr3() is serializing and orders TLB
+			 * fills with respect to the mm_cpumask write.
+			 */
+			load_cr3(next->pgd);
+			trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
+			load_mm_cr4(next);
+			load_mm_ldt(next);
+		}
+	}
+#endif
+}
+
+#ifdef CONFIG_SMP
+
 /*
  * The flush IPI assumes that a thread switch happens in this order:
  * [cpu0: the cpu that switches]
-- 
cgit v1.2.3


From 4ead44fd2525ed97e5362a806d312a0e3b0ea445 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Tue, 26 Apr 2016 09:39:09 -0700
Subject: x86/mm, sched/core: Turn off IRQs in switch_mm()

commit 078194f8e9fe3cf54c8fd8bded48a1db5bd8eb8a upstream.

Potential races between switch_mm() and TLB-flush or LDT-flush IPIs
could be very messy.  AFAICT the code is currently okay, whether by
accident or by careful design, but enabling PCID will make it
considerably more complicated and will no longer be obviously safe.

Fix it with a big hammer: run switch_mm() with IRQs off.

To avoid a performance hit in the scheduler, we take advantage of
our knowledge that the scheduler already has IRQs disabled when it
calls switch_mm().

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/f19baf759693c9dcae64bbff76189db77cb13398.1461688545.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/mmu_context.h |  3 +++
 arch/x86/mm/tlb.c                  | 10 ++++++++++
 2 files changed, 13 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 96e233b3597a..44fc93987869 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -107,6 +107,9 @@ static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
 extern void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 		      struct task_struct *tsk);
 
+extern void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
+			       struct task_struct *tsk);
+#define switch_mm_irqs_off switch_mm_irqs_off
 
 #define activate_mm(prev, next)			\
 do {						\
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index f9a30d79396c..45ba87466e6a 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -63,6 +63,16 @@ EXPORT_SYMBOL_GPL(leave_mm);
 
 void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 	       struct task_struct *tsk)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	switch_mm_irqs_off(prev, next, tsk);
+	local_irq_restore(flags);
+}
+
+void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
+			struct task_struct *tsk)
 {
 	unsigned cpu = smp_processor_id();
 
-- 
cgit v1.2.3


From c22d4b4d1c7fcc0d9eb4d8618d86c554c48ed9c0 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <rostedt@goodmis.org>
Date: Fri, 13 May 2016 15:30:13 +0200
Subject: ARM: Hide finish_arch_post_lock_switch() from modules

commit ef0491ea17f8019821c7e9c8e801184ecf17f85a upstream.

The introduction of switch_mm_irqs_off() brought back an old bug
regarding the use of preempt_enable_no_resched:

As part of:

  62b94a08da1b ("sched/preempt: Take away preempt_enable_no_resched() from modules")

the definition of preempt_enable_no_resched() is only available in
built-in code, not in loadable modules, so we can't generally use
it from header files.

However, the ARM version of finish_arch_post_lock_switch()
calls preempt_enable_no_resched() and is defined as a static
inline function in asm/mmu_context.h. This in turn means we cannot
include asm/mmu_context.h from modules.

With today's tip tree, asm/mmu_context.h gets included from
linux/mmu_context.h, which is normally the exact pattern one would
expect, but unfortunately, linux/mmu_context.h can be included from
the vhost driver that is a loadable module, now causing this compile
time error with modular configs:

  In file included from ../include/linux/mmu_context.h:4:0,
                   from ../drivers/vhost/vhost.c:18:
  ../arch/arm/include/asm/mmu_context.h: In function 'finish_arch_post_lock_switch':
  ../arch/arm/include/asm/mmu_context.h:88:3: error: implicit declaration of function 'preempt_enable_no_resched' [-Werror=implicit-function-declaration]
     preempt_enable_no_resched();

Andy already tried to fix the bug by including linux/preempt.h
from asm/mmu_context.h, but that didn't help. Arnd suggested reordering
the header files, which wasn't popular, so let's use this
workaround instead:

The finish_arch_post_lock_switch() definition is now also hidden
inside of #ifdef MODULE, so we don't see anything referencing
preempt_enable_no_resched() from a header file. I've built a
few hundred randconfig kernels with this, and did not see any
new problems.

Tested-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Acked-by: Russell King <rmk+kernel@arm.linux.org.uk>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@techsingularity.net>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Russell King - ARM Linux <linux@armlinux.org.uk>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: linux-arm-kernel@lists.infradead.org
Fixes: f98db6013c55 ("sched/core: Add switch_mm_irqs_off() and use it in the scheduler")
Link: http://lkml.kernel.org/r/1463146234-161304-1-git-send-email-arnd@arndb.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/arm/include/asm/mmu_context.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch')

diff --git a/arch/arm/include/asm/mmu_context.h b/arch/arm/include/asm/mmu_context.h
index 9b32f76bb0dd..10f662498eb7 100644
--- a/arch/arm/include/asm/mmu_context.h
+++ b/arch/arm/include/asm/mmu_context.h
@@ -61,6 +61,7 @@ static inline void check_and_switch_context(struct mm_struct *mm,
 		cpu_switch_mm(mm->pgd, mm);
 }
 
+#ifndef MODULE
 #define finish_arch_post_lock_switch \
 	finish_arch_post_lock_switch
 static inline void finish_arch_post_lock_switch(void)
@@ -82,6 +83,7 @@ static inline void finish_arch_post_lock_switch(void)
 		preempt_enable_no_resched();
 	}
 }
+#endif /* !MODULE */
 
 #endif	/* CONFIG_MMU */
 
-- 
cgit v1.2.3


From 3b9d9ec0d8261bb9b12f858e66f0c84cd2a6a3bb Mon Sep 17 00:00:00 2001
From: Aaron Lu <aaron.lu@intel.com>
Date: Thu, 11 Aug 2016 15:44:30 +0800
Subject: x86/irq: Do not substract irq_tlb_count from irq_call_count

commit 82ba4faca1bffad429f15c90c980ffd010366c25 upstream.

Since commit:

  52aec3308db8 ("x86/tlb: replace INVALIDATE_TLB_VECTOR by CALL_FUNCTION_VECTOR")

the TLB remote shootdown is done through call function vector. That
commit didn't take care of irq_tlb_count, which a later commit:

  fd0f5869724f ("x86: Distinguish TLB shootdown interrupts from other functions call interrupts")

... tried to fix.

The fix assumes every increase of irq_tlb_count has a corresponding
increase of irq_call_count. So the irq_call_count is always bigger than
irq_tlb_count and we could substract irq_tlb_count from irq_call_count.

Unfortunately this is not true for the smp_call_function_single() case.
The IPI is only sent if the target CPU's call_single_queue is empty when
adding a csd into it in generic_exec_single. That means if two threads
are both adding flush tlb csds to the same CPU's call_single_queue, only
one IPI is sent. In other words, the irq_call_count is incremented by 1
but irq_tlb_count is incremented by 2. Over time, irq_tlb_count will be
bigger than irq_call_count and the substract will produce a very large
irq_call_count value due to overflow.

Considering that:

  1) it's not worth to send more IPIs for the sake of accurate counting of
     irq_call_count in generic_exec_single();

  2) it's not easy to tell if the call function interrupt is for TLB
     shootdown in __smp_call_function_single_interrupt().

Not to exclude TLB shootdown from call function count seems to be the
simplest fix and this patch just does that.

This bug was found by LKP's cyclic performance regression tracking recently
with the vm-scalability test suite. I have bisected to commit:

  3dec0ba0be6a ("mm/rmap: share the i_mmap_rwsem")

This commit didn't do anything wrong but revealed the irq_call_count
problem. IIUC, the commit makes rwc->remap_one in rmap_walk_file
concurrent with multiple threads.  When remap_one is try_to_unmap_one(),
then multiple threads could queue flush TLB to the same CPU but only
one IPI will be sent.

Since the commit was added in Linux v3.19, the counting problem only
shows up from v3.19 onwards.

Signed-off-by: Aaron Lu <aaron.lu@intel.com>
Cc: Alex Shi <alex.shi@linaro.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tomoki Sekiyama <tomoki.sekiyama.qu@hitachi.com>
Link: http://lkml.kernel.org/r/20160811074430.GA18163@aaronlu.sh.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/hardirq.h | 4 ----
 arch/x86/kernel/irq.c          | 3 +--
 2 files changed, 1 insertion(+), 6 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 7178043b0e1d..59405a248fc2 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -22,10 +22,6 @@ typedef struct {
 #ifdef CONFIG_SMP
 	unsigned int irq_resched_count;
 	unsigned int irq_call_count;
-	/*
-	 * irq_tlb_count is double-counted in irq_call_count, so it must be
-	 * subtracted from irq_call_count when displaying irq_call_count
-	 */
 	unsigned int irq_tlb_count;
 #endif
 #ifdef CONFIG_X86_THERMAL_VECTOR
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 61521dc19c10..9f669fdd2010 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -102,8 +102,7 @@ int arch_show_interrupts(struct seq_file *p, int prec)
 	seq_puts(p, "  Rescheduling interrupts\n");
 	seq_printf(p, "%*s: ", prec, "CAL");
 	for_each_online_cpu(j)
-		seq_printf(p, "%10u ", irq_stats(j)->irq_call_count -
-					irq_stats(j)->irq_tlb_count);
+		seq_printf(p, "%10u ", irq_stats(j)->irq_call_count);
 	seq_puts(p, "  Function call interrupts\n");
 	seq_printf(p, "%*s: ", prec, "TLB");
 	for_each_online_cpu(j)
-- 
cgit v1.2.3


From 3f7855a522228787e2f73c279d7c0a3a6d3a2323 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Tue, 14 Feb 2017 00:05:59 +0900
Subject: arm: kprobes: Fix the return address of multiple kretprobes

[ Upstream commit 06553175f585b52509c7df37d6f4a50aacb7b211 ]

This is arm port of commit 737480a0d525 ("kprobes/x86:
Fix the return address of multiple kretprobes").

Fix the return address of subsequent kretprobes when multiple
kretprobes are set on the same function.

For example:

  # cd /sys/kernel/debug/tracing
  # echo "r:event1 sys_symlink" > kprobe_events
  # echo "r:event2 sys_symlink" >> kprobe_events
  # echo 1 > events/kprobes/enable
  # ln -s /tmp/foo /tmp/bar

 (without this patch)

  # cat trace | grep -v ^#
              ln-82    [000] dn.2    68.446525: event1: (kretprobe_trampoline+0x0/0x18 <- SyS_symlink)
              ln-82    [000] dn.2    68.447831: event2: (ret_fast_syscall+0x0/0x1c <- SyS_symlink)

 (with this patch)

  # cat trace | grep -v ^#
              ln-81    [000] dn.1    39.463469: event1: (ret_fast_syscall+0x0/0x1c <- SyS_symlink)
              ln-81    [000] dn.1    39.464701: event2: (ret_fast_syscall+0x0/0x1c <- SyS_symlink)

Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Cc: KUMANO Syuhei <kumano.prog@gmail.com>
Signed-off-by: Jon Medhurst <tixy@linaro.org>
Signed-off-by: Sasha Levin <alexander.levin@verizon.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/arm/probes/kprobes/core.c | 24 ++++++++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/arm/probes/kprobes/core.c b/arch/arm/probes/kprobes/core.c
index a4ec240ee7ba..3eb018fa1a1f 100644
--- a/arch/arm/probes/kprobes/core.c
+++ b/arch/arm/probes/kprobes/core.c
@@ -433,6 +433,7 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
 	struct hlist_node *tmp;
 	unsigned long flags, orig_ret_address = 0;
 	unsigned long trampoline_address = (unsigned long)&kretprobe_trampoline;
+	kprobe_opcode_t *correct_ret_addr = NULL;
 
 	INIT_HLIST_HEAD(&empty_rp);
 	kretprobe_hash_lock(current, &head, &flags);
@@ -455,14 +456,34 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
 			/* another task is sharing our hash bucket */
 			continue;
 
+		orig_ret_address = (unsigned long)ri->ret_addr;
+
+		if (orig_ret_address != trampoline_address)
+			/*
+			 * This is the real return address. Any other
+			 * instances associated with this task are for
+			 * other calls deeper on the call stack
+			 */
+			break;
+	}
+
+	kretprobe_assert(ri, orig_ret_address, trampoline_address);
+
+	correct_ret_addr = ri->ret_addr;
+	hlist_for_each_entry_safe(ri, tmp, head, hlist) {
+		if (ri->task != current)
+			/* another task is sharing our hash bucket */
+			continue;
+
+		orig_ret_address = (unsigned long)ri->ret_addr;
 		if (ri->rp && ri->rp->handler) {
 			__this_cpu_write(current_kprobe, &ri->rp->kp);
 			get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE;
+			ri->ret_addr = correct_ret_addr;
 			ri->rp->handler(ri, regs);
 			__this_cpu_write(current_kprobe, NULL);
 		}
 
-		orig_ret_address = (unsigned long)ri->ret_addr;
 		recycle_rp_inst(ri, &empty_rp);
 
 		if (orig_ret_address != trampoline_address)
@@ -474,7 +495,6 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
 			break;
 	}
 
-	kretprobe_assert(ri, orig_ret_address, trampoline_address);
 	kretprobe_hash_unlock(current, &flags);
 
 	hlist_for_each_entry_safe(ri, tmp, &empty_rp, hlist) {
-- 
cgit v1.2.3


From a23a447e47ccb333a392e9e132c231be0d5953db Mon Sep 17 00:00:00 2001
From: Jon Medhurst <tixy@linaro.org>
Date: Thu, 2 Mar 2017 13:04:09 +0000
Subject: arm: kprobes: Align stack to 8-bytes in test code

[ Upstream commit 974310d047f3c7788a51d10c8d255eebdb1fa857 ]

kprobes test cases need to have a stack that is aligned to an 8-byte
boundary because they call other functions (and the ARM ABI mandates
that alignment) and because test cases include 64-bit accesses to the
stack. Unfortunately, GCC doesn't ensure this alignment for inline
assembler and for the code in question seems to always misalign it by
pushing just the LR register onto the stack. We therefore need to
explicitly perform stack alignment at the start of each test case.

Without this fix, some test cases will generate alignment faults on
systems where alignment is enforced. Even if the kernel is configured to
handle these faults in software, triggering them is ugly. It also
exposes limitations in the fault handling code which doesn't cope with
writes to the stack. E.g. when handling this instruction

   strd r6, [sp, #-64]!

the fault handling code will write to a stack location below the SP
value at the point the fault occurred, which coincides with where the
exception handler has pushed the saved register context. This results in
corruption of those registers.

Signed-off-by: Jon Medhurst <tixy@linaro.org>
Signed-off-by: Sasha Levin <alexander.levin@verizon.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/arm/probes/kprobes/test-core.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/arm/probes/kprobes/test-core.c b/arch/arm/probes/kprobes/test-core.c
index 9775de22e2ff..a48354de1aa1 100644
--- a/arch/arm/probes/kprobes/test-core.c
+++ b/arch/arm/probes/kprobes/test-core.c
@@ -976,7 +976,10 @@ static void coverage_end(void)
 void __naked __kprobes_test_case_start(void)
 {
 	__asm__ __volatile__ (
-		"stmdb	sp!, {r4-r11}				\n\t"
+		"mov	r2, sp					\n\t"
+		"bic	r3, r2, #7				\n\t"
+		"mov	sp, r3					\n\t"
+		"stmdb	sp!, {r2-r11}				\n\t"
 		"sub	sp, sp, #"__stringify(TEST_MEMORY_SIZE)"\n\t"
 		"bic	r0, lr, #1  @ r0 = inline data		\n\t"
 		"mov	r1, sp					\n\t"
@@ -996,7 +999,8 @@ void __naked __kprobes_test_case_end_32(void)
 		"movne	pc, r0					\n\t"
 		"mov	r0, r4					\n\t"
 		"add	sp, sp, #"__stringify(TEST_MEMORY_SIZE)"\n\t"
-		"ldmia	sp!, {r4-r11}				\n\t"
+		"ldmia	sp!, {r2-r11}				\n\t"
+		"mov	sp, r2					\n\t"
 		"mov	pc, r0					\n\t"
 	);
 }
@@ -1012,7 +1016,8 @@ void __naked __kprobes_test_case_end_16(void)
 		"bxne	r0					\n\t"
 		"mov	r0, r4					\n\t"
 		"add	sp, sp, #"__stringify(TEST_MEMORY_SIZE)"\n\t"
-		"ldmia	sp!, {r4-r11}				\n\t"
+		"ldmia	sp!, {r2-r11}				\n\t"
+		"mov	sp, r2					\n\t"
 		"bx	r0					\n\t"
 	);
 }
-- 
cgit v1.2.3


From caa4cfd173b76b0b4a9ae894d438980daa235016 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpeng.li@hotmail.com>
Date: Mon, 20 Mar 2017 21:18:55 -0700
Subject: KVM: x86: correct async page present tracepoint
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[ Upstream commit 24dccf83a121b8a4ad5c2ad383a8184ef6c266ee ]

After async pf setup successfully, there is a broadcast wakeup w/ special
token 0xffffffff which tells vCPU that it should wake up all processes
waiting for APFs though there is no real process waiting at the moment.

The async page present tracepoint print prematurely and fails to catch the
special token setup. This patch fixes it by moving the async page present
tracepoint after the special token setup.

Before patch:

qemu-system-x86-8499  [006] ...1  5973.473292: kvm_async_pf_ready: token 0x0 gva 0x0

After patch:

qemu-system-x86-8499  [006] ...1  5973.473292: kvm_async_pf_ready: token 0xffffffff gva 0x0

Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Sasha Levin <alexander.levin@verizon.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kvm/x86.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index df81717a92f3..e5f44f33de89 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -8230,11 +8230,11 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
 {
 	struct x86_exception fault;
 
-	trace_kvm_async_pf_ready(work->arch.token, work->gva);
 	if (work->wakeup_all)
 		work->arch.token = ~0; /* broadcast wakeup */
 	else
 		kvm_del_async_pf_gfn(vcpu, work->arch.gfn);
+	trace_kvm_async_pf_ready(work->arch.token, work->gva);
 
 	if ((vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) &&
 	    !apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) {
-- 
cgit v1.2.3


From f1fdf68b4f947be85d96ac835d86bbf063b454b4 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpeng.li@hotmail.com>
Date: Thu, 23 Mar 2017 05:30:08 -0700
Subject: KVM: VMX: Fix enable VPID conditions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[ Upstream commit 08d839c4b134b8328ec42f2157a9ca4b93227c03 ]

This can be reproduced by running L2 on L1, and disable VPID on L0
if w/o commit "KVM: nVMX: Fix nested VPID vmx exec control", the L2
crash as below:

KVM: entry failed, hardware error 0x7
EAX=00000000 EBX=00000000 ECX=00000000 EDX=000306c3
ESI=00000000 EDI=00000000 EBP=00000000 ESP=00000000
EIP=0000fff0 EFL=00000002 [-------] CPL=0 II=0 A20=1 SMM=0 HLT=0
ES =0000 00000000 0000ffff 00009300
CS =f000 ffff0000 0000ffff 00009b00
SS =0000 00000000 0000ffff 00009300
DS =0000 00000000 0000ffff 00009300
FS =0000 00000000 0000ffff 00009300
GS =0000 00000000 0000ffff 00009300
LDT=0000 00000000 0000ffff 00008200
TR =0000 00000000 0000ffff 00008b00
GDT=     00000000 0000ffff
IDT=     00000000 0000ffff
CR0=60000010 CR2=00000000 CR3=00000000 CR4=00000000
DR0=0000000000000000 DR1=0000000000000000 DR2=0000000000000000 DR3=0000000000000000
DR6=00000000ffff0ff0 DR7=0000000000000400
EFER=0000000000000000

Reference SDM 30.3 INVVPID:

Protected Mode Exceptions
- #UD
  - If not in VMX operation.
  - If the logical processor does not support VPIDs (IA32_VMX_PROCBASED_CTLS2[37]=0).
  - If the logical processor supports VPIDs (IA32_VMX_PROCBASED_CTLS2[37]=1) but does
    not support the INVVPID instruction (IA32_VMX_EPT_VPID_CAP[32]=0).

So we should check both VPID enable bit in vmx exec control and INVVPID support bit
in vmx capability MSRs to enable VPID. This patch adds the guarantee to not enable
VPID if either INVVPID or single-context/all-context invalidation is not exposed in
vmx capability MSRs.

Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Jim Mattson <jmattson@google.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Sasha Levin <alexander.levin@verizon.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kvm/vmx.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index dcbafe53e2d4..d915185ada05 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -1107,6 +1107,11 @@ static inline bool cpu_has_vmx_invvpid_global(void)
 	return vmx_capability.vpid & VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT;
 }
 
+static inline bool cpu_has_vmx_invvpid(void)
+{
+	return vmx_capability.vpid & VMX_VPID_INVVPID_BIT;
+}
+
 static inline bool cpu_has_vmx_ept(void)
 {
 	return vmcs_config.cpu_based_2nd_exec_ctrl &
@@ -6199,8 +6204,10 @@ static __init int hardware_setup(void)
 	if (boot_cpu_has(X86_FEATURE_NX))
 		kvm_enable_efer_bits(EFER_NX);
 
-	if (!cpu_has_vmx_vpid())
+	if (!cpu_has_vmx_vpid() || !cpu_has_vmx_invvpid() ||
+		!(cpu_has_vmx_invvpid_single() || cpu_has_vmx_invvpid_global()))
 		enable_vpid = 0;
+
 	if (!cpu_has_vmx_shadow_vmcs())
 		enable_shadow_vmcs = 0;
 	if (enable_shadow_vmcs)
-- 
cgit v1.2.3


From 2e114c7b45625f6db6a30529f5d89b1b8dd0a367 Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Tue, 21 Mar 2017 21:03:01 -0500
Subject: ARM: dts: ti: fix PCI bus dtc warnings
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[ Upstream commit 7d79f6098d82f8c09914d7799bc96891ad9c3baf ]

dtc recently added PCI bus checks. Fix these warnings.

Signed-off-by: Rob Herring <robh@kernel.org>
Cc: "Benoît Cousson" <bcousson@baylibre.com>
Cc: Tony Lindgren <tony@atomide.com>
Cc: linux-omap@vger.kernel.org
Signed-off-by: Tony Lindgren <tony@atomide.com>
Signed-off-by: Sasha Levin <alexander.levin@verizon.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/arm/boot/dts/dra7.dtsi | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch')

diff --git a/arch/arm/boot/dts/dra7.dtsi b/arch/arm/boot/dts/dra7.dtsi
index c2a03c740e79..02bd6312d1d9 100644
--- a/arch/arm/boot/dts/dra7.dtsi
+++ b/arch/arm/boot/dts/dra7.dtsi
@@ -227,6 +227,7 @@
 				device_type = "pci";
 				ranges = <0x81000000 0 0          0x03000 0 0x00010000
 					  0x82000000 0 0x20013000 0x13000 0 0xffed000>;
+				bus-range = <0x00 0xff>;
 				#interrupt-cells = <1>;
 				num-lanes = <1>;
 				ti,hwmods = "pcie1";
@@ -262,6 +263,7 @@
 				device_type = "pci";
 				ranges = <0x81000000 0 0          0x03000 0 0x00010000
 					  0x82000000 0 0x30013000 0x13000 0 0xffed000>;
+				bus-range = <0x00 0xff>;
 				#interrupt-cells = <1>;
 				num-lanes = <1>;
 				ti,hwmods = "pcie2";
-- 
cgit v1.2.3


From 27c2fa1ae0ea735b32376777f012ab92c4ee8650 Mon Sep 17 00:00:00 2001
From: "Reizer, Eyal" <eyalr@ti.com>
Date: Sun, 26 Mar 2017 08:53:10 +0000
Subject: ARM: dts: am335x-evmsk: adjust mmc2 param to allow suspend

[ Upstream commit 9bcf53f34a2c1cebc45cc12e273dcd5f51fbc099 ]

mmc2 used for wl12xx was missing the keep-power-in suspend
parameter. As a result the board couldn't reach suspend state.

Signed-off-by: Eyal Reizer <eyalr@ti.com>
Signed-off-by: Tony Lindgren <tony@atomide.com>
Signed-off-by: Sasha Levin <alexander.levin@verizon.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/arm/boot/dts/am335x-evmsk.dts | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch')

diff --git a/arch/arm/boot/dts/am335x-evmsk.dts b/arch/arm/boot/dts/am335x-evmsk.dts
index 89442e98a837..3af570517903 100644
--- a/arch/arm/boot/dts/am335x-evmsk.dts
+++ b/arch/arm/boot/dts/am335x-evmsk.dts
@@ -668,6 +668,7 @@
 	ti,non-removable;
 	bus-width = <4>;
 	cap-power-off-card;
+	keep-power-in-suspend;
 	pinctrl-names = "default";
 	pinctrl-0 = <&mmc2_pins>;
 
-- 
cgit v1.2.3


From 1779b33294dab3e3a08afa5196d01848acfbe40c Mon Sep 17 00:00:00 2001
From: Russell King <rmk+kernel@armlinux.org.uk>
Date: Wed, 29 Mar 2017 17:12:47 +0100
Subject: ARM: dma-mapping: disallow dma_get_sgtable() for non-kernel managed
 memory

[ Upstream commit 916a008b4b8ecc02fbd035cfb133773dba1ff3d7 ]

dma_get_sgtable() tries to create a scatterlist table containing valid
struct page pointers for the coherent memory allocation passed in to it.

However, memory can be declared via dma_declare_coherent_memory(), or
via other reservation schemes which means that coherent memory is not
guaranteed to be backed by struct pages.  In such cases, the resulting
scatterlist table contains pointers to invalid pages, which causes
kernel oops later.

This patch adds detection of such memory, and refuses to create a
scatterlist table for such memory.

Reported-by: Shuah Khan <shuahkhan@gmail.com>
Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Signed-off-by: Sasha Levin <alexander.levin@verizon.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/arm/mm/dma-mapping.c | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index 534a60ae282e..613c1d06316a 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -774,13 +774,31 @@ static void arm_coherent_dma_free(struct device *dev, size_t size, void *cpu_add
 	__arm_dma_free(dev, size, cpu_addr, handle, attrs, true);
 }
 
+/*
+ * The whole dma_get_sgtable() idea is fundamentally unsafe - it seems
+ * that the intention is to allow exporting memory allocated via the
+ * coherent DMA APIs through the dma_buf API, which only accepts a
+ * scattertable.  This presents a couple of problems:
+ * 1. Not all memory allocated via the coherent DMA APIs is backed by
+ *    a struct page
+ * 2. Passing coherent DMA memory into the streaming APIs is not allowed
+ *    as we will try to flush the memory through a different alias to that
+ *    actually being used (and the flushes are redundant.)
+ */
 int arm_dma_get_sgtable(struct device *dev, struct sg_table *sgt,
 		 void *cpu_addr, dma_addr_t handle, size_t size,
 		 struct dma_attrs *attrs)
 {
-	struct page *page = pfn_to_page(dma_to_pfn(dev, handle));
+	unsigned long pfn = dma_to_pfn(dev, handle);
+	struct page *page;
 	int ret;
 
+	/* If the PFN is not valid, we do not have a struct page */
+	if (!pfn_valid(pfn))
+		return -ENXIO;
+
+	page = pfn_to_page(pfn);
+
 	ret = sg_alloc_table(sgt, 1, GFP_KERNEL);
 	if (unlikely(ret))
 		return ret;
-- 
cgit v1.2.3


From 9c631278a9c6eadebadcd4ee6efbb4edfe2f1c61 Mon Sep 17 00:00:00 2001
From: Aleksandar Markovic <aleksandar.markovic@mips.com>
Date: Thu, 2 Nov 2017 12:13:58 +0100
Subject: MIPS: math-emu: Fix final emulation phase for certain instructions

commit 409fcace9963c1e8d2cb0f7ac62e8b34d47ef979 upstream.

Fix final phase of <CLASS|MADDF|MSUBF|MAX|MIN|MAXA|MINA>.<D|S>
emulation. Provide proper generation of SIGFPE signal and updating
debugfs FP exception stats in cases of any exception flags set in
preceding phases of emulation.

CLASS.<D|S> instruction may generate "Unimplemented Operation" FP
exception. <MADDF|MSUBF>.<D|S> instructions may generate "Inexact",
"Unimplemented Operation", "Invalid Operation", "Overflow", and
"Underflow" FP exceptions. <MAX|MIN|MAXA|MINA>.<D|S> instructions
can generate "Unimplemented Operation" and "Invalid Operation" FP
exceptions.

The proper final processing of the cases when any FP exception
flag is set is achieved by replacing "break" statement with "goto
copcsr" statement. With such solution, this patch brings the final
phase of emulation of the above instructions consistent with the
one corresponding to the previously implemented emulation of other
related FPU instructions (ADD, SUB, etc.).

Fixes: 38db37ba069f ("MIPS: math-emu: Add support for the MIPS R6 CLASS FPU instruction")
Fixes: e24c3bec3e8e ("MIPS: math-emu: Add support for the MIPS R6 MADDF FPU instruction")
Fixes: 83d43305a1df ("MIPS: math-emu: Add support for the MIPS R6 MSUBF FPU instruction")
Fixes: a79f5f9ba508 ("MIPS: math-emu: Add support for the MIPS R6 MAX{, A} FPU instruction")
Fixes: 4e9561b20e2f ("MIPS: math-emu: Add support for the MIPS R6 MIN{, A} FPU instruction")
Signed-off-by: Aleksandar Markovic <aleksandar.markovic@mips.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Douglas Leung <douglas.leung@mips.com>
Cc: Goran Ferenc <goran.ferenc@mips.com>
Cc: "Maciej W. Rozycki" <macro@imgtec.com>
Cc: Miodrag Dinic <miodrag.dinic@mips.com>
Cc: Paul Burton <paul.burton@mips.com>
Cc: Petar Jovanovic <petar.jovanovic@mips.com>
Cc: Raghu Gandham <raghu.gandham@mips.com>
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/17581/
Signed-off-by: James Hogan <jhogan@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/mips/math-emu/cp1emu.c | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

(limited to 'arch')

diff --git a/arch/mips/math-emu/cp1emu.c b/arch/mips/math-emu/cp1emu.c
index dd058aa8a3b5..89d05de8040a 100644
--- a/arch/mips/math-emu/cp1emu.c
+++ b/arch/mips/math-emu/cp1emu.c
@@ -1777,7 +1777,7 @@ static int fpu_emu(struct pt_regs *xcp, struct mips_fpu_struct *ctx,
 			SPFROMREG(fs, MIPSInst_FS(ir));
 			SPFROMREG(fd, MIPSInst_FD(ir));
 			rv.s = ieee754sp_maddf(fd, fs, ft);
-			break;
+			goto copcsr;
 		}
 
 		case fmsubf_op: {
@@ -1790,7 +1790,7 @@ static int fpu_emu(struct pt_regs *xcp, struct mips_fpu_struct *ctx,
 			SPFROMREG(fs, MIPSInst_FS(ir));
 			SPFROMREG(fd, MIPSInst_FD(ir));
 			rv.s = ieee754sp_msubf(fd, fs, ft);
-			break;
+			goto copcsr;
 		}
 
 		case frint_op: {
@@ -1814,7 +1814,7 @@ static int fpu_emu(struct pt_regs *xcp, struct mips_fpu_struct *ctx,
 			SPFROMREG(fs, MIPSInst_FS(ir));
 			rv.w = ieee754sp_2008class(fs);
 			rfmt = w_fmt;
-			break;
+			goto copcsr;
 		}
 
 		case fmin_op: {
@@ -1826,7 +1826,7 @@ static int fpu_emu(struct pt_regs *xcp, struct mips_fpu_struct *ctx,
 			SPFROMREG(ft, MIPSInst_FT(ir));
 			SPFROMREG(fs, MIPSInst_FS(ir));
 			rv.s = ieee754sp_fmin(fs, ft);
-			break;
+			goto copcsr;
 		}
 
 		case fmina_op: {
@@ -1838,7 +1838,7 @@ static int fpu_emu(struct pt_regs *xcp, struct mips_fpu_struct *ctx,
 			SPFROMREG(ft, MIPSInst_FT(ir));
 			SPFROMREG(fs, MIPSInst_FS(ir));
 			rv.s = ieee754sp_fmina(fs, ft);
-			break;
+			goto copcsr;
 		}
 
 		case fmax_op: {
@@ -1850,7 +1850,7 @@ static int fpu_emu(struct pt_regs *xcp, struct mips_fpu_struct *ctx,
 			SPFROMREG(ft, MIPSInst_FT(ir));
 			SPFROMREG(fs, MIPSInst_FS(ir));
 			rv.s = ieee754sp_fmax(fs, ft);
-			break;
+			goto copcsr;
 		}
 
 		case fmaxa_op: {
@@ -1862,7 +1862,7 @@ static int fpu_emu(struct pt_regs *xcp, struct mips_fpu_struct *ctx,
 			SPFROMREG(ft, MIPSInst_FT(ir));
 			SPFROMREG(fs, MIPSInst_FS(ir));
 			rv.s = ieee754sp_fmaxa(fs, ft);
-			break;
+			goto copcsr;
 		}
 
 		case fabs_op:
@@ -2095,7 +2095,7 @@ copcsr:
 			DPFROMREG(fs, MIPSInst_FS(ir));
 			DPFROMREG(fd, MIPSInst_FD(ir));
 			rv.d = ieee754dp_maddf(fd, fs, ft);
-			break;
+			goto copcsr;
 		}
 
 		case fmsubf_op: {
@@ -2108,7 +2108,7 @@ copcsr:
 			DPFROMREG(fs, MIPSInst_FS(ir));
 			DPFROMREG(fd, MIPSInst_FD(ir));
 			rv.d = ieee754dp_msubf(fd, fs, ft);
-			break;
+			goto copcsr;
 		}
 
 		case frint_op: {
@@ -2132,7 +2132,7 @@ copcsr:
 			DPFROMREG(fs, MIPSInst_FS(ir));
 			rv.w = ieee754dp_2008class(fs);
 			rfmt = w_fmt;
-			break;
+			goto copcsr;
 		}
 
 		case fmin_op: {
@@ -2144,7 +2144,7 @@ copcsr:
 			DPFROMREG(ft, MIPSInst_FT(ir));
 			DPFROMREG(fs, MIPSInst_FS(ir));
 			rv.d = ieee754dp_fmin(fs, ft);
-			break;
+			goto copcsr;
 		}
 
 		case fmina_op: {
@@ -2156,7 +2156,7 @@ copcsr:
 			DPFROMREG(ft, MIPSInst_FT(ir));
 			DPFROMREG(fs, MIPSInst_FS(ir));
 			rv.d = ieee754dp_fmina(fs, ft);
-			break;
+			goto copcsr;
 		}
 
 		case fmax_op: {
@@ -2168,7 +2168,7 @@ copcsr:
 			DPFROMREG(ft, MIPSInst_FT(ir));
 			DPFROMREG(fs, MIPSInst_FS(ir));
 			rv.d = ieee754dp_fmax(fs, ft);
-			break;
+			goto copcsr;
 		}
 
 		case fmaxa_op: {
@@ -2180,7 +2180,7 @@ copcsr:
 			DPFROMREG(ft, MIPSInst_FT(ir));
 			DPFROMREG(fs, MIPSInst_FS(ir));
 			rv.d = ieee754dp_fmaxa(fs, ft);
-			break;
+			goto copcsr;
 		}
 
 		case fabs_op:
-- 
cgit v1.2.3


From 1cfeaadfd1b45388f8b957cb58d8df3cb1e5509c Mon Sep 17 00:00:00 2001
From: Sudip Mukherjee <sudipm.mukherjee@gmail.com>
Date: Tue, 3 Oct 2017 16:14:15 -0700
Subject: alpha: fix build failures

commit 8ee912dab95f1483156b6e994004bfcc3158d798 upstream.

The build of alpha allmodconfig is giving error:

  arch/alpha/include/asm/mmu_context.h: In function 'ev5_switch_mm':
  arch/alpha/include/asm/mmu_context.h:160:2: error:
	implicit declaration of function 'task_thread_info';
	did you mean 'init_thread_info'? [-Werror=implicit-function-declaration]

The file 'mmu_context.h' needed an extra header file.

Link: http://lkml.kernel.org/r/1505668810-7497-1-git-send-email-sudipm.mukherjee@gmail.com
Signed-off-by: Sudip Mukherjee <sudipm.mukherjee@gmail.com>
Cc: Richard Henderson <rth@twiddle.net>
Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru>
Cc: Matt Turner <mattst88@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/alpha/include/asm/mmu_context.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch')

diff --git a/arch/alpha/include/asm/mmu_context.h b/arch/alpha/include/asm/mmu_context.h
index 4c51c05333c6..4cafffa80e2c 100644
--- a/arch/alpha/include/asm/mmu_context.h
+++ b/arch/alpha/include/asm/mmu_context.h
@@ -7,6 +7,7 @@
  * Copyright (C) 1996, Linus Torvalds
  */
 
+#include <linux/sched.h>
 #include <asm/machvec.h>
 #include <asm/compiler.h>
 #include <asm-generic/mm_hooks.h>
-- 
cgit v1.2.3


From c9b5338394f235ef313e2160ed3c245965597db8 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpeng.li@hotmail.com>
Date: Thu, 7 Dec 2017 00:30:08 -0800
Subject: KVM: X86: Fix load RFLAGS w/o the fixed bit
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit d73235d17ba63b53dc0e1051dbc10a1f1be91b71 upstream.

 *** Guest State ***
 CR0: actual=0x0000000000000030, shadow=0x0000000060000010, gh_mask=fffffffffffffff7
 CR4: actual=0x0000000000002050, shadow=0x0000000000000000, gh_mask=ffffffffffffe871
 CR3 = 0x00000000fffbc000
 RSP = 0x0000000000000000  RIP = 0x0000000000000000
 RFLAGS=0x00000000         DR7 = 0x0000000000000400
        ^^^^^^^^^^

The failed vmentry is triggered by the following testcase when ept=Y:

    #include <unistd.h>
    #include <sys/syscall.h>
    #include <string.h>
    #include <stdint.h>
    #include <linux/kvm.h>
    #include <fcntl.h>
    #include <sys/ioctl.h>

    long r[5];
    int main()
    {
    	r[2] = open("/dev/kvm", O_RDONLY);
    	r[3] = ioctl(r[2], KVM_CREATE_VM, 0);
    	r[4] = ioctl(r[3], KVM_CREATE_VCPU, 7);
    	struct kvm_regs regs = {
    		.rflags = 0,
    	};
    	ioctl(r[4], KVM_SET_REGS, &regs);
    	ioctl(r[4], KVM_RUN, 0);
    }

X86 RFLAGS bit 1 is fixed set, userspace can simply clearing bit 1
of RFLAGS with KVM_SET_REGS ioctl which results in vmentry fails.
This patch fixes it by oring X86_EFLAGS_FIXED during ioctl.

Suggested-by: Jim Mattson <jmattson@google.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Quan Xu <quan.xu0@gmail.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Jim Mattson <jmattson@google.com>
Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kvm/x86.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e5f44f33de89..796f1ec67469 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6941,7 +6941,7 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
 #endif
 
 	kvm_rip_write(vcpu, regs->rip);
-	kvm_set_rflags(vcpu, regs->rflags);
+	kvm_set_rflags(vcpu, regs->rflags | X86_EFLAGS_FIXED);
 
 	vcpu->arch.exception.pending = false;
 
-- 
cgit v1.2.3


From 5a62acc900e9066f86ec7051ee92491d36fe5f81 Mon Sep 17 00:00:00 2001
From: Paolo Bonzini <pbonzini@redhat.com>
Date: Thu, 21 Dec 2017 00:49:14 +0100
Subject: kvm: x86: fix RSM when PCID is non-zero

commit fae1a3e775cca8c3a9e0eb34443b310871a15a92 upstream.

rsm_load_state_64() and rsm_enter_protected_mode() load CR3, then
CR4 & ~PCIDE, then CR0, then CR4.

However, setting CR4.PCIDE fails if CR3[11:0] != 0.  It's probably easier
in the long run to replace rsm_enter_protected_mode() with an emulator
callback that sets all the special registers (like KVM_SET_SREGS would
do).  For now, set the PCID field of CR3 only after CR4.PCIDE is 1.

Reported-by: Laszlo Ersek <lersek@redhat.com>
Tested-by: Laszlo Ersek <lersek@redhat.com>
Fixes: 660a5d517aaab9187f93854425c4c63f4a09195c
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kvm/emulate.c | 32 +++++++++++++++++++++++++-------
 1 file changed, 25 insertions(+), 7 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 684edebb4a0c..00045499f6c2 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -2383,9 +2383,21 @@ static int rsm_load_seg_64(struct x86_emulate_ctxt *ctxt, u64 smbase, int n)
 }
 
 static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt,
-				     u64 cr0, u64 cr4)
+				    u64 cr0, u64 cr3, u64 cr4)
 {
 	int bad;
+	u64 pcid;
+
+	/* In order to later set CR4.PCIDE, CR3[11:0] must be zero.  */
+	pcid = 0;
+	if (cr4 & X86_CR4_PCIDE) {
+		pcid = cr3 & 0xfff;
+		cr3 &= ~0xfff;
+	}
+
+	bad = ctxt->ops->set_cr(ctxt, 3, cr3);
+	if (bad)
+		return X86EMUL_UNHANDLEABLE;
 
 	/*
 	 * First enable PAE, long mode needs it before CR0.PG = 1 is set.
@@ -2404,6 +2416,12 @@ static int rsm_enter_protected_mode(struct x86_emulate_ctxt *ctxt,
 		bad = ctxt->ops->set_cr(ctxt, 4, cr4);
 		if (bad)
 			return X86EMUL_UNHANDLEABLE;
+		if (pcid) {
+			bad = ctxt->ops->set_cr(ctxt, 3, cr3 | pcid);
+			if (bad)
+				return X86EMUL_UNHANDLEABLE;
+		}
+
 	}
 
 	return X86EMUL_CONTINUE;
@@ -2414,11 +2432,11 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, u64 smbase)
 	struct desc_struct desc;
 	struct desc_ptr dt;
 	u16 selector;
-	u32 val, cr0, cr4;
+	u32 val, cr0, cr3, cr4;
 	int i;
 
 	cr0 =                      GET_SMSTATE(u32, smbase, 0x7ffc);
-	ctxt->ops->set_cr(ctxt, 3, GET_SMSTATE(u32, smbase, 0x7ff8));
+	cr3 =                      GET_SMSTATE(u32, smbase, 0x7ff8);
 	ctxt->eflags =             GET_SMSTATE(u32, smbase, 0x7ff4) | X86_EFLAGS_FIXED;
 	ctxt->_eip =               GET_SMSTATE(u32, smbase, 0x7ff0);
 
@@ -2460,14 +2478,14 @@ static int rsm_load_state_32(struct x86_emulate_ctxt *ctxt, u64 smbase)
 
 	ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smbase, 0x7ef8));
 
-	return rsm_enter_protected_mode(ctxt, cr0, cr4);
+	return rsm_enter_protected_mode(ctxt, cr0, cr3, cr4);
 }
 
 static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase)
 {
 	struct desc_struct desc;
 	struct desc_ptr dt;
-	u64 val, cr0, cr4;
+	u64 val, cr0, cr3, cr4;
 	u32 base3;
 	u16 selector;
 	int i, r;
@@ -2484,7 +2502,7 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase)
 	ctxt->ops->set_dr(ctxt, 7, (val & DR7_VOLATILE) | DR7_FIXED_1);
 
 	cr0 =                       GET_SMSTATE(u64, smbase, 0x7f58);
-	ctxt->ops->set_cr(ctxt, 3,  GET_SMSTATE(u64, smbase, 0x7f50));
+	cr3 =                       GET_SMSTATE(u64, smbase, 0x7f50);
 	cr4 =                       GET_SMSTATE(u64, smbase, 0x7f48);
 	ctxt->ops->set_smbase(ctxt, GET_SMSTATE(u32, smbase, 0x7f00));
 	val =                       GET_SMSTATE(u64, smbase, 0x7ed0);
@@ -2512,7 +2530,7 @@ static int rsm_load_state_64(struct x86_emulate_ctxt *ctxt, u64 smbase)
 	dt.address =                GET_SMSTATE(u64, smbase, 0x7e68);
 	ctxt->ops->set_gdt(ctxt, &dt);
 
-	r = rsm_enter_protected_mode(ctxt, cr0, cr4);
+	r = rsm_enter_protected_mode(ctxt, cr0, cr3, cr4);
 	if (r != X86EMUL_CONTINUE)
 		return r;
 
-- 
cgit v1.2.3


From b7aac649af1039889ea6653d2c7f855dd530ef40 Mon Sep 17 00:00:00 2001
From: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com>
Date: Tue, 12 Dec 2017 17:59:15 +0530
Subject: powerpc/perf: Dereference BHRB entries safely

commit f41d84dddc66b164ac16acf3f584c276146f1c48 upstream.

It's theoretically possible that branch instructions recorded in
BHRB (Branch History Rolling Buffer) entries have already been
unmapped before they are processed by the kernel. Hence, trying to
dereference such memory location will result in a crash. eg:

    Unable to handle kernel paging request for data at address 0xd000000019c41764
    Faulting instruction address: 0xc000000000084a14
    NIP [c000000000084a14] branch_target+0x4/0x70
    LR [c0000000000eb828] record_and_restart+0x568/0x5c0
    Call Trace:
    [c0000000000eb3b4] record_and_restart+0xf4/0x5c0 (unreliable)
    [c0000000000ec378] perf_event_interrupt+0x298/0x460
    [c000000000027964] performance_monitor_exception+0x54/0x70
    [c000000000009ba4] performance_monitor_common+0x114/0x120

Fix it by deferefencing the addresses safely.

Fixes: 691231846ceb ("powerpc/perf: Fix setting of "to" addresses for BHRB")
Suggested-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Signed-off-by: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com>
Reviewed-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
[mpe: Use probe_kernel_read() which is clearer, tweak change log]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/powerpc/perf/core-book3s.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index d1e65ce545b3..b2ab164a8094 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -401,8 +401,12 @@ static __u64 power_pmu_bhrb_to(u64 addr)
 	int ret;
 	__u64 target;
 
-	if (is_kernel_addr(addr))
-		return branch_target((unsigned int *)addr);
+	if (is_kernel_addr(addr)) {
+		if (probe_kernel_read(&instr, (void *)addr, sizeof(instr)))
+			return 0;
+
+		return branch_target(&instr);
+	}
 
 	/* Userspace: need copy instruction here then translate it */
 	pagefault_disable();
-- 
cgit v1.2.3


From 6ce9d1e6819e53c4de0bf980555c4e07bbedb4ce Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Sat, 22 Apr 2017 00:01:19 -0700
Subject: x86/vm86/32: Switch to flush_tlb_mm_range() in mark_screen_rdonly()

commit 9ccee2373f0658f234727700e619df097ba57023 upstream.

mark_screen_rdonly() is the last remaining caller of flush_tlb().
flush_tlb_mm_range() is potentially faster and isn't obsolete.

Compile-tested only because I don't know whether software that uses
this mechanism even exists.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Nadav Amit <namit@vmware.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/791a644076fc3577ba7f7b7cafd643cc089baa7d.1492844372.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kernel/vm86_32.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 524619351961..510e80da7de4 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -187,7 +187,7 @@ static void mark_screen_rdonly(struct mm_struct *mm)
 	pte_unmap_unlock(pte, ptl);
 out:
 	up_write(&mm->mmap_sem);
-	flush_tlb();
+	flush_tlb_mm_range(mm, 0xA0000, 0xA0000 + 32*PAGE_SIZE, 0UL);
 }
 
 
-- 
cgit v1.2.3


From 227d6f0e79f809e448d3157fbfd00eb54dcbb54e Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Sat, 22 Apr 2017 00:01:20 -0700
Subject: x86/mm: Remove flush_tlb() and flush_tlb_current_task()

commit 29961b59a51f8c6838a26a45e871a7ed6771809b upstream.

I was trying to figure out what how flush_tlb_current_task() would
possibly work correctly if current->mm != current->active_mm, but I
realized I could spare myself the effort: it has no callers except
the unused flush_tlb() macro.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Nadav Amit <namit@vmware.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/e52d64c11690f85e9f1d69d7b48cc2269cd2e94b.1492844372.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/tlbflush.h |  9 ---------
 arch/x86/mm/tlb.c               | 17 -----------------
 2 files changed, 26 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 4dc534175b5e..d9b3a9937166 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -204,7 +204,6 @@ static inline void __flush_tlb_one(unsigned long addr)
 /*
  * TLB flushing:
  *
- *  - flush_tlb() flushes the current mm struct TLBs
  *  - flush_tlb_all() flushes all processes TLBs
  *  - flush_tlb_mm(mm) flushes the specified mm context TLB's
  *  - flush_tlb_page(vma, vmaddr) flushes one page
@@ -236,11 +235,6 @@ static inline void flush_tlb_all(void)
 	__flush_tlb_all();
 }
 
-static inline void flush_tlb(void)
-{
-	__flush_tlb_up();
-}
-
 static inline void local_flush_tlb(void)
 {
 	__flush_tlb_up();
@@ -302,14 +296,11 @@ static inline void flush_tlb_kernel_range(unsigned long start,
 		flush_tlb_mm_range(vma->vm_mm, start, end, vma->vm_flags)
 
 extern void flush_tlb_all(void);
-extern void flush_tlb_current_task(void);
 extern void flush_tlb_page(struct vm_area_struct *, unsigned long);
 extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
 				unsigned long end, unsigned long vmflag);
 extern void flush_tlb_kernel_range(unsigned long start, unsigned long end);
 
-#define flush_tlb()	flush_tlb_current_task()
-
 void native_flush_tlb_others(const struct cpumask *cpumask,
 				struct mm_struct *mm,
 				unsigned long start, unsigned long end);
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 45ba87466e6a..1df4c82d3d7a 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -272,23 +272,6 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
 	smp_call_function_many(cpumask, flush_tlb_func, &info, 1);
 }
 
-void flush_tlb_current_task(void)
-{
-	struct mm_struct *mm = current->mm;
-
-	preempt_disable();
-
-	count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
-
-	/* This is an implicit full barrier that synchronizes with switch_mm. */
-	local_flush_tlb();
-
-	trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL);
-	if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
-		flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
-	preempt_enable();
-}
-
 /*
  * See Documentation/x86/tlb.txt for details.  We choose 33
  * because it is large enough to cover the vast majority (at
-- 
cgit v1.2.3


From 9f4d1ba1d407e56dac833aa0b11c60f952939e1c Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Sat, 22 Apr 2017 00:01:21 -0700
Subject: x86/mm: Make flush_tlb_mm_range() more predictable

commit ce27374fabf553153c3f53efcaa9bfab9216bd8c upstream.

I'm about to rewrite the function almost completely, but first I
want to get a functional change out of the way.  Currently, if
flush_tlb_mm_range() does not flush the local TLB at all, it will
never do individual page flushes on remote CPUs.  This seems to be
an accident, and preserving it will be awkward.  Let's change it
first so that any regressions in the rewrite will be easier to
bisect and so that the rewrite can attempt to change no visible
behavior at all.

The fix is simple: we can simply avoid short-circuiting the
calculation of base_pages_to_flush.

As a side effect, this also eliminates a potential corner case: if
tlb_single_page_flush_ceiling == TLB_FLUSH_ALL, flush_tlb_mm_range()
could have ended up flushing the entire address space one page at a
time.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Acked-by: Dave Hansen <dave.hansen@intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Nadav Amit <namit@vmware.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/4b29b771d9975aad7154c314534fec235618175a.1492844372.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/mm/tlb.c | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 1df4c82d3d7a..9e138da243ff 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -292,6 +292,12 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
 	unsigned long base_pages_to_flush = TLB_FLUSH_ALL;
 
 	preempt_disable();
+
+	if ((end != TLB_FLUSH_ALL) && !(vmflag & VM_HUGETLB))
+		base_pages_to_flush = (end - start) >> PAGE_SHIFT;
+	if (base_pages_to_flush > tlb_single_page_flush_ceiling)
+		base_pages_to_flush = TLB_FLUSH_ALL;
+
 	if (current->active_mm != mm) {
 		/* Synchronize with switch_mm. */
 		smp_mb();
@@ -308,15 +314,11 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
 		goto out;
 	}
 
-	if ((end != TLB_FLUSH_ALL) && !(vmflag & VM_HUGETLB))
-		base_pages_to_flush = (end - start) >> PAGE_SHIFT;
-
 	/*
 	 * Both branches below are implicit full barriers (MOV to CR or
 	 * INVLPG) that synchronize with switch_mm.
 	 */
-	if (base_pages_to_flush > tlb_single_page_flush_ceiling) {
-		base_pages_to_flush = TLB_FLUSH_ALL;
+	if (base_pages_to_flush == TLB_FLUSH_ALL) {
 		count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
 		local_flush_tlb();
 	} else {
-- 
cgit v1.2.3


From 3efba6062a410a2a65fc9d6f53dca63db2602e65 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Mon, 22 May 2017 15:30:01 -0700
Subject: x86/mm: Reimplement flush_tlb_page() using flush_tlb_mm_range()

commit ca6c99c0794875c6d1db6e22f246699691ab7e6b upstream.

flush_tlb_page() was very similar to flush_tlb_mm_range() except that
it had a couple of issues:

 - It was missing an smp_mb() in the case where
   current->active_mm != mm.  (This is a longstanding bug reported by Nadav Amit)

 - It was missing tracepoints and vm counter updates.

The only reason that I can see for keeping it at as a separate
function is that it could avoid a few branches that
flush_tlb_mm_range() needs to decide to flush just one page.  This
hardly seems worthwhile.  If we decide we want to get rid of those
branches again, a better way would be to introduce an
__flush_tlb_mm_range() helper and make both flush_tlb_page() and
flush_tlb_mm_range() use it.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Nadav Amit <namit@vmware.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/3cc3847cf888d8907577569b8bac3f01992ef8f9.1495492063.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/tlbflush.h |  6 +++++-
 arch/x86/mm/tlb.c               | 27 ---------------------------
 2 files changed, 5 insertions(+), 28 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index d9b3a9937166..47f628464935 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -296,11 +296,15 @@ static inline void flush_tlb_kernel_range(unsigned long start,
 		flush_tlb_mm_range(vma->vm_mm, start, end, vma->vm_flags)
 
 extern void flush_tlb_all(void);
-extern void flush_tlb_page(struct vm_area_struct *, unsigned long);
 extern void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
 				unsigned long end, unsigned long vmflag);
 extern void flush_tlb_kernel_range(unsigned long start, unsigned long end);
 
+static inline void flush_tlb_page(struct vm_area_struct *vma, unsigned long a)
+{
+	flush_tlb_mm_range(vma->vm_mm, a, a + PAGE_SIZE, VM_NONE);
+}
+
 void native_flush_tlb_others(const struct cpumask *cpumask,
 				struct mm_struct *mm,
 				unsigned long start, unsigned long end);
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 9e138da243ff..f3ed742c6203 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -339,33 +339,6 @@ out:
 	preempt_enable();
 }
 
-void flush_tlb_page(struct vm_area_struct *vma, unsigned long start)
-{
-	struct mm_struct *mm = vma->vm_mm;
-
-	preempt_disable();
-
-	if (current->active_mm == mm) {
-		if (current->mm) {
-			/*
-			 * Implicit full barrier (INVLPG) that synchronizes
-			 * with switch_mm.
-			 */
-			__flush_tlb_one(start);
-		} else {
-			leave_mm(smp_processor_id());
-
-			/* Synchronize with switch_mm. */
-			smp_mb();
-		}
-	}
-
-	if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
-		flush_tlb_others(mm_cpumask(mm), mm, start, start + PAGE_SIZE);
-
-	preempt_enable();
-}
-
 static void do_flush_tlb_all(void *info)
 {
 	count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
-- 
cgit v1.2.3


From b2e24274d50e0ecdf560ebe06dbed0cc648ad3f9 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Sun, 28 May 2017 10:00:14 -0700
Subject: x86/mm: Remove the UP asm/tlbflush.h code, always use the (formerly)
 SMP code

commit ce4a4e565f5264909a18c733b864c3f74467f69e upstream.

The UP asm/tlbflush.h generates somewhat nicer code than the SMP version.
Aside from that, it's fallen quite a bit behind the SMP code:

 - flush_tlb_mm_range() didn't flush individual pages if the range
   was small.

 - The lazy TLB code was much weaker.  This usually wouldn't matter,
   but, if a kernel thread flushed its lazy "active_mm" more than
   once (due to reclaim or similar), it wouldn't be unlazied and
   would instead pointlessly flush repeatedly.

 - Tracepoints were missing.

Aside from that, simply having the UP code around was a maintanence
burden, since it means that any change to the TLB flush code had to
make sure not to break it.

Simplify everything by deleting the UP code.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Nadav Amit <namit@vmware.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/Kconfig                   |  2 +-
 arch/x86/include/asm/hardirq.h     |  2 +-
 arch/x86/include/asm/mmu.h         |  6 ---
 arch/x86/include/asm/mmu_context.h |  2 -
 arch/x86/include/asm/tlbflush.h    | 78 +-------------------------------------
 arch/x86/mm/init.c                 |  2 -
 arch/x86/mm/tlb.c                  | 17 +--------
 7 files changed, 5 insertions(+), 104 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 7e40905f6d4c..39d2dc66faa5 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -42,7 +42,7 @@ config X86
 	select ARCH_USE_CMPXCHG_LOCKREF		if X86_64
 	select ARCH_USE_QUEUED_RWLOCKS
 	select ARCH_USE_QUEUED_SPINLOCKS
-	select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH if SMP
+	select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
 	select ARCH_WANTS_DYNAMIC_TASK_STRUCT
 	select ARCH_WANT_FRAME_POINTERS
 	select ARCH_WANT_IPC_PARSE_VERSION	if X86_32
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 59405a248fc2..9b76cd331990 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -22,8 +22,8 @@ typedef struct {
 #ifdef CONFIG_SMP
 	unsigned int irq_resched_count;
 	unsigned int irq_call_count;
-	unsigned int irq_tlb_count;
 #endif
+	unsigned int irq_tlb_count;
 #ifdef CONFIG_X86_THERMAL_VECTOR
 	unsigned int irq_thermal_count;
 #endif
diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index 55234d5e7160..7680b76adafc 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -24,12 +24,6 @@ typedef struct {
 	atomic_t perf_rdpmc_allowed;	/* nonzero if rdpmc is allowed */
 } mm_context_t;
 
-#ifdef CONFIG_SMP
 void leave_mm(int cpu);
-#else
-static inline void leave_mm(int cpu)
-{
-}
-#endif
 
 #endif /* _ASM_X86_MMU_H */
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 44fc93987869..9bfc5fd77015 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -98,10 +98,8 @@ static inline void load_mm_ldt(struct mm_struct *mm)
 
 static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
 {
-#ifdef CONFIG_SMP
 	if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK)
 		this_cpu_write(cpu_tlbstate.state, TLBSTATE_LAZY);
-#endif
 }
 
 extern void switch_mm(struct mm_struct *prev, struct mm_struct *next,
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 47f628464935..822ad36be17b 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -6,6 +6,7 @@
 
 #include <asm/processor.h>
 #include <asm/special_insns.h>
+#include <asm/smp.h>
 
 static inline void __invpcid(unsigned long pcid, unsigned long addr,
 			     unsigned long type)
@@ -64,10 +65,8 @@ static inline void invpcid_flush_all_nonglobals(void)
 #endif
 
 struct tlb_state {
-#ifdef CONFIG_SMP
 	struct mm_struct *active_mm;
 	int state;
-#endif
 
 	/*
 	 * Access to this CR4 shadow and to H/W CR4 is protected by
@@ -215,79 +214,6 @@ static inline void __flush_tlb_one(unsigned long addr)
  * and page-granular flushes are available only on i486 and up.
  */
 
-#ifndef CONFIG_SMP
-
-/* "_up" is for UniProcessor.
- *
- * This is a helper for other header functions.  *Not* intended to be called
- * directly.  All global TLB flushes need to either call this, or to bump the
- * vm statistics themselves.
- */
-static inline void __flush_tlb_up(void)
-{
-	count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
-	__flush_tlb();
-}
-
-static inline void flush_tlb_all(void)
-{
-	count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
-	__flush_tlb_all();
-}
-
-static inline void local_flush_tlb(void)
-{
-	__flush_tlb_up();
-}
-
-static inline void flush_tlb_mm(struct mm_struct *mm)
-{
-	if (mm == current->active_mm)
-		__flush_tlb_up();
-}
-
-static inline void flush_tlb_page(struct vm_area_struct *vma,
-				  unsigned long addr)
-{
-	if (vma->vm_mm == current->active_mm)
-		__flush_tlb_one(addr);
-}
-
-static inline void flush_tlb_range(struct vm_area_struct *vma,
-				   unsigned long start, unsigned long end)
-{
-	if (vma->vm_mm == current->active_mm)
-		__flush_tlb_up();
-}
-
-static inline void flush_tlb_mm_range(struct mm_struct *mm,
-	   unsigned long start, unsigned long end, unsigned long vmflag)
-{
-	if (mm == current->active_mm)
-		__flush_tlb_up();
-}
-
-static inline void native_flush_tlb_others(const struct cpumask *cpumask,
-					   struct mm_struct *mm,
-					   unsigned long start,
-					   unsigned long end)
-{
-}
-
-static inline void reset_lazy_tlbstate(void)
-{
-}
-
-static inline void flush_tlb_kernel_range(unsigned long start,
-					  unsigned long end)
-{
-	flush_tlb_all();
-}
-
-#else  /* SMP */
-
-#include <asm/smp.h>
-
 #define local_flush_tlb() __flush_tlb()
 
 #define flush_tlb_mm(mm)	flush_tlb_mm_range(mm, 0UL, TLB_FLUSH_ALL, 0UL)
@@ -318,8 +244,6 @@ static inline void reset_lazy_tlbstate(void)
 	this_cpu_write(cpu_tlbstate.active_mm, &init_mm);
 }
 
-#endif	/* SMP */
-
 #ifndef CONFIG_PARAVIRT
 #define flush_tlb_others(mask, mm, start, end)	\
 	native_flush_tlb_others(mask, mm, start, end)
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 3aebbd6c6f5f..ed4b372860e4 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -753,10 +753,8 @@ void __init zone_sizes_init(void)
 }
 
 DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
-#ifdef CONFIG_SMP
 	.active_mm = &init_mm,
 	.state = 0,
-#endif
 	.cr4 = ~0UL,	/* fail hard if we screw up cr4 shadow initialization */
 };
 EXPORT_SYMBOL_GPL(cpu_tlbstate);
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index f3ed742c6203..7a4cdb632508 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -15,7 +15,7 @@
 #include <linux/debugfs.h>
 
 /*
- *	Smarter SMP flushing macros.
+ *	TLB flushing, formerly SMP-only
  *		c/o Linus Torvalds.
  *
  *	These mean you can really definitely utterly forget about
@@ -28,8 +28,6 @@
  *	Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
  */
 
-#ifdef CONFIG_SMP
-
 struct flush_tlb_info {
 	struct mm_struct *flush_mm;
 	unsigned long flush_start;
@@ -59,8 +57,6 @@ void leave_mm(int cpu)
 }
 EXPORT_SYMBOL_GPL(leave_mm);
 
-#endif /* CONFIG_SMP */
-
 void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 	       struct task_struct *tsk)
 {
@@ -77,10 +73,8 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 	unsigned cpu = smp_processor_id();
 
 	if (likely(prev != next)) {
-#ifdef CONFIG_SMP
 		this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
 		this_cpu_write(cpu_tlbstate.active_mm, next);
-#endif
 		cpumask_set_cpu(cpu, mm_cpumask(next));
 
 		/*
@@ -137,9 +131,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 		if (unlikely(prev->context.ldt != next->context.ldt))
 			load_mm_ldt(next);
 #endif
-	}
-#ifdef CONFIG_SMP
-	  else {
+	} else {
 		this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
 		BUG_ON(this_cpu_read(cpu_tlbstate.active_mm) != next);
 
@@ -166,11 +158,8 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 			load_mm_ldt(next);
 		}
 	}
-#endif
 }
 
-#ifdef CONFIG_SMP
-
 /*
  * The flush IPI assumes that a thread switch happens in this order:
  * [cpu0: the cpu that switches]
@@ -423,5 +412,3 @@ static int __init create_tlb_single_page_flush_ceiling(void)
 	return 0;
 }
 late_initcall(create_tlb_single_page_flush_ceiling);
-
-#endif /* CONFIG_SMP */
-- 
cgit v1.2.3


From 78043e5b6fb2921d836b31f23e89e52925191153 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 29 Jun 2017 08:53:19 -0700
Subject: x86/mm: Disable PCID on 32-bit kernels

commit cba4671af7550e008f7a7835f06df0763825bf3e upstream.

32-bit kernels on new hardware will see PCID in CPUID, but PCID can
only be used in 64-bit mode.  Rather than making all PCID code
conditional, just disable the feature on 32-bit builds.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/2e391769192a4d31b808410c383c6bf0734bc6ea.1498751203.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/disabled-features.h | 4 +++-
 arch/x86/kernel/cpu/bugs.c               | 8 ++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h
index f226df064660..8b17c2ad1048 100644
--- a/arch/x86/include/asm/disabled-features.h
+++ b/arch/x86/include/asm/disabled-features.h
@@ -21,11 +21,13 @@
 # define DISABLE_K6_MTRR	(1<<(X86_FEATURE_K6_MTRR & 31))
 # define DISABLE_CYRIX_ARR	(1<<(X86_FEATURE_CYRIX_ARR & 31))
 # define DISABLE_CENTAUR_MCR	(1<<(X86_FEATURE_CENTAUR_MCR & 31))
+# define DISABLE_PCID		0
 #else
 # define DISABLE_VME		0
 # define DISABLE_K6_MTRR	0
 # define DISABLE_CYRIX_ARR	0
 # define DISABLE_CENTAUR_MCR	0
+# define DISABLE_PCID		(1<<(X86_FEATURE_PCID & 31))
 #endif /* CONFIG_X86_64 */
 
 /*
@@ -35,7 +37,7 @@
 #define DISABLED_MASK1	0
 #define DISABLED_MASK2	0
 #define DISABLED_MASK3	(DISABLE_CYRIX_ARR|DISABLE_CENTAUR_MCR|DISABLE_K6_MTRR)
-#define DISABLED_MASK4	0
+#define DISABLED_MASK4	(DISABLE_PCID)
 #define DISABLED_MASK5	0
 #define DISABLED_MASK6	0
 #define DISABLED_MASK7	0
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index bd17db15a2c1..0b6124315441 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -19,6 +19,14 @@
 
 void __init check_bugs(void)
 {
+#ifdef CONFIG_X86_32
+	/*
+	 * Regardless of whether PCID is enumerated, the SDM says
+	 * that it can't be enabled in 32-bit mode.
+	 */
+	setup_clear_cpu_cap(X86_FEATURE_PCID);
+#endif
+
 	identify_boot_cpu();
 #ifndef CONFIG_SMP
 	pr_info("CPU: ");
-- 
cgit v1.2.3


From dcccd3c266e24ce80ecf592765315c54c222ac33 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 29 Jun 2017 08:53:20 -0700
Subject: x86/mm: Add the 'nopcid' boot option to turn off PCID

commit 0790c9aad84901ca1bdc14746175549c8b5da215 upstream.

The parameter is only present on x86_64 systems to save a few bytes,
as PCID is always disabled on x86_32.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/8bbb2e65bcd249a5f18bfb8128b4689f08ac2b60.1498751203.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kernel/cpu/common.c | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c84b62956e8d..f4776a3afc8f 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -162,6 +162,24 @@ static int __init x86_mpx_setup(char *s)
 }
 __setup("nompx", x86_mpx_setup);
 
+#ifdef CONFIG_X86_64
+static int __init x86_pcid_setup(char *s)
+{
+	/* require an exact match without trailing characters */
+	if (strlen(s))
+		return 0;
+
+	/* do not emit a message if the feature is not present */
+	if (!boot_cpu_has(X86_FEATURE_PCID))
+		return 1;
+
+	setup_clear_cpu_cap(X86_FEATURE_PCID);
+	pr_info("nopcid: PCID feature disabled\n");
+	return 1;
+}
+__setup("nopcid", x86_pcid_setup);
+#endif
+
 static int __init x86_noinvpcid_setup(char *s)
 {
 	/* noinvpcid doesn't accept parameters */
-- 
cgit v1.2.3


From fd0504525efd2ce2063cd4229baabd3e3a56ecbc Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 29 Jun 2017 08:53:21 -0700
Subject: x86/mm: Enable CR4.PCIDE on supported systems

commit 660da7c9228f685b2ebe664f9fd69aaddcc420b5 upstream.

We can use PCID if the CPU has PCID and PGE and we're not on Xen.

By itself, this has no effect. A followup patch will start using PCID.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
Reviewed-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/6327ecd907b32f79d5aa0d466f04503bbec5df88.1498751203.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/tlbflush.h |  8 ++++++++
 arch/x86/kernel/cpu/common.c    | 22 ++++++++++++++++++++++
 arch/x86/xen/enlighten.c        |  6 ++++++
 3 files changed, 36 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 822ad36be17b..9fc5968da820 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -190,6 +190,14 @@ static inline void __flush_tlb_all(void)
 		__flush_tlb_global();
 	else
 		__flush_tlb();
+
+	/*
+	 * Note: if we somehow had PCID but not PGE, then this wouldn't work --
+	 * we'd end up flushing kernel translations for the current ASID but
+	 * we might fail to flush kernel translations for other cached ASIDs.
+	 *
+	 * To avoid this issue, we force PCID off if PGE is off.
+	 */
 }
 
 static inline void __flush_tlb_one(unsigned long addr)
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index f4776a3afc8f..aa1e7246b06b 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -321,6 +321,25 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c)
 	}
 }
 
+static void setup_pcid(struct cpuinfo_x86 *c)
+{
+	if (cpu_has(c, X86_FEATURE_PCID)) {
+		if (cpu_has(c, X86_FEATURE_PGE)) {
+			cr4_set_bits(X86_CR4_PCIDE);
+		} else {
+			/*
+			 * flush_tlb_all(), as currently implemented, won't
+			 * work if PCID is on but PGE is not.  Since that
+			 * combination doesn't exist on real hardware, there's
+			 * no reason to try to fully support it, but it's
+			 * polite to avoid corrupting data if we're on
+			 * an improperly configured VM.
+			 */
+			clear_cpu_cap(c, X86_FEATURE_PCID);
+		}
+	}
+}
+
 /*
  * Some CPU features depend on higher CPUID levels, which may not always
  * be available due to CPUID level capping or broken virtualization
@@ -952,6 +971,9 @@ static void identify_cpu(struct cpuinfo_x86 *c)
 	setup_smep(c);
 	setup_smap(c);
 
+	/* Set up PCID */
+	setup_pcid(c);
+
 	/*
 	 * The vendor-specific functions might have changed features.
 	 * Now we do "generic changes."
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index ffa41591bff9..cbef64b508e1 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -433,6 +433,12 @@ static void __init xen_init_cpuid_mask(void)
 		~((1 << X86_FEATURE_MTRR) |  /* disable MTRR */
 		  (1 << X86_FEATURE_ACC));   /* thermal monitoring */
 
+	/*
+	 * Xen PV would need some work to support PCID: CR3 handling as well
+	 * as xen_flush_tlb_others() would need updating.
+	 */
+	cpuid_leaf1_ecx_mask &= ~(1 << (X86_FEATURE_PCID % 32));  /* disable PCID */
+
 	if (!xen_initial_domain())
 		cpuid_leaf1_edx_mask &=
 			~((1 << X86_FEATURE_ACPI));  /* disable ACPI */
-- 
cgit v1.2.3


From 6c4db09c291a19da66512f99c4bcb378a862f9e6 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Sun, 8 Oct 2017 21:53:05 -0700
Subject: x86/mm/64: Fix reboot interaction with CR4.PCIDE

commit 924c6b900cfdf376b07bccfd80e62b21914f8a5a upstream.

Trying to reboot via real mode fails with PCID on: long mode cannot
be exited while CR4.PCIDE is set.  (No, I have no idea why, but the
SDM and actual CPUs are in agreement here.)  The result is a GPF and
a hang instead of a reboot.

I didn't catch this in testing because neither my computer nor my VM
reboots this way.  I can trigger it with reboot=bios, though.

Fixes: 660da7c9228f ("x86/mm: Enable CR4.PCIDE on supported systems")
Reported-and-tested-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Borislav Petkov <bp@alien8.de>
Link: https://lkml.kernel.org/r/f1e7d965998018450a7a70c2823873686a8b21c0.1507524746.git.luto@kernel.org
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kernel/reboot.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index f660d63f40fe..9a16932c7258 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -93,6 +93,10 @@ void __noreturn machine_real_restart(unsigned int type)
 	load_cr3(initial_page_table);
 #else
 	write_cr3(real_mode_header->trampoline_pgd);
+
+	/* Exiting long mode will fail if CR4.PCIDE is set. */
+	if (static_cpu_has(X86_FEATURE_PCID))
+		cr4_clear_bits(X86_CR4_PCIDE);
 #endif
 
 	/* Jump to the identity-mapped low memory code */
-- 
cgit v1.2.3


From 977614061c3db07abd9b3d8c94088fd866b858a8 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sat, 30 Dec 2017 22:13:53 +0100
Subject: x86/smpboot: Remove stale TLB flush invocations

commit 322f8b8b340c824aef891342b0f5795d15e11562 upstream.

smpboot_setup_warm_reset_vector() and smpboot_restore_warm_reset_vector()
invoke local_flush_tlb() for no obvious reason.

Digging in history revealed that the original code in the 2.1 era added
those because the code manipulated a swapper_pg_dir pagetable entry. The
pagetable manipulation was removed long ago in the 2.3 timeframe, but the
TLB flush invocations stayed around forever.

Remove them along with the pointless pr_debug()s which come from the same 2.1
change.

Reported-by: Dominik Brodowski <linux@dominikbrodowski.net>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linuxfoundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20171230211829.586548655@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kernel/smpboot.c | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index fbabe4fcc7fb..fe89f938e0f0 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -104,25 +104,16 @@ static inline void smpboot_setup_warm_reset_vector(unsigned long start_eip)
 	spin_lock_irqsave(&rtc_lock, flags);
 	CMOS_WRITE(0xa, 0xf);
 	spin_unlock_irqrestore(&rtc_lock, flags);
-	local_flush_tlb();
-	pr_debug("1.\n");
 	*((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_HIGH)) =
 							start_eip >> 4;
-	pr_debug("2.\n");
 	*((volatile unsigned short *)phys_to_virt(TRAMPOLINE_PHYS_LOW)) =
 							start_eip & 0xf;
-	pr_debug("3.\n");
 }
 
 static inline void smpboot_restore_warm_reset_vector(void)
 {
 	unsigned long flags;
 
-	/*
-	 * Install writable page 0 entry to set BIOS data area.
-	 */
-	local_flush_tlb();
-
 	/*
 	 * Paranoid:  Set warm reset code and vector here back
 	 * to default values.
-- 
cgit v1.2.3


From 0fa147b407478e73fe7a478677ff2b12bb824014 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Mon, 17 Jul 2017 16:10:33 -0500
Subject: x86/boot: Add early cmdline parsing for options with arguments
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit e505371dd83963caae1a37ead9524e8d997341be upstream.

Add a cmdline_find_option() function to look for cmdline options that
take arguments. The argument is returned in a supplied buffer and the
argument length (regardless of whether it fits in the supplied buffer)
is returned, with -1 indicating not found.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brijesh Singh <brijesh.singh@amd.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Larry Woodman <lwoodman@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Toshimitsu Kani <toshi.kani@hpe.com>
Cc: kasan-dev@googlegroups.com
Cc: kvm@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: linux-doc@vger.kernel.org
Cc: linux-efi@vger.kernel.org
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/36b5f97492a9745dce27682305f990fc20e5cf8a.1500319216.git.thomas.lendacky@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/cmdline.h |   2 +
 arch/x86/lib/cmdline.c         | 105 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 107 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/include/asm/cmdline.h b/arch/x86/include/asm/cmdline.h
index e01f7f7ccb0c..84ae170bc3d0 100644
--- a/arch/x86/include/asm/cmdline.h
+++ b/arch/x86/include/asm/cmdline.h
@@ -2,5 +2,7 @@
 #define _ASM_X86_CMDLINE_H
 
 int cmdline_find_option_bool(const char *cmdline_ptr, const char *option);
+int cmdline_find_option(const char *cmdline_ptr, const char *option,
+			char *buffer, int bufsize);
 
 #endif /* _ASM_X86_CMDLINE_H */
diff --git a/arch/x86/lib/cmdline.c b/arch/x86/lib/cmdline.c
index 422db000d727..a744506856b1 100644
--- a/arch/x86/lib/cmdline.c
+++ b/arch/x86/lib/cmdline.c
@@ -82,3 +82,108 @@ int cmdline_find_option_bool(const char *cmdline, const char *option)
 
 	return 0;	/* Buffer overrun */
 }
+
+/*
+ * Find a non-boolean option (i.e. option=argument). In accordance with
+ * standard Linux practice, if this option is repeated, this returns the
+ * last instance on the command line.
+ *
+ * @cmdline: the cmdline string
+ * @max_cmdline_size: the maximum size of cmdline
+ * @option: option string to look for
+ * @buffer: memory buffer to return the option argument
+ * @bufsize: size of the supplied memory buffer
+ *
+ * Returns the length of the argument (regardless of if it was
+ * truncated to fit in the buffer), or -1 on not found.
+ */
+static int
+__cmdline_find_option(const char *cmdline, int max_cmdline_size,
+		      const char *option, char *buffer, int bufsize)
+{
+	char c;
+	int pos = 0, len = -1;
+	const char *opptr = NULL;
+	char *bufptr = buffer;
+	enum {
+		st_wordstart = 0,	/* Start of word/after whitespace */
+		st_wordcmp,	/* Comparing this word */
+		st_wordskip,	/* Miscompare, skip */
+		st_bufcpy,	/* Copying this to buffer */
+	} state = st_wordstart;
+
+	if (!cmdline)
+		return -1;      /* No command line */
+
+	/*
+	 * This 'pos' check ensures we do not overrun
+	 * a non-NULL-terminated 'cmdline'
+	 */
+	while (pos++ < max_cmdline_size) {
+		c = *(char *)cmdline++;
+		if (!c)
+			break;
+
+		switch (state) {
+		case st_wordstart:
+			if (myisspace(c))
+				break;
+
+			state = st_wordcmp;
+			opptr = option;
+			/* fall through */
+
+		case st_wordcmp:
+			if ((c == '=') && !*opptr) {
+				/*
+				 * We matched all the way to the end of the
+				 * option we were looking for, prepare to
+				 * copy the argument.
+				 */
+				len = 0;
+				bufptr = buffer;
+				state = st_bufcpy;
+				break;
+			} else if (c == *opptr++) {
+				/*
+				 * We are currently matching, so continue
+				 * to the next character on the cmdline.
+				 */
+				break;
+			}
+			state = st_wordskip;
+			/* fall through */
+
+		case st_wordskip:
+			if (myisspace(c))
+				state = st_wordstart;
+			break;
+
+		case st_bufcpy:
+			if (myisspace(c)) {
+				state = st_wordstart;
+			} else {
+				/*
+				 * Increment len, but don't overrun the
+				 * supplied buffer and leave room for the
+				 * NULL terminator.
+				 */
+				if (++len < bufsize)
+					*bufptr++ = c;
+			}
+			break;
+		}
+	}
+
+	if (bufsize)
+		*bufptr = '\0';
+
+	return len;
+}
+
+int cmdline_find_option(const char *cmdline, const char *option, char *buffer,
+			int bufsize)
+{
+	return __cmdline_find_option(cmdline, COMMAND_LINE_SIZE, option,
+				     buffer, bufsize);
+}
-- 
cgit v1.2.3


From 8a43ddfb93a0c6ae1a6e1f5c25705ec5d1843c40 Mon Sep 17 00:00:00 2001
From: Richard Fellner <richard.fellner@student.tugraz.at>
Date: Thu, 4 May 2017 14:26:50 +0200
Subject: KAISER: Kernel Address Isolation

This patch introduces our implementation of KAISER (Kernel Address Isolation to
have Side-channels Efficiently Removed), a kernel isolation technique to close
hardware side channels on kernel address information.

More information about the patch can be found on:

        https://github.com/IAIK/KAISER

From: Richard Fellner <richard.fellner@student.tugraz.at>
From: Daniel Gruss <daniel.gruss@iaik.tugraz.at>
X-Subject: [RFC, PATCH] x86_64: KAISER - do not map kernel in user mode
Date: Thu, 4 May 2017 14:26:50 +0200
Link: http://marc.info/?l=linux-kernel&m=149390087310405&w=2
Kaiser-4.10-SHA1: c4b1831d44c6144d3762ccc72f0c4e71a0c713e5

To: <linux-kernel@vger.kernel.org>
To: <kernel-hardening@lists.openwall.com>
Cc: <clementine.maurice@iaik.tugraz.at>
Cc: <moritz.lipp@iaik.tugraz.at>
Cc: Michael Schwarz <michael.schwarz@iaik.tugraz.at>
Cc: Richard Fellner <richard.fellner@student.tugraz.at>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: <kirill.shutemov@linux.intel.com>
Cc: <anders.fogh@gdata-adan.de>

After several recent works [1,2,3] KASLR on x86_64 was basically
considered dead by many researchers. We have been working on an
efficient but effective fix for this problem and found that not mapping
the kernel space when running in user mode is the solution to this
problem [4] (the corresponding paper [5] will be presented at ESSoS17).

With this RFC patch we allow anybody to configure their kernel with the
flag CONFIG_KAISER to add our defense mechanism.

If there are any questions we would love to answer them.
We also appreciate any comments!

Cheers,
Daniel (+ the KAISER team from Graz University of Technology)

[1] http://www.ieee-security.org/TC/SP2013/papers/4977a191.pdf
[2] https://www.blackhat.com/docs/us-16/materials/us-16-Fogh-Using-Undocumented-CPU-Behaviour-To-See-Into-Kernel-Mode-And-Break-KASLR-In-The-Process.pdf
[3] https://www.blackhat.com/docs/us-16/materials/us-16-Jang-Breaking-Kernel-Address-Space-Layout-Randomization-KASLR-With-Intel-TSX.pdf
[4] https://github.com/IAIK/KAISER
[5] https://gruss.cc/files/kaiser.pdf

[patch based also on
https://raw.githubusercontent.com/IAIK/KAISER/master/KAISER/0001-KAISER-Kernel-Address-Isolation.patch]

Signed-off-by: Richard Fellner <richard.fellner@student.tugraz.at>
Signed-off-by: Moritz Lipp <moritz.lipp@iaik.tugraz.at>
Signed-off-by: Daniel Gruss <daniel.gruss@iaik.tugraz.at>
Signed-off-by: Michael Schwarz <michael.schwarz@iaik.tugraz.at>
Acked-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/entry/entry_64.S            |  19 ++++-
 arch/x86/entry/entry_64_compat.S     |   6 ++
 arch/x86/include/asm/hw_irq.h        |   2 +-
 arch/x86/include/asm/kaiser.h        | 113 +++++++++++++++++++++++++
 arch/x86/include/asm/pgtable.h       |   4 +
 arch/x86/include/asm/pgtable_64.h    |  21 +++++
 arch/x86/include/asm/pgtable_types.h |  12 ++-
 arch/x86/include/asm/processor.h     |   7 +-
 arch/x86/kernel/cpu/common.c         |   4 +-
 arch/x86/kernel/espfix_64.c          |   6 ++
 arch/x86/kernel/head_64.S            |  16 +++-
 arch/x86/kernel/irqinit.c            |   2 +-
 arch/x86/kernel/process.c            |   2 +-
 arch/x86/mm/Makefile                 |   1 +
 arch/x86/mm/kaiser.c                 | 160 +++++++++++++++++++++++++++++++++++
 arch/x86/mm/pageattr.c               |   2 +-
 arch/x86/mm/pgtable.c                |  26 ++++++
 17 files changed, 388 insertions(+), 15 deletions(-)
 create mode 100644 arch/x86/include/asm/kaiser.h
 create mode 100644 arch/x86/mm/kaiser.c

(limited to 'arch')

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index cc0f2f5da19b..273bc5b8d491 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -35,6 +35,7 @@
 #include <asm/asm.h>
 #include <asm/smap.h>
 #include <asm/pgtable_types.h>
+#include <asm/kaiser.h>
 #include <linux/err.h>
 
 /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */
@@ -135,6 +136,7 @@ ENTRY(entry_SYSCALL_64)
 	 * it is too small to ever cause noticeable irq latency.
 	 */
 	SWAPGS_UNSAFE_STACK
+	SWITCH_KERNEL_CR3_NO_STACK
 	/*
 	 * A hypervisor implementation might want to use a label
 	 * after the swapgs, so that it can do the swapgs
@@ -207,9 +209,10 @@ entry_SYSCALL_64_fastpath:
 	testl	$_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
 	jnz	int_ret_from_sys_call_irqs_off	/* Go to the slow path */
 
-	RESTORE_C_REGS_EXCEPT_RCX_R11
 	movq	RIP(%rsp), %rcx
 	movq	EFLAGS(%rsp), %r11
+	RESTORE_C_REGS_EXCEPT_RCX_R11
+	SWITCH_USER_CR3
 	movq	RSP(%rsp), %rsp
 	/*
 	 * 64-bit SYSRET restores rip from rcx,
@@ -347,10 +350,12 @@ GLOBAL(int_ret_from_sys_call)
 syscall_return_via_sysret:
 	/* rcx and r11 are already restored (see code above) */
 	RESTORE_C_REGS_EXCEPT_RCX_R11
+	SWITCH_USER_CR3
 	movq	RSP(%rsp), %rsp
 	USERGS_SYSRET64
 
 opportunistic_sysret_failed:
+	SWITCH_USER_CR3
 	SWAPGS
 	jmp	restore_c_regs_and_iret
 END(entry_SYSCALL_64)
@@ -509,6 +514,7 @@ END(irq_entries_start)
 	 * tracking that we're in kernel mode.
 	 */
 	SWAPGS
+	SWITCH_KERNEL_CR3
 
 	/*
 	 * We need to tell lockdep that IRQs are off.  We can't do this until
@@ -568,6 +574,7 @@ GLOBAL(retint_user)
 	mov	%rsp,%rdi
 	call	prepare_exit_to_usermode
 	TRACE_IRQS_IRETQ
+	SWITCH_USER_CR3
 	SWAPGS
 	jmp	restore_regs_and_iret
 
@@ -625,6 +632,7 @@ native_irq_return_ldt:
 	pushq	%rax
 	pushq	%rdi
 	SWAPGS
+	SWITCH_KERNEL_CR3
 	movq	PER_CPU_VAR(espfix_waddr), %rdi
 	movq	%rax, (0*8)(%rdi)		/* RAX */
 	movq	(2*8)(%rsp), %rax		/* RIP */
@@ -640,6 +648,7 @@ native_irq_return_ldt:
 	andl	$0xffff0000, %eax
 	popq	%rdi
 	orq	PER_CPU_VAR(espfix_stack), %rax
+	SWITCH_USER_CR3
 	SWAPGS
 	movq	%rax, %rsp
 	popq	%rax
@@ -1007,6 +1016,7 @@ ENTRY(paranoid_entry)
 	testl	%edx, %edx
 	js	1f				/* negative -> in kernel */
 	SWAPGS
+	SWITCH_KERNEL_CR3
 	xorl	%ebx, %ebx
 1:	ret
 END(paranoid_entry)
@@ -1029,6 +1039,7 @@ ENTRY(paranoid_exit)
 	testl	%ebx, %ebx			/* swapgs needed? */
 	jnz	paranoid_exit_no_swapgs
 	TRACE_IRQS_IRETQ
+	SWITCH_USER_CR3_NO_STACK
 	SWAPGS_UNSAFE_STACK
 	jmp	paranoid_exit_restore
 paranoid_exit_no_swapgs:
@@ -1058,6 +1069,7 @@ ENTRY(error_entry)
 	 * from user mode due to an IRET fault.
 	 */
 	SWAPGS
+	SWITCH_KERNEL_CR3
 
 .Lerror_entry_from_usermode_after_swapgs:
 	/*
@@ -1110,7 +1122,7 @@ ENTRY(error_entry)
 	 * Switch to kernel gsbase:
 	 */
 	SWAPGS
-
+	SWITCH_KERNEL_CR3
 	/*
 	 * Pretend that the exception came from user mode: set up pt_regs
 	 * as if we faulted immediately after IRET and clear EBX so that
@@ -1210,6 +1222,7 @@ ENTRY(nmi)
 	 */
 
 	SWAPGS_UNSAFE_STACK
+	SWITCH_KERNEL_CR3_NO_STACK
 	cld
 	movq	%rsp, %rdx
 	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
@@ -1250,6 +1263,7 @@ ENTRY(nmi)
 	 * work, because we don't want to enable interrupts.  Fortunately,
 	 * do_nmi doesn't modify pt_regs.
 	 */
+	SWITCH_USER_CR3
 	SWAPGS
 	jmp	restore_c_regs_and_iret
 
@@ -1461,6 +1475,7 @@ end_repeat_nmi:
 	testl	%ebx, %ebx			/* swapgs needed? */
 	jnz	nmi_restore
 nmi_swapgs:
+	SWITCH_USER_CR3_NO_STACK
 	SWAPGS_UNSAFE_STACK
 nmi_restore:
 	RESTORE_EXTRA_REGS
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index 15cfebaa7688..fe1911930b52 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -13,6 +13,7 @@
 #include <asm/irqflags.h>
 #include <asm/asm.h>
 #include <asm/smap.h>
+#include <asm/kaiser.h>
 #include <linux/linkage.h>
 #include <linux/err.h>
 
@@ -50,6 +51,7 @@ ENDPROC(native_usergs_sysret32)
 ENTRY(entry_SYSENTER_compat)
 	/* Interrupts are off on entry. */
 	SWAPGS_UNSAFE_STACK
+	SWITCH_KERNEL_CR3_NO_STACK
 	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
 
 	/*
@@ -161,6 +163,7 @@ ENDPROC(entry_SYSENTER_compat)
 ENTRY(entry_SYSCALL_compat)
 	/* Interrupts are off on entry. */
 	SWAPGS_UNSAFE_STACK
+	SWITCH_KERNEL_CR3_NO_STACK
 
 	/* Stash user ESP and switch to the kernel stack. */
 	movl	%esp, %r8d
@@ -208,6 +211,7 @@ ENTRY(entry_SYSCALL_compat)
 	/* Opportunistic SYSRET */
 sysret32_from_system_call:
 	TRACE_IRQS_ON			/* User mode traces as IRQs on. */
+	SWITCH_USER_CR3
 	movq	RBX(%rsp), %rbx		/* pt_regs->rbx */
 	movq	RBP(%rsp), %rbp		/* pt_regs->rbp */
 	movq	EFLAGS(%rsp), %r11	/* pt_regs->flags (in r11) */
@@ -269,6 +273,7 @@ ENTRY(entry_INT80_compat)
 	PARAVIRT_ADJUST_EXCEPTION_FRAME
 	ASM_CLAC			/* Do this early to minimize exposure */
 	SWAPGS
+	SWITCH_KERNEL_CR3_NO_STACK
 
 	/*
 	 * User tracing code (ptrace or signal handlers) might assume that
@@ -311,6 +316,7 @@ ENTRY(entry_INT80_compat)
 
 	/* Go back to user mode. */
 	TRACE_IRQS_ON
+	SWITCH_USER_CR3
 	SWAPGS
 	jmp	restore_regs_and_iret
 END(entry_INT80_compat)
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 59caa55fb9b5..ee52ff858699 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -187,7 +187,7 @@ extern char irq_entries_start[];
 #define VECTOR_RETRIGGERED	((void *)~0UL)
 
 typedef struct irq_desc* vector_irq_t[NR_VECTORS];
-DECLARE_PER_CPU(vector_irq_t, vector_irq);
+DECLARE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq);
 
 #endif /* !ASSEMBLY_ */
 
diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h
new file mode 100644
index 000000000000..63ee8309b35b
--- /dev/null
+++ b/arch/x86/include/asm/kaiser.h
@@ -0,0 +1,113 @@
+#ifndef _ASM_X86_KAISER_H
+#define _ASM_X86_KAISER_H
+
+/* This file includes the definitions for the KAISER feature.
+ * KAISER is a counter measure against x86_64 side channel attacks on the kernel virtual memory.
+ * It has a shodow-pgd for every process. the shadow-pgd has a minimalistic kernel-set mapped,
+ * but includes the whole user memory. Within a kernel context switch, or when an interrupt is handled,
+ * the pgd is switched to the normal one. When the system switches to user mode, the shadow pgd is enabled.
+ * By this, the virtual memory chaches are freed, and the user may not attack the whole kernel memory.
+ *
+ * A minimalistic kernel mapping holds the parts needed to be mapped in user mode, as the entry/exit functions
+ * of the user space, or the stacks.
+ */
+#ifdef __ASSEMBLY__
+#ifdef CONFIG_KAISER
+
+.macro _SWITCH_TO_KERNEL_CR3 reg
+movq %cr3, \reg
+andq $(~0x1000), \reg
+movq \reg, %cr3
+.endm
+
+.macro _SWITCH_TO_USER_CR3 reg
+movq %cr3, \reg
+orq $(0x1000), \reg
+movq \reg, %cr3
+.endm
+
+.macro SWITCH_KERNEL_CR3
+pushq %rax
+_SWITCH_TO_KERNEL_CR3 %rax
+popq %rax
+.endm
+
+.macro SWITCH_USER_CR3
+pushq %rax
+_SWITCH_TO_USER_CR3 %rax
+popq %rax
+.endm
+
+.macro SWITCH_KERNEL_CR3_NO_STACK
+movq %rax, PER_CPU_VAR(unsafe_stack_register_backup)
+_SWITCH_TO_KERNEL_CR3 %rax
+movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
+.endm
+
+
+.macro SWITCH_USER_CR3_NO_STACK
+
+movq %rax, PER_CPU_VAR(unsafe_stack_register_backup)
+_SWITCH_TO_USER_CR3 %rax
+movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
+
+.endm
+
+#else /* CONFIG_KAISER */
+
+.macro SWITCH_KERNEL_CR3 reg
+.endm
+.macro SWITCH_USER_CR3 reg
+.endm
+.macro SWITCH_USER_CR3_NO_STACK
+.endm
+.macro SWITCH_KERNEL_CR3_NO_STACK
+.endm
+
+#endif /* CONFIG_KAISER */
+#else /* __ASSEMBLY__ */
+
+
+#ifdef CONFIG_KAISER
+// Upon kernel/user mode switch, it may happen that
+// the address space has to be switched before the registers have been stored.
+// To change the address space, another register is needed.
+// A register therefore has to be stored/restored.
+//
+DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
+
+#endif /* CONFIG_KAISER */
+
+/**
+ *  shadowmem_add_mapping - map a virtual memory part to the shadow mapping
+ *  @addr: the start address of the range
+ *  @size: the size of the range
+ *  @flags: The mapping flags of the pages
+ *
+ *  the mapping is done on a global scope, so no bigger synchronization has to be done.
+ *  the pages have to be manually unmapped again when they are not needed any longer.
+ */
+extern void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);
+
+
+/**
+ *  shadowmem_remove_mapping - unmap a virtual memory part of the shadow mapping
+ *  @addr: the start address of the range
+ *  @size: the size of the range
+ */
+extern void kaiser_remove_mapping(unsigned long start, unsigned long size);
+
+/**
+ *  shadowmem_initialize_mapping - Initalize the shadow mapping
+ *
+ *  most parts of the shadow mapping can be mapped upon boot time.
+ *  only the thread stacks have to be mapped on runtime.
+ *  the mapped regions are not unmapped at all.
+ */
+extern void kaiser_init(void);
+
+#endif
+
+
+
+#endif /* _ASM_X86_KAISER_H */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 6ec0c8b2e9df..6a843d254114 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -856,6 +856,10 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
 static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
 {
        memcpy(dst, src, count * sizeof(pgd_t));
+#ifdef CONFIG_KAISER
+	// clone the shadow pgd part as well
+	memcpy(native_get_shadow_pgd(dst), native_get_shadow_pgd(src), count * sizeof(pgd_t));
+#endif
 }
 
 #define PTE_SHIFT ilog2(PTRS_PER_PTE)
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 2ee781114d34..2131edda6620 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -106,9 +106,30 @@ static inline void native_pud_clear(pud_t *pud)
 	native_set_pud(pud, native_make_pud(0));
 }
 
+#ifdef CONFIG_KAISER
+static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp) {
+	return (pgd_t *)(void*)((unsigned long)(void*)pgdp | (unsigned long)PAGE_SIZE);
+}
+
+static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp) {
+	return (pgd_t *)(void*)((unsigned long)(void*)pgdp &  ~(unsigned long)PAGE_SIZE);
+}
+#endif /* CONFIG_KAISER */
+
 static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
 {
+#ifdef CONFIG_KAISER
+	// We know that a pgd is page aligned.
+	// Therefore the lower indices have to be mapped to user space.
+	// These pages are mapped to the shadow mapping.
+	if ((((unsigned long)pgdp) % PAGE_SIZE) < (PAGE_SIZE / 2)) {
+		native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
+	}
+
+	pgdp->pgd = pgd.pgd & ~_PAGE_USER;
+#else /* CONFIG_KAISER */
 	*pgdp = pgd;
+#endif
 }
 
 static inline void native_pgd_clear(pgd_t *pgd)
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 79c91853e50e..984670491901 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -39,7 +39,11 @@
 #define _PAGE_ACCESSED	(_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED)
 #define _PAGE_DIRTY	(_AT(pteval_t, 1) << _PAGE_BIT_DIRTY)
 #define _PAGE_PSE	(_AT(pteval_t, 1) << _PAGE_BIT_PSE)
-#define _PAGE_GLOBAL	(_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
+#ifdef CONFIG_KAISER
+#define _PAGE_GLOBAL	(_AT(pteval_t, 0))
+#else
+#define _PAGE_GLOBAL  (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
+#endif
 #define _PAGE_SOFTW1	(_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1)
 #define _PAGE_SOFTW2	(_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2)
 #define _PAGE_PAT	(_AT(pteval_t, 1) << _PAGE_BIT_PAT)
@@ -89,7 +93,11 @@
 #define _PAGE_NX	(_AT(pteval_t, 0))
 #endif
 
-#define _PAGE_PROTNONE	(_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
+#ifdef CONFIG_KAISER
+#define _PAGE_PROTNONE	(_AT(pteval_t, 0))
+#else
+#define _PAGE_PROTNONE  (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
+#endif
 
 #define _PAGE_TABLE	(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |	\
 			 _PAGE_ACCESSED | _PAGE_DIRTY)
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 2d5a50cb61a2..6a2e0a0b4a96 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -305,7 +305,7 @@ struct tss_struct {
 
 } ____cacheline_aligned;
 
-DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss);
+DECLARE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, cpu_tss);
 
 #ifdef CONFIG_X86_32
 DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
@@ -332,6 +332,11 @@ union irq_stack_union {
 		char gs_base[40];
 		unsigned long stack_canary;
 	};
+
+	struct {
+		char irq_stack_pointer[64];
+		char unused[IRQ_STACK_SIZE - 64];
+	};
 };
 
 DECLARE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union) __visible;
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index aa1e7246b06b..e5ba9701ee6c 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -92,7 +92,7 @@ static const struct cpu_dev default_cpu = {
 
 static const struct cpu_dev *this_cpu = &default_cpu;
 
-DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = {
+DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page) = { .gdt = {
 #ifdef CONFIG_X86_64
 	/*
 	 * We need valid kernel segments for data and code in long mode too
@@ -1229,7 +1229,7 @@ static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = {
 	  [DEBUG_STACK - 1]			= DEBUG_STKSZ
 };
 
-static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks
+DEFINE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(char, exception_stacks
 	[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]);
 
 /* May not be marked __init: used by software suspend */
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index 4d38416e2a7f..bd1358d55146 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -41,6 +41,7 @@
 #include <asm/pgalloc.h>
 #include <asm/setup.h>
 #include <asm/espfix.h>
+#include <asm/kaiser.h>
 
 /*
  * Note: we only need 6*8 = 48 bytes for the espfix stack, but round
@@ -126,6 +127,11 @@ void __init init_espfix_bsp(void)
 	/* Install the espfix pud into the kernel page directory */
 	pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)];
 	pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page);
+#ifdef CONFIG_KAISER
+	// add the esp stack pud to the shadow mapping here.
+	// This can be done directly, because the fixup stack has its own pud
+	set_pgd(native_get_shadow_pgd(pgd_p), __pgd(_PAGE_TABLE | __pa((pud_t *)espfix_pud_page)));
+#endif
 
 	/* Randomize the locations */
 	init_espfix_random();
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index ffdc0e860390..0a8d18ac63eb 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -441,6 +441,14 @@ early_idt_ripmsg:
 	.balign	PAGE_SIZE; \
 GLOBAL(name)
 
+#ifdef CONFIG_KAISER
+#define NEXT_PGD_PAGE(name) \
+	.balign 2 * PAGE_SIZE; \
+GLOBAL(name)
+#else
+#define NEXT_PGD_PAGE(name) NEXT_PAGE(name)
+#endif
+
 /* Automate the creation of 1 to 1 mapping pmd entries */
 #define PMDS(START, PERM, COUNT)			\
 	i = 0 ;						\
@@ -450,7 +458,7 @@ GLOBAL(name)
 	.endr
 
 	__INITDATA
-NEXT_PAGE(early_level4_pgt)
+NEXT_PGD_PAGE(early_level4_pgt)
 	.fill	511,8,0
 	.quad	level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
 
@@ -460,10 +468,10 @@ NEXT_PAGE(early_dynamic_pgts)
 	.data
 
 #ifndef CONFIG_XEN
-NEXT_PAGE(init_level4_pgt)
-	.fill	512,8,0
+NEXT_PGD_PAGE(init_level4_pgt)
+	.fill	2*512,8,0
 #else
-NEXT_PAGE(init_level4_pgt)
+NEXT_PGD_PAGE(init_level4_pgt)
 	.quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
 	.org    init_level4_pgt + L4_PAGE_OFFSET*8, 0
 	.quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 1423ab1b0312..f480b38a03c3 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -51,7 +51,7 @@ static struct irqaction irq2 = {
 	.flags = IRQF_NO_THREAD,
 };
 
-DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
+DEFINE_PER_CPU_USER_MAPPED(vector_irq_t, vector_irq) = {
 	[0 ... NR_VECTORS - 1] = VECTOR_UNUSED,
 };
 
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 9f7c21c22477..7c5c5dc90ffa 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -39,7 +39,7 @@
  * section. Since TSS's are completely CPU-local, we want them
  * on exact cacheline boundaries, to eliminate cacheline ping-pong.
  */
-__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
+__visible DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct tss_struct, cpu_tss) = {
 	.x86_tss = {
 		.sp0 = TOP_OF_INIT_STACK,
 #ifdef CONFIG_X86_32
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 1ae7c141f778..978156081bed 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -32,3 +32,4 @@ obj-$(CONFIG_ACPI_NUMA)		+= srat.o
 obj-$(CONFIG_NUMA_EMU)		+= numa_emulation.o
 
 obj-$(CONFIG_X86_INTEL_MPX)	+= mpx.o
+obj-$(CONFIG_KAISER)		+= kaiser.o
diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c
new file mode 100644
index 000000000000..cf1bb922d467
--- /dev/null
+++ b/arch/x86/mm/kaiser.c
@@ -0,0 +1,160 @@
+
+
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/bug.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/mm.h>
+
+#include <linux/uaccess.h>
+#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
+#include <asm/desc.h>
+#ifdef CONFIG_KAISER
+
+__visible DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
+
+/**
+ * Get the real ppn from a address in kernel mapping.
+ * @param address The virtual adrress
+ * @return the physical address
+ */
+static inline unsigned long get_pa_from_mapping (unsigned long address)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+
+	pgd = pgd_offset_k(address);
+	BUG_ON(pgd_none(*pgd) || pgd_large(*pgd));
+
+	pud = pud_offset(pgd, address);
+	BUG_ON(pud_none(*pud));
+
+	if (pud_large(*pud)) {
+		return (pud_pfn(*pud) << PAGE_SHIFT) | (address & ~PUD_PAGE_MASK);
+	}
+
+	pmd = pmd_offset(pud, address);
+	BUG_ON(pmd_none(*pmd));
+
+	if (pmd_large(*pmd)) {
+		return (pmd_pfn(*pmd) << PAGE_SHIFT) | (address & ~PMD_PAGE_MASK);
+	}
+
+	pte = pte_offset_kernel(pmd, address);
+	BUG_ON(pte_none(*pte));
+
+	return (pte_pfn(*pte) << PAGE_SHIFT) | (address & ~PAGE_MASK);
+}
+
+void _kaiser_copy (unsigned long start_addr, unsigned long size,
+					unsigned long flags)
+{
+	pgd_t *pgd;
+	pud_t *pud;
+	pmd_t *pmd;
+	pte_t *pte;
+	unsigned long address;
+	unsigned long end_addr = start_addr + size;
+	unsigned long target_address;
+
+	for (address = PAGE_ALIGN(start_addr - (PAGE_SIZE - 1));
+			address < PAGE_ALIGN(end_addr); address += PAGE_SIZE) {
+		target_address = get_pa_from_mapping(address);
+
+		pgd = native_get_shadow_pgd(pgd_offset_k(address));
+
+		BUG_ON(pgd_none(*pgd) && "All shadow pgds should be mapped at this time\n");
+		BUG_ON(pgd_large(*pgd));
+
+		pud = pud_offset(pgd, address);
+		if (pud_none(*pud)) {
+			set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd_alloc_one(0, address))));
+		}
+		BUG_ON(pud_large(*pud));
+
+		pmd = pmd_offset(pud, address);
+		if (pmd_none(*pmd)) {
+			set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte_alloc_one_kernel(0, address))));
+		}
+		BUG_ON(pmd_large(*pmd));
+
+		pte = pte_offset_kernel(pmd, address);
+		if (pte_none(*pte)) {
+			set_pte(pte, __pte(flags | target_address));
+		} else {
+			BUG_ON(__pa(pte_page(*pte)) != target_address);
+		}
+	}
+}
+
+// at first, add a pmd for every pgd entry in the shadowmem-kernel-part of the kernel mapping
+static inline void __init _kaiser_init(void)
+{
+	pgd_t *pgd;
+	int i = 0;
+
+	pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0));
+	for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) {
+		set_pgd(pgd + i, __pgd(_PAGE_TABLE |__pa(pud_alloc_one(0, 0))));
+	}
+}
+
+extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];
+spinlock_t shadow_table_lock;
+void __init kaiser_init(void)
+{
+	int cpu;
+	spin_lock_init(&shadow_table_lock);
+
+	spin_lock(&shadow_table_lock);
+
+	_kaiser_init();
+
+	for_each_possible_cpu(cpu) {
+		// map the per cpu user variables
+		_kaiser_copy(
+				(unsigned long) (__per_cpu_user_mapped_start + per_cpu_offset(cpu)),
+				(unsigned long) __per_cpu_user_mapped_end - (unsigned long) __per_cpu_user_mapped_start,
+				__PAGE_KERNEL);
+	}
+
+	// map the entry/exit text section, which is responsible to switch between user- and kernel mode
+	_kaiser_copy(
+			(unsigned long) __entry_text_start,
+			(unsigned long) __entry_text_end - (unsigned long) __entry_text_start,
+			__PAGE_KERNEL_RX);
+
+	// the fixed map address of the idt_table
+	_kaiser_copy(
+			(unsigned long) idt_descr.address,
+			sizeof(gate_desc) * NR_VECTORS,
+			__PAGE_KERNEL_RO);
+
+	spin_unlock(&shadow_table_lock);
+}
+
+// add a mapping to the shadow-mapping, and synchronize the mappings
+void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
+{
+	spin_lock(&shadow_table_lock);
+	_kaiser_copy(addr, size, flags);
+	spin_unlock(&shadow_table_lock);
+}
+
+extern void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end);
+void kaiser_remove_mapping(unsigned long start, unsigned long size)
+{
+	pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(start));
+	spin_lock(&shadow_table_lock);
+	do {
+		unmap_pud_range(pgd, start, start + size);
+	} while (pgd++ != native_get_shadow_pgd(pgd_offset_k(start + size)));
+	spin_unlock(&shadow_table_lock);
+}
+#endif /* CONFIG_KAISER */
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index b599a780a5a9..4fe616bd2586 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -829,7 +829,7 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
 			pud_clear(pud);
 }
 
-static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
+void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
 {
 	pud_t *pud = pud_offset(pgd, start);
 
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index fb0a9dd1d6e4..087d3e1e7125 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -342,12 +342,38 @@ static inline void _pgd_free(pgd_t *pgd)
 #else
 static inline pgd_t *_pgd_alloc(void)
 {
+#ifdef CONFIG_KAISER
+	// Instead of one PML4, we aquire two PML4s and, thus, an 8kb-aligned memory
+	// block. Therefore, we have to allocate at least 3 pages. However, the
+	// __get_free_pages returns us 4 pages. Hence, we store the base pointer at
+	// the beginning of the page of our 8kb-aligned memory block in order to
+	// correctly free it afterwars.
+
+	unsigned long pages = __get_free_pages(PGALLOC_GFP, get_order(4*PAGE_SIZE));
+
+	if(native_get_normal_pgd((pgd_t*) pages) == (pgd_t*) pages)
+	{
+		*((unsigned long*)(pages + 2 * PAGE_SIZE)) = pages;
+		return (pgd_t *) pages;
+	}
+	else
+	{
+		*((unsigned long*)(pages + 3 * PAGE_SIZE)) = pages;
+		return (pgd_t *) (pages + PAGE_SIZE);
+	}
+#else
 	return (pgd_t *)__get_free_page(PGALLOC_GFP);
+#endif
 }
 
 static inline void _pgd_free(pgd_t *pgd)
 {
+#ifdef CONFIG_KAISER
+  unsigned long pages = *((unsigned long*) ((char*) pgd + 2 * PAGE_SIZE));
+	free_pages(pages, get_order(4*PAGE_SIZE));
+#else
 	free_page((unsigned long)pgd);
+#endif
 }
 #endif /* CONFIG_X86_PAE */
 
-- 
cgit v1.2.3


From bed9bb7f3e6d4045013d2bb9e4004896de57f02b Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Wed, 30 Aug 2017 16:23:00 -0700
Subject: kaiser: merged update

Merged fixes and cleanups, rebased to 4.4.89 tree (no 5-level paging).

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/entry/entry_64.S            | 106 ++++++++++--
 arch/x86/include/asm/kaiser.h        |  43 +++--
 arch/x86/include/asm/pgtable.h       |  18 +-
 arch/x86/include/asm/pgtable_64.h    |  48 +++++-
 arch/x86/include/asm/pgtable_types.h |   6 +-
 arch/x86/kernel/espfix_64.c          |  13 +-
 arch/x86/kernel/head_64.S            |  19 ++-
 arch/x86/kernel/ldt.c                |  27 ++-
 arch/x86/kernel/tracepoint.c         |   2 +
 arch/x86/mm/kaiser.c                 | 314 +++++++++++++++++++++++++----------
 arch/x86/mm/pageattr.c               |  63 +++++--
 arch/x86/mm/pgtable.c                |  40 ++---
 12 files changed, 515 insertions(+), 184 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 273bc5b8d491..99bce319df0c 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -212,6 +212,13 @@ entry_SYSCALL_64_fastpath:
 	movq	RIP(%rsp), %rcx
 	movq	EFLAGS(%rsp), %r11
 	RESTORE_C_REGS_EXCEPT_RCX_R11
+	/*
+	 * This opens a window where we have a user CR3, but are
+	 * running in the kernel.  This makes using the CS
+	 * register useless for telling whether or not we need to
+	 * switch CR3 in NMIs.  Normal interrupts are OK because
+	 * they are off here.
+	 */
 	SWITCH_USER_CR3
 	movq	RSP(%rsp), %rsp
 	/*
@@ -350,11 +357,25 @@ GLOBAL(int_ret_from_sys_call)
 syscall_return_via_sysret:
 	/* rcx and r11 are already restored (see code above) */
 	RESTORE_C_REGS_EXCEPT_RCX_R11
+	/*
+	 * This opens a window where we have a user CR3, but are
+	 * running in the kernel.  This makes using the CS
+	 * register useless for telling whether or not we need to
+	 * switch CR3 in NMIs.  Normal interrupts are OK because
+	 * they are off here.
+	 */
 	SWITCH_USER_CR3
 	movq	RSP(%rsp), %rsp
 	USERGS_SYSRET64
 
 opportunistic_sysret_failed:
+	/*
+	 * This opens a window where we have a user CR3, but are
+	 * running in the kernel.  This makes using the CS
+	 * register useless for telling whether or not we need to
+	 * switch CR3 in NMIs.  Normal interrupts are OK because
+	 * they are off here.
+	 */
 	SWITCH_USER_CR3
 	SWAPGS
 	jmp	restore_c_regs_and_iret
@@ -1059,6 +1080,13 @@ ENTRY(error_entry)
 	cld
 	SAVE_C_REGS 8
 	SAVE_EXTRA_REGS 8
+	/*
+	 * error_entry() always returns with a kernel gsbase and
+	 * CR3.  We must also have a kernel CR3/gsbase before
+	 * calling TRACE_IRQS_*.  Just unconditionally switch to
+	 * the kernel CR3 here.
+	 */
+	SWITCH_KERNEL_CR3
 	xorl	%ebx, %ebx
 	testb	$3, CS+8(%rsp)
 	jz	.Lerror_kernelspace
@@ -1069,7 +1097,6 @@ ENTRY(error_entry)
 	 * from user mode due to an IRET fault.
 	 */
 	SWAPGS
-	SWITCH_KERNEL_CR3
 
 .Lerror_entry_from_usermode_after_swapgs:
 	/*
@@ -1122,7 +1149,7 @@ ENTRY(error_entry)
 	 * Switch to kernel gsbase:
 	 */
 	SWAPGS
-	SWITCH_KERNEL_CR3
+
 	/*
 	 * Pretend that the exception came from user mode: set up pt_regs
 	 * as if we faulted immediately after IRET and clear EBX so that
@@ -1222,7 +1249,10 @@ ENTRY(nmi)
 	 */
 
 	SWAPGS_UNSAFE_STACK
-	SWITCH_KERNEL_CR3_NO_STACK
+	/*
+	 * percpu variables are mapped with user CR3, so no need
+	 * to switch CR3 here.
+	 */
 	cld
 	movq	%rsp, %rdx
 	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
@@ -1256,14 +1286,33 @@ ENTRY(nmi)
 
 	movq	%rsp, %rdi
 	movq	$-1, %rsi
+#ifdef CONFIG_KAISER
+	/* Unconditionally use kernel CR3 for do_nmi() */
+	/* %rax is saved above, so OK to clobber here */
+	movq	%cr3, %rax
+	pushq	%rax
+#ifdef CONFIG_KAISER_REAL_SWITCH
+	andq	$(~0x1000), %rax
+#endif
+	movq	%rax, %cr3
+#endif
 	call	do_nmi
+	/*
+	 * Unconditionally restore CR3.  I know we return to
+	 * kernel code that needs user CR3, but do we ever return
+	 * to "user mode" where we need the kernel CR3?
+	 */
+#ifdef CONFIG_KAISER
+	popq	%rax
+	mov	%rax, %cr3
+#endif
 
 	/*
 	 * Return back to user mode.  We must *not* do the normal exit
-	 * work, because we don't want to enable interrupts.  Fortunately,
-	 * do_nmi doesn't modify pt_regs.
+	 * work, because we don't want to enable interrupts.  Do not
+	 * switch to user CR3: we might be going back to kernel code
+	 * that had a user CR3 set.
 	 */
-	SWITCH_USER_CR3
 	SWAPGS
 	jmp	restore_c_regs_and_iret
 
@@ -1459,23 +1508,54 @@ end_repeat_nmi:
 	ALLOC_PT_GPREGS_ON_STACK
 
 	/*
-	 * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit
-	 * as we should not be calling schedule in NMI context.
-	 * Even with normal interrupts enabled. An NMI should not be
-	 * setting NEED_RESCHED or anything that normal interrupts and
-	 * exceptions might do.
+	 * Use the same approach as paranoid_entry to handle SWAPGS, but
+	 * without CR3 handling since we do that differently in NMIs.  No
+	 * need to use paranoid_exit as we should not be calling schedule
+	 * in NMI context.  Even with normal interrupts enabled. An NMI
+	 * should not be setting NEED_RESCHED or anything that normal
+	 * interrupts and exceptions might do.
 	 */
-	call	paranoid_entry
+	cld
+	SAVE_C_REGS
+	SAVE_EXTRA_REGS
+	movl	$1, %ebx
+	movl	$MSR_GS_BASE, %ecx
+	rdmsr
+	testl	%edx, %edx
+	js	1f				/* negative -> in kernel */
+	SWAPGS
+	xorl	%ebx, %ebx
+1:
+#ifdef CONFIG_KAISER
+	/* Unconditionally use kernel CR3 for do_nmi() */
+	/* %rax is saved above, so OK to clobber here */
+	movq	%cr3, %rax
+	pushq	%rax
+#ifdef CONFIG_KAISER_REAL_SWITCH
+	andq	$(~0x1000), %rax
+#endif
+	movq	%rax, %cr3
+#endif
 
 	/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
 	movq	%rsp, %rdi
+	addq	$8, %rdi /* point %rdi at ptregs, fixed up for CR3 */
 	movq	$-1, %rsi
 	call	do_nmi
+	/*
+	 * Unconditionally restore CR3.  We might be returning to
+	 * kernel code that needs user CR3, like just just before
+	 * a sysret.
+	 */
+#ifdef CONFIG_KAISER
+	popq	%rax
+	mov	%rax, %cr3
+#endif
 
 	testl	%ebx, %ebx			/* swapgs needed? */
 	jnz	nmi_restore
 nmi_swapgs:
-	SWITCH_USER_CR3_NO_STACK
+	/* We fixed up CR3 above, so no need to switch it here */
 	SWAPGS_UNSAFE_STACK
 nmi_restore:
 	RESTORE_EXTRA_REGS
diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h
index 63ee8309b35b..0703f48777f3 100644
--- a/arch/x86/include/asm/kaiser.h
+++ b/arch/x86/include/asm/kaiser.h
@@ -16,13 +16,17 @@
 
 .macro _SWITCH_TO_KERNEL_CR3 reg
 movq %cr3, \reg
+#ifdef CONFIG_KAISER_REAL_SWITCH
 andq $(~0x1000), \reg
+#endif
 movq \reg, %cr3
 .endm
 
 .macro _SWITCH_TO_USER_CR3 reg
 movq %cr3, \reg
+#ifdef CONFIG_KAISER_REAL_SWITCH
 orq $(0x1000), \reg
+#endif
 movq \reg, %cr3
 .endm
 
@@ -65,48 +69,53 @@ movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
 .endm
 
 #endif /* CONFIG_KAISER */
+
 #else /* __ASSEMBLY__ */
 
 
 #ifdef CONFIG_KAISER
-// Upon kernel/user mode switch, it may happen that
-// the address space has to be switched before the registers have been stored.
-// To change the address space, another register is needed.
-// A register therefore has to be stored/restored.
-//
-DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
+/*
+ * Upon kernel/user mode switch, it may happen that the address
+ * space has to be switched before the registers have been
+ * stored.  To change the address space, another register is
+ * needed.  A register therefore has to be stored/restored.
+*/
 
-#endif /* CONFIG_KAISER */
+DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
 
 /**
- *  shadowmem_add_mapping - map a virtual memory part to the shadow mapping
+ *  kaiser_add_mapping - map a virtual memory part to the shadow (user) mapping
  *  @addr: the start address of the range
  *  @size: the size of the range
  *  @flags: The mapping flags of the pages
  *
- *  the mapping is done on a global scope, so no bigger synchronization has to be done.
- *  the pages have to be manually unmapped again when they are not needed any longer.
+ *  The mapping is done on a global scope, so no bigger
+ *  synchronization has to be done.  the pages have to be
+ *  manually unmapped again when they are not needed any longer.
  */
-extern void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);
+extern int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);
 
 
 /**
- *  shadowmem_remove_mapping - unmap a virtual memory part of the shadow mapping
+ *  kaiser_remove_mapping - unmap a virtual memory part of the shadow mapping
  *  @addr: the start address of the range
  *  @size: the size of the range
  */
 extern void kaiser_remove_mapping(unsigned long start, unsigned long size);
 
 /**
- *  shadowmem_initialize_mapping - Initalize the shadow mapping
+ *  kaiser_initialize_mapping - Initalize the shadow mapping
  *
- *  most parts of the shadow mapping can be mapped upon boot time.
- *  only the thread stacks have to be mapped on runtime.
- *  the mapped regions are not unmapped at all.
+ *  Most parts of the shadow mapping can be mapped upon boot
+ *  time.  Only per-process things like the thread stacks
+ *  or a new LDT have to be mapped at runtime.  These boot-
+ *  time mappings are permanent and nevertunmapped.
  */
 extern void kaiser_init(void);
 
-#endif
+#endif /* CONFIG_KAISER */
+
+#endif /* __ASSEMBLY */
 
 
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 6a843d254114..176035fa057e 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -653,7 +653,17 @@ static inline pud_t *pud_offset(pgd_t *pgd, unsigned long address)
 
 static inline int pgd_bad(pgd_t pgd)
 {
-	return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE;
+	pgdval_t ignore_flags = _PAGE_USER;
+	/*
+	 * We set NX on KAISER pgds that map userspace memory so
+	 * that userspace can not meaningfully use the kernel
+	 * page table by accident; it will fault on the first
+	 * instruction it tries to run.  See native_set_pgd().
+	 */
+	if (IS_ENABLED(CONFIG_KAISER))
+		ignore_flags |= _PAGE_NX;
+
+	return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE;
 }
 
 static inline int pgd_none(pgd_t pgd)
@@ -857,8 +867,10 @@ static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
 {
        memcpy(dst, src, count * sizeof(pgd_t));
 #ifdef CONFIG_KAISER
-	// clone the shadow pgd part as well
-	memcpy(native_get_shadow_pgd(dst), native_get_shadow_pgd(src), count * sizeof(pgd_t));
+	/* Clone the shadow pgd part as well */
+	memcpy(native_get_shadow_pgd(dst),
+	       native_get_shadow_pgd(src),
+	       count * sizeof(pgd_t));
 #endif
 }
 
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 2131edda6620..b0adf0d11c9a 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -107,26 +107,58 @@ static inline void native_pud_clear(pud_t *pud)
 }
 
 #ifdef CONFIG_KAISER
-static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp) {
+static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp)
+{
 	return (pgd_t *)(void*)((unsigned long)(void*)pgdp | (unsigned long)PAGE_SIZE);
 }
 
-static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp) {
+static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp)
+{
 	return (pgd_t *)(void*)((unsigned long)(void*)pgdp &  ~(unsigned long)PAGE_SIZE);
 }
+#else
+static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp)
+{
+	BUILD_BUG_ON(1);
+	return NULL;
+}
+static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp)
+{
+	return pgdp;
+}
 #endif /* CONFIG_KAISER */
 
+/*
+ * Page table pages are page-aligned.  The lower half of the top
+ * level is used for userspace and the top half for the kernel.
+ * This returns true for user pages that need to get copied into
+ * both the user and kernel copies of the page tables, and false
+ * for kernel pages that should only be in the kernel copy.
+ */
+static inline bool is_userspace_pgd(void *__ptr)
+{
+	unsigned long ptr = (unsigned long)__ptr;
+
+	return ((ptr % PAGE_SIZE) < (PAGE_SIZE / 2));
+}
+
 static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
 {
 #ifdef CONFIG_KAISER
-	// We know that a pgd is page aligned.
-	// Therefore the lower indices have to be mapped to user space.
-	// These pages are mapped to the shadow mapping.
-	if ((((unsigned long)pgdp) % PAGE_SIZE) < (PAGE_SIZE / 2)) {
+	pteval_t extra_kern_pgd_flags = 0;
+	/* Do we need to also populate the shadow pgd? */
+	if (is_userspace_pgd(pgdp)) {
 		native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
+		/*
+		 * Even if the entry is *mapping* userspace, ensure
+		 * that userspace can not use it.  This way, if we
+		 * get out to userspace running on the kernel CR3,
+		 * userspace will crash instead of running.
+		 */
+		extra_kern_pgd_flags = _PAGE_NX;
 	}
-
-	pgdp->pgd = pgd.pgd & ~_PAGE_USER;
+	pgdp->pgd = pgd.pgd;
+	pgdp->pgd |= extra_kern_pgd_flags;
 #else /* CONFIG_KAISER */
 	*pgdp = pgd;
 #endif
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 984670491901..a70d6100b3df 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -42,7 +42,7 @@
 #ifdef CONFIG_KAISER
 #define _PAGE_GLOBAL	(_AT(pteval_t, 0))
 #else
-#define _PAGE_GLOBAL  (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
+#define _PAGE_GLOBAL	(_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
 #endif
 #define _PAGE_SOFTW1	(_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1)
 #define _PAGE_SOFTW2	(_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2)
@@ -93,11 +93,7 @@
 #define _PAGE_NX	(_AT(pteval_t, 0))
 #endif
 
-#ifdef CONFIG_KAISER
-#define _PAGE_PROTNONE	(_AT(pteval_t, 0))
-#else
 #define _PAGE_PROTNONE  (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
-#endif
 
 #define _PAGE_TABLE	(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |	\
 			 _PAGE_ACCESSED | _PAGE_DIRTY)
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index bd1358d55146..54de4c70cbd6 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -127,11 +127,14 @@ void __init init_espfix_bsp(void)
 	/* Install the espfix pud into the kernel page directory */
 	pgd_p = &init_level4_pgt[pgd_index(ESPFIX_BASE_ADDR)];
 	pgd_populate(&init_mm, pgd_p, (pud_t *)espfix_pud_page);
-#ifdef CONFIG_KAISER
-	// add the esp stack pud to the shadow mapping here.
-	// This can be done directly, because the fixup stack has its own pud
-	set_pgd(native_get_shadow_pgd(pgd_p), __pgd(_PAGE_TABLE | __pa((pud_t *)espfix_pud_page)));
-#endif
+	/*
+	 * Just copy the top-level PGD that is mapping the espfix
+	 * area to ensure it is mapped into the shadow user page
+	 * tables.
+	 */
+	if (IS_ENABLED(CONFIG_KAISER))
+		set_pgd(native_get_shadow_pgd(pgd_p),
+			__pgd(_KERNPG_TABLE | __pa((pud_t *)espfix_pud_page)));
 
 	/* Randomize the locations */
 	init_espfix_random();
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 0a8d18ac63eb..abbbcfbb5e26 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -442,11 +442,24 @@ early_idt_ripmsg:
 GLOBAL(name)
 
 #ifdef CONFIG_KAISER
+/*
+ * Each PGD needs to be 8k long and 8k aligned.  We do not
+ * ever go out to userspace with these, so we do not
+ * strictly *need* the second page, but this allows us to
+ * have a single set_pgd() implementation that does not
+ * need to worry about whether it has 4k or 8k to work
+ * with.
+ *
+ * This ensures PGDs are 8k long:
+ */
+#define KAISER_USER_PGD_FILL	512
+/* This ensures they are 8k-aligned: */
 #define NEXT_PGD_PAGE(name) \
 	.balign 2 * PAGE_SIZE; \
 GLOBAL(name)
 #else
 #define NEXT_PGD_PAGE(name) NEXT_PAGE(name)
+#define KAISER_USER_PGD_FILL	0
 #endif
 
 /* Automate the creation of 1 to 1 mapping pmd entries */
@@ -461,6 +474,7 @@ GLOBAL(name)
 NEXT_PGD_PAGE(early_level4_pgt)
 	.fill	511,8,0
 	.quad	level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
+	.fill	KAISER_USER_PGD_FILL,8,0
 
 NEXT_PAGE(early_dynamic_pgts)
 	.fill	512*EARLY_DYNAMIC_PAGE_TABLES,8,0
@@ -469,7 +483,8 @@ NEXT_PAGE(early_dynamic_pgts)
 
 #ifndef CONFIG_XEN
 NEXT_PGD_PAGE(init_level4_pgt)
-	.fill	2*512,8,0
+	.fill	512,8,0
+	.fill	KAISER_USER_PGD_FILL,8,0
 #else
 NEXT_PGD_PAGE(init_level4_pgt)
 	.quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
@@ -478,6 +493,7 @@ NEXT_PGD_PAGE(init_level4_pgt)
 	.org    init_level4_pgt + L4_START_KERNEL*8, 0
 	/* (2^48-(2*1024*1024*1024))/(2^39) = 511 */
 	.quad   level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE
+	.fill	KAISER_USER_PGD_FILL,8,0
 
 NEXT_PAGE(level3_ident_pgt)
 	.quad	level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE
@@ -488,6 +504,7 @@ NEXT_PAGE(level2_ident_pgt)
 	 */
 	PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
 #endif
+	.fill	KAISER_USER_PGD_FILL,8,0
 
 NEXT_PAGE(level3_kernel_pgt)
 	.fill	L3_START_KERNEL,8,0
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index d6279593bcdd..1062e88ff040 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -18,6 +18,7 @@
 #include <linux/uaccess.h>
 
 #include <asm/ldt.h>
+#include <asm/kaiser.h>
 #include <asm/desc.h>
 #include <asm/mmu_context.h>
 #include <asm/syscalls.h>
@@ -34,11 +35,21 @@ static void flush_ldt(void *current_mm)
 	set_ldt(pc->ldt->entries, pc->ldt->size);
 }
 
+static void __free_ldt_struct(struct ldt_struct *ldt)
+{
+	if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
+		vfree(ldt->entries);
+	else
+		free_page((unsigned long)ldt->entries);
+	kfree(ldt);
+}
+
 /* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */
 static struct ldt_struct *alloc_ldt_struct(int size)
 {
 	struct ldt_struct *new_ldt;
 	int alloc_size;
+	int ret = 0;
 
 	if (size > LDT_ENTRIES)
 		return NULL;
@@ -66,6 +77,14 @@ static struct ldt_struct *alloc_ldt_struct(int size)
 		return NULL;
 	}
 
+	// FIXME: make kaiser_add_mapping() return an error code
+	// when it fails
+	kaiser_add_mapping((unsigned long)new_ldt->entries, alloc_size,
+			   __PAGE_KERNEL);
+	if (ret) {
+		__free_ldt_struct(new_ldt);
+		return NULL;
+	}
 	new_ldt->size = size;
 	return new_ldt;
 }
@@ -92,12 +111,10 @@ static void free_ldt_struct(struct ldt_struct *ldt)
 	if (likely(!ldt))
 		return;
 
+	kaiser_remove_mapping((unsigned long)ldt->entries,
+			      ldt->size * LDT_ENTRY_SIZE);
 	paravirt_free_ldt(ldt->entries, ldt->size);
-	if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE)
-		vfree(ldt->entries);
-	else
-		free_page((unsigned long)ldt->entries);
-	kfree(ldt);
+	__free_ldt_struct(ldt);
 }
 
 /*
diff --git a/arch/x86/kernel/tracepoint.c b/arch/x86/kernel/tracepoint.c
index 1c113db9ed57..2bb5ee464df3 100644
--- a/arch/x86/kernel/tracepoint.c
+++ b/arch/x86/kernel/tracepoint.c
@@ -9,10 +9,12 @@
 #include <linux/atomic.h>
 
 atomic_t trace_idt_ctr = ATOMIC_INIT(0);
+__aligned(PAGE_SIZE)
 struct desc_ptr trace_idt_descr = { NR_VECTORS * 16 - 1,
 				(unsigned long) trace_idt_table };
 
 /* No need to be aligned, but done to keep all IDTs defined the same way. */
+__aligned(PAGE_SIZE)
 gate_desc trace_idt_table[NR_VECTORS] __page_aligned_bss;
 
 static int trace_irq_vector_refcount;
diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c
index cf1bb922d467..960071d258c3 100644
--- a/arch/x86/mm/kaiser.c
+++ b/arch/x86/mm/kaiser.c
@@ -1,160 +1,306 @@
-
-
+#include <linux/bug.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/string.h>
 #include <linux/types.h>
 #include <linux/bug.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/spinlock.h>
 #include <linux/mm.h>
-
 #include <linux/uaccess.h>
+#include <linux/ftrace.h>
+
+#include <asm/kaiser.h>
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
 #include <asm/desc.h>
 #ifdef CONFIG_KAISER
 
 __visible DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
+/*
+ * At runtime, the only things we map are some things for CPU
+ * hotplug, and stacks for new processes.  No two CPUs will ever
+ * be populating the same addresses, so we only need to ensure
+ * that we protect between two CPUs trying to allocate and
+ * populate the same page table page.
+ *
+ * Only take this lock when doing a set_p[4um]d(), but it is not
+ * needed for doing a set_pte().  We assume that only the *owner*
+ * of a given allocation will be doing this for _their_
+ * allocation.
+ *
+ * This ensures that once a system has been running for a while
+ * and there have been stacks all over and these page tables
+ * are fully populated, there will be no further acquisitions of
+ * this lock.
+ */
+static DEFINE_SPINLOCK(shadow_table_allocation_lock);
 
-/**
- * Get the real ppn from a address in kernel mapping.
- * @param address The virtual adrress
- * @return the physical address
+/*
+ * Returns -1 on error.
  */
-static inline unsigned long get_pa_from_mapping (unsigned long address)
+static inline unsigned long get_pa_from_mapping(unsigned long vaddr)
 {
 	pgd_t *pgd;
 	pud_t *pud;
 	pmd_t *pmd;
 	pte_t *pte;
 
-	pgd = pgd_offset_k(address);
-	BUG_ON(pgd_none(*pgd) || pgd_large(*pgd));
-
-	pud = pud_offset(pgd, address);
-	BUG_ON(pud_none(*pud));
+	pgd = pgd_offset_k(vaddr);
+	/*
+	 * We made all the kernel PGDs present in kaiser_init().
+	 * We expect them to stay that way.
+	 */
+	BUG_ON(pgd_none(*pgd));
+	/*
+	 * PGDs are either 512GB or 128TB on all x86_64
+	 * configurations.  We don't handle these.
+	 */
+	BUG_ON(pgd_large(*pgd));
 
-	if (pud_large(*pud)) {
-		return (pud_pfn(*pud) << PAGE_SHIFT) | (address & ~PUD_PAGE_MASK);
+	pud = pud_offset(pgd, vaddr);
+	if (pud_none(*pud)) {
+		WARN_ON_ONCE(1);
+		return -1;
 	}
 
-	pmd = pmd_offset(pud, address);
-	BUG_ON(pmd_none(*pmd));
+	if (pud_large(*pud))
+		return (pud_pfn(*pud) << PAGE_SHIFT) | (vaddr & ~PUD_PAGE_MASK);
 
-	if (pmd_large(*pmd)) {
-		return (pmd_pfn(*pmd) << PAGE_SHIFT) | (address & ~PMD_PAGE_MASK);
+	pmd = pmd_offset(pud, vaddr);
+	if (pmd_none(*pmd)) {
+		WARN_ON_ONCE(1);
+		return -1;
 	}
 
-	pte = pte_offset_kernel(pmd, address);
-	BUG_ON(pte_none(*pte));
+	if (pmd_large(*pmd))
+		return (pmd_pfn(*pmd) << PAGE_SHIFT) | (vaddr & ~PMD_PAGE_MASK);
 
-	return (pte_pfn(*pte) << PAGE_SHIFT) | (address & ~PAGE_MASK);
+	pte = pte_offset_kernel(pmd, vaddr);
+	if (pte_none(*pte)) {
+		WARN_ON_ONCE(1);
+		return -1;
+	}
+
+	return (pte_pfn(*pte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK);
 }
 
-void _kaiser_copy (unsigned long start_addr, unsigned long size,
-					unsigned long flags)
+/*
+ * This is a relatively normal page table walk, except that it
+ * also tries to allocate page tables pages along the way.
+ *
+ * Returns a pointer to a PTE on success, or NULL on failure.
+ */
+static pte_t *kaiser_pagetable_walk(unsigned long address, bool is_atomic)
 {
-	pgd_t *pgd;
-	pud_t *pud;
 	pmd_t *pmd;
-	pte_t *pte;
-	unsigned long address;
-	unsigned long end_addr = start_addr + size;
-	unsigned long target_address;
+	pud_t *pud;
+	pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address));
+	gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
 
-	for (address = PAGE_ALIGN(start_addr - (PAGE_SIZE - 1));
-			address < PAGE_ALIGN(end_addr); address += PAGE_SIZE) {
-		target_address = get_pa_from_mapping(address);
+	might_sleep();
+	if (is_atomic) {
+		gfp &= ~GFP_KERNEL;
+		gfp |= __GFP_HIGH | __GFP_ATOMIC;
+	}
 
-		pgd = native_get_shadow_pgd(pgd_offset_k(address));
+	if (pgd_none(*pgd)) {
+		WARN_ONCE(1, "All shadow pgds should have been populated");
+		return NULL;
+	}
+	BUILD_BUG_ON(pgd_large(*pgd) != 0);
 
-		BUG_ON(pgd_none(*pgd) && "All shadow pgds should be mapped at this time\n");
-		BUG_ON(pgd_large(*pgd));
+	pud = pud_offset(pgd, address);
+	/* The shadow page tables do not use large mappings: */
+	if (pud_large(*pud)) {
+		WARN_ON(1);
+		return NULL;
+	}
+	if (pud_none(*pud)) {
+		unsigned long new_pmd_page = __get_free_page(gfp);
+		if (!new_pmd_page)
+			return NULL;
+		spin_lock(&shadow_table_allocation_lock);
+		if (pud_none(*pud))
+			set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));
+		else
+			free_page(new_pmd_page);
+		spin_unlock(&shadow_table_allocation_lock);
+	}
 
-		pud = pud_offset(pgd, address);
-		if (pud_none(*pud)) {
-			set_pud(pud, __pud(_PAGE_TABLE | __pa(pmd_alloc_one(0, address))));
-		}
-		BUG_ON(pud_large(*pud));
+	pmd = pmd_offset(pud, address);
+	/* The shadow page tables do not use large mappings: */
+	if (pmd_large(*pmd)) {
+		WARN_ON(1);
+		return NULL;
+	}
+	if (pmd_none(*pmd)) {
+		unsigned long new_pte_page = __get_free_page(gfp);
+		if (!new_pte_page)
+			return NULL;
+		spin_lock(&shadow_table_allocation_lock);
+		if (pmd_none(*pmd))
+			set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page)));
+		else
+			free_page(new_pte_page);
+		spin_unlock(&shadow_table_allocation_lock);
+	}
 
-		pmd = pmd_offset(pud, address);
-		if (pmd_none(*pmd)) {
-			set_pmd(pmd, __pmd(_PAGE_TABLE | __pa(pte_alloc_one_kernel(0, address))));
-		}
-		BUG_ON(pmd_large(*pmd));
+	return pte_offset_kernel(pmd, address);
+}
 
-		pte = pte_offset_kernel(pmd, address);
+int kaiser_add_user_map(const void *__start_addr, unsigned long size,
+			unsigned long flags)
+{
+	int ret = 0;
+	pte_t *pte;
+	unsigned long start_addr = (unsigned long )__start_addr;
+	unsigned long address = start_addr & PAGE_MASK;
+	unsigned long end_addr = PAGE_ALIGN(start_addr + size);
+	unsigned long target_address;
+
+	for (;address < end_addr; address += PAGE_SIZE) {
+		target_address = get_pa_from_mapping(address);
+		if (target_address == -1) {
+			ret = -EIO;
+			break;
+		}
+		pte = kaiser_pagetable_walk(address, false);
 		if (pte_none(*pte)) {
 			set_pte(pte, __pte(flags | target_address));
 		} else {
-			BUG_ON(__pa(pte_page(*pte)) != target_address);
+			pte_t tmp;
+			set_pte(&tmp, __pte(flags | target_address));
+			WARN_ON_ONCE(!pte_same(*pte, tmp));
 		}
 	}
+	return ret;
+}
+
+static int kaiser_add_user_map_ptrs(const void *start, const void *end, unsigned long flags)
+{
+	unsigned long size = end - start;
+
+	return kaiser_add_user_map(start, size, flags);
 }
 
-// at first, add a pmd for every pgd entry in the shadowmem-kernel-part of the kernel mapping
-static inline void __init _kaiser_init(void)
+/*
+ * Ensure that the top level of the (shadow) page tables are
+ * entirely populated.  This ensures that all processes that get
+ * forked have the same entries.  This way, we do not have to
+ * ever go set up new entries in older processes.
+ *
+ * Note: we never free these, so there are no updates to them
+ * after this.
+ */
+static void __init kaiser_init_all_pgds(void)
 {
 	pgd_t *pgd;
 	int i = 0;
 
 	pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0));
 	for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) {
-		set_pgd(pgd + i, __pgd(_PAGE_TABLE |__pa(pud_alloc_one(0, 0))));
+		pgd_t new_pgd;
+		pud_t *pud = pud_alloc_one(&init_mm, PAGE_OFFSET + i * PGDIR_SIZE);
+		if (!pud) {
+			WARN_ON(1);
+			break;
+		}
+		new_pgd = __pgd(_KERNPG_TABLE |__pa(pud));
+		/*
+		 * Make sure not to stomp on some other pgd entry.
+		 */
+		if (!pgd_none(pgd[i])) {
+			WARN_ON(1);
+			continue;
+		}
+		set_pgd(pgd + i, new_pgd);
 	}
 }
 
+#define kaiser_add_user_map_early(start, size, flags) do {	\
+	int __ret = kaiser_add_user_map(start, size, flags);	\
+	WARN_ON(__ret);						\
+} while (0)
+
+#define kaiser_add_user_map_ptrs_early(start, end, flags) do {		\
+	int __ret = kaiser_add_user_map_ptrs(start, end, flags);	\
+	WARN_ON(__ret);							\
+} while (0)
+
 extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];
-spinlock_t shadow_table_lock;
+/*
+ * If anything in here fails, we will likely die on one of the
+ * first kernel->user transitions and init will die.  But, we
+ * will have most of the kernel up by then and should be able to
+ * get a clean warning out of it.  If we BUG_ON() here, we run
+ * the risk of being before we have good console output.
+ */
 void __init kaiser_init(void)
 {
 	int cpu;
-	spin_lock_init(&shadow_table_lock);
-
-	spin_lock(&shadow_table_lock);
 
-	_kaiser_init();
+	kaiser_init_all_pgds();
 
 	for_each_possible_cpu(cpu) {
-		// map the per cpu user variables
-		_kaiser_copy(
-				(unsigned long) (__per_cpu_user_mapped_start + per_cpu_offset(cpu)),
-				(unsigned long) __per_cpu_user_mapped_end - (unsigned long) __per_cpu_user_mapped_start,
-				__PAGE_KERNEL);
+		void *percpu_vaddr = __per_cpu_user_mapped_start +
+				     per_cpu_offset(cpu);
+		unsigned long percpu_sz = __per_cpu_user_mapped_end -
+					  __per_cpu_user_mapped_start;
+		kaiser_add_user_map_early(percpu_vaddr, percpu_sz,
+					  __PAGE_KERNEL);
 	}
 
-	// map the entry/exit text section, which is responsible to switch between user- and kernel mode
-	_kaiser_copy(
-			(unsigned long) __entry_text_start,
-			(unsigned long) __entry_text_end - (unsigned long) __entry_text_start,
-			__PAGE_KERNEL_RX);
+	/*
+	 * Map the entry/exit text section, which is needed at
+	 * switches from user to and from kernel.
+	 */
+	kaiser_add_user_map_ptrs_early(__entry_text_start, __entry_text_end,
+				       __PAGE_KERNEL_RX);
 
-	// the fixed map address of the idt_table
-	_kaiser_copy(
-			(unsigned long) idt_descr.address,
-			sizeof(gate_desc) * NR_VECTORS,
-			__PAGE_KERNEL_RO);
-
-	spin_unlock(&shadow_table_lock);
+#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
+	kaiser_add_user_map_ptrs_early(__irqentry_text_start,
+				       __irqentry_text_end,
+				       __PAGE_KERNEL_RX);
+#endif
+	kaiser_add_user_map_early((void *)idt_descr.address,
+				  sizeof(gate_desc) * NR_VECTORS,
+				  __PAGE_KERNEL_RO);
+#ifdef CONFIG_TRACING
+	kaiser_add_user_map_early(&trace_idt_descr,
+				  sizeof(trace_idt_descr),
+				  __PAGE_KERNEL);
+	kaiser_add_user_map_early(&trace_idt_table,
+				  sizeof(gate_desc) * NR_VECTORS,
+				  __PAGE_KERNEL);
+#endif
+	kaiser_add_user_map_early(&debug_idt_descr, sizeof(debug_idt_descr),
+				  __PAGE_KERNEL);
+	kaiser_add_user_map_early(&debug_idt_table,
+				  sizeof(gate_desc) * NR_VECTORS,
+				  __PAGE_KERNEL);
 }
 
+extern void unmap_pud_range_nofree(pgd_t *pgd, unsigned long start, unsigned long end);
 // add a mapping to the shadow-mapping, and synchronize the mappings
-void kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
+int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
 {
-	spin_lock(&shadow_table_lock);
-	_kaiser_copy(addr, size, flags);
-	spin_unlock(&shadow_table_lock);
+	return kaiser_add_user_map((const void *)addr, size, flags);
 }
 
-extern void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end);
 void kaiser_remove_mapping(unsigned long start, unsigned long size)
 {
-	pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(start));
-	spin_lock(&shadow_table_lock);
-	do {
-		unmap_pud_range(pgd, start, start + size);
-	} while (pgd++ != native_get_shadow_pgd(pgd_offset_k(start + size)));
-	spin_unlock(&shadow_table_lock);
+	unsigned long end = start + size;
+	unsigned long addr;
+
+	for (addr = start; addr < end; addr += PGDIR_SIZE) {
+		pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(addr));
+		/*
+		 * unmap_p4d_range() handles > P4D_SIZE unmaps,
+		 * so no need to trim 'end'.
+		 */
+		unmap_pud_range_nofree(pgd, addr, end);
+	}
 }
 #endif /* CONFIG_KAISER */
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 4fe616bd2586..79377e2a7bcd 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -52,6 +52,7 @@ static DEFINE_SPINLOCK(cpa_lock);
 #define CPA_FLUSHTLB 1
 #define CPA_ARRAY 2
 #define CPA_PAGES_ARRAY 4
+#define CPA_FREE_PAGETABLES 8
 
 #ifdef CONFIG_PROC_FS
 static unsigned long direct_pages_count[PG_LEVEL_NUM];
@@ -723,10 +724,13 @@ static int split_large_page(struct cpa_data *cpa, pte_t *kpte,
 	return 0;
 }
 
-static bool try_to_free_pte_page(pte_t *pte)
+static bool try_to_free_pte_page(struct cpa_data *cpa, pte_t *pte)
 {
 	int i;
 
+	if (!(cpa->flags & CPA_FREE_PAGETABLES))
+		return false;
+
 	for (i = 0; i < PTRS_PER_PTE; i++)
 		if (!pte_none(pte[i]))
 			return false;
@@ -735,10 +739,13 @@ static bool try_to_free_pte_page(pte_t *pte)
 	return true;
 }
 
-static bool try_to_free_pmd_page(pmd_t *pmd)
+static bool try_to_free_pmd_page(struct cpa_data *cpa, pmd_t *pmd)
 {
 	int i;
 
+	if (!(cpa->flags & CPA_FREE_PAGETABLES))
+		return false;
+
 	for (i = 0; i < PTRS_PER_PMD; i++)
 		if (!pmd_none(pmd[i]))
 			return false;
@@ -759,7 +766,9 @@ static bool try_to_free_pud_page(pud_t *pud)
 	return true;
 }
 
-static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end)
+static bool unmap_pte_range(struct cpa_data *cpa, pmd_t *pmd,
+			    unsigned long start,
+			    unsigned long end)
 {
 	pte_t *pte = pte_offset_kernel(pmd, start);
 
@@ -770,22 +779,23 @@ static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end)
 		pte++;
 	}
 
-	if (try_to_free_pte_page((pte_t *)pmd_page_vaddr(*pmd))) {
+	if (try_to_free_pte_page(cpa, (pte_t *)pmd_page_vaddr(*pmd))) {
 		pmd_clear(pmd);
 		return true;
 	}
 	return false;
 }
 
-static void __unmap_pmd_range(pud_t *pud, pmd_t *pmd,
+static void __unmap_pmd_range(struct cpa_data *cpa, pud_t *pud, pmd_t *pmd,
 			      unsigned long start, unsigned long end)
 {
-	if (unmap_pte_range(pmd, start, end))
-		if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
+	if (unmap_pte_range(cpa, pmd, start, end))
+		if (try_to_free_pmd_page(cpa, (pmd_t *)pud_page_vaddr(*pud)))
 			pud_clear(pud);
 }
 
-static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
+static void unmap_pmd_range(struct cpa_data *cpa, pud_t *pud,
+			    unsigned long start, unsigned long end)
 {
 	pmd_t *pmd = pmd_offset(pud, start);
 
@@ -796,7 +806,7 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
 		unsigned long next_page = (start + PMD_SIZE) & PMD_MASK;
 		unsigned long pre_end = min_t(unsigned long, end, next_page);
 
-		__unmap_pmd_range(pud, pmd, start, pre_end);
+		__unmap_pmd_range(cpa, pud, pmd, start, pre_end);
 
 		start = pre_end;
 		pmd++;
@@ -809,7 +819,8 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
 		if (pmd_large(*pmd))
 			pmd_clear(pmd);
 		else
-			__unmap_pmd_range(pud, pmd, start, start + PMD_SIZE);
+			__unmap_pmd_range(cpa, pud, pmd,
+					  start, start + PMD_SIZE);
 
 		start += PMD_SIZE;
 		pmd++;
@@ -819,17 +830,19 @@ static void unmap_pmd_range(pud_t *pud, unsigned long start, unsigned long end)
 	 * 4K leftovers?
 	 */
 	if (start < end)
-		return __unmap_pmd_range(pud, pmd, start, end);
+		return __unmap_pmd_range(cpa, pud, pmd, start, end);
 
 	/*
 	 * Try again to free the PMD page if haven't succeeded above.
 	 */
 	if (!pud_none(*pud))
-		if (try_to_free_pmd_page((pmd_t *)pud_page_vaddr(*pud)))
+		if (try_to_free_pmd_page(cpa, (pmd_t *)pud_page_vaddr(*pud)))
 			pud_clear(pud);
 }
 
-void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
+static void __unmap_pud_range(struct cpa_data *cpa, pgd_t *pgd,
+			      unsigned long start,
+			      unsigned long end)
 {
 	pud_t *pud = pud_offset(pgd, start);
 
@@ -840,7 +853,7 @@ void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
 		unsigned long next_page = (start + PUD_SIZE) & PUD_MASK;
 		unsigned long pre_end	= min_t(unsigned long, end, next_page);
 
-		unmap_pmd_range(pud, start, pre_end);
+		unmap_pmd_range(cpa, pud, start, pre_end);
 
 		start = pre_end;
 		pud++;
@@ -854,7 +867,7 @@ void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
 		if (pud_large(*pud))
 			pud_clear(pud);
 		else
-			unmap_pmd_range(pud, start, start + PUD_SIZE);
+			unmap_pmd_range(cpa, pud, start, start + PUD_SIZE);
 
 		start += PUD_SIZE;
 		pud++;
@@ -864,7 +877,7 @@ void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
 	 * 2M leftovers?
 	 */
 	if (start < end)
-		unmap_pmd_range(pud, start, end);
+		unmap_pmd_range(cpa, pud, start, end);
 
 	/*
 	 * No need to try to free the PUD page because we'll free it in
@@ -872,6 +885,24 @@ void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
 	 */
 }
 
+static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end)
+{
+	struct cpa_data cpa = {
+		.flags = CPA_FREE_PAGETABLES,
+	};
+
+	__unmap_pud_range(&cpa, pgd, start, end);
+}
+
+void unmap_pud_range_nofree(pgd_t *pgd, unsigned long start, unsigned long end)
+{
+	struct cpa_data cpa = {
+		.flags = 0,
+	};
+
+	__unmap_pud_range(&cpa, pgd, start, end);
+}
+
 static void unmap_pgd_range(pgd_t *root, unsigned long addr, unsigned long end)
 {
 	pgd_t *pgd_entry = root + pgd_index(addr);
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 087d3e1e7125..e2bd5c81279e 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -340,40 +340,26 @@ static inline void _pgd_free(pgd_t *pgd)
 		kmem_cache_free(pgd_cache, pgd);
 }
 #else
-static inline pgd_t *_pgd_alloc(void)
-{
+
 #ifdef CONFIG_KAISER
-	// Instead of one PML4, we aquire two PML4s and, thus, an 8kb-aligned memory
-	// block. Therefore, we have to allocate at least 3 pages. However, the
-	// __get_free_pages returns us 4 pages. Hence, we store the base pointer at
-	// the beginning of the page of our 8kb-aligned memory block in order to
-	// correctly free it afterwars.
-
-	unsigned long pages = __get_free_pages(PGALLOC_GFP, get_order(4*PAGE_SIZE));
-
-	if(native_get_normal_pgd((pgd_t*) pages) == (pgd_t*) pages)
-	{
-		*((unsigned long*)(pages + 2 * PAGE_SIZE)) = pages;
-		return (pgd_t *) pages;
-	}
-	else
-	{
-		*((unsigned long*)(pages + 3 * PAGE_SIZE)) = pages;
-		return (pgd_t *) (pages + PAGE_SIZE);
-	}
+/*
+ * Instead of one pmd, we aquire two pmds.  Being order-1, it is
+ * both 8k in size and 8k-aligned.  That lets us just flip bit 12
+ * in a pointer to swap between the two 4k halves.
+ */
+#define PGD_ALLOCATION_ORDER 1
 #else
-	return (pgd_t *)__get_free_page(PGALLOC_GFP);
+#define PGD_ALLOCATION_ORDER 0
 #endif
+
+static inline pgd_t *_pgd_alloc(void)
+{
+	return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER);
 }
 
 static inline void _pgd_free(pgd_t *pgd)
 {
-#ifdef CONFIG_KAISER
-  unsigned long pages = *((unsigned long*) ((char*) pgd + 2 * PAGE_SIZE));
-	free_pages(pages, get_order(4*PAGE_SIZE));
-#else
-	free_page((unsigned long)pgd);
-#endif
+	free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
 }
 #endif /* CONFIG_X86_PAE */
 
-- 
cgit v1.2.3


From edde73205b3fdde8c8a3adfce78cc6d0de72386b Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Tue, 5 Sep 2017 12:05:01 -0700
Subject: kaiser: do not set _PAGE_NX on pgd_none

native_pgd_clear() uses native_set_pgd(), so native_set_pgd() must
avoid setting the _PAGE_NX bit on an otherwise pgd_none() entry:
usually that just generated a warning on exit, but sometimes
more mysterious and damaging failures (our production machines
could not complete booting).

The original fix to this just avoided adding _PAGE_NX to
an empty entry; but eventually more problems surfaced with kexec,
and EFI mapping expected to be a problem too.  So now instead
change native_set_pgd() to update shadow only if _PAGE_USER:

A few places (kernel/machine_kexec_64.c, platform/efi/efi_64.c for sure)
use set_pgd() to set up a temporary internal virtual address space, with
physical pages remapped at what Kaiser regards as userspace addresses:
Kaiser then assumes a shadow pgd follows, which it will try to corrupt.

This appears to be responsible for the recent kexec and kdump failures;
though it's unclear how those did not manifest as a problem before.
Ah, the shadow pgd will only be assumed to "follow" if the requested
pgd is on an even-numbered page: so I suppose it was going wrong 50%
of the time all along.

What we need is a flag to set_pgd(), to tell it we're dealing with
userspace.  Er, isn't that what the pgd's _PAGE_USER bit is saying?
Add a test for that.  But we cannot do the same for pgd_clear()
(which may be called to clear corrupted entries - set aside the
question of "corrupt in which pgd?" until later), so there just
rely on pgd_clear() not being called in the problematic cases -
with a WARN_ON_ONCE() which should fire half the time if it is.

But this is getting too big for an inline function: move it into
arch/x86/mm/kaiser.c (which then demands a boot/compressed mod);
and de-void and de-space native_get_shadow/normal_pgd() while here.

Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/boot/compressed/misc.h   |  1 +
 arch/x86/include/asm/pgtable_64.h | 51 ++++++++++-----------------------------
 arch/x86/mm/kaiser.c              | 42 ++++++++++++++++++++++++++++++++
 3 files changed, 56 insertions(+), 38 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index 3783dc3e10b3..4bf52d351022 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -9,6 +9,7 @@
  */
 #undef CONFIG_PARAVIRT
 #undef CONFIG_PARAVIRT_SPINLOCKS
+#undef CONFIG_KAISER
 #undef CONFIG_KASAN
 
 #include <linux/linkage.h>
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index b0adf0d11c9a..8629be9e6649 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -107,61 +107,36 @@ static inline void native_pud_clear(pud_t *pud)
 }
 
 #ifdef CONFIG_KAISER
-static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp)
+extern pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd);
+
+static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp)
 {
-	return (pgd_t *)(void*)((unsigned long)(void*)pgdp | (unsigned long)PAGE_SIZE);
+	return (pgd_t *)((unsigned long)pgdp | (unsigned long)PAGE_SIZE);
 }
 
-static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp)
+static inline pgd_t *native_get_normal_pgd(pgd_t *pgdp)
 {
-	return (pgd_t *)(void*)((unsigned long)(void*)pgdp &  ~(unsigned long)PAGE_SIZE);
+	return (pgd_t *)((unsigned long)pgdp & ~(unsigned long)PAGE_SIZE);
 }
 #else
-static inline pgd_t * native_get_shadow_pgd(pgd_t *pgdp)
+static inline pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
+{
+	return pgd;
+}
+static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp)
 {
 	BUILD_BUG_ON(1);
 	return NULL;
 }
-static inline pgd_t * native_get_normal_pgd(pgd_t *pgdp)
+static inline pgd_t *native_get_normal_pgd(pgd_t *pgdp)
 {
 	return pgdp;
 }
 #endif /* CONFIG_KAISER */
 
-/*
- * Page table pages are page-aligned.  The lower half of the top
- * level is used for userspace and the top half for the kernel.
- * This returns true for user pages that need to get copied into
- * both the user and kernel copies of the page tables, and false
- * for kernel pages that should only be in the kernel copy.
- */
-static inline bool is_userspace_pgd(void *__ptr)
-{
-	unsigned long ptr = (unsigned long)__ptr;
-
-	return ((ptr % PAGE_SIZE) < (PAGE_SIZE / 2));
-}
-
 static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
 {
-#ifdef CONFIG_KAISER
-	pteval_t extra_kern_pgd_flags = 0;
-	/* Do we need to also populate the shadow pgd? */
-	if (is_userspace_pgd(pgdp)) {
-		native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
-		/*
-		 * Even if the entry is *mapping* userspace, ensure
-		 * that userspace can not use it.  This way, if we
-		 * get out to userspace running on the kernel CR3,
-		 * userspace will crash instead of running.
-		 */
-		extra_kern_pgd_flags = _PAGE_NX;
-	}
-	pgdp->pgd = pgd.pgd;
-	pgdp->pgd |= extra_kern_pgd_flags;
-#else /* CONFIG_KAISER */
-	*pgdp = pgd;
-#endif
+	*pgdp = kaiser_set_shadow_pgd(pgdp, pgd);
 }
 
 static inline void native_pgd_clear(pgd_t *pgd)
diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c
index 960071d258c3..da2fdd3c7010 100644
--- a/arch/x86/mm/kaiser.c
+++ b/arch/x86/mm/kaiser.c
@@ -303,4 +303,46 @@ void kaiser_remove_mapping(unsigned long start, unsigned long size)
 		unmap_pud_range_nofree(pgd, addr, end);
 	}
 }
+
+/*
+ * Page table pages are page-aligned.  The lower half of the top
+ * level is used for userspace and the top half for the kernel.
+ * This returns true for user pages that need to get copied into
+ * both the user and kernel copies of the page tables, and false
+ * for kernel pages that should only be in the kernel copy.
+ */
+static inline bool is_userspace_pgd(pgd_t *pgdp)
+{
+	return ((unsigned long)pgdp % PAGE_SIZE) < (PAGE_SIZE / 2);
+}
+
+pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
+{
+	/*
+	 * Do we need to also populate the shadow pgd?  Check _PAGE_USER to
+	 * skip cases like kexec and EFI which make temporary low mappings.
+	 */
+	if (pgd.pgd & _PAGE_USER) {
+		if (is_userspace_pgd(pgdp)) {
+			native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
+			/*
+			 * Even if the entry is *mapping* userspace, ensure
+			 * that userspace can not use it.  This way, if we
+			 * get out to userspace running on the kernel CR3,
+			 * userspace will crash instead of running.
+			 */
+			pgd.pgd |= _PAGE_NX;
+		}
+	} else if (!pgd.pgd) {
+		/*
+		 * pgd_clear() cannot check _PAGE_USER, and is even used to
+		 * clear corrupted pgd entries: so just rely on cases like
+		 * kexec and EFI never to be using pgd_clear().
+		 */
+		if (!WARN_ON_ONCE((unsigned long)pgdp & PAGE_SIZE) &&
+		    is_userspace_pgd(pgdp))
+			native_get_shadow_pgd(pgdp)->pgd = pgd.pgd;
+	}
+	return pgd;
+}
 #endif /* CONFIG_KAISER */
-- 
cgit v1.2.3


From 9b94cf97f42ca30fe9b5010900fa6e1d6855a9f6 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Sun, 3 Sep 2017 17:09:44 -0700
Subject: kaiser: fix build and FIXME in alloc_ldt_struct()

Include linux/kaiser.h instead of asm/kaiser.h to build ldt.c without
CONFIG_KAISER.  kaiser_add_mapping() does already return an error code,
so fix the FIXME.

Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kernel/ldt.c | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index 1062e88ff040..89ee37469d41 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -16,9 +16,9 @@
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/uaccess.h>
+#include <linux/kaiser.h>
 
 #include <asm/ldt.h>
-#include <asm/kaiser.h>
 #include <asm/desc.h>
 #include <asm/mmu_context.h>
 #include <asm/syscalls.h>
@@ -49,7 +49,7 @@ static struct ldt_struct *alloc_ldt_struct(int size)
 {
 	struct ldt_struct *new_ldt;
 	int alloc_size;
-	int ret = 0;
+	int ret;
 
 	if (size > LDT_ENTRIES)
 		return NULL;
@@ -77,10 +77,8 @@ static struct ldt_struct *alloc_ldt_struct(int size)
 		return NULL;
 	}
 
-	// FIXME: make kaiser_add_mapping() return an error code
-	// when it fails
-	kaiser_add_mapping((unsigned long)new_ldt->entries, alloc_size,
-			   __PAGE_KERNEL);
+	ret = kaiser_add_mapping((unsigned long)new_ldt->entries, alloc_size,
+				 __PAGE_KERNEL);
 	if (ret) {
 		__free_ldt_struct(new_ldt);
 		return NULL;
-- 
cgit v1.2.3


From 487f0b73d82611a2dc48d7d78409e2e9d994006a Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Thu, 21 Sep 2017 20:39:56 -0700
Subject: kaiser: fix regs to do_nmi() ifndef CONFIG_KAISER

pjt has observed that nmi's second (nmi_from_kernel) call to do_nmi()
adjusted the %rdi regs arg, rightly when CONFIG_KAISER, but wrongly
when not CONFIG_KAISER.

Although the minimal change is to add an #ifdef CONFIG_KAISER around
the addq line, that looks cluttered, and I prefer how the first call
to do_nmi() handled it: prepare args in %rdi and %rsi before getting
into the CONFIG_KAISER block, since it does not touch them at all.

And while we're here, place the "#ifdef CONFIG_KAISER" that follows
each, to enclose the "Unconditionally restore CR3" comment: matching
how the "Unconditionally use kernel CR3" comment above is enclosed.

Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/entry/entry_64.S | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 99bce319df0c..0942aa6d16a8 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -1297,12 +1297,13 @@ ENTRY(nmi)
 	movq	%rax, %cr3
 #endif
 	call	do_nmi
+
+#ifdef CONFIG_KAISER
 	/*
 	 * Unconditionally restore CR3.  I know we return to
 	 * kernel code that needs user CR3, but do we ever return
 	 * to "user mode" where we need the kernel CR3?
 	 */
-#ifdef CONFIG_KAISER
 	popq	%rax
 	mov	%rax, %cr3
 #endif
@@ -1526,6 +1527,8 @@ end_repeat_nmi:
 	SWAPGS
 	xorl	%ebx, %ebx
 1:
+	movq	%rsp, %rdi
+	movq	$-1, %rsi
 #ifdef CONFIG_KAISER
 	/* Unconditionally use kernel CR3 for do_nmi() */
 	/* %rax is saved above, so OK to clobber here */
@@ -1538,16 +1541,14 @@ end_repeat_nmi:
 #endif
 
 	/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
-	movq	%rsp, %rdi
-	addq	$8, %rdi /* point %rdi at ptregs, fixed up for CR3 */
-	movq	$-1, %rsi
 	call	do_nmi
+
+#ifdef CONFIG_KAISER
 	/*
 	 * Unconditionally restore CR3.  We might be returning to
 	 * kernel code that needs user CR3, like just just before
 	 * a sysret.
 	 */
-#ifdef CONFIG_KAISER
 	popq	%rax
 	mov	%rax, %cr3
 #endif
-- 
cgit v1.2.3


From 20cbe9a3aa2e341824da57ce0ac6d52cbffaa570 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Wed, 23 Aug 2017 14:21:14 -0700
Subject: kaiser: fix perf crashes

Avoid perf crashes: place debug_store in the user-mapped per-cpu area
instead of allocating, and use page allocator plus kaiser_add_mapping()
to keep the BTS and PEBS buffers user-mapped (that is, present in the
user mapping, though visible only to kernel and hardware).  The PEBS
fixup buffer does not need this treatment.

The need for a user-mapped struct debug_store showed up before doing
any conscious perf testing: in a couple of kernel paging oopses on
Westmere, implicating the debug_store offset of the per-cpu area.

Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kernel/cpu/perf_event_intel_ds.c | 57 ++++++++++++++++++++++++-------
 1 file changed, 45 insertions(+), 12 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 1e7de3cefc9c..4e334e3ac16f 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -2,11 +2,15 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 
+#include <asm/kaiser.h>
 #include <asm/perf_event.h>
 #include <asm/insn.h>
 
 #include "perf_event.h"
 
+static
+DEFINE_PER_CPU_SHARED_ALIGNED_USER_MAPPED(struct debug_store, cpu_debug_store);
+
 /* The size of a BTS record in bytes: */
 #define BTS_RECORD_SIZE		24
 
@@ -268,6 +272,39 @@ void fini_debug_store_on_cpu(int cpu)
 
 static DEFINE_PER_CPU(void *, insn_buffer);
 
+static void *dsalloc(size_t size, gfp_t flags, int node)
+{
+#ifdef CONFIG_KAISER
+	unsigned int order = get_order(size);
+	struct page *page;
+	unsigned long addr;
+
+	page = __alloc_pages_node(node, flags | __GFP_ZERO, order);
+	if (!page)
+		return NULL;
+	addr = (unsigned long)page_address(page);
+	if (kaiser_add_mapping(addr, size, __PAGE_KERNEL) < 0) {
+		__free_pages(page, order);
+		addr = 0;
+	}
+	return (void *)addr;
+#else
+	return kmalloc_node(size, flags | __GFP_ZERO, node);
+#endif
+}
+
+static void dsfree(const void *buffer, size_t size)
+{
+#ifdef CONFIG_KAISER
+	if (!buffer)
+		return;
+	kaiser_remove_mapping((unsigned long)buffer, size);
+	free_pages((unsigned long)buffer, get_order(size));
+#else
+	kfree(buffer);
+#endif
+}
+
 static int alloc_pebs_buffer(int cpu)
 {
 	struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
@@ -278,7 +315,7 @@ static int alloc_pebs_buffer(int cpu)
 	if (!x86_pmu.pebs)
 		return 0;
 
-	buffer = kzalloc_node(x86_pmu.pebs_buffer_size, GFP_KERNEL, node);
+	buffer = dsalloc(x86_pmu.pebs_buffer_size, GFP_KERNEL, node);
 	if (unlikely(!buffer))
 		return -ENOMEM;
 
@@ -289,7 +326,7 @@ static int alloc_pebs_buffer(int cpu)
 	if (x86_pmu.intel_cap.pebs_format < 2) {
 		ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node);
 		if (!ibuffer) {
-			kfree(buffer);
+			dsfree(buffer, x86_pmu.pebs_buffer_size);
 			return -ENOMEM;
 		}
 		per_cpu(insn_buffer, cpu) = ibuffer;
@@ -315,7 +352,8 @@ static void release_pebs_buffer(int cpu)
 	kfree(per_cpu(insn_buffer, cpu));
 	per_cpu(insn_buffer, cpu) = NULL;
 
-	kfree((void *)(unsigned long)ds->pebs_buffer_base);
+	dsfree((void *)(unsigned long)ds->pebs_buffer_base,
+			x86_pmu.pebs_buffer_size);
 	ds->pebs_buffer_base = 0;
 }
 
@@ -329,7 +367,7 @@ static int alloc_bts_buffer(int cpu)
 	if (!x86_pmu.bts)
 		return 0;
 
-	buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node);
+	buffer = dsalloc(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node);
 	if (unlikely(!buffer)) {
 		WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__);
 		return -ENOMEM;
@@ -355,19 +393,15 @@ static void release_bts_buffer(int cpu)
 	if (!ds || !x86_pmu.bts)
 		return;
 
-	kfree((void *)(unsigned long)ds->bts_buffer_base);
+	dsfree((void *)(unsigned long)ds->bts_buffer_base, BTS_BUFFER_SIZE);
 	ds->bts_buffer_base = 0;
 }
 
 static int alloc_ds_buffer(int cpu)
 {
-	int node = cpu_to_node(cpu);
-	struct debug_store *ds;
-
-	ds = kzalloc_node(sizeof(*ds), GFP_KERNEL, node);
-	if (unlikely(!ds))
-		return -ENOMEM;
+	struct debug_store *ds = per_cpu_ptr(&cpu_debug_store, cpu);
 
+	memset(ds, 0, sizeof(*ds));
 	per_cpu(cpu_hw_events, cpu).ds = ds;
 
 	return 0;
@@ -381,7 +415,6 @@ static void release_ds_buffer(int cpu)
 		return;
 
 	per_cpu(cpu_hw_events, cpu).ds = NULL;
-	kfree(ds);
 }
 
 void release_ds_buffers(void)
-- 
cgit v1.2.3


From 407c3ff6a24c7cb418b77a124d17e282f9622037 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Sun, 3 Sep 2017 18:48:02 -0700
Subject: kaiser: ENOMEM if kaiser_pagetable_walk() NULL

kaiser_add_user_map() took no notice when kaiser_pagetable_walk() failed.
And avoid its might_sleep() when atomic (though atomic at present unused).

Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/mm/kaiser.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c
index da2fdd3c7010..57e3a7dd4708 100644
--- a/arch/x86/mm/kaiser.c
+++ b/arch/x86/mm/kaiser.c
@@ -99,11 +99,11 @@ static pte_t *kaiser_pagetable_walk(unsigned long address, bool is_atomic)
 	pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address));
 	gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
 
-	might_sleep();
 	if (is_atomic) {
 		gfp &= ~GFP_KERNEL;
 		gfp |= __GFP_HIGH | __GFP_ATOMIC;
-	}
+	} else
+		might_sleep();
 
 	if (pgd_none(*pgd)) {
 		WARN_ONCE(1, "All shadow pgds should have been populated");
@@ -160,13 +160,17 @@ int kaiser_add_user_map(const void *__start_addr, unsigned long size,
 	unsigned long end_addr = PAGE_ALIGN(start_addr + size);
 	unsigned long target_address;
 
-	for (;address < end_addr; address += PAGE_SIZE) {
+	for (; address < end_addr; address += PAGE_SIZE) {
 		target_address = get_pa_from_mapping(address);
 		if (target_address == -1) {
 			ret = -EIO;
 			break;
 		}
 		pte = kaiser_pagetable_walk(address, false);
+		if (!pte) {
+			ret = -ENOMEM;
+			break;
+		}
 		if (pte_none(*pte)) {
 			set_pte(pte, __pte(flags | target_address));
 		} else {
-- 
cgit v1.2.3


From 5fbd46c4be78174656b52e1b04d3057a5dd7af66 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Sun, 3 Sep 2017 19:18:07 -0700
Subject: kaiser: tidied up asm/kaiser.h somewhat

Mainly deleting a surfeit of blank lines, and reflowing header comment.

Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/kaiser.h | 32 +++++++++++++-------------------
 1 file changed, 13 insertions(+), 19 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h
index 0703f48777f3..7394ba9f9951 100644
--- a/arch/x86/include/asm/kaiser.h
+++ b/arch/x86/include/asm/kaiser.h
@@ -1,15 +1,17 @@
 #ifndef _ASM_X86_KAISER_H
 #define _ASM_X86_KAISER_H
-
-/* This file includes the definitions for the KAISER feature.
- * KAISER is a counter measure against x86_64 side channel attacks on the kernel virtual memory.
- * It has a shodow-pgd for every process. the shadow-pgd has a minimalistic kernel-set mapped,
- * but includes the whole user memory. Within a kernel context switch, or when an interrupt is handled,
- * the pgd is switched to the normal one. When the system switches to user mode, the shadow pgd is enabled.
- * By this, the virtual memory chaches are freed, and the user may not attack the whole kernel memory.
+/*
+ * This file includes the definitions for the KAISER feature.
+ * KAISER is a counter measure against x86_64 side channel attacks on
+ * the kernel virtual memory.  It has a shadow pgd for every process: the
+ * shadow pgd has a minimalistic kernel-set mapped, but includes the whole
+ * user memory. Within a kernel context switch, or when an interrupt is handled,
+ * the pgd is switched to the normal one. When the system switches to user mode,
+ * the shadow pgd is enabled. By this, the virtual memory caches are freed,
+ * and the user may not attack the whole kernel memory.
  *
- * A minimalistic kernel mapping holds the parts needed to be mapped in user mode, as the entry/exit functions
- * of the user space, or the stacks.
+ * A minimalistic kernel mapping holds the parts needed to be mapped in user
+ * mode, such as the entry/exit functions of the user space, or the stacks.
  */
 #ifdef __ASSEMBLY__
 #ifdef CONFIG_KAISER
@@ -48,13 +50,10 @@ _SWITCH_TO_KERNEL_CR3 %rax
 movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
 .endm
 
-
 .macro SWITCH_USER_CR3_NO_STACK
-
 movq %rax, PER_CPU_VAR(unsafe_stack_register_backup)
 _SWITCH_TO_USER_CR3 %rax
 movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
-
 .endm
 
 #else /* CONFIG_KAISER */
@@ -72,7 +71,6 @@ movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
 
 #else /* __ASSEMBLY__ */
 
-
 #ifdef CONFIG_KAISER
 /*
  * Upon kernel/user mode switch, it may happen that the address
@@ -80,7 +78,6 @@ movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
  * stored.  To change the address space, another register is
  * needed.  A register therefore has to be stored/restored.
 */
-
 DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
 
 /**
@@ -95,7 +92,6 @@ DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
  */
 extern int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags);
 
-
 /**
  *  kaiser_remove_mapping - unmap a virtual memory part of the shadow mapping
  *  @addr: the start address of the range
@@ -104,12 +100,12 @@ extern int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned l
 extern void kaiser_remove_mapping(unsigned long start, unsigned long size);
 
 /**
- *  kaiser_initialize_mapping - Initalize the shadow mapping
+ *  kaiser_init - Initialize the shadow mapping
  *
  *  Most parts of the shadow mapping can be mapped upon boot
  *  time.  Only per-process things like the thread stacks
  *  or a new LDT have to be mapped at runtime.  These boot-
- *  time mappings are permanent and nevertunmapped.
+ *  time mappings are permanent and never unmapped.
  */
 extern void kaiser_init(void);
 
@@ -117,6 +113,4 @@ extern void kaiser_init(void);
 
 #endif /* __ASSEMBLY */
 
-
-
 #endif /* _ASM_X86_KAISER_H */
-- 
cgit v1.2.3


From 0c68228f7b39c96cabd89bee3e1d6bd55926df80 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Sun, 3 Sep 2017 19:23:08 -0700
Subject: kaiser: tidied up kaiser_add/remove_mapping slightly

Yes, unmap_pud_range_nofree()'s declaration ought to be in a
header file really, but I'm not sure we want to use it anyway:
so for now just declare it inside kaiser_remove_mapping().
And there doesn't seem to be such a thing as unmap_p4d_range(),
even in a 5-level paging tree.

Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/mm/kaiser.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c
index 57e3a7dd4708..34e1d48e1339 100644
--- a/arch/x86/mm/kaiser.c
+++ b/arch/x86/mm/kaiser.c
@@ -286,8 +286,7 @@ void __init kaiser_init(void)
 				  __PAGE_KERNEL);
 }
 
-extern void unmap_pud_range_nofree(pgd_t *pgd, unsigned long start, unsigned long end);
-// add a mapping to the shadow-mapping, and synchronize the mappings
+/* Add a mapping to the shadow mapping, and synchronize the mappings */
 int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
 {
 	return kaiser_add_user_map((const void *)addr, size, flags);
@@ -295,15 +294,13 @@ int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long fla
 
 void kaiser_remove_mapping(unsigned long start, unsigned long size)
 {
+	extern void unmap_pud_range_nofree(pgd_t *pgd,
+				unsigned long start, unsigned long end);
 	unsigned long end = start + size;
 	unsigned long addr;
 
 	for (addr = start; addr < end; addr += PGDIR_SIZE) {
 		pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(addr));
-		/*
-		 * unmap_p4d_range() handles > P4D_SIZE unmaps,
-		 * so no need to trim 'end'.
-		 */
 		unmap_pud_range_nofree(pgd, addr, end);
 	}
 }
-- 
cgit v1.2.3


From f127705d26b34c053e59b47aef84b3ea564dd743 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Mon, 2 Oct 2017 10:57:24 -0700
Subject: kaiser: kaiser_remove_mapping() move along the pgd

When removing the bogus comment from kaiser_remove_mapping(),
I really ought to have checked the extent of its bogosity: as
Neel points out, there is nothing to stop unmap_pud_range_nofree()
from continuing beyond the end of a pud (and starting in the wrong
position on the next).

Fix kaiser_remove_mapping() to constrain the extent and advance pgd
pointer correctly: use pgd_addr_end() macro as used throughout base
mm (but don't assume page-rounded start and size in this case).

But this bug was very unlikely to trigger in this backport: since
any buddy allocation is contained within a single pud extent, and
we are not using vmapped stacks (and are only mapping one page of
stack anyway): the only way to hit this bug here would be when
freeing a large modified ldt.

Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/mm/kaiser.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c
index 34e1d48e1339..1d50c44ff6a7 100644
--- a/arch/x86/mm/kaiser.c
+++ b/arch/x86/mm/kaiser.c
@@ -297,11 +297,13 @@ void kaiser_remove_mapping(unsigned long start, unsigned long size)
 	extern void unmap_pud_range_nofree(pgd_t *pgd,
 				unsigned long start, unsigned long end);
 	unsigned long end = start + size;
-	unsigned long addr;
+	unsigned long addr, next;
+	pgd_t *pgd;
 
-	for (addr = start; addr < end; addr += PGDIR_SIZE) {
-		pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(addr));
-		unmap_pud_range_nofree(pgd, addr, end);
+	pgd = native_get_shadow_pgd(pgd_offset_k(start));
+	for (addr = start; addr < end; pgd++, addr = next) {
+		next = pgd_addr_end(addr, end);
+		unmap_pud_range_nofree(pgd, addr, next);
 	}
 }
 
-- 
cgit v1.2.3


From c52e55a2a82d3a44189810d35717d81cb4cf61d4 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Mon, 21 Aug 2017 20:11:43 -0700
Subject: kaiser: cleanups while trying for gold link

While trying to get our gold link to work, four cleanups:
matched the gdt_page declaration to its definition;
in fiddling unsuccessfully with PERCPU_INPUT(), lined up backslashes;
lined up the backslashes according to convention in percpu-defs.h;
deleted the unused irq_stack_pointer addition to irq_stack_union.

Sad to report that aligning backslashes does not appear to help gold
align to 8192: but while these did not help, they are worth keeping.

Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/desc.h      | 2 +-
 arch/x86/include/asm/processor.h | 5 -----
 2 files changed, 1 insertion(+), 6 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index 4e10d73cf018..880db91d9457 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -43,7 +43,7 @@ struct gdt_page {
 	struct desc_struct gdt[GDT_ENTRIES];
 } __attribute__((aligned(PAGE_SIZE)));
 
-DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page);
+DECLARE_PER_CPU_PAGE_ALIGNED_USER_MAPPED(struct gdt_page, gdt_page);
 
 static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
 {
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 6a2e0a0b4a96..f3bdaed0188f 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -332,11 +332,6 @@ union irq_stack_union {
 		char gs_base[40];
 		unsigned long stack_canary;
 	};
-
-	struct {
-		char irq_stack_pointer[64];
-		char unused[IRQ_STACK_SIZE - 64];
-	};
 };
 
 DECLARE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union) __visible;
-- 
cgit v1.2.3


From aeda21d77e22fb382c51fd3f6bbb18df69bc032f Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Sat, 9 Sep 2017 17:31:18 -0700
Subject: kaiser: name that 0x1000 KAISER_SHADOW_PGD_OFFSET

There's a 0x1000 in various places, which looks better with a name.

Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/entry/entry_64.S     | 4 ++--
 arch/x86/include/asm/kaiser.h | 7 +++++--
 2 files changed, 7 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 0942aa6d16a8..82661cd1cd13 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -1292,7 +1292,7 @@ ENTRY(nmi)
 	movq	%cr3, %rax
 	pushq	%rax
 #ifdef CONFIG_KAISER_REAL_SWITCH
-	andq	$(~0x1000), %rax
+	andq	$(~KAISER_SHADOW_PGD_OFFSET), %rax
 #endif
 	movq	%rax, %cr3
 #endif
@@ -1535,7 +1535,7 @@ end_repeat_nmi:
 	movq	%cr3, %rax
 	pushq	%rax
 #ifdef CONFIG_KAISER_REAL_SWITCH
-	andq	$(~0x1000), %rax
+	andq	$(~KAISER_SHADOW_PGD_OFFSET), %rax
 #endif
 	movq	%rax, %cr3
 #endif
diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h
index 7394ba9f9951..051acf678cda 100644
--- a/arch/x86/include/asm/kaiser.h
+++ b/arch/x86/include/asm/kaiser.h
@@ -13,13 +13,16 @@
  * A minimalistic kernel mapping holds the parts needed to be mapped in user
  * mode, such as the entry/exit functions of the user space, or the stacks.
  */
+
+#define KAISER_SHADOW_PGD_OFFSET 0x1000
+
 #ifdef __ASSEMBLY__
 #ifdef CONFIG_KAISER
 
 .macro _SWITCH_TO_KERNEL_CR3 reg
 movq %cr3, \reg
 #ifdef CONFIG_KAISER_REAL_SWITCH
-andq $(~0x1000), \reg
+andq $(~KAISER_SHADOW_PGD_OFFSET), \reg
 #endif
 movq \reg, %cr3
 .endm
@@ -27,7 +30,7 @@ movq \reg, %cr3
 .macro _SWITCH_TO_USER_CR3 reg
 movq %cr3, \reg
 #ifdef CONFIG_KAISER_REAL_SWITCH
-orq $(0x1000), \reg
+orq $(KAISER_SHADOW_PGD_OFFSET), \reg
 #endif
 movq \reg, %cr3
 .endm
-- 
cgit v1.2.3


From b9d2ccc54e17b5aa50dd0c036d3f4fb4e5248d54 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Sun, 3 Sep 2017 18:30:43 -0700
Subject: kaiser: delete KAISER_REAL_SWITCH option

We fail to see what CONFIG_KAISER_REAL_SWITCH is for: it seems to be
left over from early development, and now just obscures tricky parts
of the code.  Delete it before adding PCIDs, or nokaiser boot option.

(Or if there is some good reason to keep the option, then it needs
a help text - and a "depends on KAISER", so that all those without
KAISER are not asked the question.)

Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/entry/entry_64.S     | 4 ----
 arch/x86/include/asm/kaiser.h | 4 ----
 2 files changed, 8 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 82661cd1cd13..a058e0fd99e3 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -1291,9 +1291,7 @@ ENTRY(nmi)
 	/* %rax is saved above, so OK to clobber here */
 	movq	%cr3, %rax
 	pushq	%rax
-#ifdef CONFIG_KAISER_REAL_SWITCH
 	andq	$(~KAISER_SHADOW_PGD_OFFSET), %rax
-#endif
 	movq	%rax, %cr3
 #endif
 	call	do_nmi
@@ -1534,9 +1532,7 @@ end_repeat_nmi:
 	/* %rax is saved above, so OK to clobber here */
 	movq	%cr3, %rax
 	pushq	%rax
-#ifdef CONFIG_KAISER_REAL_SWITCH
 	andq	$(~KAISER_SHADOW_PGD_OFFSET), %rax
-#endif
 	movq	%rax, %cr3
 #endif
 
diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h
index 051acf678cda..e0fc45e77aee 100644
--- a/arch/x86/include/asm/kaiser.h
+++ b/arch/x86/include/asm/kaiser.h
@@ -21,17 +21,13 @@
 
 .macro _SWITCH_TO_KERNEL_CR3 reg
 movq %cr3, \reg
-#ifdef CONFIG_KAISER_REAL_SWITCH
 andq $(~KAISER_SHADOW_PGD_OFFSET), \reg
-#endif
 movq \reg, %cr3
 .endm
 
 .macro _SWITCH_TO_USER_CR3 reg
 movq %cr3, \reg
-#ifdef CONFIG_KAISER_REAL_SWITCH
 orq $(KAISER_SHADOW_PGD_OFFSET), \reg
-#endif
 movq \reg, %cr3
 .endm
 
-- 
cgit v1.2.3


From 3e3d38fd9832e82a8cb1a5b1154acfa43ac08d15 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Sat, 9 Sep 2017 21:27:32 -0700
Subject: kaiser: vmstat show NR_KAISERTABLE as nr_overhead

The kaiser update made an interesting choice, never to free any shadow
page tables.  Contention on global spinlock was worrying, particularly
with it held across page table scans when freeing.  Something had to be
done: I was going to add refcounting; but simply never to free them is
an appealing choice, minimizing contention without complicating the code
(the more a page table is found already, the less the spinlock is used).

But leaking pages in this way is also a worry: can we get away with it?
At the very least, we need a count to show how bad it actually gets:
in principle, one might end up wasting about 1/256 of memory that way
(1/512 for when direct-mapped pages have to be user-mapped, plus 1/512
for when they are user-mapped from the vmalloc area on another occasion
(but we don't have vmalloc'ed stacks, so only large ldts are vmalloc'ed).

Add per-cpu stat NR_KAISERTABLE: including 256 at startup for the
shared pgd entries, and 1 for each intermediate page table added
thereafter for user-mapping - but leave out the 1 per mm, for its
shadow pgd, because that distracts from the monotonic increase.
Shown in /proc/vmstat as nr_overhead (0 if kaiser not enabled).

In practice, it doesn't look so bad so far: more like 1/12000 after
nine hours of gtests below; and movable pageblock segregation should
tend to cluster the kaiser tables into a subset of the address space
(if not, they will be bad for compaction too).  But production may
tell a different story: keep an eye on this number, and bring back
lighter freeing if it gets out of control (maybe a shrinker).

Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/mm/kaiser.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c
index 1d50c44ff6a7..bf48bf0df8c5 100644
--- a/arch/x86/mm/kaiser.c
+++ b/arch/x86/mm/kaiser.c
@@ -122,9 +122,11 @@ static pte_t *kaiser_pagetable_walk(unsigned long address, bool is_atomic)
 		if (!new_pmd_page)
 			return NULL;
 		spin_lock(&shadow_table_allocation_lock);
-		if (pud_none(*pud))
+		if (pud_none(*pud)) {
 			set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));
-		else
+			__inc_zone_page_state(virt_to_page((void *)
+						new_pmd_page), NR_KAISERTABLE);
+		} else
 			free_page(new_pmd_page);
 		spin_unlock(&shadow_table_allocation_lock);
 	}
@@ -140,9 +142,11 @@ static pte_t *kaiser_pagetable_walk(unsigned long address, bool is_atomic)
 		if (!new_pte_page)
 			return NULL;
 		spin_lock(&shadow_table_allocation_lock);
-		if (pmd_none(*pmd))
+		if (pmd_none(*pmd)) {
 			set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page)));
-		else
+			__inc_zone_page_state(virt_to_page((void *)
+						new_pte_page), NR_KAISERTABLE);
+		} else
 			free_page(new_pte_page);
 		spin_unlock(&shadow_table_allocation_lock);
 	}
@@ -206,11 +210,13 @@ static void __init kaiser_init_all_pgds(void)
 	pgd = native_get_shadow_pgd(pgd_offset_k((unsigned long )0));
 	for (i = PTRS_PER_PGD / 2; i < PTRS_PER_PGD; i++) {
 		pgd_t new_pgd;
-		pud_t *pud = pud_alloc_one(&init_mm, PAGE_OFFSET + i * PGDIR_SIZE);
+		pud_t *pud = pud_alloc_one(&init_mm,
+					   PAGE_OFFSET + i * PGDIR_SIZE);
 		if (!pud) {
 			WARN_ON(1);
 			break;
 		}
+		inc_zone_page_state(virt_to_page(pud), NR_KAISERTABLE);
 		new_pgd = __pgd(_KERNPG_TABLE |__pa(pud));
 		/*
 		 * Make sure not to stomp on some other pgd entry.
-- 
cgit v1.2.3


From eb82151d0b1df53d1ad8d060ecd554ca12eb552a Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Wed, 30 Aug 2017 16:23:00 -0700
Subject: kaiser: enhanced by kernel and user PCIDs

Merged performance improvements to Kaiser, using distinct kernel
and user Process Context Identifiers to minimize the TLB flushing.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/entry/entry_64.S                   | 10 ++++--
 arch/x86/entry/entry_64_compat.S            |  1 +
 arch/x86/include/asm/cpufeature.h           |  1 +
 arch/x86/include/asm/kaiser.h               | 15 +++++++--
 arch/x86/include/asm/pgtable_types.h        | 26 +++++++++++++++
 arch/x86/include/asm/tlbflush.h             | 52 ++++++++++++++++++++++++-----
 arch/x86/include/uapi/asm/processor-flags.h |  3 +-
 arch/x86/kernel/cpu/common.c                | 34 +++++++++++++++++++
 arch/x86/kvm/x86.c                          |  3 +-
 arch/x86/mm/kaiser.c                        |  7 ++++
 arch/x86/mm/tlb.c                           | 46 +++++++++++++++++++++++--
 11 files changed, 181 insertions(+), 17 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index a058e0fd99e3..6a18d787aed4 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -1291,7 +1291,10 @@ ENTRY(nmi)
 	/* %rax is saved above, so OK to clobber here */
 	movq	%cr3, %rax
 	pushq	%rax
-	andq	$(~KAISER_SHADOW_PGD_OFFSET), %rax
+	/* mask off "user" bit of pgd address and 12 PCID bits: */
+	andq	$(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
+	/* Add back kernel PCID and "no flush" bit */
+	orq	X86_CR3_PCID_KERN_VAR, %rax
 	movq	%rax, %cr3
 #endif
 	call	do_nmi
@@ -1532,7 +1535,10 @@ end_repeat_nmi:
 	/* %rax is saved above, so OK to clobber here */
 	movq	%cr3, %rax
 	pushq	%rax
-	andq	$(~KAISER_SHADOW_PGD_OFFSET), %rax
+	/* mask off "user" bit of pgd address and 12 PCID bits: */
+	andq	$(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
+	/* Add back kernel PCID and "no flush" bit */
+	orq	X86_CR3_PCID_KERN_VAR, %rax
 	movq	%rax, %cr3
 #endif
 
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index fe1911930b52..d03bf0e28b8b 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -13,6 +13,7 @@
 #include <asm/irqflags.h>
 #include <asm/asm.h>
 #include <asm/smap.h>
+#include <asm/pgtable_types.h>
 #include <asm/kaiser.h>
 #include <linux/linkage.h>
 #include <linux/err.h>
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index f7ba9fbf12ee..72a4343b774f 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -187,6 +187,7 @@
 #define X86_FEATURE_ARAT	( 7*32+ 1) /* Always Running APIC Timer */
 #define X86_FEATURE_CPB		( 7*32+ 2) /* AMD Core Performance Boost */
 #define X86_FEATURE_EPB		( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */
+#define X86_FEATURE_INVPCID_SINGLE ( 7*32+ 4) /* Effectively INVPCID && CR4.PCIDE=1 */
 #define X86_FEATURE_PLN		( 7*32+ 5) /* Intel Power Limit Notification */
 #define X86_FEATURE_PTS		( 7*32+ 6) /* Intel Package Thermal Status */
 #define X86_FEATURE_DTHERM	( 7*32+ 7) /* Digital Thermal Sensor */
diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h
index e0fc45e77aee..360ff3bc44a9 100644
--- a/arch/x86/include/asm/kaiser.h
+++ b/arch/x86/include/asm/kaiser.h
@@ -1,5 +1,8 @@
 #ifndef _ASM_X86_KAISER_H
 #define _ASM_X86_KAISER_H
+
+#include <uapi/asm/processor-flags.h> /* For PCID constants */
+
 /*
  * This file includes the definitions for the KAISER feature.
  * KAISER is a counter measure against x86_64 side channel attacks on
@@ -21,13 +24,21 @@
 
 .macro _SWITCH_TO_KERNEL_CR3 reg
 movq %cr3, \reg
-andq $(~KAISER_SHADOW_PGD_OFFSET), \reg
+andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg
+orq  X86_CR3_PCID_KERN_VAR, \reg
 movq \reg, %cr3
 .endm
 
 .macro _SWITCH_TO_USER_CR3 reg
 movq %cr3, \reg
-orq $(KAISER_SHADOW_PGD_OFFSET), \reg
+andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg
+/*
+ * This can obviously be one instruction by putting the
+ * KAISER_SHADOW_PGD_OFFSET bit in the X86_CR3_PCID_USER_VAR.
+ * But, just leave it now for simplicity.
+ */
+orq  X86_CR3_PCID_USER_VAR, \reg
+orq  $(KAISER_SHADOW_PGD_OFFSET), \reg
 movq \reg, %cr3
 .endm
 
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index a70d6100b3df..79319174e27b 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -106,6 +106,32 @@
 			 _PAGE_SOFT_DIRTY)
 #define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)
 
+/* The ASID is the lower 12 bits of CR3 */
+#define X86_CR3_PCID_ASID_MASK  (_AC((1<<12)-1,UL))
+
+/* Mask for all the PCID-related bits in CR3: */
+#define X86_CR3_PCID_MASK       (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_MASK)
+#if defined(CONFIG_KAISER) && defined(CONFIG_X86_64)
+#define X86_CR3_PCID_ASID_KERN  (_AC(0x4,UL))
+#define X86_CR3_PCID_ASID_USER  (_AC(0x6,UL))
+
+#define X86_CR3_PCID_KERN_FLUSH		(X86_CR3_PCID_ASID_KERN)
+#define X86_CR3_PCID_USER_FLUSH		(X86_CR3_PCID_ASID_USER)
+#define X86_CR3_PCID_KERN_NOFLUSH	(X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_KERN)
+#define X86_CR3_PCID_USER_NOFLUSH	(X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_USER)
+#else
+#define X86_CR3_PCID_ASID_KERN  (_AC(0x0,UL))
+#define X86_CR3_PCID_ASID_USER  (_AC(0x0,UL))
+/*
+ * PCIDs are unsupported on 32-bit and none of these bits can be
+ * set in CR3:
+ */
+#define X86_CR3_PCID_KERN_FLUSH		(0)
+#define X86_CR3_PCID_USER_FLUSH		(0)
+#define X86_CR3_PCID_KERN_NOFLUSH	(0)
+#define X86_CR3_PCID_USER_NOFLUSH	(0)
+#endif
+
 /*
  * The cache modes defined here are used to translate between pure SW usage
  * and the HW defined cache mode bits and/or PAT entries.
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 9fc5968da820..48ef37079bc2 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -12,7 +12,6 @@ static inline void __invpcid(unsigned long pcid, unsigned long addr,
 			     unsigned long type)
 {
 	struct { u64 d[2]; } desc = { { pcid, addr } };
-
 	/*
 	 * The memory clobber is because the whole point is to invalidate
 	 * stale TLB entries and, especially if we're flushing global
@@ -133,14 +132,25 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask)
 
 static inline void __native_flush_tlb(void)
 {
+	if (!cpu_feature_enabled(X86_FEATURE_INVPCID)) {
+	 	/*
+		 * If current->mm == NULL then we borrow a mm which may change during a
+		 * task switch and therefore we must not be preempted while we write CR3
+		 * back:
+		 */
+		preempt_disable();
+		native_write_cr3(native_read_cr3());
+		preempt_enable();
+		return;
+	}
 	/*
-	 * If current->mm == NULL then we borrow a mm which may change during a
-	 * task switch and therefore we must not be preempted while we write CR3
-	 * back:
+	 * We are no longer using globals with KAISER, so a
+	 * "nonglobals" flush would work too. But, this is more
+	 * conservative.
+	 *
+	 * Note, this works with CR4.PCIDE=0 or 1.
 	 */
-	preempt_disable();
-	native_write_cr3(native_read_cr3());
-	preempt_enable();
+	invpcid_flush_all();
 }
 
 static inline void __native_flush_tlb_global_irq_disabled(void)
@@ -162,6 +172,8 @@ static inline void __native_flush_tlb_global(void)
 		/*
 		 * Using INVPCID is considerably faster than a pair of writes
 		 * to CR4 sandwiched inside an IRQ flag save/restore.
+		 *
+	 	 * Note, this works with CR4.PCIDE=0 or 1.
 		 */
 		invpcid_flush_all();
 		return;
@@ -181,7 +193,31 @@ static inline void __native_flush_tlb_global(void)
 
 static inline void __native_flush_tlb_single(unsigned long addr)
 {
-	asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
+	/*
+	 * SIMICS #GP's if you run INVPCID with type 2/3
+	 * and X86_CR4_PCIDE clear.  Shame!
+	 *
+	 * The ASIDs used below are hard-coded.  But, we must not
+	 * call invpcid(type=1/2) before CR4.PCIDE=1.  Just call
+	 * invpcid in the case we are called early.
+	 */
+	if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) {
+		asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
+		return;
+	}
+	/* Flush the address out of both PCIDs. */
+	/*
+	 * An optimization here might be to determine addresses
+	 * that are only kernel-mapped and only flush the kernel
+	 * ASID.  But, userspace flushes are probably much more
+	 * important performance-wise.
+	 *
+	 * Make sure to do only a single invpcid when KAISER is
+	 * disabled and we have only a single ASID.
+	 */
+	if (X86_CR3_PCID_ASID_KERN != X86_CR3_PCID_ASID_USER)
+		invpcid_flush_one(X86_CR3_PCID_ASID_KERN, addr);
+	invpcid_flush_one(X86_CR3_PCID_ASID_USER, addr);
 }
 
 static inline void __flush_tlb_all(void)
diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h
index 79887abcb5e1..1361779f44fe 100644
--- a/arch/x86/include/uapi/asm/processor-flags.h
+++ b/arch/x86/include/uapi/asm/processor-flags.h
@@ -77,7 +77,8 @@
 #define X86_CR3_PWT		_BITUL(X86_CR3_PWT_BIT)
 #define X86_CR3_PCD_BIT		4 /* Page Cache Disable */
 #define X86_CR3_PCD		_BITUL(X86_CR3_PCD_BIT)
-#define X86_CR3_PCID_MASK	_AC(0x00000fff,UL) /* PCID Mask */
+#define X86_CR3_PCID_NOFLUSH_BIT 63 /* Preserve old PCID */
+#define X86_CR3_PCID_NOFLUSH    _BITULL(X86_CR3_PCID_NOFLUSH_BIT)
 
 /*
  * Intel CPU features in CR4
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index e5ba9701ee6c..07b7f2816567 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -321,11 +321,45 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c)
 	}
 }
 
+/*
+ * These can have bit 63 set, so we can not just use a plain "or"
+ * instruction to get their value or'd into CR3.  It would take
+ * another register.  So, we use a memory reference to these
+ * instead.
+ *
+ * This is also handy because systems that do not support
+ * PCIDs just end up or'ing a 0 into their CR3, which does
+ * no harm.
+ */
+__aligned(PAGE_SIZE) unsigned long X86_CR3_PCID_KERN_VAR = 0;
+__aligned(PAGE_SIZE) unsigned long X86_CR3_PCID_USER_VAR = 0;
+
 static void setup_pcid(struct cpuinfo_x86 *c)
 {
 	if (cpu_has(c, X86_FEATURE_PCID)) {
 		if (cpu_has(c, X86_FEATURE_PGE)) {
 			cr4_set_bits(X86_CR4_PCIDE);
+			/*
+			 * These variables are used by the entry/exit
+			 * code to change PCIDs.
+			 */
+#ifdef CONFIG_KAISER
+			X86_CR3_PCID_KERN_VAR = X86_CR3_PCID_KERN_NOFLUSH;
+			X86_CR3_PCID_USER_VAR = X86_CR3_PCID_USER_NOFLUSH;
+#endif
+			/*
+			 * INVPCID has two "groups" of types:
+			 * 1/2: Invalidate an individual address
+			 * 3/4: Invalidate all contexts
+			 *
+			 * 1/2 take a PCID, but 3/4 do not.  So, 3/4
+			 * ignore the PCID argument in the descriptor.
+			 * But, we have to be careful not to call 1/2
+			 * with an actual non-zero PCID in them before
+			 * we do the above cr4_set_bits().
+			 */
+			if (cpu_has(c, X86_FEATURE_INVPCID))
+				set_cpu_cap(c, X86_FEATURE_INVPCID_SINGLE);
 		} else {
 			/*
 			 * flush_tlb_all(), as currently implemented, won't
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 796f1ec67469..ccf17dbfea09 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -759,7 +759,8 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
 			return 1;
 
 		/* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */
-		if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu))
+		if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_ASID_MASK) ||
+		    !is_long_mode(vcpu))
 			return 1;
 	}
 
diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c
index bf48bf0df8c5..290a52e6017d 100644
--- a/arch/x86/mm/kaiser.c
+++ b/arch/x86/mm/kaiser.c
@@ -240,6 +240,8 @@ static void __init kaiser_init_all_pgds(void)
 } while (0)
 
 extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];
+extern unsigned long X86_CR3_PCID_KERN_VAR;
+extern unsigned long X86_CR3_PCID_USER_VAR;
 /*
  * If anything in here fails, we will likely die on one of the
  * first kernel->user transitions and init will die.  But, we
@@ -290,6 +292,11 @@ void __init kaiser_init(void)
 	kaiser_add_user_map_early(&debug_idt_table,
 				  sizeof(gate_desc) * NR_VECTORS,
 				  __PAGE_KERNEL);
+
+	kaiser_add_user_map_early(&X86_CR3_PCID_KERN_VAR, PAGE_SIZE,
+				  __PAGE_KERNEL);
+	kaiser_add_user_map_early(&X86_CR3_PCID_USER_VAR, PAGE_SIZE,
+				  __PAGE_KERNEL);
 }
 
 /* Add a mapping to the shadow mapping, and synchronize the mappings */
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 7a4cdb632508..aed0b704de3d 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -34,6 +34,46 @@ struct flush_tlb_info {
 	unsigned long flush_end;
 };
 
+static void load_new_mm_cr3(pgd_t *pgdir)
+{
+	unsigned long new_mm_cr3 = __pa(pgdir);
+
+	/*
+	 * KAISER, plus PCIDs needs some extra work here.  But,
+	 * if either of features is not present, we need no
+	 * PCIDs here and just do a normal, full TLB flush with
+	 * the write_cr3()
+	 */
+	if (!IS_ENABLED(CONFIG_KAISER) ||
+	    !cpu_feature_enabled(X86_FEATURE_PCID))
+		goto out_set_cr3;
+	/*
+	 * We reuse the same PCID for different tasks, so we must
+	 * flush all the entires for the PCID out when we change
+	 * tasks.
+	 */
+	new_mm_cr3 = X86_CR3_PCID_KERN_FLUSH | __pa(pgdir);
+
+	/*
+	 * The flush from load_cr3() may leave old TLB entries
+	 * for userspace in place.  We must flush that context
+	 * separately.  We can theoretically delay doing this
+	 * until we actually load up the userspace CR3, but
+	 * that's a bit tricky.  We have to have the "need to
+	 * flush userspace PCID" bit per-cpu and check it in the
+	 * exit-to-userspace paths.
+	 */
+	invpcid_flush_single_context(X86_CR3_PCID_ASID_USER);
+
+out_set_cr3:
+	/*
+	 * Caution: many callers of this function expect
+	 * that load_cr3() is serializing and orders TLB
+	 * fills with respect to the mm_cpumask writes.
+	 */
+	write_cr3(new_mm_cr3);
+}
+
 /*
  * We cannot call mmdrop() because we are in interrupt context,
  * instead update mm->cpu_vm_mask.
@@ -45,7 +85,7 @@ void leave_mm(int cpu)
 		BUG();
 	if (cpumask_test_cpu(cpu, mm_cpumask(active_mm))) {
 		cpumask_clear_cpu(cpu, mm_cpumask(active_mm));
-		load_cr3(swapper_pg_dir);
+		load_new_mm_cr3(swapper_pg_dir);
 		/*
 		 * This gets called in the idle path where RCU
 		 * functions differently.  Tracing normally
@@ -105,7 +145,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 		 * ordering guarantee we need.
 		 *
 		 */
-		load_cr3(next->pgd);
+		load_new_mm_cr3(next->pgd);
 
 		trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
 
@@ -152,7 +192,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
 			 * As above, load_cr3() is serializing and orders TLB
 			 * fills with respect to the mm_cpumask write.
 			 */
-			load_cr3(next->pgd);
+			load_new_mm_cr3(next->pgd);
 			trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
 			load_mm_cr4(next);
 			load_mm_ldt(next);
-- 
cgit v1.2.3


From 0731188fc74cc2237975a2b5bedd36e2463ef10b Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Thu, 17 Aug 2017 15:00:37 -0700
Subject: kaiser: load_new_mm_cr3() let SWITCH_USER_CR3 flush user

We have many machines (Westmere, Sandybridge, Ivybridge) supporting
PCID but not INVPCID: on these load_new_mm_cr3() simply crashed.

Flushing user context inside load_new_mm_cr3() without the use of
invpcid is difficult: momentarily switch from kernel to user context
and back to do so?  I'm not sure whether that can be safely done at
all, and would risk polluting user context with kernel internals,
and kernel context with stale user externals.

Instead, follow the hint in the comment that was there: change
X86_CR3_PCID_USER_VAR to be a per-cpu variable, then load_new_mm_cr3()
can leave a note in it, for SWITCH_USER_CR3 on return to userspace to
flush user context TLB, instead of default X86_CR3_PCID_USER_NOFLUSH.

Which works well enough that there's no need to do it this way only
when invpcid is unsupported: it's a good alternative to invpcid here.
But there's a couple of inlines in asm/tlbflush.h that need to do the
same trick, so it's best to localize all this per-cpu business in
mm/kaiser.c: moving that part of the initialization from setup_pcid()
to kaiser_setup_pcid(); with kaiser_flush_tlb_on_return_to_user() the
function for noting an X86_CR3_PCID_USER_FLUSH.  And let's keep a
KAISER_SHADOW_PGD_OFFSET in there, to avoid the extra OR on exit.

I did try to make the feature tests in asm/tlbflush.h more consistent
with each other: there seem to be far too many ways of performing such
tests, and I don't have a good grasp of their differences.  At first
I converted them all to be static_cpu_has(): but that proved to be a
mistake, as the comment in __native_flush_tlb_single() hints; so then
I reversed and made them all this_cpu_has().  Probably all gratuitous
change, but that's the way it's working at present.

I am slightly bothered by the way non-per-cpu X86_CR3_PCID_KERN_VAR
gets re-initialized by each cpu (before and after these changes):
no problem when (as usual) all cpus on a machine have the same
features, but in principle incorrect.  However, my experiment
to per-cpu-ify that one did not end well...

Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/kaiser.h   | 18 ++++++++-----
 arch/x86/include/asm/tlbflush.h | 58 ++++++++++++++++++++++++++++-------------
 arch/x86/kernel/cpu/common.c    | 22 +---------------
 arch/x86/mm/kaiser.c            | 50 ++++++++++++++++++++++++++++++-----
 arch/x86/mm/tlb.c               | 46 +++++++++++++-------------------
 5 files changed, 114 insertions(+), 80 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h
index 360ff3bc44a9..009bca514c20 100644
--- a/arch/x86/include/asm/kaiser.h
+++ b/arch/x86/include/asm/kaiser.h
@@ -32,13 +32,12 @@ movq \reg, %cr3
 .macro _SWITCH_TO_USER_CR3 reg
 movq %cr3, \reg
 andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg
-/*
- * This can obviously be one instruction by putting the
- * KAISER_SHADOW_PGD_OFFSET bit in the X86_CR3_PCID_USER_VAR.
- * But, just leave it now for simplicity.
- */
-orq  X86_CR3_PCID_USER_VAR, \reg
-orq  $(KAISER_SHADOW_PGD_OFFSET), \reg
+orq  PER_CPU_VAR(X86_CR3_PCID_USER_VAR), \reg
+js   9f
+// FLUSH this time, reset to NOFLUSH for next time
+// But if nopcid?  Consider using 0x80 for user pcid?
+movb $(0x80), PER_CPU_VAR(X86_CR3_PCID_USER_VAR+7)
+9:
 movq \reg, %cr3
 .endm
 
@@ -90,6 +89,11 @@ movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
 */
 DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
 
+extern unsigned long X86_CR3_PCID_KERN_VAR;
+DECLARE_PER_CPU(unsigned long, X86_CR3_PCID_USER_VAR);
+
+extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];
+
 /**
  *  kaiser_add_mapping - map a virtual memory part to the shadow (user) mapping
  *  @addr: the start address of the range
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 48ef37079bc2..ff8c5eb95caf 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -12,6 +12,7 @@ static inline void __invpcid(unsigned long pcid, unsigned long addr,
 			     unsigned long type)
 {
 	struct { u64 d[2]; } desc = { { pcid, addr } };
+
 	/*
 	 * The memory clobber is because the whole point is to invalidate
 	 * stale TLB entries and, especially if we're flushing global
@@ -130,27 +131,42 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask)
 	cr4_set_bits(mask);
 }
 
+/*
+ * Declare a couple of kaiser interfaces here for convenience,
+ * to avoid the need for asm/kaiser.h in unexpected places.
+ */
+#ifdef CONFIG_KAISER
+extern void kaiser_setup_pcid(void);
+extern void kaiser_flush_tlb_on_return_to_user(void);
+#else
+static inline void kaiser_setup_pcid(void)
+{
+}
+static inline void kaiser_flush_tlb_on_return_to_user(void)
+{
+}
+#endif
+
 static inline void __native_flush_tlb(void)
 {
-	if (!cpu_feature_enabled(X86_FEATURE_INVPCID)) {
-	 	/*
-		 * If current->mm == NULL then we borrow a mm which may change during a
-		 * task switch and therefore we must not be preempted while we write CR3
-		 * back:
+	if (this_cpu_has(X86_FEATURE_INVPCID)) {
+		/*
+		 * Note, this works with CR4.PCIDE=0 or 1.
 		 */
-		preempt_disable();
-		native_write_cr3(native_read_cr3());
-		preempt_enable();
+		invpcid_flush_all_nonglobals();
 		return;
 	}
+
 	/*
-	 * We are no longer using globals with KAISER, so a
-	 * "nonglobals" flush would work too. But, this is more
-	 * conservative.
-	 *
-	 * Note, this works with CR4.PCIDE=0 or 1.
+	 * If current->mm == NULL then we borrow a mm which may change during a
+	 * task switch and therefore we must not be preempted while we write CR3
+	 * back:
 	 */
-	invpcid_flush_all();
+	preempt_disable();
+	if (this_cpu_has(X86_FEATURE_PCID))
+		kaiser_flush_tlb_on_return_to_user();
+	native_write_cr3(native_read_cr3());
+	preempt_enable();
 }
 
 static inline void __native_flush_tlb_global_irq_disabled(void)
@@ -166,9 +182,13 @@ static inline void __native_flush_tlb_global_irq_disabled(void)
 
 static inline void __native_flush_tlb_global(void)
 {
+#ifdef CONFIG_KAISER
+	/* Globals are not used at all */
+	__native_flush_tlb();
+#else
 	unsigned long flags;
 
-	if (static_cpu_has(X86_FEATURE_INVPCID)) {
+	if (this_cpu_has(X86_FEATURE_INVPCID)) {
 		/*
 		 * Using INVPCID is considerably faster than a pair of writes
 		 * to CR4 sandwiched inside an IRQ flag save/restore.
@@ -185,10 +205,9 @@ static inline void __native_flush_tlb_global(void)
 	 * be called from deep inside debugging code.)
 	 */
 	raw_local_irq_save(flags);
-
 	__native_flush_tlb_global_irq_disabled();
-
 	raw_local_irq_restore(flags);
+#endif
 }
 
 static inline void __native_flush_tlb_single(unsigned long addr)
@@ -199,9 +218,12 @@ static inline void __native_flush_tlb_single(unsigned long addr)
 	 *
 	 * The ASIDs used below are hard-coded.  But, we must not
 	 * call invpcid(type=1/2) before CR4.PCIDE=1.  Just call
-	 * invpcid in the case we are called early.
+	 * invlpg in the case we are called early.
 	 */
+
 	if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) {
+		if (this_cpu_has(X86_FEATURE_PCID))
+			kaiser_flush_tlb_on_return_to_user();
 		asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
 		return;
 	}
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 07b7f2816567..46ad2faca9ab 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -321,32 +321,11 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c)
 	}
 }
 
-/*
- * These can have bit 63 set, so we can not just use a plain "or"
- * instruction to get their value or'd into CR3.  It would take
- * another register.  So, we use a memory reference to these
- * instead.
- *
- * This is also handy because systems that do not support
- * PCIDs just end up or'ing a 0 into their CR3, which does
- * no harm.
- */
-__aligned(PAGE_SIZE) unsigned long X86_CR3_PCID_KERN_VAR = 0;
-__aligned(PAGE_SIZE) unsigned long X86_CR3_PCID_USER_VAR = 0;
-
 static void setup_pcid(struct cpuinfo_x86 *c)
 {
 	if (cpu_has(c, X86_FEATURE_PCID)) {
 		if (cpu_has(c, X86_FEATURE_PGE)) {
 			cr4_set_bits(X86_CR4_PCIDE);
-			/*
-			 * These variables are used by the entry/exit
-			 * code to change PCIDs.
-			 */
-#ifdef CONFIG_KAISER
-			X86_CR3_PCID_KERN_VAR = X86_CR3_PCID_KERN_NOFLUSH;
-			X86_CR3_PCID_USER_VAR = X86_CR3_PCID_USER_NOFLUSH;
-#endif
 			/*
 			 * INVPCID has two "groups" of types:
 			 * 1/2: Invalidate an individual address
@@ -372,6 +351,7 @@ static void setup_pcid(struct cpuinfo_x86 *c)
 			clear_cpu_cap(c, X86_FEATURE_PCID);
 		}
 	}
+	kaiser_setup_pcid();
 }
 
 /*
diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c
index 290a52e6017d..bf3ec221a90b 100644
--- a/arch/x86/mm/kaiser.c
+++ b/arch/x86/mm/kaiser.c
@@ -12,12 +12,26 @@
 #include <linux/ftrace.h>
 
 #include <asm/kaiser.h>
+#include <asm/tlbflush.h>	/* to verify its kaiser declarations */
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
 #include <asm/desc.h>
+
 #ifdef CONFIG_KAISER
+__visible
+DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
+
+/*
+ * These can have bit 63 set, so we can not just use a plain "or"
+ * instruction to get their value or'd into CR3.  It would take
+ * another register.  So, we use a memory reference to these instead.
+ *
+ * This is also handy because systems that do not support PCIDs
+ * just end up or'ing a 0 into their CR3, which does no harm.
+ */
+__aligned(PAGE_SIZE) unsigned long X86_CR3_PCID_KERN_VAR;
+DEFINE_PER_CPU(unsigned long, X86_CR3_PCID_USER_VAR);
 
-__visible DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
 /*
  * At runtime, the only things we map are some things for CPU
  * hotplug, and stacks for new processes.  No two CPUs will ever
@@ -239,9 +253,6 @@ static void __init kaiser_init_all_pgds(void)
 	WARN_ON(__ret);							\
 } while (0)
 
-extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];
-extern unsigned long X86_CR3_PCID_KERN_VAR;
-extern unsigned long X86_CR3_PCID_USER_VAR;
 /*
  * If anything in here fails, we will likely die on one of the
  * first kernel->user transitions and init will die.  But, we
@@ -295,8 +306,6 @@ void __init kaiser_init(void)
 
 	kaiser_add_user_map_early(&X86_CR3_PCID_KERN_VAR, PAGE_SIZE,
 				  __PAGE_KERNEL);
-	kaiser_add_user_map_early(&X86_CR3_PCID_USER_VAR, PAGE_SIZE,
-				  __PAGE_KERNEL);
 }
 
 /* Add a mapping to the shadow mapping, and synchronize the mappings */
@@ -361,4 +370,33 @@ pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
 	}
 	return pgd;
 }
+
+void kaiser_setup_pcid(void)
+{
+	unsigned long kern_cr3 = 0;
+	unsigned long user_cr3 = KAISER_SHADOW_PGD_OFFSET;
+
+	if (this_cpu_has(X86_FEATURE_PCID)) {
+		kern_cr3 |= X86_CR3_PCID_KERN_NOFLUSH;
+		user_cr3 |= X86_CR3_PCID_USER_NOFLUSH;
+	}
+	/*
+	 * These variables are used by the entry/exit
+	 * code to change PCID and pgd and TLB flushing.
+	 */
+	X86_CR3_PCID_KERN_VAR = kern_cr3;
+	this_cpu_write(X86_CR3_PCID_USER_VAR, user_cr3);
+}
+
+/*
+ * Make a note that this cpu will need to flush USER tlb on return to user.
+ * Caller checks whether this_cpu_has(X86_FEATURE_PCID) before calling:
+ * if cpu does not, then the NOFLUSH bit will never have been set.
+ */
+void kaiser_flush_tlb_on_return_to_user(void)
+{
+	this_cpu_write(X86_CR3_PCID_USER_VAR,
+			X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET);
+}
+EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user);
 #endif /* CONFIG_KAISER */
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index aed0b704de3d..99a5d1b95527 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -6,13 +6,14 @@
 #include <linux/interrupt.h>
 #include <linux/module.h>
 #include <linux/cpu.h>
+#include <linux/debugfs.h>
 
 #include <asm/tlbflush.h>
 #include <asm/mmu_context.h>
 #include <asm/cache.h>
 #include <asm/apic.h>
 #include <asm/uv/uv.h>
-#include <linux/debugfs.h>
+#include <asm/kaiser.h>
 
 /*
  *	TLB flushing, formerly SMP-only
@@ -38,34 +39,23 @@ static void load_new_mm_cr3(pgd_t *pgdir)
 {
 	unsigned long new_mm_cr3 = __pa(pgdir);
 
-	/*
-	 * KAISER, plus PCIDs needs some extra work here.  But,
-	 * if either of features is not present, we need no
-	 * PCIDs here and just do a normal, full TLB flush with
-	 * the write_cr3()
-	 */
-	if (!IS_ENABLED(CONFIG_KAISER) ||
-	    !cpu_feature_enabled(X86_FEATURE_PCID))
-		goto out_set_cr3;
-	/*
-	 * We reuse the same PCID for different tasks, so we must
-	 * flush all the entires for the PCID out when we change
-	 * tasks.
-	 */
-	new_mm_cr3 = X86_CR3_PCID_KERN_FLUSH | __pa(pgdir);
-
-	/*
-	 * The flush from load_cr3() may leave old TLB entries
-	 * for userspace in place.  We must flush that context
-	 * separately.  We can theoretically delay doing this
-	 * until we actually load up the userspace CR3, but
-	 * that's a bit tricky.  We have to have the "need to
-	 * flush userspace PCID" bit per-cpu and check it in the
-	 * exit-to-userspace paths.
-	 */
-	invpcid_flush_single_context(X86_CR3_PCID_ASID_USER);
+#ifdef CONFIG_KAISER
+	if (this_cpu_has(X86_FEATURE_PCID)) {
+		/*
+		 * We reuse the same PCID for different tasks, so we must
+		 * flush all the entries for the PCID out when we change tasks.
+		 * Flush KERN below, flush USER when returning to userspace in
+		 * kaiser's SWITCH_USER_CR3 (_SWITCH_TO_USER_CR3) macro.
+		 *
+		 * invpcid_flush_single_context(X86_CR3_PCID_ASID_USER) could
+		 * do it here, but can only be used if X86_FEATURE_INVPCID is
+		 * available - and many machines support pcid without invpcid.
+		 */
+		new_mm_cr3 |= X86_CR3_PCID_KERN_FLUSH;
+		kaiser_flush_tlb_on_return_to_user();
+	}
+#endif /* CONFIG_KAISER */
 
-out_set_cr3:
 	/*
 	 * Caution: many callers of this function expect
 	 * that load_cr3() is serializing and orders TLB
-- 
cgit v1.2.3


From 3b4ce0e1a17228eec71815d7997e49e403ebf2a7 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Fri, 8 Sep 2017 19:26:30 -0700
Subject: kaiser: PCID 0 for kernel and 128 for user

Why was 4 chosen for kernel PCID and 6 for user PCID?
No good reason in a backport where PCIDs are only used for Kaiser.

If we continue with those, then we shall need to add Andy Lutomirski's
4.13 commit 6c690ee1039b ("x86/mm: Split read_cr3() into read_cr3_pa()
and __read_cr3()"), which deals with the problem of read_cr3() callers
finding stray bits in the cr3 that they expected to be page-aligned;
and for hibernation, his 4.14 commit f34902c5c6c0 ("x86/hibernate/64:
Mask off CR3's PCID bits in the saved CR3").

But if 0 is used for kernel PCID, then there's no need to add in those
commits - whenever the kernel looks, it sees 0 in the lower bits; and
0 for kernel seems an obvious choice.

And I naughtily propose 128 for user PCID.  Because there's a place
in _SWITCH_TO_USER_CR3 where it takes note of the need for TLB FLUSH,
but needs to reset that to NOFLUSH for the next occasion.  Currently
it does so with a "movb $(0x80)" into the high byte of the per-cpu
quadword, but that will cause a machine without PCID support to crash.
Now, if %al just happened to have 0x80 in it at that point, on a
machine with PCID support, but 0 on a machine without PCID support...

(That will go badly wrong once the pgd can be at a physical address
above 2^56, but even with 5-level paging, physical goes up to 2^52.)

Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/kaiser.h        | 19 ++++++++++++-------
 arch/x86/include/asm/pgtable_types.h |  7 ++++---
 arch/x86/mm/tlb.c                    |  3 +++
 3 files changed, 19 insertions(+), 10 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h
index 009bca514c20..110a73e0572d 100644
--- a/arch/x86/include/asm/kaiser.h
+++ b/arch/x86/include/asm/kaiser.h
@@ -29,14 +29,19 @@ orq  X86_CR3_PCID_KERN_VAR, \reg
 movq \reg, %cr3
 .endm
 
-.macro _SWITCH_TO_USER_CR3 reg
+.macro _SWITCH_TO_USER_CR3 reg regb
+/*
+ * regb must be the low byte portion of reg: because we have arranged
+ * for the low byte of the user PCID to serve as the high byte of NOFLUSH
+ * (0x80 for each when PCID is enabled, or 0x00 when PCID and NOFLUSH are
+ * not enabled): so that the one register can update both memory and cr3.
+ */
 movq %cr3, \reg
 andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg
 orq  PER_CPU_VAR(X86_CR3_PCID_USER_VAR), \reg
 js   9f
-// FLUSH this time, reset to NOFLUSH for next time
-// But if nopcid?  Consider using 0x80 for user pcid?
-movb $(0x80), PER_CPU_VAR(X86_CR3_PCID_USER_VAR+7)
+/* FLUSH this time, reset to NOFLUSH for next time (if PCID enabled) */
+movb \regb, PER_CPU_VAR(X86_CR3_PCID_USER_VAR+7)
 9:
 movq \reg, %cr3
 .endm
@@ -49,7 +54,7 @@ popq %rax
 
 .macro SWITCH_USER_CR3
 pushq %rax
-_SWITCH_TO_USER_CR3 %rax
+_SWITCH_TO_USER_CR3 %rax %al
 popq %rax
 .endm
 
@@ -61,7 +66,7 @@ movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
 
 .macro SWITCH_USER_CR3_NO_STACK
 movq %rax, PER_CPU_VAR(unsafe_stack_register_backup)
-_SWITCH_TO_USER_CR3 %rax
+_SWITCH_TO_USER_CR3 %rax %al
 movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
 .endm
 
@@ -69,7 +74,7 @@ movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
 
 .macro SWITCH_KERNEL_CR3 reg
 .endm
-.macro SWITCH_USER_CR3 reg
+.macro SWITCH_USER_CR3 reg regb
 .endm
 .macro SWITCH_USER_CR3_NO_STACK
 .endm
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 79319174e27b..22704eca1900 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -111,16 +111,17 @@
 
 /* Mask for all the PCID-related bits in CR3: */
 #define X86_CR3_PCID_MASK       (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_MASK)
+#define X86_CR3_PCID_ASID_KERN  (_AC(0x0,UL))
+
 #if defined(CONFIG_KAISER) && defined(CONFIG_X86_64)
-#define X86_CR3_PCID_ASID_KERN  (_AC(0x4,UL))
-#define X86_CR3_PCID_ASID_USER  (_AC(0x6,UL))
+/* Let X86_CR3_PCID_ASID_USER be usable for the X86_CR3_PCID_NOFLUSH bit */
+#define X86_CR3_PCID_ASID_USER	(_AC(0x80,UL))
 
 #define X86_CR3_PCID_KERN_FLUSH		(X86_CR3_PCID_ASID_KERN)
 #define X86_CR3_PCID_USER_FLUSH		(X86_CR3_PCID_ASID_USER)
 #define X86_CR3_PCID_KERN_NOFLUSH	(X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_KERN)
 #define X86_CR3_PCID_USER_NOFLUSH	(X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_USER)
 #else
-#define X86_CR3_PCID_ASID_KERN  (_AC(0x0,UL))
 #define X86_CR3_PCID_ASID_USER  (_AC(0x0,UL))
 /*
  * PCIDs are unsupported on 32-bit and none of these bits can be
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 99a5d1b95527..90ef67a9e34b 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -50,6 +50,9 @@ static void load_new_mm_cr3(pgd_t *pgdir)
 		 * invpcid_flush_single_context(X86_CR3_PCID_ASID_USER) could
 		 * do it here, but can only be used if X86_FEATURE_INVPCID is
 		 * available - and many machines support pcid without invpcid.
+		 *
+		 * The line below is a no-op: X86_CR3_PCID_KERN_FLUSH is now 0;
+		 * but keep that line in there in case something changes.
 		 */
 		new_mm_cr3 |= X86_CR3_PCID_KERN_FLUSH;
 		kaiser_flush_tlb_on_return_to_user();
-- 
cgit v1.2.3


From 20268a10ffecd9fcc04880b21fc99a9192394599 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Sun, 27 Aug 2017 16:24:27 -0700
Subject: kaiser: x86_cr3_pcid_noflush and x86_cr3_pcid_user

Mostly this commit is just unshouting X86_CR3_PCID_KERN_VAR and
X86_CR3_PCID_USER_VAR: we usually name variables in lower-case.

But why does x86_cr3_pcid_noflush need to be __aligned(PAGE_SIZE)?
Ah, it's a leftover from when kaiser_add_user_map() once complained
about mapping the same page twice.  Make it __read_mostly instead.
(I'm a little uneasy about all the unrelated data which shares its
page getting user-mapped too, but that was so before, and not a big
deal: though we call it user-mapped, it's not mapped with _PAGE_USER.)

And there is a little change around the two calls to do_nmi().
Previously they set the NOFLUSH bit (if PCID supported) when
forcing to kernel context before do_nmi(); now they also have the
NOFLUSH bit set (if PCID supported) when restoring context after:
nothing done in do_nmi() should require a TLB to be flushed here.

Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/entry/entry_64.S     |  8 ++++----
 arch/x86/include/asm/kaiser.h | 11 +++++------
 arch/x86/mm/kaiser.c          | 13 +++++++------
 3 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 6a18d787aed4..efd96f02ac9e 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -1290,11 +1290,11 @@ ENTRY(nmi)
 	/* Unconditionally use kernel CR3 for do_nmi() */
 	/* %rax is saved above, so OK to clobber here */
 	movq	%cr3, %rax
+	/* If PCID enabled, NOFLUSH now and NOFLUSH on return */
+	orq	x86_cr3_pcid_noflush, %rax
 	pushq	%rax
 	/* mask off "user" bit of pgd address and 12 PCID bits: */
 	andq	$(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
-	/* Add back kernel PCID and "no flush" bit */
-	orq	X86_CR3_PCID_KERN_VAR, %rax
 	movq	%rax, %cr3
 #endif
 	call	do_nmi
@@ -1534,11 +1534,11 @@ end_repeat_nmi:
 	/* Unconditionally use kernel CR3 for do_nmi() */
 	/* %rax is saved above, so OK to clobber here */
 	movq	%cr3, %rax
+	/* If PCID enabled, NOFLUSH now and NOFLUSH on return */
+	orq	x86_cr3_pcid_noflush, %rax
 	pushq	%rax
 	/* mask off "user" bit of pgd address and 12 PCID bits: */
 	andq	$(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
-	/* Add back kernel PCID and "no flush" bit */
-	orq	X86_CR3_PCID_KERN_VAR, %rax
 	movq	%rax, %cr3
 #endif
 
diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h
index 110a73e0572d..48d8d70dd8c7 100644
--- a/arch/x86/include/asm/kaiser.h
+++ b/arch/x86/include/asm/kaiser.h
@@ -25,7 +25,7 @@
 .macro _SWITCH_TO_KERNEL_CR3 reg
 movq %cr3, \reg
 andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg
-orq  X86_CR3_PCID_KERN_VAR, \reg
+orq  x86_cr3_pcid_noflush, \reg
 movq \reg, %cr3
 .endm
 
@@ -37,11 +37,10 @@ movq \reg, %cr3
  * not enabled): so that the one register can update both memory and cr3.
  */
 movq %cr3, \reg
-andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg
-orq  PER_CPU_VAR(X86_CR3_PCID_USER_VAR), \reg
+orq  PER_CPU_VAR(x86_cr3_pcid_user), \reg
 js   9f
 /* FLUSH this time, reset to NOFLUSH for next time (if PCID enabled) */
-movb \regb, PER_CPU_VAR(X86_CR3_PCID_USER_VAR+7)
+movb \regb, PER_CPU_VAR(x86_cr3_pcid_user+7)
 9:
 movq \reg, %cr3
 .endm
@@ -94,8 +93,8 @@ movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
 */
 DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
 
-extern unsigned long X86_CR3_PCID_KERN_VAR;
-DECLARE_PER_CPU(unsigned long, X86_CR3_PCID_USER_VAR);
+extern unsigned long x86_cr3_pcid_noflush;
+DECLARE_PER_CPU(unsigned long, x86_cr3_pcid_user);
 
 extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];
 
diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c
index bf3ec221a90b..8a5eb5993d3c 100644
--- a/arch/x86/mm/kaiser.c
+++ b/arch/x86/mm/kaiser.c
@@ -29,8 +29,8 @@ DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
  * This is also handy because systems that do not support PCIDs
  * just end up or'ing a 0 into their CR3, which does no harm.
  */
-__aligned(PAGE_SIZE) unsigned long X86_CR3_PCID_KERN_VAR;
-DEFINE_PER_CPU(unsigned long, X86_CR3_PCID_USER_VAR);
+unsigned long x86_cr3_pcid_noflush __read_mostly;
+DEFINE_PER_CPU(unsigned long, x86_cr3_pcid_user);
 
 /*
  * At runtime, the only things we map are some things for CPU
@@ -304,7 +304,8 @@ void __init kaiser_init(void)
 				  sizeof(gate_desc) * NR_VECTORS,
 				  __PAGE_KERNEL);
 
-	kaiser_add_user_map_early(&X86_CR3_PCID_KERN_VAR, PAGE_SIZE,
+	kaiser_add_user_map_early(&x86_cr3_pcid_noflush,
+				  sizeof(x86_cr3_pcid_noflush),
 				  __PAGE_KERNEL);
 }
 
@@ -384,8 +385,8 @@ void kaiser_setup_pcid(void)
 	 * These variables are used by the entry/exit
 	 * code to change PCID and pgd and TLB flushing.
 	 */
-	X86_CR3_PCID_KERN_VAR = kern_cr3;
-	this_cpu_write(X86_CR3_PCID_USER_VAR, user_cr3);
+	x86_cr3_pcid_noflush = kern_cr3;
+	this_cpu_write(x86_cr3_pcid_user, user_cr3);
 }
 
 /*
@@ -395,7 +396,7 @@ void kaiser_setup_pcid(void)
  */
 void kaiser_flush_tlb_on_return_to_user(void)
 {
-	this_cpu_write(X86_CR3_PCID_USER_VAR,
+	this_cpu_write(x86_cr3_pcid_user,
 			X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET);
 }
 EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user);
-- 
cgit v1.2.3


From fc8334e6b3e5d28afd4eec8a74493933f73b2784 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Tue, 26 Sep 2017 18:43:07 -0700
Subject: kaiser: paranoid_entry pass cr3 need to paranoid_exit

Neel Natu points out that paranoid_entry() was wrong to assume that
an entry that did not need swapgs would not need SWITCH_KERNEL_CR3:
paranoid_entry (used for debug breakpoint, int3, double fault or MCE;
though I think it's only the MCE case that is cause for concern here)
can break in at an awkward time, between cr3 switch and swapgs, but
its handling always needs kernel gs and kernel cr3.

Easy to fix in itself, but paranoid_entry() also needs to convey to
paranoid_exit() (and my reading of macro idtentry says paranoid_entry
and paranoid_exit are always paired) how to restore the prior state.
The swapgs state is already conveyed by %ebx (0 or 1), so extend that
also to convey when SWITCH_USER_CR3 will be needed (2 or 3).

(Yes, I'd much prefer that 0 meant no swapgs, whereas it's the other
way round: and a convention shared with error_entry() and error_exit(),
which I don't want to touch.  Perhaps I should have inverted the bit
for switch cr3 too, but did not.)

paranoid_exit() would be straightforward, except for TRACE_IRQS: it
did TRACE_IRQS_IRETQ when doing swapgs, but TRACE_IRQS_IRETQ_DEBUG
when not: which is it supposed to use when SWITCH_USER_CR3 is split
apart from that?  As best as I can determine, commit 5963e317b1e9
("ftrace/x86: Do not change stacks in DEBUG when calling lockdep")
missed the swapgs case, and should have used TRACE_IRQS_IRETQ_DEBUG
there too (the discrepancy has nothing to do with the liberal use
of _NO_STACK and _UNSAFE_STACK hereabouts: TRACE_IRQS_OFF_DEBUG has
just been used in all cases); discrepancy lovingly preserved across
several paranoid_exit() cleanups, but I'm now removing it.

Neel further indicates that to use SWITCH_USER_CR3_NO_STACK there in
paranoid_exit() is now not only unnecessary but unsafe: might corrupt
syscall entry's unsafe_stack_register_backup of %rax.  Just use
SWITCH_USER_CR3: and delete SWITCH_USER_CR3_NO_STACK altogether,
before we make the mistake of using it again.

hughd adds: this commit fixes an issue in the Kaiser-without-PCIDs
part of the series, and ought to be moved earlier, if you decided
to make a release of Kaiser-without-PCIDs.

Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/entry/entry_64.S     | 46 +++++++++++++++++++++++++++++++++----------
 arch/x86/include/asm/kaiser.h |  8 --------
 2 files changed, 36 insertions(+), 18 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index efd96f02ac9e..59d9e5d8c05b 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -1025,7 +1025,11 @@ idtentry machine_check					has_error_code=0	paranoid=1 do_sym=*machine_check_vec
 /*
  * Save all registers in pt_regs, and switch gs if needed.
  * Use slow, but surefire "are we in kernel?" check.
- * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
+ *
+ * Return: ebx=0: needs swapgs but not SWITCH_USER_CR3 in paranoid_exit
+ *         ebx=1: needs neither swapgs nor SWITCH_USER_CR3 in paranoid_exit
+ *         ebx=2: needs both swapgs and SWITCH_USER_CR3 in paranoid_exit
+ *         ebx=3: needs SWITCH_USER_CR3 but not swapgs in paranoid_exit
  */
 ENTRY(paranoid_entry)
 	cld
@@ -1037,9 +1041,26 @@ ENTRY(paranoid_entry)
 	testl	%edx, %edx
 	js	1f				/* negative -> in kernel */
 	SWAPGS
-	SWITCH_KERNEL_CR3
 	xorl	%ebx, %ebx
-1:	ret
+1:
+#ifdef CONFIG_KAISER
+	/*
+	 * We might have come in between a swapgs and a SWITCH_KERNEL_CR3
+	 * on entry, or between a SWITCH_USER_CR3 and a swapgs on exit.
+	 * Do a conditional SWITCH_KERNEL_CR3: this could safely be done
+	 * unconditionally, but we need to find out whether the reverse
+	 * should be done on return (conveyed to paranoid_exit in %ebx).
+	 */
+	movq	%cr3, %rax
+	testl	$KAISER_SHADOW_PGD_OFFSET, %eax
+	jz	2f
+	orl	$2, %ebx
+	andq	$(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
+	orq	x86_cr3_pcid_noflush, %rax
+	movq	%rax, %cr3
+2:
+#endif
+	ret
 END(paranoid_entry)
 
 /*
@@ -1052,20 +1073,25 @@ END(paranoid_entry)
  * be complicated.  Fortunately, we there's no good reason
  * to try to handle preemption here.
  *
- * On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it)
+ * On entry: ebx=0: needs swapgs but not SWITCH_USER_CR3
+ *           ebx=1: needs neither swapgs nor SWITCH_USER_CR3
+ *           ebx=2: needs both swapgs and SWITCH_USER_CR3
+ *           ebx=3: needs SWITCH_USER_CR3 but not swapgs
  */
 ENTRY(paranoid_exit)
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF_DEBUG
-	testl	%ebx, %ebx			/* swapgs needed? */
+	TRACE_IRQS_IRETQ_DEBUG
+#ifdef CONFIG_KAISER
+	testl	$2, %ebx			/* SWITCH_USER_CR3 needed? */
+	jz	paranoid_exit_no_switch
+	SWITCH_USER_CR3
+paranoid_exit_no_switch:
+#endif
+	testl	$1, %ebx			/* swapgs needed? */
 	jnz	paranoid_exit_no_swapgs
-	TRACE_IRQS_IRETQ
-	SWITCH_USER_CR3_NO_STACK
 	SWAPGS_UNSAFE_STACK
-	jmp	paranoid_exit_restore
 paranoid_exit_no_swapgs:
-	TRACE_IRQS_IRETQ_DEBUG
-paranoid_exit_restore:
 	RESTORE_EXTRA_REGS
 	RESTORE_C_REGS
 	REMOVE_PT_GPREGS_FROM_STACK 8
diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h
index 48d8d70dd8c7..3dc5f4c39b3e 100644
--- a/arch/x86/include/asm/kaiser.h
+++ b/arch/x86/include/asm/kaiser.h
@@ -63,20 +63,12 @@ _SWITCH_TO_KERNEL_CR3 %rax
 movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
 .endm
 
-.macro SWITCH_USER_CR3_NO_STACK
-movq %rax, PER_CPU_VAR(unsafe_stack_register_backup)
-_SWITCH_TO_USER_CR3 %rax %al
-movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
-.endm
-
 #else /* CONFIG_KAISER */
 
 .macro SWITCH_KERNEL_CR3 reg
 .endm
 .macro SWITCH_USER_CR3 reg regb
 .endm
-.macro SWITCH_USER_CR3_NO_STACK
-.endm
 .macro SWITCH_KERNEL_CR3_NO_STACK
 .endm
 
-- 
cgit v1.2.3


From d41f46f778951b0ea851ca52b88b2549c6336b47 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Fri, 13 Oct 2017 12:10:00 -0700
Subject: kaiser: _pgd_alloc() without __GFP_REPEAT to avoid stalls

Synthetic filesystem mempressure testing has shown softlockups, with
hour-long page allocation stalls, and pgd_alloc() trying for order:1
with __GFP_REPEAT in one of the backtraces each time.

That's _pgd_alloc() going for a Kaiser double-pgd, using the __GFP_REPEAT
common to all page table allocations, but actually having no effect on
order:0 (see should_alloc_oom() and should_continue_reclaim() in this
tree, but beware that ports to another tree might behave differently).

Order:1 stack allocation has been working satisfactorily without
__GFP_REPEAT forever, and page table allocation only asks __GFP_REPEAT
for awkward occasions in a long-running process: it's not appropriate
at fork or exec time, and seems to be doing much more harm than good:
getting those contiguous pages under very heavy mempressure can be
hard (though even without it, Kaiser does generate more mempressure).

Mask out that __GFP_REPEAT inside _pgd_alloc().  Why not take it out
of the PGALLOG_GFP altogether, as v4.7 commit a3a9a59d2067 ("x86: get
rid of superfluous __GFP_REPEAT") did?  Because I think that might
make a difference to our page table memcg charging, which I'd prefer
not to interfere with at this time.

hughd adds: __alloc_pages_slowpath() in the 4.4.89-stable tree handles
__GFP_REPEAT a little differently than in prod kernel or 3.18.72-stable,
so it may not always be exactly a no-op on order:0 pages, as said above;
but I think still appropriate to omit it from Kaiser or non-Kaiser pgd.

Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/mm/pgtable.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index e2bd5c81279e..d0a424988f82 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -6,7 +6,7 @@
 #include <asm/fixmap.h>
 #include <asm/mtrr.h>
 
-#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO
+#define PGALLOC_GFP (GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO)
 
 #ifdef CONFIG_HIGHPTE
 #define PGALLOC_USER_GFP __GFP_HIGHMEM
@@ -354,7 +354,9 @@ static inline void _pgd_free(pgd_t *pgd)
 
 static inline pgd_t *_pgd_alloc(void)
 {
-	return (pgd_t *)__get_free_pages(PGALLOC_GFP, PGD_ALLOCATION_ORDER);
+	/* No __GFP_REPEAT: to avoid page allocation stalls in order-1 case */
+	return (pgd_t *)__get_free_pages(PGALLOC_GFP & ~__GFP_REPEAT,
+					 PGD_ALLOCATION_ORDER);
 }
 
 static inline void _pgd_free(pgd_t *pgd)
-- 
cgit v1.2.3


From 500943e57db8d3e298e98f595f835c5b613e843b Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Mon, 4 Dec 2017 20:13:35 -0800
Subject: kaiser: fix unlikely error in alloc_ldt_struct()

An error from kaiser_add_mapping() here is not at all likely, but
Eric Biggers rightly points out that __free_ldt_struct() relies on
new_ldt->size being initialized: move that up.

Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kernel/ldt.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index 89ee37469d41..bc429365b72a 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -79,11 +79,11 @@ static struct ldt_struct *alloc_ldt_struct(int size)
 
 	ret = kaiser_add_mapping((unsigned long)new_ldt->entries, alloc_size,
 				 __PAGE_KERNEL);
+	new_ldt->size = size;
 	if (ret) {
 		__free_ldt_struct(new_ldt);
 		return NULL;
 	}
-	new_ldt->size = size;
 	return new_ldt;
 }
 
-- 
cgit v1.2.3


From e345dcc9481543edf4a0a5df4c4c2f9597b0a997 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Sun, 24 Sep 2017 16:59:49 -0700
Subject: kaiser: add "nokaiser" boot option, using ALTERNATIVE

Added "nokaiser" boot option: an early param like "noinvpcid".
Most places now check int kaiser_enabled (#defined 0 when not
CONFIG_KAISER) instead of #ifdef CONFIG_KAISER; but entry_64.S
and entry_64_compat.S are using the ALTERNATIVE technique, which
patches in the preferred instructions at runtime.  That technique
is tied to x86 cpu features, so X86_FEATURE_KAISER is fabricated.

Prior to "nokaiser", Kaiser #defined _PAGE_GLOBAL 0: revert that,
but be careful with both _PAGE_GLOBAL and CR4.PGE: setting them when
nokaiser like when !CONFIG_KAISER, but not setting either when kaiser -
neither matters on its own, but it's hard to be sure that _PAGE_GLOBAL
won't get set in some obscure corner, or something add PGE into CR4.
By omitting _PAGE_GLOBAL from __supported_pte_mask when kaiser_enabled,
all page table setup which uses pte_pfn() masks it out of the ptes.

It's slightly shameful that the same declaration versus definition of
kaiser_enabled appears in not one, not two, but in three header files
(asm/kaiser.h, asm/pgtable.h, asm/tlbflush.h).  I felt safer that way,
than with #including any of those in any of the others; and did not
feel it worth an asm/kaiser_enabled.h - kernel/cpu/common.c includes
them all, so we shall hear about it if they get out of synch.

Cleanups while in the area: removed the silly #ifdef CONFIG_KAISER
from kaiser.c; removed the unused native_get_normal_pgd(); removed
the spurious reg clutter from SWITCH_*_CR3 macro stubs; corrected some
comments.  But more interestingly, set CR4.PSE in secondary_startup_64:
the manual is clear that it does not matter whether it's 0 or 1 when
4-level-pts are enabled, but I was distracted to find cr4 different on
BSP and auxiliaries - BSP alone was adding PSE, in probe_page_size_mask().

Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/entry/entry_64.S            | 15 +++++++-------
 arch/x86/include/asm/cpufeature.h    |  3 +++
 arch/x86/include/asm/kaiser.h        | 27 ++++++++++++++++++-------
 arch/x86/include/asm/pgtable.h       | 20 ++++++++++++------
 arch/x86/include/asm/pgtable_64.h    | 13 ++++--------
 arch/x86/include/asm/pgtable_types.h |  4 ----
 arch/x86/include/asm/tlbflush.h      | 39 +++++++++++++++++++++++-------------
 arch/x86/kernel/cpu/common.c         | 28 +++++++++++++++++++++++++-
 arch/x86/kernel/espfix_64.c          |  3 ++-
 arch/x86/kernel/head_64.S            |  4 ++--
 arch/x86/mm/init.c                   |  2 +-
 arch/x86/mm/init_64.c                | 10 +++++++++
 arch/x86/mm/kaiser.c                 | 26 ++++++++++++++++++++----
 arch/x86/mm/pgtable.c                |  8 ++------
 arch/x86/mm/tlb.c                    |  4 +---
 15 files changed, 141 insertions(+), 65 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 59d9e5d8c05b..85e30957e494 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -1051,7 +1051,7 @@ ENTRY(paranoid_entry)
 	 * unconditionally, but we need to find out whether the reverse
 	 * should be done on return (conveyed to paranoid_exit in %ebx).
 	 */
-	movq	%cr3, %rax
+	ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER
 	testl	$KAISER_SHADOW_PGD_OFFSET, %eax
 	jz	2f
 	orl	$2, %ebx
@@ -1083,6 +1083,7 @@ ENTRY(paranoid_exit)
 	TRACE_IRQS_OFF_DEBUG
 	TRACE_IRQS_IRETQ_DEBUG
 #ifdef CONFIG_KAISER
+	/* No ALTERNATIVE for X86_FEATURE_KAISER: paranoid_entry sets %ebx */
 	testl	$2, %ebx			/* SWITCH_USER_CR3 needed? */
 	jz	paranoid_exit_no_switch
 	SWITCH_USER_CR3
@@ -1315,13 +1316,14 @@ ENTRY(nmi)
 #ifdef CONFIG_KAISER
 	/* Unconditionally use kernel CR3 for do_nmi() */
 	/* %rax is saved above, so OK to clobber here */
-	movq	%cr3, %rax
+	ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER
 	/* If PCID enabled, NOFLUSH now and NOFLUSH on return */
 	orq	x86_cr3_pcid_noflush, %rax
 	pushq	%rax
 	/* mask off "user" bit of pgd address and 12 PCID bits: */
 	andq	$(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
 	movq	%rax, %cr3
+2:
 #endif
 	call	do_nmi
 
@@ -1331,8 +1333,7 @@ ENTRY(nmi)
 	 * kernel code that needs user CR3, but do we ever return
 	 * to "user mode" where we need the kernel CR3?
 	 */
-	popq	%rax
-	mov	%rax, %cr3
+	ALTERNATIVE "", "popq %rax; movq %rax, %cr3", X86_FEATURE_KAISER
 #endif
 
 	/*
@@ -1559,13 +1560,14 @@ end_repeat_nmi:
 #ifdef CONFIG_KAISER
 	/* Unconditionally use kernel CR3 for do_nmi() */
 	/* %rax is saved above, so OK to clobber here */
-	movq	%cr3, %rax
+	ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER
 	/* If PCID enabled, NOFLUSH now and NOFLUSH on return */
 	orq	x86_cr3_pcid_noflush, %rax
 	pushq	%rax
 	/* mask off "user" bit of pgd address and 12 PCID bits: */
 	andq	$(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
 	movq	%rax, %cr3
+2:
 #endif
 
 	/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
@@ -1577,8 +1579,7 @@ end_repeat_nmi:
 	 * kernel code that needs user CR3, like just just before
 	 * a sysret.
 	 */
-	popq	%rax
-	mov	%rax, %cr3
+	ALTERNATIVE "", "popq %rax; movq %rax, %cr3", X86_FEATURE_KAISER
 #endif
 
 	testl	%ebx, %ebx			/* swapgs needed? */
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 72a4343b774f..3fc40a43578f 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -200,6 +200,9 @@
 #define X86_FEATURE_HWP_PKG_REQ ( 7*32+14) /* Intel HWP_PKG_REQ */
 #define X86_FEATURE_INTEL_PT	( 7*32+15) /* Intel Processor Trace */
 
+/* Because the ALTERNATIVE scheme is for members of the X86_FEATURE club... */
+#define X86_FEATURE_KAISER	( 7*32+31) /* CONFIG_KAISER w/o nokaiser */
+
 /* Virtualization flags: Linux defined, word 8 */
 #define X86_FEATURE_TPR_SHADOW  ( 8*32+ 0) /* Intel TPR Shadow */
 #define X86_FEATURE_VNMI        ( 8*32+ 1) /* Intel Virtual NMI */
diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h
index 3dc5f4c39b3e..96643a9c194c 100644
--- a/arch/x86/include/asm/kaiser.h
+++ b/arch/x86/include/asm/kaiser.h
@@ -46,28 +46,33 @@ movq \reg, %cr3
 .endm
 
 .macro SWITCH_KERNEL_CR3
-pushq %rax
+ALTERNATIVE "jmp 8f", "pushq %rax", X86_FEATURE_KAISER
 _SWITCH_TO_KERNEL_CR3 %rax
 popq %rax
+8:
 .endm
 
 .macro SWITCH_USER_CR3
-pushq %rax
+ALTERNATIVE "jmp 8f", "pushq %rax", X86_FEATURE_KAISER
 _SWITCH_TO_USER_CR3 %rax %al
 popq %rax
+8:
 .endm
 
 .macro SWITCH_KERNEL_CR3_NO_STACK
-movq %rax, PER_CPU_VAR(unsafe_stack_register_backup)
+ALTERNATIVE "jmp 8f", \
+	__stringify(movq %rax, PER_CPU_VAR(unsafe_stack_register_backup)), \
+	X86_FEATURE_KAISER
 _SWITCH_TO_KERNEL_CR3 %rax
 movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
+8:
 .endm
 
 #else /* CONFIG_KAISER */
 
-.macro SWITCH_KERNEL_CR3 reg
+.macro SWITCH_KERNEL_CR3
 .endm
-.macro SWITCH_USER_CR3 reg regb
+.macro SWITCH_USER_CR3
 .endm
 .macro SWITCH_KERNEL_CR3_NO_STACK
 .endm
@@ -90,6 +95,16 @@ DECLARE_PER_CPU(unsigned long, x86_cr3_pcid_user);
 
 extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];
 
+extern int kaiser_enabled;
+#else
+#define kaiser_enabled	0
+#endif /* CONFIG_KAISER */
+
+/*
+ * Kaiser function prototypes are needed even when CONFIG_KAISER is not set,
+ * so as to build with tests on kaiser_enabled instead of #ifdefs.
+ */
+
 /**
  *  kaiser_add_mapping - map a virtual memory part to the shadow (user) mapping
  *  @addr: the start address of the range
@@ -119,8 +134,6 @@ extern void kaiser_remove_mapping(unsigned long start, unsigned long size);
  */
 extern void kaiser_init(void);
 
-#endif /* CONFIG_KAISER */
-
 #endif /* __ASSEMBLY */
 
 #endif /* _ASM_X86_KAISER_H */
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 176035fa057e..051beec179f4 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -18,6 +18,12 @@
 #ifndef __ASSEMBLY__
 #include <asm/x86_init.h>
 
+#ifdef CONFIG_KAISER
+extern int kaiser_enabled;
+#else
+#define kaiser_enabled 0
+#endif
+
 void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd);
 void ptdump_walk_pgd_level_checkwx(void);
 
@@ -660,7 +666,7 @@ static inline int pgd_bad(pgd_t pgd)
 	 * page table by accident; it will fault on the first
 	 * instruction it tries to run.  See native_set_pgd().
 	 */
-	if (IS_ENABLED(CONFIG_KAISER))
+	if (kaiser_enabled)
 		ignore_flags |= _PAGE_NX;
 
 	return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE;
@@ -865,12 +871,14 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
  */
 static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
 {
-       memcpy(dst, src, count * sizeof(pgd_t));
+	memcpy(dst, src, count * sizeof(pgd_t));
 #ifdef CONFIG_KAISER
-	/* Clone the shadow pgd part as well */
-	memcpy(native_get_shadow_pgd(dst),
-	       native_get_shadow_pgd(src),
-	       count * sizeof(pgd_t));
+	if (kaiser_enabled) {
+		/* Clone the shadow pgd part as well */
+		memcpy(native_get_shadow_pgd(dst),
+			native_get_shadow_pgd(src),
+			count * sizeof(pgd_t));
+	}
 #endif
 }
 
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 8629be9e6649..233d19c9f22e 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -111,13 +111,12 @@ extern pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd);
 
 static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp)
 {
+#ifdef CONFIG_DEBUG_VM
+	/* linux/mmdebug.h may not have been included at this point */
+	BUG_ON(!kaiser_enabled);
+#endif
 	return (pgd_t *)((unsigned long)pgdp | (unsigned long)PAGE_SIZE);
 }
-
-static inline pgd_t *native_get_normal_pgd(pgd_t *pgdp)
-{
-	return (pgd_t *)((unsigned long)pgdp & ~(unsigned long)PAGE_SIZE);
-}
 #else
 static inline pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
 {
@@ -128,10 +127,6 @@ static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp)
 	BUILD_BUG_ON(1);
 	return NULL;
 }
-static inline pgd_t *native_get_normal_pgd(pgd_t *pgdp)
-{
-	return pgdp;
-}
 #endif /* CONFIG_KAISER */
 
 static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 22704eca1900..22547baa458a 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -39,11 +39,7 @@
 #define _PAGE_ACCESSED	(_AT(pteval_t, 1) << _PAGE_BIT_ACCESSED)
 #define _PAGE_DIRTY	(_AT(pteval_t, 1) << _PAGE_BIT_DIRTY)
 #define _PAGE_PSE	(_AT(pteval_t, 1) << _PAGE_BIT_PSE)
-#ifdef CONFIG_KAISER
-#define _PAGE_GLOBAL	(_AT(pteval_t, 0))
-#else
 #define _PAGE_GLOBAL	(_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
-#endif
 #define _PAGE_SOFTW1	(_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1)
 #define _PAGE_SOFTW2	(_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2)
 #define _PAGE_PAT	(_AT(pteval_t, 1) << _PAGE_BIT_PAT)
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index ff8c5eb95caf..b376095a1fd9 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -136,9 +136,11 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask)
  * to avoid the need for asm/kaiser.h in unexpected places.
  */
 #ifdef CONFIG_KAISER
+extern int kaiser_enabled;
 extern void kaiser_setup_pcid(void);
 extern void kaiser_flush_tlb_on_return_to_user(void);
 #else
+#define kaiser_enabled 0
 static inline void kaiser_setup_pcid(void)
 {
 }
@@ -163,7 +165,7 @@ static inline void __native_flush_tlb(void)
 	 * back:
 	 */
 	preempt_disable();
-	if (this_cpu_has(X86_FEATURE_PCID))
+	if (kaiser_enabled && this_cpu_has(X86_FEATURE_PCID))
 		kaiser_flush_tlb_on_return_to_user();
 	native_write_cr3(native_read_cr3());
 	preempt_enable();
@@ -174,20 +176,30 @@ static inline void __native_flush_tlb_global_irq_disabled(void)
 	unsigned long cr4;
 
 	cr4 = this_cpu_read(cpu_tlbstate.cr4);
-	/* clear PGE */
-	native_write_cr4(cr4 & ~X86_CR4_PGE);
-	/* write old PGE again and flush TLBs */
-	native_write_cr4(cr4);
+	if (cr4 & X86_CR4_PGE) {
+		/* clear PGE and flush TLB of all entries */
+		native_write_cr4(cr4 & ~X86_CR4_PGE);
+		/* restore PGE as it was before */
+		native_write_cr4(cr4);
+	} else {
+		/*
+		 * x86_64 microcode update comes this way when CR4.PGE is not
+		 * enabled, and it's safer for all callers to allow this case.
+		 */
+		native_write_cr3(native_read_cr3());
+	}
 }
 
 static inline void __native_flush_tlb_global(void)
 {
-#ifdef CONFIG_KAISER
-	/* Globals are not used at all */
-	__native_flush_tlb();
-#else
 	unsigned long flags;
 
+	if (kaiser_enabled) {
+		/* Globals are not used at all */
+		__native_flush_tlb();
+		return;
+	}
+
 	if (this_cpu_has(X86_FEATURE_INVPCID)) {
 		/*
 		 * Using INVPCID is considerably faster than a pair of writes
@@ -207,7 +219,6 @@ static inline void __native_flush_tlb_global(void)
 	raw_local_irq_save(flags);
 	__native_flush_tlb_global_irq_disabled();
 	raw_local_irq_restore(flags);
-#endif
 }
 
 static inline void __native_flush_tlb_single(unsigned long addr)
@@ -222,7 +233,7 @@ static inline void __native_flush_tlb_single(unsigned long addr)
 	 */
 
 	if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) {
-		if (this_cpu_has(X86_FEATURE_PCID))
+		if (kaiser_enabled && this_cpu_has(X86_FEATURE_PCID))
 			kaiser_flush_tlb_on_return_to_user();
 		asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
 		return;
@@ -237,9 +248,9 @@ static inline void __native_flush_tlb_single(unsigned long addr)
 	 * Make sure to do only a single invpcid when KAISER is
 	 * disabled and we have only a single ASID.
 	 */
-	if (X86_CR3_PCID_ASID_KERN != X86_CR3_PCID_ASID_USER)
-		invpcid_flush_one(X86_CR3_PCID_ASID_KERN, addr);
-	invpcid_flush_one(X86_CR3_PCID_ASID_USER, addr);
+	if (kaiser_enabled)
+		invpcid_flush_one(X86_CR3_PCID_ASID_USER, addr);
+	invpcid_flush_one(X86_CR3_PCID_ASID_KERN, addr);
 }
 
 static inline void __flush_tlb_all(void)
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 46ad2faca9ab..86db04f0ec78 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -178,6 +178,20 @@ static int __init x86_pcid_setup(char *s)
 	return 1;
 }
 __setup("nopcid", x86_pcid_setup);
+
+static int __init x86_nokaiser_setup(char *s)
+{
+	/* nokaiser doesn't accept parameters */
+	if (s)
+		return -EINVAL;
+#ifdef CONFIG_KAISER
+	kaiser_enabled = 0;
+	setup_clear_cpu_cap(X86_FEATURE_KAISER);
+	pr_info("nokaiser: KAISER feature disabled\n");
+#endif
+	return 0;
+}
+early_param("nokaiser", x86_nokaiser_setup);
 #endif
 
 static int __init x86_noinvpcid_setup(char *s)
@@ -324,7 +338,7 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c)
 static void setup_pcid(struct cpuinfo_x86 *c)
 {
 	if (cpu_has(c, X86_FEATURE_PCID)) {
-		if (cpu_has(c, X86_FEATURE_PGE)) {
+		if (cpu_has(c, X86_FEATURE_PGE) || kaiser_enabled) {
 			cr4_set_bits(X86_CR4_PCIDE);
 			/*
 			 * INVPCID has two "groups" of types:
@@ -747,6 +761,10 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
 		c->x86_power = cpuid_edx(0x80000007);
 
 	init_scattered_cpuid_features(c);
+#ifdef CONFIG_KAISER
+	if (kaiser_enabled)
+		set_cpu_cap(c, X86_FEATURE_KAISER);
+#endif
 }
 
 static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
@@ -1406,6 +1424,14 @@ void cpu_init(void)
 	 * try to read it.
 	 */
 	cr4_init_shadow();
+	if (!kaiser_enabled) {
+		/*
+		 * secondary_startup_64() deferred setting PGE in cr4:
+		 * probe_page_size_mask() sets it on the boot cpu,
+		 * but it needs to be set on each secondary cpu.
+		 */
+		cr4_set_bits(X86_CR4_PGE);
+	}
 
 	/*
 	 * Load microcode on this cpu if a valid microcode is available.
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index 54de4c70cbd6..b02cb2ec6726 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -132,9 +132,10 @@ void __init init_espfix_bsp(void)
 	 * area to ensure it is mapped into the shadow user page
 	 * tables.
 	 */
-	if (IS_ENABLED(CONFIG_KAISER))
+	if (kaiser_enabled) {
 		set_pgd(native_get_shadow_pgd(pgd_p),
 			__pgd(_KERNPG_TABLE | __pa((pud_t *)espfix_pud_page)));
+	}
 
 	/* Randomize the locations */
 	init_espfix_random();
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index abbbcfbb5e26..9af949ce8a69 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -183,8 +183,8 @@ ENTRY(secondary_startup_64)
 	movq	$(init_level4_pgt - __START_KERNEL_map), %rax
 1:
 
-	/* Enable PAE mode and PGE */
-	movl	$(X86_CR4_PAE | X86_CR4_PGE), %ecx
+	/* Enable PAE and PSE, but defer PGE until kaiser_enabled is decided */
+	movl	$(X86_CR4_PAE | X86_CR4_PSE), %ecx
 	movq	%rcx, %cr4
 
 	/* Setup early boot stage 4 level pagetables. */
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index ed4b372860e4..2bd45ae91eb3 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -165,7 +165,7 @@ static void __init probe_page_size_mask(void)
 		cr4_set_bits_and_update_boot(X86_CR4_PSE);
 
 	/* Enable PGE if available */
-	if (cpu_has_pge) {
+	if (cpu_has_pge && !kaiser_enabled) {
 		cr4_set_bits_and_update_boot(X86_CR4_PGE);
 		__supported_pte_mask |= _PAGE_GLOBAL;
 	} else
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index ec081fe0ce2c..d76ec9348cff 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -395,6 +395,16 @@ void __init cleanup_highmap(void)
 			continue;
 		if (vaddr < (unsigned long) _text || vaddr > end)
 			set_pmd(pmd, __pmd(0));
+		else if (kaiser_enabled) {
+			/*
+			 * level2_kernel_pgt is initialized with _PAGE_GLOBAL:
+			 * clear that now.  This is not important, so long as
+			 * CR4.PGE remains clear, but it removes an anomaly.
+			 * Physical mapping setup below avoids _PAGE_GLOBAL
+			 * by use of massage_pgprot() inside pfn_pte() etc.
+			 */
+			set_pmd(pmd, pmd_clear_flags(*pmd, _PAGE_GLOBAL));
+		}
 	}
 }
 
diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c
index 8a5eb5993d3c..45fd51c08a0e 100644
--- a/arch/x86/mm/kaiser.c
+++ b/arch/x86/mm/kaiser.c
@@ -17,7 +17,9 @@
 #include <asm/pgalloc.h>
 #include <asm/desc.h>
 
-#ifdef CONFIG_KAISER
+int kaiser_enabled __read_mostly = 1;
+EXPORT_SYMBOL(kaiser_enabled);	/* for inlined TLB flush functions */
+
 __visible
 DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
 
@@ -168,8 +170,8 @@ static pte_t *kaiser_pagetable_walk(unsigned long address, bool is_atomic)
 	return pte_offset_kernel(pmd, address);
 }
 
-int kaiser_add_user_map(const void *__start_addr, unsigned long size,
-			unsigned long flags)
+static int kaiser_add_user_map(const void *__start_addr, unsigned long size,
+			       unsigned long flags)
 {
 	int ret = 0;
 	pte_t *pte;
@@ -178,6 +180,15 @@ int kaiser_add_user_map(const void *__start_addr, unsigned long size,
 	unsigned long end_addr = PAGE_ALIGN(start_addr + size);
 	unsigned long target_address;
 
+	/*
+	 * It is convenient for callers to pass in __PAGE_KERNEL etc,
+	 * and there is no actual harm from setting _PAGE_GLOBAL, so
+	 * long as CR4.PGE is not set.  But it is nonetheless troubling
+	 * to see Kaiser itself setting _PAGE_GLOBAL (now that "nokaiser"
+	 * requires that not to be #defined to 0): so mask it off here.
+	 */
+	flags &= ~_PAGE_GLOBAL;
+
 	for (; address < end_addr; address += PAGE_SIZE) {
 		target_address = get_pa_from_mapping(address);
 		if (target_address == -1) {
@@ -264,6 +275,8 @@ void __init kaiser_init(void)
 {
 	int cpu;
 
+	if (!kaiser_enabled)
+		return;
 	kaiser_init_all_pgds();
 
 	for_each_possible_cpu(cpu) {
@@ -312,6 +325,8 @@ void __init kaiser_init(void)
 /* Add a mapping to the shadow mapping, and synchronize the mappings */
 int kaiser_add_mapping(unsigned long addr, unsigned long size, unsigned long flags)
 {
+	if (!kaiser_enabled)
+		return 0;
 	return kaiser_add_user_map((const void *)addr, size, flags);
 }
 
@@ -323,6 +338,8 @@ void kaiser_remove_mapping(unsigned long start, unsigned long size)
 	unsigned long addr, next;
 	pgd_t *pgd;
 
+	if (!kaiser_enabled)
+		return;
 	pgd = native_get_shadow_pgd(pgd_offset_k(start));
 	for (addr = start; addr < end; pgd++, addr = next) {
 		next = pgd_addr_end(addr, end);
@@ -344,6 +361,8 @@ static inline bool is_userspace_pgd(pgd_t *pgdp)
 
 pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
 {
+	if (!kaiser_enabled)
+		return pgd;
 	/*
 	 * Do we need to also populate the shadow pgd?  Check _PAGE_USER to
 	 * skip cases like kexec and EFI which make temporary low mappings.
@@ -400,4 +419,3 @@ void kaiser_flush_tlb_on_return_to_user(void)
 			X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET);
 }
 EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user);
-#endif /* CONFIG_KAISER */
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index d0a424988f82..dbc27a2b4ad5 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -341,16 +341,12 @@ static inline void _pgd_free(pgd_t *pgd)
 }
 #else
 
-#ifdef CONFIG_KAISER
 /*
- * Instead of one pmd, we aquire two pmds.  Being order-1, it is
+ * Instead of one pgd, Kaiser acquires two pgds.  Being order-1, it is
  * both 8k in size and 8k-aligned.  That lets us just flip bit 12
  * in a pointer to swap between the two 4k halves.
  */
-#define PGD_ALLOCATION_ORDER 1
-#else
-#define PGD_ALLOCATION_ORDER 0
-#endif
+#define PGD_ALLOCATION_ORDER	kaiser_enabled
 
 static inline pgd_t *_pgd_alloc(void)
 {
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 90ef67a9e34b..6ac065d4230c 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -39,8 +39,7 @@ static void load_new_mm_cr3(pgd_t *pgdir)
 {
 	unsigned long new_mm_cr3 = __pa(pgdir);
 
-#ifdef CONFIG_KAISER
-	if (this_cpu_has(X86_FEATURE_PCID)) {
+	if (kaiser_enabled && this_cpu_has(X86_FEATURE_PCID)) {
 		/*
 		 * We reuse the same PCID for different tasks, so we must
 		 * flush all the entries for the PCID out when we change tasks.
@@ -57,7 +56,6 @@ static void load_new_mm_cr3(pgd_t *pgdir)
 		new_mm_cr3 |= X86_CR3_PCID_KERN_FLUSH;
 		kaiser_flush_tlb_on_return_to_user();
 	}
-#endif /* CONFIG_KAISER */
 
 	/*
 	 * Caution: many callers of this function expect
-- 
cgit v1.2.3


From dea9aa9ffae11c91285335cc3215b4f0e48e8139 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Tue, 2 Jan 2018 14:19:48 +0100
Subject: x86/kaiser: Rename and simplify X86_FEATURE_KAISER handling

Concentrate it in arch/x86/mm/kaiser.c and use the upstream string "nopti".

Signed-off-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kernel/cpu/common.c | 18 ------------------
 arch/x86/mm/kaiser.c         | 20 +++++++++++++++++++-
 2 files changed, 19 insertions(+), 19 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 86db04f0ec78..cc154ac64f00 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -178,20 +178,6 @@ static int __init x86_pcid_setup(char *s)
 	return 1;
 }
 __setup("nopcid", x86_pcid_setup);
-
-static int __init x86_nokaiser_setup(char *s)
-{
-	/* nokaiser doesn't accept parameters */
-	if (s)
-		return -EINVAL;
-#ifdef CONFIG_KAISER
-	kaiser_enabled = 0;
-	setup_clear_cpu_cap(X86_FEATURE_KAISER);
-	pr_info("nokaiser: KAISER feature disabled\n");
-#endif
-	return 0;
-}
-early_param("nokaiser", x86_nokaiser_setup);
 #endif
 
 static int __init x86_noinvpcid_setup(char *s)
@@ -761,10 +747,6 @@ void get_cpu_cap(struct cpuinfo_x86 *c)
 		c->x86_power = cpuid_edx(0x80000007);
 
 	init_scattered_cpuid_features(c);
-#ifdef CONFIG_KAISER
-	if (kaiser_enabled)
-		set_cpu_cap(c, X86_FEATURE_KAISER);
-#endif
 }
 
 static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c
index 45fd51c08a0e..fb812fd865d8 100644
--- a/arch/x86/mm/kaiser.c
+++ b/arch/x86/mm/kaiser.c
@@ -275,8 +275,13 @@ void __init kaiser_init(void)
 {
 	int cpu;
 
-	if (!kaiser_enabled)
+	if (!kaiser_enabled) {
+		setup_clear_cpu_cap(X86_FEATURE_KAISER);
 		return;
+	}
+
+	setup_force_cpu_cap(X86_FEATURE_KAISER);
+
 	kaiser_init_all_pgds();
 
 	for_each_possible_cpu(cpu) {
@@ -419,3 +424,16 @@ void kaiser_flush_tlb_on_return_to_user(void)
 			X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET);
 }
 EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user);
+
+static int __init x86_nokaiser_setup(char *s)
+{
+	/* nopti doesn't accept parameters */
+	if (s)
+		return -EINVAL;
+
+	kaiser_enabled = 0;
+	pr_info("Kernel/User page tables isolation: disabled\n");
+
+	return 0;
+}
+early_param("nopti", x86_nokaiser_setup);
-- 
cgit v1.2.3


From e405a064bd7d6eca88935342ddb71057a9d6ceab Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Tue, 2 Jan 2018 14:19:48 +0100
Subject: x86/kaiser: Check boottime cmdline params

AMD (and possibly other vendors) are not affected by the leak
KAISER is protecting against.

Keep the "nopti" for traditional reasons and add pti=<on|off|auto>
like upstream.

Signed-off-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/mm/kaiser.c | 59 ++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 41 insertions(+), 18 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c
index fb812fd865d8..604aa48c8946 100644
--- a/arch/x86/mm/kaiser.c
+++ b/arch/x86/mm/kaiser.c
@@ -16,6 +16,7 @@
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
 #include <asm/desc.h>
+#include <asm/cmdline.h>
 
 int kaiser_enabled __read_mostly = 1;
 EXPORT_SYMBOL(kaiser_enabled);	/* for inlined TLB flush functions */
@@ -264,6 +265,43 @@ static void __init kaiser_init_all_pgds(void)
 	WARN_ON(__ret);							\
 } while (0)
 
+void __init kaiser_check_boottime_disable(void)
+{
+	bool enable = true;
+	char arg[5];
+	int ret;
+
+	ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg));
+	if (ret > 0) {
+		if (!strncmp(arg, "on", 2))
+			goto enable;
+
+		if (!strncmp(arg, "off", 3))
+			goto disable;
+
+		if (!strncmp(arg, "auto", 4))
+			goto skip;
+	}
+
+	if (cmdline_find_option_bool(boot_command_line, "nopti"))
+		goto disable;
+
+skip:
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+		goto disable;
+
+enable:
+	if (enable)
+		setup_force_cpu_cap(X86_FEATURE_KAISER);
+
+	return;
+
+disable:
+	pr_info("Kernel/User page tables isolation: disabled\n");
+	kaiser_enabled = 0;
+	setup_clear_cpu_cap(X86_FEATURE_KAISER);
+}
+
 /*
  * If anything in here fails, we will likely die on one of the
  * first kernel->user transitions and init will die.  But, we
@@ -275,12 +313,10 @@ void __init kaiser_init(void)
 {
 	int cpu;
 
-	if (!kaiser_enabled) {
-		setup_clear_cpu_cap(X86_FEATURE_KAISER);
-		return;
-	}
+	kaiser_check_boottime_disable();
 
-	setup_force_cpu_cap(X86_FEATURE_KAISER);
+	if (!kaiser_enabled)
+		return;
 
 	kaiser_init_all_pgds();
 
@@ -424,16 +460,3 @@ void kaiser_flush_tlb_on_return_to_user(void)
 			X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET);
 }
 EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user);
-
-static int __init x86_nokaiser_setup(char *s)
-{
-	/* nopti doesn't accept parameters */
-	if (s)
-		return -EINVAL;
-
-	kaiser_enabled = 0;
-	pr_info("Kernel/User page tables isolation: disabled\n");
-
-	return 0;
-}
-early_param("nopti", x86_nokaiser_setup);
-- 
cgit v1.2.3


From 2dff99eb0335f9e0817410696a180dba25ca7371 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Tue, 3 Oct 2017 20:49:04 -0700
Subject: kaiser: use ALTERNATIVE instead of x86_cr3_pcid_noflush

Now that we're playing the ALTERNATIVE game, use that more efficient
method: instead of user-mapping an extra page, and reading an extra
cacheline each time for x86_cr3_pcid_noflush.

Neel has found that __stringify(bts $X86_CR3_PCID_NOFLUSH_BIT, %rax)
is a working substitute for the "bts $63, %rax" in these ALTERNATIVEs;
but the one line with $63 in looks clearer, so let's stick with that.

Worried about what happens with an ALTERNATIVE between the jump and
jump label in another ALTERNATIVE?  I was, but have checked the
combinations in SWITCH_KERNEL_CR3_NO_STACK at entry_SYSCALL_64,
and it does a good job.

Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/entry/entry_64.S     |  7 ++++---
 arch/x86/include/asm/kaiser.h |  6 +++---
 arch/x86/mm/kaiser.c          | 11 +----------
 3 files changed, 8 insertions(+), 16 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 85e30957e494..ad33073ce9d9 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -1056,7 +1056,8 @@ ENTRY(paranoid_entry)
 	jz	2f
 	orl	$2, %ebx
 	andq	$(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
-	orq	x86_cr3_pcid_noflush, %rax
+	/* If PCID enabled, set X86_CR3_PCID_NOFLUSH_BIT */
+	ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID
 	movq	%rax, %cr3
 2:
 #endif
@@ -1318,7 +1319,7 @@ ENTRY(nmi)
 	/* %rax is saved above, so OK to clobber here */
 	ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER
 	/* If PCID enabled, NOFLUSH now and NOFLUSH on return */
-	orq	x86_cr3_pcid_noflush, %rax
+	ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID
 	pushq	%rax
 	/* mask off "user" bit of pgd address and 12 PCID bits: */
 	andq	$(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
@@ -1562,7 +1563,7 @@ end_repeat_nmi:
 	/* %rax is saved above, so OK to clobber here */
 	ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER
 	/* If PCID enabled, NOFLUSH now and NOFLUSH on return */
-	orq	x86_cr3_pcid_noflush, %rax
+	ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID
 	pushq	%rax
 	/* mask off "user" bit of pgd address and 12 PCID bits: */
 	andq	$(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h
index 96643a9c194c..906150d6094e 100644
--- a/arch/x86/include/asm/kaiser.h
+++ b/arch/x86/include/asm/kaiser.h
@@ -25,7 +25,8 @@
 .macro _SWITCH_TO_KERNEL_CR3 reg
 movq %cr3, \reg
 andq $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), \reg
-orq  x86_cr3_pcid_noflush, \reg
+/* If PCID enabled, set X86_CR3_PCID_NOFLUSH_BIT */
+ALTERNATIVE "", "bts $63, \reg", X86_FEATURE_PCID
 movq \reg, %cr3
 .endm
 
@@ -39,7 +40,7 @@ movq \reg, %cr3
 movq %cr3, \reg
 orq  PER_CPU_VAR(x86_cr3_pcid_user), \reg
 js   9f
-/* FLUSH this time, reset to NOFLUSH for next time (if PCID enabled) */
+/* If PCID enabled, FLUSH this time, reset to NOFLUSH for next time */
 movb \regb, PER_CPU_VAR(x86_cr3_pcid_user+7)
 9:
 movq \reg, %cr3
@@ -90,7 +91,6 @@ movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
 */
 DECLARE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
 
-extern unsigned long x86_cr3_pcid_noflush;
 DECLARE_PER_CPU(unsigned long, x86_cr3_pcid_user);
 
 extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];
diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c
index 604aa48c8946..c7d4a1258d1d 100644
--- a/arch/x86/mm/kaiser.c
+++ b/arch/x86/mm/kaiser.c
@@ -32,7 +32,6 @@ DEFINE_PER_CPU_USER_MAPPED(unsigned long, unsafe_stack_register_backup);
  * This is also handy because systems that do not support PCIDs
  * just end up or'ing a 0 into their CR3, which does no harm.
  */
-unsigned long x86_cr3_pcid_noflush __read_mostly;
 DEFINE_PER_CPU(unsigned long, x86_cr3_pcid_user);
 
 /*
@@ -357,10 +356,6 @@ void __init kaiser_init(void)
 	kaiser_add_user_map_early(&debug_idt_table,
 				  sizeof(gate_desc) * NR_VECTORS,
 				  __PAGE_KERNEL);
-
-	kaiser_add_user_map_early(&x86_cr3_pcid_noflush,
-				  sizeof(x86_cr3_pcid_noflush),
-				  __PAGE_KERNEL);
 }
 
 /* Add a mapping to the shadow mapping, and synchronize the mappings */
@@ -434,18 +429,14 @@ pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
 
 void kaiser_setup_pcid(void)
 {
-	unsigned long kern_cr3 = 0;
 	unsigned long user_cr3 = KAISER_SHADOW_PGD_OFFSET;
 
-	if (this_cpu_has(X86_FEATURE_PCID)) {
-		kern_cr3 |= X86_CR3_PCID_KERN_NOFLUSH;
+	if (this_cpu_has(X86_FEATURE_PCID))
 		user_cr3 |= X86_CR3_PCID_USER_NOFLUSH;
-	}
 	/*
 	 * These variables are used by the entry/exit
 	 * code to change PCID and pgd and TLB flushing.
 	 */
-	x86_cr3_pcid_noflush = kern_cr3;
 	this_cpu_write(x86_cr3_pcid_user, user_cr3);
 }
 
-- 
cgit v1.2.3


From 28c6de5441740f868a5b371804a0e8dde03757fb Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Sun, 29 Oct 2017 11:36:19 -0700
Subject: kaiser: drop is_atomic arg to kaiser_pagetable_walk()

I have not observed a might_sleep() warning from setup_fixmap_gdt()'s
use of kaiser_add_mapping() in our tree (why not?), but like upstream
we have not provided a way for that to pass is_atomic true down to
kaiser_pagetable_walk(), and at startup it's far from a likely source
of trouble: so just delete the walk's is_atomic arg and might_sleep().

Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/mm/kaiser.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c
index c7d4a1258d1d..c64bfef99ee8 100644
--- a/arch/x86/mm/kaiser.c
+++ b/arch/x86/mm/kaiser.c
@@ -108,19 +108,13 @@ static inline unsigned long get_pa_from_mapping(unsigned long vaddr)
  *
  * Returns a pointer to a PTE on success, or NULL on failure.
  */
-static pte_t *kaiser_pagetable_walk(unsigned long address, bool is_atomic)
+static pte_t *kaiser_pagetable_walk(unsigned long address)
 {
 	pmd_t *pmd;
 	pud_t *pud;
 	pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address));
 	gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
 
-	if (is_atomic) {
-		gfp &= ~GFP_KERNEL;
-		gfp |= __GFP_HIGH | __GFP_ATOMIC;
-	} else
-		might_sleep();
-
 	if (pgd_none(*pgd)) {
 		WARN_ONCE(1, "All shadow pgds should have been populated");
 		return NULL;
@@ -195,7 +189,7 @@ static int kaiser_add_user_map(const void *__start_addr, unsigned long size,
 			ret = -EIO;
 			break;
 		}
-		pte = kaiser_pagetable_walk(address, false);
+		pte = kaiser_pagetable_walk(address);
 		if (!pte) {
 			ret = -ENOMEM;
 			break;
-- 
cgit v1.2.3


From 0651b3ad99dd59269e2ec883338ab8fba617e203 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Sat, 4 Nov 2017 18:23:24 -0700
Subject: kaiser: asm/tlbflush.h handle noPGE at lower level

I found asm/tlbflush.h too twisty, and think it safer not to avoid
__native_flush_tlb_global_irq_disabled() in the kaiser_enabled case,
but instead let it handle kaiser_enabled along with cr3: it can just
use __native_flush_tlb() for that, no harm in re-disabling preemption.

(This is not the same change as Kirill and Dave have suggested for
upstream, flipping PGE in cr4: that's neat, but needs a cpu_has_pge
check; cr3 is enough for kaiser, and thought to be cheaper than cr4.)

Also delete the X86_FEATURE_INVPCID invpcid_flush_all_nonglobals()
preference from __native_flush_tlb(): unlike the invpcid_flush_all()
preference in __native_flush_tlb_global(), it's not seen in upstream
4.14, and was recently reported to be surprisingly slow.

Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/tlbflush.h | 27 +++------------------------
 1 file changed, 3 insertions(+), 24 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index b376095a1fd9..6fdc8c399601 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -151,14 +151,6 @@ static inline void kaiser_flush_tlb_on_return_to_user(void)
 
 static inline void __native_flush_tlb(void)
 {
-	if (this_cpu_has(X86_FEATURE_INVPCID)) {
-		/*
-		 * Note, this works with CR4.PCIDE=0 or 1.
-		 */
-		invpcid_flush_all_nonglobals();
-		return;
-	}
-
 	/*
 	 * If current->mm == NULL then we borrow a mm which may change during a
 	 * task switch and therefore we must not be preempted while we write CR3
@@ -182,11 +174,8 @@ static inline void __native_flush_tlb_global_irq_disabled(void)
 		/* restore PGE as it was before */
 		native_write_cr4(cr4);
 	} else {
-		/*
-		 * x86_64 microcode update comes this way when CR4.PGE is not
-		 * enabled, and it's safer for all callers to allow this case.
-		 */
-		native_write_cr3(native_read_cr3());
+		/* do it with cr3, letting kaiser flush user PCID */
+		__native_flush_tlb();
 	}
 }
 
@@ -194,12 +183,6 @@ static inline void __native_flush_tlb_global(void)
 {
 	unsigned long flags;
 
-	if (kaiser_enabled) {
-		/* Globals are not used at all */
-		__native_flush_tlb();
-		return;
-	}
-
 	if (this_cpu_has(X86_FEATURE_INVPCID)) {
 		/*
 		 * Using INVPCID is considerably faster than a pair of writes
@@ -255,11 +238,7 @@ static inline void __native_flush_tlb_single(unsigned long addr)
 
 static inline void __flush_tlb_all(void)
 {
-	if (cpu_has_pge)
-		__flush_tlb_global();
-	else
-		__flush_tlb();
-
+	__flush_tlb_global();
 	/*
 	 * Note: if we somehow had PCID but not PGE, then this wouldn't work --
 	 * we'd end up flushing kernel translations for the current ASID but
-- 
cgit v1.2.3


From 8eaca4c7d9f167209a9cc568ff028c0a3b0deb2d Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Sat, 4 Nov 2017 18:43:06 -0700
Subject: kaiser: kaiser_flush_tlb_on_return_to_user() check PCID

Let kaiser_flush_tlb_on_return_to_user() do the X86_FEATURE_PCID
check, instead of each caller doing it inline first: nobody needs
to optimize for the noPCID case, it's clearer this way, and better
suits later changes.  Replace those no-op X86_CR3_PCID_KERN_FLUSH lines
by a BUILD_BUG_ON() in load_new_mm_cr3(), in case something changes.

Signed-off-by: Hugh Dickins <hughd@google.com>
Acked-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/tlbflush.h | 4 ++--
 arch/x86/mm/kaiser.c            | 6 +++---
 arch/x86/mm/tlb.c               | 8 ++++----
 3 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 6fdc8c399601..73865b174090 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -157,7 +157,7 @@ static inline void __native_flush_tlb(void)
 	 * back:
 	 */
 	preempt_disable();
-	if (kaiser_enabled && this_cpu_has(X86_FEATURE_PCID))
+	if (kaiser_enabled)
 		kaiser_flush_tlb_on_return_to_user();
 	native_write_cr3(native_read_cr3());
 	preempt_enable();
@@ -216,7 +216,7 @@ static inline void __native_flush_tlb_single(unsigned long addr)
 	 */
 
 	if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) {
-		if (kaiser_enabled && this_cpu_has(X86_FEATURE_PCID))
+		if (kaiser_enabled)
 			kaiser_flush_tlb_on_return_to_user();
 		asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
 		return;
diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c
index c64bfef99ee8..9d6b7517fca5 100644
--- a/arch/x86/mm/kaiser.c
+++ b/arch/x86/mm/kaiser.c
@@ -436,12 +436,12 @@ void kaiser_setup_pcid(void)
 
 /*
  * Make a note that this cpu will need to flush USER tlb on return to user.
- * Caller checks whether this_cpu_has(X86_FEATURE_PCID) before calling:
- * if cpu does not, then the NOFLUSH bit will never have been set.
+ * If cpu does not have PCID, then the NOFLUSH bit will never have been set.
  */
 void kaiser_flush_tlb_on_return_to_user(void)
 {
-	this_cpu_write(x86_cr3_pcid_user,
+	if (this_cpu_has(X86_FEATURE_PCID))
+		this_cpu_write(x86_cr3_pcid_user,
 			X86_CR3_PCID_USER_FLUSH | KAISER_SHADOW_PGD_OFFSET);
 }
 EXPORT_SYMBOL(kaiser_flush_tlb_on_return_to_user);
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 6ac065d4230c..7cad01af6dcd 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -39,7 +39,7 @@ static void load_new_mm_cr3(pgd_t *pgdir)
 {
 	unsigned long new_mm_cr3 = __pa(pgdir);
 
-	if (kaiser_enabled && this_cpu_has(X86_FEATURE_PCID)) {
+	if (kaiser_enabled) {
 		/*
 		 * We reuse the same PCID for different tasks, so we must
 		 * flush all the entries for the PCID out when we change tasks.
@@ -50,10 +50,10 @@ static void load_new_mm_cr3(pgd_t *pgdir)
 		 * do it here, but can only be used if X86_FEATURE_INVPCID is
 		 * available - and many machines support pcid without invpcid.
 		 *
-		 * The line below is a no-op: X86_CR3_PCID_KERN_FLUSH is now 0;
-		 * but keep that line in there in case something changes.
+		 * If X86_CR3_PCID_KERN_FLUSH actually added something, then it
+		 * would be needed in the write_cr3() below - if PCIDs enabled.
 		 */
-		new_mm_cr3 |= X86_CR3_PCID_KERN_FLUSH;
+		BUILD_BUG_ON(X86_CR3_PCID_KERN_FLUSH);
 		kaiser_flush_tlb_on_return_to_user();
 	}
 
-- 
cgit v1.2.3


From 3e809caffdd7beeac731feb16788873c3bdb811e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 4 Dec 2017 15:07:30 +0100
Subject: x86/paravirt: Dont patch flush_tlb_single

commit a035795499ca1c2bd1928808d1a156eda1420383 upstream

native_flush_tlb_single() will be changed with the upcoming
PAGE_TABLE_ISOLATION feature. This requires to have more code in
there than INVLPG.

Remove the paravirt patching for it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
Reviewed-by: Juergen Gross <jgross@suse.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Cc: linux-mm@kvack.org
Cc: michael.schwarz@iaik.tugraz.at
Cc: moritz.lipp@iaik.tugraz.at
Cc: richard.fellner@student.tugraz.at
Link: https://lkml.kernel.org/r/20171204150606.828111617@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kernel/paravirt_patch_64.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c
index 8aa05583bc42..0677bf8d3a42 100644
--- a/arch/x86/kernel/paravirt_patch_64.c
+++ b/arch/x86/kernel/paravirt_patch_64.c
@@ -9,7 +9,6 @@ DEF_NATIVE(pv_irq_ops, save_fl, "pushfq; popq %rax");
 DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax");
 DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax");
 DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3");
-DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)");
 DEF_NATIVE(pv_cpu_ops, clts, "clts");
 DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd");
 
@@ -62,7 +61,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
 		PATCH_SITE(pv_mmu_ops, read_cr3);
 		PATCH_SITE(pv_mmu_ops, write_cr3);
 		PATCH_SITE(pv_cpu_ops, clts);
-		PATCH_SITE(pv_mmu_ops, flush_tlb_single);
 		PATCH_SITE(pv_cpu_ops, wbinvd);
 #if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS)
 		case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock):
-- 
cgit v1.2.3


From e4ba212ec64109b17fb8653ccfa2ed2c6e3e8217 Mon Sep 17 00:00:00 2001
From: Jiri Kosina <jkosina@suse.cz>
Date: Tue, 2 Jan 2018 14:19:49 +0100
Subject: kaiser: disabled on Xen PV

Kaiser cannot be used on paravirtualized MMUs (namely reading and writing CR3).
This does not work with KAISER as the CR3 switch from and to user space PGD
would require to map the whole XEN_PV machinery into both.

More importantly, enabling KAISER on Xen PV doesn't make too much sense, as PV
guests use distinct %cr3 values for kernel and user already.

Signed-off-by: Jiri Kosina <jkosina@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/mm/kaiser.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c
index 9d6b7517fca5..6a2e00a80105 100644
--- a/arch/x86/mm/kaiser.c
+++ b/arch/x86/mm/kaiser.c
@@ -264,6 +264,9 @@ void __init kaiser_check_boottime_disable(void)
 	char arg[5];
 	int ret;
 
+	if (boot_cpu_has(X86_FEATURE_XENPV))
+		goto silent_disable;
+
 	ret = cmdline_find_option(boot_command_line, "pti", arg, sizeof(arg));
 	if (ret > 0) {
 		if (!strncmp(arg, "on", 2))
@@ -291,6 +294,8 @@ enable:
 
 disable:
 	pr_info("Kernel/User page tables isolation: disabled\n");
+
+silent_disable:
 	kaiser_enabled = 0;
 	setup_clear_cpu_cap(X86_FEATURE_KAISER);
 }
-- 
cgit v1.2.3


From 7f79599df9c4a36130f7a4f6778b334a97632477 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Mon, 25 Dec 2017 13:57:16 +0100
Subject: x86/kaiser: Move feature detection up

... before the first use of kaiser_enabled as otherwise funky
things happen:

  about to get started...
  (XEN) d0v0 Unhandled page fault fault/trap [#14, ec=0000]
  (XEN) Pagetable walk from ffff88022a449090:
  (XEN)  L4[0x110] = 0000000229e0e067 0000000000001e0e
  (XEN)  L3[0x008] = 0000000000000000 ffffffffffffffff
  (XEN) domain_crash_sync called from entry.S: fault at ffff82d08033fd08
  entry.o#create_bounce_frame+0x135/0x14d
  (XEN) Domain 0 (vcpu#0) crashed on cpu#0:
  (XEN) ----[ Xen-4.9.1_02-3.21  x86_64  debug=n   Not tainted ]----
  (XEN) CPU:    0
  (XEN) RIP:    e033:[<ffffffff81007460>]
  (XEN) RFLAGS: 0000000000000286   EM: 1   CONTEXT: pv guest (d0v0)

Signed-off-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/kaiser.h | 2 ++
 arch/x86/kernel/setup.c       | 7 +++++++
 arch/x86/mm/kaiser.c          | 2 --
 3 files changed, 9 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h
index 906150d6094e..b5e46aa683f4 100644
--- a/arch/x86/include/asm/kaiser.h
+++ b/arch/x86/include/asm/kaiser.h
@@ -96,8 +96,10 @@ DECLARE_PER_CPU(unsigned long, x86_cr3_pcid_user);
 extern char __per_cpu_user_mapped_start[], __per_cpu_user_mapped_end[];
 
 extern int kaiser_enabled;
+extern void __init kaiser_check_boottime_disable(void);
 #else
 #define kaiser_enabled	0
+static inline void __init kaiser_check_boottime_disable(void) {}
 #endif /* CONFIG_KAISER */
 
 /*
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index e67b834279b2..bbaae4cf9e8e 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -112,6 +112,7 @@
 #include <asm/alternative.h>
 #include <asm/prom.h>
 #include <asm/microcode.h>
+#include <asm/kaiser.h>
 
 /*
  * max_low_pfn_mapped: highest direct mapped pfn under 4GB
@@ -1016,6 +1017,12 @@ void __init setup_arch(char **cmdline_p)
 	 */
 	init_hypervisor_platform();
 
+	/*
+	 * This needs to happen right after XENPV is set on xen and
+	 * kaiser_enabled is checked below in cleanup_highmap().
+	 */
+	kaiser_check_boottime_disable();
+
 	x86_init.resources.probe_roms();
 
 	/* after parse_early_param, so could debug it */
diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c
index 6a2e00a80105..d86d56f5e82e 100644
--- a/arch/x86/mm/kaiser.c
+++ b/arch/x86/mm/kaiser.c
@@ -311,8 +311,6 @@ void __init kaiser_init(void)
 {
 	int cpu;
 
-	kaiser_check_boottime_disable();
-
 	if (!kaiser_enabled)
 		return;
 
-- 
cgit v1.2.3


From 3e1457d6bf26d9ec300781f84cd0057e44deb45d Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 3 Jan 2018 10:43:15 -0800
Subject: KPTI: Rename to PAGE_TABLE_ISOLATION

This renames CONFIG_KAISER to CONFIG_PAGE_TABLE_ISOLATION.

Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/boot/compressed/misc.h           |  2 +-
 arch/x86/entry/entry_64.S                 | 12 ++++++------
 arch/x86/include/asm/cpufeature.h         |  2 +-
 arch/x86/include/asm/kaiser.h             | 12 ++++++------
 arch/x86/include/asm/pgtable.h            |  4 ++--
 arch/x86/include/asm/pgtable_64.h         |  4 ++--
 arch/x86/include/asm/pgtable_types.h      |  2 +-
 arch/x86/include/asm/tlbflush.h           |  2 +-
 arch/x86/kernel/cpu/perf_event_intel_ds.c |  4 ++--
 arch/x86/kernel/head_64.S                 |  2 +-
 arch/x86/mm/Makefile                      |  2 +-
 11 files changed, 24 insertions(+), 24 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index 4bf52d351022..4abb284a5b9c 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -9,7 +9,7 @@
  */
 #undef CONFIG_PARAVIRT
 #undef CONFIG_PARAVIRT_SPINLOCKS
-#undef CONFIG_KAISER
+#undef CONFIG_PAGE_TABLE_ISOLATION
 #undef CONFIG_KASAN
 
 #include <linux/linkage.h>
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index ad33073ce9d9..952b23b5d4e9 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -1043,7 +1043,7 @@ ENTRY(paranoid_entry)
 	SWAPGS
 	xorl	%ebx, %ebx
 1:
-#ifdef CONFIG_KAISER
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
 	/*
 	 * We might have come in between a swapgs and a SWITCH_KERNEL_CR3
 	 * on entry, or between a SWITCH_USER_CR3 and a swapgs on exit.
@@ -1083,7 +1083,7 @@ ENTRY(paranoid_exit)
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF_DEBUG
 	TRACE_IRQS_IRETQ_DEBUG
-#ifdef CONFIG_KAISER
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
 	/* No ALTERNATIVE for X86_FEATURE_KAISER: paranoid_entry sets %ebx */
 	testl	$2, %ebx			/* SWITCH_USER_CR3 needed? */
 	jz	paranoid_exit_no_switch
@@ -1314,7 +1314,7 @@ ENTRY(nmi)
 
 	movq	%rsp, %rdi
 	movq	$-1, %rsi
-#ifdef CONFIG_KAISER
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
 	/* Unconditionally use kernel CR3 for do_nmi() */
 	/* %rax is saved above, so OK to clobber here */
 	ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER
@@ -1328,7 +1328,7 @@ ENTRY(nmi)
 #endif
 	call	do_nmi
 
-#ifdef CONFIG_KAISER
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
 	/*
 	 * Unconditionally restore CR3.  I know we return to
 	 * kernel code that needs user CR3, but do we ever return
@@ -1558,7 +1558,7 @@ end_repeat_nmi:
 1:
 	movq	%rsp, %rdi
 	movq	$-1, %rsi
-#ifdef CONFIG_KAISER
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
 	/* Unconditionally use kernel CR3 for do_nmi() */
 	/* %rax is saved above, so OK to clobber here */
 	ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER
@@ -1574,7 +1574,7 @@ end_repeat_nmi:
 	/* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
 	call	do_nmi
 
-#ifdef CONFIG_KAISER
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
 	/*
 	 * Unconditionally restore CR3.  We might be returning to
 	 * kernel code that needs user CR3, like just just before
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 3fc40a43578f..f6605712ca90 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -201,7 +201,7 @@
 #define X86_FEATURE_INTEL_PT	( 7*32+15) /* Intel Processor Trace */
 
 /* Because the ALTERNATIVE scheme is for members of the X86_FEATURE club... */
-#define X86_FEATURE_KAISER	( 7*32+31) /* CONFIG_KAISER w/o nokaiser */
+#define X86_FEATURE_KAISER	( 7*32+31) /* CONFIG_PAGE_TABLE_ISOLATION w/o nokaiser */
 
 /* Virtualization flags: Linux defined, word 8 */
 #define X86_FEATURE_TPR_SHADOW  ( 8*32+ 0) /* Intel TPR Shadow */
diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h
index b5e46aa683f4..802bbbdfe143 100644
--- a/arch/x86/include/asm/kaiser.h
+++ b/arch/x86/include/asm/kaiser.h
@@ -20,7 +20,7 @@
 #define KAISER_SHADOW_PGD_OFFSET 0x1000
 
 #ifdef __ASSEMBLY__
-#ifdef CONFIG_KAISER
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
 
 .macro _SWITCH_TO_KERNEL_CR3 reg
 movq %cr3, \reg
@@ -69,7 +69,7 @@ movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
 8:
 .endm
 
-#else /* CONFIG_KAISER */
+#else /* CONFIG_PAGE_TABLE_ISOLATION */
 
 .macro SWITCH_KERNEL_CR3
 .endm
@@ -78,11 +78,11 @@ movq PER_CPU_VAR(unsafe_stack_register_backup), %rax
 .macro SWITCH_KERNEL_CR3_NO_STACK
 .endm
 
-#endif /* CONFIG_KAISER */
+#endif /* CONFIG_PAGE_TABLE_ISOLATION */
 
 #else /* __ASSEMBLY__ */
 
-#ifdef CONFIG_KAISER
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
 /*
  * Upon kernel/user mode switch, it may happen that the address
  * space has to be switched before the registers have been
@@ -100,10 +100,10 @@ extern void __init kaiser_check_boottime_disable(void);
 #else
 #define kaiser_enabled	0
 static inline void __init kaiser_check_boottime_disable(void) {}
-#endif /* CONFIG_KAISER */
+#endif /* CONFIG_PAGE_TABLE_ISOLATION */
 
 /*
- * Kaiser function prototypes are needed even when CONFIG_KAISER is not set,
+ * Kaiser function prototypes are needed even when CONFIG_PAGE_TABLE_ISOLATION is not set,
  * so as to build with tests on kaiser_enabled instead of #ifdefs.
  */
 
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index 051beec179f4..84c62d950023 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -18,7 +18,7 @@
 #ifndef __ASSEMBLY__
 #include <asm/x86_init.h>
 
-#ifdef CONFIG_KAISER
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
 extern int kaiser_enabled;
 #else
 #define kaiser_enabled 0
@@ -872,7 +872,7 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm,
 static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count)
 {
 	memcpy(dst, src, count * sizeof(pgd_t));
-#ifdef CONFIG_KAISER
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
 	if (kaiser_enabled) {
 		/* Clone the shadow pgd part as well */
 		memcpy(native_get_shadow_pgd(dst),
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 233d19c9f22e..c810226e741a 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -106,7 +106,7 @@ static inline void native_pud_clear(pud_t *pud)
 	native_set_pud(pud, native_make_pud(0));
 }
 
-#ifdef CONFIG_KAISER
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
 extern pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd);
 
 static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp)
@@ -127,7 +127,7 @@ static inline pgd_t *native_get_shadow_pgd(pgd_t *pgdp)
 	BUILD_BUG_ON(1);
 	return NULL;
 }
-#endif /* CONFIG_KAISER */
+#endif /* CONFIG_PAGE_TABLE_ISOLATION */
 
 static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd)
 {
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 22547baa458a..8dba273da25a 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -109,7 +109,7 @@
 #define X86_CR3_PCID_MASK       (X86_CR3_PCID_NOFLUSH | X86_CR3_PCID_ASID_MASK)
 #define X86_CR3_PCID_ASID_KERN  (_AC(0x0,UL))
 
-#if defined(CONFIG_KAISER) && defined(CONFIG_X86_64)
+#if defined(CONFIG_PAGE_TABLE_ISOLATION) && defined(CONFIG_X86_64)
 /* Let X86_CR3_PCID_ASID_USER be usable for the X86_CR3_PCID_NOFLUSH bit */
 #define X86_CR3_PCID_ASID_USER	(_AC(0x80,UL))
 
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 73865b174090..a691b66cc40a 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -135,7 +135,7 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask)
  * Declare a couple of kaiser interfaces here for convenience,
  * to avoid the need for asm/kaiser.h in unexpected places.
  */
-#ifdef CONFIG_KAISER
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
 extern int kaiser_enabled;
 extern void kaiser_setup_pcid(void);
 extern void kaiser_flush_tlb_on_return_to_user(void);
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 4e334e3ac16f..f01b3a12dce0 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -274,7 +274,7 @@ static DEFINE_PER_CPU(void *, insn_buffer);
 
 static void *dsalloc(size_t size, gfp_t flags, int node)
 {
-#ifdef CONFIG_KAISER
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
 	unsigned int order = get_order(size);
 	struct page *page;
 	unsigned long addr;
@@ -295,7 +295,7 @@ static void *dsalloc(size_t size, gfp_t flags, int node)
 
 static void dsfree(const void *buffer, size_t size)
 {
-#ifdef CONFIG_KAISER
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
 	if (!buffer)
 		return;
 	kaiser_remove_mapping((unsigned long)buffer, size);
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 9af949ce8a69..4034e905741a 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -441,7 +441,7 @@ early_idt_ripmsg:
 	.balign	PAGE_SIZE; \
 GLOBAL(name)
 
-#ifdef CONFIG_KAISER
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
 /*
  * Each PGD needs to be 8k long and 8k aligned.  We do not
  * ever go out to userspace with these, so we do not
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 978156081bed..61e6cead9c4a 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -32,4 +32,4 @@ obj-$(CONFIG_ACPI_NUMA)		+= srat.o
 obj-$(CONFIG_NUMA_EMU)		+= numa_emulation.o
 
 obj-$(CONFIG_X86_INTEL_MPX)	+= mpx.o
-obj-$(CONFIG_KAISER)		+= kaiser.o
+obj-$(CONFIG_PAGE_TABLE_ISOLATION)		+= kaiser.o
-- 
cgit v1.2.3


From bfd51a4d715b6ef44bd01b9fbfc13da936f93d76 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 3 Jan 2018 10:43:32 -0800
Subject: KPTI: Report when enabled

Make sure dmesg reports when KPTI is enabled.

Signed-off-by: Kees Cook <keescook@chromium.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/mm/kaiser.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c
index d86d56f5e82e..bf6a915f60e7 100644
--- a/arch/x86/mm/kaiser.c
+++ b/arch/x86/mm/kaiser.c
@@ -11,6 +11,9 @@
 #include <linux/uaccess.h>
 #include <linux/ftrace.h>
 
+#undef pr_fmt
+#define pr_fmt(fmt)     "Kernel/User page tables isolation: " fmt
+
 #include <asm/kaiser.h>
 #include <asm/tlbflush.h>	/* to verify its kaiser declarations */
 #include <asm/pgtable.h>
@@ -293,7 +296,7 @@ enable:
 	return;
 
 disable:
-	pr_info("Kernel/User page tables isolation: disabled\n");
+	pr_info("disabled\n");
 
 silent_disable:
 	kaiser_enabled = 0;
@@ -353,6 +356,8 @@ void __init kaiser_init(void)
 	kaiser_add_user_map_early(&debug_idt_table,
 				  sizeof(gate_desc) * NR_VECTORS,
 				  __PAGE_KERNEL);
+
+	pr_info("enabled\n");
 }
 
 /* Add a mapping to the shadow mapping, and synchronize the mappings */
-- 
cgit v1.2.3


From 64e239804e21901f1a171681269460878bb5f198 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@amacapital.net>
Date: Thu, 10 Dec 2015 19:20:19 -0800
Subject: x86, vdso, pvclock: Simplify and speed up the vdso pvclock reader

commit 6b078f5de7fc0851af4102493c7b5bb07e49c4cb upstream.

The pvclock vdso code was too abstracted to understand easily
and excessively paranoid.  Simplify it for a huge speedup.

This opens the door for additional simplifications, as the vdso
no longer accesses the pvti for any vcpu other than vcpu 0.

Before, vclock_gettime using kvm-clock took about 45ns on my
machine. With this change, it takes 29ns, which is almost as
fast as the pure TSC implementation.

Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/6b51dcc41f1b101f963945c5ec7093d72bdac429.1449702533.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Jamie Iles <jamie.iles@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/entry/vdso/vclock_gettime.c | 81 ++++++++++++++++++++----------------
 1 file changed, 46 insertions(+), 35 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c
index ca94fa649251..c325ba1bdddf 100644
--- a/arch/x86/entry/vdso/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vclock_gettime.c
@@ -78,47 +78,58 @@ static notrace const struct pvclock_vsyscall_time_info *get_pvti(int cpu)
 
 static notrace cycle_t vread_pvclock(int *mode)
 {
-	const struct pvclock_vsyscall_time_info *pvti;
+	const struct pvclock_vcpu_time_info *pvti = &get_pvti(0)->pvti;
 	cycle_t ret;
-	u64 last;
-	u32 version;
-	u8 flags;
-	unsigned cpu, cpu1;
-
+	u64 tsc, pvti_tsc;
+	u64 last, delta, pvti_system_time;
+	u32 version, pvti_tsc_to_system_mul, pvti_tsc_shift;
 
 	/*
-	 * Note: hypervisor must guarantee that:
-	 * 1. cpu ID number maps 1:1 to per-CPU pvclock time info.
-	 * 2. that per-CPU pvclock time info is updated if the
-	 *    underlying CPU changes.
-	 * 3. that version is increased whenever underlying CPU
-	 *    changes.
+	 * Note: The kernel and hypervisor must guarantee that cpu ID
+	 * number maps 1:1 to per-CPU pvclock time info.
+	 *
+	 * Because the hypervisor is entirely unaware of guest userspace
+	 * preemption, it cannot guarantee that per-CPU pvclock time
+	 * info is updated if the underlying CPU changes or that that
+	 * version is increased whenever underlying CPU changes.
 	 *
+	 * On KVM, we are guaranteed that pvti updates for any vCPU are
+	 * atomic as seen by *all* vCPUs.  This is an even stronger
+	 * guarantee than we get with a normal seqlock.
+	 *
+	 * On Xen, we don't appear to have that guarantee, but Xen still
+	 * supplies a valid seqlock using the version field.
+
+	 * We only do pvclock vdso timing at all if
+	 * PVCLOCK_TSC_STABLE_BIT is set, and we interpret that bit to
+	 * mean that all vCPUs have matching pvti and that the TSC is
+	 * synced, so we can just look at vCPU 0's pvti.
 	 */
-	do {
-		cpu = __getcpu() & VGETCPU_CPU_MASK;
-		/* TODO: We can put vcpu id into higher bits of pvti.version.
-		 * This will save a couple of cycles by getting rid of
-		 * __getcpu() calls (Gleb).
-		 */
-
-		pvti = get_pvti(cpu);
-
-		version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags);
-
-		/*
-		 * Test we're still on the cpu as well as the version.
-		 * We could have been migrated just after the first
-		 * vgetcpu but before fetching the version, so we
-		 * wouldn't notice a version change.
-		 */
-		cpu1 = __getcpu() & VGETCPU_CPU_MASK;
-	} while (unlikely(cpu != cpu1 ||
-			  (pvti->pvti.version & 1) ||
-			  pvti->pvti.version != version));
-
-	if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT)))
+
+	if (unlikely(!(pvti->flags & PVCLOCK_TSC_STABLE_BIT))) {
 		*mode = VCLOCK_NONE;
+		return 0;
+	}
+
+	do {
+		version = pvti->version;
+
+		/* This is also a read barrier, so we'll read version first. */
+		tsc = rdtsc_ordered();
+
+		pvti_tsc_to_system_mul = pvti->tsc_to_system_mul;
+		pvti_tsc_shift = pvti->tsc_shift;
+		pvti_system_time = pvti->system_time;
+		pvti_tsc = pvti->tsc_timestamp;
+
+		/* Make sure that the version double-check is last. */
+		smp_rmb();
+	} while (unlikely((version & 1) || version != pvti->version));
+
+	delta = tsc - pvti_tsc;
+	ret = pvti_system_time +
+		pvclock_scale_delta(delta, pvti_tsc_to_system_mul,
+				    pvti_tsc_shift);
 
 	/* refer to tsc.c read_tsc() comment for rationale */
 	last = gtod->cycle_last;
-- 
cgit v1.2.3


From 755bd549d9328d6d1e949a0a213f9a78e84d11fc Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 10 Dec 2015 19:20:20 -0800
Subject: x86/vdso: Get pvclock data from the vvar VMA instead of the fixmap

commit dac16fba6fc590fa7239676b35ed75dae4c4cd2b upstream.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/9d37826fdc7e2d2809efe31d5345f97186859284.1449702533.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Jamie Iles <jamie.iles@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/entry/vdso/vclock_gettime.c  | 20 ++++++++------------
 arch/x86/entry/vdso/vdso-layout.lds.S |  3 ++-
 arch/x86/entry/vdso/vdso2c.c          |  3 +++
 arch/x86/entry/vdso/vma.c             | 13 +++++++++++++
 arch/x86/include/asm/pvclock.h        |  9 +++++++++
 arch/x86/include/asm/vdso.h           |  1 +
 arch/x86/kernel/kvmclock.c            |  5 +++++
 7 files changed, 41 insertions(+), 13 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c
index c325ba1bdddf..5dd363d54348 100644
--- a/arch/x86/entry/vdso/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vclock_gettime.c
@@ -36,6 +36,11 @@ static notrace cycle_t vread_hpet(void)
 }
 #endif
 
+#ifdef CONFIG_PARAVIRT_CLOCK
+extern u8 pvclock_page
+	__attribute__((visibility("hidden")));
+#endif
+
 #ifndef BUILD_VDSO32
 
 #include <linux/kernel.h>
@@ -62,23 +67,14 @@ notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz)
 
 #ifdef CONFIG_PARAVIRT_CLOCK
 
-static notrace const struct pvclock_vsyscall_time_info *get_pvti(int cpu)
+static notrace const struct pvclock_vsyscall_time_info *get_pvti0(void)
 {
-	const struct pvclock_vsyscall_time_info *pvti_base;
-	int idx = cpu / (PAGE_SIZE/PVTI_SIZE);
-	int offset = cpu % (PAGE_SIZE/PVTI_SIZE);
-
-	BUG_ON(PVCLOCK_FIXMAP_BEGIN + idx > PVCLOCK_FIXMAP_END);
-
-	pvti_base = (struct pvclock_vsyscall_time_info *)
-		    __fix_to_virt(PVCLOCK_FIXMAP_BEGIN+idx);
-
-	return &pvti_base[offset];
+	return (const struct pvclock_vsyscall_time_info *)&pvclock_page;
 }
 
 static notrace cycle_t vread_pvclock(int *mode)
 {
-	const struct pvclock_vcpu_time_info *pvti = &get_pvti(0)->pvti;
+	const struct pvclock_vcpu_time_info *pvti = &get_pvti0()->pvti;
 	cycle_t ret;
 	u64 tsc, pvti_tsc;
 	u64 last, delta, pvti_system_time;
diff --git a/arch/x86/entry/vdso/vdso-layout.lds.S b/arch/x86/entry/vdso/vdso-layout.lds.S
index de2c921025f5..4158acc17df0 100644
--- a/arch/x86/entry/vdso/vdso-layout.lds.S
+++ b/arch/x86/entry/vdso/vdso-layout.lds.S
@@ -25,7 +25,7 @@ SECTIONS
 	 * segment.
 	 */
 
-	vvar_start = . - 2 * PAGE_SIZE;
+	vvar_start = . - 3 * PAGE_SIZE;
 	vvar_page = vvar_start;
 
 	/* Place all vvars at the offsets in asm/vvar.h. */
@@ -36,6 +36,7 @@ SECTIONS
 #undef EMIT_VVAR
 
 	hpet_page = vvar_start + PAGE_SIZE;
+	pvclock_page = vvar_start + 2 * PAGE_SIZE;
 
 	. = SIZEOF_HEADERS;
 
diff --git a/arch/x86/entry/vdso/vdso2c.c b/arch/x86/entry/vdso/vdso2c.c
index 785d9922b106..491020b2826d 100644
--- a/arch/x86/entry/vdso/vdso2c.c
+++ b/arch/x86/entry/vdso/vdso2c.c
@@ -73,6 +73,7 @@ enum {
 	sym_vvar_start,
 	sym_vvar_page,
 	sym_hpet_page,
+	sym_pvclock_page,
 	sym_VDSO_FAKE_SECTION_TABLE_START,
 	sym_VDSO_FAKE_SECTION_TABLE_END,
 };
@@ -80,6 +81,7 @@ enum {
 const int special_pages[] = {
 	sym_vvar_page,
 	sym_hpet_page,
+	sym_pvclock_page,
 };
 
 struct vdso_sym {
@@ -91,6 +93,7 @@ struct vdso_sym required_syms[] = {
 	[sym_vvar_start] = {"vvar_start", true},
 	[sym_vvar_page] = {"vvar_page", true},
 	[sym_hpet_page] = {"hpet_page", true},
+	[sym_pvclock_page] = {"pvclock_page", true},
 	[sym_VDSO_FAKE_SECTION_TABLE_START] = {
 		"VDSO_FAKE_SECTION_TABLE_START", false
 	},
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 64df47148160..aa828191c654 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -100,6 +100,7 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)
 		.name = "[vvar]",
 		.pages = no_pages,
 	};
+	struct pvclock_vsyscall_time_info *pvti;
 
 	if (calculate_addr) {
 		addr = vdso_addr(current->mm->start_stack,
@@ -169,6 +170,18 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr)
 	}
 #endif
 
+	pvti = pvclock_pvti_cpu0_va();
+	if (pvti && image->sym_pvclock_page) {
+		ret = remap_pfn_range(vma,
+				      text_start + image->sym_pvclock_page,
+				      __pa(pvti) >> PAGE_SHIFT,
+				      PAGE_SIZE,
+				      PAGE_READONLY);
+
+		if (ret)
+			goto up_fail;
+	}
+
 up_fail:
 	if (ret)
 		current->mm->context.vdso = NULL;
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index baad72e4c100..6045cef376c2 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -4,6 +4,15 @@
 #include <linux/clocksource.h>
 #include <asm/pvclock-abi.h>
 
+#ifdef CONFIG_PARAVIRT_CLOCK
+extern struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void);
+#else
+static inline struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void)
+{
+	return NULL;
+}
+#endif
+
 /* some helper functions for xen and kvm pv clock sources */
 cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src);
 u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src);
diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h
index 756de9190aec..deabaf9759b6 100644
--- a/arch/x86/include/asm/vdso.h
+++ b/arch/x86/include/asm/vdso.h
@@ -22,6 +22,7 @@ struct vdso_image {
 
 	long sym_vvar_page;
 	long sym_hpet_page;
+	long sym_pvclock_page;
 	long sym_VDSO32_NOTE_MASK;
 	long sym___kernel_sigreturn;
 	long sym___kernel_rt_sigreturn;
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 2bd81e302427..ec1b06dc82d2 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -45,6 +45,11 @@ early_param("no-kvmclock", parse_no_kvmclock);
 static struct pvclock_vsyscall_time_info *hv_clock;
 static struct pvclock_wall_clock wall_clock;
 
+struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void)
+{
+	return hv_clock;
+}
+
 /*
  * The wallclock is the time of day when we booted. Since then, some time may
  * have elapsed since the hypervisor wrote the data. So we try to account for
-- 
cgit v1.2.3


From 2b24fe5c57af4d20650e1c3157305459fc5f4afc Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <aryabinin@virtuozzo.com>
Date: Mon, 11 Jan 2016 15:51:18 +0300
Subject: x86/kasan: Clear kasan_zero_page after TLB flush

commit 69e0210fd01ff157d332102219aaf5c26ca8069b upstream.

Currently we clear kasan_zero_page before __flush_tlb_all(). This
works with current implementation of native_flush_tlb[_global]()
because it doesn't cause do any writes to kasan shadow memory.
But any subtle change made in native_flush_tlb*() could break this.
Also current code seems doesn't work for paravirt guests (lguest).

Only after the TLB flush we can be sure that kasan_zero_page is not
used as early shadow anymore (instrumented code will not write to it).
So it should cleared it only after the TLB flush.

Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luis R. Rodriguez <mcgrof@suse.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/1452516679-32040-2-git-send-email-aryabinin@virtuozzo.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Jamie Iles <jamie.iles@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/mm/kasan_init_64.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index 4e5ac46adc9d..81ec7c02f968 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -121,11 +121,16 @@ void __init kasan_init(void)
 	kasan_populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END),
 			(void *)KASAN_SHADOW_END);
 
-	memset(kasan_zero_page, 0, PAGE_SIZE);
-
 	load_cr3(init_level4_pgt);
 	__flush_tlb_all();
-	init_task.kasan_depth = 0;
 
+	/*
+	 * kasan_zero_page has been used as early shadow memory, thus it may
+	 * contain some garbage. Now we can clear it, since after the TLB flush
+	 * no one should write to it.
+	 */
+	memset(kasan_zero_page, 0, PAGE_SIZE);
+
+	init_task.kasan_depth = 0;
 	pr_info("KernelAddressSanitizer initialized\n");
 }
-- 
cgit v1.2.3


From b33c3c64c4786cd724ccde6fa97c87ada49f6a73 Mon Sep 17 00:00:00 2001
From: Guenter Roeck <groeck@chromium.org>
Date: Thu, 4 Jan 2018 13:41:55 -0800
Subject: kaiser: Set _PAGE_NX only if supported

This resolves a crash if loaded under qemu + haxm under windows.
See https://www.spinics.net/lists/kernel/msg2689835.html for details.
Here is a boot log (the log is from chromeos-4.4, but Tao Wu says that
the same log is also seen with vanilla v4.4.110-rc1).

[    0.712750] Freeing unused kernel memory: 552K
[    0.721821] init: Corrupted page table at address 57b029b332e0
[    0.722761] PGD 80000000bb238067 PUD bc36a067 PMD bc369067 PTE 45d2067
[    0.722761] Bad pagetable: 000b [#1] PREEMPT SMP
[    0.722761] Modules linked in:
[    0.722761] CPU: 1 PID: 1 Comm: init Not tainted 4.4.96 #31
[    0.722761] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
rel-1.7.5.1-0-g8936dbb-20141113_115728-nilsson.home.kraxel.org 04/01/2014
[    0.722761] task: ffff8800bc290000 ti: ffff8800bc28c000 task.ti: ffff8800bc28c000
[    0.722761] RIP: 0010:[<ffffffff83f4129e>]  [<ffffffff83f4129e>] __clear_user+0x42/0x67
[    0.722761] RSP: 0000:ffff8800bc28fcf8  EFLAGS: 00010202
[    0.722761] RAX: 0000000000000000 RBX: 00000000000001a4 RCX: 00000000000001a4
[    0.722761] RDX: 0000000000000000 RSI: 0000000000000008 RDI: 000057b029b332e0
[    0.722761] RBP: ffff8800bc28fd08 R08: ffff8800bc290000 R09: ffff8800bb2f4000
[    0.722761] R10: ffff8800bc290000 R11: ffff8800bb2f4000 R12: 000057b029b332e0
[    0.722761] R13: 0000000000000000 R14: 000057b029b33340 R15: ffff8800bb1e2a00
[    0.722761] FS:  0000000000000000(0000) GS:ffff8800bfb00000(0000) knlGS:0000000000000000
[    0.722761] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[    0.722761] CR2: 000057b029b332e0 CR3: 00000000bb2f8000 CR4: 00000000000006e0
[    0.722761] Stack:
[    0.722761]  000057b029b332e0 ffff8800bb95fa80 ffff8800bc28fd18 ffffffff83f4120c
[    0.722761]  ffff8800bc28fe18 ffffffff83e9e7a1 ffff8800bc28fd68 0000000000000000
[    0.722761]  ffff8800bc290000 ffff8800bc290000 ffff8800bc290000 ffff8800bc290000
[    0.722761] Call Trace:
[    0.722761]  [<ffffffff83f4120c>] clear_user+0x2e/0x30
[    0.722761]  [<ffffffff83e9e7a1>] load_elf_binary+0xa7f/0x18f7
[    0.722761]  [<ffffffff83de2088>] search_binary_handler+0x86/0x19c
[    0.722761]  [<ffffffff83de389e>] do_execveat_common.isra.26+0x909/0xf98
[    0.722761]  [<ffffffff844febe0>] ? rest_init+0x87/0x87
[    0.722761]  [<ffffffff83de40be>] do_execve+0x23/0x25
[    0.722761]  [<ffffffff83c002e3>] run_init_process+0x2b/0x2d
[    0.722761]  [<ffffffff844fec4d>] kernel_init+0x6d/0xda
[    0.722761]  [<ffffffff84505b2f>] ret_from_fork+0x3f/0x70
[    0.722761]  [<ffffffff844febe0>] ? rest_init+0x87/0x87
[    0.722761] Code: 86 84 be 12 00 00 00 e8 87 0d e8 ff 66 66 90 48 89 d8 48 c1
eb 03 4c 89 e7 83 e0 07 48 89 d9 be 08 00 00 00 31 d2 48 85 c9 74 0a <48> 89 17
48 01 f7 ff c9 75 f6 48 89 c1 85 c9 74 09 88 17 48 ff
[    0.722761] RIP  [<ffffffff83f4129e>] __clear_user+0x42/0x67
[    0.722761]  RSP <ffff8800bc28fcf8>
[    0.722761] ---[ end trace def703879b4ff090 ]---
[    0.722761] BUG: sleeping function called from invalid context at /mnt/host/source/src/third_party/kernel/v4.4/kernel/locking/rwsem.c:21
[    0.722761] in_atomic(): 0, irqs_disabled(): 1, pid: 1, name: init
[    0.722761] CPU: 1 PID: 1 Comm: init Tainted: G      D         4.4.96 #31
[    0.722761] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.7.5.1-0-g8936dbb-20141113_115728-nilsson.home.kraxel.org 04/01/2014
[    0.722761]  0000000000000086 dcb5d76098c89836 ffff8800bc28fa30 ffffffff83f34004
[    0.722761]  ffffffff84839dc2 0000000000000015 ffff8800bc28fa40 ffffffff83d57dc9
[    0.722761]  ffff8800bc28fa68 ffffffff83d57e6a ffffffff84a53640 0000000000000000
[    0.722761] Call Trace:
[    0.722761]  [<ffffffff83f34004>] dump_stack+0x4d/0x63
[    0.722761]  [<ffffffff83d57dc9>] ___might_sleep+0x13a/0x13c
[    0.722761]  [<ffffffff83d57e6a>] __might_sleep+0x9f/0xa6
[    0.722761]  [<ffffffff84502788>] down_read+0x20/0x31
[    0.722761]  [<ffffffff83cc5d9b>] __blocking_notifier_call_chain+0x35/0x63
[    0.722761]  [<ffffffff83cc5ddd>] blocking_notifier_call_chain+0x14/0x16
[    0.800374] usb 1-1: new full-speed USB device number 2 using uhci_hcd
[    0.722761]  [<ffffffff83cefe97>] profile_task_exit+0x1a/0x1c
[    0.802309]  [<ffffffff83cac84e>] do_exit+0x39/0xe7f
[    0.802309]  [<ffffffff83ce5938>] ? vprintk_default+0x1d/0x1f
[    0.802309]  [<ffffffff83d7bb95>] ? printk+0x57/0x73
[    0.802309]  [<ffffffff83c46e25>] oops_end+0x80/0x85
[    0.802309]  [<ffffffff83c7b747>] pgtable_bad+0x8a/0x95
[    0.802309]  [<ffffffff83ca7f4a>] __do_page_fault+0x8c/0x352
[    0.802309]  [<ffffffff83eefba5>] ? file_has_perm+0xc4/0xe5
[    0.802309]  [<ffffffff83ca821c>] do_page_fault+0xc/0xe
[    0.802309]  [<ffffffff84507682>] page_fault+0x22/0x30
[    0.802309]  [<ffffffff83f4129e>] ? __clear_user+0x42/0x67
[    0.802309]  [<ffffffff83f4127f>] ? __clear_user+0x23/0x67
[    0.802309]  [<ffffffff83f4120c>] clear_user+0x2e/0x30
[    0.802309]  [<ffffffff83e9e7a1>] load_elf_binary+0xa7f/0x18f7
[    0.802309]  [<ffffffff83de2088>] search_binary_handler+0x86/0x19c
[    0.802309]  [<ffffffff83de389e>] do_execveat_common.isra.26+0x909/0xf98
[    0.802309]  [<ffffffff844febe0>] ? rest_init+0x87/0x87
[    0.802309]  [<ffffffff83de40be>] do_execve+0x23/0x25
[    0.802309]  [<ffffffff83c002e3>] run_init_process+0x2b/0x2d
[    0.802309]  [<ffffffff844fec4d>] kernel_init+0x6d/0xda
[    0.802309]  [<ffffffff84505b2f>] ret_from_fork+0x3f/0x70
[    0.802309]  [<ffffffff844febe0>] ? rest_init+0x87/0x87
[    0.830559] Kernel panic - not syncing: Attempted to kill init!  exitcode=0x00000009
[    0.830559]
[    0.831305] Kernel Offset: 0x2c00000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff)
[    0.831305] ---[ end Kernel panic - not syncing: Attempted to kill init!  exitcode=0x00000009

The crash part of this problem may be solved with the following patch
(thanks to Hugh for the hint). There is still another problem, though -
with this patch applied, the qemu session aborts with "VCPU Shutdown
request", whatever that means.

Cc: lepton <ytht.net@gmail.com>
Signed-off-by: Guenter Roeck <groeck@chromium.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/mm/kaiser.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c
index bf6a915f60e7..b0b3a69f1c7f 100644
--- a/arch/x86/mm/kaiser.c
+++ b/arch/x86/mm/kaiser.c
@@ -414,7 +414,8 @@ pgd_t kaiser_set_shadow_pgd(pgd_t *pgdp, pgd_t pgd)
 			 * get out to userspace running on the kernel CR3,
 			 * userspace will crash instead of running.
 			 */
-			pgd.pgd |= _PAGE_NX;
+			if (__supported_pte_mask & _PAGE_NX)
+				pgd.pgd |= _PAGE_NX;
 		}
 	} else if (!pgd.pgd) {
 		/*
-- 
cgit v1.2.3


From b17b901f0fea8e4b39711841d095f50d41bb081e Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <aryabinin@virtuozzo.com>
Date: Mon, 11 Jan 2016 15:51:19 +0300
Subject: x86/kasan: Write protect kasan zero shadow

commit 063fb3e56f6dd29b2633b678b837e1d904200e6f upstream.

After kasan_init() executed, no one is allowed to write to kasan_zero_page,
so write protect it.

Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luis R. Rodriguez <mcgrof@suse.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/1452516679-32040-3-git-send-email-aryabinin@virtuozzo.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/mm/kasan_init_64.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index 81ec7c02f968..fdfa25c83119 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -126,10 +126,16 @@ void __init kasan_init(void)
 
 	/*
 	 * kasan_zero_page has been used as early shadow memory, thus it may
-	 * contain some garbage. Now we can clear it, since after the TLB flush
-	 * no one should write to it.
+	 * contain some garbage. Now we can clear and write protect it, since
+	 * after the TLB flush no one should write to it.
 	 */
 	memset(kasan_zero_page, 0, PAGE_SIZE);
+	for (i = 0; i < PTRS_PER_PTE; i++) {
+		pte_t pte = __pte(__pa(kasan_zero_page) | __PAGE_KERNEL_RO);
+		set_pte(&kasan_zero_pte[i], pte);
+	}
+	/* Flush TLBs again to be sure that write protection applied. */
+	__flush_tlb_all();
 
 	init_task.kasan_depth = 0;
 	pr_info("KernelAddressSanitizer initialized\n");
-- 
cgit v1.2.3


From 58330ec2fecd1c3a6b8759b292f32f82dfd058ba Mon Sep 17 00:00:00 2001
From: Thiago Rafael Becker <thiago.becker@gmail.com>
Date: Thu, 14 Dec 2017 15:33:12 -0800
Subject: kernel: make groups_sort calling a responsibility group_info
 allocators

commit bdcf0a423ea1c40bbb40e7ee483b50fc8aa3d758 upstream.

In testing, we found that nfsd threads may call set_groups in parallel
for the same entry cached in auth.unix.gid, racing in the call of
groups_sort, corrupting the groups for that entry and leading to
permission denials for the client.

This patch:
 - Make groups_sort globally visible.
 - Move the call to groups_sort to the modifiers of group_info
 - Remove the call to groups_sort from set_groups

Link: http://lkml.kernel.org/r/20171211151420.18655-1-thiago.becker@gmail.com
Signed-off-by: Thiago Rafael Becker <thiago.becker@gmail.com>
Reviewed-by: Matthew Wilcox <mawilcox@microsoft.com>
Reviewed-by: NeilBrown <neilb@suse.com>
Acked-by: "J. Bruce Fields" <bfields@fieldses.org>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/s390/kernel/compat_linux.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch')

diff --git a/arch/s390/kernel/compat_linux.c b/arch/s390/kernel/compat_linux.c
index 437e61159279..0176ebc97bfd 100644
--- a/arch/s390/kernel/compat_linux.c
+++ b/arch/s390/kernel/compat_linux.c
@@ -263,6 +263,7 @@ COMPAT_SYSCALL_DEFINE2(s390_setgroups16, int, gidsetsize, u16 __user *, grouplis
 		return retval;
 	}
 
+	groups_sort(group_info);
 	retval = set_current_groups(group_info);
 	put_group_info(group_info);
 
-- 
cgit v1.2.3


From 30ce9c8dbc42c220f0562f687dbbbcec2f2a9c11 Mon Sep 17 00:00:00 2001
From: Vineet Gupta <vgupta@synopsys.com>
Date: Fri, 8 Dec 2017 08:26:58 -0800
Subject: ARC: uaccess: dont use "l" gcc inline asm constraint modifier

commit 79435ac78d160e4c245544d457850a56f805ac0d upstream.

This used to setup the LP_COUNT register automatically, but now has been
removed.

There was an earlier fix 3c7c7a2fc8811 which fixed instance in delay.h but
somehow missed this one as gcc change had not made its way into
production toolchains and was not pedantic as it is now !

Signed-off-by: Vineet Gupta <vgupta@synopsys.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/arc/include/asm/uaccess.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/arc/include/asm/uaccess.h b/arch/arc/include/asm/uaccess.h
index d4d8df706efa..57387b567f34 100644
--- a/arch/arc/include/asm/uaccess.h
+++ b/arch/arc/include/asm/uaccess.h
@@ -673,6 +673,7 @@ __arc_strncpy_from_user(char *dst, const char __user *src, long count)
 		return 0;
 
 	__asm__ __volatile__(
+	"	mov	lp_count, %5		\n"
 	"	lp	3f			\n"
 	"1:	ldb.ab  %3, [%2, 1]		\n"
 	"	breq.d	%3, 0, 3f               \n"
@@ -689,8 +690,8 @@ __arc_strncpy_from_user(char *dst, const char __user *src, long count)
 	"	.word   1b, 4b			\n"
 	"	.previous			\n"
 	: "+r"(res), "+r"(dst), "+r"(src), "=r"(val)
-	: "g"(-EFAULT), "l"(count)
-	: "memory");
+	: "g"(-EFAULT), "r"(count)
+	: "lp_count", "lp_start", "lp_end", "memory");
 
 	return res;
 }
-- 
cgit v1.2.3


From 3db597feef0804f4b76f9425ded53d628f58bd32 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Thu, 30 Nov 2017 16:46:40 -0600
Subject: x86/microcode/AMD: Add support for fam17h microcode loading

commit f4e9b7af0cd58dd039a0fb2cd67d57cea4889abf upstream.

The size for the Microcode Patch Block (MPB) for an AMD family 17h
processor is 3200 bytes.  Add a #define for fam17h so that it does
not default to 2048 bytes and fail a microcode load/update.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@alien8.de>
Link: https://lkml.kernel.org/r/20171130224640.15391.40247.stgit@tlendack-t1.amdoffice.net
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Alice Ferrazzi <alicef@gentoo.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kernel/cpu/microcode/amd.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 2233f8a76615..2a0f44d225fe 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -580,6 +580,7 @@ static unsigned int verify_patch_size(u8 family, u32 patch_size,
 #define F14H_MPB_MAX_SIZE 1824
 #define F15H_MPB_MAX_SIZE 4096
 #define F16H_MPB_MAX_SIZE 3458
+#define F17H_MPB_MAX_SIZE 3200
 
 	switch (family) {
 	case 0x14:
@@ -591,6 +592,9 @@ static unsigned int verify_patch_size(u8 family, u32 patch_size,
 	case 0x16:
 		max_size = F16H_MPB_MAX_SIZE;
 		break;
+	case 0x17:
+		max_size = F17H_MPB_MAX_SIZE;
+		break;
 	default:
 		max_size = F1XH_MPB_MAX_SIZE;
 		break;
-- 
cgit v1.2.3


From d5bbffc0501de51c1df62bb907bbeb3dfb378588 Mon Sep 17 00:00:00 2001
From: Helge Deller <deller@gmx.de>
Date: Tue, 2 Jan 2018 20:36:44 +0100
Subject: parisc: Fix alignment of pa_tlb_lock in assembly on 32-bit SMP kernel

commit 88776c0e70be0290f8357019d844aae15edaa967 upstream.

Qemu for PARISC reported on a 32bit SMP parisc kernel strange failures
about "Not-handled unaligned insn 0x0e8011d6 and 0x0c2011c9."

Those opcodes evaluate to the ldcw() assembly instruction which requires
(on 32bit) an alignment of 16 bytes to ensure atomicity.

As it turns out, qemu is correct and in our assembly code in entry.S and
pacache.S we don't pay attention to the required alignment.

This patch fixes the problem by aligning the lock offset in assembly
code in the same manner as we do in our C-code.

Signed-off-by: Helge Deller <deller@gmx.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/parisc/include/asm/ldcw.h |  2 ++
 arch/parisc/kernel/entry.S     | 13 +++++++++++--
 arch/parisc/kernel/pacache.S   |  9 +++++++--
 3 files changed, 20 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/parisc/include/asm/ldcw.h b/arch/parisc/include/asm/ldcw.h
index 8121aa6db2ff..51bb6b8eade6 100644
--- a/arch/parisc/include/asm/ldcw.h
+++ b/arch/parisc/include/asm/ldcw.h
@@ -11,6 +11,7 @@
    for the semaphore.  */
 
 #define __PA_LDCW_ALIGNMENT	16
+#define __PA_LDCW_ALIGN_ORDER	4
 #define __ldcw_align(a) ({					\
 	unsigned long __ret = (unsigned long) &(a)->lock[0];	\
 	__ret = (__ret + __PA_LDCW_ALIGNMENT - 1)		\
@@ -28,6 +29,7 @@
    ldcd). */
 
 #define __PA_LDCW_ALIGNMENT	4
+#define __PA_LDCW_ALIGN_ORDER	2
 #define __ldcw_align(a) (&(a)->slock)
 #define __LDCW	"ldcw,co"
 
diff --git a/arch/parisc/kernel/entry.S b/arch/parisc/kernel/entry.S
index 623496c11756..5dc831955de5 100644
--- a/arch/parisc/kernel/entry.S
+++ b/arch/parisc/kernel/entry.S
@@ -35,6 +35,7 @@
 #include <asm/pgtable.h>
 #include <asm/signal.h>
 #include <asm/unistd.h>
+#include <asm/ldcw.h>
 #include <asm/thread_info.h>
 
 #include <linux/linkage.h>
@@ -46,6 +47,14 @@
 #endif
 
 	.import		pa_tlb_lock,data
+	.macro  load_pa_tlb_lock reg
+#if __PA_LDCW_ALIGNMENT > 4
+	load32	PA(pa_tlb_lock) + __PA_LDCW_ALIGNMENT-1, \reg
+	depi	0,31,__PA_LDCW_ALIGN_ORDER, \reg
+#else
+	load32	PA(pa_tlb_lock), \reg
+#endif
+	.endm
 
 	/* space_to_prot macro creates a prot id from a space id */
 
@@ -457,7 +466,7 @@
 	.macro		tlb_lock	spc,ptp,pte,tmp,tmp1,fault
 #ifdef CONFIG_SMP
 	cmpib,COND(=),n	0,\spc,2f
-	load32		PA(pa_tlb_lock),\tmp
+	load_pa_tlb_lock \tmp
 1:	LDCW		0(\tmp),\tmp1
 	cmpib,COND(=)	0,\tmp1,1b
 	nop
@@ -480,7 +489,7 @@
 	/* Release pa_tlb_lock lock. */
 	.macro		tlb_unlock1	spc,tmp
 #ifdef CONFIG_SMP
-	load32		PA(pa_tlb_lock),\tmp
+	load_pa_tlb_lock \tmp
 	tlb_unlock0	\spc,\tmp
 #endif
 	.endm
diff --git a/arch/parisc/kernel/pacache.S b/arch/parisc/kernel/pacache.S
index a4761b772406..16073f472118 100644
--- a/arch/parisc/kernel/pacache.S
+++ b/arch/parisc/kernel/pacache.S
@@ -36,6 +36,7 @@
 #include <asm/assembly.h>
 #include <asm/pgtable.h>
 #include <asm/cache.h>
+#include <asm/ldcw.h>
 #include <linux/linkage.h>
 
 	.text
@@ -333,8 +334,12 @@ ENDPROC(flush_data_cache_local)
 
 	.macro	tlb_lock	la,flags,tmp
 #ifdef CONFIG_SMP
-	ldil		L%pa_tlb_lock,%r1
-	ldo		R%pa_tlb_lock(%r1),\la
+#if __PA_LDCW_ALIGNMENT > 4
+	load32		pa_tlb_lock + __PA_LDCW_ALIGNMENT-1, \la
+	depi		0,31,__PA_LDCW_ALIGN_ORDER, \la
+#else
+	load32		pa_tlb_lock, \la
+#endif
 	rsm		PSW_SM_I,\flags
 1:	LDCW		0(\la),\tmp
 	cmpib,<>,n	0,\tmp,3f
-- 
cgit v1.2.3


From a4c1c75373bf17f185edf3d8b2a64c50c500c785 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 4 Jan 2018 22:19:04 +0100
Subject: x86/tlb: Drop the _GPL from the cpu_tlbstate export

commit 1e5476815fd7f98b888e01a0f9522b63085f96c9 upstream.

The recent changes for PTI touch cpu_tlbstate from various tlb_flush
inlines. cpu_tlbstate is exported as GPL symbol, so this causes a
regression when building out of tree drivers for certain graphics cards.

Aside of that the export was wrong since it was introduced as it should
have been EXPORT_PER_CPU_SYMBOL_GPL().

Use the correct PER_CPU export and drop the _GPL to restore the previous
state which allows users to utilize the cards they payed for.

As always I'm really thrilled to make this kind of change to support the
#friends (or however the hot hashtag of today is spelled) from that closet
sauce graphics corp.

Fixes: 1e02ce4cccdc ("x86: Store a per-cpu shadow copy of CR4")
Fixes: 6fd166aae78c ("x86/mm: Use/Fix PCID to optimize user/kernel switches")
Reported-by: Kees Cook <keescook@google.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Thomas Backlund <tmb@mageia.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/mm/init.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 2bd45ae91eb3..151fd33e9043 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -757,7 +757,7 @@ DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
 	.state = 0,
 	.cr4 = ~0UL,	/* fail hard if we screw up cr4 shadow initialization */
 };
-EXPORT_SYMBOL_GPL(cpu_tlbstate);
+EXPORT_PER_CPU_SYMBOL(cpu_tlbstate);
 
 void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache)
 {
-- 
cgit v1.2.3


From 6dcf5491e01c3d1135497d0661bb5b35a126b9d8 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Thu, 4 Jan 2018 17:42:45 +0100
Subject: Map the vsyscall page with _PAGE_USER

This needs to happen early in kaiser_pagetable_walk(), before the
hierarchy is established so that _PAGE_USER permission can be really
set.

A proper fix would be to teach kaiser_pagetable_walk() to update those
permissions but the vsyscall page is the only exception here so ...

Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/entry/vsyscall/vsyscall_64.c |  5 +++++
 arch/x86/include/asm/vsyscall.h       |  2 ++
 arch/x86/mm/kaiser.c                  | 34 ++++++++++++++++++++++++++++++----
 3 files changed, 37 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c
index 174c2549939d..112178b401a1 100644
--- a/arch/x86/entry/vsyscall/vsyscall_64.c
+++ b/arch/x86/entry/vsyscall/vsyscall_64.c
@@ -66,6 +66,11 @@ static int __init vsyscall_setup(char *str)
 }
 early_param("vsyscall", vsyscall_setup);
 
+bool vsyscall_enabled(void)
+{
+	return vsyscall_mode != NONE;
+}
+
 static void warn_bad_vsyscall(const char *level, struct pt_regs *regs,
 			      const char *message)
 {
diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h
index 6ba66ee79710..4865e10dbb55 100644
--- a/arch/x86/include/asm/vsyscall.h
+++ b/arch/x86/include/asm/vsyscall.h
@@ -12,12 +12,14 @@ extern void map_vsyscall(void);
  * Returns true if handled.
  */
 extern bool emulate_vsyscall(struct pt_regs *regs, unsigned long address);
+extern bool vsyscall_enabled(void);
 #else
 static inline void map_vsyscall(void) {}
 static inline bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
 {
 	return false;
 }
+static inline bool vsyscall_enabled(void) { return false; }
 #endif
 
 #endif /* _ASM_X86_VSYSCALL_H */
diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c
index b0b3a69f1c7f..6a7a77929a8c 100644
--- a/arch/x86/mm/kaiser.c
+++ b/arch/x86/mm/kaiser.c
@@ -20,6 +20,7 @@
 #include <asm/pgalloc.h>
 #include <asm/desc.h>
 #include <asm/cmdline.h>
+#include <asm/vsyscall.h>
 
 int kaiser_enabled __read_mostly = 1;
 EXPORT_SYMBOL(kaiser_enabled);	/* for inlined TLB flush functions */
@@ -111,12 +112,13 @@ static inline unsigned long get_pa_from_mapping(unsigned long vaddr)
  *
  * Returns a pointer to a PTE on success, or NULL on failure.
  */
-static pte_t *kaiser_pagetable_walk(unsigned long address)
+static pte_t *kaiser_pagetable_walk(unsigned long address, bool user)
 {
 	pmd_t *pmd;
 	pud_t *pud;
 	pgd_t *pgd = native_get_shadow_pgd(pgd_offset_k(address));
 	gfp_t gfp = (GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO);
+	unsigned long prot = _KERNPG_TABLE;
 
 	if (pgd_none(*pgd)) {
 		WARN_ONCE(1, "All shadow pgds should have been populated");
@@ -124,6 +126,17 @@ static pte_t *kaiser_pagetable_walk(unsigned long address)
 	}
 	BUILD_BUG_ON(pgd_large(*pgd) != 0);
 
+	if (user) {
+		/*
+		 * The vsyscall page is the only page that will have
+		 *  _PAGE_USER set. Catch everything else.
+		 */
+		BUG_ON(address != VSYSCALL_ADDR);
+
+		set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER));
+		prot = _PAGE_TABLE;
+	}
+
 	pud = pud_offset(pgd, address);
 	/* The shadow page tables do not use large mappings: */
 	if (pud_large(*pud)) {
@@ -136,7 +149,7 @@ static pte_t *kaiser_pagetable_walk(unsigned long address)
 			return NULL;
 		spin_lock(&shadow_table_allocation_lock);
 		if (pud_none(*pud)) {
-			set_pud(pud, __pud(_KERNPG_TABLE | __pa(new_pmd_page)));
+			set_pud(pud, __pud(prot | __pa(new_pmd_page)));
 			__inc_zone_page_state(virt_to_page((void *)
 						new_pmd_page), NR_KAISERTABLE);
 		} else
@@ -156,7 +169,7 @@ static pte_t *kaiser_pagetable_walk(unsigned long address)
 			return NULL;
 		spin_lock(&shadow_table_allocation_lock);
 		if (pmd_none(*pmd)) {
-			set_pmd(pmd, __pmd(_KERNPG_TABLE | __pa(new_pte_page)));
+			set_pmd(pmd, __pmd(prot | __pa(new_pte_page)));
 			__inc_zone_page_state(virt_to_page((void *)
 						new_pte_page), NR_KAISERTABLE);
 		} else
@@ -192,7 +205,7 @@ static int kaiser_add_user_map(const void *__start_addr, unsigned long size,
 			ret = -EIO;
 			break;
 		}
-		pte = kaiser_pagetable_walk(address);
+		pte = kaiser_pagetable_walk(address, flags & _PAGE_USER);
 		if (!pte) {
 			ret = -ENOMEM;
 			break;
@@ -319,6 +332,19 @@ void __init kaiser_init(void)
 
 	kaiser_init_all_pgds();
 
+	/*
+	 * Note that this sets _PAGE_USER and it needs to happen when the
+	 * pagetable hierarchy gets created, i.e., early. Otherwise
+	 * kaiser_pagetable_walk() will encounter initialized PTEs in the
+	 * hierarchy and not set the proper permissions, leading to the
+	 * pagefaults with page-protection violations when trying to read the
+	 * vsyscall page. For example.
+	 */
+	if (vsyscall_enabled())
+		kaiser_add_user_map_early((void *)VSYSCALL_ADDR,
+					  PAGE_SIZE,
+					   __PAGE_KERNEL_VSYSCALL);
+
 	for_each_possible_cpu(cpu) {
 		void *percpu_vaddr = __per_cpu_user_mapped_start +
 				     per_cpu_offset(cpu);
-- 
cgit v1.2.3


From 516fa79e77f7c4490ded10e7e1c36758482bde5a Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Tue, 9 Jan 2018 10:24:02 +0100
Subject: Fix build error in vma.c
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This fixes the following much-reported build issue:

arch/x86/entry/vdso/vma.c: In function ‘map_vdso’:
arch/x86/entry/vdso/vma.c:175:9: error:
        implicit declaration of function ‘pvclock_pvti_cpu0_va’

on some arches and configurations.

Thanks to Guenter for being persistent enough to get it fixed :)

Reported-by: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/entry/vdso/vma.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch')

diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index aa828191c654..b8f69e264ac4 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -12,6 +12,7 @@
 #include <linux/random.h>
 #include <linux/elf.h>
 #include <linux/cpu.h>
+#include <asm/pvclock.h>
 #include <asm/vgtod.h>
 #include <asm/proto.h>
 #include <asm/vdso.h>
-- 
cgit v1.2.3


From eb91461daa77eb0ddb4c24aa427051f3669ba1f3 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpeng.li@hotmail.com>
Date: Thu, 14 Dec 2017 17:40:50 -0800
Subject: KVM: Fix stack-out-of-bounds read in write_mmio
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit e39d200fa5bf5b94a0948db0dae44c1b73b84a56 upstream.

Reported by syzkaller:

  BUG: KASAN: stack-out-of-bounds in write_mmio+0x11e/0x270 [kvm]
  Read of size 8 at addr ffff8803259df7f8 by task syz-executor/32298

  CPU: 6 PID: 32298 Comm: syz-executor Tainted: G           OE    4.15.0-rc2+ #18
  Hardware name: LENOVO ThinkCentre M8500t-N000/SHARKBAY, BIOS FBKTC1AUS 02/16/2016
  Call Trace:
   dump_stack+0xab/0xe1
   print_address_description+0x6b/0x290
   kasan_report+0x28a/0x370
   write_mmio+0x11e/0x270 [kvm]
   emulator_read_write_onepage+0x311/0x600 [kvm]
   emulator_read_write+0xef/0x240 [kvm]
   emulator_fix_hypercall+0x105/0x150 [kvm]
   em_hypercall+0x2b/0x80 [kvm]
   x86_emulate_insn+0x2b1/0x1640 [kvm]
   x86_emulate_instruction+0x39a/0xb90 [kvm]
   handle_exception+0x1b4/0x4d0 [kvm_intel]
   vcpu_enter_guest+0x15a0/0x2640 [kvm]
   kvm_arch_vcpu_ioctl_run+0x549/0x7d0 [kvm]
   kvm_vcpu_ioctl+0x479/0x880 [kvm]
   do_vfs_ioctl+0x142/0x9a0
   SyS_ioctl+0x74/0x80
   entry_SYSCALL_64_fastpath+0x23/0x9a

The path of patched vmmcall will patch 3 bytes opcode 0F 01 C1(vmcall)
to the guest memory, however, write_mmio tracepoint always prints 8 bytes
through *(u64 *)val since kvm splits the mmio access into 8 bytes. This
leaks 5 bytes from the kernel stack (CVE-2017-17741).  This patch fixes
it by just accessing the bytes which we operate on.

Before patch:

syz-executor-5567  [007] .... 51370.561696: kvm_mmio: mmio write len 3 gpa 0x10 val 0x1ffff10077c1010f

After patch:

syz-executor-13416 [002] .... 51302.299573: kvm_mmio: mmio write len 3 gpa 0x10 val 0xc1010f

Reported-by: Dmitry Vyukov <dvyukov@google.com>
Reviewed-by: Darren Kenny <darren.kenny@oracle.com>
Reviewed-by: Marc Zyngier <marc.zyngier@arm.com>
Tested-by: Marc Zyngier <marc.zyngier@arm.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Christoffer Dall <christoffer.dall@linaro.org>
Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/arm/kvm/mmio.c | 6 +++---
 arch/x86/kvm/x86.c  | 8 ++++----
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'arch')

diff --git a/arch/arm/kvm/mmio.c b/arch/arm/kvm/mmio.c
index 3a10c9f1d0a4..387ee2a11e36 100644
--- a/arch/arm/kvm/mmio.c
+++ b/arch/arm/kvm/mmio.c
@@ -113,7 +113,7 @@ int kvm_handle_mmio_return(struct kvm_vcpu *vcpu, struct kvm_run *run)
 		}
 
 		trace_kvm_mmio(KVM_TRACE_MMIO_READ, len, run->mmio.phys_addr,
-			       data);
+			       &data);
 		data = vcpu_data_host_to_guest(vcpu, data, len);
 		vcpu_set_reg(vcpu, vcpu->arch.mmio_decode.rt, data);
 	}
@@ -189,14 +189,14 @@ int io_mem_abort(struct kvm_vcpu *vcpu, struct kvm_run *run,
 		data = vcpu_data_guest_to_host(vcpu, vcpu_get_reg(vcpu, rt),
 					       len);
 
-		trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, len, fault_ipa, data);
+		trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, len, fault_ipa, &data);
 		mmio_write_buf(data_buf, len, data);
 
 		ret = kvm_io_bus_write(vcpu, KVM_MMIO_BUS, fault_ipa, len,
 				       data_buf);
 	} else {
 		trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, len,
-			       fault_ipa, 0);
+			       fault_ipa, NULL);
 
 		ret = kvm_io_bus_read(vcpu, KVM_MMIO_BUS, fault_ipa, len,
 				      data_buf);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ccf17dbfea09..f973cfa8ff4f 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4114,7 +4114,7 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
 					 addr, n, v))
 		    && kvm_io_bus_read(vcpu, KVM_MMIO_BUS, addr, n, v))
 			break;
-		trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, *(u64 *)v);
+		trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, v);
 		handled += n;
 		addr += n;
 		len -= n;
@@ -4362,7 +4362,7 @@ static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes)
 {
 	if (vcpu->mmio_read_completed) {
 		trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes,
-			       vcpu->mmio_fragments[0].gpa, *(u64 *)val);
+			       vcpu->mmio_fragments[0].gpa, val);
 		vcpu->mmio_read_completed = 0;
 		return 1;
 	}
@@ -4384,14 +4384,14 @@ static int write_emulate(struct kvm_vcpu *vcpu, gpa_t gpa,
 
 static int write_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes, void *val)
 {
-	trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val);
+	trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, val);
 	return vcpu_mmio_write(vcpu, gpa, bytes, val);
 }
 
 static int read_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa,
 			  void *val, int bytes)
 {
-	trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0);
+	trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, NULL);
 	return X86EMUL_IO_NEEDED;
 }
 
-- 
cgit v1.2.3


From 1e918a43cbf059da23717120e6cddd24b3e6aeb4 Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@mips.com>
Date: Mon, 27 Nov 2017 09:33:03 +0000
Subject: MIPS: Validate PR_SET_FP_MODE prctl(2) requests against the ABI of
 the task

commit b67336eee3fcb8ecedc6c13e2bf88aacfa3151e2 upstream.

Fix an API loophole introduced with commit 9791554b45a2 ("MIPS,prctl:
add PR_[GS]ET_FP_MODE prctl options for MIPS"), where the caller of
prctl(2) is incorrectly allowed to make a change to CP0.Status.FR or
CP0.Config5.FRE register bits even if CONFIG_MIPS_O32_FP64_SUPPORT has
not been enabled, despite that an executable requesting the mode
requested via ELF file annotation would not be allowed to run in the
first place, or for n64 and n64 ABI tasks which do not have non-default
modes defined at all.  Add suitable checks to `mips_set_process_fp_mode'
and bail out if an invalid mode change has been requested for the ABI in
effect, even if the FPU hardware or emulation would otherwise allow it.

Always succeed however without taking any further action if the mode
requested is the same as one already in effect, regardless of whether
any mode change, should it be requested, would actually be allowed for
the task concerned.

Signed-off-by: Maciej W. Rozycki <macro@mips.com>
Fixes: 9791554b45a2 ("MIPS,prctl: add PR_[GS]ET_FP_MODE prctl options for MIPS")
Reviewed-by: Paul Burton <paul.burton@mips.com>
Cc: James Hogan <james.hogan@mips.com>
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/17800/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/mips/kernel/process.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'arch')

diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c
index 163b3449a8de..fcbc4e57d765 100644
--- a/arch/mips/kernel/process.c
+++ b/arch/mips/kernel/process.c
@@ -664,6 +664,18 @@ int mips_set_process_fp_mode(struct task_struct *task, unsigned int value)
 	unsigned long switch_count;
 	struct task_struct *t;
 
+	/* If nothing to change, return right away, successfully.  */
+	if (value == mips_get_process_fp_mode(task))
+		return 0;
+
+	/* Only accept a mode change if 64-bit FP enabled for o32.  */
+	if (!IS_ENABLED(CONFIG_MIPS_O32_FP64_SUPPORT))
+		return -EOPNOTSUPP;
+
+	/* And only for o32 tasks.  */
+	if (IS_ENABLED(CONFIG_64BIT) && !test_thread_flag(TIF_32BIT_REGS))
+		return -EOPNOTSUPP;
+
 	/* Check the value is valid */
 	if (value & ~known_bits)
 		return -EOPNOTSUPP;
-- 
cgit v1.2.3


From b1e808b9de5c6c328c3ed660eed8382d04102116 Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@mips.com>
Date: Mon, 11 Dec 2017 22:51:35 +0000
Subject: MIPS: Factor out NT_PRFPREG regset access helpers

commit a03fe72572c12e98f4173f8a535f32468e48b6ec upstream.

In preparation to fix a commit 72b22bbad1e7 ("MIPS: Don't assume 64-bit
FP registers for FP regset") FCSR access regression factor out
NT_PRFPREG regset access helpers for the non-MSA and the MSA variants
respectively, to avoid having to deal with excessive indentation in the
actual fix.

No functional change, however use `target->thread.fpu.fpr[0]' rather
than `target->thread.fpu.fpr[i]' for FGR holding type size determination
as there's no `i' variable to refer to anymore, and for the factored out
`i' variable declaration use `unsigned int' rather than `unsigned' as
its type, following the common style.

Signed-off-by: Maciej W. Rozycki <macro@mips.com>
Fixes: 72b22bbad1e7 ("MIPS: Don't assume 64-bit FP registers for FP regset")
Cc: James Hogan <james.hogan@mips.com>
Cc: Paul Burton <Paul.Burton@mips.com>
Cc: Alex Smith <alex@alex-smith.me.uk>
Cc: Dave Martin <Dave.Martin@arm.com>
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/17925/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/mips/kernel/ptrace.c | 108 +++++++++++++++++++++++++++++++++++-----------
 1 file changed, 83 insertions(+), 25 deletions(-)

(limited to 'arch')

diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c
index a3f38e6b7ea1..436c1f4ea00a 100644
--- a/arch/mips/kernel/ptrace.c
+++ b/arch/mips/kernel/ptrace.c
@@ -439,25 +439,36 @@ static int gpr64_set(struct task_struct *target,
 
 #endif /* CONFIG_64BIT */
 
-static int fpr_get(struct task_struct *target,
-		   const struct user_regset *regset,
-		   unsigned int pos, unsigned int count,
-		   void *kbuf, void __user *ubuf)
+/*
+ * Copy the floating-point context to the supplied NT_PRFPREG buffer,
+ * !CONFIG_CPU_HAS_MSA variant.  FP context's general register slots
+ * correspond 1:1 to buffer slots.
+ */
+static int fpr_get_fpa(struct task_struct *target,
+		       unsigned int *pos, unsigned int *count,
+		       void **kbuf, void __user **ubuf)
 {
-	unsigned i;
-	int err;
-	u64 fpr_val;
-
-	/* XXX fcr31  */
+	return user_regset_copyout(pos, count, kbuf, ubuf,
+				   &target->thread.fpu,
+				   0, sizeof(elf_fpregset_t));
+}
 
-	if (sizeof(target->thread.fpu.fpr[i]) == sizeof(elf_fpreg_t))
-		return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
-					   &target->thread.fpu,
-					   0, sizeof(elf_fpregset_t));
+/*
+ * Copy the floating-point context to the supplied NT_PRFPREG buffer,
+ * CONFIG_CPU_HAS_MSA variant.  Only lower 64 bits of FP context's
+ * general register slots are copied to buffer slots.
+ */
+static int fpr_get_msa(struct task_struct *target,
+		       unsigned int *pos, unsigned int *count,
+		       void **kbuf, void __user **ubuf)
+{
+	unsigned int i;
+	u64 fpr_val;
+	int err;
 
 	for (i = 0; i < NUM_FPU_REGS; i++) {
 		fpr_val = get_fpr64(&target->thread.fpu.fpr[i], 0);
-		err = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+		err = user_regset_copyout(pos, count, kbuf, ubuf,
 					  &fpr_val, i * sizeof(elf_fpreg_t),
 					  (i + 1) * sizeof(elf_fpreg_t));
 		if (err)
@@ -467,27 +478,54 @@ static int fpr_get(struct task_struct *target,
 	return 0;
 }
 
-static int fpr_set(struct task_struct *target,
+/* Copy the floating-point context to the supplied NT_PRFPREG buffer.  */
+static int fpr_get(struct task_struct *target,
 		   const struct user_regset *regset,
 		   unsigned int pos, unsigned int count,
-		   const void *kbuf, const void __user *ubuf)
+		   void *kbuf, void __user *ubuf)
 {
-	unsigned i;
 	int err;
-	u64 fpr_val;
 
 	/* XXX fcr31  */
 
-	init_fp_ctx(target);
+	if (sizeof(target->thread.fpu.fpr[0]) == sizeof(elf_fpreg_t))
+		err = fpr_get_fpa(target, &pos, &count, &kbuf, &ubuf);
+	else
+		err = fpr_get_msa(target, &pos, &count, &kbuf, &ubuf);
+
+	return err;
+}
 
-	if (sizeof(target->thread.fpu.fpr[i]) == sizeof(elf_fpreg_t))
-		return user_regset_copyin(&pos, &count, &kbuf, &ubuf,
-					  &target->thread.fpu,
-					  0, sizeof(elf_fpregset_t));
+/*
+ * Copy the supplied NT_PRFPREG buffer to the floating-point context,
+ * !CONFIG_CPU_HAS_MSA variant.   Buffer slots correspond 1:1 to FP
+ * context's general register slots.
+ */
+static int fpr_set_fpa(struct task_struct *target,
+		       unsigned int *pos, unsigned int *count,
+		       const void **kbuf, const void __user **ubuf)
+{
+	return user_regset_copyin(pos, count, kbuf, ubuf,
+				  &target->thread.fpu,
+				  0, sizeof(elf_fpregset_t));
+}
+
+/*
+ * Copy the supplied NT_PRFPREG buffer to the floating-point context,
+ * CONFIG_CPU_HAS_MSA variant.  Buffer slots are copied to lower 64
+ * bits only of FP context's general register slots.
+ */
+static int fpr_set_msa(struct task_struct *target,
+		       unsigned int *pos, unsigned int *count,
+		       const void **kbuf, const void __user **ubuf)
+{
+	unsigned int i;
+	u64 fpr_val;
+	int err;
 
 	BUILD_BUG_ON(sizeof(fpr_val) != sizeof(elf_fpreg_t));
-	for (i = 0; i < NUM_FPU_REGS && count >= sizeof(elf_fpreg_t); i++) {
-		err = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+	for (i = 0; i < NUM_FPU_REGS && *count >= sizeof(elf_fpreg_t); i++) {
+		err = user_regset_copyin(pos, count, kbuf, ubuf,
 					 &fpr_val, i * sizeof(elf_fpreg_t),
 					 (i + 1) * sizeof(elf_fpreg_t));
 		if (err)
@@ -498,6 +536,26 @@ static int fpr_set(struct task_struct *target,
 	return 0;
 }
 
+/* Copy the supplied NT_PRFPREG buffer to the floating-point context.  */
+static int fpr_set(struct task_struct *target,
+		   const struct user_regset *regset,
+		   unsigned int pos, unsigned int count,
+		   const void *kbuf, const void __user *ubuf)
+{
+	int err;
+
+	/* XXX fcr31  */
+
+	init_fp_ctx(target);
+
+	if (sizeof(target->thread.fpu.fpr[0]) == sizeof(elf_fpreg_t))
+		err = fpr_set_fpa(target, &pos, &count, &kbuf, &ubuf);
+	else
+		err = fpr_set_msa(target, &pos, &count, &kbuf, &ubuf);
+
+	return err;
+}
+
 enum mips_regset {
 	REGSET_GPR,
 	REGSET_FPR,
-- 
cgit v1.2.3


From e68049f6a9e854eace5f4d45986a198989c55d54 Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@mips.com>
Date: Mon, 11 Dec 2017 22:52:15 +0000
Subject: MIPS: Guard against any partial write attempt with PTRACE_SETREGSET

commit dc24d0edf33c3e15099688b6bbdf7bdc24bf6e91 upstream.

Complement commit d614fd58a283 ("mips/ptrace: Preserve previous
registers for short regset write") and ensure that no partial register
write attempt is made with PTRACE_SETREGSET, as we do not preinitialize
any temporaries used to hold incoming register data and consequently
random data could be written.

It is the responsibility of the caller, such as `ptrace_regset', to
arrange for writes to span whole registers only, so here we only assert
that it has indeed happened.

Signed-off-by: Maciej W. Rozycki <macro@mips.com>
Fixes: 72b22bbad1e7 ("MIPS: Don't assume 64-bit FP registers for FP regset")
Cc: James Hogan <james.hogan@mips.com>
Cc: Paul Burton <Paul.Burton@mips.com>
Cc: Alex Smith <alex@alex-smith.me.uk>
Cc: Dave Martin <Dave.Martin@arm.com>
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/17926/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/mips/kernel/ptrace.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c
index 436c1f4ea00a..cd05e34995d4 100644
--- a/arch/mips/kernel/ptrace.c
+++ b/arch/mips/kernel/ptrace.c
@@ -536,7 +536,15 @@ static int fpr_set_msa(struct task_struct *target,
 	return 0;
 }
 
-/* Copy the supplied NT_PRFPREG buffer to the floating-point context.  */
+/*
+ * Copy the supplied NT_PRFPREG buffer to the floating-point context.
+ *
+ * We optimize for the case where `count % sizeof(elf_fpreg_t) == 0',
+ * which is supposed to have been guaranteed by the kernel before
+ * calling us, e.g. in `ptrace_regset'.  We enforce that requirement,
+ * so that we can safely avoid preinitializing temporaries for
+ * partial register writes.
+ */
 static int fpr_set(struct task_struct *target,
 		   const struct user_regset *regset,
 		   unsigned int pos, unsigned int count,
@@ -544,6 +552,8 @@ static int fpr_set(struct task_struct *target,
 {
 	int err;
 
+	BUG_ON(count % sizeof(elf_fpreg_t));
+
 	/* XXX fcr31  */
 
 	init_fp_ctx(target);
-- 
cgit v1.2.3


From a6972f8bd2b6af5af070c76b8178f8fa87d44cb9 Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@mips.com>
Date: Mon, 11 Dec 2017 22:53:14 +0000
Subject: MIPS: Consistently handle buffer counter with PTRACE_SETREGSET

commit 80b3ffce0196ea50068885d085ff981e4b8396f4 upstream.

Update commit d614fd58a283 ("mips/ptrace: Preserve previous registers
for short regset write") bug and consistently consume all data supplied
to `fpr_set_msa' with the ptrace(2) PTRACE_SETREGSET request, such that
a zero data buffer counter is returned where insufficient data has been
given to fill a whole number of FP general registers.

In reality this is not going to happen, as the caller is supposed to
only supply data covering a whole number of registers and it is verified
in `ptrace_regset' and again asserted in `fpr_set', however structuring
code such that the presence of trailing partial FP general register data
causes `fpr_set_msa' to return with a non-zero data buffer counter makes
it appear that this trailing data will be used if there are subsequent
writes made to FP registers, which is going to be the case with the FCSR
once the missing write to that register has been fixed.

Fixes: d614fd58a283 ("mips/ptrace: Preserve previous registers for short regset write")
Signed-off-by: Maciej W. Rozycki <macro@mips.com>
Cc: James Hogan <james.hogan@mips.com>
Cc: Paul Burton <Paul.Burton@mips.com>
Cc: Alex Smith <alex@alex-smith.me.uk>
Cc: Dave Martin <Dave.Martin@arm.com>
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/17927/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/mips/kernel/ptrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c
index cd05e34995d4..a24a2e7ccb10 100644
--- a/arch/mips/kernel/ptrace.c
+++ b/arch/mips/kernel/ptrace.c
@@ -524,7 +524,7 @@ static int fpr_set_msa(struct task_struct *target,
 	int err;
 
 	BUILD_BUG_ON(sizeof(fpr_val) != sizeof(elf_fpreg_t));
-	for (i = 0; i < NUM_FPU_REGS && *count >= sizeof(elf_fpreg_t); i++) {
+	for (i = 0; i < NUM_FPU_REGS && *count > 0; i++) {
 		err = user_regset_copyin(pos, count, kbuf, ubuf,
 					 &fpr_val, i * sizeof(elf_fpreg_t),
 					 (i + 1) * sizeof(elf_fpreg_t));
-- 
cgit v1.2.3


From 9584ae52bd7e39e112c56b47e8341c043ca87fd7 Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@mips.com>
Date: Mon, 11 Dec 2017 22:54:33 +0000
Subject: MIPS: Fix an FCSR access API regression with NT_PRFPREG and MSA

commit be07a6a1188372b6d19a3307ec33211fc9c9439d upstream.

Fix a commit 72b22bbad1e7 ("MIPS: Don't assume 64-bit FP registers for
FP regset") public API regression, then activated by commit 1db1af84d6df
("MIPS: Basic MSA context switching support"), that caused the FCSR
register not to be read or written for CONFIG_CPU_HAS_MSA kernel
configurations (regardless of actual presence or absence of the MSA
feature in a given processor) with ptrace(2) PTRACE_GETREGSET and
PTRACE_SETREGSET requests nor recorded in core dumps.

This is because with !CONFIG_CPU_HAS_MSA configurations the whole of
`elf_fpregset_t' array is bulk-copied as it is, which includes the FCSR
in one half of the last, 33rd slot, whereas with CONFIG_CPU_HAS_MSA
configurations array elements are copied individually, and then only the
leading 32 FGR slots while the remaining slot is ignored.

Correct the code then such that only FGR slots are copied in the
respective !MSA and MSA helpers an then the FCSR slot is handled
separately in common code.  Use `ptrace_setfcr31' to update the FCSR
too, so that the read-only mask is respected.

Retrieving a correct value of FCSR is important in debugging not only
for the human to be able to get the right interpretation of the
situation, but for correct operation of GDB as well.  This is because
the condition code bits in FSCR are used by GDB to determine the
location to place a breakpoint at when single-stepping through an FPU
branch instruction.  If such a breakpoint is placed incorrectly (i.e.
with the condition reversed), then it will be missed, likely causing the
debuggee to run away from the control of GDB and consequently breaking
the process of investigation.

Fortunately GDB continues using the older PTRACE_GETFPREGS ptrace(2)
request which is unaffected, so the regression only really hits with
post-mortem debug sessions using a core dump file, in which case
execution, and consequently single-stepping through branches is not
possible.  Of course core files created by buggy kernels out there will
have the value of FCSR recorded clobbered, but such core files cannot be
corrected and the person using them simply will have to be aware that
the value of FCSR retrieved is not reliable.

Which also means we can likely get away without defining a replacement
API which would ensure a correct value of FSCR to be retrieved, or none
at all.

This is based on previous work by Alex Smith, extensively rewritten.

Signed-off-by: Alex Smith <alex@alex-smith.me.uk>
Signed-off-by: James Hogan <james.hogan@mips.com>
Signed-off-by: Maciej W. Rozycki <macro@mips.com>
Fixes: 72b22bbad1e7 ("MIPS: Don't assume 64-bit FP registers for FP regset")
Cc: Paul Burton <Paul.Burton@mips.com>
Cc: Dave Martin <Dave.Martin@arm.com>
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/17928/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/mips/kernel/ptrace.c | 47 ++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 36 insertions(+), 11 deletions(-)

(limited to 'arch')

diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c
index a24a2e7ccb10..cc480c324900 100644
--- a/arch/mips/kernel/ptrace.c
+++ b/arch/mips/kernel/ptrace.c
@@ -442,7 +442,7 @@ static int gpr64_set(struct task_struct *target,
 /*
  * Copy the floating-point context to the supplied NT_PRFPREG buffer,
  * !CONFIG_CPU_HAS_MSA variant.  FP context's general register slots
- * correspond 1:1 to buffer slots.
+ * correspond 1:1 to buffer slots.  Only general registers are copied.
  */
 static int fpr_get_fpa(struct task_struct *target,
 		       unsigned int *pos, unsigned int *count,
@@ -450,13 +450,14 @@ static int fpr_get_fpa(struct task_struct *target,
 {
 	return user_regset_copyout(pos, count, kbuf, ubuf,
 				   &target->thread.fpu,
-				   0, sizeof(elf_fpregset_t));
+				   0, NUM_FPU_REGS * sizeof(elf_fpreg_t));
 }
 
 /*
  * Copy the floating-point context to the supplied NT_PRFPREG buffer,
  * CONFIG_CPU_HAS_MSA variant.  Only lower 64 bits of FP context's
- * general register slots are copied to buffer slots.
+ * general register slots are copied to buffer slots.  Only general
+ * registers are copied.
  */
 static int fpr_get_msa(struct task_struct *target,
 		       unsigned int *pos, unsigned int *count,
@@ -478,20 +479,29 @@ static int fpr_get_msa(struct task_struct *target,
 	return 0;
 }
 
-/* Copy the floating-point context to the supplied NT_PRFPREG buffer.  */
+/*
+ * Copy the floating-point context to the supplied NT_PRFPREG buffer.
+ * Choose the appropriate helper for general registers, and then copy
+ * the FCSR register separately.
+ */
 static int fpr_get(struct task_struct *target,
 		   const struct user_regset *regset,
 		   unsigned int pos, unsigned int count,
 		   void *kbuf, void __user *ubuf)
 {
+	const int fcr31_pos = NUM_FPU_REGS * sizeof(elf_fpreg_t);
 	int err;
 
-	/* XXX fcr31  */
-
 	if (sizeof(target->thread.fpu.fpr[0]) == sizeof(elf_fpreg_t))
 		err = fpr_get_fpa(target, &pos, &count, &kbuf, &ubuf);
 	else
 		err = fpr_get_msa(target, &pos, &count, &kbuf, &ubuf);
+	if (err)
+		return err;
+
+	err = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+				  &target->thread.fpu.fcr31,
+				  fcr31_pos, fcr31_pos + sizeof(u32));
 
 	return err;
 }
@@ -499,7 +509,7 @@ static int fpr_get(struct task_struct *target,
 /*
  * Copy the supplied NT_PRFPREG buffer to the floating-point context,
  * !CONFIG_CPU_HAS_MSA variant.   Buffer slots correspond 1:1 to FP
- * context's general register slots.
+ * context's general register slots.  Only general registers are copied.
  */
 static int fpr_set_fpa(struct task_struct *target,
 		       unsigned int *pos, unsigned int *count,
@@ -507,13 +517,14 @@ static int fpr_set_fpa(struct task_struct *target,
 {
 	return user_regset_copyin(pos, count, kbuf, ubuf,
 				  &target->thread.fpu,
-				  0, sizeof(elf_fpregset_t));
+				  0, NUM_FPU_REGS * sizeof(elf_fpreg_t));
 }
 
 /*
  * Copy the supplied NT_PRFPREG buffer to the floating-point context,
  * CONFIG_CPU_HAS_MSA variant.  Buffer slots are copied to lower 64
- * bits only of FP context's general register slots.
+ * bits only of FP context's general register slots.  Only general
+ * registers are copied.
  */
 static int fpr_set_msa(struct task_struct *target,
 		       unsigned int *pos, unsigned int *count,
@@ -538,6 +549,8 @@ static int fpr_set_msa(struct task_struct *target,
 
 /*
  * Copy the supplied NT_PRFPREG buffer to the floating-point context.
+ * Choose the appropriate helper for general registers, and then copy
+ * the FCSR register separately.
  *
  * We optimize for the case where `count % sizeof(elf_fpreg_t) == 0',
  * which is supposed to have been guaranteed by the kernel before
@@ -550,18 +563,30 @@ static int fpr_set(struct task_struct *target,
 		   unsigned int pos, unsigned int count,
 		   const void *kbuf, const void __user *ubuf)
 {
+	const int fcr31_pos = NUM_FPU_REGS * sizeof(elf_fpreg_t);
+	u32 fcr31;
 	int err;
 
 	BUG_ON(count % sizeof(elf_fpreg_t));
 
-	/* XXX fcr31  */
-
 	init_fp_ctx(target);
 
 	if (sizeof(target->thread.fpu.fpr[0]) == sizeof(elf_fpreg_t))
 		err = fpr_set_fpa(target, &pos, &count, &kbuf, &ubuf);
 	else
 		err = fpr_set_msa(target, &pos, &count, &kbuf, &ubuf);
+	if (err)
+		return err;
+
+	if (count > 0) {
+		err = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+					 &fcr31,
+					 fcr31_pos, fcr31_pos + sizeof(u32));
+		if (err)
+			return err;
+
+		ptrace_setfcr31(target, fcr31);
+	}
 
 	return err;
 }
-- 
cgit v1.2.3


From 725679dc78a9a6d1441dedda8816a5b1c02f7174 Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@mips.com>
Date: Mon, 11 Dec 2017 22:55:40 +0000
Subject: MIPS: Also verify sizeof `elf_fpreg_t' with PTRACE_SETREGSET

commit 006501e039eec411842bb3150c41358867d320c2 upstream.

Complement commit d614fd58a283 ("mips/ptrace: Preserve previous
registers for short regset write") and like with the PTRACE_GETREGSET
ptrace(2) request also apply a BUILD_BUG_ON check for the size of the
`elf_fpreg_t' type in the PTRACE_SETREGSET request handler.

Signed-off-by: Maciej W. Rozycki <macro@mips.com>
Fixes: d614fd58a283 ("mips/ptrace: Preserve previous registers for short regset write")
Cc: James Hogan <james.hogan@mips.com>
Cc: Paul Burton <Paul.Burton@mips.com>
Cc: Alex Smith <alex@alex-smith.me.uk>
Cc: Dave Martin <Dave.Martin@arm.com>
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/17929/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/mips/kernel/ptrace.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch')

diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c
index cc480c324900..aa92e131de57 100644
--- a/arch/mips/kernel/ptrace.c
+++ b/arch/mips/kernel/ptrace.c
@@ -467,6 +467,7 @@ static int fpr_get_msa(struct task_struct *target,
 	u64 fpr_val;
 	int err;
 
+	BUILD_BUG_ON(sizeof(fpr_val) != sizeof(elf_fpreg_t));
 	for (i = 0; i < NUM_FPU_REGS; i++) {
 		fpr_val = get_fpr64(&target->thread.fpu.fpr[i], 0);
 		err = user_regset_copyout(pos, count, kbuf, ubuf,
-- 
cgit v1.2.3


From f2c131d05d86ea7fefd0f866ccd6b3565ef28f78 Mon Sep 17 00:00:00 2001
From: "Maciej W. Rozycki" <macro@mips.com>
Date: Mon, 11 Dec 2017 22:56:54 +0000
Subject: MIPS: Disallow outsized PTRACE_SETREGSET NT_PRFPREG regset accesses

commit c8c5a3a24d395b14447a9a89d61586a913840a3b upstream.

Complement commit c23b3d1a5311 ("MIPS: ptrace: Change GP regset to use
correct core dump register layout") and also reject outsized
PTRACE_SETREGSET requests to the NT_PRFPREG regset, like with the
NT_PRSTATUS regset.

Signed-off-by: Maciej W. Rozycki <macro@mips.com>
Fixes: c23b3d1a5311 ("MIPS: ptrace: Change GP regset to use correct core dump register layout")
Cc: James Hogan <james.hogan@mips.com>
Cc: Paul Burton <Paul.Burton@mips.com>
Cc: Alex Smith <alex@alex-smith.me.uk>
Cc: Dave Martin <Dave.Martin@arm.com>
Cc: linux-mips@linux-mips.org
Cc: linux-kernel@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/17930/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/mips/kernel/ptrace.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch')

diff --git a/arch/mips/kernel/ptrace.c b/arch/mips/kernel/ptrace.c
index aa92e131de57..c3d2d2c05fdb 100644
--- a/arch/mips/kernel/ptrace.c
+++ b/arch/mips/kernel/ptrace.c
@@ -570,6 +570,9 @@ static int fpr_set(struct task_struct *target,
 
 	BUG_ON(count % sizeof(elf_fpreg_t));
 
+	if (pos + count > sizeof(elf_fpregset_t))
+		return -EIO;
+
 	init_fp_ctx(target);
 
 	if (sizeof(target->thread.fpu.fpr[0]) == sizeof(elf_fpreg_t))
-- 
cgit v1.2.3


From 8d383ff7deaa3c3b5e2c2b604b20f5844c1f74e0 Mon Sep 17 00:00:00 2001
From: Jim Mattson <jmattson@google.com>
Date: Wed, 3 Jan 2018 14:31:38 -0800
Subject: kvm: vmx: Scrub hardware GPRs at VM-exit

commit 0cb5b30698fdc8f6b4646012e3acb4ddce430788 upstream.

Guest GPR values are live in the hardware GPRs at VM-exit.  Do not
leave any guest values in hardware GPRs after the guest GPR values are
saved to the vcpu_vmx structure.

This is a partial mitigation for CVE 2017-5715 and CVE 2017-5753.
Specifically, it defeats the Project Zero PoC for CVE 2017-5715.

Suggested-by: Eric Northup <digitaleric@google.com>
Signed-off-by: Jim Mattson <jmattson@google.com>
Reviewed-by: Eric Northup <digitaleric@google.com>
Reviewed-by: Benjamin Serebrin <serebrin@google.com>
Reviewed-by: Andrew Honig <ahonig@google.com>
[Paolo: Add AMD bits, Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kvm/svm.c | 19 +++++++++++++++++++
 arch/x86/kvm/vmx.c | 14 +++++++++++++-
 2 files changed, 32 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 4b1152e57340..900ffb6c28b5 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3855,6 +3855,25 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 		"mov %%r13, %c[r13](%[svm]) \n\t"
 		"mov %%r14, %c[r14](%[svm]) \n\t"
 		"mov %%r15, %c[r15](%[svm]) \n\t"
+#endif
+		/*
+		* Clear host registers marked as clobbered to prevent
+		* speculative use.
+		*/
+		"xor %%" _ASM_BX ", %%" _ASM_BX " \n\t"
+		"xor %%" _ASM_CX ", %%" _ASM_CX " \n\t"
+		"xor %%" _ASM_DX ", %%" _ASM_DX " \n\t"
+		"xor %%" _ASM_SI ", %%" _ASM_SI " \n\t"
+		"xor %%" _ASM_DI ", %%" _ASM_DI " \n\t"
+#ifdef CONFIG_X86_64
+		"xor %%r8, %%r8 \n\t"
+		"xor %%r9, %%r9 \n\t"
+		"xor %%r10, %%r10 \n\t"
+		"xor %%r11, %%r11 \n\t"
+		"xor %%r12, %%r12 \n\t"
+		"xor %%r13, %%r13 \n\t"
+		"xor %%r14, %%r14 \n\t"
+		"xor %%r15, %%r15 \n\t"
 #endif
 		"pop %%" _ASM_BP
 		:
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index d915185ada05..60637937df9f 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -8623,6 +8623,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 		/* Save guest registers, load host registers, keep flags */
 		"mov %0, %c[wordsize](%%" _ASM_SP ") \n\t"
 		"pop %0 \n\t"
+		"setbe %c[fail](%0)\n\t"
 		"mov %%" _ASM_AX ", %c[rax](%0) \n\t"
 		"mov %%" _ASM_BX ", %c[rbx](%0) \n\t"
 		__ASM_SIZE(pop) " %c[rcx](%0) \n\t"
@@ -8639,12 +8640,23 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 		"mov %%r13, %c[r13](%0) \n\t"
 		"mov %%r14, %c[r14](%0) \n\t"
 		"mov %%r15, %c[r15](%0) \n\t"
+		"xor %%r8d,  %%r8d \n\t"
+		"xor %%r9d,  %%r9d \n\t"
+		"xor %%r10d, %%r10d \n\t"
+		"xor %%r11d, %%r11d \n\t"
+		"xor %%r12d, %%r12d \n\t"
+		"xor %%r13d, %%r13d \n\t"
+		"xor %%r14d, %%r14d \n\t"
+		"xor %%r15d, %%r15d \n\t"
 #endif
 		"mov %%cr2, %%" _ASM_AX "   \n\t"
 		"mov %%" _ASM_AX ", %c[cr2](%0) \n\t"
 
+		"xor %%eax, %%eax \n\t"
+		"xor %%ebx, %%ebx \n\t"
+		"xor %%esi, %%esi \n\t"
+		"xor %%edi, %%edi \n\t"
 		"pop  %%" _ASM_BP "; pop  %%" _ASM_DX " \n\t"
-		"setbe %c[fail](%0) \n\t"
 		".pushsection .rodata \n\t"
 		".global vmx_return \n\t"
 		"vmx_return: " _ASM_PTR " 2b \n\t"
-- 
cgit v1.2.3


From 1a699374533b23ec4deff885db121a5e4c42aa27 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Tue, 29 Dec 2015 20:12:18 -0800
Subject: x86/vsdo: Fix build on PARAVIRT_CLOCK=y, KVM_GUEST=n

commit 8705d603edd49f1cff165cd3b7998f4c7f098d27 upstream.

arch/x86/built-in.o: In function `arch_setup_additional_pages':
 (.text+0x587): undefined reference to `pvclock_pvti_cpu0_va'

KVM_GUEST selects PARAVIRT_CLOCK, so we can make pvclock_pvti_cpu0_va depend
on KVM_GUEST.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Tested-by: Borislav Petkov <bp@alien8.de>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Kees Cook <keescook@chromium.org>
Link: http://lkml.kernel.org/r/444d38a9bcba832685740ea1401b569861d09a72.1451446564.git.luto@kernel.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: James Dingwall <james@dingwall.me.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/pvclock.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index 6045cef376c2..c926255745e1 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -4,7 +4,7 @@
 #include <linux/clocksource.h>
 #include <asm/pvclock-abi.h>
 
-#ifdef CONFIG_PARAVIRT_CLOCK
+#ifdef CONFIG_KVM_GUEST
 extern struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void);
 #else
 static inline struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void)
-- 
cgit v1.2.3


From 745a0d9c1a8e6533f9707ffc2809e1c6198c29a7 Mon Sep 17 00:00:00 2001
From: Vikas C Sajjan <vikas.cha.sajjan@hpe.com>
Date: Thu, 16 Nov 2017 21:43:44 +0530
Subject: x86/acpi: Handle SCI interrupts above legacy space gracefully

commit 252714155f04c5d16989cb3aadb85fd1b5772f99 upstream.

Platforms which support only IOAPIC mode, pass the SCI information above
the legacy space (0-15) via the FADT mechanism and not via MADT.

In such cases mp_override_legacy_irq() which is invoked from
acpi_sci_ioapic_setup() to register SCI interrupts fails for interrupts
greater equal 16, since it is meant to handle only the legacy space and
emits error "Invalid bus_irq %u for legacy override".

Add a new function to handle SCI interrupts >= 16 and invoke it
conditionally in acpi_sci_ioapic_setup().

The code duplication due to this new function will be cleaned up in a
separate patch.

Co-developed-by: Sunil V L <sunil.vl@hpe.com>
Signed-off-by: Vikas C Sajjan <vikas.cha.sajjan@hpe.com>
Signed-off-by: Sunil V L <sunil.vl@hpe.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Abdul Lateef Attar <abdul-lateef.attar@hpe.com>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: linux-pm@vger.kernel.org
Cc: kkamagui@gmail.com
Cc: linux-acpi@vger.kernel.org
Link: https://lkml.kernel.org/r/1510848825-21965-2-git-send-email-vikas.cha.sajjan@hpe.com
Cc: Jean Delvare <jdelvare@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kernel/acpi/boot.c | 34 +++++++++++++++++++++++++++++++++-
 1 file changed, 33 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 1e5eb9f2ff5f..23f746c4d7f1 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -408,6 +408,34 @@ static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger,
 	return 0;
 }
 
+static int __init mp_register_ioapic_irq(u8 bus_irq, u8 polarity,
+						u8 trigger, u32 gsi)
+{
+	struct mpc_intsrc mp_irq;
+	int ioapic, pin;
+
+	/* Convert 'gsi' to 'ioapic.pin'(INTIN#) */
+	ioapic = mp_find_ioapic(gsi);
+	if (ioapic < 0) {
+		pr_warn("Failed to find ioapic for gsi : %u\n", gsi);
+		return ioapic;
+	}
+
+	pin = mp_find_ioapic_pin(ioapic, gsi);
+
+	mp_irq.type = MP_INTSRC;
+	mp_irq.irqtype = mp_INT;
+	mp_irq.irqflag = (trigger << 2) | polarity;
+	mp_irq.srcbus = MP_ISA_BUS;
+	mp_irq.srcbusirq = bus_irq;
+	mp_irq.dstapic = mpc_ioapic_id(ioapic);
+	mp_irq.dstirq = pin;
+
+	mp_save_irq(&mp_irq);
+
+	return 0;
+}
+
 static int __init
 acpi_parse_ioapic(struct acpi_subtable_header * header, const unsigned long end)
 {
@@ -452,7 +480,11 @@ static void __init acpi_sci_ioapic_setup(u8 bus_irq, u16 polarity, u16 trigger,
 	if (acpi_sci_flags & ACPI_MADT_POLARITY_MASK)
 		polarity = acpi_sci_flags & ACPI_MADT_POLARITY_MASK;
 
-	mp_override_legacy_irq(bus_irq, polarity, trigger, gsi);
+	if (bus_irq < NR_IRQS_LEGACY)
+		mp_override_legacy_irq(bus_irq, polarity, trigger, gsi);
+	else
+		mp_register_ioapic_irq(bus_irq, polarity, trigger, gsi);
+
 	acpi_penalize_sci_irq(bus_irq, trigger, polarity);
 
 	/*
-- 
cgit v1.2.3


From 0e82bbca5896839fa87c4ef2e6cd4d342ef762a7 Mon Sep 17 00:00:00 2001
From: Vikas C Sajjan <vikas.cha.sajjan@hpe.com>
Date: Thu, 16 Nov 2017 21:43:45 +0530
Subject: x86/acpi: Reduce code duplication in mp_override_legacy_irq()

commit 4ee2ec1b122599f7b10c849fa7915cebb37b7edb upstream.

The new function mp_register_ioapic_irq() is a subset of the code in
mp_override_legacy_irq().

Replace the code duplication by invoking mp_register_ioapic_irq() from
mp_override_legacy_irq().

Signed-off-by: Vikas C Sajjan <vikas.cha.sajjan@hpe.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: linux-pm@vger.kernel.org
Cc: kkamagui@gmail.com
Cc: linux-acpi@vger.kernel.org
Link: https://lkml.kernel.org/r/1510848825-21965-3-git-send-email-vikas.cha.sajjan@hpe.com
Cc: Jean Delvare <jdelvare@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kernel/acpi/boot.c | 27 +++++----------------------
 1 file changed, 5 insertions(+), 22 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 23f746c4d7f1..a1e4a6c3f394 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -321,13 +321,12 @@ acpi_parse_lapic_nmi(struct acpi_subtable_header * header, const unsigned long e
 #ifdef CONFIG_X86_IO_APIC
 #define MP_ISA_BUS		0
 
+static int __init mp_register_ioapic_irq(u8 bus_irq, u8 polarity,
+						u8 trigger, u32 gsi);
+
 static void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger,
 					  u32 gsi)
 {
-	int ioapic;
-	int pin;
-	struct mpc_intsrc mp_irq;
-
 	/*
 	 * Check bus_irq boundary.
 	 */
@@ -336,14 +335,6 @@ static void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger,
 		return;
 	}
 
-	/*
-	 * Convert 'gsi' to 'ioapic.pin'.
-	 */
-	ioapic = mp_find_ioapic(gsi);
-	if (ioapic < 0)
-		return;
-	pin = mp_find_ioapic_pin(ioapic, gsi);
-
 	/*
 	 * TBD: This check is for faulty timer entries, where the override
 	 *      erroneously sets the trigger to level, resulting in a HUGE
@@ -352,16 +343,8 @@ static void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger,
 	if ((bus_irq == 0) && (trigger == 3))
 		trigger = 1;
 
-	mp_irq.type = MP_INTSRC;
-	mp_irq.irqtype = mp_INT;
-	mp_irq.irqflag = (trigger << 2) | polarity;
-	mp_irq.srcbus = MP_ISA_BUS;
-	mp_irq.srcbusirq = bus_irq;	/* IRQ */
-	mp_irq.dstapic = mpc_ioapic_id(ioapic); /* APIC ID */
-	mp_irq.dstirq = pin;	/* INTIN# */
-
-	mp_save_irq(&mp_irq);
-
+	if (mp_register_ioapic_irq(bus_irq, polarity, trigger, gsi) < 0)
+		return;
 	/*
 	 * Reset default identity mapping if gsi is also an legacy IRQ,
 	 * otherwise there will be more than one entry with the same GSI
-- 
cgit v1.2.3


From f065b5f78d17fe835ad6f51f1a4f8e3e2e0efec3 Mon Sep 17 00:00:00 2001
From: Jiri Kosina <jkosina@suse.cz>
Date: Fri, 8 Jul 2016 11:38:28 +0200
Subject: x86/mm/pat, /dev/mem: Remove superfluous error message

commit 39380b80d72723282f0ea1d1bbf2294eae45013e upstream.

Currently it's possible for broken (or malicious) userspace to flood a
kernel log indefinitely with messages a-la

	Program dmidecode tried to access /dev/mem between f0000->100000

because range_is_allowed() is case of CONFIG_STRICT_DEVMEM being turned on
dumps this information each and every time devmem_is_allowed() fails.

Reportedly userspace that is able to trigger contignuous flow of these
messages exists.

It would be possible to rate limit this message, but that'd have a
questionable value; the administrator wouldn't get information about all
the failing accessess, so then the information would be both superfluous
and incomplete at the same time :)

Returning EPERM (which is what is actually happening) is enough indication
for userspace what has happened; no need to log this particular error as
some sort of special condition.

Signed-off-by: Jiri Kosina <jkosina@suse.cz>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luis R. Rodriguez <mcgrof@suse.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Toshi Kani <toshi.kani@hp.com>
Link: http://lkml.kernel.org/r/alpine.LNX.2.00.1607081137020.24757@cbobk.fhfr.pm
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/mm/pat.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index 3f1bb4f93a5a..3146b1da6d72 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -750,11 +750,8 @@ static inline int range_is_allowed(unsigned long pfn, unsigned long size)
 		return 1;
 
 	while (cursor < to) {
-		if (!devmem_is_allowed(pfn)) {
-			pr_info("x86/PAT: Program %s tried to access /dev/mem between [mem %#010Lx-%#010Lx], PAT prevents it\n",
-				current->comm, from, to - 1);
+		if (!devmem_is_allowed(pfn))
 			return 0;
-		}
 		cursor += PAGE_SIZE;
 		pfn++;
 	}
-- 
cgit v1.2.3


From 7ec5d87df34a90758cf2aaf6824bb748454a8f35 Mon Sep 17 00:00:00 2001
From: Pavel Tatashin <pasha.tatashin@oracle.com>
Date: Fri, 12 Jan 2018 15:00:02 -0500
Subject: x86/pti/efi: broken conversion from efi to kernel page table

In entry_64.S we have code like this:

    /* Unconditionally use kernel CR3 for do_nmi() */
    /* %rax is saved above, so OK to clobber here */
    ALTERNATIVE "jmp 2f", "movq %cr3, %rax", X86_FEATURE_KAISER
    /* If PCID enabled, NOFLUSH now and NOFLUSH on return */
    ALTERNATIVE "", "bts $63, %rax", X86_FEATURE_PCID
    pushq   %rax
    /* mask off "user" bit of pgd address and 12 PCID bits: */
    andq    $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax
    movq    %rax, %cr3
2:

    /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
    call    do_nmi

With this instruction:
    andq    $(~(X86_CR3_PCID_ASID_MASK | KAISER_SHADOW_PGD_OFFSET)), %rax

We unconditionally switch from whatever our CR3 was to kernel page table.
But, in arch/x86/platform/efi/efi_64.c We temporarily set a different page
table, that does not have the kernel page table with 0x1000 offset from it.

Look in efi_thunk() and efi_thunk_set_virtual_address_map().

So, while CR3 points to the other page table, we get an NMI interrupt,
and clear 0x1000 from CR3, resulting in a bogus CR3 if the 0x1000 bit was
set.

The efi page table comes from realmode/rm/trampoline_64.S:

arch/x86/realmode/rm/trampoline_64.S

141 .bss
142 .balign PAGE_SIZE
143 GLOBAL(trampoline_pgd) .space PAGE_SIZE

Notice: alignment is PAGE_SIZE, so after applying KAISER_SHADOW_PGD_OFFSET
which equal to PAGE_SIZE, we can get a different page table.

But, even if we fix alignment, here the trampoline binary is later copied
into dynamically allocated memory in reserve_real_mode(), so we need to
fix that place as well.

Fixes: 8a43ddfb93a0 ("KAISER: Kernel Address Isolation")

Signed-off-by: Pavel Tatashin <pasha.tatashin@oracle.com>
Reviewed-by: Steven Sistare <steven.sistare@oracle.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/kaiser.h        | 10 ++++++++++
 arch/x86/realmode/init.c             |  4 +++-
 arch/x86/realmode/rm/trampoline_64.S |  3 ++-
 3 files changed, 15 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/kaiser.h b/arch/x86/include/asm/kaiser.h
index 802bbbdfe143..48c791a411ab 100644
--- a/arch/x86/include/asm/kaiser.h
+++ b/arch/x86/include/asm/kaiser.h
@@ -19,6 +19,16 @@
 
 #define KAISER_SHADOW_PGD_OFFSET 0x1000
 
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+/*
+ *  A page table address must have this alignment to stay the same when
+ *  KAISER_SHADOW_PGD_OFFSET mask is applied
+ */
+#define KAISER_KERNEL_PGD_ALIGNMENT (KAISER_SHADOW_PGD_OFFSET << 1)
+#else
+#define KAISER_KERNEL_PGD_ALIGNMENT PAGE_SIZE
+#endif
+
 #ifdef __ASSEMBLY__
 #ifdef CONFIG_PAGE_TABLE_ISOLATION
 
diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c
index 0b7a63d98440..805a3271a137 100644
--- a/arch/x86/realmode/init.c
+++ b/arch/x86/realmode/init.c
@@ -4,6 +4,7 @@
 #include <asm/cacheflush.h>
 #include <asm/pgtable.h>
 #include <asm/realmode.h>
+#include <asm/kaiser.h>
 
 struct real_mode_header *real_mode_header;
 u32 *trampoline_cr4_features;
@@ -15,7 +16,8 @@ void __init reserve_real_mode(void)
 	size_t size = PAGE_ALIGN(real_mode_blob_end - real_mode_blob);
 
 	/* Has to be under 1M so we can execute real-mode AP code. */
-	mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE);
+	mem = memblock_find_in_range(0, 1 << 20, size,
+				     KAISER_KERNEL_PGD_ALIGNMENT);
 	if (!mem)
 		panic("Cannot allocate trampoline\n");
 
diff --git a/arch/x86/realmode/rm/trampoline_64.S b/arch/x86/realmode/rm/trampoline_64.S
index dac7b20d2f9d..781cca63f795 100644
--- a/arch/x86/realmode/rm/trampoline_64.S
+++ b/arch/x86/realmode/rm/trampoline_64.S
@@ -30,6 +30,7 @@
 #include <asm/msr.h>
 #include <asm/segment.h>
 #include <asm/processor-flags.h>
+#include <asm/kaiser.h>
 #include "realmode.h"
 
 	.text
@@ -139,7 +140,7 @@ tr_gdt:
 tr_gdt_end:
 
 	.bss
-	.balign	PAGE_SIZE
+	.balign	KAISER_KERNEL_PGD_ALIGNMENT
 GLOBAL(trampoline_pgd)		.space	PAGE_SIZE
 
 	.balign	8
-- 
cgit v1.2.3


From 7d1bef0f6037f4cb3380cc41596ec1b6ecfb19a7 Mon Sep 17 00:00:00 2001
From: Jia Zhang <qianyue.zj@alibaba-inc.com>
Date: Mon, 1 Jan 2018 10:04:47 +0800
Subject: x86/microcode/intel: Extend BDW late-loading with a revision check

commit b94b7373317164402ff7728d10f7023127a02b60 upstream.

Instead of blacklisting all model 79 CPUs when attempting a late
microcode loading, limit that only to CPUs with microcode revisions <
0x0b000021 because only on those late loading may cause a system hang.

For such processors either:

a) a BIOS update which might contain a newer microcode revision

or

b) the early microcode loading method

should be considered.

Processors with revisions 0x0b000021 or higher will not experience such
hangs.

For more details, see erratum BDF90 in document #334165 (Intel Xeon
Processor E7-8800/4800 v4 Product Family Specification Update) from
September 2017.

[ bp: Heavily massage commit message and pr_* statements. ]

Fixes: 723f2828a98c ("x86/microcode/intel: Disable late loading on model 79")
Signed-off-by: Jia Zhang <qianyue.zj@alibaba-inc.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Tony Luck <tony.luck@intel.com>
Cc: x86-ml <x86@kernel.org>
Link: http://lkml.kernel.org/r/1514772287-92959-1-git-send-email-qianyue.zj@alibaba-inc.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kernel/cpu/microcode/intel.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index abf581ade8d2..b428a8174be1 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -994,9 +994,17 @@ static bool is_blacklisted(unsigned int cpu)
 {
 	struct cpuinfo_x86 *c = &cpu_data(cpu);
 
-	if (c->x86 == 6 && c->x86_model == 79) {
-		pr_err_once("late loading on model 79 is disabled.\n");
-		return true;
+	/*
+	 * Late loading on model 79 with microcode revision less than 0x0b000021
+	 * may result in a system hang. This behavior is documented in item
+	 * BDF90, #334165 (Intel Xeon Processor E7-8800/4800 v4 Product Family).
+	 */
+	if (c->x86 == 6 &&
+	    c->x86_model == 79 &&
+	    c->x86_mask == 0x01 &&
+	    c->microcode < 0x0b000021) {
+		pr_err_once("Erratum BDF90: late loading with revision < 0x0b000021 (0x%x) disabled.\n", c->microcode);
+		pr_err_once("Please consider either early loading through initrd/built-in or a potential BIOS update.\n");
 	}
 
 	return false;
-- 
cgit v1.2.3


From 6785f955bcb615de068079943a8382c600f74c45 Mon Sep 17 00:00:00 2001
From: Andrew Honig <ahonig@google.com>
Date: Wed, 10 Jan 2018 10:12:03 -0800
Subject: KVM: x86: Add memory barrier on vmcs field lookup

commit 75f139aaf896d6fdeec2e468ddfa4b2fe469bf40 upstream.

This adds a memory barrier when performing a lookup into
the vmcs_field_to_offset_table.  This is related to
CVE-2017-5753.

Signed-off-by: Andrew Honig <ahonig@google.com>
Reviewed-by: Jim Mattson <jmattson@google.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kvm/vmx.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 60637937df9f..c26255f19603 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -828,8 +828,16 @@ static inline short vmcs_field_to_offset(unsigned long field)
 {
 	BUILD_BUG_ON(ARRAY_SIZE(vmcs_field_to_offset_table) > SHRT_MAX);
 
-	if (field >= ARRAY_SIZE(vmcs_field_to_offset_table) ||
-	    vmcs_field_to_offset_table[field] == 0)
+	if (field >= ARRAY_SIZE(vmcs_field_to_offset_table))
+		return -ENOENT;
+
+	/*
+	 * FIXME: Mitigation for CVE-2017-5753.  To be replaced with a
+	 * generic mechanism.
+	 */
+	asm("lfence");
+
+	if (vmcs_field_to_offset_table[field] == 0)
 		return -ENOENT;
 
 	return vmcs_field_to_offset_table[field];
-- 
cgit v1.2.3


From c18b1bda49334cbef67d5b9fedbbe20e28566088 Mon Sep 17 00:00:00 2001
From: Lepton Wu <ytht.net@gmail.com>
Date: Fri, 12 Jan 2018 13:42:56 -0800
Subject: kaiser: Set _PAGE_NX only if supported

This finally resolve crash if loaded under qemu + haxm. Haitao Shan pointed
out that the reason of that crash is that NX bit get set for page tables.
It seems we missed checking if _PAGE_NX is supported in kaiser_add_user_map

Link: https://www.spinics.net/lists/kernel/msg2689835.html

Reviewed-by: Guenter Roeck <groeck@chromium.org>
Signed-off-by: Lepton Wu <ytht.net@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/mm/kaiser.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c
index 6a7a77929a8c..8af98513d36c 100644
--- a/arch/x86/mm/kaiser.c
+++ b/arch/x86/mm/kaiser.c
@@ -198,6 +198,8 @@ static int kaiser_add_user_map(const void *__start_addr, unsigned long size,
 	 * requires that not to be #defined to 0): so mask it off here.
 	 */
 	flags &= ~_PAGE_GLOBAL;
+	if (!(__supported_pte_mask & _PAGE_NX))
+		flags &= ~_PAGE_NX;
 
 	for (; address < end_addr; address += PAGE_SIZE) {
 		target_address = get_pa_from_mapping(address);
-- 
cgit v1.2.3


From 18b849b18d1c6dd385e88a0fadfeabc08faeba9c Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 18 Jan 2017 11:15:38 -0800
Subject: x86/cpu: Factor out application of forced CPU caps

commit 8bf1ebca215c262e48c15a4a15f175991776f57f upstream.

There are multiple call sites that apply forced CPU caps.  Factor
them into a helper.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matthew Whitehead <tedheadster@gmail.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: One Thousand Gnomes <gnomes@lxorguk.ukuu.org.uk>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yu-cheng Yu <yu-cheng.yu@intel.com>
Link: http://lkml.kernel.org/r/623ff7555488122143e4417de09b18be2085ad06.1484705016.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kernel/cpu/common.c | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index cc154ac64f00..df3ac33c0855 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -664,6 +664,16 @@ void cpu_detect(struct cpuinfo_x86 *c)
 	}
 }
 
+static void apply_forced_caps(struct cpuinfo_x86 *c)
+{
+	int i;
+
+	for (i = 0; i < NCAPINTS; i++) {
+		c->x86_capability[i] &= ~cpu_caps_cleared[i];
+		c->x86_capability[i] |= cpu_caps_set[i];
+	}
+}
+
 void get_cpu_cap(struct cpuinfo_x86 *c)
 {
 	u32 tfms, xlvl;
@@ -955,11 +965,8 @@ static void identify_cpu(struct cpuinfo_x86 *c)
 	if (this_cpu->c_identify)
 		this_cpu->c_identify(c);
 
-	/* Clear/Set all flags overriden by options, after probe */
-	for (i = 0; i < NCAPINTS; i++) {
-		c->x86_capability[i] &= ~cpu_caps_cleared[i];
-		c->x86_capability[i] |= cpu_caps_set[i];
-	}
+ 	/* Clear/Set all flags overridden by options, after probe */
+	apply_forced_caps(c);
 
 #ifdef CONFIG_X86_64
 	c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
@@ -1020,10 +1027,7 @@ static void identify_cpu(struct cpuinfo_x86 *c)
 	 * Clear/Set all flags overriden by options, need do it
 	 * before following smp all cpus cap AND.
 	 */
-	for (i = 0; i < NCAPINTS; i++) {
-		c->x86_capability[i] &= ~cpu_caps_cleared[i];
-		c->x86_capability[i] |= cpu_caps_set[i];
-	}
+	apply_forced_caps(c);
 
 	/*
 	 * On SMP, boot_cpu_data holds the common feature set between
-- 
cgit v1.2.3


From 65b28590de24afafcdc737b9fe86af0a9d2fbbdb Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 4 Dec 2017 15:07:32 +0100
Subject: x86/cpufeatures: Make CPU bugs sticky

commit 6cbd2171e89b13377261d15e64384df60ecb530e upstream.

There is currently no way to force CPU bug bits like CPU feature bits. That
makes it impossible to set a bug bit once at boot and have it stick for all
upcoming CPUs.

Extend the force set/clear arrays to handle bug bits as well.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Link: https://lkml.kernel.org/r/20171204150606.992156574@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/cpufeature.h | 2 ++
 arch/x86/include/asm/processor.h  | 4 ++--
 arch/x86/kernel/cpu/common.c      | 6 +++---
 3 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index f6605712ca90..34c4106230f1 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -359,6 +359,8 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
 	set_bit(bit, (unsigned long *)cpu_caps_set);	\
 } while (0)
 
+#define setup_force_cpu_bug(bit) setup_force_cpu_cap(bit)
+
 #define cpu_has_fpu		boot_cpu_has(X86_FEATURE_FPU)
 #define cpu_has_de		boot_cpu_has(X86_FEATURE_DE)
 #define cpu_has_pse		boot_cpu_has(X86_FEATURE_PSE)
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index f3bdaed0188f..c124d6ab4bf9 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -156,8 +156,8 @@ extern struct cpuinfo_x86	boot_cpu_data;
 extern struct cpuinfo_x86	new_cpu_data;
 
 extern struct tss_struct	doublefault_tss;
-extern __u32			cpu_caps_cleared[NCAPINTS];
-extern __u32			cpu_caps_set[NCAPINTS];
+extern __u32			cpu_caps_cleared[NCAPINTS + NBUGINTS];
+extern __u32			cpu_caps_set[NCAPINTS + NBUGINTS];
 
 #ifdef CONFIG_SMP
 DECLARE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info);
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index df3ac33c0855..e4dfb756d692 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -432,8 +432,8 @@ static const char *table_lookup_model(struct cpuinfo_x86 *c)
 	return NULL;		/* Not found */
 }
 
-__u32 cpu_caps_cleared[NCAPINTS];
-__u32 cpu_caps_set[NCAPINTS];
+__u32 cpu_caps_cleared[NCAPINTS + NBUGINTS];
+__u32 cpu_caps_set[NCAPINTS + NBUGINTS];
 
 void load_percpu_segment(int cpu)
 {
@@ -668,7 +668,7 @@ static void apply_forced_caps(struct cpuinfo_x86 *c)
 {
 	int i;
 
-	for (i = 0; i < NCAPINTS; i++) {
+	for (i = 0; i < NCAPINTS + NBUGINTS; i++) {
 		c->x86_capability[i] &= ~cpu_caps_cleared[i];
 		c->x86_capability[i] |= cpu_caps_set[i];
 	}
-- 
cgit v1.2.3


From 07c7aa5e7e8ac83768246822b61ebffbdea61ff7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 4 Dec 2017 15:07:33 +0100
Subject: x86/cpufeatures: Add X86_BUG_CPU_INSECURE

commit a89f040fa34ec9cd682aed98b8f04e3c47d998bd upstream.

Many x86 CPUs leak information to user space due to missing isolation of
user space and kernel space page tables. There are many well documented
ways to exploit that.

The upcoming software migitation of isolating the user and kernel space
page tables needs a misfeature flag so code can be made runtime
conditional.

Add the BUG bits which indicates that the CPU is affected and add a feature
bit which indicates that the software migitation is enabled.

Assume for now that _ALL_ x86 CPUs are affected by this. Exceptions can be
made later.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Laight <David.Laight@aculab.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eduardo Valentin <eduval@amazon.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Juergen Gross <jgross@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: aliguori@amazon.com
Cc: daniel.gruss@iaik.tugraz.at
Cc: hughd@google.com
Cc: keescook@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/cpufeature.h | 1 +
 arch/x86/kernel/cpu/common.c      | 4 ++++
 2 files changed, 5 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 34c4106230f1..ae023d393c5c 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -277,6 +277,7 @@
 #define X86_BUG_FXSAVE_LEAK	X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */
 #define X86_BUG_CLFLUSH_MONITOR	X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */
 #define X86_BUG_SYSRET_SS_ATTRS	X86_BUG(8) /* SYSRET doesn't fix up SS attrs */
+#define X86_BUG_CPU_INSECURE	X86_BUG(14) /* CPU is insecure and needs kernel page table isolation */
 
 #if defined(__KERNEL__) && !defined(__ASSEMBLY__)
 
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index e4dfb756d692..97bb497e450e 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -830,6 +830,10 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
 	}
 
 	setup_force_cpu_cap(X86_FEATURE_ALWAYS);
+
+	/* Assume for now that ALL x86 CPUs are insecure */
+	setup_force_cpu_bug(X86_BUG_CPU_INSECURE);
+
 	fpu__init_system(c);
 }
 
-- 
cgit v1.2.3


From 6349cab425ce91ba71676fba5aa6089cae0e6474 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 5 Jan 2018 15:27:34 +0100
Subject: x86/pti: Rename BUG_CPU_INSECURE to BUG_CPU_MELTDOWN

commit de791821c295cc61419a06fe5562288417d1bc58 upstream.

Use the name associated with the particular attack which needs page table
isolation for mitigation.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: David Woodhouse <dwmw@amazon.co.uk>
Cc: Alan Cox <gnomes@lxorguk.ukuu.org.uk>
Cc: Jiri Koshina <jikos@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Andi Lutomirski  <luto@amacapital.net>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Paul Turner <pjt@google.com>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: Greg KH <gregkh@linux-foundation.org>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Kees Cook <keescook@google.com>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801051525300.1724@nanos
Signed-off-by: Razvan Ghitulete <rga@amazon.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/cpufeature.h | 2 +-
 arch/x86/kernel/cpu/common.c      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index ae023d393c5c..25055e066b4c 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -277,7 +277,7 @@
 #define X86_BUG_FXSAVE_LEAK	X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */
 #define X86_BUG_CLFLUSH_MONITOR	X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */
 #define X86_BUG_SYSRET_SS_ATTRS	X86_BUG(8) /* SYSRET doesn't fix up SS attrs */
-#define X86_BUG_CPU_INSECURE	X86_BUG(14) /* CPU is insecure and needs kernel page table isolation */
+#define X86_BUG_CPU_MELTDOWN	X86_BUG(14) /* CPU is affected by meltdown attack and needs kernel page table isolation */
 
 #if defined(__KERNEL__) && !defined(__ASSEMBLY__)
 
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 97bb497e450e..fc030ce8ff81 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -832,7 +832,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
 	setup_force_cpu_cap(X86_FEATURE_ALWAYS);
 
 	/* Assume for now that ALL x86 CPUs are insecure */
-	setup_force_cpu_bug(X86_BUG_CPU_INSECURE);
+	setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN);
 
 	fpu__init_system(c);
 }
-- 
cgit v1.2.3


From caae411b6ee026c7f43d67932e9b5008cf623293 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Sat, 6 Jan 2018 11:49:23 +0000
Subject: x86/cpufeatures: Add X86_BUG_SPECTRE_V[12]

commit 99c6fa2511d8a683e61468be91b83f85452115fa upstream.

Add the bug bits for spectre v1/2 and force them unconditionally for all
cpus.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: gnomes@lxorguk.ukuu.org.uk
Cc: Rik van Riel <riel@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Jiri Kosina <jikos@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Kees Cook <keescook@google.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Greg Kroah-Hartman <gregkh@linux-foundation.org>
Cc: Paul Turner <pjt@google.com>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/1515239374-23361-2-git-send-email-dwmw@amazon.co.uk
Signed-off-by: Razvan Ghitulete <rga@amazon.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/cpufeature.h | 2 ++
 arch/x86/kernel/cpu/common.c      | 3 +++
 2 files changed, 5 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 25055e066b4c..142028afd049 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -278,6 +278,8 @@
 #define X86_BUG_CLFLUSH_MONITOR	X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */
 #define X86_BUG_SYSRET_SS_ATTRS	X86_BUG(8) /* SYSRET doesn't fix up SS attrs */
 #define X86_BUG_CPU_MELTDOWN	X86_BUG(14) /* CPU is affected by meltdown attack and needs kernel page table isolation */
+#define X86_BUG_SPECTRE_V1	X86_BUG(15) /* CPU is affected by Spectre variant 1 attack with conditional branches */
+#define X86_BUG_SPECTRE_V2	X86_BUG(16) /* CPU is affected by Spectre variant 2 attack with indirect branches */
 
 #if defined(__KERNEL__) && !defined(__ASSEMBLY__)
 
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index fc030ce8ff81..dc4dfad66a70 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -834,6 +834,9 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
 	/* Assume for now that ALL x86 CPUs are insecure */
 	setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN);
 
+	setup_force_cpu_bug(X86_BUG_SPECTRE_V1);
+	setup_force_cpu_bug(X86_BUG_SPECTRE_V2);
+
 	fpu__init_system(c);
 }
 
-- 
cgit v1.2.3


From 9718bf5f4ebbf7f0abab4c22a6d45a9dda8da098 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Mon, 24 Oct 2016 19:38:43 +0200
Subject: x86/cpu: Merge bugs.c and bugs_64.c

commit 62a67e123e058a67db58bc6a14354dd037bafd0a upstream.

Should be easier when following boot paths. It probably is a left over
from the x86 unification eons ago.

No functionality change.

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20161024173844.23038-3-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Razvan Ghitulete <rga@amazon.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kernel/cpu/Makefile  |  4 +---
 arch/x86/kernel/cpu/bugs.c    | 26 ++++++++++++++++++++++----
 arch/x86/kernel/cpu/bugs_64.c | 33 ---------------------------------
 3 files changed, 23 insertions(+), 40 deletions(-)
 delete mode 100644 arch/x86/kernel/cpu/bugs_64.c

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 58031303e304..8f184615053b 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -16,13 +16,11 @@ obj-y			:= intel_cacheinfo.o scattered.o topology.o
 obj-y			+= common.o
 obj-y			+= rdrand.o
 obj-y			+= match.o
+obj-y			+= bugs.o
 
 obj-$(CONFIG_PROC_FS)	+= proc.o
 obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o
 
-obj-$(CONFIG_X86_32)	+= bugs.o
-obj-$(CONFIG_X86_64)	+= bugs_64.o
-
 obj-$(CONFIG_CPU_SUP_INTEL)		+= intel.o
 obj-$(CONFIG_CPU_SUP_AMD)		+= amd.o
 obj-$(CONFIG_CPU_SUP_CYRIX_32)		+= cyrix.o
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 0b6124315441..5d82f2ca4acf 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -16,6 +16,8 @@
 #include <asm/msr.h>
 #include <asm/paravirt.h>
 #include <asm/alternative.h>
+#include <asm/pgtable.h>
+#include <asm/cacheflush.h>
 
 void __init check_bugs(void)
 {
@@ -28,11 +30,13 @@ void __init check_bugs(void)
 #endif
 
 	identify_boot_cpu();
-#ifndef CONFIG_SMP
-	pr_info("CPU: ");
-	print_cpu_info(&boot_cpu_data);
-#endif
 
+	if (!IS_ENABLED(CONFIG_SMP)) {
+		pr_info("CPU: ");
+		print_cpu_info(&boot_cpu_data);
+	}
+
+#ifdef CONFIG_X86_32
 	/*
 	 * Check whether we are able to run this kernel safely on SMP.
 	 *
@@ -48,4 +52,18 @@ void __init check_bugs(void)
 	alternative_instructions();
 
 	fpu__init_check_bugs();
+#else /* CONFIG_X86_64 */
+	alternative_instructions();
+
+	/*
+	 * Make sure the first 2MB area is not mapped by huge pages
+	 * There are typically fixed size MTRRs in there and overlapping
+	 * MTRRs into large pages causes slow downs.
+	 *
+	 * Right now we don't do that with gbpages because there seems
+	 * very little benefit for that case.
+	 */
+	if (!direct_gbpages)
+		set_memory_4k((unsigned long)__va(0), 1);
+#endif
 }
diff --git a/arch/x86/kernel/cpu/bugs_64.c b/arch/x86/kernel/cpu/bugs_64.c
deleted file mode 100644
index 04f0fe5af83e..000000000000
--- a/arch/x86/kernel/cpu/bugs_64.c
+++ /dev/null
@@ -1,33 +0,0 @@
-/*
- *  Copyright (C) 1994  Linus Torvalds
- *  Copyright (C) 2000  SuSE
- */
-
-#include <linux/kernel.h>
-#include <linux/init.h>
-#include <asm/alternative.h>
-#include <asm/bugs.h>
-#include <asm/processor.h>
-#include <asm/mtrr.h>
-#include <asm/cacheflush.h>
-
-void __init check_bugs(void)
-{
-	identify_boot_cpu();
-#if !defined(CONFIG_SMP)
-	printk(KERN_INFO "CPU: ");
-	print_cpu_info(&boot_cpu_data);
-#endif
-	alternative_instructions();
-
-	/*
-	 * Make sure the first 2MB area is not mapped by huge pages
-	 * There are typically fixed size MTRRs in there and overlapping
-	 * MTRRs into large pages causes slow downs.
-	 *
-	 * Right now we don't do that with gbpages because there seems
-	 * very little benefit for that case.
-	 */
-	if (!direct_gbpages)
-		set_memory_4k((unsigned long)__va(0), 1);
-}
-- 
cgit v1.2.3


From 72cf81e43ba4d2c43877ad85afd0417577d610e7 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 7 Jan 2018 22:48:01 +0100
Subject: x86/cpu: Implement CPU vulnerabilites sysfs functions

commit 61dc0f555b5c761cdafb0ba5bd41ecf22d68a4c4 upstream.

Implement the CPU vulnerabilty show functions for meltdown, spectre_v1 and
spectre_v2.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linuxfoundation.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: David Woodhouse <dwmw@amazon.co.uk>
Link: https://lkml.kernel.org/r/20180107214913.177414879@linutronix.de
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/Kconfig           |  1 +
 arch/x86/kernel/cpu/bugs.c | 29 +++++++++++++++++++++++++++++
 2 files changed, 30 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 39d2dc66faa5..0ef2cdd11616 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -62,6 +62,7 @@ config X86
 	select GENERIC_CLOCKEVENTS_MIN_ADJUST
 	select GENERIC_CMOS_UPDATE
 	select GENERIC_CPU_AUTOPROBE
+	select GENERIC_CPU_VULNERABILITIES
 	select GENERIC_EARLY_IOREMAP
 	select GENERIC_FIND_FIRST_BIT
 	select GENERIC_IOMAP
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 5d82f2ca4acf..cd46f9039119 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -9,6 +9,7 @@
  */
 #include <linux/init.h>
 #include <linux/utsname.h>
+#include <linux/cpu.h>
 #include <asm/bugs.h>
 #include <asm/processor.h>
 #include <asm/processor-flags.h>
@@ -67,3 +68,31 @@ void __init check_bugs(void)
 		set_memory_4k((unsigned long)__va(0), 1);
 #endif
 }
+
+#ifdef CONFIG_SYSFS
+ssize_t cpu_show_meltdown(struct device *dev,
+			  struct device_attribute *attr, char *buf)
+{
+	if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN))
+		return sprintf(buf, "Not affected\n");
+	if (boot_cpu_has(X86_FEATURE_KAISER))
+		return sprintf(buf, "Mitigation: PTI\n");
+	return sprintf(buf, "Vulnerable\n");
+}
+
+ssize_t cpu_show_spectre_v1(struct device *dev,
+			    struct device_attribute *attr, char *buf)
+{
+	if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1))
+		return sprintf(buf, "Not affected\n");
+	return sprintf(buf, "Vulnerable\n");
+}
+
+ssize_t cpu_show_spectre_v2(struct device *dev,
+			    struct device_attribute *attr, char *buf)
+{
+	if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
+		return sprintf(buf, "Not affected\n");
+	return sprintf(buf, "Vulnerable\n");
+}
+#endif
-- 
cgit v1.2.3


From e997d991ab2b1dc9f9cdad999a891626c2aecf21 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 10 Jan 2018 12:28:16 +0100
Subject: x86/alternatives: Fix optimize_nops() checking

commit 612e8e9350fd19cae6900cf36ea0c6892d1a0dca upstream.

The alternatives code checks only the first byte whether it is a NOP, but
with NOPs in front of the payload and having actual instructions after it
breaks the "optimized' test.

Make sure to scan all bytes before deciding to optimize the NOPs in there.

Reported-by: David Woodhouse <dwmw2@infradead.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Jiri Kosina <jikos@kernel.org>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Andrew Lutomirski <luto@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Greg Kroah-Hartman <gregkh@linux-foundation.org>
Cc: Paul Turner <pjt@google.com>
Link: https://lkml.kernel.org/r/20180110112815.mgciyf5acwacphkq@pd.tnic
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kernel/alternative.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 25f909362b7a..d6f375f1b928 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -339,9 +339,12 @@ done:
 static void __init_or_module optimize_nops(struct alt_instr *a, u8 *instr)
 {
 	unsigned long flags;
+	int i;
 
-	if (instr[0] != 0x90)
-		return;
+	for (i = 0; i < a->padlen; i++) {
+		if (instr[i] != 0x90)
+			return;
+	}
 
 	local_irq_save(flags);
 	add_nops(instr + (a->instrlen - a->padlen), a->padlen);
-- 
cgit v1.2.3


From 999d4f1961fa002bda138ddfe9119965421f85da Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Thu, 4 Jan 2018 14:37:05 +0000
Subject: x86/alternatives: Add missing '\n' at end of ALTERNATIVE inline asm

commit b9e705ef7cfaf22db0daab91ad3cd33b0fa32eb9 upstream.

Where an ALTERNATIVE is used in the middle of an inline asm block, this
would otherwise lead to the following instruction being appended directly
to the trailing ".popsection", and a failed compile.

Fixes: 9cebed423c84 ("x86, alternative: Use .pushsection/.popsection")
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: gnomes@lxorguk.ukuu.org.uk
Cc: Rik van Riel <riel@redhat.com>
Cc: ak@linux.intel.com
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Paul Turner <pjt@google.com>
Cc: Jiri Kosina <jikos@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Kees Cook <keescook@google.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Greg Kroah-Hartman <gregkh@linux-foundation.org>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/20180104143710.8961-8-dwmw@amazon.co.uk
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/alternative.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 09936e9c8154..d1cf17173b1b 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -138,7 +138,7 @@ static inline int alternatives_text_reserved(void *start, void *end)
 	".popsection\n"							\
 	".pushsection .altinstr_replacement, \"ax\"\n"			\
 	ALTINSTR_REPLACEMENT(newinstr, feature, 1)			\
-	".popsection"
+	".popsection\n"
 
 #define ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2)\
 	OLDINSTR_2(oldinstr, 1, 2)					\
@@ -149,7 +149,7 @@ static inline int alternatives_text_reserved(void *start, void *end)
 	".pushsection .altinstr_replacement, \"ax\"\n"			\
 	ALTINSTR_REPLACEMENT(newinstr1, feature1, 1)			\
 	ALTINSTR_REPLACEMENT(newinstr2, feature2, 2)			\
-	".popsection"
+	".popsection\n"
 
 /*
  * This must be included *after* the definition of ALTERNATIVE due to
-- 
cgit v1.2.3


From 20c28c04a6bc2ebd60fa20e5c3a6bf3bfa736d81 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Mon, 8 Jan 2018 16:09:21 -0600
Subject: x86/cpu/AMD: Make LFENCE a serializing instruction

commit e4d0e84e490790798691aaa0f2e598637f1867ec upstream.

To aid in speculation control, make LFENCE a serializing instruction
since it has less overhead than MFENCE.  This is done by setting bit 1
of MSR 0xc0011029 (DE_CFG).  Some families that support LFENCE do not
have this MSR.  For these families, the LFENCE instruction is already
serializing.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Greg Kroah-Hartman <gregkh@linux-foundation.org>
Cc: David Woodhouse <dwmw@amazon.co.uk>
Cc: Paul Turner <pjt@google.com>
Link: https://lkml.kernel.org/r/20180108220921.12580.71694.stgit@tlendack-t1.amdoffice.net
Signed-off-by: Razvan Ghitulete <rga@amazon.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/msr-index.h |  2 ++
 arch/x86/kernel/cpu/amd.c        | 10 ++++++++++
 2 files changed, 12 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 37db36fddc88..f00dfff81370 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -330,6 +330,8 @@
 #define FAM10H_MMIO_CONF_BASE_MASK	0xfffffffULL
 #define FAM10H_MMIO_CONF_BASE_SHIFT	20
 #define MSR_FAM10H_NODE_ID		0xc001100c
+#define MSR_F10H_DECFG			0xc0011029
+#define MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT	1
 
 /* K8 MSRs */
 #define MSR_K8_TOP_MEM1			0xc001001a
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index e2defc7593a4..7f2e9c6ff239 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -746,6 +746,16 @@ static void init_amd(struct cpuinfo_x86 *c)
 		set_cpu_cap(c, X86_FEATURE_K8);
 
 	if (cpu_has_xmm2) {
+		/*
+		 * A serializing LFENCE has less overhead than MFENCE, so
+		 * use it for execution serialization.  On families which
+		 * don't have that MSR, LFENCE is already serializing.
+		 * msr_set_bit() uses the safe accessors, too, even if the MSR
+		 * is not present.
+		 */
+		msr_set_bit(MSR_F10H_DECFG,
+			    MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT);
+
 		/* MFENCE stops RDTSC speculation */
 		set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
 	}
-- 
cgit v1.2.3


From 642ce1bb5ea64b1d036dff04a005d9bbe28ab5b4 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Mon, 8 Jan 2018 16:09:32 -0600
Subject: x86/cpu/AMD: Use LFENCE_RDTSC in preference to MFENCE_RDTSC

commit 9c6a73c75864ad9fa49e5fa6513e4c4071c0e29f upstream.

With LFENCE now a serializing instruction, use LFENCE_RDTSC in preference
to MFENCE_RDTSC.  However, since the kernel could be running under a
hypervisor that does not support writing that MSR, read the MSR back and
verify that the bit has been set successfully.  If the MSR can be read
and the bit is set, then set the LFENCE_RDTSC feature, otherwise set the
MFENCE_RDTSC feature.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Greg Kroah-Hartman <gregkh@linux-foundation.org>
Cc: David Woodhouse <dwmw@amazon.co.uk>
Cc: Paul Turner <pjt@google.com>
Link: https://lkml.kernel.org/r/20180108220932.12580.52458.stgit@tlendack-t1.amdoffice.net
Signed-off-by: Razvan Ghitulete <rga@amazon.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/msr-index.h |  1 +
 arch/x86/kernel/cpu/amd.c        | 18 ++++++++++++++++--
 2 files changed, 17 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index f00dfff81370..b8911aecf035 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -332,6 +332,7 @@
 #define MSR_FAM10H_NODE_ID		0xc001100c
 #define MSR_F10H_DECFG			0xc0011029
 #define MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT	1
+#define MSR_F10H_DECFG_LFENCE_SERIALIZE		BIT_ULL(MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT)
 
 /* K8 MSRs */
 #define MSR_K8_TOP_MEM1			0xc001001a
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 7f2e9c6ff239..4bf9e77f3e05 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -746,6 +746,9 @@ static void init_amd(struct cpuinfo_x86 *c)
 		set_cpu_cap(c, X86_FEATURE_K8);
 
 	if (cpu_has_xmm2) {
+		unsigned long long val;
+		int ret;
+
 		/*
 		 * A serializing LFENCE has less overhead than MFENCE, so
 		 * use it for execution serialization.  On families which
@@ -756,8 +759,19 @@ static void init_amd(struct cpuinfo_x86 *c)
 		msr_set_bit(MSR_F10H_DECFG,
 			    MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT);
 
-		/* MFENCE stops RDTSC speculation */
-		set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
+		/*
+		 * Verify that the MSR write was successful (could be running
+		 * under a hypervisor) and only then assume that LFENCE is
+		 * serializing.
+		 */
+		ret = rdmsrl_safe(MSR_F10H_DECFG, &val);
+		if (!ret && (val & MSR_F10H_DECFG_LFENCE_SERIALIZE)) {
+			/* A serializing LFENCE stops RDTSC speculation */
+			set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
+		} else {
+			/* MFENCE stops RDTSC speculation */
+			set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC);
+		}
 	}
 
 	/*
-- 
cgit v1.2.3


From 416f66509fce36e128ea2d9e62739a21f1a1d04d Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Sun, 17 Sep 2017 09:03:50 -0700
Subject: x86/mm/32: Move setup_clear_cpu_cap(X86_FEATURE_PCID) earlier

commit b8b7abaed7a49b350f8ba659ddc264b04931d581 upstream.

Otherwise we might have the PCID feature bit set during cpu_init().

This is just for robustness.  I haven't seen any actual bugs here.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bpetkov@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: cba4671af755 ("x86/mm: Disable PCID on 32-bit kernels")
Link: http://lkml.kernel.org/r/b16dae9d6b0db5d9801ddbebbfd83384097c61f3.1505663533.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kernel/cpu/bugs.c   | 8 --------
 arch/x86/kernel/cpu/common.c | 8 ++++++++
 2 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index cd46f9039119..cb6b4f9d0b7a 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -22,14 +22,6 @@
 
 void __init check_bugs(void)
 {
-#ifdef CONFIG_X86_32
-	/*
-	 * Regardless of whether PCID is enumerated, the SDM says
-	 * that it can't be enabled in 32-bit mode.
-	 */
-	setup_clear_cpu_cap(X86_FEATURE_PCID);
-#endif
-
 	identify_boot_cpu();
 
 	if (!IS_ENABLED(CONFIG_SMP)) {
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index dc4dfad66a70..0531c1707b40 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -838,6 +838,14 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
 	setup_force_cpu_bug(X86_BUG_SPECTRE_V2);
 
 	fpu__init_system(c);
+
+#ifdef CONFIG_X86_32
+	/*
+	 * Regardless of whether PCID is enumerated, the SDM says
+	 * that it can't be enabled in 32-bit mode.
+	 */
+	setup_clear_cpu_cap(X86_FEATURE_PCID);
+#endif
 }
 
 void __init early_cpu_init(void)
-- 
cgit v1.2.3


From cfc8c1d61e46fd3c60a34a5b1962eeeb03222a3d Mon Sep 17 00:00:00 2001
From: Andrey Ryabinin <aryabinin@virtuozzo.com>
Date: Fri, 29 Sep 2017 17:15:36 +0300
Subject: x86/asm: Use register variable to get stack pointer value

commit 196bd485ee4f03ce4c690bfcf38138abfcd0a4bc upstream.

Currently we use current_stack_pointer() function to get the value
of the stack pointer register. Since commit:

  f5caf621ee35 ("x86/asm: Fix inline asm call constraints for Clang")

... we have a stack register variable declared. It can be used instead of
current_stack_pointer() function which allows to optimize away some
excessive "mov %rsp, %<dst>" instructions:

 -mov    %rsp,%rdx
 -sub    %rdx,%rax
 -cmp    $0x3fff,%rax
 -ja     ffffffff810722fd <ist_begin_non_atomic+0x2d>

 +sub    %rsp,%rax
 +cmp    $0x3fff,%rax
 +ja     ffffffff810722fa <ist_begin_non_atomic+0x2a>

Remove current_stack_pointer(), rename __asm_call_sp to current_stack_pointer
and use it instead of the removed function.

Signed-off-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20170929141537.29167-1-aryabinin@virtuozzo.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
[dwmw2: We want ASM_CALL_CONSTRAINT for retpoline]
Signed-off-by: David Woodhouse <dwmw@amazon.co.ku>
Signed-off-by: Razvan Ghitulete <rga@amazon.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/asm.h         | 11 +++++++++++
 arch/x86/include/asm/thread_info.h | 11 -----------
 arch/x86/kernel/irq_32.c           |  6 +++---
 arch/x86/kernel/traps.c            |  2 +-
 4 files changed, 15 insertions(+), 15 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index 189679aba703..b9c6c7a6f5a6 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -105,4 +105,15 @@
 /* For C file, we already have NOKPROBE_SYMBOL macro */
 #endif
 
+#ifndef __ASSEMBLY__
+/*
+ * This output constraint should be used for any inline asm which has a "call"
+ * instruction.  Otherwise the asm may be inserted before the frame pointer
+ * gets set up by the containing function.  If you forget to do this, objtool
+ * may print a "call without frame pointer save/setup" warning.
+ */
+register unsigned long current_stack_pointer asm(_ASM_SP);
+#define ASM_CALL_CONSTRAINT "+r" (current_stack_pointer)
+#endif
+
 #endif /* _ASM_X86_ASM_H */
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index c7b551028740..9b028204685d 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -166,17 +166,6 @@ static inline struct thread_info *current_thread_info(void)
 	return (struct thread_info *)(current_top_of_stack() - THREAD_SIZE);
 }
 
-static inline unsigned long current_stack_pointer(void)
-{
-	unsigned long sp;
-#ifdef CONFIG_X86_64
-	asm("mov %%rsp,%0" : "=g" (sp));
-#else
-	asm("mov %%esp,%0" : "=g" (sp));
-#endif
-	return sp;
-}
-
 #else /* !__ASSEMBLY__ */
 
 #ifdef CONFIG_X86_64
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 38da8f29a9c8..647089c64d13 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -65,7 +65,7 @@ static void call_on_stack(void *func, void *stack)
 
 static inline void *current_stack(void)
 {
-	return (void *)(current_stack_pointer() & ~(THREAD_SIZE - 1));
+	return (void *)(current_stack_pointer & ~(THREAD_SIZE - 1));
 }
 
 static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc)
@@ -89,7 +89,7 @@ static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc)
 
 	/* Save the next esp at the bottom of the stack */
 	prev_esp = (u32 *)irqstk;
-	*prev_esp = current_stack_pointer();
+	*prev_esp = current_stack_pointer;
 
 	if (unlikely(overflow))
 		call_on_stack(print_stack_overflow, isp);
@@ -142,7 +142,7 @@ void do_softirq_own_stack(void)
 
 	/* Push the previous esp onto the stack */
 	prev_esp = (u32 *)irqstk;
-	*prev_esp = current_stack_pointer();
+	*prev_esp = current_stack_pointer;
 
 	call_on_stack(__do_softirq, isp);
 }
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 679302c312f8..22b81f35c500 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -166,7 +166,7 @@ void ist_begin_non_atomic(struct pt_regs *regs)
 	 * from double_fault.
 	 */
 	BUG_ON((unsigned long)(current_top_of_stack() -
-			       current_stack_pointer()) >= THREAD_SIZE);
+			       current_stack_pointer) >= THREAD_SIZE);
 
 	preempt_enable_no_resched();
 }
-- 
cgit v1.2.3


From b76ac90af34dc08f057da15aa34584a0a3d381b5 Mon Sep 17 00:00:00 2001
From: Adam Borowski <kilobyte@angband.pl>
Date: Sun, 11 Dec 2016 02:09:18 +0100
Subject: x86/kbuild: enable modversions for symbols exported from asm

commit 334bb773876403eae3457d81be0b8ea70f8e4ccc upstream.

Commit 4efca4ed ("kbuild: modversions for EXPORT_SYMBOL() for asm") adds
modversion support for symbols exported from asm files. Architectures
must include C-style declarations for those symbols in asm/asm-prototypes.h
in order for them to be versioned.

Add these declarations for x86, and an architecture-independent file that
can be used for common symbols.

With f27c2f6 reverting 8ab2ae6 ("default exported asm symbols to zero") we
produce a scary warning on x86, this commit fixes that.

Signed-off-by: Adam Borowski <kilobyte@angband.pl>
Tested-by: Kalle Valo <kvalo@codeaurora.org>
Acked-by: Nicholas Piggin <npiggin@gmail.com>
Tested-by: Peter Wu <peter@lekensteyn.nl>
Tested-by: Oliver Hartkopp <socketcan@hartkopp.net>
Signed-off-by: Michal Marek <mmarek@suse.com>
Signed-off-by: Razvan Ghitulete <rga@amazon.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/asm-prototypes.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)
 create mode 100644 arch/x86/include/asm/asm-prototypes.h

(limited to 'arch')

diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h
new file mode 100644
index 000000000000..44b8762fa0c7
--- /dev/null
+++ b/arch/x86/include/asm/asm-prototypes.h
@@ -0,0 +1,16 @@
+#include <asm/ftrace.h>
+#include <asm/uaccess.h>
+#include <asm/string.h>
+#include <asm/page.h>
+#include <asm/checksum.h>
+
+#include <asm-generic/asm-prototypes.h>
+
+#include <asm/page.h>
+#include <asm/pgtable.h>
+#include <asm/special_insns.h>
+#include <asm/preempt.h>
+
+#ifndef CONFIG_X86_CMPXCHG64
+extern void cmpxchg8b_emu(void);
+#endif
-- 
cgit v1.2.3


From b8e7a489b51810992ba3266bfd9e043709e07267 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Tue, 26 Apr 2016 12:23:25 -0700
Subject: x86/asm: Make asm/alternative.h safe from assembly

commit f005f5d860e0231fe212cfda8c1a3148b99609f4 upstream.

asm/alternative.h isn't directly useful from assembly, but it
shouldn't break the build.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/e5b693fcef99fe6e80341c9e97a002fb23871e91.1461698311.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Razvan Ghitulete <rga@amazon.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/alternative.h | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index d1cf17173b1b..215ea9214215 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -1,6 +1,8 @@
 #ifndef _ASM_X86_ALTERNATIVE_H
 #define _ASM_X86_ALTERNATIVE_H
 
+#ifndef __ASSEMBLY__
+
 #include <linux/types.h>
 #include <linux/stddef.h>
 #include <linux/stringify.h>
@@ -271,4 +273,6 @@ extern void *text_poke(void *addr, const void *opcode, size_t len);
 extern int poke_int3_handler(struct pt_regs *regs);
 extern void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler);
 
+#endif /* __ASSEMBLY__ */
+
 #endif /* _ASM_X86_ALTERNATIVE_H */
-- 
cgit v1.2.3


From 3c5e10905263dbe9fbc621d1889b85e9c867da25 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Thu, 11 Jan 2018 21:46:25 +0000
Subject: x86/retpoline: Add initial retpoline support

commit 76b043848fd22dbf7f8bf3a1452f8c70d557b860 upstream.

Enable the use of -mindirect-branch=thunk-extern in newer GCC, and provide
the corresponding thunks. Provide assembler macros for invoking the thunks
in the same way that GCC does, from native and inline assembler.

This adds X86_FEATURE_RETPOLINE and sets it by default on all CPUs. In
some circumstances, IBRS microcode features may be used instead, and the
retpoline can be disabled.

On AMD CPUs if lfence is serialising, the retpoline can be dramatically
simplified to a simple "lfence; jmp *\reg". A future patch, after it has
been verified that lfence really is serialising in all circumstances, can
enable this by setting the X86_FEATURE_RETPOLINE_AMD feature bit in addition
to X86_FEATURE_RETPOLINE.

Do not align the retpoline in the altinstr section, because there is no
guarantee that it stays aligned when it's copied over the oldinstr during
alternative patching.

[ Andi Kleen: Rename the macros, add CONFIG_RETPOLINE option, export thunks]
[ tglx: Put actual function CALL/JMP in front of the macros, convert to
  	symbolic labels ]
[ dwmw2: Convert back to numeric labels, merge objtool fixes ]

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Arjan van de Ven <arjan@linux.intel.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Cc: gnomes@lxorguk.ukuu.org.uk
Cc: Rik van Riel <riel@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: thomas.lendacky@amd.com
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Jiri Kosina <jikos@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Kees Cook <keescook@google.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Greg Kroah-Hartman <gregkh@linux-foundation.org>
Cc: Paul Turner <pjt@google.com>
Link: https://lkml.kernel.org/r/1515707194-20531-4-git-send-email-dwmw@amazon.co.uk
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
[ 4.4 backport: removed objtool annotation since there is no objtool ]
Signed-off-by: Razvan Ghitulete <rga@amazon.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/Kconfig                      |  13 +++++
 arch/x86/Makefile                     |  10 ++++
 arch/x86/include/asm/asm-prototypes.h |  25 ++++++++
 arch/x86/include/asm/cpufeature.h     |   2 +
 arch/x86/include/asm/nospec-branch.h  | 106 ++++++++++++++++++++++++++++++++++
 arch/x86/kernel/cpu/common.c          |   4 ++
 arch/x86/lib/Makefile                 |   1 +
 arch/x86/lib/retpoline.S              |  48 +++++++++++++++
 8 files changed, 209 insertions(+)
 create mode 100644 arch/x86/include/asm/nospec-branch.h
 create mode 100644 arch/x86/lib/retpoline.S

(limited to 'arch')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 0ef2cdd11616..75d0053b495a 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -379,6 +379,19 @@ config GOLDFISH
        def_bool y
        depends on X86_GOLDFISH
 
+config RETPOLINE
+	bool "Avoid speculative indirect branches in kernel"
+	default y
+	---help---
+	  Compile kernel with the retpoline compiler options to guard against
+	  kernel-to-user data leaks by avoiding speculative indirect
+	  branches. Requires a compiler with -mindirect-branch=thunk-extern
+	  support for full protection. The kernel may run slower.
+
+	  Without compiler support, at least indirect branches in assembler
+	  code are eliminated. Since this includes the syscall entry path,
+	  it is not entirely pointless.
+
 if X86_32
 config X86_EXTENDED_PLATFORM
 	bool "Support for extended (non-PC) x86 platforms"
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 4086abca0b32..34fdac520edb 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -189,6 +189,16 @@ KBUILD_CFLAGS += -fno-asynchronous-unwind-tables
 KBUILD_CFLAGS += $(mflags-y)
 KBUILD_AFLAGS += $(mflags-y)
 
+# Avoid indirect branches in kernel to deal with Spectre
+ifdef CONFIG_RETPOLINE
+    RETPOLINE_CFLAGS += $(call cc-option,-mindirect-branch=thunk-extern -mindirect-branch-register)
+    ifneq ($(RETPOLINE_CFLAGS),)
+        KBUILD_CFLAGS += $(RETPOLINE_CFLAGS) -DRETPOLINE
+    else
+        $(warning CONFIG_RETPOLINE=y, but not supported by the compiler. Toolchain update recommended.)
+    endif
+endif
+
 archscripts: scripts_basic
 	$(Q)$(MAKE) $(build)=arch/x86/tools relocs
 
diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h
index 44b8762fa0c7..b15aa4083dfd 100644
--- a/arch/x86/include/asm/asm-prototypes.h
+++ b/arch/x86/include/asm/asm-prototypes.h
@@ -10,7 +10,32 @@
 #include <asm/pgtable.h>
 #include <asm/special_insns.h>
 #include <asm/preempt.h>
+#include <asm/asm.h>
 
 #ifndef CONFIG_X86_CMPXCHG64
 extern void cmpxchg8b_emu(void);
 #endif
+
+#ifdef CONFIG_RETPOLINE
+#ifdef CONFIG_X86_32
+#define INDIRECT_THUNK(reg) extern asmlinkage void __x86_indirect_thunk_e ## reg(void);
+#else
+#define INDIRECT_THUNK(reg) extern asmlinkage void __x86_indirect_thunk_r ## reg(void);
+INDIRECT_THUNK(8)
+INDIRECT_THUNK(9)
+INDIRECT_THUNK(10)
+INDIRECT_THUNK(11)
+INDIRECT_THUNK(12)
+INDIRECT_THUNK(13)
+INDIRECT_THUNK(14)
+INDIRECT_THUNK(15)
+#endif
+INDIRECT_THUNK(ax)
+INDIRECT_THUNK(bx)
+INDIRECT_THUNK(cx)
+INDIRECT_THUNK(dx)
+INDIRECT_THUNK(si)
+INDIRECT_THUNK(di)
+INDIRECT_THUNK(bp)
+INDIRECT_THUNK(sp)
+#endif /* CONFIG_RETPOLINE */
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 142028afd049..0fbc98568018 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -200,6 +200,8 @@
 #define X86_FEATURE_HWP_PKG_REQ ( 7*32+14) /* Intel HWP_PKG_REQ */
 #define X86_FEATURE_INTEL_PT	( 7*32+15) /* Intel Processor Trace */
 
+#define X86_FEATURE_RETPOLINE	( 7*32+29) /* Generic Retpoline mitigation for Spectre variant 2 */
+#define X86_FEATURE_RETPOLINE_AMD ( 7*32+30) /* AMD Retpoline mitigation for Spectre variant 2 */
 /* Because the ALTERNATIVE scheme is for members of the X86_FEATURE club... */
 #define X86_FEATURE_KAISER	( 7*32+31) /* CONFIG_PAGE_TABLE_ISOLATION w/o nokaiser */
 
diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
new file mode 100644
index 000000000000..5763548fb30b
--- /dev/null
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -0,0 +1,106 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef __NOSPEC_BRANCH_H__
+#define __NOSPEC_BRANCH_H__
+
+#include <asm/alternative.h>
+#include <asm/alternative-asm.h>
+#include <asm/cpufeature.h>
+
+#ifdef __ASSEMBLY__
+
+/*
+ * These are the bare retpoline primitives for indirect jmp and call.
+ * Do not use these directly; they only exist to make the ALTERNATIVE
+ * invocation below less ugly.
+ */
+.macro RETPOLINE_JMP reg:req
+	call	.Ldo_rop_\@
+.Lspec_trap_\@:
+	pause
+	jmp	.Lspec_trap_\@
+.Ldo_rop_\@:
+	mov	\reg, (%_ASM_SP)
+	ret
+.endm
+
+/*
+ * This is a wrapper around RETPOLINE_JMP so the called function in reg
+ * returns to the instruction after the macro.
+ */
+.macro RETPOLINE_CALL reg:req
+	jmp	.Ldo_call_\@
+.Ldo_retpoline_jmp_\@:
+	RETPOLINE_JMP \reg
+.Ldo_call_\@:
+	call	.Ldo_retpoline_jmp_\@
+.endm
+
+/*
+ * JMP_NOSPEC and CALL_NOSPEC macros can be used instead of a simple
+ * indirect jmp/call which may be susceptible to the Spectre variant 2
+ * attack.
+ */
+.macro JMP_NOSPEC reg:req
+#ifdef CONFIG_RETPOLINE
+	ALTERNATIVE_2 __stringify(jmp *\reg),				\
+		__stringify(RETPOLINE_JMP \reg), X86_FEATURE_RETPOLINE,	\
+		__stringify(lfence; jmp *\reg), X86_FEATURE_RETPOLINE_AMD
+#else
+	jmp	*\reg
+#endif
+.endm
+
+.macro CALL_NOSPEC reg:req
+#ifdef CONFIG_RETPOLINE
+	ALTERNATIVE_2 __stringify(call *\reg),				\
+		__stringify(RETPOLINE_CALL \reg), X86_FEATURE_RETPOLINE,\
+		__stringify(lfence; call *\reg), X86_FEATURE_RETPOLINE_AMD
+#else
+	call	*\reg
+#endif
+.endm
+
+#else /* __ASSEMBLY__ */
+
+#if defined(CONFIG_X86_64) && defined(RETPOLINE)
+
+/*
+ * Since the inline asm uses the %V modifier which is only in newer GCC,
+ * the 64-bit one is dependent on RETPOLINE not CONFIG_RETPOLINE.
+ */
+# define CALL_NOSPEC						\
+	ALTERNATIVE(						\
+	"call *%[thunk_target]\n",				\
+	"call __x86_indirect_thunk_%V[thunk_target]\n",		\
+	X86_FEATURE_RETPOLINE)
+# define THUNK_TARGET(addr) [thunk_target] "r" (addr)
+
+#elif defined(CONFIG_X86_32) && defined(CONFIG_RETPOLINE)
+/*
+ * For i386 we use the original ret-equivalent retpoline, because
+ * otherwise we'll run out of registers. We don't care about CET
+ * here, anyway.
+ */
+# define CALL_NOSPEC ALTERNATIVE("call *%[thunk_target]\n",	\
+	"       jmp    904f;\n"					\
+	"       .align 16\n"					\
+	"901:	call   903f;\n"					\
+	"902:	pause;\n"					\
+	"       jmp    902b;\n"					\
+	"       .align 16\n"					\
+	"903:	addl   $4, %%esp;\n"				\
+	"       pushl  %[thunk_target];\n"			\
+	"       ret;\n"						\
+	"       .align 16\n"					\
+	"904:	call   901b;\n",				\
+	X86_FEATURE_RETPOLINE)
+
+# define THUNK_TARGET(addr) [thunk_target] "rm" (addr)
+#else /* No retpoline */
+# define CALL_NOSPEC "call *%[thunk_target]\n"
+# define THUNK_TARGET(addr) [thunk_target] "rm" (addr)
+#endif
+
+#endif /* __ASSEMBLY__ */
+#endif /* __NOSPEC_BRANCH_H__ */
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 0531c1707b40..5b3a6e888bc5 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -837,6 +837,10 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
 	setup_force_cpu_bug(X86_BUG_SPECTRE_V1);
 	setup_force_cpu_bug(X86_BUG_SPECTRE_V2);
 
+#ifdef CONFIG_RETPOLINE
+	setup_force_cpu_cap(X86_FEATURE_RETPOLINE);
+#endif
+
 	fpu__init_system(c);
 
 #ifdef CONFIG_X86_32
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index f2587888d987..12a34d15b648 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -21,6 +21,7 @@ lib-y += usercopy_$(BITS).o usercopy.o getuser.o putuser.o
 lib-y += memcpy_$(BITS).o
 lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
 lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o
+lib-$(CONFIG_RETPOLINE) += retpoline.o
 
 obj-y += msr.o msr-reg.o msr-reg-export.o
 
diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S
new file mode 100644
index 000000000000..019a03599bb0
--- /dev/null
+++ b/arch/x86/lib/retpoline.S
@@ -0,0 +1,48 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#include <linux/stringify.h>
+#include <linux/linkage.h>
+#include <asm/dwarf2.h>
+#include <asm/cpufeature.h>
+#include <asm/alternative-asm.h>
+#include <asm-generic/export.h>
+#include <asm/nospec-branch.h>
+
+.macro THUNK reg
+	.section .text.__x86.indirect_thunk.\reg
+
+ENTRY(__x86_indirect_thunk_\reg)
+	CFI_STARTPROC
+	JMP_NOSPEC %\reg
+	CFI_ENDPROC
+ENDPROC(__x86_indirect_thunk_\reg)
+.endm
+
+/*
+ * Despite being an assembler file we can't just use .irp here
+ * because __KSYM_DEPS__ only uses the C preprocessor and would
+ * only see one instance of "__x86_indirect_thunk_\reg" rather
+ * than one per register with the correct names. So we do it
+ * the simple and nasty way...
+ */
+#define EXPORT_THUNK(reg) EXPORT_SYMBOL(__x86_indirect_thunk_ ## reg)
+#define GENERATE_THUNK(reg) THUNK reg ; EXPORT_THUNK(reg)
+
+GENERATE_THUNK(_ASM_AX)
+GENERATE_THUNK(_ASM_BX)
+GENERATE_THUNK(_ASM_CX)
+GENERATE_THUNK(_ASM_DX)
+GENERATE_THUNK(_ASM_SI)
+GENERATE_THUNK(_ASM_DI)
+GENERATE_THUNK(_ASM_BP)
+GENERATE_THUNK(_ASM_SP)
+#ifdef CONFIG_64BIT
+GENERATE_THUNK(r8)
+GENERATE_THUNK(r9)
+GENERATE_THUNK(r10)
+GENERATE_THUNK(r11)
+GENERATE_THUNK(r12)
+GENERATE_THUNK(r13)
+GENERATE_THUNK(r14)
+GENERATE_THUNK(r15)
+#endif
-- 
cgit v1.2.3


From 9f789bc5711bcacb5df003594b992f0c1cc19df4 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Thu, 11 Jan 2018 21:46:26 +0000
Subject: x86/spectre: Add boot time option to select Spectre v2 mitigation

commit da285121560e769cc31797bba6422eea71d473e0 upstream.

Add a spectre_v2= option to select the mitigation used for the indirect
branch speculation vulnerability.

Currently, the only option available is retpoline, in its various forms.
This will be expanded to cover the new IBRS/IBPB microcode features.

The RETPOLINE_AMD feature relies on a serializing LFENCE for speculation
control. For AMD hardware, only set RETPOLINE_AMD if LFENCE is a
serializing instruction, which is indicated by the LFENCE_RDTSC feature.

[ tglx: Folded back the LFENCE/AMD fixes and reworked it so IBRS
  	integration becomes simple ]

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: gnomes@lxorguk.ukuu.org.uk
Cc: Rik van Riel <riel@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: thomas.lendacky@amd.com
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Jiri Kosina <jikos@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Kees Cook <keescook@google.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Greg Kroah-Hartman <gregkh@linux-foundation.org>
Cc: Paul Turner <pjt@google.com>
Link: https://lkml.kernel.org/r/1515707194-20531-5-git-send-email-dwmw@amazon.co.uk
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/nospec-branch.h |  10 +++
 arch/x86/kernel/cpu/bugs.c           | 158 ++++++++++++++++++++++++++++++++++-
 arch/x86/kernel/cpu/common.c         |   4 -
 3 files changed, 167 insertions(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index 5763548fb30b..fe48aeee79d1 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -102,5 +102,15 @@
 # define THUNK_TARGET(addr) [thunk_target] "rm" (addr)
 #endif
 
+/* The Spectre V2 mitigation variants */
+enum spectre_v2_mitigation {
+	SPECTRE_V2_NONE,
+	SPECTRE_V2_RETPOLINE_MINIMAL,
+	SPECTRE_V2_RETPOLINE_MINIMAL_AMD,
+	SPECTRE_V2_RETPOLINE_GENERIC,
+	SPECTRE_V2_RETPOLINE_AMD,
+	SPECTRE_V2_IBRS,
+};
+
 #endif /* __ASSEMBLY__ */
 #endif /* __NOSPEC_BRANCH_H__ */
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index cb6b4f9d0b7a..49d25ddf0e9f 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -10,6 +10,9 @@
 #include <linux/init.h>
 #include <linux/utsname.h>
 #include <linux/cpu.h>
+
+#include <asm/nospec-branch.h>
+#include <asm/cmdline.h>
 #include <asm/bugs.h>
 #include <asm/processor.h>
 #include <asm/processor-flags.h>
@@ -20,6 +23,8 @@
 #include <asm/pgtable.h>
 #include <asm/cacheflush.h>
 
+static void __init spectre_v2_select_mitigation(void);
+
 void __init check_bugs(void)
 {
 	identify_boot_cpu();
@@ -29,6 +34,9 @@ void __init check_bugs(void)
 		print_cpu_info(&boot_cpu_data);
 	}
 
+	/* Select the proper spectre mitigation before patching alternatives */
+	spectre_v2_select_mitigation();
+
 #ifdef CONFIG_X86_32
 	/*
 	 * Check whether we are able to run this kernel safely on SMP.
@@ -61,6 +69,153 @@ void __init check_bugs(void)
 #endif
 }
 
+/* The kernel command line selection */
+enum spectre_v2_mitigation_cmd {
+	SPECTRE_V2_CMD_NONE,
+	SPECTRE_V2_CMD_AUTO,
+	SPECTRE_V2_CMD_FORCE,
+	SPECTRE_V2_CMD_RETPOLINE,
+	SPECTRE_V2_CMD_RETPOLINE_GENERIC,
+	SPECTRE_V2_CMD_RETPOLINE_AMD,
+};
+
+static const char *spectre_v2_strings[] = {
+	[SPECTRE_V2_NONE]			= "Vulnerable",
+	[SPECTRE_V2_RETPOLINE_MINIMAL]		= "Vulnerable: Minimal generic ASM retpoline",
+	[SPECTRE_V2_RETPOLINE_MINIMAL_AMD]	= "Vulnerable: Minimal AMD ASM retpoline",
+	[SPECTRE_V2_RETPOLINE_GENERIC]		= "Mitigation: Full generic retpoline",
+	[SPECTRE_V2_RETPOLINE_AMD]		= "Mitigation: Full AMD retpoline",
+};
+
+#undef pr_fmt
+#define pr_fmt(fmt)     "Spectre V2 mitigation: " fmt
+
+static enum spectre_v2_mitigation spectre_v2_enabled = SPECTRE_V2_NONE;
+
+static void __init spec2_print_if_insecure(const char *reason)
+{
+	if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
+		pr_info("%s\n", reason);
+}
+
+static void __init spec2_print_if_secure(const char *reason)
+{
+	if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
+		pr_info("%s\n", reason);
+}
+
+static inline bool retp_compiler(void)
+{
+	return __is_defined(RETPOLINE);
+}
+
+static inline bool match_option(const char *arg, int arglen, const char *opt)
+{
+	int len = strlen(opt);
+
+	return len == arglen && !strncmp(arg, opt, len);
+}
+
+static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
+{
+	char arg[20];
+	int ret;
+
+	ret = cmdline_find_option(boot_command_line, "spectre_v2", arg,
+				  sizeof(arg));
+	if (ret > 0)  {
+		if (match_option(arg, ret, "off")) {
+			goto disable;
+		} else if (match_option(arg, ret, "on")) {
+			spec2_print_if_secure("force enabled on command line.");
+			return SPECTRE_V2_CMD_FORCE;
+		} else if (match_option(arg, ret, "retpoline")) {
+			spec2_print_if_insecure("retpoline selected on command line.");
+			return SPECTRE_V2_CMD_RETPOLINE;
+		} else if (match_option(arg, ret, "retpoline,amd")) {
+			if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) {
+				pr_err("retpoline,amd selected but CPU is not AMD. Switching to AUTO select\n");
+				return SPECTRE_V2_CMD_AUTO;
+			}
+			spec2_print_if_insecure("AMD retpoline selected on command line.");
+			return SPECTRE_V2_CMD_RETPOLINE_AMD;
+		} else if (match_option(arg, ret, "retpoline,generic")) {
+			spec2_print_if_insecure("generic retpoline selected on command line.");
+			return SPECTRE_V2_CMD_RETPOLINE_GENERIC;
+		} else if (match_option(arg, ret, "auto")) {
+			return SPECTRE_V2_CMD_AUTO;
+		}
+	}
+
+	if (!cmdline_find_option_bool(boot_command_line, "nospectre_v2"))
+		return SPECTRE_V2_CMD_AUTO;
+disable:
+	spec2_print_if_insecure("disabled on command line.");
+	return SPECTRE_V2_CMD_NONE;
+}
+
+static void __init spectre_v2_select_mitigation(void)
+{
+	enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline();
+	enum spectre_v2_mitigation mode = SPECTRE_V2_NONE;
+
+	/*
+	 * If the CPU is not affected and the command line mode is NONE or AUTO
+	 * then nothing to do.
+	 */
+	if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2) &&
+	    (cmd == SPECTRE_V2_CMD_NONE || cmd == SPECTRE_V2_CMD_AUTO))
+		return;
+
+	switch (cmd) {
+	case SPECTRE_V2_CMD_NONE:
+		return;
+
+	case SPECTRE_V2_CMD_FORCE:
+		/* FALLTRHU */
+	case SPECTRE_V2_CMD_AUTO:
+		goto retpoline_auto;
+
+	case SPECTRE_V2_CMD_RETPOLINE_AMD:
+		if (IS_ENABLED(CONFIG_RETPOLINE))
+			goto retpoline_amd;
+		break;
+	case SPECTRE_V2_CMD_RETPOLINE_GENERIC:
+		if (IS_ENABLED(CONFIG_RETPOLINE))
+			goto retpoline_generic;
+		break;
+	case SPECTRE_V2_CMD_RETPOLINE:
+		if (IS_ENABLED(CONFIG_RETPOLINE))
+			goto retpoline_auto;
+		break;
+	}
+	pr_err("kernel not compiled with retpoline; no mitigation available!");
+	return;
+
+retpoline_auto:
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
+	retpoline_amd:
+		if (!boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) {
+			pr_err("LFENCE not serializing. Switching to generic retpoline\n");
+			goto retpoline_generic;
+		}
+		mode = retp_compiler() ? SPECTRE_V2_RETPOLINE_AMD :
+					 SPECTRE_V2_RETPOLINE_MINIMAL_AMD;
+		setup_force_cpu_cap(X86_FEATURE_RETPOLINE_AMD);
+		setup_force_cpu_cap(X86_FEATURE_RETPOLINE);
+	} else {
+	retpoline_generic:
+		mode = retp_compiler() ? SPECTRE_V2_RETPOLINE_GENERIC :
+					 SPECTRE_V2_RETPOLINE_MINIMAL;
+		setup_force_cpu_cap(X86_FEATURE_RETPOLINE);
+	}
+
+	spectre_v2_enabled = mode;
+	pr_info("%s\n", spectre_v2_strings[mode]);
+}
+
+#undef pr_fmt
+
 #ifdef CONFIG_SYSFS
 ssize_t cpu_show_meltdown(struct device *dev,
 			  struct device_attribute *attr, char *buf)
@@ -85,6 +240,7 @@ ssize_t cpu_show_spectre_v2(struct device *dev,
 {
 	if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
 		return sprintf(buf, "Not affected\n");
-	return sprintf(buf, "Vulnerable\n");
+
+	return sprintf(buf, "%s\n", spectre_v2_strings[spectre_v2_enabled]);
 }
 #endif
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 5b3a6e888bc5..0531c1707b40 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -837,10 +837,6 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
 	setup_force_cpu_bug(X86_BUG_SPECTRE_V1);
 	setup_force_cpu_bug(X86_BUG_SPECTRE_V2);
 
-#ifdef CONFIG_RETPOLINE
-	setup_force_cpu_cap(X86_FEATURE_RETPOLINE);
-#endif
-
 	fpu__init_system(c);
 
 #ifdef CONFIG_X86_32
-- 
cgit v1.2.3


From 9fe55976f0c8acfd7408bf693b6d171587b62129 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Thu, 11 Jan 2018 21:46:27 +0000
Subject: x86/retpoline/crypto: Convert crypto assembler indirect jumps

commit 9697fa39efd3fc3692f2949d4045f393ec58450b upstream.

Convert all indirect jumps in crypto assembler code to use non-speculative
sequences when CONFIG_RETPOLINE is enabled.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Arjan van de Ven <arjan@linux.intel.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Cc: gnomes@lxorguk.ukuu.org.uk
Cc: Rik van Riel <riel@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: thomas.lendacky@amd.com
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Jiri Kosina <jikos@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Kees Cook <keescook@google.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Greg Kroah-Hartman <gregkh@linux-foundation.org>
Cc: Paul Turner <pjt@google.com>
Link: https://lkml.kernel.org/r/1515707194-20531-6-git-send-email-dwmw@amazon.co.uk
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/crypto/aesni-intel_asm.S            | 5 +++--
 arch/x86/crypto/camellia-aesni-avx-asm_64.S  | 3 ++-
 arch/x86/crypto/camellia-aesni-avx2-asm_64.S | 3 ++-
 arch/x86/crypto/crc32c-pcl-intel-asm_64.S    | 3 ++-
 4 files changed, 9 insertions(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index 6bd2c6c95373..3f93dedb5a4d 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -31,6 +31,7 @@
 
 #include <linux/linkage.h>
 #include <asm/inst.h>
+#include <asm/nospec-branch.h>
 
 /*
  * The following macros are used to move an (un)aligned 16 byte value to/from
@@ -2714,7 +2715,7 @@ ENTRY(aesni_xts_crypt8)
 	pxor INC, STATE4
 	movdqu IV, 0x30(OUTP)
 
-	call *%r11
+	CALL_NOSPEC %r11
 
 	movdqu 0x00(OUTP), INC
 	pxor INC, STATE1
@@ -2759,7 +2760,7 @@ ENTRY(aesni_xts_crypt8)
 	_aesni_gf128mul_x_ble()
 	movups IV, (IVP)
 
-	call *%r11
+	CALL_NOSPEC %r11
 
 	movdqu 0x40(OUTP), INC
 	pxor INC, STATE1
diff --git a/arch/x86/crypto/camellia-aesni-avx-asm_64.S b/arch/x86/crypto/camellia-aesni-avx-asm_64.S
index ce71f9212409..5881756f78a2 100644
--- a/arch/x86/crypto/camellia-aesni-avx-asm_64.S
+++ b/arch/x86/crypto/camellia-aesni-avx-asm_64.S
@@ -16,6 +16,7 @@
  */
 
 #include <linux/linkage.h>
+#include <asm/nospec-branch.h>
 
 #define CAMELLIA_TABLE_BYTE_LEN 272
 
@@ -1210,7 +1211,7 @@ camellia_xts_crypt_16way:
 	vpxor 14 * 16(%rax), %xmm15, %xmm14;
 	vpxor 15 * 16(%rax), %xmm15, %xmm15;
 
-	call *%r9;
+	CALL_NOSPEC %r9;
 
 	addq $(16 * 16), %rsp;
 
diff --git a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
index 0e0b8863a34b..0d45b04b490a 100644
--- a/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
+++ b/arch/x86/crypto/camellia-aesni-avx2-asm_64.S
@@ -11,6 +11,7 @@
  */
 
 #include <linux/linkage.h>
+#include <asm/nospec-branch.h>
 
 #define CAMELLIA_TABLE_BYTE_LEN 272
 
@@ -1323,7 +1324,7 @@ camellia_xts_crypt_32way:
 	vpxor 14 * 32(%rax), %ymm15, %ymm14;
 	vpxor 15 * 32(%rax), %ymm15, %ymm15;
 
-	call *%r9;
+	CALL_NOSPEC %r9;
 
 	addq $(16 * 32), %rsp;
 
diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
index 4fe27e074194..48767520cbe0 100644
--- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
+++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S
@@ -45,6 +45,7 @@
 
 #include <asm/inst.h>
 #include <linux/linkage.h>
+#include <asm/nospec-branch.h>
 
 ## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction
 
@@ -172,7 +173,7 @@ continue_block:
 	movzxw  (bufp, %rax, 2), len
 	offset=crc_array-jump_table
 	lea     offset(bufp, len, 1), bufp
-	jmp     *bufp
+	JMP_NOSPEC bufp
 
 	################################################################
 	## 2a) PROCESS FULL BLOCKS:
-- 
cgit v1.2.3


From 028083cb02db69237e73950576bc81ac579693dc Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Thu, 11 Jan 2018 21:46:28 +0000
Subject: x86/retpoline/entry: Convert entry assembler indirect jumps

commit 2641f08bb7fc63a636a2b18173221d7040a3512e upstream.

Convert indirect jumps in core 32/64bit entry assembler code to use
non-speculative sequences when CONFIG_RETPOLINE is enabled.

Don't use CALL_NOSPEC in entry_SYSCALL_64_fastpath because the return
address after the 'call' instruction must be *precisely* at the
.Lentry_SYSCALL_64_after_fastpath label for stub_ptregs_64 to work,
and the use of alternatives will mess that up unless we play horrid
games to prepend with NOPs and make the variants the same length. It's
not worth it; in the case where we ALTERNATIVE out the retpoline, the
first instruction at __x86.indirect_thunk.rax is going to be a bare
jmp *%rax anyway.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: gnomes@lxorguk.ukuu.org.uk
Cc: Rik van Riel <riel@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: thomas.lendacky@amd.com
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Jiri Kosina <jikos@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Kees Cook <keescook@google.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Greg Kroah-Hartman <gregkh@linux-foundation.org>
Cc: Paul Turner <pjt@google.com>
Link: https://lkml.kernel.org/r/1515707194-20531-7-git-send-email-dwmw@amazon.co.uk
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Razvan Ghitulete <rga@amazon.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/entry/entry_32.S |  6 ++++--
 arch/x86/entry/entry_64.S | 14 +++++++++++++-
 2 files changed, 17 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index ae678ad128a9..adbbd4f538e9 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -44,6 +44,7 @@
 #include <asm/alternative-asm.h>
 #include <asm/asm.h>
 #include <asm/smap.h>
+#include <asm/nospec-branch.h>
 
 	.section .entry.text, "ax"
 
@@ -226,7 +227,8 @@ ENTRY(ret_from_kernel_thread)
 	pushl	$0x0202				# Reset kernel eflags
 	popfl
 	movl	PT_EBP(%esp), %eax
-	call	*PT_EBX(%esp)
+	movl	PT_EBX(%esp), %edx
+	CALL_NOSPEC %edx
 	movl	$0, PT_EAX(%esp)
 
 	/*
@@ -938,7 +940,7 @@ error_code:
 	movl	%ecx, %es
 	TRACE_IRQS_OFF
 	movl	%esp, %eax			# pt_regs pointer
-	call	*%edi
+	CALL_NOSPEC %edi
 	jmp	ret_from_exception
 END(page_fault)
 
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 952b23b5d4e9..81b1cd533965 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -36,6 +36,7 @@
 #include <asm/smap.h>
 #include <asm/pgtable_types.h>
 #include <asm/kaiser.h>
+#include <asm/nospec-branch.h>
 #include <linux/err.h>
 
 /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */
@@ -184,7 +185,13 @@ entry_SYSCALL_64_fastpath:
 #endif
 	ja	1f				/* return -ENOSYS (already in pt_regs->ax) */
 	movq	%r10, %rcx
+#ifdef CONFIG_RETPOLINE
+	movq	sys_call_table(, %rax, 8), %rax
+	call	__x86_indirect_thunk_rax
+#else
 	call	*sys_call_table(, %rax, 8)
+#endif
+
 	movq	%rax, RAX(%rsp)
 1:
 /*
@@ -276,7 +283,12 @@ tracesys_phase2:
 #endif
 	ja	1f				/* return -ENOSYS (already in pt_regs->ax) */
 	movq	%r10, %rcx			/* fixup for C */
+#ifdef CONFIG_RETPOLINE
+	movq	sys_call_table(, %rax, 8), %rax
+	call	__x86_indirect_thunk_rax
+#else
 	call	*sys_call_table(, %rax, 8)
+#endif
 	movq	%rax, RAX(%rsp)
 1:
 	/* Use IRET because user could have changed pt_regs->foo */
@@ -491,7 +503,7 @@ ENTRY(ret_from_fork)
 	 * nb: we depend on RESTORE_EXTRA_REGS above
 	 */
 	movq	%rbp, %rdi
-	call	*%rbx
+	CALL_NOSPEC %rbx
 	movl	$0, RAX(%rsp)
 	RESTORE_EXTRA_REGS
 	jmp	int_ret_from_sys_call
-- 
cgit v1.2.3


From 7153a6d5ff050050555066f58ac3458c5efc699b Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Thu, 11 Jan 2018 21:46:29 +0000
Subject: x86/retpoline/ftrace: Convert ftrace assembler indirect jumps

commit 9351803bd803cdbeb9b5a7850b7b6f464806e3db upstream.

Convert all indirect jumps in ftrace assembler code to use non-speculative
sequences when CONFIG_RETPOLINE is enabled.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Arjan van de Ven <arjan@linux.intel.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Cc: gnomes@lxorguk.ukuu.org.uk
Cc: Rik van Riel <riel@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: thomas.lendacky@amd.com
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Jiri Kosina <jikos@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Kees Cook <keescook@google.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Greg Kroah-Hartman <gregkh@linux-foundation.org>
Cc: Paul Turner <pjt@google.com>
Link: https://lkml.kernel.org/r/1515707194-20531-8-git-send-email-dwmw@amazon.co.uk
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Razvan Ghitulete <rga@amazon.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/entry/entry_32.S   | 5 +++--
 arch/x86/kernel/mcount_64.S | 7 ++++---
 2 files changed, 7 insertions(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index adbbd4f538e9..d437f3871e53 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -863,7 +863,8 @@ trace:
 	movl	0x4(%ebp), %edx
 	subl	$MCOUNT_INSN_SIZE, %eax
 
-	call	*ftrace_trace_function
+	movl    ftrace_trace_function, %ecx
+	CALL_NOSPEC %ecx
 
 	popl	%edx
 	popl	%ecx
@@ -898,7 +899,7 @@ return_to_handler:
 	movl	%eax, %ecx
 	popl	%edx
 	popl	%eax
-	jmp	*%ecx
+	JMP_NOSPEC %ecx
 #endif
 
 #ifdef CONFIG_TRACING
diff --git a/arch/x86/kernel/mcount_64.S b/arch/x86/kernel/mcount_64.S
index 5d9afbcb6074..09284cfab86f 100644
--- a/arch/x86/kernel/mcount_64.S
+++ b/arch/x86/kernel/mcount_64.S
@@ -7,7 +7,7 @@
 #include <linux/linkage.h>
 #include <asm/ptrace.h>
 #include <asm/ftrace.h>
-
+#include <asm/nospec-branch.h>
 
 	.code64
 	.section .entry.text, "ax"
@@ -285,8 +285,9 @@ trace:
 	 * ip and parent ip are used and the list function is called when
 	 * function tracing is enabled.
 	 */
-	call   *ftrace_trace_function
 
+	movq ftrace_trace_function, %r8
+	CALL_NOSPEC %r8
 	restore_mcount_regs
 
 	jmp fgraph_trace
@@ -329,5 +330,5 @@ GLOBAL(return_to_handler)
 	movq 8(%rsp), %rdx
 	movq (%rsp), %rax
 	addq $24, %rsp
-	jmp *%rdi
+	JMP_NOSPEC %rdi
 #endif
-- 
cgit v1.2.3


From 6b222e7483af4fd8f632efbf3b91025c2359b10a Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Thu, 11 Jan 2018 21:46:31 +0000
Subject: x86/retpoline/xen: Convert Xen hypercall indirect jumps

commit ea08816d5b185ab3d09e95e393f265af54560350 upstream.

Convert indirect call in Xen hypercall to use non-speculative sequence,
when CONFIG_RETPOLINE is enabled.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Arjan van de Ven <arjan@linux.intel.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Reviewed-by: Juergen Gross <jgross@suse.com>
Cc: gnomes@lxorguk.ukuu.org.uk
Cc: Rik van Riel <riel@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: thomas.lendacky@amd.com
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Jiri Kosina <jikos@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Kees Cook <keescook@google.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Greg Kroah-Hartman <gregkh@linux-foundation.org>
Cc: Paul Turner <pjt@google.com>
Link: https://lkml.kernel.org/r/1515707194-20531-10-git-send-email-dwmw@amazon.co.uk
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/xen/hypercall.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index 85133b2b8e99..0977e7607046 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -44,6 +44,7 @@
 #include <asm/page.h>
 #include <asm/pgtable.h>
 #include <asm/smap.h>
+#include <asm/nospec-branch.h>
 
 #include <xen/interface/xen.h>
 #include <xen/interface/sched.h>
@@ -215,9 +216,9 @@ privcmd_call(unsigned call,
 	__HYPERCALL_5ARG(a1, a2, a3, a4, a5);
 
 	stac();
-	asm volatile("call *%[call]"
+	asm volatile(CALL_NOSPEC
 		     : __HYPERCALL_5PARAM
-		     : [call] "a" (&hypercall_page[call])
+		     : [thunk_target] "a" (&hypercall_page[call])
 		     : __HYPERCALL_CLOBBER5);
 	clac();
 
-- 
cgit v1.2.3


From 7e5bb301bd2fdd62cbee7b26a8234cccb6731849 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Thu, 11 Jan 2018 21:46:32 +0000
Subject: x86/retpoline/checksum32: Convert assembler indirect jumps

commit 5096732f6f695001fa2d6f1335a2680b37912c69 upstream.

Convert all indirect jumps in 32bit checksum assembler code to use
non-speculative sequences when CONFIG_RETPOLINE is enabled.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Arjan van de Ven <arjan@linux.intel.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Cc: gnomes@lxorguk.ukuu.org.uk
Cc: Rik van Riel <riel@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: thomas.lendacky@amd.com
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Jiri Kosina <jikos@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Kees Cook <keescook@google.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Greg Kroah-Hartman <gregkh@linux-foundation.org>
Cc: Paul Turner <pjt@google.com>
Link: https://lkml.kernel.org/r/1515707194-20531-11-git-send-email-dwmw@amazon.co.uk
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/lib/checksum_32.S | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/lib/checksum_32.S b/arch/x86/lib/checksum_32.S
index c1e623209853..90353a26ed95 100644
--- a/arch/x86/lib/checksum_32.S
+++ b/arch/x86/lib/checksum_32.S
@@ -28,7 +28,8 @@
 #include <linux/linkage.h>
 #include <asm/errno.h>
 #include <asm/asm.h>
-				
+#include <asm/nospec-branch.h>
+
 /*
  * computes a partial checksum, e.g. for TCP/UDP fragments
  */
@@ -155,7 +156,7 @@ ENTRY(csum_partial)
 	negl %ebx
 	lea 45f(%ebx,%ebx,2), %ebx
 	testl %esi, %esi
-	jmp *%ebx
+	JMP_NOSPEC %ebx
 
 	# Handle 2-byte-aligned regions
 20:	addw (%esi), %ax
@@ -437,7 +438,7 @@ ENTRY(csum_partial_copy_generic)
 	andl $-32,%edx
 	lea 3f(%ebx,%ebx), %ebx
 	testl %esi, %esi 
-	jmp *%ebx
+	JMP_NOSPEC %ebx
 1:	addl $64,%esi
 	addl $64,%edi 
 	SRC(movb -32(%edx),%bl)	; SRC(movb (%edx),%bl)
-- 
cgit v1.2.3


From f72655b837eb4320a1ffebbd0e0ebe92ce1e5314 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Thu, 11 Jan 2018 21:46:33 +0000
Subject: x86/retpoline/irq32: Convert assembler indirect jumps

commit 7614e913db1f40fff819b36216484dc3808995d4 upstream.

Convert all indirect jumps in 32bit irq inline asm code to use non
speculative sequences.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Arjan van de Ven <arjan@linux.intel.com>
Acked-by: Ingo Molnar <mingo@kernel.org>
Cc: gnomes@lxorguk.ukuu.org.uk
Cc: Rik van Riel <riel@redhat.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: thomas.lendacky@amd.com
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Jiri Kosina <jikos@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Kees Cook <keescook@google.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Greg Kroah-Hartman <gregkh@linux-foundation.org>
Cc: Paul Turner <pjt@google.com>
Link: https://lkml.kernel.org/r/1515707194-20531-12-git-send-email-dwmw@amazon.co.uk
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Razvan Ghitulete <rga@amazon.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kernel/irq_32.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 647089c64d13..528b7aa1780d 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -20,6 +20,7 @@
 #include <linux/mm.h>
 
 #include <asm/apic.h>
+#include <asm/nospec-branch.h>
 
 #ifdef CONFIG_DEBUG_STACKOVERFLOW
 
@@ -55,11 +56,11 @@ DEFINE_PER_CPU(struct irq_stack *, softirq_stack);
 static void call_on_stack(void *func, void *stack)
 {
 	asm volatile("xchgl	%%ebx,%%esp	\n"
-		     "call	*%%edi		\n"
+		     CALL_NOSPEC
 		     "movl	%%ebx,%%esp	\n"
 		     : "=b" (stack)
 		     : "0" (stack),
-		       "D"(func)
+		       [thunk_target] "D"(func)
 		     : "memory", "cc", "edx", "ecx", "eax");
 }
 
@@ -95,11 +96,11 @@ static inline int execute_on_irq_stack(int overflow, struct irq_desc *desc)
 		call_on_stack(print_stack_overflow, isp);
 
 	asm volatile("xchgl	%%ebx,%%esp	\n"
-		     "call	*%%edi		\n"
+		     CALL_NOSPEC
 		     "movl	%%ebx,%%esp	\n"
 		     : "=a" (arg1), "=b" (isp)
 		     :  "0" (desc),   "1" (isp),
-			"D" (desc->handle_irq)
+			[thunk_target] "D" (desc->handle_irq)
 		     : "memory", "cc", "ecx");
 	return 1;
 }
-- 
cgit v1.2.3


From eebc3f8adee0a6f43a4789ef0bf5c5b35de8cfe4 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Fri, 12 Jan 2018 11:11:27 +0000
Subject: x86/retpoline: Fill return stack buffer on vmexit

commit 117cc7a908c83697b0b737d15ae1eb5943afe35b upstream.

In accordance with the Intel and AMD documentation, we need to overwrite
all entries in the RSB on exiting a guest, to prevent malicious branch
target predictions from affecting the host kernel. This is needed both
for retpoline and for IBRS.

[ak: numbers again for the RSB stuffing labels]

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: gnomes@lxorguk.ukuu.org.uk
Cc: Rik van Riel <riel@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: thomas.lendacky@amd.com
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Jiri Kosina <jikos@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Kees Cook <keescook@google.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Greg Kroah-Hartman <gregkh@linux-foundation.org>
Cc: Paul Turner <pjt@google.com>
Link: https://lkml.kernel.org/r/1515755487-8524-1-git-send-email-dwmw@amazon.co.uk
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Razvan Ghitulete <rga@amazon.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/nospec-branch.h | 76 +++++++++++++++++++++++++++++++++++-
 arch/x86/kvm/svm.c                   |  4 ++
 arch/x86/kvm/vmx.c                   |  4 ++
 3 files changed, 83 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index fe48aeee79d1..1afd04eb5fb7 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -7,6 +7,48 @@
 #include <asm/alternative-asm.h>
 #include <asm/cpufeature.h>
 
+/*
+ * Fill the CPU return stack buffer.
+ *
+ * Each entry in the RSB, if used for a speculative 'ret', contains an
+ * infinite 'pause; jmp' loop to capture speculative execution.
+ *
+ * This is required in various cases for retpoline and IBRS-based
+ * mitigations for the Spectre variant 2 vulnerability. Sometimes to
+ * eliminate potentially bogus entries from the RSB, and sometimes
+ * purely to ensure that it doesn't get empty, which on some CPUs would
+ * allow predictions from other (unwanted!) sources to be used.
+ *
+ * We define a CPP macro such that it can be used from both .S files and
+ * inline assembly. It's possible to do a .macro and then include that
+ * from C via asm(".include <asm/nospec-branch.h>") but let's not go there.
+ */
+
+#define RSB_CLEAR_LOOPS		32	/* To forcibly overwrite all entries */
+#define RSB_FILL_LOOPS		16	/* To avoid underflow */
+
+/*
+ * Google experimented with loop-unrolling and this turned out to be
+ * the optimal version — two calls, each with their own speculation
+ * trap should their return address end up getting used, in a loop.
+ */
+#define __FILL_RETURN_BUFFER(reg, nr, sp)	\
+	mov	$(nr/2), reg;			\
+771:						\
+	call	772f;				\
+773:	/* speculation trap */			\
+	pause;					\
+	jmp	773b;				\
+772:						\
+	call	774f;				\
+775:	/* speculation trap */			\
+	pause;					\
+	jmp	775b;				\
+774:						\
+	dec	reg;				\
+	jnz	771b;				\
+	add	$(BITS_PER_LONG/8) * nr, sp;
+
 #ifdef __ASSEMBLY__
 
 /*
@@ -59,6 +101,19 @@
 #else
 	call	*\reg
 #endif
+.endm
+
+ /*
+  * A simpler FILL_RETURN_BUFFER macro. Don't make people use the CPP
+  * monstrosity above, manually.
+  */
+.macro FILL_RETURN_BUFFER reg:req nr:req ftr:req
+#ifdef CONFIG_RETPOLINE
+	ALTERNATIVE "jmp .Lskip_rsb_\@",				\
+		__stringify(__FILL_RETURN_BUFFER(\reg,\nr,%_ASM_SP))	\
+		\ftr
+.Lskip_rsb_\@:
+#endif
 .endm
 
 #else /* __ASSEMBLY__ */
@@ -97,7 +152,7 @@
 	X86_FEATURE_RETPOLINE)
 
 # define THUNK_TARGET(addr) [thunk_target] "rm" (addr)
-#else /* No retpoline */
+#else /* No retpoline for C / inline asm */
 # define CALL_NOSPEC "call *%[thunk_target]\n"
 # define THUNK_TARGET(addr) [thunk_target] "rm" (addr)
 #endif
@@ -112,5 +167,24 @@ enum spectre_v2_mitigation {
 	SPECTRE_V2_IBRS,
 };
 
+/*
+ * On VMEXIT we must ensure that no RSB predictions learned in the guest
+ * can be followed in the host, by overwriting the RSB completely. Both
+ * retpoline and IBRS mitigations for Spectre v2 need this; only on future
+ * CPUs with IBRS_ATT *might* it be avoided.
+ */
+static inline void vmexit_fill_RSB(void)
+{
+#ifdef CONFIG_RETPOLINE
+	unsigned long loops = RSB_CLEAR_LOOPS / 2;
+
+	asm volatile (ALTERNATIVE("jmp 910f",
+				  __stringify(__FILL_RETURN_BUFFER(%0, RSB_CLEAR_LOOPS, %1)),
+				  X86_FEATURE_RETPOLINE)
+		      "910:"
+		      : "=&r" (loops), ASM_CALL_CONSTRAINT
+		      : "r" (loops) : "memory" );
+#endif
+}
 #endif /* __ASSEMBLY__ */
 #endif /* __NOSPEC_BRANCH_H__ */
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 900ffb6c28b5..2038e5bacce6 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -37,6 +37,7 @@
 #include <asm/desc.h>
 #include <asm/debugreg.h>
 #include <asm/kvm_para.h>
+#include <asm/nospec-branch.h>
 
 #include <asm/virtext.h>
 #include "trace.h"
@@ -3904,6 +3905,9 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
 #endif
 		);
 
+	/* Eliminate branch target predictions from guest mode */
+	vmexit_fill_RSB();
+
 #ifdef CONFIG_X86_64
 	wrmsrl(MSR_GS_BASE, svm->host.gs_base);
 #else
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index c26255f19603..75d60e40c389 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -47,6 +47,7 @@
 #include <asm/kexec.h>
 #include <asm/apic.h>
 #include <asm/irq_remapping.h>
+#include <asm/nospec-branch.h>
 
 #include "trace.h"
 #include "pmu.h"
@@ -8701,6 +8702,9 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
 #endif
 	      );
 
+	/* Eliminate branch target predictions from guest mode */
+	vmexit_fill_RSB();
+
 	/* MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed */
 	if (debugctlmsr)
 		update_debugctlmsr(debugctlmsr);
-- 
cgit v1.2.3


From 451725c3e785dfc3ede6c65184b96c213181995a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 14 Jan 2018 22:13:29 +0100
Subject: x86/retpoline: Remove compile time warning

commit b8b9ce4b5aec8de9e23cabb0a26b78641f9ab1d6 upstream.

Remove the compile time warning when CONFIG_RETPOLINE=y and the compiler
does not have retpoline support. Linus rationale for this is:

  It's wrong because it will just make people turn off RETPOLINE, and the
  asm updates - and return stack clearing - that are independent of the
  compiler are likely the most important parts because they are likely the
  ones easiest to target.

  And it's annoying because most people won't be able to do anything about
  it. The number of people building their own compiler? Very small. So if
  their distro hasn't got a compiler yet (and pretty much nobody does), the
  warning is just annoying crap.

  It is already properly reported as part of the sysfs interface. The
  compile-time warning only encourages bad things.

Fixes: 76b043848fd2 ("x86/retpoline: Add initial retpoline support")
Requested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: David Woodhouse <dwmw@amazon.co.uk>
Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: gnomes@lxorguk.ukuu.org.uk
Cc: Rik van Riel <riel@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: thomas.lendacky@amd.com
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Jiri Kosina <jikos@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Kees Cook <keescook@google.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Greg Kroah-Hartman <gregkh@linux-foundation.org>
Link: https://lkml.kernel.org/r/CA+55aFzWgquv4i6Mab6bASqYXg3ErV3XDFEYf=GEcCDQg5uAtw@mail.gmail.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/Makefile | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 34fdac520edb..1f9caa041bf7 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -194,8 +194,6 @@ ifdef CONFIG_RETPOLINE
     RETPOLINE_CFLAGS += $(call cc-option,-mindirect-branch=thunk-extern -mindirect-branch-register)
     ifneq ($(RETPOLINE_CFLAGS),)
         KBUILD_CFLAGS += $(RETPOLINE_CFLAGS) -DRETPOLINE
-    else
-        $(warning CONFIG_RETPOLINE=y, but not supported by the compiler. Toolchain update recommended.)
     endif
 endif
 
-- 
cgit v1.2.3


From fba063e6dfb413e06b9daa5d45b164761172f5ed Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Sat, 13 Jan 2018 17:27:30 -0600
Subject: x86/retpoline: Add LFENCE to the retpoline/RSB filling RSB macros

commit 28d437d550e1e39f805d99f9f8ac399c778827b7 upstream.

The PAUSE instruction is currently used in the retpoline and RSB filling
macros as a speculation trap.  The use of PAUSE was originally suggested
because it showed a very, very small difference in the amount of
cycles/time used to execute the retpoline as compared to LFENCE.  On AMD,
the PAUSE instruction is not a serializing instruction, so the pause/jmp
loop will use excess power as it is speculated over waiting for return
to mispredict to the correct target.

The RSB filling macro is applicable to AMD, and, if software is unable to
verify that LFENCE is serializing on AMD (possible when running under a
hypervisor), the generic retpoline support will be used and, so, is also
applicable to AMD.  Keep the current usage of PAUSE for Intel, but add an
LFENCE instruction to the speculation trap for AMD.

The same sequence has been adopted by GCC for the GCC generated retpolines.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@alien8.de>
Acked-by: David Woodhouse <dwmw@amazon.co.uk>
Acked-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Paul Turner <pjt@google.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Jiri Kosina <jikos@kernel.org>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Greg Kroah-Hartman <gregkh@linux-foundation.org>
Cc: Kees Cook <keescook@google.com>
Link: https://lkml.kernel.org/r/20180113232730.31060.36287.stgit@tlendack-t1.amdoffice.net
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/nospec-branch.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index 1afd04eb5fb7..e28a9ff1246c 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -11,7 +11,7 @@
  * Fill the CPU return stack buffer.
  *
  * Each entry in the RSB, if used for a speculative 'ret', contains an
- * infinite 'pause; jmp' loop to capture speculative execution.
+ * infinite 'pause; lfence; jmp' loop to capture speculative execution.
  *
  * This is required in various cases for retpoline and IBRS-based
  * mitigations for the Spectre variant 2 vulnerability. Sometimes to
@@ -38,11 +38,13 @@
 	call	772f;				\
 773:	/* speculation trap */			\
 	pause;					\
+	lfence;					\
 	jmp	773b;				\
 772:						\
 	call	774f;				\
 775:	/* speculation trap */			\
 	pause;					\
+	lfence;					\
 	jmp	775b;				\
 774:						\
 	dec	reg;				\
@@ -60,6 +62,7 @@
 	call	.Ldo_rop_\@
 .Lspec_trap_\@:
 	pause
+	lfence
 	jmp	.Lspec_trap_\@
 .Ldo_rop_\@:
 	mov	\reg, (%_ASM_SP)
@@ -142,6 +145,7 @@
 	"       .align 16\n"					\
 	"901:	call   903f;\n"					\
 	"902:	pause;\n"					\
+	"    	lfence;\n"					\
 	"       jmp    902b;\n"					\
 	"       .align 16\n"					\
 	"903:	addl   $4, %%esp;\n"				\
-- 
cgit v1.2.3


From 7fd1335392891188b022f5fa518201710d5be3ac Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 16 Jan 2018 12:20:18 +0100
Subject: x86/apic/vector: Fix off by one in error path

commit 45d55e7bac4028af93f5fa324e69958a0b868e96 upstream.

Keith reported the following warning:

WARNING: CPU: 28 PID: 1420 at kernel/irq/matrix.c:222 irq_matrix_remove_managed+0x10f/0x120
  x86_vector_free_irqs+0xa1/0x180
  x86_vector_alloc_irqs+0x1e4/0x3a0
  msi_domain_alloc+0x62/0x130

The reason for this is that if the vector allocation fails the error
handling code tries to free the failed vector as well, which causes the
above imbalance warning to trigger.

Adjust the error path to handle this correctly.

Fixes: b5dc8e6c21e7 ("x86/irq: Use hierarchical irqdomain to manage CPU interrupt vectors")
Reported-by: Keith Busch <keith.busch@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Tested-by: Keith Busch <keith.busch@intel.com>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801161217300.1823@nanos
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kernel/apic/vector.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index 0988e204f1e3..a41e523536a2 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -359,14 +359,17 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq,
 		irq_data->chip_data = data;
 		irq_data->hwirq = virq + i;
 		err = assign_irq_vector_policy(virq + i, node, data, info);
-		if (err)
+		if (err) {
+			irq_data->chip_data = NULL;
+			free_apic_chip_data(data);
 			goto error;
+		}
 	}
 
 	return 0;
 
 error:
-	x86_vector_free_irqs(domain, virq, i + 1);
+	x86_vector_free_irqs(domain, virq, i);
 	return err;
 }
 
-- 
cgit v1.2.3


From 2d5523bf47b4326ff1279613bfe69982bc4675c1 Mon Sep 17 00:00:00 2001
From: Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
Date: Thu, 4 Jan 2018 17:53:12 +0100
Subject: ARM: dts: kirkwood: fix pin-muxing of MPP7 on OpenBlocks A7

commit 56aeb07c914a616ab84357d34f8414a69b140cdf upstream.

MPP7 is currently muxed as "gpio", but this function doesn't exist for
MPP7, only "gpo" is available. This causes the following error:

kirkwood-pinctrl f1010000.pin-controller: unsupported function gpio on pin mpp7
pinctrl core: failed to register map default (6): invalid type given
kirkwood-pinctrl f1010000.pin-controller: error claiming hogs: -22
kirkwood-pinctrl f1010000.pin-controller: could not claim hogs: -22
kirkwood-pinctrl f1010000.pin-controller: unable to register pinctrl driver
kirkwood-pinctrl: probe of f1010000.pin-controller failed with error -22

So the pinctrl driver is not probed, all device drivers (including the
UART driver) do a -EPROBE_DEFER, and therefore the system doesn't
really boot (well, it boots, but with no UART, and no devices that
require pin-muxing).

Back when the Device Tree file for this board was introduced, the
definition was already wrong. The pinctrl driver also always described
as "gpo" this function for MPP7. However, between Linux 4.10 and 4.11,
a hog pin failing to be muxed was turned from a simple warning to a
hard error that caused the entire pinctrl driver probe to bail
out. This is probably the result of commit 6118714275f0a ("pinctrl:
core: Fix pinctrl_register_and_init() with pinctrl_enable()").

This commit fixes the Device Tree to use the proper "gpo" function for
MPP7, which fixes the boot of OpenBlocks A7, which was broken since
Linux 4.11.

Fixes: f24b56cbcd9d ("ARM: kirkwood: add support for OpenBlocks A7 platform")
Signed-off-by: Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
Reviewed-by: Andrew Lunn <andrew@lunn.ch>
Signed-off-by: Gregory CLEMENT <gregory.clement@free-electrons.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/arm/boot/dts/kirkwood-openblocks_a7.dts | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/arm/boot/dts/kirkwood-openblocks_a7.dts b/arch/arm/boot/dts/kirkwood-openblocks_a7.dts
index d5e3bc518968..d57f48543f76 100644
--- a/arch/arm/boot/dts/kirkwood-openblocks_a7.dts
+++ b/arch/arm/boot/dts/kirkwood-openblocks_a7.dts
@@ -53,7 +53,8 @@
 		};
 
 		pinctrl: pin-controller@10000 {
-			pinctrl-0 = <&pmx_dip_switches &pmx_gpio_header>;
+			pinctrl-0 = <&pmx_dip_switches &pmx_gpio_header
+				     &pmx_gpio_header_gpo>;
 			pinctrl-names = "default";
 
 			pmx_uart0: pmx-uart0 {
@@ -85,11 +86,16 @@
 			 * ground.
 			 */
 			pmx_gpio_header: pmx-gpio-header {
-				marvell,pins = "mpp17", "mpp7", "mpp29", "mpp28",
+				marvell,pins = "mpp17", "mpp29", "mpp28",
 					       "mpp35", "mpp34", "mpp40";
 				marvell,function = "gpio";
 			};
 
+			pmx_gpio_header_gpo: pxm-gpio-header-gpo {
+				marvell,pins = "mpp7";
+				marvell,function = "gpo";
+			};
+
 			pmx_gpio_init: pmx-init {
 				marvell,pins = "mpp38";
 				marvell,function = "gpio";
-- 
cgit v1.2.3


From 5ecd5c8388f060a02a6b97d9b99d9ea885903568 Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Tue, 16 Jan 2018 10:23:47 +0000
Subject: arm64: KVM: Fix SMCCC handling of unimplemented SMC/HVC calls

commit acfb3b883f6d6a4b5d27ad7fdded11f6a09ae6dd upstream.

KVM doesn't follow the SMCCC when it comes to unimplemented calls,
and inject an UNDEF instead of returning an error. Since firmware
calls are now used for security mitigation, they are becoming more
common, and the undef is counter productive.

Instead, let's follow the SMCCC which states that -1 must be returned
to the caller when getting an unknown function number.

Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Christoffer Dall <christoffer.dall@linaro.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/arm64/kvm/handle_exit.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c
index ba93a09eb536..5295aef7c8f0 100644
--- a/arch/arm64/kvm/handle_exit.c
+++ b/arch/arm64/kvm/handle_exit.c
@@ -42,7 +42,7 @@ static int handle_hvc(struct kvm_vcpu *vcpu, struct kvm_run *run)
 
 	ret = kvm_psci_call(vcpu);
 	if (ret < 0) {
-		kvm_inject_undefined(vcpu);
+		vcpu_set_reg(vcpu, 0, ~0UL);
 		return 1;
 	}
 
@@ -51,7 +51,7 @@ static int handle_hvc(struct kvm_vcpu *vcpu, struct kvm_run *run)
 
 static int handle_smc(struct kvm_vcpu *vcpu, struct kvm_run *run)
 {
-	kvm_inject_undefined(vcpu);
+	vcpu_set_reg(vcpu, 0, ~0UL);
 	return 1;
 }
 
-- 
cgit v1.2.3


From 6b1c99e275c034e4650044a7bb1a0bc274e1eb45 Mon Sep 17 00:00:00 2001
From: Tom Lendacky <thomas.lendacky@amd.com>
Date: Tue, 26 Dec 2017 23:43:54 -0600
Subject: x86/cpu, x86/pti: Do not enable PTI on AMD processors

commit 694d99d40972f12e59a3696effee8a376b79d7c8 upstream.

AMD processors are not subject to the types of attacks that the kernel
page table isolation feature protects against.  The AMD microarchitecture
does not allow memory references, including speculative references, that
access higher privileged data when running in a lesser privileged mode
when that access would result in a page fault.

Disable page table isolation by default on AMD processors by not setting
the X86_BUG_CPU_INSECURE feature, which controls whether X86_FEATURE_PTI
is set.

Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Borislav Petkov <bp@suse.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/20171227054354.20369.94587.stgit@tlendack-t1.amdoffice.net
Cc: Nick Lowe <nick.lowe@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kernel/cpu/common.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 0531c1707b40..f7f2ad3687ee 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -831,8 +831,8 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
 
 	setup_force_cpu_cap(X86_FEATURE_ALWAYS);
 
-	/* Assume for now that ALL x86 CPUs are insecure */
-	setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN);
+	if (c->x86_vendor != X86_VENDOR_AMD)
+		setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN);
 
 	setup_force_cpu_bug(X86_BUG_SPECTRE_V1);
 	setup_force_cpu_bug(X86_BUG_SPECTRE_V2);
-- 
cgit v1.2.3


From f59e7ce17ba327245c8feb312d447b09d3b98eba Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Thu, 18 Jan 2018 16:28:26 +0100
Subject: x86/mce: Make machine check speculation protected

commit 6f41c34d69eb005e7848716bbcafc979b35037d5 upstream.

The machine check idtentry uses an indirect branch directly from the low
level code. This evades the speculation protection.

Replace it by a direct call into C code and issue the indirect call there
so the compiler can apply the proper speculation protection.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by:Borislav Petkov <bp@alien8.de>
Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
Niced-by: Peter Zijlstra <peterz@infradead.org>
Link: https://lkml.kernel.org/r/alpine.DEB.2.20.1801181626290.1847@nanos
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/entry/entry_64.S        | 2 +-
 arch/x86/include/asm/traps.h     | 1 +
 arch/x86/kernel/cpu/mcheck/mce.c | 5 +++++
 3 files changed, 7 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 81b1cd533965..a03b22c615d9 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -1031,7 +1031,7 @@ idtentry async_page_fault	do_async_page_fault	has_error_code=1
 #endif
 
 #ifdef CONFIG_X86_MCE
-idtentry machine_check					has_error_code=0	paranoid=1 do_sym=*machine_check_vector(%rip)
+idtentry machine_check		do_mce			has_error_code=0	paranoid=1
 #endif
 
 /*
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index c3496619740a..156959ca49ce 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -92,6 +92,7 @@ dotraplinkage void do_simd_coprocessor_error(struct pt_regs *, long);
 #ifdef CONFIG_X86_32
 dotraplinkage void do_iret_error(struct pt_regs *, long);
 #endif
+dotraplinkage void do_mce(struct pt_regs *, long);
 
 static inline int get_si_code(unsigned long condition)
 {
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 7e8a736d09db..364fbad72e60 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1672,6 +1672,11 @@ static void unexpected_machine_check(struct pt_regs *regs, long error_code)
 void (*machine_check_vector)(struct pt_regs *, long error_code) =
 						unexpected_machine_check;
 
+dotraplinkage void do_mce(struct pt_regs *regs, long error_code)
+{
+	machine_check_vector(regs, error_code);
+}
+
 /*
  * Called for each booted CPU to set up machine checks.
  * Must be called with preempt off:
-- 
cgit v1.2.3


From 799dc737680a8074a0c7c2d3426b85f4c439377f Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Fri, 19 Jan 2018 01:14:21 +0900
Subject: retpoline: Introduce start/end markers of indirect thunk

commit 736e80a4213e9bbce40a7c050337047128b472ac upstream.

Introduce start/end markers of __x86_indirect_thunk_* functions.
To make it easy, consolidate .text.__x86.indirect_thunk.* sections
to one .text.__x86.indirect_thunk section and put it in the
end of kernel text section and adds __indirect_thunk_start/end
so that other subsystem (e.g. kprobes) can identify it.

Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: David Woodhouse <dwmw@amazon.co.uk>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ananth N Mavinakayanahalli <ananth@linux.vnet.ibm.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Greg Kroah-Hartman <gregkh@linux-foundation.org>
Link: https://lkml.kernel.org/r/151629206178.10241.6828804696410044771.stgit@devbox
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/nospec-branch.h | 3 +++
 arch/x86/kernel/vmlinux.lds.S        | 7 +++++++
 arch/x86/lib/retpoline.S             | 2 +-
 3 files changed, 11 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index e28a9ff1246c..4f7a5d3fed91 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -171,6 +171,9 @@ enum spectre_v2_mitigation {
 	SPECTRE_V2_IBRS,
 };
 
+extern char __indirect_thunk_start[];
+extern char __indirect_thunk_end[];
+
 /*
  * On VMEXIT we must ensure that no RSB predictions learned in the guest
  * can be followed in the host, by overwriting the RSB completely. Both
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 74e4bf11f562..e065065a4dfb 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -104,6 +104,13 @@ SECTIONS
 		IRQENTRY_TEXT
 		*(.fixup)
 		*(.gnu.warning)
+
+#ifdef CONFIG_RETPOLINE
+		__indirect_thunk_start = .;
+		*(.text.__x86.indirect_thunk)
+		__indirect_thunk_end = .;
+#endif
+
 		/* End of text section */
 		_etext = .;
 	} :text = 0x9090
diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S
index 019a03599bb0..8a8093e9cf01 100644
--- a/arch/x86/lib/retpoline.S
+++ b/arch/x86/lib/retpoline.S
@@ -9,7 +9,7 @@
 #include <asm/nospec-branch.h>
 
 .macro THUNK reg
-	.section .text.__x86.indirect_thunk.\reg
+	.section .text.__x86.indirect_thunk
 
 ENTRY(__x86_indirect_thunk_\reg)
 	CFI_STARTPROC
-- 
cgit v1.2.3


From 9b8bd0d3586842716f5673e7a0922a9c9e1fcbfd Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Fri, 19 Jan 2018 01:14:51 +0900
Subject: kprobes/x86: Blacklist indirect thunk functions for kprobes

commit c1804a236894ecc942da7dc6c5abe209e56cba93 upstream.

Mark __x86_indirect_thunk_* functions as blacklist for kprobes
because those functions can be called from anywhere in the kernel
including blacklist functions of kprobes.

Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: David Woodhouse <dwmw@amazon.co.uk>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ananth N Mavinakayanahalli <ananth@linux.vnet.ibm.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Greg Kroah-Hartman <gregkh@linux-foundation.org>
Link: https://lkml.kernel.org/r/151629209111.10241.5444852823378068683.stgit@devbox
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/lib/retpoline.S | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S
index 8a8093e9cf01..e611a124c442 100644
--- a/arch/x86/lib/retpoline.S
+++ b/arch/x86/lib/retpoline.S
@@ -25,7 +25,8 @@ ENDPROC(__x86_indirect_thunk_\reg)
  * than one per register with the correct names. So we do it
  * the simple and nasty way...
  */
-#define EXPORT_THUNK(reg) EXPORT_SYMBOL(__x86_indirect_thunk_ ## reg)
+#define __EXPORT_THUNK(sym) _ASM_NOKPROBE(sym); EXPORT_SYMBOL(sym)
+#define EXPORT_THUNK(reg) __EXPORT_THUNK(__x86_indirect_thunk_ ## reg)
 #define GENERATE_THUNK(reg) THUNK reg ; EXPORT_THUNK(reg)
 
 GENERATE_THUNK(_ASM_AX)
-- 
cgit v1.2.3


From 6cb73eb8045157ea280f0a047777ea1f56547375 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@kernel.org>
Date: Fri, 19 Jan 2018 01:15:20 +0900
Subject: kprobes/x86: Disable optimizing on the function jumps to indirect
 thunk

commit c86a32c09f8ced67971a2310e3b0dda4d1749007 upstream.

Since indirect jump instructions will be replaced by jump
to __x86_indirect_thunk_*, those jmp instruction must be
treated as an indirect jump. Since optprobe prohibits to
optimize probes in the function which uses an indirect jump,
it also needs to find out the function which jump to
__x86_indirect_thunk_* and disable optimization.

Add a check that the jump target address is between the
__indirect_thunk_start/end when optimizing kprobe.

Signed-off-by: Masami Hiramatsu <mhiramat@kernel.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: David Woodhouse <dwmw@amazon.co.uk>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ananth N Mavinakayanahalli <ananth@linux.vnet.ibm.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Greg Kroah-Hartman <gregkh@linux-foundation.org>
Link: https://lkml.kernel.org/r/151629212062.10241.6991266100233002273.stgit@devbox
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kernel/kprobes/opt.c | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index c9d488f3e4cd..ea8e2b846101 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -36,6 +36,7 @@
 #include <asm/alternative.h>
 #include <asm/insn.h>
 #include <asm/debugreg.h>
+#include <asm/nospec-branch.h>
 
 #include "common.h"
 
@@ -191,7 +192,7 @@ static int copy_optimized_instructions(u8 *dest, u8 *src)
 }
 
 /* Check whether insn is indirect jump */
-static int insn_is_indirect_jump(struct insn *insn)
+static int __insn_is_indirect_jump(struct insn *insn)
 {
 	return ((insn->opcode.bytes[0] == 0xff &&
 		(X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */
@@ -225,6 +226,26 @@ static int insn_jump_into_range(struct insn *insn, unsigned long start, int len)
 	return (start <= target && target <= start + len);
 }
 
+static int insn_is_indirect_jump(struct insn *insn)
+{
+	int ret = __insn_is_indirect_jump(insn);
+
+#ifdef CONFIG_RETPOLINE
+	/*
+	 * Jump to x86_indirect_thunk_* is treated as an indirect jump.
+	 * Note that even with CONFIG_RETPOLINE=y, the kernel compiled with
+	 * older gcc may use indirect jump. So we add this check instead of
+	 * replace indirect-jump check.
+	 */
+	if (!ret)
+		ret = insn_jump_into_range(insn,
+				(unsigned long)__indirect_thunk_start,
+				(unsigned long)__indirect_thunk_end -
+				(unsigned long)__indirect_thunk_start);
+#endif
+	return ret;
+}
+
 /* Decode whole function to ensure any instructions don't jump into target */
 static int can_optimize(unsigned long paddr)
 {
-- 
cgit v1.2.3


From 11e619414b69b7f1e47baac72c5be589d86e5393 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Wed, 17 Jan 2018 14:53:28 -0800
Subject: x86/retpoline: Optimize inline assembler for vmexit_fill_RSB

commit 3f7d875566d8e79c5e0b2c9a413e91b2c29e0854 upstream.

The generated assembler for the C fill RSB inline asm operations has
several issues:

- The C code sets up the loop register, which is then immediately
  overwritten in __FILL_RETURN_BUFFER with the same value again.

- The C code also passes in the iteration count in another register, which
  is not used at all.

Remove these two unnecessary operations. Just rely on the single constant
passed to the macro for the iterations.

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: David Woodhouse <dwmw@amazon.co.uk>
Cc: dave.hansen@intel.com
Cc: gregkh@linuxfoundation.org
Cc: torvalds@linux-foundation.org
Cc: arjan@linux.intel.com
Link: https://lkml.kernel.org/r/20180117225328.15414-1-andi@firstfloor.org
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/nospec-branch.h | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index 4f7a5d3fed91..492370b9b35b 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -183,15 +183,16 @@ extern char __indirect_thunk_end[];
 static inline void vmexit_fill_RSB(void)
 {
 #ifdef CONFIG_RETPOLINE
-	unsigned long loops = RSB_CLEAR_LOOPS / 2;
+	unsigned long loops;
 
 	asm volatile (ALTERNATIVE("jmp 910f",
 				  __stringify(__FILL_RETURN_BUFFER(%0, RSB_CLEAR_LOOPS, %1)),
 				  X86_FEATURE_RETPOLINE)
 		      "910:"
-		      : "=&r" (loops), ASM_CALL_CONSTRAINT
-		      : "r" (loops) : "memory" );
+		      : "=r" (loops), ASM_CALL_CONSTRAINT
+		      : : "memory" );
 #endif
 }
+
 #endif /* __ASSEMBLY__ */
 #endif /* __NOSPEC_BRANCH_H__ */
-- 
cgit v1.2.3


From 38bc402237f8cb68c5f3ab0292c6d5537af38847 Mon Sep 17 00:00:00 2001
From: Jonas Gorski <jonas.gorski@gmail.com>
Date: Sun, 29 Oct 2017 16:27:21 +0100
Subject: MIPS: AR7: ensure the port type's FCR value is used

commit 0a5191efe06b5103909206e4fbcff81d30283f8e upstream.

Since commit aef9a7bd9b67 ("serial/uart/8250: Add tunable RX interrupt
trigger I/F of FIFO buffers"), the port's default FCR value isn't used
in serial8250_do_set_termios anymore, but copied over once in
serial8250_config_port and then modified as needed.

Unfortunately, serial8250_config_port will never be called if the port
is shared between kernel and userspace, and the port's flag doesn't have
UPF_BOOT_AUTOCONF, which would trigger a serial8250_config_port as well.

This causes garbled output from userspace:

[    5.220000] random: procd urandom read with 49 bits of entropy available
ers
   [kee

Fix this by forcing it to be configured on boot, resulting in the
expected output:

[    5.250000] random: procd urandom read with 50 bits of entropy available
Press the [f] key and hit [enter] to enter failsafe mode
Press the [1], [2], [3] or [4] key and hit [enter] to select the debug level

Fixes: aef9a7bd9b67 ("serial/uart/8250: Add tunable RX interrupt trigger I/F of FIFO buffers")
Signed-off-by: Jonas Gorski <jonas.gorski@gmail.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Yoshihiro YUNOMAE <yoshihiro.yunomae.ez@hitachi.com>
Cc: Florian Fainelli <f.fainelli@gmail.com>
Cc: Nicolas Schichan <nschichan@freebox.fr>
Cc: linux-mips@linux-mips.org
Cc: linux-serial@vger.kernel.org
Patchwork: https://patchwork.linux-mips.org/patch/17544/
Signed-off-by: Ralf Baechle <ralf@linux-mips.org>
Cc: James Hogan <jhogan@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/mips/ar7/platform.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/mips/ar7/platform.c b/arch/mips/ar7/platform.c
index 3446b6fb3acb..9da4e2292fc7 100644
--- a/arch/mips/ar7/platform.c
+++ b/arch/mips/ar7/platform.c
@@ -576,7 +576,7 @@ static int __init ar7_register_uarts(void)
 	uart_port.type		= PORT_AR7;
 	uart_port.uartclk	= clk_get_rate(bus_clk) / 2;
 	uart_port.iotype	= UPIO_MEM32;
-	uart_port.flags		= UPF_FIXED_TYPE;
+	uart_port.flags		= UPF_FIXED_TYPE | UPF_BOOT_AUTOCONF;
 	uart_port.regshift	= 2;
 
 	uart_port.line		= 0;
-- 
cgit v1.2.3


From 8bb0c6a1b3b2fe92707ee6f6901bfdfc392f8cd1 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Fri, 9 Dec 2016 10:24:05 -0800
Subject: x86/asm/32: Make sync_core() handle missing CPUID on all 32-bit
 kernels

commit 1c52d859cb2d417e7216d3e56bb7fea88444cec9 upstream.

We support various non-Intel CPUs that don't have the CPUID
instruction, so the M486 test was wrong.  For now, fix it with a big
hammer: handle missing CPUID on all 32-bit CPUs.

Reported-by: One Thousand Gnomes <gnomes@lxorguk.ukuu.org.uk>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Juergen Gross <jgross@suse.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Matthew Whitehead <tedheadster@gmail.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Henrique de Moraes Holschuh <hmh@hmh.eng.br>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: xen-devel <Xen-devel@lists.xen.org>
Link: http://lkml.kernel.org/r/685bd083a7c036f7769510b6846315b17d6ba71f.1481307769.git.luto@kernel.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: "Zhang, Ning A" <ning.a.zhang@intel.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/processor.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index c124d6ab4bf9..86bccb4bd4dc 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -574,7 +574,7 @@ static inline void sync_core(void)
 {
 	int tmp;
 
-#ifdef CONFIG_M486
+#ifdef CONFIG_X86_32
 	/*
 	 * Do a CPUID if available, otherwise do a jump.  The jump
 	 * can conveniently enough be the jump around CPUID.
-- 
cgit v1.2.3


From 20e3fa5dd5818b425b18528571e133034833df27 Mon Sep 17 00:00:00 2001
From: Ben Hutchings <ben.hutchings@codethink.co.uk>
Date: Wed, 24 Jan 2018 02:31:19 +0000
Subject: x86/microcode/intel: Fix BDW late-loading revision check

The backport of commit b94b73733171 ("x86/microcode/intel: Extend BDW
late-loading with a revision check") to 4.4-stable deleted a "return true"
statement.  This bug is not present upstream or other stable branches.

Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kernel/cpu/microcode/intel.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index b428a8174be1..ee011bd7934d 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -1005,6 +1005,7 @@ static bool is_blacklisted(unsigned int cpu)
 	    c->microcode < 0x0b000021) {
 		pr_err_once("Erratum BDF90: late loading with revision < 0x0b000021 (0x%x) disabled.\n", c->microcode);
 		pr_err_once("Please consider either early loading through initrd/built-in or a potential BIOS update.\n");
+		return true;
 	}
 
 	return false;
-- 
cgit v1.2.3


From 0dfd5fbcae5d40e5e58bc7c34305498e965bf1ef Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave@sr71.net>
Date: Thu, 2 Jun 2016 17:19:27 -0700
Subject: x86/cpu/intel: Introduce macros for Intel family numbers

commit 970442c599b22ccd644ebfe94d1d303bf6f87c05 upstream.

Problem:

We have a boatload of open-coded family-6 model numbers.  Half of
them have these model numbers in hex and the other half in
decimal.  This makes grepping for them tons of fun, if you were
to try.

Solution:

Consolidate all the magic numbers.  Put all the definitions in
one header.

The names here are closely derived from the comments describing
the models from arch/x86/events/intel/core.c.  We could easily
make them shorter by doing things like s/SANDYBRIDGE/SNB/, but
they seemed fine even with the longer versions to me.

Do not take any of these names too literally, like "DESKTOP"
or "MOBILE".  These are all colloquial names and not precise
descriptions of everywhere a given model will show up.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Adrian Hunter <adrian.hunter@intel.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Darren Hart <dvhart@infradead.org>
Cc: Dave Hansen <dave@sr71.net>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Doug Thompson <dougthompson@xmission.com>
Cc: Eduardo Valentin <edubezval@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jacob Pan <jacob.jun.pan@linux.intel.com>
Cc: Kan Liang <kan.liang@intel.com>
Cc: Len Brown <lenb@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mauro Carvalho Chehab <mchehab@osg.samsung.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
Cc: Rajneesh Bhardwaj <rajneesh.bhardwaj@intel.com>
Cc: Souvik Kumar Chakravarty <souvik.k.chakravarty@intel.com>
Cc: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Ulf Hansson <ulf.hansson@linaro.org>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: Vishwanath Somayaji <vishwanath.somayaji@intel.com>
Cc: Zhang Rui <rui.zhang@intel.com>
Cc: jacob.jun.pan@intel.com
Cc: linux-acpi@vger.kernel.org
Cc: linux-edac@vger.kernel.org
Cc: linux-mmc@vger.kernel.org
Cc: linux-pm@vger.kernel.org
Cc: platform-driver-x86@vger.kernel.org
Link: http://lkml.kernel.org/r/20160603001927.F2A7D828@viggo.jf.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Jiri Slaby <jslaby@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/intel-family.h | 68 +++++++++++++++++++++++++++++++++++++
 1 file changed, 68 insertions(+)
 create mode 100644 arch/x86/include/asm/intel-family.h

(limited to 'arch')

diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h
new file mode 100644
index 000000000000..6999f7d01a0d
--- /dev/null
+++ b/arch/x86/include/asm/intel-family.h
@@ -0,0 +1,68 @@
+#ifndef _ASM_X86_INTEL_FAMILY_H
+#define _ASM_X86_INTEL_FAMILY_H
+
+/*
+ * "Big Core" Processors (Branded as Core, Xeon, etc...)
+ *
+ * The "_X" parts are generally the EP and EX Xeons, or the
+ * "Extreme" ones, like Broadwell-E.
+ *
+ * Things ending in "2" are usually because we have no better
+ * name for them.  There's no processor called "WESTMERE2".
+ */
+
+#define INTEL_FAM6_CORE_YONAH		0x0E
+#define INTEL_FAM6_CORE2_MEROM		0x0F
+#define INTEL_FAM6_CORE2_MEROM_L	0x16
+#define INTEL_FAM6_CORE2_PENRYN		0x17
+#define INTEL_FAM6_CORE2_DUNNINGTON	0x1D
+
+#define INTEL_FAM6_NEHALEM		0x1E
+#define INTEL_FAM6_NEHALEM_EP		0x1A
+#define INTEL_FAM6_NEHALEM_EX		0x2E
+#define INTEL_FAM6_WESTMERE		0x25
+#define INTEL_FAM6_WESTMERE2		0x1F
+#define INTEL_FAM6_WESTMERE_EP		0x2C
+#define INTEL_FAM6_WESTMERE_EX		0x2F
+
+#define INTEL_FAM6_SANDYBRIDGE		0x2A
+#define INTEL_FAM6_SANDYBRIDGE_X	0x2D
+#define INTEL_FAM6_IVYBRIDGE		0x3A
+#define INTEL_FAM6_IVYBRIDGE_X		0x3E
+
+#define INTEL_FAM6_HASWELL_CORE		0x3C
+#define INTEL_FAM6_HASWELL_X		0x3F
+#define INTEL_FAM6_HASWELL_ULT		0x45
+#define INTEL_FAM6_HASWELL_GT3E		0x46
+
+#define INTEL_FAM6_BROADWELL_CORE	0x3D
+#define INTEL_FAM6_BROADWELL_XEON_D	0x56
+#define INTEL_FAM6_BROADWELL_GT3E	0x47
+#define INTEL_FAM6_BROADWELL_X		0x4F
+
+#define INTEL_FAM6_SKYLAKE_MOBILE	0x4E
+#define INTEL_FAM6_SKYLAKE_DESKTOP	0x5E
+#define INTEL_FAM6_SKYLAKE_X		0x55
+#define INTEL_FAM6_KABYLAKE_MOBILE	0x8E
+#define INTEL_FAM6_KABYLAKE_DESKTOP	0x9E
+
+/* "Small Core" Processors (Atom) */
+
+#define INTEL_FAM6_ATOM_PINEVIEW	0x1C
+#define INTEL_FAM6_ATOM_LINCROFT	0x26
+#define INTEL_FAM6_ATOM_PENWELL		0x27
+#define INTEL_FAM6_ATOM_CLOVERVIEW	0x35
+#define INTEL_FAM6_ATOM_CEDARVIEW	0x36
+#define INTEL_FAM6_ATOM_SILVERMONT1	0x37 /* BayTrail/BYT / Valleyview */
+#define INTEL_FAM6_ATOM_SILVERMONT2	0x4D /* Avaton/Rangely */
+#define INTEL_FAM6_ATOM_AIRMONT		0x4C /* CherryTrail / Braswell */
+#define INTEL_FAM6_ATOM_MERRIFIELD1	0x4A /* Tangier */
+#define INTEL_FAM6_ATOM_MERRIFIELD2	0x5A /* Annidale */
+#define INTEL_FAM6_ATOM_GOLDMONT	0x5C
+#define INTEL_FAM6_ATOM_DENVERTON	0x5F /* Goldmont Microserver */
+
+/* Xeon Phi */
+
+#define INTEL_FAM6_XEON_PHI_KNL		0x57 /* Knights Landing */
+
+#endif /* _ASM_X86_INTEL_FAMILY_H */
-- 
cgit v1.2.3


From 18bb117d1b7690181346e6365c6237b6ceaac4c4 Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Fri, 12 Jan 2018 17:49:25 +0000
Subject: x86/retpoline: Fill RSB on context switch for affected CPUs

commit c995efd5a740d9cbafbf58bde4973e8b50b4d761 upstream.

On context switch from a shallow call stack to a deeper one, as the CPU
does 'ret' up the deeper side it may encounter RSB entries (predictions for
where the 'ret' goes to) which were populated in userspace.

This is problematic if neither SMEP nor KPTI (the latter of which marks
userspace pages as NX for the kernel) are active, as malicious code in
userspace may then be executed speculatively.

Overwrite the CPU's return prediction stack with calls which are predicted
to return to an infinite loop, to "capture" speculation if this
happens. This is required both for retpoline, and also in conjunction with
IBRS for !SMEP && !KPTI.

On Skylake+ the problem is slightly different, and an *underflow* of the
RSB may cause errant branch predictions to occur. So there it's not so much
overwrite, as *filling* the RSB to attempt to prevent it getting
empty. This is only a partial solution for Skylake+ since there are many
other conditions which may result in the RSB becoming empty. The full
solution on Skylake+ is to use IBRS, which will prevent the problem even
when the RSB becomes empty. With IBRS, the RSB-stuffing will not be
required on context switch.

[ tglx: Added missing vendor check and slighty massaged comments and
  	changelog ]

[js] backport to 4.4 -- __switch_to_asm does not exist there, we
     have to patch the switch_to macros for both x86_32 and x86_64.

Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: gnomes@lxorguk.ukuu.org.uk
Cc: Rik van Riel <riel@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: thomas.lendacky@amd.com
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Jiri Kosina <jikos@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Kees Cook <keescook@google.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Greg Kroah-Hartman <gregkh@linux-foundation.org>
Cc: Paul Turner <pjt@google.com>
Link: https://lkml.kernel.org/r/1515779365-9032-1-git-send-email-dwmw@amazon.co.uk
Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/cpufeature.h |  1 +
 arch/x86/include/asm/switch_to.h  | 38 ++++++++++++++++++++++++++++++++++++++
 arch/x86/kernel/cpu/bugs.c        | 36 ++++++++++++++++++++++++++++++++++++
 3 files changed, 75 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 0fbc98568018..641f0f2c2982 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -199,6 +199,7 @@
 #define X86_FEATURE_HWP_EPP	( 7*32+13) /* Intel HWP_EPP */
 #define X86_FEATURE_HWP_PKG_REQ ( 7*32+14) /* Intel HWP_PKG_REQ */
 #define X86_FEATURE_INTEL_PT	( 7*32+15) /* Intel Processor Trace */
+#define X86_FEATURE_RSB_CTXSW	( 7*32+19) /* Fill RSB on context switches */
 
 #define X86_FEATURE_RETPOLINE	( 7*32+29) /* Generic Retpoline mitigation for Spectre variant 2 */
 #define X86_FEATURE_RETPOLINE_AMD ( 7*32+30) /* AMD Retpoline mitigation for Spectre variant 2 */
diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
index 751bf4b7bf11..025ecfaba9c9 100644
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -1,6 +1,8 @@
 #ifndef _ASM_X86_SWITCH_TO_H
 #define _ASM_X86_SWITCH_TO_H
 
+#include <asm/nospec-branch.h>
+
 struct task_struct; /* one of the stranger aspects of C forward declarations */
 __visible struct task_struct *__switch_to(struct task_struct *prev,
 					   struct task_struct *next);
@@ -24,6 +26,23 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
 #define __switch_canary_iparam
 #endif	/* CC_STACKPROTECTOR */
 
+#ifdef CONFIG_RETPOLINE
+	/*
+	 * When switching from a shallower to a deeper call stack
+	 * the RSB may either underflow or use entries populated
+	 * with userspace addresses. On CPUs where those concerns
+	 * exist, overwrite the RSB with entries which capture
+	 * speculative execution to prevent attack.
+	 */
+#define __retpoline_fill_return_buffer					\
+	ALTERNATIVE("jmp 910f",						\
+		__stringify(__FILL_RETURN_BUFFER(%%ebx, RSB_CLEAR_LOOPS, %%esp)),\
+		X86_FEATURE_RSB_CTXSW)					\
+	"910:\n\t"
+#else
+#define __retpoline_fill_return_buffer
+#endif
+
 /*
  * Saving eflags is important. It switches not only IOPL between tasks,
  * it also protects other tasks from NT leaking through sysenter etc.
@@ -46,6 +65,7 @@ do {									\
 		     "movl $1f,%[prev_ip]\n\t"	/* save    EIP   */	\
 		     "pushl %[next_ip]\n\t"	/* restore EIP   */	\
 		     __switch_canary					\
+		     __retpoline_fill_return_buffer			\
 		     "jmp __switch_to\n"	/* regparm call  */	\
 		     "1:\t"						\
 		     "popl %%ebp\n\t"		/* restore EBP   */	\
@@ -100,6 +120,23 @@ do {									\
 #define __switch_canary_iparam
 #endif	/* CC_STACKPROTECTOR */
 
+#ifdef CONFIG_RETPOLINE
+	/*
+	 * When switching from a shallower to a deeper call stack
+	 * the RSB may either underflow or use entries populated
+	 * with userspace addresses. On CPUs where those concerns
+	 * exist, overwrite the RSB with entries which capture
+	 * speculative execution to prevent attack.
+	 */
+#define __retpoline_fill_return_buffer					\
+	ALTERNATIVE("jmp 910f",						\
+		__stringify(__FILL_RETURN_BUFFER(%%r12, RSB_CLEAR_LOOPS, %%rsp)),\
+		X86_FEATURE_RSB_CTXSW)					\
+	"910:\n\t"
+#else
+#define __retpoline_fill_return_buffer
+#endif
+
 /*
  * There is no need to save or restore flags, because flags are always
  * clean in kernel mode, with the possible exception of IOPL.  Kernel IOPL
@@ -112,6 +149,7 @@ do {									\
 	     "call __switch_to\n\t"					  \
 	     "movq "__percpu_arg([current_task])",%%rsi\n\t"		  \
 	     __switch_canary						  \
+	     __retpoline_fill_return_buffer				  \
 	     "movq %P[thread_info](%%rsi),%%r8\n\t"			  \
 	     "movq %%rax,%%rdi\n\t" 					  \
 	     "testl  %[_tif_fork],%P[ti_flags](%%r8)\n\t"		  \
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 49d25ddf0e9f..8cacf62ec458 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -22,6 +22,7 @@
 #include <asm/alternative.h>
 #include <asm/pgtable.h>
 #include <asm/cacheflush.h>
+#include <asm/intel-family.h>
 
 static void __init spectre_v2_select_mitigation(void);
 
@@ -154,6 +155,23 @@ disable:
 	return SPECTRE_V2_CMD_NONE;
 }
 
+/* Check for Skylake-like CPUs (for RSB handling) */
+static bool __init is_skylake_era(void)
+{
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL &&
+	    boot_cpu_data.x86 == 6) {
+		switch (boot_cpu_data.x86_model) {
+		case INTEL_FAM6_SKYLAKE_MOBILE:
+		case INTEL_FAM6_SKYLAKE_DESKTOP:
+		case INTEL_FAM6_SKYLAKE_X:
+		case INTEL_FAM6_KABYLAKE_MOBILE:
+		case INTEL_FAM6_KABYLAKE_DESKTOP:
+			return true;
+		}
+	}
+	return false;
+}
+
 static void __init spectre_v2_select_mitigation(void)
 {
 	enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline();
@@ -212,6 +230,24 @@ retpoline_auto:
 
 	spectre_v2_enabled = mode;
 	pr_info("%s\n", spectre_v2_strings[mode]);
+
+	/*
+	 * If neither SMEP or KPTI are available, there is a risk of
+	 * hitting userspace addresses in the RSB after a context switch
+	 * from a shallow call stack to a deeper one. To prevent this fill
+	 * the entire RSB, even when using IBRS.
+	 *
+	 * Skylake era CPUs have a separate issue with *underflow* of the
+	 * RSB, when they will predict 'ret' targets from the generic BTB.
+	 * The proper mitigation for this is IBRS. If IBRS is not supported
+	 * or deactivated in favour of retpolines the RSB fill on context
+	 * switch is required.
+	 */
+	if ((!boot_cpu_has(X86_FEATURE_KAISER) &&
+	     !boot_cpu_has(X86_FEATURE_SMEP)) || is_skylake_era()) {
+		setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW);
+		pr_info("Filling RSB on context switch\n");
+	}
 }
 
 #undef pr_fmt
-- 
cgit v1.2.3


From 29b73cace1b1cc2daf2556b3285d4e79f19bcb3b Mon Sep 17 00:00:00 2001
From: Janakarajan Natarajan <Janakarajan.Natarajan@amd.com>
Date: Tue, 25 Apr 2017 16:44:03 -0500
Subject: Prevent timer value 0 for MWAITX

commit 88d879d29f9cc0de2d930b584285638cdada6625 upstream.

Newer hardware has uncovered a bug in the software implementation of
using MWAITX for the delay function. A value of 0 for the timer is meant
to indicate that a timeout will not be used to exit MWAITX. On newer
hardware this can result in MWAITX never returning, resulting in NMI
soft lockup messages being printed. On older hardware, some of the other
conditions under which MWAITX can exit masked this issue. The AMD APM
does not currently document this and will be updated.

Please refer to http://marc.info/?l=kvm&m=148950623231140 for
information regarding NMI soft lockup messages on an AMD Ryzen 1800X.
This has been root-caused as a 0 passed to MWAITX causing it to wait
indefinitely.

This change has the added benefit of avoiding the unnecessary setup of
MONITORX/MWAITX when the delay value is zero.

Signed-off-by: Janakarajan Natarajan <Janakarajan.Natarajan@amd.com>
Link: http://lkml.kernel.org/r/1493156643-29366-1-git-send-email-Janakarajan.Natarajan@amd.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Davidlohr Bueso <dbueso@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/lib/delay.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c
index e912b2f6d36e..45772560aceb 100644
--- a/arch/x86/lib/delay.c
+++ b/arch/x86/lib/delay.c
@@ -93,6 +93,13 @@ static void delay_mwaitx(unsigned long __loops)
 {
 	u64 start, end, delay, loops = __loops;
 
+	/*
+	 * Timer value of 0 causes MWAITX to wait indefinitely, unless there
+	 * is a store on the memory monitored by MONITORX.
+	 */
+	if (loops == 0)
+		return;
+
 	start = rdtsc_ordered();
 
 	for (;;) {
-- 
cgit v1.2.3


From f31a8450c09327339939fe195707749c1d67016d Mon Sep 17 00:00:00 2001
From: Sudeep Holla <sudeep.holla@arm.com>
Date: Fri, 28 Oct 2016 09:45:28 +0100
Subject: drivers: base: cacheinfo: fix x86 with CONFIG_OF enabled

commit fac51482577d5e05bbb0efa8d602a3c2111098bf upstream.

With CONFIG_OF enabled on x86, we get the following error on boot:
"
	Failed to find cpu0 device node
 	Unable to detect cache hierarchy from DT for CPU 0
"
and the cacheinfo fails to get populated in the corresponding sysfs
entries. This is because cache_setup_of_node looks for of_node for
setting up the shared cpu_map without checking that it's already
populated in the architecture specific callback.

In order to indicate that the shared cpu_map is already populated, this
patch introduces a boolean `cpu_map_populated` in struct cpu_cacheinfo
that can be used by the generic code to skip cache_shared_cpu_map_setup.

This patch also sets that boolean for x86.

Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Signed-off-by: Sudeep Holla <sudeep.holla@arm.com>
Signed-off-by: Mian Yousaf Kaukab <yousaf.kaukab@suse.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kernel/cpu/intel_cacheinfo.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index e38d338a6447..b4ca91cf55b0 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -934,6 +934,8 @@ static int __populate_cache_leaves(unsigned int cpu)
 		ci_leaf_init(this_leaf++, &id4_regs);
 		__cache_cpumap_setup(cpu, idx, &id4_regs);
 	}
+	this_cpu_ci->cpu_map_populated = true;
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From 699a6cc7dd321113c99e06c6c997e8d15557439c Mon Sep 17 00:00:00 2001
From: Rui Wang <rui.y.wang@intel.com>
Date: Wed, 8 Jun 2016 14:59:52 +0800
Subject: x86/ioapic: Fix incorrect pointers in ioapic_setup_resources()

commit 9d98bcec731756b8688b59ec998707924d716d7b upstream.

On a 4-socket Brickland system, hot-removing one ioapic is fine.
Hot-removing the 2nd one causes panic in mp_unregister_ioapic()
while calling release_resource().

It is because the iomem_res pointer has already been released
when removing the first ioapic.

To explain the use of &res[num] here: res is assigned to ioapic_resources,
and later in ioapic_insert_resources() we do:

	struct resource *r = ioapic_resources;

        for_each_ioapic(i) {
                insert_resource(&iomem_resource, r);
                r++;
        }

Here 'r' is treated as an arry of 'struct resource', and the r++ ensures
that each element of the array is inserted separately. Thus we should call
release_resouce() on each element at &res[num].

Fix it by assigning the correct pointers to ioapics[i].iomem_res in
ioapic_setup_resources().

Signed-off-by: Rui Wang <rui.y.wang@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: tony.luck@intel.com
Cc: linux-pci@vger.kernel.org
Cc: rjw@rjwysocki.net
Cc: linux-acpi@vger.kernel.org
Cc: bhelgaas@google.com
Link: http://lkml.kernel.org/r/1465369193-4816-3-git-send-email-rui.y.wang@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Acked-by: Joerg Roedel <jroedel@suse.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kernel/apic/io_apic.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index fc91c98bee01..fd945099fc95 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -2592,8 +2592,8 @@ static struct resource * __init ioapic_setup_resources(void)
 		res[num].flags = IORESOURCE_MEM | IORESOURCE_BUSY;
 		snprintf(mem, IOAPIC_RESOURCE_NAME_SIZE, "IOAPIC %u", i);
 		mem += IOAPIC_RESOURCE_NAME_SIZE;
+		ioapics[i].iomem_res = &res[num];
 		num++;
-		ioapics[i].iomem_res = res;
 	}
 
 	ioapic_resources = res;
-- 
cgit v1.2.3


From e1e457a49544718928282c56b69af082e1ebe405 Mon Sep 17 00:00:00 2001
From: Thomas Meyer <thomas@m3y3r.de>
Date: Sun, 20 Aug 2017 13:26:04 +0200
Subject: um: link vmlinux with -no-pie

commit 883354afbc109c57f925ccc19840055193da0cc0 upstream.

Debian's gcc defaults to pie. The global Makefile already defines the -fno-pie option.
Link UML dynamic kernel image also with -no-pie to fix the build.

Signed-off-by: Thomas Meyer <thomas@m3y3r.de>
Signed-off-by: Richard Weinberger <richard@nod.at>
Cc: Bernie Innocenti <codewiz@google.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/um/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/um/Makefile b/arch/um/Makefile
index e3abe6f3156d..9ccf462131c4 100644
--- a/arch/um/Makefile
+++ b/arch/um/Makefile
@@ -117,7 +117,7 @@ archheaders:
 archprepare: include/generated/user_constants.h
 
 LINK-$(CONFIG_LD_SCRIPT_STATIC) += -static
-LINK-$(CONFIG_LD_SCRIPT_DYN) += -Wl,-rpath,/lib
+LINK-$(CONFIG_LD_SCRIPT_DYN) += -Wl,-rpath,/lib $(call cc-option, -no-pie)
 
 CFLAGS_NO_HARDENING := $(call cc-option, -fno-PIC,) $(call cc-option, -fno-pic,) \
 	$(call cc-option, -fno-stack-protector,) \
-- 
cgit v1.2.3


From ed73df0b7f23c95b3243a0f4bfc40f962e61d349 Mon Sep 17 00:00:00 2001
From: Ben Hutchings <ben.hutchings@codethink.co.uk>
Date: Fri, 26 Jan 2018 16:23:02 +0000
Subject: vsyscall: Fix permissions for emulate mode with KAISER/PTI

The backport of KAISER to 4.4 turned vsyscall emulate mode into native
mode.  Add a vsyscall_pgprot variable to hold the correct page
protections, like Borislav and Hugh did for 3.2 and 3.18.

Cc: Borislav Petkov <bp@suse.de>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/entry/vsyscall/vsyscall_64.c | 7 ++++---
 arch/x86/include/asm/vsyscall.h       | 1 +
 arch/x86/mm/kaiser.c                  | 2 +-
 3 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c
index 112178b401a1..2d359991a273 100644
--- a/arch/x86/entry/vsyscall/vsyscall_64.c
+++ b/arch/x86/entry/vsyscall/vsyscall_64.c
@@ -46,6 +46,7 @@ static enum { EMULATE, NATIVE, NONE } vsyscall_mode =
 #else
 	EMULATE;
 #endif
+unsigned long vsyscall_pgprot = __PAGE_KERNEL_VSYSCALL;
 
 static int __init vsyscall_setup(char *str)
 {
@@ -336,11 +337,11 @@ void __init map_vsyscall(void)
 	extern char __vsyscall_page;
 	unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page);
 
+	if (vsyscall_mode != NATIVE)
+		vsyscall_pgprot = __PAGE_KERNEL_VVAR;
 	if (vsyscall_mode != NONE)
 		__set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall,
-			     vsyscall_mode == NATIVE
-			     ? PAGE_KERNEL_VSYSCALL
-			     : PAGE_KERNEL_VVAR);
+			     __pgprot(vsyscall_pgprot));
 
 	BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) !=
 		     (unsigned long)VSYSCALL_ADDR);
diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h
index 4865e10dbb55..9ee85066f407 100644
--- a/arch/x86/include/asm/vsyscall.h
+++ b/arch/x86/include/asm/vsyscall.h
@@ -13,6 +13,7 @@ extern void map_vsyscall(void);
  */
 extern bool emulate_vsyscall(struct pt_regs *regs, unsigned long address);
 extern bool vsyscall_enabled(void);
+extern unsigned long vsyscall_pgprot;
 #else
 static inline void map_vsyscall(void) {}
 static inline bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c
index 8af98513d36c..2298434f7bdb 100644
--- a/arch/x86/mm/kaiser.c
+++ b/arch/x86/mm/kaiser.c
@@ -345,7 +345,7 @@ void __init kaiser_init(void)
 	if (vsyscall_enabled())
 		kaiser_add_user_map_early((void *)VSYSCALL_ADDR,
 					  PAGE_SIZE,
-					   __PAGE_KERNEL_VSYSCALL);
+					  vsyscall_pgprot);
 
 	for_each_possible_cpu(cpu) {
 		void *percpu_vaddr = __per_cpu_user_mapped_start +
-- 
cgit v1.2.3


From 20c0a04284496b262b172707b7dd122cdd8fe9a1 Mon Sep 17 00:00:00 2001
From: Jia Zhang <zhang.jia@linux.alibaba.com>
Date: Tue, 23 Jan 2018 11:41:32 +0100
Subject: x86/microcode/intel: Extend BDW late-loading further with LLC size
 check

commit 7e702d17ed138cf4ae7c00e8c00681ed464587c7 upstream.

Commit b94b73733171 ("x86/microcode/intel: Extend BDW late-loading with a
revision check") reduced the impact of erratum BDF90 for Broadwell model
79.

The impact can be reduced further by checking the size of the last level
cache portion per core.

Tony: "The erratum says the problem only occurs on the large-cache SKUs.
So we only need to avoid the update if we are on a big cache SKU that is
also running old microcode."

For more details, see erratum BDF90 in document #334165 (Intel Xeon
Processor E7-8800/4800 v4 Product Family Specification Update) from
September 2017.

Fixes: b94b73733171 ("x86/microcode/intel: Extend BDW late-loading with a revision check")
Signed-off-by: Jia Zhang <zhang.jia@linux.alibaba.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Tony Luck <tony.luck@intel.com>
Link: https://lkml.kernel.org/r/1516321542-31161-1-git-send-email-zhang.jia@linux.alibaba.com
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kernel/cpu/microcode/intel.c | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index ee011bd7934d..2c76a1801393 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -39,6 +39,9 @@
 #include <asm/setup.h>
 #include <asm/msr.h>
 
+/* last level cache size per core */
+static int llc_size_per_core;
+
 static unsigned long mc_saved_in_initrd[MAX_UCODE_COUNT];
 static struct mc_saved_data {
 	unsigned int mc_saved_count;
@@ -996,12 +999,14 @@ static bool is_blacklisted(unsigned int cpu)
 
 	/*
 	 * Late loading on model 79 with microcode revision less than 0x0b000021
-	 * may result in a system hang. This behavior is documented in item
-	 * BDF90, #334165 (Intel Xeon Processor E7-8800/4800 v4 Product Family).
+	 * and LLC size per core bigger than 2.5MB may result in a system hang.
+	 * This behavior is documented in item BDF90, #334165 (Intel Xeon
+	 * Processor E7-8800/4800 v4 Product Family).
 	 */
 	if (c->x86 == 6 &&
 	    c->x86_model == 79 &&
 	    c->x86_mask == 0x01 &&
+	    llc_size_per_core > 2621440 &&
 	    c->microcode < 0x0b000021) {
 		pr_err_once("Erratum BDF90: late loading with revision < 0x0b000021 (0x%x) disabled.\n", c->microcode);
 		pr_err_once("Please consider either early loading through initrd/built-in or a potential BIOS update.\n");
@@ -1068,6 +1073,15 @@ static struct microcode_ops microcode_intel_ops = {
 	.microcode_fini_cpu               = microcode_fini_cpu,
 };
 
+static int __init calc_llc_size_per_core(struct cpuinfo_x86 *c)
+{
+	u64 llc_size = c->x86_cache_size * 1024;
+
+	do_div(llc_size, c->x86_max_cores);
+
+	return (int)llc_size;
+}
+
 struct microcode_ops * __init init_intel_microcode(void)
 {
 	struct cpuinfo_x86 *c = &boot_cpu_data;
@@ -1078,6 +1092,8 @@ struct microcode_ops * __init init_intel_microcode(void)
 		return NULL;
 	}
 
+	llc_size_per_core = calc_llc_size_per_core(c);
+
 	return &microcode_intel_ops;
 }
 
-- 
cgit v1.2.3


From 5a802e670c46ee2027ae43ec03501ecdb85d080a Mon Sep 17 00:00:00 2001
From: Eric Dumazet <edumazet@google.com>
Date: Tue, 30 Jan 2018 03:37:39 +0100
Subject: x86: bpf_jit: small optimization in emit_bpf_tail_call()

[ upstream commit 84ccac6e7854ebbfb56d2fc6d5bef9be49bb304c ]

Saves 4 bytes replacing following instructions :

lea rax, [rsi + rdx * 8 + offsetof(...)]
mov rax, qword ptr [rax]
cmp rax, 0

by :

mov rax, [rsi + rdx * 8 + offsetof(...)]
test rax, rax

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/net/bpf_jit_comp.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 75991979f667..33f002e4cd01 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -269,7 +269,7 @@ static void emit_bpf_tail_call(u8 **pprog)
 	EMIT4(0x48, 0x8B, 0x46,                   /* mov rax, qword ptr [rsi + 16] */
 	      offsetof(struct bpf_array, map.max_entries));
 	EMIT3(0x48, 0x39, 0xD0);                  /* cmp rax, rdx */
-#define OFFSET1 47 /* number of bytes to jump */
+#define OFFSET1 43 /* number of bytes to jump */
 	EMIT2(X86_JBE, OFFSET1);                  /* jbe out */
 	label1 = cnt;
 
@@ -278,21 +278,20 @@ static void emit_bpf_tail_call(u8 **pprog)
 	 */
 	EMIT2_off32(0x8B, 0x85, -STACKSIZE + 36); /* mov eax, dword ptr [rbp - 516] */
 	EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT);     /* cmp eax, MAX_TAIL_CALL_CNT */
-#define OFFSET2 36
+#define OFFSET2 32
 	EMIT2(X86_JA, OFFSET2);                   /* ja out */
 	label2 = cnt;
 	EMIT3(0x83, 0xC0, 0x01);                  /* add eax, 1 */
 	EMIT2_off32(0x89, 0x85, -STACKSIZE + 36); /* mov dword ptr [rbp - 516], eax */
 
 	/* prog = array->ptrs[index]; */
-	EMIT4_off32(0x48, 0x8D, 0x84, 0xD6,       /* lea rax, [rsi + rdx * 8 + offsetof(...)] */
+	EMIT4_off32(0x48, 0x8B, 0x84, 0xD6,       /* mov rax, [rsi + rdx * 8 + offsetof(...)] */
 		    offsetof(struct bpf_array, ptrs));
-	EMIT3(0x48, 0x8B, 0x00);                  /* mov rax, qword ptr [rax] */
 
 	/* if (prog == NULL)
 	 *   goto out;
 	 */
-	EMIT4(0x48, 0x83, 0xF8, 0x00);            /* cmp rax, 0 */
+	EMIT3(0x48, 0x85, 0xC0);		  /* test rax,rax */
 #define OFFSET3 10
 	EMIT2(X86_JE, OFFSET3);                   /* je out */
 	label3 = cnt;
-- 
cgit v1.2.3


From 361fb0481247bea4da3eb122e685c8b72ef7c8a9 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@fb.com>
Date: Tue, 30 Jan 2018 03:37:40 +0100
Subject: bpf: fix bpf_tail_call() x64 JIT

[ upstream commit 90caccdd8cc0215705f18b92771b449b01e2474a ]

- bpf prog_array just like all other types of bpf array accepts 32-bit index.
  Clarify that in the comment.
- fix x64 JIT of bpf_tail_call which was incorrectly loading 8 instead of 4 bytes
- tighten corresponding check in the interpreter to stay consistent

The JIT bug can be triggered after introduction of BPF_F_NUMA_NODE flag
in commit 96eabe7a40aa in 4.14. Before that the map_flags would stay zero and
though JIT code is wrong it will check bounds correctly.
Hence two fixes tags. All other JITs don't have this problem.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Fixes: 96eabe7a40aa ("bpf: Allow selecting numa node during map creation")
Fixes: b52f00e6a715 ("x86: bpf_jit: implement bpf_tail_call() helper")
Acked-by: Daniel Borkmann <daniel@iogearbox.net>
Acked-by: Martin KaFai Lau <kafai@fb.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/net/bpf_jit_comp.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 33f002e4cd01..33c42b826791 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -266,9 +266,9 @@ static void emit_bpf_tail_call(u8 **pprog)
 	/* if (index >= array->map.max_entries)
 	 *   goto out;
 	 */
-	EMIT4(0x48, 0x8B, 0x46,                   /* mov rax, qword ptr [rsi + 16] */
+	EMIT2(0x89, 0xD2);                        /* mov edx, edx */
+	EMIT3(0x39, 0x56,                         /* cmp dword ptr [rsi + 16], edx */
 	      offsetof(struct bpf_array, map.max_entries));
-	EMIT3(0x48, 0x39, 0xD0);                  /* cmp rax, rdx */
 #define OFFSET1 43 /* number of bytes to jump */
 	EMIT2(X86_JBE, OFFSET1);                  /* jbe out */
 	label1 = cnt;
-- 
cgit v1.2.3


From 28c486744e6de4d882a1d853aa63d99fcba4b7a6 Mon Sep 17 00:00:00 2001
From: Alexei Starovoitov <ast@kernel.org>
Date: Tue, 30 Jan 2018 03:37:41 +0100
Subject: bpf: introduce BPF_JIT_ALWAYS_ON config

[ upstream commit 290af86629b25ffd1ed6232c4e9107da031705cb ]

The BPF interpreter has been used as part of the spectre 2 attack CVE-2017-5715.

A quote from goolge project zero blog:
"At this point, it would normally be necessary to locate gadgets in
the host kernel code that can be used to actually leak data by reading
from an attacker-controlled location, shifting and masking the result
appropriately and then using the result of that as offset to an
attacker-controlled address for a load. But piecing gadgets together
and figuring out which ones work in a speculation context seems annoying.
So instead, we decided to use the eBPF interpreter, which is built into
the host kernel - while there is no legitimate way to invoke it from inside
a VM, the presence of the code in the host kernel's text section is sufficient
to make it usable for the attack, just like with ordinary ROP gadgets."

To make attacker job harder introduce BPF_JIT_ALWAYS_ON config
option that removes interpreter from the kernel in favor of JIT-only mode.
So far eBPF JIT is supported by:
x64, arm64, arm32, sparc64, s390, powerpc64, mips64

The start of JITed program is randomized and code page is marked as read-only.
In addition "constant blinding" can be turned on with net.core.bpf_jit_harden

v2->v3:
- move __bpf_prog_ret0 under ifdef (Daniel)

v1->v2:
- fix init order, test_bpf and cBPF (Daniel's feedback)
- fix offloaded bpf (Jakub's feedback)
- add 'return 0' dummy in case something can invoke prog->bpf_func
- retarget bpf tree. For bpf-next the patch would need one extra hunk.
  It will be sent when the trees are merged back to net-next

Considered doing:
  int bpf_jit_enable __read_mostly = BPF_EBPF_JIT_DEFAULT;
but it seems better to land the patch as-is and in bpf-next remove
bpf_jit_enable global variable from all JITs, consolidate in one place
and remove this jit_init() function.

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/arm64/Kconfig | 1 +
 arch/s390/Kconfig  | 1 +
 arch/x86/Kconfig   | 1 +
 3 files changed, 3 insertions(+)

(limited to 'arch')

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 14cdc6dea493..83af36d9439f 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -54,6 +54,7 @@ config ARM64
 	select HAVE_ARCH_SECCOMP_FILTER
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_BPF_JIT
+	select HAVE_EBPF_JIT
 	select HAVE_C_RECORDMCOUNT
 	select HAVE_CC_STACKPROTECTOR
 	select HAVE_CMPXCHG_DOUBLE
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 5ad7b721b769..2ee95ece0498 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -123,6 +123,7 @@ config S390
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_ARCH_TRANSPARENT_HUGEPAGE
 	select HAVE_BPF_JIT if PACK_STACK && HAVE_MARCH_Z196_FEATURES
+	select HAVE_EBPF_JIT if PACK_STACK && HAVE_MARCH_Z196_FEATURES
 	select HAVE_CMPXCHG_DOUBLE
 	select HAVE_CMPXCHG_LOCAL
 	select HAVE_DEBUG_KMEMLEAK
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 75d0053b495a..2db93042f2f3 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -88,6 +88,7 @@ config X86
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_ARCH_TRANSPARENT_HUGEPAGE
 	select HAVE_BPF_JIT			if X86_64
+	select HAVE_EBPF_JIT			if X86_64
 	select HAVE_CC_STACKPROTECTOR
 	select HAVE_CMPXCHG_DOUBLE
 	select HAVE_CMPXCHG_LOCAL
-- 
cgit v1.2.3


From 5991ee90a270537a8a04751f0097b82274ebc177 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Wed, 10 Jan 2018 14:49:39 -0800
Subject: x86/pti: Make unpoison of pgd for trusted boot work for real

commit 445b69e3b75e42362a5bdc13c8b8f61599e2228a upstream.

The inital fix for trusted boot and PTI potentially misses the pgd clearing
if pud_alloc() sets a PGD.  It probably works in *practice* because for two
adjacent calls to map_tboot_page() that share a PGD entry, the first will
clear NX, *then* allocate and set the PGD (without NX clear).  The second
call will *not* allocate but will clear the NX bit.

Defer the NX clearing to a point after it is known that all top-level
allocations have occurred.  Add a comment to clarify why.

[ tglx: Massaged changelog ]

[hughd notes: I have not tested tboot, but this looks to me as necessary
and as safe in old-Kaiser backports as it is upstream; I'm not submitting
the commit-to-be-fixed 262b6b30087, since it was undone by 445b69e3b75e,
and makes conflict trouble because of 5-level's p4d versus 4-level's pgd.]

Fixes: 262b6b30087 ("x86/tboot: Unbreak tboot with PTI enabled")
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: Jon Masters <jcm@redhat.com>
Cc: "Tim Chen" <tim.c.chen@linux.intel.com>
Cc: gnomes@lxorguk.ukuu.org.uk
Cc: peterz@infradead.org
Cc: ning.sun@intel.com
Cc: tboot-devel@lists.sourceforge.net
Cc: andi@firstfloor.org
Cc: luto@kernel.org
Cc: law@redhat.com
Cc: pbonzini@redhat.com
Cc: torvalds@linux-foundation.org
Cc: gregkh@linux-foundation.org
Cc: dwmw@amazon.co.uk
Cc: nickc@redhat.com
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/20180110224939.2695CD47@viggo.jf.intel.com
Signed-off-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kernel/tboot.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index 91a4496db434..c77ab1f51fbe 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -140,6 +140,16 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn,
 		return -1;
 	set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot));
 	pte_unmap(pte);
+
+	/*
+	 * PTI poisons low addresses in the kernel page tables in the
+	 * name of making them unusable for userspace.  To execute
+	 * code at such a low address, the poison must be cleared.
+	 *
+	 * Note: 'pgd' actually gets set in pud_alloc().
+	 */
+	pgd->pgd &= ~_PAGE_NX;
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From 145ebf95fb346528dd276c3e23324609e5f4d3f6 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Mon, 29 Jan 2018 18:15:33 -0800
Subject: kaiser: fix intel_bts perf crashes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Vince reported perf_fuzzer quickly locks up on 4.15-rc7 with PTI;
Robert reported Bad RIP with KPTI and Intel BTS also on 4.15-rc7:
honggfuzz -f /tmp/somedirectorywithatleastonefile \
          --linux_perf_bts_edge -s -- /bin/true
(honggfuzz from https://github.com/google/honggfuzz) crashed with
BUG: unable to handle kernel paging request at ffff9d3215100000
(then narrowed it down to
perf record --per-thread -e intel_bts//u -- /bin/ls).

The intel_bts driver does not use the 'normal' BTS buffer which is
exposed through kaiser_add_mapping(), but instead uses the memory
allocated for the perf AUX buffer.

This obviously comes apart when using PTI, because then the kernel
mapping, which includes that AUX buffer memory, disappears while
switched to user page tables.

Easily fixed in old-Kaiser backports, by applying kaiser_add_mapping()
to those pages; perhaps not so easy for upstream, where 4.15-rc8 commit
99a9dc98ba52 ("x86,perf: Disable intel_bts when PTI") disables for now.

Slightly reorganized surrounding code in bts_buffer_setup_aux(),
so it can better match bts_buffer_free_aux(): free_aux with an #ifdef
to avoid the loop when PTI is off, but setup_aux needs to loop anyway
(and kaiser_add_mapping() is cheap when PTI config is off or "pti=off").

Reported-by: Vince Weaver <vincent.weaver@maine.edu>
Reported-by: Robert Święcki <robert@swiecki.net>
Analyzed-by: Peter Zijlstra <peterz@infradead.org>
Analyzed-by: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Vince Weaver <vince@deater.net>
Cc: stable@vger.kernel.org
Cc: Jiri Kosina <jkosina@suze.cz>
Signed-off-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kernel/cpu/perf_event_intel_bts.c | 44 ++++++++++++++++++++++--------
 1 file changed, 33 insertions(+), 11 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_event_intel_bts.c b/arch/x86/kernel/cpu/perf_event_intel_bts.c
index 2cad71d1b14c..5af11c46d0b9 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_bts.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_bts.c
@@ -22,6 +22,7 @@
 #include <linux/debugfs.h>
 #include <linux/device.h>
 #include <linux/coredump.h>
+#include <linux/kaiser.h>
 
 #include <asm-generic/sizes.h>
 #include <asm/perf_event.h>
@@ -67,6 +68,23 @@ static size_t buf_size(struct page *page)
 	return 1 << (PAGE_SHIFT + page_private(page));
 }
 
+static void bts_buffer_free_aux(void *data)
+{
+#ifdef CONFIG_PAGE_TABLE_ISOLATION
+	struct bts_buffer *buf = data;
+	int nbuf;
+
+	for (nbuf = 0; nbuf < buf->nr_bufs; nbuf++) {
+		struct page *page = buf->buf[nbuf].page;
+		void *kaddr = page_address(page);
+		size_t page_size = buf_size(page);
+
+		kaiser_remove_mapping((unsigned long)kaddr, page_size);
+	}
+#endif
+	kfree(data);
+}
+
 static void *
 bts_buffer_setup_aux(int cpu, void **pages, int nr_pages, bool overwrite)
 {
@@ -103,29 +121,33 @@ bts_buffer_setup_aux(int cpu, void **pages, int nr_pages, bool overwrite)
 	buf->real_size = size - size % BTS_RECORD_SIZE;
 
 	for (pg = 0, nbuf = 0, offset = 0, pad = 0; nbuf < buf->nr_bufs; nbuf++) {
-		unsigned int __nr_pages;
+		void *kaddr = pages[pg];
+		size_t page_size;
+
+		page = virt_to_page(kaddr);
+		page_size = buf_size(page);
+
+		if (kaiser_add_mapping((unsigned long)kaddr,
+					page_size, __PAGE_KERNEL) < 0) {
+			buf->nr_bufs = nbuf;
+			bts_buffer_free_aux(buf);
+			return NULL;
+		}
 
-		page = virt_to_page(pages[pg]);
-		__nr_pages = PagePrivate(page) ? 1 << page_private(page) : 1;
 		buf->buf[nbuf].page = page;
 		buf->buf[nbuf].offset = offset;
 		buf->buf[nbuf].displacement = (pad ? BTS_RECORD_SIZE - pad : 0);
-		buf->buf[nbuf].size = buf_size(page) - buf->buf[nbuf].displacement;
+		buf->buf[nbuf].size = page_size - buf->buf[nbuf].displacement;
 		pad = buf->buf[nbuf].size % BTS_RECORD_SIZE;
 		buf->buf[nbuf].size -= pad;
 
-		pg += __nr_pages;
-		offset += __nr_pages << PAGE_SHIFT;
+		pg += page_size >> PAGE_SHIFT;
+		offset += page_size;
 	}
 
 	return buf;
 }
 
-static void bts_buffer_free_aux(void *data)
-{
-	kfree(data);
-}
-
 static unsigned long bts_buffer_offset(struct bts_buffer *buf, unsigned int idx)
 {
 	return buf->buf[idx].offset + buf->buf[idx].displacement;
-- 
cgit v1.2.3


From 0b69d7f4b5fc1ac0cf49a21139a1384e6d0c934f Mon Sep 17 00:00:00 2001
From: Stephan Mueller <smueller@chronox.de>
Date: Thu, 18 Jan 2018 20:41:09 +0100
Subject: crypto: aesni - handle zero length dst buffer

commit 9c674e1e2f9e24fa4392167efe343749008338e0 upstream.

GCM can be invoked with a zero destination buffer. This is possible if
the AAD and the ciphertext have zero lengths and only the tag exists in
the source buffer (i.e. a source buffer cannot be zero). In this case,
the GCM cipher only performs the authentication and no decryption
operation.

When the destination buffer has zero length, it is possible that no page
is mapped to the SG pointing to the destination. In this case,
sg_page(req->dst) is an invalid access. Therefore, page accesses should
only be allowed if the req->dst->length is non-zero which is the
indicator that a page must exist.

This fixes a crash that can be triggered by user space via AF_ALG.

Signed-off-by: Stephan Mueller <smueller@chronox.de>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/crypto/aesni-intel_glue.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 3633ad6145c5..c18806b5db2a 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -965,7 +965,7 @@ static int helper_rfc4106_encrypt(struct aead_request *req)
 
 	if (sg_is_last(req->src) &&
 	    req->src->offset + req->src->length <= PAGE_SIZE &&
-	    sg_is_last(req->dst) &&
+	    sg_is_last(req->dst) && req->dst->length &&
 	    req->dst->offset + req->dst->length <= PAGE_SIZE) {
 		one_entry_in_sg = 1;
 		scatterwalk_start(&src_sg_walk, req->src);
-- 
cgit v1.2.3


From 3d4df917d67191bca18aec35b9d1065d718fc39c Mon Sep 17 00:00:00 2001
From: Liran Alon <liran.alon@oracle.com>
Date: Sun, 5 Nov 2017 16:56:33 +0200
Subject: KVM: x86: emulator: Return to user-mode on L1 CPL=0 emulation failure
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[ Upstream commit 1f4dcb3b213235e642088709a1c54964d23365e9 ]

On this case, handle_emulation_failure() fills kvm_run with
internal-error information which it expects to be delivered
to user-mode for further processing.
However, the code reports a wrong return-value which makes KVM to never
return to user-mode on this scenario.

Fixes: 6d77dbfc88e3 ("KVM: inject #UD if instruction emulation fails and exit to
userspace")

Signed-off-by: Liran Alon <liran.alon@oracle.com>
Reviewed-by: Nikita Leshenko <nikita.leshchenko@oracle.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Reviewed-by: Wanpeng Li <wanpeng.li@hotmail.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Sasha Levin <alexander.levin@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kvm/x86.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f973cfa8ff4f..3900d34980de 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5153,7 +5153,7 @@ static int handle_emulation_failure(struct kvm_vcpu *vcpu)
 		vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
 		vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
 		vcpu->run->internal.ndata = 0;
-		r = EMULATE_FAIL;
+		r = EMULATE_USER_EXIT;
 	}
 	kvm_queue_exception(vcpu, UD_VECTOR);
 
-- 
cgit v1.2.3


From 80d2b5af21d2a29abfd58d378195a446dbc3ae43 Mon Sep 17 00:00:00 2001
From: Liran Alon <liran.alon@oracle.com>
Date: Sun, 5 Nov 2017 16:56:34 +0200
Subject: KVM: x86: Don't re-execute instruction when not passing CR2 value
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[ Upstream commit 9b8ae63798cb97e785a667ff27e43fa6220cb734 ]

In case of instruction-decode failure or emulation failure,
x86_emulate_instruction() will call reexecute_instruction() which will
attempt to use the cr2 value passed to x86_emulate_instruction().
However, when x86_emulate_instruction() is called from
emulate_instruction(), cr2 is not passed (passed as 0) and therefore
it doesn't make sense to execute reexecute_instruction() logic at all.

Fixes: 51d8b66199e9 ("KVM: cleanup emulate_instruction")

Signed-off-by: Liran Alon <liran.alon@oracle.com>
Reviewed-by: Nikita Leshenko <nikita.leshchenko@oracle.com>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Reviewed-by: Wanpeng Li <wanpeng.li@hotmail.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Sasha Levin <alexander.levin@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/kvm_host.h | 3 ++-
 arch/x86/kvm/vmx.c              | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 9d2abb2a41d2..74fda1a453bd 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -998,7 +998,8 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2,
 static inline int emulate_instruction(struct kvm_vcpu *vcpu,
 			int emulation_type)
 {
-	return x86_emulate_instruction(vcpu, 0, emulation_type, NULL, 0);
+	return x86_emulate_instruction(vcpu, 0,
+			emulation_type | EMULTYPE_NO_REEXECUTE, NULL, 0);
 }
 
 void kvm_enable_efer_bits(u64);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 75d60e40c389..e35218490305 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -6023,7 +6023,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
 		if (test_bit(KVM_REQ_EVENT, &vcpu->requests))
 			return 1;
 
-		err = emulate_instruction(vcpu, EMULTYPE_NO_REEXECUTE);
+		err = emulate_instruction(vcpu, 0);
 
 		if (err == EMULATE_USER_EXIT) {
 			++vcpu->stat.mmio_exits;
-- 
cgit v1.2.3


From 3bad2ece682151144b014bd18506d44acf6c21e4 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpeng.li@hotmail.com>
Date: Sun, 5 Nov 2017 16:54:47 -0800
Subject: KVM: X86: Fix operand/address-size during instruction decoding
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[ Upstream commit 3853be2603191829b442b64dac6ae8ba0c027bf9 ]

Pedro reported:
  During tests that we conducted on KVM, we noticed that executing a "PUSH %ES"
  instruction under KVM produces different results on both memory and the SP
  register depending on whether EPT support is enabled. With EPT the SP is
  reduced by 4 bytes (and the written value is 0-padded) but without EPT support
  it is only reduced by 2 bytes. The difference can be observed when the CS.DB
  field is 1 (32-bit) but not when it's 0 (16-bit).

The internal segment descriptor cache exist even in real/vm8096 mode. The CS.D
also should be respected instead of just default operand/address-size/66H
prefix/67H prefix during instruction decoding. This patch fixes it by also
adjusting operand/address-size according to CS.D.

Reported-by: Pedro Fonseca <pfonseca@cs.washington.edu>
Tested-by: Pedro Fonseca <pfonseca@cs.washington.edu>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Pedro Fonseca <pfonseca@cs.washington.edu>
Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
Reviewed-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Sasha Levin <alexander.levin@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kvm/emulate.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 00045499f6c2..e4eb1d2bf849 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -4978,6 +4978,8 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
 	bool op_prefix = false;
 	bool has_seg_override = false;
 	struct opcode opcode;
+	u16 dummy;
+	struct desc_struct desc;
 
 	ctxt->memop.type = OP_NONE;
 	ctxt->memopp = NULL;
@@ -4996,6 +4998,11 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
 	switch (mode) {
 	case X86EMUL_MODE_REAL:
 	case X86EMUL_MODE_VM86:
+		def_op_bytes = def_ad_bytes = 2;
+		ctxt->ops->get_segment(ctxt, &dummy, &desc, NULL, VCPU_SREG_CS);
+		if (desc.d)
+			def_op_bytes = def_ad_bytes = 4;
+		break;
 	case X86EMUL_MODE_PROT16:
 		def_op_bytes = def_ad_bytes = 2;
 		break;
-- 
cgit v1.2.3


From 19a2c007214931808cbeae3ceb673233e57ed4f9 Mon Sep 17 00:00:00 2001
From: Nikita Leshenko <nikita.leshchenko@oracle.com>
Date: Sun, 5 Nov 2017 15:52:29 +0200
Subject: KVM: x86: ioapic: Fix level-triggered EOI and IOAPIC reconfigure race
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[ Upstream commit 0fc5a36dd6b345eb0d251a65c236e53bead3eef7 ]

KVM uses ioapic_handled_vectors to track vectors that need to notify the
IOAPIC on EOI. The problem is that IOAPIC can be reconfigured while an
interrupt with old configuration is pending or running and
ioapic_handled_vectors only remembers the newest configuration;
thus EOI from the old interrupt is not delievered to the IOAPIC.

A previous commit db2bdcbbbd32
("KVM: x86: fix edge EOI and IOAPIC reconfig race")
addressed this issue by adding pending edge-triggered interrupts to
ioapic_handled_vectors, fixing this race for edge-triggered interrupts.
The commit explicitly ignored level-triggered interrupts,
but this race applies to them as well:

1) IOAPIC sends a level triggered interrupt vector to VCPU0
2) VCPU0's handler deasserts the irq line and reconfigures the IOAPIC
   to route the vector to VCPU1. The reconfiguration rewrites only the
   upper 32 bits of the IOREDTBLn register. (Causes KVM to update
   ioapic_handled_vectors for VCPU0 and it no longer includes the vector.)
3) VCPU0 sends EOI for the vector, but it's not delievered to the
   IOAPIC because the ioapic_handled_vectors doesn't include the vector.
4) New interrupts are not delievered to VCPU1 because remote_irr bit
   is set forever.

Therefore, the correct behavior is to add all pending and running
interrupts to ioapic_handled_vectors.

This commit introduces a slight performance hit similar to
commit db2bdcbbbd32 ("KVM: x86: fix edge EOI and IOAPIC reconfig race")
for the rare case that the vector is reused by a non-IOAPIC source on
VCPU0. We prefer to keep solution simple and not handle this case just
as the original commit does.

Fixes: db2bdcbbbd32 ("KVM: x86: fix edge EOI and IOAPIC reconfig race")

Signed-off-by: Nikita Leshenko <nikita.leshchenko@oracle.com>
Reviewed-by: Liran Alon <liran.alon@oracle.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Sasha Levin <alexander.levin@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kvm/ioapic.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index 3aab53f8cad2..96ee7091becf 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -247,8 +247,7 @@ void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
 		    index == RTC_GSI) {
 			if (kvm_apic_match_dest(vcpu, NULL, 0,
 			             e->fields.dest_id, e->fields.dest_mode) ||
-			    (e->fields.trig_mode == IOAPIC_EDGE_TRIG &&
-			     kvm_apic_pending_eoi(vcpu, e->fields.vector)))
+			    kvm_apic_pending_eoi(vcpu, e->fields.vector))
 				__set_bit(e->fields.vector,
 					(unsigned long *)eoi_exit_bitmap);
 		}
-- 
cgit v1.2.3


From 3ee7ae2bb537e54d3f6ce7120c5cc39957bec930 Mon Sep 17 00:00:00 2001
From: Nikita Leshenko <nikita.leshchenko@oracle.com>
Date: Sun, 5 Nov 2017 15:52:32 +0200
Subject: KVM: x86: ioapic: Clear Remote IRR when entry is switched to
 edge-triggered
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[ Upstream commit a8bfec2930525808c01f038825d1df3904638631 ]

Some OSes (Linux, Xen) use this behavior to clear the Remote IRR bit for
IOAPICs without an EOI register. They simulate the EOI message manually
by changing the trigger mode to edge and then back to level, with the
entry being masked during this.

QEMU implements this feature in commit ed1263c363c9
("ioapic: clear remote irr bit for edge-triggered interrupts")

As a side effect, this commit removes an incorrect behavior where Remote
IRR was cleared when the redirection table entry was rewritten. This is not
consistent with the manual and also opens an opportunity for a strange
behavior when a redirection table entry is modified from an interrupt
handler that handles the same entry: The modification will clear the
Remote IRR bit even though the interrupt handler is still running.

Signed-off-by: Nikita Leshenko <nikita.leshchenko@oracle.com>
Reviewed-by: Liran Alon <liran.alon@oracle.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Reviewed-by: Wanpeng Li <wanpeng.li@hotmail.com>
Reviewed-by: Steve Rutherford <srutherford@google.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Sasha Levin <alexander.levin@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kvm/ioapic.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index 96ee7091becf..403dd464965c 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -296,8 +296,17 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
 		} else {
 			e->bits &= ~0xffffffffULL;
 			e->bits |= (u32) val;
-			e->fields.remote_irr = 0;
 		}
+
+		/*
+		 * Some OSes (Linux, Xen) assume that Remote IRR bit will
+		 * be cleared by IOAPIC hardware when the entry is configured
+		 * as edge-triggered. This behavior is used to simulate an
+		 * explicit EOI on IOAPICs that don't have the EOI register.
+		 */
+		if (e->fields.trig_mode == IOAPIC_EDGE_TRIG)
+			e->fields.remote_irr = 0;
+
 		mask_after = e->fields.mask;
 		if (mask_before != mask_after)
 			kvm_fire_mask_notifiers(ioapic->kvm, KVM_IRQCHIP_IOAPIC, index, mask_after);
-- 
cgit v1.2.3


From 77e1eb750f4172f0718a6d770798b0ab9eeeb8d6 Mon Sep 17 00:00:00 2001
From: Nikita Leshenko <nikita.leshchenko@oracle.com>
Date: Sun, 5 Nov 2017 15:52:33 +0200
Subject: KVM: x86: ioapic: Preserve read-only values in the redirection table
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[ Upstream commit b200dded0a6974a3b69599832b2203483920ab25 ]

According to 82093AA (IOAPIC) manual, Remote IRR and Delivery Status are
read-only. QEMU implements the bits as RO in commit 479c2a1cb7fb
("ioapic: keep RO bits for IOAPIC entry").

Signed-off-by: Nikita Leshenko <nikita.leshchenko@oracle.com>
Reviewed-by: Liran Alon <liran.alon@oracle.com>
Signed-off-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Reviewed-by: Wanpeng Li <wanpeng.li@hotmail.com>
Reviewed-by: Steve Rutherford <srutherford@google.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Sasha Levin <alexander.levin@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kvm/ioapic.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c
index 403dd464965c..d380111351c0 100644
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -268,6 +268,7 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
 {
 	unsigned index;
 	bool mask_before, mask_after;
+	int old_remote_irr, old_delivery_status;
 	union kvm_ioapic_redirect_entry *e;
 
 	switch (ioapic->ioregsel) {
@@ -290,6 +291,9 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
 			return;
 		e = &ioapic->redirtbl[index];
 		mask_before = e->fields.mask;
+		/* Preserve read-only fields */
+		old_remote_irr = e->fields.remote_irr;
+		old_delivery_status = e->fields.delivery_status;
 		if (ioapic->ioregsel & 1) {
 			e->bits &= 0xffffffff;
 			e->bits |= (u64) val << 32;
@@ -297,6 +301,8 @@ static void ioapic_write_indirect(struct kvm_ioapic *ioapic, u32 val)
 			e->bits &= ~0xffffffffULL;
 			e->bits |= (u32) val;
 		}
+		e->fields.remote_irr = old_remote_irr;
+		e->fields.delivery_status = old_delivery_status;
 
 		/*
 		 * Some OSes (Linux, Xen) assume that Remote IRR bit will
-- 
cgit v1.2.3


From fe7832ffea9ab8c579b4bd94f3529380dbb3f6e6 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpeng.li@hotmail.com>
Date: Mon, 20 Nov 2017 14:52:21 -0800
Subject: KVM: VMX: Fix rflags cache during vCPU reset
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[ Upstream commit c37c28730bb031cc8a44a130c2555c0f3efbe2d0 ]

Reported by syzkaller:

   *** Guest State ***
   CR0: actual=0x0000000080010031, shadow=0x0000000060000010, gh_mask=fffffffffffffff7
   CR4: actual=0x0000000000002061, shadow=0x0000000000000000, gh_mask=ffffffffffffe8f1
   CR3 = 0x000000002081e000
   RSP = 0x000000000000fffa  RIP = 0x0000000000000000
   RFLAGS=0x00023000         DR7 = 0x00000000000000
          ^^^^^^^^^^
   ------------[ cut here ]------------
   WARNING: CPU: 6 PID: 24431 at /home/kernel/linux/arch/x86/kvm//x86.c:7302 kvm_arch_vcpu_ioctl_run+0x651/0x2ea0 [kvm]
   CPU: 6 PID: 24431 Comm: reprotest Tainted: G        W  OE   4.14.0+ #26
   RIP: 0010:kvm_arch_vcpu_ioctl_run+0x651/0x2ea0 [kvm]
   RSP: 0018:ffff880291d179e0 EFLAGS: 00010202
   Call Trace:
    kvm_vcpu_ioctl+0x479/0x880 [kvm]
    do_vfs_ioctl+0x142/0x9a0
    SyS_ioctl+0x74/0x80
    entry_SYSCALL_64_fastpath+0x23/0x9a

The failed vmentry is triggered by the following beautified testcase:

    #include <unistd.h>
    #include <sys/syscall.h>
    #include <string.h>
    #include <stdint.h>
    #include <linux/kvm.h>
    #include <fcntl.h>
    #include <sys/ioctl.h>

    long r[5];
    int main()
    {
        struct kvm_debugregs dr = { 0 };

        r[2] = open("/dev/kvm", O_RDONLY);
        r[3] = ioctl(r[2], KVM_CREATE_VM, 0);
        r[4] = ioctl(r[3], KVM_CREATE_VCPU, 7);
        struct kvm_guest_debug debug = {
                .control = 0xf0403,
                .arch = {
                        .debugreg[6] = 0x2,
                        .debugreg[7] = 0x2
                }
        };
        ioctl(r[4], KVM_SET_GUEST_DEBUG, &debug);
        ioctl(r[4], KVM_RUN, 0);
    }

which testcase tries to setup the processor specific debug
registers and configure vCPU for handling guest debug events through
KVM_SET_GUEST_DEBUG.  The KVM_SET_GUEST_DEBUG ioctl will get and set
rflags in order to set TF bit if single step is needed. All regs' caches
are reset to avail and GUEST_RFLAGS vmcs field is reset to 0x2 during vCPU
reset. However, the cache of rflags is not reset during vCPU reset. The
function vmx_get_rflags() returns an unreset rflags cache value since
the cache is marked avail, it is 0 after boot. Vmentry fails if the
rflags reserved bit 1 is 0.

This patch fixes it by resetting both the GUEST_RFLAGS vmcs field and
its cache to 0x2 during vCPU reset.

Reported-by: Dmitry Vyukov <dvyukov@google.com>
Tested-by: Dmitry Vyukov <dvyukov@google.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: Nadav Amit <nadav.amit@gmail.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Sasha Levin <alexander.levin@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kvm/vmx.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index e35218490305..f8d785aa2e96 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4954,7 +4954,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 		vmcs_write64(GUEST_IA32_DEBUGCTL, 0);
 	}
 
-	vmcs_writel(GUEST_RFLAGS, 0x02);
+	kvm_set_rflags(vcpu, X86_EFLAGS_FIXED);
 	kvm_rip_write(vcpu, 0xfff0);
 
 	vmcs_writel(GUEST_GDTR_BASE, 0);
-- 
cgit v1.2.3


From 98c977e62ecee701efe1ba8135747358a2cfba72 Mon Sep 17 00:00:00 2001
From: "Naveen N. Rao" <naveen.n.rao@linux.vnet.ibm.com>
Date: Wed, 22 Jun 2016 21:55:01 +0530
Subject: powerpc/bpf/jit: Disable classic BPF JIT on ppc64le

commit 844e3be47693f92a108cb1fb3b0606bf25e9c7a6 upstream.

Classic BPF JIT was never ported completely to work on little endian
powerpc. However, it can be enabled and will crash the system when used.
As such, disable use of BPF JIT on ppc64le.

Fixes: 7c105b63bd98 ("powerpc: Add CONFIG_CPU_LITTLE_ENDIAN kernel config option.")
Reported-by: Thadeu Lima de Souza Cascardo <cascardo@redhat.com>
Signed-off-by: Naveen N. Rao <naveen.n.rao@linux.vnet.ibm.com>
Acked-by: Thadeu Lima de Souza Cascardo <cascardo@redhat.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/powerpc/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index dfb1ee8c3e06..0c26025e18f9 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -129,7 +129,7 @@ config PPC
 	select IRQ_FORCED_THREADING
 	select HAVE_RCU_TABLE_FREE if SMP
 	select HAVE_SYSCALL_TRACEPOINTS
-	select HAVE_BPF_JIT
+	select HAVE_BPF_JIT if CPU_BIG_ENDIAN
 	select HAVE_ARCH_JUMP_LABEL
 	select ARCH_HAVE_NMI_SAFE_CMPXCHG
 	select ARCH_HAS_GCOV_PROFILE_ALL
-- 
cgit v1.2.3


From 1f67581356f7797696806c948d4e126f9b2a880f Mon Sep 17 00:00:00 2001
From: Oliver O'Halloran <oohall@gmail.com>
Date: Mon, 3 Apr 2017 13:25:12 +1000
Subject: powerpc/64: Fix flush_(d|i)cache_range() called from modules

commit 8f5f525d5b83f7d76a6baf9c4e94d4bf312ea7f6 upstream.

When the kernel is compiled to use 64bit ABIv2 the _GLOBAL() macro does
not include a global entry point. A function's global entry point is
used when the function is called from a different TOC context and in the
kernel this typically means a call from a module into the vmlinux (or
vice-versa).

There are a few exported asm functions declared with _GLOBAL() and
calling them from a module will likely crash the kernel since any TOC
relative load will yield garbage.

flush_icache_range() and flush_dcache_range() are both exported to
modules, and use the TOC, so must use _GLOBAL_TOC().

Fixes: 721aeaa9fdf3 ("powerpc: Build little endian ppc64 kernel with ABIv2")
Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/powerpc/include/asm/ppc_asm.h | 12 ++++++++++++
 arch/powerpc/kernel/misc_64.S      |  4 ++--
 2 files changed, 14 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/ppc_asm.h b/arch/powerpc/include/asm/ppc_asm.h
index dd0fc18d8103..160bb2311bbb 100644
--- a/arch/powerpc/include/asm/ppc_asm.h
+++ b/arch/powerpc/include/asm/ppc_asm.h
@@ -224,6 +224,16 @@ name: \
 	.globl name; \
 name:
 
+#define _KPROBE_TOC(name)			\
+	.section ".kprobes.text","a";		\
+	.align 2 ;				\
+	.type name,@function;			\
+	.globl name;				\
+name:						\
+0:	addis r2,r12,(.TOC.-0b)@ha;		\
+	addi r2,r2,(.TOC.-0b)@l;		\
+	.localentry name,.-name
+
 #define DOTSYM(a)	a
 
 #else
@@ -261,6 +271,8 @@ name: \
 	.type GLUE(.,name),@function; \
 GLUE(.,name):
 
+#define _KPROBE_TOC(n)	_KPROBE(n)
+
 #define DOTSYM(a)	GLUE(.,a)
 
 #endif
diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S
index db475d41b57a..415e58565745 100644
--- a/arch/powerpc/kernel/misc_64.S
+++ b/arch/powerpc/kernel/misc_64.S
@@ -66,7 +66,7 @@ PPC64_CACHES:
  *   flush all bytes from start through stop-1 inclusive
  */
 
-_KPROBE(flush_icache_range)
+_KPROBE_TOC(flush_icache_range)
 BEGIN_FTR_SECTION
 	PURGE_PREFETCHED_INS
 	blr
@@ -117,7 +117,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_COHERENT_ICACHE)
  *
  *    flush all bytes from start to stop-1 inclusive
  */
-_GLOBAL(flush_dcache_range)
+_GLOBAL_TOC(flush_dcache_range)
 
 /*
  * Flush the data cache to memory 
-- 
cgit v1.2.3


From e13972478e99666787867628fa3ff1b87ce3cd82 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Wed, 16 Aug 2017 16:01:14 +1000
Subject: powerpc: Fix VSX enabling/flushing to also test MSR_FP and MSR_VEC

commit 5a69aec945d27e78abac9fd032533d3aaebf7c1e upstream.

VSX uses a combination of the old vector registers, the old FP
registers and new "second halves" of the FP registers.

Thus when we need to see the VSX state in the thread struct
(flush_vsx_to_thread()) or when we'll use the VSX in the kernel
(enable_kernel_vsx()) we need to ensure they are all flushed into
the thread struct if either of them is individually enabled.

Unfortunately we only tested if the whole VSX was enabled, not if they
were individually enabled.

Fixes: 72cd7b44bc99 ("powerpc: Uncomment and make enable_kernel_vsx() routine available")
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
[mpe: Backported due to changed context]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/powerpc/kernel/process.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index cf788d7d7e56..a9b10812cbfd 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -209,7 +209,8 @@ void enable_kernel_vsx(void)
 	WARN_ON(preemptible());
 
 #ifdef CONFIG_SMP
-	if (current->thread.regs && (current->thread.regs->msr & MSR_VSX))
+	if (current->thread.regs &&
+	    (current->thread.regs->msr & (MSR_VSX|MSR_VEC|MSR_FP)))
 		giveup_vsx(current);
 	else
 		giveup_vsx(NULL);	/* just enable vsx for kernel - force */
@@ -231,7 +232,7 @@ void flush_vsx_to_thread(struct task_struct *tsk)
 {
 	if (tsk->thread.regs) {
 		preempt_disable();
-		if (tsk->thread.regs->msr & MSR_VSX) {
+		if (tsk->thread.regs->msr & (MSR_VSX|MSR_VEC|MSR_FP)) {
 #ifdef CONFIG_SMP
 			BUG_ON(tsk != current);
 #endif
-- 
cgit v1.2.3


From fa34303532dcd4c609729cd0cc8aa2c53b3eb4fd Mon Sep 17 00:00:00 2001
From: Alan Modra <amodra@gmail.com>
Date: Fri, 15 Jan 2016 20:52:22 +1100
Subject: powerpc: Simplify module TOC handling

commit c153693d7eb9eeb28478aa2deaaf0b4e7b5ff5e9 upstream.

PowerPC64 uses the symbol .TOC. much as other targets use
_GLOBAL_OFFSET_TABLE_. It identifies the value of the GOT pointer (or in
powerpc parlance, the TOC pointer). Global offset tables are generally
local to an executable or shared library, or in the kernel, module. Thus
it does not make sense for a module to resolve a relocation against
.TOC. to the kernel's .TOC. value. A module has its own .TOC., and
indeed the powerpc64 module relocation processing ignores the kernel
value of .TOC. and instead calculates a module-local value.

This patch removes code involved in exporting the kernel .TOC., tweaks
modpost to ignore an undefined .TOC., and the module loader to twiddle
the section symbol so that .TOC. isn't seen as undefined.

Note that if the kernel was compiled with -msingle-pic-base then ELFv2
would not have function global entry code setting up r2. In that case
the module call stubs would need to be modified to set up r2 using the
kernel .TOC. value, requiring some of this code to be reinstated.

mpe: Furthermore a change in binutils master (not yet released) causes
the current way we handle the TOC to no longer work when building with
MODVERSIONS=y and RELOCATABLE=n. The symptom is that modules can not be
loaded due to there being no version found for TOC.

Signed-off-by: Alan Modra <amodra@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/powerpc/kernel/misc_64.S   | 28 ----------------------------
 arch/powerpc/kernel/module_64.c | 12 +++++++++---
 2 files changed, 9 insertions(+), 31 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S
index 415e58565745..107588295b39 100644
--- a/arch/powerpc/kernel/misc_64.S
+++ b/arch/powerpc/kernel/misc_64.S
@@ -701,31 +701,3 @@ _GLOBAL(kexec_sequence)
 	li	r5,0
 	blr	/* image->start(physid, image->start, 0); */
 #endif /* CONFIG_KEXEC */
-
-#ifdef CONFIG_MODULES
-#if defined(_CALL_ELF) && _CALL_ELF == 2
-
-#ifdef CONFIG_MODVERSIONS
-.weak __crc_TOC.
-.section "___kcrctab+TOC.","a"
-.globl __kcrctab_TOC.
-__kcrctab_TOC.:
-	.llong	__crc_TOC.
-#endif
-
-/*
- * Export a fake .TOC. since both modpost and depmod will complain otherwise.
- * Both modpost and depmod strip the leading . so we do the same here.
- */
-.section "__ksymtab_strings","a"
-__kstrtab_TOC.:
-	.asciz "TOC."
-
-.section "___ksymtab+TOC.","a"
-/* This symbol name is important: it's used by modpost to find exported syms */
-.globl __ksymtab_TOC.
-__ksymtab_TOC.:
-	.llong 0 /* .value */
-	.llong __kstrtab_TOC.
-#endif /* ELFv2 */
-#endif /* MODULES */
diff --git a/arch/powerpc/kernel/module_64.c b/arch/powerpc/kernel/module_64.c
index e4f7d4eed20c..08b7a40de5f8 100644
--- a/arch/powerpc/kernel/module_64.c
+++ b/arch/powerpc/kernel/module_64.c
@@ -326,7 +326,10 @@ static void dedotify_versions(struct modversion_info *vers,
 		}
 }
 
-/* Undefined symbols which refer to .funcname, hack to funcname (or .TOC.) */
+/*
+ * Undefined symbols which refer to .funcname, hack to funcname. Make .TOC.
+ * seem to be defined (value set later).
+ */
 static void dedotify(Elf64_Sym *syms, unsigned int numsyms, char *strtab)
 {
 	unsigned int i;
@@ -334,8 +337,11 @@ static void dedotify(Elf64_Sym *syms, unsigned int numsyms, char *strtab)
 	for (i = 1; i < numsyms; i++) {
 		if (syms[i].st_shndx == SHN_UNDEF) {
 			char *name = strtab + syms[i].st_name;
-			if (name[0] == '.')
+			if (name[0] == '.') {
+				if (strcmp(name+1, "TOC.") == 0)
+					syms[i].st_shndx = SHN_ABS;
 				syms[i].st_name++;
+			}
 		}
 	}
 }
@@ -351,7 +357,7 @@ static Elf64_Sym *find_dot_toc(Elf64_Shdr *sechdrs,
 	numsyms = sechdrs[symindex].sh_size / sizeof(Elf64_Sym);
 
 	for (i = 1; i < numsyms; i++) {
-		if (syms[i].st_shndx == SHN_UNDEF
+		if (syms[i].st_shndx == SHN_ABS
 		    && strcmp(strtab + syms[i].st_name, "TOC.") == 0)
 			return &syms[i];
 	}
-- 
cgit v1.2.3


From e1c1144053753341f6767555d1e0fa043aa71e6b Mon Sep 17 00:00:00 2001
From: Michael Neuling <mikey@neuling.org>
Date: Tue, 9 Jan 2018 03:52:05 +1100
Subject: powerpc/pseries: Add H_GET_CPU_CHARACTERISTICS flags & wrapper

commit 191eccb1580939fb0d47deb405b82a85b0379070 upstream.

A new hypervisor call has been defined to communicate various
characteristics of the CPU to guests. Add definitions for the hcall
number, flags and a wrapper function.

Signed-off-by: Michael Neuling <mikey@neuling.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
[Balbir fixed conflicts in backport]
Signed-off-by: Balbir Singh <bsingharora@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/powerpc/include/asm/hvcall.h         | 17 +++++++++++++++++
 arch/powerpc/include/asm/plpar_wrappers.h | 14 ++++++++++++++
 2 files changed, 31 insertions(+)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h
index 85bc8c0d257b..51adbde09845 100644
--- a/arch/powerpc/include/asm/hvcall.h
+++ b/arch/powerpc/include/asm/hvcall.h
@@ -239,6 +239,7 @@
 #define H_GET_HCA_INFO          0x1B8
 #define H_GET_PERF_COUNT        0x1BC
 #define H_MANAGE_TRACE          0x1C0
+#define H_GET_CPU_CHARACTERISTICS 0x1C8
 #define H_FREE_LOGICAL_LAN_BUFFER 0x1D4
 #define H_QUERY_INT_STATE       0x1E4
 #define H_POLL_PENDING		0x1D8
@@ -285,6 +286,17 @@
 #define H_SET_MODE_RESOURCE_ADDR_TRANS_MODE	3
 #define H_SET_MODE_RESOURCE_LE			4
 
+/* H_GET_CPU_CHARACTERISTICS return values */
+#define H_CPU_CHAR_SPEC_BAR_ORI31	(1ull << 63) // IBM bit 0
+#define H_CPU_CHAR_BCCTRL_SERIALISED	(1ull << 62) // IBM bit 1
+#define H_CPU_CHAR_L1D_FLUSH_ORI30	(1ull << 61) // IBM bit 2
+#define H_CPU_CHAR_L1D_FLUSH_TRIG2	(1ull << 60) // IBM bit 3
+#define H_CPU_CHAR_L1D_THREAD_PRIV	(1ull << 59) // IBM bit 4
+
+#define H_CPU_BEHAV_FAVOUR_SECURITY	(1ull << 63) // IBM bit 0
+#define H_CPU_BEHAV_L1D_FLUSH_PR	(1ull << 62) // IBM bit 1
+#define H_CPU_BEHAV_BNDS_CHK_SPEC_BAR	(1ull << 61) // IBM bit 2
+
 #ifndef __ASSEMBLY__
 
 /**
@@ -423,6 +435,11 @@ extern long pseries_big_endian_exceptions(void);
 
 #endif /* CONFIG_PPC_PSERIES */
 
+struct h_cpu_char_result {
+	u64 character;
+	u64 behaviour;
+};
+
 #endif /* __ASSEMBLY__ */
 #endif /* __KERNEL__ */
 #endif /* _ASM_POWERPC_HVCALL_H */
diff --git a/arch/powerpc/include/asm/plpar_wrappers.h b/arch/powerpc/include/asm/plpar_wrappers.h
index 67859edbf8fd..6e05cb397a5c 100644
--- a/arch/powerpc/include/asm/plpar_wrappers.h
+++ b/arch/powerpc/include/asm/plpar_wrappers.h
@@ -323,4 +323,18 @@ static inline long plapr_set_watchpoint0(unsigned long dawr0, unsigned long dawr
 	return plpar_set_mode(0, H_SET_MODE_RESOURCE_SET_DAWR, dawr0, dawrx0);
 }
 
+static inline long plpar_get_cpu_characteristics(struct h_cpu_char_result *p)
+{
+	unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
+	long rc;
+
+	rc = plpar_hcall(H_GET_CPU_CHARACTERISTICS, retbuf);
+	if (rc == H_SUCCESS) {
+		p->character = retbuf[0];
+		p->behaviour = retbuf[1];
+	}
+
+	return rc;
+}
+
 #endif /* _ASM_POWERPC_PLPAR_WRAPPERS_H */
-- 
cgit v1.2.3


From a8a9925f01e1bda46f059a66c218d536bb1462ef Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Wed, 10 Jan 2018 03:07:15 +1100
Subject: powerpc/64: Add macros for annotating the destination of rfid/hrfid

commit 50e51c13b3822d14ff6df4279423e4b7b2269bc3 upstream.

The rfid/hrfid ((Hypervisor) Return From Interrupt) instruction is
used for switching from the kernel to userspace, and from the
hypervisor to the guest kernel. However it can and is also used for
other transitions, eg. from real mode kernel code to virtual mode
kernel code, and it's not always clear from the code what the
destination context is.

To make it clearer when reading the code, add macros which encode the
expected destination context.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/powerpc/include/asm/exception-64e.h |  6 ++++++
 arch/powerpc/include/asm/exception-64s.h | 29 +++++++++++++++++++++++++++++
 2 files changed, 35 insertions(+)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/exception-64e.h b/arch/powerpc/include/asm/exception-64e.h
index a703452d67b6..555e22d5e07f 100644
--- a/arch/powerpc/include/asm/exception-64e.h
+++ b/arch/powerpc/include/asm/exception-64e.h
@@ -209,5 +209,11 @@ exc_##label##_book3e:
 	ori	r3,r3,vector_offset@l;		\
 	mtspr	SPRN_IVOR##vector_number,r3;
 
+#define RFI_TO_KERNEL							\
+	rfi
+
+#define RFI_TO_USER							\
+	rfi
+
 #endif /* _ASM_POWERPC_EXCEPTION_64E_H */
 
diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h
index 77f52b26dad6..c8c8a81e3976 100644
--- a/arch/powerpc/include/asm/exception-64s.h
+++ b/arch/powerpc/include/asm/exception-64s.h
@@ -50,6 +50,35 @@
 #define EX_PPR		88	/* SMT thread status register (priority) */
 #define EX_CTR		96
 
+/* Macros for annotating the expected destination of (h)rfid */
+
+#define RFI_TO_KERNEL							\
+	rfid
+
+#define RFI_TO_USER							\
+	rfid
+
+#define RFI_TO_USER_OR_KERNEL						\
+	rfid
+
+#define RFI_TO_GUEST							\
+	rfid
+
+#define HRFI_TO_KERNEL							\
+	hrfid
+
+#define HRFI_TO_USER							\
+	hrfid
+
+#define HRFI_TO_USER_OR_KERNEL						\
+	hrfid
+
+#define HRFI_TO_GUEST							\
+	hrfid
+
+#define HRFI_TO_UNKNOWN							\
+	hrfid
+
 #ifdef CONFIG_RELOCATABLE
 #define __EXCEPTION_RELON_PROLOG_PSERIES_1(label, h)			\
 	ld	r12,PACAKBASE(r13);	/* get high part of &label */	\
-- 
cgit v1.2.3


From 7ca8316cb94f394999f0d512f30984b512f64958 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Wed, 10 Jan 2018 03:07:15 +1100
Subject: powerpc/64s: Simple RFI macro conversions

commit 222f20f140623ef6033491d0103ee0875fe87d35 upstream.

This commit does simple conversions of rfi/rfid to the new macros that
include the expected destination context. By simple we mean cases
where there is a single well known destination context, and it's
simply a matter of substituting the instruction for the appropriate
macro.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
[Balbir fixed issues with backporting to stable]
Signed-off-by: Balbir Singh <bsingharora@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/powerpc/include/asm/exception-64s.h |  2 +-
 arch/powerpc/kernel/entry_64.S           | 14 +++++++++-----
 arch/powerpc/kernel/exceptions-64s.S     | 18 +++++++++---------
 arch/powerpc/kvm/book3s_hv_rmhandlers.S  |  7 +++----
 arch/powerpc/kvm/book3s_rmhandlers.S     |  7 +++++--
 arch/powerpc/kvm/book3s_segment.S        |  4 ++--
 6 files changed, 29 insertions(+), 23 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h
index c8c8a81e3976..6dfb6e928f81 100644
--- a/arch/powerpc/include/asm/exception-64s.h
+++ b/arch/powerpc/include/asm/exception-64s.h
@@ -220,7 +220,7 @@ END_FTR_SECTION_NESTED(ftr,ftr,943)
 	mtspr	SPRN_##h##SRR0,r12;					\
 	mfspr	r12,SPRN_##h##SRR1;	/* and SRR1 */			\
 	mtspr	SPRN_##h##SRR1,r10;					\
-	h##rfid;							\
+	h##RFI_TO_KERNEL;						\
 	b	.	/* prevent speculative execution */
 #define EXCEPTION_PROLOG_PSERIES_1(label, h)				\
 	__EXCEPTION_PROLOG_PSERIES_1(label, h)
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index f6fd0332c3a2..fb2da020e0ee 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -36,6 +36,11 @@
 #include <asm/hw_irq.h>
 #include <asm/context_tracking.h>
 #include <asm/tm.h>
+#ifdef CONFIG_PPC_BOOK3S
+#include <asm/exception-64s.h>
+#else
+#include <asm/exception-64e.h>
+#endif
 
 /*
  * System calls.
@@ -353,8 +358,7 @@ tabort_syscall:
 	mtmsrd	r10, 1
 	mtspr	SPRN_SRR0, r11
 	mtspr	SPRN_SRR1, r12
-
-	rfid
+	RFI_TO_USER
 	b	.	/* prevent speculative execution */
 #endif
 
@@ -1077,7 +1081,7 @@ _GLOBAL(enter_rtas)
 	
 	mtspr	SPRN_SRR0,r5
 	mtspr	SPRN_SRR1,r6
-	rfid
+	RFI_TO_KERNEL
 	b	.	/* prevent speculative execution */
 
 rtas_return_loc:
@@ -1102,7 +1106,7 @@ rtas_return_loc:
 
 	mtspr	SPRN_SRR0,r3
 	mtspr	SPRN_SRR1,r4
-	rfid
+	RFI_TO_KERNEL
 	b	.	/* prevent speculative execution */
 
 	.align	3
@@ -1173,7 +1177,7 @@ _GLOBAL(enter_prom)
 	LOAD_REG_IMMEDIATE(r12, MSR_SF | MSR_ISF | MSR_LE)
 	andc	r11,r11,r12
 	mtsrr1	r11
-	rfid
+	RFI_TO_KERNEL
 #endif /* CONFIG_PPC_BOOK3E */
 
 1:	/* Return from OF */
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index b81ccc5fb32d..e3a3b81df363 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -46,7 +46,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE)				\
 	mtspr	SPRN_SRR0,r10 ; 				\
 	ld	r10,PACAKMSR(r13) ;				\
 	mtspr	SPRN_SRR1,r10 ; 				\
-	rfid ; 							\
+	RFI_TO_KERNEL ; 							\
 	b	. ;	/* prevent speculative execution */
 
 #define SYSCALL_PSERIES_3					\
@@ -54,7 +54,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_REAL_LE)				\
 1:	mfspr	r12,SPRN_SRR1 ;					\
 	xori	r12,r12,MSR_LE ;				\
 	mtspr	SPRN_SRR1,r12 ;					\
-	rfid ;		/* return to userspace */		\
+	RFI_TO_USER ;		/* return to userspace */		\
 	b	. ;	/* prevent speculative execution */
 
 #if defined(CONFIG_RELOCATABLE)
@@ -507,7 +507,7 @@ BEGIN_FTR_SECTION
 	LOAD_HANDLER(r12, machine_check_handle_early)
 1:	mtspr	SPRN_SRR0,r12
 	mtspr	SPRN_SRR1,r11
-	rfid
+	RFI_TO_KERNEL
 	b	.	/* prevent speculative execution */
 2:
 	/* Stack overflow. Stay on emergency stack and panic.
@@ -601,7 +601,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR)
 	ld	r11,PACA_EXGEN+EX_R11(r13)
 	ld	r12,PACA_EXGEN+EX_R12(r13)
 	ld	r13,PACA_EXGEN+EX_R13(r13)
-	HRFID
+	HRFI_TO_UNKNOWN
 	b	.
 #endif
 
@@ -666,7 +666,7 @@ masked_##_H##interrupt:					\
 	ld	r10,PACA_EXGEN+EX_R10(r13);		\
 	ld	r11,PACA_EXGEN+EX_R11(r13);		\
 	GET_SCRATCH0(r13);				\
-	##_H##rfid;					\
+	##_H##RFI_TO_KERNEL;				\
 	b	.
 	
 	MASKED_INTERRUPT()
@@ -756,7 +756,7 @@ kvmppc_skip_interrupt:
 	addi	r13, r13, 4
 	mtspr	SPRN_SRR0, r13
 	GET_SCRATCH0(r13)
-	rfid
+	RFI_TO_KERNEL
 	b	.
 
 kvmppc_skip_Hinterrupt:
@@ -768,7 +768,7 @@ kvmppc_skip_Hinterrupt:
 	addi	r13, r13, 4
 	mtspr	SPRN_HSRR0, r13
 	GET_SCRATCH0(r13)
-	hrfid
+	HRFI_TO_KERNEL
 	b	.
 #endif
 
@@ -1439,7 +1439,7 @@ machine_check_handle_early:
 	li	r3,MSR_ME
 	andc	r10,r10,r3		/* Turn off MSR_ME */
 	mtspr	SPRN_SRR1,r10
-	rfid
+	RFI_TO_KERNEL
 	b	.
 2:
 	/*
@@ -1457,7 +1457,7 @@ machine_check_handle_early:
 	 */
 	bl	machine_check_queue_event
 	MACHINE_CHECK_HANDLER_WINDUP
-	rfid
+	RFI_TO_USER_OR_KERNEL
 9:
 	/* Deliver the machine check to host kernel in V mode. */
 	MACHINE_CHECK_HANDLER_WINDUP
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index ffab9269bfe4..4463718ae614 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -64,7 +64,7 @@ _GLOBAL_TOC(kvmppc_hv_entry_trampoline)
 	mtmsrd	r0,1		/* clear RI in MSR */
 	mtsrr0	r5
 	mtsrr1	r6
-	RFI
+	RFI_TO_KERNEL
 
 kvmppc_call_hv_entry:
 	ld	r4, HSTATE_KVM_VCPU(r13)
@@ -170,7 +170,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
 	mtsrr0	r8
 	mtsrr1	r7
 	beq	cr1, 13f		/* machine check */
-	RFI
+	RFI_TO_KERNEL
 
 	/* On POWER7, we have external interrupts set to use HSRR0/1 */
 11:	mtspr	SPRN_HSRR0, r8
@@ -965,8 +965,7 @@ BEGIN_FTR_SECTION
 END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
 	ld	r0, VCPU_GPR(R0)(r4)
 	ld	r4, VCPU_GPR(R4)(r4)
-
-	hrfid
+	HRFI_TO_GUEST
 	b	.
 
 secondary_too_late:
diff --git a/arch/powerpc/kvm/book3s_rmhandlers.S b/arch/powerpc/kvm/book3s_rmhandlers.S
index 16c4d88ba27d..a328f99a887c 100644
--- a/arch/powerpc/kvm/book3s_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_rmhandlers.S
@@ -46,6 +46,9 @@
 
 #define FUNC(name)		name
 
+#define RFI_TO_KERNEL	RFI
+#define RFI_TO_GUEST	RFI
+
 .macro INTERRUPT_TRAMPOLINE intno
 
 .global kvmppc_trampoline_\intno
@@ -141,7 +144,7 @@ kvmppc_handler_skip_ins:
 	GET_SCRATCH0(r13)
 
 	/* And get back into the code */
-	RFI
+	RFI_TO_KERNEL
 #endif
 
 /*
@@ -164,6 +167,6 @@ _GLOBAL_TOC(kvmppc_entry_trampoline)
 	ori	r5, r5, MSR_EE
 	mtsrr0	r7
 	mtsrr1	r6
-	RFI
+	RFI_TO_KERNEL
 
 #include "book3s_segment.S"
diff --git a/arch/powerpc/kvm/book3s_segment.S b/arch/powerpc/kvm/book3s_segment.S
index ca8f174289bb..7c982956d709 100644
--- a/arch/powerpc/kvm/book3s_segment.S
+++ b/arch/powerpc/kvm/book3s_segment.S
@@ -156,7 +156,7 @@ no_dcbz32_on:
 	PPC_LL	r9, SVCPU_R9(r3)
 	PPC_LL	r3, (SVCPU_R3)(r3)
 
-	RFI
+	RFI_TO_GUEST
 kvmppc_handler_trampoline_enter_end:
 
 
@@ -389,5 +389,5 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)
 	cmpwi	r12, BOOK3S_INTERRUPT_DOORBELL
 	beqa	BOOK3S_INTERRUPT_DOORBELL
 
-	RFI
+	RFI_TO_KERNEL
 kvmppc_handler_trampoline_exit_end:
-- 
cgit v1.2.3


From 9bfecafe84e628c5dff9cbeaa4b6e73560adb925 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Wed, 10 Jan 2018 03:07:15 +1100
Subject: powerpc/64: Convert fast_exception_return to use RFI_TO_USER/KERNEL

commit a08f828cf47e6c605af21d2cdec68f84e799c318 upstream.

Similar to the syscall return path, in fast_exception_return we may be
returning to user or kernel context. We already have a test for that,
because we conditionally restore r13. So use that existing test and
branch, and bifurcate the return based on that.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/powerpc/kernel/entry_64.S | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index fb2da020e0ee..8efd8c402cf7 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -891,7 +891,7 @@ BEGIN_FTR_SECTION
 END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
 	ACCOUNT_CPU_USER_EXIT(r2, r4)
 	REST_GPR(13, r1)
-1:
+
 	mtspr	SPRN_SRR1,r3
 
 	ld	r2,_CCR(r1)
@@ -904,8 +904,22 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
 	ld	r3,GPR3(r1)
 	ld	r4,GPR4(r1)
 	ld	r1,GPR1(r1)
+	RFI_TO_USER
+	b	.	/* prevent speculative execution */
+
+1:	mtspr	SPRN_SRR1,r3
+
+	ld	r2,_CCR(r1)
+	mtcrf	0xFF,r2
+	ld	r2,_NIP(r1)
+	mtspr	SPRN_SRR0,r2
 
-	rfid
+	ld	r0,GPR0(r1)
+	ld	r2,GPR2(r1)
+	ld	r3,GPR3(r1)
+	ld	r4,GPR4(r1)
+	ld	r1,GPR1(r1)
+	RFI_TO_KERNEL
 	b	.	/* prevent speculative execution */
 
 #endif /* CONFIG_PPC_BOOK3E */
-- 
cgit v1.2.3


From 8dd311f1ec740b05c851d65bab9cfdde26e35a8a Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Wed, 10 Jan 2018 03:07:15 +1100
Subject: powerpc/64: Convert the syscall exit path to use RFI_TO_USER/KERNEL

commit b8e90cb7bc04a509e821e82ab6ed7a8ef11ba333 upstream.

In the syscall exit path we may be returning to user or kernel
context. We already have a test for that, because we conditionally
restore r13. So use that existing test and branch, and bifurcate the
return based on that.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/powerpc/kernel/entry_64.S | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index 8efd8c402cf7..2837232bbffb 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -230,13 +230,23 @@ END_FTR_SECTION_IFCLR(CPU_FTR_STCX_CHECKS_ADDRESS)
 	ACCOUNT_CPU_USER_EXIT(r11, r12)
 	HMT_MEDIUM_LOW_HAS_PPR
 	ld	r13,GPR13(r1)	/* only restore r13 if returning to usermode */
+	ld	r2,GPR2(r1)
+	ld	r1,GPR1(r1)
+	mtlr	r4
+	mtcr	r5
+	mtspr	SPRN_SRR0,r7
+	mtspr	SPRN_SRR1,r8
+	RFI_TO_USER
+	b	.	/* prevent speculative execution */
+
+	/* exit to kernel */
 1:	ld	r2,GPR2(r1)
 	ld	r1,GPR1(r1)
 	mtlr	r4
 	mtcr	r5
 	mtspr	SPRN_SRR0,r7
 	mtspr	SPRN_SRR1,r8
-	RFI
+	RFI_TO_KERNEL
 	b	.	/* prevent speculative execution */
 
 syscall_error:	
-- 
cgit v1.2.3


From 973439da1137a066f6b3f478c930edff1879dee2 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Wed, 10 Jan 2018 03:07:15 +1100
Subject: powerpc/64s: Convert slb_miss_common to use RFI_TO_USER/KERNEL

commit c7305645eb0c1621351cfc104038831ae87c0053 upstream.

In the SLB miss handler we may be returning to user or kernel. We need
to add a check early on and save the result in the cr4 register, and
then we bifurcate the return path based on that.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
[mpe: Backport to 4.4 based on patch from Balbir]
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/powerpc/kernel/exceptions-64s.S | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index e3a3b81df363..2c6494351604 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -1503,6 +1503,8 @@ slb_miss_realmode:
 
 	andi.	r10,r12,MSR_RI	/* check for unrecoverable exception */
 	beq-	2f
+	andi.	r10,r12,MSR_PR	/* check for user mode (PR != 0) */
+	bne	1f
 
 .machine	push
 .machine	"power4"
@@ -1516,7 +1518,23 @@ slb_miss_realmode:
 	ld	r11,PACA_EXSLB+EX_R11(r13)
 	ld	r12,PACA_EXSLB+EX_R12(r13)
 	ld	r13,PACA_EXSLB+EX_R13(r13)
-	rfid
+	RFI_TO_KERNEL
+	b	.	/* prevent speculative execution */
+
+1:
+.machine	push
+.machine	"power4"
+	mtcrf	0x80,r9
+	mtcrf	0x01,r9		/* slb_allocate uses cr0 and cr7 */
+.machine	pop
+
+	RESTORE_PPR_PACA(PACA_EXSLB, r9)
+	ld	r9,PACA_EXSLB+EX_R9(r13)
+	ld	r10,PACA_EXSLB+EX_R10(r13)
+	ld	r11,PACA_EXSLB+EX_R11(r13)
+	ld	r12,PACA_EXSLB+EX_R12(r13)
+	ld	r13,PACA_EXSLB+EX_R13(r13)
+	RFI_TO_USER
 	b	.	/* prevent speculative execution */
 
 2:	mfspr	r11,SPRN_SRR0
@@ -1525,7 +1543,7 @@ slb_miss_realmode:
 	mtspr	SPRN_SRR0,r10
 	ld	r10,PACAKMSR(r13)
 	mtspr	SPRN_SRR1,r10
-	rfid
+	RFI_TO_KERNEL
 	b	.
 
 unrecov_slb:
-- 
cgit v1.2.3


From c3892946315effa323954134c2f8aeda51e9e68b Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Wed, 10 Jan 2018 03:07:15 +1100
Subject: powerpc/64s: Add support for RFI flush of L1-D cache

commit aa8a5e0062ac940f7659394f4817c948dc8c0667 upstream.

On some CPUs we can prevent the Meltdown vulnerability by flushing the
L1-D cache on exit from kernel to user mode, and from hypervisor to
guest.

This is known to be the case on at least Power7, Power8 and Power9. At
this time we do not know the status of the vulnerability on other CPUs
such as the 970 (Apple G5), pasemi CPUs (AmigaOne X1000) or Freescale
CPUs. As more information comes to light we can enable this, or other
mechanisms on those CPUs.

The vulnerability occurs when the load of an architecturally
inaccessible memory region (eg. userspace load of kernel memory) is
speculatively executed to the point where its result can influence the
address of a subsequent speculatively executed load.

In order for that to happen, the first load must hit in the L1,
because before the load is sent to the L2 the permission check is
performed. Therefore if no kernel addresses hit in the L1 the
vulnerability can not occur. We can ensure that is the case by
flushing the L1 whenever we return to userspace. Similarly for
hypervisor vs guest.

In order to flush the L1-D cache on exit, we add a section of nops at
each (h)rfi location that returns to a lower privileged context, and
patch that with some sequence. Newer firmwares are able to advertise
to us that there is a special nop instruction that flushes the L1-D.
If we do not see that advertised, we fall back to doing a displacement
flush in software.

For guest kernels we support migration between some CPU versions, and
different CPUs may use different flush instructions. So that we are
prepared to migrate to a machine with a different flush instruction
activated, we may have to patch more than one flush instruction at
boot if the hypervisor tells us to.

In the end this patch is mostly the work of Nicholas Piggin and
Michael Ellerman. However a cast of thousands contributed to analysis
of the issue, earlier versions of the patch, back ports testing etc.
Many thanks to all of them.

Tested-by: Jon Masters <jcm@redhat.com>
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
[Balbir - back ported to stable with changes]
Signed-off-by: Balbir Singh <bsingharora@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/powerpc/include/asm/exception-64s.h  | 40 +++++++++++---
 arch/powerpc/include/asm/feature-fixups.h | 15 ++++++
 arch/powerpc/include/asm/paca.h           | 10 ++++
 arch/powerpc/include/asm/setup.h          | 13 +++++
 arch/powerpc/kernel/asm-offsets.c         |  4 ++
 arch/powerpc/kernel/exceptions-64s.S      | 86 +++++++++++++++++++++++++++++++
 arch/powerpc/kernel/setup_64.c            | 79 ++++++++++++++++++++++++++++
 arch/powerpc/kernel/vmlinux.lds.S         |  9 ++++
 arch/powerpc/lib/feature-fixups.c         | 42 +++++++++++++++
 9 files changed, 290 insertions(+), 8 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h
index 6dfb6e928f81..9bddbec441b8 100644
--- a/arch/powerpc/include/asm/exception-64s.h
+++ b/arch/powerpc/include/asm/exception-64s.h
@@ -50,34 +50,58 @@
 #define EX_PPR		88	/* SMT thread status register (priority) */
 #define EX_CTR		96
 
-/* Macros for annotating the expected destination of (h)rfid */
+/*
+ * Macros for annotating the expected destination of (h)rfid
+ *
+ * The nop instructions allow us to insert one or more instructions to flush the
+ * L1-D cache when returning to userspace or a guest.
+ */
+#define RFI_FLUSH_SLOT							\
+	RFI_FLUSH_FIXUP_SECTION;					\
+	nop;								\
+	nop;								\
+	nop
 
 #define RFI_TO_KERNEL							\
 	rfid
 
 #define RFI_TO_USER							\
-	rfid
+	RFI_FLUSH_SLOT;							\
+	rfid;								\
+	b	rfi_flush_fallback
 
 #define RFI_TO_USER_OR_KERNEL						\
-	rfid
+	RFI_FLUSH_SLOT;							\
+	rfid;								\
+	b	rfi_flush_fallback
 
 #define RFI_TO_GUEST							\
-	rfid
+	RFI_FLUSH_SLOT;							\
+	rfid;								\
+	b	rfi_flush_fallback
 
 #define HRFI_TO_KERNEL							\
 	hrfid
 
 #define HRFI_TO_USER							\
-	hrfid
+	RFI_FLUSH_SLOT;							\
+	hrfid;								\
+	b	hrfi_flush_fallback
 
 #define HRFI_TO_USER_OR_KERNEL						\
-	hrfid
+	RFI_FLUSH_SLOT;							\
+	hrfid;								\
+	b	hrfi_flush_fallback
 
 #define HRFI_TO_GUEST							\
-	hrfid
+	RFI_FLUSH_SLOT;							\
+	hrfid;								\
+	b	hrfi_flush_fallback
 
 #define HRFI_TO_UNKNOWN							\
-	hrfid
+	RFI_FLUSH_SLOT;							\
+	hrfid;								\
+	b	hrfi_flush_fallback
 
 #ifdef CONFIG_RELOCATABLE
 #define __EXCEPTION_RELON_PROLOG_PSERIES_1(label, h)			\
diff --git a/arch/powerpc/include/asm/feature-fixups.h b/arch/powerpc/include/asm/feature-fixups.h
index 9a67a38bf7b9..7068bafbb2d6 100644
--- a/arch/powerpc/include/asm/feature-fixups.h
+++ b/arch/powerpc/include/asm/feature-fixups.h
@@ -184,4 +184,19 @@ label##3:					       	\
 	FTR_ENTRY_OFFSET label##1b-label##3b;		\
 	.popsection;
 
+#define RFI_FLUSH_FIXUP_SECTION				\
+951:							\
+	.pushsection __rfi_flush_fixup,"a";		\
+	.align 2;					\
+952:							\
+	FTR_ENTRY_OFFSET 951b-952b;			\
+	.popsection;
+
+
+#ifndef __ASSEMBLY__
+
+extern long __start___rfi_flush_fixup, __stop___rfi_flush_fixup;
+
+#endif
+
 #endif /* __ASM_POWERPC_FEATURE_FIXUPS_H */
diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
index 70bd4381f8e6..45e2aefece16 100644
--- a/arch/powerpc/include/asm/paca.h
+++ b/arch/powerpc/include/asm/paca.h
@@ -192,6 +192,16 @@ struct paca_struct {
 #endif
 	struct kvmppc_host_state kvm_hstate;
 #endif
+#ifdef CONFIG_PPC_BOOK3S_64
+	/*
+	 * rfi fallback flush must be in its own cacheline to prevent
+	 * other paca data leaking into the L1d
+	 */
+	u64 exrfi[13] __aligned(0x80);
+	void *rfi_flush_fallback_area;
+	u64 l1d_flush_congruence;
+	u64 l1d_flush_sets;
+#endif
 };
 
 extern struct paca_struct *paca;
diff --git a/arch/powerpc/include/asm/setup.h b/arch/powerpc/include/asm/setup.h
index e9d384cbd021..7916b56f2e60 100644
--- a/arch/powerpc/include/asm/setup.h
+++ b/arch/powerpc/include/asm/setup.h
@@ -26,6 +26,19 @@ void initmem_init(void);
 void setup_panic(void);
 #define ARCH_PANIC_TIMEOUT 180
 
+void rfi_flush_enable(bool enable);
+
+/* These are bit flags */
+enum l1d_flush_type {
+	L1D_FLUSH_NONE		= 0x1,
+	L1D_FLUSH_FALLBACK	= 0x2,
+	L1D_FLUSH_ORI		= 0x4,
+	L1D_FLUSH_MTTRIG	= 0x8,
+};
+
+void __init setup_rfi_flush(enum l1d_flush_type, bool enable);
+void do_rfi_flush_fixups(enum l1d_flush_type types);
+
 #endif /* !__ASSEMBLY__ */
 
 #endif	/* _ASM_POWERPC_SETUP_H */
diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c
index 40da69163d51..d92705e3a0c1 100644
--- a/arch/powerpc/kernel/asm-offsets.c
+++ b/arch/powerpc/kernel/asm-offsets.c
@@ -243,6 +243,10 @@ int main(void)
 #ifdef CONFIG_PPC_BOOK3S_64
 	DEFINE(PACAMCEMERGSP, offsetof(struct paca_struct, mc_emergency_sp));
 	DEFINE(PACA_IN_MCE, offsetof(struct paca_struct, in_mce));
+	DEFINE(PACA_RFI_FLUSH_FALLBACK_AREA, offsetof(struct paca_struct, rfi_flush_fallback_area));
+	DEFINE(PACA_EXRFI, offsetof(struct paca_struct, exrfi));
+	DEFINE(PACA_L1D_FLUSH_CONGRUENCE, offsetof(struct paca_struct, l1d_flush_congruence));
+	DEFINE(PACA_L1D_FLUSH_SETS, offsetof(struct paca_struct, l1d_flush_sets));
 #endif
 	DEFINE(PACAHWCPUID, offsetof(struct paca_struct, hw_cpu_id));
 	DEFINE(PACAKEXECSTATE, offsetof(struct paca_struct, kexec_state));
diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S
index 2c6494351604..938a30fef031 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -1564,6 +1564,92 @@ power4_fixup_nap:
 	blr
 #endif
 
+	.globl rfi_flush_fallback
+rfi_flush_fallback:
+	SET_SCRATCH0(r13);
+	GET_PACA(r13);
+	std	r9,PACA_EXRFI+EX_R9(r13)
+	std	r10,PACA_EXRFI+EX_R10(r13)
+	std	r11,PACA_EXRFI+EX_R11(r13)
+	std	r12,PACA_EXRFI+EX_R12(r13)
+	std	r8,PACA_EXRFI+EX_R13(r13)
+	mfctr	r9
+	ld	r10,PACA_RFI_FLUSH_FALLBACK_AREA(r13)
+	ld	r11,PACA_L1D_FLUSH_SETS(r13)
+	ld	r12,PACA_L1D_FLUSH_CONGRUENCE(r13)
+	/*
+	 * The load adresses are at staggered offsets within cachelines,
+	 * which suits some pipelines better (on others it should not
+	 * hurt).
+	 */
+	addi	r12,r12,8
+	mtctr	r11
+	DCBT_STOP_ALL_STREAM_IDS(r11) /* Stop prefetch streams */
+
+	/* order ld/st prior to dcbt stop all streams with flushing */
+	sync
+1:	li	r8,0
+	.rept	8 /* 8-way set associative */
+	ldx	r11,r10,r8
+	add	r8,r8,r12
+	xor	r11,r11,r11	// Ensure r11 is 0 even if fallback area is not
+	add	r8,r8,r11	// Add 0, this creates a dependency on the ldx
+	.endr
+	addi	r10,r10,128 /* 128 byte cache line */
+	bdnz	1b
+
+	mtctr	r9
+	ld	r9,PACA_EXRFI+EX_R9(r13)
+	ld	r10,PACA_EXRFI+EX_R10(r13)
+	ld	r11,PACA_EXRFI+EX_R11(r13)
+	ld	r12,PACA_EXRFI+EX_R12(r13)
+	ld	r8,PACA_EXRFI+EX_R13(r13)
+	GET_SCRATCH0(r13);
+	rfid
+
+	.globl hrfi_flush_fallback
+hrfi_flush_fallback:
+	SET_SCRATCH0(r13);
+	GET_PACA(r13);
+	std	r9,PACA_EXRFI+EX_R9(r13)
+	std	r10,PACA_EXRFI+EX_R10(r13)
+	std	r11,PACA_EXRFI+EX_R11(r13)
+	std	r12,PACA_EXRFI+EX_R12(r13)
+	std	r8,PACA_EXRFI+EX_R13(r13)
+	mfctr	r9
+	ld	r10,PACA_RFI_FLUSH_FALLBACK_AREA(r13)
+	ld	r11,PACA_L1D_FLUSH_SETS(r13)
+	ld	r12,PACA_L1D_FLUSH_CONGRUENCE(r13)
+	/*
+	 * The load adresses are at staggered offsets within cachelines,
+	 * which suits some pipelines better (on others it should not
+	 * hurt).
+	 */
+	addi	r12,r12,8
+	mtctr	r11
+	DCBT_STOP_ALL_STREAM_IDS(r11) /* Stop prefetch streams */
+
+	/* order ld/st prior to dcbt stop all streams with flushing */
+	sync
+1:	li	r8,0
+	.rept	8 /* 8-way set associative */
+	ldx	r11,r10,r8
+	add	r8,r8,r12
+	xor	r11,r11,r11	// Ensure r11 is 0 even if fallback area is not
+	add	r8,r8,r11	// Add 0, this creates a dependency on the ldx
+	.endr
+	addi	r10,r10,128 /* 128 byte cache line */
+	bdnz	1b
+
+	mtctr	r9
+	ld	r9,PACA_EXRFI+EX_R9(r13)
+	ld	r10,PACA_EXRFI+EX_R10(r13)
+	ld	r11,PACA_EXRFI+EX_R11(r13)
+	ld	r12,PACA_EXRFI+EX_R12(r13)
+	ld	r8,PACA_EXRFI+EX_R13(r13)
+	GET_SCRATCH0(r13);
+	hrfid
+
 /*
  * Hash table stuff
  */
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index a20823210ac0..1a3bd6a35c44 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -834,4 +834,83 @@ static int __init disable_hardlockup_detector(void)
 	return 0;
 }
 early_initcall(disable_hardlockup_detector);
+
+#ifdef CONFIG_PPC_BOOK3S_64
+static enum l1d_flush_type enabled_flush_types;
+static void *l1d_flush_fallback_area;
+bool rfi_flush;
+
+static void do_nothing(void *unused)
+{
+	/*
+	 * We don't need to do the flush explicitly, just enter+exit kernel is
+	 * sufficient, the RFI exit handlers will do the right thing.
+	 */
+}
+
+void rfi_flush_enable(bool enable)
+{
+	if (rfi_flush == enable)
+		return;
+
+	if (enable) {
+		do_rfi_flush_fixups(enabled_flush_types);
+		on_each_cpu(do_nothing, NULL, 1);
+	} else
+		do_rfi_flush_fixups(L1D_FLUSH_NONE);
+
+	rfi_flush = enable;
+}
+
+static void init_fallback_flush(void)
+{
+	u64 l1d_size, limit;
+	int cpu;
+
+	l1d_size = ppc64_caches.dsize;
+	limit = min(safe_stack_limit(), ppc64_rma_size);
+
+	/*
+	 * Align to L1d size, and size it at 2x L1d size, to catch possible
+	 * hardware prefetch runoff. We don't have a recipe for load patterns to
+	 * reliably avoid the prefetcher.
+	 */
+	l1d_flush_fallback_area = __va(memblock_alloc_base(l1d_size * 2, l1d_size, limit));
+	memset(l1d_flush_fallback_area, 0, l1d_size * 2);
+
+	for_each_possible_cpu(cpu) {
+		/*
+		 * The fallback flush is currently coded for 8-way
+		 * associativity. Different associativity is possible, but it
+		 * will be treated as 8-way and may not evict the lines as
+		 * effectively.
+		 *
+		 * 128 byte lines are mandatory.
+		 */
+		u64 c = l1d_size / 8;
+
+		paca[cpu].rfi_flush_fallback_area = l1d_flush_fallback_area;
+		paca[cpu].l1d_flush_congruence = c;
+		paca[cpu].l1d_flush_sets = c / 128;
+	}
+}
+
+void __init setup_rfi_flush(enum l1d_flush_type types, bool enable)
+{
+	if (types & L1D_FLUSH_FALLBACK) {
+		pr_info("rfi-flush: Using fallback displacement flush\n");
+		init_fallback_flush();
+	}
+
+	if (types & L1D_FLUSH_ORI)
+		pr_info("rfi-flush: Using ori type flush\n");
+
+	if (types & L1D_FLUSH_MTTRIG)
+		pr_info("rfi-flush: Using mttrig type flush\n");
+
+	enabled_flush_types = types;
+
+	rfi_flush_enable(enable);
+}
+#endif /* CONFIG_PPC_BOOK3S_64 */
 #endif
diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S
index d41fd0af8980..072a23a17350 100644
--- a/arch/powerpc/kernel/vmlinux.lds.S
+++ b/arch/powerpc/kernel/vmlinux.lds.S
@@ -72,6 +72,15 @@ SECTIONS
 	/* Read-only data */
 	RODATA
 
+#ifdef CONFIG_PPC64
+	. = ALIGN(8);
+	__rfi_flush_fixup : AT(ADDR(__rfi_flush_fixup) - LOAD_OFFSET) {
+		__start___rfi_flush_fixup = .;
+		*(__rfi_flush_fixup)
+		__stop___rfi_flush_fixup = .;
+	}
+#endif
+
 	EXCEPTION_TABLE(0)
 
 	NOTES :kernel :notes
diff --git a/arch/powerpc/lib/feature-fixups.c b/arch/powerpc/lib/feature-fixups.c
index 7ce3870d7ddd..a18d648d31a6 100644
--- a/arch/powerpc/lib/feature-fixups.c
+++ b/arch/powerpc/lib/feature-fixups.c
@@ -20,6 +20,7 @@
 #include <asm/code-patching.h>
 #include <asm/page.h>
 #include <asm/sections.h>
+#include <asm/setup.h>
 
 
 struct fixup_entry {
@@ -113,6 +114,47 @@ void do_feature_fixups(unsigned long value, void *fixup_start, void *fixup_end)
 	}
 }
 
+#ifdef CONFIG_PPC_BOOK3S_64
+void do_rfi_flush_fixups(enum l1d_flush_type types)
+{
+	unsigned int instrs[3], *dest;
+	long *start, *end;
+	int i;
+
+	start = PTRRELOC(&__start___rfi_flush_fixup),
+	end = PTRRELOC(&__stop___rfi_flush_fixup);
+
+	instrs[0] = 0x60000000; /* nop */
+	instrs[1] = 0x60000000; /* nop */
+	instrs[2] = 0x60000000; /* nop */
+
+	if (types & L1D_FLUSH_FALLBACK)
+		/* b .+16 to fallback flush */
+		instrs[0] = 0x48000010;
+
+	i = 0;
+	if (types & L1D_FLUSH_ORI) {
+		instrs[i++] = 0x63ff0000; /* ori 31,31,0 speculation barrier */
+		instrs[i++] = 0x63de0000; /* ori 30,30,0 L1d flush*/
+	}
+
+	if (types & L1D_FLUSH_MTTRIG)
+		instrs[i++] = 0x7c12dba6; /* mtspr TRIG2,r0 (SPR #882) */
+
+	for (i = 0; start < end; start++, i++) {
+		dest = (void *)start + *start;
+
+		pr_devel("patching dest %lx\n", (unsigned long)dest);
+
+		patch_instruction(dest, instrs[0]);
+		patch_instruction(dest + 1, instrs[1]);
+		patch_instruction(dest + 2, instrs[2]);
+	}
+
+	printk(KERN_DEBUG "rfi-flush: patched %d locations\n", i);
+}
+#endif /* CONFIG_PPC_BOOK3S_64 */
+
 void do_lwsync_fixups(unsigned long value, void *fixup_start, void *fixup_end)
 {
 	long *start, *end;
-- 
cgit v1.2.3


From 11c76e64332f0f6f10ea8c2e2612fd4601a3e0d7 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Wed, 10 Jan 2018 03:07:15 +1100
Subject: powerpc/64s: Support disabling RFI flush with no_rfi_flush and nopti

commit bc9c9304a45480797e13a8e1df96ffcf44fb62fe upstream.

Because there may be some performance overhead of the RFI flush, add
kernel command line options to disable it.

We add a sensibly named 'no_rfi_flush' option, but we also hijack the
x86 option 'nopti'. The RFI flush is not the same as KPTI, but if we
see 'nopti' we can guess that the user is trying to avoid any overhead
of Meltdown mitigations, and it means we don't have to educate every
one about a different command line option.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/powerpc/kernel/setup_64.c | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index 1a3bd6a35c44..7db464c2b129 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -838,8 +838,29 @@ early_initcall(disable_hardlockup_detector);
 #ifdef CONFIG_PPC_BOOK3S_64
 static enum l1d_flush_type enabled_flush_types;
 static void *l1d_flush_fallback_area;
+static bool no_rfi_flush;
 bool rfi_flush;
 
+static int __init handle_no_rfi_flush(char *p)
+{
+	pr_info("rfi-flush: disabled on command line.");
+	no_rfi_flush = true;
+	return 0;
+}
+early_param("no_rfi_flush", handle_no_rfi_flush);
+
+/*
+ * The RFI flush is not KPTI, but because users will see doco that says to use
+ * nopti we hijack that option here to also disable the RFI flush.
+ */
+static int __init handle_no_pti(char *p)
+{
+	pr_info("rfi-flush: disabling due to 'nopti' on command line.\n");
+	handle_no_rfi_flush(NULL);
+	return 0;
+}
+early_param("nopti", handle_no_pti);
+
 static void do_nothing(void *unused)
 {
 	/*
@@ -910,7 +931,8 @@ void __init setup_rfi_flush(enum l1d_flush_type types, bool enable)
 
 	enabled_flush_types = types;
 
-	rfi_flush_enable(enable);
+	if (!no_rfi_flush)
+		rfi_flush_enable(enable);
 }
 #endif /* CONFIG_PPC_BOOK3S_64 */
 #endif
-- 
cgit v1.2.3


From a46ca307a405edda96daf54a5d8baa6778753e82 Mon Sep 17 00:00:00 2001
From: Michael Neuling <mikey@neuling.org>
Date: Wed, 10 Jan 2018 03:07:15 +1100
Subject: powerpc/pseries: Query hypervisor for RFI flush settings

commit 8989d56878a7735dfdb234707a2fee6faf631085 upstream.

A new hypervisor call is available which tells the guest settings
related to the RFI flush. Use it to query the appropriate flush
instruction(s), and whether the flush is required.

Signed-off-by: Michael Neuling <mikey@neuling.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/powerpc/platforms/pseries/setup.c | 37 +++++++++++++++++++++++++++++++++-
 1 file changed, 36 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c
index 36df46eaba24..dd2545fc9947 100644
--- a/arch/powerpc/platforms/pseries/setup.c
+++ b/arch/powerpc/platforms/pseries/setup.c
@@ -499,6 +499,39 @@ static void __init find_and_init_phbs(void)
 	of_pci_check_probe_only();
 }
 
+static void pseries_setup_rfi_flush(void)
+{
+	struct h_cpu_char_result result;
+	enum l1d_flush_type types;
+	bool enable;
+	long rc;
+
+	/* Enable by default */
+	enable = true;
+
+	rc = plpar_get_cpu_characteristics(&result);
+	if (rc == H_SUCCESS) {
+		types = L1D_FLUSH_NONE;
+
+		if (result.character & H_CPU_CHAR_L1D_FLUSH_TRIG2)
+			types |= L1D_FLUSH_MTTRIG;
+		if (result.character & H_CPU_CHAR_L1D_FLUSH_ORI30)
+			types |= L1D_FLUSH_ORI;
+
+		/* Use fallback if nothing set in hcall */
+		if (types == L1D_FLUSH_NONE)
+			types = L1D_FLUSH_FALLBACK;
+
+		if (!(result.behaviour & H_CPU_BEHAV_L1D_FLUSH_PR))
+			enable = false;
+	} else {
+		/* Default to fallback if case hcall is not available */
+		types = L1D_FLUSH_FALLBACK;
+	}
+
+	setup_rfi_flush(types, enable);
+}
+
 static void __init pSeries_setup_arch(void)
 {
 	set_arch_panic_timeout(10, ARCH_PANIC_TIMEOUT);
@@ -515,7 +548,9 @@ static void __init pSeries_setup_arch(void)
 
 	fwnmi_init();
 
-	/* By default, only probe PCI (can be overriden by rtas_pci) */
+	pseries_setup_rfi_flush();
+
+	/* By default, only probe PCI (can be overridden by rtas_pci) */
 	pci_add_flags(PCI_PROBE_ONLY);
 
 	/* Find and initialize PCI host bridges */
-- 
cgit v1.2.3


From 95e4f102222aea0c9ff89a5a04c44612d9e400e8 Mon Sep 17 00:00:00 2001
From: Oliver O'Halloran <oohall@gmail.com>
Date: Wed, 10 Jan 2018 03:07:15 +1100
Subject: powerpc/powernv: Check device-tree for RFI flush settings

commit 6e032b350cd1fdb830f18f8320ef0e13b4e24094 upstream.

New device-tree properties are available which tell the hypervisor
settings related to the RFI flush. Use them to determine the
appropriate flush instruction to use, and whether the flush is
required.

Signed-off-by: Oliver O'Halloran <oohall@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/powerpc/platforms/powernv/setup.c | 50 ++++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)

(limited to 'arch')

diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c
index f48afc06ba14..30c6b3b7be90 100644
--- a/arch/powerpc/platforms/powernv/setup.c
+++ b/arch/powerpc/platforms/powernv/setup.c
@@ -35,13 +35,63 @@
 #include <asm/opal.h>
 #include <asm/kexec.h>
 #include <asm/smp.h>
+#include <asm/tm.h>
+#include <asm/setup.h>
 
 #include "powernv.h"
 
+static void pnv_setup_rfi_flush(void)
+{
+	struct device_node *np, *fw_features;
+	enum l1d_flush_type type;
+	int enable;
+
+	/* Default to fallback in case fw-features are not available */
+	type = L1D_FLUSH_FALLBACK;
+	enable = 1;
+
+	np = of_find_node_by_name(NULL, "ibm,opal");
+	fw_features = of_get_child_by_name(np, "fw-features");
+	of_node_put(np);
+
+	if (fw_features) {
+		np = of_get_child_by_name(fw_features, "inst-l1d-flush-trig2");
+		if (np && of_property_read_bool(np, "enabled"))
+			type = L1D_FLUSH_MTTRIG;
+
+		of_node_put(np);
+
+		np = of_get_child_by_name(fw_features, "inst-l1d-flush-ori30,30,0");
+		if (np && of_property_read_bool(np, "enabled"))
+			type = L1D_FLUSH_ORI;
+
+		of_node_put(np);
+
+		/* Enable unless firmware says NOT to */
+		enable = 2;
+		np = of_get_child_by_name(fw_features, "needs-l1d-flush-msr-hv-1-to-0");
+		if (np && of_property_read_bool(np, "disabled"))
+			enable--;
+
+		of_node_put(np);
+
+		np = of_get_child_by_name(fw_features, "needs-l1d-flush-msr-pr-0-to-1");
+		if (np && of_property_read_bool(np, "disabled"))
+			enable--;
+
+		of_node_put(np);
+		of_node_put(fw_features);
+	}
+
+	setup_rfi_flush(type, enable > 0);
+}
+
 static void __init pnv_setup_arch(void)
 {
 	set_arch_panic_timeout(10, ARCH_PANIC_TIMEOUT);
 
+	pnv_setup_rfi_flush();
+
 	/* Initialize SMP */
 	pnv_smp_init();
 
-- 
cgit v1.2.3


From 1e8014e74b141979f0cf65bfabe9a077879b11a1 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Tue, 16 Jan 2018 21:20:05 +1100
Subject: powerpc/64s: Wire up cpu_show_meltdown()

commit fd6e440f20b1a4304553775fc55938848ff617c9 upstream.

The recent commit 87590ce6e373 ("sysfs/cpu: Add vulnerability folder")
added a generic folder and set of files for reporting information on
CPU vulnerabilities. One of those was for meltdown:

  /sys/devices/system/cpu/vulnerabilities/meltdown

This commit wires up that file for 64-bit Book3S powerpc.

For now we default to "Vulnerable" unless the RFI flush is enabled.
That may not actually be true on all hardware, further patches will
refine the reporting based on the CPU/platform etc. But for now we
default to being pessimists.

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/powerpc/Kconfig           | 1 +
 arch/powerpc/kernel/setup_64.c | 8 ++++++++
 2 files changed, 9 insertions(+)

(limited to 'arch')

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 0c26025e18f9..58a1fa979655 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -136,6 +136,7 @@ config PPC
 	select GENERIC_SMP_IDLE_THREAD
 	select GENERIC_CMOS_UPDATE
 	select GENERIC_TIME_VSYSCALL_OLD
+	select GENERIC_CPU_VULNERABILITIES	if PPC_BOOK3S_64
 	select GENERIC_CLOCKEVENTS
 	select GENERIC_CLOCKEVENTS_BROADCAST if SMP
 	select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index 7db464c2b129..cd7bfec17654 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -934,5 +934,13 @@ void __init setup_rfi_flush(enum l1d_flush_type types, bool enable)
 	if (!no_rfi_flush)
 		rfi_flush_enable(enable);
 }
+
+ssize_t cpu_show_meltdown(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	if (rfi_flush)
+		return sprintf(buf, "Mitigation: RFI Flush\n");
+
+	return sprintf(buf, "Vulnerable\n");
+}
 #endif /* CONFIG_PPC_BOOK3S_64 */
 #endif
-- 
cgit v1.2.3


From b074e0bd527686da77d4c7efbe77ecc52c470234 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Tue, 16 Jan 2018 22:17:18 +1100
Subject: powerpc/64s: Allow control of RFI flush via debugfs

commit 236003e6b5443c45c18e613d2b0d776a9f87540e upstream.

Expose the state of the RFI flush (enabled/disabled) via debugfs, and
allow it to be enabled/disabled at runtime.

eg: $ cat /sys/kernel/debug/powerpc/rfi_flush
    1
    $ echo 0 > /sys/kernel/debug/powerpc/rfi_flush
    $ cat /sys/kernel/debug/powerpc/rfi_flush
    0

Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Reviewed-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/powerpc/kernel/setup_64.c | 30 ++++++++++++++++++++++++++++++
 1 file changed, 30 insertions(+)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index cd7bfec17654..df4a87eb8da4 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -38,6 +38,7 @@
 #include <linux/hugetlb.h>
 #include <linux/memory.h>
 #include <linux/nmi.h>
+#include <linux/debugfs.h>
 
 #include <asm/io.h>
 #include <asm/kdump.h>
@@ -935,6 +936,35 @@ void __init setup_rfi_flush(enum l1d_flush_type types, bool enable)
 		rfi_flush_enable(enable);
 }
 
+#ifdef CONFIG_DEBUG_FS
+static int rfi_flush_set(void *data, u64 val)
+{
+	if (val == 1)
+		rfi_flush_enable(true);
+	else if (val == 0)
+		rfi_flush_enable(false);
+	else
+		return -EINVAL;
+
+	return 0;
+}
+
+static int rfi_flush_get(void *data, u64 *val)
+{
+	*val = rfi_flush ? 1 : 0;
+	return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(fops_rfi_flush, rfi_flush_get, rfi_flush_set, "%llu\n");
+
+static __init int rfi_flush_debugfs_init(void)
+{
+	debugfs_create_file("rfi_flush", 0600, powerpc_debugfs_root, NULL, &fops_rfi_flush);
+	return 0;
+}
+device_initcall(rfi_flush_debugfs_init);
+#endif
+
 ssize_t cpu_show_meltdown(struct device *dev, struct device_attribute *attr, char *buf)
 {
 	if (rfi_flush)
-- 
cgit v1.2.3


From 69f9dc4b71a70a2bbc23eaa57525df3ecdc881a3 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Thu, 28 Sep 2017 16:58:26 -0500
Subject: x86/asm: Fix inline asm call constraints for GCC 4.4

commit 520a13c530aeb5f63e011d668c42db1af19ed349 upstream.

The kernel test bot (run by Xiaolong Ye) reported that the following commit:

  f5caf621ee35 ("x86/asm: Fix inline asm call constraints for Clang")

is causing double faults in a kernel compiled with GCC 4.4.

Linus subsequently diagnosed the crash pattern and the buggy commit and found that
the issue is with this code:

  register unsigned int __asm_call_sp asm("esp");
  #define ASM_CALL_CONSTRAINT "+r" (__asm_call_sp)

Even on a 64-bit kernel, it's using ESP instead of RSP.  That causes GCC
to produce the following bogus code:

  ffffffff8147461d:       89 e0                   mov    %esp,%eax
  ffffffff8147461f:       4c 89 f7                mov    %r14,%rdi
  ffffffff81474622:       4c 89 fe                mov    %r15,%rsi
  ffffffff81474625:       ba 20 00 00 00          mov    $0x20,%edx
  ffffffff8147462a:       89 c4                   mov    %eax,%esp
  ffffffff8147462c:       e8 bf 52 05 00          callq  ffffffff814c98f0 <copy_user_generic_unrolled>

Despite the absurdity of it backing up and restoring the stack pointer
for no reason, the bug is actually the fact that it's only backing up
and restoring the lower 32 bits of the stack pointer.  The upper 32 bits
are getting cleared out, corrupting the stack pointer.

So change the '__asm_call_sp' register variable to be associated with
the actual full-size stack pointer.

This also requires changing the __ASM_SEL() macro to be based on the
actual compiled arch size, rather than the CONFIG value, because
CONFIG_X86_64 compiles some files with '-m32' (e.g., realmode and vdso).
Otherwise Clang fails to build the kernel because it complains about the
use of a 64-bit register (RSP) in a 32-bit file.

Reported-and-Bisected-and-Tested-by: kernel test robot <xiaolong.ye@intel.com>
Diagnosed-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Ryabinin <aryabinin@virtuozzo.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Dmitriy Vyukov <dvyukov@google.com>
Cc: LKP <lkp@01.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matthias Kaehlcke <mka@chromium.org>
Cc: Miguel Bernal Marin <miguel.bernal.marin@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: f5caf621ee35 ("x86/asm: Fix inline asm call constraints for Clang")
Link: http://lkml.kernel.org/r/20170928215826.6sdpmwtkiydiytim@treble
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Matthias Kaehlcke <mka@chromium.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/asm.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h
index b9c6c7a6f5a6..1c79c8add0eb 100644
--- a/arch/x86/include/asm/asm.h
+++ b/arch/x86/include/asm/asm.h
@@ -11,10 +11,12 @@
 # define __ASM_FORM_COMMA(x) " " #x ","
 #endif
 
-#ifdef CONFIG_X86_32
+#ifndef __x86_64__
+/* 32 bit */
 # define __ASM_SEL(a,b) __ASM_FORM(a)
 # define __ASM_SEL_RAW(a,b) __ASM_FORM_RAW(a)
 #else
+/* 64 bit */
 # define __ASM_SEL(a,b) __ASM_FORM(b)
 # define __ASM_SEL_RAW(a,b) __ASM_FORM_RAW(b)
 #endif
-- 
cgit v1.2.3


From 3fe9cdee4205a4876154f469247c7a3176ccaac7 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Sun, 18 Dec 2016 17:44:13 +0100
Subject: x86/microcode/AMD: Do not load when running on a hypervisor

commit a15a753539eca8ba243d576f02e7ca9c4b7d7042 upstream with minor
adjustments.

Doing so is completely void of sense for multiple reasons so prevent
it. Set dis_ucode_ldr to true and thus disable the microcode loader by
default to address xen pv guests which execute the AP path but not the
BSP path.

By having it turned off by default, the APs won't run into the loader
either.

Also, check CPUID(1).ECX[31] which hypervisors set. Well almost, not the
xen pv one. That one gets the aforementioned "fix".

Also, improve the detection method by caching the final decision whether
to continue loading in dis_ucode_ldr and do it once on the BSP. The APs
then simply test that value.

Signed-off-by: Borislav Petkov <bp@suse.de>
Tested-by: Juergen Gross <jgross@suse.com>
Tested-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Acked-by: Juergen Gross <jgross@suse.com>
Link: http://lkml.kernel.org/r/20161218164414.9649-4-bp@alien8.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Rolf Neugebauer <rolf.neugebauer@docker.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kernel/cpu/microcode/core.c | 28 +++++++++++++++++++---------
 1 file changed, 19 insertions(+), 9 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index b3e94ef461fd..1b3e0aa4c511 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -44,7 +44,7 @@
 
 static struct microcode_ops	*microcode_ops;
 
-static bool dis_ucode_ldr;
+static bool dis_ucode_ldr = true;
 
 static int __init disable_loader(char *str)
 {
@@ -81,6 +81,7 @@ struct cpu_info_ctx {
 
 static bool __init check_loader_disabled_bsp(void)
 {
+	u32 a, b, c, d;
 #ifdef CONFIG_X86_32
 	const char *cmdline = (const char *)__pa_nodebug(boot_command_line);
 	const char *opt	    = "dis_ucode_ldr";
@@ -93,8 +94,23 @@ static bool __init check_loader_disabled_bsp(void)
 	bool *res = &dis_ucode_ldr;
 #endif
 
-	if (cmdline_find_option_bool(cmdline, option))
-		*res = true;
+	if (!have_cpuid_p())
+		return *res;
+
+	a = 1;
+	c = 0;
+	native_cpuid(&a, &b, &c, &d);
+
+	/*
+	 * CPUID(1).ECX[31]: reserved for hypervisor use. This is still not
+	 * completely accurate as xen pv guests don't see that CPUID bit set but
+	 * that's good enough as they don't land on the BSP path anyway.
+	 */
+	if (c & BIT(31))
+		return *res;
+
+	if (cmdline_find_option_bool(cmdline, option) <= 0)
+		*res = false;
 
 	return *res;
 }
@@ -126,9 +142,6 @@ void __init load_ucode_bsp(void)
 	if (check_loader_disabled_bsp())
 		return;
 
-	if (!have_cpuid_p())
-		return;
-
 	vendor = x86_vendor();
 	family = x86_family();
 
@@ -162,9 +175,6 @@ void load_ucode_ap(void)
 	if (check_loader_disabled_ap())
 		return;
 
-	if (!have_cpuid_p())
-		return;
-
 	vendor = x86_vendor();
 	family = x86_family();
 
-- 
cgit v1.2.3


From ba929f5f3c263f3b975a3b95328f66203a57b536 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Thu, 12 Oct 2017 13:23:16 +0200
Subject: x86/microcode: Do the family check first

commit 1f161f67a272cc4f29f27934dd3f74cb657eb5c4 upstream with adjustments.

On CPUs like AMD's Geode, for example, we shouldn't even try to load
microcode because they do not support the modern microcode loading
interface.

However, we do the family check *after* the other checks whether the
loader has been disabled on the command line or whether we're running in
a guest.

So move the family checks first in order to exit early if we're being
loaded on an unsupported family.

Reported-and-tested-by: Sven Glodowski <glodi1@arcor.de>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: <stable@vger.kernel.org> # 4.11..
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://bugzilla.suse.com/show_bug.cgi?id=1061396
Link: http://lkml.kernel.org/r/20171012112316.977-1-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Rolf Neugebauer <rolf.neugebauer@docker.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kernel/cpu/microcode/core.c | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c
index 1b3e0aa4c511..ce5f8a2e7ae6 100644
--- a/arch/x86/kernel/cpu/microcode/core.c
+++ b/arch/x86/kernel/cpu/microcode/core.c
@@ -94,9 +94,6 @@ static bool __init check_loader_disabled_bsp(void)
 	bool *res = &dis_ucode_ldr;
 #endif
 
-	if (!have_cpuid_p())
-		return *res;
-
 	a = 1;
 	c = 0;
 	native_cpuid(&a, &b, &c, &d);
@@ -138,8 +135,9 @@ void __init load_ucode_bsp(void)
 {
 	int vendor;
 	unsigned int family;
+	bool intel = true;
 
-	if (check_loader_disabled_bsp())
+	if (!have_cpuid_p())
 		return;
 
 	vendor = x86_vendor();
@@ -147,16 +145,27 @@ void __init load_ucode_bsp(void)
 
 	switch (vendor) {
 	case X86_VENDOR_INTEL:
-		if (family >= 6)
-			load_ucode_intel_bsp();
+		if (family < 6)
+			return;
 		break;
+
 	case X86_VENDOR_AMD:
-		if (family >= 0x10)
-			load_ucode_amd_bsp(family);
+		if (family < 0x10)
+			return;
+		intel = false;
 		break;
+
 	default:
-		break;
+		return;
 	}
+
+	if (check_loader_disabled_bsp())
+		return;
+
+	if (intel)
+		load_ucode_intel_bsp();
+	else
+		load_ucode_amd_bsp(family);
 }
 
 static bool check_loader_disabled_ap(void)
-- 
cgit v1.2.3


From 5c2ea7f7bb2102a6b8caa057af628d1ee7783e24 Mon Sep 17 00:00:00 2001
From: Michal Suchanek <msuchanek@suse.de>
Date: Mon, 15 Jan 2018 14:30:03 +0100
Subject: powerpc/pseries: include linux/types.h in asm/hvcall.h

commit 1b689a95ce7427075f9ac9fb4aea1af530742b7f upstream.

Commit 6e032b350cd1 ("powerpc/powernv: Check device-tree for RFI flush
settings") uses u64 in asm/hvcall.h without including linux/types.h

This breaks hvcall.h users that do not include the header themselves.

Fixes: 6e032b350cd1 ("powerpc/powernv: Check device-tree for RFI flush settings")
Signed-off-by: Michal Suchanek <msuchanek@suse.de>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/powerpc/include/asm/hvcall.h | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch')

diff --git a/arch/powerpc/include/asm/hvcall.h b/arch/powerpc/include/asm/hvcall.h
index 51adbde09845..449bbb87c257 100644
--- a/arch/powerpc/include/asm/hvcall.h
+++ b/arch/powerpc/include/asm/hvcall.h
@@ -298,6 +298,7 @@
 #define H_CPU_BEHAV_BNDS_CHK_SPEC_BAR	(1ull << 61) // IBM bit 2
 
 #ifndef __ASSEMBLY__
+#include <linux/types.h>
 
 /**
  * plpar_hcall_norets: - Make a pseries hypervisor call with no return arguments
-- 
cgit v1.2.3


From 9b6cab80102ff73bf099c623c778af48a3684d2e Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Fri, 9 Feb 2018 15:21:31 -0800
Subject: x86/kaiser: fix build error with KASAN && !FUNCTION_GRAPH_TRACER
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This is a build fix for the 4.4 PTI backport.  4.4 kernels do not have
commit be7635e7287e ("arch, ftrace: for KASAN put hard/soft IRQ entries
into separate sections") which went into 4.6.  Consequently, the
irqentry sections are only created when CONFIG_FUNCTION_GRAPH_TRACER is
enabled, not also when CONFIG_KASAN is enabled.  Therefore, fix the
condition for trying to add a user mapping for this section.

This fixes the following build error:

    arch/x86/mm/kaiser.c: In function ‘kaiser_init’:
    arch/x86/mm/kaiser.c:367:33: error: ‘__irqentry_text_start’ undeclared (first use in this function)
      kaiser_add_user_map_ptrs_early(__irqentry_text_start,
    [...]

Signed-off-by: Eric Biggers <ebiggers@google.com>
Acked-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/mm/kaiser.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/mm/kaiser.c b/arch/x86/mm/kaiser.c
index 2298434f7bdb..7a72e32e4806 100644
--- a/arch/x86/mm/kaiser.c
+++ b/arch/x86/mm/kaiser.c
@@ -363,7 +363,7 @@ void __init kaiser_init(void)
 	kaiser_add_user_map_ptrs_early(__entry_text_start, __entry_text_end,
 				       __PAGE_KERNEL_RX);
 
-#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
 	kaiser_add_user_map_ptrs_early(__irqentry_text_start,
 				       __irqentry_text_end,
 				       __PAGE_KERNEL_RX);
-- 
cgit v1.2.3


From 4f3e6ab44b928ef70b77aa374886cb59fd9e4171 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hughd@google.com>
Date: Tue, 13 Feb 2018 16:45:20 +0100
Subject: kaiser: fix compile error without vsyscall
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Tobias noticed a compile error on 4.4.115, and it's the same on 4.9.80:
arch/x86/mm/kaiser.c: In function ‘kaiser_init’:
arch/x86/mm/kaiser.c:348:8: error: ‘vsyscall_pgprot’ undeclared
                                   (first use in this function)

It seems like his combination of kernel options doesn't work for KAISER.
X86_VSYSCALL_EMULATION is not set on his system, while LEGACY_VSYSCALL
is set to NONE (LEGACY_VSYSCALL_NONE=y). He managed to get things
compiling again, by moving the 'extern unsigned long vsyscall_pgprot'
outside of the preprocessor statement. This works because the optimizer
removes that code (vsyscall_enabled() is always false) - and that's how
it was done in some older backports.

Reported-by: Tobias Jakobi <tjakobi@math.uni-bielefeld.de>
Signed-off-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/vsyscall.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h
index 9ee85066f407..62210da19a92 100644
--- a/arch/x86/include/asm/vsyscall.h
+++ b/arch/x86/include/asm/vsyscall.h
@@ -13,7 +13,6 @@ extern void map_vsyscall(void);
  */
 extern bool emulate_vsyscall(struct pt_regs *regs, unsigned long address);
 extern bool vsyscall_enabled(void);
-extern unsigned long vsyscall_pgprot;
 #else
 static inline void map_vsyscall(void) {}
 static inline bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
@@ -22,5 +21,6 @@ static inline bool emulate_vsyscall(struct pt_regs *regs, unsigned long address)
 }
 static inline bool vsyscall_enabled(void) { return false; }
 #endif
+extern unsigned long vsyscall_pgprot;
 
 #endif /* _ASM_X86_VSYSCALL_H */
-- 
cgit v1.2.3


From de1ca9ef43de77e507752182c1b5050851a1aa58 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Wed, 3 Jan 2018 11:16:25 -0800
Subject: crypto: poly1305 - remove ->setkey() method

commit a16e772e664b9a261424107784804cffc8894977 upstream.

Since Poly1305 requires a nonce per invocation, the Linux kernel
implementations of Poly1305 don't use the crypto API's keying mechanism
and instead expect the key and nonce as the first 32 bytes of the data.
But ->setkey() is still defined as a stub returning an error code.  This
prevents Poly1305 from being used through AF_ALG and will also break it
completely once we start enforcing that all crypto API users (not just
AF_ALG) call ->setkey() if present.

Fix it by removing crypto_poly1305_setkey(), leaving ->setkey as NULL.

Signed-off-by: Eric Biggers <ebiggers@google.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/crypto/poly1305_glue.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/crypto/poly1305_glue.c b/arch/x86/crypto/poly1305_glue.c
index 4264a3d59589..7c064887b783 100644
--- a/arch/x86/crypto/poly1305_glue.c
+++ b/arch/x86/crypto/poly1305_glue.c
@@ -164,7 +164,6 @@ static struct shash_alg alg = {
 	.init		= poly1305_simd_init,
 	.update		= poly1305_simd_update,
 	.final		= crypto_poly1305_final,
-	.setkey		= crypto_poly1305_setkey,
 	.descsize	= sizeof(struct poly1305_simd_desc_ctx),
 	.base		= {
 		.cra_name		= "poly1305",
-- 
cgit v1.2.3


From b1da6f0262b000b9834728f4f767111882451efc Mon Sep 17 00:00:00 2001
From: Marc Zyngier <marc.zyngier@arm.com>
Date: Tue, 6 Feb 2018 17:56:06 +0000
Subject: arm: KVM: Fix SMCCC handling of unimplemented SMC/HVC calls

commit 20e8175d246e9f9deb377f2784b3e7dfb2ad3e86 upstream.

KVM doesn't follow the SMCCC when it comes to unimplemented calls,
and inject an UNDEF instead of returning an error. Since firmware
calls are now used for security mitigation, they are becoming more
common, and the undef is counter productive.

Instead, let's follow the SMCCC which states that -1 must be returned
to the caller when getting an unknown function number.

Tested-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: Marc Zyngier <marc.zyngier@arm.com>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/arm/kvm/handle_exit.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/arm/kvm/handle_exit.c b/arch/arm/kvm/handle_exit.c
index f36b5b1acd1f..05b2f8294968 100644
--- a/arch/arm/kvm/handle_exit.c
+++ b/arch/arm/kvm/handle_exit.c
@@ -45,7 +45,7 @@ static int handle_hvc(struct kvm_vcpu *vcpu, struct kvm_run *run)
 
 	ret = kvm_psci_call(vcpu);
 	if (ret < 0) {
-		kvm_inject_undefined(vcpu);
+		vcpu_set_reg(vcpu, 0, ~0UL);
 		return 1;
 	}
 
@@ -54,7 +54,16 @@ static int handle_hvc(struct kvm_vcpu *vcpu, struct kvm_run *run)
 
 static int handle_smc(struct kvm_vcpu *vcpu, struct kvm_run *run)
 {
-	kvm_inject_undefined(vcpu);
+	/*
+	 * "If an SMC instruction executed at Non-secure EL1 is
+	 * trapped to EL2 because HCR_EL2.TSC is 1, the exception is a
+	 * Trap exception, not a Secure Monitor Call exception [...]"
+	 *
+	 * We need to advance the PC after the trap, as it would
+	 * otherwise return to the same address...
+	 */
+	vcpu_set_reg(vcpu, 0, ~0UL);
+	kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu));
 	return 1;
 }
 
-- 
cgit v1.2.3


From 0e524b26cdcc81a2609d076123e3fd139853eee5 Mon Sep 17 00:00:00 2001
From: Liran Alon <liran.alon@oracle.com>
Date: Thu, 9 Nov 2017 20:27:20 +0200
Subject: KVM: nVMX: Fix races when sending nested PI while dest enters/leaves
 L2
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 6b6977117f50d60455ace86b2d256f6fb4f3de05 upstream.

Consider the following scenario:
1. CPU A calls vmx_deliver_nested_posted_interrupt() to send an IPI
to CPU B via virtual posted-interrupt mechanism.
2. CPU B is currently executing L2 guest.
3. vmx_deliver_nested_posted_interrupt() calls
kvm_vcpu_trigger_posted_interrupt() which will note that
vcpu->mode == IN_GUEST_MODE.
4. Assume that before CPU A sends the physical POSTED_INTR_NESTED_VECTOR
IPI, CPU B exits from L2 to L0 during event-delivery
(valid IDT-vectoring-info).
5. CPU A now sends the physical IPI. The IPI is received in host and
it's handler (smp_kvm_posted_intr_nested_ipi()) does nothing.
6. Assume that before CPU A sets pi_pending=true and KVM_REQ_EVENT,
CPU B continues to run in L0 and reach vcpu_enter_guest(). As
KVM_REQ_EVENT is not set yet, vcpu_enter_guest() will continue and resume
L2 guest.
7. At this point, CPU A sets pi_pending=true and KVM_REQ_EVENT but
it's too late! CPU B already entered L2 and KVM_REQ_EVENT will only be
consumed at next L2 entry!

Another scenario to consider:
1. CPU A calls vmx_deliver_nested_posted_interrupt() to send an IPI
to CPU B via virtual posted-interrupt mechanism.
2. Assume that before CPU A calls kvm_vcpu_trigger_posted_interrupt(),
CPU B is at L0 and is about to resume into L2. Further assume that it is
in vcpu_enter_guest() after check for KVM_REQ_EVENT.
3. At this point, CPU A calls kvm_vcpu_trigger_posted_interrupt() which
will note that vcpu->mode != IN_GUEST_MODE. Therefore, do nothing and
return false. Then, will set pi_pending=true and KVM_REQ_EVENT.
4. Now CPU B continue and resumes into L2 guest without processing
the posted-interrupt until next L2 entry!

To fix both issues, we just need to change
vmx_deliver_nested_posted_interrupt() to set pi_pending=true and
KVM_REQ_EVENT before calling kvm_vcpu_trigger_posted_interrupt().

It will fix the first scenario by chaging step (6) to note that
KVM_REQ_EVENT and pi_pending=true and therefore process
nested posted-interrupt.

It will fix the second scenario by two possible ways:
1. If kvm_vcpu_trigger_posted_interrupt() is called while CPU B has changed
vcpu->mode to IN_GUEST_MODE, physical IPI will be sent and will be received
when CPU resumes into L2.
2. If kvm_vcpu_trigger_posted_interrupt() is called while CPU B hasn't yet
changed vcpu->mode to IN_GUEST_MODE, then after CPU B will change
vcpu->mode it will call kvm_request_pending() which will return true and
therefore force another round of vcpu_enter_guest() which will note that
KVM_REQ_EVENT and pi_pending=true and therefore process nested
posted-interrupt.

Fixes: 705699a13994 ("KVM: nVMX: Enable nested posted interrupt processing")
Signed-off-by: Liran Alon <liran.alon@oracle.com>
Reviewed-by: Nikita Leshenko <nikita.leshchenko@oracle.com>
Reviewed-by: Krish Sadhukhan <krish.sadhukhan@oracle.com>
[Add kvm_vcpu_kick to also handle the case where L1 doesn't intercept L2 HLT
 and L2 executes HLT instruction. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kvm/vmx.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index f8d785aa2e96..2a1a8737015b 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4595,14 +4595,15 @@ static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
 
 	if (is_guest_mode(vcpu) &&
 	    vector == vmx->nested.posted_intr_nv) {
-		/* the PIR and ON have been set by L1. */
-		kvm_vcpu_trigger_posted_interrupt(vcpu);
 		/*
 		 * If a posted intr is not recognized by hardware,
 		 * we will accomplish it in the next vmentry.
 		 */
 		vmx->nested.pi_pending = true;
 		kvm_make_request(KVM_REQ_EVENT, vcpu);
+		/* the PIR and ON have been set by L1. */
+		if (!kvm_vcpu_trigger_posted_interrupt(vcpu))
+			kvm_vcpu_kick(vcpu);
 		return 0;
 	}
 	return -1;
-- 
cgit v1.2.3


From 6471c3d1900f4860e871c4f1d4133a338f588313 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 1 Aug 2017 04:16:47 -0500
Subject: signal/openrisc: Fix do_unaligned_access to send the proper signal

commit 500d58300571b6602341b041f97c082a461ef994 upstream.

While reviewing the signal sending on openrisc the do_unaligned_access
function stood out because it is obviously wrong.  A comment about an
si_code set above when actually si_code is never set.  Leading to a
random si_code being sent to userspace in the event of an unaligned
access.

Looking further SIGBUS BUS_ADRALN is the proper pair of signal and
si_code to send for an unaligned access. That is what other
architectures do and what is required by posix.

Given that do_unaligned_access is broken in a way that no one can be
relying on it on openrisc fix the code to just do the right thing.

Fixes: 769a8a96229e ("OpenRISC: Traps")
Cc: Jonas Bonn <jonas@southpole.se>
Cc: Stefan Kristiansson <stefan.kristiansson@saunalahti.fi>
Cc: Stafford Horne <shorne@gmail.com>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: openrisc@lists.librecores.org
Acked-by: Stafford Horne <shorne@gmail.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/openrisc/kernel/traps.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/openrisc/kernel/traps.c b/arch/openrisc/kernel/traps.c
index 3d3f6062f49c..605a284922fb 100644
--- a/arch/openrisc/kernel/traps.c
+++ b/arch/openrisc/kernel/traps.c
@@ -302,12 +302,12 @@ asmlinkage void do_unaligned_access(struct pt_regs *regs, unsigned long address)
 	siginfo_t info;
 
 	if (user_mode(regs)) {
-		/* Send a SIGSEGV */
-		info.si_signo = SIGSEGV;
+		/* Send a SIGBUS */
+		info.si_signo = SIGBUS;
 		info.si_errno = 0;
-		/* info.si_code has been set above */
-		info.si_addr = (void *)address;
-		force_sig_info(SIGSEGV, &info, current);
+		info.si_code = BUS_ADRALN;
+		info.si_addr = (void __user *)address;
+		force_sig_info(SIGBUS, &info, current);
 	} else {
 		printk("KERNEL: Unaligned Access 0x%.8lx\n", address);
 		show_registers(regs);
-- 
cgit v1.2.3


From fbc1425f4544371b5fe20b9a93f73b85c08dfe29 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Mon, 24 Jul 2017 17:30:30 -0500
Subject: signal/sh: Ensure si_signo is initialized in do_divide_error

commit 0e88bb002a9b2ee8cc3cc9478ce2dc126f849696 upstream.

Set si_signo.

Cc: Yoshinori Sato <ysato@users.sourceforge.jp>
Cc: Rich Felker <dalias@libc.org>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: linux-sh@vger.kernel.org
Fixes: 0983b31849bb ("sh: Wire up division and address error exceptions on SH-2A.")
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/sh/kernel/traps_32.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/sh/kernel/traps_32.c b/arch/sh/kernel/traps_32.c
index ff639342a8be..c5b997757988 100644
--- a/arch/sh/kernel/traps_32.c
+++ b/arch/sh/kernel/traps_32.c
@@ -607,7 +607,8 @@ asmlinkage void do_divide_error(unsigned long r4)
 		break;
 	}
 
-	force_sig_info(SIGFPE, &info, current);
+	info.si_signo = SIGFPE;
+	force_sig_info(info.si_signo, &info, current);
 }
 #endif
 
-- 
cgit v1.2.3


From f6d7ea7a180177fcf5c0aabb45dc8e2503a5fd9e Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Tue, 2 Jan 2018 14:01:34 -0500
Subject: alpha: fix crash if pthread_create races with signal delivery

commit 21ffceda1c8b3807615c40d440d7815e0c85d366 upstream.

On alpha, a process will crash if it attempts to start a thread and a
signal is delivered at the same time. The crash can be reproduced with
this program: https://cygwin.com/ml/cygwin/2014-11/msg00473.html

The reason for the crash is this:
* we call the clone syscall
* we go to the function copy_process
* copy process calls copy_thread_tls, it is a wrapper around copy_thread
* copy_thread sets the tls pointer: childti->pcb.unique = regs->r20
* copy_thread sets regs->r20 to zero
* we go back to copy_process
* copy process checks "if (signal_pending(current))" and returns
  -ERESTARTNOINTR
* the clone syscall is restarted, but this time, regs->r20 is zero, so
  the new thread is created with zero tls pointer
* the new thread crashes in start_thread when attempting to access tls

The comment in the code says that setting the register r20 is some
compatibility with OSF/1. But OSF/1 doesn't use the CLONE_SETTLS flag, so
we don't have to zero r20 if CLONE_SETTLS is set. This patch fixes the bug
by zeroing regs->r20 only if CLONE_SETTLS is not set.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Matt Turner <mattst88@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/alpha/kernel/process.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c
index 84d13263ce46..8095fb2c5c94 100644
--- a/arch/alpha/kernel/process.c
+++ b/arch/alpha/kernel/process.c
@@ -273,12 +273,13 @@ copy_thread(unsigned long clone_flags, unsigned long usp,
 	   application calling fork.  */
 	if (clone_flags & CLONE_SETTLS)
 		childti->pcb.unique = regs->r20;
+	else
+		regs->r20 = 0;	/* OSF/1 has some strange fork() semantics.  */
 	childti->pcb.usp = usp ?: rdusp();
 	*childregs = *regs;
 	childregs->r0 = 0;
 	childregs->r19 = 0;
 	childregs->r20 = 1;	/* OSF/1 has some strange fork() semantics.  */
-	regs->r20 = 0;
 	stack = ((struct switch_stack *) regs) - 1;
 	*childstack = *stack;
 	childstack->r26 = (unsigned long) ret_from_fork;
-- 
cgit v1.2.3


From 4bd9d85ab3db74527d7db46874a93459e6c0cee2 Mon Sep 17 00:00:00 2001
From: Mikulas Patocka <mpatocka@redhat.com>
Date: Tue, 2 Jan 2018 13:59:54 -0500
Subject: alpha: fix reboot on Avanti platform

commit 55fc633c41a08ce9244ff5f528f420b16b1e04d6 upstream.

We need to define NEED_SRM_SAVE_RESTORE on the Avanti, otherwise we get
machine check exception when attempting to reboot the machine.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Matt Turner <mattst88@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/alpha/kernel/pci_impl.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/alpha/kernel/pci_impl.h b/arch/alpha/kernel/pci_impl.h
index 2b0ac429f5eb..412bb3c24f36 100644
--- a/arch/alpha/kernel/pci_impl.h
+++ b/arch/alpha/kernel/pci_impl.h
@@ -143,7 +143,8 @@ struct pci_iommu_arena
 };
 
 #if defined(CONFIG_ALPHA_SRM) && \
-    (defined(CONFIG_ALPHA_CIA) || defined(CONFIG_ALPHA_LCA))
+    (defined(CONFIG_ALPHA_CIA) || defined(CONFIG_ALPHA_LCA) || \
+     defined(CONFIG_ALPHA_AVANTI))
 # define NEED_SRM_SAVE_RESTORE
 #else
 # undef NEED_SRM_SAVE_RESTORE
-- 
cgit v1.2.3


From 1c11f153eb35260b3baab5996d7aa9e974dbb5dd Mon Sep 17 00:00:00 2001
From: Max Filippov <jcmvbkbc@gmail.com>
Date: Fri, 5 Jan 2018 14:27:58 -0800
Subject: xtensa: fix futex_atomic_cmpxchg_inatomic

commit ca47480921587ae30417dd234a9f79af188e3666 upstream.

Return 0 if the operation was successful, not the userspace memory
value. Check that userspace value equals passed oldval, not itself.
Don't update *uval if the value wasn't read from userspace memory.

This fixes process hang due to infinite loop in futex_lock_pi.
It also fixes a bunch of glibc tests nptl/tst-mutexpi*.

Signed-off-by: Max Filippov <jcmvbkbc@gmail.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/xtensa/include/asm/futex.h | 23 ++++++++++-------------
 1 file changed, 10 insertions(+), 13 deletions(-)

(limited to 'arch')

diff --git a/arch/xtensa/include/asm/futex.h b/arch/xtensa/include/asm/futex.h
index b39531babec0..72bfc1cbc2b5 100644
--- a/arch/xtensa/include/asm/futex.h
+++ b/arch/xtensa/include/asm/futex.h
@@ -109,7 +109,6 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
 			      u32 oldval, u32 newval)
 {
 	int ret = 0;
-	u32 prev;
 
 	if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
 		return -EFAULT;
@@ -120,26 +119,24 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
 
 	__asm__ __volatile__ (
 	"	# futex_atomic_cmpxchg_inatomic\n"
-	"1:	l32i	%1, %3, 0\n"
-	"	mov	%0, %5\n"
-	"	wsr	%1, scompare1\n"
-	"2:	s32c1i	%0, %3, 0\n"
-	"3:\n"
+	"	wsr	%5, scompare1\n"
+	"1:	s32c1i	%1, %4, 0\n"
+	"	s32i	%1, %6, 0\n"
+	"2:\n"
 	"	.section .fixup,\"ax\"\n"
 	"	.align 4\n"
-	"4:	.long	3b\n"
-	"5:	l32r	%1, 4b\n"
-	"	movi	%0, %6\n"
+	"3:	.long	2b\n"
+	"4:	l32r	%1, 3b\n"
+	"	movi	%0, %7\n"
 	"	jx	%1\n"
 	"	.previous\n"
 	"	.section __ex_table,\"a\"\n"
-	"	.long 1b,5b,2b,5b\n"
+	"	.long 1b,4b\n"
 	"	.previous\n"
-	: "+r" (ret), "=&r" (prev), "+m" (*uaddr)
-	: "r" (uaddr), "r" (oldval), "r" (newval), "I" (-EFAULT)
+	: "+r" (ret), "+r" (newval), "+m" (*uaddr), "+m" (*uval)
+	: "r" (uaddr), "r" (oldval), "r" (uval), "I" (-EFAULT)
 	: "memory");
 
-	*uval = prev;
 	return ret;
 }
 
-- 
cgit v1.2.3


From 2dab22193943230c4ff63fd9496b2ed5c044e519 Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Tue, 1 Aug 2017 05:02:38 -0500
Subject: mn10300/misalignment: Use SIGSEGV SEGV_MAPERR to report a failed user
 copy

commit 6ac1dc736b323011a55ecd1fc5897c24c4f77cbd upstream.

Setting si_code to 0 is the same a setting si_code to SI_USER which is definitely
not correct.  With si_code set to SI_USER si_pid and si_uid will be copied to
userspace instead of si_addr.  Which is very wrong.

So fix this by using a sensible si_code (SEGV_MAPERR) for this failure.

Fixes: b920de1b77b7 ("mn10300: add the MN10300/AM33 architecture to the kernel")
Cc: David Howells <dhowells@redhat.com>
Cc: Masakazu Urade <urade.masakazu@jp.panasonic.com>
Cc: Koichi Yasutake <yasutake.koichi@jp.panasonic.com>
Signed-off-by: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/mn10300/mm/misalignment.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/mn10300/mm/misalignment.c b/arch/mn10300/mm/misalignment.c
index b9920b1edd5a..70cef54dc40f 100644
--- a/arch/mn10300/mm/misalignment.c
+++ b/arch/mn10300/mm/misalignment.c
@@ -437,7 +437,7 @@ transfer_failed:
 
 	info.si_signo	= SIGSEGV;
 	info.si_errno	= 0;
-	info.si_code	= 0;
+	info.si_code	= SEGV_MAPERR;
 	info.si_addr	= (void *) regs->pc;
 	force_sig_info(SIGSEGV, &info, current);
 	return;
-- 
cgit v1.2.3


From a76abe444968922e8c3743c1c575f7791c0cd8f8 Mon Sep 17 00:00:00 2001
From: Eugene Syromiatnikov <esyr@redhat.com>
Date: Mon, 15 Jan 2018 20:38:17 +0100
Subject: s390: fix handling of -1 in set{,fs}[gu]id16 syscalls

commit 6dd0d2d22aa363fec075cb2577ba273ac8462e94 upstream.

For some reason, the implementation of some 16-bit ID system calls
(namely, setuid16/setgid16 and setfsuid16/setfsgid16) used type cast
instead of low2highgid/low2highuid macros for converting [GU]IDs, which
led to incorrect handling of value of -1 (which ought to be considered
invalid).

Discovered by strace test suite.

Cc: stable@vger.kernel.org
Signed-off-by: Eugene Syromiatnikov <esyr@redhat.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/s390/kernel/compat_linux.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/s390/kernel/compat_linux.c b/arch/s390/kernel/compat_linux.c
index 0176ebc97bfd..86f934255eb6 100644
--- a/arch/s390/kernel/compat_linux.c
+++ b/arch/s390/kernel/compat_linux.c
@@ -110,7 +110,7 @@ COMPAT_SYSCALL_DEFINE2(s390_setregid16, u16, rgid, u16, egid)
 
 COMPAT_SYSCALL_DEFINE1(s390_setgid16, u16, gid)
 {
-	return sys_setgid((gid_t)gid);
+	return sys_setgid(low2highgid(gid));
 }
 
 COMPAT_SYSCALL_DEFINE2(s390_setreuid16, u16, ruid, u16, euid)
@@ -120,7 +120,7 @@ COMPAT_SYSCALL_DEFINE2(s390_setreuid16, u16, ruid, u16, euid)
 
 COMPAT_SYSCALL_DEFINE1(s390_setuid16, u16, uid)
 {
-	return sys_setuid((uid_t)uid);
+	return sys_setuid(low2highuid(uid));
 }
 
 COMPAT_SYSCALL_DEFINE3(s390_setresuid16, u16, ruid, u16, euid, u16, suid)
@@ -173,12 +173,12 @@ COMPAT_SYSCALL_DEFINE3(s390_getresgid16, u16 __user *, rgidp,
 
 COMPAT_SYSCALL_DEFINE1(s390_setfsuid16, u16, uid)
 {
-	return sys_setfsuid((uid_t)uid);
+	return sys_setfsuid(low2highuid(uid));
 }
 
 COMPAT_SYSCALL_DEFINE1(s390_setfsgid16, u16, gid)
 {
-	return sys_setfsgid((gid_t)gid);
+	return sys_setfsgid(low2highgid(gid));
 }
 
 static int groups16_to_user(u16 __user *grouplist, struct group_info *group_info)
-- 
cgit v1.2.3


From 660728b71a630c59ef17c72ace669bbdd666244f Mon Sep 17 00:00:00 2001
From: Patrice Chotard <patrice.chotard@st.com>
Date: Wed, 10 Jan 2018 09:21:02 +0100
Subject: ARM: dts: STi: Add gpio polarity for "hdmi,hpd-gpio" property

commit 7ac1f59c09a61e6af6622df6809e003b0af07f3d upstream.

The GPIO polarity is missing in the hdmi,hpd-gpio property, this
fixes the following DT warnings:

arch/arm/boot/dts/stih410-b2120.dtb: Warning (gpios_property): hdmi,hpd-gpio property
size (8) too small for cell size 2 in /soc/sti-display-subsystem/sti-hdmi@8d04000

arch/arm/boot/dts/stih407-b2120.dtb: Warning (gpios_property): hdmi,hpd-gpio property
size (8) too small for cell size 2 in /soc/sti-display-subsystem/sti-hdmi@8d04000

arch/arm/boot/dts/stih410-b2260.dtb: Warning (gpios_property): hdmi,hpd-gpio property
size (8) too small for cell size 2 in /soc/sti-display-subsystem/sti-hdmi@8d04000

[arnd: marked Cc:stable since this warning shows up with the latest dtc
       by default, and is more likely to actually cause problems than the
       other patches from this series]

Cc: stable@vger.kernel.org
Signed-off-by: Patrice Chotard <patrice.chotard@st.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/arm/boot/dts/stih407.dtsi | 3 ++-
 arch/arm/boot/dts/stih410.dtsi | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/arm/boot/dts/stih407.dtsi b/arch/arm/boot/dts/stih407.dtsi
index d60f0d8add26..e4b508ce38a2 100644
--- a/arch/arm/boot/dts/stih407.dtsi
+++ b/arch/arm/boot/dts/stih407.dtsi
@@ -8,6 +8,7 @@
  */
 #include "stih407-clock.dtsi"
 #include "stih407-family.dtsi"
+#include <dt-bindings/gpio/gpio.h>
 / {
 	soc {
 		sti-display-subsystem {
@@ -112,7 +113,7 @@
 					 <&clk_s_d2_quadfs 0>,
 					 <&clk_s_d2_quadfs 1>;
 
-				hdmi,hpd-gpio = <&pio5 3>;
+				hdmi,hpd-gpio = <&pio5 3 GPIO_ACTIVE_LOW>;
 				reset-names = "hdmi";
 				resets = <&softreset STIH407_HDMI_TX_PHY_SOFTRESET>;
 				ddc = <&hdmiddc>;
diff --git a/arch/arm/boot/dts/stih410.dtsi b/arch/arm/boot/dts/stih410.dtsi
index 40318869c733..3c32fb8cdcac 100644
--- a/arch/arm/boot/dts/stih410.dtsi
+++ b/arch/arm/boot/dts/stih410.dtsi
@@ -9,6 +9,7 @@
 #include "stih410-clock.dtsi"
 #include "stih407-family.dtsi"
 #include "stih410-pinctrl.dtsi"
+#include <dt-bindings/gpio/gpio.h>
 / {
 	aliases {
 		bdisp0 = &bdisp0;
@@ -203,7 +204,7 @@
 					 <&clk_s_d2_quadfs 0>,
 					 <&clk_s_d2_quadfs 1>;
 
-				hdmi,hpd-gpio = <&pio5 3>;
+				hdmi,hpd-gpio = <&pio5 3 GPIO_ACTIVE_LOW>;
 				reset-names = "hdmi";
 				resets = <&softreset STIH407_HDMI_TX_PHY_SOFTRESET>;
 				ddc = <&hdmiddc>;
-- 
cgit v1.2.3


From f8bc000816ef91783a9fba8507da1e998b198d3c Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 11 Jan 2018 11:28:51 +0530
Subject: arm: spear600: Add missing interrupt-parent of rtc

commit 6ffb5b4f248fe53e0361b8cbc2a523b432566442 upstream.

The interrupt-parent of rtc was missing, add it.

Fixes: 8113ba917dfa ("ARM: SPEAr: DT: Update device nodes")
Cc: stable@vger.kernel.org # v3.8+
Reported-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Olof Johansson <olof@lixom.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/arm/boot/dts/spear600.dtsi | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch')

diff --git a/arch/arm/boot/dts/spear600.dtsi b/arch/arm/boot/dts/spear600.dtsi
index 9f60a7b6a42b..bd379034993c 100644
--- a/arch/arm/boot/dts/spear600.dtsi
+++ b/arch/arm/boot/dts/spear600.dtsi
@@ -194,6 +194,7 @@
 			rtc@fc900000 {
 				compatible = "st,spear600-rtc";
 				reg = <0xfc900000 0x1000>;
+				interrupt-parent = <&vic0>;
 				interrupts = <10>;
 				status = "disabled";
 			};
-- 
cgit v1.2.3


From e2756618e9d124152ef2a89bb8966e6992af43b1 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 11 Jan 2018 11:28:52 +0530
Subject: arm: spear13xx: Fix dmas cells

commit cdd10409914184c7eee5ae3e11beb890c9c16c61 upstream.

The "dmas" cells for the designware DMA controller need to have only 3
properties apart from the phandle: request line, src master and
destination master. But the commit 6e8887f60f60 updated it incorrectly
while moving from platform code to DT. Fix it.

Cc: stable@vger.kernel.org # v3.10+
Fixes: 6e8887f60f60 ("ARM: SPEAr13xx: Pass generic DW DMAC platform data from DT")
Reported-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Olof Johansson <olof@lixom.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/arm/boot/dts/spear1340.dtsi | 4 ++--
 arch/arm/boot/dts/spear13xx.dtsi | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/arm/boot/dts/spear1340.dtsi b/arch/arm/boot/dts/spear1340.dtsi
index df2232d767ed..6361cbfcbe5e 100644
--- a/arch/arm/boot/dts/spear1340.dtsi
+++ b/arch/arm/boot/dts/spear1340.dtsi
@@ -141,8 +141,8 @@
 				reg = <0xb4100000 0x1000>;
 				interrupts = <0 105 0x4>;
 				status = "disabled";
-				dmas = <&dwdma0 0x600 0 0 1>, /* 0xC << 11 */
-					<&dwdma0 0x680 0 1 0>; /* 0xD << 7 */
+				dmas = <&dwdma0 12 0 1>,
+					<&dwdma0 13 1 0>;
 				dma-names = "tx", "rx";
 			};
 
diff --git a/arch/arm/boot/dts/spear13xx.dtsi b/arch/arm/boot/dts/spear13xx.dtsi
index 14594ce8c18a..8fd8a3328acb 100644
--- a/arch/arm/boot/dts/spear13xx.dtsi
+++ b/arch/arm/boot/dts/spear13xx.dtsi
@@ -100,7 +100,7 @@
 			reg = <0xb2800000 0x1000>;
 			interrupts = <0 29 0x4>;
 			status = "disabled";
-			dmas = <&dwdma0 0 0 0 0>;
+			dmas = <&dwdma0 0 0 0>;
 			dma-names = "data";
 		};
 
@@ -288,8 +288,8 @@
 				#size-cells = <0>;
 				interrupts = <0 31 0x4>;
 				status = "disabled";
-				dmas = <&dwdma0 0x2000 0 0 0>, /* 0x4 << 11 */
-					<&dwdma0 0x0280 0 0 0>;  /* 0x5 << 7 */
+				dmas = <&dwdma0 4 0 0>,
+					<&dwdma0 5 0 0>;
 				dma-names = "tx", "rx";
 			};
 
-- 
cgit v1.2.3


From 7f5cb8e97b42ced7d7c99161825010764aed1946 Mon Sep 17 00:00:00 2001
From: Viresh Kumar <viresh.kumar@linaro.org>
Date: Thu, 11 Jan 2018 11:28:53 +0530
Subject: arm: spear13xx: Fix spics gpio controller's warning

commit f8975cb1b8a36d0839b6365235778dd9df1d04ca upstream.

This fixes the following warning by also sending the flags argument for
gpio controllers:

Property 'cs-gpios', cell 6 is not a phandle reference in
/ahb/apb/spi@e0100000

Fixes: 8113ba917dfa ("ARM: SPEAr: DT: Update device nodes")
Cc: stable@vger.kernel.org # v3.8+
Reported-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Viresh Kumar <viresh.kumar@linaro.org>
Signed-off-by: Olof Johansson <olof@lixom.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/arm/boot/dts/spear1310-evb.dts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/arm/boot/dts/spear1310-evb.dts b/arch/arm/boot/dts/spear1310-evb.dts
index e48857249ce7..3d83992efd90 100644
--- a/arch/arm/boot/dts/spear1310-evb.dts
+++ b/arch/arm/boot/dts/spear1310-evb.dts
@@ -349,7 +349,7 @@
 			spi0: spi@e0100000 {
 				status = "okay";
 				num-cs = <3>;
-				cs-gpios = <&gpio1 7 0>, <&spics 0>, <&spics 1>;
+				cs-gpios = <&gpio1 7 0>, <&spics 0 0>, <&spics 1 0>;
 
 				stmpe610@0 {
 					compatible = "st,stmpe610";
-- 
cgit v1.2.3


From c63497edc3aa9b7e22dc2aa012d26ec4770b8f2d Mon Sep 17 00:00:00 2001
From: David Woodhouse <dwmw@amazon.co.uk>
Date: Sat, 10 Feb 2018 23:39:24 +0000
Subject: KVM/x86: Reduce retpoline performance impact in
 slot_handle_level_range(), by always inlining iterator helper methods
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 928a4c39484281f8ca366f53a1db79330d058401 upstream.

With retpoline, tight loops of "call this function for every XXX" are
very much pessimised by taking a prediction miss *every* time. This one
is by far the biggest contributor to the guest launch time with retpoline.

By marking the iterator slot_handle_…() functions always_inline, we can
ensure that the indirect function call can be optimised away into a
direct call and it actually generates slightly smaller code because
some of the other conditionals can get optimised away too.

Performance is now pretty close to what we see with nospectre_v2 on
the command line.

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Tested-by: Filippo Sironi <sironi@amazon.de>
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Reviewed-by: Filippo Sironi <sironi@amazon.de>
Acked-by: Paolo Bonzini <pbonzini@redhat.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: arjan.van.de.ven@intel.com
Cc: dave.hansen@intel.com
Cc: jmattson@google.com
Cc: karahmed@amazon.de
Cc: kvm@vger.kernel.org
Cc: rkrcmar@redhat.com
Link: http://lkml.kernel.org/r/1518305967-31356-4-git-send-email-dwmw@amazon.co.uk
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kvm/mmu.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 1049c3c9b877..2b71f2c03b9e 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -4503,7 +4503,7 @@ void kvm_mmu_setup(struct kvm_vcpu *vcpu)
 typedef bool (*slot_level_handler) (struct kvm *kvm, unsigned long *rmap);
 
 /* The caller should hold mmu-lock before calling this function. */
-static bool
+static __always_inline bool
 slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
 			slot_level_handler fn, int start_level, int end_level,
 			gfn_t start_gfn, gfn_t end_gfn, bool lock_flush_tlb)
@@ -4533,7 +4533,7 @@ slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot,
 	return flush;
 }
 
-static bool
+static __always_inline bool
 slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
 		  slot_level_handler fn, int start_level, int end_level,
 		  bool lock_flush_tlb)
@@ -4544,7 +4544,7 @@ slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
 			lock_flush_tlb);
 }
 
-static bool
+static __always_inline bool
 slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
 		      slot_level_handler fn, bool lock_flush_tlb)
 {
@@ -4552,7 +4552,7 @@ slot_handle_all_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
 				 PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
 }
 
-static bool
+static __always_inline bool
 slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
 			slot_level_handler fn, bool lock_flush_tlb)
 {
@@ -4560,7 +4560,7 @@ slot_handle_large_level(struct kvm *kvm, struct kvm_memory_slot *memslot,
 				 PT_MAX_HUGEPAGE_LEVEL, lock_flush_tlb);
 }
 
-static bool
+static __always_inline bool
 slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot,
 		 slot_level_handler fn, bool lock_flush_tlb)
 {
-- 
cgit v1.2.3


From ff891875c1a079c55cb5def525c2448c4cbed98c Mon Sep 17 00:00:00 2001
From: "Gustavo A. R. Silva" <garsilva@embeddedor.com>
Date: Tue, 13 Feb 2018 13:22:08 -0600
Subject: x86/cpu: Change type of x86_cache_size variable to unsigned int

commit 24dbc6000f4b9b0ef5a9daecb161f1907733765a upstream.

Currently, x86_cache_size is of type int, which makes no sense as we
will never have a valid cache size equal or less than 0. So instead of
initializing this variable to -1, it can perfectly be initialized to 0
and use it as an unsigned variable instead.

Suggested-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Gustavo A. R. Silva <garsilva@embeddedor.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Addresses-Coverity-ID: 1464429
Link: http://lkml.kernel.org/r/20180213192208.GA26414@embeddedor.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/processor.h      | 2 +-
 arch/x86/kernel/cpu/common.c          | 2 +-
 arch/x86/kernel/cpu/microcode/intel.c | 2 +-
 arch/x86/kernel/cpu/proc.c            | 4 ++--
 4 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 86bccb4bd4dc..9e77cea2a8ef 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -113,7 +113,7 @@ struct cpuinfo_x86 {
 	char			x86_vendor_id[16];
 	char			x86_model_id[64];
 	/* in KB - valid for CPUS which support this call: */
-	int			x86_cache_size;
+	unsigned int		x86_cache_size;
 	int			x86_cache_alignment;	/* In bytes */
 	/* Cache QoS architectural values: */
 	int			x86_cache_max_rmid;	/* max index */
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index f7f2ad3687ee..8eabbafff213 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -955,7 +955,7 @@ static void identify_cpu(struct cpuinfo_x86 *c)
 	int i;
 
 	c->loops_per_jiffy = loops_per_jiffy;
-	c->x86_cache_size = -1;
+	c->x86_cache_size = 0;
 	c->x86_vendor = X86_VENDOR_UNKNOWN;
 	c->x86_model = c->x86_mask = 0;	/* So far unknown... */
 	c->x86_vendor_id[0] = '\0'; /* Unset */
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 2c76a1801393..2f38a99cdb98 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -1075,7 +1075,7 @@ static struct microcode_ops microcode_intel_ops = {
 
 static int __init calc_llc_size_per_core(struct cpuinfo_x86 *c)
 {
-	u64 llc_size = c->x86_cache_size * 1024;
+	u64 llc_size = c->x86_cache_size * 1024ULL;
 
 	do_div(llc_size, c->x86_max_cores);
 
diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c
index 18ca99f2798b..935225c0375f 100644
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -87,8 +87,8 @@ static int show_cpuinfo(struct seq_file *m, void *v)
 	}
 
 	/* Cache size */
-	if (c->x86_cache_size >= 0)
-		seq_printf(m, "cache size\t: %d KB\n", c->x86_cache_size);
+	if (c->x86_cache_size)
+		seq_printf(m, "cache size\t: %u KB\n", c->x86_cache_size);
 
 	show_cpuinfo_core(m, c, cpu);
 	show_cpuinfo_misc(m, c);
-- 
cgit v1.2.3


From 4bdee1ef5f52afcbd31b8a6331cb63c767031ac6 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 10 Jan 2018 15:40:37 +0100
Subject: ARM: pxa/tosa-bt: add MODULE_LICENSE tag

commit 3343647813fdf0f2409fbf5816ee3e0622168079 upstream.

Without this tag, we get a build warning:

WARNING: modpost: missing MODULE_LICENSE() in arch/arm/mach-pxa/tosa-bt.o

For completeness, I'm also adding author and description fields.

Acked-by: Robert Jarzmik <robert.jarzmik@free.fr>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/arm/mach-pxa/tosa-bt.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch')

diff --git a/arch/arm/mach-pxa/tosa-bt.c b/arch/arm/mach-pxa/tosa-bt.c
index e0a53208880a..b59a7a2df4e3 100644
--- a/arch/arm/mach-pxa/tosa-bt.c
+++ b/arch/arm/mach-pxa/tosa-bt.c
@@ -132,3 +132,7 @@ static struct platform_driver tosa_bt_driver = {
 	},
 };
 module_platform_driver(tosa_bt_driver);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Dmitry Baryshkov");
+MODULE_DESCRIPTION("Bluetooth built-in chip control");
-- 
cgit v1.2.3


From 193cfa7632a490a86faa3af1969d63c63873d8be Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 10 Jan 2018 17:10:11 +0100
Subject: ARM: dts: s5pv210: add interrupt-parent for ohci

commit 5c1037196b9ee75897c211972de370ed1336ec8f upstream.

The ohci-hcd node has an interrupt number but no interrupt-parent,
leading to a warning with current dtc versions:

arch/arm/boot/dts/s5pv210-aquila.dtb: Warning (interrupts_property): Missing interrupt-parent for /soc/ohci@ec300000
arch/arm/boot/dts/s5pv210-goni.dtb: Warning (interrupts_property): Missing interrupt-parent for /soc/ohci@ec300000
arch/arm/boot/dts/s5pv210-smdkc110.dtb: Warning (interrupts_property): Missing interrupt-parent for /soc/ohci@ec300000
arch/arm/boot/dts/s5pv210-smdkv210.dtb: Warning (interrupts_property): Missing interrupt-parent for /soc/ohci@ec300000
arch/arm/boot/dts/s5pv210-torbreck.dtb: Warning (interrupts_property): Missing interrupt-parent for /soc/ohci@ec300000

As seen from the related exynos dts files, the ohci and ehci controllers
always share one interrupt number, and the number is the same here as
well, so setting the same interrupt-parent is the reasonable solution
here.

Reviewed-by: Krzysztof Kozlowski <krzk@kernel.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/arm/boot/dts/s5pv210.dtsi | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch')

diff --git a/arch/arm/boot/dts/s5pv210.dtsi b/arch/arm/boot/dts/s5pv210.dtsi
index 8344a0ee2b86..b03fe747b98c 100644
--- a/arch/arm/boot/dts/s5pv210.dtsi
+++ b/arch/arm/boot/dts/s5pv210.dtsi
@@ -461,6 +461,7 @@
 			compatible = "samsung,exynos4210-ohci";
 			reg = <0xec300000 0x100>;
 			interrupts = <23>;
+			interrupt-parent = <&vic1>;
 			clocks = <&clocks CLK_USB_HOST>;
 			clock-names = "usbhost";
 			#address-cells = <1>;
-- 
cgit v1.2.3


From 6bfbf2aacb212002e774bb586dd2d0edfd470077 Mon Sep 17 00:00:00 2001
From: Eric Biggers <ebiggers@google.com>
Date: Mon, 18 Dec 2017 16:40:26 -0800
Subject: crypto: x86/twofish-3way - Fix %rbp usage

commit d8c7fe9f2a486a6e5f0d5229ca43807af5ab22c6 upstream.

Using %rbp as a temporary register breaks frame pointer convention and
breaks stack traces when unwinding from an interrupt in the crypto code.

In twofish-3way, we can't simply replace %rbp with another register
because there are none available.  Instead, we use the stack to hold the
values that %rbp, %r11, and %r12 were holding previously.  Each of these
values represents the half of the output from the previous Feistel round
that is being passed on unchanged to the following round.  They are only
used once per round, when they are exchanged with %rax, %rbx, and %rcx.

As a result, we free up 3 registers (one per block) and can reassign
them so that %rbp is not used, and additionally %r14 and %r15 are not
used so they do not need to be saved/restored.

There may be a small overhead caused by replacing 'xchg REG, REG' with
the needed sequence 'mov MEM, REG; mov REG, MEM; mov REG, REG' once per
round.  But, counterintuitively, when I tested "ctr-twofish-3way" on a
Haswell processor, the new version was actually about 2% faster.
(Perhaps 'xchg' is not as well optimized as plain moves.)

Reported-by: syzbot <syzkaller@googlegroups.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/crypto/twofish-x86_64-asm_64-3way.S | 112 ++++++++++++++-------------
 1 file changed, 60 insertions(+), 52 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/crypto/twofish-x86_64-asm_64-3way.S b/arch/x86/crypto/twofish-x86_64-asm_64-3way.S
index 1c3b7ceb36d2..e7273a606a07 100644
--- a/arch/x86/crypto/twofish-x86_64-asm_64-3way.S
+++ b/arch/x86/crypto/twofish-x86_64-asm_64-3way.S
@@ -55,29 +55,31 @@
 #define RAB1bl %bl
 #define RAB2bl %cl
 
+#define CD0 0x0(%rsp)
+#define CD1 0x8(%rsp)
+#define CD2 0x10(%rsp)
+
+# used only before/after all rounds
 #define RCD0 %r8
 #define RCD1 %r9
 #define RCD2 %r10
 
-#define RCD0d %r8d
-#define RCD1d %r9d
-#define RCD2d %r10d
-
-#define RX0 %rbp
-#define RX1 %r11
-#define RX2 %r12
+# used only during rounds
+#define RX0 %r8
+#define RX1 %r9
+#define RX2 %r10
 
-#define RX0d %ebp
-#define RX1d %r11d
-#define RX2d %r12d
+#define RX0d %r8d
+#define RX1d %r9d
+#define RX2d %r10d
 
-#define RY0 %r13
-#define RY1 %r14
-#define RY2 %r15
+#define RY0 %r11
+#define RY1 %r12
+#define RY2 %r13
 
-#define RY0d %r13d
-#define RY1d %r14d
-#define RY2d %r15d
+#define RY0d %r11d
+#define RY1d %r12d
+#define RY2d %r13d
 
 #define RT0 %rdx
 #define RT1 %rsi
@@ -85,6 +87,8 @@
 #define RT0d %edx
 #define RT1d %esi
 
+#define RT1bl %sil
+
 #define do16bit_ror(rot, op1, op2, T0, T1, tmp1, tmp2, ab, dst) \
 	movzbl ab ## bl,		tmp2 ## d; \
 	movzbl ab ## bh,		tmp1 ## d; \
@@ -92,6 +96,11 @@
 	op1##l T0(CTX, tmp2, 4),	dst ## d; \
 	op2##l T1(CTX, tmp1, 4),	dst ## d;
 
+#define swap_ab_with_cd(ab, cd, tmp)	\
+	movq cd, tmp;			\
+	movq ab, cd;			\
+	movq tmp, ab;
+
 /*
  * Combined G1 & G2 function. Reordered with help of rotates to have moves
  * at begining.
@@ -110,15 +119,15 @@
 	/* G1,2 && G2,2 */ \
 	do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 0, x ## 0); \
 	do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 0, y ## 0); \
-	xchgq cd ## 0, ab ## 0; \
+	swap_ab_with_cd(ab ## 0, cd ## 0, RT0); \
 	\
 	do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 1, x ## 1); \
 	do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 1, y ## 1); \
-	xchgq cd ## 1, ab ## 1; \
+	swap_ab_with_cd(ab ## 1, cd ## 1, RT0); \
 	\
 	do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 2, x ## 2); \
 	do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 2, y ## 2); \
-	xchgq cd ## 2, ab ## 2;
+	swap_ab_with_cd(ab ## 2, cd ## 2, RT0);
 
 #define enc_round_end(ab, x, y, n) \
 	addl y ## d,			x ## d; \
@@ -168,6 +177,16 @@
 	decrypt_round3(ba, dc, (n*2)+1); \
 	decrypt_round3(ba, dc, (n*2));
 
+#define push_cd()	\
+	pushq RCD2;	\
+	pushq RCD1;	\
+	pushq RCD0;
+
+#define pop_cd()	\
+	popq RCD0;	\
+	popq RCD1;	\
+	popq RCD2;
+
 #define inpack3(in, n, xy, m) \
 	movq 4*(n)(in),			xy ## 0; \
 	xorq w+4*m(CTX),		xy ## 0; \
@@ -223,11 +242,8 @@ ENTRY(__twofish_enc_blk_3way)
 	 *	%rdx: src, RIO
 	 *	%rcx: bool, if true: xor output
 	 */
-	pushq %r15;
-	pushq %r14;
 	pushq %r13;
 	pushq %r12;
-	pushq %rbp;
 	pushq %rbx;
 
 	pushq %rcx; /* bool xor */
@@ -235,40 +251,36 @@ ENTRY(__twofish_enc_blk_3way)
 
 	inpack_enc3();
 
-	encrypt_cycle3(RAB, RCD, 0);
-	encrypt_cycle3(RAB, RCD, 1);
-	encrypt_cycle3(RAB, RCD, 2);
-	encrypt_cycle3(RAB, RCD, 3);
-	encrypt_cycle3(RAB, RCD, 4);
-	encrypt_cycle3(RAB, RCD, 5);
-	encrypt_cycle3(RAB, RCD, 6);
-	encrypt_cycle3(RAB, RCD, 7);
+	push_cd();
+	encrypt_cycle3(RAB, CD, 0);
+	encrypt_cycle3(RAB, CD, 1);
+	encrypt_cycle3(RAB, CD, 2);
+	encrypt_cycle3(RAB, CD, 3);
+	encrypt_cycle3(RAB, CD, 4);
+	encrypt_cycle3(RAB, CD, 5);
+	encrypt_cycle3(RAB, CD, 6);
+	encrypt_cycle3(RAB, CD, 7);
+	pop_cd();
 
 	popq RIO; /* dst */
-	popq %rbp; /* bool xor */
+	popq RT1; /* bool xor */
 
-	testb %bpl, %bpl;
+	testb RT1bl, RT1bl;
 	jnz .L__enc_xor3;
 
 	outunpack_enc3(mov);
 
 	popq %rbx;
-	popq %rbp;
 	popq %r12;
 	popq %r13;
-	popq %r14;
-	popq %r15;
 	ret;
 
 .L__enc_xor3:
 	outunpack_enc3(xor);
 
 	popq %rbx;
-	popq %rbp;
 	popq %r12;
 	popq %r13;
-	popq %r14;
-	popq %r15;
 	ret;
 ENDPROC(__twofish_enc_blk_3way)
 
@@ -278,35 +290,31 @@ ENTRY(twofish_dec_blk_3way)
 	 *	%rsi: dst
 	 *	%rdx: src, RIO
 	 */
-	pushq %r15;
-	pushq %r14;
 	pushq %r13;
 	pushq %r12;
-	pushq %rbp;
 	pushq %rbx;
 
 	pushq %rsi; /* dst */
 
 	inpack_dec3();
 
-	decrypt_cycle3(RAB, RCD, 7);
-	decrypt_cycle3(RAB, RCD, 6);
-	decrypt_cycle3(RAB, RCD, 5);
-	decrypt_cycle3(RAB, RCD, 4);
-	decrypt_cycle3(RAB, RCD, 3);
-	decrypt_cycle3(RAB, RCD, 2);
-	decrypt_cycle3(RAB, RCD, 1);
-	decrypt_cycle3(RAB, RCD, 0);
+	push_cd();
+	decrypt_cycle3(RAB, CD, 7);
+	decrypt_cycle3(RAB, CD, 6);
+	decrypt_cycle3(RAB, CD, 5);
+	decrypt_cycle3(RAB, CD, 4);
+	decrypt_cycle3(RAB, CD, 3);
+	decrypt_cycle3(RAB, CD, 2);
+	decrypt_cycle3(RAB, CD, 1);
+	decrypt_cycle3(RAB, CD, 0);
+	pop_cd();
 
 	popq RIO; /* dst */
 
 	outunpack_dec3();
 
 	popq %rbx;
-	popq %rbp;
 	popq %r12;
 	popq %r13;
-	popq %r14;
-	popq %r15;
 	ret;
 ENDPROC(twofish_dec_blk_3way)
-- 
cgit v1.2.3


From 9435c32b3763c829abf245b30c28ef3e5087907a Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpeng.li@hotmail.com>
Date: Wed, 13 Dec 2017 10:46:40 +0100
Subject: KVM: x86: fix escape of guest dr6 to the host
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit efdab992813fb2ed825745625b83c05032e9cda2 upstream.

syzkaller reported:

   WARNING: CPU: 0 PID: 12927 at arch/x86/kernel/traps.c:780 do_debug+0x222/0x250
   CPU: 0 PID: 12927 Comm: syz-executor Tainted: G           OE    4.15.0-rc2+ #16
   RIP: 0010:do_debug+0x222/0x250
   Call Trace:
    <#DB>
    debug+0x3e/0x70
   RIP: 0010:copy_user_enhanced_fast_string+0x10/0x20
    </#DB>
    _copy_from_user+0x5b/0x90
    SyS_timer_create+0x33/0x80
    entry_SYSCALL_64_fastpath+0x23/0x9a

The testcase sets a watchpoint (with perf_event_open) on a buffer that is
passed to timer_create() as the struct sigevent argument.  In timer_create(),
copy_from_user()'s rep movsb triggers the BP.  The testcase also sets
the debug registers for the guest.

However, KVM only restores host debug registers when the host has active
watchpoints, which triggers a race condition when running the testcase with
multiple threads.  The guest's DR6.BS bit can escape to the host before
another thread invokes timer_create(), and do_debug() complains.

The fix is to respect do_debug()'s dr6 invariant when leaving KVM.

Reported-by: Dmitry Vyukov <dvyukov@google.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kvm/x86.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3900d34980de..721c5049b1ad 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2755,6 +2755,12 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 	kvm_x86_ops->vcpu_put(vcpu);
 	kvm_put_guest_fpu(vcpu);
 	vcpu->arch.last_host_tsc = rdtsc();
+	/*
+	 * If userspace has set any breakpoints or watchpoints, dr6 is restored
+	 * on every vmexit, but if not, we might have a stale dr6 from the
+	 * guest. do_debug expects dr6 to be cleared after it runs, do the same.
+	 */
+	set_debugreg(0, 6);
 }
 
 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
-- 
cgit v1.2.3


From 5683797eb99d5662ba7c67414ab63f1406fb452b Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 10 Jan 2018 22:06:48 +0100
Subject: arm64: dts: add #cooling-cells to CPU nodes

commit acbf76ee05067c3942852019993f7beb69a0f45f upstream.

dtc complains about the lack of #coolin-cells properties for the
CPU nodes that are referred to as "cooling-device":

arch/arm64/boot/dts/mediatek/mt8173-evb.dtb: Warning (cooling_device_property): Missing property '#cooling-cells' in node /cpus/cpu@0 or bad phandle (referred from /thermal-zones/cpu_thermal/cooling-maps/map@0:cooling-device[0])
arch/arm64/boot/dts/mediatek/mt8173-evb.dtb: Warning (cooling_device_property): Missing property '#cooling-cells' in node /cpus/cpu@100 or bad phandle (referred from /thermal-zones/cpu_thermal/cooling-maps/map@1:cooling-device[0])

Apparently this property must be '<2>' to match the binding.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Tested-by: Chunfeng Yun <chunfeng.yun@mediatek.com>
Signed-off-by: Olof Johansson <olof@lixom.net>
[arnd: backported to 4.15]
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/arm64/boot/dts/mediatek/mt8173.dtsi | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch')

diff --git a/arch/arm64/boot/dts/mediatek/mt8173.dtsi b/arch/arm64/boot/dts/mediatek/mt8173.dtsi
index 4dd5f93d0303..7f42b646d528 100644
--- a/arch/arm64/boot/dts/mediatek/mt8173.dtsi
+++ b/arch/arm64/boot/dts/mediatek/mt8173.dtsi
@@ -54,6 +54,7 @@
 			reg = <0x000>;
 			enable-method = "psci";
 			cpu-idle-states = <&CPU_SLEEP_0>;
+			#cooling-cells = <2>;
 		};
 
 		cpu1: cpu@1 {
@@ -70,6 +71,7 @@
 			reg = <0x100>;
 			enable-method = "psci";
 			cpu-idle-states = <&CPU_SLEEP_0>;
+			#cooling-cells = <2>;
 		};
 
 		cpu3: cpu@101 {
-- 
cgit v1.2.3


From 7780d2af34151493afc17101ebce0ada2d6979e2 Mon Sep 17 00:00:00 2001
From: Tony Lindgren <tony@atomide.com>
Date: Mon, 27 Nov 2017 08:57:26 -0800
Subject: ARM: OMAP2+: Fix SRAM virt to phys translation for
 save_secure_ram_context

[ Upstream commit d09220a887f70368afa79e850c95e74890c0a32d ]

With the CMA changes from Joonsoo Kim <iamjoonsoo.kim@lge.com>, it
was noticed that n900 stopped booting. After investigating it turned
out that n900 save_secure_ram_context does some whacky virtual to
physical address translation for the SRAM data address.

As we now only have minimal parts of omap3 idle code copied to SRAM,
running save_secure_ram_context() in SRAM is not needed. It only gets
called on PM init. And it seems there's no need to ever call this from
SRAM idle code.

So let's just keep save_secure_ram_context() in DDR, and pass it the
physical address of the parameters. We can do everything else in
omap-secure.c like we already do for other secure code.

And since we don't have any documentation, I still have no clue what
the values for 0, 1 and 1 for the parameters might be. If somebody has
figured it out, please do send a patch to add some comments.

Debugged-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Signed-off-by: Tony Lindgren <tony@atomide.com>
Signed-off-by: Sasha Levin <alexander.levin@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/arm/mach-omap2/omap-secure.c | 19 +++++++++++++++++++
 arch/arm/mach-omap2/omap-secure.h |  4 ++++
 arch/arm/mach-omap2/pm.h          |  4 ----
 arch/arm/mach-omap2/pm34xx.c      | 13 ++++---------
 arch/arm/mach-omap2/sleep34xx.S   | 26 ++++----------------------
 5 files changed, 31 insertions(+), 35 deletions(-)

(limited to 'arch')

diff --git a/arch/arm/mach-omap2/omap-secure.c b/arch/arm/mach-omap2/omap-secure.c
index 5ac122e88f67..9ff92050053c 100644
--- a/arch/arm/mach-omap2/omap-secure.c
+++ b/arch/arm/mach-omap2/omap-secure.c
@@ -73,6 +73,25 @@ phys_addr_t omap_secure_ram_mempool_base(void)
 	return omap_secure_memblock_base;
 }
 
+u32 omap3_save_secure_ram(void __iomem *addr, int size)
+{
+	u32 ret;
+	u32 param[5];
+
+	if (size != OMAP3_SAVE_SECURE_RAM_SZ)
+		return OMAP3_SAVE_SECURE_RAM_SZ;
+
+	param[0] = 4;		/* Number of arguments */
+	param[1] = __pa(addr);	/* Physical address for saving */
+	param[2] = 0;
+	param[3] = 1;
+	param[4] = 1;
+
+	ret = save_secure_ram_context(__pa(param));
+
+	return ret;
+}
+
 /**
  * rx51_secure_dispatcher: Routine to dispatch secure PPA API calls
  * @idx: The PPA API index
diff --git a/arch/arm/mach-omap2/omap-secure.h b/arch/arm/mach-omap2/omap-secure.h
index af2851fbcdf0..ab6ce2597a88 100644
--- a/arch/arm/mach-omap2/omap-secure.h
+++ b/arch/arm/mach-omap2/omap-secure.h
@@ -31,6 +31,8 @@
 /* Maximum Secure memory storage size */
 #define OMAP_SECURE_RAM_STORAGE	(88 * SZ_1K)
 
+#define OMAP3_SAVE_SECURE_RAM_SZ	0x803F
+
 /* Secure low power HAL API index */
 #define OMAP4_HAL_SAVESECURERAM_INDEX	0x1a
 #define OMAP4_HAL_SAVEHW_INDEX		0x1b
@@ -64,6 +66,8 @@ extern u32 omap_smc2(u32 id, u32 falg, u32 pargs);
 extern u32 omap_smc3(u32 id, u32 process, u32 flag, u32 pargs);
 extern phys_addr_t omap_secure_ram_mempool_base(void);
 extern int omap_secure_ram_reserve_memblock(void);
+extern u32 save_secure_ram_context(u32 args_pa);
+extern u32 omap3_save_secure_ram(void __iomem *save_regs, int size);
 
 extern u32 rx51_secure_dispatcher(u32 idx, u32 process, u32 flag, u32 nargs,
 				  u32 arg1, u32 arg2, u32 arg3, u32 arg4);
diff --git a/arch/arm/mach-omap2/pm.h b/arch/arm/mach-omap2/pm.h
index b668719b9b25..8e30772cfe32 100644
--- a/arch/arm/mach-omap2/pm.h
+++ b/arch/arm/mach-omap2/pm.h
@@ -81,10 +81,6 @@ extern unsigned int omap3_do_wfi_sz;
 /* ... and its pointer from SRAM after copy */
 extern void (*omap3_do_wfi_sram)(void);
 
-/* save_secure_ram_context function pointer and size, for copy to SRAM */
-extern int save_secure_ram_context(u32 *addr);
-extern unsigned int save_secure_ram_context_sz;
-
 extern void omap3_save_scratchpad_contents(void);
 
 #define PM_RTA_ERRATUM_i608		(1 << 0)
diff --git a/arch/arm/mach-omap2/pm34xx.c b/arch/arm/mach-omap2/pm34xx.c
index 2dbd3785ee6f..181da202f981 100644
--- a/arch/arm/mach-omap2/pm34xx.c
+++ b/arch/arm/mach-omap2/pm34xx.c
@@ -48,6 +48,7 @@
 #include "prm3xxx.h"
 #include "pm.h"
 #include "sdrc.h"
+#include "omap-secure.h"
 #include "sram.h"
 #include "control.h"
 #include "vc.h"
@@ -66,7 +67,6 @@ struct power_state {
 
 static LIST_HEAD(pwrst_list);
 
-static int (*_omap_save_secure_sram)(u32 *addr);
 void (*omap3_do_wfi_sram)(void);
 
 static struct powerdomain *mpu_pwrdm, *neon_pwrdm;
@@ -121,8 +121,8 @@ static void omap3_save_secure_ram_context(void)
 		 * will hang the system.
 		 */
 		pwrdm_set_next_pwrst(mpu_pwrdm, PWRDM_POWER_ON);
-		ret = _omap_save_secure_sram((u32 *)(unsigned long)
-				__pa(omap3_secure_ram_storage));
+		ret = omap3_save_secure_ram(omap3_secure_ram_storage,
+					    OMAP3_SAVE_SECURE_RAM_SZ);
 		pwrdm_set_next_pwrst(mpu_pwrdm, mpu_next_state);
 		/* Following is for error tracking, it should not happen */
 		if (ret) {
@@ -431,15 +431,10 @@ static int __init pwrdms_setup(struct powerdomain *pwrdm, void *unused)
  *
  * The minimum set of functions is pushed to SRAM for execution:
  * - omap3_do_wfi for erratum i581 WA,
- * - save_secure_ram_context for security extensions.
  */
 void omap_push_sram_idle(void)
 {
 	omap3_do_wfi_sram = omap_sram_push(omap3_do_wfi, omap3_do_wfi_sz);
-
-	if (omap_type() != OMAP2_DEVICE_TYPE_GP)
-		_omap_save_secure_sram = omap_sram_push(save_secure_ram_context,
-				save_secure_ram_context_sz);
 }
 
 static void __init pm_errata_configure(void)
@@ -551,7 +546,7 @@ int __init omap3_pm_init(void)
 	clkdm_add_wkdep(neon_clkdm, mpu_clkdm);
 	if (omap_type() != OMAP2_DEVICE_TYPE_GP) {
 		omap3_secure_ram_storage =
-			kmalloc(0x803F, GFP_KERNEL);
+			kmalloc(OMAP3_SAVE_SECURE_RAM_SZ, GFP_KERNEL);
 		if (!omap3_secure_ram_storage)
 			pr_err("Memory allocation failed when allocating for secure sram context\n");
 
diff --git a/arch/arm/mach-omap2/sleep34xx.S b/arch/arm/mach-omap2/sleep34xx.S
index 1b9f0520dea9..3e0d802c59da 100644
--- a/arch/arm/mach-omap2/sleep34xx.S
+++ b/arch/arm/mach-omap2/sleep34xx.S
@@ -93,20 +93,13 @@ ENTRY(enable_omap3630_toggle_l2_on_restore)
 ENDPROC(enable_omap3630_toggle_l2_on_restore)
 
 /*
- * Function to call rom code to save secure ram context. This gets
- * relocated to SRAM, so it can be all in .data section. Otherwise
- * we need to initialize api_params separately.
+ * Function to call rom code to save secure ram context.
+ *
+ * r0 = physical address of the parameters
  */
-	.data
-	.align	3
 ENTRY(save_secure_ram_context)
 	stmfd	sp!, {r4 - r11, lr}	@ save registers on stack
-	adr	r3, api_params		@ r3 points to parameters
-	str	r0, [r3,#0x4]		@ r0 has sdram address
-	ldr	r12, high_mask
-	and	r3, r3, r12
-	ldr	r12, sram_phy_addr_mask
-	orr	r3, r3, r12
+	mov	r3, r0			@ physical address of parameters
 	mov	r0, #25			@ set service ID for PPA
 	mov	r12, r0			@ copy secure service ID in r12
 	mov	r1, #0			@ set task id for ROM code in r1
@@ -120,18 +113,7 @@ ENTRY(save_secure_ram_context)
 	nop
 	nop
 	ldmfd	sp!, {r4 - r11, pc}
-	.align
-sram_phy_addr_mask:
-	.word	SRAM_BASE_P
-high_mask:
-	.word	0xffff
-api_params:
-	.word	0x4, 0x0, 0x0, 0x1, 0x1
 ENDPROC(save_secure_ram_context)
-ENTRY(save_secure_ram_context_sz)
-	.word	. - save_secure_ram_context
-
-	.text
 
 /*
  * ======================
-- 
cgit v1.2.3


From 8d9fd11f8a806999eb100a08e38fa3dec2beb3c6 Mon Sep 17 00:00:00 2001
From: Keerthy <j-keerthy@ti.com>
Date: Fri, 10 Nov 2017 16:56:52 +0530
Subject: ARM: AM33xx: PRM: Remove am33xx_pwrdm_read_prev_pwrst function

[ Upstream commit b6d6af7226465b6d11eac09d0be2ab78a4a9eb62 ]

Referring TRM Am335X series:
http://www.ti.com/lit/ug/spruh73p/spruh73p.pdf

The LastPowerStateEntered bitfield is present only for PM_CEFUSE
domain. This is not present in any of the other power domains. Hence
remove the generic am33xx_pwrdm_read_prev_pwrst hook which wrongly
reads the reserved bit fields for all the other power domains.

Reading the reserved bits leads to wrongly interpreting the low
power transitions for various power domains that do not have the
LastPowerStateEntered field. The pm debug counters values are wrong
currently as we are incrementing them based on the reserved bits.

Signed-off-by: Keerthy <j-keerthy@ti.com>
Signed-off-by: Tony Lindgren <tony@atomide.com>
Signed-off-by: Sasha Levin <alexander.levin@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/arm/mach-omap2/prm33xx.c | 12 ------------
 1 file changed, 12 deletions(-)

(limited to 'arch')

diff --git a/arch/arm/mach-omap2/prm33xx.c b/arch/arm/mach-omap2/prm33xx.c
index dcb5001d77da..973bcd754e1c 100644
--- a/arch/arm/mach-omap2/prm33xx.c
+++ b/arch/arm/mach-omap2/prm33xx.c
@@ -176,17 +176,6 @@ static int am33xx_pwrdm_read_pwrst(struct powerdomain *pwrdm)
 	return v;
 }
 
-static int am33xx_pwrdm_read_prev_pwrst(struct powerdomain *pwrdm)
-{
-	u32 v;
-
-	v = am33xx_prm_read_reg(pwrdm->prcm_offs, pwrdm->pwrstst_offs);
-	v &= AM33XX_LASTPOWERSTATEENTERED_MASK;
-	v >>= AM33XX_LASTPOWERSTATEENTERED_SHIFT;
-
-	return v;
-}
-
 static int am33xx_pwrdm_set_lowpwrstchange(struct powerdomain *pwrdm)
 {
 	am33xx_prm_rmw_reg_bits(AM33XX_LOWPOWERSTATECHANGE_MASK,
@@ -357,7 +346,6 @@ struct pwrdm_ops am33xx_pwrdm_operations = {
 	.pwrdm_set_next_pwrst		= am33xx_pwrdm_set_next_pwrst,
 	.pwrdm_read_next_pwrst		= am33xx_pwrdm_read_next_pwrst,
 	.pwrdm_read_pwrst		= am33xx_pwrdm_read_pwrst,
-	.pwrdm_read_prev_pwrst		= am33xx_pwrdm_read_prev_pwrst,
 	.pwrdm_set_logic_retst		= am33xx_pwrdm_set_logic_retst,
 	.pwrdm_read_logic_pwrst		= am33xx_pwrdm_read_logic_pwrst,
 	.pwrdm_read_logic_retst		= am33xx_pwrdm_read_logic_retst,
-- 
cgit v1.2.3


From ffb47e3a6101c36638ab467a27dfd509a563e149 Mon Sep 17 00:00:00 2001
From: Tony Lindgren <tony@atomide.com>
Date: Fri, 17 Nov 2017 08:56:58 -0800
Subject: ARM: dts: Fix omap4 hang with GPS connected to USB by using wakeupgen

[ Upstream commit cf87634c8b24e24bf379b8c6807c8b0fb5f23567 ]

There's been a reproducable USB OHCI/EHCI cpuidle related hang on omap4
for a while that happens after about 20 - 40 minutes on an idle system
with some data feeding device being connected, like a USB GPS device or
a cellular modem.

This issue happens in cpuidle states C2 and C3 and does not happen if
cpuidle is limited to C1 state only. The symptoms are that the whole
system hangs and never wakes up from idle, and if a watchdog is
configured the system reboots after a while.

Turns out that OHCI/EHCI devices on omap4 are trying to use the GIC
interrupt controller directly as a parent instead of the WUGEN. We
need to pass the interrupts through WUGEN to GIC to provide the wakeup
events for the processor.

Let's fix the issue by removing the gic interrupt-parent and use the
default interrupt-parent wakeupgen instead. Note that omap5.dtsi had
this already fixes earlier by commit 7136d457f365 ("ARM: omap: convert
wakeupgen to stacked domains") but we somehow missed omap4 at that
point.

Fixes: 7136d457f365 ("ARM: omap: convert wakeupgen to stacked domains")
Cc: Dave Gerlach <d-gerlach@ti.com>
Cc: Nishanth Menon <nm@ti.com>
Cc: Marc Zyngier <marc.zyngier@arm.com>
Cc: Sebastian Reichel <sebastian.reichel@collabora.co.uk>
Reviewed-by: Roger Quadros <rogerq@ti.com>
Signed-off-by: Tony Lindgren <tony@atomide.com>
Signed-off-by: Sasha Levin <alexander.levin@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/arm/boot/dts/omap4.dtsi | 2 --
 1 file changed, 2 deletions(-)

(limited to 'arch')

diff --git a/arch/arm/boot/dts/omap4.dtsi b/arch/arm/boot/dts/omap4.dtsi
index 5a206c100ce2..8a5628c4b135 100644
--- a/arch/arm/boot/dts/omap4.dtsi
+++ b/arch/arm/boot/dts/omap4.dtsi
@@ -844,14 +844,12 @@
 			usbhsohci: ohci@4a064800 {
 				compatible = "ti,ohci-omap3";
 				reg = <0x4a064800 0x400>;
-				interrupt-parent = <&gic>;
 				interrupts = <GIC_SPI 76 IRQ_TYPE_LEVEL_HIGH>;
 			};
 
 			usbhsehci: ehci@4a064c00 {
 				compatible = "ti,ehci-omap";
 				reg = <0x4a064c00 0x400>;
-				interrupt-parent = <&gic>;
 				interrupts = <GIC_SPI 77 IRQ_TYPE_LEVEL_HIGH>;
 			};
 		};
-- 
cgit v1.2.3


From 1d2905b09771c3dc6234dcf0b80f8bfdf91c7a73 Mon Sep 17 00:00:00 2001
From: Peter Ujfalusi <peter.ujfalusi@ti.com>
Date: Wed, 1 Nov 2017 11:03:31 +0200
Subject: ARM: dts: am4372: Correct the interrupts_properties of McASP

[ Upstream commit 627395a6f8091c0aa18f49dca7df59ba3ec147ef ]

Fixes the following warnings:

arch/arm/boot/dts/am437x-cm-t43.dtb: Warning (interrupts_property):
interrupts size is (8), expected multiple of 12 in
/ocp@44000000/mcasp@48038000

arch/arm/boot/dts/am437x-cm-t43.dtb: Warning (interrupts_property):
interrupts size is (8), expected multiple of 12 in
/ocp@44000000/mcasp@4803C000

Signed-off-by: Peter Ujfalusi <peter.ujfalusi@ti.com>
Signed-off-by: Tony Lindgren <tony@atomide.com>
Signed-off-by: Sasha Levin <alexander.levin@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/arm/boot/dts/am4372.dtsi | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/arm/boot/dts/am4372.dtsi b/arch/arm/boot/dts/am4372.dtsi
index de8791a4d131..3ef1d5a26389 100644
--- a/arch/arm/boot/dts/am4372.dtsi
+++ b/arch/arm/boot/dts/am4372.dtsi
@@ -807,7 +807,8 @@
 			reg = <0x48038000 0x2000>,
 			      <0x46000000 0x400000>;
 			reg-names = "mpu", "dat";
-			interrupts = <80>, <81>;
+			interrupts = <GIC_SPI 80 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 81 IRQ_TYPE_LEVEL_HIGH>;
 			interrupt-names = "tx", "rx";
 			status = "disabled";
 			dmas = <&edma 8>,
@@ -821,7 +822,8 @@
 			reg = <0x4803C000 0x2000>,
 			      <0x46400000 0x400000>;
 			reg-names = "mpu", "dat";
-			interrupts = <82>, <83>;
+			interrupts = <GIC_SPI 82 IRQ_TYPE_LEVEL_HIGH>,
+				     <GIC_SPI 83 IRQ_TYPE_LEVEL_HIGH>;
 			interrupt-names = "tx", "rx";
 			status = "disabled";
 			dmas = <&edma 10>,
-- 
cgit v1.2.3


From ca20b875a9e2c2730240cecb40875f89a3a4355d Mon Sep 17 00:00:00 2001
From: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com>
Date: Thu, 30 Nov 2017 14:03:22 +0530
Subject: powerpc/perf: Fix oops when grouping different pmu events

[ Upstream commit 5aa04b3eb6fca63d2e9827be656dcadc26d54e11 ]

When user tries to group imc (In-Memory Collections) event with
normal event, (sometime) kernel crashes with following log:

    Faulting instruction address: 0x00000000
    [link register   ] c00000000010ce88 power_check_constraints+0x128/0x980
    ...
    c00000000010e238 power_pmu_event_init+0x268/0x6f0
    c0000000002dc60c perf_try_init_event+0xdc/0x1a0
    c0000000002dce88 perf_event_alloc+0x7b8/0xac0
    c0000000002e92e0 SyS_perf_event_open+0x530/0xda0
    c00000000000b004 system_call+0x38/0xe0

'event_base' field of 'struct hw_perf_event' is used as flags for
normal hw events and used as memory address for imc events. While
grouping these two types of events, collect_events() tries to
interpret imc 'event_base' as a flag, which causes a corruption
resulting in a crash.

Consider only those events which belongs to 'perf_hw_context' in
collect_events().

Signed-off-by: Ravi Bangoria <ravi.bangoria@linux.vnet.ibm.com>
Reviewed-By: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Sasha Levin <alexander.levin@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/powerpc/perf/core-book3s.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index b2ab164a8094..4eba7c00ea1f 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -1381,7 +1381,7 @@ static int collect_events(struct perf_event *group, int max_count,
 	int n = 0;
 	struct perf_event *event;
 
-	if (!is_software_event(group)) {
+	if (group->pmu->task_ctx_nr == perf_hw_context) {
 		if (n >= max_count)
 			return -1;
 		ctrs[n] = group;
@@ -1389,7 +1389,7 @@ static int collect_events(struct perf_event *group, int max_count,
 		events[n++] = group->hw.config;
 	}
 	list_for_each_entry(event, &group->sibling_list, group_entry) {
-		if (!is_software_event(event) &&
+		if (event->pmu->task_ctx_nr == perf_hw_context &&
 		    event->state != PERF_EVENT_STATE_OFF) {
 			if (n >= max_count)
 				return -1;
-- 
cgit v1.2.3


From 7b2f5c1c72066f548beecee94323ba0f849fcde9 Mon Sep 17 00:00:00 2001
From: Karol Herbst <kherbst@redhat.com>
Date: Mon, 27 Nov 2017 08:51:39 +0100
Subject: x86/mm/kmmio: Fix mmiotrace for page unaligned addresses

[ Upstream commit 6d60ce384d1d5ca32b595244db4077a419acc687 ]

If something calls ioremap() with an address not aligned to PAGE_SIZE, the
returned address might be not aligned as well. This led to a probe
registered on exactly the returned address, but the entire page was armed
for mmiotracing.

On calling iounmap() the address passed to unregister_kmmio_probe() was
PAGE_SIZE aligned by the caller leading to a complete freeze of the
machine.

We should always page align addresses while (un)registerung mappings,
because the mmiotracer works on top of pages, not mappings. We still keep
track of the probes based on their real addresses and lengths though,
because the mmiotrace still needs to know what are mapped memory regions.

Also move the call to mmiotrace_iounmap() prior page aligning the address,
so that all probes are unregistered properly, otherwise the kernel ends up
failing memory allocations randomly after disabling the mmiotracer.

Tested-by: Lyude <lyude@redhat.com>
Signed-off-by: Karol Herbst <kherbst@redhat.com>
Acked-by: Pekka Paalanen <ppaalanen@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: nouveau@lists.freedesktop.org
Link: http://lkml.kernel.org/r/20171127075139.4928-1-kherbst@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Sasha Levin <alexander.levin@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/mm/ioremap.c |  4 ++--
 arch/x86/mm/kmmio.c   | 12 +++++++-----
 2 files changed, 9 insertions(+), 7 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index b9c78f3bcd67..53ab3f367472 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -348,11 +348,11 @@ void iounmap(volatile void __iomem *addr)
 	    (void __force *)addr < phys_to_virt(ISA_END_ADDRESS))
 		return;
 
+	mmiotrace_iounmap(addr);
+
 	addr = (volatile void __iomem *)
 		(PAGE_MASK & (unsigned long __force)addr);
 
-	mmiotrace_iounmap(addr);
-
 	/* Use the vm area unlocked, assuming the caller
 	   ensures there isn't another iounmap for the same address
 	   in parallel. Reuse of the virtual address is prevented by
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index ddb2244b06a1..76604c8a2a48 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -434,17 +434,18 @@ int register_kmmio_probe(struct kmmio_probe *p)
 	unsigned long flags;
 	int ret = 0;
 	unsigned long size = 0;
+	unsigned long addr = p->addr & PAGE_MASK;
 	const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK);
 	unsigned int l;
 	pte_t *pte;
 
 	spin_lock_irqsave(&kmmio_lock, flags);
-	if (get_kmmio_probe(p->addr)) {
+	if (get_kmmio_probe(addr)) {
 		ret = -EEXIST;
 		goto out;
 	}
 
-	pte = lookup_address(p->addr, &l);
+	pte = lookup_address(addr, &l);
 	if (!pte) {
 		ret = -EINVAL;
 		goto out;
@@ -453,7 +454,7 @@ int register_kmmio_probe(struct kmmio_probe *p)
 	kmmio_count++;
 	list_add_rcu(&p->list, &kmmio_probes);
 	while (size < size_lim) {
-		if (add_kmmio_fault_page(p->addr + size))
+		if (add_kmmio_fault_page(addr + size))
 			pr_err("Unable to set page fault.\n");
 		size += page_level_size(l);
 	}
@@ -527,19 +528,20 @@ void unregister_kmmio_probe(struct kmmio_probe *p)
 {
 	unsigned long flags;
 	unsigned long size = 0;
+	unsigned long addr = p->addr & PAGE_MASK;
 	const unsigned long size_lim = p->len + (p->addr & ~PAGE_MASK);
 	struct kmmio_fault_page *release_list = NULL;
 	struct kmmio_delayed_release *drelease;
 	unsigned int l;
 	pte_t *pte;
 
-	pte = lookup_address(p->addr, &l);
+	pte = lookup_address(addr, &l);
 	if (!pte)
 		return;
 
 	spin_lock_irqsave(&kmmio_lock, flags);
 	while (size < size_lim) {
-		release_kmmio_fault_page(p->addr + size, &release_list);
+		release_kmmio_fault_page(addr + size, &release_list);
 		size += page_level_size(l);
 	}
 	list_del_rcu(&p->list);
-- 
cgit v1.2.3


From e2ac5159c5c126f90490305e8de128a5384aef43 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Mon, 23 Jan 2017 19:35:06 +0100
Subject: x86/ras/inject: Make it depend on X86_LOCAL_APIC=y
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit d4b2ac63b0eae461fc10c9791084be24724ef57a upstream.

... and get rid of the annoying:

  arch/x86/kernel/cpu/mcheck/mce-inject.c:97:13: warning: ‘mce_irq_ipi’ defined but not used [-Wunused-function]

when doing randconfig builds.

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Luck <tony.luck@intel.com>
Cc: Yazen Ghannam <Yazen.Ghannam@amd.com>
Cc: linux-edac <linux-edac@vger.kernel.org>
Link: http://lkml.kernel.org/r/20170123183514.13356-2-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/Kconfig                        | 2 +-
 arch/x86/kernel/cpu/mcheck/mce-inject.c | 5 +----
 2 files changed, 2 insertions(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 2db93042f2f3..bb6aab2fa7f5 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1009,7 +1009,7 @@ config X86_MCE_THRESHOLD
 	def_bool y
 
 config X86_MCE_INJECT
-	depends on X86_MCE
+	depends on X86_MCE && X86_LOCAL_APIC
 	tristate "Machine check injector support"
 	---help---
 	  Provide support for injecting machine checks for testing purposes.
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
index 4cfba4371a71..101bfae369e1 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-inject.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -152,7 +152,6 @@ static void raise_mce(struct mce *m)
 	if (context == MCJ_CTX_RANDOM)
 		return;
 
-#ifdef CONFIG_X86_LOCAL_APIC
 	if (m->inject_flags & (MCJ_IRQ_BROADCAST | MCJ_NMI_BROADCAST)) {
 		unsigned long start;
 		int cpu;
@@ -193,9 +192,7 @@ static void raise_mce(struct mce *m)
 		raise_local();
 		put_cpu();
 		put_online_cpus();
-	} else
-#endif
-	{
+	} else {
 		preempt_disable();
 		raise_local();
 		preempt_enable();
-- 
cgit v1.2.3


From 93e482ea8557e91d5944142091193f79e6dfcee8 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 14 Mar 2017 22:39:21 +0100
Subject: arm64: define BUG() instruction without CONFIG_BUG

commit f13d52cb3fad03c237572be2ee691e1fe2d1d7bb upstream.

This mirrors commit e9c38ceba8d9 ("ARM: 8455/1: define __BUG as
asm(BUG_INSTR) without CONFIG_BUG") to make the behavior of
arm64 consistent with arm and x86, and avoids lots of warnings in
randconfig builds, such as:

kernel/seccomp.c: In function '__seccomp_filter':
kernel/seccomp.c:666:1: error: no return statement in function returning non-void [-Werror=return-type]

Acked-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/arm64/include/asm/bug.h | 33 +++++++++++++++++++--------------
 1 file changed, 19 insertions(+), 14 deletions(-)

(limited to 'arch')

diff --git a/arch/arm64/include/asm/bug.h b/arch/arm64/include/asm/bug.h
index 4a748ce9ba1a..ac6382b25add 100644
--- a/arch/arm64/include/asm/bug.h
+++ b/arch/arm64/include/asm/bug.h
@@ -20,9 +20,6 @@
 
 #include <asm/debug-monitors.h>
 
-#ifdef CONFIG_GENERIC_BUG
-#define HAVE_ARCH_BUG
-
 #ifdef CONFIG_DEBUG_BUGVERBOSE
 #define _BUGVERBOSE_LOCATION(file, line) __BUGVERBOSE_LOCATION(file, line)
 #define __BUGVERBOSE_LOCATION(file, line)				\
@@ -36,28 +33,36 @@
 #define _BUGVERBOSE_LOCATION(file, line)
 #endif
 
-#define _BUG_FLAGS(flags) __BUG_FLAGS(flags)
+#ifdef CONFIG_GENERIC_BUG
 
-#define __BUG_FLAGS(flags) asm volatile (		\
+#define __BUG_ENTRY(flags) 				\
 		".pushsection __bug_table,\"a\"\n\t"	\
 		".align 2\n\t"				\
 	"0:	.long 1f - 0b\n\t"			\
 _BUGVERBOSE_LOCATION(__FILE__, __LINE__)		\
 		".short " #flags "\n\t"			\
 		".popsection\n"				\
-							\
-	"1:	brk %[imm]"				\
-		:: [imm] "i" (BUG_BRK_IMM)		\
-)
+	"1:	"
+#else
+#define __BUG_ENTRY(flags) ""
+#endif
+
+#define __BUG_FLAGS(flags)				\
+	asm volatile (					\
+		__BUG_ENTRY(flags)			\
+		"brk %[imm]" :: [imm] "i" (BUG_BRK_IMM)	\
+	);
 
-#define BUG() do {				\
-	_BUG_FLAGS(0);				\
-	unreachable();				\
+
+#define BUG() do {					\
+	__BUG_FLAGS(0);					\
+	unreachable();					\
 } while (0)
 
-#define __WARN_TAINT(taint) _BUG_FLAGS(BUGFLAG_TAINT(taint))
+#define __WARN_TAINT(taint) 				\
+	__BUG_FLAGS(BUGFLAG_TAINT(taint))
 
-#endif /* ! CONFIG_GENERIC_BUG */
+#define HAVE_ARCH_BUG
 
 #include <asm-generic/bug.h>
 
-- 
cgit v1.2.3


From d721427c28d437d30e9e0a6375a17b3c72d53b75 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 19 Jul 2017 14:53:00 +0200
Subject: x86/fpu/math-emu: Fix possible uninitialized variable use
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 75e2f0a6b16141cb347f442033ec907380d4d66e upstream.

When building the kernel with "make EXTRA_CFLAGS=...", this overrides
the "PARANOID" preprocessor macro defined in arch/x86/math-emu/Makefile,
and we run into a build warning:

  arch/x86/math-emu/reg_compare.c: In function ‘compare_i_st_st’:
  arch/x86/math-emu/reg_compare.c:254:6: error: ‘f’ may be used uninitialized in this function [-Werror=maybe-uninitialized]

This fixes the implementation to work correctly even without the PARANOID
flag, and also fixes the Makefile to not use the EXTRA_CFLAGS variable
but instead use the ccflags-y variable in the Makefile that is meant
for this purpose.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Bill Metzenthen <billm@melbpc.org.au>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20170719125310.2487451-3-arnd@arndb.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/math-emu/Makefile      |  4 ++--
 arch/x86/math-emu/reg_compare.c | 16 ++++++++--------
 2 files changed, 10 insertions(+), 10 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/math-emu/Makefile b/arch/x86/math-emu/Makefile
index 9b0c63b60302..1b2dac174321 100644
--- a/arch/x86/math-emu/Makefile
+++ b/arch/x86/math-emu/Makefile
@@ -5,8 +5,8 @@
 #DEBUG	= -DDEBUGGING
 DEBUG	=
 PARANOID = -DPARANOID
-EXTRA_CFLAGS	:= $(PARANOID) $(DEBUG) -fno-builtin $(MATH_EMULATION)
-EXTRA_AFLAGS	:= $(PARANOID)
+ccflags-y += $(PARANOID) $(DEBUG) -fno-builtin $(MATH_EMULATION)
+asflags-y += $(PARANOID)
 
 # From 'C' language sources:
 C_OBJS =fpu_entry.o errors.o \
diff --git a/arch/x86/math-emu/reg_compare.c b/arch/x86/math-emu/reg_compare.c
index b77360fdbf4a..19b33b50adfa 100644
--- a/arch/x86/math-emu/reg_compare.c
+++ b/arch/x86/math-emu/reg_compare.c
@@ -168,7 +168,7 @@ static int compare(FPU_REG const *b, int tagb)
 /* This function requires that st(0) is not empty */
 int FPU_compare_st_data(FPU_REG const *loaded_data, u_char loaded_tag)
 {
-	int f = 0, c;
+	int f, c;
 
 	c = compare(loaded_data, loaded_tag);
 
@@ -189,12 +189,12 @@ int FPU_compare_st_data(FPU_REG const *loaded_data, u_char loaded_tag)
 		case COMP_No_Comp:
 			f = SW_C3 | SW_C2 | SW_C0;
 			break;
-#ifdef PARANOID
 		default:
+#ifdef PARANOID
 			EXCEPTION(EX_INTERNAL | 0x121);
+#endif /* PARANOID */
 			f = SW_C3 | SW_C2 | SW_C0;
 			break;
-#endif /* PARANOID */
 		}
 	setcc(f);
 	if (c & COMP_Denormal) {
@@ -205,7 +205,7 @@ int FPU_compare_st_data(FPU_REG const *loaded_data, u_char loaded_tag)
 
 static int compare_st_st(int nr)
 {
-	int f = 0, c;
+	int f, c;
 	FPU_REG *st_ptr;
 
 	if (!NOT_EMPTY(0) || !NOT_EMPTY(nr)) {
@@ -235,12 +235,12 @@ static int compare_st_st(int nr)
 		case COMP_No_Comp:
 			f = SW_C3 | SW_C2 | SW_C0;
 			break;
-#ifdef PARANOID
 		default:
+#ifdef PARANOID
 			EXCEPTION(EX_INTERNAL | 0x122);
+#endif /* PARANOID */
 			f = SW_C3 | SW_C2 | SW_C0;
 			break;
-#endif /* PARANOID */
 		}
 	setcc(f);
 	if (c & COMP_Denormal) {
@@ -283,12 +283,12 @@ static int compare_i_st_st(int nr)
 	case COMP_No_Comp:
 		f = X86_EFLAGS_ZF | X86_EFLAGS_PF | X86_EFLAGS_CF;
 		break;
-#ifdef PARANOID
 	default:
+#ifdef PARANOID
 		EXCEPTION(EX_INTERNAL | 0x122);
+#endif /* PARANOID */
 		f = 0;
 		break;
-#endif /* PARANOID */
 	}
 	FPU_EFLAGS = (FPU_EFLAGS & ~(X86_EFLAGS_ZF | X86_EFLAGS_PF | X86_EFLAGS_CF)) | f;
 	if (c & COMP_Denormal) {
-- 
cgit v1.2.3


From 149f6f5f5f788cb7a9477c3f7346ac4fff22e4a2 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 19 Jul 2017 14:53:03 +0200
Subject: x86/build: Silence the build with "make -s"

commit d460131dd50599e0e9405d5f4ae02c27d529a44a upstream.

Every kernel build on x86 will result in some output:

  Setup is 13084 bytes (padded to 13312 bytes).
  System is 4833 kB
  CRC 6d35fa35
  Kernel: arch/x86/boot/bzImage is ready  (#2)

This shuts it up, so that 'make -s' is truely silent as long as
everything works. Building without '-s' should produce unchanged
output.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20170719125310.2487451-6-arnd@arndb.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/boot/Makefile | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index c0cc2a6be0bf..6da2cd0897f3 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -64,12 +64,13 @@ GCOV_PROFILE := n
 $(obj)/bzImage: asflags-y  := $(SVGA_MODE)
 
 quiet_cmd_image = BUILD   $@
+silent_redirect_image = >/dev/null
 cmd_image = $(obj)/tools/build $(obj)/setup.bin $(obj)/vmlinux.bin \
-			       $(obj)/zoffset.h $@
+			       $(obj)/zoffset.h $@ $($(quiet)redirect_image)
 
 $(obj)/bzImage: $(obj)/setup.bin $(obj)/vmlinux.bin $(obj)/tools/build FORCE
 	$(call if_changed,image)
-	@echo 'Kernel: $@ is ready' ' (#'`cat .version`')'
+	@$(kecho) 'Kernel: $@ is ready' ' (#'`cat .version`')'
 
 OBJCOPYFLAGS_vmlinux.bin := -O binary -R .note -R .comment -S
 $(obj)/vmlinux.bin: $(obj)/compressed/vmlinux FORCE
-- 
cgit v1.2.3


From 4d3835f64c1f457aa4286a899c960233d683f953 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 19 Jul 2017 14:53:04 +0200
Subject: x86: add MULTIUSER dependency for KVM
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit c2ce3f5d89d57301e2756ac325fe2ebc33bfec30 upstream.

KVM tries to select 'TASKSTATS', which had additional dependencies:

warning: (KVM) selects TASKSTATS which has unmet direct dependencies (NET && MULTIUSER)

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kvm/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 639a6e34500c..2ec3e55b2c62 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -22,7 +22,7 @@ config KVM
 	depends on HAVE_KVM
 	depends on HIGH_RES_TIMERS
 	# for TASKSTATS/TASK_DELAY_ACCT:
-	depends on NET
+	depends on NET && MULTIUSER
 	select PREEMPT_NOTIFIERS
 	select MMU_NOTIFIER
 	select ANON_INODES
-- 
cgit v1.2.3


From e2e297df05361b05da1af484921bd0c1c51ef4c3 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 19 Jul 2017 14:53:05 +0200
Subject: x86/platform: Add PCI dependency for PUNIT_ATOM_DEBUG

commit d689c64d189e43d782fec5649fb0afe303c5b3f9 upstream.

The IOSF_MBI option requires PCI support, without it we get a harmless
Kconfig warning when it gets selected by PUNIT_ATOM_DEBUG:

  warning: (X86_INTEL_LPSS && SND_SST_IPC_ACPI && MMC_SDHCI_ACPI && PUNIT_ATOM_DEBUG) selects IOSF_MBI which has unmet direct dependencies (PCI)

This adds another dependency to avoid the warning.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20170719125310.2487451-8-arnd@arndb.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/Kconfig.debug | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch')

diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 137dfa96aa14..da00fe1f48f4 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -391,6 +391,7 @@ config X86_DEBUG_FPU
 
 config PUNIT_ATOM_DEBUG
 	tristate "ATOM Punit debug driver"
+	depends on PCI
 	select DEBUG_FS
 	select IOSF_MBI
 	---help---
-- 
cgit v1.2.3


From 622d825999111ddeda9338386f8bc1effd5a2ea1 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Thu, 26 Jan 2017 11:19:55 +0800
Subject: arm64: Kconfig: select COMPAT_BINFMT_ELF only when BINFMT_ELF is set

commit 2e449048a25eb75d48dff12882b93f26d130a1c6 upstream.

Fix warning:
"(COMPAT) selects COMPAT_BINFMT_ELF which has unmet direct dependencies
(COMPAT && BINFMT_ELF)"

Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/arm64/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index 83af36d9439f..02c08671553e 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -785,7 +785,7 @@ source "fs/Kconfig.binfmt"
 config COMPAT
 	bool "Kernel support for 32-bit EL0"
 	depends on ARM64_4K_PAGES || EXPERT
-	select COMPAT_BINFMT_ELF
+	select COMPAT_BINFMT_ELF if BINFMT_ELF
 	select HAVE_UID16
 	select OLD_SIGSUSPEND3
 	select COMPAT_OLD_SIGACTION
-- 
cgit v1.2.3


From 96b745cacbaf50dbfb25a469d7dc8c9a13440f0f Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 16 Nov 2016 15:17:09 +0100
Subject: x86/boot: Avoid warning for zero-filling .bss

commit 553bbc11aa6c1f9e0f529a06aeeca15fbe4a3985 upstream.

The latest binutils are warning about a .fill directive with an explicit
value in a .bss section:

  arch/x86/kernel/head_32.S: Assembler messages:
  arch/x86/kernel/head_32.S:677: Warning: ignoring fill value in section `.bss..page_aligned'
  arch/x86/kernel/head_32.S:679: Warning: ignoring fill value in section `.bss..page_aligned'

This comes from the 'ENTRY()' macro padding the space between the symbols
with 'nop' via:

  .align 4,0x90

Open-coding the .globl directive without the padding avoids that warning,
as all the symbols are already page aligned.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20161116141726.2013389-1-arnd@arndb.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kernel/head_32.S | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 8f1a3f443f7d..70284d38fdc2 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -669,14 +669,17 @@ __PAGE_ALIGNED_BSS
 initial_pg_pmd:
 	.fill 1024*KPMDS,4,0
 #else
-ENTRY(initial_page_table)
+.globl initial_page_table
+initial_page_table:
 	.fill 1024,4,0
 #endif
 initial_pg_fixmap:
 	.fill 1024,4,0
-ENTRY(empty_zero_page)
+.globl empty_zero_page
+empty_zero_page:
 	.fill 4096,1,0
-ENTRY(swapper_pg_dir)
+.globl swapper_pg_dir
+swapper_pg_dir:
 	.fill 1024,4,0
 
 /*
-- 
cgit v1.2.3


From d4869f7f25a8441dbc004b67d1d55aa6520ce997 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Sat, 26 Nov 2016 15:27:06 +0100
Subject: x86/platform/olpc: Fix resume handler build warning
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 20ab6677716c7bbdcfd1cdb9aef296a0b3101f73 upstream.

Fix:

  arch/x86/platform/olpc/olpc-xo15-sci.c:199:12: warning: ‘xo15_sci_resume’
  defined but not used [-Wunused-function]
   static int xo15_sci_resume(struct device *dev)
              ^

which I see in randconfig builds here.

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20161126142706.13602-1-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/platform/olpc/olpc-xo15-sci.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/platform/olpc/olpc-xo15-sci.c b/arch/x86/platform/olpc/olpc-xo15-sci.c
index 55130846ac87..c0533fbc39e3 100644
--- a/arch/x86/platform/olpc/olpc-xo15-sci.c
+++ b/arch/x86/platform/olpc/olpc-xo15-sci.c
@@ -196,6 +196,7 @@ static int xo15_sci_remove(struct acpi_device *device)
 	return 0;
 }
 
+#ifdef CONFIG_PM_SLEEP
 static int xo15_sci_resume(struct device *dev)
 {
 	/* Enable all EC events */
@@ -207,6 +208,7 @@ static int xo15_sci_resume(struct device *dev)
 
 	return 0;
 }
+#endif
 
 static SIMPLE_DEV_PM_OPS(xo15_sci_pm, NULL, xo15_sci_resume);
 
-- 
cgit v1.2.3


From 6f2f10ea6ed781681f66c6554868620158e94ea3 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 4 Oct 2017 12:28:18 +0200
Subject: KVM: add X86_LOCAL_APIC dependency
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit e42eef4ba38806b18c4a74f0c276fb2e0b548173 upstream.

The rework of the posted interrupt handling broke building without
support for the local APIC:

ERROR: "boot_cpu_physical_apicid" [arch/x86/kvm/kvm-intel.ko] undefined!

That configuration is probably not particularly useful anyway, so
we can avoid the randconfig failures by adding a Kconfig dependency.

Fixes: 8b306e2f3c41 ("KVM: VMX: avoid double list add with VT-d posted interrupts")
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kvm/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'arch')

diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 2ec3e55b2c62..53b7f53f6207 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -23,6 +23,7 @@ config KVM
 	depends on HIGH_RES_TIMERS
 	# for TASKSTATS/TASK_DELAY_ACCT:
 	depends on NET && MULTIUSER
+	depends on X86_LOCAL_APIC
 	select PREEMPT_NOTIFIERS
 	select MMU_NOTIFIER
 	select ANON_INODES
-- 
cgit v1.2.3


From e787fce41c0746e39b9edafcadb3c4d05b43d063 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Wed, 19 Jul 2017 14:52:59 +0200
Subject: perf/x86: Shut up false-positive -Wmaybe-uninitialized warning
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 11d8b05855f3749bcb6c57e2c4052921b9605c77 upstream.

The intialization function checks for various failure scenarios, but
unfortunately the compiler gets a little confused about the possible
combinations, leading to a false-positive build warning when
-Wmaybe-uninitialized is set:

  arch/x86/events/core.c: In function ‘init_hw_perf_events’:
  arch/x86/events/core.c:264:3: warning: ‘reg_fail’ may be used uninitialized in this function [-Wmaybe-uninitialized]
  arch/x86/events/core.c:264:3: warning: ‘val_fail’ may be used uninitialized in this function [-Wmaybe-uninitialized]
     pr_err(FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n",

We can't actually run into this case, so this shuts up the warning
by initializing the variables to a known-invalid state.

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20170719125310.2487451-2-arnd@arndb.de
Link: https://patchwork.kernel.org/patch/9392595/
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kernel/cpu/perf_event.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 5b2f2306fbcc..b52a8d08ab36 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -188,8 +188,8 @@ static void release_pmc_hardware(void) {}
 
 static bool check_hw_exists(void)
 {
-	u64 val, val_fail, val_new= ~0;
-	int i, reg, reg_fail, ret = 0;
+	u64 val, val_fail = -1, val_new= ~0;
+	int i, reg, reg_fail = -1, ret = 0;
 	int bios_fail = 0;
 	int reg_safe = -1;
 
-- 
cgit v1.2.3


From 0522f5e8c0a5f3835c2115aa2627d621ebbc7818 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 20 Feb 2018 12:54:56 +0100
Subject: ARM: tegra: select USB_ULPI from EHCI rather than platform

commit a262e87ff354f12447bb6268bd63edf7ba1c20e0 upstream.

For historic reasons, the tegra platform selects USB_ULPI from architecture
code, but that hasn't really made sense for a long time, as the only
user of that code is the Tegra EHCI driver that has its own Kconfig
symbol.

This removes the 'select' statements from mach-tegra and drivers/soc/tegra
and adds them with the device driver that actually needs them.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Thierry Reding <treding@nvidia.com>
[arnd: rebased to 4.4-stable]
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/arm/mach-tegra/Kconfig  | 2 --
 arch/arm64/Kconfig.platforms | 2 --
 2 files changed, 4 deletions(-)

(limited to 'arch')

diff --git a/arch/arm/mach-tegra/Kconfig b/arch/arm/mach-tegra/Kconfig
index 0fa4c5f8b1be..2d43357d4a0a 100644
--- a/arch/arm/mach-tegra/Kconfig
+++ b/arch/arm/mach-tegra/Kconfig
@@ -12,8 +12,6 @@ menuconfig ARCH_TEGRA
 	select ARCH_HAS_RESET_CONTROLLER
 	select RESET_CONTROLLER
 	select SOC_BUS
-	select USB_ULPI if USB_PHY
-	select USB_ULPI_VIEWPORT if USB_PHY
 	help
 	  This enables support for NVIDIA Tegra based systems.
 
diff --git a/arch/arm64/Kconfig.platforms b/arch/arm64/Kconfig.platforms
index 4043c35962cc..5edb50772c11 100644
--- a/arch/arm64/Kconfig.platforms
+++ b/arch/arm64/Kconfig.platforms
@@ -90,8 +90,6 @@ config ARCH_TEGRA_132_SOC
 	bool "NVIDIA Tegra132 SoC"
 	depends on ARCH_TEGRA
 	select PINCTRL_TEGRA124
-	select USB_ULPI if USB_PHY
-	select USB_ULPI_VIEWPORT if USB_PHY
 	help
 	  Enable support for NVIDIA Tegra132 SoC, based on the Denver
 	  ARMv8 CPU.  The Tegra132 SoC is similar to the Tegra124 SoC,
-- 
cgit v1.2.3


From 26c3a6a7f5435712eaa70cb413717838abc49258 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Tue, 20 Feb 2018 12:55:05 +0100
Subject: x86/microcode/AMD: Change load_microcode_amd()'s param to bool to fix
 preemptibility bug

commit dac6ca243c4c49a9ca7507d3d66140ebfac8b04b upstream.

With CONFIG_DEBUG_PREEMPT enabled, I get:

  BUG: using smp_processor_id() in preemptible [00000000] code: swapper/0/1
  caller is debug_smp_processor_id
  CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.12.0-rc2+ #2
  Call Trace:
   dump_stack
   check_preemption_disabled
   debug_smp_processor_id
   save_microcode_in_initrd_amd
   ? microcode_init
   save_microcode_in_initrd
   ...

because, well, it says it above, we're using smp_processor_id() in
preemptible code.

But passing the CPU number is not really needed. It is only used to
determine whether we're on the BSP, and, if so, to save the microcode
patch for early loading.

 [ We don't absolutely need to do it on the BSP but we do that
   customarily there. ]

Instead, convert that function parameter to a boolean which denotes
whether the patch should be saved or not, thereby avoiding the use of
smp_processor_id() in preemptible code.

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20170528200414.31305-1-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
[arnd: rebased to 4.9, after running into warning:
 arch/x86/kernel/cpu/microcode/amd.c:881:30: self-comparison always evaluates to true]
Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/microcode_amd.h |  1 -
 arch/x86/kernel/cpu/microcode/amd.c  | 17 +++++++++++------
 2 files changed, 11 insertions(+), 7 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/microcode_amd.h b/arch/x86/include/asm/microcode_amd.h
index adfc847a395e..fb163f02ebb1 100644
--- a/arch/x86/include/asm/microcode_amd.h
+++ b/arch/x86/include/asm/microcode_amd.h
@@ -59,7 +59,6 @@ static inline u16 find_equiv_id(struct equiv_cpu_entry *equiv_cpu_table,
 
 extern int __apply_microcode_amd(struct microcode_amd *mc_amd);
 extern int apply_microcode_amd(int cpu);
-extern enum ucode_state load_microcode_amd(int cpu, u8 family, const u8 *data, size_t size);
 
 #define PATCH_MAX_SIZE PAGE_SIZE
 extern u8 amd_ucode_patch[PATCH_MAX_SIZE];
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index 2a0f44d225fe..6da6f9cd6d2d 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -131,6 +131,9 @@ static size_t compute_container_size(u8 *data, u32 total_size)
 	return size;
 }
 
+static enum ucode_state
+load_microcode_amd(bool save, u8 family, const u8 *data, size_t size);
+
 /*
  * Early load occurs before we can vmalloc(). So we look for the microcode
  * patch container file in initrd, traverse equivalent cpu table, look for a
@@ -438,7 +441,7 @@ int __init save_microcode_in_initrd_amd(void)
 	eax   = cpuid_eax(0x00000001);
 	eax   = ((eax >> 8) & 0xf) + ((eax >> 20) & 0xff);
 
-	ret = load_microcode_amd(smp_processor_id(), eax, container, container_size);
+	ret = load_microcode_amd(true, eax, container, container_size);
 	if (ret != UCODE_OK)
 		retval = -EINVAL;
 
@@ -854,7 +857,8 @@ static enum ucode_state __load_microcode_amd(u8 family, const u8 *data,
 	return UCODE_OK;
 }
 
-enum ucode_state load_microcode_amd(int cpu, u8 family, const u8 *data, size_t size)
+static enum ucode_state
+load_microcode_amd(bool save, u8 family, const u8 *data, size_t size)
 {
 	enum ucode_state ret;
 
@@ -868,8 +872,8 @@ enum ucode_state load_microcode_amd(int cpu, u8 family, const u8 *data, size_t s
 
 #ifdef CONFIG_X86_32
 	/* save BSP's matching patch for early load */
-	if (cpu_data(cpu).cpu_index == boot_cpu_data.cpu_index) {
-		struct ucode_patch *p = find_patch(cpu);
+	if (save) {
+		struct ucode_patch *p = find_patch(0);
 		if (p) {
 			memset(amd_ucode_patch, 0, PATCH_MAX_SIZE);
 			memcpy(amd_ucode_patch, p->data, min_t(u32, ksize(p->data),
@@ -901,11 +905,12 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device,
 {
 	char fw_name[36] = "amd-ucode/microcode_amd.bin";
 	struct cpuinfo_x86 *c = &cpu_data(cpu);
+	bool bsp = c->cpu_index == boot_cpu_data.cpu_index;
 	enum ucode_state ret = UCODE_NFOUND;
 	const struct firmware *fw;
 
 	/* reload ucode container only on the boot cpu */
-	if (!refresh_fw || c->cpu_index != boot_cpu_data.cpu_index)
+	if (!refresh_fw || !bsp)
 		return UCODE_OK;
 
 	if (c->x86 >= 0x15)
@@ -922,7 +927,7 @@ static enum ucode_state request_microcode_amd(int cpu, struct device *device,
 		goto fw_release;
 	}
 
-	ret = load_microcode_amd(cpu, c->x86, fw->data, fw->size);
+	ret = load_microcode_amd(bsp, c->x86, fw->data, fw->size);
 
  fw_release:
 	release_firmware(fw);
-- 
cgit v1.2.3


From 5f7537133212bc12298e99abf21bea6e08489d58 Mon Sep 17 00:00:00 2001
From: Wanpeng Li <wanpeng.li@hotmail.com>
Date: Thu, 14 Sep 2017 03:54:16 -0700
Subject: KVM: async_pf: Fix #DF due to inject "Page not Present" and "Page
 Ready" exceptions simultaneously
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 9a6e7c39810e4a8bc7fc95056cefb40583fe07ef upstream.

qemu-system-x86-8600  [004] d..1  7205.687530: kvm_entry: vcpu 2
qemu-system-x86-8600  [004] ....  7205.687532: kvm_exit: reason EXCEPTION_NMI rip 0xffffffffa921297d info ffffeb2c0e44e018 80000b0e
qemu-system-x86-8600  [004] ....  7205.687532: kvm_page_fault: address ffffeb2c0e44e018 error_code 0
qemu-system-x86-8600  [004] ....  7205.687620: kvm_try_async_get_page: gva = 0xffffeb2c0e44e018, gfn = 0x427e4e
qemu-system-x86-8600  [004] .N..  7205.687628: kvm_async_pf_not_present: token 0x8b002 gva 0xffffeb2c0e44e018
    kworker/4:2-7814  [004] ....  7205.687655: kvm_async_pf_completed: gva 0xffffeb2c0e44e018 address 0x7fcc30c4e000
qemu-system-x86-8600  [004] ....  7205.687703: kvm_async_pf_ready: token 0x8b002 gva 0xffffeb2c0e44e018
qemu-system-x86-8600  [004] d..1  7205.687711: kvm_entry: vcpu 2

After running some memory intensive workload in guest, I catch the kworker
which completes the GUP too quickly, and queues an "Page Ready" #PF exception
after the "Page not Present" exception before the next vmentry as the above
trace which will result in #DF injected to guest.

This patch fixes it by clearing the queue for "Page not Present" if "Page Ready"
occurs before the next vmentry since the GUP has already got the required page
and shadow page table has already been fixed by "Page Ready" handler.

Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Radim Krčmář <rkrcmar@redhat.com>
Signed-off-by: Wanpeng Li <wanpeng.li@hotmail.com>
Fixes: 7c90705bf2a3 ("KVM: Inject asynchronous page fault into a PV guest if page is swapped out.")
[Changed indentation and added clearing of injected. - Radim]
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
[port from upstream v4.14-rc1, Don't assign to kvm_queued_exception::injected or
 x86_exception::async_page_fault]
Signed-off-by: Jack Wang <jinpu.wang@profitbricks.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kvm/x86.c | 34 ++++++++++++++++++++++++++--------
 1 file changed, 26 insertions(+), 8 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 721c5049b1ad..f37f0c72b22a 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -8210,6 +8210,13 @@ static int apf_put_user(struct kvm_vcpu *vcpu, u32 val)
 				      sizeof(val));
 }
 
+static int apf_get_user(struct kvm_vcpu *vcpu, u32 *val)
+{
+
+	return kvm_read_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, val,
+				      sizeof(u32));
+}
+
 void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
 				     struct kvm_async_pf *work)
 {
@@ -8236,6 +8243,7 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
 				 struct kvm_async_pf *work)
 {
 	struct x86_exception fault;
+	u32 val;
 
 	if (work->wakeup_all)
 		work->arch.token = ~0; /* broadcast wakeup */
@@ -8243,14 +8251,24 @@ void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
 		kvm_del_async_pf_gfn(vcpu, work->arch.gfn);
 	trace_kvm_async_pf_ready(work->arch.token, work->gva);
 
-	if ((vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) &&
-	    !apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) {
-		fault.vector = PF_VECTOR;
-		fault.error_code_valid = true;
-		fault.error_code = 0;
-		fault.nested_page_fault = false;
-		fault.address = work->arch.token;
-		kvm_inject_page_fault(vcpu, &fault);
+	if (vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED &&
+	    !apf_get_user(vcpu, &val)) {
+		if (val == KVM_PV_REASON_PAGE_NOT_PRESENT &&
+		    vcpu->arch.exception.pending &&
+		    vcpu->arch.exception.nr == PF_VECTOR &&
+		    !apf_put_user(vcpu, 0)) {
+			vcpu->arch.exception.pending = false;
+			vcpu->arch.exception.nr = 0;
+			vcpu->arch.exception.has_error_code = false;
+			vcpu->arch.exception.error_code = 0;
+		} else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) {
+			fault.vector = PF_VECTOR;
+			fault.error_code_valid = true;
+			fault.error_code = 0;
+			fault.nested_page_fault = false;
+			fault.address = work->arch.token;
+			kvm_inject_page_fault(vcpu, &fault);
+		}
 	}
 	vcpu->arch.apf.halted = false;
 	vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
-- 
cgit v1.2.3


From ffe69f2dd11a8ef5f708f116a3bccb80b9d57e7d Mon Sep 17 00:00:00 2001
From: Waiman Long <longman@redhat.com>
Date: Fri, 23 Feb 2018 11:41:51 +0100
Subject: x86/retpoline: Remove the esp/rsp thunk

commit 1df37383a8aeabb9b418698f0bcdffea01f4b1b2 upstream.

It doesn't make sense to have an indirect call thunk with esp/rsp as
retpoline code won't work correctly with the stack pointer register.
Removing it will help compiler writers to catch error in case such
a thunk call is emitted incorrectly.

Fixes: 76b043848fd2 ("x86/retpoline: Add initial retpoline support")
Suggested-by: Jeff Law <law@redhat.com>
Signed-off-by: Waiman Long <longman@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: David Woodhouse <dwmw@amazon.co.uk>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: Kees Cook <keescook@google.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Jiri Kosina <jikos@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Greg Kroah-Hartman <gregkh@linux-foundation.org>
Cc: Paul Turner <pjt@google.com>
Link: https://lkml.kernel.org/r/1516658974-27852-1-git-send-email-longman@redhat.com
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
[jwang: cherry pick to 4.4]
Signed-off-by: Jack Wang <jinpu.wang@profitbricks.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/asm-prototypes.h | 1 -
 arch/x86/lib/retpoline.S              | 1 -
 2 files changed, 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/asm-prototypes.h b/arch/x86/include/asm/asm-prototypes.h
index b15aa4083dfd..5a25ada75aeb 100644
--- a/arch/x86/include/asm/asm-prototypes.h
+++ b/arch/x86/include/asm/asm-prototypes.h
@@ -37,5 +37,4 @@ INDIRECT_THUNK(dx)
 INDIRECT_THUNK(si)
 INDIRECT_THUNK(di)
 INDIRECT_THUNK(bp)
-INDIRECT_THUNK(sp)
 #endif /* CONFIG_RETPOLINE */
diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S
index e611a124c442..3d06b482ebc7 100644
--- a/arch/x86/lib/retpoline.S
+++ b/arch/x86/lib/retpoline.S
@@ -36,7 +36,6 @@ GENERATE_THUNK(_ASM_DX)
 GENERATE_THUNK(_ASM_SI)
 GENERATE_THUNK(_ASM_DI)
 GENERATE_THUNK(_ASM_BP)
-GENERATE_THUNK(_ASM_SP)
 #ifdef CONFIG_64BIT
 GENERATE_THUNK(r8)
 GENERATE_THUNK(r9)
-- 
cgit v1.2.3


From 5dac465887db57833830601e290b8a581a95a9aa Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 23 Feb 2018 11:41:52 +0100
Subject: KVM: x86: Make indirect calls in emulator speculation safe

(cherry picked from commit 1a29b5b7f347a1a9230c1e0af5b37e3e571588ab)

Replace the indirect calls with CALL_NOSPEC.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Ashok Raj <ashok.raj@intel.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Jun Nakajima <jun.nakajima@intel.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: rga@amazon.de
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Asit Mallick <asit.k.mallick@intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Jason Baron <jbaron@akamai.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Arjan Van De Ven <arjan.van.de.ven@intel.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Link: https://lkml.kernel.org/r/20180125095843.595615683@infradead.org
[dwmw2: Use ASM_CALL_CONSTRAINT like upstream, now we have it]
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
[backport to 4.4]
Signed-off-by: Jack Wang <jinpu.wang@profitbricks.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kvm/emulate.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index e4eb1d2bf849..8864fec63a20 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -26,6 +26,7 @@
 #include <asm/kvm_emulate.h>
 #include <linux/stringify.h>
 #include <asm/debugreg.h>
+#include <asm/nospec-branch.h>
 
 #include "x86.h"
 #include "tss.h"
@@ -1000,8 +1001,8 @@ static u8 test_cc(unsigned int condition, unsigned long flags)
 	void (*fop)(void) = (void *)em_setcc + 4 * (condition & 0xf);
 
 	flags = (flags & EFLAGS_MASK) | X86_EFLAGS_IF;
-	asm("push %[flags]; popf; call *%[fastop]"
-	    : "=a"(rc) : [fastop]"r"(fop), [flags]"r"(flags));
+	asm("push %[flags]; popf; " CALL_NOSPEC
+	    : "=a"(rc) : [thunk_target]"r"(fop), [flags]"r"(flags));
 	return rc;
 }
 
@@ -5297,9 +5298,9 @@ static int fastop(struct x86_emulate_ctxt *ctxt, void (*fop)(struct fastop *))
 	ulong flags = (ctxt->eflags & EFLAGS_MASK) | X86_EFLAGS_IF;
 	if (!(ctxt->d & ByteOp))
 		fop += __ffs(ctxt->dst.bytes) * FASTOP_SIZE;
-	asm("push %[flags]; popf; call *%[fastop]; pushf; pop %[flags]\n"
+	asm("push %[flags]; popf; " CALL_NOSPEC "; pushf; pop %[flags]\n"
 	    : "+a"(ctxt->dst.val), "+d"(ctxt->src.val), [flags]"+D"(flags),
-	      [fastop]"+S"(fop)
+	      [thunk_target]"+S"(fop)
 	    : "c"(ctxt->src2.val));
 	ctxt->eflags = (ctxt->eflags & ~EFLAGS_MASK) | (flags & EFLAGS_MASK);
 	if (!fop) /* exception is returned in fop variable */
-- 
cgit v1.2.3


From d5030418b0c82956921545121b4f08df0f9ece70 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 23 Feb 2018 11:41:53 +0100
Subject: KVM: VMX: Make indirect call speculation safe

(cherry picked from commit c940a3fb1e2e9b7d03228ab28f375fb5a47ff699)

Replace indirect call with CALL_NOSPEC.

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: David Woodhouse <dwmw@amazon.co.uk>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Ashok Raj <ashok.raj@intel.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Jun Nakajima <jun.nakajima@intel.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: rga@amazon.de
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Asit Mallick <asit.k.mallick@intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Jason Baron <jbaron@akamai.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Arjan Van De Ven <arjan.van.de.ven@intel.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Link: https://lkml.kernel.org/r/20180125095843.645776917@infradead.org
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
[backport to 4.4]
Signed-off-by: Jack Wang <jinpu.wang@profitbricks.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kvm/vmx.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 2a1a8737015b..7e653ec47da8 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -8377,13 +8377,13 @@ static void vmx_handle_external_intr(struct kvm_vcpu *vcpu)
 			"pushf\n\t"
 			"orl $0x200, (%%" _ASM_SP ")\n\t"
 			__ASM_SIZE(push) " $%c[cs]\n\t"
-			"call *%[entry]\n\t"
+			CALL_NOSPEC
 			:
 #ifdef CONFIG_X86_64
 			[sp]"=&r"(tmp)
 #endif
 			:
-			[entry]"r"(entry),
+			THUNK_TARGET(entry),
 			[ss]"i"(__KERNEL_DS),
 			[cs]"i"(__KERNEL_CS)
 			);
-- 
cgit v1.2.3


From 6cd5513c813eb57eba081563beb817abd9923a3a Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Fri, 23 Feb 2018 11:41:54 +0100
Subject: module/retpoline: Warn about missing retpoline in module

(cherry picked from commit caf7501a1b4ec964190f31f9c3f163de252273b8)

There's a risk that a kernel which has full retpoline mitigations becomes
vulnerable when a module gets loaded that hasn't been compiled with the
right compiler or the right option.

To enable detection of that mismatch at module load time, add a module info
string "retpoline" at build time when the module was compiled with
retpoline support. This only covers compiled C source, but assembler source
or prebuilt object files are not checked.

If a retpoline enabled kernel detects a non retpoline protected module at
load time, print a warning and report it in the sysfs vulnerability file.

[ tglx: Massaged changelog ]

Signed-off-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: gregkh@linuxfoundation.org
Cc: torvalds@linux-foundation.org
Cc: jeyu@kernel.org
Cc: arjan@linux.intel.com
Link: https://lkml.kernel.org/r/20180125235028.31211-1-andi@firstfloor.org
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
[jwang: port to 4.4]
Signed-off-by: Jack Wang <jinpu.wang@profitbricks.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kernel/cpu/bugs.c | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 8cacf62ec458..b86e0a2f1b48 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -10,6 +10,7 @@
 #include <linux/init.h>
 #include <linux/utsname.h>
 #include <linux/cpu.h>
+#include <linux/module.h>
 
 #include <asm/nospec-branch.h>
 #include <asm/cmdline.h>
@@ -93,6 +94,20 @@ static const char *spectre_v2_strings[] = {
 
 static enum spectre_v2_mitigation spectre_v2_enabled = SPECTRE_V2_NONE;
 
+static bool spectre_v2_bad_module;
+
+#ifdef RETPOLINE
+bool retpoline_module_ok(bool has_retpoline)
+{
+	if (spectre_v2_enabled == SPECTRE_V2_NONE || has_retpoline)
+		return true;
+
+	pr_err("System may be vunerable to spectre v2\n");
+	spectre_v2_bad_module = true;
+	return false;
+}
+#endif
+
 static void __init spec2_print_if_insecure(const char *reason)
 {
 	if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
@@ -277,6 +292,7 @@ ssize_t cpu_show_spectre_v2(struct device *dev,
 	if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
 		return sprintf(buf, "Not affected\n");
 
-	return sprintf(buf, "%s\n", spectre_v2_strings[spectre_v2_enabled]);
+	return sprintf(buf, "%s%s\n", spectre_v2_strings[spectre_v2_enabled],
+		       spectre_v2_bad_module ? " - vulnerable module loaded" : "");
 }
 #endif
-- 
cgit v1.2.3


From 3d535a0f55d1ba44b66c88d44e592f12056c188b Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Fri, 23 Feb 2018 11:41:55 +0100
Subject: x86/nospec: Fix header guards names

(cherry picked from commit 7a32fc51ca938e67974cbb9db31e1a43f98345a9)

... to adhere to the _ASM_X86_ naming scheme.

No functional change.

Signed-off-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: riel@redhat.com
Cc: ak@linux.intel.com
Cc: peterz@infradead.org
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: jikos@kernel.org
Cc: luto@amacapital.net
Cc: dave.hansen@intel.com
Cc: torvalds@linux-foundation.org
Cc: keescook@google.com
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: tim.c.chen@linux.intel.com
Cc: gregkh@linux-foundation.org
Cc: pjt@google.com
Link: https://lkml.kernel.org/r/20180126121139.31959-3-bp@alien8.de
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
[cherry-pick to 4.4]
Signed-off-by: Jack Wang <jinpu.wang@profitbricks.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/nospec-branch.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index 492370b9b35b..82d0d6e5ade8 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -1,7 +1,7 @@
 /* SPDX-License-Identifier: GPL-2.0 */
 
-#ifndef __NOSPEC_BRANCH_H__
-#define __NOSPEC_BRANCH_H__
+#ifndef _ASM_X86_NOSPEC_BRANCH_H_
+#define _ASM_X86_NOSPEC_BRANCH_H_
 
 #include <asm/alternative.h>
 #include <asm/alternative-asm.h>
@@ -195,4 +195,4 @@ static inline void vmexit_fill_RSB(void)
 }
 
 #endif /* __ASSEMBLY__ */
-#endif /* __NOSPEC_BRANCH_H__ */
+#endif /* _ASM_X86_NOSPEC_BRANCH_H_ */
-- 
cgit v1.2.3


From e9560fbe97d0c5da9e7cac0ede8448f0f2b83769 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Fri, 23 Feb 2018 11:41:56 +0100
Subject: x86/bugs: Drop one "mitigation" from dmesg

(cherry picked from commit 55fa19d3e51f33d9cd4056d25836d93abf9438db)

Make

[    0.031118] Spectre V2 mitigation: Mitigation: Full generic retpoline

into

[    0.031118] Spectre V2: Mitigation: Full generic retpoline

to reduce the mitigation mitigations strings.

Signed-off-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: riel@redhat.com
Cc: ak@linux.intel.com
Cc: peterz@infradead.org
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: jikos@kernel.org
Cc: luto@amacapital.net
Cc: dave.hansen@intel.com
Cc: torvalds@linux-foundation.org
Cc: keescook@google.com
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: tim.c.chen@linux.intel.com
Cc: pjt@google.com
Link: https://lkml.kernel.org/r/20180126121139.31959-5-bp@alien8.de
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
[jwang: port to 4.4]
Signed-off-by: Jack Wang <jinpu.wang@profitbricks.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kernel/cpu/bugs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index b86e0a2f1b48..847e35629a22 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -90,7 +90,7 @@ static const char *spectre_v2_strings[] = {
 };
 
 #undef pr_fmt
-#define pr_fmt(fmt)     "Spectre V2 mitigation: " fmt
+#define pr_fmt(fmt)     "Spectre V2 : " fmt
 
 static enum spectre_v2_mitigation spectre_v2_enabled = SPECTRE_V2_NONE;
 
-- 
cgit v1.2.3


From e905005d58ebed85108f9a473d4a33127c013fd3 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 23 Feb 2018 11:41:57 +0100
Subject: x86/cpu/bugs: Make retpoline module warning conditional
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

(cherry picked from commit e383095c7fe8d218e00ec0f83e4b95ed4e627b02)

If sysfs is disabled and RETPOLINE not defined:

arch/x86/kernel/cpu/bugs.c:97:13: warning: ‘spectre_v2_bad_module’ defined but not used
[-Wunused-variable]
 static bool spectre_v2_bad_module;

Hide it.

Fixes: caf7501a1b4e ("module/retpoline: Warn about missing retpoline in module")
Reported-by: Borislav Petkov <bp@alien8.de>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
[jwang: port to 4.4]
Signed-off-by: Jack Wang <jinpu.wang@profitbricks.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kernel/cpu/bugs.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 847e35629a22..e3099c28008c 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -94,9 +94,10 @@ static const char *spectre_v2_strings[] = {
 
 static enum spectre_v2_mitigation spectre_v2_enabled = SPECTRE_V2_NONE;
 
-static bool spectre_v2_bad_module;
 
 #ifdef RETPOLINE
+static bool spectre_v2_bad_module;
+
 bool retpoline_module_ok(bool has_retpoline)
 {
 	if (spectre_v2_enabled == SPECTRE_V2_NONE || has_retpoline)
@@ -106,6 +107,13 @@ bool retpoline_module_ok(bool has_retpoline)
 	spectre_v2_bad_module = true;
 	return false;
 }
+
+static inline const char *spectre_v2_module_string(void)
+{
+	return spectre_v2_bad_module ? " - vulnerable module loaded" : "";
+}
+#else
+static inline const char *spectre_v2_module_string(void) { return ""; }
 #endif
 
 static void __init spec2_print_if_insecure(const char *reason)
@@ -293,6 +301,6 @@ ssize_t cpu_show_spectre_v2(struct device *dev,
 		return sprintf(buf, "Not affected\n");
 
 	return sprintf(buf, "%s%s\n", spectre_v2_strings[spectre_v2_enabled],
-		       spectre_v2_bad_module ? " - vulnerable module loaded" : "");
+		       spectre_v2_module_string());
 }
 #endif
-- 
cgit v1.2.3


From 131f3e886648a186fddb43be72b4b7b091876a1c Mon Sep 17 00:00:00 2001
From: Dou Liyang <douly.fnst@cn.fujitsu.com>
Date: Fri, 23 Feb 2018 11:41:58 +0100
Subject: x86/spectre: Check CONFIG_RETPOLINE in command line parser

(cherry picked from commit 9471eee9186a46893726e22ebb54cade3f9bc043)

The spectre_v2 option 'auto' does not check whether CONFIG_RETPOLINE is
enabled. As a consequence it fails to emit the appropriate warning and sets
feature flags which have no effect at all.

Add the missing IS_ENABLED() check.

Fixes: da285121560e ("x86/spectre: Add boot time option to select Spectre v2 mitigation")
Signed-off-by: Dou Liyang <douly.fnst@cn.fujitsu.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: ak@linux.intel.com
Cc: peterz@infradead.org
Cc: Tomohiro <misono.tomohiro@jp.fujitsu.com>
Cc: dave.hansen@intel.com
Cc: bp@alien8.de
Cc: arjan@linux.intel.com
Cc: dwmw@amazon.co.uk
Cc: stable@vger.kernel.org
Link: https://lkml.kernel.org/r/f5892721-7528-3647-08fb-f8d10e65ad87@cn.fujitsu.com
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
[jwang: cherry-pick to 4.4]
Signed-off-by: Jack Wang <jinpu.wang@profitbricks.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kernel/cpu/bugs.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index e3099c28008c..d3b8337afc98 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -213,10 +213,10 @@ static void __init spectre_v2_select_mitigation(void)
 		return;
 
 	case SPECTRE_V2_CMD_FORCE:
-		/* FALLTRHU */
 	case SPECTRE_V2_CMD_AUTO:
-		goto retpoline_auto;
-
+		if (IS_ENABLED(CONFIG_RETPOLINE))
+			goto retpoline_auto;
+		break;
 	case SPECTRE_V2_CMD_RETPOLINE_AMD:
 		if (IS_ENABLED(CONFIG_RETPOLINE))
 			goto retpoline_amd;
-- 
cgit v1.2.3


From f136b56017ad7848449ac8b8aaebc340346acbbd Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 23 Feb 2018 11:42:01 +0100
Subject: x86: Implement array_index_mask_nospec

(cherry picked from commit babdde2698d482b6c0de1eab4f697cf5856c5859)

array_index_nospec() uses a mask to sanitize user controllable array
indexes, i.e. generate a 0 mask if 'index' >= 'size', and a ~0 mask
otherwise. While the default array_index_mask_nospec() handles the
carry-bit from the (index - size) result in software.

The x86 array_index_mask_nospec() does the same, but the carry-bit is
handled in the processor CF flag without conditional instructions in the
control flow.

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-arch@vger.kernel.org
Cc: kernel-hardening@lists.openwall.com
Cc: gregkh@linuxfoundation.org
Cc: alan@linux.intel.com
Link: https://lkml.kernel.org/r/151727414808.33451.1873237130672785331.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
[jwang:chery pick to 4.4]
Signed-off-by: Jack Wang <jinpu.wang@profitbricks.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/barrier.h | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h
index 0681d2532527..b5028e3ef909 100644
--- a/arch/x86/include/asm/barrier.h
+++ b/arch/x86/include/asm/barrier.h
@@ -24,6 +24,30 @@
 #define wmb()	asm volatile("sfence" ::: "memory")
 #endif
 
+/**
+ * array_index_mask_nospec() - generate a mask that is ~0UL when the
+ * 	bounds check succeeds and 0 otherwise
+ * @index: array element index
+ * @size: number of elements in array
+ *
+ * Returns:
+ *     0 - (index < size)
+ */
+static inline unsigned long array_index_mask_nospec(unsigned long index,
+		unsigned long size)
+{
+	unsigned long mask;
+
+	asm ("cmp %1,%2; sbb %0,%0;"
+			:"=r" (mask)
+			:"r"(size),"r" (index)
+			:"cc");
+	return mask;
+}
+
+/* Override the default implementation from linux/nospec.h. */
+#define array_index_mask_nospec array_index_mask_nospec
+
 #ifdef CONFIG_X86_PPRO_FENCE
 #define dma_rmb()	rmb()
 #else
-- 
cgit v1.2.3


From 64d41d13ed81d55e03c80d241cf353b1aa0bf1c3 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 23 Feb 2018 11:42:02 +0100
Subject: x86: Introduce barrier_nospec

(cherry picked from commit b3d7ad85b80bbc404635dca80f5b129f6242bc7a)

Rename the open coded form of this instruction sequence from
rdtsc_ordered() into a generic barrier primitive, barrier_nospec().

One of the mitigations for Spectre variant1 vulnerabilities is to fence
speculative execution after successfully validating a bounds check. I.e.
force the result of a bounds check to resolve in the instruction pipeline
to ensure speculative execution honors that result before potentially
operating on out-of-bounds data.

No functional changes.

Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
Suggested-by: Andi Kleen <ak@linux.intel.com>
Suggested-by: Ingo Molnar <mingo@redhat.com>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-arch@vger.kernel.org
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: kernel-hardening@lists.openwall.com
Cc: gregkh@linuxfoundation.org
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: alan@linux.intel.com
Link: https://lkml.kernel.org/r/151727415361.33451.9049453007262764675.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
[jwang: cherry pick to 4.4]
Signed-off-by: Jack Wang <jinpu.wang@profitbricks.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/barrier.h | 4 ++++
 arch/x86/include/asm/msr.h     | 3 +--
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h
index b5028e3ef909..814ef83c6720 100644
--- a/arch/x86/include/asm/barrier.h
+++ b/arch/x86/include/asm/barrier.h
@@ -48,6 +48,10 @@ static inline unsigned long array_index_mask_nospec(unsigned long index,
 /* Override the default implementation from linux/nospec.h. */
 #define array_index_mask_nospec array_index_mask_nospec
 
+/* Prevent speculative execution past this barrier. */
+#define barrier_nospec() alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC, \
+					   "lfence", X86_FEATURE_LFENCE_RDTSC)
+
 #ifdef CONFIG_X86_PPRO_FENCE
 #define dma_rmb()	rmb()
 #else
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index 77d8b284e4a7..5a10ac8c131e 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -147,8 +147,7 @@ static __always_inline unsigned long long rdtsc_ordered(void)
 	 * that some other imaginary CPU is updating continuously with a
 	 * time stamp.
 	 */
-	alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC,
-			  "lfence", X86_FEATURE_LFENCE_RDTSC);
+	barrier_nospec();
 	return rdtsc();
 }
 
-- 
cgit v1.2.3


From fd3d9535450c3c9b720bae22419c7419f50decf6 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 23 Feb 2018 11:42:03 +0100
Subject: x86/get_user: Use pointer masking to limit speculation

(cherry picked from commit c7f631cb07e7da06ac1d231ca178452339e32a94)

Quoting Linus:

    I do think that it would be a good idea to very expressly document
    the fact that it's not that the user access itself is unsafe. I do
    agree that things like "get_user()" want to be protected, but not
    because of any direct bugs or problems with get_user() and friends,
    but simply because get_user() is an excellent source of a pointer
    that is obviously controlled from a potentially attacking user
    space. So it's a prime candidate for then finding _subsequent_
    accesses that can then be used to perturb the cache.

Unlike the __get_user() case get_user() includes the address limit check
near the pointer de-reference. With that locality the speculation can be
mitigated with pointer narrowing rather than a barrier, i.e.
array_index_nospec(). Where the narrowing is performed by:

	cmp %limit, %ptr
	sbb %mask, %mask
	and %mask, %ptr

With respect to speculation the value of %ptr is either less than %limit
or NULL.

Co-developed-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-arch@vger.kernel.org
Cc: Kees Cook <keescook@chromium.org>
Cc: kernel-hardening@lists.openwall.com
Cc: gregkh@linuxfoundation.org
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: torvalds@linux-foundation.org
Cc: alan@linux.intel.com
Link: https://lkml.kernel.org/r/151727417469.33451.11804043010080838495.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
[jwang: port to 4.4]
Signed-off-by: Jack Wang <jinpu.wang@profitbricks.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/lib/getuser.S | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S
index 46668cda4ffd..490b2ee4e4bb 100644
--- a/arch/x86/lib/getuser.S
+++ b/arch/x86/lib/getuser.S
@@ -38,6 +38,8 @@ ENTRY(__get_user_1)
 	GET_THREAD_INFO(%_ASM_DX)
 	cmp TI_addr_limit(%_ASM_DX),%_ASM_AX
 	jae bad_get_user
+	sbb %_ASM_DX, %_ASM_DX		/* array_index_mask_nospec() */
+	and %_ASM_DX, %_ASM_AX
 	ASM_STAC
 1:	movzbl (%_ASM_AX),%edx
 	xor %eax,%eax
@@ -51,6 +53,8 @@ ENTRY(__get_user_2)
 	GET_THREAD_INFO(%_ASM_DX)
 	cmp TI_addr_limit(%_ASM_DX),%_ASM_AX
 	jae bad_get_user
+	sbb %_ASM_DX, %_ASM_DX		/* array_index_mask_nospec() */
+	and %_ASM_DX, %_ASM_AX
 	ASM_STAC
 2:	movzwl -1(%_ASM_AX),%edx
 	xor %eax,%eax
@@ -64,6 +68,8 @@ ENTRY(__get_user_4)
 	GET_THREAD_INFO(%_ASM_DX)
 	cmp TI_addr_limit(%_ASM_DX),%_ASM_AX
 	jae bad_get_user
+	sbb %_ASM_DX, %_ASM_DX		/* array_index_mask_nospec() */
+	and %_ASM_DX, %_ASM_AX
 	ASM_STAC
 3:	movl -3(%_ASM_AX),%edx
 	xor %eax,%eax
@@ -78,6 +84,8 @@ ENTRY(__get_user_8)
 	GET_THREAD_INFO(%_ASM_DX)
 	cmp TI_addr_limit(%_ASM_DX),%_ASM_AX
 	jae bad_get_user
+	sbb %_ASM_DX, %_ASM_DX		/* array_index_mask_nospec() */
+	and %_ASM_DX, %_ASM_AX
 	ASM_STAC
 4:	movq -7(%_ASM_AX),%rdx
 	xor %eax,%eax
@@ -89,6 +97,8 @@ ENTRY(__get_user_8)
 	GET_THREAD_INFO(%_ASM_DX)
 	cmp TI_addr_limit(%_ASM_DX),%_ASM_AX
 	jae bad_get_user_8
+	sbb %_ASM_DX, %_ASM_DX		/* array_index_mask_nospec() */
+	and %_ASM_DX, %_ASM_AX
 	ASM_STAC
 4:	movl -7(%_ASM_AX),%edx
 5:	movl -3(%_ASM_AX),%ecx
-- 
cgit v1.2.3


From c8961332d6da59b8a39998f46831fe7871cd1519 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 23 Feb 2018 11:42:04 +0100
Subject: x86/syscall: Sanitize syscall table de-references under speculation

(cherry picked from commit 2fbd7af5af8665d18bcefae3e9700be07e22b681)

The syscall table base is a user controlled function pointer in kernel
space. Use array_index_nospec() to prevent any out of bounds speculation.

While retpoline prevents speculating into a userspace directed target it
does not stop the pointer de-reference, the concern is leaking memory
relative to the syscall table base, by observing instruction cache
behavior.

Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-arch@vger.kernel.org
Cc: kernel-hardening@lists.openwall.com
Cc: gregkh@linuxfoundation.org
Cc: Andy Lutomirski <luto@kernel.org>
Cc: alan@linux.intel.com
Link: https://lkml.kernel.org/r/151727417984.33451.1216731042505722161.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
[jwang: port to 4.4, no syscall_64]
Signed-off-by: Jack Wang <jinpu.wang@profitbricks.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/entry/common.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'arch')

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 1a4477cedc49..b5eb1cca70a0 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -20,6 +20,7 @@
 #include <linux/export.h>
 #include <linux/context_tracking.h>
 #include <linux/user-return-notifier.h>
+#include <linux/nospec.h>
 #include <linux/uprobes.h>
 
 #include <asm/desc.h>
@@ -381,6 +382,7 @@ __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs)
 	}
 
 	if (likely(nr < IA32_NR_syscalls)) {
+		nr = array_index_nospec(nr, IA32_NR_syscalls);
 		/*
 		 * It's possible that a 32-bit syscall implementation
 		 * takes a 64-bit parameter but nonetheless assumes that
-- 
cgit v1.2.3


From b9c288b664da79d18b71edce8be7640d9ea8c0bf Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 23 Feb 2018 11:42:07 +0100
Subject: x86/spectre: Report get_user mitigation for spectre_v1

(cherry picked from commit edfbae53dab8348fca778531be9f4855d2ca0360)

Reflect the presence of get_user(), __get_user(), and 'syscall' protections
in sysfs. The expectation is that new and better tooling will allow the
kernel to grow more usages of array_index_nospec(), for now, only claim
mitigation for __user pointer de-references.

Reported-by: Jiri Slaby <jslaby@suse.cz>
Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-arch@vger.kernel.org
Cc: kernel-hardening@lists.openwall.com
Cc: gregkh@linuxfoundation.org
Cc: torvalds@linux-foundation.org
Cc: alan@linux.intel.com
Link: https://lkml.kernel.org/r/151727420158.33451.11658324346540434635.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
[jwang: cherry pick to 4.4]
Signed-off-by: Jack Wang <jinpu.wang@profitbricks.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kernel/cpu/bugs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index d3b8337afc98..812813e026a3 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -291,7 +291,7 @@ ssize_t cpu_show_spectre_v1(struct device *dev,
 {
 	if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1))
 		return sprintf(buf, "Not affected\n");
-	return sprintf(buf, "Vulnerable\n");
+	return sprintf(buf, "Mitigation: __user pointer sanitization\n");
 }
 
 ssize_t cpu_show_spectre_v2(struct device *dev,
-- 
cgit v1.2.3


From bf17809d19146865c29c985e82b0c419147d5b97 Mon Sep 17 00:00:00 2001
From: Colin Ian King <colin.king@canonical.com>
Date: Fri, 23 Feb 2018 11:42:08 +0100
Subject: x86/spectre: Fix spelling mistake: "vunerable"-> "vulnerable"

(cherry picked from commit e698dcdfcda41efd0984de539767b4cddd235f1e)

Trivial fix to spelling mistake in pr_err error message text.

Signed-off-by: Colin Ian King <colin.king@canonical.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: kernel-janitors@vger.kernel.org
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: David Woodhouse <dwmw@amazon.co.uk>
Link: https://lkml.kernel.org/r/20180130193218.9271-1-colin.king@canonical.com
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
[jwang: cherry pick to 4.4]
Signed-off-by: Jack Wang <jinpu.wang@profitbricks.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kernel/cpu/bugs.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 812813e026a3..6272c207330d 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -103,7 +103,7 @@ bool retpoline_module_ok(bool has_retpoline)
 	if (spectre_v2_enabled == SPECTRE_V2_NONE || has_retpoline)
 		return true;
 
-	pr_err("System may be vunerable to spectre v2\n");
+	pr_err("System may be vulnerable to spectre v2\n");
 	spectre_v2_bad_module = true;
 	return false;
 }
-- 
cgit v1.2.3


From ac0242fe0d9d698dde4a1fc249915af24a2a4c99 Mon Sep 17 00:00:00 2001
From: Josh Poimboeuf <jpoimboe@redhat.com>
Date: Fri, 23 Feb 2018 11:42:09 +0100
Subject: x86/paravirt: Remove 'noreplace-paravirt' cmdline option

(cherry picked from commit 12c69f1e94c89d40696e83804dd2f0965b5250cd)

The 'noreplace-paravirt' option disables paravirt patching, leaving the
original pv indirect calls in place.

That's highly incompatible with retpolines, unless we want to uglify
paravirt even further and convert the paravirt calls to retpolines.

As far as I can tell, the option doesn't seem to be useful for much
other than introducing surprising corner cases and making the kernel
vulnerable to Spectre v2.  It was probably a debug option from the early
paravirt days.  So just remove it.

Signed-off-by: Josh Poimboeuf <jpoimboe@redhat.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Juergen Gross <jgross@suse.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Ashok Raj <ashok.raj@intel.com>
Cc: Greg KH <gregkh@linuxfoundation.org>
Cc: Jun Nakajima <jun.nakajima@intel.com>
Cc: Tim Chen <tim.c.chen@linux.intel.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Asit Mallick <asit.k.mallick@intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Jason Baron <jbaron@akamai.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Alok Kataria <akataria@vmware.com>
Cc: Arjan Van De Ven <arjan.van.de.ven@intel.com>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Link: https://lkml.kernel.org/r/20180131041333.2x6blhxirc2kclrq@treble
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
[jwang: chery pick to 4.4]
Signed-off-by: Jack Wang <jinpu.wang@profitbricks.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kernel/alternative.c | 14 --------------
 1 file changed, 14 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index d6f375f1b928..89829c3d5a74 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -45,17 +45,6 @@ static int __init setup_noreplace_smp(char *str)
 }
 __setup("noreplace-smp", setup_noreplace_smp);
 
-#ifdef CONFIG_PARAVIRT
-static int __initdata_or_module noreplace_paravirt = 0;
-
-static int __init setup_noreplace_paravirt(char *str)
-{
-	noreplace_paravirt = 1;
-	return 1;
-}
-__setup("noreplace-paravirt", setup_noreplace_paravirt);
-#endif
-
 #define DPRINTK(fmt, args...)						\
 do {									\
 	if (debug_alternative)						\
@@ -587,9 +576,6 @@ void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
 	struct paravirt_patch_site *p;
 	char insnbuf[MAX_PATCH_LEN];
 
-	if (noreplace_paravirt)
-		return;
-
 	for (p = start; p < end; p++) {
 		unsigned int used;
 
-- 
cgit v1.2.3


From 355e059499da0eca1cd550ffcb3136f442dc7df8 Mon Sep 17 00:00:00 2001
From: Dan Williams <dan.j.williams@intel.com>
Date: Fri, 23 Feb 2018 11:42:10 +0100
Subject: x86/kvm: Update spectre-v1 mitigation

(cherry picked from commit 085331dfc6bbe3501fb936e657331ca943827600)

Commit 75f139aaf896 "KVM: x86: Add memory barrier on vmcs field lookup"
added a raw 'asm("lfence");' to prevent a bounds check bypass of
'vmcs_field_to_offset_table'.

The lfence can be avoided in this path by using the array_index_nospec()
helper designed for these types of fixes.

Signed-off-by: Dan Williams <dan.j.williams@intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Paolo Bonzini <pbonzini@redhat.com>
Cc: Andrew Honig <ahonig@google.com>
Cc: kvm@vger.kernel.org
Cc: Jim Mattson <jmattson@google.com>
Link: https://lkml.kernel.org/r/151744959670.6342.3001723920950249067.stgit@dwillia2-desk3.amr.corp.intel.com
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
[jwang: cherry pick to 4.4]
Signed-off-by: Jack Wang <jinpu.wang@profitbricks.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kvm/vmx.c | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 7e653ec47da8..b1472c75284b 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -32,6 +32,7 @@
 #include <linux/slab.h>
 #include <linux/tboot.h>
 #include <linux/hrtimer.h>
+#include <linux/nospec.h>
 #include "kvm_cache_regs.h"
 #include "x86.h"
 
@@ -827,21 +828,18 @@ static const unsigned short vmcs_field_to_offset_table[] = {
 
 static inline short vmcs_field_to_offset(unsigned long field)
 {
-	BUILD_BUG_ON(ARRAY_SIZE(vmcs_field_to_offset_table) > SHRT_MAX);
+	const size_t size = ARRAY_SIZE(vmcs_field_to_offset_table);
+	unsigned short offset;
 
-	if (field >= ARRAY_SIZE(vmcs_field_to_offset_table))
+	BUILD_BUG_ON(size > SHRT_MAX);
+	if (field >= size)
 		return -ENOENT;
 
-	/*
-	 * FIXME: Mitigation for CVE-2017-5753.  To be replaced with a
-	 * generic mechanism.
-	 */
-	asm("lfence");
-
-	if (vmcs_field_to_offset_table[field] == 0)
+	field = array_index_nospec(field, size);
+	offset = vmcs_field_to_offset_table[field];
+	if (offset == 0)
 		return -ENOENT;
-
-	return vmcs_field_to_offset_table[field];
+	return offset;
 }
 
 static inline struct vmcs12 *get_vmcs12(struct kvm_vcpu *vcpu)
-- 
cgit v1.2.3


From 3fc9b05df62de1877cb69f11368d1936b4f22160 Mon Sep 17 00:00:00 2001
From: KarimAllah Ahmed <karahmed@amazon.de>
Date: Fri, 23 Feb 2018 11:42:12 +0100
Subject: x86/spectre: Simplify spectre_v2 command line parsing

(cherry picked from commit 9005c6834c0ffdfe46afa76656bd9276cca864f6)

[dwmw2: Use ARRAY_SIZE]

Signed-off-by: KarimAllah Ahmed <karahmed@amazon.de>
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: peterz@infradead.org
Cc: bp@alien8.de
Link: https://lkml.kernel.org/r/1517484441-1420-3-git-send-email-dwmw@amazon.co.uk
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
[jwang: cherry pick to 4.4]
Signed-off-by: Jack Wang <jinpu.wang@profitbricks.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kernel/cpu/bugs.c | 86 ++++++++++++++++++++++++++++++----------------
 1 file changed, 56 insertions(+), 30 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 6272c207330d..ecaf7c9baf75 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -119,13 +119,13 @@ static inline const char *spectre_v2_module_string(void) { return ""; }
 static void __init spec2_print_if_insecure(const char *reason)
 {
 	if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
-		pr_info("%s\n", reason);
+		pr_info("%s selected on command line.\n", reason);
 }
 
 static void __init spec2_print_if_secure(const char *reason)
 {
 	if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2))
-		pr_info("%s\n", reason);
+		pr_info("%s selected on command line.\n", reason);
 }
 
 static inline bool retp_compiler(void)
@@ -140,42 +140,68 @@ static inline bool match_option(const char *arg, int arglen, const char *opt)
 	return len == arglen && !strncmp(arg, opt, len);
 }
 
+static const struct {
+	const char *option;
+	enum spectre_v2_mitigation_cmd cmd;
+	bool secure;
+} mitigation_options[] = {
+	{ "off",               SPECTRE_V2_CMD_NONE,              false },
+	{ "on",                SPECTRE_V2_CMD_FORCE,             true },
+	{ "retpoline",         SPECTRE_V2_CMD_RETPOLINE,         false },
+	{ "retpoline,amd",     SPECTRE_V2_CMD_RETPOLINE_AMD,     false },
+	{ "retpoline,generic", SPECTRE_V2_CMD_RETPOLINE_GENERIC, false },
+	{ "auto",              SPECTRE_V2_CMD_AUTO,              false },
+};
+
 static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
 {
 	char arg[20];
-	int ret;
-
-	ret = cmdline_find_option(boot_command_line, "spectre_v2", arg,
-				  sizeof(arg));
-	if (ret > 0)  {
-		if (match_option(arg, ret, "off")) {
-			goto disable;
-		} else if (match_option(arg, ret, "on")) {
-			spec2_print_if_secure("force enabled on command line.");
-			return SPECTRE_V2_CMD_FORCE;
-		} else if (match_option(arg, ret, "retpoline")) {
-			spec2_print_if_insecure("retpoline selected on command line.");
-			return SPECTRE_V2_CMD_RETPOLINE;
-		} else if (match_option(arg, ret, "retpoline,amd")) {
-			if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) {
-				pr_err("retpoline,amd selected but CPU is not AMD. Switching to AUTO select\n");
-				return SPECTRE_V2_CMD_AUTO;
-			}
-			spec2_print_if_insecure("AMD retpoline selected on command line.");
-			return SPECTRE_V2_CMD_RETPOLINE_AMD;
-		} else if (match_option(arg, ret, "retpoline,generic")) {
-			spec2_print_if_insecure("generic retpoline selected on command line.");
-			return SPECTRE_V2_CMD_RETPOLINE_GENERIC;
-		} else if (match_option(arg, ret, "auto")) {
+	int ret, i;
+	enum spectre_v2_mitigation_cmd cmd = SPECTRE_V2_CMD_AUTO;
+
+	if (cmdline_find_option_bool(boot_command_line, "nospectre_v2"))
+		return SPECTRE_V2_CMD_NONE;
+	else {
+		ret = cmdline_find_option(boot_command_line, "spectre_v2", arg,
+					  sizeof(arg));
+		if (ret < 0)
+			return SPECTRE_V2_CMD_AUTO;
+
+		for (i = 0; i < ARRAY_SIZE(mitigation_options); i++) {
+			if (!match_option(arg, ret, mitigation_options[i].option))
+				continue;
+			cmd = mitigation_options[i].cmd;
+			break;
+		}
+
+		if (i >= ARRAY_SIZE(mitigation_options)) {
+			pr_err("unknown option (%s). Switching to AUTO select\n",
+			       mitigation_options[i].option);
 			return SPECTRE_V2_CMD_AUTO;
 		}
 	}
 
-	if (!cmdline_find_option_bool(boot_command_line, "nospectre_v2"))
+	if ((cmd == SPECTRE_V2_CMD_RETPOLINE ||
+	     cmd == SPECTRE_V2_CMD_RETPOLINE_AMD ||
+	     cmd == SPECTRE_V2_CMD_RETPOLINE_GENERIC) &&
+	    !IS_ENABLED(CONFIG_RETPOLINE)) {
+		pr_err("%s selected but not compiled in. Switching to AUTO select\n",
+		       mitigation_options[i].option);
 		return SPECTRE_V2_CMD_AUTO;
-disable:
-	spec2_print_if_insecure("disabled on command line.");
-	return SPECTRE_V2_CMD_NONE;
+	}
+
+	if (cmd == SPECTRE_V2_CMD_RETPOLINE_AMD &&
+	    boot_cpu_data.x86_vendor != X86_VENDOR_AMD) {
+		pr_err("retpoline,amd selected but CPU is not AMD. Switching to AUTO select\n");
+		return SPECTRE_V2_CMD_AUTO;
+	}
+
+	if (mitigation_options[i].secure)
+		spec2_print_if_secure(mitigation_options[i].option);
+	else
+		spec2_print_if_insecure(mitigation_options[i].option);
+
+	return cmd;
 }
 
 /* Check for Skylake-like CPUs (for RSB handling) */
-- 
cgit v1.2.3


From fd94ae98d2dd6883ed8c7948dcbb48867894045d Mon Sep 17 00:00:00 2001
From: Darren Kenny <darren.kenny@oracle.com>
Date: Fri, 23 Feb 2018 11:42:13 +0100
Subject: x86/speculation: Fix typo IBRS_ATT, which should be IBRS_ALL

(cherry picked from commit af189c95a371b59f493dbe0f50c0a09724868881)

Fixes: 117cc7a908c83 ("x86/retpoline: Fill return stack buffer on vmexit")
Signed-off-by: Darren Kenny <darren.kenny@oracle.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Tom Lendacky <thomas.lendacky@amd.com>
Cc: Andi Kleen <ak@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: David Woodhouse <dwmw@amazon.co.uk>
Link: https://lkml.kernel.org/r/20180202191220.blvgkgutojecxr3b@starbug-vm.ie.oracle.com
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
[jwang: cherry pick to 4.4]
Signed-off-by: Jack Wang <jinpu.wang@profitbricks.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/nospec-branch.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index 82d0d6e5ade8..66094a0473a8 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -178,7 +178,7 @@ extern char __indirect_thunk_end[];
  * On VMEXIT we must ensure that no RSB predictions learned in the guest
  * can be followed in the host, by overwriting the RSB completely. Both
  * retpoline and IBRS mitigations for Spectre v2 need this; only on future
- * CPUs with IBRS_ATT *might* it be avoided.
+ * CPUs with IBRS_ALL *might* it be avoided.
  */
 static inline void vmexit_fill_RSB(void)
 {
-- 
cgit v1.2.3


From e7a3bc31dc0841ff5c3fc544edd94824a8b13b18 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 23 Feb 2018 11:42:14 +0100
Subject: KVM: nVMX: kmap() can't fail

commit 42cf014d38d8822cce63703a467e00f65d000952 upstream.

kmap() can't fail, therefore it will always return a valid pointer. Let's
just get rid of the unnecessary checks.

Signed-off-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
[jwang: port to 4.4]
Signed-off-by: Jack Wang <jinpu.wang@profitbricks.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kvm/vmx.c | 9 ---------
 1 file changed, 9 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index b1472c75284b..08e26ab07ed7 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4532,10 +4532,6 @@ static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
 			return 0;
 
 		vapic_page = kmap(vmx->nested.virtual_apic_page);
-		if (!vapic_page) {
-			WARN_ON(1);
-			return -ENOMEM;
-		}
 		__kvm_apic_update_irr(vmx->nested.pi_desc->pir, vapic_page);
 		kunmap(vmx->nested.virtual_apic_page);
 
@@ -9238,11 +9234,6 @@ static inline bool nested_vmx_merge_msr_bitmap(struct kvm_vcpu *vcpu,
 		return false;
 	}
 	msr_bitmap = (unsigned long *)kmap(page);
-	if (!msr_bitmap) {
-		nested_release_page_clean(page);
-		WARN_ON(1);
-		return false;
-	}
 
 	if (nested_cpu_has_virt_x2apic_mode(vmcs12)) {
 		if (nested_cpu_has_apic_reg_virt(vmcs12))
-- 
cgit v1.2.3


From 04e8b366d3594bc6aaa728e183a13245a7f70653 Mon Sep 17 00:00:00 2001
From: David Hildenbrand <david@redhat.com>
Date: Fri, 23 Feb 2018 11:42:15 +0100
Subject: KVM: nVMX: vmx_complete_nested_posted_interrupt() can't fail

(cherry picked from commit 6342c50ad12e8ce0736e722184a7dbdea4a3477f)

vmx_complete_nested_posted_interrupt() can't fail, let's turn it into
a void function.

Signed-off-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
Signed-off-by: David Woodhouse <dwmw@amazon.co.uk>
[jwang: port to 4.4]
Signed-off-by: Jack Wang <jinpu.wang@profitbricks.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kvm/vmx.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 08e26ab07ed7..6c753b966c28 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4512,7 +4512,7 @@ static int vmx_cpu_uses_apicv(struct kvm_vcpu *vcpu)
 	return enable_apicv && lapic_in_kernel(vcpu);
 }
 
-static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
+static void vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 	int max_irr;
@@ -4523,13 +4523,13 @@ static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
 	    vmx->nested.pi_pending) {
 		vmx->nested.pi_pending = false;
 		if (!pi_test_and_clear_on(vmx->nested.pi_desc))
-			return 0;
+			return;
 
 		max_irr = find_last_bit(
 			(unsigned long *)vmx->nested.pi_desc->pir, 256);
 
 		if (max_irr == 256)
-			return 0;
+			return;
 
 		vapic_page = kmap(vmx->nested.virtual_apic_page);
 		__kvm_apic_update_irr(vmx->nested.pi_desc->pir, vapic_page);
@@ -4542,7 +4542,6 @@ static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
 			vmcs_write16(GUEST_INTR_STATUS, status);
 		}
 	}
-	return 0;
 }
 
 static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu)
@@ -10155,7 +10154,8 @@ static int vmx_check_nested_events(struct kvm_vcpu *vcpu, bool external_intr)
 		return 0;
 	}
 
-	return vmx_complete_nested_posted_interrupt(vcpu);
+	vmx_complete_nested_posted_interrupt(vcpu);
+	return 0;
 }
 
 static u32 vmx_get_preemption_timer_value(struct kvm_vcpu *vcpu)
-- 
cgit v1.2.3


From 82a945257ea995db797401eae023ec667967db18 Mon Sep 17 00:00:00 2001
From: Jim Mattson <jmattson@google.com>
Date: Fri, 23 Feb 2018 11:42:16 +0100
Subject: kvm: nVMX: Fix kernel panics induced by illegal INVEPT/INVVPID types

commit 85c856b39b479dde410ddd09df1da745343010c9 upstream

Bitwise shifts by amounts greater than or equal to the width of the left
operand are undefined. A malicious guest can exploit this to crash a
32-bit host, due to the BUG_ON(1)'s in handle_{invept,invvpid}.

Signed-off-by: Jim Mattson <jmattson@google.com>
Message-Id: <1477496318-17681-1-git-send-email-jmattson@google.com>
[Change 1UL to 1, to match the range check on the shift count. - Paolo]
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
[jwang: port from linux-4.9 to 4.4 ]
Signed-off-by: Jack Wang <jinpu.wang@profitbricks.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kvm/vmx.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 6c753b966c28..43c2996c1f73 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -7361,7 +7361,7 @@ static int handle_invept(struct kvm_vcpu *vcpu)
 
 	types = (vmx->nested.nested_vmx_ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6;
 
-	if (!(types & (1UL << type))) {
+	if (type >= 32 || !(types & (1 << type))) {
 		nested_vmx_failValid(vcpu,
 				VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
 		skip_emulated_instruction(vcpu);
@@ -7420,7 +7420,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
 
 	types = (vmx->nested.nested_vmx_vpid_caps >> 8) & 0x7;
 
-	if (!(types & (1UL << type))) {
+	if (type >= 32 || !(types & (1 << type))) {
 		nested_vmx_failValid(vcpu,
 			VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
 		skip_emulated_instruction(vcpu);
-- 
cgit v1.2.3


From 6f0a79ff1b62e78b782c4f80298c586aa8495259 Mon Sep 17 00:00:00 2001
From: Jan Dakinevich <jan.dakinevich@gmail.com>
Date: Fri, 23 Feb 2018 11:42:17 +0100
Subject: KVM: VMX: clean up declaration of VPID/EPT invalidation types
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit 63f3ac48133a19110c8a3666028dbd9b1bf3dcb3 upstream

- Remove VMX_EPT_EXTENT_INDIVIDUAL_ADDR, since there is no such type of
   EPT invalidation

 - Add missing VPID types names

Signed-off-by: Jan Dakinevich <jan.dakinevich@gmail.com>
Tested-by: Ladi Prosek <lprosek@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
[jwang: port to 4.4]
Signed-off-by: Jack Wang <jinpu.wang@profitbricks.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/vmx.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 14c63c7e8337..6b6e16d813b9 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -400,10 +400,11 @@ enum vmcs_field {
 #define IDENTITY_PAGETABLE_PRIVATE_MEMSLOT	(KVM_USER_MEM_SLOTS + 2)
 
 #define VMX_NR_VPIDS				(1 << 16)
+#define VMX_VPID_EXTENT_INDIVIDUAL_ADDR		0
 #define VMX_VPID_EXTENT_SINGLE_CONTEXT		1
 #define VMX_VPID_EXTENT_ALL_CONTEXT		2
+#define VMX_VPID_EXTENT_SINGLE_NON_GLOBAL	3
 
-#define VMX_EPT_EXTENT_INDIVIDUAL_ADDR		0
 #define VMX_EPT_EXTENT_CONTEXT			1
 #define VMX_EPT_EXTENT_GLOBAL			2
 #define VMX_EPT_EXTENT_SHIFT			24
@@ -420,8 +421,10 @@ enum vmcs_field {
 #define VMX_EPT_EXTENT_GLOBAL_BIT		(1ull << 26)
 
 #define VMX_VPID_INVVPID_BIT                    (1ull << 0) /* (32 - 32) */
+#define VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT     (1ull << 8) /* (40 - 32) */
 #define VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT      (1ull << 9) /* (41 - 32) */
 #define VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT      (1ull << 10) /* (42 - 32) */
+#define VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT   (1ull << 11) /* (43 - 32) */
 
 #define VMX_EPT_DEFAULT_GAW			3
 #define VMX_EPT_MAX_GAW				0x4
-- 
cgit v1.2.3


From 853106cae8900801ebbdf6029c62800a99045d37 Mon Sep 17 00:00:00 2001
From: Jan Dakinevich <jan.dakinevich@gmail.com>
Date: Fri, 23 Feb 2018 11:42:18 +0100
Subject: KVM: nVMX: invvpid handling improvements
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

commit bcdde302b8268ef7dbc4ddbdaffb5b44eafe9a1e upstream

 - Expose all invalidation types to the L1

 - Reject invvpid instruction, if L1 passed zero vpid value to single
   context invalidations

Signed-off-by: Jan Dakinevich <jan.dakinevich@gmail.com>
Tested-by: Ladi Prosek <lprosek@redhat.com>
Signed-off-by: Radim Krčmář <rkrcmar@redhat.com>
[jwang: port to 4.4]
Signed-off-by: Jack Wang <jinpu.wang@profitbricks.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kvm/vmx.c | 36 ++++++++++++++++++++++++------------
 1 file changed, 24 insertions(+), 12 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 43c2996c1f73..849517805eef 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -126,6 +126,12 @@ module_param_named(pml, enable_pml, bool, S_IRUGO);
 
 #define VMX_MISC_EMULATED_PREEMPTION_TIMER_RATE 5
 
+#define VMX_VPID_EXTENT_SUPPORTED_MASK		\
+	(VMX_VPID_EXTENT_INDIVIDUAL_ADDR_BIT |	\
+	VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT |	\
+	VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT |	\
+	VMX_VPID_EXTENT_SINGLE_NON_GLOBAL_BIT)
+
 /*
  * These 2 parameters are used to config the controls for Pause-Loop Exiting:
  * ple_gap:    upper bound on the amount of time between two successive
@@ -2657,8 +2663,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
 	 */
 	if (enable_vpid)
 		vmx->nested.nested_vmx_vpid_caps = VMX_VPID_INVVPID_BIT |
-				VMX_VPID_EXTENT_SINGLE_CONTEXT_BIT |
-				VMX_VPID_EXTENT_GLOBAL_CONTEXT_BIT;
+			VMX_VPID_EXTENT_SUPPORTED_MASK;
 	else
 		vmx->nested.nested_vmx_vpid_caps = 0;
 
@@ -7418,7 +7423,8 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
 	vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
 	type = kvm_register_readl(vcpu, (vmx_instruction_info >> 28) & 0xf);
 
-	types = (vmx->nested.nested_vmx_vpid_caps >> 8) & 0x7;
+	types = (vmx->nested.nested_vmx_vpid_caps &
+			VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8;
 
 	if (type >= 32 || !(types & (1 << type))) {
 		nested_vmx_failValid(vcpu,
@@ -7440,21 +7446,27 @@ static int handle_invvpid(struct kvm_vcpu *vcpu)
 	}
 
 	switch (type) {
+	case VMX_VPID_EXTENT_INDIVIDUAL_ADDR:
 	case VMX_VPID_EXTENT_SINGLE_CONTEXT:
-		/*
-		 * Old versions of KVM use the single-context version so we
-		 * have to support it; just treat it the same as all-context.
-		 */
+	case VMX_VPID_EXTENT_SINGLE_NON_GLOBAL:
+		if (!vpid) {
+			nested_vmx_failValid(vcpu,
+				VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID);
+			skip_emulated_instruction(vcpu);
+			return 1;
+		}
+		break;
 	case VMX_VPID_EXTENT_ALL_CONTEXT:
-		__vmx_flush_tlb(vcpu, to_vmx(vcpu)->nested.vpid02);
-		nested_vmx_succeed(vcpu);
 		break;
 	default:
-		/* Trap individual address invalidation invvpid calls */
-		BUG_ON(1);
-		break;
+		WARN_ON_ONCE(1);
+		skip_emulated_instruction(vcpu);
+		return 1;
 	}
 
+	__vmx_flush_tlb(vcpu, vmx->nested.vpid02);
+	nested_vmx_succeed(vcpu);
+
 	skip_emulated_instruction(vcpu);
 	return 1;
 }
-- 
cgit v1.2.3


From 920a541397f7b897cb2d0db4be3889df332899f7 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Mon, 26 Feb 2018 13:13:17 +1100
Subject: powerpc/64s: Fix RFI flush dependency on HARDLOCKUP_DETECTOR

The backport of commit aa8a5e0062ac ("powerpc/64s: Add support for RFI
flush of L1-D cache"), incorrectly placed the new RFI flush code
inside an existing #ifdef CONFIG_HARDLOCKUP_DETECTOR block.

This has the obvious effect of requiring HARDLOCKUP_DETECTOR to be
enabled in order for RFI flush to be enabled, which is a bug.

Fix it by moving the #endif up to where it belongs.

Fixes: c3892946315e ("powerpc/64s: Add support for RFI flush of L1-D cache")
Reported-by: Bernhard Kaindl <bernhard.kaindl@thalesgroup.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/powerpc/kernel/setup_64.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c
index df4a87eb8da4..9eb469bed22b 100644
--- a/arch/powerpc/kernel/setup_64.c
+++ b/arch/powerpc/kernel/setup_64.c
@@ -835,6 +835,7 @@ static int __init disable_hardlockup_detector(void)
 	return 0;
 }
 early_initcall(disable_hardlockup_detector);
+#endif
 
 #ifdef CONFIG_PPC_BOOK3S_64
 static enum l1d_flush_type enabled_flush_types;
@@ -973,4 +974,3 @@ ssize_t cpu_show_meltdown(struct device *dev, struct device_attribute *attr, cha
 	return sprintf(buf, "Vulnerable\n");
 }
 #endif /* CONFIG_PPC_BOOK3S_64 */
-#endif
-- 
cgit v1.2.3


From e169f13cbc66e941cf3d8eed2757e988730a889a Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Tue, 20 Feb 2018 21:58:21 +0100
Subject: x86/oprofile: Fix bogus GCC-8 warning in nmi_setup()

commit 85c615eb52222bc5fab6c7190d146bc59fac289e upstream.

GCC-8 shows a warning for the x86 oprofile code that copies per-CPU
data from CPU 0 to all other CPUs, which when building a non-SMP
kernel turns into a memcpy() with identical source and destination
pointers:

 arch/x86/oprofile/nmi_int.c: In function 'mux_clone':
 arch/x86/oprofile/nmi_int.c:285:2: error: 'memcpy' source argument is the same as destination [-Werror=restrict]
   memcpy(per_cpu(cpu_msrs, cpu).multiplex,
   ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
          per_cpu(cpu_msrs, 0).multiplex,
          ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
          sizeof(struct op_msr) * model->num_virt_counters);
          ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 arch/x86/oprofile/nmi_int.c: In function 'nmi_setup':
 arch/x86/oprofile/nmi_int.c:466:3: error: 'memcpy' source argument is the same as destination [-Werror=restrict]
 arch/x86/oprofile/nmi_int.c:470:3: error: 'memcpy' source argument is the same as destination [-Werror=restrict]

I have analyzed a number of such warnings now: some are valid and the
GCC warning is welcome. Others turned out to be false-positives, and
GCC was changed to not warn about those any more. This is a corner case
that is a false-positive but the GCC developers feel it's better to keep
warning about it.

In this case, it seems best to work around it by telling GCC
a little more clearly that this code path is never hit with
an IS_ENABLED() configuration check.

Cc:stable as we also want old kernels to build cleanly with GCC-8.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Cc: Jessica Yu <jeyu@kernel.org>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Martin Sebor <msebor@gcc.gnu.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Robert Richter <rric@kernel.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: oprofile-list@lists.sf.net
Cc: stable@vger.kernel.org
Link: http://lkml.kernel.org/r/20180220205826.2008875-1-arnd@arndb.de
Link: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=84095
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/oprofile/nmi_int.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index 1d2e6392f5fa..f24bd7249536 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -471,7 +471,7 @@ static int nmi_setup(void)
 		goto fail;
 
 	for_each_possible_cpu(cpu) {
-		if (!cpu)
+		if (!IS_ENABLED(CONFIG_SMP) || !cpu)
 			continue;
 
 		memcpy(per_cpu(cpu_msrs, cpu).counters,
-- 
cgit v1.2.3


From 56b57bd20f5bcdd353eacf7b7c41ee18ffe0c963 Mon Sep 17 00:00:00 2001
From: Michael Weiser <michael.weiser@gmx.de>
Date: Thu, 1 Feb 2018 23:13:38 +0100
Subject: arm64: Disable unhandled signal log messages by default

commit 5ee39a71fd89ab7240c5339d04161c44a8e03269 upstream.

aarch64 unhandled signal kernel messages are very verbose, suggesting
them to be more of a debugging aid:

sigsegv[33]: unhandled level 2 translation fault (11) at 0x00000000, esr
0x92000046, in sigsegv[400000+71000]
CPU: 1 PID: 33 Comm: sigsegv Tainted: G        W        4.15.0-rc3+ #3
Hardware name: linux,dummy-virt (DT)
pstate: 60000000 (nZCv daif -PAN -UAO)
pc : 0x4003f4
lr : 0x4006bc
sp : 0000fffffe94a060
x29: 0000fffffe94a070 x28: 0000000000000000
x27: 0000000000000000 x26: 0000000000000000
x25: 0000000000000000 x24: 00000000004001b0
x23: 0000000000486ac8 x22: 00000000004001c8
x21: 0000000000000000 x20: 0000000000400be8
x19: 0000000000400b30 x18: 0000000000484728
x17: 000000000865ffc8 x16: 000000000000270f
x15: 00000000000000b0 x14: 0000000000000002
x13: 0000000000000001 x12: 0000000000000000
x11: 0000000000000000 x10: 0008000020008008
x9 : 000000000000000f x8 : ffffffffffffffff
x7 : 0004000000000000 x6 : ffffffffffffffff
x5 : 0000000000000000 x4 : 0000000000000000
x3 : 00000000004003e4 x2 : 0000fffffe94a1e8
x1 : 000000000000000a x0 : 0000000000000000

Disable them by default, so they can be enabled using
/proc/sys/debug/exception-trace.

Cc: <stable@vger.kernel.org>
Signed-off-by: Michael Weiser <michael.weiser@gmx.de>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/arm64/kernel/traps.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'arch')

diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c
index 9119722eb347..5d270ca76aec 100644
--- a/arch/arm64/kernel/traps.c
+++ b/arch/arm64/kernel/traps.c
@@ -49,7 +49,7 @@ static const char *handler[]= {
 	"Error"
 };
 
-int show_unhandled_signals = 1;
+int show_unhandled_signals = 0;
 
 /*
  * Dump out the contents of some memory nicely...
-- 
cgit v1.2.3


From 5f63882e762ebe312cc58404f959fc3de98b4491 Mon Sep 17 00:00:00 2001
From: Chunyan Zhang <zhang.lyra@gmail.com>
Date: Fri, 1 Dec 2017 03:51:04 +0100
Subject: ARM: 8731/1: Fix csum_partial_copy_from_user() stack mismatch

[ Upstream commit 36b0cb84ee858f02c256d26f0cb4229c78e3399e ]

An additional 'ip' will be pushed to the stack, for restoring the
DACR later, if CONFIG_CPU_SW_DOMAIN_PAN defined.

However, the fixup still get the err_ptr by add #8*4 to sp, which
results in the fact that the code area pointed by the LR will be
overwritten, or the kernel will crash if CONFIG_DEBUG_RODATA is enabled.

This patch fixes the stack mismatch.

Fixes: a5e090acbf54 ("ARM: software-based priviledged-no-access support")
Signed-off-by: Lvqiang Huang <Lvqiang.Huang@spreadtrum.com>
Signed-off-by: Chunyan Zhang <zhang.lyra@gmail.com>
Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
Signed-off-by: Sasha Levin <alexander.levin@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/arm/lib/csumpartialcopyuser.S | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'arch')

diff --git a/arch/arm/lib/csumpartialcopyuser.S b/arch/arm/lib/csumpartialcopyuser.S
index 1712f132b80d..b83fdc06286a 100644
--- a/arch/arm/lib/csumpartialcopyuser.S
+++ b/arch/arm/lib/csumpartialcopyuser.S
@@ -85,7 +85,11 @@
 		.pushsection .text.fixup,"ax"
 		.align	4
 9001:		mov	r4, #-EFAULT
+#ifdef CONFIG_CPU_SW_DOMAIN_PAN
+		ldr	r5, [sp, #9*4]		@ *err_ptr
+#else
 		ldr	r5, [sp, #8*4]		@ *err_ptr
+#endif
 		str	r4, [r5]
 		ldmia	sp, {r1, r2}		@ retrieve dst, len
 		add	r2, r2, r1
-- 
cgit v1.2.3


From 7cb222a5d7b25985bcf2e06ec189c0f08a8ae823 Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Thu, 21 Dec 2017 22:35:19 +0100
Subject: ARM: dts: ls1021a: fix incorrect clock references

[ Upstream commit 506e8a912661c97b41adc8a286b875d01323ec45 ]

dtc warns about two 'clocks' properties that have an extraneous '1'
at the end:

arch/arm/boot/dts/ls1021a-qds.dtb: Warning (clocks_property): arch/arm/boot/dts/ls1021a-twr.dtb: Warning (clocks_property): Property 'clocks', cell 1 is not a phandle reference in /soc/i2c@2180000/mux@77/i2c@4/sgtl5000@2a
arch/arm/boot/dts/ls1021a-qds.dtb: Warning (clocks_property): Missing property '#clock-cells' in node /soc/interrupt-controller@1400000 or bad phandle (referred from /soc/i2c@2180000/mux@77/i2c@4/sgtl5000@2a:clocks[1])
Property 'clocks', cell 1 is not a phandle reference in /soc/i2c@2190000/sgtl5000@a
arch/arm/boot/dts/ls1021a-twr.dtb: Warning (clocks_property): Missing property '#clock-cells' in node /soc/interrupt-controller@1400000 or bad phandle (referred from /soc/i2c@2190000/sgtl5000@a:clocks[1])

The clocks that get referenced here are fixed-rate, so they do not
take any argument, and dtc interprets the next cell as a phandle, which
is invalid.

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Sasha Levin <alexander.levin@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/arm/boot/dts/ls1021a-qds.dts | 2 +-
 arch/arm/boot/dts/ls1021a-twr.dts | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/arm/boot/dts/ls1021a-qds.dts b/arch/arm/boot/dts/ls1021a-qds.dts
index 0521e6864cb7..76fce89d4f69 100644
--- a/arch/arm/boot/dts/ls1021a-qds.dts
+++ b/arch/arm/boot/dts/ls1021a-qds.dts
@@ -215,7 +215,7 @@
 				reg = <0x2a>;
 				VDDA-supply = <&reg_3p3v>;
 				VDDIO-supply = <&reg_3p3v>;
-				clocks = <&sys_mclk 1>;
+				clocks = <&sys_mclk>;
 			};
 		};
 	};
diff --git a/arch/arm/boot/dts/ls1021a-twr.dts b/arch/arm/boot/dts/ls1021a-twr.dts
index fbb89d13401e..674df87629bd 100644
--- a/arch/arm/boot/dts/ls1021a-twr.dts
+++ b/arch/arm/boot/dts/ls1021a-twr.dts
@@ -167,7 +167,7 @@
 		reg = <0x0a>;
 		VDDA-supply = <&reg_3p3v>;
 		VDDIO-supply = <&reg_3p3v>;
-		clocks = <&sys_mclk 1>;
+		clocks = <&sys_mclk>;
 	};
 };
 
-- 
cgit v1.2.3


From 7f102d7af10cdbc9a90abb635c790f1071765421 Mon Sep 17 00:00:00 2001
From: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
Date: Sat, 6 Jan 2018 21:53:26 +0300
Subject: SolutionEngine771x: fix Ether platform data

[ Upstream commit 195e2addbce09e5afbc766efc1e6567c9ce840d3 ]

The 'sh_eth' driver's probe() method would fail  on the SolutionEngine7710
board and crash on SolutionEngine7712 board  as the platform code is
hopelessly behind the driver's platform data --  it passes the PHY address
instead of 'struct sh_eth_plat_data *'; pass the latter to the driver in
order to fix the bug...

Fixes: 71557a37adb5 ("[netdrvr] sh_eth: Add SH7619 support")
Signed-off-by: Sergei Shtylyov <sergei.shtylyov@cogentembedded.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Sasha Levin <alexander.levin@microsoft.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/sh/boards/mach-se/770x/setup.c | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/sh/boards/mach-se/770x/setup.c b/arch/sh/boards/mach-se/770x/setup.c
index 658326f44df8..5e0267624d8d 100644
--- a/arch/sh/boards/mach-se/770x/setup.c
+++ b/arch/sh/boards/mach-se/770x/setup.c
@@ -8,6 +8,7 @@
  */
 #include <linux/init.h>
 #include <linux/platform_device.h>
+#include <linux/sh_eth.h>
 #include <mach-se/mach/se.h>
 #include <mach-se/mach/mrshpc.h>
 #include <asm/machvec.h>
@@ -114,6 +115,11 @@ static struct platform_device heartbeat_device = {
 #if defined(CONFIG_CPU_SUBTYPE_SH7710) ||\
 	defined(CONFIG_CPU_SUBTYPE_SH7712)
 /* SH771X Ethernet driver */
+static struct sh_eth_plat_data sh_eth_plat = {
+	.phy = PHY_ID,
+	.phy_interface = PHY_INTERFACE_MODE_MII,
+};
+
 static struct resource sh_eth0_resources[] = {
 	[0] = {
 		.start = SH_ETH0_BASE,
@@ -131,7 +137,7 @@ static struct platform_device sh_eth0_device = {
 	.name = "sh771x-ether",
 	.id = 0,
 	.dev = {
-		.platform_data = PHY_ID,
+		.platform_data = &sh_eth_plat,
 	},
 	.num_resources = ARRAY_SIZE(sh_eth0_resources),
 	.resource = sh_eth0_resources,
@@ -154,7 +160,7 @@ static struct platform_device sh_eth1_device = {
 	.name = "sh771x-ether",
 	.id = 1,
 	.dev = {
-		.platform_data = PHY_ID,
+		.platform_data = &sh_eth_plat,
 	},
 	.num_resources = ARRAY_SIZE(sh_eth1_resources),
 	.resource = sh_eth1_resources,
-- 
cgit v1.2.3


From cacdb0162489b42f6b46e132b4d08860aa9a07c9 Mon Sep 17 00:00:00 2001
From: James Hogan <jhogan@kernel.org>
Date: Thu, 7 Dec 2017 07:20:46 +0000
Subject: MIPS: Implement __multi3 for GCC7 MIPS64r6 builds

commit ebabcf17bcd7ce968b1631ebe08236275698f39b upstream.

GCC7 is a bit too eager to generate suboptimal __multi3 calls (128bit
multiply with 128bit result) for MIPS64r6 builds, even in code which
doesn't explicitly use 128bit types, such as the following:

unsigned long func(unsigned long a, unsigned long b)
{
	return a > (~0UL) / b;
}

Which GCC rearanges to:

return (unsigned __int128)a * (unsigned __int128)b > 0xffffffffffffffff;

Therefore implement __multi3, but only for MIPS64r6 with GCC7 as under
normal circumstances we wouldn't expect any calls to __multi3 to be
generated from kernel code.

Reported-by: Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
Signed-off-by: James Hogan <jhogan@kernel.org>
Tested-by: Waldemar Brodkorb <wbx@openadk.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Maciej W. Rozycki <macro@mips.com>
Cc: Matthew Fortune <matthew.fortune@mips.com>
Cc: Florian Fainelli <florian@openwrt.org>
Cc: linux-mips@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/17890/
Cc: Guenter Roeck <linux@roeck-us.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/mips/lib/Makefile |  3 ++-
 arch/mips/lib/libgcc.h | 17 ++++++++++++++++
 arch/mips/lib/multi3.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 73 insertions(+), 1 deletion(-)
 create mode 100644 arch/mips/lib/multi3.c

(limited to 'arch')

diff --git a/arch/mips/lib/Makefile b/arch/mips/lib/Makefile
index 0344e575f522..fba4ca56e46a 100644
--- a/arch/mips/lib/Makefile
+++ b/arch/mips/lib/Makefile
@@ -15,4 +15,5 @@ obj-$(CONFIG_CPU_R3000)		+= r3k_dump_tlb.o
 obj-$(CONFIG_CPU_TX39XX)	+= r3k_dump_tlb.o
 
 # libgcc-style stuff needed in the kernel
-obj-y += ashldi3.o ashrdi3.o bswapsi.o bswapdi.o cmpdi2.o lshrdi3.o ucmpdi2.o
+obj-y += ashldi3.o ashrdi3.o bswapsi.o bswapdi.o cmpdi2.o lshrdi3.o multi3.o \
+	 ucmpdi2.o
diff --git a/arch/mips/lib/libgcc.h b/arch/mips/lib/libgcc.h
index 05909d58e2fe..56ea0df60a44 100644
--- a/arch/mips/lib/libgcc.h
+++ b/arch/mips/lib/libgcc.h
@@ -9,10 +9,18 @@ typedef int word_type __attribute__ ((mode (__word__)));
 struct DWstruct {
 	int high, low;
 };
+
+struct TWstruct {
+	long long high, low;
+};
 #elif defined(__LITTLE_ENDIAN)
 struct DWstruct {
 	int low, high;
 };
+
+struct TWstruct {
+	long long low, high;
+};
 #else
 #error I feel sick.
 #endif
@@ -22,4 +30,13 @@ typedef union {
 	long long ll;
 } DWunion;
 
+#if defined(CONFIG_64BIT) && defined(CONFIG_CPU_MIPSR6)
+typedef int ti_type __attribute__((mode(TI)));
+
+typedef union {
+	struct TWstruct s;
+	ti_type ti;
+} TWunion;
+#endif
+
 #endif /* __ASM_LIBGCC_H */
diff --git a/arch/mips/lib/multi3.c b/arch/mips/lib/multi3.c
new file mode 100644
index 000000000000..111ad475aa0c
--- /dev/null
+++ b/arch/mips/lib/multi3.c
@@ -0,0 +1,54 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/export.h>
+
+#include "libgcc.h"
+
+/*
+ * GCC 7 suboptimally generates __multi3 calls for mips64r6, so for that
+ * specific case only we'll implement it here.
+ *
+ * See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82981
+ */
+#if defined(CONFIG_64BIT) && defined(CONFIG_CPU_MIPSR6) && (__GNUC__ == 7)
+
+/* multiply 64-bit values, low 64-bits returned */
+static inline long long notrace dmulu(long long a, long long b)
+{
+	long long res;
+
+	asm ("dmulu %0,%1,%2" : "=r" (res) : "r" (a), "r" (b));
+	return res;
+}
+
+/* multiply 64-bit unsigned values, high 64-bits of 128-bit result returned */
+static inline long long notrace dmuhu(long long a, long long b)
+{
+	long long res;
+
+	asm ("dmuhu %0,%1,%2" : "=r" (res) : "r" (a), "r" (b));
+	return res;
+}
+
+/* multiply 128-bit values, low 128-bits returned */
+ti_type notrace __multi3(ti_type a, ti_type b)
+{
+	TWunion res, aa, bb;
+
+	aa.ti = a;
+	bb.ti = b;
+
+	/*
+	 * a * b =           (a.lo * b.lo)
+	 *         + 2^64  * (a.hi * b.lo + a.lo * b.hi)
+	 *        [+ 2^128 * (a.hi * b.hi)]
+	 */
+	res.s.low = dmulu(aa.s.low, bb.s.low);
+	res.s.high = dmuhu(aa.s.low, bb.s.low);
+	res.s.high += dmulu(aa.s.high, bb.s.low);
+	res.s.high += dmulu(aa.s.low, bb.s.high);
+
+	return res.ti;
+}
+EXPORT_SYMBOL(__multi3);
+
+#endif /* 64BIT && CPU_MIPSR6 && GCC7 */
-- 
cgit v1.2.3


From d95280bd8c786ac5d2f379d451bcbdbdf2211e44 Mon Sep 17 00:00:00 2001
From: Ulf Magnusson <ulfalizer@gmail.com>
Date: Mon, 5 Feb 2018 02:21:13 +0100
Subject: ARM: mvebu: Fix broken PL310_ERRATA_753970 selects

commit 8aa36a8dcde3183d84db7b0d622ffddcebb61077 upstream.

The MACH_ARMADA_375 and MACH_ARMADA_38X boards select ARM_ERRATA_753970,
but it was renamed to PL310_ERRATA_753970 by commit fa0ce4035d48 ("ARM:
7162/1: errata: tidy up Kconfig options for PL310 errata workarounds").

Fix the selects to use the new name.

Discovered with the
https://github.com/ulfalizer/Kconfiglib/blob/master/examples/list_undefined.py
script.
Fixes: fa0ce4035d48 ("ARM: 7162/1: errata: tidy up Kconfig options for
PL310 errata workarounds"
cc: stable@vger.kernel.org
Signed-off-by: Ulf Magnusson <ulfalizer@gmail.com>
Signed-off-by: Gregory CLEMENT <gregory.clement@bootlin.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/arm/mach-mvebu/Kconfig | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/arm/mach-mvebu/Kconfig b/arch/arm/mach-mvebu/Kconfig
index e20fc4178b15..1c8a6098a2ca 100644
--- a/arch/arm/mach-mvebu/Kconfig
+++ b/arch/arm/mach-mvebu/Kconfig
@@ -37,7 +37,7 @@ config MACH_ARMADA_370
 config MACH_ARMADA_375
 	bool "Marvell Armada 375 boards" if ARCH_MULTI_V7
 	select ARM_ERRATA_720789
-	select ARM_ERRATA_753970
+	select PL310_ERRATA_753970
 	select ARM_GIC
 	select ARMADA_375_CLK
 	select HAVE_ARM_SCU
@@ -52,7 +52,7 @@ config MACH_ARMADA_375
 config MACH_ARMADA_38X
 	bool "Marvell Armada 380/385 boards" if ARCH_MULTI_V7
 	select ARM_ERRATA_720789
-	select ARM_ERRATA_753970
+	select PL310_ERRATA_753970
 	select ARM_GIC
 	select ARMADA_38X_CLK
 	select HAVE_ARM_SCU
-- 
cgit v1.2.3


From 5fed0b3532cb69b27d286b27ea4377ea44e686e5 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jslaby@suse.cz>
Date: Wed, 7 Mar 2018 08:56:23 +0100
Subject: x86/syscall: Sanitize syscall table de-references under speculation
 fix

In 4.4.118, we have commit c8961332d6da (x86/syscall: Sanitize syscall
table de-references under speculation), which is a backport of upstream
commit 2fbd7af5af86. But it fixed only the C part of the upstream patch
-- the IA32 sysentry. So it ommitted completely the assembly part -- the
64bit sysentry.

Fix that in this patch by explicit array_index_mask_nospec written in
assembly. The same was used in lib/getuser.S.

However, to have "sbb" working properly, we have to switch from "cmp"
against (NR_syscalls-1) to (NR_syscalls), otherwise the last syscall
number would be "and"ed by 0. It is because the original "ja" relies on
"CF" or "ZF", but we rely only on "CF" in "sbb". That means: switch to
"jae" conditional jump too.

Final note: use rcx for mask as this is exactly what is overwritten by
the 4th syscall argument (r10) right after.

Reported-by: Jan Beulich <JBeulich@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-arch@vger.kernel.org
Cc: kernel-hardening@lists.openwall.com
Cc: gregkh@linuxfoundation.org
Cc: Andy Lutomirski <luto@kernel.org>
Cc: alan@linux.intel.com
Cc: Jinpu Wang <jinpu.wang@profitbricks.com>
Signed-off-by: Jiri Slaby <jslaby@suse.cz>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/entry/entry_64.S | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index a03b22c615d9..59a4e1604a36 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -178,12 +178,14 @@ GLOBAL(entry_SYSCALL_64_after_swapgs)
 	jnz	tracesys
 entry_SYSCALL_64_fastpath:
 #if __SYSCALL_MASK == ~0
-	cmpq	$__NR_syscall_max, %rax
+	cmpq	$NR_syscalls, %rax
 #else
 	andl	$__SYSCALL_MASK, %eax
-	cmpl	$__NR_syscall_max, %eax
+	cmpl	$NR_syscalls, %eax
 #endif
-	ja	1f				/* return -ENOSYS (already in pt_regs->ax) */
+	jae	1f				/* return -ENOSYS (already in pt_regs->ax) */
+	sbb	%rcx, %rcx			/* array_index_mask_nospec() */
+	and	%rcx, %rax
 	movq	%r10, %rcx
 #ifdef CONFIG_RETPOLINE
 	movq	sys_call_table(, %rax, 8), %rax
@@ -276,12 +278,14 @@ tracesys_phase2:
 	RESTORE_C_REGS_EXCEPT_RAX
 	RESTORE_EXTRA_REGS
 #if __SYSCALL_MASK == ~0
-	cmpq	$__NR_syscall_max, %rax
+	cmpq	$NR_syscalls, %rax
 #else
 	andl	$__SYSCALL_MASK, %eax
-	cmpl	$__NR_syscall_max, %eax
+	cmpl	$NR_syscalls, %eax
 #endif
-	ja	1f				/* return -ENOSYS (already in pt_regs->ax) */
+	jae	1f				/* return -ENOSYS (already in pt_regs->ax) */
+	sbb	%rcx, %rcx			/* array_index_mask_nospec() */
+	and	%rcx, %rax
 	movq	%r10, %rcx			/* fixup for C */
 #ifdef CONFIG_RETPOLINE
 	movq	sys_call_table(, %rax, 8), %rax
-- 
cgit v1.2.3


From c86bfc7b7b01c4b98c29a39bd60e61fa8e337ebf Mon Sep 17 00:00:00 2001
From: Adam Ford <aford173@gmail.com>
Date: Thu, 25 Jan 2018 14:10:37 -0600
Subject: ARM: dts: LogicPD Torpedo: Fix I2C1 pinmux

commit 74402055a2d3ec998a1ded599e86185a27d9bbf4 upstream.

The pinmuxing was missing for I2C1 which was causing intermittent issues
with the PMIC which is connected to I2C1.  The bootloader did not quite
configure the I2C1 either, so when running at 2.6MHz, it was generating
errors at time.

This correctly sets the I2C1 pinmuxing so it can operate at 2.6MHz

Fixes: 687c27676151 ("ARM: dts: Add minimal support for LogicPD Torpedo
DM3730 devkit")

Signed-off-by: Adam Ford <aford173@gmail.com>
Signed-off-by: Tony Lindgren <tony@atomide.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/arm/boot/dts/logicpd-torpedo-som.dtsi | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'arch')

diff --git a/arch/arm/boot/dts/logicpd-torpedo-som.dtsi b/arch/arm/boot/dts/logicpd-torpedo-som.dtsi
index 80f6c786a37e..5562c5feb0be 100644
--- a/arch/arm/boot/dts/logicpd-torpedo-som.dtsi
+++ b/arch/arm/boot/dts/logicpd-torpedo-som.dtsi
@@ -90,6 +90,8 @@
 };
 
 &i2c1 {
+	pinctrl-names = "default";
+	pinctrl-0 = <&i2c1_pins>;
 	clock-frequency = <2600000>;
 
 	twl: twl@48 {
@@ -146,6 +148,12 @@
 			OMAP3630_CORE2_IOPAD(0x25da, PIN_INPUT_PULLUP | MUX_MODE2)   /* etk_ctl.sdmmc3_cmd */
 		>;
 	};
+	i2c1_pins: pinmux_i2c1_pins {
+		pinctrl-single,pins = <
+			OMAP3_CORE1_IOPAD(0x21ba, PIN_INPUT | MUX_MODE0)        /* i2c1_scl.i2c1_scl */
+			OMAP3_CORE1_IOPAD(0x21bc, PIN_INPUT | MUX_MODE0)        /* i2c1_sda.i2c1_sda */
+		>;
+	};
 };
 
 #include "twl4030.dtsi"
-- 
cgit v1.2.3


From 6bde094049bf3648882ff8fdfabd7871f40d3935 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 7 Mar 2018 18:36:43 +0000
Subject: x86/apic/vector: Handle legacy irq data correctly

The backport of upstream commit 45d55e7bac40 ("x86/apic/vector: Fix off by
one in error path") missed to fixup the legacy interrupt data which is not
longer available upstream.

Handle legacy irq data correctly by clearing the legacy storage to prevent
use after free.

Fixes: 7fd133539289 ("x86/apic/vector: Fix off by one in error path") - 4.4.y
Fixes: c557481a9491 ("x86/apic/vector: Fix off by one in error path") - 4.9.y
Reported-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kernel/apic/vector.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c
index a41e523536a2..592e260ba05b 100644
--- a/arch/x86/kernel/apic/vector.c
+++ b/arch/x86/kernel/apic/vector.c
@@ -91,8 +91,12 @@ out_data:
 	return NULL;
 }
 
-static void free_apic_chip_data(struct apic_chip_data *data)
+static void free_apic_chip_data(unsigned int virq, struct apic_chip_data *data)
 {
+#ifdef	CONFIG_X86_IO_APIC
+	if (virq  < nr_legacy_irqs())
+		legacy_irq_data[virq] = NULL;
+#endif
 	if (data) {
 		free_cpumask_var(data->domain);
 		free_cpumask_var(data->old_domain);
@@ -316,11 +320,7 @@ static void x86_vector_free_irqs(struct irq_domain *domain,
 			apic_data = irq_data->chip_data;
 			irq_domain_reset_irq_data(irq_data);
 			raw_spin_unlock_irqrestore(&vector_lock, flags);
-			free_apic_chip_data(apic_data);
-#ifdef	CONFIG_X86_IO_APIC
-			if (virq + i < nr_legacy_irqs())
-				legacy_irq_data[virq + i] = NULL;
-#endif
+			free_apic_chip_data(virq + i, apic_data);
 		}
 	}
 }
@@ -361,7 +361,7 @@ static int x86_vector_alloc_irqs(struct irq_domain *domain, unsigned int virq,
 		err = assign_irq_vector_policy(virq + i, node, data, info);
 		if (err) {
 			irq_data->chip_data = NULL;
-			free_apic_chip_data(data);
+			free_apic_chip_data(virq + i, data);
 			goto error;
 		}
 	}
-- 
cgit v1.2.3


From c2da3bb9cfab37eae4ad92d53f8e7a86d5747dd5 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Wed, 14 Feb 2018 10:14:17 +0300
Subject: x86/spectre: Fix an error message

commit 9de29eac8d2189424d81c0d840cd0469aa3d41c8 upstream.

If i == ARRAY_SIZE(mitigation_options) then we accidentally print
garbage from one space beyond the end of the mitigation_options[] array.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: David Woodhouse <dwmw@amazon.co.uk>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: KarimAllah Ahmed <karahmed@amazon.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kernel-janitors@vger.kernel.org
Fixes: 9005c6834c0f ("x86/spectre: Simplify spectre_v2 command line parsing")
Link: http://lkml.kernel.org/r/20180214071416.GA26677@mwanda
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Cc: Ben Hutchings <ben.hutchings@codethink.co.uk>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/kernel/cpu/bugs.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index ecaf7c9baf75..2bbc74f8a4a8 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -175,8 +175,7 @@ static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void)
 		}
 
 		if (i >= ARRAY_SIZE(mitigation_options)) {
-			pr_err("unknown option (%s). Switching to AUTO select\n",
-			       mitigation_options[i].option);
+			pr_err("unknown option (%s). Switching to AUTO select\n", arg);
 			return SPECTRE_V2_CMD_AUTO;
 		}
 	}
-- 
cgit v1.2.3


From ea1c4ebe282d6bb6afca4a42bfbfb933c86b264c Mon Sep 17 00:00:00 2001
From: Daniel Borkmann <daniel@iogearbox.net>
Date: Thu, 8 Mar 2018 16:17:34 +0100
Subject: bpf, x64: implement retpoline for tail call

[ upstream commit a493a87f38cfa48caaa95c9347be2d914c6fdf29 ]

Implement a retpoline [0] for the BPF tail call JIT'ing that converts
the indirect jump via jmp %rax that is used to make the long jump into
another JITed BPF image. Since this is subject to speculative execution,
we need to control the transient instruction sequence here as well
when CONFIG_RETPOLINE is set, and direct it into a pause + lfence loop.
The latter aligns also with what gcc / clang emits (e.g. [1]).

JIT dump after patch:

  # bpftool p d x i 1
   0: (18) r2 = map[id:1]
   2: (b7) r3 = 0
   3: (85) call bpf_tail_call#12
   4: (b7) r0 = 2
   5: (95) exit

With CONFIG_RETPOLINE:

  # bpftool p d j i 1
  [...]
  33:	cmp    %edx,0x24(%rsi)
  36:	jbe    0x0000000000000072  |*
  38:	mov    0x24(%rbp),%eax
  3e:	cmp    $0x20,%eax
  41:	ja     0x0000000000000072  |
  43:	add    $0x1,%eax
  46:	mov    %eax,0x24(%rbp)
  4c:	mov    0x90(%rsi,%rdx,8),%rax
  54:	test   %rax,%rax
  57:	je     0x0000000000000072  |
  59:	mov    0x28(%rax),%rax
  5d:	add    $0x25,%rax
  61:	callq  0x000000000000006d  |+
  66:	pause                      |
  68:	lfence                     |
  6b:	jmp    0x0000000000000066  |
  6d:	mov    %rax,(%rsp)         |
  71:	retq                       |
  72:	mov    $0x2,%eax
  [...]

  * relative fall-through jumps in error case
  + retpoline for indirect jump

Without CONFIG_RETPOLINE:

  # bpftool p d j i 1
  [...]
  33:	cmp    %edx,0x24(%rsi)
  36:	jbe    0x0000000000000063  |*
  38:	mov    0x24(%rbp),%eax
  3e:	cmp    $0x20,%eax
  41:	ja     0x0000000000000063  |
  43:	add    $0x1,%eax
  46:	mov    %eax,0x24(%rbp)
  4c:	mov    0x90(%rsi,%rdx,8),%rax
  54:	test   %rax,%rax
  57:	je     0x0000000000000063  |
  59:	mov    0x28(%rax),%rax
  5d:	add    $0x25,%rax
  61:	jmpq   *%rax               |-
  63:	mov    $0x2,%eax
  [...]

  * relative fall-through jumps in error case
  - plain indirect jump as before

  [0] https://support.google.com/faqs/answer/7625886
  [1] https://github.com/gcc-mirror/gcc/commit/a31e654fa107be968b802786d747e962c2fcdb2b

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 arch/x86/include/asm/nospec-branch.h | 37 ++++++++++++++++++++++++++++++++++++
 arch/x86/net/bpf_jit_comp.c          |  9 +++++----
 2 files changed, 42 insertions(+), 4 deletions(-)

(limited to 'arch')

diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
index 66094a0473a8..249f1c769f21 100644
--- a/arch/x86/include/asm/nospec-branch.h
+++ b/arch/x86/include/asm/nospec-branch.h
@@ -195,4 +195,41 @@ static inline void vmexit_fill_RSB(void)
 }
 
 #endif /* __ASSEMBLY__ */
+
+/*
+ * Below is used in the eBPF JIT compiler and emits the byte sequence
+ * for the following assembly:
+ *
+ * With retpolines configured:
+ *
+ *    callq do_rop
+ *  spec_trap:
+ *    pause
+ *    lfence
+ *    jmp spec_trap
+ *  do_rop:
+ *    mov %rax,(%rsp)
+ *    retq
+ *
+ * Without retpolines configured:
+ *
+ *    jmp *%rax
+ */
+#ifdef CONFIG_RETPOLINE
+# define RETPOLINE_RAX_BPF_JIT_SIZE	17
+# define RETPOLINE_RAX_BPF_JIT()				\
+	EMIT1_off32(0xE8, 7);	 /* callq do_rop */		\
+	/* spec_trap: */					\
+	EMIT2(0xF3, 0x90);       /* pause */			\
+	EMIT3(0x0F, 0xAE, 0xE8); /* lfence */			\
+	EMIT2(0xEB, 0xF9);       /* jmp spec_trap */		\
+	/* do_rop: */						\
+	EMIT4(0x48, 0x89, 0x04, 0x24); /* mov %rax,(%rsp) */	\
+	EMIT1(0xC3);             /* retq */
+#else
+# define RETPOLINE_RAX_BPF_JIT_SIZE	2
+# define RETPOLINE_RAX_BPF_JIT()				\
+	EMIT2(0xFF, 0xE0);	 /* jmp *%rax */
+#endif
+
 #endif /* _ASM_X86_NOSPEC_BRANCH_H_ */
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
index 33c42b826791..a889211e21c5 100644
--- a/arch/x86/net/bpf_jit_comp.c
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -12,6 +12,7 @@
 #include <linux/filter.h>
 #include <linux/if_vlan.h>
 #include <asm/cacheflush.h>
+#include <asm/nospec-branch.h>
 #include <linux/bpf.h>
 
 int bpf_jit_enable __read_mostly;
@@ -269,7 +270,7 @@ static void emit_bpf_tail_call(u8 **pprog)
 	EMIT2(0x89, 0xD2);                        /* mov edx, edx */
 	EMIT3(0x39, 0x56,                         /* cmp dword ptr [rsi + 16], edx */
 	      offsetof(struct bpf_array, map.max_entries));
-#define OFFSET1 43 /* number of bytes to jump */
+#define OFFSET1 (41 + RETPOLINE_RAX_BPF_JIT_SIZE) /* number of bytes to jump */
 	EMIT2(X86_JBE, OFFSET1);                  /* jbe out */
 	label1 = cnt;
 
@@ -278,7 +279,7 @@ static void emit_bpf_tail_call(u8 **pprog)
 	 */
 	EMIT2_off32(0x8B, 0x85, -STACKSIZE + 36); /* mov eax, dword ptr [rbp - 516] */
 	EMIT3(0x83, 0xF8, MAX_TAIL_CALL_CNT);     /* cmp eax, MAX_TAIL_CALL_CNT */
-#define OFFSET2 32
+#define OFFSET2 (30 + RETPOLINE_RAX_BPF_JIT_SIZE)
 	EMIT2(X86_JA, OFFSET2);                   /* ja out */
 	label2 = cnt;
 	EMIT3(0x83, 0xC0, 0x01);                  /* add eax, 1 */
@@ -292,7 +293,7 @@ static void emit_bpf_tail_call(u8 **pprog)
 	 *   goto out;
 	 */
 	EMIT3(0x48, 0x85, 0xC0);		  /* test rax,rax */
-#define OFFSET3 10
+#define OFFSET3 (8 + RETPOLINE_RAX_BPF_JIT_SIZE)
 	EMIT2(X86_JE, OFFSET3);                   /* je out */
 	label3 = cnt;
 
@@ -305,7 +306,7 @@ static void emit_bpf_tail_call(u8 **pprog)
 	 * rdi == ctx (1st arg)
 	 * rax == prog->bpf_func + prologue_size
 	 */
-	EMIT2(0xFF, 0xE0);                        /* jmp rax */
+	RETPOLINE_RAX_BPF_JIT();
 
 	/* out: */
 	BUILD_BUG_ON(cnt - label1 != OFFSET1);
-- 
cgit v1.2.3