From fdbd518adfaf2c10903250dffe70c61ed90b7dd7 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 3 Feb 2017 01:58:03 -0700 Subject: x86/entry/32: Relax a pvops stub clobber specification The code at .Lrestore_nocheck does not make any assumptions on register values, so all registers can be clobbered on code paths leading there. Signed-off-by: Jan Beulich Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/5894542B02000078001366C5@prv-mh.provo.novell.com Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_32.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 57f7ec35216e..5553475f6008 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -585,7 +585,7 @@ ENTRY(iret_exc ) * will soon execute iret and the tracer was already set to * the irqstate after the IRET: */ - DISABLE_INTERRUPTS(CLBR_EAX) + DISABLE_INTERRUPTS(CLBR_ANY) lss (%esp), %esp /* switch to espfix segment */ jmp .Lrestore_nocheck #endif -- cgit v1.2.3 From 2140a9942b84dd4bf559dd1215b8f43c36ece5b5 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 3 Feb 2017 02:03:25 -0700 Subject: x86/entry/64: Relax pvops stub clobber specifications Except for the error_exit case, none of the code paths following the {DIS,EN}ABLE_INTERRUPTS() invocations being modified here make any assumptions on register values, so all registers can be clobbered there. In the error_exit case a minor adjustment to register usage (at once eliminating an instruction) also allows for this to be true. Signed-off-by: Jan Beulich Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/5894556D02000078001366D3@prv-mh.provo.novell.com Signed-off-by: Ingo Molnar --- arch/x86/entry/entry_64.S | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 044d18ebc43c..d2b2a2948ffe 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -212,7 +212,7 @@ entry_SYSCALL_64_fastpath: * If we see that no exit work is required (which we are required * to check with IRQs off), then we can go straight to SYSRET64. */ - DISABLE_INTERRUPTS(CLBR_NONE) + DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF movq PER_CPU_VAR(current_task), %r11 testl $_TIF_ALLWORK_MASK, TASK_TI_flags(%r11) @@ -233,7 +233,7 @@ entry_SYSCALL_64_fastpath: * raise(3) will trigger this, for example. IRQs are off. */ TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_NONE) + ENABLE_INTERRUPTS(CLBR_ANY) SAVE_EXTRA_REGS movq %rsp, %rdi call syscall_return_slowpath /* returns with IRQs disabled */ @@ -343,7 +343,7 @@ ENTRY(stub_ptregs_64) * Called from fast path -- disable IRQs again, pop return address * and jump to slow path */ - DISABLE_INTERRUPTS(CLBR_NONE) + DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF popq %rax jmp entry_SYSCALL64_slow_path @@ -518,7 +518,7 @@ common_interrupt: interrupt do_IRQ /* 0(%rsp): old RSP */ ret_from_intr: - DISABLE_INTERRUPTS(CLBR_NONE) + DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF decl PER_CPU_VAR(irq_count) @@ -1051,7 +1051,7 @@ END(paranoid_entry) * On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */ ENTRY(paranoid_exit) - DISABLE_INTERRUPTS(CLBR_NONE) + DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF_DEBUG testl %ebx, %ebx /* swapgs needed? */ jnz paranoid_exit_no_swapgs @@ -1156,10 +1156,9 @@ END(error_entry) * 0: user gsbase is loaded, we need SWAPGS and standard preparation for return to usermode */ ENTRY(error_exit) - movl %ebx, %eax - DISABLE_INTERRUPTS(CLBR_NONE) + DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF - testl %eax, %eax + testl %ebx, %ebx jnz retint_kernel jmp retint_user END(error_exit) -- cgit v1.2.3 From f25d384755191690b1196776d319cb6a4e899f28 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 9 Feb 2017 01:34:49 +0100 Subject: x86/asm: Optimize clear_page() Currently, we CALL clear_page() which then JMPs to the proper function chosen by the alternatives. What we should do instead is CALL the proper function directly. (This was something Ingo suggested a while ago). So let's do that. Measuring our favourite kernel build workload shows that there are no significant changes in performance. AMD === -- /tmp/before 2017-02-09 18:01:46.451961188 +0100 ++ /tmp/after 2017-02-09 18:01:54.883961175 +0100 @@ -1,15 +1,15 @@ Performance counter stats for 'system wide' (5 runs): - 1028960.373643 cpu-clock (msec) # 6.000 CPUs utilized ( +- 1.41% ) + 1023086.018961 cpu-clock (msec) # 6.000 CPUs utilized ( +- 1.20% ) - 518,744 context-switches # 0.504 K/sec ( +- 1.04% ) + 518,254 context-switches # 0.507 K/sec ( +- 1.01% ) - 38,112 cpu-migrations # 0.037 K/sec ( +- 1.95% ) + 37,917 cpu-migrations # 0.037 K/sec ( +- 1.02% ) - 20,874,266 page-faults # 0.020 M/sec ( +- 0.07% ) + 20,918,897 page-faults # 0.020 M/sec ( +- 0.18% ) - 2,043,646,230,667 cycles # 1.986 GHz ( +- 0.14% ) (66.67%) + 2,045,305,584,032 cycles # 1.999 GHz ( +- 0.16% ) (66.67%) - 553,698,855,431 stalled-cycles-frontend # 27.09% frontend cycles idle ( +- 0.07% ) (66.67%) + 555,099,401,413 stalled-cycles-frontend # 27.14% frontend cycles idle ( +- 0.13% ) (66.67%) - 621,544,286,390 stalled-cycles-backend # 30.41% backend cycles idle ( +- 0.39% ) (66.67%) + 621,371,430,254 stalled-cycles-backend # 30.38% backend cycles idle ( +- 0.32% ) (66.67%) - 1,738,364,431,659 instructions # 0.85 insn per cycle + 1,739,895,771,901 instructions # 0.85 insn per cycle - # 0.36 stalled cycles per insn ( +- 0.11% ) (66.67%) + # 0.36 stalled cycles per insn ( +- 0.13% ) (66.67%) - 391,170,943,850 branches # 380.161 M/sec ( +- 0.13% ) (66.67%) + 391,398,551,757 branches # 382.567 M/sec ( +- 0.13% ) (66.67%) - 22,567,810,411 branch-misses # 5.77% of all branches ( +- 0.11% ) (66.67%) + 22,574,726,683 branch-misses # 5.77% of all branches ( +- 0.13% ) (66.67%) - 171.480741921 seconds time elapsed ( +- 1.41% ) + 170.509229451 seconds time elapsed ( +- 1.20% ) Intel ===== -- /tmp/before 2017-02-09 20:36:19.851947473 +0100 ++ /tmp/after 2017-02-09 20:36:30.151947458 +0100 @@ -1,15 +1,15 @@ Performance counter stats for 'system wide' (5 runs): - 2207248.598126 cpu-clock (msec) # 8.000 CPUs utilized ( +- 0.69% ) + 2213300.106631 cpu-clock (msec) # 8.000 CPUs utilized ( +- 0.73% ) - 899,342 context-switches # 0.407 K/sec ( +- 0.68% ) + 898,381 context-switches # 0.406 K/sec ( +- 0.79% ) - 80,553 cpu-migrations # 0.036 K/sec ( +- 1.13% ) + 80,979 cpu-migrations # 0.037 K/sec ( +- 1.11% ) - 36,171,148 page-faults # 0.016 M/sec ( +- 0.02% ) + 36,179,791 page-faults # 0.016 M/sec ( +- 0.02% ) - 6,665,288,826,484 cycles # 3.020 GHz ( +- 0.07% ) (83.33%) + 6,671,638,410,799 cycles # 3.014 GHz ( +- 0.06% ) (83.33%) - 5,065,975,115,197 stalled-cycles-frontend # 76.01% frontend cycles idle ( +- 0.11% ) (83.33%) + 5,076,835,183,223 stalled-cycles-frontend # 76.10% frontend cycles idle ( +- 0.11% ) (83.33%) - 3,841,556,350,614 stalled-cycles-backend # 57.64% backend cycles idle ( +- 0.13% ) (66.67%) + 3,852,823,974,333 stalled-cycles-backend # 57.75% backend cycles idle ( +- 0.12% ) (66.67%) - 4,148,398,171,079 instructions # 0.62 insn per cycle + 4,148,997,156,059 instructions # 0.62 insn per cycle - # 1.22 stalled cycles per insn ( +- 0.10% ) (83.33%) + # 1.22 stalled cycles per insn ( +- 0.11% ) (83.33%) - 887,187,118,591 branches # 401.943 M/sec ( +- 0.09% ) (83.33%) + 887,271,341,121 branches # 400.882 M/sec ( +- 0.11% ) (83.33%) - 30,139,439,034 branch-misses # 3.40% of all branches ( +- 0.09% ) (83.33%) + 30,134,864,997 branch-misses # 3.40% of all branches ( +- 0.06% ) (83.33%) - 275.904405540 seconds time elapsed ( +- 0.69% ) + 276.660352016 seconds time elapsed ( +- 0.73% ) allmodconfig vmlinux size grows by a ~1Kb but that's fine - we optimize our calling of the clear_page variants. text data bss dec hex filename 9051979 23067670 27009024 59128673 3863b61 vmlinux 9053000 23067670 27009024 59129694 3863f5e vmlinux.clear_page Reported-by: kernel test robot Tested-by: Fengguang Wu Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170215111927.emdgxf2pide3kwro@pd.tnic Signed-off-by: Ingo Molnar --- arch/x86/include/asm/page_64.h | 16 +++++++++++++++- arch/x86/lib/clear_page_64.S | 17 +++++++---------- 2 files changed, 22 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h index b3bebf9e5746..b4a0d43248cf 100644 --- a/arch/x86/include/asm/page_64.h +++ b/arch/x86/include/asm/page_64.h @@ -4,6 +4,7 @@ #include #ifndef __ASSEMBLY__ +#include /* duplicated to the one in bootmem.h */ extern unsigned long max_pfn; @@ -34,7 +35,20 @@ extern unsigned long __phys_addr_symbol(unsigned long); #define pfn_valid(pfn) ((pfn) < max_pfn) #endif -void clear_page(void *page); +void clear_page_orig(void *page); +void clear_page_rep(void *page); +void clear_page_erms(void *page); + +static inline void clear_page(void *page) +{ + alternative_call_2(clear_page_orig, + clear_page_rep, X86_FEATURE_REP_GOOD, + clear_page_erms, X86_FEATURE_ERMS, + "=D" (page), + "0" (page) + : "memory", "rax", "rcx"); +} + void copy_page(void *to, void *from); #endif /* !__ASSEMBLY__ */ diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S index 5e2af3a88cf5..81b1635d67de 100644 --- a/arch/x86/lib/clear_page_64.S +++ b/arch/x86/lib/clear_page_64.S @@ -14,20 +14,15 @@ * Zero a page. * %rdi - page */ -ENTRY(clear_page) - - ALTERNATIVE_2 "jmp clear_page_orig", "", X86_FEATURE_REP_GOOD, \ - "jmp clear_page_c_e", X86_FEATURE_ERMS - +ENTRY(clear_page_rep) movl $4096/8,%ecx xorl %eax,%eax rep stosq ret -ENDPROC(clear_page) -EXPORT_SYMBOL(clear_page) +ENDPROC(clear_page_rep) +EXPORT_SYMBOL_GPL(clear_page_rep) ENTRY(clear_page_orig) - xorl %eax,%eax movl $4096/64,%ecx .p2align 4 @@ -47,10 +42,12 @@ ENTRY(clear_page_orig) nop ret ENDPROC(clear_page_orig) +EXPORT_SYMBOL_GPL(clear_page_orig) -ENTRY(clear_page_c_e) +ENTRY(clear_page_erms) movl $4096,%ecx xorl %eax,%eax rep stosb ret -ENDPROC(clear_page_c_e) +ENDPROC(clear_page_erms) +EXPORT_SYMBOL_GPL(clear_page_erms) -- cgit v1.2.3 From db65d7b6dcd313ad54c3589813f70b805dbb48ec Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 23 Mar 2017 10:33:48 -0400 Subject: x86/ftrace: Rename mcount_64.S to ftrace_64.S With the advent of -mfentry that uses the new "fentry" hook over mcount, the mcount name is obsolete. Having the code file that ftrace hooks into called "mcount*.S" is rather misleading. Rename it to ftrace_64.S and remove the file name reference. Signed-off-by: Steven Rostedt (VMware) Reviewed-by: Ingo Molnar Cc: Peter Zijlstra Cc: Andy Lutomirski Cc: Masami Hiramatsu Cc: Josh Poimboeuf Cc: Andrew Morton Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20170323143445.490601451@goodmis.org Signed-off-by: Thomas Gleixner --- arch/x86/kernel/Makefile | 4 +- arch/x86/kernel/ftrace_64.S | 336 +++++++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/mcount_64.S | 338 -------------------------------------------- 3 files changed, 338 insertions(+), 340 deletions(-) create mode 100644 arch/x86/kernel/ftrace_64.S delete mode 100644 arch/x86/kernel/mcount_64.S diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 84c00592d359..d3743a37e990 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -27,7 +27,7 @@ KASAN_SANITIZE_stacktrace.o := n OBJECT_FILES_NON_STANDARD_head_$(BITS).o := y OBJECT_FILES_NON_STANDARD_relocate_kernel_$(BITS).o := y -OBJECT_FILES_NON_STANDARD_mcount_$(BITS).o := y +OBJECT_FILES_NON_STANDARD_ftrace_$(BITS).o := y OBJECT_FILES_NON_STANDARD_test_nx.o := y # If instrumentation of this dir is enabled, boot hangs during first second. @@ -46,7 +46,7 @@ obj-$(CONFIG_MODIFY_LDT_SYSCALL) += ldt.o obj-y += setup.o x86_init.o i8259.o irqinit.o jump_label.o obj-$(CONFIG_IRQ_WORK) += irq_work.o obj-y += probe_roms.o -obj-$(CONFIG_X86_64) += sys_x86_64.o mcount_64.o +obj-$(CONFIG_X86_64) += sys_x86_64.o ftrace_64.o obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o obj-$(CONFIG_SYSFS) += ksysfs.o obj-y += bootflag.o e820.o diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S new file mode 100644 index 000000000000..d1be3cbe2cd4 --- /dev/null +++ b/arch/x86/kernel/ftrace_64.S @@ -0,0 +1,336 @@ +/* + * Copyright (C) 2014 Steven Rostedt, Red Hat Inc + */ + +#include +#include +#include +#include + + + .code64 + .section .entry.text, "ax" + + +#ifdef CONFIG_FUNCTION_TRACER + +#ifdef CC_USING_FENTRY +# define function_hook __fentry__ +EXPORT_SYMBOL(__fentry__) +#else +# define function_hook mcount +EXPORT_SYMBOL(mcount) +#endif + +/* All cases save the original rbp (8 bytes) */ +#ifdef CONFIG_FRAME_POINTER +# ifdef CC_USING_FENTRY +/* Save parent and function stack frames (rip and rbp) */ +# define MCOUNT_FRAME_SIZE (8+16*2) +# else +/* Save just function stack frame (rip and rbp) */ +# define MCOUNT_FRAME_SIZE (8+16) +# endif +#else +/* No need to save a stack frame */ +# define MCOUNT_FRAME_SIZE 8 +#endif /* CONFIG_FRAME_POINTER */ + +/* Size of stack used to save mcount regs in save_mcount_regs */ +#define MCOUNT_REG_SIZE (SS+8 + MCOUNT_FRAME_SIZE) + +/* + * gcc -pg option adds a call to 'mcount' in most functions. + * When -mfentry is used, the call is to 'fentry' and not 'mcount' + * and is done before the function's stack frame is set up. + * They both require a set of regs to be saved before calling + * any C code and restored before returning back to the function. + * + * On boot up, all these calls are converted into nops. When tracing + * is enabled, the call can jump to either ftrace_caller or + * ftrace_regs_caller. Callbacks (tracing functions) that require + * ftrace_regs_caller (like kprobes) need to have pt_regs passed to + * it. For this reason, the size of the pt_regs structure will be + * allocated on the stack and the required mcount registers will + * be saved in the locations that pt_regs has them in. + */ + +/* + * @added: the amount of stack added before calling this + * + * After this is called, the following registers contain: + * + * %rdi - holds the address that called the trampoline + * %rsi - holds the parent function (traced function's return address) + * %rdx - holds the original %rbp + */ +.macro save_mcount_regs added=0 + + /* Always save the original rbp */ + pushq %rbp + +#ifdef CONFIG_FRAME_POINTER + /* + * Stack traces will stop at the ftrace trampoline if the frame pointer + * is not set up properly. If fentry is used, we need to save a frame + * pointer for the parent as well as the function traced, because the + * fentry is called before the stack frame is set up, where as mcount + * is called afterward. + */ +#ifdef CC_USING_FENTRY + /* Save the parent pointer (skip orig rbp and our return address) */ + pushq \added+8*2(%rsp) + pushq %rbp + movq %rsp, %rbp + /* Save the return address (now skip orig rbp, rbp and parent) */ + pushq \added+8*3(%rsp) +#else + /* Can't assume that rip is before this (unless added was zero) */ + pushq \added+8(%rsp) +#endif + pushq %rbp + movq %rsp, %rbp +#endif /* CONFIG_FRAME_POINTER */ + + /* + * We add enough stack to save all regs. + */ + subq $(MCOUNT_REG_SIZE - MCOUNT_FRAME_SIZE), %rsp + movq %rax, RAX(%rsp) + movq %rcx, RCX(%rsp) + movq %rdx, RDX(%rsp) + movq %rsi, RSI(%rsp) + movq %rdi, RDI(%rsp) + movq %r8, R8(%rsp) + movq %r9, R9(%rsp) + /* + * Save the original RBP. Even though the mcount ABI does not + * require this, it helps out callers. + */ + movq MCOUNT_REG_SIZE-8(%rsp), %rdx + movq %rdx, RBP(%rsp) + + /* Copy the parent address into %rsi (second parameter) */ +#ifdef CC_USING_FENTRY + movq MCOUNT_REG_SIZE+8+\added(%rsp), %rsi +#else + /* %rdx contains original %rbp */ + movq 8(%rdx), %rsi +#endif + + /* Move RIP to its proper location */ + movq MCOUNT_REG_SIZE+\added(%rsp), %rdi + movq %rdi, RIP(%rsp) + + /* + * Now %rdi (the first parameter) has the return address of + * where ftrace_call returns. But the callbacks expect the + * address of the call itself. + */ + subq $MCOUNT_INSN_SIZE, %rdi + .endm + +.macro restore_mcount_regs + movq R9(%rsp), %r9 + movq R8(%rsp), %r8 + movq RDI(%rsp), %rdi + movq RSI(%rsp), %rsi + movq RDX(%rsp), %rdx + movq RCX(%rsp), %rcx + movq RAX(%rsp), %rax + + /* ftrace_regs_caller can modify %rbp */ + movq RBP(%rsp), %rbp + + addq $MCOUNT_REG_SIZE, %rsp + + .endm + +#ifdef CONFIG_DYNAMIC_FTRACE + +ENTRY(function_hook) + retq +END(function_hook) + +ENTRY(ftrace_caller) + /* save_mcount_regs fills in first two parameters */ + save_mcount_regs + +GLOBAL(ftrace_caller_op_ptr) + /* Load the ftrace_ops into the 3rd parameter */ + movq function_trace_op(%rip), %rdx + + /* regs go into 4th parameter (but make it NULL) */ + movq $0, %rcx + +GLOBAL(ftrace_call) + call ftrace_stub + + restore_mcount_regs + + /* + * The copied trampoline must call ftrace_epilogue as it + * still may need to call the function graph tracer. + * + * The code up to this label is copied into trampolines so + * think twice before adding any new code or changing the + * layout here. + */ +GLOBAL(ftrace_epilogue) + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +GLOBAL(ftrace_graph_call) + jmp ftrace_stub +#endif + +/* This is weak to keep gas from relaxing the jumps */ +WEAK(ftrace_stub) + retq +END(ftrace_caller) + +ENTRY(ftrace_regs_caller) + /* Save the current flags before any operations that can change them */ + pushfq + + /* added 8 bytes to save flags */ + save_mcount_regs 8 + /* save_mcount_regs fills in first two parameters */ + +GLOBAL(ftrace_regs_caller_op_ptr) + /* Load the ftrace_ops into the 3rd parameter */ + movq function_trace_op(%rip), %rdx + + /* Save the rest of pt_regs */ + movq %r15, R15(%rsp) + movq %r14, R14(%rsp) + movq %r13, R13(%rsp) + movq %r12, R12(%rsp) + movq %r11, R11(%rsp) + movq %r10, R10(%rsp) + movq %rbx, RBX(%rsp) + /* Copy saved flags */ + movq MCOUNT_REG_SIZE(%rsp), %rcx + movq %rcx, EFLAGS(%rsp) + /* Kernel segments */ + movq $__KERNEL_DS, %rcx + movq %rcx, SS(%rsp) + movq $__KERNEL_CS, %rcx + movq %rcx, CS(%rsp) + /* Stack - skipping return address and flags */ + leaq MCOUNT_REG_SIZE+8*2(%rsp), %rcx + movq %rcx, RSP(%rsp) + + /* regs go into 4th parameter */ + leaq (%rsp), %rcx + +GLOBAL(ftrace_regs_call) + call ftrace_stub + + /* Copy flags back to SS, to restore them */ + movq EFLAGS(%rsp), %rax + movq %rax, MCOUNT_REG_SIZE(%rsp) + + /* Handlers can change the RIP */ + movq RIP(%rsp), %rax + movq %rax, MCOUNT_REG_SIZE+8(%rsp) + + /* restore the rest of pt_regs */ + movq R15(%rsp), %r15 + movq R14(%rsp), %r14 + movq R13(%rsp), %r13 + movq R12(%rsp), %r12 + movq R10(%rsp), %r10 + movq RBX(%rsp), %rbx + + restore_mcount_regs + + /* Restore flags */ + popfq + + /* + * As this jmp to ftrace_epilogue can be a short jump + * it must not be copied into the trampoline. + * The trampoline will add the code to jump + * to the return. + */ +GLOBAL(ftrace_regs_caller_end) + + jmp ftrace_epilogue + +END(ftrace_regs_caller) + + +#else /* ! CONFIG_DYNAMIC_FTRACE */ + +ENTRY(function_hook) + cmpq $ftrace_stub, ftrace_trace_function + jnz trace + +fgraph_trace: +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + cmpq $ftrace_stub, ftrace_graph_return + jnz ftrace_graph_caller + + cmpq $ftrace_graph_entry_stub, ftrace_graph_entry + jnz ftrace_graph_caller +#endif + +GLOBAL(ftrace_stub) + retq + +trace: + /* save_mcount_regs fills in first two parameters */ + save_mcount_regs + + /* + * When DYNAMIC_FTRACE is not defined, ARCH_SUPPORTS_FTRACE_OPS is not + * set (see include/asm/ftrace.h and include/linux/ftrace.h). Only the + * ip and parent ip are used and the list function is called when + * function tracing is enabled. + */ + call *ftrace_trace_function + + restore_mcount_regs + + jmp fgraph_trace +END(function_hook) +#endif /* CONFIG_DYNAMIC_FTRACE */ +#endif /* CONFIG_FUNCTION_TRACER */ + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +ENTRY(ftrace_graph_caller) + /* Saves rbp into %rdx and fills first parameter */ + save_mcount_regs + +#ifdef CC_USING_FENTRY + leaq MCOUNT_REG_SIZE+8(%rsp), %rsi + movq $0, %rdx /* No framepointers needed */ +#else + /* Save address of the return address of traced function */ + leaq 8(%rdx), %rsi + /* ftrace does sanity checks against frame pointers */ + movq (%rdx), %rdx +#endif + call prepare_ftrace_return + + restore_mcount_regs + + retq +END(ftrace_graph_caller) + +GLOBAL(return_to_handler) + subq $24, %rsp + + /* Save the return values */ + movq %rax, (%rsp) + movq %rdx, 8(%rsp) + movq %rbp, %rdi + + call ftrace_return_to_handler + + movq %rax, %rdi + movq 8(%rsp), %rdx + movq (%rsp), %rax + addq $24, %rsp + jmp *%rdi +#endif diff --git a/arch/x86/kernel/mcount_64.S b/arch/x86/kernel/mcount_64.S deleted file mode 100644 index 7b0d3da52fb4..000000000000 --- a/arch/x86/kernel/mcount_64.S +++ /dev/null @@ -1,338 +0,0 @@ -/* - * linux/arch/x86_64/mcount_64.S - * - * Copyright (C) 2014 Steven Rostedt, Red Hat Inc - */ - -#include -#include -#include -#include - - - .code64 - .section .entry.text, "ax" - - -#ifdef CONFIG_FUNCTION_TRACER - -#ifdef CC_USING_FENTRY -# define function_hook __fentry__ -EXPORT_SYMBOL(__fentry__) -#else -# define function_hook mcount -EXPORT_SYMBOL(mcount) -#endif - -/* All cases save the original rbp (8 bytes) */ -#ifdef CONFIG_FRAME_POINTER -# ifdef CC_USING_FENTRY -/* Save parent and function stack frames (rip and rbp) */ -# define MCOUNT_FRAME_SIZE (8+16*2) -# else -/* Save just function stack frame (rip and rbp) */ -# define MCOUNT_FRAME_SIZE (8+16) -# endif -#else -/* No need to save a stack frame */ -# define MCOUNT_FRAME_SIZE 8 -#endif /* CONFIG_FRAME_POINTER */ - -/* Size of stack used to save mcount regs in save_mcount_regs */ -#define MCOUNT_REG_SIZE (SS+8 + MCOUNT_FRAME_SIZE) - -/* - * gcc -pg option adds a call to 'mcount' in most functions. - * When -mfentry is used, the call is to 'fentry' and not 'mcount' - * and is done before the function's stack frame is set up. - * They both require a set of regs to be saved before calling - * any C code and restored before returning back to the function. - * - * On boot up, all these calls are converted into nops. When tracing - * is enabled, the call can jump to either ftrace_caller or - * ftrace_regs_caller. Callbacks (tracing functions) that require - * ftrace_regs_caller (like kprobes) need to have pt_regs passed to - * it. For this reason, the size of the pt_regs structure will be - * allocated on the stack and the required mcount registers will - * be saved in the locations that pt_regs has them in. - */ - -/* - * @added: the amount of stack added before calling this - * - * After this is called, the following registers contain: - * - * %rdi - holds the address that called the trampoline - * %rsi - holds the parent function (traced function's return address) - * %rdx - holds the original %rbp - */ -.macro save_mcount_regs added=0 - - /* Always save the original rbp */ - pushq %rbp - -#ifdef CONFIG_FRAME_POINTER - /* - * Stack traces will stop at the ftrace trampoline if the frame pointer - * is not set up properly. If fentry is used, we need to save a frame - * pointer for the parent as well as the function traced, because the - * fentry is called before the stack frame is set up, where as mcount - * is called afterward. - */ -#ifdef CC_USING_FENTRY - /* Save the parent pointer (skip orig rbp and our return address) */ - pushq \added+8*2(%rsp) - pushq %rbp - movq %rsp, %rbp - /* Save the return address (now skip orig rbp, rbp and parent) */ - pushq \added+8*3(%rsp) -#else - /* Can't assume that rip is before this (unless added was zero) */ - pushq \added+8(%rsp) -#endif - pushq %rbp - movq %rsp, %rbp -#endif /* CONFIG_FRAME_POINTER */ - - /* - * We add enough stack to save all regs. - */ - subq $(MCOUNT_REG_SIZE - MCOUNT_FRAME_SIZE), %rsp - movq %rax, RAX(%rsp) - movq %rcx, RCX(%rsp) - movq %rdx, RDX(%rsp) - movq %rsi, RSI(%rsp) - movq %rdi, RDI(%rsp) - movq %r8, R8(%rsp) - movq %r9, R9(%rsp) - /* - * Save the original RBP. Even though the mcount ABI does not - * require this, it helps out callers. - */ - movq MCOUNT_REG_SIZE-8(%rsp), %rdx - movq %rdx, RBP(%rsp) - - /* Copy the parent address into %rsi (second parameter) */ -#ifdef CC_USING_FENTRY - movq MCOUNT_REG_SIZE+8+\added(%rsp), %rsi -#else - /* %rdx contains original %rbp */ - movq 8(%rdx), %rsi -#endif - - /* Move RIP to its proper location */ - movq MCOUNT_REG_SIZE+\added(%rsp), %rdi - movq %rdi, RIP(%rsp) - - /* - * Now %rdi (the first parameter) has the return address of - * where ftrace_call returns. But the callbacks expect the - * address of the call itself. - */ - subq $MCOUNT_INSN_SIZE, %rdi - .endm - -.macro restore_mcount_regs - movq R9(%rsp), %r9 - movq R8(%rsp), %r8 - movq RDI(%rsp), %rdi - movq RSI(%rsp), %rsi - movq RDX(%rsp), %rdx - movq RCX(%rsp), %rcx - movq RAX(%rsp), %rax - - /* ftrace_regs_caller can modify %rbp */ - movq RBP(%rsp), %rbp - - addq $MCOUNT_REG_SIZE, %rsp - - .endm - -#ifdef CONFIG_DYNAMIC_FTRACE - -ENTRY(function_hook) - retq -END(function_hook) - -ENTRY(ftrace_caller) - /* save_mcount_regs fills in first two parameters */ - save_mcount_regs - -GLOBAL(ftrace_caller_op_ptr) - /* Load the ftrace_ops into the 3rd parameter */ - movq function_trace_op(%rip), %rdx - - /* regs go into 4th parameter (but make it NULL) */ - movq $0, %rcx - -GLOBAL(ftrace_call) - call ftrace_stub - - restore_mcount_regs - - /* - * The copied trampoline must call ftrace_epilogue as it - * still may need to call the function graph tracer. - * - * The code up to this label is copied into trampolines so - * think twice before adding any new code or changing the - * layout here. - */ -GLOBAL(ftrace_epilogue) - -#ifdef CONFIG_FUNCTION_GRAPH_TRACER -GLOBAL(ftrace_graph_call) - jmp ftrace_stub -#endif - -/* This is weak to keep gas from relaxing the jumps */ -WEAK(ftrace_stub) - retq -END(ftrace_caller) - -ENTRY(ftrace_regs_caller) - /* Save the current flags before any operations that can change them */ - pushfq - - /* added 8 bytes to save flags */ - save_mcount_regs 8 - /* save_mcount_regs fills in first two parameters */ - -GLOBAL(ftrace_regs_caller_op_ptr) - /* Load the ftrace_ops into the 3rd parameter */ - movq function_trace_op(%rip), %rdx - - /* Save the rest of pt_regs */ - movq %r15, R15(%rsp) - movq %r14, R14(%rsp) - movq %r13, R13(%rsp) - movq %r12, R12(%rsp) - movq %r11, R11(%rsp) - movq %r10, R10(%rsp) - movq %rbx, RBX(%rsp) - /* Copy saved flags */ - movq MCOUNT_REG_SIZE(%rsp), %rcx - movq %rcx, EFLAGS(%rsp) - /* Kernel segments */ - movq $__KERNEL_DS, %rcx - movq %rcx, SS(%rsp) - movq $__KERNEL_CS, %rcx - movq %rcx, CS(%rsp) - /* Stack - skipping return address and flags */ - leaq MCOUNT_REG_SIZE+8*2(%rsp), %rcx - movq %rcx, RSP(%rsp) - - /* regs go into 4th parameter */ - leaq (%rsp), %rcx - -GLOBAL(ftrace_regs_call) - call ftrace_stub - - /* Copy flags back to SS, to restore them */ - movq EFLAGS(%rsp), %rax - movq %rax, MCOUNT_REG_SIZE(%rsp) - - /* Handlers can change the RIP */ - movq RIP(%rsp), %rax - movq %rax, MCOUNT_REG_SIZE+8(%rsp) - - /* restore the rest of pt_regs */ - movq R15(%rsp), %r15 - movq R14(%rsp), %r14 - movq R13(%rsp), %r13 - movq R12(%rsp), %r12 - movq R10(%rsp), %r10 - movq RBX(%rsp), %rbx - - restore_mcount_regs - - /* Restore flags */ - popfq - - /* - * As this jmp to ftrace_epilogue can be a short jump - * it must not be copied into the trampoline. - * The trampoline will add the code to jump - * to the return. - */ -GLOBAL(ftrace_regs_caller_end) - - jmp ftrace_epilogue - -END(ftrace_regs_caller) - - -#else /* ! CONFIG_DYNAMIC_FTRACE */ - -ENTRY(function_hook) - cmpq $ftrace_stub, ftrace_trace_function - jnz trace - -fgraph_trace: -#ifdef CONFIG_FUNCTION_GRAPH_TRACER - cmpq $ftrace_stub, ftrace_graph_return - jnz ftrace_graph_caller - - cmpq $ftrace_graph_entry_stub, ftrace_graph_entry - jnz ftrace_graph_caller -#endif - -GLOBAL(ftrace_stub) - retq - -trace: - /* save_mcount_regs fills in first two parameters */ - save_mcount_regs - - /* - * When DYNAMIC_FTRACE is not defined, ARCH_SUPPORTS_FTRACE_OPS is not - * set (see include/asm/ftrace.h and include/linux/ftrace.h). Only the - * ip and parent ip are used and the list function is called when - * function tracing is enabled. - */ - call *ftrace_trace_function - - restore_mcount_regs - - jmp fgraph_trace -END(function_hook) -#endif /* CONFIG_DYNAMIC_FTRACE */ -#endif /* CONFIG_FUNCTION_TRACER */ - -#ifdef CONFIG_FUNCTION_GRAPH_TRACER -ENTRY(ftrace_graph_caller) - /* Saves rbp into %rdx and fills first parameter */ - save_mcount_regs - -#ifdef CC_USING_FENTRY - leaq MCOUNT_REG_SIZE+8(%rsp), %rsi - movq $0, %rdx /* No framepointers needed */ -#else - /* Save address of the return address of traced function */ - leaq 8(%rdx), %rsi - /* ftrace does sanity checks against frame pointers */ - movq (%rdx), %rdx -#endif - call prepare_ftrace_return - - restore_mcount_regs - - retq -END(ftrace_graph_caller) - -GLOBAL(return_to_handler) - subq $24, %rsp - - /* Save the return values */ - movq %rax, (%rsp) - movq %rdx, 8(%rsp) - movq %rbp, %rdi - - call ftrace_return_to_handler - - movq %rax, %rdi - movq 8(%rsp), %rdx - movq (%rsp), %rax - addq $24, %rsp - jmp *%rdi -#endif -- cgit v1.2.3 From 3d82c59c6e3cb168284d9b0a1143415d9c98ae40 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 23 Mar 2017 10:33:49 -0400 Subject: x86/ftrace: Move the ftrace specific code out of entry_32.S The function tracing hook code for ftrace is not an entry point from userspace and does not belong in the entry_*.S files. It has already been moved out of entry_64.S. Move it out of entry_32.S into its own ftrace_32.S file. Signed-off-by: Steven Rostedt (VMware) Reviewed-by: Ingo Molnar Cc: Peter Zijlstra Cc: Andy Lutomirski Cc: Masami Hiramatsu Cc: Josh Poimboeuf Cc: Andrew Morton Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20170323143445.645218946@goodmis.org Signed-off-by: Thomas Gleixner --- arch/x86/entry/entry_32.S | 169 ------------------------------------------ arch/x86/kernel/Makefile | 1 + arch/x86/kernel/ftrace_32.S | 175 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 176 insertions(+), 169 deletions(-) create mode 100644 arch/x86/kernel/ftrace_32.S diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 5553475f6008..50bc26949e9e 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -35,16 +35,13 @@ #include #include #include -#include #include #include -#include #include #include #include #include #include -#include #include .section .entry.text, "ax" @@ -886,172 +883,6 @@ BUILD_INTERRUPT3(hyperv_callback_vector, HYPERVISOR_CALLBACK_VECTOR, #endif /* CONFIG_HYPERV */ -#ifdef CONFIG_FUNCTION_TRACER -#ifdef CONFIG_DYNAMIC_FTRACE - -ENTRY(mcount) - ret -END(mcount) - -ENTRY(ftrace_caller) - pushl %eax - pushl %ecx - pushl %edx - pushl $0 /* Pass NULL as regs pointer */ - movl 4*4(%esp), %eax - movl 0x4(%ebp), %edx - movl function_trace_op, %ecx - subl $MCOUNT_INSN_SIZE, %eax - -.globl ftrace_call -ftrace_call: - call ftrace_stub - - addl $4, %esp /* skip NULL pointer */ - popl %edx - popl %ecx - popl %eax -.Lftrace_ret: -#ifdef CONFIG_FUNCTION_GRAPH_TRACER -.globl ftrace_graph_call -ftrace_graph_call: - jmp ftrace_stub -#endif - -/* This is weak to keep gas from relaxing the jumps */ -WEAK(ftrace_stub) - ret -END(ftrace_caller) - -ENTRY(ftrace_regs_caller) - pushf /* push flags before compare (in cs location) */ - - /* - * i386 does not save SS and ESP when coming from kernel. - * Instead, to get sp, ®s->sp is used (see ptrace.h). - * Unfortunately, that means eflags must be at the same location - * as the current return ip is. We move the return ip into the - * ip location, and move flags into the return ip location. - */ - pushl 4(%esp) /* save return ip into ip slot */ - - pushl $0 /* Load 0 into orig_ax */ - pushl %gs - pushl %fs - pushl %es - pushl %ds - pushl %eax - pushl %ebp - pushl %edi - pushl %esi - pushl %edx - pushl %ecx - pushl %ebx - - movl 13*4(%esp), %eax /* Get the saved flags */ - movl %eax, 14*4(%esp) /* Move saved flags into regs->flags location */ - /* clobbering return ip */ - movl $__KERNEL_CS, 13*4(%esp) - - movl 12*4(%esp), %eax /* Load ip (1st parameter) */ - subl $MCOUNT_INSN_SIZE, %eax /* Adjust ip */ - movl 0x4(%ebp), %edx /* Load parent ip (2nd parameter) */ - movl function_trace_op, %ecx /* Save ftrace_pos in 3rd parameter */ - pushl %esp /* Save pt_regs as 4th parameter */ - -GLOBAL(ftrace_regs_call) - call ftrace_stub - - addl $4, %esp /* Skip pt_regs */ - movl 14*4(%esp), %eax /* Move flags back into cs */ - movl %eax, 13*4(%esp) /* Needed to keep addl from modifying flags */ - movl 12*4(%esp), %eax /* Get return ip from regs->ip */ - movl %eax, 14*4(%esp) /* Put return ip back for ret */ - - popl %ebx - popl %ecx - popl %edx - popl %esi - popl %edi - popl %ebp - popl %eax - popl %ds - popl %es - popl %fs - popl %gs - addl $8, %esp /* Skip orig_ax and ip */ - popf /* Pop flags at end (no addl to corrupt flags) */ - jmp .Lftrace_ret - - popf - jmp ftrace_stub -#else /* ! CONFIG_DYNAMIC_FTRACE */ - -ENTRY(mcount) - cmpl $__PAGE_OFFSET, %esp - jb ftrace_stub /* Paging not enabled yet? */ - - cmpl $ftrace_stub, ftrace_trace_function - jnz .Ltrace -#ifdef CONFIG_FUNCTION_GRAPH_TRACER - cmpl $ftrace_stub, ftrace_graph_return - jnz ftrace_graph_caller - - cmpl $ftrace_graph_entry_stub, ftrace_graph_entry - jnz ftrace_graph_caller -#endif -.globl ftrace_stub -ftrace_stub: - ret - - /* taken from glibc */ -.Ltrace: - pushl %eax - pushl %ecx - pushl %edx - movl 0xc(%esp), %eax - movl 0x4(%ebp), %edx - subl $MCOUNT_INSN_SIZE, %eax - - call *ftrace_trace_function - - popl %edx - popl %ecx - popl %eax - jmp ftrace_stub -END(mcount) -#endif /* CONFIG_DYNAMIC_FTRACE */ -EXPORT_SYMBOL(mcount) -#endif /* CONFIG_FUNCTION_TRACER */ - -#ifdef CONFIG_FUNCTION_GRAPH_TRACER -ENTRY(ftrace_graph_caller) - pushl %eax - pushl %ecx - pushl %edx - movl 0xc(%esp), %eax - lea 0x4(%ebp), %edx - movl (%ebp), %ecx - subl $MCOUNT_INSN_SIZE, %eax - call prepare_ftrace_return - popl %edx - popl %ecx - popl %eax - ret -END(ftrace_graph_caller) - -.globl return_to_handler -return_to_handler: - pushl %eax - pushl %edx - movl %ebp, %eax - call ftrace_return_to_handler - movl %eax, %ecx - popl %edx - popl %eax - jmp *%ecx -#endif - #ifdef CONFIG_TRACING ENTRY(trace_page_fault) ASM_CLAC diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index d3743a37e990..55e8902c461f 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -47,6 +47,7 @@ obj-y += setup.o x86_init.o i8259.o irqinit.o jump_label.o obj-$(CONFIG_IRQ_WORK) += irq_work.o obj-y += probe_roms.o obj-$(CONFIG_X86_64) += sys_x86_64.o ftrace_64.o +obj-$(CONFIG_X86_32) += ftrace_32.o obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o obj-$(CONFIG_SYSFS) += ksysfs.o obj-y += bootflag.o e820.o diff --git a/arch/x86/kernel/ftrace_32.S b/arch/x86/kernel/ftrace_32.S new file mode 100644 index 000000000000..2b160f238a42 --- /dev/null +++ b/arch/x86/kernel/ftrace_32.S @@ -0,0 +1,175 @@ +/* + * Copyright (C) 2017 Steven Rostedt, VMware Inc. + */ + +#include +#include +#include +#include +#include + +#ifdef CONFIG_FUNCTION_TRACER +#ifdef CONFIG_DYNAMIC_FTRACE + +ENTRY(mcount) + ret +END(mcount) + +ENTRY(ftrace_caller) + pushl %eax + pushl %ecx + pushl %edx + pushl $0 /* Pass NULL as regs pointer */ + movl 4*4(%esp), %eax + movl 0x4(%ebp), %edx + movl function_trace_op, %ecx + subl $MCOUNT_INSN_SIZE, %eax + +.globl ftrace_call +ftrace_call: + call ftrace_stub + + addl $4, %esp /* skip NULL pointer */ + popl %edx + popl %ecx + popl %eax +.Lftrace_ret: +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +.globl ftrace_graph_call +ftrace_graph_call: + jmp ftrace_stub +#endif + +/* This is weak to keep gas from relaxing the jumps */ +WEAK(ftrace_stub) + ret +END(ftrace_caller) + +ENTRY(ftrace_regs_caller) + pushf /* push flags before compare (in cs location) */ + + /* + * i386 does not save SS and ESP when coming from kernel. + * Instead, to get sp, ®s->sp is used (see ptrace.h). + * Unfortunately, that means eflags must be at the same location + * as the current return ip is. We move the return ip into the + * ip location, and move flags into the return ip location. + */ + pushl 4(%esp) /* save return ip into ip slot */ + + pushl $0 /* Load 0 into orig_ax */ + pushl %gs + pushl %fs + pushl %es + pushl %ds + pushl %eax + pushl %ebp + pushl %edi + pushl %esi + pushl %edx + pushl %ecx + pushl %ebx + + movl 13*4(%esp), %eax /* Get the saved flags */ + movl %eax, 14*4(%esp) /* Move saved flags into regs->flags location */ + /* clobbering return ip */ + movl $__KERNEL_CS, 13*4(%esp) + + movl 12*4(%esp), %eax /* Load ip (1st parameter) */ + subl $MCOUNT_INSN_SIZE, %eax /* Adjust ip */ + movl 0x4(%ebp), %edx /* Load parent ip (2nd parameter) */ + movl function_trace_op, %ecx /* Save ftrace_pos in 3rd parameter */ + pushl %esp /* Save pt_regs as 4th parameter */ + +GLOBAL(ftrace_regs_call) + call ftrace_stub + + addl $4, %esp /* Skip pt_regs */ + movl 14*4(%esp), %eax /* Move flags back into cs */ + movl %eax, 13*4(%esp) /* Needed to keep addl from modifying flags */ + movl 12*4(%esp), %eax /* Get return ip from regs->ip */ + movl %eax, 14*4(%esp) /* Put return ip back for ret */ + + popl %ebx + popl %ecx + popl %edx + popl %esi + popl %edi + popl %ebp + popl %eax + popl %ds + popl %es + popl %fs + popl %gs + addl $8, %esp /* Skip orig_ax and ip */ + popf /* Pop flags at end (no addl to corrupt flags) */ + jmp .Lftrace_ret + + popf + jmp ftrace_stub +#else /* ! CONFIG_DYNAMIC_FTRACE */ + +ENTRY(mcount) + cmpl $__PAGE_OFFSET, %esp + jb ftrace_stub /* Paging not enabled yet? */ + + cmpl $ftrace_stub, ftrace_trace_function + jnz .Ltrace +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + cmpl $ftrace_stub, ftrace_graph_return + jnz ftrace_graph_caller + + cmpl $ftrace_graph_entry_stub, ftrace_graph_entry + jnz ftrace_graph_caller +#endif +.globl ftrace_stub +ftrace_stub: + ret + + /* taken from glibc */ +.Ltrace: + pushl %eax + pushl %ecx + pushl %edx + movl 0xc(%esp), %eax + movl 0x4(%ebp), %edx + subl $MCOUNT_INSN_SIZE, %eax + + call *ftrace_trace_function + + popl %edx + popl %ecx + popl %eax + jmp ftrace_stub +END(mcount) +#endif /* CONFIG_DYNAMIC_FTRACE */ +EXPORT_SYMBOL(mcount) +#endif /* CONFIG_FUNCTION_TRACER */ + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +ENTRY(ftrace_graph_caller) + pushl %eax + pushl %ecx + pushl %edx + movl 0xc(%esp), %eax + lea 0x4(%ebp), %edx + movl (%ebp), %ecx + subl $MCOUNT_INSN_SIZE, %eax + call prepare_ftrace_return + popl %edx + popl %ecx + popl %eax + ret +END(ftrace_graph_caller) + +.globl return_to_handler +return_to_handler: + pushl %eax + pushl %edx + movl %ebp, %eax + call ftrace_return_to_handler + movl %eax, %ecx + popl %edx + popl %eax + jmp *%ecx +#endif -- cgit v1.2.3 From e6928e58d4d4a02f88838945f792c107623314ac Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 23 Mar 2017 10:33:50 -0400 Subject: x86/ftrace: Add stack frame pointer to ftrace_caller The function hook ftrace_caller does not create its own stack frame, and this causes the ftrace stack trace to miss the first function when doing stack traces. # echo schedule:stacktrace > /sys/kernel/tracing/set_ftrace_filter Before: -0 [002] .N.. 29.865807: => cpu_startup_entry => start_secondary => startup_32_smp <...>-7 [001] .... 29.866509: => kthread => ret_from_fork <...>-1 [000] .... 29.865377: => poll_schedule_timeout => do_select => core_sys_select => SyS_select => do_fast_syscall_32 => entry_SYSENTER_32 After: -0 [002] .N.. 31.234853: => do_idle => cpu_startup_entry => start_secondary => startup_32_smp <...>-7 [003] .... 31.235140: => rcu_gp_kthread => kthread => ret_from_fork <...>-1819 [000] .... 31.264172: => schedule_hrtimeout_range => poll_schedule_timeout => do_sys_poll => SyS_ppoll => do_fast_syscall_32 => entry_SYSENTER_32 Signed-off-by: Steven Rostedt (VMware) Reviewed-by: Josh Poimboeuf Reviewed-by: Masami Hiramatsu Reviewed-by: Ingo Molnar Cc: Peter Zijlstra Cc: Andy Lutomirski Cc: Andrew Morton Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20170323143445.771707773@goodmis.org Signed-off-by: Thomas Gleixner --- arch/x86/kernel/ftrace_32.S | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/ftrace_32.S b/arch/x86/kernel/ftrace_32.S index 2b160f238a42..a4e2872971c0 100644 --- a/arch/x86/kernel/ftrace_32.S +++ b/arch/x86/kernel/ftrace_32.S @@ -16,12 +16,19 @@ ENTRY(mcount) END(mcount) ENTRY(ftrace_caller) + + pushl %ebp + movl %esp, %ebp + pushl %eax pushl %ecx pushl %edx pushl $0 /* Pass NULL as regs pointer */ - movl 4*4(%esp), %eax - movl 0x4(%ebp), %edx + movl 5*4(%esp), %eax + /* Copy original ebp into %edx */ + movl 4*4(%esp), %edx + /* Get the parent ip */ + movl 0x4(%edx), %edx movl function_trace_op, %ecx subl $MCOUNT_INSN_SIZE, %eax @@ -33,6 +40,7 @@ ftrace_call: popl %edx popl %ecx popl %eax + popl %ebp .Lftrace_ret: #ifdef CONFIG_FUNCTION_GRAPH_TRACER .globl ftrace_graph_call -- cgit v1.2.3 From ff04b440d2d645fd8a5b3385b1b2e4d19d3fe746 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 23 Mar 2017 10:33:51 -0400 Subject: x86/ftrace: Clean up ftrace_regs_caller When ftrace_regs_caller was created, it was designed to preserve flags as much as possible as it needed to act just like a breakpoint triggered on the same location. But the design is over complicated as it treated all operations as modifying flags. But push, mov and lea do not modify flags. This means the code can become more simplified by allowing flags to be stored further down. Making ftrace_regs_caller simpler will also be useful in implementing fentry logic. Suggested-by: Linus Torvalds Signed-off-by: Steven Rostedt (VMware) Reviewed-by: Masami Hiramatsu Reviewed-by: Josh Poimboeuf Reviewed-by: Ingo Molnar Cc: Peter Zijlstra Cc: Andy Lutomirski Cc: Andrew Morton Link: http://lkml.kernel.org/r/20170316135328.36123c3e@gandalf.local.home Link: http://lkml.kernel.org/r/20170323143445.917292592@goodmis.org Signed-off-by: Thomas Gleixner --- arch/x86/kernel/ftrace_32.S | 40 +++++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/arch/x86/kernel/ftrace_32.S b/arch/x86/kernel/ftrace_32.S index a4e2872971c0..93e26647c3f2 100644 --- a/arch/x86/kernel/ftrace_32.S +++ b/arch/x86/kernel/ftrace_32.S @@ -54,23 +54,27 @@ WEAK(ftrace_stub) END(ftrace_caller) ENTRY(ftrace_regs_caller) - pushf /* push flags before compare (in cs location) */ - /* * i386 does not save SS and ESP when coming from kernel. * Instead, to get sp, ®s->sp is used (see ptrace.h). * Unfortunately, that means eflags must be at the same location * as the current return ip is. We move the return ip into the - * ip location, and move flags into the return ip location. + * regs->ip location, and move flags into the return ip location. */ - pushl 4(%esp) /* save return ip into ip slot */ - + pushl $__KERNEL_CS + pushl 4(%esp) /* Save the return ip */ pushl $0 /* Load 0 into orig_ax */ pushl %gs pushl %fs pushl %es pushl %ds pushl %eax + + /* Get flags and place them into the return ip slot */ + pushf + popl %eax + movl %eax, 8*4(%esp) + pushl %ebp pushl %edi pushl %esi @@ -78,11 +82,6 @@ ENTRY(ftrace_regs_caller) pushl %ecx pushl %ebx - movl 13*4(%esp), %eax /* Get the saved flags */ - movl %eax, 14*4(%esp) /* Move saved flags into regs->flags location */ - /* clobbering return ip */ - movl $__KERNEL_CS, 13*4(%esp) - movl 12*4(%esp), %eax /* Load ip (1st parameter) */ subl $MCOUNT_INSN_SIZE, %eax /* Adjust ip */ movl 0x4(%ebp), %edx /* Load parent ip (2nd parameter) */ @@ -93,10 +92,14 @@ GLOBAL(ftrace_regs_call) call ftrace_stub addl $4, %esp /* Skip pt_regs */ - movl 14*4(%esp), %eax /* Move flags back into cs */ - movl %eax, 13*4(%esp) /* Needed to keep addl from modifying flags */ - movl 12*4(%esp), %eax /* Get return ip from regs->ip */ - movl %eax, 14*4(%esp) /* Put return ip back for ret */ + + /* restore flags */ + push 14*4(%esp) + popf + + /* Move return ip back to its original location */ + movl 12*4(%esp), %eax + movl %eax, 14*4(%esp) popl %ebx popl %ecx @@ -109,12 +112,11 @@ GLOBAL(ftrace_regs_call) popl %es popl %fs popl %gs - addl $8, %esp /* Skip orig_ax and ip */ - popf /* Pop flags at end (no addl to corrupt flags) */ - jmp .Lftrace_ret - popf - jmp ftrace_stub + /* use lea to not affect flags */ + lea 3*4(%esp), %esp /* Skip orig_ax, ip and cs */ + + jmp .Lftrace_ret #else /* ! CONFIG_DYNAMIC_FTRACE */ ENTRY(mcount) -- cgit v1.2.3 From 644e0e8dc76b919976c44d3929164d42cbe656bc Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 23 Mar 2017 10:33:52 -0400 Subject: x86/ftrace: Add -mfentry support to x86_32 with DYNAMIC_FTRACE set x86_64 has had fentry support for some time. I did not add support to x86_32 as I was unsure if it will be used much in the future. It is still very much used, and there's issues with function graph tracing with gcc playing around with the mcount frames, causing function graph to panic. The fentry code does not have this issue, and is able to cope as there is no frame to mess up. Note, this only adds support for fentry when DYNAMIC_FTRACE is set. There's really no reason to not have that set, because the performance of the machine drops significantly when it's not enabled. Keep !DYNAMIC_FTRACE around to test it off, as there's still some archs that have FTRACE but not DYNAMIC_FTRACE. Signed-off-by: Steven Rostedt (VMware) Reviewed-by: Masami Hiramatsu Reviewed-by: Josh Poimboeuf Reviewed-by: Ingo Molnar Cc: Peter Zijlstra Cc: Andy Lutomirski Cc: Andrew Morton Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20170323143446.052202377@goodmis.org Signed-off-by: Thomas Gleixner --- arch/x86/Kconfig | 2 +- arch/x86/kernel/ftrace_32.S | 82 +++++++++++++++++++++++++++++++++++++++------ 2 files changed, 73 insertions(+), 11 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index cc98d5a294ee..8c17146427ca 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -127,7 +127,7 @@ config X86 select HAVE_EBPF_JIT if X86_64 select HAVE_EFFICIENT_UNALIGNED_ACCESS select HAVE_EXIT_THREAD - select HAVE_FENTRY if X86_64 + select HAVE_FENTRY if X86_64 || DYNAMIC_FTRACE select HAVE_FTRACE_MCOUNT_RECORD select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_TRACER diff --git a/arch/x86/kernel/ftrace_32.S b/arch/x86/kernel/ftrace_32.S index 93e26647c3f2..80518ec5b882 100644 --- a/arch/x86/kernel/ftrace_32.S +++ b/arch/x86/kernel/ftrace_32.S @@ -9,26 +9,68 @@ #include #ifdef CONFIG_FUNCTION_TRACER + +#ifdef CC_USING_FENTRY +# define function_hook __fentry__ +EXPORT_SYMBOL(__fentry__) +#else +# define function_hook mcount +EXPORT_SYMBOL(mcount) +#endif + #ifdef CONFIG_DYNAMIC_FTRACE -ENTRY(mcount) +/* mcount uses a frame pointer even if CONFIG_FRAME_POINTER is not set */ +#if !defined(CC_USING_FENTRY) || defined(CONFIG_FRAME_POINTER) +# define USING_FRAME_POINTER +#endif + +#ifdef USING_FRAME_POINTER +# define MCOUNT_FRAME 1 /* using frame = true */ +#else +# define MCOUNT_FRAME 0 /* using frame = false */ +#endif + +ENTRY(function_hook) ret -END(mcount) +END(function_hook) ENTRY(ftrace_caller) +#ifdef USING_FRAME_POINTER +# ifdef CC_USING_FENTRY + /* + * Frame pointers are of ip followed by bp. + * Since fentry is an immediate jump, we are left with + * parent-ip, function-ip. We need to add a frame with + * parent-ip followed by ebp. + */ + pushl 4(%esp) /* parent ip */ pushl %ebp movl %esp, %ebp - + pushl 2*4(%esp) /* function ip */ +# endif + /* For mcount, the function ip is directly above */ + pushl %ebp + movl %esp, %ebp +#endif pushl %eax pushl %ecx pushl %edx pushl $0 /* Pass NULL as regs pointer */ - movl 5*4(%esp), %eax - /* Copy original ebp into %edx */ + +#ifdef USING_FRAME_POINTER + /* Load parent ebp into edx */ movl 4*4(%esp), %edx +#else + /* There's no frame pointer, load the appropriate stack addr instead */ + lea 4*4(%esp), %edx +#endif + + movl (MCOUNT_FRAME+4)*4(%esp), %eax /* load the rip */ /* Get the parent ip */ - movl 0x4(%edx), %edx + movl 4(%edx), %edx /* edx has ebp */ + movl function_trace_op, %ecx subl $MCOUNT_INSN_SIZE, %eax @@ -40,7 +82,14 @@ ftrace_call: popl %edx popl %ecx popl %eax +#ifdef USING_FRAME_POINTER popl %ebp +# ifdef CC_USING_FENTRY + addl $4,%esp /* skip function ip */ + popl %ebp /* this is the orig bp */ + addl $4, %esp /* skip parent ip */ +# endif +#endif .Lftrace_ret: #ifdef CONFIG_FUNCTION_GRAPH_TRACER .globl ftrace_graph_call @@ -81,6 +130,10 @@ ENTRY(ftrace_regs_caller) pushl %edx pushl %ecx pushl %ebx +#ifdef CC_USING_FENTRY + /* Load 4 off of the parent ip addr into ebp */ + lea 14*4(%esp), %ebp +#endif movl 12*4(%esp), %eax /* Load ip (1st parameter) */ subl $MCOUNT_INSN_SIZE, %eax /* Adjust ip */ @@ -119,7 +172,7 @@ GLOBAL(ftrace_regs_call) jmp .Lftrace_ret #else /* ! CONFIG_DYNAMIC_FTRACE */ -ENTRY(mcount) +ENTRY(function_hook) cmpl $__PAGE_OFFSET, %esp jb ftrace_stub /* Paging not enabled yet? */ @@ -151,9 +204,8 @@ ftrace_stub: popl %ecx popl %eax jmp ftrace_stub -END(mcount) +END(function_hook) #endif /* CONFIG_DYNAMIC_FTRACE */ -EXPORT_SYMBOL(mcount) #endif /* CONFIG_FUNCTION_TRACER */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER @@ -161,9 +213,15 @@ ENTRY(ftrace_graph_caller) pushl %eax pushl %ecx pushl %edx - movl 0xc(%esp), %eax + movl 3*4(%esp), %eax + /* Even with frame pointers, fentry doesn't have one here */ +#ifdef CC_USING_FENTRY + lea 4*4(%esp), %edx + movl $0, %ecx +#else lea 0x4(%ebp), %edx movl (%ebp), %ecx +#endif subl $MCOUNT_INSN_SIZE, %eax call prepare_ftrace_return popl %edx @@ -176,7 +234,11 @@ END(ftrace_graph_caller) return_to_handler: pushl %eax pushl %edx +#ifdef CC_USING_FENTRY + movl $0, %eax +#else movl %ebp, %eax +#endif call ftrace_return_to_handler movl %eax, %ecx popl %edx -- cgit v1.2.3 From 1fa9d67a2f07893fc7e4a880867a6cbb81dda547 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 23 Mar 2017 10:33:53 -0400 Subject: x86/ftrace: Use Makefile logic instead of #ifdef for compiling ftrace_*.o Currently ftrace_32.S and ftrace_64.S are compiled even when CONFIG_FUNCTION_TRACER is not set. This means there's an unnecessary #ifdef to protect the code. Instead of using preprocessor directives, only compile those files when FUNCTION_TRACER is defined. Suggested-by: Josh Poimboeuf Signed-off-by: Steven Rostedt (VMware) Reviewed-by: Josh Poimboeuf Reviewed-by: Ingo Molnar Cc: Peter Zijlstra Cc: Andy Lutomirski Cc: Masami Hiramatsu Cc: Andrew Morton Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20170316210043.peycxdxktwwn6cid@treble Link: http://lkml.kernel.org/r/20170323143446.217684991@goodmis.org Signed-off-by: Thomas Gleixner --- arch/x86/kernel/Makefile | 4 ++-- arch/x86/kernel/ftrace_32.S | 3 --- arch/x86/kernel/ftrace_64.S | 4 ---- 3 files changed, 2 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 55e8902c461f..4b994232cb57 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -46,8 +46,7 @@ obj-$(CONFIG_MODIFY_LDT_SYSCALL) += ldt.o obj-y += setup.o x86_init.o i8259.o irqinit.o jump_label.o obj-$(CONFIG_IRQ_WORK) += irq_work.o obj-y += probe_roms.o -obj-$(CONFIG_X86_64) += sys_x86_64.o ftrace_64.o -obj-$(CONFIG_X86_32) += ftrace_32.o +obj-$(CONFIG_X86_64) += sys_x86_64.o obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o obj-$(CONFIG_SYSFS) += ksysfs.o obj-y += bootflag.o e820.o @@ -83,6 +82,7 @@ obj-y += apic/ obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o obj-$(CONFIG_LIVEPATCH) += livepatch.o +obj-$(CONFIG_FUNCTION_TRACER) += ftrace_$(BITS).o obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += ftrace.o obj-$(CONFIG_FTRACE_SYSCALLS) += ftrace.o obj-$(CONFIG_X86_TSC) += trace_clock.o diff --git a/arch/x86/kernel/ftrace_32.S b/arch/x86/kernel/ftrace_32.S index 80518ec5b882..07f40359c9ea 100644 --- a/arch/x86/kernel/ftrace_32.S +++ b/arch/x86/kernel/ftrace_32.S @@ -8,8 +8,6 @@ #include #include -#ifdef CONFIG_FUNCTION_TRACER - #ifdef CC_USING_FENTRY # define function_hook __fentry__ EXPORT_SYMBOL(__fentry__) @@ -206,7 +204,6 @@ ftrace_stub: jmp ftrace_stub END(function_hook) #endif /* CONFIG_DYNAMIC_FTRACE */ -#endif /* CONFIG_FUNCTION_TRACER */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER ENTRY(ftrace_graph_caller) diff --git a/arch/x86/kernel/ftrace_64.S b/arch/x86/kernel/ftrace_64.S index d1be3cbe2cd4..1dfac634bbf7 100644 --- a/arch/x86/kernel/ftrace_64.S +++ b/arch/x86/kernel/ftrace_64.S @@ -11,9 +11,6 @@ .code64 .section .entry.text, "ax" - -#ifdef CONFIG_FUNCTION_TRACER - #ifdef CC_USING_FENTRY # define function_hook __fentry__ EXPORT_SYMBOL(__fentry__) @@ -295,7 +292,6 @@ trace: jmp fgraph_trace END(function_hook) #endif /* CONFIG_DYNAMIC_FTRACE */ -#endif /* CONFIG_FUNCTION_TRACER */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER ENTRY(ftrace_graph_caller) -- cgit v1.2.3 From 9a93848fe787a53aec404e4e00d8f7f9bbdaebb4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 2 Feb 2017 14:43:51 +0100 Subject: x86/debug: Implement __WARN() using UD0 By using "UD0" for WARN()s we remove the function call and its possible __FILE__ and __LINE__ immediate arguments from the instruction stream. Total image size will not change much, what we win in the instruction stream we'll lose because of the __bug_table entries. Still, saves on I$ footprint and the total image size does go down a bit. text data filename 10702123 4530992 defconfig-build/vmlinux.orig 10682460 4530992 defconfig-build/vmlinux.patched (UML didn't seem to use GENERIC_BUG at all, so remove it) Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Josh Poimboeuf Cc: Arjan van de Ven Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Richard Weinberger Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/um/Kconfig.common | 5 --- arch/x86/include/asm/bug.h | 78 ++++++++++++++++++++++++++++++++---------- arch/x86/kernel/dumpstack.c | 3 -- arch/x86/kernel/dumpstack_32.c | 12 ------- arch/x86/kernel/dumpstack_64.c | 10 ------ arch/x86/kernel/traps.c | 46 +++++++++++++++++++++---- arch/x86/um/Makefile | 2 +- arch/x86/um/bug.c | 21 ------------ 8 files changed, 101 insertions(+), 76 deletions(-) delete mode 100644 arch/x86/um/bug.c diff --git a/arch/um/Kconfig.common b/arch/um/Kconfig.common index fd443852103c..ed9c5b5ff028 100644 --- a/arch/um/Kconfig.common +++ b/arch/um/Kconfig.common @@ -50,11 +50,6 @@ config GENERIC_CALIBRATE_DELAY bool default y -config GENERIC_BUG - bool - default y - depends on BUG - config HZ int default 100 diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h index ba38ebbaced3..4fde330c44b7 100644 --- a/arch/x86/include/asm/bug.h +++ b/arch/x86/include/asm/bug.h @@ -1,36 +1,78 @@ #ifndef _ASM_X86_BUG_H #define _ASM_X86_BUG_H -#define HAVE_ARCH_BUG +#include -#ifdef CONFIG_DEBUG_BUGVERBOSE +/* + * Since some emulators terminate on UD2, we cannot use it for WARN. + * Since various instruction decoders disagree on the length of UD1, + * we cannot use it either. So use UD0 for WARN. + * + * (binutils knows about "ud1" but {en,de}codes it as 2 bytes, whereas + * our kernel decoder thinks it takes a ModRM byte, which seems consistent + * with various things like the Intel SDM instruction encoding rules) + */ + +#define ASM_UD0 ".byte 0x0f, 0xff" +#define ASM_UD1 ".byte 0x0f, 0xb9" /* + ModRM */ +#define ASM_UD2 ".byte 0x0f, 0x0b" + +#define INSN_UD0 0xff0f +#define INSN_UD2 0x0b0f + +#define LEN_UD0 2 + +#ifdef CONFIG_GENERIC_BUG +#define HAVE_ARCH_BUG #ifdef CONFIG_X86_32 -# define __BUG_C0 "2:\t.long 1b, %c0\n" +# define __BUG_REL(val) ".long " __stringify(val) #else -# define __BUG_C0 "2:\t.long 1b - 2b, %c0 - 2b\n" +# define __BUG_REL(val) ".long " __stringify(val) " - 2b" #endif -#define BUG() \ -do { \ - asm volatile("1:\tud2\n" \ - ".pushsection __bug_table,\"a\"\n" \ - __BUG_C0 \ - "\t.word %c1, 0\n" \ - "\t.org 2b+%c2\n" \ - ".popsection" \ - : : "i" (__FILE__), "i" (__LINE__), \ - "i" (sizeof(struct bug_entry))); \ - unreachable(); \ +#ifdef CONFIG_DEBUG_BUGVERBOSE + +#define _BUG_FLAGS(ins, flags) \ +do { \ + asm volatile("1:\t" ins "\n" \ + ".pushsection __bug_table,\"a\"\n" \ + "2:\t" __BUG_REL(1b) "\t# bug_entry::bug_addr\n" \ + "\t" __BUG_REL(%c0) "\t# bug_entry::file\n" \ + "\t.word %c1" "\t# bug_entry::line\n" \ + "\t.word %c2" "\t# bug_entry::flags\n" \ + "\t.org 2b+%c3\n" \ + ".popsection" \ + : : "i" (__FILE__), "i" (__LINE__), \ + "i" (flags), \ + "i" (sizeof(struct bug_entry))); \ } while (0) -#else +#else /* !CONFIG_DEBUG_BUGVERBOSE */ + +#define _BUG_FLAGS(ins, flags) \ +do { \ + asm volatile("1:\t" ins "\n" \ + ".pushsection __bug_table,\"a\"\n" \ + "2:\t" __BUG_REL(1b) "\t# bug_entry::bug_addr\n" \ + "\t.word %c0" "\t# bug_entry::flags\n" \ + "\t.org 2b+%c1\n" \ + ".popsection" \ + : : "i" (flags), \ + "i" (sizeof(struct bug_entry))); \ +} while (0) + +#endif /* CONFIG_DEBUG_BUGVERBOSE */ + #define BUG() \ do { \ - asm volatile("ud2"); \ + _BUG_FLAGS(ASM_UD2, 0); \ unreachable(); \ } while (0) -#endif + +#define __WARN_TAINT(taint) _BUG_FLAGS(ASM_UD0, BUGFLAG_TAINT(taint)) + +#endif /* CONFIG_GENERIC_BUG */ #include diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 09d4ac0d2661..924f45ea4382 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -289,9 +289,6 @@ void die(const char *str, struct pt_regs *regs, long err) unsigned long flags = oops_begin(); int sig = SIGSEGV; - if (!user_mode(regs)) - report_bug(regs->ip, regs); - if (__die(str, regs, err)) sig = 0; oops_end(flags, regs, sig); diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index b0b3a3df7c20..e5f0b40e66d2 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -162,15 +162,3 @@ void show_regs(struct pt_regs *regs) } pr_cont("\n"); } - -int is_valid_bugaddr(unsigned long ip) -{ - unsigned short ud2; - - if (ip < PAGE_OFFSET) - return 0; - if (probe_kernel_address((unsigned short *)ip, ud2)) - return 0; - - return ud2 == 0x0b0f; -} diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index a8b117e93b46..3e1471d57487 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -178,13 +178,3 @@ void show_regs(struct pt_regs *regs) } pr_cont("\n"); } - -int is_valid_bugaddr(unsigned long ip) -{ - unsigned short ud2; - - if (__copy_from_user(&ud2, (const void __user *) ip, sizeof(ud2))) - return 0; - - return ud2 == 0x0b0f; -} diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 948443e115c1..3c0751b120de 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -169,6 +169,37 @@ void ist_end_non_atomic(void) preempt_disable(); } +int is_valid_bugaddr(unsigned long addr) +{ + unsigned short ud; + + if (addr < TASK_SIZE_MAX) + return 0; + + if (probe_kernel_address((unsigned short *)addr, ud)) + return 0; + + return ud == INSN_UD0 || ud == INSN_UD2; +} + +static int fixup_bug(struct pt_regs *regs, int trapnr) +{ + if (trapnr != X86_TRAP_UD) + return 0; + + switch (report_bug(regs->ip, regs)) { + case BUG_TRAP_TYPE_NONE: + case BUG_TRAP_TYPE_BUG: + break; + + case BUG_TRAP_TYPE_WARN: + regs->ip += LEN_UD0; + return 1; + } + + return 0; +} + static nokprobe_inline int do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str, struct pt_regs *regs, long error_code) @@ -187,12 +218,15 @@ do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str, } if (!user_mode(regs)) { - if (!fixup_exception(regs, trapnr)) { - tsk->thread.error_code = error_code; - tsk->thread.trap_nr = trapnr; - die(str, regs, error_code); - } - return 0; + if (fixup_exception(regs, trapnr)) + return 0; + + if (fixup_bug(regs, trapnr)) + return 0; + + tsk->thread.error_code = error_code; + tsk->thread.trap_nr = trapnr; + die(str, regs, error_code); } return -1; diff --git a/arch/x86/um/Makefile b/arch/x86/um/Makefile index e7e7055a8658..76f17f05446f 100644 --- a/arch/x86/um/Makefile +++ b/arch/x86/um/Makefile @@ -8,7 +8,7 @@ else BITS := 64 endif -obj-y = bug.o bugs_$(BITS).o delay.o fault.o ldt.o \ +obj-y = bugs_$(BITS).o delay.o fault.o ldt.o \ ptrace_$(BITS).o ptrace_user.o setjmp_$(BITS).o signal.o \ stub_$(BITS).o stub_segv.o \ sys_call_table_$(BITS).o sysrq_$(BITS).o tls_$(BITS).o \ diff --git a/arch/x86/um/bug.c b/arch/x86/um/bug.c deleted file mode 100644 index e8034e363d83..000000000000 --- a/arch/x86/um/bug.c +++ /dev/null @@ -1,21 +0,0 @@ -/* - * Copyright (C) 2006 Jeff Dike (jdike@addtoit.com) - * Licensed under the GPL V2 - */ - -#include - -/* - * Mostly copied from i386/x86_86 - eliminated the eip < PAGE_OFFSET because - * that's not relevant in skas mode. - */ - -int is_valid_bugaddr(unsigned long eip) -{ - unsigned short ud2; - - if (probe_kernel_address((unsigned short __user *)eip, ud2)) - return 0; - - return ud2 == 0x0b0f; -} -- cgit v1.2.3 From 70579a86e3c8eb2ce57999e594a73b4dfe095959 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 29 Mar 2017 23:16:31 +0200 Subject: x86/debug: Define BUG() again for !CONFIG_BUG The latest change to the BUG() macro inadvertently reverted the earlier commit: b06dd879f5db ("x86: always define BUG() and HAVE_ARCH_BUG, even with !CONFIG_BUG") ... that sanitized the behavior with CONFIG_BUG=n. I noticed this as some warnings have appeared again that were previously fixed as a side effect of that patch: kernel/seccomp.c: In function '__seccomp_filter': kernel/seccomp.c:670:1: error: no return statement in function returning non-void [-Werror=return-type] ... This combines the two patches and uses the ud2 macro to define BUG() in case of CONFIG_BUG=n. Signed-off-by: Arnd Bergmann Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Josh Triplett Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 9a93848fe787 ("x86/debug: Implement __WARN() using UD0") Link: http://lkml.kernel.org/r/20170329211646.2707365-1-arnd@arndb.de Signed-off-by: Ingo Molnar --- arch/x86/include/asm/bug.h | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h index 4fde330c44b7..cecf559d0012 100644 --- a/arch/x86/include/asm/bug.h +++ b/arch/x86/include/asm/bug.h @@ -23,7 +23,6 @@ #define LEN_UD0 2 #ifdef CONFIG_GENERIC_BUG -#define HAVE_ARCH_BUG #ifdef CONFIG_X86_32 # define __BUG_REL(val) ".long " __stringify(val) @@ -64,6 +63,13 @@ do { \ #endif /* CONFIG_DEBUG_BUGVERBOSE */ +#else + +#define _BUG_FLAGS(ins, flags) asm volatile(ins) + +#endif /* CONFIG_GENERIC_BUG */ + +#define HAVE_ARCH_BUG #define BUG() \ do { \ _BUG_FLAGS(ASM_UD2, 0); \ @@ -72,8 +78,6 @@ do { \ #define __WARN_TAINT(taint) _BUG_FLAGS(ASM_UD0, BUGFLAG_TAINT(taint)) -#endif /* CONFIG_GENERIC_BUG */ - #include #endif /* _ASM_X86_BUG_H */ -- cgit v1.2.3 From 19d436268dde95389c616bb3819da73f0a8b28a8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 25 Feb 2017 08:56:53 +0100 Subject: debug: Add _ONCE() logic to report_bug() Josh suggested moving the _ONCE logic inside the trap handler, using a bit in the bug_entry::flags field, avoiding the need for the extra variable. Sadly this only works for WARN_ON_ONCE(), since the others have printk() statements prior to triggering the trap. Still, this saves a fair amount of text and some data: text data filename 10682460 4530992 defconfig-build/vmlinux.orig 10665111 4530096 defconfig-build/vmlinux.patched Suggested-by: Josh Poimboeuf Signed-off-by: Peter Zijlstra (Intel) Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/arm64/include/asm/bug.h | 2 +- arch/parisc/include/asm/bug.h | 8 ++++---- arch/powerpc/include/asm/bug.h | 4 ++-- arch/s390/include/asm/bug.h | 4 ++-- arch/sh/include/asm/bug.h | 4 ++-- arch/x86/include/asm/bug.h | 2 +- include/asm-generic/bug.h | 18 +++++++++++++++++- include/asm-generic/vmlinux.lds.h | 5 ++--- include/linux/bug.h | 2 +- lib/bug.c | 28 ++++++++++++++++++++-------- 10 files changed, 52 insertions(+), 25 deletions(-) diff --git a/arch/arm64/include/asm/bug.h b/arch/arm64/include/asm/bug.h index 561190d15881..a9be1072933c 100644 --- a/arch/arm64/include/asm/bug.h +++ b/arch/arm64/include/asm/bug.h @@ -55,7 +55,7 @@ _BUGVERBOSE_LOCATION(__FILE__, __LINE__) \ unreachable(); \ } while (0) -#define __WARN_TAINT(taint) _BUG_FLAGS(BUGFLAG_TAINT(taint)) +#define __WARN_FLAGS(flags) _BUG_FLAGS(BUGFLAG_WARNING|(flags)) #endif /* ! CONFIG_GENERIC_BUG */ diff --git a/arch/parisc/include/asm/bug.h b/arch/parisc/include/asm/bug.h index 62a33338549c..d2742273a685 100644 --- a/arch/parisc/include/asm/bug.h +++ b/arch/parisc/include/asm/bug.h @@ -46,7 +46,7 @@ #endif #ifdef CONFIG_DEBUG_BUGVERBOSE -#define __WARN_TAINT(taint) \ +#define __WARN_FLAGS(flags) \ do { \ asm volatile("\n" \ "1:\t" PARISC_BUG_BREAK_ASM "\n" \ @@ -56,11 +56,11 @@ "\t.org 2b+%c3\n" \ "\t.popsection" \ : : "i" (__FILE__), "i" (__LINE__), \ - "i" (BUGFLAG_TAINT(taint)), \ + "i" (BUGFLAG_WARNING|(flags)), \ "i" (sizeof(struct bug_entry)) ); \ } while(0) #else -#define __WARN_TAINT(taint) \ +#define __WARN_FLAGS(flags) \ do { \ asm volatile("\n" \ "1:\t" PARISC_BUG_BREAK_ASM "\n" \ @@ -69,7 +69,7 @@ "\t.short %c0\n" \ "\t.org 2b+%c1\n" \ "\t.popsection" \ - : : "i" (BUGFLAG_TAINT(taint)), \ + : : "i" (BUGFLAG_WARNING|(flags)), \ "i" (sizeof(struct bug_entry)) ); \ } while(0) #endif diff --git a/arch/powerpc/include/asm/bug.h b/arch/powerpc/include/asm/bug.h index 3a39283333c3..f2c562a0a427 100644 --- a/arch/powerpc/include/asm/bug.h +++ b/arch/powerpc/include/asm/bug.h @@ -85,12 +85,12 @@ } \ } while (0) -#define __WARN_TAINT(taint) do { \ +#define __WARN_FLAGS(flags) do { \ __asm__ __volatile__( \ "1: twi 31,0,0\n" \ _EMIT_BUG_ENTRY \ : : "i" (__FILE__), "i" (__LINE__), \ - "i" (BUGFLAG_TAINT(taint)), \ + "i" (BUGFLAG_WARNING|(flags)), \ "i" (sizeof(struct bug_entry))); \ } while (0) diff --git a/arch/s390/include/asm/bug.h b/arch/s390/include/asm/bug.h index bf90d1fd97a5..1bbd9dbfe4e0 100644 --- a/arch/s390/include/asm/bug.h +++ b/arch/s390/include/asm/bug.h @@ -46,8 +46,8 @@ unreachable(); \ } while (0) -#define __WARN_TAINT(taint) do { \ - __EMIT_BUG(BUGFLAG_TAINT(taint)); \ +#define __WARN_FLAGS(flags) do { \ + __EMIT_BUG(BUGFLAG_WARNING|(flags)); \ } while (0) #define WARN_ON(x) ({ \ diff --git a/arch/sh/include/asm/bug.h b/arch/sh/include/asm/bug.h index dcf278075429..1b77f068be2b 100644 --- a/arch/sh/include/asm/bug.h +++ b/arch/sh/include/asm/bug.h @@ -50,7 +50,7 @@ do { \ "i" (sizeof(struct bug_entry))); \ } while (0) -#define __WARN_TAINT(taint) \ +#define __WARN_FLAGS(flags) \ do { \ __asm__ __volatile__ ( \ "1:\t.short %O0\n" \ @@ -59,7 +59,7 @@ do { \ : "n" (TRAPA_BUG_OPCODE), \ "i" (__FILE__), \ "i" (__LINE__), \ - "i" (BUGFLAG_TAINT(taint)), \ + "i" (BUGFLAG_WARNING|(flags)), \ "i" (sizeof(struct bug_entry))); \ } while (0) diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h index cecf559d0012..39e702d90cdb 100644 --- a/arch/x86/include/asm/bug.h +++ b/arch/x86/include/asm/bug.h @@ -76,7 +76,7 @@ do { \ unreachable(); \ } while (0) -#define __WARN_TAINT(taint) _BUG_FLAGS(ASM_UD0, BUGFLAG_TAINT(taint)) +#define __WARN_FLAGS(flags) _BUG_FLAGS(ASM_UD0, BUGFLAG_WARNING|(flags)) #include diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h index 6f96247226a4..5fe9f39cec44 100644 --- a/include/asm-generic/bug.h +++ b/include/asm-generic/bug.h @@ -5,6 +5,8 @@ #ifdef CONFIG_GENERIC_BUG #define BUGFLAG_WARNING (1 << 0) +#define BUGFLAG_ONCE (1 << 1) +#define BUGFLAG_DONE (1 << 2) #define BUGFLAG_TAINT(taint) (BUGFLAG_WARNING | ((taint) << 8)) #define BUG_GET_TAINT(bug) ((bug)->flags >> 8) #endif @@ -55,6 +57,18 @@ struct bug_entry { #define BUG_ON(condition) do { if (unlikely(condition)) BUG(); } while (0) #endif +#ifdef __WARN_FLAGS +#define __WARN_TAINT(taint) __WARN_FLAGS(BUGFLAG_TAINT(taint)) +#define __WARN_ONCE_TAINT(taint) __WARN_FLAGS(BUGFLAG_ONCE|BUGFLAG_TAINT(taint)) + +#define WARN_ON_ONCE(condition) ({ \ + int __ret_warn_on = !!(condition); \ + if (unlikely(__ret_warn_on)) \ + __WARN_ONCE_TAINT(TAINT_WARN); \ + unlikely(__ret_warn_on); \ +}) +#endif + /* * WARN(), WARN_ON(), WARN_ON_ONCE, and so on can be used to report * significant issues that need prompt attention if they should ever @@ -97,7 +111,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint, #endif #ifndef WARN -#define WARN(condition, format...) ({ \ +#define WARN(condition, format...) ({ \ int __ret_warn_on = !!(condition); \ if (unlikely(__ret_warn_on)) \ __WARN_printf(format); \ @@ -112,6 +126,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint, unlikely(__ret_warn_on); \ }) +#ifndef WARN_ON_ONCE #define WARN_ON_ONCE(condition) ({ \ static bool __section(.data.unlikely) __warned; \ int __ret_warn_once = !!(condition); \ @@ -122,6 +137,7 @@ void __warn(const char *file, int line, void *caller, unsigned taint, } \ unlikely(__ret_warn_once); \ }) +#endif #define WARN_ONCE(condition, format...) ({ \ static bool __section(.data.unlikely) __warned; \ diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 0968d13b3885..51c8dbe8e8d9 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -286,8 +286,6 @@ *(.rodata1) \ } \ \ - BUG_TABLE \ - \ /* PCI quirks */ \ .pci_fixup : AT(ADDR(.pci_fixup) - LOAD_OFFSET) { \ VMLINUX_SYMBOL(__start_pci_fixups_early) = .; \ @@ -855,7 +853,8 @@ READ_MOSTLY_DATA(cacheline) \ DATA_DATA \ CONSTRUCTORS \ - } + } \ + BUG_TABLE #define INIT_TEXT_SECTION(inittext_align) \ . = ALIGN(inittext_align); \ diff --git a/include/linux/bug.h b/include/linux/bug.h index 5828489309bb..687b557fc5eb 100644 --- a/include/linux/bug.h +++ b/include/linux/bug.h @@ -105,7 +105,7 @@ static inline int is_warning_bug(const struct bug_entry *bug) return bug->flags & BUGFLAG_WARNING; } -const struct bug_entry *find_bug(unsigned long bugaddr); +struct bug_entry *find_bug(unsigned long bugaddr); enum bug_trap_type report_bug(unsigned long bug_addr, struct pt_regs *regs); diff --git a/lib/bug.c b/lib/bug.c index 06edbbef0623..a6a1137d06db 100644 --- a/lib/bug.c +++ b/lib/bug.c @@ -47,7 +47,7 @@ #include #include -extern const struct bug_entry __start___bug_table[], __stop___bug_table[]; +extern struct bug_entry __start___bug_table[], __stop___bug_table[]; static inline unsigned long bug_addr(const struct bug_entry *bug) { @@ -62,10 +62,10 @@ static inline unsigned long bug_addr(const struct bug_entry *bug) /* Updates are protected by module mutex */ static LIST_HEAD(module_bug_list); -static const struct bug_entry *module_find_bug(unsigned long bugaddr) +static struct bug_entry *module_find_bug(unsigned long bugaddr) { struct module *mod; - const struct bug_entry *bug = NULL; + struct bug_entry *bug = NULL; rcu_read_lock_sched(); list_for_each_entry_rcu(mod, &module_bug_list, bug_list) { @@ -122,15 +122,15 @@ void module_bug_cleanup(struct module *mod) #else -static inline const struct bug_entry *module_find_bug(unsigned long bugaddr) +static inline struct bug_entry *module_find_bug(unsigned long bugaddr) { return NULL; } #endif -const struct bug_entry *find_bug(unsigned long bugaddr) +struct bug_entry *find_bug(unsigned long bugaddr) { - const struct bug_entry *bug; + struct bug_entry *bug; for (bug = __start___bug_table; bug < __stop___bug_table; ++bug) if (bugaddr == bug_addr(bug)) @@ -141,9 +141,9 @@ const struct bug_entry *find_bug(unsigned long bugaddr) enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs) { - const struct bug_entry *bug; + struct bug_entry *bug; const char *file; - unsigned line, warning; + unsigned line, warning, once, done; if (!is_valid_bugaddr(bugaddr)) return BUG_TRAP_TYPE_NONE; @@ -164,6 +164,18 @@ enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs) line = bug->line; #endif warning = (bug->flags & BUGFLAG_WARNING) != 0; + once = (bug->flags & BUGFLAG_ONCE) != 0; + done = (bug->flags & BUGFLAG_DONE) != 0; + + if (warning && once) { + if (done) + return BUG_TRAP_TYPE_WARN; + + /* + * Since this is the only store, concurrency is not an issue. + */ + bug->flags |= BUGFLAG_DONE; + } } if (warning) { -- cgit v1.2.3 From b5effd3815ccbe3df1a015a6d67d8a24a27813d5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 30 Mar 2017 17:49:27 +0200 Subject: debug: Fix __bug_table[] in arch linker scripts The kbuild test robot reported this build failure on a number of architectures: > make.cross ARCH=arm > lib/lib.a(bug.o): In function `find_bug': > >> lib/bug.c:135: undefined reference to `__start___bug_table' > >> lib/bug.c:135: undefined reference to `__stop___bug_table' Caused by: 19d436268dde ("debug: Add _ONCE() logic to report_bug()") Which moved the BUG_TABLE from RO_DATA_SECTION() to RW_DATA_SECTION(), but a number of architectures don't use RW_DATA_SECTION(), so they ended up with no __bug_table[] ... Ideally all those would use RW_DATA_SECTION() in their linker scripts, but that's for another day. Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Thomas Gleixner Cc: kbuild test robot Cc: kbuild-all@01.org Cc: tipbuild@zytor.com Link: http://lkml.kernel.org/r/20170330154927.o6qmgfp4bdhrajbm@hirez.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- arch/arm/kernel/vmlinux-xip.lds.S | 2 ++ arch/arm/kernel/vmlinux.lds.S | 2 ++ arch/avr32/kernel/vmlinux.lds.S | 1 + arch/blackfin/kernel/vmlinux.lds.S | 2 ++ arch/c6x/kernel/vmlinux.lds.S | 2 ++ arch/cris/kernel/vmlinux.lds.S | 2 ++ arch/frv/kernel/vmlinux.lds.S | 2 ++ arch/ia64/kernel/vmlinux.lds.S | 2 ++ arch/mips/kernel/vmlinux.lds.S | 1 + arch/powerpc/kernel/vmlinux.lds.S | 2 ++ arch/x86/kernel/vmlinux.lds.S | 1 + 11 files changed, 19 insertions(+) diff --git a/arch/arm/kernel/vmlinux-xip.lds.S b/arch/arm/kernel/vmlinux-xip.lds.S index 37b2a11af345..8265b116218d 100644 --- a/arch/arm/kernel/vmlinux-xip.lds.S +++ b/arch/arm/kernel/vmlinux-xip.lds.S @@ -242,6 +242,8 @@ SECTIONS } _edata_loc = __data_loc + SIZEOF(.data); + BUG_TABLE + #ifdef CONFIG_HAVE_TCM /* * We align everything to a page boundary so we can diff --git a/arch/arm/kernel/vmlinux.lds.S b/arch/arm/kernel/vmlinux.lds.S index ce18007f9e4e..c83a7ba737d6 100644 --- a/arch/arm/kernel/vmlinux.lds.S +++ b/arch/arm/kernel/vmlinux.lds.S @@ -262,6 +262,8 @@ SECTIONS } _edata_loc = __data_loc + SIZEOF(.data); + BUG_TABLE + #ifdef CONFIG_HAVE_TCM /* * We align everything to a page boundary so we can diff --git a/arch/avr32/kernel/vmlinux.lds.S b/arch/avr32/kernel/vmlinux.lds.S index 17f2730eb497..623d18db0466 100644 --- a/arch/avr32/kernel/vmlinux.lds.S +++ b/arch/avr32/kernel/vmlinux.lds.S @@ -75,6 +75,7 @@ SECTIONS _edata = .; } + BUG_TABLE BSS_SECTION(0, 8, 8) _end = .; diff --git a/arch/blackfin/kernel/vmlinux.lds.S b/arch/blackfin/kernel/vmlinux.lds.S index 68069a120055..334ef8139b35 100644 --- a/arch/blackfin/kernel/vmlinux.lds.S +++ b/arch/blackfin/kernel/vmlinux.lds.S @@ -115,6 +115,8 @@ SECTIONS __data_lma = LOADADDR(.data); __data_len = SIZEOF(.data); + BUG_TABLE + /* The init section should be last, so when we free it, it goes into * the general memory pool, and (hopefully) will decrease fragmentation * a tiny bit. The init section has a _requirement_ that it be diff --git a/arch/c6x/kernel/vmlinux.lds.S b/arch/c6x/kernel/vmlinux.lds.S index a1a5c166bc9b..29ebea49ddd5 100644 --- a/arch/c6x/kernel/vmlinux.lds.S +++ b/arch/c6x/kernel/vmlinux.lds.S @@ -128,6 +128,8 @@ SECTIONS . = ALIGN(8); } + BUG_TABLE + _edata = .; __bss_start = .; diff --git a/arch/cris/kernel/vmlinux.lds.S b/arch/cris/kernel/vmlinux.lds.S index 979586261520..867f237d7c5c 100644 --- a/arch/cris/kernel/vmlinux.lds.S +++ b/arch/cris/kernel/vmlinux.lds.S @@ -68,6 +68,8 @@ SECTIONS __edata = . ; /* End of data section. */ _edata = . ; + BUG_TABLE + INIT_TASK_DATA_SECTION(PAGE_SIZE) . = ALIGN(PAGE_SIZE); /* Init code and data. */ diff --git a/arch/frv/kernel/vmlinux.lds.S b/arch/frv/kernel/vmlinux.lds.S index aa6e573d57da..3f44dcbbad4d 100644 --- a/arch/frv/kernel/vmlinux.lds.S +++ b/arch/frv/kernel/vmlinux.lds.S @@ -102,6 +102,8 @@ SECTIONS _edata = .; /* End of data section */ + BUG_TABLE + /* GP section */ . = ALIGN(L1_CACHE_BYTES); _gp = . + 2048; diff --git a/arch/ia64/kernel/vmlinux.lds.S b/arch/ia64/kernel/vmlinux.lds.S index f89d20c97412..798026dde52e 100644 --- a/arch/ia64/kernel/vmlinux.lds.S +++ b/arch/ia64/kernel/vmlinux.lds.S @@ -192,6 +192,8 @@ SECTIONS { CONSTRUCTORS } + BUG_TABLE + . = ALIGN(16); /* gp must be 16-byte aligned for exc. table */ .got : AT(ADDR(.got) - LOAD_OFFSET) { *(.got.plt) diff --git a/arch/mips/kernel/vmlinux.lds.S b/arch/mips/kernel/vmlinux.lds.S index f0a0e6d62be3..8ca2371aa684 100644 --- a/arch/mips/kernel/vmlinux.lds.S +++ b/arch/mips/kernel/vmlinux.lds.S @@ -97,6 +97,7 @@ SECTIONS DATA_DATA CONSTRUCTORS } + BUG_TABLE _gp = . + 0x8000; .lit8 : { *(.lit8) diff --git a/arch/powerpc/kernel/vmlinux.lds.S b/arch/powerpc/kernel/vmlinux.lds.S index 7394b770ae1f..1c24c894c908 100644 --- a/arch/powerpc/kernel/vmlinux.lds.S +++ b/arch/powerpc/kernel/vmlinux.lds.S @@ -312,6 +312,8 @@ SECTIONS NOSAVE_DATA } + BUG_TABLE + . = ALIGN(PAGE_SIZE); _edata = .; PROVIDE32 (edata = .); diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index c74ae9ce8dc4..c8a3b61be0aa 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -146,6 +146,7 @@ SECTIONS _edata = .; } :data + BUG_TABLE . = ALIGN(PAGE_SIZE); __vvar_page = .; -- cgit v1.2.3 From 5ed8d8bb38c5dcd78de540182cedb0fb19399aab Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 12 Apr 2017 13:47:10 -0500 Subject: x86/unwind: Move common code into update_stack_state() The __unwind_start() and unwind_next_frame() functions have some duplicated functionality. They both call decode_frame_pointer() and set state->regs and state->bp accordingly. Move that functionality to a common place in update_stack_state(). Signed-off-by: Josh Poimboeuf Acked-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Daniel Borkmann Cc: Dave Jones Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/a2ee4801113f6d2300d58f08f6b69f85edf4eb43.1492020577.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/unwind_frame.c | 119 +++++++++++++++++++---------------------- 1 file changed, 55 insertions(+), 64 deletions(-) diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c index 08339262b666..9098ef1da8f8 100644 --- a/arch/x86/kernel/unwind_frame.c +++ b/arch/x86/kernel/unwind_frame.c @@ -135,26 +135,59 @@ static struct pt_regs *decode_frame_pointer(unsigned long *bp) return (struct pt_regs *)(regs & ~0x1); } -static bool update_stack_state(struct unwind_state *state, void *addr, - size_t len) +static bool update_stack_state(struct unwind_state *state, + unsigned long *next_bp) { struct stack_info *info = &state->stack_info; - enum stack_type orig_type = info->type; + enum stack_type prev_type = info->type; + struct pt_regs *regs; + unsigned long *frame, *prev_frame_end; + size_t len; + + if (state->regs) + prev_frame_end = (void *)state->regs + regs_size(state->regs); + else + prev_frame_end = (void *)state->bp + FRAME_HEADER_SIZE; + + /* Is the next frame pointer an encoded pointer to pt_regs? */ + regs = decode_frame_pointer(next_bp); + if (regs) { + frame = (unsigned long *)regs; + len = regs_size(regs); + } else { + frame = next_bp; + len = FRAME_HEADER_SIZE; + } /* - * If addr isn't on the current stack, switch to the next one. + * If the next bp isn't on the current stack, switch to the next one. * * We may have to traverse multiple stacks to deal with the possibility - * that 'info->next_sp' could point to an empty stack and 'addr' could - * be on a subsequent stack. + * that info->next_sp could point to an empty stack and the next bp + * could be on a subsequent stack. */ - while (!on_stack(info, addr, len)) + while (!on_stack(info, frame, len)) if (get_stack_info(info->next_sp, state->task, info, &state->stack_mask)) return false; - if (!state->orig_sp || info->type != orig_type) - state->orig_sp = addr; + /* Make sure it only unwinds up and doesn't overlap the prev frame: */ + if (state->orig_sp && state->stack_info.type == prev_type && + frame < prev_frame_end) + return false; + + /* Move state to the next frame: */ + if (regs) { + state->regs = regs; + state->bp = NULL; + } else { + state->bp = next_bp; + state->regs = NULL; + } + + /* Save the original stack pointer for unwind_dump(): */ + if (!state->orig_sp || info->type != prev_type) + state->orig_sp = frame; return true; } @@ -162,14 +195,12 @@ static bool update_stack_state(struct unwind_state *state, void *addr, bool unwind_next_frame(struct unwind_state *state) { struct pt_regs *regs; - unsigned long *next_bp, *next_frame; - size_t next_len; - enum stack_type prev_type = state->stack_info.type; + unsigned long *next_bp; if (unwind_done(state)) return false; - /* have we reached the end? */ + /* Have we reached the end? */ if (state->regs && user_mode(state->regs)) goto the_end; @@ -200,24 +231,14 @@ bool unwind_next_frame(struct unwind_state *state) return true; } - /* get the next frame pointer */ + /* Get the next frame pointer: */ if (state->regs) next_bp = (unsigned long *)state->regs->bp; else - next_bp = (unsigned long *)READ_ONCE_TASK_STACK(state->task,*state->bp); + next_bp = (unsigned long *)READ_ONCE_TASK_STACK(state->task, *state->bp); - /* is the next frame pointer an encoded pointer to pt_regs? */ - regs = decode_frame_pointer(next_bp); - if (regs) { - next_frame = (unsigned long *)regs; - next_len = sizeof(*regs); - } else { - next_frame = next_bp; - next_len = FRAME_HEADER_SIZE; - } - - /* make sure the next frame's data is accessible */ - if (!update_stack_state(state, next_frame, next_len)) { + /* Move to the next frame if it's safe: */ + if (!update_stack_state(state, next_bp)) { /* * Don't warn on bad regs->bp. An interrupt in entry code * might cause a false positive warning. @@ -228,24 +249,6 @@ bool unwind_next_frame(struct unwind_state *state) goto bad_address; } - /* Make sure it only unwinds up and doesn't overlap the last frame: */ - if (state->stack_info.type == prev_type) { - if (state->regs && (void *)next_frame < (void *)state->regs + regs_size(state->regs)) - goto bad_address; - - if (state->bp && (void *)next_frame < (void *)state->bp + FRAME_HEADER_SIZE) - goto bad_address; - } - - /* move to the next frame */ - if (regs) { - state->regs = regs; - state->bp = NULL; - } else { - state->bp = next_bp; - state->regs = NULL; - } - return true; bad_address: @@ -263,13 +266,13 @@ bad_address: printk_deferred_once(KERN_WARNING "WARNING: kernel stack regs at %p in %s:%d has bad 'bp' value %p\n", state->regs, state->task->comm, - state->task->pid, next_frame); + state->task->pid, next_bp); unwind_dump(state, (unsigned long *)state->regs); } else { printk_deferred_once(KERN_WARNING "WARNING: kernel stack frame pointer at %p in %s:%d has bad value %p\n", state->bp, state->task->comm, - state->task->pid, next_frame); + state->task->pid, next_bp); unwind_dump(state, state->bp); } the_end: @@ -281,35 +284,23 @@ EXPORT_SYMBOL_GPL(unwind_next_frame); void __unwind_start(struct unwind_state *state, struct task_struct *task, struct pt_regs *regs, unsigned long *first_frame) { - unsigned long *bp, *frame; - size_t len; + unsigned long *bp; memset(state, 0, sizeof(*state)); state->task = task; - /* don't even attempt to start from user mode regs */ + /* Don't even attempt to start from user mode regs: */ if (regs && user_mode(regs)) { state->stack_info.type = STACK_TYPE_UNKNOWN; return; } - /* set up the starting stack frame */ bp = get_frame_pointer(task, regs); - regs = decode_frame_pointer(bp); - if (regs) { - state->regs = regs; - frame = (unsigned long *)regs; - len = sizeof(*regs); - } else { - state->bp = bp; - frame = bp; - len = FRAME_HEADER_SIZE; - } - /* initialize stack info and make sure the frame data is accessible */ - get_stack_info(frame, state->task, &state->stack_info, + /* Initialize stack info and make sure the frame data is accessible: */ + get_stack_info(bp, state->task, &state->stack_info, &state->stack_mask); - update_stack_state(state, frame, len); + update_stack_state(state, bp); /* * The caller can provide the address of the first frame directly -- cgit v1.2.3 From 6bcdf9d51b9892dd5c6892d50cf5e9628efb8062 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 12 Apr 2017 13:47:11 -0500 Subject: x86/unwind: Read stack return address in update_stack_state() Instead of reading the return address when unwind_get_return_address() is called, read it from update_stack_state() and store it in the unwind state. This enables the next patch to check the return address from unwind_next_frame() so it can detect an entry code frame. Signed-off-by: Josh Poimboeuf Acked-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Daniel Borkmann Cc: Dave Jones Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/af0c5e4560c49c0343dca486ea26c4fa92bc4e35.1492020577.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/unwind.h | 1 + arch/x86/kernel/unwind_frame.c | 25 +++++++++++++------------ 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h index 6fa75b17aec3..5663425f812f 100644 --- a/arch/x86/include/asm/unwind.h +++ b/arch/x86/include/asm/unwind.h @@ -14,6 +14,7 @@ struct unwind_state { #ifdef CONFIG_FRAME_POINTER unsigned long *bp, *orig_sp; struct pt_regs *regs; + unsigned long ip; #else unsigned long *sp; #endif diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c index 9098ef1da8f8..c67a059c6655 100644 --- a/arch/x86/kernel/unwind_frame.c +++ b/arch/x86/kernel/unwind_frame.c @@ -56,20 +56,10 @@ static void unwind_dump(struct unwind_state *state, unsigned long *sp) unsigned long unwind_get_return_address(struct unwind_state *state) { - unsigned long addr; - unsigned long *addr_p = unwind_get_return_address_ptr(state); - if (unwind_done(state)) return 0; - if (state->regs && user_mode(state->regs)) - return 0; - - addr = READ_ONCE_TASK_STACK(state->task, *addr_p); - addr = ftrace_graph_ret_addr(state->task, &state->graph_idx, addr, - addr_p); - - return __kernel_text_address(addr) ? addr : 0; + return __kernel_text_address(state->ip) ? state->ip : 0; } EXPORT_SYMBOL_GPL(unwind_get_return_address); @@ -141,7 +131,7 @@ static bool update_stack_state(struct unwind_state *state, struct stack_info *info = &state->stack_info; enum stack_type prev_type = info->type; struct pt_regs *regs; - unsigned long *frame, *prev_frame_end; + unsigned long *frame, *prev_frame_end, *addr_p, addr; size_t len; if (state->regs) @@ -185,6 +175,16 @@ static bool update_stack_state(struct unwind_state *state, state->regs = NULL; } + /* Save the return address: */ + if (state->regs && user_mode(state->regs)) + state->ip = 0; + else { + addr_p = unwind_get_return_address_ptr(state); + addr = READ_ONCE_TASK_STACK(state->task, *addr_p); + state->ip = ftrace_graph_ret_addr(state->task, &state->graph_idx, + addr, addr_p); + } + /* Save the original stack pointer for unwind_dump(): */ if (!state->orig_sp || info->type != prev_type) state->orig_sp = frame; @@ -228,6 +228,7 @@ bool unwind_next_frame(struct unwind_state *state) */ state->regs = regs; state->bp = NULL; + state->ip = 0; return true; } -- cgit v1.2.3 From a8b7a92318b6d7779f6d8e9aa6ba0e3de01a8943 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 12 Apr 2017 13:47:12 -0500 Subject: x86/unwind: Silence entry-related warnings A few people have reported unwinder warnings like the following: WARNING: kernel stack frame pointer at ffffc90000fe7ff0 in rsync:1157 has bad value (null) unwind stack type:0 next_sp: (null) mask:2 graph_idx:0 ffffc90000fe7f98: ffffc90000fe7ff0 (0xffffc90000fe7ff0) ffffc90000fe7fa0: ffffffffb7000f56 (trace_hardirqs_off_thunk+0x1a/0x1c) ffffc90000fe7fa8: 0000000000000246 (0x246) ffffc90000fe7fb0: 0000000000000000 ... ffffc90000fe7fc0: 00007ffe3af639bc (0x7ffe3af639bc) ffffc90000fe7fc8: 0000000000000006 (0x6) ffffc90000fe7fd0: 00007f80af433fc5 (0x7f80af433fc5) ffffc90000fe7fd8: 00007ffe3af638e0 (0x7ffe3af638e0) ffffc90000fe7fe0: 00007ffe3af638e0 (0x7ffe3af638e0) ffffc90000fe7fe8: 00007ffe3af63970 (0x7ffe3af63970) ffffc90000fe7ff0: 0000000000000000 ... ffffc90000fe7ff8: ffffffffb7b74b9a (entry_SYSCALL_64_after_swapgs+0x17/0x4f) This warning can happen when unwinding a code path where an interrupt occurred in x86 entry code before it set up the first stack frame. Silently ignore any warnings for this case. Reported-by: Daniel Borkmann Reported-by: Dave Jones Signed-off-by: Josh Poimboeuf Acked-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Fixes: c32c47c68a0a ("x86/unwind: Warn on bad frame pointer") Link: http://lkml.kernel.org/r/dbd6838826466a60dc23a52098185bc973ce2f1e.1492020577.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/unwind.h | 1 + arch/x86/kernel/unwind_frame.c | 36 +++++++++++++++++++++++++++--------- 2 files changed, 28 insertions(+), 9 deletions(-) diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h index 5663425f812f..9b10dcd51716 100644 --- a/arch/x86/include/asm/unwind.h +++ b/arch/x86/include/asm/unwind.h @@ -12,6 +12,7 @@ struct unwind_state { struct task_struct *task; int graph_idx; #ifdef CONFIG_FRAME_POINTER + bool got_irq; unsigned long *bp, *orig_sp; struct pt_regs *regs; unsigned long ip; diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c index c67a059c6655..c4611359b617 100644 --- a/arch/x86/kernel/unwind_frame.c +++ b/arch/x86/kernel/unwind_frame.c @@ -1,6 +1,8 @@ #include #include #include +#include +#include #include #include #include @@ -72,6 +74,21 @@ static size_t regs_size(struct pt_regs *regs) return sizeof(*regs); } +static bool in_entry_code(unsigned long ip) +{ + char *addr = (char *)ip; + + if (addr >= __entry_text_start && addr < __entry_text_end) + return true; + +#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN) + if (addr >= __irqentry_text_start && addr < __irqentry_text_end) + return true; +#endif + + return false; +} + #ifdef CONFIG_X86_32 #define GCC_REALIGN_WORDS 3 #else @@ -144,6 +161,7 @@ static bool update_stack_state(struct unwind_state *state, if (regs) { frame = (unsigned long *)regs; len = regs_size(regs); + state->got_irq = true; } else { frame = next_bp; len = FRAME_HEADER_SIZE; @@ -239,16 +257,8 @@ bool unwind_next_frame(struct unwind_state *state) next_bp = (unsigned long *)READ_ONCE_TASK_STACK(state->task, *state->bp); /* Move to the next frame if it's safe: */ - if (!update_stack_state(state, next_bp)) { - /* - * Don't warn on bad regs->bp. An interrupt in entry code - * might cause a false positive warning. - */ - if (state->regs) - goto the_end; - + if (!update_stack_state(state, next_bp)) goto bad_address; - } return true; @@ -263,6 +273,13 @@ bad_address: if (state->task != current) goto the_end; + /* + * Don't warn if the unwinder got lost due to an interrupt in entry + * code before the stack was set up: + */ + if (state->got_irq && in_entry_code(state->ip)) + goto the_end; + if (state->regs) { printk_deferred_once(KERN_WARNING "WARNING: kernel stack regs at %p in %s:%d has bad 'bp' value %p\n", @@ -289,6 +306,7 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task, memset(state, 0, sizeof(*state)); state->task = task; + state->got_irq = (regs); /* Don't even attempt to start from user mode regs: */ if (regs && user_mode(regs)) { -- cgit v1.2.3 From f26dee15103f4040bfe21c910131db0cfe6624fc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 10 Apr 2017 10:49:39 +0200 Subject: debug: Avoid setting BUGFLAG_WARNING twice Dan reported that his static checking complains about BUGFLAG_WARNING being set on both sides of the bitwise-or, it figures that that might've been an unintentional mistake. Since there are no architectures that implement __WARN_TAINT() (I converted them all to implement __WARN_FLAGS()), and all __WARN_FLAGS() implementations already set BUGFLAG_WARNING, we can remove the bit from BUGFLAG_TAINT() and make Dan's checker happy. Reported-by: Dan Carpenter Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20170410084939.4bwhrvpmauwfzauq@hirez.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- include/asm-generic/bug.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h index 5fe9f39cec44..d6f4aed479a1 100644 --- a/include/asm-generic/bug.h +++ b/include/asm-generic/bug.h @@ -7,7 +7,7 @@ #define BUGFLAG_WARNING (1 << 0) #define BUGFLAG_ONCE (1 << 1) #define BUGFLAG_DONE (1 << 2) -#define BUGFLAG_TAINT(taint) (BUGFLAG_WARNING | ((taint) << 8)) +#define BUGFLAG_TAINT(taint) ((taint) << 8) #define BUG_GET_TAINT(bug) ((bug)->flags >> 8) #endif -- cgit v1.2.3 From e335bb51cc15e80ac180701a0d335ef1c050828e Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 17 Apr 2017 08:44:00 -0500 Subject: x86/unwind: Ensure stack pointer is aligned With frame pointers disabled, on some older versions of GCC (like 4.8.3), it's possible for the stack pointer to get aligned at a half-word boundary: 00000000000004d0 : 4d0: 41 57 push %r15 4d2: 41 56 push %r14 4d4: 41 55 push %r13 4d6: 41 54 push %r12 4d8: 55 push %rbp 4d9: 53 push %rbx 4da: 48 83 ec 24 sub $0x24,%rsp In such a case, the unwinder ends up reading the entire stack at the wrong alignment. Then the last read goes past the end of the stack, hitting the stack guard page: BUG: stack guard page was hit at ffffc900217c4000 (stack is ffffc900217c0000..ffffc900217c3fff) kernel stack overflow (page fault): 0000 [#1] SMP ... Fix it by ensuring the stack pointer is properly aligned before unwinding. Reported-by: Jirka Hladky Signed-off-by: Josh Poimboeuf Acked-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Fixes: 7c7900f89770 ("x86/unwind: Add new unwind interface and implementations") Link: http://lkml.kernel.org/r/cff33847cc9b02fa548625aa23268ac574460d8d.1492436590.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/dumpstack.c | 2 +- arch/x86/kernel/unwind_guess.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 924f45ea4382..dbce3cca94cb 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -77,7 +77,7 @@ void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, * - softirq stack * - hardirq stack */ - for (regs = NULL; stack; stack = stack_info.next_sp) { + for (regs = NULL; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) { const char *stack_name; /* diff --git a/arch/x86/kernel/unwind_guess.c b/arch/x86/kernel/unwind_guess.c index 22881ddcbb9f..039f36738e49 100644 --- a/arch/x86/kernel/unwind_guess.c +++ b/arch/x86/kernel/unwind_guess.c @@ -34,7 +34,7 @@ bool unwind_next_frame(struct unwind_state *state) return true; } - state->sp = info->next_sp; + state->sp = PTR_ALIGN(info->next_sp, sizeof(long)); } while (!get_stack_info(state->sp, state->task, info, &state->stack_mask)); @@ -49,7 +49,7 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task, memset(state, 0, sizeof(*state)); state->task = task; - state->sp = first_frame; + state->sp = PTR_ALIGN(first_frame, sizeof(long)); get_stack_info(first_frame, state->task, &state->stack_info, &state->stack_mask); -- cgit v1.2.3 From 9b135b234644c9881eee4f5d5683da0f4524722f Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 18 Apr 2017 08:12:56 -0500 Subject: x86/unwind: Properly zero-pad 32-bit values in unwind_dump() On x86-32, 32-bit stack values printed by unwind_dump() are confusingly zero-padded to 16 characters (64 bits): unwind stack type:0 next_sp: (null) mask:a graph_idx:0 f50cdebc: 00000000f50cdec4 (0xf50cdec4) f50cdec0: 00000000c40489b7 (irq_exit+0x87/0xa0) ... Instead, base the field width on the size of a long integer so that it looks right on both x86-32 and x86-64. x86-32: unwind stack type:1 next_sp: (null) mask:0x2 graph_idx:0 c0ee9d98: c0ee9de0 (init_thread_union+0x1de0/0x2000) c0ee9d9c: c043fd90 (__save_stack_trace+0x50/0xe0) ... x86-64: unwind stack type:1 next_sp: (null) mask:0x2 graph_idx:0 ffffffff81e03b88: ffffffff81e03c10 (init_thread_union+0x3c10/0x4000) ffffffff81e03b90: ffffffff81048f8e (__save_stack_trace+0x5e/0x100) ... Reported-by: H. Peter Anvin Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/36b743812e7eb291d74af4e5067736736622daad.1492520933.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/unwind_frame.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c index c4611359b617..3ebe68770e60 100644 --- a/arch/x86/kernel/unwind_frame.c +++ b/arch/x86/kernel/unwind_frame.c @@ -48,11 +48,13 @@ static void unwind_dump(struct unwind_state *state, unsigned long *sp) if (zero) { if (!prev_zero) - printk_deferred("%p: %016x ...\n", sp, 0); + printk_deferred("%p: %0*x ...\n", + sp, BITS_PER_LONG/4, 0); continue; } - printk_deferred("%p: %016lx (%pB)\n", sp, word, (void *)word); + printk_deferred("%p: %0*lx (%pB)\n", + sp, BITS_PER_LONG/4, word, (void *)word); } } -- cgit v1.2.3 From 4ea3d7410c3597910071182a6bc258c015942887 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 18 Apr 2017 08:12:57 -0500 Subject: x86/unwind: Prepend hex mask value with '0x' in unwind_dump() In unwind_dump(), the stack mask value is printed in hex, but is confusingly not prepended with '0x'. Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/e7fe41be19d73c9f99f53082486473febfe08ffa.1492520933.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/unwind_frame.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c index 3ebe68770e60..5d26374e9d64 100644 --- a/arch/x86/kernel/unwind_frame.c +++ b/arch/x86/kernel/unwind_frame.c @@ -36,7 +36,7 @@ static void unwind_dump(struct unwind_state *state, unsigned long *sp) dumped_before = true; - printk_deferred("unwind stack type:%d next_sp:%p mask:%lx graph_idx:%d\n", + printk_deferred("unwind stack type:%d next_sp:%p mask:0x%lx graph_idx:%d\n", state->stack_info.type, state->stack_info.next_sp, state->stack_mask, state->graph_idx); -- cgit v1.2.3 From aa4f853461aab5f526a312bf418091a95626632b Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 18 Apr 2017 08:12:58 -0500 Subject: x86/unwind: Remove unused 'sp' parameter in unwind_dump() The 'sp' parameter to unwind_dump() is unused. Remove it. Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/08cb36b004629f6bbcf44c267ae4a609242ebd0b.1492520933.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/unwind_frame.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c index 5d26374e9d64..bda82df34c7b 100644 --- a/arch/x86/kernel/unwind_frame.c +++ b/arch/x86/kernel/unwind_frame.c @@ -25,11 +25,11 @@ val; \ }) -static void unwind_dump(struct unwind_state *state, unsigned long *sp) +static void unwind_dump(struct unwind_state *state) { static bool dumped_before = false; bool prev_zero, zero = false; - unsigned long word; + unsigned long word, *sp; if (dumped_before) return; @@ -287,13 +287,13 @@ bad_address: "WARNING: kernel stack regs at %p in %s:%d has bad 'bp' value %p\n", state->regs, state->task->comm, state->task->pid, next_bp); - unwind_dump(state, (unsigned long *)state->regs); + unwind_dump(state); } else { printk_deferred_once(KERN_WARNING "WARNING: kernel stack frame pointer at %p in %s:%d has bad value %p\n", state->bp, state->task->comm, state->task->pid, next_bp); - unwind_dump(state, state->bp); + unwind_dump(state); } the_end: state->stack_info.type = STACK_TYPE_UNKNOWN; -- cgit v1.2.3 From dc912c303517b01960dcee6875a78b2999f7c098 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Thu, 20 Apr 2017 17:22:36 -0400 Subject: x86/ftrace: Fix ebp in ftrace_regs_caller that screws up unwinder Fengguang Wu's zero day bot triggered a stack unwinder dump. This can be easily triggered when CONFIG_FRAME_POINTERS is enabled and -mfentry is in use on x86_32. ># cd /sys/kernel/debug/tracing ># echo 'p:schedule schedule' > kprobe_events ># echo stacktrace > events/kprobes/schedule/trigger This is because the code that implemented fentry in the ftrace_regs_caller tried to use the least amount of #ifdefs, and modified ebp when CC_USE_FENTRY was defined to point to the parent ip as it does when CC_USE_FENTRY is not defined. But when CONFIG_FRAME_POINTERS is set, it corrupts the ebp register for this frame while doing the tracing. NOTE, it does not corrupt ebp in any other way. It is just a bad frame pointer when calling into the tracing infrastructure. The original ebp is restored before returning from the fentry call. But if a stack trace is performed inside the tracing, the unwinder will notice the bad ebp. Instead of toying with ebp with CC_USING_FENTRY, just slap the parent ip into the second parameter (%edx), and have an #else that does it the original way. The unwinder will unfortunately miss the function being traced, as the stack frame is not set up yet for it, as it is for x86_64. But fixing that is a bit more complex and did not work before anyway. This has been tested with and without FRAME_POINTERS being set while using -mfentry, as well as using an older compiler that uses mcount. Analyzed-by: Josh Poimboeuf Fixes: 644e0e8dc76b ("x86/ftrace: Add -mfentry support to x86_32 with DYNAMIC_FTRACE set") Reported-by: kernel test robot Signed-off-by: Steven Rostedt (VMware) Cc: Masami Hiramatsu Cc: Josh Poimboeuf Link: https://lists.01.org/pipermail/lkp/2017-April/006165.html Link: http://lkml.kernel.org/r/20170420172236.7af7f6e5@gandalf.local.home Signed-off-by: Thomas Gleixner --- arch/x86/kernel/ftrace_32.S | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/ftrace_32.S b/arch/x86/kernel/ftrace_32.S index 07f40359c9ea..722a145b4139 100644 --- a/arch/x86/kernel/ftrace_32.S +++ b/arch/x86/kernel/ftrace_32.S @@ -128,14 +128,14 @@ ENTRY(ftrace_regs_caller) pushl %edx pushl %ecx pushl %ebx -#ifdef CC_USING_FENTRY - /* Load 4 off of the parent ip addr into ebp */ - lea 14*4(%esp), %ebp -#endif movl 12*4(%esp), %eax /* Load ip (1st parameter) */ subl $MCOUNT_INSN_SIZE, %eax /* Adjust ip */ +#ifdef CC_USING_FENTRY + movl 15*4(%esp), %edx /* Load parent ip (2nd parameter) */ +#else movl 0x4(%ebp), %edx /* Load parent ip (2nd parameter) */ +#endif movl function_trace_op, %ecx /* Save ftrace_pos in 3rd parameter */ pushl %esp /* Save pt_regs as 4th parameter */ -- cgit v1.2.3 From b0d50c7b5d807ce6f7ba58e42b260e92bd7d88fb Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 25 Apr 2017 20:48:51 -0500 Subject: x86/unwind: Silence more entry-code related warnings Borislav Petkov reported the following unwinder warning: WARNING: kernel stack regs at ffffc9000024fea8 in udevadm:92 has bad 'bp' value 00007fffc4614d30 unwind stack type:0 next_sp: (null) mask:0x6 graph_idx:0 ffffc9000024fea8: 000055a6100e9b38 (0x55a6100e9b38) ffffc9000024feb0: 000055a6100e9b35 (0x55a6100e9b35) ffffc9000024feb8: 000055a6100e9f68 (0x55a6100e9f68) ffffc9000024fec0: 000055a6100e9f50 (0x55a6100e9f50) ffffc9000024fec8: 00007fffc4614d30 (0x7fffc4614d30) ffffc9000024fed0: 000055a6100eaf50 (0x55a6100eaf50) ffffc9000024fed8: 0000000000000000 ... ffffc9000024fee0: 0000000000000100 (0x100) ffffc9000024fee8: ffff8801187df488 (0xffff8801187df488) ffffc9000024fef0: 00007ffffffff000 (0x7ffffffff000) ffffc9000024fef8: 0000000000000000 ... ffffc9000024ff10: ffffc9000024fe98 (0xffffc9000024fe98) ffffc9000024ff18: 00007fffc4614d00 (0x7fffc4614d00) ffffc9000024ff20: ffffffffffffff10 (0xffffffffffffff10) ffffc9000024ff28: ffffffff811c6c1f (SyS_newlstat+0xf/0x10) ffffc9000024ff30: 0000000000000010 (0x10) ffffc9000024ff38: 0000000000000296 (0x296) ffffc9000024ff40: ffffc9000024ff50 (0xffffc9000024ff50) ffffc9000024ff48: 0000000000000018 (0x18) ffffc9000024ff50: ffffffff816b2e6a (entry_SYSCALL_64_fastpath+0x18/0xa8) ... It unwinded from an interrupt which came in right after entry code called into a C syscall handler, before it had a chance to set up the frame pointer, so regs->bp still had its user space value. Add a check to silence warnings in such a case, where an interrupt has occurred and regs->sp is almost at the end of the stack. Reported-by: Borislav Petkov Tested-by: Borislav Petkov Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: c32c47c68a0a ("x86/unwind: Warn on bad frame pointer") Link: http://lkml.kernel.org/r/c695f0d0d4c2cfe6542b90e2d0520e11eb901eb5.1493171120.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/unwind_frame.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c index bda82df34c7b..ae0821f6c3a5 100644 --- a/arch/x86/kernel/unwind_frame.c +++ b/arch/x86/kernel/unwind_frame.c @@ -91,16 +91,26 @@ static bool in_entry_code(unsigned long ip) return false; } +static inline unsigned long *last_frame(struct unwind_state *state) +{ + return (unsigned long *)task_pt_regs(state->task) - 2; +} + #ifdef CONFIG_X86_32 #define GCC_REALIGN_WORDS 3 #else #define GCC_REALIGN_WORDS 1 #endif +static inline unsigned long *last_aligned_frame(struct unwind_state *state) +{ + return last_frame(state) - GCC_REALIGN_WORDS; +} + static bool is_last_task_frame(struct unwind_state *state) { - unsigned long *last_bp = (unsigned long *)task_pt_regs(state->task) - 2; - unsigned long *aligned_bp = last_bp - GCC_REALIGN_WORDS; + unsigned long *last_bp = last_frame(state); + unsigned long *aligned_bp = last_aligned_frame(state); /* * We have to check for the last task frame at two different locations @@ -277,10 +287,14 @@ bad_address: /* * Don't warn if the unwinder got lost due to an interrupt in entry - * code before the stack was set up: + * code or in the C handler before the first frame pointer got set up: */ if (state->got_irq && in_entry_code(state->ip)) goto the_end; + if (state->regs && + state->regs->sp >= (unsigned long)last_aligned_frame(state) && + state->regs->sp < (unsigned long)task_pt_regs(state->task)) + goto the_end; if (state->regs) { printk_deferred_once(KERN_WARNING -- cgit v1.2.3 From 262fa734a0023a43391f9bd4a4099487b8393f35 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 25 Apr 2017 20:48:52 -0500 Subject: x86/unwind: Dump all stacks in unwind_dump() Currently unwind_dump() dumps only the most recently accessed stack. But it has a few issues. In some cases, 'first_sp' can get out of sync with 'stack_info', causing unwind_dump() to start from the wrong address, flood the printk buffer, and eventually read a bad address. In other cases, dumping only the most recently accessed stack doesn't give enough data to diagnose the error. Fix both issues by dumping *all* stacks involved in the trace, not just the last one. Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 8b5e99f02264 ("x86/unwind: Dump stack data on warnings") Link: http://lkml.kernel.org/r/016d6a9810d7d1bfc87ef8c0e6ee041c6744c909.1493171120.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/unwind_frame.c | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c index ae0821f6c3a5..fec70fe3b1ec 100644 --- a/arch/x86/kernel/unwind_frame.c +++ b/arch/x86/kernel/unwind_frame.c @@ -30,6 +30,8 @@ static void unwind_dump(struct unwind_state *state) static bool dumped_before = false; bool prev_zero, zero = false; unsigned long word, *sp; + struct stack_info stack_info = {0}; + unsigned long visit_mask = 0; if (dumped_before) return; @@ -40,21 +42,27 @@ static void unwind_dump(struct unwind_state *state) state->stack_info.type, state->stack_info.next_sp, state->stack_mask, state->graph_idx); - for (sp = state->orig_sp; sp < state->stack_info.end; sp++) { - word = READ_ONCE_NOCHECK(*sp); + for (sp = state->orig_sp; sp; sp = PTR_ALIGN(stack_info.next_sp, sizeof(long))) { + if (get_stack_info(sp, state->task, &stack_info, &visit_mask)) + break; - prev_zero = zero; - zero = word == 0; + for (; sp < stack_info.end; sp++) { - if (zero) { - if (!prev_zero) - printk_deferred("%p: %0*x ...\n", - sp, BITS_PER_LONG/4, 0); - continue; - } + word = READ_ONCE_NOCHECK(*sp); + + prev_zero = zero; + zero = word == 0; - printk_deferred("%p: %0*lx (%pB)\n", - sp, BITS_PER_LONG/4, word, (void *)word); + if (zero) { + if (!prev_zero) + printk_deferred("%p: %0*x ...\n", + sp, BITS_PER_LONG/4, 0); + continue; + } + + printk_deferred("%p: %0*lx (%pB)\n", + sp, BITS_PER_LONG/4, word, (void *)word); + } } } @@ -216,7 +224,7 @@ static bool update_stack_state(struct unwind_state *state, } /* Save the original stack pointer for unwind_dump(): */ - if (!state->orig_sp || info->type != prev_type) + if (!state->orig_sp) state->orig_sp = frame; return true; -- cgit v1.2.3