diff options
Diffstat (limited to 'arch/x86_64/kernel')
41 files changed, 1194 insertions, 675 deletions
diff --git a/arch/x86_64/kernel/Makefile b/arch/x86_64/kernel/Makefile index 4d94c51803d8..ff5d8c9b96d9 100644 --- a/arch/x86_64/kernel/Makefile +++ b/arch/x86_64/kernel/Makefile @@ -26,7 +26,7 @@ obj-y += io_apic.o mpparse.o genapic.o genapic_flat.o obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o crash.o obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_PM) += suspend.o -obj-$(CONFIG_SOFTWARE_SUSPEND) += suspend_asm.o +obj-$(CONFIG_HIBERNATION) += suspend_asm.o obj-$(CONFIG_CPU_FREQ) += cpufreq/ obj-$(CONFIG_EARLY_PRINTK) += early_printk.o obj-$(CONFIG_IOMMU) += pci-gart.o aperture.o @@ -43,6 +43,7 @@ obj-$(CONFIG_PCI) += early-quirks.o obj-y += topology.o obj-y += intel_cacheinfo.o +obj-y += addon_cpuid_features.o obj-y += pcspeaker.o CFLAGS_vsyscall.o := $(PROFILING) -g0 @@ -53,6 +54,7 @@ cpuid-$(subst m,y,$(CONFIG_X86_CPUID)) += ../../i386/kernel/cpuid.o topology-y += ../../i386/kernel/topology.o microcode-$(subst m,y,$(CONFIG_MICROCODE)) += ../../i386/kernel/microcode.o intel_cacheinfo-y += ../../i386/kernel/cpu/intel_cacheinfo.o +addon_cpuid_features-y += ../../i386/kernel/cpu/addon_cpuid_features.o quirks-y += ../../i386/kernel/quirks.o i8237-y += ../../i386/kernel/i8237.o msr-$(subst m,y,$(CONFIG_X86_MSR)) += ../../i386/kernel/msr.o diff --git a/arch/x86_64/kernel/acpi/sleep.c b/arch/x86_64/kernel/acpi/sleep.c index 195b7034a148..79475d237071 100644 --- a/arch/x86_64/kernel/acpi/sleep.c +++ b/arch/x86_64/kernel/acpi/sleep.c @@ -51,11 +51,9 @@ Low-Level Sleep Support -------------------------------------------------------------------------- */ -#ifdef CONFIG_ACPI_SLEEP - /* address in low memory of the wakeup routine. */ unsigned long acpi_wakeup_address = 0; -unsigned long acpi_video_flags; +unsigned long acpi_realmode_flags; extern char wakeup_start, wakeup_end; extern unsigned long acpi_copy_wakeup_routine(unsigned long); @@ -103,9 +101,11 @@ static int __init acpi_sleep_setup(char *str) { while ((str != NULL) && (*str != '\0')) { if (strncmp(str, "s3_bios", 7) == 0) - acpi_video_flags = 1; + acpi_realmode_flags |= 1; if (strncmp(str, "s3_mode", 7) == 0) - acpi_video_flags |= 2; + acpi_realmode_flags |= 2; + if (strncmp(str, "s3_beep", 7) == 0) + acpi_realmode_flags |= 4; str = strchr(str, ','); if (str != NULL) str += strspn(str, ", \t"); @@ -115,8 +115,6 @@ static int __init acpi_sleep_setup(char *str) __setup("acpi_sleep=", acpi_sleep_setup); -#endif /*CONFIG_ACPI_SLEEP */ - void acpi_pci_link_exit(void) { } diff --git a/arch/x86_64/kernel/acpi/wakeup.S b/arch/x86_64/kernel/acpi/wakeup.S index 8550a6ffa275..a06f2bcabef9 100644 --- a/arch/x86_64/kernel/acpi/wakeup.S +++ b/arch/x86_64/kernel/acpi/wakeup.S @@ -16,6 +16,21 @@ # cs = 0x1234, eip = 0x05 # +#define BEEP \ + inb $97, %al; \ + outb %al, $0x80; \ + movb $3, %al; \ + outb %al, $97; \ + outb %al, $0x80; \ + movb $-74, %al; \ + outb %al, $67; \ + outb %al, $0x80; \ + movb $-119, %al; \ + outb %al, $66; \ + outb %al, $0x80; \ + movb $15, %al; \ + outb %al, $66; + ALIGN .align 16 @@ -33,6 +48,13 @@ wakeup_code: movw %cs, %ax movw %ax, %ds # Make ds:0 point to wakeup_start movw %ax, %ss + + # Data segment must be set up before we can see whether to beep. + testl $4, realmode_flags - wakeup_code + jz 1f + BEEP +1: + # Private stack is needed for ASUS board mov $(wakeup_stack - wakeup_code), %sp @@ -48,7 +70,7 @@ wakeup_code: testl %eax, %eax jnz no_longmode - testl $1, video_flags - wakeup_code + testl $1, realmode_flags - wakeup_code jz 1f lcall $0xc000,$3 movw %cs, %ax @@ -56,10 +78,10 @@ wakeup_code: movw %ax, %ss 1: - testl $2, video_flags - wakeup_code + testl $2, realmode_flags - wakeup_code jz 1f mov video_mode - wakeup_code, %ax - call mode_seta + call mode_set 1: movw $0xb800, %ax @@ -230,7 +252,7 @@ gdt_48a: real_magic: .quad 0 video_mode: .quad 0 -video_flags: .quad 0 +realmode_flags: .quad 0 .code16 bogus_real_magic: @@ -269,52 +291,31 @@ no_longmode: #define VIDEO_FIRST_V7 0x0900 # Setting of user mode (AX=mode ID) => CF=success + +# For now, we only handle VESA modes (0x0200..0x03ff). To handle other +# modes, we should probably compile in the video code from the boot +# directory. .code16 -mode_seta: +mode_set: movw %ax, %bx -#if 0 - cmpb $0xff, %ah - jz setalias - - testb $VIDEO_RECALC>>8, %ah - jnz _setrec - - cmpb $VIDEO_FIRST_RESOLUTION>>8, %ah - jnc setres - - cmpb $VIDEO_FIRST_SPECIAL>>8, %ah - jz setspc - - cmpb $VIDEO_FIRST_V7>>8, %ah - jz setv7 -#endif - - cmpb $VIDEO_FIRST_VESA>>8, %ah - jnc check_vesaa -#if 0 - orb %ah, %ah - jz setmenu -#endif - - decb %ah -# jz setbios Add bios modes later + subb $VIDEO_FIRST_VESA>>8, %bh + cmpb $2, %bh + jb check_vesa -setbada: clc +setbad: + clc ret -check_vesaa: - subb $VIDEO_FIRST_VESA>>8, %bh +check_vesa: orw $0x4000, %bx # Use linear frame buffer movw $0x4f02, %ax # VESA BIOS mode set call int $0x10 cmpw $0x004f, %ax # AL=4f if implemented - jnz _setbada # AH=0 if OK + jnz setbad # AH=0 if OK stc ret -_setbada: jmp setbada - wakeup_stack_begin: # Stack grows down .org 0xff0 @@ -346,8 +347,8 @@ ENTRY(acpi_copy_wakeup_routine) movl saved_video_mode, %edx movl %edx, video_mode - wakeup_start (,%rdi) - movl acpi_video_flags, %edx - movl %edx, video_flags - wakeup_start (,%rdi) + movl acpi_realmode_flags, %edx + movl %edx, realmode_flags - wakeup_start (,%rdi) movq $0x12345678, real_magic - wakeup_start (,%rdi) movq $0x123456789abcdef0, %rdx movq %rdx, saved_magic diff --git a/arch/x86_64/kernel/aperture.c b/arch/x86_64/kernel/aperture.c index a3d450d6c15b..8f681cae7bf7 100644 --- a/arch/x86_64/kernel/aperture.c +++ b/arch/x86_64/kernel/aperture.c @@ -20,7 +20,7 @@ #include <linux/ioport.h> #include <asm/e820.h> #include <asm/io.h> -#include <asm/proto.h> +#include <asm/iommu.h> #include <asm/pci-direct.h> #include <asm/dma.h> #include <asm/k8.h> @@ -214,7 +214,7 @@ void __init iommu_hole_init(void) if (iommu_aperture_disabled || !fix_aperture || !early_pci_allowed()) return; - printk("Checking aperture...\n"); + printk(KERN_INFO "Checking aperture...\n"); fix = 0; for (num = 24; num < 32; num++) { diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c index 1b0e07bb8728..925758dbca0c 100644 --- a/arch/x86_64/kernel/apic.c +++ b/arch/x86_64/kernel/apic.c @@ -92,8 +92,9 @@ unsigned int safe_apic_wait_icr_idle(void) void enable_NMI_through_LVT0 (void * dummy) { unsigned int v; - - v = APIC_DM_NMI; /* unmask and set to NMI */ + + /* unmask and set to NMI */ + v = APIC_DM_NMI; apic_write(APIC_LVT0, v); } @@ -120,7 +121,7 @@ void ack_bad_irq(unsigned int irq) * holds up an irq slot - in excessive cases (when multiple * unexpected vectors occur) that might lock up the APIC * completely. - * But don't ack when the APIC is disabled. -AK + * But don't ack when the APIC is disabled. -AK */ if (!disable_apic) ack_APIC_irq(); @@ -616,7 +617,7 @@ early_param("apic", apic_set_verbosity); * Detect and enable local APICs on non-SMP boards. * Original code written by Keir Fraser. * On AMD64 we trust the BIOS - if it says no APIC it is likely - * not correctly set up (usually the APIC timer won't work etc.) + * not correctly set up (usually the APIC timer won't work etc.) */ static int __init detect_init_APIC (void) @@ -789,13 +790,11 @@ static void setup_APIC_timer(unsigned int clocks) local_irq_save(flags); /* wait for irq slice */ - if (hpet_address && hpet_use_timer) { - int trigger = hpet_readl(HPET_T0_CMP); - while (hpet_readl(HPET_COUNTER) >= trigger) - /* do nothing */ ; - while (hpet_readl(HPET_COUNTER) < trigger) - /* do nothing */ ; - } else { + if (hpet_address && hpet_use_timer) { + u32 trigger = hpet_readl(HPET_T0_CMP); + while (hpet_readl(HPET_T0_CMP) == trigger) + /* do nothing */ ; + } else { int c1, c2; outb_p(0x00, 0x43); c2 = inb_p(0x40); @@ -881,10 +880,10 @@ static unsigned int calibration_result; void __init setup_boot_APIC_clock (void) { - if (disable_apic_timer) { - printk(KERN_INFO "Disabling APIC timer\n"); - return; - } + if (disable_apic_timer) { + printk(KERN_INFO "Disabling APIC timer\n"); + return; + } printk(KERN_INFO "Using local APIC timer interrupts.\n"); using_apic_timer = 1; @@ -990,8 +989,8 @@ int setup_profiling_timer(unsigned int multiplier) return -EINVAL; } -void setup_APIC_extened_lvt(unsigned char lvt_off, unsigned char vector, - unsigned char msg_type, unsigned char mask) +void setup_APIC_extended_lvt(unsigned char lvt_off, unsigned char vector, + unsigned char msg_type, unsigned char mask) { unsigned long reg = (lvt_off << 4) + K8_APIC_EXT_LVT_BASE; unsigned int v = (mask << 16) | (msg_type << 8) | vector; @@ -1128,20 +1127,6 @@ asmlinkage void smp_spurious_interrupt(void) if (v & (1 << (SPURIOUS_APIC_VECTOR & 0x1f))) ack_APIC_irq(); -#if 0 - static unsigned long last_warning; - static unsigned long skipped; - - /* see sw-dev-man vol 3, chapter 7.4.13.5 */ - if (time_before(last_warning+30*HZ,jiffies)) { - printk(KERN_INFO "spurious APIC interrupt on CPU#%d, %ld skipped.\n", - smp_processor_id(), skipped); - last_warning = jiffies; - skipped = 0; - } else { - skipped++; - } -#endif irq_exit(); } @@ -1173,11 +1158,11 @@ asmlinkage void smp_error_interrupt(void) 7: Illegal register address */ printk (KERN_DEBUG "APIC error on CPU%d: %02x(%02x)\n", - smp_processor_id(), v , v1); + smp_processor_id(), v , v1); irq_exit(); } -int disable_apic; +int disable_apic; /* * This initializes the IO-APIC and APIC hardware if this is @@ -1185,11 +1170,11 @@ int disable_apic; */ int __init APIC_init_uniprocessor (void) { - if (disable_apic) { + if (disable_apic) { printk(KERN_INFO "Apic disabled\n"); - return -1; + return -1; } - if (!cpu_has_apic) { + if (!cpu_has_apic) { disable_apic = 1; printk(KERN_INFO "Apic disabled by BIOS\n"); return -1; @@ -1211,8 +1196,8 @@ int __init APIC_init_uniprocessor (void) return 0; } -static __init int setup_disableapic(char *str) -{ +static __init int setup_disableapic(char *str) +{ disable_apic = 1; clear_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability); return 0; @@ -1220,10 +1205,10 @@ static __init int setup_disableapic(char *str) early_param("disableapic", setup_disableapic); /* same as disableapic, for compatibility */ -static __init int setup_nolapic(char *str) -{ +static __init int setup_nolapic(char *str) +{ return setup_disableapic(str); -} +} early_param("nolapic", setup_nolapic); static int __init parse_lapic_timer_c2_ok(char *arg) @@ -1233,13 +1218,13 @@ static int __init parse_lapic_timer_c2_ok(char *arg) } early_param("lapic_timer_c2_ok", parse_lapic_timer_c2_ok); -static __init int setup_noapictimer(char *str) -{ +static __init int setup_noapictimer(char *str) +{ if (str[0] != ' ' && str[0] != 0) return 0; disable_apic_timer = 1; return 1; -} +} static __init int setup_apicmaintimer(char *str) { @@ -1264,5 +1249,5 @@ static __init int setup_apicpmtimer(char *s) } __setup("apicpmtimer", setup_apicpmtimer); -__setup("noapictimer", setup_noapictimer); +__setup("noapictimer", setup_noapictimer); diff --git a/arch/x86_64/kernel/bugs.c b/arch/x86_64/kernel/bugs.c index c3c6b91566ed..4e5e9d364d63 100644 --- a/arch/x86_64/kernel/bugs.c +++ b/arch/x86_64/kernel/bugs.c @@ -8,6 +8,7 @@ #include <linux/kernel.h> #include <linux/init.h> #include <asm/alternative.h> +#include <asm/bugs.h> #include <asm/processor.h> #include <asm/mtrr.h> diff --git a/arch/x86_64/kernel/cpufreq/Kconfig b/arch/x86_64/kernel/cpufreq/Kconfig index c0749d2479f5..a3fd51926cbd 100644 --- a/arch/x86_64/kernel/cpufreq/Kconfig +++ b/arch/x86_64/kernel/cpufreq/Kconfig @@ -48,10 +48,6 @@ config X86_SPEEDSTEP_CENTRINO If in doubt, say N. -config X86_SPEEDSTEP_CENTRINO_ACPI - bool - depends on X86_SPEEDSTEP_CENTRINO - config X86_ACPI_CPUFREQ tristate "ACPI Processor P-States driver" select CPU_FREQ_TABLE @@ -73,7 +69,7 @@ comment "shared options" config X86_ACPI_CPUFREQ_PROC_INTF bool "/proc/acpi/processor/../performance interface (deprecated)" depends on PROC_FS - depends on X86_ACPI_CPUFREQ || X86_SPEEDSTEP_CENTRINO_ACPI || X86_POWERNOW_K8_ACPI + depends on X86_ACPI_CPUFREQ || X86_POWERNOW_K8_ACPI help This enables the deprecated /proc/acpi/processor/../performance interface. While it is helpful for debugging, the generic, diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c index 13c6c37610e0..0f4d5e209e9b 100644 --- a/arch/x86_64/kernel/e820.c +++ b/arch/x86_64/kernel/e820.c @@ -194,37 +194,6 @@ unsigned long __init e820_end_of_ram(void) } /* - * Find the hole size in the range. - */ -unsigned long __init e820_hole_size(unsigned long start, unsigned long end) -{ - unsigned long ram = 0; - int i; - - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - unsigned long last, addr; - - if (ei->type != E820_RAM || - ei->addr+ei->size <= start || - ei->addr >= end) - continue; - - addr = round_up(ei->addr, PAGE_SIZE); - if (addr < start) - addr = start; - - last = round_down(ei->addr + ei->size, PAGE_SIZE); - if (last >= end) - last = end; - - if (last > addr) - ram += last - addr; - } - return ((end - start) - ram); -} - -/* * Mark e820 reserved areas as busy for the resource manager. */ void __init e820_reserve_resources(void) @@ -289,47 +258,61 @@ void __init e820_mark_nosave_regions(void) } } -/* Walk the e820 map and register active regions within a node */ -void __init -e820_register_active_regions(int nid, unsigned long start_pfn, - unsigned long end_pfn) +/* + * Finds an active region in the address range from start_pfn to end_pfn and + * returns its range in ei_startpfn and ei_endpfn for the e820 entry. + */ +static int __init e820_find_active_region(const struct e820entry *ei, + unsigned long start_pfn, + unsigned long end_pfn, + unsigned long *ei_startpfn, + unsigned long *ei_endpfn) { - int i; - unsigned long ei_startpfn, ei_endpfn; - for (i = 0; i < e820.nr_map; i++) { - struct e820entry *ei = &e820.map[i]; - ei_startpfn = round_up(ei->addr, PAGE_SIZE) >> PAGE_SHIFT; - ei_endpfn = round_down(ei->addr + ei->size, PAGE_SIZE) - >> PAGE_SHIFT; + *ei_startpfn = round_up(ei->addr, PAGE_SIZE) >> PAGE_SHIFT; + *ei_endpfn = round_down(ei->addr + ei->size, PAGE_SIZE) >> PAGE_SHIFT; - /* Skip map entries smaller than a page */ - if (ei_startpfn >= ei_endpfn) - continue; + /* Skip map entries smaller than a page */ + if (*ei_startpfn >= *ei_endpfn) + return 0; - /* Check if end_pfn_map should be updated */ - if (ei->type != E820_RAM && ei_endpfn > end_pfn_map) - end_pfn_map = ei_endpfn; + /* Check if end_pfn_map should be updated */ + if (ei->type != E820_RAM && *ei_endpfn > end_pfn_map) + end_pfn_map = *ei_endpfn; - /* Skip if map is outside the node */ - if (ei->type != E820_RAM || - ei_endpfn <= start_pfn || - ei_startpfn >= end_pfn) - continue; + /* Skip if map is outside the node */ + if (ei->type != E820_RAM || *ei_endpfn <= start_pfn || + *ei_startpfn >= end_pfn) + return 0; - /* Check for overlaps */ - if (ei_startpfn < start_pfn) - ei_startpfn = start_pfn; - if (ei_endpfn > end_pfn) - ei_endpfn = end_pfn; + /* Check for overlaps */ + if (*ei_startpfn < start_pfn) + *ei_startpfn = start_pfn; + if (*ei_endpfn > end_pfn) + *ei_endpfn = end_pfn; - /* Obey end_user_pfn to save on memmap */ - if (ei_startpfn >= end_user_pfn) - continue; - if (ei_endpfn > end_user_pfn) - ei_endpfn = end_user_pfn; + /* Obey end_user_pfn to save on memmap */ + if (*ei_startpfn >= end_user_pfn) + return 0; + if (*ei_endpfn > end_user_pfn) + *ei_endpfn = end_user_pfn; - add_active_range(nid, ei_startpfn, ei_endpfn); - } + return 1; +} + +/* Walk the e820 map and register active regions within a node */ +void __init +e820_register_active_regions(int nid, unsigned long start_pfn, + unsigned long end_pfn) +{ + unsigned long ei_startpfn; + unsigned long ei_endpfn; + int i; + + for (i = 0; i < e820.nr_map; i++) + if (e820_find_active_region(&e820.map[i], + start_pfn, end_pfn, + &ei_startpfn, &ei_endpfn)) + add_active_range(nid, ei_startpfn, ei_endpfn); } /* @@ -350,12 +333,35 @@ void __init add_memory_region(unsigned long start, unsigned long size, int type) e820.nr_map++; } +/* + * Find the hole size (in bytes) in the memory range. + * @start: starting address of the memory range to scan + * @end: ending address of the memory range to scan + */ +unsigned long __init e820_hole_size(unsigned long start, unsigned long end) +{ + unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long end_pfn = end >> PAGE_SHIFT; + unsigned long ei_startpfn; + unsigned long ei_endpfn; + unsigned long ram = 0; + int i; + + for (i = 0; i < e820.nr_map; i++) { + if (e820_find_active_region(&e820.map[i], + start_pfn, end_pfn, + &ei_startpfn, &ei_endpfn)) + ram += ei_endpfn - ei_startpfn; + } + return end - start - (ram << PAGE_SHIFT); +} + void __init e820_print_map(char *who) { int i; for (i = 0; i < e820.nr_map; i++) { - printk(" %s: %016Lx - %016Lx ", who, + printk(KERN_INFO " %s: %016Lx - %016Lx ", who, (unsigned long long) e820.map[i].addr, (unsigned long long) (e820.map[i].addr + e820.map[i].size)); switch (e820.map[i].type) { diff --git a/arch/x86_64/kernel/early-quirks.c b/arch/x86_64/kernel/early-quirks.c index 990d9c218a5d..13aa4fd728f3 100644 --- a/arch/x86_64/kernel/early-quirks.c +++ b/arch/x86_64/kernel/early-quirks.c @@ -14,6 +14,7 @@ #include <linux/pci_ids.h> #include <asm/pci-direct.h> #include <asm/proto.h> +#include <asm/iommu.h> #include <asm/dma.h> static void __init via_bugs(void) diff --git a/arch/x86_64/kernel/early_printk.c b/arch/x86_64/kernel/early_printk.c index 296d2b0c5d88..fd9aff3f3890 100644 --- a/arch/x86_64/kernel/early_printk.c +++ b/arch/x86_64/kernel/early_printk.c @@ -6,6 +6,7 @@ #include <asm/io.h> #include <asm/processor.h> #include <asm/fcntl.h> +#include <xen/hvc-console.h> /* Simple VGA output */ @@ -242,6 +243,10 @@ static int __init setup_early_printk(char *buf) simnow_init(buf + 6); early_console = &simnow_console; keep_early = 1; +#ifdef CONFIG_HVC_XEN + } else if (!strncmp(buf, "xen", 3)) { + early_console = &xenboot_console; +#endif } if (keep_early) diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S index a67f87bf4015..1d232e5f5658 100644 --- a/arch/x86_64/kernel/entry.S +++ b/arch/x86_64/kernel/entry.S @@ -282,7 +282,7 @@ sysret_careful: sysret_signal: TRACE_IRQS_ON sti - testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx + testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx jz 1f /* Really a signal */ @@ -375,7 +375,7 @@ int_very_careful: jmp int_restore_rest int_signal: - testl $(_TIF_NOTIFY_RESUME|_TIF_SIGPENDING|_TIF_SINGLESTEP),%edx + testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx jz 1f movq %rsp,%rdi # &ptregs -> arg1 xorl %esi,%esi # oldset -> arg2 @@ -599,7 +599,7 @@ retint_careful: jmp retint_check retint_signal: - testl $(_TIF_SIGPENDING|_TIF_NOTIFY_RESUME|_TIF_SINGLESTEP),%edx + testl $(_TIF_SIGPENDING|_TIF_SINGLESTEP|_TIF_MCE_NOTIFY),%edx jz retint_swapgs TRACE_IRQS_ON sti diff --git a/arch/x86_64/kernel/head.S b/arch/x86_64/kernel/head.S index c63fc64e14e8..b6167fe3330e 100644 --- a/arch/x86_64/kernel/head.S +++ b/arch/x86_64/kernel/head.S @@ -25,7 +25,7 @@ */ .text - .section .bootstrap.text + .section .text.head .code64 .globl startup_64 startup_64: @@ -73,7 +73,11 @@ startup_64: addq %rbp, init_level4_pgt + (511*8)(%rip) addq %rbp, level3_ident_pgt + 0(%rip) + addq %rbp, level3_kernel_pgt + (510*8)(%rip) + addq %rbp, level3_kernel_pgt + (511*8)(%rip) + + addq %rbp, level2_fixmap_pgt + (506*8)(%rip) /* Add an Identity mapping if I am above 1G */ leaq _text(%rip), %rdi @@ -239,10 +243,16 @@ ENTRY(secondary_startup_64) lretq /* SMP bootup changes these two */ +#ifndef CONFIG_HOTPLUG_CPU + .pushsection .init.data +#endif .align 8 .globl initial_code initial_code: .quad x86_64_start_kernel +#ifndef CONFIG_HOTPLUG_CPU + .popsection +#endif .globl init_rsp init_rsp: .quad init_thread_union+THREAD_SIZE-8 @@ -314,7 +324,16 @@ NEXT_PAGE(level3_kernel_pgt) .fill 510,8,0 /* (2^48-(2*1024*1024*1024)-((2^39)*511))/(2^30) = 510 */ .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE - .fill 1,8,0 + .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE + +NEXT_PAGE(level2_fixmap_pgt) + .fill 506,8,0 + .quad level1_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE + /* 8MB reserved for vsyscalls + a 2MB hole = 4 + 1 entries */ + .fill 5,8,0 + +NEXT_PAGE(level1_fixmap_pgt) + .fill 512,8,0 NEXT_PAGE(level2_ident_pgt) /* Since I easily can, map the first 1G. diff --git a/arch/x86_64/kernel/hpet.c b/arch/x86_64/kernel/hpet.c index b8286968662d..e2d1b912e154 100644 --- a/arch/x86_64/kernel/hpet.c +++ b/arch/x86_64/kernel/hpet.c @@ -133,7 +133,7 @@ struct clocksource clocksource_hpet = { .vread = vread_hpet, }; -int hpet_arch_init(void) +int __init hpet_arch_init(void) { unsigned int id; u64 tmp; @@ -190,7 +190,7 @@ int hpet_reenable(void) */ #define TICK_COUNT 100000000 -#define TICK_MIN 5000 +#define SMI_THRESHOLD 50000 #define MAX_TRIES 5 /* @@ -205,7 +205,7 @@ static void __init read_hpet_tsc(int *hpet, int *tsc) tsc1 = get_cycles_sync(); hpet1 = hpet_readl(HPET_COUNTER); tsc2 = get_cycles_sync(); - if (tsc2 - tsc1 > TICK_MIN) + if ((tsc2 - tsc1) < SMI_THRESHOLD) break; } *hpet = hpet1; @@ -439,7 +439,7 @@ int hpet_rtc_dropped_irq(void) return 1; } -irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs) +irqreturn_t hpet_rtc_interrupt(int irq, void *dev_id) { struct rtc_time curr_time; unsigned long rtc_int_flag = 0; diff --git a/arch/x86_64/kernel/i8259.c b/arch/x86_64/kernel/i8259.c index 4b326655b208..948cae646099 100644 --- a/arch/x86_64/kernel/i8259.c +++ b/arch/x86_64/kernel/i8259.c @@ -444,24 +444,6 @@ void __init init_ISA_irqs (void) } } -void apic_timer_interrupt(void); -void spurious_interrupt(void); -void error_interrupt(void); -void reschedule_interrupt(void); -void call_function_interrupt(void); -void irq_move_cleanup_interrupt(void); -void invalidate_interrupt0(void); -void invalidate_interrupt1(void); -void invalidate_interrupt2(void); -void invalidate_interrupt3(void); -void invalidate_interrupt4(void); -void invalidate_interrupt5(void); -void invalidate_interrupt6(void); -void invalidate_interrupt7(void); -void thermal_interrupt(void); -void threshold_interrupt(void); -void i8254_timer_resume(void); - static void setup_timer_hardware(void) { outb_p(0x34,0x43); /* binary, mode 2, LSB/MSB, ch 0 */ diff --git a/arch/x86_64/kernel/init_task.c b/arch/x86_64/kernel/init_task.c index 3dc5854ba21e..4ff33d4f8551 100644 --- a/arch/x86_64/kernel/init_task.c +++ b/arch/x86_64/kernel/init_task.c @@ -44,7 +44,7 @@ EXPORT_SYMBOL(init_task); * section. Since TSS's are completely CPU-local, we want them * on exact cacheline boundaries, to eliminate cacheline ping-pong. */ -DEFINE_PER_CPU(struct tss_struct, init_tss) ____cacheline_internodealigned_in_smp = INIT_TSS; +DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS; /* Copies of the original ist values from the tss are only accessed during * debugging, no special alignment required. diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c index 1c6c6f724573..966fa1062491 100644 --- a/arch/x86_64/kernel/io_apic.c +++ b/arch/x86_64/kernel/io_apic.c @@ -152,6 +152,32 @@ static inline void io_apic_modify(unsigned int apic, unsigned int value) writel(value, &io_apic->data); } +static int io_apic_level_ack_pending(unsigned int irq) +{ + struct irq_pin_list *entry; + unsigned long flags; + int pending = 0; + + spin_lock_irqsave(&ioapic_lock, flags); + entry = irq_2_pin + irq; + for (;;) { + unsigned int reg; + int pin; + + pin = entry->pin; + if (pin == -1) + break; + reg = io_apic_read(entry->apic, 0x10 + pin*2); + /* Is the remote IRR bit set? */ + pending |= (reg >> 14) & 1; + if (!entry->next) + break; + entry = irq_2_pin + entry->next; + } + spin_unlock_irqrestore(&ioapic_lock, flags); + return pending; +} + /* * Synchronize the IO-APIC and the CPU by doing * a dummy read from the IO-APIC @@ -371,14 +397,12 @@ static void clear_IO_APIC (void) int skip_ioapic_setup; int ioapic_force; -/* dummy parsing: see setup.c */ - -static int __init disable_ioapic_setup(char *str) +static int __init parse_noapic(char *str) { - skip_ioapic_setup = 1; + disable_ioapic_setup(); return 0; } -early_param("noapic", disable_ioapic_setup); +early_param("noapic", parse_noapic); /* Actually the next is obsolete, but keep it for paranoid reasons -AK */ static int __init disable_timer_pin_setup(char *arg) @@ -774,12 +798,15 @@ static struct irq_chip ioapic_chip; static void ioapic_register_intr(int irq, unsigned long trigger) { - if (trigger) + if (trigger) { + irq_desc[irq].status |= IRQ_LEVEL; set_irq_chip_and_handler_name(irq, &ioapic_chip, handle_fasteoi_irq, "fasteoi"); - else + } else { + irq_desc[irq].status &= ~IRQ_LEVEL; set_irq_chip_and_handler_name(irq, &ioapic_chip, handle_edge_irq, "edge"); + } } static void setup_IO_APIC_irq(int apic, int pin, unsigned int irq, @@ -1418,9 +1445,37 @@ static void ack_apic_level(unsigned int irq) ack_APIC_irq(); /* Now we can move and renable the irq */ - move_masked_irq(irq); - if (unlikely(do_unmask_irq)) + if (unlikely(do_unmask_irq)) { + /* Only migrate the irq if the ack has been received. + * + * On rare occasions the broadcast level triggered ack gets + * delayed going to ioapics, and if we reprogram the + * vector while Remote IRR is still set the irq will never + * fire again. + * + * To prevent this scenario we read the Remote IRR bit + * of the ioapic. This has two effects. + * - On any sane system the read of the ioapic will + * flush writes (and acks) going to the ioapic from + * this cpu. + * - We get to see if the ACK has actually been delivered. + * + * Based on failed experiments of reprogramming the + * ioapic entry from outside of irq context starting + * with masking the ioapic entry and then polling until + * Remote IRR was clear before reprogramming the + * ioapic I don't trust the Remote IRR bit to be + * completey accurate. + * + * However there appears to be no other way to plug + * this race, so if the Remote IRR bit is not + * accurate and is causing problems then it is a hardware bug + * and you can go talk to the chipset vendor about it. + */ + if (!io_apic_level_ack_pending(irq)) + move_masked_irq(irq); unmask_IO_APIC_irq(irq); + } } static struct irq_chip ioapic_chip __read_mostly = { diff --git a/arch/x86_64/kernel/kprobes.c b/arch/x86_64/kernel/kprobes.c index d4a0d0ac9935..a30e004682e2 100644 --- a/arch/x86_64/kernel/kprobes.c +++ b/arch/x86_64/kernel/kprobes.c @@ -39,9 +39,9 @@ #include <linux/module.h> #include <linux/kdebug.h> -#include <asm/cacheflush.h> #include <asm/pgtable.h> #include <asm/uaccess.h> +#include <asm/alternative.h> void jprobe_return_end(void); static void __kprobes arch_copy_kprobe(struct kprobe *p); @@ -209,16 +209,12 @@ static void __kprobes arch_copy_kprobe(struct kprobe *p) void __kprobes arch_arm_kprobe(struct kprobe *p) { - *p->addr = BREAKPOINT_INSTRUCTION; - flush_icache_range((unsigned long) p->addr, - (unsigned long) p->addr + sizeof(kprobe_opcode_t)); + text_poke(p->addr, ((unsigned char []){BREAKPOINT_INSTRUCTION}), 1); } void __kprobes arch_disarm_kprobe(struct kprobe *p) { - *p->addr = p->opcode; - flush_icache_range((unsigned long) p->addr, - (unsigned long) p->addr + sizeof(kprobe_opcode_t)); + text_poke(p->addr, &p->opcode, 1); } void __kprobes arch_remove_kprobe(struct kprobe *p) diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c index aa1d15991794..a66d607f5b92 100644 --- a/arch/x86_64/kernel/mce.c +++ b/arch/x86_64/kernel/mce.c @@ -18,6 +18,8 @@ #include <linux/capability.h> #include <linux/cpu.h> #include <linux/percpu.h> +#include <linux/poll.h> +#include <linux/thread_info.h> #include <linux/ctype.h> #include <linux/kmod.h> #include <linux/kdebug.h> @@ -26,6 +28,7 @@ #include <asm/mce.h> #include <asm/uaccess.h> #include <asm/smp.h> +#include <asm/idle.h> #define MISC_MCELOG_MINOR 227 #define NR_BANKS 6 @@ -34,13 +37,17 @@ atomic_t mce_entry; static int mce_dont_init; -/* 0: always panic, 1: panic if deadlock possible, 2: try to avoid panic, - 3: never panic or exit (for testing only) */ +/* + * Tolerant levels: + * 0: always panic on uncorrected errors, log corrected errors + * 1: panic or SIGBUS on uncorrected errors, log corrected errors + * 2: SIGBUS or log uncorrected errors (if possible), log corrected errors + * 3: never panic or SIGBUS, log all errors (for testing only) + */ static int tolerant = 1; static int banks; static unsigned long bank[NR_BANKS] = { [0 ... NR_BANKS-1] = ~0UL }; -static unsigned long console_logged; -static int notify_user; +static unsigned long notify_user; static int rip_msr; static int mce_bootlog = 1; static atomic_t mce_events; @@ -48,6 +55,8 @@ static atomic_t mce_events; static char trigger[128]; static char *trigger_argv[2] = { trigger, NULL }; +static DECLARE_WAIT_QUEUE_HEAD(mce_wait); + /* * Lockless MCE logging infrastructure. * This avoids deadlocks on printk locks without having to break locks. Also @@ -94,8 +103,7 @@ void mce_log(struct mce *mce) mcelog.entry[entry].finished = 1; wmb(); - if (!test_and_set_bit(0, &console_logged)) - notify_user = 1; + set_bit(0, ¬ify_user); } static void print_mce(struct mce *m) @@ -128,6 +136,7 @@ static void print_mce(struct mce *m) static void mce_panic(char *msg, struct mce *backup, unsigned long start) { int i; + oops_begin(); for (i = 0; i < MCE_LOG_LEN; i++) { unsigned long tsc = mcelog.entry[i].tsc; @@ -139,10 +148,7 @@ static void mce_panic(char *msg, struct mce *backup, unsigned long start) } if (backup) print_mce(backup); - if (tolerant >= 3) - printk("Fake panic: %s\n", msg); - else - panic(msg); + panic(msg); } static int mce_available(struct cpuinfo_x86 *c) @@ -167,17 +173,6 @@ static inline void mce_get_rip(struct mce *m, struct pt_regs *regs) } } -static void do_mce_trigger(void) -{ - static atomic_t mce_logged; - int events = atomic_read(&mce_events); - if (events != atomic_read(&mce_logged) && trigger[0]) { - /* Small race window, but should be harmless. */ - atomic_set(&mce_logged, events); - call_usermodehelper(trigger, trigger_argv, NULL, -1); - } -} - /* * The actual machine check handler */ @@ -185,11 +180,19 @@ static void do_mce_trigger(void) void do_machine_check(struct pt_regs * regs, long error_code) { struct mce m, panicm; - int nowayout = (tolerant < 1); - int kill_it = 0; u64 mcestart = 0; int i; int panicm_found = 0; + /* + * If no_way_out gets set, there is no safe way to recover from this + * MCE. If tolerant is cranked up, we'll try anyway. + */ + int no_way_out = 0; + /* + * If kill_it gets set, there might be a way to recover from this + * error. + */ + int kill_it = 0; atomic_inc(&mce_entry); @@ -201,8 +204,9 @@ void do_machine_check(struct pt_regs * regs, long error_code) memset(&m, 0, sizeof(struct mce)); m.cpu = smp_processor_id(); rdmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); + /* if the restart IP is not valid, we're done for */ if (!(m.mcgstatus & MCG_STATUS_RIPV)) - kill_it = 1; + no_way_out = 1; rdtscll(mcestart); barrier(); @@ -221,10 +225,18 @@ void do_machine_check(struct pt_regs * regs, long error_code) continue; if (m.status & MCI_STATUS_EN) { - /* In theory _OVER could be a nowayout too, but - assume any overflowed errors were no fatal. */ - nowayout |= !!(m.status & MCI_STATUS_PCC); - kill_it |= !!(m.status & MCI_STATUS_UC); + /* if PCC was set, there's no way out */ + no_way_out |= !!(m.status & MCI_STATUS_PCC); + /* + * If this error was uncorrectable and there was + * an overflow, we're in trouble. If no overflow, + * we might get away with just killing a task. + */ + if (m.status & MCI_STATUS_UC) { + if (tolerant < 1 || m.status & MCI_STATUS_OVER) + no_way_out = 1; + kill_it = 1; + } } if (m.status & MCI_STATUS_MISCV) @@ -235,7 +247,6 @@ void do_machine_check(struct pt_regs * regs, long error_code) mce_get_rip(&m, regs); if (error_code >= 0) rdtscll(m.tsc); - wrmsrl(MSR_IA32_MC0_STATUS + i*4, 0); if (error_code != -2) mce_log(&m); @@ -251,45 +262,59 @@ void do_machine_check(struct pt_regs * regs, long error_code) } /* Never do anything final in the polling timer */ - if (!regs) { - /* Normal interrupt context here. Call trigger for any new - events. */ - do_mce_trigger(); + if (!regs) goto out; - } /* If we didn't find an uncorrectable error, pick the last one (shouldn't happen, just being safe). */ if (!panicm_found) panicm = m; - if (nowayout) + + /* + * If we have decided that we just CAN'T continue, and the user + * has not set tolerant to an insane level, give up and die. + */ + if (no_way_out && tolerant < 3) mce_panic("Machine check", &panicm, mcestart); - if (kill_it) { + + /* + * If the error seems to be unrecoverable, something should be + * done. Try to kill as little as possible. If we can kill just + * one task, do that. If the user has set the tolerance very + * high, don't try to do anything at all. + */ + if (kill_it && tolerant < 3) { int user_space = 0; - if (m.mcgstatus & MCG_STATUS_RIPV) + /* + * If the EIPV bit is set, it means the saved IP is the + * instruction which caused the MCE. + */ + if (m.mcgstatus & MCG_STATUS_EIPV) user_space = panicm.rip && (panicm.cs & 3); - - /* When the machine was in user space and the CPU didn't get - confused it's normally not necessary to panic, unless you - are paranoid (tolerant == 0) - - RED-PEN could be more tolerant for MCEs in idle, - but most likely they occur at boot anyways, where - it is best to just halt the machine. */ - if ((!user_space && (panic_on_oops || tolerant < 2)) || - (unsigned)current->pid <= 1) - mce_panic("Uncorrected machine check", &panicm, mcestart); - - /* do_exit takes an awful lot of locks and has as - slight risk of deadlocking. If you don't want that - don't set tolerant >= 2 */ - if (tolerant < 3) + + /* + * If we know that the error was in user space, send a + * SIGBUS. Otherwise, panic if tolerance is low. + * + * do_exit() takes an awful lot of locks and has a slight + * risk of deadlocking. + */ + if (user_space) { do_exit(SIGBUS); + } else if (panic_on_oops || tolerant < 2) { + mce_panic("Uncorrected machine check", + &panicm, mcestart); + } } + /* notify userspace ASAP */ + set_thread_flag(TIF_MCE_NOTIFY); + out: - /* Last thing done in the machine check exception to clear state. */ + /* the last thing we do is clear state */ + for (i = 0; i < banks; i++) + wrmsrl(MSR_IA32_MC0_STATUS+4*i, 0); wrmsrl(MSR_IA32_MCG_STATUS, 0); out2: atomic_dec(&mce_entry); @@ -344,37 +369,69 @@ static void mcheck_timer(struct work_struct *work) on_each_cpu(mcheck_check_cpu, NULL, 1, 1); /* - * It's ok to read stale data here for notify_user and - * console_logged as we'll simply get the updated versions - * on the next mcheck_timer execution and atomic operations - * on console_logged act as synchronization for notify_user - * writes. + * Alert userspace if needed. If we logged an MCE, reduce the + * polling interval, otherwise increase the polling interval. */ - if (notify_user && console_logged) { + if (mce_notify_user()) { + next_interval = max(next_interval/2, HZ/100); + } else { + next_interval = min(next_interval*2, + (int)round_jiffies_relative(check_interval*HZ)); + } + + schedule_delayed_work(&mcheck_work, next_interval); +} + +/* + * This is only called from process context. This is where we do + * anything we need to alert userspace about new MCEs. This is called + * directly from the poller and also from entry.S and idle, thanks to + * TIF_MCE_NOTIFY. + */ +int mce_notify_user(void) +{ + clear_thread_flag(TIF_MCE_NOTIFY); + if (test_and_clear_bit(0, ¬ify_user)) { static unsigned long last_print; unsigned long now = jiffies; - /* if we logged an MCE, reduce the polling interval */ - next_interval = max(next_interval/2, HZ/100); - notify_user = 0; - clear_bit(0, &console_logged); + wake_up_interruptible(&mce_wait); + if (trigger[0]) + call_usermodehelper(trigger, trigger_argv, NULL, + UMH_NO_WAIT); + if (time_after_eq(now, last_print + (check_interval*HZ))) { last_print = now; printk(KERN_INFO "Machine check events logged\n"); } - } else { - next_interval = min(next_interval*2, check_interval*HZ); + + return 1; } + return 0; +} - schedule_delayed_work(&mcheck_work, next_interval); +/* see if the idle task needs to notify userspace */ +static int +mce_idle_callback(struct notifier_block *nfb, unsigned long action, void *junk) +{ + /* IDLE_END should be safe - interrupts are back on */ + if (action == IDLE_END && test_thread_flag(TIF_MCE_NOTIFY)) + mce_notify_user(); + + return NOTIFY_OK; } +static struct notifier_block mce_idle_notifier = { + .notifier_call = mce_idle_callback, +}; static __init int periodic_mcheck_init(void) { next_interval = check_interval * HZ; if (next_interval) - schedule_delayed_work(&mcheck_work, next_interval); + schedule_delayed_work(&mcheck_work, + round_jiffies_relative(next_interval)); + idle_notifier_register(&mce_idle_notifier); return 0; } __initcall(periodic_mcheck_init); @@ -465,6 +522,40 @@ void __cpuinit mcheck_init(struct cpuinfo_x86 *c) * Character device to read and clear the MCE log. */ +static DEFINE_SPINLOCK(mce_state_lock); +static int open_count; /* #times opened */ +static int open_exclu; /* already open exclusive? */ + +static int mce_open(struct inode *inode, struct file *file) +{ + spin_lock(&mce_state_lock); + + if (open_exclu || (open_count && (file->f_flags & O_EXCL))) { + spin_unlock(&mce_state_lock); + return -EBUSY; + } + + if (file->f_flags & O_EXCL) + open_exclu = 1; + open_count++; + + spin_unlock(&mce_state_lock); + + return nonseekable_open(inode, file); +} + +static int mce_release(struct inode *inode, struct file *file) +{ + spin_lock(&mce_state_lock); + + open_count--; + open_exclu = 0; + + spin_unlock(&mce_state_lock); + + return 0; +} + static void collect_tscs(void *data) { unsigned long *cpu_tsc = (unsigned long *)data; @@ -532,6 +623,14 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff return err ? -EFAULT : buf - ubuf; } +static unsigned int mce_poll(struct file *file, poll_table *wait) +{ + poll_wait(file, &mce_wait, wait); + if (rcu_dereference(mcelog.next)) + return POLLIN | POLLRDNORM; + return 0; +} + static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd, unsigned long arg) { int __user *p = (int __user *)arg; @@ -555,7 +654,10 @@ static int mce_ioctl(struct inode *i, struct file *f,unsigned int cmd, unsigned } static const struct file_operations mce_chrdev_ops = { + .open = mce_open, + .release = mce_release, .read = mce_read, + .poll = mce_poll, .ioctl = mce_ioctl, }; @@ -565,6 +667,20 @@ static struct miscdevice mce_log_device = { &mce_chrdev_ops, }; +static unsigned long old_cr4 __initdata; + +void __init stop_mce(void) +{ + old_cr4 = read_cr4(); + clear_in_cr4(X86_CR4_MCE); +} + +void __init restart_mce(void) +{ + if (old_cr4 & X86_CR4_MCE) + set_in_cr4(X86_CR4_MCE); +} + /* * Old style boot options parsing. Only for compatibility. */ @@ -620,7 +736,8 @@ static void mce_restart(void) on_each_cpu(mce_init, NULL, 1, 1); next_interval = check_interval * HZ; if (next_interval) - schedule_delayed_work(&mcheck_work, next_interval); + schedule_delayed_work(&mcheck_work, + round_jiffies_relative(next_interval)); } static struct sysdev_class mce_sysclass = { diff --git a/arch/x86_64/kernel/mce_amd.c b/arch/x86_64/kernel/mce_amd.c index 03356e64f9c8..2f8a7f18b0fe 100644 --- a/arch/x86_64/kernel/mce_amd.c +++ b/arch/x86_64/kernel/mce_amd.c @@ -157,9 +157,9 @@ void __cpuinit mce_amd_feature_init(struct cpuinfo_x86 *c) high |= K8_APIC_EXT_LVT_ENTRY_THRESHOLD << 20; wrmsr(address, low, high); - setup_APIC_extened_lvt(K8_APIC_EXT_LVT_ENTRY_THRESHOLD, - THRESHOLD_APIC_VECTOR, - K8_APIC_EXT_INT_MSG_FIX, 0); + setup_APIC_extended_lvt(K8_APIC_EXT_LVT_ENTRY_THRESHOLD, + THRESHOLD_APIC_VECTOR, + K8_APIC_EXT_INT_MSG_FIX, 0); threshold_defaults.address = address; threshold_restart_bank(&threshold_defaults, 0, 0); diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c index 61ae57eb9e4c..8bf0ca03ac8e 100644 --- a/arch/x86_64/kernel/mpparse.c +++ b/arch/x86_64/kernel/mpparse.c @@ -32,7 +32,6 @@ /* Have we found an MP table */ int smp_found_config; -unsigned int __initdata maxcpus = NR_CPUS; /* * Various Linux-internal data structures created from the @@ -649,6 +648,20 @@ static int mp_find_ioapic(int gsi) return -1; } +static u8 uniq_ioapic_id(u8 id) +{ + int i; + DECLARE_BITMAP(used, 256); + bitmap_zero(used, 256); + for (i = 0; i < nr_ioapics; i++) { + struct mpc_config_ioapic *ia = &mp_ioapics[i]; + __set_bit(ia->mpc_apicid, used); + } + if (!test_bit(id, used)) + return id; + return find_first_zero_bit(used, 256); +} + void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base) { int idx = 0; @@ -656,14 +669,14 @@ void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base) if (bad_ioapic(address)) return; - idx = nr_ioapics++; + idx = nr_ioapics; mp_ioapics[idx].mpc_type = MP_IOAPIC; mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE; mp_ioapics[idx].mpc_apicaddr = address; set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); - mp_ioapics[idx].mpc_apicid = id; + mp_ioapics[idx].mpc_apicid = uniq_ioapic_id(id); mp_ioapics[idx].mpc_apicver = 0; /* @@ -680,6 +693,8 @@ void __init mp_register_ioapic(u8 id, u32 address, u32 gsi_base) mp_ioapics[idx].mpc_apicaddr, mp_ioapic_routing[idx].gsi_start, mp_ioapic_routing[idx].gsi_end); + + nr_ioapics++; } void __init diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c index 931c64bad5e6..0ec6d2ddb931 100644 --- a/arch/x86_64/kernel/nmi.c +++ b/arch/x86_64/kernel/nmi.c @@ -85,7 +85,7 @@ int __init check_nmi_watchdog (void) int *counts; int cpu; - if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DEFAULT)) + if ((nmi_watchdog == NMI_NONE) || (nmi_watchdog == NMI_DISABLED)) return 0; if (!atomic_read(&nmi_active)) @@ -296,7 +296,7 @@ static DEFINE_PER_CPU(unsigned, last_irq_sum); static DEFINE_PER_CPU(local_t, alert_counter); static DEFINE_PER_CPU(int, nmi_touch); -void touch_nmi_watchdog (void) +void touch_nmi_watchdog(void) { if (nmi_watchdog > 0) { unsigned cpu; @@ -306,8 +306,10 @@ void touch_nmi_watchdog (void) * do it ourselves because the alert count increase is not * atomic. */ - for_each_present_cpu (cpu) - per_cpu(nmi_touch, cpu) = 1; + for_each_present_cpu(cpu) { + if (per_cpu(nmi_touch, cpu) != 1) + per_cpu(nmi_touch, cpu) = 1; + } } touch_softlockup_watchdog(); @@ -382,11 +384,14 @@ int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) return rc; } +static unsigned ignore_nmis; + asmlinkage __kprobes void do_nmi(struct pt_regs * regs, long error_code) { nmi_enter(); add_pda(__nmi_count,1); - default_do_nmi(regs); + if (!ignore_nmis) + default_do_nmi(regs); nmi_exit(); } @@ -399,6 +404,18 @@ int do_nmi_callback(struct pt_regs * regs, int cpu) return 0; } +void stop_nmi(void) +{ + acpi_nmi_disable(); + ignore_nmis++; +} + +void restart_nmi(void) +{ + ignore_nmis--; + acpi_nmi_enable(); +} + #ifdef CONFIG_SYSCTL static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu) @@ -425,7 +442,7 @@ int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file, if (!!old_state == !!nmi_watchdog_enabled) return 0; - if (atomic_read(&nmi_active) < 0) { + if (atomic_read(&nmi_active) < 0 || nmi_watchdog == NMI_DISABLED) { printk( KERN_WARNING "NMI watchdog is permanently disabled\n"); return -EIO; } diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c index 5bd20b542c1e..71da01e73f03 100644 --- a/arch/x86_64/kernel/pci-calgary.c +++ b/arch/x86_64/kernel/pci-calgary.c @@ -1,7 +1,7 @@ /* * Derived from arch/powerpc/kernel/iommu.c * - * Copyright (C) IBM Corporation, 2006 + * Copyright IBM Corporation, 2006-2007 * Copyright (C) 2006 Jon Mason <jdmason@kudzu.us> * * Author: Jon Mason <jdmason@kudzu.us> @@ -35,7 +35,7 @@ #include <linux/pci_ids.h> #include <linux/pci.h> #include <linux/delay.h> -#include <asm/proto.h> +#include <asm/iommu.h> #include <asm/calgary.h> #include <asm/tce.h> #include <asm/pci-direct.h> @@ -50,13 +50,7 @@ int use_calgary __read_mostly = 0; #endif /* CONFIG_CALGARY_DEFAULT_ENABLED */ #define PCI_DEVICE_ID_IBM_CALGARY 0x02a1 -#define PCI_VENDOR_DEVICE_ID_CALGARY \ - (PCI_VENDOR_ID_IBM | PCI_DEVICE_ID_IBM_CALGARY << 16) - -/* we need these for register space address calculation */ -#define START_ADDRESS 0xfe000000 -#define CHASSIS_BASE 0 -#define ONE_BASED_CHASSIS_NUM 1 +#define PCI_DEVICE_ID_IBM_CALIOC2 0x0308 /* register offsets inside the host bridge space */ #define CALGARY_CONFIG_REG 0x0108 @@ -80,6 +74,12 @@ int use_calgary __read_mostly = 0; #define PHB_MEM_2_SIZE_LOW 0x02E0 #define PHB_DOSHOLE_OFFSET 0x08E0 +/* CalIOC2 specific */ +#define PHB_SAVIOR_L2 0x0DB0 +#define PHB_PAGE_MIG_CTRL 0x0DA8 +#define PHB_PAGE_MIG_DEBUG 0x0DA0 +#define PHB_ROOT_COMPLEX_STATUS 0x0CB0 + /* PHB_CONFIG_RW */ #define PHB_TCE_ENABLE 0x20000000 #define PHB_SLOT_DISABLE 0x1C000000 @@ -92,7 +92,11 @@ int use_calgary __read_mostly = 0; /* CSR (Channel/DMA Status Register) */ #define CSR_AGENT_MASK 0xffe0ffff /* CCR (Calgary Configuration Register) */ -#define CCR_2SEC_TIMEOUT 0x000000000000000EUL +#define CCR_2SEC_TIMEOUT 0x000000000000000EUL +/* PMCR/PMDR (Page Migration Control/Debug Registers */ +#define PMR_SOFTSTOP 0x80000000 +#define PMR_SOFTSTOPFAULT 0x40000000 +#define PMR_HARDSTOP 0x20000000 #define MAX_NUM_OF_PHBS 8 /* how many PHBs in total? */ #define MAX_NUM_CHASSIS 8 /* max number of chassis */ @@ -155,9 +159,26 @@ struct calgary_bus_info { void __iomem *bbar; }; -static struct calgary_bus_info bus_info[MAX_PHB_BUS_NUM] = { { NULL, 0, 0 }, }; +static void calgary_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev); +static void calgary_tce_cache_blast(struct iommu_table *tbl); +static void calgary_dump_error_regs(struct iommu_table *tbl); +static void calioc2_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev); +static void calioc2_tce_cache_blast(struct iommu_table *tbl); +static void calioc2_dump_error_regs(struct iommu_table *tbl); + +static struct cal_chipset_ops calgary_chip_ops = { + .handle_quirks = calgary_handle_quirks, + .tce_cache_blast = calgary_tce_cache_blast, + .dump_error_regs = calgary_dump_error_regs +}; -static void tce_cache_blast(struct iommu_table *tbl); +static struct cal_chipset_ops calioc2_chip_ops = { + .handle_quirks = calioc2_handle_quirks, + .tce_cache_blast = calioc2_tce_cache_blast, + .dump_error_regs = calioc2_dump_error_regs +}; + +static struct calgary_bus_info bus_info[MAX_PHB_BUS_NUM] = { { NULL, 0, 0 }, }; /* enable this to stress test the chip's TCE cache */ #ifdef CONFIG_IOMMU_DEBUG @@ -187,6 +208,7 @@ static inline unsigned long verify_bit_range(unsigned long* bitmap, { return ~0UL; } + #endif /* CONFIG_IOMMU_DEBUG */ static inline unsigned int num_dma_pages(unsigned long dma, unsigned int dmalen) @@ -206,11 +228,12 @@ static inline int translate_phb(struct pci_dev* dev) } static void iommu_range_reserve(struct iommu_table *tbl, - unsigned long start_addr, unsigned int npages) + unsigned long start_addr, unsigned int npages) { unsigned long index; unsigned long end; unsigned long badbit; + unsigned long flags; index = start_addr >> PAGE_SHIFT; @@ -222,6 +245,8 @@ static void iommu_range_reserve(struct iommu_table *tbl, if (end > tbl->it_size) /* don't go off the table */ end = tbl->it_size; + spin_lock_irqsave(&tbl->it_lock, flags); + badbit = verify_bit_range(tbl->it_map, 0, index, end); if (badbit != ~0UL) { if (printk_ratelimit()) @@ -231,23 +256,29 @@ static void iommu_range_reserve(struct iommu_table *tbl, } set_bit_string(tbl->it_map, index, npages); + + spin_unlock_irqrestore(&tbl->it_lock, flags); } static unsigned long iommu_range_alloc(struct iommu_table *tbl, unsigned int npages) { + unsigned long flags; unsigned long offset; BUG_ON(npages == 0); + spin_lock_irqsave(&tbl->it_lock, flags); + offset = find_next_zero_string(tbl->it_map, tbl->it_hint, tbl->it_size, npages); if (offset == ~0UL) { - tce_cache_blast(tbl); + tbl->chip_ops->tce_cache_blast(tbl); offset = find_next_zero_string(tbl->it_map, 0, tbl->it_size, npages); if (offset == ~0UL) { printk(KERN_WARNING "Calgary: IOMMU full.\n"); + spin_unlock_irqrestore(&tbl->it_lock, flags); if (panic_on_overflow) panic("Calgary: fix the allocator.\n"); else @@ -259,17 +290,17 @@ static unsigned long iommu_range_alloc(struct iommu_table *tbl, tbl->it_hint = offset + npages; BUG_ON(tbl->it_hint > tbl->it_size); + spin_unlock_irqrestore(&tbl->it_lock, flags); + return offset; } static dma_addr_t iommu_alloc(struct iommu_table *tbl, void *vaddr, unsigned int npages, int direction) { - unsigned long entry, flags; + unsigned long entry; dma_addr_t ret = bad_dma_address; - spin_lock_irqsave(&tbl->it_lock, flags); - entry = iommu_range_alloc(tbl, npages); if (unlikely(entry == bad_dma_address)) @@ -282,23 +313,21 @@ static dma_addr_t iommu_alloc(struct iommu_table *tbl, void *vaddr, tce_build(tbl, entry, npages, (unsigned long)vaddr & PAGE_MASK, direction); - spin_unlock_irqrestore(&tbl->it_lock, flags); - return ret; error: - spin_unlock_irqrestore(&tbl->it_lock, flags); printk(KERN_WARNING "Calgary: failed to allocate %u pages in " "iommu %p\n", npages, tbl); return bad_dma_address; } -static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, +static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, unsigned int npages) { unsigned long entry; unsigned long badbit; unsigned long badend; + unsigned long flags; /* were we called with bad_dma_address? */ badend = bad_dma_address + (EMERGENCY_PAGES * PAGE_SIZE); @@ -315,6 +344,8 @@ static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, tce_free(tbl, entry, npages); + spin_lock_irqsave(&tbl->it_lock, flags); + badbit = verify_bit_range(tbl->it_map, 1, entry, entry + npages); if (badbit != ~0UL) { if (printk_ratelimit()) @@ -324,23 +355,39 @@ static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, } __clear_bit_string(tbl->it_map, entry, npages); + + spin_unlock_irqrestore(&tbl->it_lock, flags); } -static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, - unsigned int npages) +static inline struct iommu_table *find_iommu_table(struct device *dev) { - unsigned long flags; + struct pci_dev *pdev; + struct pci_bus *pbus; + struct iommu_table *tbl; - spin_lock_irqsave(&tbl->it_lock, flags); + pdev = to_pci_dev(dev); - __iommu_free(tbl, dma_addr, npages); + pbus = pdev->bus; - spin_unlock_irqrestore(&tbl->it_lock, flags); + /* is the device behind a bridge? Look for the root bus */ + while (pbus->parent) + pbus = pbus->parent; + + tbl = pci_iommu(pbus); + + BUG_ON(tbl && (tbl->it_busno != pbus->number)); + + return tbl; } -static void __calgary_unmap_sg(struct iommu_table *tbl, +static void calgary_unmap_sg(struct device *dev, struct scatterlist *sglist, int nelems, int direction) { + struct iommu_table *tbl = find_iommu_table(dev); + + if (!translate_phb(to_pci_dev(dev))) + return; + while (nelems--) { unsigned int npages; dma_addr_t dma = sglist->dma_address; @@ -350,33 +397,17 @@ static void __calgary_unmap_sg(struct iommu_table *tbl, break; npages = num_dma_pages(dma, dmalen); - __iommu_free(tbl, dma, npages); + iommu_free(tbl, dma, npages); sglist++; } } -void calgary_unmap_sg(struct device *dev, struct scatterlist *sglist, - int nelems, int direction) -{ - unsigned long flags; - struct iommu_table *tbl = to_pci_dev(dev)->bus->self->sysdata; - - if (!translate_phb(to_pci_dev(dev))) - return; - - spin_lock_irqsave(&tbl->it_lock, flags); - - __calgary_unmap_sg(tbl, sglist, nelems, direction); - - spin_unlock_irqrestore(&tbl->it_lock, flags); -} - static int calgary_nontranslate_map_sg(struct device* dev, struct scatterlist *sg, int nelems, int direction) { int i; - for (i = 0; i < nelems; i++ ) { + for (i = 0; i < nelems; i++ ) { struct scatterlist *s = &sg[i]; BUG_ON(!s->page); s->dma_address = virt_to_bus(page_address(s->page) +s->offset); @@ -385,11 +416,10 @@ static int calgary_nontranslate_map_sg(struct device* dev, return nelems; } -int calgary_map_sg(struct device *dev, struct scatterlist *sg, +static int calgary_map_sg(struct device *dev, struct scatterlist *sg, int nelems, int direction) { - struct iommu_table *tbl = to_pci_dev(dev)->bus->self->sysdata; - unsigned long flags; + struct iommu_table *tbl = find_iommu_table(dev); unsigned long vaddr; unsigned int npages; unsigned long entry; @@ -398,8 +428,6 @@ int calgary_map_sg(struct device *dev, struct scatterlist *sg, if (!translate_phb(to_pci_dev(dev))) return calgary_nontranslate_map_sg(dev, sg, nelems, direction); - spin_lock_irqsave(&tbl->it_lock, flags); - for (i = 0; i < nelems; i++ ) { struct scatterlist *s = &sg[i]; BUG_ON(!s->page); @@ -423,26 +451,23 @@ int calgary_map_sg(struct device *dev, struct scatterlist *sg, s->dma_length = s->length; } - spin_unlock_irqrestore(&tbl->it_lock, flags); - return nelems; error: - __calgary_unmap_sg(tbl, sg, nelems, direction); + calgary_unmap_sg(dev, sg, nelems, direction); for (i = 0; i < nelems; i++) { sg[i].dma_address = bad_dma_address; sg[i].dma_length = 0; } - spin_unlock_irqrestore(&tbl->it_lock, flags); return 0; } -dma_addr_t calgary_map_single(struct device *dev, void *vaddr, +static dma_addr_t calgary_map_single(struct device *dev, void *vaddr, size_t size, int direction) { dma_addr_t dma_handle = bad_dma_address; unsigned long uaddr; unsigned int npages; - struct iommu_table *tbl = to_pci_dev(dev)->bus->self->sysdata; + struct iommu_table *tbl = find_iommu_table(dev); uaddr = (unsigned long)vaddr; npages = num_dma_pages(uaddr, size); @@ -455,10 +480,10 @@ dma_addr_t calgary_map_single(struct device *dev, void *vaddr, return dma_handle; } -void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle, +static void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle, size_t size, int direction) { - struct iommu_table *tbl = to_pci_dev(dev)->bus->self->sysdata; + struct iommu_table *tbl = find_iommu_table(dev); unsigned int npages; if (!translate_phb(to_pci_dev(dev))) @@ -468,15 +493,13 @@ void calgary_unmap_single(struct device *dev, dma_addr_t dma_handle, iommu_free(tbl, dma_handle, npages); } -void* calgary_alloc_coherent(struct device *dev, size_t size, +static void* calgary_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t flag) { void *ret = NULL; dma_addr_t mapping; unsigned int npages, order; - struct iommu_table *tbl; - - tbl = to_pci_dev(dev)->bus->self->sysdata; + struct iommu_table *tbl = find_iommu_table(dev); size = PAGE_ALIGN(size); /* size rounded up to full pages */ npages = size >> PAGE_SHIFT; @@ -552,7 +575,22 @@ static inline void __iomem* calgary_reg(void __iomem *bar, unsigned long offset) return (void __iomem*)target; } -static void tce_cache_blast(struct iommu_table *tbl) +static inline int is_calioc2(unsigned short device) +{ + return (device == PCI_DEVICE_ID_IBM_CALIOC2); +} + +static inline int is_calgary(unsigned short device) +{ + return (device == PCI_DEVICE_ID_IBM_CALGARY); +} + +static inline int is_cal_pci_dev(unsigned short device) +{ + return (is_calgary(device) || is_calioc2(device)); +} + +static void calgary_tce_cache_blast(struct iommu_table *tbl) { u64 val; u32 aer; @@ -589,6 +627,85 @@ static void tce_cache_blast(struct iommu_table *tbl) (void)readl(target); /* flush */ } +static void calioc2_tce_cache_blast(struct iommu_table *tbl) +{ + void __iomem *bbar = tbl->bbar; + void __iomem *target; + u64 val64; + u32 val; + int i = 0; + int count = 1; + unsigned char bus = tbl->it_busno; + +begin: + printk(KERN_DEBUG "Calgary: CalIOC2 bus 0x%x entering tce cache blast " + "sequence - count %d\n", bus, count); + + /* 1. using the Page Migration Control reg set SoftStop */ + target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL); + val = be32_to_cpu(readl(target)); + printk(KERN_DEBUG "1a. read 0x%x [LE] from %p\n", val, target); + val |= PMR_SOFTSTOP; + printk(KERN_DEBUG "1b. writing 0x%x [LE] to %p\n", val, target); + writel(cpu_to_be32(val), target); + + /* 2. poll split queues until all DMA activity is done */ + printk(KERN_DEBUG "2a. starting to poll split queues\n"); + target = calgary_reg(bbar, split_queue_offset(bus)); + do { + val64 = readq(target); + i++; + } while ((val64 & 0xff) != 0xff && i < 100); + if (i == 100) + printk(KERN_WARNING "CalIOC2: PCI bus not quiesced, " + "continuing anyway\n"); + + /* 3. poll Page Migration DEBUG for SoftStopFault */ + target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_DEBUG); + val = be32_to_cpu(readl(target)); + printk(KERN_DEBUG "3. read 0x%x [LE] from %p\n", val, target); + + /* 4. if SoftStopFault - goto (1) */ + if (val & PMR_SOFTSTOPFAULT) { + if (++count < 100) + goto begin; + else { + printk(KERN_WARNING "CalIOC2: too many SoftStopFaults, " + "aborting TCE cache flush sequence!\n"); + return; /* pray for the best */ + } + } + + /* 5. Slam into HardStop by reading PHB_PAGE_MIG_CTRL */ + target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL); + printk(KERN_DEBUG "5a. slamming into HardStop by reading %p\n", target); + val = be32_to_cpu(readl(target)); + printk(KERN_DEBUG "5b. read 0x%x [LE] from %p\n", val, target); + target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_DEBUG); + val = be32_to_cpu(readl(target)); + printk(KERN_DEBUG "5c. read 0x%x [LE] from %p (debug)\n", val, target); + + /* 6. invalidate TCE cache */ + printk(KERN_DEBUG "6. invalidating TCE cache\n"); + target = calgary_reg(bbar, tar_offset(bus)); + writeq(tbl->tar_val, target); + + /* 7. Re-read PMCR */ + printk(KERN_DEBUG "7a. Re-reading PMCR\n"); + target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL); + val = be32_to_cpu(readl(target)); + printk(KERN_DEBUG "7b. read 0x%x [LE] from %p\n", val, target); + + /* 8. Remove HardStop */ + printk(KERN_DEBUG "8a. removing HardStop from PMCR\n"); + target = calgary_reg(bbar, phb_offset(bus) | PHB_PAGE_MIG_CTRL); + val = 0; + printk(KERN_DEBUG "8b. writing 0x%x [LE] to %p\n", val, target); + writel(cpu_to_be32(val), target); + val = be32_to_cpu(readl(target)); + printk(KERN_DEBUG "8c. read 0x%x [LE] from %p\n", val, target); +} + static void __init calgary_reserve_mem_region(struct pci_dev *dev, u64 start, u64 limit) { @@ -598,7 +715,7 @@ static void __init calgary_reserve_mem_region(struct pci_dev *dev, u64 start, limit++; numpages = ((limit - start) >> PAGE_SHIFT); - iommu_range_reserve(dev->sysdata, start, numpages); + iommu_range_reserve(pci_iommu(dev->bus), start, numpages); } static void __init calgary_reserve_peripheral_mem_1(struct pci_dev *dev) @@ -606,7 +723,7 @@ static void __init calgary_reserve_peripheral_mem_1(struct pci_dev *dev) void __iomem *target; u64 low, high, sizelow; u64 start, limit; - struct iommu_table *tbl = dev->sysdata; + struct iommu_table *tbl = pci_iommu(dev->bus); unsigned char busnum = dev->bus->number; void __iomem *bbar = tbl->bbar; @@ -630,7 +747,7 @@ static void __init calgary_reserve_peripheral_mem_2(struct pci_dev *dev) u32 val32; u64 low, high, sizelow, sizehigh; u64 start, limit; - struct iommu_table *tbl = dev->sysdata; + struct iommu_table *tbl = pci_iommu(dev->bus); unsigned char busnum = dev->bus->number; void __iomem *bbar = tbl->bbar; @@ -666,14 +783,20 @@ static void __init calgary_reserve_regions(struct pci_dev *dev) { unsigned int npages; u64 start; - struct iommu_table *tbl = dev->sysdata; + struct iommu_table *tbl = pci_iommu(dev->bus); /* reserve EMERGENCY_PAGES from bad_dma_address and up */ iommu_range_reserve(tbl, bad_dma_address, EMERGENCY_PAGES); /* avoid the BIOS/VGA first 640KB-1MB region */ - start = (640 * 1024); - npages = ((1024 - 640) * 1024) >> PAGE_SHIFT; + /* for CalIOC2 - avoid the entire first MB */ + if (is_calgary(dev->device)) { + start = (640 * 1024); + npages = ((1024 - 640) * 1024) >> PAGE_SHIFT; + } else { /* calioc2 */ + start = 0; + npages = (1 * 1024 * 1024) >> PAGE_SHIFT; + } iommu_range_reserve(tbl, start, npages); /* reserve the two PCI peripheral memory regions in IO space */ @@ -694,10 +817,17 @@ static int __init calgary_setup_tar(struct pci_dev *dev, void __iomem *bbar) if (ret) return ret; - tbl = dev->sysdata; + tbl = pci_iommu(dev->bus); tbl->it_base = (unsigned long)bus_info[dev->bus->number].tce_space; tce_free(tbl, 0, tbl->it_size); + if (is_calgary(dev->device)) + tbl->chip_ops = &calgary_chip_ops; + else if (is_calioc2(dev->device)) + tbl->chip_ops = &calioc2_chip_ops; + else + BUG(); + calgary_reserve_regions(dev); /* set TARs for each PHB */ @@ -706,15 +836,15 @@ static int __init calgary_setup_tar(struct pci_dev *dev, void __iomem *bbar) /* zero out all TAR bits under sw control */ val64 &= ~TAR_SW_BITS; - - tbl = dev->sysdata; table_phys = (u64)__pa(tbl->it_base); + val64 |= table_phys; BUG_ON(specified_table_size > TCE_TABLE_SIZE_8M); val64 |= (u64) specified_table_size; tbl->tar_val = cpu_to_be64(val64); + writeq(tbl->tar_val, target); readq(target); /* flush */ @@ -724,7 +854,7 @@ static int __init calgary_setup_tar(struct pci_dev *dev, void __iomem *bbar) static void __init calgary_free_bus(struct pci_dev *dev) { u64 val64; - struct iommu_table *tbl = dev->sysdata; + struct iommu_table *tbl = pci_iommu(dev->bus); void __iomem *target; unsigned int bitmapsz; @@ -739,16 +869,81 @@ static void __init calgary_free_bus(struct pci_dev *dev) tbl->it_map = NULL; kfree(tbl); - dev->sysdata = NULL; + + set_pci_iommu(dev->bus, NULL); /* Can't free bootmem allocated memory after system is up :-( */ bus_info[dev->bus->number].tce_space = NULL; } +static void calgary_dump_error_regs(struct iommu_table *tbl) +{ + void __iomem *bbar = tbl->bbar; + void __iomem *target; + u32 csr, plssr; + + target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_CSR_OFFSET); + csr = be32_to_cpu(readl(target)); + + target = calgary_reg(bbar, phb_offset(tbl->it_busno) | PHB_PLSSR_OFFSET); + plssr = be32_to_cpu(readl(target)); + + /* If no error, the agent ID in the CSR is not valid */ + printk(KERN_EMERG "Calgary: DMA error on Calgary PHB 0x%x, " + "0x%08x@CSR 0x%08x@PLSSR\n", tbl->it_busno, csr, plssr); +} + +static void calioc2_dump_error_regs(struct iommu_table *tbl) +{ + void __iomem *bbar = tbl->bbar; + u32 csr, csmr, plssr, mck, rcstat; + void __iomem *target; + unsigned long phboff = phb_offset(tbl->it_busno); + unsigned long erroff; + u32 errregs[7]; + int i; + + /* dump CSR */ + target = calgary_reg(bbar, phboff | PHB_CSR_OFFSET); + csr = be32_to_cpu(readl(target)); + /* dump PLSSR */ + target = calgary_reg(bbar, phboff | PHB_PLSSR_OFFSET); + plssr = be32_to_cpu(readl(target)); + /* dump CSMR */ + target = calgary_reg(bbar, phboff | 0x290); + csmr = be32_to_cpu(readl(target)); + /* dump mck */ + target = calgary_reg(bbar, phboff | 0x800); + mck = be32_to_cpu(readl(target)); + + printk(KERN_EMERG "Calgary: DMA error on CalIOC2 PHB 0x%x\n", + tbl->it_busno); + + printk(KERN_EMERG "Calgary: 0x%08x@CSR 0x%08x@PLSSR 0x%08x@CSMR 0x%08x@MCK\n", + csr, plssr, csmr, mck); + + /* dump rest of error regs */ + printk(KERN_EMERG "Calgary: "); + for (i = 0; i < ARRAY_SIZE(errregs); i++) { + /* err regs are at 0x810 - 0x870 */ + erroff = (0x810 + (i * 0x10)); + target = calgary_reg(bbar, phboff | erroff); + errregs[i] = be32_to_cpu(readl(target)); + printk("0x%08x@0x%lx ", errregs[i], erroff); + } + printk("\n"); + + /* root complex status */ + target = calgary_reg(bbar, phboff | PHB_ROOT_COMPLEX_STATUS); + rcstat = be32_to_cpu(readl(target)); + printk(KERN_EMERG "Calgary: 0x%08x@0x%x\n", rcstat, + PHB_ROOT_COMPLEX_STATUS); +} + static void calgary_watchdog(unsigned long data) { struct pci_dev *dev = (struct pci_dev *)data; - struct iommu_table *tbl = dev->sysdata; + struct iommu_table *tbl = pci_iommu(dev->bus); void __iomem *bbar = tbl->bbar; u32 val32; void __iomem *target; @@ -758,13 +953,14 @@ static void calgary_watchdog(unsigned long data) /* If no error, the agent ID in the CSR is not valid */ if (val32 & CSR_AGENT_MASK) { - printk(KERN_EMERG "calgary_watchdog: DMA error on PHB %#x, " - "CSR = %#x\n", dev->bus->number, val32); + tbl->chip_ops->dump_error_regs(tbl); + + /* reset error */ writel(0, target); /* Disable bus that caused the error */ target = calgary_reg(bbar, phb_offset(tbl->it_busno) | - PHB_CONFIG_RW_OFFSET); + PHB_CONFIG_RW_OFFSET); val32 = be32_to_cpu(readl(target)); val32 |= PHB_SLOT_DISABLE; writel(cpu_to_be32(val32), target); @@ -775,8 +971,8 @@ static void calgary_watchdog(unsigned long data) } } -static void __init calgary_increase_split_completion_timeout(void __iomem *bbar, - unsigned char busnum) +static void __init calgary_set_split_completion_timeout(void __iomem *bbar, + unsigned char busnum, unsigned long timeout) { u64 val64; void __iomem *target; @@ -802,11 +998,40 @@ static void __init calgary_increase_split_completion_timeout(void __iomem *bbar, /* zero out this PHB's timer bits */ mask = ~(0xFUL << phb_shift); val64 &= mask; - val64 |= (CCR_2SEC_TIMEOUT << phb_shift); + val64 |= (timeout << phb_shift); writeq(cpu_to_be64(val64), target); readq(target); /* flush */ } +static void calioc2_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev) +{ + unsigned char busnum = dev->bus->number; + void __iomem *bbar = tbl->bbar; + void __iomem *target; + u32 val; + + /* + * CalIOC2 designers recommend setting bit 8 in 0xnDB0 to 1 + */ + target = calgary_reg(bbar, phb_offset(busnum) | PHB_SAVIOR_L2); + val = cpu_to_be32(readl(target)); + val |= 0x00800000; + writel(cpu_to_be32(val), target); +} + +static void calgary_handle_quirks(struct iommu_table *tbl, struct pci_dev *dev) +{ + unsigned char busnum = dev->bus->number; + + /* + * Give split completion a longer timeout on bus 1 for aic94xx + * http://bugzilla.kernel.org/show_bug.cgi?id=7180 + */ + if (is_calgary(dev->device) && (busnum == 1)) + calgary_set_split_completion_timeout(tbl->bbar, busnum, + CCR_2SEC_TIMEOUT); +} + static void __init calgary_enable_translation(struct pci_dev *dev) { u32 val32; @@ -816,7 +1041,7 @@ static void __init calgary_enable_translation(struct pci_dev *dev) struct iommu_table *tbl; busnum = dev->bus->number; - tbl = dev->sysdata; + tbl = pci_iommu(dev->bus); bbar = tbl->bbar; /* enable TCE in PHB Config Register */ @@ -824,20 +1049,15 @@ static void __init calgary_enable_translation(struct pci_dev *dev) val32 = be32_to_cpu(readl(target)); val32 |= PHB_TCE_ENABLE | PHB_DAC_DISABLE | PHB_MCSR_ENABLE; - printk(KERN_INFO "Calgary: enabling translation on PHB %#x\n", busnum); + printk(KERN_INFO "Calgary: enabling translation on %s PHB %#x\n", + (dev->device == PCI_DEVICE_ID_IBM_CALGARY) ? + "Calgary" : "CalIOC2", busnum); printk(KERN_INFO "Calgary: errant DMAs will now be prevented on this " "bus.\n"); writel(cpu_to_be32(val32), target); readl(target); /* flush */ - /* - * Give split completion a longer timeout on bus 1 for aic94xx - * http://bugzilla.kernel.org/show_bug.cgi?id=7180 - */ - if (busnum == 1) - calgary_increase_split_completion_timeout(bbar, busnum); - init_timer(&tbl->watchdog_timer); tbl->watchdog_timer.function = &calgary_watchdog; tbl->watchdog_timer.data = (unsigned long)dev; @@ -853,7 +1073,7 @@ static void __init calgary_disable_translation(struct pci_dev *dev) struct iommu_table *tbl; busnum = dev->bus->number; - tbl = dev->sysdata; + tbl = pci_iommu(dev->bus); bbar = tbl->bbar; /* disable TCE in PHB Config Register */ @@ -871,13 +1091,19 @@ static void __init calgary_disable_translation(struct pci_dev *dev) static void __init calgary_init_one_nontraslated(struct pci_dev *dev) { pci_dev_get(dev); - dev->sysdata = NULL; - dev->bus->self = dev; + set_pci_iommu(dev->bus, NULL); + + /* is the device behind a bridge? */ + if (dev->bus->parent) + dev->bus->parent->self = dev; + else + dev->bus->self = dev; } static int __init calgary_init_one(struct pci_dev *dev) { void __iomem *bbar; + struct iommu_table *tbl; int ret; BUG_ON(dev->bus->number >= MAX_PHB_BUS_NUM); @@ -888,7 +1114,18 @@ static int __init calgary_init_one(struct pci_dev *dev) goto done; pci_dev_get(dev); - dev->bus->self = dev; + + if (dev->bus->parent) { + if (dev->bus->parent->self) + printk(KERN_WARNING "Calgary: IEEEE, dev %p has " + "bus->parent->self!\n", dev); + dev->bus->parent->self = dev; + } else + dev->bus->self = dev; + + tbl = pci_iommu(dev->bus); + tbl->chip_ops->handle_quirks(tbl, dev); + calgary_enable_translation(dev); return 0; @@ -924,11 +1161,18 @@ static int __init calgary_locate_bbars(void) target = calgary_reg(bbar, offset); val = be32_to_cpu(readl(target)); + start_bus = (u8)((val & 0x00FF0000) >> 16); end_bus = (u8)((val & 0x0000FF00) >> 8); - for (bus = start_bus; bus <= end_bus; bus++) { - bus_info[bus].bbar = bbar; - bus_info[bus].phbid = phb; + + if (end_bus) { + for (bus = start_bus; bus <= end_bus; bus++) { + bus_info[bus].bbar = bbar; + bus_info[bus].phbid = phb; + } + } else { + bus_info[start_bus].bbar = bbar; + bus_info[start_bus].phbid = phb; } } } @@ -948,22 +1192,24 @@ static int __init calgary_init(void) { int ret; struct pci_dev *dev = NULL; + void *tce_space; ret = calgary_locate_bbars(); if (ret) return ret; do { - dev = pci_get_device(PCI_VENDOR_ID_IBM, - PCI_DEVICE_ID_IBM_CALGARY, - dev); + dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev); if (!dev) break; + if (!is_cal_pci_dev(dev->device)) + continue; if (!translate_phb(dev)) { calgary_init_one_nontraslated(dev); continue; } - if (!bus_info[dev->bus->number].tce_space && !translate_empty_slots) + tce_space = bus_info[dev->bus->number].tce_space; + if (!tce_space && !translate_empty_slots) continue; ret = calgary_init_one(dev); @@ -976,10 +1222,11 @@ static int __init calgary_init(void) error: do { dev = pci_get_device_reverse(PCI_VENDOR_ID_IBM, - PCI_DEVICE_ID_IBM_CALGARY, - dev); + PCI_ANY_ID, dev); if (!dev) break; + if (!is_cal_pci_dev(dev->device)) + continue; if (!translate_phb(dev)) { pci_dev_put(dev); continue; @@ -1057,9 +1304,29 @@ static int __init build_detail_arrays(void) return 0; } -void __init detect_calgary(void) +static int __init calgary_bus_has_devices(int bus, unsigned short pci_dev) { + int dev; u32 val; + + if (pci_dev == PCI_DEVICE_ID_IBM_CALIOC2) { + /* + * FIXME: properly scan for devices accross the + * PCI-to-PCI bridge on every CalIOC2 port. + */ + return 1; + } + + for (dev = 1; dev < 8; dev++) { + val = read_pci_config(bus, dev, 0, 0); + if (val != 0xffffffff) + break; + } + return (val != 0xffffffff); +} + +void __init detect_calgary(void) +{ int bus; void *tbl; int calgary_found = 0; @@ -1116,29 +1383,26 @@ void __init detect_calgary(void) specified_table_size = determine_tce_table_size(end_pfn * PAGE_SIZE); for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) { - int dev; struct calgary_bus_info *info = &bus_info[bus]; + unsigned short pci_device; + u32 val; + + val = read_pci_config(bus, 0, 0, 0); + pci_device = (val & 0xFFFF0000) >> 16; - if (read_pci_config(bus, 0, 0, 0) != PCI_VENDOR_DEVICE_ID_CALGARY) + if (!is_cal_pci_dev(pci_device)) continue; if (info->translation_disabled) continue; - /* - * Scan the slots of the PCI bus to see if there is a device present. - * The parent bus will be the zero-ith device, so start at 1. - */ - for (dev = 1; dev < 8; dev++) { - val = read_pci_config(bus, dev, 0, 0); - if (val != 0xffffffff || translate_empty_slots) { - tbl = alloc_tce_table(); - if (!tbl) - goto cleanup; - info->tce_space = tbl; - calgary_found = 1; - break; - } + if (calgary_bus_has_devices(bus, pci_device) || + translate_empty_slots) { + tbl = alloc_tce_table(); + if (!tbl) + goto cleanup; + info->tce_space = tbl; + calgary_found = 1; } } @@ -1249,3 +1513,66 @@ static int __init calgary_parse_options(char *p) return 1; } __setup("calgary=", calgary_parse_options); + +static void __init calgary_fixup_one_tce_space(struct pci_dev *dev) +{ + struct iommu_table *tbl; + unsigned int npages; + int i; + + tbl = pci_iommu(dev->bus); + + for (i = 0; i < 4; i++) { + struct resource *r = &dev->resource[PCI_BRIDGE_RESOURCES + i]; + + /* Don't give out TCEs that map MEM resources */ + if (!(r->flags & IORESOURCE_MEM)) + continue; + + /* 0-based? we reserve the whole 1st MB anyway */ + if (!r->start) + continue; + + /* cover the whole region */ + npages = (r->end - r->start) >> PAGE_SHIFT; + npages++; + + iommu_range_reserve(tbl, r->start, npages); + } +} + +static int __init calgary_fixup_tce_spaces(void) +{ + struct pci_dev *dev = NULL; + void *tce_space; + + if (no_iommu || swiotlb || !calgary_detected) + return -ENODEV; + + printk(KERN_DEBUG "Calgary: fixing up tce spaces\n"); + + do { + dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_ANY_ID, dev); + if (!dev) + break; + if (!is_cal_pci_dev(dev->device)) + continue; + if (!translate_phb(dev)) + continue; + + tce_space = bus_info[dev->bus->number].tce_space; + if (!tce_space) + continue; + + calgary_fixup_one_tce_space(dev); + + } while (1); + + return 0; +} + +/* + * We need to be call after pcibios_assign_resources (fs_initcall level) + * and before device_initcall. + */ +rootfs_initcall(calgary_fixup_tce_spaces); diff --git a/arch/x86_64/kernel/pci-dma.c b/arch/x86_64/kernel/pci-dma.c index 9f80aad3fe2d..29711445c818 100644 --- a/arch/x86_64/kernel/pci-dma.c +++ b/arch/x86_64/kernel/pci-dma.c @@ -8,7 +8,7 @@ #include <linux/pci.h> #include <linux/module.h> #include <asm/io.h> -#include <asm/proto.h> +#include <asm/iommu.h> #include <asm/calgary.h> int iommu_merge __read_mostly = 0; @@ -22,8 +22,7 @@ EXPORT_SYMBOL(bad_dma_address); int iommu_bio_merge __read_mostly = 0; EXPORT_SYMBOL(iommu_bio_merge); -int iommu_sac_force __read_mostly = 0; -EXPORT_SYMBOL(iommu_sac_force); +static int iommu_sac_force __read_mostly = 0; int no_iommu __read_mostly; #ifdef CONFIG_IOMMU_DEBUG @@ -83,6 +82,10 @@ dma_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, if (dma_mask == 0) dma_mask = DMA_32BIT_MASK; + /* Device not DMA able */ + if (dev->dma_mask == NULL) + return NULL; + /* Don't invoke OOM killer */ gfp |= __GFP_NORETRY; @@ -322,6 +325,11 @@ static int __init pci_iommu_init(void) return 0; } +void pci_iommu_shutdown(void) +{ + gart_iommu_shutdown(); +} + #ifdef CONFIG_PCI /* Many VIA bridges seem to corrupt data for DAC. Disable it here */ diff --git a/arch/x86_64/kernel/pci-gart.c b/arch/x86_64/kernel/pci-gart.c index ae091cdc1a4d..4918c575d582 100644 --- a/arch/x86_64/kernel/pci-gart.c +++ b/arch/x86_64/kernel/pci-gart.c @@ -28,6 +28,7 @@ #include <asm/mtrr.h> #include <asm/pgtable.h> #include <asm/proto.h> +#include <asm/iommu.h> #include <asm/cacheflush.h> #include <asm/swiotlb.h> #include <asm/dma.h> @@ -235,7 +236,7 @@ static dma_addr_t gart_map_simple(struct device *dev, char *buf, } /* Map a single area into the IOMMU */ -dma_addr_t gart_map_single(struct device *dev, void *addr, size_t size, int dir) +static dma_addr_t gart_map_single(struct device *dev, void *addr, size_t size, int dir) { unsigned long phys_mem, bus; @@ -253,7 +254,7 @@ dma_addr_t gart_map_single(struct device *dev, void *addr, size_t size, int dir) /* * Free a DMA mapping. */ -void gart_unmap_single(struct device *dev, dma_addr_t dma_addr, +static void gart_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size, int direction) { unsigned long iommu_page; @@ -275,7 +276,7 @@ void gart_unmap_single(struct device *dev, dma_addr_t dma_addr, /* * Wrapper for pci_unmap_single working with scatterlists. */ -void gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) +static void gart_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) { int i; @@ -571,6 +572,26 @@ static const struct dma_mapping_ops gart_dma_ops = { .unmap_sg = gart_unmap_sg, }; +void gart_iommu_shutdown(void) +{ + struct pci_dev *dev; + int i; + + if (no_agp && (dma_ops != &gart_dma_ops)) + return; + + for (i = 0; i < num_k8_northbridges; i++) { + u32 ctl; + + dev = k8_northbridges[i]; + pci_read_config_dword(dev, 0x90, &ctl); + + ctl &= ~1; + + pci_write_config_dword(dev, 0x90, ctl); + } +} + void __init gart_iommu_init(void) { struct agp_kern_info info; diff --git a/arch/x86_64/kernel/pci-nommu.c b/arch/x86_64/kernel/pci-nommu.c index 6dade0c867cc..2a34c6c025a9 100644 --- a/arch/x86_64/kernel/pci-nommu.c +++ b/arch/x86_64/kernel/pci-nommu.c @@ -6,7 +6,7 @@ #include <linux/string.h> #include <linux/dma-mapping.h> -#include <asm/proto.h> +#include <asm/iommu.h> #include <asm/processor.h> #include <asm/dma.h> @@ -34,7 +34,7 @@ nommu_map_single(struct device *hwdev, void *ptr, size_t size, return bus; } -void nommu_unmap_single(struct device *dev, dma_addr_t addr,size_t size, +static void nommu_unmap_single(struct device *dev, dma_addr_t addr,size_t size, int direction) { } @@ -54,7 +54,7 @@ void nommu_unmap_single(struct device *dev, dma_addr_t addr,size_t size, * Device ownership issues as mentioned above for pci_map_single are * the same here. */ -int nommu_map_sg(struct device *hwdev, struct scatterlist *sg, +static int nommu_map_sg(struct device *hwdev, struct scatterlist *sg, int nents, int direction) { int i; @@ -74,7 +74,7 @@ int nommu_map_sg(struct device *hwdev, struct scatterlist *sg, * Again, cpu read rules concerning calls here are the same as for * pci_unmap_single() above. */ -void nommu_unmap_sg(struct device *dev, struct scatterlist *sg, +static void nommu_unmap_sg(struct device *dev, struct scatterlist *sg, int nents, int dir) { } diff --git a/arch/x86_64/kernel/pci-swiotlb.c b/arch/x86_64/kernel/pci-swiotlb.c index 4b4569abc60c..b2f405ea7c85 100644 --- a/arch/x86_64/kernel/pci-swiotlb.c +++ b/arch/x86_64/kernel/pci-swiotlb.c @@ -5,7 +5,7 @@ #include <linux/module.h> #include <linux/dma-mapping.h> -#include <asm/proto.h> +#include <asm/iommu.h> #include <asm/swiotlb.h> #include <asm/dma.h> diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c index 5909039f37aa..98956555450b 100644 --- a/arch/x86_64/kernel/process.c +++ b/arch/x86_64/kernel/process.c @@ -23,6 +23,7 @@ #include <linux/sched.h> #include <linux/kernel.h> #include <linux/mm.h> +#include <linux/fs.h> #include <linux/elfcore.h> #include <linux/smp.h> #include <linux/slab.h> @@ -278,7 +279,7 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) */ if (!pm_idle) { if (!printed) { - printk("using mwait in idle threads.\n"); + printk(KERN_INFO "using mwait in idle threads.\n"); printed = 1; } pm_idle = mwait_idle; @@ -305,6 +306,7 @@ early_param("idle", idle_setup); void __show_regs(struct pt_regs * regs) { unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs; + unsigned long d0, d1, d2, d3, d6, d7; unsigned int fsindex,gsindex; unsigned int ds,cs,es; @@ -340,15 +342,24 @@ void __show_regs(struct pt_regs * regs) rdmsrl(MSR_GS_BASE, gs); rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); - asm("movq %%cr0, %0": "=r" (cr0)); - asm("movq %%cr2, %0": "=r" (cr2)); - asm("movq %%cr3, %0": "=r" (cr3)); - asm("movq %%cr4, %0": "=r" (cr4)); + cr0 = read_cr0(); + cr2 = read_cr2(); + cr3 = read_cr3(); + cr4 = read_cr4(); printk("FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", fs,fsindex,gs,gsindex,shadowgs); printk("CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, es, cr0); printk("CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, cr4); + + get_debugreg(d0, 0); + get_debugreg(d1, 1); + get_debugreg(d2, 2); + printk("DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2); + get_debugreg(d3, 3); + get_debugreg(d6, 6); + get_debugreg(d7, 7); + printk("DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7); } void show_regs(struct pt_regs *regs) diff --git a/arch/x86_64/kernel/ptrace.c b/arch/x86_64/kernel/ptrace.c index 9409117b9f19..eea3702427b4 100644 --- a/arch/x86_64/kernel/ptrace.c +++ b/arch/x86_64/kernel/ptrace.c @@ -102,16 +102,25 @@ unsigned long convert_rip_to_linear(struct task_struct *child, struct pt_regs *r u32 *desc; unsigned long base; - down(&child->mm->context.sem); - desc = child->mm->context.ldt + (seg & ~7); - base = (desc[0] >> 16) | ((desc[1] & 0xff) << 16) | (desc[1] & 0xff000000); + seg &= ~7UL; - /* 16-bit code segment? */ - if (!((desc[1] >> 22) & 1)) - addr &= 0xffff; - addr += base; + down(&child->mm->context.sem); + if (unlikely((seg >> 3) >= child->mm->context.size)) + addr = -1L; /* bogus selector, access would fault */ + else { + desc = child->mm->context.ldt + seg; + base = ((desc[0] >> 16) | + ((desc[1] & 0xff) << 16) | + (desc[1] & 0xff000000)); + + /* 16-bit code segment? */ + if (!((desc[1] >> 22) & 1)) + addr &= 0xffff; + addr += base; + } up(&child->mm->context.sem); } + return addr; } @@ -223,10 +232,6 @@ static int putreg(struct task_struct *child, { unsigned long tmp; - /* Some code in the 64bit emulation may not be 64bit clean. - Don't take any chances. */ - if (test_tsk_thread_flag(child, TIF_IA32)) - value &= 0xffffffff; switch (regno) { case offsetof(struct user_regs_struct,fs): if (value && (value & 3) != 3) @@ -313,17 +318,9 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) switch (request) { /* when I and D space are separate, these will need to be fixed. */ case PTRACE_PEEKTEXT: /* read word at location addr. */ - case PTRACE_PEEKDATA: { - unsigned long tmp; - int copied; - - copied = access_process_vm(child, addr, &tmp, sizeof(tmp), 0); - ret = -EIO; - if (copied != sizeof(tmp)) - break; - ret = put_user(tmp,(unsigned long __user *) data); + case PTRACE_PEEKDATA: + ret = generic_ptrace_peekdata(child, addr, data); break; - } /* read the word at location addr in the USER area. */ case PTRACE_PEEKUSR: { @@ -367,10 +364,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) /* when I and D space are separate, this will have to be fixed. */ case PTRACE_POKETEXT: /* write the word at location addr. */ case PTRACE_POKEDATA: - ret = 0; - if (access_process_vm(child, addr, &data, sizeof(data), 1) == sizeof(data)) - break; - ret = -EIO; + ret = generic_ptrace_pokedata(child, addr, data); break; case PTRACE_POKEUSR: /* write the word at location addr in the USER area */ diff --git a/arch/x86_64/kernel/reboot.c b/arch/x86_64/kernel/reboot.c index 7503068e788d..368db2b9c5ac 100644 --- a/arch/x86_64/kernel/reboot.c +++ b/arch/x86_64/kernel/reboot.c @@ -16,6 +16,7 @@ #include <asm/pgtable.h> #include <asm/tlbflush.h> #include <asm/apic.h> +#include <asm/iommu.h> /* * Power off function, if any @@ -81,6 +82,7 @@ static inline void kb_wait(void) void machine_shutdown(void) { unsigned long flags; + /* Stop the cpus and apics */ #ifdef CONFIG_SMP int reboot_cpu_id; @@ -111,6 +113,8 @@ void machine_shutdown(void) disable_IO_APIC(); local_irq_restore(flags); + + pci_iommu_shutdown(); } void machine_emergency_restart(void) diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index eb6524f3ac29..af838f6b0b7f 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -575,6 +575,8 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) level = cpuid_eax(1); if (c->x86 == 15 && ((level >= 0x0f48 && level < 0x0f50) || level >= 0x0f58)) set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability); + if (c->x86 == 0x10) + set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability); /* Enable workaround for FXSAVE leak */ if (c->x86 >= 6) @@ -600,8 +602,14 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) if (c->extended_cpuid_level >= 0x80000008) amd_detect_cmp(c); - /* Fix cpuid4 emulation for more */ - num_cache_leaves = 3; + if (c->extended_cpuid_level >= 0x80000006 && + (cpuid_edx(0x80000006) & 0xf000)) + num_cache_leaves = 4; + else + num_cache_leaves = 3; + + if (c->x86 == 0xf || c->x86 == 0x10 || c->x86 == 0x11) + set_bit(X86_FEATURE_K8, &c->x86_capability); /* RDTSC can be speculated around */ clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability); @@ -846,6 +854,8 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c) c->x86_capability[2] = cpuid_edx(0x80860001); } + init_scattered_cpuid_features(c); + c->apicid = phys_pkg_id(0); /* @@ -931,7 +941,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) "fpu", "vme", "de", "pse", "tsc", "msr", "pae", "mce", "cx8", "apic", NULL, "sep", "mtrr", "pge", "mca", "cmov", "pat", "pse36", "pn", "clflush", NULL, "dts", "acpi", "mmx", - "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", NULL, + "fxsr", "sse", "sse2", "ss", "ht", "tm", "ia64", "pbe", /* AMD-defined */ NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, @@ -947,10 +957,11 @@ static int show_cpuinfo(struct seq_file *m, void *v) NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* Other (Linux-defined) */ - "cxmmx", NULL, "cyrix_arr", "centaur_mcr", NULL, - "constant_tsc", NULL, NULL, - "up", NULL, NULL, NULL, NULL, NULL, NULL, NULL, - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + "cxmmx", "k6_mtrr", "cyrix_arr", "centaur_mcr", + NULL, NULL, NULL, NULL, + "constant_tsc", "up", NULL, "arch_perfmon", + "pebs", "bts", NULL, "sync_rdtsc", + "rep_good", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* Intel-defined (#2) */ @@ -961,7 +972,7 @@ static int show_cpuinfo(struct seq_file *m, void *v) /* VIA/Cyrix/Centaur-defined */ NULL, NULL, "rng", "rng_en", NULL, NULL, "ace", "ace_en", - NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + "ace2", "ace2_en", "phe", "phe_en", "pmm", "pmm_en", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, @@ -972,6 +983,12 @@ static int show_cpuinfo(struct seq_file *m, void *v) "osvw", "ibs", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + + /* Auxiliary (Linux-defined) */ + "ida", NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, }; static char *x86_power_flags[] = { "ts", /* temperature sensor */ diff --git a/arch/x86_64/kernel/signal.c b/arch/x86_64/kernel/signal.c index 290f5d8037cd..739175b01e06 100644 --- a/arch/x86_64/kernel/signal.c +++ b/arch/x86_64/kernel/signal.c @@ -26,6 +26,7 @@ #include <asm/i387.h> #include <asm/proto.h> #include <asm/ia32_unistd.h> +#include <asm/mce.h> /* #define DEBUG_SIG 1 */ @@ -472,6 +473,12 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) clear_thread_flag(TIF_SINGLESTEP); } +#ifdef CONFIG_X86_MCE + /* notify userspace of pending MCEs */ + if (thread_info_flags & _TIF_MCE_NOTIFY) + mce_notify_user(); +#endif /* CONFIG_X86_MCE */ + /* deal with pending signal delivery */ if (thread_info_flags & (_TIF_SIGPENDING|_TIF_RESTORE_SIGMASK)) do_signal(regs); @@ -480,7 +487,7 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) void signal_fault(struct pt_regs *regs, void __user *frame, char *where) { struct task_struct *me = current; - if (exception_trace) + if (show_unhandled_signals && printk_ratelimit()) printk("%s[%d] bad frame in %s frame:%p rip:%lx rsp:%lx orax:%lx\n", me->comm,me->pid,where,frame,regs->rip,regs->rsp,regs->orig_rax); diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c index 2ff468591625..df4a82812adb 100644 --- a/arch/x86_64/kernel/smp.c +++ b/arch/x86_64/kernel/smp.c @@ -357,7 +357,7 @@ __smp_call_function_single(int cpu, void (*func) (void *info), void *info, } /* - * smp_call_function_single - Run a function on another CPU + * smp_call_function_single - Run a function on a specific CPU * @func: The function to run. This must be fast and non-blocking. * @info: An arbitrary pointer to pass to the function. * @nonatomic: Currently unused. @@ -374,17 +374,21 @@ int smp_call_function_single (int cpu, void (*func) (void *info), void *info, { /* prevent preemption and reschedule on another processor */ int me = get_cpu(); + + /* Can deadlock when called with interrupts disabled */ + WARN_ON(irqs_disabled()); + if (cpu == me) { + local_irq_disable(); + func(info); + local_irq_enable(); put_cpu(); return 0; } - /* Can deadlock when called with interrupts disabled */ - WARN_ON(irqs_disabled()); - - spin_lock_bh(&call_lock); + spin_lock(&call_lock); __smp_call_function_single(cpu, func, info, nonatomic, wait); - spin_unlock_bh(&call_lock); + spin_unlock(&call_lock); put_cpu(); return 0; } diff --git a/arch/x86_64/kernel/suspend.c b/arch/x86_64/kernel/suspend.c index 6a5a98f2a75c..573c0a6e0ac6 100644 --- a/arch/x86_64/kernel/suspend.c +++ b/arch/x86_64/kernel/suspend.c @@ -55,11 +55,11 @@ void __save_processor_state(struct saved_context *ctxt) * control registers */ rdmsrl(MSR_EFER, ctxt->efer); - asm volatile ("movq %%cr0, %0" : "=r" (ctxt->cr0)); - asm volatile ("movq %%cr2, %0" : "=r" (ctxt->cr2)); - asm volatile ("movq %%cr3, %0" : "=r" (ctxt->cr3)); - asm volatile ("movq %%cr4, %0" : "=r" (ctxt->cr4)); - asm volatile ("movq %%cr8, %0" : "=r" (ctxt->cr8)); + ctxt->cr0 = read_cr0(); + ctxt->cr2 = read_cr2(); + ctxt->cr3 = read_cr3(); + ctxt->cr4 = read_cr4(); + ctxt->cr8 = read_cr8(); } void save_processor_state(void) @@ -81,11 +81,11 @@ void __restore_processor_state(struct saved_context *ctxt) * control registers */ wrmsrl(MSR_EFER, ctxt->efer); - asm volatile ("movq %0, %%cr8" :: "r" (ctxt->cr8)); - asm volatile ("movq %0, %%cr4" :: "r" (ctxt->cr4)); - asm volatile ("movq %0, %%cr3" :: "r" (ctxt->cr3)); - asm volatile ("movq %0, %%cr2" :: "r" (ctxt->cr2)); - asm volatile ("movq %0, %%cr0" :: "r" (ctxt->cr0)); + write_cr8(ctxt->cr8); + write_cr4(ctxt->cr4); + write_cr3(ctxt->cr3); + write_cr2(ctxt->cr2); + write_cr0(ctxt->cr0); /* * now restore the descriptor tables to their proper values @@ -146,7 +146,7 @@ void fix_processor_context(void) } -#ifdef CONFIG_SOFTWARE_SUSPEND +#ifdef CONFIG_HIBERNATION /* Defined in arch/x86_64/kernel/suspend_asm.S */ extern int restore_image(void); @@ -236,4 +236,4 @@ int pfn_is_nosave(unsigned long pfn) unsigned long nosave_end_pfn = PAGE_ALIGN(__pa_symbol(&__nosave_end)) >> PAGE_SHIFT; return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn); } -#endif /* CONFIG_SOFTWARE_SUSPEND */ +#endif /* CONFIG_HIBERNATION */ diff --git a/arch/x86_64/kernel/sys_x86_64.c b/arch/x86_64/kernel/sys_x86_64.c index d067d9a2ad27..4770b7a2052c 100644 --- a/arch/x86_64/kernel/sys_x86_64.c +++ b/arch/x86_64/kernel/sys_x86_64.c @@ -6,6 +6,7 @@ #include <linux/sched.h> #include <linux/syscalls.h> #include <linux/mm.h> +#include <linux/fs.h> #include <linux/smp.h> #include <linux/sem.h> #include <linux/msg.h> diff --git a/arch/x86_64/kernel/tce.c b/arch/x86_64/kernel/tce.c index f61fb8e4f129..e3f2569b2c44 100644 --- a/arch/x86_64/kernel/tce.c +++ b/arch/x86_64/kernel/tce.c @@ -131,14 +131,14 @@ done: return ret; } -int build_tce_table(struct pci_dev *dev, void __iomem *bbar) +int __init build_tce_table(struct pci_dev *dev, void __iomem *bbar) { struct iommu_table *tbl; int ret; - if (dev->sysdata) { - printk(KERN_ERR "Calgary: dev %p has sysdata %p\n", - dev, dev->sysdata); + if (pci_iommu(dev->bus)) { + printk(KERN_ERR "Calgary: dev %p has sysdata->iommu %p\n", + dev, pci_iommu(dev->bus)); BUG(); } @@ -155,11 +155,7 @@ int build_tce_table(struct pci_dev *dev, void __iomem *bbar) tbl->bbar = bbar; - /* - * NUMA is already using the bus's sysdata pointer, so we use - * the bus's pci_dev's sysdata instead. - */ - dev->sysdata = tbl; + set_pci_iommu(dev->bus, tbl); return 0; @@ -169,7 +165,7 @@ done: return ret; } -void* alloc_tce_table(void) +void * __init alloc_tce_table(void) { unsigned int size; @@ -179,7 +175,7 @@ void* alloc_tce_table(void) return __alloc_bootmem_low(size, size, 0); } -void free_tce_table(void *tbl) +void __init free_tce_table(void *tbl) { unsigned int size; diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c index 4a0895bacf51..6d48a4e826d9 100644 --- a/arch/x86_64/kernel/time.c +++ b/arch/x86_64/kernel/time.c @@ -33,6 +33,7 @@ #include <acpi/acpi_bus.h> #endif #include <asm/8253pit.h> +#include <asm/i8253.h> #include <asm/pgtable.h> #include <asm/vsyscall.h> #include <asm/timex.h> @@ -44,12 +45,14 @@ #include <asm/hpet.h> #include <asm/mpspec.h> #include <asm/nmi.h> +#include <asm/vgtod.h> static char *timename = NULL; DEFINE_SPINLOCK(rtc_lock); EXPORT_SYMBOL(rtc_lock); DEFINE_SPINLOCK(i8253_lock); +EXPORT_SYMBOL(i8253_lock); volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; @@ -79,8 +82,9 @@ EXPORT_SYMBOL(profile_pc); * sheet for details. */ -static void set_rtc_mmss(unsigned long nowtime) +static int set_rtc_mmss(unsigned long nowtime) { + int retval = 0; int real_seconds, real_minutes, cmos_minutes; unsigned char control, freq_select; @@ -120,6 +124,7 @@ static void set_rtc_mmss(unsigned long nowtime) if (abs(real_minutes - cmos_minutes) >= 30) { printk(KERN_WARNING "time.c: can't update CMOS clock " "from %d to %d\n", cmos_minutes, real_minutes); + retval = -1; } else { BIN_TO_BCD(real_seconds); BIN_TO_BCD(real_minutes); @@ -139,12 +144,17 @@ static void set_rtc_mmss(unsigned long nowtime) CMOS_WRITE(freq_select, RTC_FREQ_SELECT); spin_unlock(&rtc_lock); + + return retval; } +int update_persistent_clock(struct timespec now) +{ + return set_rtc_mmss(now.tv_sec); +} void main_timer_handler(void) { - static unsigned long rtc_update = 0; /* * Here we are in the timer irq handler. We have irqs locally disabled (so we * don't need spin_lock_irqsave()) but we don't know if the timer_bh is running @@ -172,20 +182,6 @@ void main_timer_handler(void) if (!using_apic_timer) smp_local_timer_interrupt(); -/* - * If we have an externally synchronized Linux clock, then update CMOS clock - * accordingly every ~11 minutes. set_rtc_mmss() will be called in the jiffy - * closest to exactly 500 ms before the next second. If the update fails, we - * don't care, as it'll be updated on the next turn, and the problem (time way - * off) isn't likely to go away much sooner anyway. - */ - - if (ntp_synced() && xtime.tv_sec > rtc_update && - abs(xtime.tv_nsec - 500000000) <= tick_nsec / 2) { - set_rtc_mmss(xtime.tv_sec); - rtc_update = xtime.tv_sec + 660; - } - write_sequnlock(&xtime_lock); } @@ -199,7 +195,7 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } -static unsigned long get_cmos_time(void) +unsigned long read_persistent_clock(void) { unsigned int year, mon, day, hour, min, sec; unsigned long flags; @@ -226,7 +222,7 @@ static unsigned long get_cmos_time(void) /* * We know that x86-64 always uses BCD format, no need to check the * config register. - */ + */ BCD_TO_BIN(sec); BCD_TO_BIN(min); @@ -239,11 +235,11 @@ static unsigned long get_cmos_time(void) BCD_TO_BIN(century); year += century * 100; printk(KERN_INFO "Extended CMOS year: %d\n", century * 100); - } else { + } else { /* * x86-64 systems only exists since 2002. * This will work up to Dec 31, 2100 - */ + */ year += 2000; } @@ -255,45 +251,45 @@ static unsigned long get_cmos_time(void) #define TICK_COUNT 100000000 static unsigned int __init tsc_calibrate_cpu_khz(void) { - int tsc_start, tsc_now; - int i, no_ctr_free; - unsigned long evntsel3 = 0, pmc3 = 0, pmc_now = 0; - unsigned long flags; - - for (i = 0; i < 4; i++) - if (avail_to_resrv_perfctr_nmi_bit(i)) - break; - no_ctr_free = (i == 4); - if (no_ctr_free) { - i = 3; - rdmsrl(MSR_K7_EVNTSEL3, evntsel3); - wrmsrl(MSR_K7_EVNTSEL3, 0); - rdmsrl(MSR_K7_PERFCTR3, pmc3); - } else { - reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i); - reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i); - } - local_irq_save(flags); - /* start meauring cycles, incrementing from 0 */ - wrmsrl(MSR_K7_PERFCTR0 + i, 0); - wrmsrl(MSR_K7_EVNTSEL0 + i, 1 << 22 | 3 << 16 | 0x76); - rdtscl(tsc_start); - do { - rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now); - tsc_now = get_cycles_sync(); - } while ((tsc_now - tsc_start) < TICK_COUNT); - - local_irq_restore(flags); - if (no_ctr_free) { - wrmsrl(MSR_K7_EVNTSEL3, 0); - wrmsrl(MSR_K7_PERFCTR3, pmc3); - wrmsrl(MSR_K7_EVNTSEL3, evntsel3); - } else { - release_perfctr_nmi(MSR_K7_PERFCTR0 + i); - release_evntsel_nmi(MSR_K7_EVNTSEL0 + i); - } - - return pmc_now * tsc_khz / (tsc_now - tsc_start); + int tsc_start, tsc_now; + int i, no_ctr_free; + unsigned long evntsel3 = 0, pmc3 = 0, pmc_now = 0; + unsigned long flags; + + for (i = 0; i < 4; i++) + if (avail_to_resrv_perfctr_nmi_bit(i)) + break; + no_ctr_free = (i == 4); + if (no_ctr_free) { + i = 3; + rdmsrl(MSR_K7_EVNTSEL3, evntsel3); + wrmsrl(MSR_K7_EVNTSEL3, 0); + rdmsrl(MSR_K7_PERFCTR3, pmc3); + } else { + reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i); + reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i); + } + local_irq_save(flags); + /* start meauring cycles, incrementing from 0 */ + wrmsrl(MSR_K7_PERFCTR0 + i, 0); + wrmsrl(MSR_K7_EVNTSEL0 + i, 1 << 22 | 3 << 16 | 0x76); + rdtscl(tsc_start); + do { + rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now); + tsc_now = get_cycles_sync(); + } while ((tsc_now - tsc_start) < TICK_COUNT); + + local_irq_restore(flags); + if (no_ctr_free) { + wrmsrl(MSR_K7_EVNTSEL3, 0); + wrmsrl(MSR_K7_PERFCTR3, pmc3); + wrmsrl(MSR_K7_EVNTSEL3, evntsel3); + } else { + release_perfctr_nmi(MSR_K7_PERFCTR0 + i); + release_evntsel_nmi(MSR_K7_EVNTSEL0 + i); + } + + return pmc_now * tsc_khz / (tsc_now - tsc_start); } /* @@ -321,7 +317,7 @@ static unsigned int __init pit_calibrate_tsc(void) end = get_cycles_sync(); spin_unlock_irqrestore(&i8253_lock, flags); - + return (end - start) / 50; } @@ -366,25 +362,20 @@ static struct irqaction irq0 = { .handler = timer_interrupt, .flags = IRQF_DISABLED | IRQF_IRQPOLL, .mask = CPU_MASK_NONE, - .name = "timer" + .name = "timer" }; void __init time_init(void) { if (nohpet) hpet_address = 0; - xtime.tv_sec = get_cmos_time(); - xtime.tv_nsec = 0; - - set_normalized_timespec(&wall_to_monotonic, - -xtime.tv_sec, -xtime.tv_nsec); if (hpet_arch_init()) hpet_address = 0; if (hpet_use_timer) { /* set tick_nsec to use the proper rate for HPET */ - tick_nsec = TICK_NSEC_HPET; + tick_nsec = TICK_NSEC_HPET; tsc_khz = hpet_calibrate_tsc(); timename = "HPET"; } else { @@ -415,54 +406,21 @@ void __init time_init(void) setup_irq(0, &irq0); } - -static long clock_cmos_diff; -static unsigned long sleep_start; - /* * sysfs support for the timer. */ static int timer_suspend(struct sys_device *dev, pm_message_t state) { - /* - * Estimate time zone so that set_time can update the clock - */ - long cmos_time = get_cmos_time(); - - clock_cmos_diff = -cmos_time; - clock_cmos_diff += get_seconds(); - sleep_start = cmos_time; return 0; } static int timer_resume(struct sys_device *dev) { - unsigned long flags; - unsigned long sec; - unsigned long ctime = get_cmos_time(); - long sleep_length = (ctime - sleep_start) * HZ; - - if (sleep_length < 0) { - printk(KERN_WARNING "Time skew detected in timer resume!\n"); - /* The time after the resume must not be earlier than the time - * before the suspend or some nasty things will happen - */ - sleep_length = 0; - ctime = sleep_start; - } if (hpet_address) hpet_reenable(); else i8254_timer_resume(); - - sec = ctime + clock_cmos_diff; - write_seqlock_irqsave(&xtime_lock,flags); - xtime.tv_sec = sec; - xtime.tv_nsec = 0; - jiffies += sleep_length; - write_sequnlock_irqrestore(&xtime_lock,flags); - touch_softlockup_watchdog(); return 0; } diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index aac1c0be54c6..03888420775d 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -34,6 +34,10 @@ #include <linux/bug.h> #include <linux/kdebug.h> +#if defined(CONFIG_EDAC) +#include <linux/edac.h> +#endif + #include <asm/system.h> #include <asm/io.h> #include <asm/atomic.h> @@ -330,6 +334,7 @@ static int print_trace_stack(void *data, char *name) static void print_trace_address(void *data, unsigned long addr) { + touch_nmi_watchdog(); printk_address(addr); } @@ -518,6 +523,7 @@ void __kprobes __die(const char * str, struct pt_regs * regs, long err) printk("\n"); notify_die(DIE_OOPS, str, regs, err, current->thread.trap_no, SIGSEGV); show_registers(regs); + add_taint(TAINT_DIE); /* Executive summary in case the oops scrolled away */ printk(KERN_ALERT "RIP "); printk_address(regs->rip); @@ -531,7 +537,7 @@ void die(const char * str, struct pt_regs * regs, long err) unsigned long flags = oops_begin(); if (!user_mode(regs)) - report_bug(regs->rip); + report_bug(regs->rip, regs); __die(str, regs, err); oops_end(flags); @@ -578,7 +584,8 @@ static void __kprobes do_trap(int trapnr, int signr, char *str, tsk->thread.error_code = error_code; tsk->thread.trap_no = trapnr; - if (exception_trace && unhandled_signal(tsk, signr)) + if (show_unhandled_signals && unhandled_signal(tsk, signr) && + printk_ratelimit()) printk(KERN_INFO "%s[%d] trap %s rip:%lx rsp:%lx error:%lx\n", tsk->comm, tsk->pid, str, @@ -682,7 +689,8 @@ asmlinkage void __kprobes do_general_protection(struct pt_regs * regs, tsk->thread.error_code = error_code; tsk->thread.trap_no = 13; - if (exception_trace && unhandled_signal(tsk, SIGSEGV)) + if (show_unhandled_signals && unhandled_signal(tsk, SIGSEGV) && + printk_ratelimit()) printk(KERN_INFO "%s[%d] general protection rip:%lx rsp:%lx error:%lx\n", tsk->comm, tsk->pid, @@ -717,6 +725,13 @@ mem_parity_error(unsigned char reason, struct pt_regs * regs) reason); printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n"); +#if defined(CONFIG_EDAC) + if(edac_handler_set()) { + edac_atomic_assert_error(); + return; + } +#endif + if (panic_on_unrecovered_nmi) panic("NMI: Not continuing"); diff --git a/arch/x86_64/kernel/tsc.c b/arch/x86_64/kernel/tsc.c index 48f9a8e6aa91..2a59bde663f2 100644 --- a/arch/x86_64/kernel/tsc.c +++ b/arch/x86_64/kernel/tsc.c @@ -44,7 +44,7 @@ unsigned long long sched_clock(void) static int tsc_unstable; -static inline int check_tsc_unstable(void) +inline int check_tsc_unstable(void) { return tsc_unstable; } @@ -61,25 +61,9 @@ static inline int check_tsc_unstable(void) * first tick after the change will be slightly wrong. */ -#include <linux/workqueue.h> - -static unsigned int cpufreq_delayed_issched = 0; -static unsigned int cpufreq_init = 0; -static struct work_struct cpufreq_delayed_get_work; - -static void handle_cpufreq_delayed_get(struct work_struct *v) -{ - unsigned int cpu; - for_each_online_cpu(cpu) { - cpufreq_get(cpu); - } - cpufreq_delayed_issched = 0; -} - -static unsigned int ref_freq = 0; -static unsigned long loops_per_jiffy_ref = 0; - -static unsigned long tsc_khz_ref = 0; +static unsigned int ref_freq; +static unsigned long loops_per_jiffy_ref; +static unsigned long tsc_khz_ref; static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data) @@ -125,10 +109,8 @@ static struct notifier_block time_cpufreq_notifier_block = { static int __init cpufreq_tsc(void) { - INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get); - if (!cpufreq_register_notifier(&time_cpufreq_notifier_block, - CPUFREQ_TRANSITION_NOTIFIER)) - cpufreq_init = 1; + cpufreq_register_notifier(&time_cpufreq_notifier_block, + CPUFREQ_TRANSITION_NOTIFIER); return 0; } @@ -136,8 +118,6 @@ core_initcall(cpufreq_tsc); #endif -static int tsc_unstable = 0; - /* * Make an educated guess if the TSC is trustworthy and synchronized * over all CPUs. @@ -153,17 +133,18 @@ __cpuinit int unsynchronized_tsc(void) #endif /* Most intel systems have synchronized TSCs except for multi node systems */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { #ifdef CONFIG_ACPI /* But TSC doesn't tick in C3 so don't use it there */ - if (acpi_gbl_FADT.header.length > 0 && acpi_gbl_FADT.C3latency < 1000) + if (acpi_gbl_FADT.header.length > 0 && + acpi_gbl_FADT.C3latency < 1000) return 1; #endif - return 0; + return 0; } - /* Assume multi socket systems are not synchronized */ - return num_present_cpus() > 1; + /* Assume multi socket systems are not synchronized */ + return num_present_cpus() > 1; } int __init notsc_setup(char *s) diff --git a/arch/x86_64/kernel/verify_cpu.S b/arch/x86_64/kernel/verify_cpu.S index e035f5948199..45b6f8a975a1 100644 --- a/arch/x86_64/kernel/verify_cpu.S +++ b/arch/x86_64/kernel/verify_cpu.S @@ -37,20 +37,6 @@ verify_cpu: pushl $0 # Kill any dangerous flags popfl - /* minimum CPUID flags for x86-64 as defined by AMD */ -#define M(x) (1<<(x)) -#define M2(a,b) M(a)|M(b) -#define M4(a,b,c,d) M(a)|M(b)|M(c)|M(d) - -#define SSE_MASK \ - (M2(X86_FEATURE_XMM,X86_FEATURE_XMM2)) -#define REQUIRED_MASK1 \ - (M4(X86_FEATURE_FPU,X86_FEATURE_PSE,X86_FEATURE_TSC,X86_FEATURE_MSR)|\ - M4(X86_FEATURE_PAE,X86_FEATURE_CX8,X86_FEATURE_PGE,X86_FEATURE_CMOV)|\ - M(X86_FEATURE_FXSR)) -#define REQUIRED_MASK2 \ - (M(X86_FEATURE_LM - 32)) - pushfl # standard way to check for cpuid popl %eax movl %eax,%ebx @@ -79,8 +65,8 @@ verify_cpu: verify_cpu_noamd: movl $0x1,%eax # Does the cpu have what it takes cpuid - andl $REQUIRED_MASK1,%edx - xorl $REQUIRED_MASK1,%edx + andl $REQUIRED_MASK0,%edx + xorl $REQUIRED_MASK0,%edx jnz verify_cpu_no_longmode movl $0x80000000,%eax # See if extended cpuid is implemented @@ -90,8 +76,8 @@ verify_cpu_noamd: movl $0x80000001,%eax # Does the cpu have what it takes cpuid - andl $REQUIRED_MASK2,%edx - xorl $REQUIRED_MASK2,%edx + andl $REQUIRED_MASK1,%edx + xorl $REQUIRED_MASK1,%edx jnz verify_cpu_no_longmode verify_cpu_sse_test: diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S index dbccfda8364f..ba8ea97abd21 100644 --- a/arch/x86_64/kernel/vmlinux.lds.S +++ b/arch/x86_64/kernel/vmlinux.lds.S @@ -28,7 +28,7 @@ SECTIONS _text = .; /* Text and read-only data */ .text : AT(ADDR(.text) - LOAD_OFFSET) { /* First the code that has to be first for bootstrapping */ - *(.bootstrap.text) + *(.text.head) _stext = .; /* Then the rest */ TEXT_TEXT @@ -48,10 +48,19 @@ SECTIONS __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { *(__ex_table) } __stop___ex_table = .; - BUG_TABLE + NOTES :text :note + + BUG_TABLE :text RODATA + . = ALIGN(4); + .tracedata : AT(ADDR(.tracedata) - LOAD_OFFSET) { + __tracedata_start = .; + *(.tracedata) + __tracedata_end = .; + } + . = ALIGN(PAGE_SIZE); /* Align data segment to page size boundary */ /* Data */ .data : AT(ADDR(.data) - LOAD_OFFSET) { @@ -91,6 +100,9 @@ SECTIONS .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) { *(.vsyscall_gtod_data) } vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data); + .vsyscall_clock : AT(VLOAD(.vsyscall_clock)) + { *(.vsyscall_clock) } + vsyscall_clock = VVIRT(.vsyscall_clock); .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) @@ -131,20 +143,11 @@ SECTIONS /* might get freed after init */ . = ALIGN(4096); __smp_alt_begin = .; - __smp_alt_instructions = .; - .smp_altinstructions : AT(ADDR(.smp_altinstructions) - LOAD_OFFSET) { - *(.smp_altinstructions) - } - __smp_alt_instructions_end = .; - . = ALIGN(8); __smp_locks = .; .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { *(.smp_locks) } __smp_locks_end = .; - .smp_altinstr_replacement : AT(ADDR(.smp_altinstr_replacement) - LOAD_OFFSET) { - *(.smp_altinstr_replacement) - } . = ALIGN(4096); __smp_alt_end = .; @@ -187,6 +190,12 @@ SECTIONS .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { *(.exit.text) } .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { *(.exit.data) } +/* vdso blob that is mapped into user space */ + vdso_start = . ; + .vdso : AT(ADDR(.vdso) - LOAD_OFFSET) { *(.vdso) } + . = ALIGN(4096); + vdso_end = .; + #ifdef CONFIG_BLK_DEV_INITRD . = ALIGN(4096); __initramfs_start = .; @@ -194,10 +203,8 @@ SECTIONS __initramfs_end = .; #endif - . = ALIGN(4096); - __per_cpu_start = .; - .data.percpu : AT(ADDR(.data.percpu) - LOAD_OFFSET) { *(.data.percpu) } - __per_cpu_end = .; + PERCPU(4096) + . = ALIGN(4096); __init_end = .; diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c index 57660d58d500..06c34949bfdc 100644 --- a/arch/x86_64/kernel/vsyscall.c +++ b/arch/x86_64/kernel/vsyscall.c @@ -42,6 +42,7 @@ #include <asm/segment.h> #include <asm/desc.h> #include <asm/topology.h> +#include <asm/vgtod.h> #define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) #define __syscall_clobber "r11","rcx","memory" @@ -57,26 +58,9 @@ * - writen by timer interrupt or systcl (/proc/sys/kernel/vsyscall64) * Try to keep this structure as small as possible to avoid cache line ping pongs */ -struct vsyscall_gtod_data_t { - seqlock_t lock; - - /* open coded 'struct timespec' */ - time_t wall_time_sec; - u32 wall_time_nsec; - - int sysctl_enabled; - struct timezone sys_tz; - struct { /* extract of a clocksource struct */ - cycle_t (*vread)(void); - cycle_t cycle_last; - cycle_t mask; - u32 mult; - u32 shift; - } clock; -}; int __vgetcpu_mode __section_vgetcpu_mode; -struct vsyscall_gtod_data_t __vsyscall_gtod_data __section_vsyscall_gtod_data = +struct vsyscall_gtod_data __vsyscall_gtod_data __section_vsyscall_gtod_data = { .lock = SEQLOCK_UNLOCKED, .sysctl_enabled = 1, @@ -96,6 +80,8 @@ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock) vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; vsyscall_gtod_data.sys_tz = sys_tz; + vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; + vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic; write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); } |