10 files changed, 480 insertions, 100 deletions
diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h
index 8e41071704a5..49ad773f4b9f 100644
--- a/arch/x86/include/asm/amd_nb.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -1,6 +1,7 @@
 #ifndef _ASM_X86_AMD_NB_H
 #define _ASM_X86_AMD_NB_H
 
+#include <linux/ioport.h>
 #include <linux/pci.h>
 
 struct amd_nb_bus_dev_range {
@@ -13,6 +14,7 @@ extern const struct pci_device_id amd_nb_misc_ids[];
 extern const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[];
 
 extern bool early_is_amd_nb(u32 value);
+extern struct resource *amd_get_mmconfig_range(struct resource *res);
 extern int amd_cache_northbridges(void);
 extern void amd_flush_garts(void);
 extern int amd_numa_init(void);
diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index 8e862aaf0d90..1b82f7e87393 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -65,7 +65,7 @@
  * UV2: Bit 19 selects between
  *  (0): 10 microsecond timebase and
  *  (1): 80 microseconds
- *  we're using 655us, similar to UV1: 65 units of 10us
+ *  we're using 560us, similar to UV1: 65 units of 10us
  */
 #define UV1_INTD_SOFT_ACK_TIMEOUT_PERIOD (9UL)
 #define UV2_INTD_SOFT_ACK_TIMEOUT_PERIOD (15UL)
@@ -167,6 +167,7 @@
 #define FLUSH_RETRY_TIMEOUT		2
 #define FLUSH_GIVEUP			3
 #define FLUSH_COMPLETE			4
+#define FLUSH_RETRY_BUSYBUG		5
 
 /*
  * tuning the action when the numalink network is extremely delayed
@@ -235,10 +236,10 @@ struct bau_msg_payload {
 
 
 /*
- * Message header:  16 bytes (128 bits) (bytes 0x30-0x3f of descriptor)
+ * UV1 Message header:  16 bytes (128 bits) (bytes 0x30-0x3f of descriptor)
  * see table 4.2.3.0.1 in broacast_assist spec.
  */
-struct bau_msg_header {
+struct uv1_bau_msg_header {
 	unsigned int	dest_subnodeid:6;	/* must be 0x10, for the LB */
 	/* bits 5:0 */
 	unsigned int	base_dest_nasid:15;	/* nasid of the first bit */
@@ -318,19 +319,87 @@ struct bau_msg_header {
 };
 
 /*
+ * UV2 Message header:  16 bytes (128 bits) (bytes 0x30-0x3f of descriptor)
+ * see figure 9-2 of harp_sys.pdf
+ */
+struct uv2_bau_msg_header {
+	unsigned int	base_dest_nasid:15;	/* nasid of the first bit */
+	/* bits 14:0 */				/* in uvhub map */
+	unsigned int	dest_subnodeid:5;	/* must be 0x10, for the LB */
+	/* bits 19:15 */
+	unsigned int	rsvd_1:1;		/* must be zero */
+	/* bit 20 */
+	/* Address bits 59:21 */
+	/* bits 25:2 of address (44:21) are payload */
+	/* these next 24 bits become bytes 12-14 of msg */
+	/* bits 28:21 land in byte 12 */
+	unsigned int	replied_to:1;		/* sent as 0 by the source to
+						   byte 12 */
+	/* bit 21 */
+	unsigned int	msg_type:3;		/* software type of the
+						   message */
+	/* bits 24:22 */
+	unsigned int	canceled:1;		/* message canceled, resource
+						   is to be freed*/
+	/* bit 25 */
+	unsigned int	payload_1:3;		/* not currently used */
+	/* bits 28:26 */
+
+	/* bits 36:29 land in byte 13 */
+	unsigned int	payload_2a:3;		/* not currently used */
+	unsigned int	payload_2b:5;		/* not currently used */
+	/* bits 36:29 */
+
+	/* bits 44:37 land in byte 14 */
+	unsigned int	payload_3:8;		/* not currently used */
+	/* bits 44:37 */
+
+	unsigned int	rsvd_2:7;		/* reserved */
+	/* bits 51:45 */
+	unsigned int	swack_flag:1;		/* software acknowledge flag */
+	/* bit 52 */
+	unsigned int	rsvd_3a:3;		/* must be zero */
+	unsigned int	rsvd_3b:8;		/* must be zero */
+	unsigned int	rsvd_3c:8;		/* must be zero */
+	unsigned int	rsvd_3d:3;		/* must be zero */
+	/* bits 74:53 */
+	unsigned int	fairness:3;		/* usually zero */
+	/* bits 77:75 */
+
+	unsigned int	sequence:16;		/* message sequence number */
+	/* bits 93:78  Suppl_A  */
+	unsigned int	chaining:1;		/* next descriptor is part of
+						   this activation*/
+	/* bit 94 */
+	unsigned int	multilevel:1;		/* multi-level multicast
+						   format */
+	/* bit 95 */
+	unsigned int	rsvd_4:24;		/* ordered / source node /
+						   source subnode / aging
+						   must be zero */
+	/* bits 119:96 */
+	unsigned int	command:8;		/* message type */
+	/* bits 127:120 */
+};
+
+/*
  * The activation descriptor:
  * The format of the message to send, plus all accompanying control
  * Should be 64 bytes
  */
 struct bau_desc {
-	struct pnmask			distribution;
+	struct pnmask				distribution;
 	/*
 	 * message template, consisting of header and payload:
 	 */
-	struct bau_msg_header		header;
-	struct bau_msg_payload		payload;
+	union bau_msg_header {
+		struct uv1_bau_msg_header	uv1_hdr;
+		struct uv2_bau_msg_header	uv2_hdr;
+	} header;
+
+	struct bau_msg_payload			payload;
 };
-/*
+/* UV1:
  *   -payload--    ---------header------
  *   bytes 0-11    bits 41-56  bits 58-81
  *       A           B  (2)      C (3)
@@ -340,6 +409,16 @@ struct bau_desc {
  *   bytes 0-11  bytes 12-14  bytes 16-17  (byte 15 filled in by hw as vector)
  *   ------------payload queue-----------
  */
+/* UV2:
+ *   -payload--    ---------header------
+ *   bytes 0-11    bits 70-78  bits 21-44
+ *       A           B  (2)      C (3)
+ *
+ *            A/B/C are moved to:
+ *       A            C          B
+ *   bytes 0-11  bytes 12-14  bytes 16-17  (byte 15 filled in by hw as vector)
+ *   ------------payload queue-----------
+ */
 
 /*
  * The payload queue on the destination side is an array of these.
@@ -385,7 +464,6 @@ struct bau_pq_entry {
 struct msg_desc {
 	struct bau_pq_entry	*msg;
 	int			msg_slot;
-	int			swack_slot;
 	struct bau_pq_entry	*queue_first;
 	struct bau_pq_entry	*queue_last;
 };
@@ -439,6 +517,9 @@ struct ptc_stats {
 	unsigned long	s_retry_messages;	/* retry broadcasts */
 	unsigned long	s_bau_reenabled;	/* for bau enable/disable */
 	unsigned long	s_bau_disabled;		/* for bau enable/disable */
+	unsigned long	s_uv2_wars;		/* uv2 workaround, perm. busy */
+	unsigned long	s_uv2_wars_hw;		/* uv2 workaround, hiwater */
+	unsigned long	s_uv2_war_waits;	/* uv2 workaround, long waits */
 	/* destination statistics */
 	unsigned long	d_alltlb;		/* times all tlb's on this
 						   cpu were flushed */
@@ -511,9 +592,12 @@ struct bau_control {
 	short			osnode;
 	short			uvhub_cpu;
 	short			uvhub;
+	short			uvhub_version;
 	short			cpus_in_socket;
 	short			cpus_in_uvhub;
 	short			partition_base_pnode;
+	short			using_desc; /* an index, like uvhub_cpu */
+	unsigned int		inuse_map;
 	unsigned short		message_number;
 	unsigned short		uvhub_quiesce;
 	short			socket_acknowledge_count[DEST_Q_SIZE];
@@ -531,6 +615,7 @@ struct bau_control {
 	int			cong_response_us;
 	int			cong_reps;
 	int			cong_period;
+	unsigned long		clocks_per_100_usec;
 	cycles_t		period_time;
 	long			period_requests;
 	struct hub_and_pnode	*thp;
@@ -591,6 +676,11 @@ static inline void write_mmr_sw_ack(unsigned long mr)
 	uv_write_local_mmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, mr);
 }
 
+static inline void write_gmmr_sw_ack(int pnode, unsigned long mr)
+{
+	write_gmmr(pnode, UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, mr);
+}
+
 static inline unsigned long read_mmr_sw_ack(void)
 {
 	return read_lmmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
index 4c39baa8facc..bae1efe6d515 100644
--- a/arch/x86/kernel/amd_nb.c
+++ b/arch/x86/kernel/amd_nb.c
@@ -119,6 +119,37 @@ bool __init early_is_amd_nb(u32 device)
 	return false;
 }
 
+struct resource *amd_get_mmconfig_range(struct resource *res)
+{
+	u32 address;
+	u64 base, msr;
+	unsigned segn_busn_bits;
+
+	if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
+		return NULL;
+
+	/* assume all cpus from fam10h have mmconfig */
+        if (boot_cpu_data.x86 < 0x10)
+		return NULL;
+
+	address = MSR_FAM10H_MMIO_CONF_BASE;
+	rdmsrl(address, msr);
+
+	/* mmconfig is not enabled */
+	if (!(msr & FAM10H_MMIO_CONF_ENABLE))
+		return NULL;
+
+	base = msr & (FAM10H_MMIO_CONF_BASE_MASK<<FAM10H_MMIO_CONF_BASE_SHIFT);
+
+	segn_busn_bits = (msr >> FAM10H_MMIO_CONF_BUSRANGE_SHIFT) &
+			 FAM10H_MMIO_CONF_BUSRANGE_MASK;
+
+	res->flags = IORESOURCE_MEM;
+	res->start = base;
+	res->end = base + (1ULL<<(segn_busn_bits + 20)) - 1;
+	return res;
+}
+
 int amd_get_subcaches(int cpu)
 {
 	struct pci_dev *link = node_to_amd_nb(amd_get_nb_id(cpu))->link;
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 9d59bbacd4e3..79b05b88aa19 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -769,7 +769,12 @@ void __init uv_system_init(void)
 	for(i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++)
 		uv_possible_blades +=
 		  hweight64(uv_read_local_mmr( UVH_NODE_PRESENT_TABLE + i * 8));
-	printk(KERN_DEBUG "UV: Found %d blades\n", uv_num_possible_blades());
+
+	/* uv_num_possible_blades() is really the hub count */
+	printk(KERN_INFO "UV: Found %d blades, %d hubs\n",
+			is_uv1_hub() ? uv_num_possible_blades() :
+			(uv_num_possible_blades() + 1) / 2,
+			uv_num_possible_blades());
 
 	bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades();
 	uv_blade_info = kzalloc(bytes, GFP_KERNEL);
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index 4b5ba85eb5c9..845df6835f9f 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -75,9 +75,9 @@ static unsigned long mmap_rnd(void)
 	*/
 	if (current->flags & PF_RANDOMIZE) {
 		if (mmap_is_ia32())
-			rnd = (long)get_random_int() % (1<<8);
+			rnd = get_random_int() % (1<<8);
 		else
-			rnd = (long)(get_random_int() % (1<<28));
+			rnd = get_random_int() % (1<<28);
 	}
 	return rnd << PAGE_SHIFT;
 }
diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c
index 81dbfdeb080d..7efd0c615d58 100644
--- a/arch/x86/mm/srat.c
+++ b/arch/x86/mm/srat.c
@@ -104,6 +104,8 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
 	if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0)
 		return;
 	pxm = pa->proximity_domain_lo;
+	if (acpi_srat_revision >= 2)
+		pxm |= *((unsigned int*)pa->proximity_domain_hi) << 8;
 	node = setup_node(pxm);
 	if (node < 0) {
 		printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
@@ -155,6 +157,8 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
 	start = ma->base_address;
 	end = start + ma->length;
 	pxm = ma->proximity_domain;
+	if (acpi_srat_revision <= 1)
+		pxm &= 0xff;
 	node = setup_node(pxm);
 	if (node < 0) {
 		printk(KERN_ERR "SRAT: Too many proximity domains.\n");
diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile
index 6b8759f7634e..d24d3da72926 100644
--- a/arch/x86/pci/Makefile
+++ b/arch/x86/pci/Makefile
@@ -18,8 +18,9 @@ obj-$(CONFIG_X86_NUMAQ)		+= numaq_32.o
 obj-$(CONFIG_X86_MRST)		+= mrst.o
 
 obj-y				+= common.o early.o
-obj-y				+= amd_bus.o bus_numa.o
+obj-y				+= bus_numa.o
 
+obj-$(CONFIG_AMD_NB)		+= amd_bus.o
 obj-$(CONFIG_PCI_CNB20LE_QUIRK)	+= broadcom_bus.o
 
 ifeq ($(CONFIG_PCI_DEBUG),y)
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 404f21a3ff9e..f8348ab10324 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -149,7 +149,7 @@ setup_resource(struct acpi_resource *acpi_res, void *data)
 	struct acpi_resource_address64 addr;
 	acpi_status status;
 	unsigned long flags;
-	u64 start, end;
+	u64 start, orig_end, end;
 
 	status = resource_to_addr(acpi_res, &addr);
 	if (!ACPI_SUCCESS(status))
@@ -165,7 +165,21 @@ setup_resource(struct acpi_resource *acpi_res, void *data)
 		return AE_OK;
 
 	start = addr.minimum + addr.translation_offset;
-	end = addr.maximum + addr.translation_offset;
+	orig_end = end = addr.maximum + addr.translation_offset;
+
+	/* Exclude non-addressable range or non-addressable portion of range */
+	end = min(end, (u64)iomem_resource.end);
+	if (end <= start) {
+		dev_info(&info->bridge->dev,
+			"host bridge window [%#llx-%#llx] "
+			"(ignored, not CPU addressable)\n", start, orig_end);
+		return AE_OK;
+	} else if (orig_end != end) {
+		dev_info(&info->bridge->dev,
+			"host bridge window [%#llx-%#llx] "
+			"([%#llx-%#llx] ignored, not CPU addressable)\n",
+			start, orig_end, end + 1, orig_end);
+	}
 
 	res = &info->res[info->res_num];
 	res->name = info->name;
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index 026e4931d162..385a940b5422 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -30,34 +30,6 @@ static struct pci_hostbridge_probe pci_probes[] __initdata = {
 	{ 0, 0x18, PCI_VENDOR_ID_AMD, 0x1300 },
 };
 
-static u64 __initdata fam10h_mmconf_start;
-static u64 __initdata fam10h_mmconf_end;
-static void __init get_pci_mmcfg_amd_fam10h_range(void)
-{
-	u32 address;
-	u64 base, msr;
-	unsigned segn_busn_bits;
-
-	/* assume all cpus from fam10h have mmconf */
-        if (boot_cpu_data.x86 < 0x10)
-		return;
-
-	address = MSR_FAM10H_MMIO_CONF_BASE;
-	rdmsrl(address, msr);
-
-	/* mmconfig is not enable */
-	if (!(msr & FAM10H_MMIO_CONF_ENABLE))
-		return;
-
-	base = msr & (FAM10H_MMIO_CONF_BASE_MASK<<FAM10H_MMIO_CONF_BASE_SHIFT);
-
-	segn_busn_bits = (msr >> FAM10H_MMIO_CONF_BUSRANGE_SHIFT) &
-			 FAM10H_MMIO_CONF_BUSRANGE_MASK;
-
-	fam10h_mmconf_start = base;
-	fam10h_mmconf_end = base + (1ULL<<(segn_busn_bits + 20)) - 1;
-}
-
 #define RANGE_NUM 16
 
 /**
@@ -85,6 +57,9 @@ static int __init early_fill_mp_bus_info(void)
 	u64 val;
 	u32 address;
 	bool found;
+	struct resource fam10h_mmconf_res, *fam10h_mmconf;
+	u64 fam10h_mmconf_start;
+	u64 fam10h_mmconf_end;
 
 	if (!early_pci_allowed())
 		return -1;
@@ -211,12 +186,17 @@ static int __init early_fill_mp_bus_info(void)
 		subtract_range(range, RANGE_NUM, 0, end);
 
 	/* get mmconfig */
-	get_pci_mmcfg_amd_fam10h_range();
+	fam10h_mmconf = amd_get_mmconfig_range(&fam10h_mmconf_res);
 	/* need to take out mmconf range */
-	if (fam10h_mmconf_end) {
-		printk(KERN_DEBUG "Fam 10h mmconf [%llx, %llx]\n", fam10h_mmconf_start, fam10h_mmconf_end);
+	if (fam10h_mmconf) {
+		printk(KERN_DEBUG "Fam 10h mmconf %pR\n", fam10h_mmconf);
+		fam10h_mmconf_start = fam10h_mmconf->start;
+		fam10h_mmconf_end = fam10h_mmconf->end;
 		subtract_range(range, RANGE_NUM, fam10h_mmconf_start,
 				 fam10h_mmconf_end + 1);
+	} else {
+		fam10h_mmconf_start = 0;
+		fam10h_mmconf_end = 0;
 	}
 
 	/* mmio resource */
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index 5b552198f774..9010ca715c03 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -157,13 +157,14 @@ static int __init uvhub_to_first_apicid(int uvhub)
  * clear of the Timeout bit (as well) will free the resource. No reply will
  * be sent (the hardware will only do one reply per message).
  */
-static void reply_to_message(struct msg_desc *mdp, struct bau_control *bcp)
+static void reply_to_message(struct msg_desc *mdp, struct bau_control *bcp,
+						int do_acknowledge)
 {
 	unsigned long dw;
 	struct bau_pq_entry *msg;
 
 	msg = mdp->msg;
-	if (!msg->canceled) {
+	if (!msg->canceled && do_acknowledge) {
 		dw = (msg->swack_vec << UV_SW_ACK_NPENDING) | msg->swack_vec;
 		write_mmr_sw_ack(dw);
 	}
@@ -212,8 +213,8 @@ static void bau_process_retry_msg(struct msg_desc *mdp,
 			if (mmr & (msg_res << UV_SW_ACK_NPENDING)) {
 				unsigned long mr;
 				/*
-				 * is the resource timed out?
-				 * make everyone ignore the cancelled message.
+				 * Is the resource timed out?
+				 * Make everyone ignore the cancelled message.
 				 */
 				msg2->canceled = 1;
 				stat->d_canceled++;
@@ -231,8 +232,8 @@ static void bau_process_retry_msg(struct msg_desc *mdp,
  * Do all the things a cpu should do for a TLB shootdown message.
  * Other cpu's may come here at the same time for this message.
  */
-static void bau_process_message(struct msg_desc *mdp,
-					struct bau_control *bcp)
+static void bau_process_message(struct msg_desc *mdp, struct bau_control *bcp,
+						int do_acknowledge)
 {
 	short socket_ack_count = 0;
 	short *sp;
@@ -284,8 +285,9 @@ static void bau_process_message(struct msg_desc *mdp,
 		if (msg_ack_count == bcp->cpus_in_uvhub) {
 			/*
 			 * All cpus in uvhub saw it; reply
+			 * (unless we are in the UV2 workaround)
 			 */
-			reply_to_message(mdp, bcp);
+			reply_to_message(mdp, bcp, do_acknowledge);
 		}
 	}
 
@@ -491,27 +493,138 @@ static int uv1_wait_completion(struct bau_desc *bau_desc,
 /*
  * UV2 has an extra bit of status in the ACTIVATION_STATUS_2 register.
  */
-static unsigned long uv2_read_status(unsigned long offset, int rshft, int cpu)
+static unsigned long uv2_read_status(unsigned long offset, int rshft, int desc)
 {
 	unsigned long descriptor_status;
 	unsigned long descriptor_status2;
 
 	descriptor_status = ((read_lmmr(offset) >> rshft) & UV_ACT_STATUS_MASK);
-	descriptor_status2 = (read_mmr_uv2_status() >> cpu) & 0x1UL;
+	descriptor_status2 = (read_mmr_uv2_status() >> desc) & 0x1UL;
 	descriptor_status = (descriptor_status << 1) | descriptor_status2;
 	return descriptor_status;
 }
 
+/*
+ * Return whether the status of the descriptor that is normally used for this
+ * cpu (the one indexed by its hub-relative cpu number) is busy.
+ * The status of the original 32 descriptors is always reflected in the 64
+ * bits of UVH_LB_BAU_SB_ACTIVATION_STATUS_0.
+ * The bit provided by the activation_status_2 register is irrelevant to
+ * the status if it is only being tested for busy or not busy.
+ */
+int normal_busy(struct bau_control *bcp)
+{
+	int cpu = bcp->uvhub_cpu;
+	int mmr_offset;
+	int right_shift;
+
+	mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0;
+	right_shift = cpu * UV_ACT_STATUS_SIZE;
+	return (((((read_lmmr(mmr_offset) >> right_shift) &
+				UV_ACT_STATUS_MASK)) << 1) == UV2H_DESC_BUSY);
+}
+
+/*
+ * Entered when a bau descriptor has gone into a permanent busy wait because
+ * of a hardware bug.
+ * Workaround the bug.
+ */
+int handle_uv2_busy(struct bau_control *bcp)
+{
+	int busy_one = bcp->using_desc;
+	int normal = bcp->uvhub_cpu;
+	int selected = -1;
+	int i;
+	unsigned long descriptor_status;
+	unsigned long status;
+	int mmr_offset;
+	struct bau_desc *bau_desc_old;
+	struct bau_desc *bau_desc_new;
+	struct bau_control *hmaster = bcp->uvhub_master;
+	struct ptc_stats *stat = bcp->statp;
+	cycles_t ttm;
+
+	stat->s_uv2_wars++;
+	spin_lock(&hmaster->uvhub_lock);
+	/* try for the original first */
+	if (busy_one != normal) {
+		if (!normal_busy(bcp))
+			selected = normal;
+	}
+	if (selected < 0) {
+		/* can't use the normal, select an alternate */
+		mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1;
+		descriptor_status = read_lmmr(mmr_offset);
+
+		/* scan available descriptors 32-63 */
+		for (i = 0; i < UV_CPUS_PER_AS; i++) {
+			if ((hmaster->inuse_map & (1 << i)) == 0) {
+				status = ((descriptor_status >>
+						(i * UV_ACT_STATUS_SIZE)) &
+						UV_ACT_STATUS_MASK) << 1;
+				if (status != UV2H_DESC_BUSY) {
+					selected = i + UV_CPUS_PER_AS;
+					break;
+				}
+			}
+		}
+	}
+
+	if (busy_one != normal)
+		/* mark the busy alternate as not in-use */
+		hmaster->inuse_map &= ~(1 << (busy_one - UV_CPUS_PER_AS));
+
+	if (selected >= 0) {
+		/* switch to the selected descriptor */
+		if (selected != normal) {
+			/* set the selected alternate as in-use */
+			hmaster->inuse_map |=
+					(1 << (selected - UV_CPUS_PER_AS));
+			if (selected > stat->s_uv2_wars_hw)
+				stat->s_uv2_wars_hw = selected;
+		}
+		bau_desc_old = bcp->descriptor_base;
+		bau_desc_old += (ITEMS_PER_DESC * busy_one);
+		bcp->using_desc = selected;
+		bau_desc_new = bcp->descriptor_base;
+		bau_desc_new += (ITEMS_PER_DESC * selected);
+		*bau_desc_new = *bau_desc_old;
+	} else {
+		/*
+		 * All are busy. Wait for the normal one for this cpu to
+		 * free up.
+		 */
+		stat->s_uv2_war_waits++;
+		spin_unlock(&hmaster->uvhub_lock);
+		ttm = get_cycles();
+		do {
+			cpu_relax();
+		} while (normal_busy(bcp));
+		spin_lock(&hmaster->uvhub_lock);
+		/* switch to the original descriptor */
+		bcp->using_desc = normal;
+		bau_desc_old = bcp->descriptor_base;
+		bau_desc_old += (ITEMS_PER_DESC * bcp->using_desc);
+		bcp->using_desc = (ITEMS_PER_DESC * normal);
+		bau_desc_new = bcp->descriptor_base;
+		bau_desc_new += (ITEMS_PER_DESC * normal);
+		*bau_desc_new = *bau_desc_old; /* copy the entire descriptor */
+	}
+	spin_unlock(&hmaster->uvhub_lock);
+	return FLUSH_RETRY_BUSYBUG;
+}
+
 static int uv2_wait_completion(struct bau_desc *bau_desc,
 				unsigned long mmr_offset, int right_shift,
 				struct bau_control *bcp, long try)
 {
 	unsigned long descriptor_stat;
 	cycles_t ttm;
-	int cpu = bcp->uvhub_cpu;
+	int desc = bcp->using_desc;
+	long busy_reps = 0;
 	struct ptc_stats *stat = bcp->statp;
 
-	descriptor_stat = uv2_read_status(mmr_offset, right_shift, cpu);
+	descriptor_stat = uv2_read_status(mmr_offset, right_shift, desc);
 
 	/* spin on the status MMR, waiting for it to go idle */
 	while (descriptor_stat != UV2H_DESC_IDLE) {
@@ -542,12 +655,23 @@ static int uv2_wait_completion(struct bau_desc *bau_desc,
 			bcp->conseccompletes = 0;
 			return FLUSH_RETRY_TIMEOUT;
 		} else {
+			busy_reps++;
+			if (busy_reps > 1000000) {
+				/* not to hammer on the clock */
+				busy_reps = 0;
+				ttm = get_cycles();
+				if ((ttm - bcp->send_message) >
+					(bcp->clocks_per_100_usec)) {
+					return handle_uv2_busy(bcp);
+				}
+			}
 			/*
 			 * descriptor_stat is still BUSY
 			 */
 			cpu_relax();
 		}
-		descriptor_stat = uv2_read_status(mmr_offset, right_shift, cpu);
+		descriptor_stat = uv2_read_status(mmr_offset, right_shift,
+									desc);
 	}
 	bcp->conseccompletes++;
 	return FLUSH_COMPLETE;
@@ -563,17 +687,17 @@ static int wait_completion(struct bau_desc *bau_desc,
 {
 	int right_shift;
 	unsigned long mmr_offset;
-	int cpu = bcp->uvhub_cpu;
+	int desc = bcp->using_desc;
 
-	if (cpu < UV_CPUS_PER_AS) {
+	if (desc < UV_CPUS_PER_AS) {
 		mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0;
-		right_shift = cpu * UV_ACT_STATUS_SIZE;
+		right_shift = desc * UV_ACT_STATUS_SIZE;
 	} else {
 		mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1;
-		right_shift = ((cpu - UV_CPUS_PER_AS) * UV_ACT_STATUS_SIZE);
+		right_shift = ((desc - UV_CPUS_PER_AS) * UV_ACT_STATUS_SIZE);
 	}
 
-	if (is_uv1_hub())
+	if (bcp->uvhub_version == 1)
 		return uv1_wait_completion(bau_desc, mmr_offset, right_shift,
 								bcp, try);
 	else
@@ -752,19 +876,22 @@ static void handle_cmplt(int completion_status, struct bau_desc *bau_desc,
  * Returns 1 if it gives up entirely and the original cpu mask is to be
  * returned to the kernel.
  */
-int uv_flush_send_and_wait(struct bau_desc *bau_desc,
-			struct cpumask *flush_mask, struct bau_control *bcp)
+int uv_flush_send_and_wait(struct cpumask *flush_mask, struct bau_control *bcp)
 {
 	int seq_number = 0;
 	int completion_stat = 0;
+	int uv1 = 0;
 	long try = 0;
 	unsigned long index;
 	cycles_t time1;
 	cycles_t time2;
 	struct ptc_stats *stat = bcp->statp;
 	struct bau_control *hmaster = bcp->uvhub_master;
+	struct uv1_bau_msg_header *uv1_hdr = NULL;
+	struct uv2_bau_msg_header *uv2_hdr = NULL;
+	struct bau_desc *bau_desc;
 
-	if (is_uv1_hub())
+	if (bcp->uvhub_version == 1)
 		uv1_throttle(hmaster, stat);
 
 	while (hmaster->uvhub_quiesce)
@@ -772,22 +899,39 @@ int uv_flush_send_and_wait(struct bau_desc *bau_desc,
 
 	time1 = get_cycles();
 	do {
-		if (try == 0) {
-			bau_desc->header.msg_type = MSG_REGULAR;
+		bau_desc = bcp->descriptor_base;
+		bau_desc += (ITEMS_PER_DESC * bcp->using_desc);
+		if (bcp->uvhub_version == 1) {
+			uv1 = 1;
+			uv1_hdr = &bau_desc->header.uv1_hdr;
+		} else
+			uv2_hdr = &bau_desc->header.uv2_hdr;
+		if ((try == 0) || (completion_stat == FLUSH_RETRY_BUSYBUG)) {
+			if (uv1)
+				uv1_hdr->msg_type = MSG_REGULAR;
+			else
+				uv2_hdr->msg_type = MSG_REGULAR;
 			seq_number = bcp->message_number++;
 		} else {
-			bau_desc->header.msg_type = MSG_RETRY;
+			if (uv1)
+				uv1_hdr->msg_type = MSG_RETRY;
+			else
+				uv2_hdr->msg_type = MSG_RETRY;
 			stat->s_retry_messages++;
 		}
 
-		bau_desc->header.sequence = seq_number;
-		index = (1UL << AS_PUSH_SHIFT) | bcp->uvhub_cpu;
+		if (uv1)
+			uv1_hdr->sequence = seq_number;
+		else
+			uv2_hdr->sequence = seq_number;
+		index = (1UL << AS_PUSH_SHIFT) | bcp->using_desc;
 		bcp->send_message = get_cycles();
 
 		write_mmr_activation(index);
 
 		try++;
 		completion_stat = wait_completion(bau_desc, bcp, try);
+		/* UV2: wait_completion() may change the bcp->using_desc */
 
 		handle_cmplt(completion_stat, bau_desc, bcp, hmaster, stat);
 
@@ -798,6 +942,7 @@ int uv_flush_send_and_wait(struct bau_desc *bau_desc,
 		}
 		cpu_relax();
 	} while ((completion_stat == FLUSH_RETRY_PLUGGED) ||
+		 (completion_stat == FLUSH_RETRY_BUSYBUG) ||
 		 (completion_stat == FLUSH_RETRY_TIMEOUT));
 
 	time2 = get_cycles();
@@ -812,6 +957,7 @@ int uv_flush_send_and_wait(struct bau_desc *bau_desc,
 	record_send_stats(time1, time2, bcp, stat, completion_stat, try);
 
 	if (completion_stat == FLUSH_GIVEUP)
+		/* FLUSH_GIVEUP will fall back to using IPI's for tlb flush */
 		return 1;
 	return 0;
 }
@@ -967,7 +1113,7 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
 		stat->s_ntargself++;
 
 	bau_desc = bcp->descriptor_base;
-	bau_desc += ITEMS_PER_DESC * bcp->uvhub_cpu;
+	bau_desc += (ITEMS_PER_DESC * bcp->using_desc);
 	bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
 	if (set_distrib_bits(flush_mask, bcp, bau_desc, &locals, &remotes))
 		return NULL;
@@ -980,13 +1126,86 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
 	 * uv_flush_send_and_wait returns 0 if all cpu's were messaged,
 	 * or 1 if it gave up and the original cpumask should be returned.
 	 */
-	if (!uv_flush_send_and_wait(bau_desc, flush_mask, bcp))
+	if (!uv_flush_send_and_wait(flush_mask, bcp))
 		return NULL;
 	else
 		return cpumask;
 }
 
 /*
+ * Search the message queue for any 'other' message with the same software
+ * acknowledge resource bit vector.
+ */
+struct bau_pq_entry *find_another_by_swack(struct bau_pq_entry *msg,
+			struct bau_control *bcp, unsigned char swack_vec)
+{
+	struct bau_pq_entry *msg_next = msg + 1;
+
+	if (msg_next > bcp->queue_last)
+		msg_next = bcp->queue_first;
+	while ((msg_next->swack_vec != 0) && (msg_next != msg)) {
+		if (msg_next->swack_vec == swack_vec)
+			return msg_next;
+		msg_next++;
+		if (msg_next > bcp->queue_last)
+			msg_next = bcp->queue_first;
+	}
+	return NULL;
+}
+
+/*
+ * UV2 needs to work around a bug in which an arriving message has not
+ * set a bit in the UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE register.
+ * Such a message must be ignored.
+ */
+void process_uv2_message(struct msg_desc *mdp, struct bau_control *bcp)
+{
+	unsigned long mmr_image;
+	unsigned char swack_vec;
+	struct bau_pq_entry *msg = mdp->msg;
+	struct bau_pq_entry *other_msg;
+
+	mmr_image = read_mmr_sw_ack();
+	swack_vec = msg->swack_vec;
+
+	if ((swack_vec & mmr_image) == 0) {
+		/*
+		 * This message was assigned a swack resource, but no
+		 * reserved acknowlegment is pending.
+		 * The bug has prevented this message from setting the MMR.
+		 * And no other message has used the same sw_ack resource.
+		 * Do the requested shootdown but do not reply to the msg.
+		 * (the 0 means make no acknowledge)
+		 */
+		bau_process_message(mdp, bcp, 0);
+		return;
+	}
+
+	/*
+	 * Some message has set the MMR 'pending' bit; it might have been
+	 * another message.  Look for that message.
+	 */
+	other_msg = find_another_by_swack(msg, bcp, msg->swack_vec);
+	if (other_msg) {
+		/* There is another.  Do not ack the current one. */
+		bau_process_message(mdp, bcp, 0);
+		/*
+		 * Let the natural processing of that message acknowledge
+		 * it. Don't get the processing of sw_ack's out of order.
+		 */
+		return;
+	}
+
+	/*
+	 * There is no other message using this sw_ack, so it is safe to
+	 * acknowledge it.
+	 */
+	bau_process_message(mdp, bcp, 1);
+
+	return;
+}
+
+/*
  * The BAU message interrupt comes here. (registered by set_intr_gate)
  * See entry_64.S
  *
@@ -1022,9 +1241,11 @@ void uv_bau_message_interrupt(struct pt_regs *regs)
 		count++;
 
 		msgdesc.msg_slot = msg - msgdesc.queue_first;
-		msgdesc.swack_slot = ffs(msg->swack_vec) - 1;
 		msgdesc.msg = msg;
-		bau_process_message(&msgdesc, bcp);
+		if (bcp->uvhub_version == 2)
+			process_uv2_message(&msgdesc, bcp);
+		else
+			bau_process_message(&msgdesc, bcp, 1);
 
 		msg++;
 		if (msg > msgdesc.queue_last)
@@ -1083,7 +1304,7 @@ static void __init enable_timeouts(void)
 		 */
 		mmr_image |= (1L << SOFTACK_MSHIFT);
 		if (is_uv2_hub()) {
-			mmr_image |= (1L << UV2_LEG_SHFT);
+			mmr_image &= ~(1L << UV2_LEG_SHFT);
 			mmr_image |= (1L << UV2_EXT_SHFT);
 		}
 		write_mmr_misc_control(pnode, mmr_image);
@@ -1142,7 +1363,7 @@ static int ptc_seq_show(struct seq_file *file, void *data)
 		seq_printf(file,
 			"all one mult none retry canc nocan reset rcan ");
 		seq_printf(file,
-			"disable enable\n");
+			"disable enable wars warshw warwaits\n");
 	}
 	if (cpu < num_possible_cpus() && cpu_online(cpu)) {
 		stat = &per_cpu(ptcstats, cpu);
@@ -1173,8 +1394,10 @@ static int ptc_seq_show(struct seq_file *file, void *data)
 			   stat->d_nomsg, stat->d_retries, stat->d_canceled,
 			   stat->d_nocanceled, stat->d_resets,
 			   stat->d_rcanceled);
-		seq_printf(file, "%ld %ld\n",
-			stat->s_bau_disabled, stat->s_bau_reenabled);
+		seq_printf(file, "%ld %ld %ld %ld %ld\n",
+			stat->s_bau_disabled, stat->s_bau_reenabled,
+			stat->s_uv2_wars, stat->s_uv2_wars_hw,
+			stat->s_uv2_war_waits);
 	}
 	return 0;
 }
@@ -1432,12 +1655,15 @@ static void activation_descriptor_init(int node, int pnode, int base_pnode)
 {
 	int i;
 	int cpu;
+	int uv1 = 0;
 	unsigned long gpa;
 	unsigned long m;
 	unsigned long n;
 	size_t dsize;
 	struct bau_desc *bau_desc;
 	struct bau_desc *bd2;
+	struct uv1_bau_msg_header *uv1_hdr;
+	struct uv2_bau_msg_header *uv2_hdr;
 	struct bau_control *bcp;
 
 	/*
@@ -1451,6 +1677,8 @@ static void activation_descriptor_init(int node, int pnode, int base_pnode)
 	gpa = uv_gpa(bau_desc);
 	n = uv_gpa_to_gnode(gpa);
 	m = uv_gpa_to_offset(gpa);
+	if (is_uv1_hub())
+		uv1 = 1;
 
 	/* the 14-bit pnode */
 	write_mmr_descriptor_base(pnode, (n << UV_DESC_PSHIFT | m));
@@ -1461,21 +1689,33 @@ static void activation_descriptor_init(int node, int pnode, int base_pnode)
 	 */
 	for (i = 0, bd2 = bau_desc; i < (ADP_SZ * ITEMS_PER_DESC); i++, bd2++) {
 		memset(bd2, 0, sizeof(struct bau_desc));
-		bd2->header.swack_flag =	1;
-		/*
-		 * The base_dest_nasid set in the message header is the nasid
-		 * of the first uvhub in the partition. The bit map will
-		 * indicate destination pnode numbers relative to that base.
-		 * They may not be consecutive if nasid striding is being used.
-		 */
-		bd2->header.base_dest_nasid =	UV_PNODE_TO_NASID(base_pnode);
-		bd2->header.dest_subnodeid =	UV_LB_SUBNODEID;
-		bd2->header.command =		UV_NET_ENDPOINT_INTD;
-		bd2->header.int_both =		1;
-		/*
-		 * all others need to be set to zero:
-		 *   fairness chaining multilevel count replied_to
-		 */
+		if (uv1) {
+			uv1_hdr = &bd2->header.uv1_hdr;
+			uv1_hdr->swack_flag =	1;
+			/*
+			 * The base_dest_nasid set in the message header
+			 * is the nasid of the first uvhub in the partition.
+			 * The bit map will indicate destination pnode numbers
+			 * relative to that base. They may not be consecutive
+			 * if nasid striding is being used.
+			 */
+			uv1_hdr->base_dest_nasid =
+						UV_PNODE_TO_NASID(base_pnode);
+			uv1_hdr->dest_subnodeid =	UV_LB_SUBNODEID;
+			uv1_hdr->command =		UV_NET_ENDPOINT_INTD;
+			uv1_hdr->int_both =		1;
+			/*
+			 * all others need to be set to zero:
+			 *   fairness chaining multilevel count replied_to
+			 */
+		} else {
+			uv2_hdr = &bd2->header.uv2_hdr;
+			uv2_hdr->swack_flag =	1;
+			uv2_hdr->base_dest_nasid =
+						UV_PNODE_TO_NASID(base_pnode);
+			uv2_hdr->dest_subnodeid =	UV_LB_SUBNODEID;
+			uv2_hdr->command =		UV_NET_ENDPOINT_INTD;
+		}
 	}
 	for_each_present_cpu(cpu) {
 		if (pnode != uv_blade_to_pnode(uv_cpu_to_blade_id(cpu)))
@@ -1531,6 +1771,7 @@ static void pq_init(int node, int pnode)
 	write_mmr_payload_first(pnode, pn_first);
 	write_mmr_payload_tail(pnode, first);
 	write_mmr_payload_last(pnode, last);
+	write_gmmr_sw_ack(pnode, 0xffffUL);
 
 	/* in effect, all msg_type's are set to MSG_NOOP */
 	memset(pqp, 0, sizeof(struct bau_pq_entry) * DEST_Q_SIZE);
@@ -1584,14 +1825,14 @@ static int calculate_destination_timeout(void)
 		ts_ns = base * mult1 * mult2;
 		ret = ts_ns / 1000;
 	} else {
-		/* 4 bits  0/1 for 10/80us, 3 bits of multiplier */
-		mmr_image = uv_read_local_mmr(UVH_AGING_PRESCALE_SEL);
+		/* 4 bits  0/1 for 10/80us base, 3 bits of multiplier */
+		mmr_image = uv_read_local_mmr(UVH_LB_BAU_MISC_CONTROL);
 		mmr_image = (mmr_image & UV_SA_MASK) >> UV_SA_SHFT;
 		if (mmr_image & (1L << UV2_ACK_UNITS_SHFT))
-			mult1 = 80;
+			base = 80;
 		else
-			mult1 = 10;
-		base = mmr_image & UV2_ACK_MASK;
+			base = 10;
+		mult1 = mmr_image & UV2_ACK_MASK;
 		ret = mult1 * base;
 	}
 	return ret;
@@ -1618,6 +1859,7 @@ static void __init init_per_cpu_tunables(void)
 		bcp->cong_response_us		= congested_respns_us;
 		bcp->cong_reps			= congested_reps;
 		bcp->cong_period		= congested_period;
+		bcp->clocks_per_100_usec =	usec_2_cycles(100);
 	}
 }
 
@@ -1728,8 +1970,17 @@ static int scan_sock(struct socket_desc *sdp, struct uvhub_desc *bdp,
 		bcp->cpus_in_socket = sdp->num_cpus;
 		bcp->socket_master = *smasterp;
 		bcp->uvhub = bdp->uvhub;
+		if (is_uv1_hub())
+			bcp->uvhub_version = 1;
+		else if (is_uv2_hub())
+			bcp->uvhub_version = 2;
+		else {
+			printk(KERN_EMERG "uvhub version not 1 or 2\n");
+			return 1;
+		}
 		bcp->uvhub_master = *hmasterp;
 		bcp->uvhub_cpu = uv_cpu_hub_info(cpu)->blade_processor_id;
+		bcp->using_desc = bcp->uvhub_cpu;
 		if (bcp->uvhub_cpu >= MAX_CPUS_PER_UVHUB) {
 			printk(KERN_EMERG "%d cpus per uvhub invalid\n",
 				bcp->uvhub_cpu);
@@ -1845,6 +2096,8 @@ static int __init uv_bau_init(void)
 			uv_base_pnode = uv_blade_to_pnode(uvhub);
 	}
 
+	enable_timeouts();
+
 	if (init_per_cpu(nuvhubs, uv_base_pnode)) {
 		nobau = 1;
 		return 0;
@@ -1855,7 +2108,6 @@ static int __init uv_bau_init(void)
 		if (uv_blade_nr_possible_cpus(uvhub))
 			init_uvhub(uvhub, vector, uv_base_pnode);
 
-	enable_timeouts();
 	alloc_intr_gate(vector, uv_bau_message_intr1);
 
 	for_each_possible_blade(uvhub) {
@@ -1867,7 +2119,8 @@ static int __init uv_bau_init(void)
 			val = 1L << 63;
 			write_gmmr_activation(pnode, val);
 			mmr = 1; /* should be 1 to broadcast to both sockets */
-			write_mmr_data_broadcast(pnode, mmr);
+			if (!is_uv1_hub())
+				write_mmr_data_broadcast(pnode, mmr);
 		}
 	}