timer: fix the too many reries on the per-cpu event device

There are so many retries happen on the per-cpu event device when run the command 'cat /proc/timer_list', as following: root@~$ cat /proc/timer_list Timer List Version: v0.6 HRTIMER_MAX_CLOCK_BASES: 3 now at 3297691988044 nsecs Tick Device: mode: 1 Per CPU device: 0 Clock Event Device: local_timer max_delta_ns: 8624432320 min_delta_ns: 1000 mult: 2138893713 shift: 32 mode: 3 next_event: 3297700000000 nsecs set_next_event: twd_set_next_event set_mode: twd_set_mode event_handler: hrtimer_interrupt retries: 36383 the reason is that the local timer will stop when enter C3 state, we need switch the local timer to bc timer when enter the state and switch back when exit from the that state.The code is like this: void arch_idle(void) { .... clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu); enter_the_wait_mode(); clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu); } when the broadcast timer interrupt arrives(this interrupt just wakeup the ARM, and ARM has no chance to handle it since local irq is disabled. In fact it's disabled in cpu_idle() of arch/arm/kernel/process.c) the broadcast timer interrupt will wake up the CPU and run: clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu); -> tick_broadcast_oneshot_control(...); -> tick_program_event(dev->next_event, 1); -> tick_dev_program_event(dev, expires, force); -> for (i = 0;;) { int ret = clockevents_program_event(dev, expires, now); if (!ret || !force) return ret; dev->retries++; .... now = ktime_get(); expires = ktime_add_ns(now, dev->min_delta_ns); } clockevents_program_event(dev, expires, now); delta = ktime_to_ns(ktime_sub(expires, now)); if (delta <= 0) return -ETIME; when the bc timer interrupt arrives, which means the last local timer expires too. so, clockevents_program_event will return -ETIME, which will cause the dev->retries++ when retry to program the expired timer. Even under the worst case, after the re-program the expired timer, then CPU enter idle quickly before the re-progam timer expired, it will make system ping-pang forever if no interrupt happen. We have found the ping-pang issue during the video play-back test. system will freeze and video not playing for sometime until other interrupt occured to break the error condition. The detailed information, please refer to the LKML:https://lkml.org/lkml/2013/2/20/216 which posted by Jason Liu. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Signed-off-by: Jason Liu <r64343@freescale.com> Tested-by: Jason Liu <r64343@freescale.com> Tested-by: Santosh Shilimkar <santosh.shilimkar@ti.com> Tested-by: Lorenzo Pieralisi <lorenzo.pieralisi@arm.com>
author: Jason Liu <r64343@freescale.com> 2013-02-25 19:55:04 +0800
committer: Jason Liu <r64343@freescale.com> 2013-03-01 17:17:44 +0800
commit: e83a0cfe4c5c2ab2045bc6d93b34323faafdcc4c (patch)
tree: 1d45d4839d7be12fc9c11b5fa209d3b172aef6dd /kernel
parent: 751d8fb681ec42f803283861db245a57233a9b07 (diff)
1 files changed, 83 insertions, 4 deletions
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 7a90d021b79a..bc361151e22f 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -360,6 +360,8 @@ int tick_resume_broadcast(void)
 
 /* FIXME: use cpumask_var_t. */
 static DECLARE_BITMAP(tick_broadcast_oneshot_mask, NR_CPUS);
+static DECLARE_BITMAP(tick_broadcast_pending, NR_CPUS);
+static DECLARE_BITMAP(tick_force_broadcast_mask, NR_CPUS);
 
 /*
  * Exposed for debugging: see timer_list.c
@@ -376,6 +378,15 @@ static int tick_broadcast_set_event(ktime_t expires, int force)
 	return tick_dev_program_event(bc, expires, force);
 }
 
+/*
+ * Called before going idle with interrupts disabled. Checks whether a
+ * broadcast event from the other core is about to happen.
+ */
+int tick_check_broadcast_pending(void)
+{
+	return test_bit(smp_processor_id(), tick_force_broadcast_mask);
+}
+
 int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
 {
 	clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
@@ -413,12 +424,24 @@ again:
 	/* Find all expired events */
 	for_each_cpu(cpu, tick_get_broadcast_oneshot_mask()) {
 		td = &per_cpu(tick_cpu_device, cpu);
-		if (td->evtdev->next_event.tv64 <= now.tv64)
+		if (td->evtdev->next_event.tv64 <= now.tv64) {
 			cpumask_set_cpu(cpu, to_cpumask(tmpmask));
-		else if (td->evtdev->next_event.tv64 < next_event.tv64)
+			/*
+			 * Mark the remote cpu in the pending mask, so
+			 * it can avoid reprogramming the cpu local
+			 * timer in tick_broadcast_oneshot_control().
+			 */
+			set_bit(cpu, tick_broadcast_pending);
+		} else if (td->evtdev->next_event.tv64 < next_event.tv64)
 			next_event.tv64 = td->evtdev->next_event.tv64;
 	}
 
+	/* Take care of enforced broadcast requests */
+	for_each_cpu(cpu, to_cpumask(tick_force_broadcast_mask)) {
+		set_bit(cpu, tmpmask);
+		clear_bit(cpu, tick_force_broadcast_mask);
+	}
+
 	/*
 	 * Wakeup the cpus which have an expired event.
 	 */
@@ -454,6 +477,7 @@ void tick_broadcast_oneshot_control(unsigned long reason)
 	struct clock_event_device *bc, *dev;
 	struct tick_device *td;
 	unsigned long flags;
+	ktime_t now;
 	int cpu;
 
 	/*
@@ -478,6 +502,8 @@ void tick_broadcast_oneshot_control(unsigned long reason)
 
 	raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
 	if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) {
+		WARN_ON_ONCE(test_bit(cpu, tick_broadcast_pending));
+		WARN_ON_ONCE(test_bit(cpu, tick_force_broadcast_mask));
 		if (!cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) {
 			cpumask_set_cpu(cpu, tick_get_broadcast_oneshot_mask());
 			clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
@@ -489,10 +515,63 @@ void tick_broadcast_oneshot_control(unsigned long reason)
 			cpumask_clear_cpu(cpu,
 					  tick_get_broadcast_oneshot_mask());
 			clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT);
-			if (dev->next_event.tv64 != KTIME_MAX)
-				tick_program_event(dev->next_event, 1);
+			if (dev->next_event.tv64 == KTIME_MAX)
+				goto out;
+			/*
+			 * The cpu handling the broadcast timer marked
+			 * this cpu in the broadcast pending mask and
+			 * fired the broadcast IPI. So we are going to
+			 * handle the expired event anyway via the
+			 * broadcast IPI handler. No need to reprogram
+			 * the timer with an already expired event.
+			 */
+			if (test_and_clear_bit(cpu, tick_broadcast_pending))
+				goto out;
+			/*
+			 * If the pending bit is not set, then we are
+			 * either the CPU handling the broadcast
+			 * interrupt or we got woken by something else.
+			 *
+			 * We are not longer in the broadcast mask, so
+			 * if the cpu local expiry time is already
+			 * reached, we would reprogram the cpu local
+			 * timer with an already expired event.
+			 *
+			 * This can lead to a ping-pong when we return
+			 * to idle and therefor rearm the broadcast
+			 * timer before the cpu local timer was able
+			 * to fire. This happens because the forced
+			 * reprogramming makes sure that the event
+			 * will happen in the future and depending on
+			 * the min_delta setting this might be far
+			 * enough out that the ping-pong starts.
+			 *
+			 * If the cpu local next_event has expired
+			 * then we know that the broadcast timer
+			 * next_event has expired as well and
+			 * broadcast is about to be handled. So we
+			 * avoid reprogramming and enforce that the
+			 * broadcast handler, which did not run yet,
+			 * will invoke the cpu local handler.
+			 *
+			 * We cannot call the handler directly from
+			 * here, because we might be in a NOHZ phase
+			 * and we did not go through the irq_enter()
+			 * nohz fixups.
+			 */
+			now = ktime_get();
+			if (dev->next_event.tv64 <= now.tv64) {
+				set_bit(cpu, tick_force_broadcast_mask);
+				goto out;
+			}
+			/*
+			 * We got woken by something else. Reprogram
+			 * the cpu local timer device.
+			 */
+			tick_program_event(dev->next_event, 1);
 		}
 	}
+out:
 	raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
 }
author	Jason Liu <r64343@freescale.com>	2013-02-25 19:55:04 +0800
committer	Jason Liu <r64343@freescale.com>	2013-03-01 17:17:44 +0800
commit	e83a0cfe4c5c2ab2045bc6d93b34323faafdcc4c (patch)
tree	1d45d4839d7be12fc9c11b5fa209d3b172aef6dd /kernel
parent	751d8fb681ec42f803283861db245a57233a9b07 (diff)