From cf9efce0ce3136fa076f53e53154e98455229514 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Thu, 26 Aug 2010 19:56:43 +0000
Subject: powerpc: Account time using timebase rather than PURR

Currently, when CONFIG_VIRT_CPU_ACCOUNTING is enabled, we use the
PURR register for measuring the user and system time used by
processes, as well as other related times such as hardirq and
softirq times.  This turns out to be quite confusing for users
because it means that a program will often be measured as taking
less time when run on a multi-threaded processor (SMT2 or SMT4 mode)
than it does when run on a single-threaded processor (ST mode), even
though the program takes longer to finish.  The discrepancy is
accounted for as stolen time, which is also confusing, particularly
when there are no other partitions running.

This changes the accounting to use the timebase instead, meaning that
the reported user and system times are the actual number of real-time
seconds that the program was executing on the processor thread,
regardless of which SMT mode the processor is in.  Thus a program will
generally show greater user and system times when run on a
multi-threaded processor than on a single-threaded processor.

On pSeries systems on POWER5 or later processors, we measure the
stolen time (time when this partition wasn't running) using the
hypervisor dispatch trace log.  We check for new entries in the
log on every entry from user mode and on every transition from
kernel process context to soft or hard IRQ context (i.e. when
account_system_vtime() gets called).  So that we can correctly
distinguish time stolen from user time and time stolen from system
time, without having to check the log on every exit to user mode,
we store separate timestamps for exit to user mode and entry from
user mode.

On systems that have a SPURR (POWER6 and POWER7), we read the SPURR
in account_system_vtime() (as before), and then apportion the SPURR
ticks since the last time we read it between scaled user time and
scaled system time according to the relative proportions of user
time and system time over the same interval.  This avoids having to
read the SPURR on every kernel entry and exit.  On systems that have
PURR but not SPURR (i.e., POWER5), we do the same using the PURR
rather than the SPURR.

This disables the DTL user interface in /sys/debug/kernel/powerpc/dtl
for now since it conflicts with the use of the dispatch trace log
by the time accounting code.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/kernel/time.c | 268 +++++++++++++++++++++------------------------
 1 file changed, 127 insertions(+), 141 deletions(-)

(limited to 'arch/powerpc/kernel/time.c')

diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 8533b3b83f5d..fca20643c368 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -164,8 +164,6 @@ unsigned long ppc_proc_freq;
 EXPORT_SYMBOL(ppc_proc_freq);
 unsigned long ppc_tb_freq;
 
-static DEFINE_PER_CPU(u64, last_jiffy);
-
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING
 /*
  * Factors for converting from cputime_t (timebase ticks) to
@@ -200,62 +198,151 @@ static void calc_cputime_factors(void)
 }
 
 /*
- * Read the PURR on systems that have it, otherwise the timebase.
+ * Read the SPURR on systems that have it, otherwise the PURR,
+ * or if that doesn't exist return the timebase value passed in.
  */
-static u64 read_purr(void)
+static u64 read_spurr(u64 tb)
 {
+	if (cpu_has_feature(CPU_FTR_SPURR))
+		return mfspr(SPRN_SPURR);
 	if (cpu_has_feature(CPU_FTR_PURR))
 		return mfspr(SPRN_PURR);
-	return mftb();
+	return tb;
 }
 
+#ifdef CONFIG_PPC_SPLPAR
+
 /*
- * Read the SPURR on systems that have it, otherwise the purr
+ * Scan the dispatch trace log and count up the stolen time.
+ * Should be called with interrupts disabled.
  */
-static u64 read_spurr(u64 purr)
+static u64 scan_dispatch_log(u64 stop_tb)
 {
-	/*
-	 * cpus without PURR won't have a SPURR
-	 * We already know the former when we use this, so tell gcc
-	 */
-	if (cpu_has_feature(CPU_FTR_PURR) && cpu_has_feature(CPU_FTR_SPURR))
-		return mfspr(SPRN_SPURR);
-	return purr;
+	unsigned long i = local_paca->dtl_ridx;
+	struct dtl_entry *dtl = local_paca->dtl_curr;
+	struct dtl_entry *dtl_end = local_paca->dispatch_log_end;
+	struct lppaca *vpa = local_paca->lppaca_ptr;
+	u64 tb_delta;
+	u64 stolen = 0;
+	u64 dtb;
+
+	if (i == vpa->dtl_idx)
+		return 0;
+	while (i < vpa->dtl_idx) {
+		dtb = dtl->timebase;
+		tb_delta = dtl->enqueue_to_dispatch_time +
+			dtl->ready_to_enqueue_time;
+		barrier();
+		if (i + N_DISPATCH_LOG < vpa->dtl_idx) {
+			/* buffer has overflowed */
+			i = vpa->dtl_idx - N_DISPATCH_LOG;
+			dtl = local_paca->dispatch_log + (i % N_DISPATCH_LOG);
+			continue;
+		}
+		if (dtb > stop_tb)
+			break;
+		stolen += tb_delta;
+		++i;
+		++dtl;
+		if (dtl == dtl_end)
+			dtl = local_paca->dispatch_log;
+	}
+	local_paca->dtl_ridx = i;
+	local_paca->dtl_curr = dtl;
+	return stolen;
 }
 
+/*
+ * Accumulate stolen time by scanning the dispatch trace log.
+ * Called on entry from user mode.
+ */
+void accumulate_stolen_time(void)
+{
+	u64 sst, ust;
+
+	sst = scan_dispatch_log(get_paca()->starttime_user);
+	ust = scan_dispatch_log(get_paca()->starttime);
+	get_paca()->system_time -= sst;
+	get_paca()->user_time -= ust;
+	get_paca()->stolen_time += ust + sst;
+}
+
+static inline u64 calculate_stolen_time(u64 stop_tb)
+{
+	u64 stolen = 0;
+
+	if (get_paca()->dtl_ridx != get_paca()->lppaca_ptr->dtl_idx) {
+		stolen = scan_dispatch_log(stop_tb);
+		get_paca()->system_time -= stolen;
+	}
+
+	stolen += get_paca()->stolen_time;
+	get_paca()->stolen_time = 0;
+	return stolen;
+}
+
+#else /* CONFIG_PPC_SPLPAR */
+static inline u64 calculate_stolen_time(u64 stop_tb)
+{
+	return 0;
+}
+
+#endif /* CONFIG_PPC_SPLPAR */
+
 /*
  * Account time for a transition between system, hard irq
  * or soft irq state.
  */
 void account_system_vtime(struct task_struct *tsk)
 {
-	u64 now, nowscaled, delta, deltascaled, sys_time;
+	u64 now, nowscaled, delta, deltascaled;
 	unsigned long flags;
+	u64 stolen, udelta, sys_scaled, user_scaled;
 
 	local_irq_save(flags);
-	now = read_purr();
+	now = mftb();
 	nowscaled = read_spurr(now);
-	delta = now - get_paca()->startpurr;
+	get_paca()->system_time += now - get_paca()->starttime;
+	get_paca()->starttime = now;
 	deltascaled = nowscaled - get_paca()->startspurr;
-	get_paca()->startpurr = now;
 	get_paca()->startspurr = nowscaled;
-	if (!in_interrupt()) {
-		/* deltascaled includes both user and system time.
-		 * Hence scale it based on the purr ratio to estimate
-		 * the system time */
-		sys_time = get_paca()->system_time;
-		if (get_paca()->user_time)
-			deltascaled = deltascaled * sys_time /
-			     (sys_time + get_paca()->user_time);
-		delta += sys_time;
-		get_paca()->system_time = 0;
+
+	stolen = calculate_stolen_time(now);
+
+	delta = get_paca()->system_time;
+	get_paca()->system_time = 0;
+	udelta = get_paca()->user_time - get_paca()->utime_sspurr;
+	get_paca()->utime_sspurr = get_paca()->user_time;
+
+	/*
+	 * Because we don't read the SPURR on every kernel entry/exit,
+	 * deltascaled includes both user and system SPURR ticks.
+	 * Apportion these ticks to system SPURR ticks and user
+	 * SPURR ticks in the same ratio as the system time (delta)
+	 * and user time (udelta) values obtained from the timebase
+	 * over the same interval.  The system ticks get accounted here;
+	 * the user ticks get saved up in paca->user_time_scaled to be
+	 * used by account_process_tick.
+	 */
+	sys_scaled = delta;
+	user_scaled = udelta;
+	if (deltascaled != delta + udelta) {
+		if (udelta) {
+			sys_scaled = deltascaled * delta / (delta + udelta);
+			user_scaled = deltascaled - sys_scaled;
+		} else {
+			sys_scaled = deltascaled;
+		}
+	}
+	get_paca()->user_time_scaled += user_scaled;
+
+	if (in_irq() || idle_task(smp_processor_id()) != tsk) {
+		account_system_time(tsk, 0, delta, sys_scaled);
+		if (stolen)
+			account_steal_time(stolen);
+	} else {
+		account_idle_time(delta + stolen);
 	}
-	if (in_irq() || idle_task(smp_processor_id()) != tsk)
-		account_system_time(tsk, 0, delta, deltascaled);
-	else
-		account_idle_time(delta);
-	__get_cpu_var(cputime_last_delta) = delta;
-	__get_cpu_var(cputime_scaled_last_delta) = deltascaled;
 	local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(account_system_vtime);
@@ -265,125 +352,26 @@ EXPORT_SYMBOL_GPL(account_system_vtime);
  * by the exception entry and exit code to the generic process
  * user and system time records.
  * Must be called with interrupts disabled.
+ * Assumes that account_system_vtime() has been called recently
+ * (i.e. since the last entry from usermode) so that
+ * get_paca()->user_time_scaled is up to date.
  */
 void account_process_tick(struct task_struct *tsk, int user_tick)
 {
 	cputime_t utime, utimescaled;
 
 	utime = get_paca()->user_time;
+	utimescaled = get_paca()->user_time_scaled;
 	get_paca()->user_time = 0;
-	utimescaled = cputime_to_scaled(utime);
+	get_paca()->user_time_scaled = 0;
+	get_paca()->utime_sspurr = 0;
 	account_user_time(tsk, utime, utimescaled);
 }
 
-/*
- * Stuff for accounting stolen time.
- */
-struct cpu_purr_data {
-	int	initialized;			/* thread is running */
-	u64	tb;			/* last TB value read */
-	u64	purr;			/* last PURR value read */
-	u64	spurr;			/* last SPURR value read */
-};
-
-/*
- * Each entry in the cpu_purr_data array is manipulated only by its
- * "owner" cpu -- usually in the timer interrupt but also occasionally
- * in process context for cpu online.  As long as cpus do not touch
- * each others' cpu_purr_data, disabling local interrupts is
- * sufficient to serialize accesses.
- */
-static DEFINE_PER_CPU(struct cpu_purr_data, cpu_purr_data);
-
-static void snapshot_tb_and_purr(void *data)
-{
-	unsigned long flags;
-	struct cpu_purr_data *p = &__get_cpu_var(cpu_purr_data);
-
-	local_irq_save(flags);
-	p->tb = get_tb_or_rtc();
-	p->purr = mfspr(SPRN_PURR);
-	wmb();
-	p->initialized = 1;
-	local_irq_restore(flags);
-}
-
-/*
- * Called during boot when all cpus have come up.
- */
-void snapshot_timebases(void)
-{
-	if (!cpu_has_feature(CPU_FTR_PURR))
-		return;
-	on_each_cpu(snapshot_tb_and_purr, NULL, 1);
-}
-
-/*
- * Must be called with interrupts disabled.
- */
-void calculate_steal_time(void)
-{
-	u64 tb, purr;
-	s64 stolen;
-	struct cpu_purr_data *pme;
-
-	pme = &__get_cpu_var(cpu_purr_data);
-	if (!pme->initialized)
-		return;		/* !CPU_FTR_PURR or early in early boot */
-	tb = mftb();
-	purr = mfspr(SPRN_PURR);
-	stolen = (tb - pme->tb) - (purr - pme->purr);
-	if (stolen > 0) {
-		if (idle_task(smp_processor_id()) != current)
-			account_steal_time(stolen);
-		else
-			account_idle_time(stolen);
-	}
-	pme->tb = tb;
-	pme->purr = purr;
-}
-
-#ifdef CONFIG_PPC_SPLPAR
-/*
- * Must be called before the cpu is added to the online map when
- * a cpu is being brought up at runtime.
- */
-static void snapshot_purr(void)
-{
-	struct cpu_purr_data *pme;
-	unsigned long flags;
-
-	if (!cpu_has_feature(CPU_FTR_PURR))
-		return;
-	local_irq_save(flags);
-	pme = &__get_cpu_var(cpu_purr_data);
-	pme->tb = mftb();
-	pme->purr = mfspr(SPRN_PURR);
-	pme->initialized = 1;
-	local_irq_restore(flags);
-}
-
-#endif /* CONFIG_PPC_SPLPAR */
-
 #else /* ! CONFIG_VIRT_CPU_ACCOUNTING */
 #define calc_cputime_factors()
-#define calculate_steal_time()		do { } while (0)
 #endif
 
-#if !(defined(CONFIG_VIRT_CPU_ACCOUNTING) && defined(CONFIG_PPC_SPLPAR))
-#define snapshot_purr()			do { } while (0)
-#endif
-
-/*
- * Called when a cpu comes up after the system has finished booting,
- * i.e. as a result of a hotplug cpu action.
- */
-void snapshot_timebase(void)
-{
-	__get_cpu_var(last_jiffy) = get_tb_or_rtc();
-	snapshot_purr();
-}
-
 void __delay(unsigned long loops)
 {
 	unsigned long start;
@@ -585,8 +573,6 @@ void timer_interrupt(struct pt_regs * regs)
 	old_regs = set_irq_regs(regs);
 	irq_enter();
 
-	calculate_steal_time();
-
 	if (test_perf_event_pending()) {
 		clear_perf_event_pending();
 		perf_event_do_pending();
-- 
cgit v1.2.3


From 872e439a45ed4a4bd499bc55cb0dffa74027f749 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Tue, 31 Aug 2010 01:59:53 +0000
Subject: powerpc/pseries: Re-enable dispatch trace log userspace interface

Since the cpu accounting code uses the hypervisor dispatch trace log
now when CONFIG_VIRT_CPU_ACCOUNTING = y, the previous commit disabled
access to it via files in the /sys/kernel/debug/powerpc/dtl/ directory
in that case.  This restores those files.

To do this, we now have a hook that the cpu accounting code will call
as it processes each entry from the hypervisor dispatch trace log.
The code in dtl.c now uses that to fill up its ring buffer, rather
than having the hypervisor fill the ring buffer directly.

This also fixes dtl_file_read() to handle overflow conditions a bit
better and adds a spinlock to ensure that race conditions (multiple
processes opening or reading the file concurrently) are handled
correctly.

Signed-off-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
---
 arch/powerpc/kernel/time.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'arch/powerpc/kernel/time.c')

diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index fca20643c368..bcb738b9ff8c 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -183,6 +183,8 @@ DEFINE_PER_CPU(unsigned long, cputime_scaled_last_delta);
 
 cputime_t cputime_one_jiffy;
 
+void (*dtl_consumer)(struct dtl_entry *, u64);
+
 static void calc_cputime_factors(void)
 {
 	struct div_result res;
@@ -218,7 +220,7 @@ static u64 read_spurr(u64 tb)
  */
 static u64 scan_dispatch_log(u64 stop_tb)
 {
-	unsigned long i = local_paca->dtl_ridx;
+	u64 i = local_paca->dtl_ridx;
 	struct dtl_entry *dtl = local_paca->dtl_curr;
 	struct dtl_entry *dtl_end = local_paca->dispatch_log_end;
 	struct lppaca *vpa = local_paca->lppaca_ptr;
@@ -229,6 +231,8 @@ static u64 scan_dispatch_log(u64 stop_tb)
 	if (i == vpa->dtl_idx)
 		return 0;
 	while (i < vpa->dtl_idx) {
+		if (dtl_consumer)
+			dtl_consumer(dtl, i);
 		dtb = dtl->timebase;
 		tb_delta = dtl->enqueue_to_dispatch_time +
 			dtl->ready_to_enqueue_time;
-- 
cgit v1.2.3


From 55ec2fca3e99f83b5c674e9aba713d848392f6cc Mon Sep 17 00:00:00 2001
From: Timur Tabi <timur@freescale.com>
Date: Mon, 20 Sep 2010 11:23:41 -0500
Subject: powerpc: export ppc_proc_freq and ppc_tb_freq as GPL symbols

Export the global variable 'ppc_tb_freq', so that modules (like the Book-E
watchdog driver) can use it.  To maintain consistency, ppc_proc_freq is
changed to a GPL-only export.  This is okay, because any module that needs
this symbol should be an actual Linux driver, which must be GPL-licensed.

Signed-off-by: Timur Tabi <timur@freescale.com>
Acked-by: Josh Boyer <jwboyer@linux.vnet.ibm.com>
Signed-off-by: Kumar Gala <galak@kernel.crashing.org>
---
 arch/powerpc/kernel/time.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'arch/powerpc/kernel/time.c')

diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index bcb738b9ff8c..644f9188d8e7 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -161,8 +161,9 @@ extern struct timezone sys_tz;
 static long timezone_offset;
 
 unsigned long ppc_proc_freq;
-EXPORT_SYMBOL(ppc_proc_freq);
+EXPORT_SYMBOL_GPL(ppc_proc_freq);
 unsigned long ppc_tb_freq;
+EXPORT_SYMBOL_GPL(ppc_tb_freq);
 
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING
 /*
-- 
cgit v1.2.3