summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/cgroup.c231
-rw-r--r--kernel/cgroup_freezer.c8
-rw-r--r--kernel/cpu.c27
-rw-r--r--kernel/cpuset.c35
-rw-r--r--kernel/debug/debug_core.c12
-rw-r--r--kernel/debug/kdb/kdb_io.c12
-rw-r--r--kernel/fork.c19
-rw-r--r--kernel/gcov/Kconfig8
-rw-r--r--kernel/gcov/gcc_3_4.c100
-rw-r--r--kernel/gcov/gcov.h53
-rw-r--r--kernel/irq/pm.c7
-rw-r--r--kernel/irq/resend.c19
-rw-r--r--kernel/kthread.c11
-rw-r--r--kernel/module.c2
-rw-r--r--kernel/panic.c13
-rw-r--r--kernel/power/Kconfig54
-rw-r--r--kernel/power/Makefile3
-rw-r--r--kernel/power/autosleep.c127
-rw-r--r--kernel/power/hibernate.c13
-rw-r--r--kernel/power/main.c160
-rw-r--r--kernel/power/power.h27
-rw-r--r--kernel/power/qos.c63
-rw-r--r--kernel/power/suspend.c15
-rw-r--r--kernel/power/suspend_time.c111
-rw-r--r--kernel/power/swap.c62
-rw-r--r--kernel/power/wakelock.c259
-rw-r--r--kernel/printk.c58
-rw-r--r--kernel/sched/core.c57
-rw-r--r--kernel/sched/debug.c3
-rw-r--r--kernel/sched/rt.c16
-rw-r--r--kernel/sched/sched.h40
-rw-r--r--kernel/sysctl.c8
-rw-r--r--kernel/time/alarmtimer.c24
-rw-r--r--kernel/time/timekeeping.c2
-rw-r--r--kernel/trace/Kconfig33
-rw-r--r--kernel/trace/Makefile2
-rw-r--r--kernel/trace/tracedump.c682
-rw-r--r--kernel/trace/tracelevel.c142
38 files changed, 2335 insertions, 183 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index ed64ccac67c9..145deeb69bc3 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -60,6 +60,7 @@
#include <linux/eventfd.h>
#include <linux/poll.h>
#include <linux/flex_array.h> /* used in cgroup_attach_proc */
+#include <linux/capability.h>
#include <linux/atomic.h>
@@ -287,6 +288,33 @@ static void cgroup_release_agent(struct work_struct *work);
static DECLARE_WORK(release_agent_work, cgroup_release_agent);
static void check_for_release(struct cgroup *cgrp);
+/*
+ * A queue for waiters to do rmdir() cgroup. A tasks will sleep when
+ * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some
+ * reference to css->refcnt. In general, this refcnt is expected to goes down
+ * to zero, soon.
+ *
+ * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
+ */
+static DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
+
+static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
+{
+ if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
+ wake_up_all(&cgroup_rmdir_waitq);
+}
+
+void cgroup_exclude_rmdir(struct cgroup_subsys_state *css)
+{
+ css_get(css);
+}
+
+void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
+{
+ cgroup_wakeup_rmdir_waiter(css->cgroup);
+ css_put(css);
+}
+
/* Link structure for associating css_set objects with cgroups */
struct cg_cgroup_link {
/*
@@ -346,52 +374,43 @@ static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
return &css_set_table[index];
}
-/* We don't maintain the lists running through each css_set to its
- * task until after the first call to cgroup_iter_start(). This
- * reduces the fork()/exit() overhead for people who have cgroups
- * compiled into their kernel but not actually in use */
-static int use_task_css_set_links __read_mostly;
-
-static void __put_css_set(struct css_set *cg, int taskexit)
+static void free_css_set_work(struct work_struct *work)
{
+ struct css_set *cg = container_of(work, struct css_set, work);
struct cg_cgroup_link *link;
struct cg_cgroup_link *saved_link;
- /*
- * Ensure that the refcount doesn't hit zero while any readers
- * can see it. Similar to atomic_dec_and_lock(), but for an
- * rwlock
- */
- if (atomic_add_unless(&cg->refcount, -1, 1))
- return;
- write_lock(&css_set_lock);
- if (!atomic_dec_and_test(&cg->refcount)) {
- write_unlock(&css_set_lock);
- return;
- }
-
- /* This css_set is dead. unlink it and release cgroup refcounts */
- hlist_del(&cg->hlist);
- css_set_count--;
+ write_lock(&css_set_lock);
list_for_each_entry_safe(link, saved_link, &cg->cg_links,
cg_link_list) {
struct cgroup *cgrp = link->cgrp;
list_del(&link->cg_link_list);
list_del(&link->cgrp_link_list);
- if (atomic_dec_and_test(&cgrp->count) &&
- notify_on_release(cgrp)) {
- if (taskexit)
- set_bit(CGRP_RELEASABLE, &cgrp->flags);
+ if (atomic_dec_and_test(&cgrp->count)) {
check_for_release(cgrp);
+ cgroup_wakeup_rmdir_waiter(cgrp);
}
-
kfree(link);
}
-
write_unlock(&css_set_lock);
- kfree_rcu(cg, rcu_head);
+
+ kfree(cg);
+}
+
+static void free_css_set_rcu(struct rcu_head *obj)
+{
+ struct css_set *cg = container_of(obj, struct css_set, rcu_head);
+
+ INIT_WORK(&cg->work, free_css_set_work);
+ schedule_work(&cg->work);
}
+/* We don't maintain the lists running through each css_set to its
+ * task until after the first call to cgroup_iter_start(). This
+ * reduces the fork()/exit() overhead for people who have cgroups
+ * compiled into their kernel but not actually in use */
+static int use_task_css_set_links __read_mostly;
+
/*
* refcounted get/put for css_set objects
*/
@@ -400,16 +419,34 @@ static inline void get_css_set(struct css_set *cg)
atomic_inc(&cg->refcount);
}
-static inline void put_css_set(struct css_set *cg)
+static void put_css_set(struct css_set *cg)
{
- __put_css_set(cg, 0);
-}
+ /*
+ * Ensure that the refcount doesn't hit zero while any readers
+ * can see it. Similar to atomic_dec_and_lock(), but for an
+ * rwlock
+ */
+ if (atomic_add_unless(&cg->refcount, -1, 1))
+ return;
+ write_lock(&css_set_lock);
+ if (!atomic_dec_and_test(&cg->refcount)) {
+ write_unlock(&css_set_lock);
+ return;
+ }
-static inline void put_css_set_taskexit(struct css_set *cg)
-{
- __put_css_set(cg, 1);
+ hlist_del(&cg->hlist);
+ css_set_count--;
+
+ write_unlock(&css_set_lock);
+ call_rcu(&cg->rcu_head, free_css_set_rcu);
}
+/* We don't maintain the lists running through each css_set to its
+ * task until after the first call to cgroup_iter_start(). This
+ * reduces the fork()/exit() overhead for people who have cgroups
+ * compiled into their kernel but not actually in use */
+static int use_task_css_set_links __read_mostly;
+
/*
* compare_css_sets - helper function for find_existing_css_set().
* @cg: candidate css_set being tested
@@ -739,9 +776,9 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
* cgroup_attach_task(), which overwrites one tasks cgroup pointer with
* another. It does so using cgroup_mutex, however there are
* several performance critical places that need to reference
- * task->cgroup without the expense of grabbing a system global
+ * task->cgroups without the expense of grabbing a system global
* mutex. Therefore except as noted below, when dereferencing or, as
- * in cgroup_attach_task(), modifying a task'ss cgroup pointer we use
+ * in cgroup_attach_task(), modifying a task's cgroups pointer we use
* task_lock(), which acts on a spinlock (task->alloc_lock) already in
* the task_struct routinely used for such matters.
*
@@ -931,33 +968,6 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
}
/*
- * A queue for waiters to do rmdir() cgroup. A tasks will sleep when
- * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some
- * reference to css->refcnt. In general, this refcnt is expected to goes down
- * to zero, soon.
- *
- * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex;
- */
-static DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
-
-static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp)
-{
- if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
- wake_up_all(&cgroup_rmdir_waitq);
-}
-
-void cgroup_exclude_rmdir(struct cgroup_subsys_state *css)
-{
- css_get(css);
-}
-
-void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
-{
- cgroup_wakeup_rmdir_waiter(css->cgroup);
- css_put(css);
-}
-
-/*
* Call with cgroup_mutex held. Drops reference counts on modules, including
* any duplicate ones that parse_cgroupfs_options took. If this function
* returns an error, no reference counts are touched.
@@ -1889,6 +1899,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
struct cgroupfs_root *root = cgrp->root;
struct cgroup_taskset tset = { };
struct css_set *newcg;
+ struct css_set *cg;
/* @tsk either already exited or can't exit until the end */
if (tsk->flags & PF_EXITING)
@@ -1915,6 +1926,15 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
failed_ss = ss;
goto out;
}
+ } else if (!capable(CAP_SYS_ADMIN)) {
+ const struct cred *cred = current_cred(), *tcred;
+
+ /* No can_attach() - check perms generically */
+ tcred = __task_cred(tsk);
+ if (cred->euid != tcred->uid &&
+ cred->euid != tcred->suid) {
+ return -EACCES;
+ }
}
}
@@ -1924,14 +1944,20 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
goto out;
}
+ task_lock(tsk);
+ cg = tsk->cgroups;
+ get_css_set(cg);
+ task_unlock(tsk);
+
cgroup_task_migrate(cgrp, oldcgrp, tsk, newcg);
for_each_subsys(root, ss) {
if (ss->attach)
ss->attach(cgrp, &tset);
}
-
- synchronize_rcu();
+ set_bit(CGRP_RELEASABLE, &cgrp->flags);
+ /* put_css_set will not destroy cg until after an RCU grace period */
+ put_css_set(cg);
/*
* wake up rmdir() waiter. the rmdir should fail since the cgroup
@@ -2132,6 +2158,24 @@ out_free_group_list:
return retval;
}
+static int cgroup_allow_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
+{
+ struct cgroup_subsys *ss;
+ int ret;
+
+ for_each_subsys(cgrp->root, ss) {
+ if (ss->allow_attach) {
+ ret = ss->allow_attach(cgrp, tset);
+ if (ret)
+ return ret;
+ } else {
+ return -EACCES;
+ }
+ }
+
+ return 0;
+}
+
/*
* Find the task_struct of the task to attach by vpid and pass it along to the
* function to attach either it or all tasks in its threadgroup. Will lock
@@ -2163,9 +2207,19 @@ retry_find_task:
if (cred->euid &&
cred->euid != tcred->uid &&
cred->euid != tcred->suid) {
- rcu_read_unlock();
- ret = -EACCES;
- goto out_unlock_cgroup;
+ /*
+ * if the default permission check fails, give each
+ * cgroup a chance to extend the permission check
+ */
+ struct cgroup_taskset tset = { };
+ tset.single.task = tsk;
+ tset.single.cgrp = cgrp;
+ ret = cgroup_allow_attach(cgrp, &tset);
+ if (ret) {
+ rcu_read_unlock();
+ cgroup_unlock();
+ return ret;
+ }
}
} else
tsk = current;
@@ -3784,6 +3838,8 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
if (err < 0)
goto err_remove;
+ set_bit(CGRP_RELEASABLE, &parent->flags);
+
/* The cgroup directory was pre-locked for us */
BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex));
@@ -3915,6 +3971,21 @@ static int cgroup_clear_css_refs(struct cgroup *cgrp)
return !failed;
}
+/* checks if all of the css_sets attached to a cgroup have a refcount of 0.
+ * Must be called with css_set_lock held */
+static int cgroup_css_sets_empty(struct cgroup *cgrp)
+{
+ struct cg_cgroup_link *link;
+
+ list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) {
+ struct css_set *cg = link->cg;
+ if (atomic_read(&cg->refcount) > 0)
+ return 0;
+ }
+
+ return 1;
+}
+
static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
{
struct cgroup *cgrp = dentry->d_fsdata;
@@ -3927,7 +3998,7 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
/* the vfs holds both inode->i_mutex already */
again:
mutex_lock(&cgroup_mutex);
- if (atomic_read(&cgrp->count) != 0) {
+ if (!cgroup_css_sets_empty(cgrp)) {
mutex_unlock(&cgroup_mutex);
return -EBUSY;
}
@@ -3960,7 +4031,7 @@ again:
mutex_lock(&cgroup_mutex);
parent = cgrp->parent;
- if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
+ if (!cgroup_css_sets_empty(cgrp) || !list_empty(&cgrp->children)) {
clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
mutex_unlock(&cgroup_mutex);
return -EBUSY;
@@ -4000,7 +4071,6 @@ again:
cgroup_d_remove_dir(d);
dput(d);
- set_bit(CGRP_RELEASABLE, &parent->flags);
check_for_release(parent);
/*
@@ -4631,7 +4701,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
task_unlock(tsk);
if (cg)
- put_css_set_taskexit(cg);
+ put_css_set(cg);
}
/**
@@ -4685,6 +4755,14 @@ static void check_for_release(struct cgroup *cgrp)
}
/* Caller must verify that the css is not for root cgroup */
+void __css_get(struct cgroup_subsys_state *css, int count)
+{
+ atomic_add(count, &css->refcnt);
+ set_bit(CGRP_RELEASABLE, &css->cgroup->flags);
+}
+EXPORT_SYMBOL_GPL(__css_get);
+
+/* Caller must verify that the css is not for root cgroup */
void __css_put(struct cgroup_subsys_state *css, int count)
{
struct cgroup *cgrp = css->cgroup;
@@ -4692,10 +4770,7 @@ void __css_put(struct cgroup_subsys_state *css, int count)
rcu_read_lock();
val = atomic_sub_return(count, &css->refcnt);
if (val == 1) {
- if (notify_on_release(cgrp)) {
- set_bit(CGRP_RELEASABLE, &cgrp->flags);
- check_for_release(cgrp);
- }
+ check_for_release(cgrp);
cgroup_wakeup_rmdir_waiter(cgrp);
}
rcu_read_unlock();
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index f86e93920b62..5c248e507a6e 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -168,6 +168,14 @@ static int freezer_can_attach(struct cgroup *new_cgroup,
struct freezer *freezer;
struct task_struct *task;
+ if ((current != task) && (!capable(CAP_SYS_ADMIN))) {
+ const struct cred *cred = current_cred(), *tcred;
+
+ tcred = __task_cred(task);
+ if (cred->euid != tcred->uid && cred->euid != tcred->suid)
+ return -EPERM;
+ }
+
/*
* Anything frozen can't move or be moved to/from.
*/
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 2060c6e57027..acf5d8047458 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -16,6 +16,7 @@
#include <linux/mutex.h>
#include <linux/gfp.h>
#include <linux/suspend.h>
+#include <trace/events/power.h>
#ifdef CONFIG_SMP
/* Serializes the updates to cpu_online_mask, cpu_present_mask */
@@ -273,6 +274,8 @@ int __ref cpu_down(unsigned int cpu)
{
int err;
+ trace_cpu_hotplug(cpu, POWER_CPU_DOWN_START);
+
cpu_maps_update_begin();
if (cpu_hotplug_disabled) {
@@ -284,6 +287,7 @@ int __ref cpu_down(unsigned int cpu)
out:
cpu_maps_update_done();
+ trace_cpu_hotplug(cpu, POWER_CPU_DOWN_DONE);
return err;
}
EXPORT_SYMBOL(cpu_down);
@@ -334,6 +338,8 @@ int __cpuinit cpu_up(unsigned int cpu)
pg_data_t *pgdat;
#endif
+ trace_cpu_hotplug(cpu, POWER_CPU_UP_START);
+
if (!cpu_possible(cpu)) {
printk(KERN_ERR "can't online cpu %d because it is not "
"configured as may-hotadd at boot time\n", cpu);
@@ -377,6 +383,7 @@ int __cpuinit cpu_up(unsigned int cpu)
out:
cpu_maps_update_done();
+ trace_cpu_hotplug(cpu, POWER_CPU_UP_DONE);
return err;
}
EXPORT_SYMBOL_GPL(cpu_up);
@@ -668,3 +675,23 @@ void init_cpu_online(const struct cpumask *src)
{
cpumask_copy(to_cpumask(cpu_online_bits), src);
}
+
+static ATOMIC_NOTIFIER_HEAD(idle_notifier);
+
+void idle_notifier_register(struct notifier_block *n)
+{
+ atomic_notifier_chain_register(&idle_notifier, n);
+}
+EXPORT_SYMBOL_GPL(idle_notifier_register);
+
+void idle_notifier_unregister(struct notifier_block *n)
+{
+ atomic_notifier_chain_unregister(&idle_notifier, n);
+}
+EXPORT_SYMBOL_GPL(idle_notifier_unregister);
+
+void idle_notifier_call_chain(unsigned long val)
+{
+ atomic_notifier_call_chain(&idle_notifier, val, NULL);
+}
+EXPORT_SYMBOL_GPL(idle_notifier_call_chain);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 14f7070b4ba2..48b90d30797f 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1362,6 +1362,41 @@ static int fmeter_getrate(struct fmeter *fmp)
return val;
}
+/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
+static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
+ struct task_struct *tsk)
+{
+ struct cpuset *cs = cgroup_cs(cont);
+
+ if ((current != task) && (!capable(CAP_SYS_ADMIN))) {
+ const struct cred *cred = current_cred(), *tcred;
+
+ if (cred->euid != tcred->uid && cred->euid != tcred->suid)
+ return -EPERM;
+ }
+
+ if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
+ return -ENOSPC;
+
+ /*
+ * Kthreads bound to specific cpus cannot be moved to a new cpuset; we
+ * cannot change their cpu affinity and isolating such threads by their
+ * set of allowed nodes is unnecessary. Thus, cpusets are not
+ * applicable for such threads. This prevents checking for success of
+ * set_cpus_allowed_ptr() on all attached tasks before cpus_allowed may
+ * be changed.
+ */
+ if (tsk->flags & PF_THREAD_BOUND)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int cpuset_can_attach_task(struct cgroup *cgrp, struct task_struct *task)
+{
+ return security_task_setscheduler(task);
+}
+
/*
* Protected by cgroup_lock. The nodemasks must be stored globally because
* dynamically allocating them is not allowed in can_attach, and they must
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 0557f24c6bca..35b94acec621 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -85,6 +85,10 @@ static int kgdb_use_con;
bool dbg_is_early = true;
/* Next cpu to become the master debug core */
int dbg_switch_cpu;
+/* Flag for entering kdb when a panic occurs */
+static bool break_on_panic = true;
+/* Flag for entering kdb when an exception occurs */
+static bool break_on_exception = true;
/* Use kdb or gdbserver mode */
int dbg_kdb_mode = 1;
@@ -99,6 +103,8 @@ early_param("kgdbcon", opt_kgdb_con);
module_param(kgdb_use_con, int, 0644);
module_param(kgdbreboot, int, 0644);
+module_param(break_on_panic, bool, 0644);
+module_param(break_on_exception, bool, 0644);
/*
* Holds information about breakpoints in a kernel. These breakpoints are
@@ -673,6 +679,9 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
struct kgdb_state kgdb_var;
struct kgdb_state *ks = &kgdb_var;
+ if (unlikely(signo != SIGTRAP && !break_on_exception))
+ return 1;
+
ks->cpu = raw_smp_processor_id();
ks->ex_vector = evector;
ks->signo = signo;
@@ -759,6 +768,9 @@ static int kgdb_panic_event(struct notifier_block *self,
unsigned long val,
void *data)
{
+ if (!break_on_panic)
+ return NOTIFY_DONE;
+
if (dbg_kdb_mode)
kdb_printf("PANIC: %s\n", (char *)data);
kgdb_breakpoint();
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index bb9520f0f6ff..18a4cb33c52b 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -216,7 +216,7 @@ static char *kdb_read(char *buffer, size_t bufsize)
int i;
int diag, dtab_count;
int key;
-
+ static int last_crlf;
diag = kdbgetintenv("DTABCOUNT", &dtab_count);
if (diag)
@@ -237,6 +237,9 @@ poll_again:
return buffer;
if (key != 9)
tab = 0;
+ if (key != 10 && key != 13)
+ last_crlf = 0;
+
switch (key) {
case 8: /* backspace */
if (cp > buffer) {
@@ -254,7 +257,12 @@ poll_again:
*cp = tmp;
}
break;
- case 13: /* enter */
+ case 10: /* new line */
+ case 13: /* carriage return */
+ /* handle \n after \r */
+ if (last_crlf && last_crlf != key)
+ break;
+ last_crlf = key;
*lastchar++ = '\n';
*lastchar++ = '\0';
if (!KDB_STATE(KGDB_TRANS)) {
diff --git a/kernel/fork.c b/kernel/fork.c
index 81633337aee1..bc3398ee1d79 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -158,6 +158,9 @@ struct kmem_cache *vm_area_cachep;
/* SLAB cache for mm_struct structures (tsk->mm) */
static struct kmem_cache *mm_cachep;
+/* Notifier list called when a task struct is freed */
+static ATOMIC_NOTIFIER_HEAD(task_free_notifier);
+
static void account_kernel_stack(struct thread_info *ti, int account)
{
struct zone *zone = page_zone(virt_to_page(ti));
@@ -188,6 +191,18 @@ static inline void put_signal_struct(struct signal_struct *sig)
free_signal_struct(sig);
}
+int task_free_register(struct notifier_block *n)
+{
+ return atomic_notifier_chain_register(&task_free_notifier, n);
+}
+EXPORT_SYMBOL(task_free_register);
+
+int task_free_unregister(struct notifier_block *n)
+{
+ return atomic_notifier_chain_unregister(&task_free_notifier, n);
+}
+EXPORT_SYMBOL(task_free_unregister);
+
void __put_task_struct(struct task_struct *tsk)
{
WARN_ON(!tsk->exit_state);
@@ -199,6 +214,7 @@ void __put_task_struct(struct task_struct *tsk)
delayacct_tsk_free(tsk);
put_signal_struct(tsk->signal);
+ atomic_notifier_call_chain(&task_free_notifier, 0, tsk);
if (!profile_handoff_task(tsk))
free_task(tsk);
}
@@ -677,7 +693,8 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode)
mm = get_task_mm(task);
if (mm && mm != current->mm &&
- !ptrace_may_access(task, mode)) {
+ !ptrace_may_access(task, mode) &&
+ !capable(CAP_SYS_RESOURCE)) {
mmput(mm);
mm = ERR_PTR(-EACCES);
}
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index a92028196cc1..824b741925bb 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -35,7 +35,7 @@ config GCOV_KERNEL
config GCOV_PROFILE_ALL
bool "Profile entire Kernel"
depends on GCOV_KERNEL
- depends on SUPERH || S390 || X86 || (PPC && EXPERIMENTAL) || MICROBLAZE
+ depends on SUPERH || S390 || X86 || (PPC && EXPERIMENTAL) || MICROBLAZE || ARM
default n
---help---
This options activates profiling for the entire kernel.
@@ -46,4 +46,10 @@ config GCOV_PROFILE_ALL
larger and run slower. Also be sure to exclude files from profiling
which are not linked to the kernel image to prevent linker errors.
+config GCOV_CTORS
+ string
+ depends on CONSTRUCTORS
+ default ".init_array" if ARM && AEABI
+ default ".ctors"
+
endmenu
diff --git a/kernel/gcov/gcc_3_4.c b/kernel/gcov/gcc_3_4.c
index ae5bb4260033..bc78336bc345 100644
--- a/kernel/gcov/gcc_3_4.c
+++ b/kernel/gcov/gcc_3_4.c
@@ -297,16 +297,30 @@ void gcov_iter_start(struct gcov_iterator *iter)
}
/* Mapping of logical record number to actual file content. */
-#define RECORD_FILE_MAGIC 0
-#define RECORD_GCOV_VERSION 1
-#define RECORD_TIME_STAMP 2
-#define RECORD_FUNCTION_TAG 3
-#define RECORD_FUNCTON_TAG_LEN 4
-#define RECORD_FUNCTION_IDENT 5
-#define RECORD_FUNCTION_CHECK 6
-#define RECORD_COUNT_TAG 7
-#define RECORD_COUNT_LEN 8
-#define RECORD_COUNT 9
+#define RECORD_FILE_MAGIC 0
+#define RECORD_GCOV_VERSION 1
+#define RECORD_TIME_STAMP 2
+#define RECORD_FUNCTION_TAG 3
+#define RECORD_FUNCTON_TAG_LEN 4
+#define RECORD_FUNCTION_IDENT 5
+#define RECORD_FUNCTION_CHECK_LINE 6
+#define RECORD_FUNCTION_CHECK_CFG 7
+#define RECORD_FUNCTION_NAME_LEN 8
+#define RECORD_FUNCTION_NAME 9
+#define RECORD_COUNT_TAG 10
+#define RECORD_COUNT_LEN 11
+#define RECORD_COUNT 12
+
+/* Return length of string encoded in GCOV format. */
+static size_t
+sizeof_str(const char *str)
+{
+ size_t len;
+ len = (str) ? strlen(str) : 0;
+ if (len == 0)
+ return 1;
+ return 1 + ((len + 4) >> 2);
+}
/**
* gcov_iter_next - advance file iterator to next logical record
@@ -323,6 +337,9 @@ int gcov_iter_next(struct gcov_iterator *iter)
case RECORD_FUNCTON_TAG_LEN:
case RECORD_FUNCTION_IDENT:
case RECORD_COUNT_TAG:
+ case RECORD_FUNCTION_CHECK_LINE:
+ case RECORD_FUNCTION_CHECK_CFG:
+ case RECORD_FUNCTION_NAME_LEN:
/* Advance to next record */
iter->record++;
break;
@@ -332,7 +349,7 @@ int gcov_iter_next(struct gcov_iterator *iter)
/* fall through */
case RECORD_COUNT_LEN:
if (iter->count < get_func(iter)->n_ctrs[iter->type]) {
- iter->record = 9;
+ iter->record = 12;
break;
}
/* Advance to next counter type */
@@ -340,9 +357,9 @@ int gcov_iter_next(struct gcov_iterator *iter)
iter->count = 0;
iter->type++;
/* fall through */
- case RECORD_FUNCTION_CHECK:
+ case RECORD_FUNCTION_NAME:
if (iter->type < iter->num_types) {
- iter->record = 7;
+ iter->record = 10;
break;
}
/* Advance to next function */
@@ -395,6 +412,34 @@ static int seq_write_gcov_u64(struct seq_file *seq, u64 v)
data[1] = (v >> 32);
return seq_write(seq, data, sizeof(data));
}
+/**
+ * seq_write_gcov_str - write string in gcov format to seq_file
+ * @seq: seq_file handle
+ * @str: string to be stored
+ *
+ * Number format defined by gcc: numbers are recorded in the 32 bit
+ * unsigned binary form of the endianness of the machine generating the
+ * file. 64 bit numbers are stored as two 32 bit numbers, the low part
+ * first.
+ */
+static int seq_write_gcov_str(struct seq_file *seq, const char *str)
+{
+ if (str) {
+ size_t len;
+ int str_off;
+ u32 data;
+ len = strlen(str);
+ for (str_off = 0; str_off < (sizeof_str(str) - 2) ; str_off++) {
+ memcpy(&data, (str + str_off * 4), 4);
+ seq_write(seq, &data, sizeof(data));
+ }
+ data = 0;
+ memcpy(&data, (str + str_off * 4), (len - str_off * 4));
+ return seq_write(seq, &data, sizeof(data));
+ } else {
+ return 0;
+ }
+}
/**
* gcov_iter_write - write data for current pos to seq_file
@@ -421,13 +466,36 @@ int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq)
rc = seq_write_gcov_u32(seq, GCOV_TAG_FUNCTION);
break;
case RECORD_FUNCTON_TAG_LEN:
- rc = seq_write_gcov_u32(seq, 2);
+#ifdef GCOV_FN_INFO_HAS_NAME_FIELD
+ rc = seq_write_gcov_u32(seq, GCOV_TAG_FUNCTION_LENGTH +
+ (sizeof_str(get_func(iter)->name)));
+#else
+ rc = seq_write_gcov_u32(seq, GCOV_TAG_FUNCTION_LENGTH);
+#endif
break;
case RECORD_FUNCTION_IDENT:
rc = seq_write_gcov_u32(seq, get_func(iter)->ident);
break;
- case RECORD_FUNCTION_CHECK:
- rc = seq_write_gcov_u32(seq, get_func(iter)->checksum);
+ case RECORD_FUNCTION_CHECK_LINE:
+ rc = seq_write_gcov_u32(seq, get_func(iter)->lineno_checksum);
+ break;
+ case RECORD_FUNCTION_CHECK_CFG:
+ rc = seq_write_gcov_u32(seq, get_func(iter)->cfg_checksum);
+ break;
+ case RECORD_FUNCTION_NAME_LEN:
+#ifdef GCOV_FN_INFO_HAS_NAME_FIELD
+ rc = seq_write_gcov_u32(seq,
+ (sizeof_str(get_func(iter)->name) - 1));
+#else
+ rc = 0;
+#endif
+ break;
+ case RECORD_FUNCTION_NAME:
+#ifdef GCOV_FN_INFO_HAS_NAME_FIELD
+ rc = seq_write_gcov_str(seq, get_func(iter)->name);
+#else
+ rc = 0;
+#endif
break;
case RECORD_COUNT_TAG:
rc = seq_write_gcov_u32(seq,
diff --git a/kernel/gcov/gcov.h b/kernel/gcov/gcov.h
index 060073ebf7a6..8c5130a5c1b5 100644
--- a/kernel/gcov/gcov.h
+++ b/kernel/gcov/gcov.h
@@ -17,13 +17,21 @@
#include <linux/types.h>
/*
- * Profiling data types used for gcc 3.4 and above - these are defined by
+ * GCC 4.6 drops the 'name' field from 'struct gcov_fn_info'.
+ */
+#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 6)
+#define GCOV_FN_INFO_HAS_NAME_FIELD
+#endif
+
+/*
+ * Profiling data types used for at least gcc 4.4 and 4.6 - these are defined by
* gcc and need to be kept as close to the original definition as possible to
* remain compatible.
*/
-#define GCOV_COUNTERS 5
+#define GCOV_COUNTERS 10
#define GCOV_DATA_MAGIC ((unsigned int) 0x67636461)
#define GCOV_TAG_FUNCTION ((unsigned int) 0x01000000)
+#define GCOV_TAG_FUNCTION_LENGTH 3
#define GCOV_TAG_COUNTER_BASE ((unsigned int) 0x01a10000)
#define GCOV_TAG_FOR_COUNTER(count) \
(GCOV_TAG_COUNTER_BASE + ((unsigned int) (count) << 17))
@@ -34,10 +42,38 @@ typedef long gcov_type;
typedef long long gcov_type;
#endif
+/*
+ * Source module info. The data structure is used in both runtime and
+ * profile-use phase.
+ */
+struct gcov_module_info {
+ unsigned int ident;
+/*
+ * This is overloaded to mean two things:
+ * (1) means FDO/LIPO in instrumented binary.
+ * (2) means IS_PRIMARY in persistent file or memory copy used in profile-use.
+ */
+ unsigned int is_primary;
+ unsigned int is_exported;
+ unsigned int lang;
+ char *da_filename;
+ char *source_filename;
+ unsigned int num_quote_paths;
+ unsigned int num_bracket_paths;
+ unsigned int num_cpp_defines;
+ unsigned int num_cpp_includes;
+ unsigned int num_cl_args;
+ char *string_array[1];
+};
+
+
/**
* struct gcov_fn_info - profiling meta data per function
* @ident: object file-unique function identifier
- * @checksum: function checksum
+ * @lineno_checksum: function lineno checksum
+ * @cfg_checksum: function cfg checksum
+ * @dc_offset: direct call offset
+ * @name: function name
* @n_ctrs: number of values per counter type belonging to this function
*
* This data is generated by gcc during compilation and doesn't change
@@ -45,7 +81,12 @@ typedef long long gcov_type;
*/
struct gcov_fn_info {
unsigned int ident;
- unsigned int checksum;
+ unsigned int lineno_checksum;
+ unsigned int cfg_checksum;
+ unsigned int dc_offset;
+#ifdef GCOV_FN_INFO_HAS_NAME_FIELD
+ const char *name;
+#endif
unsigned int n_ctrs[0];
};
@@ -67,9 +108,11 @@ struct gcov_ctr_info {
/**
* struct gcov_info - profiling data per object file
* @version: gcov version magic indicating the gcc version used for compilation
+ * @modinfo: additional module information
* @next: list head for a singly-linked list
* @stamp: time stamp
* @filename: name of the associated gcov data file
+ * @eof_pos: end position of profile data
* @n_functions: number of instrumented functions
* @functions: function data
* @ctr_mask: mask specifying which counter types are active
@@ -80,9 +123,11 @@ struct gcov_ctr_info {
*/
struct gcov_info {
unsigned int version;
+ struct gcov_module_info *mod_info;
struct gcov_info *next;
unsigned int stamp;
const char *filename;
+ unsigned int eof_pos;
unsigned int n_functions;
const struct gcov_fn_info *functions;
unsigned int ctr_mask;
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index 15e53b1766a6..fe4b09cf829c 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -104,8 +104,13 @@ int check_wakeup_irqs(void)
for_each_irq_desc(irq, desc) {
if (irqd_is_wakeup_set(&desc->irq_data)) {
- if (desc->istate & IRQS_PENDING)
+ if (desc->istate & IRQS_PENDING) {
+ pr_info("Wakeup IRQ %d %s pending, suspend aborted\n",
+ irq,
+ desc->action && desc->action->name ?
+ desc->action->name : "");
return -EBUSY;
+ }
continue;
}
/*
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index 14dd5761e8c9..ef60772d2feb 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -55,17 +55,18 @@ static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0);
*/
void check_irq_resend(struct irq_desc *desc, unsigned int irq)
{
- /*
- * We do not resend level type interrupts. Level type
- * interrupts are resent by hardware when they are still
- * active.
- */
- if (irq_settings_is_level(desc))
- return;
- if (desc->istate & IRQS_REPLAY)
- return;
if (desc->istate & IRQS_PENDING) {
desc->istate &= ~IRQS_PENDING;
+ /*
+ * We do not resend level type interrupts. Level type
+ * interrupts are resent by hardware when they are still
+ * active.
+ */
+ if (irq_settings_is_level(desc))
+ return;
+ if (desc->istate & IRQS_REPLAY)
+ return;
+
desc->istate |= IRQS_REPLAY;
if (!desc->irq_data.chip->irq_retrigger ||
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 3d3de633702e..b68236b45ba9 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -16,6 +16,7 @@
#include <linux/mutex.h>
#include <linux/slab.h>
#include <linux/freezer.h>
+#include <linux/preempt.h>
#include <trace/events/sched.h>
static DEFINE_SPINLOCK(kthread_create_lock);
@@ -113,7 +114,17 @@ static int kthread(void *_create)
/* OK, tell user we're spawned, wait for stop or wakeup */
__set_current_state(TASK_UNINTERRUPTIBLE);
create->result = current;
+
+ /*
+ * Disable preemption so we enter TASK_UNINTERRUPTIBLE after
+ * complete() instead of possibly being preempted. This speeds
+ * up clients that do a kthread_bind() directly after
+ * creation.
+ */
+ preempt_disable();
complete(&create->done);
+ preempt_enable_no_resched();
+
schedule();
ret = -EINTR;
diff --git a/kernel/module.c b/kernel/module.c
index 78ac6ec1e425..b084bf116fc4 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2600,7 +2600,7 @@ static void find_module_sections(struct module *mod, struct load_info *info)
mod->unused_gpl_crcs = section_addr(info, "__kcrctab_unused_gpl");
#endif
#ifdef CONFIG_CONSTRUCTORS
- mod->ctors = section_objs(info, ".ctors",
+ mod->ctors = section_objs(info, CONFIG_GCOV_CTORS,
sizeof(*mod->ctors), &mod->num_ctors);
#endif
diff --git a/kernel/panic.c b/kernel/panic.c
index 9ed023b8333a..90fd443165df 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -27,13 +27,19 @@
#define PANIC_TIMER_STEP 100
#define PANIC_BLINK_SPD 18
+/* Machine specific panic information string */
+char *mach_panic_string;
+
int panic_on_oops;
static unsigned long tainted_mask;
static int pause_on_oops;
static int pause_on_oops_flag;
static DEFINE_SPINLOCK(pause_on_oops_lock);
-int panic_timeout;
+#ifndef CONFIG_PANIC_TIMEOUT
+#define CONFIG_PANIC_TIMEOUT 0
+#endif
+int panic_timeout = CONFIG_PANIC_TIMEOUT;
EXPORT_SYMBOL_GPL(panic_timeout);
ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
@@ -375,6 +381,11 @@ late_initcall(init_oops_id);
void print_oops_end_marker(void)
{
init_oops_id();
+
+ if (mach_panic_string)
+ printk(KERN_WARNING "Board Information: %s\n",
+ mach_panic_string);
+
printk(KERN_WARNING "---[ end trace %016llx ]---\n",
(unsigned long long)oops_id);
}
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index deb5461e3216..815da3c5cdd4 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -18,6 +18,14 @@ config SUSPEND_FREEZER
Turning OFF this setting is NOT recommended! If in doubt, say Y.
+config HAS_WAKELOCK
+ bool
+ default y
+
+config WAKELOCK
+ bool
+ default y
+
config HIBERNATE_CALLBACKS
bool
@@ -75,20 +83,20 @@ config PM_STD_PARTITION
default ""
---help---
The default resume partition is the partition that the suspend-
- to-disk implementation will look for a suspended disk image.
+ to-disk implementation will look for a suspended disk image.
- The partition specified here will be different for almost every user.
+ The partition specified here will be different for almost every user.
It should be a valid swap partition (at least for now) that is turned
- on before suspending.
+ on before suspending.
The partition specified can be overridden by specifying:
- resume=/dev/<other device>
+ resume=/dev/<other device>
- which will set the resume partition to the device specified.
+ which will set the resume partition to the device specified.
Note there is currently not a way to specify which device to save the
- suspended image to. It will simply pick the first available swap
+ suspended image to. It will simply pick the first available swap
device.
config PM_SLEEP
@@ -103,6 +111,33 @@ config PM_SLEEP_SMP
select HOTPLUG
select HOTPLUG_CPU
+config PM_AUTOSLEEP
+ bool "Opportunistic sleep"
+ depends on PM_SLEEP
+ default n
+ ---help---
+ Allow the kernel to trigger a system transition into a global sleep
+ state automatically whenever there are no active wakeup sources.
+
+config PM_WAKELOCKS
+ bool "User space wakeup sources interface"
+ depends on PM_SLEEP
+ default n
+ ---help---
+ Allow user space to create, activate and deactivate wakeup source
+ objects with the help of a sysfs-based interface.
+
+config PM_WAKELOCKS_LIMIT
+ int "Maximum number of user space wakeup sources (0 = no limit)"
+ range 0 100000
+ default 100
+ depends on PM_WAKELOCKS
+
+config PM_WAKELOCKS_GC
+ bool "Garbage collector for user space wakeup sources"
+ depends on PM_WAKELOCKS
+ default y
+
config PM_RUNTIME
bool "Run-time PM core functionality"
depends on !IA64_HP_SIM
@@ -243,3 +278,10 @@ config PM_GENERIC_DOMAINS_RUNTIME
config CPU_PM
bool
depends on SUSPEND || CPU_IDLE
+
+config SUSPEND_TIME
+ bool "Log time spent in suspend"
+ ---help---
+ Prints the time spent in suspend in the kernel log, and
+ keeps statistics on the time spent in suspend in
+ /sys/kernel/debug/suspend_time
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 66d808ec5252..8450b85d33c0 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -9,5 +9,8 @@ obj-$(CONFIG_SUSPEND) += suspend.o
obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o
obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \
block_io.o
+obj-$(CONFIG_PM_AUTOSLEEP) += autosleep.o
+obj-$(CONFIG_PM_WAKELOCKS) += wakelock.o
+obj-$(CONFIG_SUSPEND_TIME) += suspend_time.o
obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o
diff --git a/kernel/power/autosleep.c b/kernel/power/autosleep.c
new file mode 100644
index 000000000000..ca304046d9e2
--- /dev/null
+++ b/kernel/power/autosleep.c
@@ -0,0 +1,127 @@
+/*
+ * kernel/power/autosleep.c
+ *
+ * Opportunistic sleep support.
+ *
+ * Copyright (C) 2012 Rafael J. Wysocki <rjw@sisk.pl>
+ */
+
+#include <linux/device.h>
+#include <linux/mutex.h>
+#include <linux/pm_wakeup.h>
+
+#include "power.h"
+
+static suspend_state_t autosleep_state;
+static struct workqueue_struct *autosleep_wq;
+/*
+ * Note: it is only safe to mutex_lock(&autosleep_lock) if a wakeup_source
+ * is active, otherwise a deadlock with try_to_suspend() is possible.
+ * Alternatively mutex_lock_interruptible() can be used. This will then fail
+ * if an auto_sleep cycle tries to freeze processes.
+ */
+static DEFINE_MUTEX(autosleep_lock);
+static struct wakeup_source *autosleep_ws;
+
+static void try_to_suspend(struct work_struct *work)
+{
+ unsigned int initial_count, final_count;
+
+ if (!pm_get_wakeup_count(&initial_count, true))
+ goto out;
+
+ mutex_lock(&autosleep_lock);
+
+ if (!pm_save_wakeup_count(initial_count)) {
+ mutex_unlock(&autosleep_lock);
+ goto out;
+ }
+
+ if (autosleep_state == PM_SUSPEND_ON) {
+ mutex_unlock(&autosleep_lock);
+ return;
+ }
+ if (autosleep_state >= PM_SUSPEND_MAX)
+ hibernate();
+ else
+ pm_suspend(autosleep_state);
+
+ mutex_unlock(&autosleep_lock);
+
+ if (!pm_get_wakeup_count(&final_count, false))
+ goto out;
+
+ /*
+ * If the wakeup occured for an unknown reason, wait to prevent the
+ * system from trying to suspend and waking up in a tight loop.
+ */
+ if (final_count == initial_count)
+ schedule_timeout_uninterruptible(HZ / 2);
+
+ out:
+ queue_up_suspend_work();
+}
+
+static DECLARE_WORK(suspend_work, try_to_suspend);
+
+void queue_up_suspend_work(void)
+{
+ if (!work_pending(&suspend_work) && autosleep_state > PM_SUSPEND_ON)
+ queue_work(autosleep_wq, &suspend_work);
+}
+
+suspend_state_t pm_autosleep_state(void)
+{
+ return autosleep_state;
+}
+
+int pm_autosleep_lock(void)
+{
+ return mutex_lock_interruptible(&autosleep_lock);
+}
+
+void pm_autosleep_unlock(void)
+{
+ mutex_unlock(&autosleep_lock);
+}
+
+int pm_autosleep_set_state(suspend_state_t state)
+{
+
+#ifndef CONFIG_HIBERNATION
+ if (state >= PM_SUSPEND_MAX)
+ return -EINVAL;
+#endif
+
+ __pm_stay_awake(autosleep_ws);
+
+ mutex_lock(&autosleep_lock);
+
+ autosleep_state = state;
+
+ __pm_relax(autosleep_ws);
+
+ if (state > PM_SUSPEND_ON) {
+ pm_wakep_autosleep_enabled(true);
+ queue_up_suspend_work();
+ } else {
+ pm_wakep_autosleep_enabled(false);
+ }
+
+ mutex_unlock(&autosleep_lock);
+ return 0;
+}
+
+int __init pm_autosleep_init(void)
+{
+ autosleep_ws = wakeup_source_register("autosleep");
+ if (!autosleep_ws)
+ return -ENOMEM;
+
+ autosleep_wq = alloc_ordered_workqueue("autosleep", 0);
+ if (autosleep_wq)
+ return 0;
+
+ wakeup_source_unregister(autosleep_ws);
+ return -ENOMEM;
+}
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 52a18173c845..586521aa2baf 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -25,6 +25,8 @@
#include <linux/freezer.h>
#include <linux/gfp.h>
#include <linux/syscore_ops.h>
+#include <linux/ctype.h>
+#include <linux/genhd.h>
#include <scsi/scsi_scan.h>
#include "power.h"
@@ -728,6 +730,17 @@ static int software_resume(void)
/* Check if the device is there */
swsusp_resume_device = name_to_dev_t(resume_file);
+
+ /*
+ * name_to_dev_t is ineffective to verify parition if resume_file is in
+ * integer format. (e.g. major:minor)
+ */
+ if (isdigit(resume_file[0]) && resume_wait) {
+ int partno;
+ while (!get_gendisk(swsusp_resume_device, &partno))
+ msleep(10);
+ }
+
if (!swsusp_resume_device) {
/*
* Some device discovery might still be in progress; we need
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 1c12581f1c62..428f8a034e96 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -269,8 +269,7 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
return (s - buf);
}
-static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
- const char *buf, size_t n)
+static suspend_state_t decode_state(const char *buf, size_t n)
{
#ifdef CONFIG_SUSPEND
suspend_state_t state = PM_SUSPEND_STANDBY;
@@ -278,27 +277,48 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
#endif
char *p;
int len;
- int error = -EINVAL;
p = memchr(buf, '\n', n);
len = p ? p - buf : n;
- /* First, check if we are requested to hibernate */
- if (len == 4 && !strncmp(buf, "disk", len)) {
- error = hibernate();
- goto Exit;
- }
+ /* Check hibernation first. */
+ if (len == 4 && !strncmp(buf, "disk", len))
+ return PM_SUSPEND_MAX;
#ifdef CONFIG_SUSPEND
- for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) {
- if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) {
- error = pm_suspend(state);
- break;
- }
- }
+ for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++)
+ if (*s && len == strlen(*s) && !strncmp(buf, *s, len))
+ return state;
#endif
- Exit:
+ return PM_SUSPEND_ON;
+}
+
+static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
+ const char *buf, size_t n)
+{
+ suspend_state_t state;
+ int error;
+
+ error = pm_autosleep_lock();
+ if (error)
+ return error;
+
+ if (pm_autosleep_state() > PM_SUSPEND_ON) {
+ error = -EBUSY;
+ goto out;
+ }
+
+ state = decode_state(buf, n);
+ if (state < PM_SUSPEND_MAX)
+ error = pm_suspend(state);
+ else if (state == PM_SUSPEND_MAX)
+ error = hibernate();
+ else
+ error = -EINVAL;
+
+ out:
+ pm_autosleep_unlock();
return error ? error : n;
}
@@ -339,7 +359,8 @@ static ssize_t wakeup_count_show(struct kobject *kobj,
{
unsigned int val;
- return pm_get_wakeup_count(&val) ? sprintf(buf, "%u\n", val) : -EINTR;
+ return pm_get_wakeup_count(&val, true) ?
+ sprintf(buf, "%u\n", val) : -EINTR;
}
static ssize_t wakeup_count_store(struct kobject *kobj,
@@ -347,15 +368,106 @@ static ssize_t wakeup_count_store(struct kobject *kobj,
const char *buf, size_t n)
{
unsigned int val;
+ int error;
+
+ error = pm_autosleep_lock();
+ if (error)
+ return error;
+
+ if (pm_autosleep_state() > PM_SUSPEND_ON) {
+ error = -EBUSY;
+ goto out;
+ }
+ error = -EINVAL;
if (sscanf(buf, "%u", &val) == 1) {
if (pm_save_wakeup_count(val))
- return n;
+ error = n;
}
- return -EINVAL;
+
+ out:
+ pm_autosleep_unlock();
+ return error;
}
power_attr(wakeup_count);
+
+#ifdef CONFIG_PM_AUTOSLEEP
+static ssize_t autosleep_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ suspend_state_t state = pm_autosleep_state();
+
+ if (state == PM_SUSPEND_ON)
+ return sprintf(buf, "off\n");
+
+#ifdef CONFIG_SUSPEND
+ if (state < PM_SUSPEND_MAX)
+ return sprintf(buf, "%s\n", valid_state(state) ?
+ pm_states[state] : "error");
+#endif
+#ifdef CONFIG_HIBERNATION
+ return sprintf(buf, "disk\n");
+#else
+ return sprintf(buf, "error");
+#endif
+}
+
+static ssize_t autosleep_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t n)
+{
+ suspend_state_t state = decode_state(buf, n);
+ int error;
+
+ if (state == PM_SUSPEND_ON
+ && strcmp(buf, "off") && strcmp(buf, "off\n"))
+ return -EINVAL;
+
+ error = pm_autosleep_set_state(state);
+ return error ? error : n;
+}
+
+power_attr(autosleep);
+#endif /* CONFIG_PM_AUTOSLEEP */
+
+#ifdef CONFIG_PM_WAKELOCKS
+static ssize_t wake_lock_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return pm_show_wakelocks(buf, true);
+}
+
+static ssize_t wake_lock_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t n)
+{
+ int error = pm_wake_lock(buf);
+ return error ? error : n;
+}
+
+power_attr(wake_lock);
+
+static ssize_t wake_unlock_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return pm_show_wakelocks(buf, false);
+}
+
+static ssize_t wake_unlock_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t n)
+{
+ int error = pm_wake_unlock(buf);
+ return error ? error : n;
+}
+
+power_attr(wake_unlock);
+
+#endif /* CONFIG_PM_WAKELOCKS */
#endif /* CONFIG_PM_SLEEP */
#ifdef CONFIG_PM_TRACE
@@ -409,6 +521,13 @@ static struct attribute * g[] = {
#ifdef CONFIG_PM_SLEEP
&pm_async_attr.attr,
&wakeup_count_attr.attr,
+#ifdef CONFIG_PM_AUTOSLEEP
+ &autosleep_attr.attr,
+#endif
+#ifdef CONFIG_PM_WAKELOCKS
+ &wake_lock_attr.attr,
+ &wake_unlock_attr.attr,
+#endif
#ifdef CONFIG_PM_DEBUG
&pm_test_attr.attr,
#endif
@@ -444,7 +563,10 @@ static int __init pm_init(void)
power_kobj = kobject_create_and_add("power", NULL);
if (!power_kobj)
return -ENOMEM;
- return sysfs_create_group(power_kobj, &attr_group);
+ error = sysfs_create_group(power_kobj, &attr_group);
+ if (error)
+ return error;
+ return pm_autosleep_init();
}
core_initcall(pm_init);
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 98f3622d7407..b0bd4beaebfe 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -264,3 +264,30 @@ static inline void suspend_thaw_processes(void)
{
}
#endif
+
+#ifdef CONFIG_PM_AUTOSLEEP
+
+/* kernel/power/autosleep.c */
+extern int pm_autosleep_init(void);
+extern int pm_autosleep_lock(void);
+extern void pm_autosleep_unlock(void);
+extern suspend_state_t pm_autosleep_state(void);
+extern int pm_autosleep_set_state(suspend_state_t state);
+
+#else /* !CONFIG_PM_AUTOSLEEP */
+
+static inline int pm_autosleep_init(void) { return 0; }
+static inline int pm_autosleep_lock(void) { return 0; }
+static inline void pm_autosleep_unlock(void) {}
+static inline suspend_state_t pm_autosleep_state(void) { return PM_SUSPEND_ON; }
+
+#endif /* !CONFIG_PM_AUTOSLEEP */
+
+#ifdef CONFIG_PM_WAKELOCKS
+
+/* kernel/power/wakelock.c */
+extern ssize_t pm_show_wakelocks(char *buf, bool show_active);
+extern int pm_wake_lock(const char *buf);
+extern int pm_wake_unlock(const char *buf);
+
+#endif /* !CONFIG_PM_WAKELOCKS */
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
index 6a031e684026..834fbfc61398 100644
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -101,11 +101,72 @@ static struct pm_qos_object network_throughput_pm_qos = {
};
+static BLOCKING_NOTIFIER_HEAD(min_online_cpus_notifier);
+static struct pm_qos_constraints min_online_cpus_constraints = {
+ .list = PLIST_HEAD_INIT(min_online_cpus_constraints.list),
+ .target_value = PM_QOS_MIN_ONLINE_CPUS_DEFAULT_VALUE,
+ .default_value = PM_QOS_MIN_ONLINE_CPUS_DEFAULT_VALUE,
+ .type = PM_QOS_MAX,
+ .notifiers = &min_online_cpus_notifier,
+};
+static struct pm_qos_object min_online_cpus_pm_qos = {
+ .constraints = &min_online_cpus_constraints,
+ .name = "min_online_cpus",
+};
+
+
+static BLOCKING_NOTIFIER_HEAD(max_online_cpus_notifier);
+static struct pm_qos_constraints max_online_cpus_constraints = {
+ .list = PLIST_HEAD_INIT(max_online_cpus_constraints.list),
+ .target_value = PM_QOS_MAX_ONLINE_CPUS_DEFAULT_VALUE,
+ .default_value = PM_QOS_MAX_ONLINE_CPUS_DEFAULT_VALUE,
+ .type = PM_QOS_MIN,
+ .notifiers = &max_online_cpus_notifier,
+};
+static struct pm_qos_object max_online_cpus_pm_qos = {
+ .constraints = &max_online_cpus_constraints,
+ .name = "max_online_cpus",
+
+};
+
+
+static BLOCKING_NOTIFIER_HEAD(cpu_freq_min_notifier);
+static struct pm_qos_constraints cpu_freq_min_constraints = {
+ .list = PLIST_HEAD_INIT(cpu_freq_min_constraints.list),
+ .target_value = PM_QOS_CPU_FREQ_MIN_DEFAULT_VALUE,
+ .default_value = PM_QOS_CPU_FREQ_MIN_DEFAULT_VALUE,
+ .type = PM_QOS_MAX,
+ .notifiers = &cpu_freq_min_notifier,
+};
+static struct pm_qos_object cpu_freq_min_pm_qos = {
+ .constraints = &cpu_freq_min_constraints,
+ .name = "cpu_freq_min",
+};
+
+
+static BLOCKING_NOTIFIER_HEAD(cpu_freq_max_notifier);
+static struct pm_qos_constraints cpu_freq_max_constraints = {
+ .list = PLIST_HEAD_INIT(cpu_freq_max_constraints.list),
+ .target_value = PM_QOS_CPU_FREQ_MAX_DEFAULT_VALUE,
+ .default_value = PM_QOS_CPU_FREQ_MAX_DEFAULT_VALUE,
+ .type = PM_QOS_MIN,
+ .notifiers = &cpu_freq_max_notifier,
+};
+static struct pm_qos_object cpu_freq_max_pm_qos = {
+ .constraints = &cpu_freq_max_constraints,
+ .name = "cpu_freq_max",
+};
+
+
static struct pm_qos_object *pm_qos_array[] = {
&null_pm_qos,
&cpu_dma_pm_qos,
&network_lat_pm_qos,
- &network_throughput_pm_qos
+ &network_throughput_pm_qos,
+ &min_online_cpus_pm_qos,
+ &max_online_cpus_pm_qos,
+ &cpu_freq_min_pm_qos,
+ &cpu_freq_max_pm_qos
};
static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index c8b7446b27df..7a2bb5beda6c 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -24,6 +24,7 @@
#include <linux/export.h>
#include <linux/suspend.h>
#include <linux/syscore_ops.h>
+#include <linux/rtc.h>
#include <linux/ftrace.h>
#include <trace/events/power.h>
@@ -303,6 +304,18 @@ static int enter_state(suspend_state_t state)
return error;
}
+static void pm_suspend_marker(char *annotation)
+{
+ struct timespec ts;
+ struct rtc_time tm;
+
+ getnstimeofday(&ts);
+ rtc_time_to_tm(ts.tv_sec, &tm);
+ pr_info("PM: suspend %s %d-%02d-%02d %02d:%02d:%02d.%09lu UTC\n",
+ annotation, tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday,
+ tm.tm_hour, tm.tm_min, tm.tm_sec, ts.tv_nsec);
+}
+
/**
* pm_suspend - Externally visible function for suspending the system.
* @state: System sleep state to enter.
@@ -317,6 +330,7 @@ int pm_suspend(suspend_state_t state)
if (state <= PM_SUSPEND_ON || state >= PM_SUSPEND_MAX)
return -EINVAL;
+ pm_suspend_marker("entry");
error = enter_state(state);
if (error) {
suspend_stats.fail++;
@@ -324,6 +338,7 @@ int pm_suspend(suspend_state_t state)
} else {
suspend_stats.success++;
}
+ pm_suspend_marker("exit");
return error;
}
EXPORT_SYMBOL(pm_suspend);
diff --git a/kernel/power/suspend_time.c b/kernel/power/suspend_time.c
new file mode 100644
index 000000000000..d2a65da9f22c
--- /dev/null
+++ b/kernel/power/suspend_time.c
@@ -0,0 +1,111 @@
+/*
+ * debugfs file to track time spent in suspend
+ *
+ * Copyright (c) 2011, Google, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+
+#include <linux/debugfs.h>
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/seq_file.h>
+#include <linux/syscore_ops.h>
+#include <linux/time.h>
+
+static struct timespec suspend_time_before;
+static unsigned int time_in_suspend_bins[32];
+
+#ifdef CONFIG_DEBUG_FS
+static int suspend_time_debug_show(struct seq_file *s, void *data)
+{
+ int bin;
+ seq_printf(s, "time (secs) count\n");
+ seq_printf(s, "------------------\n");
+ for (bin = 0; bin < 32; bin++) {
+ if (time_in_suspend_bins[bin] == 0)
+ continue;
+ seq_printf(s, "%4d - %4d %4u\n",
+ bin ? 1 << (bin - 1) : 0, 1 << bin,
+ time_in_suspend_bins[bin]);
+ }
+ return 0;
+}
+
+static int suspend_time_debug_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, suspend_time_debug_show, NULL);
+}
+
+static const struct file_operations suspend_time_debug_fops = {
+ .open = suspend_time_debug_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int __init suspend_time_debug_init(void)
+{
+ struct dentry *d;
+
+ d = debugfs_create_file("suspend_time", 0755, NULL, NULL,
+ &suspend_time_debug_fops);
+ if (!d) {
+ pr_err("Failed to create suspend_time debug file\n");
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+late_initcall(suspend_time_debug_init);
+#endif
+
+static int suspend_time_syscore_suspend(void)
+{
+ read_persistent_clock(&suspend_time_before);
+
+ return 0;
+}
+
+static void suspend_time_syscore_resume(void)
+{
+ struct timespec after;
+
+ read_persistent_clock(&after);
+
+ after = timespec_sub(after, suspend_time_before);
+
+ time_in_suspend_bins[fls(after.tv_sec)]++;
+
+ pr_info("Suspended for %lu.%03lu seconds\n", after.tv_sec,
+ after.tv_nsec / NSEC_PER_MSEC);
+}
+
+static struct syscore_ops suspend_time_syscore_ops = {
+ .suspend = suspend_time_syscore_suspend,
+ .resume = suspend_time_syscore_resume,
+};
+
+static int suspend_time_syscore_init(void)
+{
+ register_syscore_ops(&suspend_time_syscore_ops);
+
+ return 0;
+}
+
+static void suspend_time_syscore_exit(void)
+{
+ unregister_syscore_ops(&suspend_time_syscore_ops);
+}
+module_init(suspend_time_syscore_init);
+module_exit(suspend_time_syscore_exit);
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index eef311a58a64..11e22c068e8b 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -6,7 +6,7 @@
*
* Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz>
* Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
- * Copyright (C) 2010 Bojan Smojver <bojan@rexursive.com>
+ * Copyright (C) 2010-2012 Bojan Smojver <bojan@rexursive.com>
*
* This file is released under the GPLv2.
*
@@ -282,14 +282,17 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
return -ENOSPC;
if (bio_chain) {
- src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
+ src = (void *)__get_free_page(__GFP_WAIT | __GFP_NOWARN |
+ __GFP_NORETRY);
if (src) {
copy_page(src, buf);
} else {
ret = hib_wait_on_bio_chain(bio_chain); /* Free pages */
if (ret)
return ret;
- src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
+ src = (void *)__get_free_page(__GFP_WAIT |
+ __GFP_NOWARN |
+ __GFP_NORETRY);
if (src) {
copy_page(src, buf);
} else {
@@ -367,12 +370,17 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
clear_page(handle->cur);
handle->cur_swap = offset;
handle->k = 0;
- }
- if (bio_chain && low_free_pages() <= handle->reqd_free_pages) {
- error = hib_wait_on_bio_chain(bio_chain);
- if (error)
- goto out;
- handle->reqd_free_pages = reqd_free_pages();
+
+ if (bio_chain && low_free_pages() <= handle->reqd_free_pages) {
+ error = hib_wait_on_bio_chain(bio_chain);
+ if (error)
+ goto out;
+ /*
+ * Recalculate the number of required free pages, to
+ * make sure we never take more than half.
+ */
+ handle->reqd_free_pages = reqd_free_pages();
+ }
}
out:
return error;
@@ -419,8 +427,9 @@ static int swap_writer_finish(struct swap_map_handle *handle,
/* Maximum number of threads for compression/decompression. */
#define LZO_THREADS 3
-/* Maximum number of pages for read buffering. */
-#define LZO_READ_PAGES (MAP_PAGE_ENTRIES * 8)
+/* Minimum/maximum number of pages for read buffering. */
+#define LZO_MIN_RD_PAGES 1024
+#define LZO_MAX_RD_PAGES 8192
/**
@@ -631,12 +640,6 @@ static int save_image_lzo(struct swap_map_handle *handle,
}
/*
- * Adjust number of free pages after all allocations have been done.
- * We don't want to run out of pages when writing.
- */
- handle->reqd_free_pages = reqd_free_pages();
-
- /*
* Start the CRC32 thread.
*/
init_waitqueue_head(&crc->go);
@@ -657,6 +660,12 @@ static int save_image_lzo(struct swap_map_handle *handle,
goto out_clean;
}
+ /*
+ * Adjust the number of required free pages after all allocations have
+ * been done. We don't want to run out of pages when writing.
+ */
+ handle->reqd_free_pages = reqd_free_pages();
+
printk(KERN_INFO
"PM: Using %u thread(s) for compression.\n"
"PM: Compressing and saving image data (%u pages) ... ",
@@ -1067,7 +1076,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
unsigned i, thr, run_threads, nr_threads;
unsigned ring = 0, pg = 0, ring_size = 0,
have = 0, want, need, asked = 0;
- unsigned long read_pages;
+ unsigned long read_pages = 0;
unsigned char **page = NULL;
struct dec_data *data = NULL;
struct crc_data *crc = NULL;
@@ -1079,7 +1088,7 @@ static int load_image_lzo(struct swap_map_handle *handle,
nr_threads = num_online_cpus() - 1;
nr_threads = clamp_val(nr_threads, 1, LZO_THREADS);
- page = vmalloc(sizeof(*page) * LZO_READ_PAGES);
+ page = vmalloc(sizeof(*page) * LZO_MAX_RD_PAGES);
if (!page) {
printk(KERN_ERR "PM: Failed to allocate LZO page\n");
ret = -ENOMEM;
@@ -1144,15 +1153,22 @@ static int load_image_lzo(struct swap_map_handle *handle,
}
/*
- * Adjust number of pages for read buffering, in case we are short.
+ * Set the number of pages for read buffering.
+ * This is complete guesswork, because we'll only know the real
+ * picture once prepare_image() is called, which is much later on
+ * during the image load phase. We'll assume the worst case and
+ * say that none of the image pages are from high memory.
*/
- read_pages = (nr_free_pages() - snapshot_get_image_size()) >> 1;
- read_pages = clamp_val(read_pages, LZO_CMP_PAGES, LZO_READ_PAGES);
+ if (low_free_pages() > snapshot_get_image_size())
+ read_pages = (low_free_pages() - snapshot_get_image_size()) / 2;
+ read_pages = clamp_val(read_pages, LZO_MIN_RD_PAGES, LZO_MAX_RD_PAGES);
for (i = 0; i < read_pages; i++) {
page[i] = (void *)__get_free_page(i < LZO_CMP_PAGES ?
__GFP_WAIT | __GFP_HIGH :
- __GFP_WAIT);
+ __GFP_WAIT | __GFP_NOWARN |
+ __GFP_NORETRY);
+
if (!page[i]) {
if (i < LZO_CMP_PAGES) {
ring_size = i;
diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c
new file mode 100644
index 000000000000..c8fba3380076
--- /dev/null
+++ b/kernel/power/wakelock.c
@@ -0,0 +1,259 @@
+/*
+ * kernel/power/wakelock.c
+ *
+ * User space wakeup sources support.
+ *
+ * Copyright (C) 2012 Rafael J. Wysocki <rjw@sisk.pl>
+ *
+ * This code is based on the analogous interface allowing user space to
+ * manipulate wakelocks on Android.
+ */
+
+#include <linux/ctype.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/hrtimer.h>
+#include <linux/list.h>
+#include <linux/rbtree.h>
+#include <linux/slab.h>
+
+static DEFINE_MUTEX(wakelocks_lock);
+
+struct wakelock {
+ char *name;
+ struct rb_node node;
+ struct wakeup_source ws;
+#ifdef CONFIG_PM_WAKELOCKS_GC
+ struct list_head lru;
+#endif
+};
+
+static struct rb_root wakelocks_tree = RB_ROOT;
+
+ssize_t pm_show_wakelocks(char *buf, bool show_active)
+{
+ struct rb_node *node;
+ struct wakelock *wl;
+ char *str = buf;
+ char *end = buf + PAGE_SIZE;
+
+ mutex_lock(&wakelocks_lock);
+
+ for (node = rb_first(&wakelocks_tree); node; node = rb_next(node)) {
+ wl = rb_entry(node, struct wakelock, node);
+ if (wl->ws.active == show_active)
+ str += scnprintf(str, end - str, "%s ", wl->name);
+ }
+ if (str > buf)
+ str--;
+
+ str += scnprintf(str, end - str, "\n");
+
+ mutex_unlock(&wakelocks_lock);
+ return (str - buf);
+}
+
+#if CONFIG_PM_WAKELOCKS_LIMIT > 0
+static unsigned int number_of_wakelocks;
+
+static inline bool wakelocks_limit_exceeded(void)
+{
+ return number_of_wakelocks > CONFIG_PM_WAKELOCKS_LIMIT;
+}
+
+static inline void increment_wakelocks_number(void)
+{
+ number_of_wakelocks++;
+}
+
+static inline void decrement_wakelocks_number(void)
+{
+ number_of_wakelocks--;
+}
+#else /* CONFIG_PM_WAKELOCKS_LIMIT = 0 */
+static inline bool wakelocks_limit_exceeded(void) { return false; }
+static inline void increment_wakelocks_number(void) {}
+static inline void decrement_wakelocks_number(void) {}
+#endif /* CONFIG_PM_WAKELOCKS_LIMIT */
+
+#ifdef CONFIG_PM_WAKELOCKS_GC
+#define WL_GC_COUNT_MAX 100
+#define WL_GC_TIME_SEC 300
+
+static LIST_HEAD(wakelocks_lru_list);
+static unsigned int wakelocks_gc_count;
+
+static inline void wakelocks_lru_add(struct wakelock *wl)
+{
+ list_add(&wl->lru, &wakelocks_lru_list);
+}
+
+static inline void wakelocks_lru_most_recent(struct wakelock *wl)
+{
+ list_move(&wl->lru, &wakelocks_lru_list);
+}
+
+static void wakelocks_gc(void)
+{
+ struct wakelock *wl, *aux;
+ ktime_t now;
+
+ if (++wakelocks_gc_count <= WL_GC_COUNT_MAX)
+ return;
+
+ now = ktime_get();
+ list_for_each_entry_safe_reverse(wl, aux, &wakelocks_lru_list, lru) {
+ u64 idle_time_ns;
+ bool active;
+
+ spin_lock_irq(&wl->ws.lock);
+ idle_time_ns = ktime_to_ns(ktime_sub(now, wl->ws.last_time));
+ active = wl->ws.active;
+ spin_unlock_irq(&wl->ws.lock);
+
+ if (idle_time_ns < ((u64)WL_GC_TIME_SEC * NSEC_PER_SEC))
+ break;
+
+ if (!active) {
+ wakeup_source_remove(&wl->ws);
+ rb_erase(&wl->node, &wakelocks_tree);
+ list_del(&wl->lru);
+ kfree(wl->name);
+ kfree(wl);
+ decrement_wakelocks_number();
+ }
+ }
+ wakelocks_gc_count = 0;
+}
+#else /* !CONFIG_PM_WAKELOCKS_GC */
+static inline void wakelocks_lru_add(struct wakelock *wl) {}
+static inline void wakelocks_lru_most_recent(struct wakelock *wl) {}
+static inline void wakelocks_gc(void) {}
+#endif /* !CONFIG_PM_WAKELOCKS_GC */
+
+static struct wakelock *wakelock_lookup_add(const char *name, size_t len,
+ bool add_if_not_found)
+{
+ struct rb_node **node = &wakelocks_tree.rb_node;
+ struct rb_node *parent = *node;
+ struct wakelock *wl;
+
+ while (*node) {
+ int diff;
+
+ parent = *node;
+ wl = rb_entry(*node, struct wakelock, node);
+ diff = strncmp(name, wl->name, len);
+ if (diff == 0) {
+ if (wl->name[len])
+ diff = -1;
+ else
+ return wl;
+ }
+ if (diff < 0)
+ node = &(*node)->rb_left;
+ else
+ node = &(*node)->rb_right;
+ }
+ if (!add_if_not_found)
+ return ERR_PTR(-EINVAL);
+
+ if (wakelocks_limit_exceeded())
+ return ERR_PTR(-ENOSPC);
+
+ /* Not found, we have to add a new one. */
+ wl = kzalloc(sizeof(*wl), GFP_KERNEL);
+ if (!wl)
+ return ERR_PTR(-ENOMEM);
+
+ wl->name = kstrndup(name, len, GFP_KERNEL);
+ if (!wl->name) {
+ kfree(wl);
+ return ERR_PTR(-ENOMEM);
+ }
+ wl->ws.name = wl->name;
+ wakeup_source_add(&wl->ws);
+ rb_link_node(&wl->node, parent, node);
+ rb_insert_color(&wl->node, &wakelocks_tree);
+ wakelocks_lru_add(wl);
+ increment_wakelocks_number();
+ return wl;
+}
+
+int pm_wake_lock(const char *buf)
+{
+ const char *str = buf;
+ struct wakelock *wl;
+ u64 timeout_ns = 0;
+ size_t len;
+ int ret = 0;
+
+ while (*str && !isspace(*str))
+ str++;
+
+ len = str - buf;
+ if (!len)
+ return -EINVAL;
+
+ if (*str && *str != '\n') {
+ /* Find out if there's a valid timeout string appended. */
+ ret = kstrtou64(skip_spaces(str), 10, &timeout_ns);
+ if (ret)
+ return -EINVAL;
+ }
+
+ mutex_lock(&wakelocks_lock);
+
+ wl = wakelock_lookup_add(buf, len, true);
+ if (IS_ERR(wl)) {
+ ret = PTR_ERR(wl);
+ goto out;
+ }
+ if (timeout_ns) {
+ u64 timeout_ms = timeout_ns + NSEC_PER_MSEC - 1;
+
+ do_div(timeout_ms, NSEC_PER_MSEC);
+ __pm_wakeup_event(&wl->ws, timeout_ms);
+ } else {
+ __pm_stay_awake(&wl->ws);
+ }
+
+ wakelocks_lru_most_recent(wl);
+
+ out:
+ mutex_unlock(&wakelocks_lock);
+ return ret;
+}
+
+int pm_wake_unlock(const char *buf)
+{
+ struct wakelock *wl;
+ size_t len;
+ int ret = 0;
+
+ len = strlen(buf);
+ if (!len)
+ return -EINVAL;
+
+ if (buf[len-1] == '\n')
+ len--;
+
+ if (!len)
+ return -EINVAL;
+
+ mutex_lock(&wakelocks_lock);
+
+ wl = wakelock_lookup_add(buf, len, false);
+ if (IS_ERR(wl)) {
+ ret = PTR_ERR(wl);
+ goto out;
+ }
+ __pm_relax(&wl->ws);
+
+ wakelocks_lru_most_recent(wl);
+ wakelocks_gc();
+
+ out:
+ mutex_unlock(&wakelocks_lock);
+ return ret;
+}
diff --git a/kernel/printk.c b/kernel/printk.c
index b663c2c95d39..7a8b101b237a 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -56,6 +56,10 @@ void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...)
#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
+#ifdef CONFIG_DEBUG_LL
+extern void printascii(char *);
+#endif
+
/* printk's without a loglevel use this.. */
#define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL
@@ -293,6 +297,53 @@ static inline void boot_delay_msec(void)
}
#endif
+/*
+ * Return the number of unread characters in the log buffer.
+ */
+static int log_buf_get_len(void)
+{
+ return logged_chars;
+}
+
+/*
+ * Clears the ring-buffer
+ */
+void log_buf_clear(void)
+{
+ logged_chars = 0;
+}
+
+/*
+ * Copy a range of characters from the log buffer.
+ */
+int log_buf_copy(char *dest, int idx, int len)
+{
+ int ret, max;
+ bool took_lock = false;
+
+ if (!oops_in_progress) {
+ raw_spin_lock_irq(&logbuf_lock);
+ took_lock = true;
+ }
+
+ max = log_buf_get_len();
+ if (idx < 0 || idx >= max) {
+ ret = -1;
+ } else {
+ if (len > max - idx)
+ len = max - idx;
+ ret = len;
+ idx += (log_end - max);
+ while (len-- > 0)
+ dest[len] = LOG_BUF(idx + len);
+ }
+
+ if (took_lock)
+ raw_spin_unlock_irq(&logbuf_lock);
+
+ return ret;
+}
+
#ifdef CONFIG_SECURITY_DMESG_RESTRICT
int dmesg_restrict = 1;
#else
@@ -884,6 +935,10 @@ asmlinkage int vprintk(const char *fmt, va_list args)
printed_len += vscnprintf(printk_buf + printed_len,
sizeof(printk_buf) - printed_len, fmt, args);
+#ifdef CONFIG_DEBUG_LL
+ printascii(printk_buf);
+#endif
+
p = printk_buf;
/* Read log level and handle special printk prefix */
@@ -959,7 +1014,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
* Try to acquire and then immediately release the
* console semaphore. The release will do all the
* actual magic (print out buffers, wake up klogd,
- * etc).
+ * etc).
*
* The console_trylock_for_printk() function
* will release 'logbuf_lock' regardless of whether it
@@ -1161,7 +1216,6 @@ static int __cpuinit console_cpu_notify(struct notifier_block *self,
switch (action) {
case CPU_ONLINE:
case CPU_DEAD:
- case CPU_DYING:
case CPU_DOWN_FAILED:
case CPU_UP_CANCELED:
console_lock();
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 817bf7018834..eb15edd08a23 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2161,6 +2161,32 @@ unsigned long this_cpu_load(void)
return this->cpu_load[0];
}
+unsigned long avg_nr_running(void)
+{
+ unsigned long i, sum = 0;
+ unsigned int seqcnt, ave_nr_running;
+
+ for_each_online_cpu(i) {
+ struct rq *q = cpu_rq(i);
+
+ /*
+ * Update average to avoid reading stalled value if there were
+ * no run-queue changes for a long time. On the other hand if
+ * the changes are happening right now, just read current value
+ * directly.
+ */
+ seqcnt = read_seqcount_begin(&q->ave_seqcnt);
+ ave_nr_running = do_avg_nr_running(q);
+ if (read_seqcount_retry(&q->ave_seqcnt, seqcnt)) {
+ read_seqcount_begin(&q->ave_seqcnt);
+ ave_nr_running = q->ave_nr_running;
+ }
+
+ sum += ave_nr_running;
+ }
+
+ return sum;
+}
/*
* Global load-average calculations
@@ -7203,13 +7229,24 @@ static inline int preempt_count_equals(int preempt_offset)
return (nested == preempt_offset);
}
+static int __might_sleep_init_called;
+int __init __might_sleep_init(void)
+{
+ __might_sleep_init_called = 1;
+ return 0;
+}
+early_initcall(__might_sleep_init);
+
void __might_sleep(const char *file, int line, int preempt_offset)
{
static unsigned long prev_jiffy; /* ratelimiting */
rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
- system_state != SYSTEM_RUNNING || oops_in_progress)
+ oops_in_progress)
+ return;
+ if (system_state != SYSTEM_RUNNING &&
+ (!__might_sleep_init_called || system_state != SYSTEM_BOOTING))
return;
if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
return;
@@ -7762,6 +7799,23 @@ static void cpu_cgroup_destroy(struct cgroup *cgrp)
sched_destroy_group(tg);
}
+static int
+cpu_cgroup_allow_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
+{
+ const struct cred *cred = current_cred(), *tcred;
+ struct task_struct *task;
+
+ cgroup_taskset_for_each(task, cgrp, tset) {
+ tcred = __task_cred(task);
+
+ if ((current != task) && !capable(CAP_SYS_NICE) &&
+ cred->euid != tcred->uid && cred->euid != tcred->suid)
+ return -EACCES;
+ }
+
+ return 0;
+}
+
static int cpu_cgroup_can_attach(struct cgroup *cgrp,
struct cgroup_taskset *tset)
{
@@ -8123,6 +8177,7 @@ struct cgroup_subsys cpu_cgroup_subsys = {
.destroy = cpu_cgroup_destroy,
.can_attach = cpu_cgroup_can_attach,
.attach = cpu_cgroup_attach,
+ .allow_attach = cpu_cgroup_allow_attach,
.exit = cpu_cgroup_exit,
.populate = cpu_cgroup_populate,
.subsys_id = cpu_cgroup_subsys_id,
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 09acaa15161d..06d172eb5cea 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -266,6 +266,9 @@ static void print_cpu(struct seq_file *m, int cpu)
SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x))
P(nr_running);
+ SEQ_printf(m, " .%-30s: %d.%03d \n", "ave_nr_running",
+ rq->ave_nr_running / FIXED_1,
+ ((rq->ave_nr_running % FIXED_1) * 1000) / FIXED_1);
SEQ_printf(m, " .%-30s: %lu\n", "load",
rq->load.weight);
P(nr_switches);
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 44af55e6d5d0..be427c5bc4d7 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -685,6 +685,7 @@ balanced:
* runtime - in which case borrowing doesn't make sense.
*/
rt_rq->rt_runtime = RUNTIME_INF;
+ rt_rq->rt_throttled = 0;
raw_spin_unlock(&rt_rq->rt_runtime_lock);
raw_spin_unlock(&rt_b->rt_runtime_lock);
}
@@ -1983,6 +1984,8 @@ static void watchdog(struct rq *rq, struct task_struct *p)
static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
{
+ struct sched_rt_entity *rt_se = &p->rt;
+
update_curr_rt(rq);
watchdog(rq, p);
@@ -2000,12 +2003,15 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
p->rt.time_slice = RR_TIMESLICE;
/*
- * Requeue to the end of queue if we are not the only element
- * on the queue:
+ * Requeue to the end of queue if we (and all of our ancestors) are the
+ * only element on the queue
*/
- if (p->rt.run_list.prev != p->rt.run_list.next) {
- requeue_task_rt(rq, p, 0);
- set_tsk_need_resched(p);
+ for_each_sched_rt_entity(rt_se) {
+ if (rt_se->run_list.prev != rt_se->run_list.next) {
+ requeue_task_rt(rq, p, 0);
+ set_tsk_need_resched(p);
+ return;
+ }
}
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 116ced06ecc0..ef5a1ff65196 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -363,6 +363,11 @@ struct rq {
#endif
int skip_clock_update;
+ /* time-based average load */
+ u64 nr_last_stamp;
+ unsigned int ave_nr_running;
+ seqcount_t ave_seqcnt;
+
/* capture load from *all* tasks on this cpu: */
struct load_weight load;
unsigned long nr_load_updates;
@@ -914,14 +919,49 @@ extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
#endif
+/* 27 ~= 134217728ns = 134.2ms
+ * 26 ~= 67108864ns = 67.1ms
+ * 25 ~= 33554432ns = 33.5ms
+ * 24 ~= 16777216ns = 16.8ms
+ */
+#define NR_AVE_PERIOD_EXP 27
+#define NR_AVE_SCALE(x) ((x) << FSHIFT)
+#define NR_AVE_PERIOD (1 << NR_AVE_PERIOD_EXP)
+#define NR_AVE_DIV_PERIOD(x) ((x) >> NR_AVE_PERIOD_EXP)
+
+static inline unsigned int do_avg_nr_running(struct rq *rq)
+{
+ s64 nr, deltax;
+ unsigned int ave_nr_running = rq->ave_nr_running;
+
+ deltax = rq->clock_task - rq->nr_last_stamp;
+ nr = NR_AVE_SCALE(rq->nr_running);
+
+ if (deltax > NR_AVE_PERIOD)
+ ave_nr_running = nr;
+ else
+ ave_nr_running +=
+ NR_AVE_DIV_PERIOD(deltax * (nr - ave_nr_running));
+
+ return ave_nr_running;
+}
+
static inline void inc_nr_running(struct rq *rq)
{
+ write_seqcount_begin(&rq->ave_seqcnt);
+ rq->ave_nr_running = do_avg_nr_running(rq);
+ rq->nr_last_stamp = rq->clock_task;
rq->nr_running++;
+ write_seqcount_end(&rq->ave_seqcnt);
}
static inline void dec_nr_running(struct rq *rq)
{
+ write_seqcount_begin(&rq->ave_seqcnt);
+ rq->ave_nr_running = do_avg_nr_running(rq);
+ rq->nr_last_stamp = rq->clock_task;
rq->nr_running--;
+ write_seqcount_end(&rq->ave_seqcnt);
}
extern void update_rq_clock(struct rq *rq);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 4ab11879aeb4..49f472582722 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -102,6 +102,7 @@ extern char core_pattern[];
extern unsigned int core_pipe_limit;
extern int pid_max;
extern int min_free_kbytes;
+extern int min_free_order_shift;
extern int pid_max_min, pid_max_max;
extern int sysctl_drop_caches;
extern int percpu_pagelist_fraction;
@@ -1199,6 +1200,13 @@ static struct ctl_table vm_table[] = {
.extra1 = &zero,
},
{
+ .procname = "min_free_order_shift",
+ .data = &min_free_order_shift,
+ .maxlen = sizeof(min_free_order_shift),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec
+ },
+ {
.procname = "percpu_pagelist_fraction",
.data = &percpu_pagelist_fraction,
.maxlen = sizeof(percpu_pagelist_fraction),
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 8a538c55fc7b..0c079010527f 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -46,6 +46,8 @@ static struct alarm_base {
static ktime_t freezer_delta;
static DEFINE_SPINLOCK(freezer_delta_lock);
+static struct wakeup_source *ws;
+
#ifdef CONFIG_RTC_CLASS
/* rtc timer and device for setting alarm wakeups at suspend */
static struct rtc_timer rtctimer;
@@ -59,7 +61,7 @@ static DEFINE_SPINLOCK(rtcdev_lock);
* If one has not already been chosen, it checks to see if a
* functional rtc device is available.
*/
-static struct rtc_device *alarmtimer_get_rtcdev(void)
+struct rtc_device *alarmtimer_get_rtcdev(void)
{
unsigned long flags;
struct rtc_device *ret;
@@ -115,10 +117,6 @@ static void alarmtimer_rtc_interface_remove(void)
class_interface_unregister(&alarmtimer_rtc_interface);
}
#else
-static inline struct rtc_device *alarmtimer_get_rtcdev(void)
-{
- return NULL;
-}
#define rtcdev (NULL)
static inline int alarmtimer_rtc_interface_setup(void) { return 0; }
static inline void alarmtimer_rtc_interface_remove(void) { }
@@ -250,6 +248,7 @@ static int alarmtimer_suspend(struct device *dev)
unsigned long flags;
struct rtc_device *rtc;
int i;
+ int ret;
spin_lock_irqsave(&freezer_delta_lock, flags);
min = freezer_delta;
@@ -279,8 +278,10 @@ static int alarmtimer_suspend(struct device *dev)
if (min.tv64 == 0)
return 0;
- /* XXX - Should we enforce a minimum sleep time? */
- WARN_ON(min.tv64 < NSEC_PER_SEC);
+ if (ktime_to_ns(min) < 2 * NSEC_PER_SEC) {
+ __pm_wakeup_event(ws, 2 * MSEC_PER_SEC);
+ return -EBUSY;
+ }
/* Setup an rtc timer to fire that far in the future */
rtc_timer_cancel(rtc, &rtctimer);
@@ -288,9 +289,11 @@ static int alarmtimer_suspend(struct device *dev)
now = rtc_tm_to_ktime(tm);
now = ktime_add(now, min);
- rtc_timer_start(rtc, &rtctimer, now, ktime_set(0, 0));
-
- return 0;
+ /* Set alarm, if in the past reject suspend briefly to handle */
+ ret = rtc_timer_start(rtc, &rtctimer, now, ktime_set(0, 0));
+ if (ret < 0)
+ __pm_wakeup_event(ws, 1 * MSEC_PER_SEC);
+ return ret;
}
#else
static int alarmtimer_suspend(struct device *dev)
@@ -821,6 +824,7 @@ static int __init alarmtimer_init(void)
error = PTR_ERR(pdev);
goto out_drv;
}
+ ws = wakeup_source_register("alarmtimer");
return 0;
out_drv:
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 7c50de83b6fd..f3a4dd993ca0 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1323,8 +1323,6 @@ ktime_t ktime_get_monotonic_offset(void)
return timespec_to_ktime(wtom);
}
-EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset);
-
/**
* xtime_update() - advances the timekeeping infrastructure
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index a1d2849f2473..e2a3f7207cca 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -487,6 +487,39 @@ config RING_BUFFER_BENCHMARK
If unsure, say N.
+config TRACELEVEL
+ bool "Add capability to prioritize traces"
+ depends on EVENT_TRACING
+ help
+ This option allows subsystem programmers to add priorities to trace
+ events by calling to tracelevel_register. Traces of high priority
+ will automatically be enabled on kernel boot, and users can change
+ the the trace level in a kernel parameter.
+
+config TRACEDUMP
+ bool "Dumping functionality for ftrace"
+ depends on FUNCTION_TRACER
+ help
+ This option adds functionality to dump tracing data in several forms
+ Data can be dumped in ascii form or as raw pages from the tracing
+ ring buffers, along with the saved cmdlines. This is specified by
+ the module parameter tracedump_ascii. Data will be compressed
+ using zlib.
+
+config TRACEDUMP_PANIC
+ bool "Tracedump to console on panic"
+ depends on TRACEDUMP
+ help
+ With this option, tracedump will automatically dump to the console
+ on a kernel panic.
+
+config TRACEDUMP_PROCFS
+ bool "Tracedump via proc file"
+ depends on TRACEDUMP
+ help
+ With this option, tracedump can be dumped from user space by reading
+ from /proc/tracedump.
+
endif # FTRACE
endif # TRACING_SUPPORT
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 5f39a07fe5ea..5eca10525b5b 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -61,5 +61,7 @@ endif
ifeq ($(CONFIG_TRACING),y)
obj-$(CONFIG_KGDB_KDB) += trace_kdb.o
endif
+obj-$(CONFIG_TRACELEVEL) += tracelevel.o
+obj-$(CONFIG_TRACEDUMP) += tracedump.o
libftrace-y := ftrace.o
diff --git a/kernel/trace/tracedump.c b/kernel/trace/tracedump.c
new file mode 100644
index 000000000000..a83532bc36da
--- /dev/null
+++ b/kernel/trace/tracedump.c
@@ -0,0 +1,682 @@
+/*
+ * kernel/trace/tracedump.c
+ *
+ * Copyright (c) 2011, NVIDIA CORPORATION. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ */
+
+#include <linux/console.h>
+#include <linux/cpumask.h>
+#include <linux/init.h>
+#include <linux/irqflags.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/mutex.h>
+#include <linux/notifier.h>
+#include <linux/proc_fs.h>
+#include <linux/ring_buffer.h>
+#include <linux/sched.h>
+#include <linux/smp.h>
+#include <linux/string.h>
+#include <linux/threads.h>
+#include <linux/tracedump.h>
+#include <linux/uaccess.h>
+#include <linux/vmalloc.h>
+#include <linux/zlib.h>
+
+#include "trace.h"
+#include "trace_output.h"
+
+#define CPU_MAX (NR_CPUS-1)
+
+#define TRYM(fn, ...) do { \
+ int try_error = (fn); \
+ if (try_error < 0) { \
+ printk(__VA_ARGS__); \
+ return try_error; \
+ } \
+} while (0)
+
+#define TRY(fn) TRYM(fn, TAG "Caught error from %s in %s\n", #fn, __func__)
+
+/* Stolen from printk.c */
+#define for_each_console(con) \
+ for (con = console_drivers; con != NULL; con = con->next)
+
+#define TAG KERN_ERR "tracedump: "
+
+#define TD_MIN_CONSUME 2000
+#define TD_COMPRESS_CHUNK 0x8000
+
+static DEFINE_MUTEX(tracedump_proc_lock);
+
+static const char MAGIC_NUMBER[9] = "TRACEDUMP";
+static const char CPU_DELIM[7] = "CPU_END";
+#define CMDLINE_DELIM "|"
+
+/* Type of output */
+static bool current_format;
+static bool format_ascii;
+module_param(format_ascii, bool, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(format_ascii, "Dump ascii or raw data");
+
+/* Max size of output */
+static uint panic_size = 0x80000;
+module_param(panic_size, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(panic_size, "Max dump size during kernel panic (bytes)");
+
+static uint compress_level = 9;
+module_param(compress_level, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(compress_level, "Level of compression to use. [0-9]");
+
+static char out_buf[TD_COMPRESS_CHUNK];
+static z_stream stream;
+static int compress_done;
+static int flush;
+
+static int old_trace_flags;
+
+static struct trace_iterator iter;
+static struct pager_s {
+ struct trace_array *tr;
+ void *spare;
+ int cpu;
+ int len;
+ char __user *ubuf;
+} pager;
+
+static char cmdline_buf[16+TASK_COMM_LEN];
+
+static int print_to_console(const char *buf, size_t len)
+{
+ struct console *con;
+
+ /* Stolen from printk.c */
+ for_each_console(con) {
+ if ((con->flags & CON_ENABLED) && con->write &&
+ (cpu_online(smp_processor_id()) ||
+ (con->flags & CON_ANYTIME)))
+ con->write(con, buf, len);
+ }
+ return 0;
+}
+
+static int print_to_user(const char *buf, size_t len)
+{
+ int size;
+ size = copy_to_user(pager.ubuf, buf, len);
+ if (size > 0) {
+ printk(TAG "Failed to copy to user %d bytes\n", size);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+static int print(const char *buf, size_t len, int print_to)
+{
+ if (print_to == TD_PRINT_CONSOLE)
+ TRY(print_to_console(buf, len));
+ else if (print_to == TD_PRINT_USER)
+ TRY(print_to_user(buf, len));
+ return 0;
+}
+
+/* print_magic will print MAGIC_NUMBER using the
+ * print function selected by print_to.
+ */
+static inline ssize_t print_magic(int print_to)
+{
+ print(MAGIC_NUMBER, sizeof(MAGIC_NUMBER), print_to);
+ return sizeof(MAGIC_NUMBER);
+}
+
+static int iter_init(void)
+{
+ int cpu;
+
+ /* Make iter point to global ring buffer used in trace. */
+ trace_init_global_iter(&iter);
+
+ /* Disable tracing */
+ for_each_tracing_cpu(cpu) {
+ atomic_inc(&iter.tr->data[cpu]->disabled);
+ }
+
+ /* Save flags */
+ old_trace_flags = trace_flags;
+
+ /* Dont look at memory in panic mode. */
+ trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
+
+ /* Prepare ring buffer iter */
+ for_each_tracing_cpu(cpu) {
+ iter.buffer_iter[cpu] =
+ ring_buffer_read_prepare(iter.tr->buffer, cpu);
+ }
+ ring_buffer_read_prepare_sync();
+ for_each_tracing_cpu(cpu) {
+ ring_buffer_read_start(iter.buffer_iter[cpu]);
+ tracing_iter_reset(&iter, cpu);
+ }
+ return 0;
+}
+
+/* iter_next gets the next entry in the ring buffer, ordered by time.
+ * If there are no more entries, returns 0.
+ */
+static ssize_t iter_next(void)
+{
+ /* Zero out the iterator's seq */
+ memset(&iter.seq, 0,
+ sizeof(struct trace_iterator) -
+ offsetof(struct trace_iterator, seq));
+
+ while (!trace_empty(&iter)) {
+ if (trace_find_next_entry_inc(&iter) == NULL) {
+ printk(TAG "trace_find_next_entry failed!\n");
+ return -EINVAL;
+ }
+
+ /* Copy the ring buffer data to iterator's seq */
+ print_trace_line(&iter);
+ if (iter.seq.len != 0)
+ return iter.seq.len;
+ }
+ return 0;
+}
+
+static int iter_deinit(void)
+{
+ int cpu;
+ /* Enable tracing */
+ for_each_tracing_cpu(cpu) {
+ ring_buffer_read_finish(iter.buffer_iter[cpu]);
+ }
+ for_each_tracing_cpu(cpu) {
+ atomic_dec(&iter.tr->data[cpu]->disabled);
+ }
+
+ /* Restore flags */
+ trace_flags = old_trace_flags;
+ return 0;
+}
+
+static int pager_init(void)
+{
+ int cpu;
+
+ /* Need to do this to get a pointer to global_trace (iter.tr).
+ Lame, I know. */
+ trace_init_global_iter(&iter);
+
+ /* Turn off tracing */
+ for_each_tracing_cpu(cpu) {
+ atomic_inc(&iter.tr->data[cpu]->disabled);
+ }
+
+ memset(&pager, 0, sizeof(pager));
+ pager.tr = iter.tr;
+ pager.len = TD_COMPRESS_CHUNK;
+
+ return 0;
+}
+
+/* pager_next_cpu moves the pager to the next cpu.
+ * Returns 0 if pager is done, else 1.
+ */
+static ssize_t pager_next_cpu(void)
+{
+ if (pager.cpu <= CPU_MAX) {
+ pager.cpu += 1;
+ return 1;
+ }
+
+ return 0;
+}
+
+/* pager_next gets the next page of data from the ring buffer
+ * of the current cpu. Returns page size or 0 if no more data.
+ */
+static ssize_t pager_next(void)
+{
+ int ret;
+
+ if (pager.cpu > CPU_MAX)
+ return 0;
+
+ if (!pager.spare)
+ pager.spare = ring_buffer_alloc_read_page(pager.tr->buffer, pager.cpu);
+ if (!pager.spare) {
+ printk(TAG "ring_buffer_alloc_read_page failed!");
+ return -ENOMEM;
+ }
+
+ ret = ring_buffer_read_page(pager.tr->buffer,
+ &pager.spare,
+ pager.len,
+ pager.cpu, 0);
+ if (ret < 0)
+ return 0;
+
+ return PAGE_SIZE;
+}
+
+static int pager_deinit(void)
+{
+ int cpu;
+ if (pager.spare != NULL)
+ ring_buffer_free_read_page(pager.tr->buffer, pager.spare);
+
+ for_each_tracing_cpu(cpu) {
+ atomic_dec(&iter.tr->data[cpu]->disabled);
+ }
+ return 0;
+}
+
+/* cmdline_next gets the next saved cmdline from the trace and
+ * puts it in cmdline_buf. Returns the size of the cmdline, or 0 if empty.
+ * but will reset itself on a subsequent call.
+ */
+static ssize_t cmdline_next(void)
+{
+ static int pid;
+ ssize_t size = 0;
+
+ if (pid >= PID_MAX_DEFAULT)
+ pid = -1;
+
+ while (size == 0 && pid < PID_MAX_DEFAULT) {
+ pid++;
+ trace_find_cmdline(pid, cmdline_buf);
+ if (!strncmp(cmdline_buf, "<...>", 5))
+ continue;
+
+ sprintf(&cmdline_buf[strlen(cmdline_buf)], " %d"
+ CMDLINE_DELIM, pid);
+ size = strlen(cmdline_buf);
+ }
+ return size;
+}
+
+/* comsume_events removes the first 'num' entries from the ring buffer. */
+static int consume_events(size_t num)
+{
+ TRY(iter_init());
+ for (; num > 0 && !trace_empty(&iter); num--) {
+ trace_find_next_entry_inc(&iter);
+ ring_buffer_consume(iter.tr->buffer, iter.cpu, &iter.ts,
+ &iter.lost_events);
+ }
+ TRY(iter_deinit());
+ return 0;
+}
+
+static int data_init(void)
+{
+ if (current_format)
+ TRY(iter_init());
+ else
+ TRY(pager_init());
+ return 0;
+}
+
+/* data_next will figure out the right 'next' function to
+ * call and will select the right buffer to pass back
+ * to compress_next.
+ *
+ * iter_next should be used to get data entry-by-entry, ordered
+ * by time, which is what we need in order to convert it to ascii.
+ *
+ * pager_next will return a full page of raw data at a time, one
+ * CPU at a time. pager_next_cpu must be called to get the next CPU.
+ * cmdline_next will get the next saved cmdline
+ */
+static ssize_t data_next(const char **buf)
+{
+ ssize_t size;
+
+ if (current_format) {
+ TRY(size = iter_next());
+ *buf = iter.seq.buffer;
+ } else {
+ TRY(size = pager_next());
+ *buf = pager.spare;
+ if (size == 0) {
+ if (pager_next_cpu()) {
+ size = sizeof(CPU_DELIM);
+ *buf = CPU_DELIM;
+ } else {
+ TRY(size = cmdline_next());
+ *buf = cmdline_buf;
+ }
+ }
+ }
+ return size;
+}
+
+static int data_deinit(void)
+{
+ if (current_format)
+ TRY(iter_deinit());
+ else
+ TRY(pager_deinit());
+ return 0;
+}
+
+static int compress_init(void)
+{
+ int workspacesize, ret;
+
+ compress_done = 0;
+ flush = Z_NO_FLUSH;
+ stream.data_type = current_format ? Z_ASCII : Z_BINARY;
+ workspacesize = zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL);
+ stream.workspace = vmalloc(workspacesize);
+ if (!stream.workspace) {
+ printk(TAG "Could not allocate "
+ "enough memory for zlib!\n");
+ return -ENOMEM;
+ }
+ memset(stream.workspace, 0, workspacesize);
+
+ ret = zlib_deflateInit(&stream, compress_level);
+ if (ret != Z_OK) {
+ printk(TAG "%s\n", stream.msg);
+ return ret;
+ }
+ stream.avail_in = 0;
+ stream.avail_out = 0;
+ TRY(data_init());
+ return 0;
+}
+
+/* compress_next will compress up to min(max_out, TD_COMPRESS_CHUNK) bytes
+ * of data into the output buffer. It gets the data by calling data_next.
+ * It will return the most data it possibly can. If it returns 0, then
+ * there is no more data.
+ *
+ * By the way that zlib works, each call to zlib_deflate will possibly
+ * consume up to avail_in bytes from next_in, and will fill up to
+ * avail_out bytes in next_out. Once flush == Z_FINISH, it can not take
+ * any more input. It will output until it is finished, and will return
+ * Z_STREAM_END.
+ */
+static ssize_t compress_next(size_t max_out)
+{
+ ssize_t ret;
+ max_out = min(max_out, (size_t)TD_COMPRESS_CHUNK);
+ stream.next_out = out_buf;
+ stream.avail_out = max_out;
+ while (stream.avail_out > 0 && !compress_done) {
+ if (stream.avail_in == 0 && flush != Z_FINISH) {
+ TRY(stream.avail_in =
+ data_next((const char **)&stream.next_in));
+ flush = (stream.avail_in == 0) ? Z_FINISH : Z_NO_FLUSH;
+ }
+ if (stream.next_in != NULL) {
+ TRYM((ret = zlib_deflate(&stream, flush)),
+ "zlib: %s\n", stream.msg);
+ compress_done = (ret == Z_STREAM_END);
+ }
+ }
+ ret = max_out - stream.avail_out;
+ return ret;
+}
+
+static int compress_deinit(void)
+{
+ TRY(data_deinit());
+
+ zlib_deflateEnd(&stream);
+ vfree(stream.workspace);
+
+ /* TODO: remove */
+ printk(TAG "Total in: %ld\n", stream.total_in);
+ printk(TAG "Total out: %ld\n", stream.total_out);
+ return stream.total_out;
+}
+
+static int compress_reset(void)
+{
+ TRY(compress_deinit());
+ TRY(compress_init());
+ return 0;
+}
+
+/* tracedump_init initializes all tracedump components.
+ * Call this before tracedump_next
+ */
+int tracedump_init(void)
+{
+ TRY(compress_init());
+ return 0;
+}
+
+/* tracedump_next will print up to max_out data from the tracing ring
+ * buffers using the print function selected by print_to. The data is
+ * compressed using zlib.
+ *
+ * The output type of the data is specified by the format_ascii module
+ * parameter. If format_ascii == 1, human-readable data will be output.
+ * Otherwise, it will output raw data from the ring buffer in cpu order,
+ * followed by the saved_cmdlines data.
+ */
+ssize_t tracedump_next(size_t max_out, int print_to)
+{
+ ssize_t size;
+ TRY(size = compress_next(max_out));
+ print(out_buf, size, print_to);
+ return size;
+}
+
+/* tracedump_all will print all data in the tracing ring buffers using
+ * the print function selected by print_to. The data is compressed using
+ * zlib, and is surrounded by MAGIC_NUMBER.
+ *
+ * The output type of the data is specified by the format_ascii module
+ * parameter. If format_ascii == 1, human-readable data will be output.
+ * Otherwise, it will output raw data from the ring buffer in cpu order,
+ * followed by the saved_cmdlines data.
+ */
+ssize_t tracedump_all(int print_to)
+{
+ ssize_t ret, size = 0;
+ TRY(size += print_magic(print_to));
+
+ do {
+ /* Here the size used doesn't really matter,
+ * since we're dumping everything. */
+ TRY(ret = tracedump_next(0xFFFFFFFF, print_to));
+ size += ret;
+ } while (ret > 0);
+
+ TRY(size += print_magic(print_to));
+
+ return size;
+}
+
+/* tracedump_deinit deinitializes all tracedump components.
+ * This must be called, even on error.
+ */
+int tracedump_deinit(void)
+{
+ TRY(compress_deinit());
+ return 0;
+}
+
+/* tracedump_reset reinitializes all tracedump components. */
+int tracedump_reset(void)
+{
+ TRY(compress_reset());
+ return 0;
+}
+
+
+
+/* tracedump_open opens the tracedump file for reading. */
+static int tracedump_open(struct inode *inode, struct file *file)
+{
+ int ret;
+ mutex_lock(&tracedump_proc_lock);
+ current_format = format_ascii;
+ ret = tracedump_init();
+ if (ret < 0)
+ goto err;
+
+ ret = nonseekable_open(inode, file);
+ if (ret < 0)
+ goto err;
+ return ret;
+
+err:
+ mutex_unlock(&tracedump_proc_lock);
+ return ret;
+}
+
+/* tracedump_read will reads data from tracedump_next and prints
+ * it to userspace. It will surround the data with MAGIC_NUMBER.
+ */
+static ssize_t tracedump_read(struct file *file, char __user *buf,
+ size_t len, loff_t *offset)
+{
+ static int done;
+ ssize_t size = 0;
+
+ pager.ubuf = buf;
+
+ if (*offset == 0) {
+ done = 0;
+ TRY(size = print_magic(TD_PRINT_USER));
+ } else if (!done) {
+ TRY(size = tracedump_next(len, TD_PRINT_USER));
+ if (size == 0) {
+ TRY(size = print_magic(TD_PRINT_USER));
+ done = 1;
+ }
+ }
+
+ *offset += size;
+
+ return size;
+}
+
+static int tracedump_release(struct inode *inode, struct file *file)
+{
+ int ret;
+ ret = tracedump_deinit();
+ mutex_unlock(&tracedump_proc_lock);
+ return ret;
+}
+
+/* tracedump_dump dumps all tracing data from the tracing ring buffers
+ * to all consoles. For details about the output format, see
+ * tracedump_all.
+
+ * At most max_out bytes are dumped. To accomplish this,
+ * tracedump_dump calls tracedump_all several times without writing the data,
+ * each time tossing out old data until it reaches its goal.
+ *
+ * Note: dumping raw pages currently does NOT follow the size limit.
+ */
+
+int tracedump_dump(size_t max_out)
+{
+ ssize_t size;
+ size_t consume;
+
+ printk(TAG "\n");
+
+ tracedump_init();
+
+ if (format_ascii) {
+ size = tracedump_all(TD_NO_PRINT);
+ if (size < 0) {
+ printk(TAG "failed to dump\n");
+ goto out;
+ }
+ while (size > max_out) {
+ TRY(tracedump_deinit());
+ /* Events take more or less 60 ascii bytes each,
+ not counting compression */
+ consume = TD_MIN_CONSUME + (size - max_out) /
+ (60 / (compress_level + 1));
+ TRY(consume_events(consume));
+ TRY(tracedump_init());
+ size = tracedump_all(TD_NO_PRINT);
+ if (size < 0) {
+ printk(TAG "failed to dump\n");
+ goto out;
+ }
+ }
+
+ TRY(tracedump_reset());
+ }
+ size = tracedump_all(TD_PRINT_CONSOLE);
+ if (size < 0) {
+ printk(TAG "failed to dump\n");
+ goto out;
+ }
+
+out:
+ tracedump_deinit();
+ printk(KERN_INFO "\n" TAG " end\n");
+ return size;
+}
+
+static const struct file_operations tracedump_fops = {
+ .owner = THIS_MODULE,
+ .open = tracedump_open,
+ .read = tracedump_read,
+ .release = tracedump_release,
+};
+
+#ifdef CONFIG_TRACEDUMP_PANIC
+static int tracedump_panic_handler(struct notifier_block *this,
+ unsigned long event, void *unused)
+{
+ tracedump_dump(panic_size);
+ return 0;
+}
+
+static struct notifier_block tracedump_panic_notifier = {
+ .notifier_call = tracedump_panic_handler,
+ .next = NULL,
+ .priority = 150 /* priority: INT_MAX >= x >= 0 */
+};
+#endif
+
+static int __init tracedump_initcall(void)
+{
+#ifdef CONFIG_TRACEDUMP_PROCFS
+ struct proc_dir_entry *entry;
+
+ /* Create a procfs file for easy dumping */
+ entry = create_proc_entry("tracedump", S_IFREG | S_IRUGO, NULL);
+ if (!entry)
+ printk(TAG "failed to create proc entry\n");
+ else
+ entry->proc_fops = &tracedump_fops;
+#endif
+
+#ifdef CONFIG_TRACEDUMP_PANIC
+ /* Automatically dump to console on a kernel panic */
+ atomic_notifier_chain_register(&panic_notifier_list,
+ &tracedump_panic_notifier);
+#endif
+ return 0;
+}
+
+early_initcall(tracedump_initcall);
diff --git a/kernel/trace/tracelevel.c b/kernel/trace/tracelevel.c
new file mode 100644
index 000000000000..9f8b8eedbb58
--- /dev/null
+++ b/kernel/trace/tracelevel.c
@@ -0,0 +1,142 @@
+/*
+ * kernel/trace/tracelevel.c
+ *
+ * Copyright (c) 2011, NVIDIA CORPORATION. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ */
+
+#include <linux/ftrace_event.h>
+#include <linux/list.h>
+#include <linux/moduleparam.h>
+#include <linux/mutex.h>
+#include <linux/tracelevel.h>
+#include <linux/vmalloc.h>
+
+#include "trace.h"
+
+#define TAG KERN_ERR "tracelevel: "
+
+struct tracelevel_record {
+ struct list_head list;
+ char *name;
+ int level;
+};
+
+static LIST_HEAD(tracelevel_list);
+
+static bool started;
+static unsigned int tracelevel_level = TRACELEVEL_DEFAULT;
+
+static DEFINE_MUTEX(tracelevel_record_lock);
+
+/* tracelevel_set_event sets a single event if set = 1, or
+ * clears an event if set = 0.
+ */
+static int tracelevel_set_event(struct tracelevel_record *evt, bool set)
+{
+ if (trace_set_clr_event(NULL, evt->name, set) < 0) {
+ printk(TAG "failed to set event %s\n", evt->name);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/* Registers an event. If possible, it also sets it.
+ * If not, we'll set it in tracelevel_init.
+ */
+int __tracelevel_register(char *name, unsigned int level)
+{
+ struct tracelevel_record *evt = (struct tracelevel_record *)
+ vmalloc(sizeof(struct tracelevel_record));
+ if (!evt) {
+ printk(TAG "failed to allocate tracelevel_record for %s\n",
+ name);
+ return -ENOMEM;
+ }
+
+ evt->name = name;
+ evt->level = level;
+
+ mutex_lock(&tracelevel_record_lock);
+ list_add(&evt->list, &tracelevel_list);
+ mutex_unlock(&tracelevel_record_lock);
+
+ if (level >= tracelevel_level && started)
+ tracelevel_set_event(evt, 1);
+ return 0;
+}
+
+/* tracelevel_set_level sets the global level, clears events
+ * lower than that level, and enables events greater or equal.
+ */
+int tracelevel_set_level(int level)
+{
+ struct tracelevel_record *evt = NULL;
+
+ if (level < 0 || level > TRACELEVEL_MAX)
+ return -EINVAL;
+ tracelevel_level = level;
+
+ mutex_lock(&tracelevel_record_lock);
+ list_for_each_entry(evt, &tracelevel_list, list) {
+ if (evt->level >= level)
+ tracelevel_set_event(evt, 1);
+ else
+ tracelevel_set_event(evt, 0);
+ }
+ mutex_unlock(&tracelevel_record_lock);
+ return 0;
+}
+
+static int param_set_level(const char *val, const struct kernel_param *kp)
+{
+ int level, ret;
+ ret = strict_strtol(val, 0, &level);
+ if (ret < 0)
+ return ret;
+ return tracelevel_set_level(level);
+}
+
+static int param_get_level(char *buffer, const struct kernel_param *kp)
+{
+ return param_get_int(buffer, kp);
+}
+
+static struct kernel_param_ops tracelevel_level_ops = {
+ .set = param_set_level,
+ .get = param_get_level
+};
+
+module_param_cb(level, &tracelevel_level_ops, &tracelevel_level, 0644);
+
+/* Turn on the tracing that has been registered thus far. */
+static int __init tracelevel_init(void)
+{
+ int ret;
+ started = true;
+
+ /* Ring buffer is initialize to 1 page until the user sets a tracer.
+ * Since we're doing this manually, we need to ask for expanded buffer.
+ */
+ ret = tracing_update_buffers();
+ if (ret < 0)
+ return ret;
+
+ return tracelevel_set_level(tracelevel_level);
+}
+
+/* Tracing mechanism is set up during fs_initcall. */
+fs_initcall_sync(tracelevel_init);