diff options
Diffstat (limited to 'kernel')
38 files changed, 2335 insertions, 183 deletions
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index ed64ccac67c9..145deeb69bc3 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -60,6 +60,7 @@ #include <linux/eventfd.h> #include <linux/poll.h> #include <linux/flex_array.h> /* used in cgroup_attach_proc */ +#include <linux/capability.h> #include <linux/atomic.h> @@ -287,6 +288,33 @@ static void cgroup_release_agent(struct work_struct *work); static DECLARE_WORK(release_agent_work, cgroup_release_agent); static void check_for_release(struct cgroup *cgrp); +/* + * A queue for waiters to do rmdir() cgroup. A tasks will sleep when + * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some + * reference to css->refcnt. In general, this refcnt is expected to goes down + * to zero, soon. + * + * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex; + */ +static DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq); + +static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp) +{ + if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))) + wake_up_all(&cgroup_rmdir_waitq); +} + +void cgroup_exclude_rmdir(struct cgroup_subsys_state *css) +{ + css_get(css); +} + +void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css) +{ + cgroup_wakeup_rmdir_waiter(css->cgroup); + css_put(css); +} + /* Link structure for associating css_set objects with cgroups */ struct cg_cgroup_link { /* @@ -346,52 +374,43 @@ static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[]) return &css_set_table[index]; } -/* We don't maintain the lists running through each css_set to its - * task until after the first call to cgroup_iter_start(). This - * reduces the fork()/exit() overhead for people who have cgroups - * compiled into their kernel but not actually in use */ -static int use_task_css_set_links __read_mostly; - -static void __put_css_set(struct css_set *cg, int taskexit) +static void free_css_set_work(struct work_struct *work) { + struct css_set *cg = container_of(work, struct css_set, work); struct cg_cgroup_link *link; struct cg_cgroup_link *saved_link; - /* - * Ensure that the refcount doesn't hit zero while any readers - * can see it. Similar to atomic_dec_and_lock(), but for an - * rwlock - */ - if (atomic_add_unless(&cg->refcount, -1, 1)) - return; - write_lock(&css_set_lock); - if (!atomic_dec_and_test(&cg->refcount)) { - write_unlock(&css_set_lock); - return; - } - - /* This css_set is dead. unlink it and release cgroup refcounts */ - hlist_del(&cg->hlist); - css_set_count--; + write_lock(&css_set_lock); list_for_each_entry_safe(link, saved_link, &cg->cg_links, cg_link_list) { struct cgroup *cgrp = link->cgrp; list_del(&link->cg_link_list); list_del(&link->cgrp_link_list); - if (atomic_dec_and_test(&cgrp->count) && - notify_on_release(cgrp)) { - if (taskexit) - set_bit(CGRP_RELEASABLE, &cgrp->flags); + if (atomic_dec_and_test(&cgrp->count)) { check_for_release(cgrp); + cgroup_wakeup_rmdir_waiter(cgrp); } - kfree(link); } - write_unlock(&css_set_lock); - kfree_rcu(cg, rcu_head); + + kfree(cg); +} + +static void free_css_set_rcu(struct rcu_head *obj) +{ + struct css_set *cg = container_of(obj, struct css_set, rcu_head); + + INIT_WORK(&cg->work, free_css_set_work); + schedule_work(&cg->work); } +/* We don't maintain the lists running through each css_set to its + * task until after the first call to cgroup_iter_start(). This + * reduces the fork()/exit() overhead for people who have cgroups + * compiled into their kernel but not actually in use */ +static int use_task_css_set_links __read_mostly; + /* * refcounted get/put for css_set objects */ @@ -400,16 +419,34 @@ static inline void get_css_set(struct css_set *cg) atomic_inc(&cg->refcount); } -static inline void put_css_set(struct css_set *cg) +static void put_css_set(struct css_set *cg) { - __put_css_set(cg, 0); -} + /* + * Ensure that the refcount doesn't hit zero while any readers + * can see it. Similar to atomic_dec_and_lock(), but for an + * rwlock + */ + if (atomic_add_unless(&cg->refcount, -1, 1)) + return; + write_lock(&css_set_lock); + if (!atomic_dec_and_test(&cg->refcount)) { + write_unlock(&css_set_lock); + return; + } -static inline void put_css_set_taskexit(struct css_set *cg) -{ - __put_css_set(cg, 1); + hlist_del(&cg->hlist); + css_set_count--; + + write_unlock(&css_set_lock); + call_rcu(&cg->rcu_head, free_css_set_rcu); } +/* We don't maintain the lists running through each css_set to its + * task until after the first call to cgroup_iter_start(). This + * reduces the fork()/exit() overhead for people who have cgroups + * compiled into their kernel but not actually in use */ +static int use_task_css_set_links __read_mostly; + /* * compare_css_sets - helper function for find_existing_css_set(). * @cg: candidate css_set being tested @@ -739,9 +776,9 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task, * cgroup_attach_task(), which overwrites one tasks cgroup pointer with * another. It does so using cgroup_mutex, however there are * several performance critical places that need to reference - * task->cgroup without the expense of grabbing a system global + * task->cgroups without the expense of grabbing a system global * mutex. Therefore except as noted below, when dereferencing or, as - * in cgroup_attach_task(), modifying a task'ss cgroup pointer we use + * in cgroup_attach_task(), modifying a task's cgroups pointer we use * task_lock(), which acts on a spinlock (task->alloc_lock) already in * the task_struct routinely used for such matters. * @@ -931,33 +968,6 @@ static void cgroup_d_remove_dir(struct dentry *dentry) } /* - * A queue for waiters to do rmdir() cgroup. A tasks will sleep when - * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some - * reference to css->refcnt. In general, this refcnt is expected to goes down - * to zero, soon. - * - * CGRP_WAIT_ON_RMDIR flag is set under cgroup's inode->i_mutex; - */ -static DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq); - -static void cgroup_wakeup_rmdir_waiter(struct cgroup *cgrp) -{ - if (unlikely(test_and_clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags))) - wake_up_all(&cgroup_rmdir_waitq); -} - -void cgroup_exclude_rmdir(struct cgroup_subsys_state *css) -{ - css_get(css); -} - -void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css) -{ - cgroup_wakeup_rmdir_waiter(css->cgroup); - css_put(css); -} - -/* * Call with cgroup_mutex held. Drops reference counts on modules, including * any duplicate ones that parse_cgroupfs_options took. If this function * returns an error, no reference counts are touched. @@ -1889,6 +1899,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) struct cgroupfs_root *root = cgrp->root; struct cgroup_taskset tset = { }; struct css_set *newcg; + struct css_set *cg; /* @tsk either already exited or can't exit until the end */ if (tsk->flags & PF_EXITING) @@ -1915,6 +1926,15 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) failed_ss = ss; goto out; } + } else if (!capable(CAP_SYS_ADMIN)) { + const struct cred *cred = current_cred(), *tcred; + + /* No can_attach() - check perms generically */ + tcred = __task_cred(tsk); + if (cred->euid != tcred->uid && + cred->euid != tcred->suid) { + return -EACCES; + } } } @@ -1924,14 +1944,20 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) goto out; } + task_lock(tsk); + cg = tsk->cgroups; + get_css_set(cg); + task_unlock(tsk); + cgroup_task_migrate(cgrp, oldcgrp, tsk, newcg); for_each_subsys(root, ss) { if (ss->attach) ss->attach(cgrp, &tset); } - - synchronize_rcu(); + set_bit(CGRP_RELEASABLE, &cgrp->flags); + /* put_css_set will not destroy cg until after an RCU grace period */ + put_css_set(cg); /* * wake up rmdir() waiter. the rmdir should fail since the cgroup @@ -2132,6 +2158,24 @@ out_free_group_list: return retval; } +static int cgroup_allow_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) +{ + struct cgroup_subsys *ss; + int ret; + + for_each_subsys(cgrp->root, ss) { + if (ss->allow_attach) { + ret = ss->allow_attach(cgrp, tset); + if (ret) + return ret; + } else { + return -EACCES; + } + } + + return 0; +} + /* * Find the task_struct of the task to attach by vpid and pass it along to the * function to attach either it or all tasks in its threadgroup. Will lock @@ -2163,9 +2207,19 @@ retry_find_task: if (cred->euid && cred->euid != tcred->uid && cred->euid != tcred->suid) { - rcu_read_unlock(); - ret = -EACCES; - goto out_unlock_cgroup; + /* + * if the default permission check fails, give each + * cgroup a chance to extend the permission check + */ + struct cgroup_taskset tset = { }; + tset.single.task = tsk; + tset.single.cgrp = cgrp; + ret = cgroup_allow_attach(cgrp, &tset); + if (ret) { + rcu_read_unlock(); + cgroup_unlock(); + return ret; + } } } else tsk = current; @@ -3784,6 +3838,8 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, if (err < 0) goto err_remove; + set_bit(CGRP_RELEASABLE, &parent->flags); + /* The cgroup directory was pre-locked for us */ BUG_ON(!mutex_is_locked(&cgrp->dentry->d_inode->i_mutex)); @@ -3915,6 +3971,21 @@ static int cgroup_clear_css_refs(struct cgroup *cgrp) return !failed; } +/* checks if all of the css_sets attached to a cgroup have a refcount of 0. + * Must be called with css_set_lock held */ +static int cgroup_css_sets_empty(struct cgroup *cgrp) +{ + struct cg_cgroup_link *link; + + list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) { + struct css_set *cg = link->cg; + if (atomic_read(&cg->refcount) > 0) + return 0; + } + + return 1; +} + static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) { struct cgroup *cgrp = dentry->d_fsdata; @@ -3927,7 +3998,7 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) /* the vfs holds both inode->i_mutex already */ again: mutex_lock(&cgroup_mutex); - if (atomic_read(&cgrp->count) != 0) { + if (!cgroup_css_sets_empty(cgrp)) { mutex_unlock(&cgroup_mutex); return -EBUSY; } @@ -3960,7 +4031,7 @@ again: mutex_lock(&cgroup_mutex); parent = cgrp->parent; - if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) { + if (!cgroup_css_sets_empty(cgrp) || !list_empty(&cgrp->children)) { clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags); mutex_unlock(&cgroup_mutex); return -EBUSY; @@ -4000,7 +4071,6 @@ again: cgroup_d_remove_dir(d); dput(d); - set_bit(CGRP_RELEASABLE, &parent->flags); check_for_release(parent); /* @@ -4631,7 +4701,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks) task_unlock(tsk); if (cg) - put_css_set_taskexit(cg); + put_css_set(cg); } /** @@ -4685,6 +4755,14 @@ static void check_for_release(struct cgroup *cgrp) } /* Caller must verify that the css is not for root cgroup */ +void __css_get(struct cgroup_subsys_state *css, int count) +{ + atomic_add(count, &css->refcnt); + set_bit(CGRP_RELEASABLE, &css->cgroup->flags); +} +EXPORT_SYMBOL_GPL(__css_get); + +/* Caller must verify that the css is not for root cgroup */ void __css_put(struct cgroup_subsys_state *css, int count) { struct cgroup *cgrp = css->cgroup; @@ -4692,10 +4770,7 @@ void __css_put(struct cgroup_subsys_state *css, int count) rcu_read_lock(); val = atomic_sub_return(count, &css->refcnt); if (val == 1) { - if (notify_on_release(cgrp)) { - set_bit(CGRP_RELEASABLE, &cgrp->flags); - check_for_release(cgrp); - } + check_for_release(cgrp); cgroup_wakeup_rmdir_waiter(cgrp); } rcu_read_unlock(); diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index f86e93920b62..5c248e507a6e 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -168,6 +168,14 @@ static int freezer_can_attach(struct cgroup *new_cgroup, struct freezer *freezer; struct task_struct *task; + if ((current != task) && (!capable(CAP_SYS_ADMIN))) { + const struct cred *cred = current_cred(), *tcred; + + tcred = __task_cred(task); + if (cred->euid != tcred->uid && cred->euid != tcred->suid) + return -EPERM; + } + /* * Anything frozen can't move or be moved to/from. */ diff --git a/kernel/cpu.c b/kernel/cpu.c index 2060c6e57027..acf5d8047458 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -16,6 +16,7 @@ #include <linux/mutex.h> #include <linux/gfp.h> #include <linux/suspend.h> +#include <trace/events/power.h> #ifdef CONFIG_SMP /* Serializes the updates to cpu_online_mask, cpu_present_mask */ @@ -273,6 +274,8 @@ int __ref cpu_down(unsigned int cpu) { int err; + trace_cpu_hotplug(cpu, POWER_CPU_DOWN_START); + cpu_maps_update_begin(); if (cpu_hotplug_disabled) { @@ -284,6 +287,7 @@ int __ref cpu_down(unsigned int cpu) out: cpu_maps_update_done(); + trace_cpu_hotplug(cpu, POWER_CPU_DOWN_DONE); return err; } EXPORT_SYMBOL(cpu_down); @@ -334,6 +338,8 @@ int __cpuinit cpu_up(unsigned int cpu) pg_data_t *pgdat; #endif + trace_cpu_hotplug(cpu, POWER_CPU_UP_START); + if (!cpu_possible(cpu)) { printk(KERN_ERR "can't online cpu %d because it is not " "configured as may-hotadd at boot time\n", cpu); @@ -377,6 +383,7 @@ int __cpuinit cpu_up(unsigned int cpu) out: cpu_maps_update_done(); + trace_cpu_hotplug(cpu, POWER_CPU_UP_DONE); return err; } EXPORT_SYMBOL_GPL(cpu_up); @@ -668,3 +675,23 @@ void init_cpu_online(const struct cpumask *src) { cpumask_copy(to_cpumask(cpu_online_bits), src); } + +static ATOMIC_NOTIFIER_HEAD(idle_notifier); + +void idle_notifier_register(struct notifier_block *n) +{ + atomic_notifier_chain_register(&idle_notifier, n); +} +EXPORT_SYMBOL_GPL(idle_notifier_register); + +void idle_notifier_unregister(struct notifier_block *n) +{ + atomic_notifier_chain_unregister(&idle_notifier, n); +} +EXPORT_SYMBOL_GPL(idle_notifier_unregister); + +void idle_notifier_call_chain(unsigned long val) +{ + atomic_notifier_call_chain(&idle_notifier, val, NULL); +} +EXPORT_SYMBOL_GPL(idle_notifier_call_chain); diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 14f7070b4ba2..48b90d30797f 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1362,6 +1362,41 @@ static int fmeter_getrate(struct fmeter *fmp) return val; } +/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */ +static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont, + struct task_struct *tsk) +{ + struct cpuset *cs = cgroup_cs(cont); + + if ((current != task) && (!capable(CAP_SYS_ADMIN))) { + const struct cred *cred = current_cred(), *tcred; + + if (cred->euid != tcred->uid && cred->euid != tcred->suid) + return -EPERM; + } + + if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) + return -ENOSPC; + + /* + * Kthreads bound to specific cpus cannot be moved to a new cpuset; we + * cannot change their cpu affinity and isolating such threads by their + * set of allowed nodes is unnecessary. Thus, cpusets are not + * applicable for such threads. This prevents checking for success of + * set_cpus_allowed_ptr() on all attached tasks before cpus_allowed may + * be changed. + */ + if (tsk->flags & PF_THREAD_BOUND) + return -EINVAL; + + return 0; +} + +static int cpuset_can_attach_task(struct cgroup *cgrp, struct task_struct *task) +{ + return security_task_setscheduler(task); +} + /* * Protected by cgroup_lock. The nodemasks must be stored globally because * dynamically allocating them is not allowed in can_attach, and they must diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 0557f24c6bca..35b94acec621 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -85,6 +85,10 @@ static int kgdb_use_con; bool dbg_is_early = true; /* Next cpu to become the master debug core */ int dbg_switch_cpu; +/* Flag for entering kdb when a panic occurs */ +static bool break_on_panic = true; +/* Flag for entering kdb when an exception occurs */ +static bool break_on_exception = true; /* Use kdb or gdbserver mode */ int dbg_kdb_mode = 1; @@ -99,6 +103,8 @@ early_param("kgdbcon", opt_kgdb_con); module_param(kgdb_use_con, int, 0644); module_param(kgdbreboot, int, 0644); +module_param(break_on_panic, bool, 0644); +module_param(break_on_exception, bool, 0644); /* * Holds information about breakpoints in a kernel. These breakpoints are @@ -673,6 +679,9 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs) struct kgdb_state kgdb_var; struct kgdb_state *ks = &kgdb_var; + if (unlikely(signo != SIGTRAP && !break_on_exception)) + return 1; + ks->cpu = raw_smp_processor_id(); ks->ex_vector = evector; ks->signo = signo; @@ -759,6 +768,9 @@ static int kgdb_panic_event(struct notifier_block *self, unsigned long val, void *data) { + if (!break_on_panic) + return NOTIFY_DONE; + if (dbg_kdb_mode) kdb_printf("PANIC: %s\n", (char *)data); kgdb_breakpoint(); diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index bb9520f0f6ff..18a4cb33c52b 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -216,7 +216,7 @@ static char *kdb_read(char *buffer, size_t bufsize) int i; int diag, dtab_count; int key; - + static int last_crlf; diag = kdbgetintenv("DTABCOUNT", &dtab_count); if (diag) @@ -237,6 +237,9 @@ poll_again: return buffer; if (key != 9) tab = 0; + if (key != 10 && key != 13) + last_crlf = 0; + switch (key) { case 8: /* backspace */ if (cp > buffer) { @@ -254,7 +257,12 @@ poll_again: *cp = tmp; } break; - case 13: /* enter */ + case 10: /* new line */ + case 13: /* carriage return */ + /* handle \n after \r */ + if (last_crlf && last_crlf != key) + break; + last_crlf = key; *lastchar++ = '\n'; *lastchar++ = '\0'; if (!KDB_STATE(KGDB_TRANS)) { diff --git a/kernel/fork.c b/kernel/fork.c index 81633337aee1..bc3398ee1d79 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -158,6 +158,9 @@ struct kmem_cache *vm_area_cachep; /* SLAB cache for mm_struct structures (tsk->mm) */ static struct kmem_cache *mm_cachep; +/* Notifier list called when a task struct is freed */ +static ATOMIC_NOTIFIER_HEAD(task_free_notifier); + static void account_kernel_stack(struct thread_info *ti, int account) { struct zone *zone = page_zone(virt_to_page(ti)); @@ -188,6 +191,18 @@ static inline void put_signal_struct(struct signal_struct *sig) free_signal_struct(sig); } +int task_free_register(struct notifier_block *n) +{ + return atomic_notifier_chain_register(&task_free_notifier, n); +} +EXPORT_SYMBOL(task_free_register); + +int task_free_unregister(struct notifier_block *n) +{ + return atomic_notifier_chain_unregister(&task_free_notifier, n); +} +EXPORT_SYMBOL(task_free_unregister); + void __put_task_struct(struct task_struct *tsk) { WARN_ON(!tsk->exit_state); @@ -199,6 +214,7 @@ void __put_task_struct(struct task_struct *tsk) delayacct_tsk_free(tsk); put_signal_struct(tsk->signal); + atomic_notifier_call_chain(&task_free_notifier, 0, tsk); if (!profile_handoff_task(tsk)) free_task(tsk); } @@ -677,7 +693,8 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode) mm = get_task_mm(task); if (mm && mm != current->mm && - !ptrace_may_access(task, mode)) { + !ptrace_may_access(task, mode) && + !capable(CAP_SYS_RESOURCE)) { mmput(mm); mm = ERR_PTR(-EACCES); } diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig index a92028196cc1..824b741925bb 100644 --- a/kernel/gcov/Kconfig +++ b/kernel/gcov/Kconfig @@ -35,7 +35,7 @@ config GCOV_KERNEL config GCOV_PROFILE_ALL bool "Profile entire Kernel" depends on GCOV_KERNEL - depends on SUPERH || S390 || X86 || (PPC && EXPERIMENTAL) || MICROBLAZE + depends on SUPERH || S390 || X86 || (PPC && EXPERIMENTAL) || MICROBLAZE || ARM default n ---help--- This options activates profiling for the entire kernel. @@ -46,4 +46,10 @@ config GCOV_PROFILE_ALL larger and run slower. Also be sure to exclude files from profiling which are not linked to the kernel image to prevent linker errors. +config GCOV_CTORS + string + depends on CONSTRUCTORS + default ".init_array" if ARM && AEABI + default ".ctors" + endmenu diff --git a/kernel/gcov/gcc_3_4.c b/kernel/gcov/gcc_3_4.c index ae5bb4260033..bc78336bc345 100644 --- a/kernel/gcov/gcc_3_4.c +++ b/kernel/gcov/gcc_3_4.c @@ -297,16 +297,30 @@ void gcov_iter_start(struct gcov_iterator *iter) } /* Mapping of logical record number to actual file content. */ -#define RECORD_FILE_MAGIC 0 -#define RECORD_GCOV_VERSION 1 -#define RECORD_TIME_STAMP 2 -#define RECORD_FUNCTION_TAG 3 -#define RECORD_FUNCTON_TAG_LEN 4 -#define RECORD_FUNCTION_IDENT 5 -#define RECORD_FUNCTION_CHECK 6 -#define RECORD_COUNT_TAG 7 -#define RECORD_COUNT_LEN 8 -#define RECORD_COUNT 9 +#define RECORD_FILE_MAGIC 0 +#define RECORD_GCOV_VERSION 1 +#define RECORD_TIME_STAMP 2 +#define RECORD_FUNCTION_TAG 3 +#define RECORD_FUNCTON_TAG_LEN 4 +#define RECORD_FUNCTION_IDENT 5 +#define RECORD_FUNCTION_CHECK_LINE 6 +#define RECORD_FUNCTION_CHECK_CFG 7 +#define RECORD_FUNCTION_NAME_LEN 8 +#define RECORD_FUNCTION_NAME 9 +#define RECORD_COUNT_TAG 10 +#define RECORD_COUNT_LEN 11 +#define RECORD_COUNT 12 + +/* Return length of string encoded in GCOV format. */ +static size_t +sizeof_str(const char *str) +{ + size_t len; + len = (str) ? strlen(str) : 0; + if (len == 0) + return 1; + return 1 + ((len + 4) >> 2); +} /** * gcov_iter_next - advance file iterator to next logical record @@ -323,6 +337,9 @@ int gcov_iter_next(struct gcov_iterator *iter) case RECORD_FUNCTON_TAG_LEN: case RECORD_FUNCTION_IDENT: case RECORD_COUNT_TAG: + case RECORD_FUNCTION_CHECK_LINE: + case RECORD_FUNCTION_CHECK_CFG: + case RECORD_FUNCTION_NAME_LEN: /* Advance to next record */ iter->record++; break; @@ -332,7 +349,7 @@ int gcov_iter_next(struct gcov_iterator *iter) /* fall through */ case RECORD_COUNT_LEN: if (iter->count < get_func(iter)->n_ctrs[iter->type]) { - iter->record = 9; + iter->record = 12; break; } /* Advance to next counter type */ @@ -340,9 +357,9 @@ int gcov_iter_next(struct gcov_iterator *iter) iter->count = 0; iter->type++; /* fall through */ - case RECORD_FUNCTION_CHECK: + case RECORD_FUNCTION_NAME: if (iter->type < iter->num_types) { - iter->record = 7; + iter->record = 10; break; } /* Advance to next function */ @@ -395,6 +412,34 @@ static int seq_write_gcov_u64(struct seq_file *seq, u64 v) data[1] = (v >> 32); return seq_write(seq, data, sizeof(data)); } +/** + * seq_write_gcov_str - write string in gcov format to seq_file + * @seq: seq_file handle + * @str: string to be stored + * + * Number format defined by gcc: numbers are recorded in the 32 bit + * unsigned binary form of the endianness of the machine generating the + * file. 64 bit numbers are stored as two 32 bit numbers, the low part + * first. + */ +static int seq_write_gcov_str(struct seq_file *seq, const char *str) +{ + if (str) { + size_t len; + int str_off; + u32 data; + len = strlen(str); + for (str_off = 0; str_off < (sizeof_str(str) - 2) ; str_off++) { + memcpy(&data, (str + str_off * 4), 4); + seq_write(seq, &data, sizeof(data)); + } + data = 0; + memcpy(&data, (str + str_off * 4), (len - str_off * 4)); + return seq_write(seq, &data, sizeof(data)); + } else { + return 0; + } +} /** * gcov_iter_write - write data for current pos to seq_file @@ -421,13 +466,36 @@ int gcov_iter_write(struct gcov_iterator *iter, struct seq_file *seq) rc = seq_write_gcov_u32(seq, GCOV_TAG_FUNCTION); break; case RECORD_FUNCTON_TAG_LEN: - rc = seq_write_gcov_u32(seq, 2); +#ifdef GCOV_FN_INFO_HAS_NAME_FIELD + rc = seq_write_gcov_u32(seq, GCOV_TAG_FUNCTION_LENGTH + + (sizeof_str(get_func(iter)->name))); +#else + rc = seq_write_gcov_u32(seq, GCOV_TAG_FUNCTION_LENGTH); +#endif break; case RECORD_FUNCTION_IDENT: rc = seq_write_gcov_u32(seq, get_func(iter)->ident); break; - case RECORD_FUNCTION_CHECK: - rc = seq_write_gcov_u32(seq, get_func(iter)->checksum); + case RECORD_FUNCTION_CHECK_LINE: + rc = seq_write_gcov_u32(seq, get_func(iter)->lineno_checksum); + break; + case RECORD_FUNCTION_CHECK_CFG: + rc = seq_write_gcov_u32(seq, get_func(iter)->cfg_checksum); + break; + case RECORD_FUNCTION_NAME_LEN: +#ifdef GCOV_FN_INFO_HAS_NAME_FIELD + rc = seq_write_gcov_u32(seq, + (sizeof_str(get_func(iter)->name) - 1)); +#else + rc = 0; +#endif + break; + case RECORD_FUNCTION_NAME: +#ifdef GCOV_FN_INFO_HAS_NAME_FIELD + rc = seq_write_gcov_str(seq, get_func(iter)->name); +#else + rc = 0; +#endif break; case RECORD_COUNT_TAG: rc = seq_write_gcov_u32(seq, diff --git a/kernel/gcov/gcov.h b/kernel/gcov/gcov.h index 060073ebf7a6..8c5130a5c1b5 100644 --- a/kernel/gcov/gcov.h +++ b/kernel/gcov/gcov.h @@ -17,13 +17,21 @@ #include <linux/types.h> /* - * Profiling data types used for gcc 3.4 and above - these are defined by + * GCC 4.6 drops the 'name' field from 'struct gcov_fn_info'. + */ +#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 6) +#define GCOV_FN_INFO_HAS_NAME_FIELD +#endif + +/* + * Profiling data types used for at least gcc 4.4 and 4.6 - these are defined by * gcc and need to be kept as close to the original definition as possible to * remain compatible. */ -#define GCOV_COUNTERS 5 +#define GCOV_COUNTERS 10 #define GCOV_DATA_MAGIC ((unsigned int) 0x67636461) #define GCOV_TAG_FUNCTION ((unsigned int) 0x01000000) +#define GCOV_TAG_FUNCTION_LENGTH 3 #define GCOV_TAG_COUNTER_BASE ((unsigned int) 0x01a10000) #define GCOV_TAG_FOR_COUNTER(count) \ (GCOV_TAG_COUNTER_BASE + ((unsigned int) (count) << 17)) @@ -34,10 +42,38 @@ typedef long gcov_type; typedef long long gcov_type; #endif +/* + * Source module info. The data structure is used in both runtime and + * profile-use phase. + */ +struct gcov_module_info { + unsigned int ident; +/* + * This is overloaded to mean two things: + * (1) means FDO/LIPO in instrumented binary. + * (2) means IS_PRIMARY in persistent file or memory copy used in profile-use. + */ + unsigned int is_primary; + unsigned int is_exported; + unsigned int lang; + char *da_filename; + char *source_filename; + unsigned int num_quote_paths; + unsigned int num_bracket_paths; + unsigned int num_cpp_defines; + unsigned int num_cpp_includes; + unsigned int num_cl_args; + char *string_array[1]; +}; + + /** * struct gcov_fn_info - profiling meta data per function * @ident: object file-unique function identifier - * @checksum: function checksum + * @lineno_checksum: function lineno checksum + * @cfg_checksum: function cfg checksum + * @dc_offset: direct call offset + * @name: function name * @n_ctrs: number of values per counter type belonging to this function * * This data is generated by gcc during compilation and doesn't change @@ -45,7 +81,12 @@ typedef long long gcov_type; */ struct gcov_fn_info { unsigned int ident; - unsigned int checksum; + unsigned int lineno_checksum; + unsigned int cfg_checksum; + unsigned int dc_offset; +#ifdef GCOV_FN_INFO_HAS_NAME_FIELD + const char *name; +#endif unsigned int n_ctrs[0]; }; @@ -67,9 +108,11 @@ struct gcov_ctr_info { /** * struct gcov_info - profiling data per object file * @version: gcov version magic indicating the gcc version used for compilation + * @modinfo: additional module information * @next: list head for a singly-linked list * @stamp: time stamp * @filename: name of the associated gcov data file + * @eof_pos: end position of profile data * @n_functions: number of instrumented functions * @functions: function data * @ctr_mask: mask specifying which counter types are active @@ -80,9 +123,11 @@ struct gcov_ctr_info { */ struct gcov_info { unsigned int version; + struct gcov_module_info *mod_info; struct gcov_info *next; unsigned int stamp; const char *filename; + unsigned int eof_pos; unsigned int n_functions; const struct gcov_fn_info *functions; unsigned int ctr_mask; diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index 15e53b1766a6..fe4b09cf829c 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -104,8 +104,13 @@ int check_wakeup_irqs(void) for_each_irq_desc(irq, desc) { if (irqd_is_wakeup_set(&desc->irq_data)) { - if (desc->istate & IRQS_PENDING) + if (desc->istate & IRQS_PENDING) { + pr_info("Wakeup IRQ %d %s pending, suspend aborted\n", + irq, + desc->action && desc->action->name ? + desc->action->name : ""); return -EBUSY; + } continue; } /* diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c index 14dd5761e8c9..ef60772d2feb 100644 --- a/kernel/irq/resend.c +++ b/kernel/irq/resend.c @@ -55,17 +55,18 @@ static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0); */ void check_irq_resend(struct irq_desc *desc, unsigned int irq) { - /* - * We do not resend level type interrupts. Level type - * interrupts are resent by hardware when they are still - * active. - */ - if (irq_settings_is_level(desc)) - return; - if (desc->istate & IRQS_REPLAY) - return; if (desc->istate & IRQS_PENDING) { desc->istate &= ~IRQS_PENDING; + /* + * We do not resend level type interrupts. Level type + * interrupts are resent by hardware when they are still + * active. + */ + if (irq_settings_is_level(desc)) + return; + if (desc->istate & IRQS_REPLAY) + return; + desc->istate |= IRQS_REPLAY; if (!desc->irq_data.chip->irq_retrigger || diff --git a/kernel/kthread.c b/kernel/kthread.c index 3d3de633702e..b68236b45ba9 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -16,6 +16,7 @@ #include <linux/mutex.h> #include <linux/slab.h> #include <linux/freezer.h> +#include <linux/preempt.h> #include <trace/events/sched.h> static DEFINE_SPINLOCK(kthread_create_lock); @@ -113,7 +114,17 @@ static int kthread(void *_create) /* OK, tell user we're spawned, wait for stop or wakeup */ __set_current_state(TASK_UNINTERRUPTIBLE); create->result = current; + + /* + * Disable preemption so we enter TASK_UNINTERRUPTIBLE after + * complete() instead of possibly being preempted. This speeds + * up clients that do a kthread_bind() directly after + * creation. + */ + preempt_disable(); complete(&create->done); + preempt_enable_no_resched(); + schedule(); ret = -EINTR; diff --git a/kernel/module.c b/kernel/module.c index 78ac6ec1e425..b084bf116fc4 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2600,7 +2600,7 @@ static void find_module_sections(struct module *mod, struct load_info *info) mod->unused_gpl_crcs = section_addr(info, "__kcrctab_unused_gpl"); #endif #ifdef CONFIG_CONSTRUCTORS - mod->ctors = section_objs(info, ".ctors", + mod->ctors = section_objs(info, CONFIG_GCOV_CTORS, sizeof(*mod->ctors), &mod->num_ctors); #endif diff --git a/kernel/panic.c b/kernel/panic.c index 9ed023b8333a..90fd443165df 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -27,13 +27,19 @@ #define PANIC_TIMER_STEP 100 #define PANIC_BLINK_SPD 18 +/* Machine specific panic information string */ +char *mach_panic_string; + int panic_on_oops; static unsigned long tainted_mask; static int pause_on_oops; static int pause_on_oops_flag; static DEFINE_SPINLOCK(pause_on_oops_lock); -int panic_timeout; +#ifndef CONFIG_PANIC_TIMEOUT +#define CONFIG_PANIC_TIMEOUT 0 +#endif +int panic_timeout = CONFIG_PANIC_TIMEOUT; EXPORT_SYMBOL_GPL(panic_timeout); ATOMIC_NOTIFIER_HEAD(panic_notifier_list); @@ -375,6 +381,11 @@ late_initcall(init_oops_id); void print_oops_end_marker(void) { init_oops_id(); + + if (mach_panic_string) + printk(KERN_WARNING "Board Information: %s\n", + mach_panic_string); + printk(KERN_WARNING "---[ end trace %016llx ]---\n", (unsigned long long)oops_id); } diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index deb5461e3216..815da3c5cdd4 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -18,6 +18,14 @@ config SUSPEND_FREEZER Turning OFF this setting is NOT recommended! If in doubt, say Y. +config HAS_WAKELOCK + bool + default y + +config WAKELOCK + bool + default y + config HIBERNATE_CALLBACKS bool @@ -75,20 +83,20 @@ config PM_STD_PARTITION default "" ---help--- The default resume partition is the partition that the suspend- - to-disk implementation will look for a suspended disk image. + to-disk implementation will look for a suspended disk image. - The partition specified here will be different for almost every user. + The partition specified here will be different for almost every user. It should be a valid swap partition (at least for now) that is turned - on before suspending. + on before suspending. The partition specified can be overridden by specifying: - resume=/dev/<other device> + resume=/dev/<other device> - which will set the resume partition to the device specified. + which will set the resume partition to the device specified. Note there is currently not a way to specify which device to save the - suspended image to. It will simply pick the first available swap + suspended image to. It will simply pick the first available swap device. config PM_SLEEP @@ -103,6 +111,33 @@ config PM_SLEEP_SMP select HOTPLUG select HOTPLUG_CPU +config PM_AUTOSLEEP + bool "Opportunistic sleep" + depends on PM_SLEEP + default n + ---help--- + Allow the kernel to trigger a system transition into a global sleep + state automatically whenever there are no active wakeup sources. + +config PM_WAKELOCKS + bool "User space wakeup sources interface" + depends on PM_SLEEP + default n + ---help--- + Allow user space to create, activate and deactivate wakeup source + objects with the help of a sysfs-based interface. + +config PM_WAKELOCKS_LIMIT + int "Maximum number of user space wakeup sources (0 = no limit)" + range 0 100000 + default 100 + depends on PM_WAKELOCKS + +config PM_WAKELOCKS_GC + bool "Garbage collector for user space wakeup sources" + depends on PM_WAKELOCKS + default y + config PM_RUNTIME bool "Run-time PM core functionality" depends on !IA64_HP_SIM @@ -243,3 +278,10 @@ config PM_GENERIC_DOMAINS_RUNTIME config CPU_PM bool depends on SUSPEND || CPU_IDLE + +config SUSPEND_TIME + bool "Log time spent in suspend" + ---help--- + Prints the time spent in suspend in the kernel log, and + keeps statistics on the time spent in suspend in + /sys/kernel/debug/suspend_time diff --git a/kernel/power/Makefile b/kernel/power/Makefile index 66d808ec5252..8450b85d33c0 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -9,5 +9,8 @@ obj-$(CONFIG_SUSPEND) += suspend.o obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \ block_io.o +obj-$(CONFIG_PM_AUTOSLEEP) += autosleep.o +obj-$(CONFIG_PM_WAKELOCKS) += wakelock.o +obj-$(CONFIG_SUSPEND_TIME) += suspend_time.o obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o diff --git a/kernel/power/autosleep.c b/kernel/power/autosleep.c new file mode 100644 index 000000000000..ca304046d9e2 --- /dev/null +++ b/kernel/power/autosleep.c @@ -0,0 +1,127 @@ +/* + * kernel/power/autosleep.c + * + * Opportunistic sleep support. + * + * Copyright (C) 2012 Rafael J. Wysocki <rjw@sisk.pl> + */ + +#include <linux/device.h> +#include <linux/mutex.h> +#include <linux/pm_wakeup.h> + +#include "power.h" + +static suspend_state_t autosleep_state; +static struct workqueue_struct *autosleep_wq; +/* + * Note: it is only safe to mutex_lock(&autosleep_lock) if a wakeup_source + * is active, otherwise a deadlock with try_to_suspend() is possible. + * Alternatively mutex_lock_interruptible() can be used. This will then fail + * if an auto_sleep cycle tries to freeze processes. + */ +static DEFINE_MUTEX(autosleep_lock); +static struct wakeup_source *autosleep_ws; + +static void try_to_suspend(struct work_struct *work) +{ + unsigned int initial_count, final_count; + + if (!pm_get_wakeup_count(&initial_count, true)) + goto out; + + mutex_lock(&autosleep_lock); + + if (!pm_save_wakeup_count(initial_count)) { + mutex_unlock(&autosleep_lock); + goto out; + } + + if (autosleep_state == PM_SUSPEND_ON) { + mutex_unlock(&autosleep_lock); + return; + } + if (autosleep_state >= PM_SUSPEND_MAX) + hibernate(); + else + pm_suspend(autosleep_state); + + mutex_unlock(&autosleep_lock); + + if (!pm_get_wakeup_count(&final_count, false)) + goto out; + + /* + * If the wakeup occured for an unknown reason, wait to prevent the + * system from trying to suspend and waking up in a tight loop. + */ + if (final_count == initial_count) + schedule_timeout_uninterruptible(HZ / 2); + + out: + queue_up_suspend_work(); +} + +static DECLARE_WORK(suspend_work, try_to_suspend); + +void queue_up_suspend_work(void) +{ + if (!work_pending(&suspend_work) && autosleep_state > PM_SUSPEND_ON) + queue_work(autosleep_wq, &suspend_work); +} + +suspend_state_t pm_autosleep_state(void) +{ + return autosleep_state; +} + +int pm_autosleep_lock(void) +{ + return mutex_lock_interruptible(&autosleep_lock); +} + +void pm_autosleep_unlock(void) +{ + mutex_unlock(&autosleep_lock); +} + +int pm_autosleep_set_state(suspend_state_t state) +{ + +#ifndef CONFIG_HIBERNATION + if (state >= PM_SUSPEND_MAX) + return -EINVAL; +#endif + + __pm_stay_awake(autosleep_ws); + + mutex_lock(&autosleep_lock); + + autosleep_state = state; + + __pm_relax(autosleep_ws); + + if (state > PM_SUSPEND_ON) { + pm_wakep_autosleep_enabled(true); + queue_up_suspend_work(); + } else { + pm_wakep_autosleep_enabled(false); + } + + mutex_unlock(&autosleep_lock); + return 0; +} + +int __init pm_autosleep_init(void) +{ + autosleep_ws = wakeup_source_register("autosleep"); + if (!autosleep_ws) + return -ENOMEM; + + autosleep_wq = alloc_ordered_workqueue("autosleep", 0); + if (autosleep_wq) + return 0; + + wakeup_source_unregister(autosleep_ws); + return -ENOMEM; +} diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index 52a18173c845..586521aa2baf 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -25,6 +25,8 @@ #include <linux/freezer.h> #include <linux/gfp.h> #include <linux/syscore_ops.h> +#include <linux/ctype.h> +#include <linux/genhd.h> #include <scsi/scsi_scan.h> #include "power.h" @@ -728,6 +730,17 @@ static int software_resume(void) /* Check if the device is there */ swsusp_resume_device = name_to_dev_t(resume_file); + + /* + * name_to_dev_t is ineffective to verify parition if resume_file is in + * integer format. (e.g. major:minor) + */ + if (isdigit(resume_file[0]) && resume_wait) { + int partno; + while (!get_gendisk(swsusp_resume_device, &partno)) + msleep(10); + } + if (!swsusp_resume_device) { /* * Some device discovery might still be in progress; we need diff --git a/kernel/power/main.c b/kernel/power/main.c index 1c12581f1c62..428f8a034e96 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -269,8 +269,7 @@ static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, return (s - buf); } -static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, - const char *buf, size_t n) +static suspend_state_t decode_state(const char *buf, size_t n) { #ifdef CONFIG_SUSPEND suspend_state_t state = PM_SUSPEND_STANDBY; @@ -278,27 +277,48 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, #endif char *p; int len; - int error = -EINVAL; p = memchr(buf, '\n', n); len = p ? p - buf : n; - /* First, check if we are requested to hibernate */ - if (len == 4 && !strncmp(buf, "disk", len)) { - error = hibernate(); - goto Exit; - } + /* Check hibernation first. */ + if (len == 4 && !strncmp(buf, "disk", len)) + return PM_SUSPEND_MAX; #ifdef CONFIG_SUSPEND - for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) { - if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) { - error = pm_suspend(state); - break; - } - } + for (s = &pm_states[state]; state < PM_SUSPEND_MAX; s++, state++) + if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) + return state; #endif - Exit: + return PM_SUSPEND_ON; +} + +static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t n) +{ + suspend_state_t state; + int error; + + error = pm_autosleep_lock(); + if (error) + return error; + + if (pm_autosleep_state() > PM_SUSPEND_ON) { + error = -EBUSY; + goto out; + } + + state = decode_state(buf, n); + if (state < PM_SUSPEND_MAX) + error = pm_suspend(state); + else if (state == PM_SUSPEND_MAX) + error = hibernate(); + else + error = -EINVAL; + + out: + pm_autosleep_unlock(); return error ? error : n; } @@ -339,7 +359,8 @@ static ssize_t wakeup_count_show(struct kobject *kobj, { unsigned int val; - return pm_get_wakeup_count(&val) ? sprintf(buf, "%u\n", val) : -EINTR; + return pm_get_wakeup_count(&val, true) ? + sprintf(buf, "%u\n", val) : -EINTR; } static ssize_t wakeup_count_store(struct kobject *kobj, @@ -347,15 +368,106 @@ static ssize_t wakeup_count_store(struct kobject *kobj, const char *buf, size_t n) { unsigned int val; + int error; + + error = pm_autosleep_lock(); + if (error) + return error; + + if (pm_autosleep_state() > PM_SUSPEND_ON) { + error = -EBUSY; + goto out; + } + error = -EINVAL; if (sscanf(buf, "%u", &val) == 1) { if (pm_save_wakeup_count(val)) - return n; + error = n; } - return -EINVAL; + + out: + pm_autosleep_unlock(); + return error; } power_attr(wakeup_count); + +#ifdef CONFIG_PM_AUTOSLEEP +static ssize_t autosleep_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + suspend_state_t state = pm_autosleep_state(); + + if (state == PM_SUSPEND_ON) + return sprintf(buf, "off\n"); + +#ifdef CONFIG_SUSPEND + if (state < PM_SUSPEND_MAX) + return sprintf(buf, "%s\n", valid_state(state) ? + pm_states[state] : "error"); +#endif +#ifdef CONFIG_HIBERNATION + return sprintf(buf, "disk\n"); +#else + return sprintf(buf, "error"); +#endif +} + +static ssize_t autosleep_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t n) +{ + suspend_state_t state = decode_state(buf, n); + int error; + + if (state == PM_SUSPEND_ON + && strcmp(buf, "off") && strcmp(buf, "off\n")) + return -EINVAL; + + error = pm_autosleep_set_state(state); + return error ? error : n; +} + +power_attr(autosleep); +#endif /* CONFIG_PM_AUTOSLEEP */ + +#ifdef CONFIG_PM_WAKELOCKS +static ssize_t wake_lock_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return pm_show_wakelocks(buf, true); +} + +static ssize_t wake_lock_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t n) +{ + int error = pm_wake_lock(buf); + return error ? error : n; +} + +power_attr(wake_lock); + +static ssize_t wake_unlock_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return pm_show_wakelocks(buf, false); +} + +static ssize_t wake_unlock_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t n) +{ + int error = pm_wake_unlock(buf); + return error ? error : n; +} + +power_attr(wake_unlock); + +#endif /* CONFIG_PM_WAKELOCKS */ #endif /* CONFIG_PM_SLEEP */ #ifdef CONFIG_PM_TRACE @@ -409,6 +521,13 @@ static struct attribute * g[] = { #ifdef CONFIG_PM_SLEEP &pm_async_attr.attr, &wakeup_count_attr.attr, +#ifdef CONFIG_PM_AUTOSLEEP + &autosleep_attr.attr, +#endif +#ifdef CONFIG_PM_WAKELOCKS + &wake_lock_attr.attr, + &wake_unlock_attr.attr, +#endif #ifdef CONFIG_PM_DEBUG &pm_test_attr.attr, #endif @@ -444,7 +563,10 @@ static int __init pm_init(void) power_kobj = kobject_create_and_add("power", NULL); if (!power_kobj) return -ENOMEM; - return sysfs_create_group(power_kobj, &attr_group); + error = sysfs_create_group(power_kobj, &attr_group); + if (error) + return error; + return pm_autosleep_init(); } core_initcall(pm_init); diff --git a/kernel/power/power.h b/kernel/power/power.h index 98f3622d7407..b0bd4beaebfe 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -264,3 +264,30 @@ static inline void suspend_thaw_processes(void) { } #endif + +#ifdef CONFIG_PM_AUTOSLEEP + +/* kernel/power/autosleep.c */ +extern int pm_autosleep_init(void); +extern int pm_autosleep_lock(void); +extern void pm_autosleep_unlock(void); +extern suspend_state_t pm_autosleep_state(void); +extern int pm_autosleep_set_state(suspend_state_t state); + +#else /* !CONFIG_PM_AUTOSLEEP */ + +static inline int pm_autosleep_init(void) { return 0; } +static inline int pm_autosleep_lock(void) { return 0; } +static inline void pm_autosleep_unlock(void) {} +static inline suspend_state_t pm_autosleep_state(void) { return PM_SUSPEND_ON; } + +#endif /* !CONFIG_PM_AUTOSLEEP */ + +#ifdef CONFIG_PM_WAKELOCKS + +/* kernel/power/wakelock.c */ +extern ssize_t pm_show_wakelocks(char *buf, bool show_active); +extern int pm_wake_lock(const char *buf); +extern int pm_wake_unlock(const char *buf); + +#endif /* !CONFIG_PM_WAKELOCKS */ diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 6a031e684026..834fbfc61398 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -101,11 +101,72 @@ static struct pm_qos_object network_throughput_pm_qos = { }; +static BLOCKING_NOTIFIER_HEAD(min_online_cpus_notifier); +static struct pm_qos_constraints min_online_cpus_constraints = { + .list = PLIST_HEAD_INIT(min_online_cpus_constraints.list), + .target_value = PM_QOS_MIN_ONLINE_CPUS_DEFAULT_VALUE, + .default_value = PM_QOS_MIN_ONLINE_CPUS_DEFAULT_VALUE, + .type = PM_QOS_MAX, + .notifiers = &min_online_cpus_notifier, +}; +static struct pm_qos_object min_online_cpus_pm_qos = { + .constraints = &min_online_cpus_constraints, + .name = "min_online_cpus", +}; + + +static BLOCKING_NOTIFIER_HEAD(max_online_cpus_notifier); +static struct pm_qos_constraints max_online_cpus_constraints = { + .list = PLIST_HEAD_INIT(max_online_cpus_constraints.list), + .target_value = PM_QOS_MAX_ONLINE_CPUS_DEFAULT_VALUE, + .default_value = PM_QOS_MAX_ONLINE_CPUS_DEFAULT_VALUE, + .type = PM_QOS_MIN, + .notifiers = &max_online_cpus_notifier, +}; +static struct pm_qos_object max_online_cpus_pm_qos = { + .constraints = &max_online_cpus_constraints, + .name = "max_online_cpus", + +}; + + +static BLOCKING_NOTIFIER_HEAD(cpu_freq_min_notifier); +static struct pm_qos_constraints cpu_freq_min_constraints = { + .list = PLIST_HEAD_INIT(cpu_freq_min_constraints.list), + .target_value = PM_QOS_CPU_FREQ_MIN_DEFAULT_VALUE, + .default_value = PM_QOS_CPU_FREQ_MIN_DEFAULT_VALUE, + .type = PM_QOS_MAX, + .notifiers = &cpu_freq_min_notifier, +}; +static struct pm_qos_object cpu_freq_min_pm_qos = { + .constraints = &cpu_freq_min_constraints, + .name = "cpu_freq_min", +}; + + +static BLOCKING_NOTIFIER_HEAD(cpu_freq_max_notifier); +static struct pm_qos_constraints cpu_freq_max_constraints = { + .list = PLIST_HEAD_INIT(cpu_freq_max_constraints.list), + .target_value = PM_QOS_CPU_FREQ_MAX_DEFAULT_VALUE, + .default_value = PM_QOS_CPU_FREQ_MAX_DEFAULT_VALUE, + .type = PM_QOS_MIN, + .notifiers = &cpu_freq_max_notifier, +}; +static struct pm_qos_object cpu_freq_max_pm_qos = { + .constraints = &cpu_freq_max_constraints, + .name = "cpu_freq_max", +}; + + static struct pm_qos_object *pm_qos_array[] = { &null_pm_qos, &cpu_dma_pm_qos, &network_lat_pm_qos, - &network_throughput_pm_qos + &network_throughput_pm_qos, + &min_online_cpus_pm_qos, + &max_online_cpus_pm_qos, + &cpu_freq_min_pm_qos, + &cpu_freq_max_pm_qos }; static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index c8b7446b27df..7a2bb5beda6c 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -24,6 +24,7 @@ #include <linux/export.h> #include <linux/suspend.h> #include <linux/syscore_ops.h> +#include <linux/rtc.h> #include <linux/ftrace.h> #include <trace/events/power.h> @@ -303,6 +304,18 @@ static int enter_state(suspend_state_t state) return error; } +static void pm_suspend_marker(char *annotation) +{ + struct timespec ts; + struct rtc_time tm; + + getnstimeofday(&ts); + rtc_time_to_tm(ts.tv_sec, &tm); + pr_info("PM: suspend %s %d-%02d-%02d %02d:%02d:%02d.%09lu UTC\n", + annotation, tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday, + tm.tm_hour, tm.tm_min, tm.tm_sec, ts.tv_nsec); +} + /** * pm_suspend - Externally visible function for suspending the system. * @state: System sleep state to enter. @@ -317,6 +330,7 @@ int pm_suspend(suspend_state_t state) if (state <= PM_SUSPEND_ON || state >= PM_SUSPEND_MAX) return -EINVAL; + pm_suspend_marker("entry"); error = enter_state(state); if (error) { suspend_stats.fail++; @@ -324,6 +338,7 @@ int pm_suspend(suspend_state_t state) } else { suspend_stats.success++; } + pm_suspend_marker("exit"); return error; } EXPORT_SYMBOL(pm_suspend); diff --git a/kernel/power/suspend_time.c b/kernel/power/suspend_time.c new file mode 100644 index 000000000000..d2a65da9f22c --- /dev/null +++ b/kernel/power/suspend_time.c @@ -0,0 +1,111 @@ +/* + * debugfs file to track time spent in suspend + * + * Copyright (c) 2011, Google, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include <linux/debugfs.h> +#include <linux/err.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/seq_file.h> +#include <linux/syscore_ops.h> +#include <linux/time.h> + +static struct timespec suspend_time_before; +static unsigned int time_in_suspend_bins[32]; + +#ifdef CONFIG_DEBUG_FS +static int suspend_time_debug_show(struct seq_file *s, void *data) +{ + int bin; + seq_printf(s, "time (secs) count\n"); + seq_printf(s, "------------------\n"); + for (bin = 0; bin < 32; bin++) { + if (time_in_suspend_bins[bin] == 0) + continue; + seq_printf(s, "%4d - %4d %4u\n", + bin ? 1 << (bin - 1) : 0, 1 << bin, + time_in_suspend_bins[bin]); + } + return 0; +} + +static int suspend_time_debug_open(struct inode *inode, struct file *file) +{ + return single_open(file, suspend_time_debug_show, NULL); +} + +static const struct file_operations suspend_time_debug_fops = { + .open = suspend_time_debug_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init suspend_time_debug_init(void) +{ + struct dentry *d; + + d = debugfs_create_file("suspend_time", 0755, NULL, NULL, + &suspend_time_debug_fops); + if (!d) { + pr_err("Failed to create suspend_time debug file\n"); + return -ENOMEM; + } + + return 0; +} + +late_initcall(suspend_time_debug_init); +#endif + +static int suspend_time_syscore_suspend(void) +{ + read_persistent_clock(&suspend_time_before); + + return 0; +} + +static void suspend_time_syscore_resume(void) +{ + struct timespec after; + + read_persistent_clock(&after); + + after = timespec_sub(after, suspend_time_before); + + time_in_suspend_bins[fls(after.tv_sec)]++; + + pr_info("Suspended for %lu.%03lu seconds\n", after.tv_sec, + after.tv_nsec / NSEC_PER_MSEC); +} + +static struct syscore_ops suspend_time_syscore_ops = { + .suspend = suspend_time_syscore_suspend, + .resume = suspend_time_syscore_resume, +}; + +static int suspend_time_syscore_init(void) +{ + register_syscore_ops(&suspend_time_syscore_ops); + + return 0; +} + +static void suspend_time_syscore_exit(void) +{ + unregister_syscore_ops(&suspend_time_syscore_ops); +} +module_init(suspend_time_syscore_init); +module_exit(suspend_time_syscore_exit); diff --git a/kernel/power/swap.c b/kernel/power/swap.c index eef311a58a64..11e22c068e8b 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -6,7 +6,7 @@ * * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@ucw.cz> * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl> - * Copyright (C) 2010 Bojan Smojver <bojan@rexursive.com> + * Copyright (C) 2010-2012 Bojan Smojver <bojan@rexursive.com> * * This file is released under the GPLv2. * @@ -282,14 +282,17 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain) return -ENOSPC; if (bio_chain) { - src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); + src = (void *)__get_free_page(__GFP_WAIT | __GFP_NOWARN | + __GFP_NORETRY); if (src) { copy_page(src, buf); } else { ret = hib_wait_on_bio_chain(bio_chain); /* Free pages */ if (ret) return ret; - src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); + src = (void *)__get_free_page(__GFP_WAIT | + __GFP_NOWARN | + __GFP_NORETRY); if (src) { copy_page(src, buf); } else { @@ -367,12 +370,17 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf, clear_page(handle->cur); handle->cur_swap = offset; handle->k = 0; - } - if (bio_chain && low_free_pages() <= handle->reqd_free_pages) { - error = hib_wait_on_bio_chain(bio_chain); - if (error) - goto out; - handle->reqd_free_pages = reqd_free_pages(); + + if (bio_chain && low_free_pages() <= handle->reqd_free_pages) { + error = hib_wait_on_bio_chain(bio_chain); + if (error) + goto out; + /* + * Recalculate the number of required free pages, to + * make sure we never take more than half. + */ + handle->reqd_free_pages = reqd_free_pages(); + } } out: return error; @@ -419,8 +427,9 @@ static int swap_writer_finish(struct swap_map_handle *handle, /* Maximum number of threads for compression/decompression. */ #define LZO_THREADS 3 -/* Maximum number of pages for read buffering. */ -#define LZO_READ_PAGES (MAP_PAGE_ENTRIES * 8) +/* Minimum/maximum number of pages for read buffering. */ +#define LZO_MIN_RD_PAGES 1024 +#define LZO_MAX_RD_PAGES 8192 /** @@ -631,12 +640,6 @@ static int save_image_lzo(struct swap_map_handle *handle, } /* - * Adjust number of free pages after all allocations have been done. - * We don't want to run out of pages when writing. - */ - handle->reqd_free_pages = reqd_free_pages(); - - /* * Start the CRC32 thread. */ init_waitqueue_head(&crc->go); @@ -657,6 +660,12 @@ static int save_image_lzo(struct swap_map_handle *handle, goto out_clean; } + /* + * Adjust the number of required free pages after all allocations have + * been done. We don't want to run out of pages when writing. + */ + handle->reqd_free_pages = reqd_free_pages(); + printk(KERN_INFO "PM: Using %u thread(s) for compression.\n" "PM: Compressing and saving image data (%u pages) ... ", @@ -1067,7 +1076,7 @@ static int load_image_lzo(struct swap_map_handle *handle, unsigned i, thr, run_threads, nr_threads; unsigned ring = 0, pg = 0, ring_size = 0, have = 0, want, need, asked = 0; - unsigned long read_pages; + unsigned long read_pages = 0; unsigned char **page = NULL; struct dec_data *data = NULL; struct crc_data *crc = NULL; @@ -1079,7 +1088,7 @@ static int load_image_lzo(struct swap_map_handle *handle, nr_threads = num_online_cpus() - 1; nr_threads = clamp_val(nr_threads, 1, LZO_THREADS); - page = vmalloc(sizeof(*page) * LZO_READ_PAGES); + page = vmalloc(sizeof(*page) * LZO_MAX_RD_PAGES); if (!page) { printk(KERN_ERR "PM: Failed to allocate LZO page\n"); ret = -ENOMEM; @@ -1144,15 +1153,22 @@ static int load_image_lzo(struct swap_map_handle *handle, } /* - * Adjust number of pages for read buffering, in case we are short. + * Set the number of pages for read buffering. + * This is complete guesswork, because we'll only know the real + * picture once prepare_image() is called, which is much later on + * during the image load phase. We'll assume the worst case and + * say that none of the image pages are from high memory. */ - read_pages = (nr_free_pages() - snapshot_get_image_size()) >> 1; - read_pages = clamp_val(read_pages, LZO_CMP_PAGES, LZO_READ_PAGES); + if (low_free_pages() > snapshot_get_image_size()) + read_pages = (low_free_pages() - snapshot_get_image_size()) / 2; + read_pages = clamp_val(read_pages, LZO_MIN_RD_PAGES, LZO_MAX_RD_PAGES); for (i = 0; i < read_pages; i++) { page[i] = (void *)__get_free_page(i < LZO_CMP_PAGES ? __GFP_WAIT | __GFP_HIGH : - __GFP_WAIT); + __GFP_WAIT | __GFP_NOWARN | + __GFP_NORETRY); + if (!page[i]) { if (i < LZO_CMP_PAGES) { ring_size = i; diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c new file mode 100644 index 000000000000..c8fba3380076 --- /dev/null +++ b/kernel/power/wakelock.c @@ -0,0 +1,259 @@ +/* + * kernel/power/wakelock.c + * + * User space wakeup sources support. + * + * Copyright (C) 2012 Rafael J. Wysocki <rjw@sisk.pl> + * + * This code is based on the analogous interface allowing user space to + * manipulate wakelocks on Android. + */ + +#include <linux/ctype.h> +#include <linux/device.h> +#include <linux/err.h> +#include <linux/hrtimer.h> +#include <linux/list.h> +#include <linux/rbtree.h> +#include <linux/slab.h> + +static DEFINE_MUTEX(wakelocks_lock); + +struct wakelock { + char *name; + struct rb_node node; + struct wakeup_source ws; +#ifdef CONFIG_PM_WAKELOCKS_GC + struct list_head lru; +#endif +}; + +static struct rb_root wakelocks_tree = RB_ROOT; + +ssize_t pm_show_wakelocks(char *buf, bool show_active) +{ + struct rb_node *node; + struct wakelock *wl; + char *str = buf; + char *end = buf + PAGE_SIZE; + + mutex_lock(&wakelocks_lock); + + for (node = rb_first(&wakelocks_tree); node; node = rb_next(node)) { + wl = rb_entry(node, struct wakelock, node); + if (wl->ws.active == show_active) + str += scnprintf(str, end - str, "%s ", wl->name); + } + if (str > buf) + str--; + + str += scnprintf(str, end - str, "\n"); + + mutex_unlock(&wakelocks_lock); + return (str - buf); +} + +#if CONFIG_PM_WAKELOCKS_LIMIT > 0 +static unsigned int number_of_wakelocks; + +static inline bool wakelocks_limit_exceeded(void) +{ + return number_of_wakelocks > CONFIG_PM_WAKELOCKS_LIMIT; +} + +static inline void increment_wakelocks_number(void) +{ + number_of_wakelocks++; +} + +static inline void decrement_wakelocks_number(void) +{ + number_of_wakelocks--; +} +#else /* CONFIG_PM_WAKELOCKS_LIMIT = 0 */ +static inline bool wakelocks_limit_exceeded(void) { return false; } +static inline void increment_wakelocks_number(void) {} +static inline void decrement_wakelocks_number(void) {} +#endif /* CONFIG_PM_WAKELOCKS_LIMIT */ + +#ifdef CONFIG_PM_WAKELOCKS_GC +#define WL_GC_COUNT_MAX 100 +#define WL_GC_TIME_SEC 300 + +static LIST_HEAD(wakelocks_lru_list); +static unsigned int wakelocks_gc_count; + +static inline void wakelocks_lru_add(struct wakelock *wl) +{ + list_add(&wl->lru, &wakelocks_lru_list); +} + +static inline void wakelocks_lru_most_recent(struct wakelock *wl) +{ + list_move(&wl->lru, &wakelocks_lru_list); +} + +static void wakelocks_gc(void) +{ + struct wakelock *wl, *aux; + ktime_t now; + + if (++wakelocks_gc_count <= WL_GC_COUNT_MAX) + return; + + now = ktime_get(); + list_for_each_entry_safe_reverse(wl, aux, &wakelocks_lru_list, lru) { + u64 idle_time_ns; + bool active; + + spin_lock_irq(&wl->ws.lock); + idle_time_ns = ktime_to_ns(ktime_sub(now, wl->ws.last_time)); + active = wl->ws.active; + spin_unlock_irq(&wl->ws.lock); + + if (idle_time_ns < ((u64)WL_GC_TIME_SEC * NSEC_PER_SEC)) + break; + + if (!active) { + wakeup_source_remove(&wl->ws); + rb_erase(&wl->node, &wakelocks_tree); + list_del(&wl->lru); + kfree(wl->name); + kfree(wl); + decrement_wakelocks_number(); + } + } + wakelocks_gc_count = 0; +} +#else /* !CONFIG_PM_WAKELOCKS_GC */ +static inline void wakelocks_lru_add(struct wakelock *wl) {} +static inline void wakelocks_lru_most_recent(struct wakelock *wl) {} +static inline void wakelocks_gc(void) {} +#endif /* !CONFIG_PM_WAKELOCKS_GC */ + +static struct wakelock *wakelock_lookup_add(const char *name, size_t len, + bool add_if_not_found) +{ + struct rb_node **node = &wakelocks_tree.rb_node; + struct rb_node *parent = *node; + struct wakelock *wl; + + while (*node) { + int diff; + + parent = *node; + wl = rb_entry(*node, struct wakelock, node); + diff = strncmp(name, wl->name, len); + if (diff == 0) { + if (wl->name[len]) + diff = -1; + else + return wl; + } + if (diff < 0) + node = &(*node)->rb_left; + else + node = &(*node)->rb_right; + } + if (!add_if_not_found) + return ERR_PTR(-EINVAL); + + if (wakelocks_limit_exceeded()) + return ERR_PTR(-ENOSPC); + + /* Not found, we have to add a new one. */ + wl = kzalloc(sizeof(*wl), GFP_KERNEL); + if (!wl) + return ERR_PTR(-ENOMEM); + + wl->name = kstrndup(name, len, GFP_KERNEL); + if (!wl->name) { + kfree(wl); + return ERR_PTR(-ENOMEM); + } + wl->ws.name = wl->name; + wakeup_source_add(&wl->ws); + rb_link_node(&wl->node, parent, node); + rb_insert_color(&wl->node, &wakelocks_tree); + wakelocks_lru_add(wl); + increment_wakelocks_number(); + return wl; +} + +int pm_wake_lock(const char *buf) +{ + const char *str = buf; + struct wakelock *wl; + u64 timeout_ns = 0; + size_t len; + int ret = 0; + + while (*str && !isspace(*str)) + str++; + + len = str - buf; + if (!len) + return -EINVAL; + + if (*str && *str != '\n') { + /* Find out if there's a valid timeout string appended. */ + ret = kstrtou64(skip_spaces(str), 10, &timeout_ns); + if (ret) + return -EINVAL; + } + + mutex_lock(&wakelocks_lock); + + wl = wakelock_lookup_add(buf, len, true); + if (IS_ERR(wl)) { + ret = PTR_ERR(wl); + goto out; + } + if (timeout_ns) { + u64 timeout_ms = timeout_ns + NSEC_PER_MSEC - 1; + + do_div(timeout_ms, NSEC_PER_MSEC); + __pm_wakeup_event(&wl->ws, timeout_ms); + } else { + __pm_stay_awake(&wl->ws); + } + + wakelocks_lru_most_recent(wl); + + out: + mutex_unlock(&wakelocks_lock); + return ret; +} + +int pm_wake_unlock(const char *buf) +{ + struct wakelock *wl; + size_t len; + int ret = 0; + + len = strlen(buf); + if (!len) + return -EINVAL; + + if (buf[len-1] == '\n') + len--; + + if (!len) + return -EINVAL; + + mutex_lock(&wakelocks_lock); + + wl = wakelock_lookup_add(buf, len, false); + if (IS_ERR(wl)) { + ret = PTR_ERR(wl); + goto out; + } + __pm_relax(&wl->ws); + + wakelocks_lru_most_recent(wl); + wakelocks_gc(); + + out: + mutex_unlock(&wakelocks_lock); + return ret; +} diff --git a/kernel/printk.c b/kernel/printk.c index b663c2c95d39..7a8b101b237a 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -56,6 +56,10 @@ void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...) #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) +#ifdef CONFIG_DEBUG_LL +extern void printascii(char *); +#endif + /* printk's without a loglevel use this.. */ #define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL @@ -293,6 +297,53 @@ static inline void boot_delay_msec(void) } #endif +/* + * Return the number of unread characters in the log buffer. + */ +static int log_buf_get_len(void) +{ + return logged_chars; +} + +/* + * Clears the ring-buffer + */ +void log_buf_clear(void) +{ + logged_chars = 0; +} + +/* + * Copy a range of characters from the log buffer. + */ +int log_buf_copy(char *dest, int idx, int len) +{ + int ret, max; + bool took_lock = false; + + if (!oops_in_progress) { + raw_spin_lock_irq(&logbuf_lock); + took_lock = true; + } + + max = log_buf_get_len(); + if (idx < 0 || idx >= max) { + ret = -1; + } else { + if (len > max - idx) + len = max - idx; + ret = len; + idx += (log_end - max); + while (len-- > 0) + dest[len] = LOG_BUF(idx + len); + } + + if (took_lock) + raw_spin_unlock_irq(&logbuf_lock); + + return ret; +} + #ifdef CONFIG_SECURITY_DMESG_RESTRICT int dmesg_restrict = 1; #else @@ -884,6 +935,10 @@ asmlinkage int vprintk(const char *fmt, va_list args) printed_len += vscnprintf(printk_buf + printed_len, sizeof(printk_buf) - printed_len, fmt, args); +#ifdef CONFIG_DEBUG_LL + printascii(printk_buf); +#endif + p = printk_buf; /* Read log level and handle special printk prefix */ @@ -959,7 +1014,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) * Try to acquire and then immediately release the * console semaphore. The release will do all the * actual magic (print out buffers, wake up klogd, - * etc). + * etc). * * The console_trylock_for_printk() function * will release 'logbuf_lock' regardless of whether it @@ -1161,7 +1216,6 @@ static int __cpuinit console_cpu_notify(struct notifier_block *self, switch (action) { case CPU_ONLINE: case CPU_DEAD: - case CPU_DYING: case CPU_DOWN_FAILED: case CPU_UP_CANCELED: console_lock(); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 817bf7018834..eb15edd08a23 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2161,6 +2161,32 @@ unsigned long this_cpu_load(void) return this->cpu_load[0]; } +unsigned long avg_nr_running(void) +{ + unsigned long i, sum = 0; + unsigned int seqcnt, ave_nr_running; + + for_each_online_cpu(i) { + struct rq *q = cpu_rq(i); + + /* + * Update average to avoid reading stalled value if there were + * no run-queue changes for a long time. On the other hand if + * the changes are happening right now, just read current value + * directly. + */ + seqcnt = read_seqcount_begin(&q->ave_seqcnt); + ave_nr_running = do_avg_nr_running(q); + if (read_seqcount_retry(&q->ave_seqcnt, seqcnt)) { + read_seqcount_begin(&q->ave_seqcnt); + ave_nr_running = q->ave_nr_running; + } + + sum += ave_nr_running; + } + + return sum; +} /* * Global load-average calculations @@ -7203,13 +7229,24 @@ static inline int preempt_count_equals(int preempt_offset) return (nested == preempt_offset); } +static int __might_sleep_init_called; +int __init __might_sleep_init(void) +{ + __might_sleep_init_called = 1; + return 0; +} +early_initcall(__might_sleep_init); + void __might_sleep(const char *file, int line, int preempt_offset) { static unsigned long prev_jiffy; /* ratelimiting */ rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */ if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || - system_state != SYSTEM_RUNNING || oops_in_progress) + oops_in_progress) + return; + if (system_state != SYSTEM_RUNNING && + (!__might_sleep_init_called || system_state != SYSTEM_BOOTING)) return; if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) return; @@ -7762,6 +7799,23 @@ static void cpu_cgroup_destroy(struct cgroup *cgrp) sched_destroy_group(tg); } +static int +cpu_cgroup_allow_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) +{ + const struct cred *cred = current_cred(), *tcred; + struct task_struct *task; + + cgroup_taskset_for_each(task, cgrp, tset) { + tcred = __task_cred(task); + + if ((current != task) && !capable(CAP_SYS_NICE) && + cred->euid != tcred->uid && cred->euid != tcred->suid) + return -EACCES; + } + + return 0; +} + static int cpu_cgroup_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) { @@ -8123,6 +8177,7 @@ struct cgroup_subsys cpu_cgroup_subsys = { .destroy = cpu_cgroup_destroy, .can_attach = cpu_cgroup_can_attach, .attach = cpu_cgroup_attach, + .allow_attach = cpu_cgroup_allow_attach, .exit = cpu_cgroup_exit, .populate = cpu_cgroup_populate, .subsys_id = cpu_cgroup_subsys_id, diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 09acaa15161d..06d172eb5cea 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -266,6 +266,9 @@ static void print_cpu(struct seq_file *m, int cpu) SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rq->x)) P(nr_running); + SEQ_printf(m, " .%-30s: %d.%03d \n", "ave_nr_running", + rq->ave_nr_running / FIXED_1, + ((rq->ave_nr_running % FIXED_1) * 1000) / FIXED_1); SEQ_printf(m, " .%-30s: %lu\n", "load", rq->load.weight); P(nr_switches); diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 44af55e6d5d0..be427c5bc4d7 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -685,6 +685,7 @@ balanced: * runtime - in which case borrowing doesn't make sense. */ rt_rq->rt_runtime = RUNTIME_INF; + rt_rq->rt_throttled = 0; raw_spin_unlock(&rt_rq->rt_runtime_lock); raw_spin_unlock(&rt_b->rt_runtime_lock); } @@ -1983,6 +1984,8 @@ static void watchdog(struct rq *rq, struct task_struct *p) static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) { + struct sched_rt_entity *rt_se = &p->rt; + update_curr_rt(rq); watchdog(rq, p); @@ -2000,12 +2003,15 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) p->rt.time_slice = RR_TIMESLICE; /* - * Requeue to the end of queue if we are not the only element - * on the queue: + * Requeue to the end of queue if we (and all of our ancestors) are the + * only element on the queue */ - if (p->rt.run_list.prev != p->rt.run_list.next) { - requeue_task_rt(rq, p, 0); - set_tsk_need_resched(p); + for_each_sched_rt_entity(rt_se) { + if (rt_se->run_list.prev != rt_se->run_list.next) { + requeue_task_rt(rq, p, 0); + set_tsk_need_resched(p); + return; + } } } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 116ced06ecc0..ef5a1ff65196 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -363,6 +363,11 @@ struct rq { #endif int skip_clock_update; + /* time-based average load */ + u64 nr_last_stamp; + unsigned int ave_nr_running; + seqcount_t ave_seqcnt; + /* capture load from *all* tasks on this cpu: */ struct load_weight load; unsigned long nr_load_updates; @@ -914,14 +919,49 @@ extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} #endif +/* 27 ~= 134217728ns = 134.2ms + * 26 ~= 67108864ns = 67.1ms + * 25 ~= 33554432ns = 33.5ms + * 24 ~= 16777216ns = 16.8ms + */ +#define NR_AVE_PERIOD_EXP 27 +#define NR_AVE_SCALE(x) ((x) << FSHIFT) +#define NR_AVE_PERIOD (1 << NR_AVE_PERIOD_EXP) +#define NR_AVE_DIV_PERIOD(x) ((x) >> NR_AVE_PERIOD_EXP) + +static inline unsigned int do_avg_nr_running(struct rq *rq) +{ + s64 nr, deltax; + unsigned int ave_nr_running = rq->ave_nr_running; + + deltax = rq->clock_task - rq->nr_last_stamp; + nr = NR_AVE_SCALE(rq->nr_running); + + if (deltax > NR_AVE_PERIOD) + ave_nr_running = nr; + else + ave_nr_running += + NR_AVE_DIV_PERIOD(deltax * (nr - ave_nr_running)); + + return ave_nr_running; +} + static inline void inc_nr_running(struct rq *rq) { + write_seqcount_begin(&rq->ave_seqcnt); + rq->ave_nr_running = do_avg_nr_running(rq); + rq->nr_last_stamp = rq->clock_task; rq->nr_running++; + write_seqcount_end(&rq->ave_seqcnt); } static inline void dec_nr_running(struct rq *rq) { + write_seqcount_begin(&rq->ave_seqcnt); + rq->ave_nr_running = do_avg_nr_running(rq); + rq->nr_last_stamp = rq->clock_task; rq->nr_running--; + write_seqcount_end(&rq->ave_seqcnt); } extern void update_rq_clock(struct rq *rq); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 4ab11879aeb4..49f472582722 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -102,6 +102,7 @@ extern char core_pattern[]; extern unsigned int core_pipe_limit; extern int pid_max; extern int min_free_kbytes; +extern int min_free_order_shift; extern int pid_max_min, pid_max_max; extern int sysctl_drop_caches; extern int percpu_pagelist_fraction; @@ -1199,6 +1200,13 @@ static struct ctl_table vm_table[] = { .extra1 = &zero, }, { + .procname = "min_free_order_shift", + .data = &min_free_order_shift, + .maxlen = sizeof(min_free_order_shift), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { .procname = "percpu_pagelist_fraction", .data = &percpu_pagelist_fraction, .maxlen = sizeof(percpu_pagelist_fraction), diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 8a538c55fc7b..0c079010527f 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -46,6 +46,8 @@ static struct alarm_base { static ktime_t freezer_delta; static DEFINE_SPINLOCK(freezer_delta_lock); +static struct wakeup_source *ws; + #ifdef CONFIG_RTC_CLASS /* rtc timer and device for setting alarm wakeups at suspend */ static struct rtc_timer rtctimer; @@ -59,7 +61,7 @@ static DEFINE_SPINLOCK(rtcdev_lock); * If one has not already been chosen, it checks to see if a * functional rtc device is available. */ -static struct rtc_device *alarmtimer_get_rtcdev(void) +struct rtc_device *alarmtimer_get_rtcdev(void) { unsigned long flags; struct rtc_device *ret; @@ -115,10 +117,6 @@ static void alarmtimer_rtc_interface_remove(void) class_interface_unregister(&alarmtimer_rtc_interface); } #else -static inline struct rtc_device *alarmtimer_get_rtcdev(void) -{ - return NULL; -} #define rtcdev (NULL) static inline int alarmtimer_rtc_interface_setup(void) { return 0; } static inline void alarmtimer_rtc_interface_remove(void) { } @@ -250,6 +248,7 @@ static int alarmtimer_suspend(struct device *dev) unsigned long flags; struct rtc_device *rtc; int i; + int ret; spin_lock_irqsave(&freezer_delta_lock, flags); min = freezer_delta; @@ -279,8 +278,10 @@ static int alarmtimer_suspend(struct device *dev) if (min.tv64 == 0) return 0; - /* XXX - Should we enforce a minimum sleep time? */ - WARN_ON(min.tv64 < NSEC_PER_SEC); + if (ktime_to_ns(min) < 2 * NSEC_PER_SEC) { + __pm_wakeup_event(ws, 2 * MSEC_PER_SEC); + return -EBUSY; + } /* Setup an rtc timer to fire that far in the future */ rtc_timer_cancel(rtc, &rtctimer); @@ -288,9 +289,11 @@ static int alarmtimer_suspend(struct device *dev) now = rtc_tm_to_ktime(tm); now = ktime_add(now, min); - rtc_timer_start(rtc, &rtctimer, now, ktime_set(0, 0)); - - return 0; + /* Set alarm, if in the past reject suspend briefly to handle */ + ret = rtc_timer_start(rtc, &rtctimer, now, ktime_set(0, 0)); + if (ret < 0) + __pm_wakeup_event(ws, 1 * MSEC_PER_SEC); + return ret; } #else static int alarmtimer_suspend(struct device *dev) @@ -821,6 +824,7 @@ static int __init alarmtimer_init(void) error = PTR_ERR(pdev); goto out_drv; } + ws = wakeup_source_register("alarmtimer"); return 0; out_drv: diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 7c50de83b6fd..f3a4dd993ca0 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1323,8 +1323,6 @@ ktime_t ktime_get_monotonic_offset(void) return timespec_to_ktime(wtom); } -EXPORT_SYMBOL_GPL(ktime_get_monotonic_offset); - /** * xtime_update() - advances the timekeeping infrastructure diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index a1d2849f2473..e2a3f7207cca 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -487,6 +487,39 @@ config RING_BUFFER_BENCHMARK If unsure, say N. +config TRACELEVEL + bool "Add capability to prioritize traces" + depends on EVENT_TRACING + help + This option allows subsystem programmers to add priorities to trace + events by calling to tracelevel_register. Traces of high priority + will automatically be enabled on kernel boot, and users can change + the the trace level in a kernel parameter. + +config TRACEDUMP + bool "Dumping functionality for ftrace" + depends on FUNCTION_TRACER + help + This option adds functionality to dump tracing data in several forms + Data can be dumped in ascii form or as raw pages from the tracing + ring buffers, along with the saved cmdlines. This is specified by + the module parameter tracedump_ascii. Data will be compressed + using zlib. + +config TRACEDUMP_PANIC + bool "Tracedump to console on panic" + depends on TRACEDUMP + help + With this option, tracedump will automatically dump to the console + on a kernel panic. + +config TRACEDUMP_PROCFS + bool "Tracedump via proc file" + depends on TRACEDUMP + help + With this option, tracedump can be dumped from user space by reading + from /proc/tracedump. + endif # FTRACE endif # TRACING_SUPPORT diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 5f39a07fe5ea..5eca10525b5b 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -61,5 +61,7 @@ endif ifeq ($(CONFIG_TRACING),y) obj-$(CONFIG_KGDB_KDB) += trace_kdb.o endif +obj-$(CONFIG_TRACELEVEL) += tracelevel.o +obj-$(CONFIG_TRACEDUMP) += tracedump.o libftrace-y := ftrace.o diff --git a/kernel/trace/tracedump.c b/kernel/trace/tracedump.c new file mode 100644 index 000000000000..a83532bc36da --- /dev/null +++ b/kernel/trace/tracedump.c @@ -0,0 +1,682 @@ +/* + * kernel/trace/tracedump.c + * + * Copyright (c) 2011, NVIDIA CORPORATION. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + * + */ + +#include <linux/console.h> +#include <linux/cpumask.h> +#include <linux/init.h> +#include <linux/irqflags.h> +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/mutex.h> +#include <linux/notifier.h> +#include <linux/proc_fs.h> +#include <linux/ring_buffer.h> +#include <linux/sched.h> +#include <linux/smp.h> +#include <linux/string.h> +#include <linux/threads.h> +#include <linux/tracedump.h> +#include <linux/uaccess.h> +#include <linux/vmalloc.h> +#include <linux/zlib.h> + +#include "trace.h" +#include "trace_output.h" + +#define CPU_MAX (NR_CPUS-1) + +#define TRYM(fn, ...) do { \ + int try_error = (fn); \ + if (try_error < 0) { \ + printk(__VA_ARGS__); \ + return try_error; \ + } \ +} while (0) + +#define TRY(fn) TRYM(fn, TAG "Caught error from %s in %s\n", #fn, __func__) + +/* Stolen from printk.c */ +#define for_each_console(con) \ + for (con = console_drivers; con != NULL; con = con->next) + +#define TAG KERN_ERR "tracedump: " + +#define TD_MIN_CONSUME 2000 +#define TD_COMPRESS_CHUNK 0x8000 + +static DEFINE_MUTEX(tracedump_proc_lock); + +static const char MAGIC_NUMBER[9] = "TRACEDUMP"; +static const char CPU_DELIM[7] = "CPU_END"; +#define CMDLINE_DELIM "|" + +/* Type of output */ +static bool current_format; +static bool format_ascii; +module_param(format_ascii, bool, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(format_ascii, "Dump ascii or raw data"); + +/* Max size of output */ +static uint panic_size = 0x80000; +module_param(panic_size, uint, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(panic_size, "Max dump size during kernel panic (bytes)"); + +static uint compress_level = 9; +module_param(compress_level, uint, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(compress_level, "Level of compression to use. [0-9]"); + +static char out_buf[TD_COMPRESS_CHUNK]; +static z_stream stream; +static int compress_done; +static int flush; + +static int old_trace_flags; + +static struct trace_iterator iter; +static struct pager_s { + struct trace_array *tr; + void *spare; + int cpu; + int len; + char __user *ubuf; +} pager; + +static char cmdline_buf[16+TASK_COMM_LEN]; + +static int print_to_console(const char *buf, size_t len) +{ + struct console *con; + + /* Stolen from printk.c */ + for_each_console(con) { + if ((con->flags & CON_ENABLED) && con->write && + (cpu_online(smp_processor_id()) || + (con->flags & CON_ANYTIME))) + con->write(con, buf, len); + } + return 0; +} + +static int print_to_user(const char *buf, size_t len) +{ + int size; + size = copy_to_user(pager.ubuf, buf, len); + if (size > 0) { + printk(TAG "Failed to copy to user %d bytes\n", size); + return -EINVAL; + } + return 0; +} + +static int print(const char *buf, size_t len, int print_to) +{ + if (print_to == TD_PRINT_CONSOLE) + TRY(print_to_console(buf, len)); + else if (print_to == TD_PRINT_USER) + TRY(print_to_user(buf, len)); + return 0; +} + +/* print_magic will print MAGIC_NUMBER using the + * print function selected by print_to. + */ +static inline ssize_t print_magic(int print_to) +{ + print(MAGIC_NUMBER, sizeof(MAGIC_NUMBER), print_to); + return sizeof(MAGIC_NUMBER); +} + +static int iter_init(void) +{ + int cpu; + + /* Make iter point to global ring buffer used in trace. */ + trace_init_global_iter(&iter); + + /* Disable tracing */ + for_each_tracing_cpu(cpu) { + atomic_inc(&iter.tr->data[cpu]->disabled); + } + + /* Save flags */ + old_trace_flags = trace_flags; + + /* Dont look at memory in panic mode. */ + trace_flags &= ~TRACE_ITER_SYM_USEROBJ; + + /* Prepare ring buffer iter */ + for_each_tracing_cpu(cpu) { + iter.buffer_iter[cpu] = + ring_buffer_read_prepare(iter.tr->buffer, cpu); + } + ring_buffer_read_prepare_sync(); + for_each_tracing_cpu(cpu) { + ring_buffer_read_start(iter.buffer_iter[cpu]); + tracing_iter_reset(&iter, cpu); + } + return 0; +} + +/* iter_next gets the next entry in the ring buffer, ordered by time. + * If there are no more entries, returns 0. + */ +static ssize_t iter_next(void) +{ + /* Zero out the iterator's seq */ + memset(&iter.seq, 0, + sizeof(struct trace_iterator) - + offsetof(struct trace_iterator, seq)); + + while (!trace_empty(&iter)) { + if (trace_find_next_entry_inc(&iter) == NULL) { + printk(TAG "trace_find_next_entry failed!\n"); + return -EINVAL; + } + + /* Copy the ring buffer data to iterator's seq */ + print_trace_line(&iter); + if (iter.seq.len != 0) + return iter.seq.len; + } + return 0; +} + +static int iter_deinit(void) +{ + int cpu; + /* Enable tracing */ + for_each_tracing_cpu(cpu) { + ring_buffer_read_finish(iter.buffer_iter[cpu]); + } + for_each_tracing_cpu(cpu) { + atomic_dec(&iter.tr->data[cpu]->disabled); + } + + /* Restore flags */ + trace_flags = old_trace_flags; + return 0; +} + +static int pager_init(void) +{ + int cpu; + + /* Need to do this to get a pointer to global_trace (iter.tr). + Lame, I know. */ + trace_init_global_iter(&iter); + + /* Turn off tracing */ + for_each_tracing_cpu(cpu) { + atomic_inc(&iter.tr->data[cpu]->disabled); + } + + memset(&pager, 0, sizeof(pager)); + pager.tr = iter.tr; + pager.len = TD_COMPRESS_CHUNK; + + return 0; +} + +/* pager_next_cpu moves the pager to the next cpu. + * Returns 0 if pager is done, else 1. + */ +static ssize_t pager_next_cpu(void) +{ + if (pager.cpu <= CPU_MAX) { + pager.cpu += 1; + return 1; + } + + return 0; +} + +/* pager_next gets the next page of data from the ring buffer + * of the current cpu. Returns page size or 0 if no more data. + */ +static ssize_t pager_next(void) +{ + int ret; + + if (pager.cpu > CPU_MAX) + return 0; + + if (!pager.spare) + pager.spare = ring_buffer_alloc_read_page(pager.tr->buffer, pager.cpu); + if (!pager.spare) { + printk(TAG "ring_buffer_alloc_read_page failed!"); + return -ENOMEM; + } + + ret = ring_buffer_read_page(pager.tr->buffer, + &pager.spare, + pager.len, + pager.cpu, 0); + if (ret < 0) + return 0; + + return PAGE_SIZE; +} + +static int pager_deinit(void) +{ + int cpu; + if (pager.spare != NULL) + ring_buffer_free_read_page(pager.tr->buffer, pager.spare); + + for_each_tracing_cpu(cpu) { + atomic_dec(&iter.tr->data[cpu]->disabled); + } + return 0; +} + +/* cmdline_next gets the next saved cmdline from the trace and + * puts it in cmdline_buf. Returns the size of the cmdline, or 0 if empty. + * but will reset itself on a subsequent call. + */ +static ssize_t cmdline_next(void) +{ + static int pid; + ssize_t size = 0; + + if (pid >= PID_MAX_DEFAULT) + pid = -1; + + while (size == 0 && pid < PID_MAX_DEFAULT) { + pid++; + trace_find_cmdline(pid, cmdline_buf); + if (!strncmp(cmdline_buf, "<...>", 5)) + continue; + + sprintf(&cmdline_buf[strlen(cmdline_buf)], " %d" + CMDLINE_DELIM, pid); + size = strlen(cmdline_buf); + } + return size; +} + +/* comsume_events removes the first 'num' entries from the ring buffer. */ +static int consume_events(size_t num) +{ + TRY(iter_init()); + for (; num > 0 && !trace_empty(&iter); num--) { + trace_find_next_entry_inc(&iter); + ring_buffer_consume(iter.tr->buffer, iter.cpu, &iter.ts, + &iter.lost_events); + } + TRY(iter_deinit()); + return 0; +} + +static int data_init(void) +{ + if (current_format) + TRY(iter_init()); + else + TRY(pager_init()); + return 0; +} + +/* data_next will figure out the right 'next' function to + * call and will select the right buffer to pass back + * to compress_next. + * + * iter_next should be used to get data entry-by-entry, ordered + * by time, which is what we need in order to convert it to ascii. + * + * pager_next will return a full page of raw data at a time, one + * CPU at a time. pager_next_cpu must be called to get the next CPU. + * cmdline_next will get the next saved cmdline + */ +static ssize_t data_next(const char **buf) +{ + ssize_t size; + + if (current_format) { + TRY(size = iter_next()); + *buf = iter.seq.buffer; + } else { + TRY(size = pager_next()); + *buf = pager.spare; + if (size == 0) { + if (pager_next_cpu()) { + size = sizeof(CPU_DELIM); + *buf = CPU_DELIM; + } else { + TRY(size = cmdline_next()); + *buf = cmdline_buf; + } + } + } + return size; +} + +static int data_deinit(void) +{ + if (current_format) + TRY(iter_deinit()); + else + TRY(pager_deinit()); + return 0; +} + +static int compress_init(void) +{ + int workspacesize, ret; + + compress_done = 0; + flush = Z_NO_FLUSH; + stream.data_type = current_format ? Z_ASCII : Z_BINARY; + workspacesize = zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL); + stream.workspace = vmalloc(workspacesize); + if (!stream.workspace) { + printk(TAG "Could not allocate " + "enough memory for zlib!\n"); + return -ENOMEM; + } + memset(stream.workspace, 0, workspacesize); + + ret = zlib_deflateInit(&stream, compress_level); + if (ret != Z_OK) { + printk(TAG "%s\n", stream.msg); + return ret; + } + stream.avail_in = 0; + stream.avail_out = 0; + TRY(data_init()); + return 0; +} + +/* compress_next will compress up to min(max_out, TD_COMPRESS_CHUNK) bytes + * of data into the output buffer. It gets the data by calling data_next. + * It will return the most data it possibly can. If it returns 0, then + * there is no more data. + * + * By the way that zlib works, each call to zlib_deflate will possibly + * consume up to avail_in bytes from next_in, and will fill up to + * avail_out bytes in next_out. Once flush == Z_FINISH, it can not take + * any more input. It will output until it is finished, and will return + * Z_STREAM_END. + */ +static ssize_t compress_next(size_t max_out) +{ + ssize_t ret; + max_out = min(max_out, (size_t)TD_COMPRESS_CHUNK); + stream.next_out = out_buf; + stream.avail_out = max_out; + while (stream.avail_out > 0 && !compress_done) { + if (stream.avail_in == 0 && flush != Z_FINISH) { + TRY(stream.avail_in = + data_next((const char **)&stream.next_in)); + flush = (stream.avail_in == 0) ? Z_FINISH : Z_NO_FLUSH; + } + if (stream.next_in != NULL) { + TRYM((ret = zlib_deflate(&stream, flush)), + "zlib: %s\n", stream.msg); + compress_done = (ret == Z_STREAM_END); + } + } + ret = max_out - stream.avail_out; + return ret; +} + +static int compress_deinit(void) +{ + TRY(data_deinit()); + + zlib_deflateEnd(&stream); + vfree(stream.workspace); + + /* TODO: remove */ + printk(TAG "Total in: %ld\n", stream.total_in); + printk(TAG "Total out: %ld\n", stream.total_out); + return stream.total_out; +} + +static int compress_reset(void) +{ + TRY(compress_deinit()); + TRY(compress_init()); + return 0; +} + +/* tracedump_init initializes all tracedump components. + * Call this before tracedump_next + */ +int tracedump_init(void) +{ + TRY(compress_init()); + return 0; +} + +/* tracedump_next will print up to max_out data from the tracing ring + * buffers using the print function selected by print_to. The data is + * compressed using zlib. + * + * The output type of the data is specified by the format_ascii module + * parameter. If format_ascii == 1, human-readable data will be output. + * Otherwise, it will output raw data from the ring buffer in cpu order, + * followed by the saved_cmdlines data. + */ +ssize_t tracedump_next(size_t max_out, int print_to) +{ + ssize_t size; + TRY(size = compress_next(max_out)); + print(out_buf, size, print_to); + return size; +} + +/* tracedump_all will print all data in the tracing ring buffers using + * the print function selected by print_to. The data is compressed using + * zlib, and is surrounded by MAGIC_NUMBER. + * + * The output type of the data is specified by the format_ascii module + * parameter. If format_ascii == 1, human-readable data will be output. + * Otherwise, it will output raw data from the ring buffer in cpu order, + * followed by the saved_cmdlines data. + */ +ssize_t tracedump_all(int print_to) +{ + ssize_t ret, size = 0; + TRY(size += print_magic(print_to)); + + do { + /* Here the size used doesn't really matter, + * since we're dumping everything. */ + TRY(ret = tracedump_next(0xFFFFFFFF, print_to)); + size += ret; + } while (ret > 0); + + TRY(size += print_magic(print_to)); + + return size; +} + +/* tracedump_deinit deinitializes all tracedump components. + * This must be called, even on error. + */ +int tracedump_deinit(void) +{ + TRY(compress_deinit()); + return 0; +} + +/* tracedump_reset reinitializes all tracedump components. */ +int tracedump_reset(void) +{ + TRY(compress_reset()); + return 0; +} + + + +/* tracedump_open opens the tracedump file for reading. */ +static int tracedump_open(struct inode *inode, struct file *file) +{ + int ret; + mutex_lock(&tracedump_proc_lock); + current_format = format_ascii; + ret = tracedump_init(); + if (ret < 0) + goto err; + + ret = nonseekable_open(inode, file); + if (ret < 0) + goto err; + return ret; + +err: + mutex_unlock(&tracedump_proc_lock); + return ret; +} + +/* tracedump_read will reads data from tracedump_next and prints + * it to userspace. It will surround the data with MAGIC_NUMBER. + */ +static ssize_t tracedump_read(struct file *file, char __user *buf, + size_t len, loff_t *offset) +{ + static int done; + ssize_t size = 0; + + pager.ubuf = buf; + + if (*offset == 0) { + done = 0; + TRY(size = print_magic(TD_PRINT_USER)); + } else if (!done) { + TRY(size = tracedump_next(len, TD_PRINT_USER)); + if (size == 0) { + TRY(size = print_magic(TD_PRINT_USER)); + done = 1; + } + } + + *offset += size; + + return size; +} + +static int tracedump_release(struct inode *inode, struct file *file) +{ + int ret; + ret = tracedump_deinit(); + mutex_unlock(&tracedump_proc_lock); + return ret; +} + +/* tracedump_dump dumps all tracing data from the tracing ring buffers + * to all consoles. For details about the output format, see + * tracedump_all. + + * At most max_out bytes are dumped. To accomplish this, + * tracedump_dump calls tracedump_all several times without writing the data, + * each time tossing out old data until it reaches its goal. + * + * Note: dumping raw pages currently does NOT follow the size limit. + */ + +int tracedump_dump(size_t max_out) +{ + ssize_t size; + size_t consume; + + printk(TAG "\n"); + + tracedump_init(); + + if (format_ascii) { + size = tracedump_all(TD_NO_PRINT); + if (size < 0) { + printk(TAG "failed to dump\n"); + goto out; + } + while (size > max_out) { + TRY(tracedump_deinit()); + /* Events take more or less 60 ascii bytes each, + not counting compression */ + consume = TD_MIN_CONSUME + (size - max_out) / + (60 / (compress_level + 1)); + TRY(consume_events(consume)); + TRY(tracedump_init()); + size = tracedump_all(TD_NO_PRINT); + if (size < 0) { + printk(TAG "failed to dump\n"); + goto out; + } + } + + TRY(tracedump_reset()); + } + size = tracedump_all(TD_PRINT_CONSOLE); + if (size < 0) { + printk(TAG "failed to dump\n"); + goto out; + } + +out: + tracedump_deinit(); + printk(KERN_INFO "\n" TAG " end\n"); + return size; +} + +static const struct file_operations tracedump_fops = { + .owner = THIS_MODULE, + .open = tracedump_open, + .read = tracedump_read, + .release = tracedump_release, +}; + +#ifdef CONFIG_TRACEDUMP_PANIC +static int tracedump_panic_handler(struct notifier_block *this, + unsigned long event, void *unused) +{ + tracedump_dump(panic_size); + return 0; +} + +static struct notifier_block tracedump_panic_notifier = { + .notifier_call = tracedump_panic_handler, + .next = NULL, + .priority = 150 /* priority: INT_MAX >= x >= 0 */ +}; +#endif + +static int __init tracedump_initcall(void) +{ +#ifdef CONFIG_TRACEDUMP_PROCFS + struct proc_dir_entry *entry; + + /* Create a procfs file for easy dumping */ + entry = create_proc_entry("tracedump", S_IFREG | S_IRUGO, NULL); + if (!entry) + printk(TAG "failed to create proc entry\n"); + else + entry->proc_fops = &tracedump_fops; +#endif + +#ifdef CONFIG_TRACEDUMP_PANIC + /* Automatically dump to console on a kernel panic */ + atomic_notifier_chain_register(&panic_notifier_list, + &tracedump_panic_notifier); +#endif + return 0; +} + +early_initcall(tracedump_initcall); diff --git a/kernel/trace/tracelevel.c b/kernel/trace/tracelevel.c new file mode 100644 index 000000000000..9f8b8eedbb58 --- /dev/null +++ b/kernel/trace/tracelevel.c @@ -0,0 +1,142 @@ +/* + * kernel/trace/tracelevel.c + * + * Copyright (c) 2011, NVIDIA CORPORATION. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + * + */ + +#include <linux/ftrace_event.h> +#include <linux/list.h> +#include <linux/moduleparam.h> +#include <linux/mutex.h> +#include <linux/tracelevel.h> +#include <linux/vmalloc.h> + +#include "trace.h" + +#define TAG KERN_ERR "tracelevel: " + +struct tracelevel_record { + struct list_head list; + char *name; + int level; +}; + +static LIST_HEAD(tracelevel_list); + +static bool started; +static unsigned int tracelevel_level = TRACELEVEL_DEFAULT; + +static DEFINE_MUTEX(tracelevel_record_lock); + +/* tracelevel_set_event sets a single event if set = 1, or + * clears an event if set = 0. + */ +static int tracelevel_set_event(struct tracelevel_record *evt, bool set) +{ + if (trace_set_clr_event(NULL, evt->name, set) < 0) { + printk(TAG "failed to set event %s\n", evt->name); + return -EINVAL; + } + return 0; +} + +/* Registers an event. If possible, it also sets it. + * If not, we'll set it in tracelevel_init. + */ +int __tracelevel_register(char *name, unsigned int level) +{ + struct tracelevel_record *evt = (struct tracelevel_record *) + vmalloc(sizeof(struct tracelevel_record)); + if (!evt) { + printk(TAG "failed to allocate tracelevel_record for %s\n", + name); + return -ENOMEM; + } + + evt->name = name; + evt->level = level; + + mutex_lock(&tracelevel_record_lock); + list_add(&evt->list, &tracelevel_list); + mutex_unlock(&tracelevel_record_lock); + + if (level >= tracelevel_level && started) + tracelevel_set_event(evt, 1); + return 0; +} + +/* tracelevel_set_level sets the global level, clears events + * lower than that level, and enables events greater or equal. + */ +int tracelevel_set_level(int level) +{ + struct tracelevel_record *evt = NULL; + + if (level < 0 || level > TRACELEVEL_MAX) + return -EINVAL; + tracelevel_level = level; + + mutex_lock(&tracelevel_record_lock); + list_for_each_entry(evt, &tracelevel_list, list) { + if (evt->level >= level) + tracelevel_set_event(evt, 1); + else + tracelevel_set_event(evt, 0); + } + mutex_unlock(&tracelevel_record_lock); + return 0; +} + +static int param_set_level(const char *val, const struct kernel_param *kp) +{ + int level, ret; + ret = strict_strtol(val, 0, &level); + if (ret < 0) + return ret; + return tracelevel_set_level(level); +} + +static int param_get_level(char *buffer, const struct kernel_param *kp) +{ + return param_get_int(buffer, kp); +} + +static struct kernel_param_ops tracelevel_level_ops = { + .set = param_set_level, + .get = param_get_level +}; + +module_param_cb(level, &tracelevel_level_ops, &tracelevel_level, 0644); + +/* Turn on the tracing that has been registered thus far. */ +static int __init tracelevel_init(void) +{ + int ret; + started = true; + + /* Ring buffer is initialize to 1 page until the user sets a tracer. + * Since we're doing this manually, we need to ask for expanded buffer. + */ + ret = tracing_update_buffers(); + if (ret < 0) + return ret; + + return tracelevel_set_level(tracelevel_level); +} + +/* Tracing mechanism is set up during fs_initcall. */ +fs_initcall_sync(tracelevel_init); |