summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorAlex Gonzalez <alex.gonzalez@digi.com>2012-01-19 13:54:23 +0100
committerAlex Gonzalez <alex.gonzalez@digi.com>2012-01-19 13:54:23 +0100
commit802699c91a967767fc94759f7a3e5e82d8269245 (patch)
treec8b714dd25edd333efbbf8bb1eb6c3d379084cc4 /kernel
parentf135e68daa6745fd3dbb285e6161ae2758c4027f (diff)
parent675f7660ffb0e1880011f6b3c4f9ac241491e3cd (diff)
Merge commit 'v2.6.35.14' into del-5.8/main
Conflicts: arch/arm/plat-mxc/include/mach/gpio.h arch/x86/kernel/cpu/mtrr/main.c drivers/mmc/core/core.c drivers/net/smsc911x.c fs/proc/task_mmu.c include/linux/pm_runtime.h mm/memory.c mm/mlock.c Signed-off-by: Alex Gonzalez <alex.gonzalez@digi.com>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/compat.c21
-rw-r--r--kernel/cpuset.c7
-rw-r--r--kernel/cred.c16
-rw-r--r--kernel/exit.c22
-rw-r--r--kernel/fork.c7
-rw-r--r--kernel/futex.c35
-rw-r--r--kernel/gcov/fs.c244
-rw-r--r--kernel/groups.c5
-rw-r--r--kernel/hrtimer.c13
-rw-r--r--kernel/hw_breakpoint.c3
-rw-r--r--kernel/irq/manage.c15
-rw-r--r--kernel/irq/pm.c3
-rw-r--r--kernel/irq/proc.c2
-rw-r--r--kernel/latencytop.c17
-rw-r--r--kernel/lockdep.c2
-rw-r--r--kernel/perf_event.c46
-rw-r--r--kernel/pid.c5
-rw-r--r--kernel/power/hibernate.c22
-rw-r--r--kernel/power/main.c1
-rw-r--r--kernel/power/power.h9
-rw-r--r--kernel/power/snapshot.c109
-rw-r--r--kernel/power/suspend.c5
-rw-r--r--kernel/power/user.c8
-rw-r--r--kernel/printk.c4
-rw-r--r--kernel/ptrace.c2
-rw-r--r--kernel/sched.c299
-rw-r--r--kernel/sched_fair.c137
-rw-r--r--kernel/sched_features.h5
-rw-r--r--kernel/sched_rt.c34
-rw-r--r--kernel/signal.c25
-rw-r--r--kernel/smp.c67
-rw-r--r--kernel/softirq.c64
-rw-r--r--kernel/stop_machine.c2
-rw-r--r--kernel/sys.c2
-rw-r--r--kernel/taskstats.c15
-rw-r--r--kernel/time/clocksource.c26
-rw-r--r--kernel/time/tick-broadcast.c22
-rw-r--r--kernel/time/tick-common.c6
-rw-r--r--kernel/time/tick-internal.h3
-rw-r--r--kernel/time/tick-sched.c2
-rw-r--r--kernel/time/timekeeping.c23
-rw-r--r--kernel/timer.c8
-rw-r--r--kernel/trace/ftrace.c83
-rw-r--r--kernel/trace/ring_buffer.c5
-rw-r--r--kernel/trace/trace.c10
-rw-r--r--kernel/trace/trace.h1
-rw-r--r--kernel/trace/trace_events.c113
-rw-r--r--kernel/trace/trace_events_filter.c6
-rw-r--r--kernel/trace/trace_functions_graph.c10
-rw-r--r--kernel/user.c2
-rw-r--r--kernel/user_namespace.c44
51 files changed, 1298 insertions, 339 deletions
diff --git a/kernel/compat.c b/kernel/compat.c
index 5adab05a3172..91b33fbcb427 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -1137,3 +1137,24 @@ compat_sys_sysinfo(struct compat_sysinfo __user *info)
return 0;
}
+
+/*
+ * Allocate user-space memory for the duration of a single system call,
+ * in order to marshall parameters inside a compat thunk.
+ */
+void __user *compat_alloc_user_space(unsigned long len)
+{
+ void __user *ptr;
+
+ /* If len would occupy more than half of the entire compat space... */
+ if (unlikely(len > (((compat_uptr_t)~0) >> 1)))
+ return NULL;
+
+ ptr = arch_compat_alloc_user_space(len);
+
+ if (unlikely(!access_ok(VERIFY_WRITE, ptr, len)))
+ return NULL;
+
+ return ptr;
+}
+EXPORT_SYMBOL_GPL(compat_alloc_user_space);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 02b9611eadde..8c781d59f2e6 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1576,8 +1576,10 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
return -ENODEV;
trialcs = alloc_trial_cpuset(cs);
- if (!trialcs)
- return -ENOMEM;
+ if (!trialcs) {
+ retval = -ENOMEM;
+ goto out;
+ }
switch (cft->private) {
case FILE_CPULIST:
@@ -1592,6 +1594,7 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
}
free_trial_cpuset(trialcs);
+out:
cgroup_unlock();
return retval;
}
diff --git a/kernel/cred.c b/kernel/cred.c
index 60bc8b1e32e6..a8d12feda053 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -256,13 +256,13 @@ struct cred *cred_alloc_blank(void)
#endif
atomic_set(&new->usage, 1);
+#ifdef CONFIG_DEBUG_CREDENTIALS
+ new->magic = CRED_MAGIC;
+#endif
if (security_cred_alloc_blank(new, GFP_KERNEL) < 0)
goto error;
-#ifdef CONFIG_DEBUG_CREDENTIALS
- new->magic = CRED_MAGIC;
-#endif
return new;
error:
@@ -663,6 +663,8 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
validate_creds(old);
*new = *old;
+ atomic_set(&new->usage, 1);
+ set_cred_subscribers(new, 0);
get_uid(new->user);
get_group_info(new->group_info);
@@ -680,8 +682,6 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
if (security_prepare_creds(new, old, GFP_KERNEL) < 0)
goto error;
- atomic_set(&new->usage, 1);
- set_cred_subscribers(new, 0);
put_cred(old);
validate_creds(new);
return new;
@@ -754,7 +754,11 @@ bool creds_are_invalid(const struct cred *cred)
if (cred->magic != CRED_MAGIC)
return true;
#ifdef CONFIG_SECURITY_SELINUX
- if (selinux_is_enabled()) {
+ /*
+ * cred->security == NULL if security_cred_alloc_blank() or
+ * security_prepare_creds() returned an error.
+ */
+ if (selinux_is_enabled() && cred->security) {
if ((unsigned long) cred->security < PAGE_SIZE)
return true;
if ((*(u32 *)cred->security & 0xffffff00) ==
diff --git a/kernel/exit.c b/kernel/exit.c
index ceffc67b564a..41d44c36816f 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -95,6 +95,14 @@ static void __exit_signal(struct task_struct *tsk)
sig->tty = NULL;
} else {
/*
+ * This can only happen if the caller is de_thread().
+ * FIXME: this is the temporary hack, we should teach
+ * posix-cpu-timers to handle this case correctly.
+ */
+ if (unlikely(has_group_leader_pid(tsk)))
+ posix_cpu_timers_exit_group(tsk);
+
+ /*
* If there is any task waiting for the group exit
* then notify it:
*/
@@ -900,6 +908,15 @@ NORET_TYPE void do_exit(long code)
if (unlikely(!tsk->pid))
panic("Attempted to kill the idle task!");
+ /*
+ * If do_exit is called because this processes oopsed, it's possible
+ * that get_fs() was left as KERNEL_DS, so reset it to USER_DS before
+ * continuing. Amongst other possible reasons, this is to prevent
+ * mm_release()->clear_child_tid() from writing to a user-controlled
+ * kernel address.
+ */
+ set_fs(USER_DS);
+
tracehook_report_exit(&code);
validate_creds_for_do_exit(tsk);
@@ -1383,8 +1400,7 @@ static int wait_task_stopped(struct wait_opts *wo,
if (!unlikely(wo->wo_flags & WNOWAIT))
*p_code = 0;
- /* don't need the RCU readlock here as we're holding a spinlock */
- uid = __task_cred(p)->uid;
+ uid = task_uid(p);
unlock_sig:
spin_unlock_irq(&p->sighand->siglock);
if (!exit_code)
@@ -1457,7 +1473,7 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
}
if (!unlikely(wo->wo_flags & WNOWAIT))
p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
- uid = __task_cred(p)->uid;
+ uid = task_uid(p);
spin_unlock_irq(&p->sighand->siglock);
pid = task_pid_vnr(p);
diff --git a/kernel/fork.c b/kernel/fork.c
index be07d4e18ed5..7ca45cab959c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -319,7 +319,7 @@ out:
#ifdef CONFIG_MMU
static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
{
- struct vm_area_struct *mpnt, *tmp, **pprev;
+ struct vm_area_struct *mpnt, *tmp, *prev, **pprev;
struct rb_node **rb_link, *rb_parent;
int retval;
unsigned long charge;
@@ -347,6 +347,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
if (retval)
goto out;
+ prev = NULL;
for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) {
struct file *file;
@@ -378,7 +379,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
goto fail_nomem_anon_vma_fork;
tmp->vm_flags &= ~VM_LOCKED;
tmp->vm_mm = mm;
- tmp->vm_next = NULL;
+ tmp->vm_next = tmp->vm_prev = NULL;
file = tmp->vm_file;
if (file) {
struct inode *inode = file->f_path.dentry->d_inode;
@@ -411,6 +412,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
*/
*pprev = tmp;
pprev = &tmp->vm_next;
+ tmp->vm_prev = prev;
+ prev = tmp;
__vma_link_rb(mm, tmp, rb_link, rb_parent);
rb_link = &tmp->vm_rb.rb_right;
diff --git a/kernel/futex.c b/kernel/futex.c
index 6a3a5fa1526d..185ee68a6d18 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -306,8 +306,8 @@ static int fault_in_user_writeable(u32 __user *uaddr)
int ret;
down_read(&mm->mmap_sem);
- ret = get_user_pages(current, mm, (unsigned long)uaddr,
- 1, 1, 0, NULL, NULL);
+ ret = fixup_user_fault(current, mm, (unsigned long)uaddr,
+ FAULT_FLAG_WRITE);
up_read(&mm->mmap_sem);
return ret < 0 ? ret : 0;
@@ -1363,7 +1363,6 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
{
struct futex_hash_bucket *hb;
- get_futex_key_refs(&q->key);
hb = hash_futex(&q->key);
q->lock_ptr = &hb->lock;
@@ -1375,7 +1374,6 @@ static inline void
queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
{
spin_unlock(&hb->lock);
- drop_futex_key_refs(&q->key);
}
/**
@@ -1480,8 +1478,6 @@ static void unqueue_me_pi(struct futex_q *q)
q->pi_state = NULL;
spin_unlock(q->lock_ptr);
-
- drop_futex_key_refs(&q->key);
}
/*
@@ -1812,7 +1808,10 @@ static int futex_wait(u32 __user *uaddr, int fshared,
}
retry:
- /* Prepare to wait on uaddr. */
+ /*
+ * Prepare to wait on uaddr. On success, holds hb lock and increments
+ * q.key refs.
+ */
ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
if (ret)
goto out;
@@ -1822,24 +1821,23 @@ retry:
/* If we were woken (and unqueued), we succeeded, whatever. */
ret = 0;
+ /* unqueue_me() drops q.key ref */
if (!unqueue_me(&q))
- goto out_put_key;
+ goto out;
ret = -ETIMEDOUT;
if (to && !to->task)
- goto out_put_key;
+ goto out;
/*
* We expect signal_pending(current), but we might be the
* victim of a spurious wakeup as well.
*/
- if (!signal_pending(current)) {
- put_futex_key(fshared, &q.key);
+ if (!signal_pending(current))
goto retry;
- }
ret = -ERESTARTSYS;
if (!abs_time)
- goto out_put_key;
+ goto out;
restart = &current_thread_info()->restart_block;
restart->fn = futex_wait_restart;
@@ -1856,8 +1854,6 @@ retry:
ret = -ERESTART_RESTARTBLOCK;
-out_put_key:
- put_futex_key(fshared, &q.key);
out:
if (to) {
hrtimer_cancel(&to->timer);
@@ -2236,7 +2232,10 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
q.rt_waiter = &rt_waiter;
q.requeue_pi_key = &key2;
- /* Prepare to wait on uaddr. */
+ /*
+ * Prepare to wait on uaddr. On success, increments q.key (key1) ref
+ * count.
+ */
ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
if (ret)
goto out_key2;
@@ -2254,7 +2253,9 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
* In order for us to be here, we know our q.key == key2, and since
* we took the hb->lock above, we also know that futex_requeue() has
* completed and we no longer have to concern ourselves with a wakeup
- * race with the atomic proxy lock acquition by the requeue code.
+ * race with the atomic proxy lock acquisition by the requeue code. The
+ * futex_requeue dropped our key1 reference and incremented our key2
+ * reference count.
*/
/* Check if the requeue code acquired the second futex for us. */
diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c
index ef3c3f88a7a3..f83972b16564 100644
--- a/kernel/gcov/fs.c
+++ b/kernel/gcov/fs.c
@@ -33,10 +33,11 @@
* @children: child nodes
* @all: list head for list of all nodes
* @parent: parent node
- * @info: associated profiling data structure if not a directory
- * @ghost: when an object file containing profiling data is unloaded we keep a
- * copy of the profiling data here to allow collecting coverage data
- * for cleanup code. Such a node is called a "ghost".
+ * @loaded_info: array of pointers to profiling data sets for loaded object
+ * files.
+ * @num_loaded: number of profiling data sets for loaded object files.
+ * @unloaded_info: accumulated copy of profiling data sets for unloaded
+ * object files. Used only when gcov_persist=1.
* @dentry: main debugfs entry, either a directory or data file
* @links: associated symbolic links
* @name: data file basename
@@ -51,10 +52,11 @@ struct gcov_node {
struct list_head children;
struct list_head all;
struct gcov_node *parent;
- struct gcov_info *info;
- struct gcov_info *ghost;
+ struct gcov_info **loaded_info;
+ struct gcov_info *unloaded_info;
struct dentry *dentry;
struct dentry **links;
+ int num_loaded;
char name[0];
};
@@ -136,16 +138,37 @@ static const struct seq_operations gcov_seq_ops = {
};
/*
- * Return the profiling data set for a given node. This can either be the
- * original profiling data structure or a duplicate (also called "ghost")
- * in case the associated object file has been unloaded.
+ * Return a profiling data set associated with the given node. This is
+ * either a data set for a loaded object file or a data set copy in case
+ * all associated object files have been unloaded.
*/
static struct gcov_info *get_node_info(struct gcov_node *node)
{
- if (node->info)
- return node->info;
+ if (node->num_loaded > 0)
+ return node->loaded_info[0];
- return node->ghost;
+ return node->unloaded_info;
+}
+
+/*
+ * Return a newly allocated profiling data set which contains the sum of
+ * all profiling data associated with the given node.
+ */
+static struct gcov_info *get_accumulated_info(struct gcov_node *node)
+{
+ struct gcov_info *info;
+ int i = 0;
+
+ if (node->unloaded_info)
+ info = gcov_info_dup(node->unloaded_info);
+ else
+ info = gcov_info_dup(node->loaded_info[i++]);
+ if (!info)
+ return NULL;
+ for (; i < node->num_loaded; i++)
+ gcov_info_add(info, node->loaded_info[i]);
+
+ return info;
}
/*
@@ -163,9 +186,10 @@ static int gcov_seq_open(struct inode *inode, struct file *file)
mutex_lock(&node_lock);
/*
* Read from a profiling data copy to minimize reference tracking
- * complexity and concurrent access.
+ * complexity and concurrent access and to keep accumulating multiple
+ * profiling data sets associated with one node simple.
*/
- info = gcov_info_dup(get_node_info(node));
+ info = get_accumulated_info(node);
if (!info)
goto out_unlock;
iter = gcov_iter_new(info);
@@ -225,12 +249,25 @@ static struct gcov_node *get_node_by_name(const char *name)
return NULL;
}
+/*
+ * Reset all profiling data associated with the specified node.
+ */
+static void reset_node(struct gcov_node *node)
+{
+ int i;
+
+ if (node->unloaded_info)
+ gcov_info_reset(node->unloaded_info);
+ for (i = 0; i < node->num_loaded; i++)
+ gcov_info_reset(node->loaded_info[i]);
+}
+
static void remove_node(struct gcov_node *node);
/*
* write() implementation for gcov data files. Reset profiling data for the
- * associated file. If the object file has been unloaded (i.e. this is
- * a "ghost" node), remove the debug fs node as well.
+ * corresponding file. If all associated object files have been unloaded,
+ * remove the debug fs node as well.
*/
static ssize_t gcov_seq_write(struct file *file, const char __user *addr,
size_t len, loff_t *pos)
@@ -245,10 +282,10 @@ static ssize_t gcov_seq_write(struct file *file, const char __user *addr,
node = get_node_by_name(info->filename);
if (node) {
/* Reset counts or remove node for unloaded modules. */
- if (node->ghost)
+ if (node->num_loaded == 0)
remove_node(node);
else
- gcov_info_reset(node->info);
+ reset_node(node);
}
/* Reset counts for open file. */
gcov_info_reset(info);
@@ -378,7 +415,10 @@ static void init_node(struct gcov_node *node, struct gcov_info *info,
INIT_LIST_HEAD(&node->list);
INIT_LIST_HEAD(&node->children);
INIT_LIST_HEAD(&node->all);
- node->info = info;
+ if (node->loaded_info) {
+ node->loaded_info[0] = info;
+ node->num_loaded = 1;
+ }
node->parent = parent;
if (name)
strcpy(node->name, name);
@@ -394,9 +434,13 @@ static struct gcov_node *new_node(struct gcov_node *parent,
struct gcov_node *node;
node = kzalloc(sizeof(struct gcov_node) + strlen(name) + 1, GFP_KERNEL);
- if (!node) {
- pr_warning("out of memory\n");
- return NULL;
+ if (!node)
+ goto err_nomem;
+ if (info) {
+ node->loaded_info = kcalloc(1, sizeof(struct gcov_info *),
+ GFP_KERNEL);
+ if (!node->loaded_info)
+ goto err_nomem;
}
init_node(node, info, name, parent);
/* Differentiate between gcov data file nodes and directory nodes. */
@@ -416,6 +460,11 @@ static struct gcov_node *new_node(struct gcov_node *parent,
list_add(&node->all, &all_head);
return node;
+
+err_nomem:
+ kfree(node);
+ pr_warning("out of memory\n");
+ return NULL;
}
/* Remove symbolic links associated with node. */
@@ -441,8 +490,9 @@ static void release_node(struct gcov_node *node)
list_del(&node->all);
debugfs_remove(node->dentry);
remove_links(node);
- if (node->ghost)
- gcov_info_free(node->ghost);
+ kfree(node->loaded_info);
+ if (node->unloaded_info)
+ gcov_info_free(node->unloaded_info);
kfree(node);
}
@@ -477,7 +527,7 @@ static struct gcov_node *get_child_by_name(struct gcov_node *parent,
/*
* write() implementation for reset file. Reset all profiling data to zero
- * and remove ghost nodes.
+ * and remove nodes for which all associated object files are unloaded.
*/
static ssize_t reset_write(struct file *file, const char __user *addr,
size_t len, loff_t *pos)
@@ -487,8 +537,8 @@ static ssize_t reset_write(struct file *file, const char __user *addr,
mutex_lock(&node_lock);
restart:
list_for_each_entry(node, &all_head, all) {
- if (node->info)
- gcov_info_reset(node->info);
+ if (node->num_loaded > 0)
+ reset_node(node);
else if (list_empty(&node->children)) {
remove_node(node);
/* Several nodes may have gone - restart loop. */
@@ -564,37 +614,115 @@ err_remove:
}
/*
- * The profiling data set associated with this node is being unloaded. Store a
- * copy of the profiling data and turn this node into a "ghost".
+ * Associate a profiling data set with an existing node. Needs to be called
+ * with node_lock held.
*/
-static int ghost_node(struct gcov_node *node)
+static void add_info(struct gcov_node *node, struct gcov_info *info)
{
- node->ghost = gcov_info_dup(node->info);
- if (!node->ghost) {
- pr_warning("could not save data for '%s' (out of memory)\n",
- node->info->filename);
- return -ENOMEM;
+ struct gcov_info **loaded_info;
+ int num = node->num_loaded;
+
+ /*
+ * Prepare new array. This is done first to simplify cleanup in
+ * case the new data set is incompatible, the node only contains
+ * unloaded data sets and there's not enough memory for the array.
+ */
+ loaded_info = kcalloc(num + 1, sizeof(struct gcov_info *), GFP_KERNEL);
+ if (!loaded_info) {
+ pr_warning("could not add '%s' (out of memory)\n",
+ info->filename);
+ return;
+ }
+ memcpy(loaded_info, node->loaded_info,
+ num * sizeof(struct gcov_info *));
+ loaded_info[num] = info;
+ /* Check if the new data set is compatible. */
+ if (num == 0) {
+ /*
+ * A module was unloaded, modified and reloaded. The new
+ * data set replaces the copy of the last one.
+ */
+ if (!gcov_info_is_compatible(node->unloaded_info, info)) {
+ pr_warning("discarding saved data for %s "
+ "(incompatible version)\n", info->filename);
+ gcov_info_free(node->unloaded_info);
+ node->unloaded_info = NULL;
+ }
+ } else {
+ /*
+ * Two different versions of the same object file are loaded.
+ * The initial one takes precedence.
+ */
+ if (!gcov_info_is_compatible(node->loaded_info[0], info)) {
+ pr_warning("could not add '%s' (incompatible "
+ "version)\n", info->filename);
+ kfree(loaded_info);
+ return;
+ }
}
- node->info = NULL;
+ /* Overwrite previous array. */
+ kfree(node->loaded_info);
+ node->loaded_info = loaded_info;
+ node->num_loaded = num + 1;
+}
- return 0;
+/*
+ * Return the index of a profiling data set associated with a node.
+ */
+static int get_info_index(struct gcov_node *node, struct gcov_info *info)
+{
+ int i;
+
+ for (i = 0; i < node->num_loaded; i++) {
+ if (node->loaded_info[i] == info)
+ return i;
+ }
+ return -ENOENT;
}
/*
- * Profiling data for this node has been loaded again. Add profiling data
- * from previous instantiation and turn this node into a regular node.
+ * Save the data of a profiling data set which is being unloaded.
*/
-static void revive_node(struct gcov_node *node, struct gcov_info *info)
+static void save_info(struct gcov_node *node, struct gcov_info *info)
{
- if (gcov_info_is_compatible(node->ghost, info))
- gcov_info_add(info, node->ghost);
+ if (node->unloaded_info)
+ gcov_info_add(node->unloaded_info, info);
else {
- pr_warning("discarding saved data for '%s' (version changed)\n",
+ node->unloaded_info = gcov_info_dup(info);
+ if (!node->unloaded_info) {
+ pr_warning("could not save data for '%s' "
+ "(out of memory)\n", info->filename);
+ }
+ }
+}
+
+/*
+ * Disassociate a profiling data set from a node. Needs to be called with
+ * node_lock held.
+ */
+static void remove_info(struct gcov_node *node, struct gcov_info *info)
+{
+ int i;
+
+ i = get_info_index(node, info);
+ if (i < 0) {
+ pr_warning("could not remove '%s' (not found)\n",
info->filename);
+ return;
}
- gcov_info_free(node->ghost);
- node->ghost = NULL;
- node->info = info;
+ if (gcov_persist)
+ save_info(node, info);
+ /* Shrink array. */
+ node->loaded_info[i] = node->loaded_info[node->num_loaded - 1];
+ node->num_loaded--;
+ if (node->num_loaded > 0)
+ return;
+ /* Last loaded data set was removed. */
+ kfree(node->loaded_info);
+ node->loaded_info = NULL;
+ node->num_loaded = 0;
+ if (!node->unloaded_info)
+ remove_node(node);
}
/*
@@ -609,30 +737,18 @@ void gcov_event(enum gcov_action action, struct gcov_info *info)
node = get_node_by_name(info->filename);
switch (action) {
case GCOV_ADD:
- /* Add new node or revive ghost. */
- if (!node) {
+ if (node)
+ add_info(node, info);
+ else
add_node(info);
- break;
- }
- if (gcov_persist)
- revive_node(node, info);
- else {
- pr_warning("could not add '%s' (already exists)\n",
- info->filename);
- }
break;
case GCOV_REMOVE:
- /* Remove node or turn into ghost. */
- if (!node) {
+ if (node)
+ remove_info(node, info);
+ else {
pr_warning("could not remove '%s' (not found)\n",
info->filename);
- break;
}
- if (gcov_persist) {
- if (!ghost_node(node))
- break;
- }
- remove_node(node);
break;
}
mutex_unlock(&node_lock);
diff --git a/kernel/groups.c b/kernel/groups.c
index 53b1916c9492..253dc0f35cf4 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -143,10 +143,9 @@ int groups_search(const struct group_info *group_info, gid_t grp)
right = group_info->ngroups;
while (left < right) {
unsigned int mid = (left+right)/2;
- int cmp = grp - GROUP_AT(group_info, mid);
- if (cmp > 0)
+ if (grp > GROUP_AT(group_info, mid))
left = mid + 1;
- else if (cmp < 0)
+ else if (grp < GROUP_AT(group_info, mid))
right = mid;
else
return 1;
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 5c69e996bd0f..21e0c5e468b6 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -936,6 +936,7 @@ static inline int
remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
{
if (hrtimer_is_queued(timer)) {
+ unsigned long state;
int reprogram;
/*
@@ -949,8 +950,13 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
debug_deactivate(timer);
timer_stats_hrtimer_clear_start_info(timer);
reprogram = base->cpu_base == &__get_cpu_var(hrtimer_bases);
- __remove_hrtimer(timer, base, HRTIMER_STATE_INACTIVE,
- reprogram);
+ /*
+ * We must preserve the CALLBACK state flag here,
+ * otherwise we could move the timer base in
+ * switch_hrtimer_base.
+ */
+ state = timer->state & HRTIMER_STATE_CALLBACK;
+ __remove_hrtimer(timer, base, state, reprogram);
return 1;
}
return 0;
@@ -1237,6 +1243,9 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
enqueue_hrtimer(timer, base);
}
+
+ WARN_ON_ONCE(!(timer->state & HRTIMER_STATE_CALLBACK));
+
timer->state &= ~HRTIMER_STATE_CALLBACK;
}
diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c
index 7a56b22e0602..6eb642219713 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/hw_breakpoint.c
@@ -417,7 +417,8 @@ register_user_hw_breakpoint(struct perf_event_attr *attr,
perf_overflow_handler_t triggered,
struct task_struct *tsk)
{
- return perf_event_create_kernel_counter(attr, -1, tsk->pid, triggered);
+ return perf_event_create_kernel_counter(attr, -1, task_pid_vnr(tsk),
+ triggered);
}
EXPORT_SYMBOL_GPL(register_user_hw_breakpoint);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index e1497481fe8a..a6a8b6af1fe7 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -216,7 +216,7 @@ static inline int setup_affinity(unsigned int irq, struct irq_desc *desc)
void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend)
{
if (suspend) {
- if (!desc->action || (desc->action->flags & IRQF_TIMER))
+ if (!desc->action || (desc->action->flags & IRQF_NO_SUSPEND))
return;
desc->status |= IRQ_SUSPENDED;
}
@@ -281,8 +281,17 @@ EXPORT_SYMBOL(disable_irq);
void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
{
- if (resume)
+ if (resume) {
+ if (!(desc->status & IRQ_SUSPENDED)) {
+ if (!desc->action)
+ return;
+ if (!(desc->action->flags & IRQF_FORCE_RESUME))
+ return;
+ /* Pretend that it got disabled ! */
+ desc->depth++;
+ }
desc->status &= ~IRQ_SUSPENDED;
+ }
switch (desc->depth) {
case 0:
@@ -1093,7 +1102,7 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
if (retval)
kfree(action);
-#ifdef CONFIG_DEBUG_SHIRQ
+#ifdef CONFIG_DEBUG_SHIRQ_FIXME
if (!retval && (irqflags & IRQF_SHARED)) {
/*
* It's a shared IRQ -- the driver ought to be prepared for it
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index 0d4005d85b03..d6bfb89cce91 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -53,9 +53,6 @@ void resume_device_irqs(void)
for_each_irq_desc(irq, desc) {
unsigned long flags;
- if (!(desc->status & IRQ_SUSPENDED))
- continue;
-
raw_spin_lock_irqsave(&desc->lock, flags);
__enable_irq(desc, irq, true);
raw_spin_unlock_irqrestore(&desc->lock, flags);
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 09a2ee540bd2..345e0b75fe1e 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -214,7 +214,7 @@ static int irq_spurious_proc_show(struct seq_file *m, void *v)
static int irq_spurious_proc_open(struct inode *inode, struct file *file)
{
- return single_open(file, irq_spurious_proc_show, NULL);
+ return single_open(file, irq_spurious_proc_show, PDE(inode)->data);
}
static const struct file_operations irq_spurious_proc_fops = {
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 877fb306d415..17110a4a4fc2 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -194,14 +194,7 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
account_global_scheduler_latency(tsk, &lat);
- /*
- * short term hack; if we're > 32 we stop; future we recycle:
- */
- tsk->latency_record_count++;
- if (tsk->latency_record_count >= LT_SAVECOUNT)
- goto out_unlock;
-
- for (i = 0; i < LT_SAVECOUNT; i++) {
+ for (i = 0; i < tsk->latency_record_count; i++) {
struct latency_record *mylat;
int same = 1;
@@ -227,8 +220,14 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
}
}
+ /*
+ * short term hack; if we're > 32 we stop; future we recycle:
+ */
+ if (tsk->latency_record_count >= LT_SAVECOUNT)
+ goto out_unlock;
+
/* Allocated a new one: */
- i = tsk->latency_record_count;
+ i = tsk->latency_record_count++;
memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record));
out_unlock:
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 54286798c37b..e6c932b4ddc6 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -3250,7 +3250,7 @@ int lock_is_held(struct lockdep_map *lock)
int ret = 0;
if (unlikely(current->lockdep_recursion))
- return ret;
+ return 1; /* avoid false negative lockdep_assert_held() */
raw_local_irq_save(flags);
check_flags(flags);
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index ff86c558af4c..fa8123e48353 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -58,7 +58,8 @@ static atomic_t nr_task_events __read_mostly;
*/
int sysctl_perf_event_paranoid __read_mostly = 1;
-int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */
+/* Minimum for 128 pages + 1 for the user control page */
+int sysctl_perf_event_mlock __read_mostly = 516; /* 'free' kb per user */
/*
* max perf event sample rate
@@ -1609,8 +1610,12 @@ static void rotate_ctx(struct perf_event_context *ctx)
{
raw_spin_lock(&ctx->lock);
- /* Rotate the first entry last of non-pinned groups */
- list_rotate_left(&ctx->flexible_groups);
+ /*
+ * Rotate the first entry last of non-pinned groups. Rotation might be
+ * disabled by the inheritance code.
+ */
+ if (!ctx->rotate_disable)
+ list_rotate_left(&ctx->flexible_groups);
raw_spin_unlock(&ctx->lock);
}
@@ -1757,7 +1762,13 @@ static u64 perf_event_read(struct perf_event *event)
unsigned long flags;
raw_spin_lock_irqsave(&ctx->lock, flags);
- update_context_time(ctx);
+ /*
+ * may read while context is not active
+ * (e.g., thread is blocked), in that case
+ * we cannot update context time
+ */
+ if (ctx->is_active)
+ update_context_time(ctx);
update_event_times(event);
raw_spin_unlock_irqrestore(&ctx->lock, flags);
}
@@ -5390,17 +5401,20 @@ __perf_event_exit_task(struct perf_event *child_event,
struct perf_event_context *child_ctx,
struct task_struct *child)
{
- struct perf_event *parent_event;
+ if (child_event->parent) {
+ raw_spin_lock_irq(&child_ctx->lock);
+ perf_group_detach(child_event);
+ raw_spin_unlock_irq(&child_ctx->lock);
+ }
perf_event_remove_from_context(child_event);
- parent_event = child_event->parent;
/*
- * It can happen that parent exits first, and has events
+ * It can happen that the parent exits first, and has events
* that are still around due to the child reference. These
- * events need to be zapped - but otherwise linger.
+ * events need to be zapped.
*/
- if (parent_event) {
+ if (child_event->parent) {
sync_child_event(child_event, child);
free_event(child_event);
}
@@ -5590,6 +5604,7 @@ int perf_event_init_task(struct task_struct *child)
struct perf_event *event;
struct task_struct *parent = current;
int inherited_all = 1;
+ unsigned long flags;
int ret = 0;
child->perf_event_ctxp = NULL;
@@ -5630,6 +5645,15 @@ int perf_event_init_task(struct task_struct *child)
break;
}
+ /*
+ * We can't hold ctx->lock when iterating the ->flexible_group list due
+ * to allocations, but we need to prevent rotation because
+ * rotate_ctx() will change the list from interrupt context.
+ */
+ raw_spin_lock_irqsave(&parent_ctx->lock, flags);
+ parent_ctx->rotate_disable = 1;
+ raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
+
list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
ret = inherit_task_group(event, parent, parent_ctx, child,
&inherited_all);
@@ -5637,6 +5661,10 @@ int perf_event_init_task(struct task_struct *child)
break;
}
+ raw_spin_lock_irqsave(&parent_ctx->lock, flags);
+ parent_ctx->rotate_disable = 0;
+ raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
+
child_ctx = child->perf_event_ctxp;
if (child_ctx && inherited_all) {
diff --git a/kernel/pid.c b/kernel/pid.c
index e9fd8c132d26..4d0a9fc6e8a0 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -183,11 +183,14 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
return -1;
}
-int next_pidmap(struct pid_namespace *pid_ns, int last)
+int next_pidmap(struct pid_namespace *pid_ns, unsigned int last)
{
int offset;
struct pidmap *map, *end;
+ if (last >= PID_MAX_LIMIT)
+ return -1;
+
offset = (last + 1) & BITS_PER_PAGE_MASK;
map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE];
end = &pid_ns->pidmap[PIDMAP_ENTRIES];
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index aa9e916da4d5..684463e7e263 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -324,7 +324,6 @@ static int create_image(int platform_mode)
int hibernation_snapshot(int platform_mode)
{
int error;
- gfp_t saved_mask;
error = platform_begin(platform_mode);
if (error)
@@ -336,7 +335,7 @@ int hibernation_snapshot(int platform_mode)
goto Close;
suspend_console();
- saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
+ pm_restrict_gfp_mask();
error = dpm_suspend_start(PMSG_FREEZE);
if (error)
goto Recover_platform;
@@ -345,7 +344,10 @@ int hibernation_snapshot(int platform_mode)
goto Recover_platform;
error = create_image(platform_mode);
- /* Control returns here after successful restore */
+ /*
+ * Control returns here (1) after the image has been created or the
+ * image creation has failed and (2) after a successful restore.
+ */
Resume_devices:
/* We may need to release the preallocated image pages here. */
@@ -354,7 +356,10 @@ int hibernation_snapshot(int platform_mode)
dpm_resume_end(in_suspend ?
(error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
- set_gfp_allowed_mask(saved_mask);
+
+ if (error || !in_suspend)
+ pm_restore_gfp_mask();
+
resume_console();
Close:
platform_end(platform_mode);
@@ -449,17 +454,16 @@ static int resume_target_kernel(bool platform_mode)
int hibernation_restore(int platform_mode)
{
int error;
- gfp_t saved_mask;
pm_prepare_console();
suspend_console();
- saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
+ pm_restrict_gfp_mask();
error = dpm_suspend_start(PMSG_QUIESCE);
if (!error) {
error = resume_target_kernel(platform_mode);
dpm_resume_end(PMSG_RECOVER);
}
- set_gfp_allowed_mask(saved_mask);
+ pm_restore_gfp_mask();
resume_console();
pm_restore_console();
return error;
@@ -473,7 +477,6 @@ int hibernation_restore(int platform_mode)
int hibernation_platform_enter(void)
{
int error;
- gfp_t saved_mask;
if (!hibernation_ops)
return -ENOSYS;
@@ -489,7 +492,6 @@ int hibernation_platform_enter(void)
entering_platform_hibernation = true;
suspend_console();
- saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
error = dpm_suspend_start(PMSG_HIBERNATE);
if (error) {
if (hibernation_ops->recover)
@@ -527,7 +529,6 @@ int hibernation_platform_enter(void)
Resume_devices:
entering_platform_hibernation = false;
dpm_resume_end(PMSG_RESTORE);
- set_gfp_allowed_mask(saved_mask);
resume_console();
Close:
@@ -635,6 +636,7 @@ int hibernate(void)
swsusp_free();
if (!error)
power_down();
+ pm_restore_gfp_mask();
} else {
pr_debug("PM: Image restored successfully.\n");
}
diff --git a/kernel/power/main.c b/kernel/power/main.c
index bfb684a4d412..a6797d310227 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -302,6 +302,7 @@ static int __init pm_init(void)
int error = pm_start_workqueue();
if (error)
return error;
+ hibernate_image_size_init();
power_kobj = kobject_create_and_add("power", NULL);
if (!power_kobj)
return -ENOMEM;
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 006270fe382d..54580cb4eaa7 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -14,6 +14,9 @@ struct swsusp_info {
} __attribute__((aligned(PAGE_SIZE)));
#ifdef CONFIG_HIBERNATION
+/* kernel/power/snapshot.c */
+extern void __init hibernate_image_size_init(void);
+
#ifdef CONFIG_ARCH_HIBERNATION_HEADER
/* Maximum size of architecture specific data in a hibernation header */
#define MAX_ARCH_HEADER_SIZE (sizeof(struct new_utsname) + 4)
@@ -49,7 +52,11 @@ static inline char *check_image_kernel(struct swsusp_info *info)
extern int hibernation_snapshot(int platform_mode);
extern int hibernation_restore(int platform_mode);
extern int hibernation_platform_enter(void);
-#endif
+
+#else /* !CONFIG_HIBERNATION */
+
+static inline void hibernate_image_size_init(void) {}
+#endif /* !CONFIG_HIBERNATION */
extern int pfn_is_nosave(unsigned long);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 25ce010e9f8b..d0c3c4420eec 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -46,7 +46,12 @@ static void swsusp_unset_page_forbidden(struct page *);
* size will not exceed N bytes, but if that is impossible, it will
* try to create the smallest image possible.
*/
-unsigned long image_size = 500 * 1024 * 1024;
+unsigned long image_size;
+
+void __init hibernate_image_size_init(void)
+{
+ image_size = ((totalram_pages * 2) / 5) * PAGE_SIZE;
+}
/* List of PBEs needed for restoring the pages that were allocated before
* the suspend and included in the suspend image, but have also been
@@ -1121,9 +1126,19 @@ static unsigned long preallocate_image_pages(unsigned long nr_pages, gfp_t mask)
return nr_alloc;
}
-static unsigned long preallocate_image_memory(unsigned long nr_pages)
+static unsigned long preallocate_image_memory(unsigned long nr_pages,
+ unsigned long avail_normal)
{
- return preallocate_image_pages(nr_pages, GFP_IMAGE);
+ unsigned long alloc;
+
+ if (avail_normal <= alloc_normal)
+ return 0;
+
+ alloc = avail_normal - alloc_normal;
+ if (nr_pages < alloc)
+ alloc = nr_pages;
+
+ return preallocate_image_pages(alloc, GFP_IMAGE);
}
#ifdef CONFIG_HIGHMEM
@@ -1169,15 +1184,26 @@ static inline unsigned long preallocate_highmem_fraction(unsigned long nr_pages,
*/
static void free_unnecessary_pages(void)
{
- unsigned long save_highmem, to_free_normal, to_free_highmem;
+ unsigned long save, to_free_normal, to_free_highmem;
- to_free_normal = alloc_normal - count_data_pages();
- save_highmem = count_highmem_pages();
- if (alloc_highmem > save_highmem) {
- to_free_highmem = alloc_highmem - save_highmem;
+ save = count_data_pages();
+ if (alloc_normal >= save) {
+ to_free_normal = alloc_normal - save;
+ save = 0;
+ } else {
+ to_free_normal = 0;
+ save -= alloc_normal;
+ }
+ save += count_highmem_pages();
+ if (alloc_highmem >= save) {
+ to_free_highmem = alloc_highmem - save;
} else {
to_free_highmem = 0;
- to_free_normal -= save_highmem - alloc_highmem;
+ save -= alloc_highmem;
+ if (to_free_normal > save)
+ to_free_normal -= save;
+ else
+ to_free_normal = 0;
}
memory_bm_position_reset(&copy_bm);
@@ -1258,7 +1284,7 @@ int hibernate_preallocate_memory(void)
{
struct zone *zone;
unsigned long saveable, size, max_size, count, highmem, pages = 0;
- unsigned long alloc, save_highmem, pages_highmem;
+ unsigned long alloc, save_highmem, pages_highmem, avail_normal;
struct timeval start, stop;
int error;
@@ -1295,26 +1321,38 @@ int hibernate_preallocate_memory(void)
else
count += zone_page_state(zone, NR_FREE_PAGES);
}
+ avail_normal = count;
count += highmem;
count -= totalreserve_pages;
/* Compute the maximum number of saveable pages to leave in memory. */
max_size = (count - (size + PAGES_FOR_IO)) / 2 - 2 * SPARE_PAGES;
+ /* Compute the desired number of image pages specified by image_size. */
size = DIV_ROUND_UP(image_size, PAGE_SIZE);
if (size > max_size)
size = max_size;
/*
- * If the maximum is not less than the current number of saveable pages
- * in memory, allocate page frames for the image and we're done.
+ * If the desired number of image pages is at least as large as the
+ * current number of saveable pages in memory, allocate page frames for
+ * the image and we're done.
*/
if (size >= saveable) {
pages = preallocate_image_highmem(save_highmem);
- pages += preallocate_image_memory(saveable - pages);
+ pages += preallocate_image_memory(saveable - pages, avail_normal);
goto out;
}
/* Estimate the minimum size of the image. */
pages = minimum_image_size(saveable);
+ /*
+ * To avoid excessive pressure on the normal zone, leave room in it to
+ * accommodate an image of the minimum size (unless it's already too
+ * small, in which case don't preallocate pages from it at all).
+ */
+ if (avail_normal > pages)
+ avail_normal -= pages;
+ else
+ avail_normal = 0;
if (size < pages)
size = min_t(unsigned long, pages, max_size);
@@ -1335,16 +1373,34 @@ int hibernate_preallocate_memory(void)
*/
pages_highmem = preallocate_image_highmem(highmem / 2);
alloc = (count - max_size) - pages_highmem;
- pages = preallocate_image_memory(alloc);
- if (pages < alloc)
- goto err_out;
- size = max_size - size;
- alloc = size;
- size = preallocate_highmem_fraction(size, highmem, count);
- pages_highmem += size;
- alloc -= size;
- pages += preallocate_image_memory(alloc);
- pages += pages_highmem;
+ pages = preallocate_image_memory(alloc, avail_normal);
+ if (pages < alloc) {
+ /* We have exhausted non-highmem pages, try highmem. */
+ alloc -= pages;
+ pages += pages_highmem;
+ pages_highmem = preallocate_image_highmem(alloc);
+ if (pages_highmem < alloc)
+ goto err_out;
+ pages += pages_highmem;
+ /*
+ * size is the desired number of saveable pages to leave in
+ * memory, so try to preallocate (all memory - size) pages.
+ */
+ alloc = (count - pages) - size;
+ pages += preallocate_image_highmem(alloc);
+ } else {
+ /*
+ * There are approximately max_size saveable pages at this point
+ * and we want to reduce this number down to size.
+ */
+ alloc = max_size - size;
+ size = preallocate_highmem_fraction(alloc, highmem, count);
+ pages_highmem += size;
+ alloc -= size;
+ size = preallocate_image_memory(alloc, avail_normal);
+ pages_highmem += preallocate_image_highmem(alloc - size);
+ pages += pages_highmem + size;
+ }
/*
* We only need as many page frames for the image as there are saveable
@@ -1467,11 +1523,8 @@ static int
swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
unsigned int nr_pages, unsigned int nr_highmem)
{
- int error = 0;
-
if (nr_highmem > 0) {
- error = get_highmem_buffer(PG_ANY);
- if (error)
+ if (get_highmem_buffer(PG_ANY))
goto err_out;
if (nr_highmem > alloc_highmem) {
nr_highmem -= alloc_highmem;
@@ -1494,7 +1547,7 @@ swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
err_out:
swsusp_free();
- return error;
+ return -ENOMEM;
}
asmlinkage int swsusp_save(void)
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index f37cb7dd4402..b0f28dddc04e 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -196,7 +196,6 @@ static int suspend_enter(suspend_state_t state)
int suspend_devices_and_enter(suspend_state_t state)
{
int error;
- gfp_t saved_mask;
if (!suspend_ops)
return -ENOSYS;
@@ -207,7 +206,7 @@ int suspend_devices_and_enter(suspend_state_t state)
goto Close;
}
suspend_console();
- saved_mask = clear_gfp_allowed_mask(GFP_IOFS);
+ pm_restrict_gfp_mask();
suspend_test_start();
error = dpm_suspend_start(PMSG_SUSPEND);
if (error) {
@@ -224,7 +223,7 @@ int suspend_devices_and_enter(suspend_state_t state)
suspend_test_start();
dpm_resume_end(PMSG_RESUME);
suspend_test_finish("resume devices");
- set_gfp_allowed_mask(saved_mask);
+ pm_restore_gfp_mask();
resume_console();
Close:
if (suspend_ops->end)
diff --git a/kernel/power/user.c b/kernel/power/user.c
index e819e17877ca..fe4f1ab3f746 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -113,8 +113,10 @@ static int snapshot_open(struct inode *inode, struct file *filp)
if (error)
pm_notifier_call_chain(PM_POST_RESTORE);
}
- if (error)
+ if (error) {
+ free_basic_memory_bitmaps();
atomic_inc(&snapshot_device_available);
+ }
data->frozen = 0;
data->ready = 0;
data->platform_support = 0;
@@ -137,7 +139,7 @@ static int snapshot_release(struct inode *inode, struct file *filp)
free_all_swap_pages(data->swap);
if (data->frozen)
thaw_processes();
- pm_notifier_call_chain(data->mode == O_WRONLY ?
+ pm_notifier_call_chain(data->mode == O_RDONLY ?
PM_POST_HIBERNATION : PM_POST_RESTORE);
atomic_inc(&snapshot_device_available);
@@ -263,6 +265,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
case SNAPSHOT_UNFREEZE:
if (!data->frozen || data->ready)
break;
+ pm_restore_gfp_mask();
thaw_processes();
usermodehelper_enable();
data->frozen = 0;
@@ -275,6 +278,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
error = -EPERM;
break;
}
+ pm_restore_gfp_mask();
error = hibernation_snapshot(data->platform_support);
if (!error)
error = put_user(in_suspend, (int __user *)arg);
diff --git a/kernel/printk.c b/kernel/printk.c
index 444b770c9595..b7b7f90aad6e 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1034,13 +1034,15 @@ void printk_tick(void)
int printk_needs_cpu(int cpu)
{
+ if (unlikely(cpu_is_offline(cpu)))
+ printk_tick();
return per_cpu(printk_pending, cpu);
}
void wake_up_klogd(void)
{
if (waitqueue_active(&log_wait))
- __raw_get_cpu_var(printk_pending) = 1;
+ this_cpu_write(printk_pending, 1);
}
/**
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 74a3d693c196..752321eeca8b 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -313,7 +313,7 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
child->exit_code = data;
dead = __ptrace_detach(current, child);
if (!child->exit_state)
- wake_up_process(child);
+ wake_up_state(child, TASK_TRACED | TASK_STOPPED);
}
write_unlock_irq(&tasklist_lock);
diff --git a/kernel/sched.c b/kernel/sched.c
index 889aef148201..d40d662392cf 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -491,6 +491,7 @@ struct rq {
struct mm_struct *prev_mm;
u64 clock;
+ u64 clock_task;
atomic_t nr_iowait;
@@ -518,6 +519,10 @@ struct rq {
u64 avg_idle;
#endif
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ u64 prev_irq_time;
+#endif
+
/* calc_load related fields */
unsigned long calc_load_update;
long calc_load_active;
@@ -641,10 +646,21 @@ static inline struct task_group *task_group(struct task_struct *p)
#endif /* CONFIG_CGROUP_SCHED */
+static u64 irq_time_cpu(int cpu);
+static void sched_irq_time_avg_update(struct rq *rq, u64 irq_time);
+
inline void update_rq_clock(struct rq *rq)
{
+ int cpu = cpu_of(rq);
+ u64 irq_time;
+
if (!rq->skip_clock_update)
rq->clock = sched_clock_cpu(cpu_of(rq));
+ irq_time = irq_time_cpu(cpu);
+ if (rq->clock - irq_time > rq->clock_task)
+ rq->clock_task = rq->clock - irq_time;
+
+ sched_irq_time_avg_update(rq, irq_time);
}
/*
@@ -1231,16 +1247,6 @@ void wake_up_idle_cpu(int cpu)
smp_send_reschedule(cpu);
}
-int nohz_ratelimit(int cpu)
-{
- struct rq *rq = cpu_rq(cpu);
- u64 diff = rq->clock - rq->nohz_stamp;
-
- rq->nohz_stamp = rq->clock;
-
- return diff < (NSEC_PER_SEC / HZ) >> 1;
-}
-
#endif /* CONFIG_NO_HZ */
static u64 sched_avg_period(void)
@@ -1280,6 +1286,10 @@ static void resched_task(struct task_struct *p)
static void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
{
}
+
+static void sched_avg_update(struct rq *rq)
+{
+}
#endif /* CONFIG_SMP */
#if BITS_PER_LONG == 32
@@ -1825,6 +1835,94 @@ static const struct sched_class rt_sched_class;
#define for_each_class(class) \
for (class = sched_class_highest; class; class = class->next)
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+
+/*
+ * There are no locks covering percpu hardirq/softirq time.
+ * They are only modified in account_system_vtime, on corresponding CPU
+ * with interrupts disabled. So, writes are safe.
+ * They are read and saved off onto struct rq in update_rq_clock().
+ * This may result in other CPU reading this CPU's irq time and can
+ * race with irq/account_system_vtime on this CPU. We would either get old
+ * or new value (or semi updated value on 32 bit) with a side effect of
+ * accounting a slice of irq time to wrong task when irq is in progress
+ * while we read rq->clock. That is a worthy compromise in place of having
+ * locks on each irq in account_system_time.
+ */
+static DEFINE_PER_CPU(u64, cpu_hardirq_time);
+static DEFINE_PER_CPU(u64, cpu_softirq_time);
+
+static DEFINE_PER_CPU(u64, irq_start_time);
+static int sched_clock_irqtime;
+
+void enable_sched_clock_irqtime(void)
+{
+ sched_clock_irqtime = 1;
+}
+
+void disable_sched_clock_irqtime(void)
+{
+ sched_clock_irqtime = 0;
+}
+
+static u64 irq_time_cpu(int cpu)
+{
+ if (!sched_clock_irqtime)
+ return 0;
+
+ return per_cpu(cpu_softirq_time, cpu) + per_cpu(cpu_hardirq_time, cpu);
+}
+
+void account_system_vtime(struct task_struct *curr)
+{
+ unsigned long flags;
+ int cpu;
+ u64 now, delta;
+
+ if (!sched_clock_irqtime)
+ return;
+
+ local_irq_save(flags);
+
+ cpu = smp_processor_id();
+ now = sched_clock_cpu(cpu);
+ delta = now - per_cpu(irq_start_time, cpu);
+ per_cpu(irq_start_time, cpu) = now;
+ /*
+ * We do not account for softirq time from ksoftirqd here.
+ * We want to continue accounting softirq time to ksoftirqd thread
+ * in that case, so as not to confuse scheduler with a special task
+ * that do not consume any time, but still wants to run.
+ */
+ if (hardirq_count())
+ per_cpu(cpu_hardirq_time, cpu) += delta;
+ else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
+ per_cpu(cpu_softirq_time, cpu) += delta;
+
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(account_system_vtime);
+
+static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time)
+{
+ if (sched_clock_irqtime && sched_feat(NONIRQ_POWER)) {
+ u64 delta_irq = curr_irq_time - rq->prev_irq_time;
+ rq->prev_irq_time = curr_irq_time;
+ sched_rt_avg_update(rq, delta_irq);
+ }
+}
+
+#else
+
+static u64 irq_time_cpu(int cpu)
+{
+ return 0;
+}
+
+static void sched_irq_time_avg_update(struct rq *rq, u64 curr_irq_time) { }
+
+#endif
+
#include "sched_stats.h"
static void inc_nr_running(struct rq *rq)
@@ -1978,6 +2076,9 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
if (p->sched_class != &fair_sched_class)
return 0;
+ if (unlikely(p->policy == SCHED_IDLE))
+ return 0;
+
/*
* Buddy candidates are cache hot:
*/
@@ -2900,6 +3001,15 @@ static long calc_load_fold_active(struct rq *this_rq)
return delta;
}
+static unsigned long
+calc_load(unsigned long load, unsigned long exp, unsigned long active)
+{
+ load *= exp;
+ load += active * (FIXED_1 - exp);
+ load += 1UL << (FSHIFT - 1);
+ return load >> FSHIFT;
+}
+
#ifdef CONFIG_NO_HZ
/*
* For NO_HZ we delay the active fold to the next LOAD_FREQ update.
@@ -2929,6 +3039,128 @@ static long calc_load_fold_idle(void)
return delta;
}
+
+/**
+ * fixed_power_int - compute: x^n, in O(log n) time
+ *
+ * @x: base of the power
+ * @frac_bits: fractional bits of @x
+ * @n: power to raise @x to.
+ *
+ * By exploiting the relation between the definition of the natural power
+ * function: x^n := x*x*...*x (x multiplied by itself for n times), and
+ * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
+ * (where: n_i \elem {0, 1}, the binary vector representing n),
+ * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
+ * of course trivially computable in O(log_2 n), the length of our binary
+ * vector.
+ */
+static unsigned long
+fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
+{
+ unsigned long result = 1UL << frac_bits;
+
+ if (n) for (;;) {
+ if (n & 1) {
+ result *= x;
+ result += 1UL << (frac_bits - 1);
+ result >>= frac_bits;
+ }
+ n >>= 1;
+ if (!n)
+ break;
+ x *= x;
+ x += 1UL << (frac_bits - 1);
+ x >>= frac_bits;
+ }
+
+ return result;
+}
+
+/*
+ * a1 = a0 * e + a * (1 - e)
+ *
+ * a2 = a1 * e + a * (1 - e)
+ * = (a0 * e + a * (1 - e)) * e + a * (1 - e)
+ * = a0 * e^2 + a * (1 - e) * (1 + e)
+ *
+ * a3 = a2 * e + a * (1 - e)
+ * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
+ * = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
+ *
+ * ...
+ *
+ * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
+ * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
+ * = a0 * e^n + a * (1 - e^n)
+ *
+ * [1] application of the geometric series:
+ *
+ * n 1 - x^(n+1)
+ * S_n := \Sum x^i = -------------
+ * i=0 1 - x
+ */
+static unsigned long
+calc_load_n(unsigned long load, unsigned long exp,
+ unsigned long active, unsigned int n)
+{
+
+ return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
+}
+
+/*
+ * NO_HZ can leave us missing all per-cpu ticks calling
+ * calc_load_account_active(), but since an idle CPU folds its delta into
+ * calc_load_tasks_idle per calc_load_account_idle(), all we need to do is fold
+ * in the pending idle delta if our idle period crossed a load cycle boundary.
+ *
+ * Once we've updated the global active value, we need to apply the exponential
+ * weights adjusted to the number of cycles missed.
+ */
+static void calc_global_nohz(unsigned long ticks)
+{
+ long delta, active, n;
+
+ if (time_before(jiffies, calc_load_update))
+ return;
+
+ /*
+ * If we crossed a calc_load_update boundary, make sure to fold
+ * any pending idle changes, the respective CPUs might have
+ * missed the tick driven calc_load_account_active() update
+ * due to NO_HZ.
+ */
+ delta = calc_load_fold_idle();
+ if (delta)
+ atomic_long_add(delta, &calc_load_tasks);
+
+ /*
+ * If we were idle for multiple load cycles, apply them.
+ */
+ if (ticks >= LOAD_FREQ) {
+ n = ticks / LOAD_FREQ;
+
+ active = atomic_long_read(&calc_load_tasks);
+ active = active > 0 ? active * FIXED_1 : 0;
+
+ avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
+ avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
+ avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);
+
+ calc_load_update += n * LOAD_FREQ;
+ }
+
+ /*
+ * Its possible the remainder of the above division also crosses
+ * a LOAD_FREQ period, the regular check in calc_global_load()
+ * which comes after this will take care of that.
+ *
+ * Consider us being 11 ticks before a cycle completion, and us
+ * sleeping for 4*LOAD_FREQ + 22 ticks, then the above code will
+ * age us 4 cycles, and the test in calc_global_load() will
+ * pick up the final one.
+ */
+}
#else
static void calc_load_account_idle(struct rq *this_rq)
{
@@ -2938,6 +3170,10 @@ static inline long calc_load_fold_idle(void)
{
return 0;
}
+
+static void calc_global_nohz(unsigned long ticks)
+{
+}
#endif
/**
@@ -2955,24 +3191,17 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
loads[2] = (avenrun[2] + offset) << shift;
}
-static unsigned long
-calc_load(unsigned long load, unsigned long exp, unsigned long active)
-{
- load *= exp;
- load += active * (FIXED_1 - exp);
- return load >> FSHIFT;
-}
-
/*
* calc_load - update the avenrun load estimates 10 ticks after the
* CPUs have updated calc_load_tasks.
*/
-void calc_global_load(void)
+void calc_global_load(unsigned long ticks)
{
- unsigned long upd = calc_load_update + 10;
long active;
- if (time_before(jiffies, upd))
+ calc_global_nohz(ticks);
+
+ if (time_before(jiffies, calc_load_update + 10))
return;
active = atomic_long_read(&calc_load_tasks);
@@ -3034,6 +3263,8 @@ static void update_cpu_load(struct rq *this_rq)
}
calc_load_account_active(this_rq);
+
+ sched_avg_update(this_rq);
}
#ifdef CONFIG_SMP
@@ -3087,7 +3318,7 @@ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
if (task_current(rq, p)) {
update_rq_clock(rq);
- ns = rq->clock - p->se.exec_start;
+ ns = rq->clock_task - p->se.exec_start;
if ((s64)ns < 0)
ns = 0;
}
@@ -3236,7 +3467,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
tmp = cputime_to_cputime64(cputime);
if (hardirq_count() - hardirq_offset)
cpustat->irq = cputime64_add(cpustat->irq, tmp);
- else if (softirq_count())
+ else if (in_serving_softirq())
cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
else
cpustat->system = cputime64_add(cpustat->system, tmp);
@@ -3697,8 +3928,16 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
/*
* Owner changed, break to re-assess state.
*/
- if (lock->owner != owner)
+ if (lock->owner != owner) {
+ /*
+ * If the lock has switched to a different owner,
+ * we likely have heavy contention. Return 0 to quit
+ * optimistic spinning and not contend further:
+ */
+ if (lock->owner)
+ return 0;
break;
+ }
/*
* Is that owner really running on that cpu?
@@ -5189,7 +5428,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
* The idle tasks have their own, simple scheduling class:
*/
idle->sched_class = &idle_sched_class;
- ftrace_graph_init_task(idle);
+ ftrace_graph_init_idle_task(idle, cpu);
}
/*
@@ -6600,6 +6839,8 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
if (cpu != group_first_cpu(sd->groups))
return;
+ sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups));
+
child = sd->child;
sd->groups->cpu_power = 0;
@@ -8093,12 +8334,12 @@ void sched_move_task(struct task_struct *tsk)
if (unlikely(running))
tsk->sched_class->put_prev_task(rq, tsk);
- set_task_rq(tsk, task_cpu(tsk));
-
#ifdef CONFIG_FAIR_GROUP_SCHED
- if (tsk->sched_class->moved_group)
- tsk->sched_class->moved_group(tsk, on_rq);
+ if (tsk->sched_class->task_move_group)
+ tsk->sched_class->task_move_group(tsk, on_rq);
+ else
#endif
+ set_task_rq(tsk, task_cpu(tsk));
if (unlikely(running))
tsk->sched_class->set_curr_task(rq);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index a878b5332daa..09a98114ec32 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -519,7 +519,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
static void update_curr(struct cfs_rq *cfs_rq)
{
struct sched_entity *curr = cfs_rq->curr;
- u64 now = rq_of(cfs_rq)->clock;
+ u64 now = rq_of(cfs_rq)->clock_task;
unsigned long delta_exec;
if (unlikely(!curr))
@@ -602,7 +602,7 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
/*
* We are starting a new run period:
*/
- se->exec_start = rq_of(cfs_rq)->clock;
+ se->exec_start = rq_of(cfs_rq)->clock_task;
}
/**************************************************
@@ -1765,6 +1765,10 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
set_task_cpu(p, this_cpu);
activate_task(this_rq, p, 0);
check_preempt_curr(this_rq, p, 0);
+
+ /* re-arm NEWIDLE balancing when moving tasks */
+ src_rq->avg_idle = this_rq->avg_idle = 2*sysctl_sched_migration_cost;
+ this_rq->idle_stamp = 0;
}
/*
@@ -1799,7 +1803,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
* 2) too many balance attempts have failed.
*/
- tsk_cache_hot = task_hot(p, rq->clock, sd);
+ tsk_cache_hot = task_hot(p, rq->clock_task, sd);
if (!tsk_cache_hot ||
sd->nr_balance_failed > sd->cache_nice_tries) {
#ifdef CONFIG_SCHEDSTATS
@@ -2031,12 +2035,17 @@ struct sd_lb_stats {
unsigned long this_load;
unsigned long this_load_per_task;
unsigned long this_nr_running;
+ unsigned long this_has_capacity;
+ unsigned int this_idle_cpus;
/* Statistics of the busiest group */
+ unsigned int busiest_idle_cpus;
unsigned long max_load;
unsigned long busiest_load_per_task;
unsigned long busiest_nr_running;
unsigned long busiest_group_capacity;
+ unsigned long busiest_has_capacity;
+ unsigned int busiest_group_weight;
int group_imb; /* Is there imbalance in this sd */
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
@@ -2058,7 +2067,10 @@ struct sg_lb_stats {
unsigned long sum_nr_running; /* Nr tasks running in the group */
unsigned long sum_weighted_load; /* Weighted load of group's tasks */
unsigned long group_capacity;
+ unsigned long idle_cpus;
+ unsigned long group_weight;
int group_imb; /* Is there an imbalance in the group ? */
+ int group_has_capacity; /* Is there extra capacity in the group? */
};
/**
@@ -2268,11 +2280,14 @@ unsigned long scale_rt_power(int cpu)
struct rq *rq = cpu_rq(cpu);
u64 total, available;
- sched_avg_update(rq);
-
total = sched_avg_period() + (rq->clock - rq->age_stamp);
- available = total - rq->rt_avg;
+ if (unlikely(total < rq->rt_avg)) {
+ /* Ensures that power won't end up being negative */
+ available = 0;
+ } else {
+ available = total - rq->rt_avg;
+ }
if (unlikely((s64)total < SCHED_LOAD_SCALE))
total = SCHED_LOAD_SCALE;
@@ -2354,7 +2369,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
int local_group, const struct cpumask *cpus,
int *balance, struct sg_lb_stats *sgs)
{
- unsigned long load, max_cpu_load, min_cpu_load;
+ unsigned long load, max_cpu_load, min_cpu_load, max_nr_running;
int i;
unsigned int balance_cpu = -1, first_idle_cpu = 0;
unsigned long avg_load_per_task = 0;
@@ -2365,6 +2380,7 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
/* Tally up the load of all CPUs in the group */
max_cpu_load = 0;
min_cpu_load = ~0UL;
+ max_nr_running = 0;
for_each_cpu_and(i, sched_group_cpus(group), cpus) {
struct rq *rq = cpu_rq(i);
@@ -2382,8 +2398,10 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
load = target_load(i, load_idx);
} else {
load = source_load(i, load_idx);
- if (load > max_cpu_load)
+ if (load > max_cpu_load) {
max_cpu_load = load;
+ max_nr_running = rq->nr_running;
+ }
if (min_cpu_load > load)
min_cpu_load = load;
}
@@ -2391,7 +2409,8 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
sgs->group_load += load;
sgs->sum_nr_running += rq->nr_running;
sgs->sum_weighted_load += weighted_cpuload(i);
-
+ if (idle_cpu(i))
+ sgs->idle_cpus++;
}
/*
@@ -2423,11 +2442,14 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
if (sgs->sum_nr_running)
avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
- if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
+ if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task && max_nr_running > 1)
sgs->group_imb = 1;
- sgs->group_capacity =
- DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
+ sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
+ sgs->group_weight = group->group_weight;
+
+ if (sgs->group_capacity > sgs->sum_nr_running)
+ sgs->group_has_capacity = 1;
}
/**
@@ -2474,9 +2496,14 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
/*
* In case the child domain prefers tasks go to siblings
* first, lower the group capacity to one so that we'll try
- * and move all the excess tasks away.
+ * and move all the excess tasks away. We lower the capacity
+ * of a group only if the local group has the capacity to fit
+ * these excess tasks, i.e. nr_running < group_capacity. The
+ * extra check prevents the case where you always pull from the
+ * heaviest group when it is already under-utilized (possible
+ * with a large weight task outweighs the tasks on the system).
*/
- if (prefer_sibling)
+ if (prefer_sibling && !local_group && sds->this_has_capacity)
sgs.group_capacity = min(sgs.group_capacity, 1UL);
if (local_group) {
@@ -2484,14 +2511,19 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
sds->this = group;
sds->this_nr_running = sgs.sum_nr_running;
sds->this_load_per_task = sgs.sum_weighted_load;
+ sds->this_has_capacity = sgs.group_has_capacity;
+ sds->this_idle_cpus = sgs.idle_cpus;
} else if (sgs.avg_load > sds->max_load &&
(sgs.sum_nr_running > sgs.group_capacity ||
sgs.group_imb)) {
sds->max_load = sgs.avg_load;
sds->busiest = group;
sds->busiest_nr_running = sgs.sum_nr_running;
+ sds->busiest_idle_cpus = sgs.idle_cpus;
sds->busiest_group_capacity = sgs.group_capacity;
+ sds->busiest_group_weight = sgs.group_weight;
sds->busiest_load_per_task = sgs.sum_weighted_load;
+ sds->busiest_has_capacity = sgs.group_has_capacity;
sds->group_imb = sgs.group_imb;
}
@@ -2637,6 +2669,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
return fix_small_imbalance(sds, this_cpu, imbalance);
}
+
/******* find_busiest_group() helpers end here *********************/
/**
@@ -2688,6 +2721,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
* 4) This group is more busy than the avg busieness at this
* sched_domain.
* 5) The imbalance is within the specified limit.
+ *
+ * Note: when doing newidle balance, if the local group has excess
+ * capacity (i.e. nr_running < group_capacity) and the busiest group
+ * does not have any capacity, we force a load balance to pull tasks
+ * to the local group. In this case, we skip past checks 3, 4 and 5.
*/
if (!(*balance))
goto ret;
@@ -2695,6 +2733,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
if (!sds.busiest || sds.busiest_nr_running == 0)
goto out_balanced;
+ /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
+ if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
+ !sds.busiest_has_capacity)
+ goto force_balance;
+
if (sds.this_load >= sds.max_load)
goto out_balanced;
@@ -2703,9 +2746,28 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
if (sds.this_load >= sds.avg_load)
goto out_balanced;
- if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
- goto out_balanced;
+ /*
+ * In the CPU_NEWLY_IDLE, use imbalance_pct to be conservative.
+ * And to check for busy balance use !idle_cpu instead of
+ * CPU_NOT_IDLE. This is because HT siblings will use CPU_NOT_IDLE
+ * even when they are idle.
+ */
+ if (idle == CPU_NEWLY_IDLE || !idle_cpu(this_cpu)) {
+ if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
+ goto out_balanced;
+ } else {
+ /*
+ * This cpu is idle. If the busiest group load doesn't
+ * have more tasks than the number of available cpu's and
+ * there is no imbalance between this and busiest group
+ * wrt to idle cpu's, it is balanced.
+ */
+ if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) &&
+ sds.busiest_nr_running <= sds.busiest_group_weight)
+ goto out_balanced;
+ }
+force_balance:
/* Looks like there is an imbalance. Compute it */
calculate_imbalance(&sds, this_cpu, imbalance);
return sds.busiest;
@@ -2896,7 +2958,14 @@ redo:
if (!ld_moved) {
schedstat_inc(sd, lb_failed[idle]);
- sd->nr_balance_failed++;
+ /*
+ * Increment the failure counter only on periodic balance.
+ * We do not want newidle balance, which can be very
+ * frequent, pollute the failure counter causing
+ * excessive cache_hot migrations and active balances.
+ */
+ if (idle != CPU_NEWLY_IDLE)
+ sd->nr_balance_failed++;
if (need_active_balance(sd, sd_idle, idle)) {
raw_spin_lock_irqsave(&busiest->lock, flags);
@@ -3017,10 +3086,8 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
interval = msecs_to_jiffies(sd->balance_interval);
if (time_after(next_balance, sd->last_balance + interval))
next_balance = sd->last_balance + interval;
- if (pulled_task) {
- this_rq->idle_stamp = 0;
+ if (pulled_task)
break;
- }
}
raw_spin_lock(&this_rq->lock);
@@ -3542,8 +3609,11 @@ static void task_fork_fair(struct task_struct *p)
raw_spin_lock_irqsave(&rq->lock, flags);
- if (unlikely(task_cpu(p) != this_cpu))
+ if (unlikely(task_cpu(p) != this_cpu)) {
+ rcu_read_lock();
__set_task_cpu(p, this_cpu);
+ rcu_read_unlock();
+ }
update_curr(cfs_rq);
@@ -3615,13 +3685,26 @@ static void set_curr_task_fair(struct rq *rq)
}
#ifdef CONFIG_FAIR_GROUP_SCHED
-static void moved_group_fair(struct task_struct *p, int on_rq)
+static void task_move_group_fair(struct task_struct *p, int on_rq)
{
- struct cfs_rq *cfs_rq = task_cfs_rq(p);
-
- update_curr(cfs_rq);
+ /*
+ * If the task was not on the rq at the time of this cgroup movement
+ * it must have been asleep, sleeping tasks keep their ->vruntime
+ * absolute on their old rq until wakeup (needed for the fair sleeper
+ * bonus in place_entity()).
+ *
+ * If it was on the rq, we've just 'preempted' it, which does convert
+ * ->vruntime to a relative base.
+ *
+ * Make sure both cases convert their relative position when migrating
+ * to another cgroup's rq. This does somewhat interfere with the
+ * fair sleeper stuff for the first placement, but who cares.
+ */
+ if (!on_rq)
+ p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
+ set_task_rq(p, task_cpu(p));
if (!on_rq)
- place_entity(cfs_rq, &p->se, 1);
+ p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime;
}
#endif
@@ -3673,7 +3756,7 @@ static const struct sched_class fair_sched_class = {
.get_rr_interval = get_rr_interval_fair,
#ifdef CONFIG_FAIR_GROUP_SCHED
- .moved_group = moved_group_fair,
+ .task_move_group = task_move_group_fair,
#endif
};
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 83c66e8ad3ee..185f920ec1a2 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -61,3 +61,8 @@ SCHED_FEAT(ASYM_EFF_LOAD, 1)
* release the lock. Decreases scheduling overhead.
*/
SCHED_FEAT(OWNER_SPIN, 1)
+
+/*
+ * Decrement CPU power based on irq activity
+ */
+SCHED_FEAT(NONIRQ_POWER, 1)
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 8afb953e31c6..a851cc0796b3 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -609,7 +609,7 @@ static void update_curr_rt(struct rq *rq)
if (!task_has_rt_policy(curr))
return;
- delta_exec = rq->clock - curr->se.exec_start;
+ delta_exec = rq->clock_task - curr->se.exec_start;
if (unlikely((s64)delta_exec < 0))
delta_exec = 0;
@@ -618,7 +618,7 @@ static void update_curr_rt(struct rq *rq)
curr->se.sum_exec_runtime += delta_exec;
account_group_exec_runtime(curr, delta_exec);
- curr->se.exec_start = rq->clock;
+ curr->se.exec_start = rq->clock_task;
cpuacct_charge(curr, delta_exec);
sched_rt_avg_update(rq, delta_exec);
@@ -960,18 +960,19 @@ select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
* runqueue. Otherwise simply start this RT task
* on its current runqueue.
*
- * We want to avoid overloading runqueues. Even if
- * the RT task is of higher priority than the current RT task.
- * RT tasks behave differently than other tasks. If
- * one gets preempted, we try to push it off to another queue.
- * So trying to keep a preempting RT task on the same
- * cache hot CPU will force the running RT task to
- * a cold CPU. So we waste all the cache for the lower
- * RT task in hopes of saving some of a RT task
- * that is just being woken and probably will have
- * cold cache anyway.
+ * We want to avoid overloading runqueues. If the woken
+ * task is a higher priority, then it will stay on this CPU
+ * and the lower prio task should be moved to another CPU.
+ * Even though this will probably make the lower prio task
+ * lose its cache, we do not want to bounce a higher task
+ * around just because it gave up its CPU, perhaps for a
+ * lock?
+ *
+ * For equal prio tasks, we just let the scheduler sort it out.
*/
if (unlikely(rt_task(rq->curr)) &&
+ (rq->curr->rt.nr_cpus_allowed < 2 ||
+ rq->curr->prio < p->prio) &&
(p->rt.nr_cpus_allowed > 1)) {
int cpu = find_lowest_rq(p);
@@ -1074,7 +1075,7 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
} while (rt_rq);
p = rt_task_of(rt_se);
- p->se.exec_start = rq->clock;
+ p->se.exec_start = rq->clock_task;
return p;
}
@@ -1491,7 +1492,10 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
if (!task_running(rq, p) &&
!test_tsk_need_resched(rq->curr) &&
has_pushable_tasks(rq) &&
- p->rt.nr_cpus_allowed > 1)
+ p->rt.nr_cpus_allowed > 1 &&
+ rt_task(rq->curr) &&
+ (rq->curr->rt.nr_cpus_allowed < 2 ||
+ rq->curr->prio < p->prio))
push_rt_tasks(rq);
}
@@ -1712,7 +1716,7 @@ static void set_curr_task_rt(struct rq *rq)
{
struct task_struct *p = rq->curr;
- p->se.exec_start = rq->clock;
+ p->se.exec_start = rq->clock_task;
/* The running task is never eligible for pushing */
dequeue_pushable_task(rq, p);
diff --git a/kernel/signal.c b/kernel/signal.c
index 906ae5a1779c..fdecae014340 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -637,7 +637,7 @@ static inline bool si_fromuser(const struct siginfo *info)
/*
* Bad permissions for sending the signal
- * - the caller must hold at least the RCU read lock
+ * - the caller must hold the RCU read lock
*/
static int check_kill_permission(int sig, struct siginfo *info,
struct task_struct *t)
@@ -1127,11 +1127,14 @@ struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long
/*
* send signal info to all the members of a group
- * - the caller must hold the RCU read lock at least
*/
int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
{
- int ret = check_kill_permission(sig, info, p);
+ int ret;
+
+ rcu_read_lock();
+ ret = check_kill_permission(sig, info, p);
+ rcu_read_unlock();
if (!ret && sig)
ret = do_send_sig_info(sig, info, p, true);
@@ -2407,9 +2410,13 @@ SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig,
return -EFAULT;
/* Not even root can pretend to send signals from the kernel.
- Nor can they impersonate a kill(), which adds source info. */
- if (info.si_code >= 0)
+ * Nor can they impersonate a kill()/tgkill(), which adds source info.
+ */
+ if (info.si_code >= 0 || info.si_code == SI_TKILL) {
+ /* We used to allow any < 0 si_code */
+ WARN_ON_ONCE(info.si_code < 0);
return -EPERM;
+ }
info.si_signo = sig;
/* POSIX.1b doesn't mention process groups. */
@@ -2423,9 +2430,13 @@ long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info)
return -EINVAL;
/* Not even root can pretend to send signals from the kernel.
- Nor can they impersonate a kill(), which adds source info. */
- if (info->si_code >= 0)
+ * Nor can they impersonate a kill()/tgkill(), which adds source info.
+ */
+ if (info->si_code >= 0 || info->si_code == SI_TKILL) {
+ /* We used to allow any < 0 si_code */
+ WARN_ON_ONCE(info->si_code < 0);
return -EPERM;
+ }
info->si_signo = sig;
return do_send_specific(tgid, pid, sig, info);
diff --git a/kernel/smp.c b/kernel/smp.c
index 75c970c715d3..b89ef31396c4 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -194,6 +194,24 @@ void generic_smp_call_function_interrupt(void)
list_for_each_entry_rcu(data, &call_function.queue, csd.list) {
int refs;
+ /*
+ * Since we walk the list without any locks, we might
+ * see an entry that was completed, removed from the
+ * list and is in the process of being reused.
+ *
+ * We must check that the cpu is in the cpumask before
+ * checking the refs, and both must be set before
+ * executing the callback on this cpu.
+ */
+
+ if (!cpumask_test_cpu(cpu, data->cpumask))
+ continue;
+
+ smp_rmb();
+
+ if (atomic_read(&data->refs) == 0)
+ continue;
+
if (!cpumask_test_and_clear_cpu(cpu, data->cpumask))
continue;
@@ -409,7 +427,7 @@ void smp_call_function_many(const struct cpumask *mask,
{
struct call_function_data *data;
unsigned long flags;
- int cpu, next_cpu, this_cpu = smp_processor_id();
+ int refs, cpu, next_cpu, this_cpu = smp_processor_id();
/*
* Can deadlock when called with interrupts disabled.
@@ -420,7 +438,7 @@ void smp_call_function_many(const struct cpumask *mask,
WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
&& !oops_in_progress);
- /* So, what's a CPU they want? Ignoring this one. */
+ /* Try to fastpath. So, what's a CPU they want? Ignoring this one. */
cpu = cpumask_first_and(mask, cpu_online_mask);
if (cpu == this_cpu)
cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
@@ -443,11 +461,48 @@ void smp_call_function_many(const struct cpumask *mask,
data = &__get_cpu_var(cfd_data);
csd_lock(&data->csd);
+ /* This BUG_ON verifies our reuse assertions and can be removed */
+ BUG_ON(atomic_read(&data->refs) || !cpumask_empty(data->cpumask));
+
+ /*
+ * The global call function queue list add and delete are protected
+ * by a lock, but the list is traversed without any lock, relying
+ * on the rcu list add and delete to allow safe concurrent traversal.
+ * We reuse the call function data without waiting for any grace
+ * period after some other cpu removes it from the global queue.
+ * This means a cpu might find our data block as it is being
+ * filled out.
+ *
+ * We hold off the interrupt handler on the other cpu by
+ * ordering our writes to the cpu mask vs our setting of the
+ * refs counter. We assert only the cpu owning the data block
+ * will set a bit in cpumask, and each bit will only be cleared
+ * by the subject cpu. Each cpu must first find its bit is
+ * set and then check that refs is set indicating the element is
+ * ready to be processed, otherwise it must skip the entry.
+ *
+ * On the previous iteration refs was set to 0 by another cpu.
+ * To avoid the use of transitivity, set the counter to 0 here
+ * so the wmb will pair with the rmb in the interrupt handler.
+ */
+ atomic_set(&data->refs, 0); /* convert 3rd to 1st party write */
+
data->csd.func = func;
data->csd.info = info;
+
+ /* Ensure 0 refs is visible before mask. Also orders func and info */
+ smp_wmb();
+
+ /* We rely on the "and" being processed before the store */
cpumask_and(data->cpumask, mask, cpu_online_mask);
cpumask_clear_cpu(this_cpu, data->cpumask);
- atomic_set(&data->refs, cpumask_weight(data->cpumask));
+ refs = cpumask_weight(data->cpumask);
+
+ /* Some callers race with other cpus changing the passed mask */
+ if (unlikely(!refs)) {
+ csd_unlock(&data->csd);
+ return;
+ }
raw_spin_lock_irqsave(&call_function.lock, flags);
/*
@@ -456,6 +511,12 @@ void smp_call_function_many(const struct cpumask *mask,
* will not miss any other list entries:
*/
list_add_rcu(&data->csd.list, &call_function.queue);
+ /*
+ * We rely on the wmb() in list_add_rcu to complete our writes
+ * to the cpumask before this write to refs, which indicates
+ * data is on the list and is ready to be processed.
+ */
+ atomic_set(&data->refs, refs);
raw_spin_unlock_irqrestore(&call_function.lock, flags);
/*
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 07b4f1b1a73a..79ee8f1fc0e7 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -77,11 +77,21 @@ void wakeup_softirqd(void)
}
/*
+ * preempt_count and SOFTIRQ_OFFSET usage:
+ * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving
+ * softirq processing.
+ * - preempt_count is changed by SOFTIRQ_DISABLE_OFFSET (= 2 * SOFTIRQ_OFFSET)
+ * on local_bh_disable or local_bh_enable.
+ * This lets us distinguish between whether we are currently processing
+ * softirq and whether we just have bh disabled.
+ */
+
+/*
* This one is for softirq.c-internal use,
* where hardirqs are disabled legitimately:
*/
#ifdef CONFIG_TRACE_IRQFLAGS
-static void __local_bh_disable(unsigned long ip)
+static void __local_bh_disable(unsigned long ip, unsigned int cnt)
{
unsigned long flags;
@@ -95,32 +105,43 @@ static void __local_bh_disable(unsigned long ip)
* We must manually increment preempt_count here and manually
* call the trace_preempt_off later.
*/
- preempt_count() += SOFTIRQ_OFFSET;
+ preempt_count() += cnt;
/*
* Were softirqs turned off above:
*/
- if (softirq_count() == SOFTIRQ_OFFSET)
+ if (softirq_count() == cnt)
trace_softirqs_off(ip);
raw_local_irq_restore(flags);
- if (preempt_count() == SOFTIRQ_OFFSET)
+ if (preempt_count() == cnt)
trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
}
#else /* !CONFIG_TRACE_IRQFLAGS */
-static inline void __local_bh_disable(unsigned long ip)
+static inline void __local_bh_disable(unsigned long ip, unsigned int cnt)
{
- add_preempt_count(SOFTIRQ_OFFSET);
+ add_preempt_count(cnt);
barrier();
}
#endif /* CONFIG_TRACE_IRQFLAGS */
void local_bh_disable(void)
{
- __local_bh_disable((unsigned long)__builtin_return_address(0));
+ __local_bh_disable((unsigned long)__builtin_return_address(0),
+ SOFTIRQ_DISABLE_OFFSET);
}
EXPORT_SYMBOL(local_bh_disable);
+static void __local_bh_enable(unsigned int cnt)
+{
+ WARN_ON_ONCE(in_irq());
+ WARN_ON_ONCE(!irqs_disabled());
+
+ if (softirq_count() == cnt)
+ trace_softirqs_on((unsigned long)__builtin_return_address(0));
+ sub_preempt_count(cnt);
+}
+
/*
* Special-case - softirqs can safely be enabled in
* cond_resched_softirq(), or by __do_softirq(),
@@ -128,12 +149,7 @@ EXPORT_SYMBOL(local_bh_disable);
*/
void _local_bh_enable(void)
{
- WARN_ON_ONCE(in_irq());
- WARN_ON_ONCE(!irqs_disabled());
-
- if (softirq_count() == SOFTIRQ_OFFSET)
- trace_softirqs_on((unsigned long)__builtin_return_address(0));
- sub_preempt_count(SOFTIRQ_OFFSET);
+ __local_bh_enable(SOFTIRQ_DISABLE_OFFSET);
}
EXPORT_SYMBOL(_local_bh_enable);
@@ -147,13 +163,13 @@ static inline void _local_bh_enable_ip(unsigned long ip)
/*
* Are softirqs going to be turned on now:
*/
- if (softirq_count() == SOFTIRQ_OFFSET)
+ if (softirq_count() == SOFTIRQ_DISABLE_OFFSET)
trace_softirqs_on(ip);
/*
* Keep preemption disabled until we are done with
* softirq processing:
*/
- sub_preempt_count(SOFTIRQ_OFFSET - 1);
+ sub_preempt_count(SOFTIRQ_DISABLE_OFFSET - 1);
if (unlikely(!in_interrupt() && local_softirq_pending()))
do_softirq();
@@ -198,7 +214,8 @@ asmlinkage void __do_softirq(void)
pending = local_softirq_pending();
account_system_vtime(current);
- __local_bh_disable((unsigned long)__builtin_return_address(0));
+ __local_bh_disable((unsigned long)__builtin_return_address(0),
+ SOFTIRQ_OFFSET);
lockdep_softirq_enter();
cpu = smp_processor_id();
@@ -245,7 +262,7 @@ restart:
lockdep_softirq_exit();
account_system_vtime(current);
- _local_bh_enable();
+ __local_bh_enable(SOFTIRQ_OFFSET);
}
#ifndef __ARCH_HAS_DO_SOFTIRQ
@@ -279,10 +296,16 @@ void irq_enter(void)
rcu_irq_enter();
if (idle_cpu(cpu) && !in_interrupt()) {
- __irq_enter();
+ /*
+ * Prevent raise_softirq from needlessly waking up ksoftirqd
+ * here, as softirq will be serviced on return from interrupt.
+ */
+ local_bh_disable();
tick_check_idle(cpu);
- } else
- __irq_enter();
+ _local_bh_enable();
+ }
+
+ __irq_enter();
}
#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
@@ -696,6 +719,7 @@ static int run_ksoftirqd(void * __bind_cpu)
{
set_current_state(TASK_INTERRUPTIBLE);
+ current->flags |= PF_KSOFTIRQD;
while (!kthread_should_stop()) {
preempt_disable();
if (!local_softirq_pending()) {
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 70f8d90331e9..44b3163defe3 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -132,8 +132,8 @@ void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg,
cpu_stop_queue_work(&per_cpu(cpu_stopper, cpu), work_buf);
}
+DEFINE_MUTEX(stop_cpus_mutex);
/* static data for stop_cpus */
-static DEFINE_MUTEX(stop_cpus_mutex);
static DEFINE_PER_CPU(struct cpu_stop_work, stop_cpus_work);
int __stop_cpus(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg)
diff --git a/kernel/sys.c b/kernel/sys.c
index e83ddbbaf89d..f4fcc6e870ff 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -931,6 +931,7 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
pgid = pid;
if (pgid < 0)
return -EINVAL;
+ rcu_read_lock();
/* From this point forward we keep holding onto the tasklist lock
* so that our parent does not change from under us. -DaveM
@@ -984,6 +985,7 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
out:
/* All paths lead to here, thus we are safe. -DaveM */
write_unlock_irq(&tasklist_lock);
+ rcu_read_unlock();
return err;
}
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 11281d5792bd..5e21645efde1 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -292,16 +292,18 @@ ret:
static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd)
{
struct listener_list *listeners;
- struct listener *s, *tmp;
+ struct listener *s, *tmp, *s2;
unsigned int cpu;
if (!cpumask_subset(mask, cpu_possible_mask))
return -EINVAL;
+ s = NULL;
if (isadd == REGISTER) {
for_each_cpu(cpu, mask) {
- s = kmalloc_node(sizeof(struct listener), GFP_KERNEL,
- cpu_to_node(cpu));
+ if (!s)
+ s = kmalloc_node(sizeof(struct listener),
+ GFP_KERNEL, cpu_to_node(cpu));
if (!s)
goto cleanup;
s->pid = pid;
@@ -310,9 +312,16 @@ static int add_del_listener(pid_t pid, const struct cpumask *mask, int isadd)
listeners = &per_cpu(listener_array, cpu);
down_write(&listeners->sem);
+ list_for_each_entry_safe(s2, tmp, &listeners->list, list) {
+ if (s2->pid == pid)
+ goto next_cpu;
+ }
list_add(&s->list, &listeners->list);
+ s = NULL;
+next_cpu:
up_write(&listeners->sem);
}
+ kfree(s);
return 0;
}
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index f08e99c1d561..4db1e0adc7d7 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -184,7 +184,6 @@ static struct clocksource *watchdog;
static struct timer_list watchdog_timer;
static DECLARE_WORK(watchdog_work, clocksource_watchdog_work);
static DEFINE_SPINLOCK(watchdog_lock);
-static cycle_t watchdog_last;
static int watchdog_running;
static int clocksource_watchdog_kthread(void *data);
@@ -253,11 +252,6 @@ static void clocksource_watchdog(unsigned long data)
if (!watchdog_running)
goto out;
- wdnow = watchdog->read(watchdog);
- wd_nsec = clocksource_cyc2ns((wdnow - watchdog_last) & watchdog->mask,
- watchdog->mult, watchdog->shift);
- watchdog_last = wdnow;
-
list_for_each_entry(cs, &watchdog_list, wd_list) {
/* Clocksource already marked unstable? */
@@ -267,19 +261,28 @@ static void clocksource_watchdog(unsigned long data)
continue;
}
+ local_irq_disable();
csnow = cs->read(cs);
+ wdnow = watchdog->read(watchdog);
+ local_irq_enable();
/* Clocksource initialized ? */
if (!(cs->flags & CLOCK_SOURCE_WATCHDOG)) {
cs->flags |= CLOCK_SOURCE_WATCHDOG;
- cs->wd_last = csnow;
+ cs->wd_last = wdnow;
+ cs->cs_last = csnow;
continue;
}
- /* Check the deviation from the watchdog clocksource. */
- cs_nsec = clocksource_cyc2ns((csnow - cs->wd_last) &
+ wd_nsec = clocksource_cyc2ns((wdnow - cs->wd_last) & watchdog->mask,
+ watchdog->mult, watchdog->shift);
+
+ cs_nsec = clocksource_cyc2ns((csnow - cs->cs_last) &
cs->mask, cs->mult, cs->shift);
- cs->wd_last = csnow;
+ cs->cs_last = csnow;
+ cs->wd_last = wdnow;
+
+ /* Check the deviation from the watchdog clocksource. */
if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) {
clocksource_unstable(cs, cs_nsec - wd_nsec);
continue;
@@ -317,7 +320,6 @@ static inline void clocksource_start_watchdog(void)
return;
init_timer(&watchdog_timer);
watchdog_timer.function = clocksource_watchdog;
- watchdog_last = watchdog->read(watchdog);
watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask));
watchdog_running = 1;
@@ -665,8 +667,8 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
mutex_lock(&clocksource_mutex);
clocksource_enqueue(cs);
- clocksource_select();
clocksource_enqueue_watchdog(cs);
+ clocksource_select();
mutex_unlock(&clocksource_mutex);
return 0;
}
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index b3bafd5fc66d..0f888cb837cc 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -523,10 +523,11 @@ static void tick_broadcast_init_next_event(struct cpumask *mask,
*/
void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
{
+ int cpu = smp_processor_id();
+
/* Set it up only once ! */
if (bc->event_handler != tick_handle_oneshot_broadcast) {
int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC;
- int cpu = smp_processor_id();
bc->event_handler = tick_handle_oneshot_broadcast;
clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
@@ -552,6 +553,15 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
tick_broadcast_set_event(tick_next_period, 1);
} else
bc->next_event.tv64 = KTIME_MAX;
+ } else {
+ /*
+ * The first cpu which switches to oneshot mode sets
+ * the bit for all other cpus which are in the general
+ * (periodic) broadcast mask. So the bit is set and
+ * would prevent the first broadcast enter after this
+ * to program the bc device.
+ */
+ tick_broadcast_clear_oneshot(cpu);
}
}
@@ -600,4 +610,14 @@ int tick_broadcast_oneshot_active(void)
return tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT;
}
+/*
+ * Check whether the broadcast device supports oneshot.
+ */
+bool tick_broadcast_oneshot_available(void)
+{
+ struct clock_event_device *bc = tick_broadcast_device.evtdev;
+
+ return bc ? bc->features & CLOCK_EVT_FEAT_ONESHOT : false;
+}
+
#endif
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index b6b898d2eeef..61e296ba4392 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -51,7 +51,11 @@ int tick_is_oneshot_available(void)
{
struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
- return dev && (dev->features & CLOCK_EVT_FEAT_ONESHOT);
+ if (!dev || !(dev->features & CLOCK_EVT_FEAT_ONESHOT))
+ return 0;
+ if (!(dev->features & CLOCK_EVT_FEAT_C3STOP))
+ return 1;
+ return tick_broadcast_oneshot_available();
}
/*
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 290eefbc1f60..f65d3a723a64 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -36,6 +36,7 @@ extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup);
extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc);
extern int tick_broadcast_oneshot_active(void);
extern void tick_check_oneshot_broadcast(int cpu);
+bool tick_broadcast_oneshot_available(void);
# else /* BROADCAST */
static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
{
@@ -46,6 +47,7 @@ static inline void tick_broadcast_switch_to_oneshot(void) { }
static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }
static inline int tick_broadcast_oneshot_active(void) { return 0; }
static inline void tick_check_oneshot_broadcast(int cpu) { }
+static inline bool tick_broadcast_oneshot_available(void) { return true; }
# endif /* !BROADCAST */
#else /* !ONESHOT */
@@ -76,6 +78,7 @@ static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
return 0;
}
static inline int tick_broadcast_oneshot_active(void) { return 0; }
+static inline bool tick_broadcast_oneshot_available(void) { return false; }
#endif /* !TICK_ONESHOT */
/*
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 813993b5fb61..f898af608171 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -325,7 +325,7 @@ void tick_nohz_stop_sched_tick(int inidle)
} while (read_seqretry(&xtime_lock, seq));
if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) ||
- arch_needs_cpu(cpu) || nohz_ratelimit(cpu)) {
+ arch_needs_cpu(cpu)) {
next_jiffies = last_jiffies + 1;
delta_jiffies = 1;
} else {
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 5fa8372e9379..4783717c2f42 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -32,6 +32,8 @@ struct timekeeper {
cycle_t cycle_interval;
/* Number of clock shifted nano seconds in one NTP interval. */
u64 xtime_interval;
+ /* shifted nano seconds left over when rounding cycle_interval */
+ s64 xtime_remainder;
/* Raw nano seconds accumulated per NTP interval. */
u32 raw_interval;
@@ -62,7 +64,7 @@ struct timekeeper timekeeper;
static void timekeeper_setup_internals(struct clocksource *clock)
{
cycle_t interval;
- u64 tmp;
+ u64 tmp, ntpinterval;
timekeeper.clock = clock;
clock->cycle_last = clock->read(clock);
@@ -70,6 +72,7 @@ static void timekeeper_setup_internals(struct clocksource *clock)
/* Do the ns -> cycle conversion first, using original mult */
tmp = NTP_INTERVAL_LENGTH;
tmp <<= clock->shift;
+ ntpinterval = tmp;
tmp += clock->mult/2;
do_div(tmp, clock->mult);
if (tmp == 0)
@@ -80,6 +83,7 @@ static void timekeeper_setup_internals(struct clocksource *clock)
/* Go back from cycles -> shifted ns */
timekeeper.xtime_interval = (u64) interval * clock->mult;
+ timekeeper.xtime_remainder = ntpinterval - timekeeper.xtime_interval;
timekeeper.raw_interval =
((u64) interval * clock->mult) >> clock->shift;
@@ -740,6 +744,7 @@ static void timekeeping_adjust(s64 offset)
static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
{
u64 nsecps = (u64)NSEC_PER_SEC << timekeeper.shift;
+ u64 raw_nsecs;
/* If the offset is smaller then a shifted interval, do nothing */
if (offset < timekeeper.cycle_interval<<shift)
@@ -756,16 +761,20 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
second_overflow();
}
- /* Accumulate into raw time */
- raw_time.tv_nsec += timekeeper.raw_interval << shift;;
- while (raw_time.tv_nsec >= NSEC_PER_SEC) {
- raw_time.tv_nsec -= NSEC_PER_SEC;
- raw_time.tv_sec++;
+ /* Accumulate raw time */
+ raw_nsecs = timekeeper.raw_interval << shift;
+ raw_nsecs += raw_time.tv_nsec;
+ if (raw_nsecs >= NSEC_PER_SEC) {
+ u64 raw_secs = raw_nsecs;
+ raw_nsecs = do_div(raw_secs, NSEC_PER_SEC);
+ raw_time.tv_sec += raw_secs;
}
+ raw_time.tv_nsec = raw_nsecs;
/* Accumulate error between NTP and clock interval */
timekeeper.ntp_error += tick_length << shift;
- timekeeper.ntp_error -= timekeeper.xtime_interval <<
+ timekeeper.ntp_error -=
+ (timekeeper.xtime_interval + timekeeper.xtime_remainder) <<
(timekeeper.ntp_error_shift + shift);
return offset;
diff --git a/kernel/timer.c b/kernel/timer.c
index ee305c8d4e18..f7dec455d826 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -1237,6 +1237,12 @@ unsigned long get_next_timer_interrupt(unsigned long now)
struct tvec_base *base = __get_cpu_var(tvec_bases);
unsigned long expires;
+ /*
+ * Pretend that there is no timer pending if the cpu is offline.
+ * Possible pending timers will be migrated later to an active cpu.
+ */
+ if (cpu_is_offline(smp_processor_id()))
+ return now + NEXT_TIMER_MAX_DELTA;
spin_lock(&base->lock);
if (time_before_eq(base->next_timer, base->timer_jiffies))
base->next_timer = __next_timer_interrupt(base);
@@ -1302,7 +1308,7 @@ void do_timer(unsigned long ticks)
{
jiffies_64 += ticks;
update_wall_time();
- calc_global_load();
+ calc_global_load(ticks);
}
#ifdef __ARCH_WANT_SYS_ALARM
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 6d2cb14f9449..f23b055e0d49 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -381,12 +381,19 @@ static int function_stat_show(struct seq_file *m, void *v)
{
struct ftrace_profile *rec = v;
char str[KSYM_SYMBOL_LEN];
+ int ret = 0;
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
- static DEFINE_MUTEX(mutex);
static struct trace_seq s;
unsigned long long avg;
unsigned long long stddev;
#endif
+ mutex_lock(&ftrace_profile_lock);
+
+ /* we raced with function_profile_reset() */
+ if (unlikely(rec->counter == 0)) {
+ ret = -EBUSY;
+ goto out;
+ }
kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
seq_printf(m, " %-30.30s %10lu", str, rec->counter);
@@ -408,7 +415,6 @@ static int function_stat_show(struct seq_file *m, void *v)
do_div(stddev, (rec->counter - 1) * 1000);
}
- mutex_lock(&mutex);
trace_seq_init(&s);
trace_print_graph_duration(rec->time, &s);
trace_seq_puts(&s, " ");
@@ -416,11 +422,12 @@ static int function_stat_show(struct seq_file *m, void *v)
trace_seq_puts(&s, " ");
trace_print_graph_duration(stddev, &s);
trace_print_seq(m, &s);
- mutex_unlock(&mutex);
#endif
seq_putc(m, '\n');
+out:
+ mutex_unlock(&ftrace_profile_lock);
- return 0;
+ return ret;
}
static void ftrace_profile_reset(struct ftrace_profile_stat *stat)
@@ -1503,6 +1510,8 @@ static void *t_start(struct seq_file *m, loff_t *pos)
if (*pos > 0)
return t_hash_start(m, pos);
iter->flags |= FTRACE_ITER_PRINTALL;
+ /* reset in case of seek/pread */
+ iter->flags &= ~FTRACE_ITER_HASH;
return iter;
}
@@ -2368,14 +2377,16 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)
ftrace_match_records(parser->buffer, parser->idx, enable);
}
- mutex_lock(&ftrace_lock);
- if (ftrace_start_up && ftrace_enabled)
- ftrace_run_update_code(FTRACE_ENABLE_CALLS);
- mutex_unlock(&ftrace_lock);
-
trace_parser_put(parser);
kfree(iter);
+ if (file->f_mode & FMODE_WRITE) {
+ mutex_lock(&ftrace_lock);
+ if (ftrace_start_up && ftrace_enabled)
+ ftrace_run_update_code(FTRACE_ENABLE_CALLS);
+ mutex_unlock(&ftrace_lock);
+ }
+
mutex_unlock(&ftrace_regex_lock);
return 0;
}
@@ -2410,7 +2421,7 @@ static const struct file_operations ftrace_filter_fops = {
.open = ftrace_filter_open,
.read = seq_read,
.write = ftrace_filter_write,
- .llseek = ftrace_regex_lseek,
+ .llseek = no_llseek,
.release = ftrace_filter_release,
};
@@ -3281,7 +3292,7 @@ static int start_graph_tracing(void)
/* The cpu_boot init_task->ret_stack will never be freed */
for_each_online_cpu(cpu) {
if (!idle_task(cpu)->ret_stack)
- ftrace_graph_init_task(idle_task(cpu));
+ ftrace_graph_init_idle_task(idle_task(cpu), cpu);
}
do {
@@ -3371,6 +3382,49 @@ void unregister_ftrace_graph(void)
mutex_unlock(&ftrace_lock);
}
+static DEFINE_PER_CPU(struct ftrace_ret_stack *, idle_ret_stack);
+
+static void
+graph_init_task(struct task_struct *t, struct ftrace_ret_stack *ret_stack)
+{
+ atomic_set(&t->tracing_graph_pause, 0);
+ atomic_set(&t->trace_overrun, 0);
+ t->ftrace_timestamp = 0;
+ /* make curr_ret_stack visable before we add the ret_stack */
+ smp_wmb();
+ t->ret_stack = ret_stack;
+}
+
+/*
+ * Allocate a return stack for the idle task. May be the first
+ * time through, or it may be done by CPU hotplug online.
+ */
+void ftrace_graph_init_idle_task(struct task_struct *t, int cpu)
+{
+ t->curr_ret_stack = -1;
+ /*
+ * The idle task has no parent, it either has its own
+ * stack or no stack at all.
+ */
+ if (t->ret_stack)
+ WARN_ON(t->ret_stack != per_cpu(idle_ret_stack, cpu));
+
+ if (ftrace_graph_active) {
+ struct ftrace_ret_stack *ret_stack;
+
+ ret_stack = per_cpu(idle_ret_stack, cpu);
+ if (!ret_stack) {
+ ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH
+ * sizeof(struct ftrace_ret_stack),
+ GFP_KERNEL);
+ if (!ret_stack)
+ return;
+ per_cpu(idle_ret_stack, cpu) = ret_stack;
+ }
+ graph_init_task(t, ret_stack);
+ }
+}
+
/* Allocate a return stack for newly created task */
void ftrace_graph_init_task(struct task_struct *t)
{
@@ -3386,12 +3440,7 @@ void ftrace_graph_init_task(struct task_struct *t)
GFP_KERNEL);
if (!ret_stack)
return;
- atomic_set(&t->tracing_graph_pause, 0);
- atomic_set(&t->trace_overrun, 0);
- t->ftrace_timestamp = 0;
- /* make curr_ret_stack visable before we add the ret_stack */
- smp_wmb();
- t->ret_stack = ret_stack;
+ graph_init_task(t, ret_stack);
}
}
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 1da7b6ea8b85..6cd7334ee01e 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -405,7 +405,7 @@ static inline int test_time_stamp(u64 delta)
#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2))
/* Max number of timestamps that can fit on a page */
-#define RB_TIMESTAMPS_PER_PAGE (BUF_PAGE_SIZE / RB_LEN_TIME_STAMP)
+#define RB_TIMESTAMPS_PER_PAGE (BUF_PAGE_SIZE / RB_LEN_TIME_EXTEND)
int ring_buffer_print_page_header(struct trace_seq *s)
{
@@ -3868,6 +3868,9 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
rpos = reader->read;
pos += size;
+ if (rpos >= commit)
+ break;
+
event = rb_reader_event(cpu_buffer);
size = rb_event_length(event);
} while (len > size);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 086d36316805..755d3ee9d453 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2382,11 +2382,19 @@ tracing_write_stub(struct file *filp, const char __user *ubuf,
return count;
}
+static loff_t tracing_seek(struct file *file, loff_t offset, int origin)
+{
+ if (file->f_mode & FMODE_READ)
+ return seq_lseek(file, offset, origin);
+ else
+ return 0;
+}
+
static const struct file_operations tracing_fops = {
.open = tracing_open,
.read = seq_read,
.write = tracing_write_stub,
- .llseek = seq_lseek,
+ .llseek = tracing_seek,
.release = tracing_release,
};
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 2cd96399463f..75ada72f55f8 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -731,6 +731,7 @@ struct event_subsystem {
struct dentry *entry;
struct event_filter *filter;
int nr_events;
+ int ref_count;
};
struct filter_pred;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 53cffc0b0801..83ff94a475ba 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -182,6 +182,35 @@ static void ftrace_clear_events(void)
mutex_unlock(&event_mutex);
}
+static void __put_system(struct event_subsystem *system)
+{
+ struct event_filter *filter = system->filter;
+
+ WARN_ON_ONCE(system->ref_count == 0);
+ if (--system->ref_count)
+ return;
+
+ if (filter) {
+ kfree(filter->filter_string);
+ kfree(filter);
+ }
+ kfree(system->name);
+ kfree(system);
+}
+
+static void __get_system(struct event_subsystem *system)
+{
+ WARN_ON_ONCE(system->ref_count == 0);
+ system->ref_count++;
+}
+
+static void put_system(struct event_subsystem *system)
+{
+ mutex_lock(&event_mutex);
+ __put_system(system);
+ mutex_unlock(&event_mutex);
+}
+
/*
* __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events.
*/
@@ -466,7 +495,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
loff_t *ppos)
{
const char set_to_char[4] = { '?', '0', '1', 'X' };
- const char *system = filp->private_data;
+ struct event_subsystem *system = filp->private_data;
struct ftrace_event_call *call;
char buf[2];
int set = 0;
@@ -478,7 +507,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
(!call->class->probe && !call->class->reg))
continue;
- if (system && strcmp(call->class->system, system) != 0)
+ if (system && strcmp(call->class->system, system->name) != 0)
continue;
/*
@@ -508,7 +537,8 @@ static ssize_t
system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
loff_t *ppos)
{
- const char *system = filp->private_data;
+ struct event_subsystem *system = filp->private_data;
+ const char *name = NULL;
unsigned long val;
char buf[64];
ssize_t ret;
@@ -532,7 +562,14 @@ system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
if (val != 0 && val != 1)
return -EINVAL;
- ret = __ftrace_set_clr_event(NULL, system, NULL, val);
+ /*
+ * Opening of "enable" adds a ref count to system,
+ * so the name is safe to use.
+ */
+ if (system)
+ name = system->name;
+
+ ret = __ftrace_set_clr_event(NULL, name, NULL, val);
if (ret)
goto out;
@@ -704,6 +741,52 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
return cnt;
}
+static LIST_HEAD(event_subsystems);
+
+static int subsystem_open(struct inode *inode, struct file *filp)
+{
+ struct event_subsystem *system = NULL;
+ int ret;
+
+ if (!inode->i_private)
+ goto skip_search;
+
+ /* Make sure the system still exists */
+ mutex_lock(&event_mutex);
+ list_for_each_entry(system, &event_subsystems, list) {
+ if (system == inode->i_private) {
+ /* Don't open systems with no events */
+ if (!system->nr_events) {
+ system = NULL;
+ break;
+ }
+ __get_system(system);
+ break;
+ }
+ }
+ mutex_unlock(&event_mutex);
+
+ if (system != inode->i_private)
+ return -ENODEV;
+
+ skip_search:
+ ret = tracing_open_generic(inode, filp);
+ if (ret < 0 && system)
+ put_system(system);
+
+ return ret;
+}
+
+static int subsystem_release(struct inode *inode, struct file *file)
+{
+ struct event_subsystem *system = inode->i_private;
+
+ if (system)
+ put_system(system);
+
+ return 0;
+}
+
static ssize_t
subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
loff_t *ppos)
@@ -836,15 +919,17 @@ static const struct file_operations ftrace_event_filter_fops = {
};
static const struct file_operations ftrace_subsystem_filter_fops = {
- .open = tracing_open_generic,
+ .open = subsystem_open,
.read = subsystem_filter_read,
.write = subsystem_filter_write,
+ .release = subsystem_release,
};
static const struct file_operations ftrace_system_enable_fops = {
- .open = tracing_open_generic,
+ .open = subsystem_open,
.read = system_enable_read,
.write = system_enable_write,
+ .release = subsystem_release,
};
static const struct file_operations ftrace_show_header_fops = {
@@ -872,8 +957,6 @@ static struct dentry *event_trace_events_dir(void)
return d_events;
}
-static LIST_HEAD(event_subsystems);
-
static struct dentry *
event_subsystem_dir(const char *name, struct dentry *d_events)
{
@@ -883,6 +966,7 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
/* First see if we did not already create this dir */
list_for_each_entry(system, &event_subsystems, list) {
if (strcmp(system->name, name) == 0) {
+ __get_system(system);
system->nr_events++;
return system->entry;
}
@@ -905,6 +989,7 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
}
system->nr_events = 1;
+ system->ref_count = 1;
system->name = kstrdup(name, GFP_KERNEL);
if (!system->name) {
debugfs_remove(system->entry);
@@ -932,8 +1017,7 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
"'%s/filter' entry\n", name);
}
- trace_create_file("enable", 0644, system->entry,
- (void *)system->name,
+ trace_create_file("enable", 0644, system->entry, system,
&ftrace_system_enable_fops);
return system->entry;
@@ -1050,16 +1134,9 @@ static void remove_subsystem_dir(const char *name)
list_for_each_entry(system, &event_subsystems, list) {
if (strcmp(system->name, name) == 0) {
if (!--system->nr_events) {
- struct event_filter *filter = system->filter;
-
debugfs_remove_recursive(system->entry);
list_del(&system->list);
- if (filter) {
- kfree(filter->filter_string);
- kfree(filter);
- }
- kfree(system->name);
- kfree(system);
+ __put_system(system);
}
break;
}
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 57bb1bb32999..ecf4a3e1273a 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1340,6 +1340,12 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
if (err)
goto out_unlock;
+ /* Make sure the system still has events */
+ if (!system->nr_events) {
+ err = -ENODEV;
+ goto out_unlock;
+ }
+
if (!strcmp(strstrip(filter_string), "0")) {
filter_free_subsystem_preds(system);
remove_filter_string(system->filter);
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 79f4bac99a94..b4c179ae4e45 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -507,7 +507,15 @@ get_return_for_leaf(struct trace_iterator *iter,
* if the output fails.
*/
data->ent = *curr;
- data->ret = *next;
+ /*
+ * If the next event is not a return type, then
+ * we only care about what type it is. Otherwise we can
+ * safely copy the entire event.
+ */
+ if (next->ent.type == TRACE_GRAPH_RET)
+ data->ret = *next;
+ else
+ data->ret.ent.type = next->ent.type;
}
}
diff --git a/kernel/user.c b/kernel/user.c
index 7e72614b736d..5c598ca781df 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -91,6 +91,7 @@ static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
* upon function exit.
*/
static void free_user(struct user_struct *up, unsigned long flags)
+ __releases(&uidhash_lock)
{
uid_hash_remove(up);
spin_unlock_irqrestore(&uidhash_lock, flags);
@@ -157,6 +158,7 @@ struct user_struct *alloc_uid(struct user_namespace *ns, uid_t uid)
spin_lock_irq(&uidhash_lock);
up = uid_hash_find(uid, hashent);
if (up) {
+ put_user_ns(ns);
key_put(new->uid_keyring);
key_put(new->session_keyring);
kmem_cache_free(uid_cachep, new);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index b2d70d38dff4..25915832291a 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -9,6 +9,7 @@
#include <linux/nsproxy.h>
#include <linux/slab.h>
#include <linux/user_namespace.h>
+#include <linux/highuid.h>
#include <linux/cred.h>
/*
@@ -82,3 +83,46 @@ void free_user_ns(struct kref *kref)
schedule_work(&ns->destroyer);
}
EXPORT_SYMBOL(free_user_ns);
+
+uid_t user_ns_map_uid(struct user_namespace *to, const struct cred *cred, uid_t uid)
+{
+ struct user_namespace *tmp;
+
+ if (likely(to == cred->user->user_ns))
+ return uid;
+
+
+ /* Is cred->user the creator of the target user_ns
+ * or the creator of one of it's parents?
+ */
+ for ( tmp = to; tmp != &init_user_ns;
+ tmp = tmp->creator->user_ns ) {
+ if (cred->user == tmp->creator) {
+ return (uid_t)0;
+ }
+ }
+
+ /* No useful relationship so no mapping */
+ return overflowuid;
+}
+
+gid_t user_ns_map_gid(struct user_namespace *to, const struct cred *cred, gid_t gid)
+{
+ struct user_namespace *tmp;
+
+ if (likely(to == cred->user->user_ns))
+ return gid;
+
+ /* Is cred->user the creator of the target user_ns
+ * or the creator of one of it's parents?
+ */
+ for ( tmp = to; tmp != &init_user_ns;
+ tmp = tmp->creator->user_ns ) {
+ if (cred->user == tmp->creator) {
+ return (gid_t)0;
+ }
+ }
+
+ /* No useful relationship so no mapping */
+ return overflowgid;
+}