summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2010-11-18 13:22:14 +0100
committerIngo Molnar <mingo@elte.hu>2010-11-18 13:22:26 +0100
commit92fd4d4d67b945c0766416284d4ab236b31542c4 (patch)
tree00b8b5f90748f752ccaba3dddbe271091d93543a /kernel
parentfe7de49f9d4e53f24ec9ef762a503f70b562341c (diff)
parente53beacd23d9cb47590da6a7a7f6d417b941a994 (diff)
Merge commit 'v2.6.37-rc2' into sched/core
Merge reason: Move to a .37-rc base. Signed-off-by: Ingo Molnar <mingo@elte.hu>
Diffstat (limited to 'kernel')
-rw-r--r--kernel/audit.c67
-rw-r--r--kernel/audit.h5
-rw-r--r--kernel/audit_tree.c9
-rw-r--r--kernel/audit_watch.c4
-rw-r--r--kernel/auditfilter.c12
-rw-r--r--kernel/auditsc.c16
-rw-r--r--kernel/cgroup.c145
-rw-r--r--kernel/cgroup_freezer.c72
-rw-r--r--kernel/configs.c1
-rw-r--r--kernel/cpuset.c13
-rw-r--r--kernel/cred.c4
-rw-r--r--kernel/debug/debug_core.c153
-rw-r--r--kernel/debug/debug_core.h1
-rw-r--r--kernel/debug/kdb/kdb_debugger.c3
-rw-r--r--kernel/debug/kdb/kdb_io.c2
-rw-r--r--kernel/debug/kdb/kdb_main.c66
-rw-r--r--kernel/debug/kdb/kdb_private.h48
-rw-r--r--kernel/exit.c13
-rw-r--r--kernel/fork.c17
-rw-r--r--kernel/futex.c2
-rw-r--r--kernel/gcov/fs.c1
-rw-r--r--kernel/irq/irqdesc.c15
-rw-r--r--kernel/irq/manage.c4
-rw-r--r--kernel/jump_label.c77
-rw-r--r--kernel/kexec.c2
-rw-r--r--kernel/kprobes.c34
-rw-r--r--kernel/latencytop.c17
-rw-r--r--kernel/module.c2
-rw-r--r--kernel/ns_cgroup.c8
-rw-r--r--kernel/perf_event.c136
-rw-r--r--kernel/pm_qos_params.c3
-rw-r--r--kernel/power/snapshot.c18
-rw-r--r--kernel/power/swap.c6
-rw-r--r--kernel/printk.c26
-rw-r--r--kernel/profile.c1
-rw-r--r--kernel/ptrace.c36
-rw-r--r--kernel/range.c2
-rw-r--r--kernel/relay.c15
-rw-r--r--kernel/resource.c153
-rw-r--r--kernel/rtmutex-tester.c6
-rw-r--r--kernel/sched.c8
-rw-r--r--kernel/sched_fair.c25
-rw-r--r--kernel/sched_stats.h20
-rw-r--r--kernel/signal.c5
-rw-r--r--kernel/smp.c8
-rw-r--r--kernel/softirq.c18
-rw-r--r--kernel/stop_machine.c6
-rw-r--r--kernel/sysctl.c23
-rw-r--r--kernel/taskstats.c172
-rw-r--r--kernel/trace/blktrace.c20
-rw-r--r--kernel/trace/ftrace.c2
-rw-r--r--kernel/trace/ring_buffer.c336
-rw-r--r--kernel/trace/trace.c8
-rw-r--r--kernel/trace/trace_events.c6
-rw-r--r--kernel/trace/trace_kdb.c1
-rw-r--r--kernel/trace/trace_kprobe.c3
-rw-r--r--kernel/trace/trace_stack.c1
-rw-r--r--kernel/tsacct.c10
-rw-r--r--kernel/user.c1
-rw-r--r--kernel/wait.c6
-rw-r--r--kernel/watchdog.c2
-rw-r--r--kernel/workqueue.c316
62 files changed, 1301 insertions, 911 deletions
diff --git a/kernel/audit.c b/kernel/audit.c
index d96045789b54..77770a034d59 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -467,23 +467,16 @@ static int audit_prepare_user_tty(pid_t pid, uid_t loginuid, u32 sessionid)
struct task_struct *tsk;
int err;
- read_lock(&tasklist_lock);
+ rcu_read_lock();
tsk = find_task_by_vpid(pid);
- err = -ESRCH;
- if (!tsk)
- goto out;
- err = 0;
-
- spin_lock_irq(&tsk->sighand->siglock);
- if (!tsk->signal->audit_tty)
- err = -EPERM;
- spin_unlock_irq(&tsk->sighand->siglock);
- if (err)
- goto out;
-
- tty_audit_push_task(tsk, loginuid, sessionid);
-out:
- read_unlock(&tasklist_lock);
+ if (!tsk) {
+ rcu_read_unlock();
+ return -ESRCH;
+ }
+ get_task_struct(tsk);
+ rcu_read_unlock();
+ err = tty_audit_push_task(tsk, loginuid, sessionid);
+ put_task_struct(tsk);
return err;
}
@@ -506,7 +499,7 @@ int audit_send_list(void *_dest)
}
struct sk_buff *audit_make_reply(int pid, int seq, int type, int done,
- int multi, void *payload, int size)
+ int multi, const void *payload, int size)
{
struct sk_buff *skb;
struct nlmsghdr *nlh;
@@ -555,8 +548,8 @@ static int audit_send_reply_thread(void *arg)
* Allocates an skb, builds the netlink message, and sends it to the pid.
* No failure notifications.
*/
-void audit_send_reply(int pid, int seq, int type, int done, int multi,
- void *payload, int size)
+static void audit_send_reply(int pid, int seq, int type, int done, int multi,
+ const void *payload, int size)
{
struct sk_buff *skb;
struct task_struct *tsk;
@@ -880,40 +873,40 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
case AUDIT_TTY_GET: {
struct audit_tty_status s;
struct task_struct *tsk;
+ unsigned long flags;
- read_lock(&tasklist_lock);
+ rcu_read_lock();
tsk = find_task_by_vpid(pid);
- if (!tsk)
- err = -ESRCH;
- else {
- spin_lock_irq(&tsk->sighand->siglock);
+ if (tsk && lock_task_sighand(tsk, &flags)) {
s.enabled = tsk->signal->audit_tty != 0;
- spin_unlock_irq(&tsk->sighand->siglock);
- }
- read_unlock(&tasklist_lock);
- audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_TTY_GET, 0, 0,
- &s, sizeof(s));
+ unlock_task_sighand(tsk, &flags);
+ } else
+ err = -ESRCH;
+ rcu_read_unlock();
+
+ if (!err)
+ audit_send_reply(NETLINK_CB(skb).pid, seq,
+ AUDIT_TTY_GET, 0, 0, &s, sizeof(s));
break;
}
case AUDIT_TTY_SET: {
struct audit_tty_status *s;
struct task_struct *tsk;
+ unsigned long flags;
if (nlh->nlmsg_len < sizeof(struct audit_tty_status))
return -EINVAL;
s = data;
if (s->enabled != 0 && s->enabled != 1)
return -EINVAL;
- read_lock(&tasklist_lock);
+ rcu_read_lock();
tsk = find_task_by_vpid(pid);
- if (!tsk)
- err = -ESRCH;
- else {
- spin_lock_irq(&tsk->sighand->siglock);
+ if (tsk && lock_task_sighand(tsk, &flags)) {
tsk->signal->audit_tty = s->enabled != 0;
- spin_unlock_irq(&tsk->sighand->siglock);
- }
- read_unlock(&tasklist_lock);
+ unlock_task_sighand(tsk, &flags);
+ } else
+ err = -ESRCH;
+ rcu_read_unlock();
break;
}
default:
diff --git a/kernel/audit.h b/kernel/audit.h
index f7206db4e13d..91e7071c4d2c 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -84,10 +84,7 @@ extern int audit_compare_dname_path(const char *dname, const char *path,
int *dirlen);
extern struct sk_buff * audit_make_reply(int pid, int seq, int type,
int done, int multi,
- void *payload, int size);
-extern void audit_send_reply(int pid, int seq, int type,
- int done, int multi,
- void *payload, int size);
+ const void *payload, int size);
extern void audit_panic(const char *message);
struct audit_netlink_list {
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 7f18d3a4527e..37b2bea170c8 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -223,7 +223,7 @@ static void untag_chunk(struct node *p)
{
struct audit_chunk *chunk = find_chunk(p);
struct fsnotify_mark *entry = &chunk->mark;
- struct audit_chunk *new;
+ struct audit_chunk *new = NULL;
struct audit_tree *owner;
int size = chunk->count - 1;
int i, j;
@@ -232,9 +232,14 @@ static void untag_chunk(struct node *p)
spin_unlock(&hash_lock);
+ if (size)
+ new = alloc_chunk(size);
+
spin_lock(&entry->lock);
if (chunk->dead || !entry->i.inode) {
spin_unlock(&entry->lock);
+ if (new)
+ free_chunk(new);
goto out;
}
@@ -255,9 +260,9 @@ static void untag_chunk(struct node *p)
goto out;
}
- new = alloc_chunk(size);
if (!new)
goto Fallback;
+
fsnotify_duplicate_mark(&new->mark, entry);
if (fsnotify_add_mark(&new->mark, new->mark.group, new->mark.i.inode, NULL, 1)) {
free_chunk(new);
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index f0c9b2e7542d..d2e3c7866460 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -60,7 +60,7 @@ struct audit_parent {
};
/* fsnotify handle. */
-struct fsnotify_group *audit_watch_group;
+static struct fsnotify_group *audit_watch_group;
/* fsnotify events we care about. */
#define AUDIT_FS_WATCH (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\
@@ -123,7 +123,7 @@ void audit_put_watch(struct audit_watch *watch)
}
}
-void audit_remove_watch(struct audit_watch *watch)
+static void audit_remove_watch(struct audit_watch *watch)
{
list_del(&watch->wlist);
audit_put_parent(watch->parent);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index eb7675499fb5..add2819af71b 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1252,6 +1252,18 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb,
case AUDIT_LOGINUID:
result = audit_comparator(cb->loginuid, f->op, f->val);
break;
+ case AUDIT_SUBJ_USER:
+ case AUDIT_SUBJ_ROLE:
+ case AUDIT_SUBJ_TYPE:
+ case AUDIT_SUBJ_SEN:
+ case AUDIT_SUBJ_CLR:
+ if (f->lsm_rule)
+ result = security_audit_rule_match(cb->sid,
+ f->type,
+ f->op,
+ f->lsm_rule,
+ NULL);
+ break;
}
if (!result)
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 1b31c130d034..f49a0318c2ed 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -241,6 +241,10 @@ struct audit_context {
pid_t pid;
struct audit_cap_data cap;
} capset;
+ struct {
+ int fd;
+ int flags;
+ } mmap;
};
int fds[2];
@@ -1305,6 +1309,10 @@ static void show_special(struct audit_context *context, int *call_panic)
audit_log_cap(ab, "cap_pp", &context->capset.cap.permitted);
audit_log_cap(ab, "cap_pe", &context->capset.cap.effective);
break; }
+ case AUDIT_MMAP: {
+ audit_log_format(ab, "fd=%d flags=0x%x", context->mmap.fd,
+ context->mmap.flags);
+ break; }
}
audit_log_end(ab);
}
@@ -2476,6 +2484,14 @@ void __audit_log_capset(pid_t pid,
context->type = AUDIT_CAPSET;
}
+void __audit_mmap_fd(int fd, int flags)
+{
+ struct audit_context *context = current->audit_context;
+ context->mmap.fd = fd;
+ context->mmap.flags = flags;
+ context->type = AUDIT_MMAP;
+}
+
/**
* audit_core_dumps - record information about processes that end abnormally
* @signr: signal value
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 291ba3d04bea..66a416b42c18 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -52,7 +52,6 @@
#include <linux/cgroupstats.h>
#include <linux/hash.h>
#include <linux/namei.h>
-#include <linux/smp_lock.h>
#include <linux/pid_namespace.h>
#include <linux/idr.h>
#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
@@ -244,6 +243,11 @@ static int notify_on_release(const struct cgroup *cgrp)
return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
}
+static int clone_children(const struct cgroup *cgrp)
+{
+ return test_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
+}
+
/*
* for_each_subsys() allows you to iterate on each subsystem attached to
* an active hierarchy
@@ -778,6 +782,7 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
struct inode *inode = new_inode(sb);
if (inode) {
+ inode->i_ino = get_next_ino();
inode->i_mode = mode;
inode->i_uid = current_fsuid();
inode->i_gid = current_fsgid();
@@ -1040,6 +1045,8 @@ static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs)
seq_puts(seq, ",noprefix");
if (strlen(root->release_agent_path))
seq_printf(seq, ",release_agent=%s", root->release_agent_path);
+ if (clone_children(&root->top_cgroup))
+ seq_puts(seq, ",clone_children");
if (strlen(root->name))
seq_printf(seq, ",name=%s", root->name);
mutex_unlock(&cgroup_mutex);
@@ -1050,6 +1057,7 @@ struct cgroup_sb_opts {
unsigned long subsys_bits;
unsigned long flags;
char *release_agent;
+ bool clone_children;
char *name;
/* User explicitly requested empty subsystem */
bool none;
@@ -1066,7 +1074,8 @@ struct cgroup_sb_opts {
*/
static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
{
- char *token, *o = data ?: "all";
+ char *token, *o = data;
+ bool all_ss = false, one_ss = false;
unsigned long mask = (unsigned long)-1;
int i;
bool module_pin_failed = false;
@@ -1082,22 +1091,27 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
while ((token = strsep(&o, ",")) != NULL) {
if (!*token)
return -EINVAL;
- if (!strcmp(token, "all")) {
- /* Add all non-disabled subsystems */
- opts->subsys_bits = 0;
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
- struct cgroup_subsys *ss = subsys[i];
- if (ss == NULL)
- continue;
- if (!ss->disabled)
- opts->subsys_bits |= 1ul << i;
- }
- } else if (!strcmp(token, "none")) {
+ if (!strcmp(token, "none")) {
/* Explicitly have no subsystems */
opts->none = true;
- } else if (!strcmp(token, "noprefix")) {
+ continue;
+ }
+ if (!strcmp(token, "all")) {
+ /* Mutually exclusive option 'all' + subsystem name */
+ if (one_ss)
+ return -EINVAL;
+ all_ss = true;
+ continue;
+ }
+ if (!strcmp(token, "noprefix")) {
set_bit(ROOT_NOPREFIX, &opts->flags);
- } else if (!strncmp(token, "release_agent=", 14)) {
+ continue;
+ }
+ if (!strcmp(token, "clone_children")) {
+ opts->clone_children = true;
+ continue;
+ }
+ if (!strncmp(token, "release_agent=", 14)) {
/* Specifying two release agents is forbidden */
if (opts->release_agent)
return -EINVAL;
@@ -1105,7 +1119,9 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL);
if (!opts->release_agent)
return -ENOMEM;
- } else if (!strncmp(token, "name=", 5)) {
+ continue;
+ }
+ if (!strncmp(token, "name=", 5)) {
const char *name = token + 5;
/* Can't specify an empty name */
if (!strlen(name))
@@ -1127,20 +1143,44 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts)
GFP_KERNEL);
if (!opts->name)
return -ENOMEM;
- } else {
- struct cgroup_subsys *ss;
- for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
- ss = subsys[i];
- if (ss == NULL)
- continue;
- if (!strcmp(token, ss->name)) {
- if (!ss->disabled)
- set_bit(i, &opts->subsys_bits);
- break;
- }
- }
- if (i == CGROUP_SUBSYS_COUNT)
- return -ENOENT;
+
+ continue;
+ }
+
+ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+ struct cgroup_subsys *ss = subsys[i];
+ if (ss == NULL)
+ continue;
+ if (strcmp(token, ss->name))
+ continue;
+ if (ss->disabled)
+ continue;
+
+ /* Mutually exclusive option 'all' + subsystem name */
+ if (all_ss)
+ return -EINVAL;
+ set_bit(i, &opts->subsys_bits);
+ one_ss = true;
+
+ break;
+ }
+ if (i == CGROUP_SUBSYS_COUNT)
+ return -ENOENT;
+ }
+
+ /*
+ * If the 'all' option was specified select all the subsystems,
+ * otherwise 'all, 'none' and a subsystem name options were not
+ * specified, let's default to 'all'
+ */
+ if (all_ss || (!all_ss && !one_ss && !opts->none)) {
+ for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+ struct cgroup_subsys *ss = subsys[i];
+ if (ss == NULL)
+ continue;
+ if (ss->disabled)
+ continue;
+ set_bit(i, &opts->subsys_bits);
}
}
@@ -1222,7 +1262,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
struct cgroup *cgrp = &root->top_cgroup;
struct cgroup_sb_opts opts;
- lock_kernel();
mutex_lock(&cgrp->dentry->d_inode->i_mutex);
mutex_lock(&cgroup_mutex);
@@ -1255,7 +1294,6 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
kfree(opts.name);
mutex_unlock(&cgroup_mutex);
mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
- unlock_kernel();
return ret;
}
@@ -1357,6 +1395,8 @@ static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts)
strcpy(root->release_agent_path, opts->release_agent);
if (opts->name)
strcpy(root->name, opts->name);
+ if (opts->clone_children)
+ set_bit(CGRP_CLONE_CHILDREN, &root->top_cgroup.flags);
return root;
}
@@ -1420,9 +1460,9 @@ static int cgroup_get_rootdir(struct super_block *sb)
return 0;
}
-static int cgroup_get_sb(struct file_system_type *fs_type,
+static struct dentry *cgroup_mount(struct file_system_type *fs_type,
int flags, const char *unused_dev_name,
- void *data, struct vfsmount *mnt)
+ void *data)
{
struct cgroup_sb_opts opts;
struct cgroupfs_root *root;
@@ -1556,10 +1596,9 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
drop_parsed_module_refcounts(opts.subsys_bits);
}
- simple_set_mnt(mnt, sb);
kfree(opts.release_agent);
kfree(opts.name);
- return 0;
+ return dget(sb->s_root);
drop_new_super:
deactivate_locked_super(sb);
@@ -1568,8 +1607,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
out_err:
kfree(opts.release_agent);
kfree(opts.name);
-
- return ret;
+ return ERR_PTR(ret);
}
static void cgroup_kill_sb(struct super_block *sb) {
@@ -1619,7 +1657,7 @@ static void cgroup_kill_sb(struct super_block *sb) {
static struct file_system_type cgroup_fs_type = {
.name = "cgroup",
- .get_sb = cgroup_get_sb,
+ .mount = cgroup_mount,
.kill_sb = cgroup_kill_sb,
};
@@ -1883,6 +1921,8 @@ static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
const char *buffer)
{
BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
+ if (strlen(buffer) >= PATH_MAX)
+ return -EINVAL;
if (!cgroup_lock_live_group(cgrp))
return -ENODEV;
strcpy(cgrp->root->release_agent_path, buffer);
@@ -3176,6 +3216,23 @@ fail:
return ret;
}
+static u64 cgroup_clone_children_read(struct cgroup *cgrp,
+ struct cftype *cft)
+{
+ return clone_children(cgrp);
+}
+
+static int cgroup_clone_children_write(struct cgroup *cgrp,
+ struct cftype *cft,
+ u64 val)
+{
+ if (val)
+ set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
+ else
+ clear_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
+ return 0;
+}
+
/*
* for the common functions, 'private' gives the type of file
*/
@@ -3206,6 +3263,11 @@ static struct cftype files[] = {
.write_string = cgroup_write_event_control,
.mode = S_IWUGO,
},
+ {
+ .name = "cgroup.clone_children",
+ .read_u64 = cgroup_clone_children_read,
+ .write_u64 = cgroup_clone_children_write,
+ },
};
static struct cftype cft_release_agent = {
@@ -3335,6 +3397,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
if (notify_on_release(parent))
set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
+ if (clone_children(parent))
+ set_bit(CGRP_CLONE_CHILDREN, &cgrp->flags);
+
for_each_subsys(root, ss) {
struct cgroup_subsys_state *css = ss->create(ss, cgrp);
@@ -3349,6 +3414,8 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
goto err_destroy;
}
/* At error, ->destroy() callback has to free assigned ID. */
+ if (clone_children(parent) && ss->post_clone)
+ ss->post_clone(ss, cgrp);
}
cgroup_lock_hierarchy(root);
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index ce71ed53e88f..e7bebb7c6c38 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -48,20 +48,19 @@ static inline struct freezer *task_freezer(struct task_struct *task)
struct freezer, css);
}
-int cgroup_freezing_or_frozen(struct task_struct *task)
+static inline int __cgroup_freezing_or_frozen(struct task_struct *task)
{
- struct freezer *freezer;
- enum freezer_state state;
+ enum freezer_state state = task_freezer(task)->state;
+ return (state == CGROUP_FREEZING) || (state == CGROUP_FROZEN);
+}
+int cgroup_freezing_or_frozen(struct task_struct *task)
+{
+ int result;
task_lock(task);
- freezer = task_freezer(task);
- if (!freezer->css.cgroup->parent)
- state = CGROUP_THAWED; /* root cgroup can't be frozen */
- else
- state = freezer->state;
+ result = __cgroup_freezing_or_frozen(task);
task_unlock(task);
-
- return (state == CGROUP_FREEZING) || (state == CGROUP_FROZEN);
+ return result;
}
/*
@@ -154,13 +153,6 @@ static void freezer_destroy(struct cgroup_subsys *ss,
kfree(cgroup_freezer(cgroup));
}
-/* Task is frozen or will freeze immediately when next it gets woken */
-static bool is_task_frozen_enough(struct task_struct *task)
-{
- return frozen(task) ||
- (task_is_stopped_or_traced(task) && freezing(task));
-}
-
/*
* The call to cgroup_lock() in the freezer.state write method prevents
* a write to that file racing against an attach, and hence the
@@ -174,24 +166,25 @@ static int freezer_can_attach(struct cgroup_subsys *ss,
/*
* Anything frozen can't move or be moved to/from.
- *
- * Since orig_freezer->state == FROZEN means that @task has been
- * frozen, so it's sufficient to check the latter condition.
*/
- if (is_task_frozen_enough(task))
+ freezer = cgroup_freezer(new_cgroup);
+ if (freezer->state != CGROUP_THAWED)
return -EBUSY;
- freezer = cgroup_freezer(new_cgroup);
- if (freezer->state == CGROUP_FROZEN)
+ rcu_read_lock();
+ if (__cgroup_freezing_or_frozen(task)) {
+ rcu_read_unlock();
return -EBUSY;
+ }
+ rcu_read_unlock();
if (threadgroup) {
struct task_struct *c;
rcu_read_lock();
list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
- if (is_task_frozen_enough(c)) {
+ if (__cgroup_freezing_or_frozen(c)) {
rcu_read_unlock();
return -EBUSY;
}
@@ -236,31 +229,30 @@ static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)
/*
* caller must hold freezer->lock
*/
-static void update_freezer_state(struct cgroup *cgroup,
+static void update_if_frozen(struct cgroup *cgroup,
struct freezer *freezer)
{
struct cgroup_iter it;
struct task_struct *task;
unsigned int nfrozen = 0, ntotal = 0;
+ enum freezer_state old_state = freezer->state;
cgroup_iter_start(cgroup, &it);
while ((task = cgroup_iter_next(cgroup, &it))) {
ntotal++;
- if (is_task_frozen_enough(task))
+ if (frozen(task))
nfrozen++;
}
- /*
- * Transition to FROZEN when no new tasks can be added ensures
- * that we never exist in the FROZEN state while there are unfrozen
- * tasks.
- */
- if (nfrozen == ntotal)
- freezer->state = CGROUP_FROZEN;
- else if (nfrozen > 0)
- freezer->state = CGROUP_FREEZING;
- else
- freezer->state = CGROUP_THAWED;
+ if (old_state == CGROUP_THAWED) {
+ BUG_ON(nfrozen > 0);
+ } else if (old_state == CGROUP_FREEZING) {
+ if (nfrozen == ntotal)
+ freezer->state = CGROUP_FROZEN;
+ } else { /* old_state == CGROUP_FROZEN */
+ BUG_ON(nfrozen != ntotal);
+ }
+
cgroup_iter_end(cgroup, &it);
}
@@ -279,7 +271,7 @@ static int freezer_read(struct cgroup *cgroup, struct cftype *cft,
if (state == CGROUP_FREEZING) {
/* We change from FREEZING to FROZEN lazily if the cgroup was
* only partially frozen when we exitted write. */
- update_freezer_state(cgroup, freezer);
+ update_if_frozen(cgroup, freezer);
state = freezer->state;
}
spin_unlock_irq(&freezer->lock);
@@ -301,7 +293,7 @@ static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
while ((task = cgroup_iter_next(cgroup, &it))) {
if (!freeze_task(task, true))
continue;
- if (is_task_frozen_enough(task))
+ if (frozen(task))
continue;
if (!freezing(task) && !freezer_should_skip(task))
num_cant_freeze_now++;
@@ -335,7 +327,7 @@ static int freezer_change_state(struct cgroup *cgroup,
spin_lock_irq(&freezer->lock);
- update_freezer_state(cgroup, freezer);
+ update_if_frozen(cgroup, freezer);
if (goal_state == freezer->state)
goto out;
diff --git a/kernel/configs.c b/kernel/configs.c
index abaee684ecbf..b4066b44a99d 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -66,6 +66,7 @@ ikconfig_read_current(struct file *file, char __user *buf,
static const struct file_operations ikconfig_file_ops = {
.owner = THIS_MODULE,
.read = ikconfig_read_current,
+ .llseek = default_llseek,
};
static int __init ikconfig_init(void)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 51b143e2a07a..4349935c2ad8 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -231,18 +231,17 @@ static DEFINE_SPINLOCK(cpuset_buffer_lock);
* users. If someone tries to mount the "cpuset" filesystem, we
* silently switch it to mount "cgroup" instead
*/
-static int cpuset_get_sb(struct file_system_type *fs_type,
- int flags, const char *unused_dev_name,
- void *data, struct vfsmount *mnt)
+static struct dentry *cpuset_mount(struct file_system_type *fs_type,
+ int flags, const char *unused_dev_name, void *data)
{
struct file_system_type *cgroup_fs = get_fs_type("cgroup");
- int ret = -ENODEV;
+ struct dentry *ret = ERR_PTR(-ENODEV);
if (cgroup_fs) {
char mountopts[] =
"cpuset,noprefix,"
"release_agent=/sbin/cpuset_release_agent";
- ret = cgroup_fs->get_sb(cgroup_fs, flags,
- unused_dev_name, mountopts, mnt);
+ ret = cgroup_fs->mount(cgroup_fs, flags,
+ unused_dev_name, mountopts);
put_filesystem(cgroup_fs);
}
return ret;
@@ -250,7 +249,7 @@ static int cpuset_get_sb(struct file_system_type *fs_type,
static struct file_system_type cpuset_fs_type = {
.name = "cpuset",
- .get_sb = cpuset_get_sb,
+ .mount = cpuset_mount,
};
/*
diff --git a/kernel/cred.c b/kernel/cred.c
index 9a3e22641fe7..6a1aa004e376 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -325,7 +325,7 @@ EXPORT_SYMBOL(prepare_creds);
/*
* Prepare credentials for current to perform an execve()
- * - The caller must hold current->cred_guard_mutex
+ * - The caller must hold ->cred_guard_mutex
*/
struct cred *prepare_exec_creds(void)
{
@@ -384,8 +384,6 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
struct cred *new;
int ret;
- mutex_init(&p->cred_guard_mutex);
-
if (
#ifdef CONFIG_KEYS
!p->cred->thread_keyring &&
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index de407c78178d..cefd4a11f6d9 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -47,6 +47,7 @@
#include <linux/pid.h>
#include <linux/smp.h>
#include <linux/mm.h>
+#include <linux/rcupdate.h>
#include <asm/cacheflush.h>
#include <asm/byteorder.h>
@@ -109,13 +110,15 @@ static struct kgdb_bkpt kgdb_break[KGDB_MAX_BREAKPOINTS] = {
*/
atomic_t kgdb_active = ATOMIC_INIT(-1);
EXPORT_SYMBOL_GPL(kgdb_active);
+static DEFINE_RAW_SPINLOCK(dbg_master_lock);
+static DEFINE_RAW_SPINLOCK(dbg_slave_lock);
/*
* We use NR_CPUs not PERCPU, in case kgdb is used to debug early
* bootup code (which might not have percpu set up yet):
*/
-static atomic_t passive_cpu_wait[NR_CPUS];
-static atomic_t cpu_in_kgdb[NR_CPUS];
+static atomic_t masters_in_kgdb;
+static atomic_t slaves_in_kgdb;
static atomic_t kgdb_break_tasklet_var;
atomic_t kgdb_setting_breakpoint;
@@ -206,18 +209,6 @@ int __weak kgdb_skipexception(int exception, struct pt_regs *regs)
return 0;
}
-/**
- * kgdb_disable_hw_debug - Disable hardware debugging while we in kgdb.
- * @regs: Current &struct pt_regs.
- *
- * This function will be called if the particular architecture must
- * disable hardware debugging while it is processing gdb packets or
- * handling exception.
- */
-void __weak kgdb_disable_hw_debug(struct pt_regs *regs)
-{
-}
-
/*
* Some architectures need cache flushes when we set/clear a
* breakpoint:
@@ -457,26 +448,34 @@ static int kgdb_reenter_check(struct kgdb_state *ks)
return 1;
}
-static void dbg_cpu_switch(int cpu, int next_cpu)
+static void dbg_touch_watchdogs(void)
{
- /* Mark the cpu we are switching away from as a slave when it
- * holds the kgdb_active token. This must be done so that the
- * that all the cpus wait in for the debug core will not enter
- * again as the master. */
- if (cpu == atomic_read(&kgdb_active)) {
- kgdb_info[cpu].exception_state |= DCPU_IS_SLAVE;
- kgdb_info[cpu].exception_state &= ~DCPU_WANT_MASTER;
- }
- kgdb_info[next_cpu].exception_state |= DCPU_NEXT_MASTER;
+ touch_softlockup_watchdog_sync();
+ clocksource_touch_watchdog();
+ rcu_cpu_stall_reset();
}
-static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs)
+static int kgdb_cpu_enter(struct kgdb_state *ks, struct pt_regs *regs,
+ int exception_state)
{
unsigned long flags;
int sstep_tries = 100;
int error;
- int i, cpu;
+ int cpu;
int trace_on = 0;
+ int online_cpus = num_online_cpus();
+
+ kgdb_info[ks->cpu].enter_kgdb++;
+ kgdb_info[ks->cpu].exception_state |= exception_state;
+
+ if (exception_state == DCPU_WANT_MASTER)
+ atomic_inc(&masters_in_kgdb);
+ else
+ atomic_inc(&slaves_in_kgdb);
+
+ if (arch_kgdb_ops.disable_hw_break)
+ arch_kgdb_ops.disable_hw_break(regs);
+
acquirelock:
/*
* Interrupts will be restored by the 'trap return' code, except when
@@ -489,14 +488,15 @@ acquirelock:
kgdb_info[cpu].task = current;
kgdb_info[cpu].ret_state = 0;
kgdb_info[cpu].irq_depth = hardirq_count() >> HARDIRQ_SHIFT;
- /*
- * Make sure the above info reaches the primary CPU before
- * our cpu_in_kgdb[] flag setting does:
- */
- atomic_inc(&cpu_in_kgdb[cpu]);
- if (exception_level == 1)
+ /* Make sure the above info reaches the primary CPU */
+ smp_mb();
+
+ if (exception_level == 1) {
+ if (raw_spin_trylock(&dbg_master_lock))
+ atomic_xchg(&kgdb_active, cpu);
goto cpu_master_loop;
+ }
/*
* CPU will loop if it is a slave or request to become a kgdb
@@ -508,10 +508,12 @@ cpu_loop:
kgdb_info[cpu].exception_state &= ~DCPU_NEXT_MASTER;
goto cpu_master_loop;
} else if (kgdb_info[cpu].exception_state & DCPU_WANT_MASTER) {
- if (atomic_cmpxchg(&kgdb_active, -1, cpu) == cpu)
+ if (raw_spin_trylock(&dbg_master_lock)) {
+ atomic_xchg(&kgdb_active, cpu);
break;
+ }
} else if (kgdb_info[cpu].exception_state & DCPU_IS_SLAVE) {
- if (!atomic_read(&passive_cpu_wait[cpu]))
+ if (!raw_spin_is_locked(&dbg_slave_lock))
goto return_normal;
} else {
return_normal:
@@ -522,9 +524,12 @@ return_normal:
arch_kgdb_ops.correct_hw_break();
if (trace_on)
tracing_on();
- atomic_dec(&cpu_in_kgdb[cpu]);
- touch_softlockup_watchdog_sync();
- clocksource_touch_watchdog();
+ kgdb_info[cpu].exception_state &=
+ ~(DCPU_WANT_MASTER | DCPU_IS_SLAVE);
+ kgdb_info[cpu].enter_kgdb--;
+ smp_mb__before_atomic_dec();
+ atomic_dec(&slaves_in_kgdb);
+ dbg_touch_watchdogs();
local_irq_restore(flags);
return 0;
}
@@ -541,8 +546,8 @@ return_normal:
(kgdb_info[cpu].task &&
kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) {
atomic_set(&kgdb_active, -1);
- touch_softlockup_watchdog_sync();
- clocksource_touch_watchdog();
+ raw_spin_unlock(&dbg_master_lock);
+ dbg_touch_watchdogs();
local_irq_restore(flags);
goto acquirelock;
@@ -563,16 +568,12 @@ return_normal:
if (dbg_io_ops->pre_exception)
dbg_io_ops->pre_exception();
- kgdb_disable_hw_debug(ks->linux_regs);
-
/*
* Get the passive CPU lock which will hold all the non-primary
* CPU in a spin state while the debugger is active
*/
- if (!kgdb_single_step) {
- for (i = 0; i < NR_CPUS; i++)
- atomic_inc(&passive_cpu_wait[i]);
- }
+ if (!kgdb_single_step)
+ raw_spin_lock(&dbg_slave_lock);
#ifdef CONFIG_SMP
/* Signal the other CPUs to enter kgdb_wait() */
@@ -583,10 +584,9 @@ return_normal:
/*
* Wait for the other CPUs to be notified and be waiting for us:
*/
- for_each_online_cpu(i) {
- while (kgdb_do_roundup && !atomic_read(&cpu_in_kgdb[i]))
- cpu_relax();
- }
+ while (kgdb_do_roundup && (atomic_read(&masters_in_kgdb) +
+ atomic_read(&slaves_in_kgdb)) != online_cpus)
+ cpu_relax();
/*
* At this point the primary processor is completely
@@ -615,7 +615,8 @@ cpu_master_loop:
if (error == DBG_PASS_EVENT) {
dbg_kdb_mode = !dbg_kdb_mode;
} else if (error == DBG_SWITCH_CPU_EVENT) {
- dbg_cpu_switch(cpu, dbg_switch_cpu);
+ kgdb_info[dbg_switch_cpu].exception_state |=
+ DCPU_NEXT_MASTER;
goto cpu_loop;
} else {
kgdb_info[cpu].ret_state = error;
@@ -627,24 +628,11 @@ cpu_master_loop:
if (dbg_io_ops->post_exception)
dbg_io_ops->post_exception();
- atomic_dec(&cpu_in_kgdb[ks->cpu]);
-
if (!kgdb_single_step) {
- for (i = NR_CPUS-1; i >= 0; i--)
- atomic_dec(&passive_cpu_wait[i]);
- /*
- * Wait till all the CPUs have quit from the debugger,
- * but allow a CPU that hit an exception and is
- * waiting to become the master to remain in the debug
- * core.
- */
- for_each_online_cpu(i) {
- while (kgdb_do_roundup &&
- atomic_read(&cpu_in_kgdb[i]) &&
- !(kgdb_info[i].exception_state &
- DCPU_WANT_MASTER))
- cpu_relax();
- }
+ raw_spin_unlock(&dbg_slave_lock);
+ /* Wait till all the CPUs have quit from the debugger. */
+ while (kgdb_do_roundup && atomic_read(&slaves_in_kgdb))
+ cpu_relax();
}
kgdb_restore:
@@ -655,12 +643,20 @@ kgdb_restore:
else
kgdb_sstep_pid = 0;
}
+ if (arch_kgdb_ops.correct_hw_break)
+ arch_kgdb_ops.correct_hw_break();
if (trace_on)
tracing_on();
+
+ kgdb_info[cpu].exception_state &=
+ ~(DCPU_WANT_MASTER | DCPU_IS_SLAVE);
+ kgdb_info[cpu].enter_kgdb--;
+ smp_mb__before_atomic_dec();
+ atomic_dec(&masters_in_kgdb);
/* Free kgdb_active */
atomic_set(&kgdb_active, -1);
- touch_softlockup_watchdog_sync();
- clocksource_touch_watchdog();
+ raw_spin_unlock(&dbg_master_lock);
+ dbg_touch_watchdogs();
local_irq_restore(flags);
return kgdb_info[cpu].ret_state;
@@ -678,7 +674,6 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
{
struct kgdb_state kgdb_var;
struct kgdb_state *ks = &kgdb_var;
- int ret;
ks->cpu = raw_smp_processor_id();
ks->ex_vector = evector;
@@ -689,11 +684,10 @@ kgdb_handle_exception(int evector, int signo, int ecode, struct pt_regs *regs)
if (kgdb_reenter_check(ks))
return 0; /* Ouch, double exception ! */
- kgdb_info[ks->cpu].exception_state |= DCPU_WANT_MASTER;
- ret = kgdb_cpu_enter(ks, regs);
- kgdb_info[ks->cpu].exception_state &= ~(DCPU_WANT_MASTER |
- DCPU_IS_SLAVE);
- return ret;
+ if (kgdb_info[ks->cpu].enter_kgdb != 0)
+ return 0;
+
+ return kgdb_cpu_enter(ks, regs, DCPU_WANT_MASTER);
}
int kgdb_nmicallback(int cpu, void *regs)
@@ -706,12 +700,9 @@ int kgdb_nmicallback(int cpu, void *regs)
ks->cpu = cpu;
ks->linux_regs = regs;
- if (!atomic_read(&cpu_in_kgdb[cpu]) &&
- atomic_read(&kgdb_active) != -1 &&
- atomic_read(&kgdb_active) != cpu) {
- kgdb_info[cpu].exception_state |= DCPU_IS_SLAVE;
- kgdb_cpu_enter(ks, regs);
- kgdb_info[cpu].exception_state &= ~DCPU_IS_SLAVE;
+ if (kgdb_info[ks->cpu].enter_kgdb == 0 &&
+ raw_spin_is_locked(&dbg_master_lock)) {
+ kgdb_cpu_enter(ks, regs, DCPU_IS_SLAVE);
return 0;
}
#endif
diff --git a/kernel/debug/debug_core.h b/kernel/debug/debug_core.h
index c5d753d80f67..3494c28a7e7a 100644
--- a/kernel/debug/debug_core.h
+++ b/kernel/debug/debug_core.h
@@ -40,6 +40,7 @@ struct debuggerinfo_struct {
int exception_state;
int ret_state;
int irq_depth;
+ int enter_kgdb;
};
extern struct debuggerinfo_struct kgdb_info[];
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c
index bf6e8270e957..dd0b1b7dd02c 100644
--- a/kernel/debug/kdb/kdb_debugger.c
+++ b/kernel/debug/kdb/kdb_debugger.c
@@ -86,7 +86,7 @@ int kdb_stub(struct kgdb_state *ks)
}
/* Set initial kdb state variables */
KDB_STATE_CLEAR(KGDB_TRANS);
- kdb_initial_cpu = ks->cpu;
+ kdb_initial_cpu = atomic_read(&kgdb_active);
kdb_current_task = kgdb_info[ks->cpu].task;
kdb_current_regs = kgdb_info[ks->cpu].debuggerinfo;
/* Remove any breakpoints as needed by kdb and clear single step */
@@ -105,7 +105,6 @@ int kdb_stub(struct kgdb_state *ks)
ks->pass_exception = 1;
KDB_FLAG_SET(CATASTROPHIC);
}
- kdb_initial_cpu = ks->cpu;
if (KDB_STATE(SSBPT) && reason == KDB_REASON_SSTEP) {
KDB_STATE_CLEAR(SSBPT);
KDB_STATE_CLEAR(DOING_SS);
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
index c9b7f4f90bba..96fdaac46a80 100644
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -823,4 +823,4 @@ int kdb_printf(const char *fmt, ...)
return r;
}
-
+EXPORT_SYMBOL_GPL(kdb_printf);
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index caf057a3de0e..37755d621924 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -1127,7 +1127,7 @@ static int kdb_local(kdb_reason_t reason, int error, struct pt_regs *regs,
/* special case below */
} else {
kdb_printf("\nEntering kdb (current=0x%p, pid %d) ",
- kdb_current, kdb_current->pid);
+ kdb_current, kdb_current ? kdb_current->pid : 0);
#if defined(CONFIG_SMP)
kdb_printf("on processor %d ", raw_smp_processor_id());
#endif
@@ -1749,13 +1749,13 @@ static int kdb_go(int argc, const char **argv)
int nextarg;
long offset;
+ if (raw_smp_processor_id() != kdb_initial_cpu) {
+ kdb_printf("go must execute on the entry cpu, "
+ "please use \"cpu %d\" and then execute go\n",
+ kdb_initial_cpu);
+ return KDB_BADCPUNUM;
+ }
if (argc == 1) {
- if (raw_smp_processor_id() != kdb_initial_cpu) {
- kdb_printf("go <address> must be issued from the "
- "initial cpu, do cpu %d first\n",
- kdb_initial_cpu);
- return KDB_ARGCOUNT;
- }
nextarg = 1;
diag = kdbgetaddrarg(argc, argv, &nextarg,
&addr, &offset, NULL);
@@ -2603,20 +2603,17 @@ static int kdb_summary(int argc, const char **argv)
*/
static int kdb_per_cpu(int argc, const char **argv)
{
- char buf[256], fmtstr[64];
- kdb_symtab_t symtab;
- cpumask_t suppress = CPU_MASK_NONE;
- int cpu, diag;
- unsigned long addr, val, bytesperword = 0, whichcpu = ~0UL;
+ char fmtstr[64];
+ int cpu, diag, nextarg = 1;
+ unsigned long addr, symaddr, val, bytesperword = 0, whichcpu = ~0UL;
if (argc < 1 || argc > 3)
return KDB_ARGCOUNT;
- snprintf(buf, sizeof(buf), "per_cpu__%s", argv[1]);
- if (!kdbgetsymval(buf, &symtab)) {
- kdb_printf("%s is not a per_cpu variable\n", argv[1]);
- return KDB_BADADDR;
- }
+ diag = kdbgetaddrarg(argc, argv, &nextarg, &symaddr, NULL, NULL);
+ if (diag)
+ return diag;
+
if (argc >= 2) {
diag = kdbgetularg(argv[2], &bytesperword);
if (diag)
@@ -2649,46 +2646,25 @@ static int kdb_per_cpu(int argc, const char **argv)
#define KDB_PCU(cpu) 0
#endif
#endif
-
for_each_online_cpu(cpu) {
+ if (KDB_FLAG(CMD_INTERRUPT))
+ return 0;
+
if (whichcpu != ~0UL && whichcpu != cpu)
continue;
- addr = symtab.sym_start + KDB_PCU(cpu);
+ addr = symaddr + KDB_PCU(cpu);
diag = kdb_getword(&val, addr, bytesperword);
if (diag) {
kdb_printf("%5d " kdb_bfd_vma_fmt0 " - unable to "
"read, diag=%d\n", cpu, addr, diag);
continue;
}
-#ifdef CONFIG_SMP
- if (!val) {
- cpu_set(cpu, suppress);
- continue;
- }
-#endif /* CONFIG_SMP */
kdb_printf("%5d ", cpu);
kdb_md_line(fmtstr, addr,
bytesperword == KDB_WORD_SIZE,
1, bytesperword, 1, 1, 0);
}
- if (cpus_weight(suppress) == 0)
- return 0;
- kdb_printf("Zero suppressed cpu(s):");
- for (cpu = first_cpu(suppress); cpu < num_possible_cpus();
- cpu = next_cpu(cpu, suppress)) {
- kdb_printf(" %d", cpu);
- if (cpu == num_possible_cpus() - 1 ||
- next_cpu(cpu, suppress) != cpu + 1)
- continue;
- while (cpu < num_possible_cpus() &&
- next_cpu(cpu, suppress) == cpu + 1)
- ++cpu;
- kdb_printf("-%d", cpu);
- }
- kdb_printf("\n");
-
#undef KDB_PCU
-
return 0;
}
@@ -2783,6 +2759,8 @@ int kdb_register_repeat(char *cmd,
return 0;
}
+EXPORT_SYMBOL_GPL(kdb_register_repeat);
+
/*
* kdb_register - Compatibility register function for commands that do
@@ -2805,6 +2783,7 @@ int kdb_register(char *cmd,
return kdb_register_repeat(cmd, func, usage, help, minlen,
KDB_REPEAT_NONE);
}
+EXPORT_SYMBOL_GPL(kdb_register);
/*
* kdb_unregister - This function is used to unregister a kernel
@@ -2823,7 +2802,7 @@ int kdb_unregister(char *cmd)
/*
* find the command.
*/
- for (i = 0, kp = kdb_commands; i < kdb_max_commands; i++, kp++) {
+ for_each_kdbcmd(kp, i) {
if (kp->cmd_name && (strcmp(kp->cmd_name, cmd) == 0)) {
kp->cmd_name = NULL;
return 0;
@@ -2833,6 +2812,7 @@ int kdb_unregister(char *cmd)
/* Couldn't find it. */
return 1;
}
+EXPORT_SYMBOL_GPL(kdb_unregister);
/* Initialize the kdb command table. */
static void __init kdb_inittab(void)
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
index be775f7e81e0..35d69ed1dfb5 100644
--- a/kernel/debug/kdb/kdb_private.h
+++ b/kernel/debug/kdb/kdb_private.h
@@ -15,29 +15,6 @@
#include <linux/kgdb.h>
#include "../debug_core.h"
-/* Kernel Debugger Error codes. Must not overlap with command codes. */
-#define KDB_NOTFOUND (-1)
-#define KDB_ARGCOUNT (-2)
-#define KDB_BADWIDTH (-3)
-#define KDB_BADRADIX (-4)
-#define KDB_NOTENV (-5)
-#define KDB_NOENVVALUE (-6)
-#define KDB_NOTIMP (-7)
-#define KDB_ENVFULL (-8)
-#define KDB_ENVBUFFULL (-9)
-#define KDB_TOOMANYBPT (-10)
-#define KDB_TOOMANYDBREGS (-11)
-#define KDB_DUPBPT (-12)
-#define KDB_BPTNOTFOUND (-13)
-#define KDB_BADMODE (-14)
-#define KDB_BADINT (-15)
-#define KDB_INVADDRFMT (-16)
-#define KDB_BADREG (-17)
-#define KDB_BADCPUNUM (-18)
-#define KDB_BADLENGTH (-19)
-#define KDB_NOBP (-20)
-#define KDB_BADADDR (-21)
-
/* Kernel Debugger Command codes. Must not overlap with error codes. */
#define KDB_CMD_GO (-1001)
#define KDB_CMD_CPU (-1002)
@@ -93,17 +70,6 @@
*/
#define KDB_MAXBPT 16
-/* Maximum number of arguments to a function */
-#define KDB_MAXARGS 16
-
-typedef enum {
- KDB_REPEAT_NONE = 0, /* Do not repeat this command */
- KDB_REPEAT_NO_ARGS, /* Repeat the command without arguments */
- KDB_REPEAT_WITH_ARGS, /* Repeat the command including its arguments */
-} kdb_repeat_t;
-
-typedef int (*kdb_func_t)(int, const char **);
-
/* Symbol table format returned by kallsyms. */
typedef struct __ksymtab {
unsigned long value; /* Address of symbol */
@@ -123,11 +89,6 @@ extern int kallsyms_symbol_next(char *prefix_name, int flag);
extern int kallsyms_symbol_complete(char *prefix_name, int max_len);
/* Exported Symbols for kernel loadable modules to use. */
-extern int kdb_register(char *, kdb_func_t, char *, char *, short);
-extern int kdb_register_repeat(char *, kdb_func_t, char *, char *,
- short, kdb_repeat_t);
-extern int kdb_unregister(char *);
-
extern int kdb_getarea_size(void *, unsigned long, size_t);
extern int kdb_putarea_size(unsigned long, void *, size_t);
@@ -144,6 +105,7 @@ extern int kdb_getword(unsigned long *, unsigned long, size_t);
extern int kdb_putword(unsigned long, unsigned long, size_t);
extern int kdbgetularg(const char *, unsigned long *);
+extern int kdbgetu64arg(const char *, u64 *);
extern char *kdbgetenv(const char *);
extern int kdbgetaddrarg(int, const char **, int*, unsigned long *,
long *, char **);
@@ -255,14 +217,6 @@ extern void kdb_ps1(const struct task_struct *p);
extern void kdb_print_nameval(const char *name, unsigned long val);
extern void kdb_send_sig_info(struct task_struct *p, struct siginfo *info);
extern void kdb_meminfo_proc_show(void);
-#ifdef CONFIG_KALLSYMS
-extern const char *kdb_walk_kallsyms(loff_t *pos);
-#else /* ! CONFIG_KALLSYMS */
-static inline const char *kdb_walk_kallsyms(loff_t *pos)
-{
- return NULL;
-}
-#endif /* ! CONFIG_KALLSYMS */
extern char *kdb_getstr(char *, size_t, char *);
/* Defines for kdb_symbol_print */
diff --git a/kernel/exit.c b/kernel/exit.c
index e2bdf37f9fde..21aa7b3001fb 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -50,6 +50,7 @@
#include <linux/perf_event.h>
#include <trace/events/sched.h>
#include <linux/hw_breakpoint.h>
+#include <linux/oom.h>
#include <asm/uaccess.h>
#include <asm/unistd.h>
@@ -95,6 +96,14 @@ static void __exit_signal(struct task_struct *tsk)
sig->tty = NULL;
} else {
/*
+ * This can only happen if the caller is de_thread().
+ * FIXME: this is the temporary hack, we should teach
+ * posix-cpu-timers to handle this case correctly.
+ */
+ if (unlikely(has_group_leader_pid(tsk)))
+ posix_cpu_timers_exit_group(tsk);
+
+ /*
* If there is any task waiting for the group exit
* then notify it:
*/
@@ -687,6 +696,8 @@ static void exit_mm(struct task_struct * tsk)
enter_lazy_tlb(mm, current);
/* We don't want this task to be frozen prematurely */
clear_freeze_flag(tsk);
+ if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
+ atomic_dec(&mm->oom_disable_count);
task_unlock(tsk);
mm_update_next_owner(mm);
mmput(mm);
@@ -700,6 +711,8 @@ static void exit_mm(struct task_struct * tsk)
* space.
*/
static struct task_struct *find_new_reaper(struct task_struct *father)
+ __releases(&tasklist_lock)
+ __acquires(&tasklist_lock)
{
struct pid_namespace *pid_ns = task_active_pid_ns(father);
struct task_struct *thread;
diff --git a/kernel/fork.c b/kernel/fork.c
index c445f8cc408d..3b159c5991b7 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -65,6 +65,7 @@
#include <linux/perf_event.h>
#include <linux/posix-timers.h>
#include <linux/user-return-notifier.h>
+#include <linux/oom.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
@@ -488,6 +489,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
mm->cached_hole_size = ~0UL;
mm_init_aio(mm);
mm_init_owner(mm, p);
+ atomic_set(&mm->oom_disable_count, 0);
if (likely(!mm_alloc_pgd(mm))) {
mm->def_flags = 0;
@@ -741,6 +743,8 @@ good_mm:
/* Initializing for Swap token stuff */
mm->token_priority = 0;
mm->last_interval = 0;
+ if (tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
+ atomic_inc(&mm->oom_disable_count);
tsk->mm = mm;
tsk->active_mm = mm;
@@ -904,6 +908,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
sig->oom_adj = current->signal->oom_adj;
sig->oom_score_adj = current->signal->oom_score_adj;
+ mutex_init(&sig->cred_guard_mutex);
+
return 0;
}
@@ -1299,8 +1305,13 @@ bad_fork_cleanup_io:
bad_fork_cleanup_namespaces:
exit_task_namespaces(p);
bad_fork_cleanup_mm:
- if (p->mm)
+ if (p->mm) {
+ task_lock(p);
+ if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
+ atomic_dec(&p->mm->oom_disable_count);
+ task_unlock(p);
mmput(p->mm);
+ }
bad_fork_cleanup_signal:
if (!(clone_flags & CLONE_THREAD))
free_signal_struct(p->signal);
@@ -1693,6 +1704,10 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
active_mm = current->active_mm;
current->mm = new_mm;
current->active_mm = new_mm;
+ if (current->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
+ atomic_dec(&mm->oom_disable_count);
+ atomic_inc(&new_mm->oom_disable_count);
+ }
activate_mm(active_mm, new_mm);
new_mm = mm;
}
diff --git a/kernel/futex.c b/kernel/futex.c
index a118bf160e0b..6c683b37f2ce 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -169,7 +169,7 @@ static void get_futex_key_refs(union futex_key *key)
switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
case FUT_OFF_INODE:
- atomic_inc(&key->shared.inode->i_count);
+ ihold(key->shared.inode);
break;
case FUT_OFF_MMSHARED:
atomic_inc(&key->private.mm->mm_count);
diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c
index f83972b16564..9bd0934f6c33 100644
--- a/kernel/gcov/fs.c
+++ b/kernel/gcov/fs.c
@@ -561,6 +561,7 @@ static ssize_t reset_read(struct file *file, char __user *addr, size_t len,
static const struct file_operations gcov_reset_fops = {
.write = reset_write,
.read = reset_read,
+ .llseek = noop_llseek,
};
/*
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 9d917ff72675..9988d03797f5 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -393,3 +393,18 @@ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
struct irq_desc *desc = irq_to_desc(irq);
return desc ? desc->kstat_irqs[cpu] : 0;
}
+
+#ifdef CONFIG_GENERIC_HARDIRQS
+unsigned int kstat_irqs(unsigned int irq)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+ int cpu;
+ int sum = 0;
+
+ if (!desc)
+ return 0;
+ for_each_possible_cpu(cpu)
+ sum += desc->kstat_irqs[cpu];
+ return sum;
+}
+#endif /* CONFIG_GENERIC_HARDIRQS */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 850f030fa0c2..91a5fa25054e 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -324,6 +324,10 @@ void enable_irq(unsigned int irq)
if (!desc)
return;
+ if (WARN(!desc->irq_data.chip || !desc->irq_data.chip->irq_enable,
+ KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq))
+ return;
+
chip_bus_lock(desc);
raw_spin_lock_irqsave(&desc->lock, flags);
__enable_irq(desc, irq, false);
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 7be868bf25c6..3b79bd938330 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -39,6 +39,16 @@ struct jump_label_module_entry {
struct module *mod;
};
+void jump_label_lock(void)
+{
+ mutex_lock(&jump_label_mutex);
+}
+
+void jump_label_unlock(void)
+{
+ mutex_unlock(&jump_label_mutex);
+}
+
static int jump_label_cmp(const void *a, const void *b)
{
const struct jump_entry *jea = a;
@@ -152,7 +162,7 @@ void jump_label_update(unsigned long key, enum jump_label_type type)
struct jump_label_module_entry *e_module;
int count;
- mutex_lock(&jump_label_mutex);
+ jump_label_lock();
entry = get_jump_label_entry((jump_label_t)key);
if (entry) {
count = entry->nr_entries;
@@ -168,13 +178,14 @@ void jump_label_update(unsigned long key, enum jump_label_type type)
count = e_module->nr_entries;
iter = e_module->table;
while (count--) {
- if (kernel_text_address(iter->code))
+ if (iter->key &&
+ kernel_text_address(iter->code))
arch_jump_label_transform(iter, type);
iter++;
}
}
}
- mutex_unlock(&jump_label_mutex);
+ jump_label_unlock();
}
static int addr_conflict(struct jump_entry *entry, void *start, void *end)
@@ -231,6 +242,7 @@ out:
* overlaps with any of the jump label patch addresses. Code
* that wants to modify kernel text should first verify that
* it does not overlap with any of the jump label addresses.
+ * Caller must hold jump_label_mutex.
*
* returns 1 if there is an overlap, 0 otherwise
*/
@@ -241,7 +253,6 @@ int jump_label_text_reserved(void *start, void *end)
struct jump_entry *iter_stop = __start___jump_table;
int conflict = 0;
- mutex_lock(&jump_label_mutex);
iter = iter_start;
while (iter < iter_stop) {
if (addr_conflict(iter, start, end)) {
@@ -256,10 +267,16 @@ int jump_label_text_reserved(void *start, void *end)
conflict = module_conflict(start, end);
#endif
out:
- mutex_unlock(&jump_label_mutex);
return conflict;
}
+/*
+ * Not all archs need this.
+ */
+void __weak arch_jump_label_text_poke_early(jump_label_t addr)
+{
+}
+
static __init int init_jump_label(void)
{
int ret;
@@ -267,7 +284,7 @@ static __init int init_jump_label(void)
struct jump_entry *iter_stop = __stop___jump_table;
struct jump_entry *iter;
- mutex_lock(&jump_label_mutex);
+ jump_label_lock();
ret = build_jump_label_hashtable(__start___jump_table,
__stop___jump_table);
iter = iter_start;
@@ -275,7 +292,7 @@ static __init int init_jump_label(void)
arch_jump_label_text_poke_early(iter->code);
iter++;
}
- mutex_unlock(&jump_label_mutex);
+ jump_label_unlock();
return ret;
}
early_initcall(init_jump_label);
@@ -366,6 +383,39 @@ static void remove_jump_label_module(struct module *mod)
}
}
+static void remove_jump_label_module_init(struct module *mod)
+{
+ struct hlist_head *head;
+ struct hlist_node *node, *node_next, *module_node, *module_node_next;
+ struct jump_label_entry *e;
+ struct jump_label_module_entry *e_module;
+ struct jump_entry *iter;
+ int i, count;
+
+ /* if the module doesn't have jump label entries, just return */
+ if (!mod->num_jump_entries)
+ return;
+
+ for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) {
+ head = &jump_label_table[i];
+ hlist_for_each_entry_safe(e, node, node_next, head, hlist) {
+ hlist_for_each_entry_safe(e_module, module_node,
+ module_node_next,
+ &(e->modules), hlist) {
+ if (e_module->mod != mod)
+ continue;
+ count = e_module->nr_entries;
+ iter = e_module->table;
+ while (count--) {
+ if (within_module_init(iter->code, mod))
+ iter->key = 0;
+ iter++;
+ }
+ }
+ }
+ }
+}
+
static int
jump_label_module_notify(struct notifier_block *self, unsigned long val,
void *data)
@@ -375,16 +425,21 @@ jump_label_module_notify(struct notifier_block *self, unsigned long val,
switch (val) {
case MODULE_STATE_COMING:
- mutex_lock(&jump_label_mutex);
+ jump_label_lock();
ret = add_jump_label_module(mod);
if (ret)
remove_jump_label_module(mod);
- mutex_unlock(&jump_label_mutex);
+ jump_label_unlock();
break;
case MODULE_STATE_GOING:
- mutex_lock(&jump_label_mutex);
+ jump_label_lock();
remove_jump_label_module(mod);
- mutex_unlock(&jump_label_mutex);
+ jump_label_unlock();
+ break;
+ case MODULE_STATE_LIVE:
+ jump_label_lock();
+ remove_jump_label_module_init(mod);
+ jump_label_unlock();
break;
}
return ret;
diff --git a/kernel/kexec.c b/kernel/kexec.c
index c0613f7d6730..b55045bc7563 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -816,7 +816,7 @@ static int kimage_load_normal_segment(struct kimage *image,
ptr = kmap(page);
/* Start with a clear page */
- memset(ptr, 0, PAGE_SIZE);
+ clear_page(ptr);
ptr += maddr & ~PAGE_MASK;
mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
if (mchunk > mbytes)
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index ec4210c6501e..9737a76e106f 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -74,7 +74,8 @@ static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
/* NOTE: change this value only with kprobe_mutex held */
static bool kprobes_all_disarmed;
-static DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */
+/* This protects kprobe_table and optimizing_list */
+static DEFINE_MUTEX(kprobe_mutex);
static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
static struct {
spinlock_t lock ____cacheline_aligned_in_smp;
@@ -595,6 +596,7 @@ static __kprobes void try_to_optimize_kprobe(struct kprobe *p)
}
#ifdef CONFIG_SYSCTL
+/* This should be called with kprobe_mutex locked */
static void __kprobes optimize_all_kprobes(void)
{
struct hlist_head *head;
@@ -607,17 +609,16 @@ static void __kprobes optimize_all_kprobes(void)
return;
kprobes_allow_optimization = true;
- mutex_lock(&text_mutex);
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
head = &kprobe_table[i];
hlist_for_each_entry_rcu(p, node, head, hlist)
if (!kprobe_disabled(p))
optimize_kprobe(p);
}
- mutex_unlock(&text_mutex);
printk(KERN_INFO "Kprobes globally optimized\n");
}
+/* This should be called with kprobe_mutex locked */
static void __kprobes unoptimize_all_kprobes(void)
{
struct hlist_head *head;
@@ -1144,14 +1145,13 @@ int __kprobes register_kprobe(struct kprobe *p)
if (ret)
return ret;
+ jump_label_lock();
preempt_disable();
if (!kernel_text_address((unsigned long) p->addr) ||
in_kprobes_functions((unsigned long) p->addr) ||
ftrace_text_reserved(p->addr, p->addr) ||
- jump_label_text_reserved(p->addr, p->addr)) {
- preempt_enable();
- return -EINVAL;
- }
+ jump_label_text_reserved(p->addr, p->addr))
+ goto fail_with_jump_label;
/* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */
p->flags &= KPROBE_FLAG_DISABLED;
@@ -1165,10 +1165,9 @@ int __kprobes register_kprobe(struct kprobe *p)
* We must hold a refcount of the probed module while updating
* its code to prohibit unexpected unloading.
*/
- if (unlikely(!try_module_get(probed_mod))) {
- preempt_enable();
- return -EINVAL;
- }
+ if (unlikely(!try_module_get(probed_mod)))
+ goto fail_with_jump_label;
+
/*
* If the module freed .init.text, we couldn't insert
* kprobes in there.
@@ -1176,16 +1175,18 @@ int __kprobes register_kprobe(struct kprobe *p)
if (within_module_init((unsigned long)p->addr, probed_mod) &&
probed_mod->state != MODULE_STATE_COMING) {
module_put(probed_mod);
- preempt_enable();
- return -EINVAL;
+ goto fail_with_jump_label;
}
}
preempt_enable();
+ jump_label_unlock();
p->nmissed = 0;
INIT_LIST_HEAD(&p->list);
mutex_lock(&kprobe_mutex);
+ jump_label_lock(); /* needed to call jump_label_text_reserved() */
+
get_online_cpus(); /* For avoiding text_mutex deadlock. */
mutex_lock(&text_mutex);
@@ -1213,12 +1214,18 @@ int __kprobes register_kprobe(struct kprobe *p)
out:
mutex_unlock(&text_mutex);
put_online_cpus();
+ jump_label_unlock();
mutex_unlock(&kprobe_mutex);
if (probed_mod)
module_put(probed_mod);
return ret;
+
+fail_with_jump_label:
+ preempt_enable();
+ jump_label_unlock();
+ return -EINVAL;
}
EXPORT_SYMBOL_GPL(register_kprobe);
@@ -2000,6 +2007,7 @@ static ssize_t write_enabled_file_bool(struct file *file,
static const struct file_operations fops_kp = {
.read = read_enabled_file_bool,
.write = write_enabled_file_bool,
+ .llseek = default_llseek,
};
static int __kprobes debugfs_kprobe_init(void)
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 877fb306d415..17110a4a4fc2 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -194,14 +194,7 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
account_global_scheduler_latency(tsk, &lat);
- /*
- * short term hack; if we're > 32 we stop; future we recycle:
- */
- tsk->latency_record_count++;
- if (tsk->latency_record_count >= LT_SAVECOUNT)
- goto out_unlock;
-
- for (i = 0; i < LT_SAVECOUNT; i++) {
+ for (i = 0; i < tsk->latency_record_count; i++) {
struct latency_record *mylat;
int same = 1;
@@ -227,8 +220,14 @@ __account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
}
}
+ /*
+ * short term hack; if we're > 32 we stop; future we recycle:
+ */
+ if (tsk->latency_record_count >= LT_SAVECOUNT)
+ goto out_unlock;
+
/* Allocated a new one: */
- i = tsk->latency_record_count;
+ i = tsk->latency_record_count++;
memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record));
out_unlock:
diff --git a/kernel/module.c b/kernel/module.c
index 2df46301a7a4..437a74a7524a 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2037,7 +2037,7 @@ static inline void layout_symtab(struct module *mod, struct load_info *info)
{
}
-static void add_kallsyms(struct module *mod, struct load_info *info)
+static void add_kallsyms(struct module *mod, const struct load_info *info)
{
}
#endif /* CONFIG_KALLSYMS */
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c
index 2a5dfec8efe0..2c98ad94ba0e 100644
--- a/kernel/ns_cgroup.c
+++ b/kernel/ns_cgroup.c
@@ -85,6 +85,14 @@ static struct cgroup_subsys_state *ns_create(struct cgroup_subsys *ss,
return ERR_PTR(-EPERM);
if (!cgroup_is_descendant(cgroup, current))
return ERR_PTR(-EPERM);
+ if (test_bit(CGRP_CLONE_CHILDREN, &cgroup->flags)) {
+ printk("ns_cgroup can't be created with parent "
+ "'clone_children' set.\n");
+ return ERR_PTR(-EINVAL);
+ }
+
+ printk_once("ns_cgroup deprecated: consider using the "
+ "'clone_children' flag without the ns_cgroup.\n");
ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL);
if (!ns_cgroup)
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index f309e8014c78..cb6c0d2af68f 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -417,8 +417,8 @@ event_filter_match(struct perf_event *event)
return event->cpu == -1 || event->cpu == smp_processor_id();
}
-static int
-__event_sched_out(struct perf_event *event,
+static void
+event_sched_out(struct perf_event *event,
struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx)
{
@@ -437,13 +437,14 @@ __event_sched_out(struct perf_event *event,
}
if (event->state != PERF_EVENT_STATE_ACTIVE)
- return 0;
+ return;
event->state = PERF_EVENT_STATE_INACTIVE;
if (event->pending_disable) {
event->pending_disable = 0;
event->state = PERF_EVENT_STATE_OFF;
}
+ event->tstamp_stopped = ctx->time;
event->pmu->del(event, 0);
event->oncpu = -1;
@@ -452,19 +453,6 @@ __event_sched_out(struct perf_event *event,
ctx->nr_active--;
if (event->attr.exclusive || !cpuctx->active_oncpu)
cpuctx->exclusive = 0;
- return 1;
-}
-
-static void
-event_sched_out(struct perf_event *event,
- struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx)
-{
- int ret;
-
- ret = __event_sched_out(event, cpuctx, ctx);
- if (ret)
- event->tstamp_stopped = ctx->time;
}
static void
@@ -664,7 +652,7 @@ retry:
}
static int
-__event_sched_in(struct perf_event *event,
+event_sched_in(struct perf_event *event,
struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx)
{
@@ -684,6 +672,10 @@ __event_sched_in(struct perf_event *event,
return -EAGAIN;
}
+ event->tstamp_running += ctx->time - event->tstamp_stopped;
+
+ event->shadow_ctx_time = ctx->time - ctx->timestamp;
+
if (!is_software_event(event))
cpuctx->active_oncpu++;
ctx->nr_active++;
@@ -694,35 +686,6 @@ __event_sched_in(struct perf_event *event,
return 0;
}
-static inline int
-event_sched_in(struct perf_event *event,
- struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx)
-{
- int ret = __event_sched_in(event, cpuctx, ctx);
- if (ret)
- return ret;
- event->tstamp_running += ctx->time - event->tstamp_stopped;
- return 0;
-}
-
-static void
-group_commit_event_sched_in(struct perf_event *group_event,
- struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx)
-{
- struct perf_event *event;
- u64 now = ctx->time;
-
- group_event->tstamp_running += now - group_event->tstamp_stopped;
- /*
- * Schedule in siblings as one group (if any):
- */
- list_for_each_entry(event, &group_event->sibling_list, group_entry) {
- event->tstamp_running += now - event->tstamp_stopped;
- }
-}
-
static int
group_sched_in(struct perf_event *group_event,
struct perf_cpu_context *cpuctx,
@@ -730,19 +693,15 @@ group_sched_in(struct perf_event *group_event,
{
struct perf_event *event, *partial_group = NULL;
struct pmu *pmu = group_event->pmu;
+ u64 now = ctx->time;
+ bool simulate = false;
if (group_event->state == PERF_EVENT_STATE_OFF)
return 0;
pmu->start_txn(pmu);
- /*
- * use __event_sched_in() to delay updating tstamp_running
- * until the transaction is committed. In case of failure
- * we will keep an unmodified tstamp_running which is a
- * requirement to get correct timing information
- */
- if (__event_sched_in(group_event, cpuctx, ctx)) {
+ if (event_sched_in(group_event, cpuctx, ctx)) {
pmu->cancel_txn(pmu);
return -EAGAIN;
}
@@ -751,31 +710,42 @@ group_sched_in(struct perf_event *group_event,
* Schedule in siblings as one group (if any):
*/
list_for_each_entry(event, &group_event->sibling_list, group_entry) {
- if (__event_sched_in(event, cpuctx, ctx)) {
+ if (event_sched_in(event, cpuctx, ctx)) {
partial_group = event;
goto group_error;
}
}
- if (!pmu->commit_txn(pmu)) {
- /* commit tstamp_running */
- group_commit_event_sched_in(group_event, cpuctx, ctx);
+ if (!pmu->commit_txn(pmu))
return 0;
- }
+
group_error:
/*
* Groups can be scheduled in as one unit only, so undo any
* partial group before returning:
+ * The events up to the failed event are scheduled out normally,
+ * tstamp_stopped will be updated.
*
- * use __event_sched_out() to avoid updating tstamp_stopped
- * because the event never actually ran
+ * The failed events and the remaining siblings need to have
+ * their timings updated as if they had gone thru event_sched_in()
+ * and event_sched_out(). This is required to get consistent timings
+ * across the group. This also takes care of the case where the group
+ * could never be scheduled by ensuring tstamp_stopped is set to mark
+ * the time the event was actually stopped, such that time delta
+ * calculation in update_event_times() is correct.
*/
list_for_each_entry(event, &group_event->sibling_list, group_entry) {
if (event == partial_group)
- break;
- __event_sched_out(event, cpuctx, ctx);
+ simulate = true;
+
+ if (simulate) {
+ event->tstamp_running += now - event->tstamp_stopped;
+ event->tstamp_stopped = now;
+ } else {
+ event_sched_out(event, cpuctx, ctx);
+ }
}
- __event_sched_out(group_event, cpuctx, ctx);
+ event_sched_out(group_event, cpuctx, ctx);
pmu->cancel_txn(pmu);
@@ -3428,7 +3398,8 @@ static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
}
static void perf_output_read_one(struct perf_output_handle *handle,
- struct perf_event *event)
+ struct perf_event *event,
+ u64 enabled, u64 running)
{
u64 read_format = event->attr.read_format;
u64 values[4];
@@ -3436,11 +3407,11 @@ static void perf_output_read_one(struct perf_output_handle *handle,
values[n++] = perf_event_count(event);
if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
- values[n++] = event->total_time_enabled +
+ values[n++] = enabled +
atomic64_read(&event->child_total_time_enabled);
}
if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
- values[n++] = event->total_time_running +
+ values[n++] = running +
atomic64_read(&event->child_total_time_running);
}
if (read_format & PERF_FORMAT_ID)
@@ -3453,7 +3424,8 @@ static void perf_output_read_one(struct perf_output_handle *handle,
* XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
*/
static void perf_output_read_group(struct perf_output_handle *handle,
- struct perf_event *event)
+ struct perf_event *event,
+ u64 enabled, u64 running)
{
struct perf_event *leader = event->group_leader, *sub;
u64 read_format = event->attr.read_format;
@@ -3463,10 +3435,10 @@ static void perf_output_read_group(struct perf_output_handle *handle,
values[n++] = 1 + leader->nr_siblings;
if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
- values[n++] = leader->total_time_enabled;
+ values[n++] = enabled;
if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
- values[n++] = leader->total_time_running;
+ values[n++] = running;
if (leader != event)
leader->pmu->read(leader);
@@ -3491,13 +3463,35 @@ static void perf_output_read_group(struct perf_output_handle *handle,
}
}
+#define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
+ PERF_FORMAT_TOTAL_TIME_RUNNING)
+
static void perf_output_read(struct perf_output_handle *handle,
struct perf_event *event)
{
+ u64 enabled = 0, running = 0, now, ctx_time;
+ u64 read_format = event->attr.read_format;
+
+ /*
+ * compute total_time_enabled, total_time_running
+ * based on snapshot values taken when the event
+ * was last scheduled in.
+ *
+ * we cannot simply called update_context_time()
+ * because of locking issue as we are called in
+ * NMI context
+ */
+ if (read_format & PERF_FORMAT_TOTAL_TIMES) {
+ now = perf_clock();
+ ctx_time = event->shadow_ctx_time + now;
+ enabled = ctx_time - event->tstamp_enabled;
+ running = ctx_time - event->tstamp_running;
+ }
+
if (event->attr.read_format & PERF_FORMAT_GROUP)
- perf_output_read_group(handle, event);
+ perf_output_read_group(handle, event, enabled, running);
else
- perf_output_read_one(handle, event);
+ perf_output_read_one(handle, event, enabled, running);
}
void perf_output_sample(struct perf_output_handle *handle,
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index 645e541a45f6..c7a8f453919e 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -110,6 +110,7 @@ static const struct file_operations pm_qos_power_fops = {
.write = pm_qos_power_write,
.open = pm_qos_power_open,
.release = pm_qos_power_release,
+ .llseek = noop_llseek,
};
/* unlocked internal variant */
@@ -398,7 +399,7 @@ static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
} else
return -EINVAL;
- pm_qos_req = (struct pm_qos_request_list *)filp->private_data;
+ pm_qos_req = filp->private_data;
pm_qos_update_request(pm_qos_req, value);
return count;
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index ac7eb109f196..0dac75ea4456 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -984,8 +984,8 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
src = kmap_atomic(s_page, KM_USER0);
dst = kmap_atomic(d_page, KM_USER1);
do_copy_page(dst, src);
- kunmap_atomic(src, KM_USER0);
kunmap_atomic(dst, KM_USER1);
+ kunmap_atomic(src, KM_USER0);
} else {
if (PageHighMem(d_page)) {
/* Page pointed to by src may contain some kernel
@@ -993,7 +993,7 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn)
*/
safe_copy_page(buffer, s_page);
dst = kmap_atomic(d_page, KM_USER0);
- memcpy(dst, buffer, PAGE_SIZE);
+ copy_page(dst, buffer);
kunmap_atomic(dst, KM_USER0);
} else {
safe_copy_page(page_address(d_page), s_page);
@@ -1687,7 +1687,7 @@ int snapshot_read_next(struct snapshot_handle *handle)
memory_bm_position_reset(&orig_bm);
memory_bm_position_reset(&copy_bm);
} else if (handle->cur <= nr_meta_pages) {
- memset(buffer, 0, PAGE_SIZE);
+ clear_page(buffer);
pack_pfns(buffer, &orig_bm);
} else {
struct page *page;
@@ -1701,7 +1701,7 @@ int snapshot_read_next(struct snapshot_handle *handle)
void *kaddr;
kaddr = kmap_atomic(page, KM_USER0);
- memcpy(buffer, kaddr, PAGE_SIZE);
+ copy_page(buffer, kaddr);
kunmap_atomic(kaddr, KM_USER0);
handle->buffer = buffer;
} else {
@@ -1984,7 +1984,7 @@ static void copy_last_highmem_page(void)
void *dst;
dst = kmap_atomic(last_highmem_page, KM_USER0);
- memcpy(dst, buffer, PAGE_SIZE);
+ copy_page(dst, buffer);
kunmap_atomic(dst, KM_USER0);
last_highmem_page = NULL;
}
@@ -2270,11 +2270,11 @@ swap_two_pages_data(struct page *p1, struct page *p2, void *buf)
kaddr1 = kmap_atomic(p1, KM_USER0);
kaddr2 = kmap_atomic(p2, KM_USER1);
- memcpy(buf, kaddr1, PAGE_SIZE);
- memcpy(kaddr1, kaddr2, PAGE_SIZE);
- memcpy(kaddr2, buf, PAGE_SIZE);
- kunmap_atomic(kaddr1, KM_USER0);
+ copy_page(buf, kaddr1);
+ copy_page(kaddr1, kaddr2);
+ copy_page(kaddr2, buf);
kunmap_atomic(kaddr2, KM_USER1);
+ kunmap_atomic(kaddr1, KM_USER0);
}
/**
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 916eaa790399..a0e4a86ccf94 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -251,7 +251,7 @@ static int write_page(void *buf, sector_t offset, struct bio **bio_chain)
if (bio_chain) {
src = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH);
if (src) {
- memcpy(src, buf, PAGE_SIZE);
+ copy_page(src, buf);
} else {
WARN_ON_ONCE(1);
bio_chain = NULL; /* Go synchronous */
@@ -325,7 +325,7 @@ static int swap_write_page(struct swap_map_handle *handle, void *buf,
error = write_page(handle->cur, handle->cur_swap, NULL);
if (error)
goto out;
- memset(handle->cur, 0, PAGE_SIZE);
+ clear_page(handle->cur);
handle->cur_swap = offset;
handle->k = 0;
}
@@ -910,7 +910,7 @@ int swsusp_check(void)
hib_resume_bdev = open_by_devnum(swsusp_resume_device, FMODE_READ);
if (!IS_ERR(hib_resume_bdev)) {
set_blocksize(hib_resume_bdev, PAGE_SIZE);
- memset(swsusp_header, 0, PAGE_SIZE);
+ clear_page(swsusp_header);
error = hib_bio_read_page(swsusp_resume_block,
swsusp_header, NULL);
if (error)
diff --git a/kernel/printk.c b/kernel/printk.c
index 2531017795f6..9a2264fc42ca 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -210,7 +210,7 @@ __setup("log_buf_len=", log_buf_len_setup);
#ifdef CONFIG_BOOT_PRINTK_DELAY
-static unsigned int boot_delay; /* msecs delay after each printk during bootup */
+static int boot_delay; /* msecs delay after each printk during bootup */
static unsigned long long loops_per_msec; /* based on boot_delay */
static int __init boot_delay_setup(char *str)
@@ -261,6 +261,12 @@ static inline void boot_delay_msec(void)
}
#endif
+#ifdef CONFIG_SECURITY_DMESG_RESTRICT
+int dmesg_restrict = 1;
+#else
+int dmesg_restrict;
+#endif
+
int do_syslog(int type, char __user *buf, int len, bool from_file)
{
unsigned i, j, limit, count;
@@ -268,7 +274,20 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
char c;
int error = 0;
- error = security_syslog(type, from_file);
+ /*
+ * If this is from /proc/kmsg we only do the capabilities checks
+ * at open time.
+ */
+ if (type == SYSLOG_ACTION_OPEN || !from_file) {
+ if (dmesg_restrict && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ if ((type != SYSLOG_ACTION_READ_ALL &&
+ type != SYSLOG_ACTION_SIZE_BUFFER) &&
+ !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ }
+
+ error = security_syslog(type);
if (error)
return error;
@@ -647,6 +666,7 @@ static inline int can_use_console(unsigned int cpu)
* released but interrupts still disabled.
*/
static int acquire_console_semaphore_for_printk(unsigned int cpu)
+ __releases(&logbuf_lock)
{
int retval = 0;
@@ -1511,7 +1531,7 @@ int kmsg_dump_unregister(struct kmsg_dumper *dumper)
}
EXPORT_SYMBOL_GPL(kmsg_dump_unregister);
-static const char const *kmsg_reasons[] = {
+static const char * const kmsg_reasons[] = {
[KMSG_DUMP_OOPS] = "oops",
[KMSG_DUMP_PANIC] = "panic",
[KMSG_DUMP_KEXEC] = "kexec",
diff --git a/kernel/profile.c b/kernel/profile.c
index b22a899934cc..66f841b7fbd3 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -555,6 +555,7 @@ static ssize_t write_profile(struct file *file, const char __user *buf,
static const struct file_operations proc_profile_operations = {
.read = read_profile,
.write = write_profile,
+ .llseek = default_llseek,
};
#ifdef CONFIG_SMP
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index f34d798ef4a2..99bbaa3e5b0d 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -181,7 +181,7 @@ int ptrace_attach(struct task_struct *task)
* under ptrace.
*/
retval = -ERESTARTNOINTR;
- if (mutex_lock_interruptible(&task->cred_guard_mutex))
+ if (mutex_lock_interruptible(&task->signal->cred_guard_mutex))
goto out;
task_lock(task);
@@ -208,7 +208,7 @@ int ptrace_attach(struct task_struct *task)
unlock_tasklist:
write_unlock_irq(&tasklist_lock);
unlock_creds:
- mutex_unlock(&task->cred_guard_mutex);
+ mutex_unlock(&task->signal->cred_guard_mutex);
out:
return retval;
}
@@ -329,6 +329,8 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
* and reacquire the lock.
*/
void exit_ptrace(struct task_struct *tracer)
+ __releases(&tasklist_lock)
+ __acquires(&tasklist_lock)
{
struct task_struct *p, *n;
LIST_HEAD(ptrace_dead);
@@ -402,7 +404,7 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds
return copied;
}
-static int ptrace_setoptions(struct task_struct *child, long data)
+static int ptrace_setoptions(struct task_struct *child, unsigned long data)
{
child->ptrace &= ~PT_TRACE_MASK;
@@ -481,7 +483,8 @@ static int ptrace_setsiginfo(struct task_struct *child, const siginfo_t *info)
#define is_sysemu_singlestep(request) 0
#endif
-static int ptrace_resume(struct task_struct *child, long request, long data)
+static int ptrace_resume(struct task_struct *child, long request,
+ unsigned long data)
{
if (!valid_signal(data))
return -EIO;
@@ -558,10 +561,12 @@ static int ptrace_regset(struct task_struct *task, int req, unsigned int type,
#endif
int ptrace_request(struct task_struct *child, long request,
- long addr, long data)
+ unsigned long addr, unsigned long data)
{
int ret = -EIO;
siginfo_t siginfo;
+ void __user *datavp = (void __user *) data;
+ unsigned long __user *datalp = datavp;
switch (request) {
case PTRACE_PEEKTEXT:
@@ -578,19 +583,17 @@ int ptrace_request(struct task_struct *child, long request,
ret = ptrace_setoptions(child, data);
break;
case PTRACE_GETEVENTMSG:
- ret = put_user(child->ptrace_message, (unsigned long __user *) data);
+ ret = put_user(child->ptrace_message, datalp);
break;
case PTRACE_GETSIGINFO:
ret = ptrace_getsiginfo(child, &siginfo);
if (!ret)
- ret = copy_siginfo_to_user((siginfo_t __user *) data,
- &siginfo);
+ ret = copy_siginfo_to_user(datavp, &siginfo);
break;
case PTRACE_SETSIGINFO:
- if (copy_from_user(&siginfo, (siginfo_t __user *) data,
- sizeof siginfo))
+ if (copy_from_user(&siginfo, datavp, sizeof siginfo))
ret = -EFAULT;
else
ret = ptrace_setsiginfo(child, &siginfo);
@@ -621,7 +624,7 @@ int ptrace_request(struct task_struct *child, long request,
}
mmput(mm);
- ret = put_user(tmp, (unsigned long __user *) data);
+ ret = put_user(tmp, datalp);
break;
}
#endif
@@ -650,7 +653,7 @@ int ptrace_request(struct task_struct *child, long request,
case PTRACE_SETREGSET:
{
struct iovec kiov;
- struct iovec __user *uiov = (struct iovec __user *) data;
+ struct iovec __user *uiov = datavp;
if (!access_ok(VERIFY_WRITE, uiov, sizeof(*uiov)))
return -EFAULT;
@@ -691,7 +694,8 @@ static struct task_struct *ptrace_get_task_struct(pid_t pid)
#define arch_ptrace_attach(child) do { } while (0)
#endif
-SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data)
+SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr,
+ unsigned long, data)
{
struct task_struct *child;
long ret;
@@ -732,7 +736,8 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data)
return ret;
}
-int generic_ptrace_peekdata(struct task_struct *tsk, long addr, long data)
+int generic_ptrace_peekdata(struct task_struct *tsk, unsigned long addr,
+ unsigned long data)
{
unsigned long tmp;
int copied;
@@ -743,7 +748,8 @@ int generic_ptrace_peekdata(struct task_struct *tsk, long addr, long data)
return put_user(tmp, (unsigned long __user *)data);
}
-int generic_ptrace_pokedata(struct task_struct *tsk, long addr, long data)
+int generic_ptrace_pokedata(struct task_struct *tsk, unsigned long addr,
+ unsigned long data)
{
int copied;
diff --git a/kernel/range.c b/kernel/range.c
index 471b66acabb5..37fa9b99ad58 100644
--- a/kernel/range.c
+++ b/kernel/range.c
@@ -119,7 +119,7 @@ static int cmp_range(const void *x1, const void *x2)
int clean_sort_range(struct range *range, int az)
{
- int i, j, k = az - 1, nr_range = 0;
+ int i, j, k = az - 1, nr_range = az;
for (i = 0; i < k; i++) {
if (range[i].end)
diff --git a/kernel/relay.c b/kernel/relay.c
index c7cf397fb929..859ea5a9605f 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -70,17 +70,10 @@ static const struct vm_operations_struct relay_file_mmap_ops = {
*/
static struct page **relay_alloc_page_array(unsigned int n_pages)
{
- struct page **array;
- size_t pa_size = n_pages * sizeof(struct page *);
-
- if (pa_size > PAGE_SIZE) {
- array = vmalloc(pa_size);
- if (array)
- memset(array, 0, pa_size);
- } else {
- array = kzalloc(pa_size, GFP_KERNEL);
- }
- return array;
+ const size_t pa_size = n_pages * sizeof(struct page *);
+ if (pa_size > PAGE_SIZE)
+ return vzalloc(pa_size);
+ return kzalloc(pa_size, GFP_KERNEL);
}
/*
diff --git a/kernel/resource.c b/kernel/resource.c
index 7b36976e5dea..9fad33efd0db 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -40,6 +40,23 @@ EXPORT_SYMBOL(iomem_resource);
static DEFINE_RWLOCK(resource_lock);
+/*
+ * By default, we allocate free space bottom-up. The architecture can request
+ * top-down by clearing this flag. The user can override the architecture's
+ * choice with the "resource_alloc_from_bottom" kernel boot option, but that
+ * should only be a debugging tool.
+ */
+int resource_alloc_from_bottom = 1;
+
+static __init int setup_alloc_from_bottom(char *s)
+{
+ printk(KERN_INFO
+ "resource: allocating from bottom-up; please report a bug\n");
+ resource_alloc_from_bottom = 1;
+ return 0;
+}
+early_param("resource_alloc_from_bottom", setup_alloc_from_bottom);
+
static void *r_next(struct seq_file *m, void *v, loff_t *pos)
{
struct resource *p = v;
@@ -357,8 +374,97 @@ int __weak page_is_ram(unsigned long pfn)
return walk_system_ram_range(pfn, 1, NULL, __is_ram) == 1;
}
+static resource_size_t simple_align_resource(void *data,
+ const struct resource *avail,
+ resource_size_t size,
+ resource_size_t align)
+{
+ return avail->start;
+}
+
+static void resource_clip(struct resource *res, resource_size_t min,
+ resource_size_t max)
+{
+ if (res->start < min)
+ res->start = min;
+ if (res->end > max)
+ res->end = max;
+}
+
+static bool resource_contains(struct resource *res1, struct resource *res2)
+{
+ return res1->start <= res2->start && res1->end >= res2->end;
+}
+
+/*
+ * Find the resource before "child" in the sibling list of "root" children.
+ */
+static struct resource *find_sibling_prev(struct resource *root, struct resource *child)
+{
+ struct resource *this;
+
+ for (this = root->child; this; this = this->sibling)
+ if (this->sibling == child)
+ return this;
+
+ return NULL;
+}
+
+/*
+ * Find empty slot in the resource tree given range and alignment.
+ * This version allocates from the end of the root resource first.
+ */
+static int find_resource_from_top(struct resource *root, struct resource *new,
+ resource_size_t size, resource_size_t min,
+ resource_size_t max, resource_size_t align,
+ resource_size_t (*alignf)(void *,
+ const struct resource *,
+ resource_size_t,
+ resource_size_t),
+ void *alignf_data)
+{
+ struct resource *this;
+ struct resource tmp, avail, alloc;
+
+ tmp.start = root->end;
+ tmp.end = root->end;
+
+ this = find_sibling_prev(root, NULL);
+ for (;;) {
+ if (this) {
+ if (this->end < root->end)
+ tmp.start = this->end + 1;
+ } else
+ tmp.start = root->start;
+
+ resource_clip(&tmp, min, max);
+
+ /* Check for overflow after ALIGN() */
+ avail = *new;
+ avail.start = ALIGN(tmp.start, align);
+ avail.end = tmp.end;
+ if (avail.start >= tmp.start) {
+ alloc.start = alignf(alignf_data, &avail, size, align);
+ alloc.end = alloc.start + size - 1;
+ if (resource_contains(&avail, &alloc)) {
+ new->start = alloc.start;
+ new->end = alloc.end;
+ return 0;
+ }
+ }
+
+ if (!this || this->start == root->start)
+ break;
+
+ tmp.end = this->start - 1;
+ this = find_sibling_prev(root, this);
+ }
+ return -EBUSY;
+}
+
/*
* Find empty slot in the resource tree given range and alignment.
+ * This version allocates from the beginning of the root resource first.
*/
static int find_resource(struct resource *root, struct resource *new,
resource_size_t size, resource_size_t min,
@@ -370,36 +476,43 @@ static int find_resource(struct resource *root, struct resource *new,
void *alignf_data)
{
struct resource *this = root->child;
- struct resource tmp = *new;
+ struct resource tmp = *new, avail, alloc;
tmp.start = root->start;
/*
- * Skip past an allocated resource that starts at 0, since the assignment
- * of this->start - 1 to tmp->end below would cause an underflow.
+ * Skip past an allocated resource that starts at 0, since the
+ * assignment of this->start - 1 to tmp->end below would cause an
+ * underflow.
*/
if (this && this->start == 0) {
tmp.start = this->end + 1;
this = this->sibling;
}
- for(;;) {
+ for (;;) {
if (this)
tmp.end = this->start - 1;
else
tmp.end = root->end;
- if (tmp.start < min)
- tmp.start = min;
- if (tmp.end > max)
- tmp.end = max;
- tmp.start = ALIGN(tmp.start, align);
- if (alignf)
- tmp.start = alignf(alignf_data, &tmp, size, align);
- if (tmp.start < tmp.end && tmp.end - tmp.start >= size - 1) {
- new->start = tmp.start;
- new->end = tmp.start + size - 1;
- return 0;
+
+ resource_clip(&tmp, min, max);
+
+ /* Check for overflow after ALIGN() */
+ avail = *new;
+ avail.start = ALIGN(tmp.start, align);
+ avail.end = tmp.end;
+ if (avail.start >= tmp.start) {
+ alloc.start = alignf(alignf_data, &avail, size, align);
+ alloc.end = alloc.start + size - 1;
+ if (resource_contains(&avail, &alloc)) {
+ new->start = alloc.start;
+ new->end = alloc.end;
+ return 0;
+ }
}
+
if (!this)
break;
+
tmp.start = this->end + 1;
this = this->sibling;
}
@@ -428,8 +541,14 @@ int allocate_resource(struct resource *root, struct resource *new,
{
int err;
+ if (!alignf)
+ alignf = simple_align_resource;
+
write_lock(&resource_lock);
- err = find_resource(root, new, size, min, max, align, alignf, alignf_data);
+ if (resource_alloc_from_bottom)
+ err = find_resource(root, new, size, min, max, align, alignf, alignf_data);
+ else
+ err = find_resource_from_top(root, new, size, min, max, align, alignf, alignf_data);
if (err >= 0 && __request_resource(root, new))
err = -EBUSY;
write_unlock(&resource_lock);
@@ -453,6 +572,8 @@ static struct resource * __insert_resource(struct resource *parent, struct resou
if (first == parent)
return first;
+ if (WARN_ON(first == new)) /* duplicated insertion */
+ return first;
if ((first->start > new->start) || (first->end < new->end))
break;
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index a56f629b057a..66cb89bc5ef1 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -76,7 +76,9 @@ static int handle_op(struct test_thread_data *td, int lockwakeup)
}
if (!lockwakeup && td->bkl == 4) {
+#ifdef CONFIG_LOCK_KERNEL
unlock_kernel();
+#endif
td->bkl = 0;
}
return 0;
@@ -133,14 +135,18 @@ static int handle_op(struct test_thread_data *td, int lockwakeup)
if (td->bkl)
return 0;
td->bkl = 1;
+#ifdef CONFIG_LOCK_KERNEL
lock_kernel();
+#endif
td->bkl = 4;
return 0;
case RTTEST_UNLOCKBKL:
if (td->bkl != 4)
break;
+#ifdef CONFIG_LOCK_KERNEL
unlock_kernel();
+#endif
td->bkl = 0;
return 0;
diff --git a/kernel/sched.c b/kernel/sched.c
index 51944e8c38a8..41f18695b730 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8510,12 +8510,12 @@ void sched_move_task(struct task_struct *tsk)
if (unlikely(running))
tsk->sched_class->put_prev_task(rq, tsk);
- set_task_rq(tsk, task_cpu(tsk));
-
#ifdef CONFIG_FAIR_GROUP_SCHED
- if (tsk->sched_class->moved_group)
- tsk->sched_class->moved_group(tsk, on_rq);
+ if (tsk->sched_class->task_move_group)
+ tsk->sched_class->task_move_group(tsk, on_rq);
+ else
#endif
+ set_task_rq(tsk, task_cpu(tsk));
if (unlikely(running))
tsk->sched_class->set_curr_task(rq);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 933f3d1b62ea..f4f6a8326dd0 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -3869,13 +3869,26 @@ static void set_curr_task_fair(struct rq *rq)
}
#ifdef CONFIG_FAIR_GROUP_SCHED
-static void moved_group_fair(struct task_struct *p, int on_rq)
+static void task_move_group_fair(struct task_struct *p, int on_rq)
{
- struct cfs_rq *cfs_rq = task_cfs_rq(p);
-
- update_curr(cfs_rq);
+ /*
+ * If the task was not on the rq at the time of this cgroup movement
+ * it must have been asleep, sleeping tasks keep their ->vruntime
+ * absolute on their old rq until wakeup (needed for the fair sleeper
+ * bonus in place_entity()).
+ *
+ * If it was on the rq, we've just 'preempted' it, which does convert
+ * ->vruntime to a relative base.
+ *
+ * Make sure both cases convert their relative position when migrating
+ * to another cgroup's rq. This does somewhat interfere with the
+ * fair sleeper stuff for the first placement, but who cares.
+ */
+ if (!on_rq)
+ p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
+ set_task_rq(p, task_cpu(p));
if (!on_rq)
- place_entity(cfs_rq, &p->se, 1);
+ p->se.vruntime += cfs_rq_of(&p->se)->min_vruntime;
}
#endif
@@ -3927,7 +3940,7 @@ static const struct sched_class fair_sched_class = {
.get_rr_interval = get_rr_interval_fair,
#ifdef CONFIG_FAIR_GROUP_SCHED
- .moved_group = moved_group_fair,
+ .task_move_group = task_move_group_fair,
#endif
};
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 25c2f962f6fc..48ddf431db0e 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -157,15 +157,7 @@ static inline void sched_info_reset_dequeued(struct task_struct *t)
}
/*
- * Called when a process is dequeued from the active array and given
- * the cpu. We should note that with the exception of interactive
- * tasks, the expired queue will become the active queue after the active
- * queue is empty, without explicitly dequeuing and requeuing tasks in the
- * expired queue. (Interactive tasks may be requeued directly to the
- * active queue, thus delaying tasks in the expired queue from running;
- * see scheduler_tick()).
- *
- * Though we are interested in knowing how long it was from the *first* time a
+ * We are interested in knowing how long it was from the *first* time a
* task was queued to the time that it finally hit a cpu, we call this routine
* from dequeue_task() to account for possible rq->clock skew across cpus. The
* delta taken on each cpu would annul the skew.
@@ -203,16 +195,6 @@ static void sched_info_arrive(struct task_struct *t)
}
/*
- * Called when a process is queued into either the active or expired
- * array. The time is noted and later used to determine how long we
- * had to wait for us to reach the cpu. Since the expired queue will
- * become the active queue after active queue is empty, without dequeuing
- * and requeuing any tasks, we are interested in queuing to either. It
- * is unusual but not impossible for tasks to be dequeued and immediately
- * requeued in the same or another array: this can happen in sched_yield(),
- * set_user_nice(), and even load_balance() as it moves tasks from runqueue
- * to runqueue.
- *
* This function is only called from enqueue_task(), but also only updates
* the timestamp if it is already not set. It's assumed that
* sched_info_dequeued() will clear that stamp when appropriate.
diff --git a/kernel/signal.c b/kernel/signal.c
index 919562c3d6b7..4e3cff10fdce 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1105,7 +1105,8 @@ int zap_other_threads(struct task_struct *p)
return count;
}
-struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags)
+struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
+ unsigned long *flags)
{
struct sighand_struct *sighand;
@@ -1617,6 +1618,8 @@ static int sigkill_pending(struct task_struct *tsk)
* is gone, we keep current->exit_code unless clear_code.
*/
static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
+ __releases(&current->sighand->siglock)
+ __acquires(&current->sighand->siglock)
{
if (arch_ptrace_stop_needed(exit_code, info)) {
/*
diff --git a/kernel/smp.c b/kernel/smp.c
index ed6aacfcb7ef..12ed8b013e2d 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -267,7 +267,7 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data);
*
* Returns 0 on success, else a negative status code.
*/
-int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
+int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
int wait)
{
struct call_single_data d = {
@@ -336,7 +336,7 @@ EXPORT_SYMBOL(smp_call_function_single);
* 3) any other online cpu in @mask
*/
int smp_call_function_any(const struct cpumask *mask,
- void (*func)(void *info), void *info, int wait)
+ smp_call_func_t func, void *info, int wait)
{
unsigned int cpu;
const struct cpumask *nodemask;
@@ -416,7 +416,7 @@ void __smp_call_function_single(int cpu, struct call_single_data *data,
* must be disabled when calling this function.
*/
void smp_call_function_many(const struct cpumask *mask,
- void (*func)(void *), void *info, bool wait)
+ smp_call_func_t func, void *info, bool wait)
{
struct call_function_data *data;
unsigned long flags;
@@ -500,7 +500,7 @@ EXPORT_SYMBOL(smp_call_function_many);
* You must not call this function with disabled interrupts or from a
* hardware interrupt handler or from a bottom half handler.
*/
-int smp_call_function(void (*func)(void *), void *info, int wait)
+int smp_call_function(smp_call_func_t func, void *info, int wait)
{
preempt_disable();
smp_call_function_many(cpu_online_mask, func, info, wait);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 081869ed3a9f..d4d918a91881 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -67,7 +67,7 @@ char *softirq_to_name[NR_SOFTIRQS] = {
* to the pending events, so lets the scheduler to balance
* the softirq load for us.
*/
-void wakeup_softirqd(void)
+static void wakeup_softirqd(void)
{
/* Interrupts are disabled: no need to stop preemption */
struct task_struct *tsk = __get_cpu_var(ksoftirqd);
@@ -229,18 +229,20 @@ restart:
do {
if (pending & 1) {
+ unsigned int vec_nr = h - softirq_vec;
int prev_count = preempt_count();
- kstat_incr_softirqs_this_cpu(h - softirq_vec);
- trace_softirq_entry(h, softirq_vec);
+ kstat_incr_softirqs_this_cpu(vec_nr);
+
+ trace_softirq_entry(vec_nr);
h->action(h);
- trace_softirq_exit(h, softirq_vec);
+ trace_softirq_exit(vec_nr);
if (unlikely(prev_count != preempt_count())) {
- printk(KERN_ERR "huh, entered softirq %td %s %p"
+ printk(KERN_ERR "huh, entered softirq %u %s %p"
"with preempt_count %08x,"
- " exited with %08x?\n", h - softirq_vec,
- softirq_to_name[h - softirq_vec],
- h->action, prev_count, preempt_count());
+ " exited with %08x?\n", vec_nr,
+ softirq_to_name[vec_nr], h->action,
+ prev_count, preempt_count());
preempt_count() = prev_count;
}
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 090c28812ce1..2df820b03beb 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -262,7 +262,7 @@ repeat:
cpu_stop_fn_t fn = work->fn;
void *arg = work->arg;
struct cpu_stop_done *done = work->done;
- char ksym_buf[KSYM_NAME_LEN];
+ char ksym_buf[KSYM_NAME_LEN] __maybe_unused;
__set_current_state(TASK_RUNNING);
@@ -304,7 +304,7 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
p = kthread_create(cpu_stopper_thread, stopper, "migration/%d",
cpu);
if (IS_ERR(p))
- return NOTIFY_BAD;
+ return notifier_from_errno(PTR_ERR(p));
get_task_struct(p);
kthread_bind(p, cpu);
sched_set_stop_task(cpu, p);
@@ -372,7 +372,7 @@ static int __init cpu_stop_init(void)
/* start one for the boot cpu */
err = cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_UP_PREPARE,
bcpu);
- BUG_ON(err == NOTIFY_BAD);
+ BUG_ON(err != NOTIFY_OK);
cpu_stop_cpu_callback(&cpu_stop_cpu_notifier, CPU_ONLINE, bcpu);
register_cpu_notifier(&cpu_stop_cpu_notifier);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 3a45c224770f..b65bf634035e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -161,8 +161,6 @@ extern int no_unaligned_warning;
extern int unaligned_dump_stack;
#endif
-extern struct ratelimit_state printk_ratelimit_state;
-
#ifdef CONFIG_PROC_SYSCTL
static int proc_do_cad_pid(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos);
@@ -706,6 +704,15 @@ static struct ctl_table kern_table[] = {
},
#endif
{
+ .procname = "dmesg_restrict",
+ .data = &dmesg_restrict,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+ {
.procname = "ngroups_max",
.data = &ngroups_max,
.maxlen = sizeof (int),
@@ -1340,28 +1347,28 @@ static struct ctl_table fs_table[] = {
.data = &inodes_stat,
.maxlen = 2*sizeof(int),
.mode = 0444,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_nr_inodes,
},
{
.procname = "inode-state",
.data = &inodes_stat,
.maxlen = 7*sizeof(int),
.mode = 0444,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_nr_inodes,
},
{
.procname = "file-nr",
.data = &files_stat,
- .maxlen = 3*sizeof(int),
+ .maxlen = sizeof(files_stat),
.mode = 0444,
.proc_handler = proc_nr_files,
},
{
.procname = "file-max",
.data = &files_stat.max_files,
- .maxlen = sizeof(int),
+ .maxlen = sizeof(files_stat.max_files),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_doulongvec_minmax,
},
{
.procname = "nr_open",
@@ -1377,7 +1384,7 @@ static struct ctl_table fs_table[] = {
.data = &dentry_stat,
.maxlen = 6*sizeof(int),
.mode = 0444,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_nr_dentry,
},
{
.procname = "overflowuid",
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 11281d5792bd..c8231fb15708 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -175,22 +175,8 @@ static void send_cpu_listeners(struct sk_buff *skb,
up_write(&listeners->sem);
}
-static int fill_pid(pid_t pid, struct task_struct *tsk,
- struct taskstats *stats)
+static void fill_stats(struct task_struct *tsk, struct taskstats *stats)
{
- int rc = 0;
-
- if (!tsk) {
- rcu_read_lock();
- tsk = find_task_by_vpid(pid);
- if (tsk)
- get_task_struct(tsk);
- rcu_read_unlock();
- if (!tsk)
- return -ESRCH;
- } else
- get_task_struct(tsk);
-
memset(stats, 0, sizeof(*stats));
/*
* Each accounting subsystem adds calls to its functions to
@@ -209,17 +195,27 @@ static int fill_pid(pid_t pid, struct task_struct *tsk,
/* fill in extended acct fields */
xacct_add_tsk(stats, tsk);
+}
- /* Define err: label here if needed */
- put_task_struct(tsk);
- return rc;
+static int fill_stats_for_pid(pid_t pid, struct taskstats *stats)
+{
+ struct task_struct *tsk;
+ rcu_read_lock();
+ tsk = find_task_by_vpid(pid);
+ if (tsk)
+ get_task_struct(tsk);
+ rcu_read_unlock();
+ if (!tsk)
+ return -ESRCH;
+ fill_stats(tsk, stats);
+ put_task_struct(tsk);
+ return 0;
}
-static int fill_tgid(pid_t tgid, struct task_struct *first,
- struct taskstats *stats)
+static int fill_stats_for_tgid(pid_t tgid, struct taskstats *stats)
{
- struct task_struct *tsk;
+ struct task_struct *tsk, *first;
unsigned long flags;
int rc = -ESRCH;
@@ -228,8 +224,7 @@ static int fill_tgid(pid_t tgid, struct task_struct *first,
* leaders who are already counted with the dead tasks
*/
rcu_read_lock();
- if (!first)
- first = find_task_by_vpid(tgid);
+ first = find_task_by_vpid(tgid);
if (!first || !lock_task_sighand(first, &flags))
goto out;
@@ -268,7 +263,6 @@ out:
return rc;
}
-
static void fill_tgid_exit(struct task_struct *tsk)
{
unsigned long flags;
@@ -360,6 +354,12 @@ static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid)
struct nlattr *na, *ret;
int aggr;
+ /* If we don't pad, we end up with alignment on a 4 byte boundary.
+ * This causes lots of runtime warnings on systems requiring 8 byte
+ * alignment */
+ u32 pids[2] = { pid, 0 };
+ int pid_size = ALIGN(sizeof(pid), sizeof(long));
+
aggr = (type == TASKSTATS_TYPE_PID)
? TASKSTATS_TYPE_AGGR_PID
: TASKSTATS_TYPE_AGGR_TGID;
@@ -367,7 +367,7 @@ static struct taskstats *mk_reply(struct sk_buff *skb, int type, u32 pid)
na = nla_nest_start(skb, aggr);
if (!na)
goto err;
- if (nla_put(skb, type, sizeof(pid), &pid) < 0)
+ if (nla_put(skb, type, pid_size, pids) < 0)
goto err;
ret = nla_reserve(skb, TASKSTATS_TYPE_STATS, sizeof(struct taskstats));
if (!ret)
@@ -424,39 +424,46 @@ err:
return rc;
}
-static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
+static int cmd_attr_register_cpumask(struct genl_info *info)
{
- int rc;
- struct sk_buff *rep_skb;
- struct taskstats *stats;
- size_t size;
cpumask_var_t mask;
+ int rc;
if (!alloc_cpumask_var(&mask, GFP_KERNEL))
return -ENOMEM;
-
rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], mask);
if (rc < 0)
- goto free_return_rc;
- if (rc == 0) {
- rc = add_del_listener(info->snd_pid, mask, REGISTER);
- goto free_return_rc;
- }
+ goto out;
+ rc = add_del_listener(info->snd_pid, mask, REGISTER);
+out:
+ free_cpumask_var(mask);
+ return rc;
+}
+
+static int cmd_attr_deregister_cpumask(struct genl_info *info)
+{
+ cpumask_var_t mask;
+ int rc;
+ if (!alloc_cpumask_var(&mask, GFP_KERNEL))
+ return -ENOMEM;
rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], mask);
if (rc < 0)
- goto free_return_rc;
- if (rc == 0) {
- rc = add_del_listener(info->snd_pid, mask, DEREGISTER);
-free_return_rc:
- free_cpumask_var(mask);
- return rc;
- }
+ goto out;
+ rc = add_del_listener(info->snd_pid, mask, DEREGISTER);
+out:
free_cpumask_var(mask);
+ return rc;
+}
+
+static int cmd_attr_pid(struct genl_info *info)
+{
+ struct taskstats *stats;
+ struct sk_buff *rep_skb;
+ size_t size;
+ u32 pid;
+ int rc;
- /*
- * Size includes space for nested attributes
- */
size = nla_total_size(sizeof(u32)) +
nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
@@ -465,33 +472,64 @@ free_return_rc:
return rc;
rc = -EINVAL;
- if (info->attrs[TASKSTATS_CMD_ATTR_PID]) {
- u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]);
- stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid);
- if (!stats)
- goto err;
-
- rc = fill_pid(pid, NULL, stats);
- if (rc < 0)
- goto err;
- } else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) {
- u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]);
- stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid);
- if (!stats)
- goto err;
-
- rc = fill_tgid(tgid, NULL, stats);
- if (rc < 0)
- goto err;
- } else
+ pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]);
+ stats = mk_reply(rep_skb, TASKSTATS_TYPE_PID, pid);
+ if (!stats)
+ goto err;
+
+ rc = fill_stats_for_pid(pid, stats);
+ if (rc < 0)
+ goto err;
+ return send_reply(rep_skb, info);
+err:
+ nlmsg_free(rep_skb);
+ return rc;
+}
+
+static int cmd_attr_tgid(struct genl_info *info)
+{
+ struct taskstats *stats;
+ struct sk_buff *rep_skb;
+ size_t size;
+ u32 tgid;
+ int rc;
+
+ size = nla_total_size(sizeof(u32)) +
+ nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
+
+ rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, size);
+ if (rc < 0)
+ return rc;
+
+ rc = -EINVAL;
+ tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]);
+ stats = mk_reply(rep_skb, TASKSTATS_TYPE_TGID, tgid);
+ if (!stats)
goto err;
+ rc = fill_stats_for_tgid(tgid, stats);
+ if (rc < 0)
+ goto err;
return send_reply(rep_skb, info);
err:
nlmsg_free(rep_skb);
return rc;
}
+static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
+{
+ if (info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK])
+ return cmd_attr_register_cpumask(info);
+ else if (info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK])
+ return cmd_attr_deregister_cpumask(info);
+ else if (info->attrs[TASKSTATS_CMD_ATTR_PID])
+ return cmd_attr_pid(info);
+ else if (info->attrs[TASKSTATS_CMD_ATTR_TGID])
+ return cmd_attr_tgid(info);
+ else
+ return -EINVAL;
+}
+
static struct taskstats *taskstats_tgid_alloc(struct task_struct *tsk)
{
struct signal_struct *sig = tsk->signal;
@@ -555,9 +593,7 @@ void taskstats_exit(struct task_struct *tsk, int group_dead)
if (!stats)
goto err;
- rc = fill_pid(-1, tsk, stats);
- if (rc < 0)
- goto err;
+ fill_stats(tsk, stats);
/*
* Doesn't matter if tsk is the leader or the last group member leaving
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 959f8d6c8cc1..7b8ec0281548 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -23,7 +23,6 @@
#include <linux/mutex.h>
#include <linux/slab.h>
#include <linux/debugfs.h>
-#include <linux/smp_lock.h>
#include <linux/time.h>
#include <linux/uaccess.h>
@@ -169,7 +168,6 @@ static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),
BLK_TC_ACT(BLK_TC_WRITE) };
-#define BLK_TC_HARDBARRIER BLK_TC_BARRIER
#define BLK_TC_RAHEAD BLK_TC_AHEAD
/* The ilog2() calls fall out because they're constant */
@@ -197,7 +195,6 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
return;
what |= ddir_act[rw & WRITE];
- what |= MASK_TC_BIT(rw, HARDBARRIER);
what |= MASK_TC_BIT(rw, SYNC);
what |= MASK_TC_BIT(rw, RAHEAD);
what |= MASK_TC_BIT(rw, META);
@@ -326,6 +323,7 @@ static const struct file_operations blk_dropped_fops = {
.owner = THIS_MODULE,
.open = blk_dropped_open,
.read = blk_dropped_read,
+ .llseek = default_llseek,
};
static int blk_msg_open(struct inode *inode, struct file *filp)
@@ -365,6 +363,7 @@ static const struct file_operations blk_msg_fops = {
.owner = THIS_MODULE,
.open = blk_msg_open,
.write = blk_msg_write,
+ .llseek = noop_llseek,
};
/*
@@ -639,7 +638,6 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
if (!q)
return -ENXIO;
- lock_kernel();
mutex_lock(&bdev->bd_mutex);
switch (cmd) {
@@ -667,7 +665,6 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
}
mutex_unlock(&bdev->bd_mutex);
- unlock_kernel();
return ret;
}
@@ -1652,10 +1649,9 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
struct block_device *bdev;
ssize_t ret = -ENXIO;
- lock_kernel();
bdev = bdget(part_devt(p));
if (bdev == NULL)
- goto out_unlock_kernel;
+ goto out;
q = blk_trace_get_queue(bdev);
if (q == NULL)
@@ -1683,8 +1679,7 @@ out_unlock_bdev:
mutex_unlock(&bdev->bd_mutex);
out_bdput:
bdput(bdev);
-out_unlock_kernel:
- unlock_kernel();
+out:
return ret;
}
@@ -1714,11 +1709,10 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
ret = -ENXIO;
- lock_kernel();
p = dev_to_part(dev);
bdev = bdget(part_devt(p));
if (bdev == NULL)
- goto out_unlock_kernel;
+ goto out;
q = blk_trace_get_queue(bdev);
if (q == NULL)
@@ -1753,8 +1747,6 @@ out_unlock_bdev:
mutex_unlock(&bdev->bd_mutex);
out_bdput:
bdput(bdev);
-out_unlock_kernel:
- unlock_kernel();
out:
return ret ? ret : count;
}
@@ -1813,8 +1805,6 @@ void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
if (rw & REQ_RAHEAD)
rwbs[i++] = 'A';
- if (rw & REQ_HARDBARRIER)
- rwbs[i++] = 'B';
if (rw & REQ_SYNC)
rwbs[i++] = 'S';
if (rw & REQ_META)
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index ebd80d50c474..f3dadae83883 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -800,6 +800,7 @@ static const struct file_operations ftrace_profile_fops = {
.open = tracing_open_generic,
.read = ftrace_profile_read,
.write = ftrace_profile_write,
+ .llseek = default_llseek,
};
/* used to initialize the real stat files */
@@ -2669,6 +2670,7 @@ static const struct file_operations ftrace_graph_fops = {
.read = seq_read,
.write = ftrace_graph_write,
.release = ftrace_graph_release,
+ .llseek = seq_lseek,
};
#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index c5a632a669e1..9ed509a015d8 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -224,6 +224,9 @@ enum {
RB_LEN_TIME_STAMP = 16,
};
+#define skip_time_extend(event) \
+ ((struct ring_buffer_event *)((char *)event + RB_LEN_TIME_EXTEND))
+
static inline int rb_null_event(struct ring_buffer_event *event)
{
return event->type_len == RINGBUF_TYPE_PADDING && !event->time_delta;
@@ -248,8 +251,12 @@ rb_event_data_length(struct ring_buffer_event *event)
return length + RB_EVNT_HDR_SIZE;
}
-/* inline for ring buffer fast paths */
-static unsigned
+/*
+ * Return the length of the given event. Will return
+ * the length of the time extend if the event is a
+ * time extend.
+ */
+static inline unsigned
rb_event_length(struct ring_buffer_event *event)
{
switch (event->type_len) {
@@ -274,13 +281,41 @@ rb_event_length(struct ring_buffer_event *event)
return 0;
}
+/*
+ * Return total length of time extend and data,
+ * or just the event length for all other events.
+ */
+static inline unsigned
+rb_event_ts_length(struct ring_buffer_event *event)
+{
+ unsigned len = 0;
+
+ if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
+ /* time extends include the data event after it */
+ len = RB_LEN_TIME_EXTEND;
+ event = skip_time_extend(event);
+ }
+ return len + rb_event_length(event);
+}
+
/**
* ring_buffer_event_length - return the length of the event
* @event: the event to get the length of
+ *
+ * Returns the size of the data load of a data event.
+ * If the event is something other than a data event, it
+ * returns the size of the event itself. With the exception
+ * of a TIME EXTEND, where it still returns the size of the
+ * data load of the data event after it.
*/
unsigned ring_buffer_event_length(struct ring_buffer_event *event)
{
- unsigned length = rb_event_length(event);
+ unsigned length;
+
+ if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
+ event = skip_time_extend(event);
+
+ length = rb_event_length(event);
if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
return length;
length -= RB_EVNT_HDR_SIZE;
@@ -294,6 +329,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_length);
static void *
rb_event_data(struct ring_buffer_event *event)
{
+ if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
+ event = skip_time_extend(event);
BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
/* If length is in len field, then array[0] has the data */
if (event->type_len)
@@ -404,9 +441,6 @@ static inline int test_time_stamp(u64 delta)
/* Max payload is BUF_PAGE_SIZE - header (8bytes) */
#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2))
-/* Max number of timestamps that can fit on a page */
-#define RB_TIMESTAMPS_PER_PAGE (BUF_PAGE_SIZE / RB_LEN_TIME_EXTEND)
-
int ring_buffer_print_page_header(struct trace_seq *s)
{
struct buffer_data_page field;
@@ -1546,6 +1580,25 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)
iter->head = 0;
}
+/* Slow path, do not inline */
+static noinline struct ring_buffer_event *
+rb_add_time_stamp(struct ring_buffer_event *event, u64 delta)
+{
+ event->type_len = RINGBUF_TYPE_TIME_EXTEND;
+
+ /* Not the first event on the page? */
+ if (rb_event_index(event)) {
+ event->time_delta = delta & TS_MASK;
+ event->array[0] = delta >> TS_SHIFT;
+ } else {
+ /* nope, just zero it */
+ event->time_delta = 0;
+ event->array[0] = 0;
+ }
+
+ return skip_time_extend(event);
+}
+
/**
* ring_buffer_update_event - update event type and data
* @event: the even to update
@@ -1558,28 +1611,31 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)
* data field.
*/
static void
-rb_update_event(struct ring_buffer_event *event,
- unsigned type, unsigned length)
+rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
+ struct ring_buffer_event *event, unsigned length,
+ int add_timestamp, u64 delta)
{
- event->type_len = type;
-
- switch (type) {
-
- case RINGBUF_TYPE_PADDING:
- case RINGBUF_TYPE_TIME_EXTEND:
- case RINGBUF_TYPE_TIME_STAMP:
- break;
+ /* Only a commit updates the timestamp */
+ if (unlikely(!rb_event_is_commit(cpu_buffer, event)))
+ delta = 0;
- case 0:
- length -= RB_EVNT_HDR_SIZE;
- if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT)
- event->array[0] = length;
- else
- event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
- break;
- default:
- BUG();
+ /*
+ * If we need to add a timestamp, then we
+ * add it to the start of the resevered space.
+ */
+ if (unlikely(add_timestamp)) {
+ event = rb_add_time_stamp(event, delta);
+ length -= RB_LEN_TIME_EXTEND;
+ delta = 0;
}
+
+ event->time_delta = delta;
+ length -= RB_EVNT_HDR_SIZE;
+ if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) {
+ event->type_len = 0;
+ event->array[0] = length;
+ } else
+ event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
}
/*
@@ -1823,10 +1879,13 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
local_sub(length, &tail_page->write);
}
-static struct ring_buffer_event *
+/*
+ * This is the slow path, force gcc not to inline it.
+ */
+static noinline struct ring_buffer_event *
rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
unsigned long length, unsigned long tail,
- struct buffer_page *tail_page, u64 *ts)
+ struct buffer_page *tail_page, u64 ts)
{
struct buffer_page *commit_page = cpu_buffer->commit_page;
struct ring_buffer *buffer = cpu_buffer->buffer;
@@ -1909,8 +1968,8 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
* Nested commits always have zero deltas, so
* just reread the time stamp
*/
- *ts = rb_time_stamp(buffer);
- next_page->page->time_stamp = *ts;
+ ts = rb_time_stamp(buffer);
+ next_page->page->time_stamp = ts;
}
out_again:
@@ -1929,12 +1988,21 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
static struct ring_buffer_event *
__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
- unsigned type, unsigned long length, u64 *ts)
+ unsigned long length, u64 ts,
+ u64 delta, int add_timestamp)
{
struct buffer_page *tail_page;
struct ring_buffer_event *event;
unsigned long tail, write;
+ /*
+ * If the time delta since the last event is too big to
+ * hold in the time field of the event, then we append a
+ * TIME EXTEND event ahead of the data event.
+ */
+ if (unlikely(add_timestamp))
+ length += RB_LEN_TIME_EXTEND;
+
tail_page = cpu_buffer->tail_page;
write = local_add_return(length, &tail_page->write);
@@ -1943,7 +2011,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
tail = write - length;
/* See if we shot pass the end of this buffer page */
- if (write > BUF_PAGE_SIZE)
+ if (unlikely(write > BUF_PAGE_SIZE))
return rb_move_tail(cpu_buffer, length, tail,
tail_page, ts);
@@ -1951,18 +2019,16 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
event = __rb_page_index(tail_page, tail);
kmemcheck_annotate_bitfield(event, bitfield);
- rb_update_event(event, type, length);
+ rb_update_event(cpu_buffer, event, length, add_timestamp, delta);
- /* The passed in type is zero for DATA */
- if (likely(!type))
- local_inc(&tail_page->entries);
+ local_inc(&tail_page->entries);
/*
* If this is the first commit on the page, then update
* its timestamp.
*/
if (!tail)
- tail_page->page->time_stamp = *ts;
+ tail_page->page->time_stamp = ts;
return event;
}
@@ -1977,7 +2043,7 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
unsigned long addr;
new_index = rb_event_index(event);
- old_index = new_index + rb_event_length(event);
+ old_index = new_index + rb_event_ts_length(event);
addr = (unsigned long)event;
addr &= PAGE_MASK;
@@ -2003,76 +2069,13 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
return 0;
}
-static int
-rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
- u64 *ts, u64 *delta)
-{
- struct ring_buffer_event *event;
- int ret;
-
- WARN_ONCE(*delta > (1ULL << 59),
- KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n",
- (unsigned long long)*delta,
- (unsigned long long)*ts,
- (unsigned long long)cpu_buffer->write_stamp);
-
- /*
- * The delta is too big, we to add a
- * new timestamp.
- */
- event = __rb_reserve_next(cpu_buffer,
- RINGBUF_TYPE_TIME_EXTEND,
- RB_LEN_TIME_EXTEND,
- ts);
- if (!event)
- return -EBUSY;
-
- if (PTR_ERR(event) == -EAGAIN)
- return -EAGAIN;
-
- /* Only a commited time event can update the write stamp */
- if (rb_event_is_commit(cpu_buffer, event)) {
- /*
- * If this is the first on the page, then it was
- * updated with the page itself. Try to discard it
- * and if we can't just make it zero.
- */
- if (rb_event_index(event)) {
- event->time_delta = *delta & TS_MASK;
- event->array[0] = *delta >> TS_SHIFT;
- } else {
- /* try to discard, since we do not need this */
- if (!rb_try_to_discard(cpu_buffer, event)) {
- /* nope, just zero it */
- event->time_delta = 0;
- event->array[0] = 0;
- }
- }
- cpu_buffer->write_stamp = *ts;
- /* let the caller know this was the commit */
- ret = 1;
- } else {
- /* Try to discard the event */
- if (!rb_try_to_discard(cpu_buffer, event)) {
- /* Darn, this is just wasted space */
- event->time_delta = 0;
- event->array[0] = 0;
- }
- ret = 0;
- }
-
- *delta = 0;
-
- return ret;
-}
-
static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer)
{
local_inc(&cpu_buffer->committing);
local_inc(&cpu_buffer->commits);
}
-static void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
+static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
{
unsigned long commits;
@@ -2110,9 +2113,10 @@ rb_reserve_next_event(struct ring_buffer *buffer,
unsigned long length)
{
struct ring_buffer_event *event;
- u64 ts, delta = 0;
- int commit = 0;
+ u64 ts, delta;
int nr_loops = 0;
+ int add_timestamp;
+ u64 diff;
rb_start_commit(cpu_buffer);
@@ -2133,6 +2137,9 @@ rb_reserve_next_event(struct ring_buffer *buffer,
length = rb_calculate_event_length(length);
again:
+ add_timestamp = 0;
+ delta = 0;
+
/*
* We allow for interrupts to reenter here and do a trace.
* If one does, it will cause this original code to loop
@@ -2146,56 +2153,32 @@ rb_reserve_next_event(struct ring_buffer *buffer,
goto out_fail;
ts = rb_time_stamp(cpu_buffer->buffer);
+ diff = ts - cpu_buffer->write_stamp;
- /*
- * Only the first commit can update the timestamp.
- * Yes there is a race here. If an interrupt comes in
- * just after the conditional and it traces too, then it
- * will also check the deltas. More than one timestamp may
- * also be made. But only the entry that did the actual
- * commit will be something other than zero.
- */
- if (likely(cpu_buffer->tail_page == cpu_buffer->commit_page &&
- rb_page_write(cpu_buffer->tail_page) ==
- rb_commit_index(cpu_buffer))) {
- u64 diff;
-
- diff = ts - cpu_buffer->write_stamp;
-
- /* make sure this diff is calculated here */
- barrier();
-
- /* Did the write stamp get updated already? */
- if (unlikely(ts < cpu_buffer->write_stamp))
- goto get_event;
+ /* make sure this diff is calculated here */
+ barrier();
+ /* Did the write stamp get updated already? */
+ if (likely(ts >= cpu_buffer->write_stamp)) {
delta = diff;
if (unlikely(test_time_stamp(delta))) {
-
- commit = rb_add_time_stamp(cpu_buffer, &ts, &delta);
- if (commit == -EBUSY)
- goto out_fail;
-
- if (commit == -EAGAIN)
- goto again;
-
- RB_WARN_ON(cpu_buffer, commit < 0);
+ WARN_ONCE(delta > (1ULL << 59),
+ KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n",
+ (unsigned long long)delta,
+ (unsigned long long)ts,
+ (unsigned long long)cpu_buffer->write_stamp);
+ add_timestamp = 1;
}
}
- get_event:
- event = __rb_reserve_next(cpu_buffer, 0, length, &ts);
+ event = __rb_reserve_next(cpu_buffer, length, ts,
+ delta, add_timestamp);
if (unlikely(PTR_ERR(event) == -EAGAIN))
goto again;
if (!event)
goto out_fail;
- if (!rb_event_is_commit(cpu_buffer, event))
- delta = 0;
-
- event->time_delta = delta;
-
return event;
out_fail:
@@ -2207,13 +2190,9 @@ rb_reserve_next_event(struct ring_buffer *buffer,
#define TRACE_RECURSIVE_DEPTH 16
-static int trace_recursive_lock(void)
+/* Keep this code out of the fast path cache */
+static noinline void trace_recursive_fail(void)
{
- current->trace_recursion++;
-
- if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH))
- return 0;
-
/* Disable all tracing before we do anything else */
tracing_off_permanent();
@@ -2225,10 +2204,21 @@ static int trace_recursive_lock(void)
in_nmi());
WARN_ON_ONCE(1);
+}
+
+static inline int trace_recursive_lock(void)
+{
+ current->trace_recursion++;
+
+ if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH))
+ return 0;
+
+ trace_recursive_fail();
+
return -1;
}
-static void trace_recursive_unlock(void)
+static inline void trace_recursive_unlock(void)
{
WARN_ON_ONCE(!current->trace_recursion);
@@ -2308,12 +2298,28 @@ static void
rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer,
struct ring_buffer_event *event)
{
+ u64 delta;
+
/*
* The event first in the commit queue updates the
* time stamp.
*/
- if (rb_event_is_commit(cpu_buffer, event))
- cpu_buffer->write_stamp += event->time_delta;
+ if (rb_event_is_commit(cpu_buffer, event)) {
+ /*
+ * A commit event that is first on a page
+ * updates the write timestamp with the page stamp
+ */
+ if (!rb_event_index(event))
+ cpu_buffer->write_stamp =
+ cpu_buffer->commit_page->page->time_stamp;
+ else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
+ delta = event->array[0];
+ delta <<= TS_SHIFT;
+ delta += event->time_delta;
+ cpu_buffer->write_stamp += delta;
+ } else
+ cpu_buffer->write_stamp += event->time_delta;
+ }
}
static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
@@ -2353,6 +2359,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
static inline void rb_event_discard(struct ring_buffer_event *event)
{
+ if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
+ event = skip_time_extend(event);
+
/* array[0] holds the actual length for the discarded event */
event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE;
event->type_len = RINGBUF_TYPE_PADDING;
@@ -3049,12 +3058,12 @@ rb_buffer_peek(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts,
again:
/*
- * We repeat when a timestamp is encountered. It is possible
- * to get multiple timestamps from an interrupt entering just
- * as one timestamp is about to be written, or from discarded
- * commits. The most that we can have is the number on a single page.
+ * We repeat when a time extend is encountered.
+ * Since the time extend is always attached to a data event,
+ * we should never loop more than once.
+ * (We never hit the following condition more than twice).
*/
- if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE))
+ if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2))
return NULL;
reader = rb_get_reader_page(cpu_buffer);
@@ -3130,14 +3139,12 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
return NULL;
/*
- * We repeat when a timestamp is encountered.
- * We can get multiple timestamps by nested interrupts or also
- * if filtering is on (discarding commits). Since discarding
- * commits can be frequent we can get a lot of timestamps.
- * But we limit them by not adding timestamps if they begin
- * at the start of a page.
+ * We repeat when a time extend is encountered.
+ * Since the time extend is always attached to a data event,
+ * we should never loop more than once.
+ * (We never hit the following condition more than twice).
*/
- if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE))
+ if (RB_WARN_ON(cpu_buffer, ++nr_loops > 2))
return NULL;
if (rb_per_cpu_empty(cpu_buffer))
@@ -3835,7 +3842,8 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
if (len > (commit - read))
len = (commit - read);
- size = rb_event_length(event);
+ /* Always keep the time extend and data together */
+ size = rb_event_ts_length(event);
if (len < size)
goto out_unlock;
@@ -3857,7 +3865,8 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
break;
event = rb_reader_event(cpu_buffer);
- size = rb_event_length(event);
+ /* Always keep the time extend and data together */
+ size = rb_event_ts_length(event);
} while (len > size);
/* update bpage */
@@ -3974,6 +3983,7 @@ static const struct file_operations rb_simple_fops = {
.open = tracing_open_generic,
.read = rb_simple_read,
.write = rb_simple_write,
+ .llseek = default_llseek,
};
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 001bcd2ccf4a..82d9b8106cd0 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3996,13 +3996,9 @@ static void tracing_init_debugfs_percpu(long cpu)
{
struct dentry *d_percpu = tracing_dentry_percpu();
struct dentry *d_cpu;
- /* strlen(cpu) + MAX(log10(cpu)) + '\0' */
- char cpu_dir[7];
+ char cpu_dir[30]; /* 30 characters should be more than enough */
- if (cpu > 999 || cpu < 0)
- return;
-
- sprintf(cpu_dir, "cpu%ld", cpu);
+ snprintf(cpu_dir, 30, "cpu%ld", cpu);
d_cpu = debugfs_create_dir(cpu_dir, d_percpu);
if (!d_cpu) {
pr_warning("Could not create debugfs '%s' entry\n", cpu_dir);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 398c0e8b332c..0725eeab1937 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -932,6 +932,7 @@ static const struct file_operations ftrace_enable_fops = {
.open = tracing_open_generic,
.read = event_enable_read,
.write = event_enable_write,
+ .llseek = default_llseek,
};
static const struct file_operations ftrace_event_format_fops = {
@@ -944,29 +945,34 @@ static const struct file_operations ftrace_event_format_fops = {
static const struct file_operations ftrace_event_id_fops = {
.open = tracing_open_generic,
.read = event_id_read,
+ .llseek = default_llseek,
};
static const struct file_operations ftrace_event_filter_fops = {
.open = tracing_open_generic,
.read = event_filter_read,
.write = event_filter_write,
+ .llseek = default_llseek,
};
static const struct file_operations ftrace_subsystem_filter_fops = {
.open = tracing_open_generic,
.read = subsystem_filter_read,
.write = subsystem_filter_write,
+ .llseek = default_llseek,
};
static const struct file_operations ftrace_system_enable_fops = {
.open = tracing_open_generic,
.read = system_enable_read,
.write = system_enable_write,
+ .llseek = default_llseek,
};
static const struct file_operations ftrace_show_header_fops = {
.open = tracing_open_generic,
.read = show_header,
+ .llseek = default_llseek,
};
static struct dentry *event_trace_events_dir(void)
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
index 7b8ecd751d93..3c5c5dfea0b3 100644
--- a/kernel/trace/trace_kdb.c
+++ b/kernel/trace/trace_kdb.c
@@ -13,7 +13,6 @@
#include <linux/kdb.h>
#include <linux/ftrace.h>
-#include "../debug/kdb/kdb_private.h"
#include "trace.h"
#include "trace_output.h"
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 544301d29dee..2dec9bcde8b4 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -31,7 +31,6 @@
#include <linux/perf_event.h>
#include <linux/stringify.h>
#include <linux/limits.h>
-#include <linux/uaccess.h>
#include <asm/bitsperlong.h>
#include "trace.h"
@@ -648,7 +647,7 @@ static int register_trace_probe(struct trace_probe *tp)
}
ret = register_probe_event(tp);
if (ret) {
- pr_warning("Faild to register probe event(%d)\n", ret);
+ pr_warning("Failed to register probe event(%d)\n", ret);
goto end;
}
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index a6b7e0e0f3eb..4c5dead0c239 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -195,6 +195,7 @@ static const struct file_operations stack_max_size_fops = {
.open = tracing_open_generic,
.read = stack_max_size_read,
.write = stack_max_size_write,
+ .llseek = default_llseek,
};
static void *
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 0a67e041edf8..24dc60d9fa1f 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -63,12 +63,10 @@ void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
stats->ac_ppid = pid_alive(tsk) ?
rcu_dereference(tsk->real_parent)->tgid : 0;
rcu_read_unlock();
- stats->ac_utime = cputime_to_msecs(tsk->utime) * USEC_PER_MSEC;
- stats->ac_stime = cputime_to_msecs(tsk->stime) * USEC_PER_MSEC;
- stats->ac_utimescaled =
- cputime_to_msecs(tsk->utimescaled) * USEC_PER_MSEC;
- stats->ac_stimescaled =
- cputime_to_msecs(tsk->stimescaled) * USEC_PER_MSEC;
+ stats->ac_utime = cputime_to_usecs(tsk->utime);
+ stats->ac_stime = cputime_to_usecs(tsk->stime);
+ stats->ac_utimescaled = cputime_to_usecs(tsk->utimescaled);
+ stats->ac_stimescaled = cputime_to_usecs(tsk->stimescaled);
stats->ac_minflt = tsk->min_flt;
stats->ac_majflt = tsk->maj_flt;
diff --git a/kernel/user.c b/kernel/user.c
index 7e72614b736d..2c7d8d5914b1 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -91,6 +91,7 @@ static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
* upon function exit.
*/
static void free_user(struct user_struct *up, unsigned long flags)
+ __releases(&uidhash_lock)
{
uid_hash_remove(up);
spin_unlock_irqrestore(&uidhash_lock, flags);
diff --git a/kernel/wait.c b/kernel/wait.c
index c4bd3d825f35..b0310eb6cc1e 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -92,7 +92,7 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
}
EXPORT_SYMBOL(prepare_to_wait_exclusive);
-/*
+/**
* finish_wait - clean up after waiting in a queue
* @q: waitqueue waited on
* @wait: wait descriptor
@@ -127,11 +127,11 @@ void finish_wait(wait_queue_head_t *q, wait_queue_t *wait)
}
EXPORT_SYMBOL(finish_wait);
-/*
+/**
* abort_exclusive_wait - abort exclusive waiting in a queue
* @q: waitqueue waited on
* @wait: wait descriptor
- * @state: runstate of the waiter to be woken
+ * @mode: runstate of the waiter to be woken
* @key: key to identify a wait bit queue or %NULL
*
* Sets current thread back to running state and removes
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 94ca779aa9c2..14b8120d5232 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -43,7 +43,7 @@ static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
#endif
-static int __initdata no_watchdog;
+static int no_watchdog;
/* boot commands */
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index f77afd939229..90db1bd1a978 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -42,9 +42,6 @@
#include <linux/lockdep.h>
#include <linux/idr.h>
-#define CREATE_TRACE_POINTS
-#include <trace/events/workqueue.h>
-
#include "workqueue_sched.h"
enum {
@@ -257,6 +254,9 @@ EXPORT_SYMBOL_GPL(system_long_wq);
EXPORT_SYMBOL_GPL(system_nrt_wq);
EXPORT_SYMBOL_GPL(system_unbound_wq);
+#define CREATE_TRACE_POINTS
+#include <trace/events/workqueue.h>
+
#define for_each_busy_worker(worker, i, pos, gcwq) \
for (i = 0; i < BUSY_WORKER_HASH_SIZE; i++) \
hlist_for_each_entry(worker, pos, &gcwq->busy_hash[i], hentry)
@@ -310,21 +310,6 @@ static inline int __next_wq_cpu(int cpu, const struct cpumask *mask,
(cpu) < WORK_CPU_NONE; \
(cpu) = __next_wq_cpu((cpu), cpu_possible_mask, (wq)))
-#ifdef CONFIG_LOCKDEP
-/**
- * in_workqueue_context() - in context of specified workqueue?
- * @wq: the workqueue of interest
- *
- * Checks lockdep state to see if the current task is executing from
- * within a workqueue item. This function exists only if lockdep is
- * enabled.
- */
-int in_workqueue_context(struct workqueue_struct *wq)
-{
- return lock_is_held(&wq->lockdep_map);
-}
-#endif
-
#ifdef CONFIG_DEBUG_OBJECTS_WORK
static struct debug_obj_descr work_debug_descr;
@@ -604,7 +589,9 @@ static bool keep_working(struct global_cwq *gcwq)
{
atomic_t *nr_running = get_gcwq_nr_running(gcwq->cpu);
- return !list_empty(&gcwq->worklist) && atomic_read(nr_running) <= 1;
+ return !list_empty(&gcwq->worklist) &&
+ (atomic_read(nr_running) <= 1 ||
+ gcwq->flags & GCWQ_HIGHPRI_PENDING);
}
/* Do we need a new worker? Called from manager. */
@@ -997,6 +984,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
/* gcwq determined, get cwq and queue */
cwq = get_cwq(gcwq->cpu, wq);
+ trace_workqueue_queue_work(cpu, cwq, work);
BUG_ON(!list_empty(&work->entry));
@@ -1004,6 +992,7 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
work_flags = work_color_to_flags(cwq->work_color);
if (likely(cwq->nr_active < cwq->max_active)) {
+ trace_workqueue_activate_work(work);
cwq->nr_active++;
worklist = gcwq_determine_ins_pos(gcwq, cwq);
} else {
@@ -1679,6 +1668,7 @@ static void cwq_activate_first_delayed(struct cpu_workqueue_struct *cwq)
struct work_struct, entry);
struct list_head *pos = gcwq_determine_ins_pos(cwq->gcwq, cwq);
+ trace_workqueue_activate_work(work);
move_linked_works(work, pos, NULL);
__clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work));
cwq->nr_active++;
@@ -2074,7 +2064,7 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
* checks and call back into the fixup functions where we
* might deadlock.
*/
- INIT_WORK_ON_STACK(&barr->work, wq_barrier_func);
+ INIT_WORK_ONSTACK(&barr->work, wq_barrier_func);
__set_bit(WORK_STRUCT_PENDING_BIT, work_data_bits(&barr->work));
init_completion(&barr->done);
@@ -2326,27 +2316,17 @@ out_unlock:
}
EXPORT_SYMBOL_GPL(flush_workqueue);
-/**
- * flush_work - block until a work_struct's callback has terminated
- * @work: the work which is to be flushed
- *
- * Returns false if @work has already terminated.
- *
- * It is expected that, prior to calling flush_work(), the caller has
- * arranged for the work to not be requeued, otherwise it doesn't make
- * sense to use this function.
- */
-int flush_work(struct work_struct *work)
+static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr,
+ bool wait_executing)
{
struct worker *worker = NULL;
struct global_cwq *gcwq;
struct cpu_workqueue_struct *cwq;
- struct wq_barrier barr;
might_sleep();
gcwq = get_work_gcwq(work);
if (!gcwq)
- return 0;
+ return false;
spin_lock_irq(&gcwq->lock);
if (!list_empty(&work->entry)) {
@@ -2359,28 +2339,127 @@ int flush_work(struct work_struct *work)
cwq = get_work_cwq(work);
if (unlikely(!cwq || gcwq != cwq->gcwq))
goto already_gone;
- } else {
+ } else if (wait_executing) {
worker = find_worker_executing_work(gcwq, work);
if (!worker)
goto already_gone;
cwq = worker->current_cwq;
- }
+ } else
+ goto already_gone;
- insert_wq_barrier(cwq, &barr, work, worker);
+ insert_wq_barrier(cwq, barr, work, worker);
spin_unlock_irq(&gcwq->lock);
lock_map_acquire(&cwq->wq->lockdep_map);
lock_map_release(&cwq->wq->lockdep_map);
-
- wait_for_completion(&barr.done);
- destroy_work_on_stack(&barr.work);
- return 1;
+ return true;
already_gone:
spin_unlock_irq(&gcwq->lock);
- return 0;
+ return false;
+}
+
+/**
+ * flush_work - wait for a work to finish executing the last queueing instance
+ * @work: the work to flush
+ *
+ * Wait until @work has finished execution. This function considers
+ * only the last queueing instance of @work. If @work has been
+ * enqueued across different CPUs on a non-reentrant workqueue or on
+ * multiple workqueues, @work might still be executing on return on
+ * some of the CPUs from earlier queueing.
+ *
+ * If @work was queued only on a non-reentrant, ordered or unbound
+ * workqueue, @work is guaranteed to be idle on return if it hasn't
+ * been requeued since flush started.
+ *
+ * RETURNS:
+ * %true if flush_work() waited for the work to finish execution,
+ * %false if it was already idle.
+ */
+bool flush_work(struct work_struct *work)
+{
+ struct wq_barrier barr;
+
+ if (start_flush_work(work, &barr, true)) {
+ wait_for_completion(&barr.done);
+ destroy_work_on_stack(&barr.work);
+ return true;
+ } else
+ return false;
}
EXPORT_SYMBOL_GPL(flush_work);
+static bool wait_on_cpu_work(struct global_cwq *gcwq, struct work_struct *work)
+{
+ struct wq_barrier barr;
+ struct worker *worker;
+
+ spin_lock_irq(&gcwq->lock);
+
+ worker = find_worker_executing_work(gcwq, work);
+ if (unlikely(worker))
+ insert_wq_barrier(worker->current_cwq, &barr, work, worker);
+
+ spin_unlock_irq(&gcwq->lock);
+
+ if (unlikely(worker)) {
+ wait_for_completion(&barr.done);
+ destroy_work_on_stack(&barr.work);
+ return true;
+ } else
+ return false;
+}
+
+static bool wait_on_work(struct work_struct *work)
+{
+ bool ret = false;
+ int cpu;
+
+ might_sleep();
+
+ lock_map_acquire(&work->lockdep_map);
+ lock_map_release(&work->lockdep_map);
+
+ for_each_gcwq_cpu(cpu)
+ ret |= wait_on_cpu_work(get_gcwq(cpu), work);
+ return ret;
+}
+
+/**
+ * flush_work_sync - wait until a work has finished execution
+ * @work: the work to flush
+ *
+ * Wait until @work has finished execution. On return, it's
+ * guaranteed that all queueing instances of @work which happened
+ * before this function is called are finished. In other words, if
+ * @work hasn't been requeued since this function was called, @work is
+ * guaranteed to be idle on return.
+ *
+ * RETURNS:
+ * %true if flush_work_sync() waited for the work to finish execution,
+ * %false if it was already idle.
+ */
+bool flush_work_sync(struct work_struct *work)
+{
+ struct wq_barrier barr;
+ bool pending, waited;
+
+ /* we'll wait for executions separately, queue barr only if pending */
+ pending = start_flush_work(work, &barr, false);
+
+ /* wait for executions to finish */
+ waited = wait_on_work(work);
+
+ /* wait for the pending one */
+ if (pending) {
+ wait_for_completion(&barr.done);
+ destroy_work_on_stack(&barr.work);
+ }
+
+ return pending || waited;
+}
+EXPORT_SYMBOL_GPL(flush_work_sync);
+
/*
* Upon a successful return (>= 0), the caller "owns" WORK_STRUCT_PENDING bit,
* so this work can't be re-armed in any way.
@@ -2423,39 +2502,7 @@ static int try_to_grab_pending(struct work_struct *work)
return ret;
}
-static void wait_on_cpu_work(struct global_cwq *gcwq, struct work_struct *work)
-{
- struct wq_barrier barr;
- struct worker *worker;
-
- spin_lock_irq(&gcwq->lock);
-
- worker = find_worker_executing_work(gcwq, work);
- if (unlikely(worker))
- insert_wq_barrier(worker->current_cwq, &barr, work, worker);
-
- spin_unlock_irq(&gcwq->lock);
-
- if (unlikely(worker)) {
- wait_for_completion(&barr.done);
- destroy_work_on_stack(&barr.work);
- }
-}
-
-static void wait_on_work(struct work_struct *work)
-{
- int cpu;
-
- might_sleep();
-
- lock_map_acquire(&work->lockdep_map);
- lock_map_release(&work->lockdep_map);
-
- for_each_gcwq_cpu(cpu)
- wait_on_cpu_work(get_gcwq(cpu), work);
-}
-
-static int __cancel_work_timer(struct work_struct *work,
+static bool __cancel_work_timer(struct work_struct *work,
struct timer_list* timer)
{
int ret;
@@ -2472,42 +2519,81 @@ static int __cancel_work_timer(struct work_struct *work,
}
/**
- * cancel_work_sync - block until a work_struct's callback has terminated
- * @work: the work which is to be flushed
- *
- * Returns true if @work was pending.
+ * cancel_work_sync - cancel a work and wait for it to finish
+ * @work: the work to cancel
*
- * cancel_work_sync() will cancel the work if it is queued. If the work's
- * callback appears to be running, cancel_work_sync() will block until it
- * has completed.
+ * Cancel @work and wait for its execution to finish. This function
+ * can be used even if the work re-queues itself or migrates to
+ * another workqueue. On return from this function, @work is
+ * guaranteed to be not pending or executing on any CPU.
*
- * It is possible to use this function if the work re-queues itself. It can
- * cancel the work even if it migrates to another workqueue, however in that
- * case it only guarantees that work->func() has completed on the last queued
- * workqueue.
+ * cancel_work_sync(&delayed_work->work) must not be used for
+ * delayed_work's. Use cancel_delayed_work_sync() instead.
*
- * cancel_work_sync(&delayed_work->work) should be used only if ->timer is not
- * pending, otherwise it goes into a busy-wait loop until the timer expires.
- *
- * The caller must ensure that workqueue_struct on which this work was last
+ * The caller must ensure that the workqueue on which @work was last
* queued can't be destroyed before this function returns.
+ *
+ * RETURNS:
+ * %true if @work was pending, %false otherwise.
*/
-int cancel_work_sync(struct work_struct *work)
+bool cancel_work_sync(struct work_struct *work)
{
return __cancel_work_timer(work, NULL);
}
EXPORT_SYMBOL_GPL(cancel_work_sync);
/**
- * cancel_delayed_work_sync - reliably kill off a delayed work.
- * @dwork: the delayed work struct
+ * flush_delayed_work - wait for a dwork to finish executing the last queueing
+ * @dwork: the delayed work to flush
+ *
+ * Delayed timer is cancelled and the pending work is queued for
+ * immediate execution. Like flush_work(), this function only
+ * considers the last queueing instance of @dwork.
+ *
+ * RETURNS:
+ * %true if flush_work() waited for the work to finish execution,
+ * %false if it was already idle.
+ */
+bool flush_delayed_work(struct delayed_work *dwork)
+{
+ if (del_timer_sync(&dwork->timer))
+ __queue_work(raw_smp_processor_id(),
+ get_work_cwq(&dwork->work)->wq, &dwork->work);
+ return flush_work(&dwork->work);
+}
+EXPORT_SYMBOL(flush_delayed_work);
+
+/**
+ * flush_delayed_work_sync - wait for a dwork to finish
+ * @dwork: the delayed work to flush
*
- * Returns true if @dwork was pending.
+ * Delayed timer is cancelled and the pending work is queued for
+ * execution immediately. Other than timer handling, its behavior
+ * is identical to flush_work_sync().
*
- * It is possible to use this function if @dwork rearms itself via queue_work()
- * or queue_delayed_work(). See also the comment for cancel_work_sync().
+ * RETURNS:
+ * %true if flush_work_sync() waited for the work to finish execution,
+ * %false if it was already idle.
*/
-int cancel_delayed_work_sync(struct delayed_work *dwork)
+bool flush_delayed_work_sync(struct delayed_work *dwork)
+{
+ if (del_timer_sync(&dwork->timer))
+ __queue_work(raw_smp_processor_id(),
+ get_work_cwq(&dwork->work)->wq, &dwork->work);
+ return flush_work_sync(&dwork->work);
+}
+EXPORT_SYMBOL(flush_delayed_work_sync);
+
+/**
+ * cancel_delayed_work_sync - cancel a delayed work and wait for it to finish
+ * @dwork: the delayed work cancel
+ *
+ * This is cancel_work_sync() for delayed works.
+ *
+ * RETURNS:
+ * %true if @dwork was pending, %false otherwise.
+ */
+bool cancel_delayed_work_sync(struct delayed_work *dwork)
{
return __cancel_work_timer(&dwork->work, &dwork->timer);
}
@@ -2559,23 +2645,6 @@ int schedule_delayed_work(struct delayed_work *dwork,
EXPORT_SYMBOL(schedule_delayed_work);
/**
- * flush_delayed_work - block until a dwork_struct's callback has terminated
- * @dwork: the delayed work which is to be flushed
- *
- * Any timeout is cancelled, and any pending work is run immediately.
- */
-void flush_delayed_work(struct delayed_work *dwork)
-{
- if (del_timer_sync(&dwork->timer)) {
- __queue_work(get_cpu(), get_work_cwq(&dwork->work)->wq,
- &dwork->work);
- put_cpu();
- }
- flush_work(&dwork->work);
-}
-EXPORT_SYMBOL(flush_delayed_work);
-
-/**
* schedule_delayed_work_on - queue work in global workqueue on CPU after delay
* @cpu: cpu to use
* @dwork: job to be done
@@ -2592,13 +2661,15 @@ int schedule_delayed_work_on(int cpu,
EXPORT_SYMBOL(schedule_delayed_work_on);
/**
- * schedule_on_each_cpu - call a function on each online CPU from keventd
+ * schedule_on_each_cpu - execute a function synchronously on each online CPU
* @func: the function to call
*
- * Returns zero on success.
- * Returns -ve errno on failure.
- *
+ * schedule_on_each_cpu() executes @func on each online CPU using the
+ * system workqueue and blocks until all CPUs have completed.
* schedule_on_each_cpu() is very slow.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
*/
int schedule_on_each_cpu(work_func_t func)
{
@@ -2720,7 +2791,9 @@ static int alloc_cwqs(struct workqueue_struct *wq)
}
}
- /* just in case, make sure it's actually aligned */
+ /* just in case, make sure it's actually aligned
+ * - this is affected by PERCPU() alignment in vmlinux.lds.S
+ */
BUG_ON(!IS_ALIGNED(wq->cpu_wq.v, align));
return wq->cpu_wq.v ? 0 : -ENOMEM;
}
@@ -2764,6 +2837,13 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name,
unsigned int cpu;
/*
+ * Workqueues which may be used during memory reclaim should
+ * have a rescuer to guarantee forward progress.
+ */
+ if (flags & WQ_MEM_RECLAIM)
+ flags |= WQ_RESCUER;
+
+ /*
* Unbound workqueues aren't concurrency managed and should be
* dispatched to workers immediately.
*/