summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile101
-rw-r--r--kernel/audit.c2
-rw-r--r--kernel/audit.h18
-rw-r--r--kernel/audit_fsnotify.c216
-rw-r--r--kernel/audit_tree.c2
-rw-r--r--kernel/audit_watch.c56
-rw-r--r--kernel/auditfilter.c83
-rw-r--r--kernel/auditsc.c9
-rw-r--r--kernel/cgroup.c7
-rw-r--r--kernel/fork.c3
-rw-r--r--kernel/futex.c100
-rw-r--r--kernel/jump_label.c158
-rw-r--r--kernel/kthread.c7
-rw-r--r--kernel/locking/Makefile4
-rw-r--r--kernel/locking/percpu-rwsem.c13
-rw-r--r--kernel/locking/qrwlock.c47
-rw-r--r--kernel/locking/qspinlock.c6
-rw-r--r--kernel/locking/qspinlock_paravirt.h102
-rw-r--r--kernel/locking/rtmutex-tester.c420
-rw-r--r--kernel/locking/rtmutex.c2
-rw-r--r--kernel/locking/rtmutex_common.h22
-rw-r--r--kernel/memremap.c190
-rw-r--r--kernel/module_signing.c213
-rw-r--r--kernel/profile.c8
-rw-r--r--kernel/ptrace.c13
-rw-r--r--kernel/resource.c61
-rw-r--r--kernel/sched/core.c6
-rw-r--r--kernel/sched/wait.c7
-rw-r--r--kernel/seccomp.c17
-rw-r--r--kernel/smpboot.c27
-rw-r--r--kernel/sys_ni.c1
-rw-r--r--kernel/system_certificates.S20
-rw-r--r--kernel/system_keyring.c106
-rw-r--r--kernel/task_work.c12
-rw-r--r--kernel/trace/ftrace.c9
-rw-r--r--kernel/trace/ring_buffer.c764
-rw-r--r--kernel/trace/trace.c4
-rw-r--r--kernel/trace/trace_events.c25
-rw-r--r--kernel/trace/trace_events_filter.c54
-rw-r--r--kernel/trace/trace_functions_graph.c4
-rw-r--r--kernel/trace/trace_output.c4
-rw-r--r--kernel/trace/trace_stack.c68
-rw-r--r--kernel/user_namespace.c1
-rw-r--r--kernel/watchdog.c189
44 files changed, 1609 insertions, 1572 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 718fb8afab7a..e0d7587e7684 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -45,7 +45,6 @@ ifneq ($(CONFIG_SMP),y)
obj-y += up.o
endif
obj-$(CONFIG_UID16) += uid16.o
-obj-$(CONFIG_SYSTEM_TRUSTED_KEYRING) += system_keyring.o system_certificates.o
obj-$(CONFIG_MODULES) += module.o
obj-$(CONFIG_MODULE_SIG) += module_signing.o
obj-$(CONFIG_KALLSYMS) += kallsyms.o
@@ -65,7 +64,7 @@ obj-$(CONFIG_SMP) += stop_machine.o
obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
-obj-$(CONFIG_AUDIT_WATCH) += audit_watch.o
+obj-$(CONFIG_AUDIT_WATCH) += audit_watch.o audit_fsnotify.o
obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
obj-$(CONFIG_GCOV_KERNEL) += gcov/
obj-$(CONFIG_KPROBES) += kprobes.o
@@ -100,6 +99,8 @@ obj-$(CONFIG_JUMP_LABEL) += jump_label.o
obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o
obj-$(CONFIG_TORTURE_TEST) += torture.o
+obj-$(CONFIG_HAS_IOMEM) += memremap.o
+
$(obj)/configs.o: $(obj)/config_data.h
# config_data.h contains the same information as ikconfig.h but gzipped.
@@ -112,99 +113,3 @@ $(obj)/config_data.gz: $(KCONFIG_CONFIG) FORCE
targets += config_data.h
$(obj)/config_data.h: $(obj)/config_data.gz FORCE
$(call filechk,ikconfiggz)
-
-###############################################################################
-#
-# Roll all the X.509 certificates that we can find together and pull them into
-# the kernel so that they get loaded into the system trusted keyring during
-# boot.
-#
-# We look in the source root and the build root for all files whose name ends
-# in ".x509". Unfortunately, this will generate duplicate filenames, so we
-# have make canonicalise the pathnames and then sort them to discard the
-# duplicates.
-#
-###############################################################################
-ifeq ($(CONFIG_SYSTEM_TRUSTED_KEYRING),y)
-X509_CERTIFICATES-y := $(wildcard *.x509) $(wildcard $(srctree)/*.x509)
-X509_CERTIFICATES-$(CONFIG_MODULE_SIG) += $(objtree)/signing_key.x509
-X509_CERTIFICATES-raw := $(sort $(foreach CERT,$(X509_CERTIFICATES-y), \
- $(or $(realpath $(CERT)),$(CERT))))
-X509_CERTIFICATES := $(subst $(realpath $(objtree))/,,$(X509_CERTIFICATES-raw))
-
-ifeq ($(X509_CERTIFICATES),)
-$(warning *** No X.509 certificates found ***)
-endif
-
-ifneq ($(wildcard $(obj)/.x509.list),)
-ifneq ($(shell cat $(obj)/.x509.list),$(X509_CERTIFICATES))
-$(warning X.509 certificate list changed to "$(X509_CERTIFICATES)" from "$(shell cat $(obj)/.x509.list)")
-$(shell rm $(obj)/.x509.list)
-endif
-endif
-
-kernel/system_certificates.o: $(obj)/x509_certificate_list
-
-quiet_cmd_x509certs = CERTS $@
- cmd_x509certs = cat $(X509_CERTIFICATES) /dev/null >$@ $(foreach X509,$(X509_CERTIFICATES),; $(kecho) " - Including cert $(X509)")
-
-targets += $(obj)/x509_certificate_list
-$(obj)/x509_certificate_list: $(X509_CERTIFICATES) $(obj)/.x509.list
- $(call if_changed,x509certs)
-
-targets += $(obj)/.x509.list
-$(obj)/.x509.list:
- @echo $(X509_CERTIFICATES) >$@
-endif
-
-clean-files := x509_certificate_list .x509.list
-
-ifeq ($(CONFIG_MODULE_SIG),y)
-###############################################################################
-#
-# If module signing is requested, say by allyesconfig, but a key has not been
-# supplied, then one will need to be generated to make sure the build does not
-# fail and that the kernel may be used afterwards.
-#
-###############################################################################
-ifndef CONFIG_MODULE_SIG_HASH
-$(error Could not determine digest type to use from kernel config)
-endif
-
-signing_key.priv signing_key.x509: x509.genkey
- @echo "###"
- @echo "### Now generating an X.509 key pair to be used for signing modules."
- @echo "###"
- @echo "### If this takes a long time, you might wish to run rngd in the"
- @echo "### background to keep the supply of entropy topped up. It"
- @echo "### needs to be run as root, and uses a hardware random"
- @echo "### number generator if one is available."
- @echo "###"
- openssl req -new -nodes -utf8 -$(CONFIG_MODULE_SIG_HASH) -days 36500 \
- -batch -x509 -config x509.genkey \
- -outform DER -out signing_key.x509 \
- -keyout signing_key.priv 2>&1
- @echo "###"
- @echo "### Key pair generated."
- @echo "###"
-
-x509.genkey:
- @echo Generating X.509 key generation config
- @echo >x509.genkey "[ req ]"
- @echo >>x509.genkey "default_bits = 4096"
- @echo >>x509.genkey "distinguished_name = req_distinguished_name"
- @echo >>x509.genkey "prompt = no"
- @echo >>x509.genkey "string_mask = utf8only"
- @echo >>x509.genkey "x509_extensions = myexts"
- @echo >>x509.genkey
- @echo >>x509.genkey "[ req_distinguished_name ]"
- @echo >>x509.genkey "#O = Unspecified company"
- @echo >>x509.genkey "CN = Build time autogenerated kernel key"
- @echo >>x509.genkey "#emailAddress = unspecified.user@unspecified.company"
- @echo >>x509.genkey
- @echo >>x509.genkey "[ myexts ]"
- @echo >>x509.genkey "basicConstraints=critical,CA:FALSE"
- @echo >>x509.genkey "keyUsage=digitalSignature"
- @echo >>x509.genkey "subjectKeyIdentifier=hash"
- @echo >>x509.genkey "authorityKeyIdentifier=keyid"
-endif
diff --git a/kernel/audit.c b/kernel/audit.c
index f9e6065346db..662c007635fb 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1761,7 +1761,7 @@ void audit_log_name(struct audit_context *context, struct audit_names *n,
} else
audit_log_format(ab, " name=(null)");
- if (n->ino != (unsigned long)-1)
+ if (n->ino != AUDIT_INO_UNSET)
audit_log_format(ab, " inode=%lu"
" dev=%02x:%02x mode=%#ho"
" ouid=%u ogid=%u rdev=%02x:%02x",
diff --git a/kernel/audit.h b/kernel/audit.h
index d641f9bb3ed0..dadf86a0e59e 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -50,6 +50,7 @@ enum audit_state {
/* Rule lists */
struct audit_watch;
+struct audit_fsnotify_mark;
struct audit_tree;
struct audit_chunk;
@@ -252,6 +253,7 @@ struct audit_net {
extern int selinux_audit_rule_update(void);
extern struct mutex audit_filter_mutex;
+extern int audit_del_rule(struct audit_entry *);
extern void audit_free_rule_rcu(struct rcu_head *);
extern struct list_head audit_filter_list[];
@@ -269,6 +271,15 @@ extern int audit_add_watch(struct audit_krule *krule, struct list_head **list);
extern void audit_remove_watch_rule(struct audit_krule *krule);
extern char *audit_watch_path(struct audit_watch *watch);
extern int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev);
+
+extern struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pathname, int len);
+extern char *audit_mark_path(struct audit_fsnotify_mark *mark);
+extern void audit_remove_mark(struct audit_fsnotify_mark *audit_mark);
+extern void audit_remove_mark_rule(struct audit_krule *krule);
+extern int audit_mark_compare(struct audit_fsnotify_mark *mark, unsigned long ino, dev_t dev);
+extern int audit_dupe_exe(struct audit_krule *new, struct audit_krule *old);
+extern int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark);
+
#else
#define audit_put_watch(w) {}
#define audit_get_watch(w) {}
@@ -278,6 +289,13 @@ extern int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev
#define audit_watch_path(w) ""
#define audit_watch_compare(w, i, d) 0
+#define audit_alloc_mark(k, p, l) (ERR_PTR(-EINVAL))
+#define audit_mark_path(m) ""
+#define audit_remove_mark(m)
+#define audit_remove_mark_rule(k)
+#define audit_mark_compare(m, i, d) 0
+#define audit_exe_compare(t, m) (-EINVAL)
+#define audit_dupe_exe(n, o) (-EINVAL)
#endif /* CONFIG_AUDIT_WATCH */
#ifdef CONFIG_AUDIT_TREE
diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c
new file mode 100644
index 000000000000..27c6046c2c3d
--- /dev/null
+++ b/kernel/audit_fsnotify.c
@@ -0,0 +1,216 @@
+/* audit_fsnotify.c -- tracking inodes
+ *
+ * Copyright 2003-2009,2014-2015 Red Hat, Inc.
+ * Copyright 2005 Hewlett-Packard Development Company, L.P.
+ * Copyright 2005 IBM Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/audit.h>
+#include <linux/kthread.h>
+#include <linux/mutex.h>
+#include <linux/fs.h>
+#include <linux/fsnotify_backend.h>
+#include <linux/namei.h>
+#include <linux/netlink.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/security.h>
+#include "audit.h"
+
+/*
+ * this mark lives on the parent directory of the inode in question.
+ * but dev, ino, and path are about the child
+ */
+struct audit_fsnotify_mark {
+ dev_t dev; /* associated superblock device */
+ unsigned long ino; /* associated inode number */
+ char *path; /* insertion path */
+ struct fsnotify_mark mark; /* fsnotify mark on the inode */
+ struct audit_krule *rule;
+};
+
+/* fsnotify handle. */
+static struct fsnotify_group *audit_fsnotify_group;
+
+/* fsnotify events we care about. */
+#define AUDIT_FS_EVENTS (FS_MOVE | FS_CREATE | FS_DELETE | FS_DELETE_SELF |\
+ FS_MOVE_SELF | FS_EVENT_ON_CHILD)
+
+static void audit_fsnotify_mark_free(struct audit_fsnotify_mark *audit_mark)
+{
+ kfree(audit_mark->path);
+ kfree(audit_mark);
+}
+
+static void audit_fsnotify_free_mark(struct fsnotify_mark *mark)
+{
+ struct audit_fsnotify_mark *audit_mark;
+
+ audit_mark = container_of(mark, struct audit_fsnotify_mark, mark);
+ audit_fsnotify_mark_free(audit_mark);
+}
+
+char *audit_mark_path(struct audit_fsnotify_mark *mark)
+{
+ return mark->path;
+}
+
+int audit_mark_compare(struct audit_fsnotify_mark *mark, unsigned long ino, dev_t dev)
+{
+ if (mark->ino == AUDIT_INO_UNSET)
+ return 0;
+ return (mark->ino == ino) && (mark->dev == dev);
+}
+
+static void audit_update_mark(struct audit_fsnotify_mark *audit_mark,
+ struct inode *inode)
+{
+ audit_mark->dev = inode ? inode->i_sb->s_dev : AUDIT_DEV_UNSET;
+ audit_mark->ino = inode ? inode->i_ino : AUDIT_INO_UNSET;
+}
+
+struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pathname, int len)
+{
+ struct audit_fsnotify_mark *audit_mark;
+ struct path path;
+ struct dentry *dentry;
+ struct inode *inode;
+ int ret;
+
+ if (pathname[0] != '/' || pathname[len-1] == '/')
+ return ERR_PTR(-EINVAL);
+
+ dentry = kern_path_locked(pathname, &path);
+ if (IS_ERR(dentry))
+ return (void *)dentry; /* returning an error */
+ inode = path.dentry->d_inode;
+ mutex_unlock(&inode->i_mutex);
+
+ audit_mark = kzalloc(sizeof(*audit_mark), GFP_KERNEL);
+ if (unlikely(!audit_mark)) {
+ audit_mark = ERR_PTR(-ENOMEM);
+ goto out;
+ }
+
+ fsnotify_init_mark(&audit_mark->mark, audit_fsnotify_free_mark);
+ audit_mark->mark.mask = AUDIT_FS_EVENTS;
+ audit_mark->path = pathname;
+ audit_update_mark(audit_mark, dentry->d_inode);
+ audit_mark->rule = krule;
+
+ ret = fsnotify_add_mark(&audit_mark->mark, audit_fsnotify_group, inode, NULL, true);
+ if (ret < 0) {
+ audit_fsnotify_mark_free(audit_mark);
+ audit_mark = ERR_PTR(ret);
+ }
+out:
+ dput(dentry);
+ path_put(&path);
+ return audit_mark;
+}
+
+static void audit_mark_log_rule_change(struct audit_fsnotify_mark *audit_mark, char *op)
+{
+ struct audit_buffer *ab;
+ struct audit_krule *rule = audit_mark->rule;
+
+ if (!audit_enabled)
+ return;
+ ab = audit_log_start(NULL, GFP_NOFS, AUDIT_CONFIG_CHANGE);
+ if (unlikely(!ab))
+ return;
+ audit_log_format(ab, "auid=%u ses=%u op=",
+ from_kuid(&init_user_ns, audit_get_loginuid(current)),
+ audit_get_sessionid(current));
+ audit_log_string(ab, op);
+ audit_log_format(ab, " path=");
+ audit_log_untrustedstring(ab, audit_mark->path);
+ audit_log_key(ab, rule->filterkey);
+ audit_log_format(ab, " list=%d res=1", rule->listnr);
+ audit_log_end(ab);
+}
+
+void audit_remove_mark(struct audit_fsnotify_mark *audit_mark)
+{
+ fsnotify_destroy_mark(&audit_mark->mark, audit_fsnotify_group);
+ fsnotify_put_mark(&audit_mark->mark);
+}
+
+void audit_remove_mark_rule(struct audit_krule *krule)
+{
+ struct audit_fsnotify_mark *mark = krule->exe;
+
+ audit_remove_mark(mark);
+}
+
+static void audit_autoremove_mark_rule(struct audit_fsnotify_mark *audit_mark)
+{
+ struct audit_krule *rule = audit_mark->rule;
+ struct audit_entry *entry = container_of(rule, struct audit_entry, rule);
+
+ audit_mark_log_rule_change(audit_mark, "autoremove_rule");
+ audit_del_rule(entry);
+}
+
+/* Update mark data in audit rules based on fsnotify events. */
+static int audit_mark_handle_event(struct fsnotify_group *group,
+ struct inode *to_tell,
+ struct fsnotify_mark *inode_mark,
+ struct fsnotify_mark *vfsmount_mark,
+ u32 mask, void *data, int data_type,
+ const unsigned char *dname, u32 cookie)
+{
+ struct audit_fsnotify_mark *audit_mark;
+ struct inode *inode = NULL;
+
+ audit_mark = container_of(inode_mark, struct audit_fsnotify_mark, mark);
+
+ BUG_ON(group != audit_fsnotify_group);
+
+ switch (data_type) {
+ case (FSNOTIFY_EVENT_PATH):
+ inode = ((struct path *)data)->dentry->d_inode;
+ break;
+ case (FSNOTIFY_EVENT_INODE):
+ inode = (struct inode *)data;
+ break;
+ default:
+ BUG();
+ return 0;
+ };
+
+ if (mask & (FS_CREATE|FS_MOVED_TO|FS_DELETE|FS_MOVED_FROM)) {
+ if (audit_compare_dname_path(dname, audit_mark->path, AUDIT_NAME_FULL))
+ return 0;
+ audit_update_mark(audit_mark, inode);
+ } else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF))
+ audit_autoremove_mark_rule(audit_mark);
+
+ return 0;
+}
+
+static const struct fsnotify_ops audit_mark_fsnotify_ops = {
+ .handle_event = audit_mark_handle_event,
+};
+
+static int __init audit_fsnotify_init(void)
+{
+ audit_fsnotify_group = fsnotify_alloc_group(&audit_mark_fsnotify_ops);
+ if (IS_ERR(audit_fsnotify_group)) {
+ audit_fsnotify_group = NULL;
+ audit_panic("cannot create audit fsnotify group");
+ }
+ return 0;
+}
+device_initcall(audit_fsnotify_init);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index b0f9877273fc..94ecdabda8e6 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -479,6 +479,8 @@ static void kill_rules(struct audit_tree *tree)
if (rule->tree) {
/* not a half-baked one */
audit_tree_log_remove_rule(rule);
+ if (entry->rule.exe)
+ audit_remove_mark(entry->rule.exe);
rule->tree = NULL;
list_del_rcu(&entry->list);
list_del(&entry->rule.list);
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 6e30024d9aac..656c7e93ac0d 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -138,7 +138,7 @@ char *audit_watch_path(struct audit_watch *watch)
int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev)
{
- return (watch->ino != (unsigned long)-1) &&
+ return (watch->ino != AUDIT_INO_UNSET) &&
(watch->ino == ino) &&
(watch->dev == dev);
}
@@ -179,8 +179,8 @@ static struct audit_watch *audit_init_watch(char *path)
INIT_LIST_HEAD(&watch->rules);
atomic_set(&watch->count, 1);
watch->path = path;
- watch->dev = (dev_t)-1;
- watch->ino = (unsigned long)-1;
+ watch->dev = AUDIT_DEV_UNSET;
+ watch->ino = AUDIT_INO_UNSET;
return watch;
}
@@ -203,7 +203,6 @@ int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op)
if (IS_ERR(watch))
return PTR_ERR(watch);
- audit_get_watch(watch);
krule->watch = watch;
return 0;
@@ -313,6 +312,8 @@ static void audit_update_watch(struct audit_parent *parent,
list_replace(&oentry->rule.list,
&nentry->rule.list);
}
+ if (oentry->rule.exe)
+ audit_remove_mark(oentry->rule.exe);
audit_watch_log_rule_change(r, owatch, "updated_rules");
@@ -343,6 +344,8 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
list_for_each_entry_safe(r, nextr, &w->rules, rlist) {
e = container_of(r, struct audit_entry, rule);
audit_watch_log_rule_change(r, w, "remove_rule");
+ if (e->rule.exe)
+ audit_remove_mark(e->rule.exe);
list_del(&r->rlist);
list_del(&r->list);
list_del_rcu(&e->list);
@@ -387,19 +390,20 @@ static void audit_add_to_parent(struct audit_krule *krule,
watch_found = 1;
- /* put krule's and initial refs to temporary watch */
- audit_put_watch(watch);
+ /* put krule's ref to temporary watch */
audit_put_watch(watch);
audit_get_watch(w);
krule->watch = watch = w;
+
+ audit_put_parent(parent);
break;
}
if (!watch_found) {
- audit_get_parent(parent);
watch->parent = parent;
+ audit_get_watch(watch);
list_add(&watch->wlist, &parent->watches);
}
list_add(&krule->rlist, &watch->rules);
@@ -437,9 +441,6 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list)
audit_add_to_parent(krule, parent);
- /* match get in audit_find_parent or audit_init_parent */
- audit_put_parent(parent);
-
h = audit_hash_ino((u32)watch->ino);
*list = &audit_inode_hash[h];
error:
@@ -496,7 +497,7 @@ static int audit_watch_handle_event(struct fsnotify_group *group,
if (mask & (FS_CREATE|FS_MOVED_TO) && inode)
audit_update_watch(parent, dname, inode->i_sb->s_dev, inode->i_ino, 0);
else if (mask & (FS_DELETE|FS_MOVED_FROM))
- audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1);
+ audit_update_watch(parent, dname, AUDIT_DEV_UNSET, AUDIT_INO_UNSET, 1);
else if (mask & (FS_DELETE_SELF|FS_UNMOUNT|FS_MOVE_SELF))
audit_remove_parent_watches(parent);
@@ -517,3 +518,36 @@ static int __init audit_watch_init(void)
return 0;
}
device_initcall(audit_watch_init);
+
+int audit_dupe_exe(struct audit_krule *new, struct audit_krule *old)
+{
+ struct audit_fsnotify_mark *audit_mark;
+ char *pathname;
+
+ pathname = kstrdup(audit_mark_path(old->exe), GFP_KERNEL);
+ if (!pathname)
+ return -ENOMEM;
+
+ audit_mark = audit_alloc_mark(new, pathname, strlen(pathname));
+ if (IS_ERR(audit_mark)) {
+ kfree(pathname);
+ return PTR_ERR(audit_mark);
+ }
+ new->exe = audit_mark;
+
+ return 0;
+}
+
+int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark)
+{
+ struct file *exe_file;
+ unsigned long ino;
+ dev_t dev;
+
+ rcu_read_lock();
+ exe_file = rcu_dereference(tsk->mm->exe_file);
+ ino = exe_file->f_inode->i_ino;
+ dev = exe_file->f_inode->i_sb->s_dev;
+ rcu_read_unlock();
+ return audit_mark_compare(mark, ino, dev);
+}
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 72e1660a79a3..7714d93edb85 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -405,6 +405,12 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f)
if (f->val > AUDIT_MAX_FIELD_COMPARE)
return -EINVAL;
break;
+ case AUDIT_EXE:
+ if (f->op != Audit_equal)
+ return -EINVAL;
+ if (entry->rule.listnr != AUDIT_FILTER_EXIT)
+ return -EINVAL;
+ break;
};
return 0;
}
@@ -419,6 +425,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
size_t remain = datasz - sizeof(struct audit_rule_data);
int i;
char *str;
+ struct audit_fsnotify_mark *audit_mark;
entry = audit_to_entry_common(data);
if (IS_ERR(entry))
@@ -539,6 +546,24 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
entry->rule.buflen += f->val;
entry->rule.filterkey = str;
break;
+ case AUDIT_EXE:
+ if (entry->rule.exe || f->val > PATH_MAX)
+ goto exit_free;
+ str = audit_unpack_string(&bufp, &remain, f->val);
+ if (IS_ERR(str)) {
+ err = PTR_ERR(str);
+ goto exit_free;
+ }
+ entry->rule.buflen += f->val;
+
+ audit_mark = audit_alloc_mark(&entry->rule, str, f->val);
+ if (IS_ERR(audit_mark)) {
+ kfree(str);
+ err = PTR_ERR(audit_mark);
+ goto exit_free;
+ }
+ entry->rule.exe = audit_mark;
+ break;
}
}
@@ -549,10 +574,10 @@ exit_nofree:
return entry;
exit_free:
- if (entry->rule.watch)
- audit_put_watch(entry->rule.watch); /* matches initial get */
if (entry->rule.tree)
audit_put_tree(entry->rule.tree); /* that's the temporary one */
+ if (entry->rule.exe)
+ audit_remove_mark(entry->rule.exe); /* that's the template one */
audit_free_rule(entry);
return ERR_PTR(err);
}
@@ -617,6 +642,10 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
data->buflen += data->values[i] =
audit_pack_string(&bufp, krule->filterkey);
break;
+ case AUDIT_EXE:
+ data->buflen += data->values[i] =
+ audit_pack_string(&bufp, audit_mark_path(krule->exe));
+ break;
case AUDIT_LOGINUID_SET:
if (krule->pflags & AUDIT_LOGINUID_LEGACY && !f->val) {
data->fields[i] = AUDIT_LOGINUID;
@@ -680,6 +709,12 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
if (strcmp(a->filterkey, b->filterkey))
return 1;
break;
+ case AUDIT_EXE:
+ /* both paths exist based on above type compare */
+ if (strcmp(audit_mark_path(a->exe),
+ audit_mark_path(b->exe)))
+ return 1;
+ break;
case AUDIT_UID:
case AUDIT_EUID:
case AUDIT_SUID:
@@ -801,8 +836,14 @@ struct audit_entry *audit_dupe_rule(struct audit_krule *old)
err = -ENOMEM;
else
new->filterkey = fk;
+ break;
+ case AUDIT_EXE:
+ err = audit_dupe_exe(new, old);
+ break;
}
if (err) {
+ if (new->exe)
+ audit_remove_mark(new->exe);
audit_free_rule(entry);
return ERR_PTR(err);
}
@@ -863,7 +904,7 @@ static inline int audit_add_rule(struct audit_entry *entry)
struct audit_watch *watch = entry->rule.watch;
struct audit_tree *tree = entry->rule.tree;
struct list_head *list;
- int err;
+ int err = 0;
#ifdef CONFIG_AUDITSYSCALL
int dont_count = 0;
@@ -881,7 +922,7 @@ static inline int audit_add_rule(struct audit_entry *entry)
/* normally audit_add_tree_rule() will free it on failure */
if (tree)
audit_put_tree(tree);
- goto error;
+ return err;
}
if (watch) {
@@ -895,14 +936,14 @@ static inline int audit_add_rule(struct audit_entry *entry)
*/
if (tree)
audit_put_tree(tree);
- goto error;
+ return err;
}
}
if (tree) {
err = audit_add_tree_rule(&entry->rule);
if (err) {
mutex_unlock(&audit_filter_mutex);
- goto error;
+ return err;
}
}
@@ -933,19 +974,13 @@ static inline int audit_add_rule(struct audit_entry *entry)
#endif
mutex_unlock(&audit_filter_mutex);
- return 0;
-
-error:
- if (watch)
- audit_put_watch(watch); /* tmp watch, matches initial get */
return err;
}
/* Remove an existing rule from filterlist. */
-static inline int audit_del_rule(struct audit_entry *entry)
+int audit_del_rule(struct audit_entry *entry)
{
struct audit_entry *e;
- struct audit_watch *watch = entry->rule.watch;
struct audit_tree *tree = entry->rule.tree;
struct list_head *list;
int ret = 0;
@@ -961,7 +996,6 @@ static inline int audit_del_rule(struct audit_entry *entry)
mutex_lock(&audit_filter_mutex);
e = audit_find_rule(entry, &list);
if (!e) {
- mutex_unlock(&audit_filter_mutex);
ret = -ENOENT;
goto out;
}
@@ -972,9 +1006,8 @@ static inline int audit_del_rule(struct audit_entry *entry)
if (e->rule.tree)
audit_remove_tree_rule(&e->rule);
- list_del_rcu(&e->list);
- list_del(&e->rule.list);
- call_rcu(&e->rcu, audit_free_rule_rcu);
+ if (e->rule.exe)
+ audit_remove_mark_rule(&e->rule);
#ifdef CONFIG_AUDITSYSCALL
if (!dont_count)
@@ -983,11 +1016,14 @@ static inline int audit_del_rule(struct audit_entry *entry)
if (!audit_match_signal(entry))
audit_signals--;
#endif
- mutex_unlock(&audit_filter_mutex);
+
+ list_del_rcu(&e->list);
+ list_del(&e->rule.list);
+ call_rcu(&e->rcu, audit_free_rule_rcu);
out:
- if (watch)
- audit_put_watch(watch); /* match initial get */
+ mutex_unlock(&audit_filter_mutex);
+
if (tree)
audit_put_tree(tree); /* that's the temporary one */
@@ -1077,8 +1113,11 @@ int audit_rule_change(int type, __u32 portid, int seq, void *data,
WARN_ON(1);
}
- if (err || type == AUDIT_DEL_RULE)
+ if (err || type == AUDIT_DEL_RULE) {
+ if (entry->rule.exe)
+ audit_remove_mark(entry->rule.exe);
audit_free_rule(entry);
+ }
return err;
}
@@ -1370,6 +1409,8 @@ static int update_lsm_rule(struct audit_krule *r)
return 0;
nentry = audit_dupe_rule(r);
+ if (entry->rule.exe)
+ audit_remove_mark(entry->rule.exe);
if (IS_ERR(nentry)) {
/* save the first error encountered for the
* return value */
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index e85bdfd15fed..b86cc04959de 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -180,7 +180,7 @@ static int audit_match_filetype(struct audit_context *ctx, int val)
return 0;
list_for_each_entry(n, &ctx->names_list, list) {
- if ((n->ino != -1) &&
+ if ((n->ino != AUDIT_INO_UNSET) &&
((n->mode & S_IFMT) == mode))
return 1;
}
@@ -466,6 +466,9 @@ static int audit_filter_rules(struct task_struct *tsk,
result = audit_comparator(ctx->ppid, f->op, f->val);
}
break;
+ case AUDIT_EXE:
+ result = audit_exe_compare(tsk, rule->exe);
+ break;
case AUDIT_UID:
result = audit_uid_comparator(cred->uid, f->op, f->uid);
break;
@@ -1680,7 +1683,7 @@ static struct audit_names *audit_alloc_name(struct audit_context *context,
aname->should_free = true;
}
- aname->ino = (unsigned long)-1;
+ aname->ino = AUDIT_INO_UNSET;
aname->type = type;
list_add_tail(&aname->list, &context->names_list);
@@ -1922,7 +1925,7 @@ void __audit_inode_child(const struct inode *parent,
if (inode)
audit_copy_inode(found_child, dentry, inode);
else
- found_child->ino = (unsigned long)-1;
+ found_child->ino = AUDIT_INO_UNSET;
}
EXPORT_SYMBOL_GPL(__audit_inode_child);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f3f5cd5e2c0d..2cf0f79f1fc9 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1342,7 +1342,7 @@ static int cgroup_show_options(struct seq_file *seq,
if (root != &cgrp_dfl_root)
for_each_subsys(ss, ssid)
if (root->subsys_mask & (1 << ssid))
- seq_printf(seq, ",%s", ss->legacy_name);
+ seq_show_option(seq, ss->legacy_name, NULL);
if (root->flags & CGRP_ROOT_NOPREFIX)
seq_puts(seq, ",noprefix");
if (root->flags & CGRP_ROOT_XATTR)
@@ -1350,13 +1350,14 @@ static int cgroup_show_options(struct seq_file *seq,
spin_lock(&release_agent_path_lock);
if (strlen(root->release_agent_path))
- seq_printf(seq, ",release_agent=%s", root->release_agent_path);
+ seq_show_option(seq, "release_agent",
+ root->release_agent_path);
spin_unlock(&release_agent_path_lock);
if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags))
seq_puts(seq, ",clone_children");
if (strlen(root->name))
- seq_printf(seq, ",name=%s", root->name);
+ seq_show_option(seq, "name", root->name);
return 0;
}
diff --git a/kernel/fork.c b/kernel/fork.c
index 03aa2e6de7a4..7d5f0f118a63 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -454,8 +454,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
tmp->vm_mm = mm;
if (anon_vma_fork(tmp, mpnt))
goto fail_nomem_anon_vma_fork;
- tmp->vm_flags &= ~VM_LOCKED;
+ tmp->vm_flags &= ~(VM_LOCKED|VM_UFFD_MISSING|VM_UFFD_WP);
tmp->vm_next = tmp->vm_prev = NULL;
+ tmp->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
file = tmp->vm_file;
if (file) {
struct inode *inode = file_inode(file);
diff --git a/kernel/futex.c b/kernel/futex.c
index c4a182f5357e..6e443efc65f4 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -64,6 +64,7 @@
#include <linux/hugetlb.h>
#include <linux/freezer.h>
#include <linux/bootmem.h>
+#include <linux/fault-inject.h>
#include <asm/futex.h>
@@ -258,6 +259,66 @@ static unsigned long __read_mostly futex_hashsize;
static struct futex_hash_bucket *futex_queues;
+/*
+ * Fault injections for futexes.
+ */
+#ifdef CONFIG_FAIL_FUTEX
+
+static struct {
+ struct fault_attr attr;
+
+ u32 ignore_private;
+} fail_futex = {
+ .attr = FAULT_ATTR_INITIALIZER,
+ .ignore_private = 0,
+};
+
+static int __init setup_fail_futex(char *str)
+{
+ return setup_fault_attr(&fail_futex.attr, str);
+}
+__setup("fail_futex=", setup_fail_futex);
+
+static bool should_fail_futex(bool fshared)
+{
+ if (fail_futex.ignore_private && !fshared)
+ return false;
+
+ return should_fail(&fail_futex.attr, 1);
+}
+
+#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
+
+static int __init fail_futex_debugfs(void)
+{
+ umode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
+ struct dentry *dir;
+
+ dir = fault_create_debugfs_attr("fail_futex", NULL,
+ &fail_futex.attr);
+ if (IS_ERR(dir))
+ return PTR_ERR(dir);
+
+ if (!debugfs_create_bool("ignore-private", mode, dir,
+ &fail_futex.ignore_private)) {
+ debugfs_remove_recursive(dir);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+late_initcall(fail_futex_debugfs);
+
+#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
+
+#else
+static inline bool should_fail_futex(bool fshared)
+{
+ return false;
+}
+#endif /* CONFIG_FAIL_FUTEX */
+
static inline void futex_get_mm(union futex_key *key)
{
atomic_inc(&key->private.mm->mm_count);
@@ -413,6 +474,9 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
if (unlikely(!access_ok(rw, uaddr, sizeof(u32))))
return -EFAULT;
+ if (unlikely(should_fail_futex(fshared)))
+ return -EFAULT;
+
/*
* PROCESS_PRIVATE futexes are fast.
* As the mm cannot disappear under us and the 'key' only needs
@@ -428,6 +492,10 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
}
again:
+ /* Ignore any VERIFY_READ mapping (futex common case) */
+ if (unlikely(should_fail_futex(fshared)))
+ return -EFAULT;
+
err = get_user_pages_fast(address, 1, 1, &page);
/*
* If write access is not required (eg. FUTEX_WAIT), try
@@ -516,7 +584,7 @@ again:
* A RO anonymous page will never change and thus doesn't make
* sense for futex operations.
*/
- if (ro) {
+ if (unlikely(should_fail_futex(fshared)) || ro) {
err = -EFAULT;
goto out;
}
@@ -974,6 +1042,9 @@ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
{
u32 uninitialized_var(curval);
+ if (unlikely(should_fail_futex(true)))
+ return -EFAULT;
+
if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
return -EFAULT;
@@ -1015,12 +1086,18 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
if (get_futex_value_locked(&uval, uaddr))
return -EFAULT;
+ if (unlikely(should_fail_futex(true)))
+ return -EFAULT;
+
/*
* Detect deadlocks.
*/
if ((unlikely((uval & FUTEX_TID_MASK) == vpid)))
return -EDEADLK;
+ if ((unlikely(should_fail_futex(true))))
+ return -EDEADLK;
+
/*
* Lookup existing state first. If it exists, try to attach to
* its pi_state.
@@ -1155,6 +1232,9 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
*/
newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
+ if (unlikely(should_fail_futex(true)))
+ ret = -EFAULT;
+
if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
ret = -EFAULT;
else if (curval != uval)
@@ -1457,6 +1537,9 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
if (get_futex_value_locked(&curval, pifutex))
return -EFAULT;
+ if (unlikely(should_fail_futex(true)))
+ return -EFAULT;
+
/*
* Find the top_waiter and determine if there are additional waiters.
* If the caller intends to requeue more than 1 waiter to pifutex,
@@ -2268,8 +2351,11 @@ static long futex_wait_restart(struct restart_block *restart)
/*
* Userspace tried a 0 -> TID atomic transition of the futex value
* and failed. The kernel side here does the whole locking operation:
- * if there are waiters then it will block, it does PI, etc. (Due to
- * races the kernel might see a 0 value of the futex too.)
+ * if there are waiters then it will block as a consequence of relying
+ * on rt-mutexes, it does PI, etc. (Due to races the kernel might see
+ * a 0 value of the futex too.).
+ *
+ * Also serves as futex trylock_pi()'ing, and due semantics.
*/
static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
ktime_t *time, int trylock)
@@ -2300,6 +2386,10 @@ retry_private:
ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0);
if (unlikely(ret)) {
+ /*
+ * Atomic work succeeded and we got the lock,
+ * or failed. Either way, we do _not_ block.
+ */
switch (ret) {
case 1:
/* We got the lock. */
@@ -2530,7 +2620,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
* futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
* @uaddr: the futex we initially wait on (non-pi)
* @flags: futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be
- * the same type, no requeueing from private to shared, etc.
+ * the same type, no requeueing from private to shared, etc.
* @val: the expected value of uaddr
* @abs_time: absolute timeout
* @bitset: 32 bit wakeup bitset set by userspace, defaults to all
@@ -3005,6 +3095,8 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
cmd == FUTEX_WAIT_BITSET ||
cmd == FUTEX_WAIT_REQUEUE_PI)) {
+ if (unlikely(should_fail_futex(!(op & FUTEX_PRIVATE_FLAG))))
+ return -EFAULT;
if (copy_from_user(&ts, utime, sizeof(ts)) != 0)
return -EFAULT;
if (!timespec_valid(&ts))
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 52ebaca1b9fc..f7dd15d537f9 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -54,7 +54,7 @@ jump_label_sort_entries(struct jump_entry *start, struct jump_entry *stop)
sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL);
}
-static void jump_label_update(struct static_key *key, int enable);
+static void jump_label_update(struct static_key *key);
void static_key_slow_inc(struct static_key *key)
{
@@ -63,13 +63,8 @@ void static_key_slow_inc(struct static_key *key)
return;
jump_label_lock();
- if (atomic_read(&key->enabled) == 0) {
- if (!jump_label_get_branch_default(key))
- jump_label_update(key, JUMP_LABEL_ENABLE);
- else
- jump_label_update(key, JUMP_LABEL_DISABLE);
- }
- atomic_inc(&key->enabled);
+ if (atomic_inc_return(&key->enabled) == 1)
+ jump_label_update(key);
jump_label_unlock();
}
EXPORT_SYMBOL_GPL(static_key_slow_inc);
@@ -87,10 +82,7 @@ static void __static_key_slow_dec(struct static_key *key,
atomic_inc(&key->enabled);
schedule_delayed_work(work, rate_limit);
} else {
- if (!jump_label_get_branch_default(key))
- jump_label_update(key, JUMP_LABEL_DISABLE);
- else
- jump_label_update(key, JUMP_LABEL_ENABLE);
+ jump_label_update(key);
}
jump_label_unlock();
}
@@ -149,7 +141,7 @@ static int __jump_label_text_reserved(struct jump_entry *iter_start,
return 0;
}
-/*
+/*
* Update code which is definitely not currently executing.
* Architectures which need heavyweight synchronization to modify
* running code can override this to make the non-live update case
@@ -158,37 +150,54 @@ static int __jump_label_text_reserved(struct jump_entry *iter_start,
void __weak __init_or_module arch_jump_label_transform_static(struct jump_entry *entry,
enum jump_label_type type)
{
- arch_jump_label_transform(entry, type);
+ arch_jump_label_transform(entry, type);
+}
+
+static inline struct jump_entry *static_key_entries(struct static_key *key)
+{
+ return (struct jump_entry *)((unsigned long)key->entries & ~JUMP_TYPE_MASK);
+}
+
+static inline bool static_key_type(struct static_key *key)
+{
+ return (unsigned long)key->entries & JUMP_TYPE_MASK;
+}
+
+static inline struct static_key *jump_entry_key(struct jump_entry *entry)
+{
+ return (struct static_key *)((unsigned long)entry->key & ~1UL);
+}
+
+static bool jump_entry_branch(struct jump_entry *entry)
+{
+ return (unsigned long)entry->key & 1UL;
+}
+
+static enum jump_label_type jump_label_type(struct jump_entry *entry)
+{
+ struct static_key *key = jump_entry_key(entry);
+ bool enabled = static_key_enabled(key);
+ bool branch = jump_entry_branch(entry);
+
+ /* See the comment in linux/jump_label.h */
+ return enabled ^ branch;
}
static void __jump_label_update(struct static_key *key,
struct jump_entry *entry,
- struct jump_entry *stop, int enable)
+ struct jump_entry *stop)
{
- for (; (entry < stop) &&
- (entry->key == (jump_label_t)(unsigned long)key);
- entry++) {
+ for (; (entry < stop) && (jump_entry_key(entry) == key); entry++) {
/*
* entry->code set to 0 invalidates module init text sections
* kernel_text_address() verifies we are not in core kernel
* init code, see jump_label_invalidate_module_init().
*/
if (entry->code && kernel_text_address(entry->code))
- arch_jump_label_transform(entry, enable);
+ arch_jump_label_transform(entry, jump_label_type(entry));
}
}
-static enum jump_label_type jump_label_type(struct static_key *key)
-{
- bool true_branch = jump_label_get_branch_default(key);
- bool state = static_key_enabled(key);
-
- if ((!true_branch && state) || (true_branch && !state))
- return JUMP_LABEL_ENABLE;
-
- return JUMP_LABEL_DISABLE;
-}
-
void __init jump_label_init(void)
{
struct jump_entry *iter_start = __start___jump_table;
@@ -202,8 +211,11 @@ void __init jump_label_init(void)
for (iter = iter_start; iter < iter_stop; iter++) {
struct static_key *iterk;
- iterk = (struct static_key *)(unsigned long)iter->key;
- arch_jump_label_transform_static(iter, jump_label_type(iterk));
+ /* rewrite NOPs */
+ if (jump_label_type(iter) == JUMP_LABEL_NOP)
+ arch_jump_label_transform_static(iter, JUMP_LABEL_NOP);
+
+ iterk = jump_entry_key(iter);
if (iterk == key)
continue;
@@ -222,6 +234,16 @@ void __init jump_label_init(void)
#ifdef CONFIG_MODULES
+static enum jump_label_type jump_label_init_type(struct jump_entry *entry)
+{
+ struct static_key *key = jump_entry_key(entry);
+ bool type = static_key_type(key);
+ bool branch = jump_entry_branch(entry);
+
+ /* See the comment in linux/jump_label.h */
+ return type ^ branch;
+}
+
struct static_key_mod {
struct static_key_mod *next;
struct jump_entry *entries;
@@ -243,17 +265,15 @@ static int __jump_label_mod_text_reserved(void *start, void *end)
start, end);
}
-static void __jump_label_mod_update(struct static_key *key, int enable)
+static void __jump_label_mod_update(struct static_key *key)
{
- struct static_key_mod *mod = key->next;
+ struct static_key_mod *mod;
- while (mod) {
+ for (mod = key->next; mod; mod = mod->next) {
struct module *m = mod->mod;
__jump_label_update(key, mod->entries,
- m->jump_entries + m->num_jump_entries,
- enable);
- mod = mod->next;
+ m->jump_entries + m->num_jump_entries);
}
}
@@ -276,7 +296,9 @@ void jump_label_apply_nops(struct module *mod)
return;
for (iter = iter_start; iter < iter_stop; iter++) {
- arch_jump_label_transform_static(iter, JUMP_LABEL_DISABLE);
+ /* Only write NOPs for arch_branch_static(). */
+ if (jump_label_init_type(iter) == JUMP_LABEL_NOP)
+ arch_jump_label_transform_static(iter, JUMP_LABEL_NOP);
}
}
@@ -297,7 +319,7 @@ static int jump_label_add_module(struct module *mod)
for (iter = iter_start; iter < iter_stop; iter++) {
struct static_key *iterk;
- iterk = (struct static_key *)(unsigned long)iter->key;
+ iterk = jump_entry_key(iter);
if (iterk == key)
continue;
@@ -318,8 +340,9 @@ static int jump_label_add_module(struct module *mod)
jlm->next = key->next;
key->next = jlm;
- if (jump_label_type(key) == JUMP_LABEL_ENABLE)
- __jump_label_update(key, iter, iter_stop, JUMP_LABEL_ENABLE);
+ /* Only update if we've changed from our initial state */
+ if (jump_label_type(iter) != jump_label_init_type(iter))
+ __jump_label_update(key, iter, iter_stop);
}
return 0;
@@ -334,10 +357,10 @@ static void jump_label_del_module(struct module *mod)
struct static_key_mod *jlm, **prev;
for (iter = iter_start; iter < iter_stop; iter++) {
- if (iter->key == (jump_label_t)(unsigned long)key)
+ if (jump_entry_key(iter) == key)
continue;
- key = (struct static_key *)(unsigned long)iter->key;
+ key = jump_entry_key(iter);
if (within_module(iter->key, mod))
continue;
@@ -439,14 +462,14 @@ int jump_label_text_reserved(void *start, void *end)
return ret;
}
-static void jump_label_update(struct static_key *key, int enable)
+static void jump_label_update(struct static_key *key)
{
struct jump_entry *stop = __stop___jump_table;
- struct jump_entry *entry = jump_label_get_entries(key);
+ struct jump_entry *entry = static_key_entries(key);
#ifdef CONFIG_MODULES
struct module *mod;
- __jump_label_mod_update(key, enable);
+ __jump_label_mod_update(key);
preempt_disable();
mod = __module_address((unsigned long)key);
@@ -456,7 +479,44 @@ static void jump_label_update(struct static_key *key, int enable)
#endif
/* if there are no users, entry can be NULL */
if (entry)
- __jump_label_update(key, entry, stop, enable);
+ __jump_label_update(key, entry, stop);
}
-#endif
+#ifdef CONFIG_STATIC_KEYS_SELFTEST
+static DEFINE_STATIC_KEY_TRUE(sk_true);
+static DEFINE_STATIC_KEY_FALSE(sk_false);
+
+static __init int jump_label_test(void)
+{
+ int i;
+
+ for (i = 0; i < 2; i++) {
+ WARN_ON(static_key_enabled(&sk_true.key) != true);
+ WARN_ON(static_key_enabled(&sk_false.key) != false);
+
+ WARN_ON(!static_branch_likely(&sk_true));
+ WARN_ON(!static_branch_unlikely(&sk_true));
+ WARN_ON(static_branch_likely(&sk_false));
+ WARN_ON(static_branch_unlikely(&sk_false));
+
+ static_branch_disable(&sk_true);
+ static_branch_enable(&sk_false);
+
+ WARN_ON(static_key_enabled(&sk_true.key) == true);
+ WARN_ON(static_key_enabled(&sk_false.key) == false);
+
+ WARN_ON(static_branch_likely(&sk_true));
+ WARN_ON(static_branch_unlikely(&sk_true));
+ WARN_ON(!static_branch_likely(&sk_false));
+ WARN_ON(!static_branch_unlikely(&sk_false));
+
+ static_branch_enable(&sk_true);
+ static_branch_disable(&sk_false);
+ }
+
+ return 0;
+}
+late_initcall(jump_label_test);
+#endif /* STATIC_KEYS_SELFTEST */
+
+#endif /* HAVE_JUMP_LABEL */
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 490924cc9e7c..9ff173dca1ae 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -248,15 +248,16 @@ static void create_kthread(struct kthread_create_info *create)
* kthread_create_on_node - create a kthread.
* @threadfn: the function to run until signal_pending(current).
* @data: data ptr for @threadfn.
- * @node: memory node number.
+ * @node: task and thread structures for the thread are allocated on this node
* @namefmt: printf-style name for the thread.
*
* Description: This helper function creates and names a kernel
* thread. The thread will be stopped: use wake_up_process() to start
- * it. See also kthread_run().
+ * it. See also kthread_run(). The new thread has SCHED_NORMAL policy and
+ * is affine to all CPUs.
*
* If thread is going to be bound on a particular cpu, give its node
- * in @node, to get NUMA affinity for kthread stack, or else give -1.
+ * in @node, to get NUMA affinity for kthread stack, or else give NUMA_NO_NODE.
* When woken, the thread will run @threadfn() with @data as its
* argument. @threadfn() can either call do_exit() directly if it is a
* standalone thread for which no one will call kthread_stop(), or
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
index 7dd5c9918e4c..8e96f6cc2a4a 100644
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -1,5 +1,5 @@
-obj-y += mutex.o semaphore.o rwsem.o
+obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o
ifdef CONFIG_FUNCTION_TRACER
CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE)
@@ -20,11 +20,9 @@ obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
obj-$(CONFIG_QUEUED_SPINLOCKS) += qspinlock.o
obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
-obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o
-obj-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o
obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o
obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
index 652a8ee8efe9..f32567254867 100644
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -88,6 +88,19 @@ void percpu_down_read(struct percpu_rw_semaphore *brw)
__up_read(&brw->rw_sem);
}
+int percpu_down_read_trylock(struct percpu_rw_semaphore *brw)
+{
+ if (unlikely(!update_fast_ctr(brw, +1))) {
+ if (!__down_read_trylock(&brw->rw_sem))
+ return 0;
+ atomic_inc(&brw->slow_read_ctr);
+ __up_read(&brw->rw_sem);
+ }
+
+ rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 1, _RET_IP_);
+ return 1;
+}
+
void percpu_up_read(struct percpu_rw_semaphore *brw)
{
rwsem_release(&brw->rw_sem.dep_map, 1, _RET_IP_);
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
index 6c5da483966b..f17a3e3b3550 100644
--- a/kernel/locking/qrwlock.c
+++ b/kernel/locking/qrwlock.c
@@ -55,27 +55,29 @@ rspin_until_writer_unlock(struct qrwlock *lock, u32 cnts)
{
while ((cnts & _QW_WMASK) == _QW_LOCKED) {
cpu_relax_lowlatency();
- cnts = smp_load_acquire((u32 *)&lock->cnts);
+ cnts = atomic_read_acquire(&lock->cnts);
}
}
/**
- * queue_read_lock_slowpath - acquire read lock of a queue rwlock
+ * queued_read_lock_slowpath - acquire read lock of a queue rwlock
* @lock: Pointer to queue rwlock structure
+ * @cnts: Current qrwlock lock value
*/
-void queue_read_lock_slowpath(struct qrwlock *lock)
+void queued_read_lock_slowpath(struct qrwlock *lock, u32 cnts)
{
- u32 cnts;
-
/*
* Readers come here when they cannot get the lock without waiting
*/
if (unlikely(in_interrupt())) {
/*
- * Readers in interrupt context will spin until the lock is
- * available without waiting in the queue.
+ * Readers in interrupt context will get the lock immediately
+ * if the writer is just waiting (not holding the lock yet).
+ * The rspin_until_writer_unlock() function returns immediately
+ * in this case. Otherwise, they will spin (with ACQUIRE
+ * semantics) until the lock is available without waiting in
+ * the queue.
*/
- cnts = smp_load_acquire((u32 *)&lock->cnts);
rspin_until_writer_unlock(lock, cnts);
return;
}
@@ -87,16 +89,11 @@ void queue_read_lock_slowpath(struct qrwlock *lock)
arch_spin_lock(&lock->lock);
/*
- * At the head of the wait queue now, wait until the writer state
- * goes to 0 and then try to increment the reader count and get
- * the lock. It is possible that an incoming writer may steal the
- * lock in the interim, so it is necessary to check the writer byte
- * to make sure that the write lock isn't taken.
+ * The ACQUIRE semantics of the following spinning code ensure
+ * that accesses can't leak upwards out of our subsequent critical
+ * section in the case that the lock is currently held for write.
*/
- while (atomic_read(&lock->cnts) & _QW_WMASK)
- cpu_relax_lowlatency();
-
- cnts = atomic_add_return(_QR_BIAS, &lock->cnts) - _QR_BIAS;
+ cnts = atomic_add_return_acquire(_QR_BIAS, &lock->cnts) - _QR_BIAS;
rspin_until_writer_unlock(lock, cnts);
/*
@@ -104,13 +101,13 @@ void queue_read_lock_slowpath(struct qrwlock *lock)
*/
arch_spin_unlock(&lock->lock);
}
-EXPORT_SYMBOL(queue_read_lock_slowpath);
+EXPORT_SYMBOL(queued_read_lock_slowpath);
/**
- * queue_write_lock_slowpath - acquire write lock of a queue rwlock
+ * queued_write_lock_slowpath - acquire write lock of a queue rwlock
* @lock : Pointer to queue rwlock structure
*/
-void queue_write_lock_slowpath(struct qrwlock *lock)
+void queued_write_lock_slowpath(struct qrwlock *lock)
{
u32 cnts;
@@ -119,7 +116,7 @@ void queue_write_lock_slowpath(struct qrwlock *lock)
/* Try to acquire the lock directly if no reader is present */
if (!atomic_read(&lock->cnts) &&
- (atomic_cmpxchg(&lock->cnts, 0, _QW_LOCKED) == 0))
+ (atomic_cmpxchg_acquire(&lock->cnts, 0, _QW_LOCKED) == 0))
goto unlock;
/*
@@ -130,7 +127,7 @@ void queue_write_lock_slowpath(struct qrwlock *lock)
struct __qrwlock *l = (struct __qrwlock *)lock;
if (!READ_ONCE(l->wmode) &&
- (cmpxchg(&l->wmode, 0, _QW_WAITING) == 0))
+ (cmpxchg_relaxed(&l->wmode, 0, _QW_WAITING) == 0))
break;
cpu_relax_lowlatency();
@@ -140,8 +137,8 @@ void queue_write_lock_slowpath(struct qrwlock *lock)
for (;;) {
cnts = atomic_read(&lock->cnts);
if ((cnts == _QW_WAITING) &&
- (atomic_cmpxchg(&lock->cnts, _QW_WAITING,
- _QW_LOCKED) == _QW_WAITING))
+ (atomic_cmpxchg_acquire(&lock->cnts, _QW_WAITING,
+ _QW_LOCKED) == _QW_WAITING))
break;
cpu_relax_lowlatency();
@@ -149,4 +146,4 @@ void queue_write_lock_slowpath(struct qrwlock *lock)
unlock:
arch_spin_unlock(&lock->lock);
}
-EXPORT_SYMBOL(queue_write_lock_slowpath);
+EXPORT_SYMBOL(queued_write_lock_slowpath);
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 38c49202d532..337c8818541d 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -239,8 +239,8 @@ static __always_inline void set_locked(struct qspinlock *lock)
static __always_inline void __pv_init_node(struct mcs_spinlock *node) { }
static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { }
-static __always_inline void __pv_kick_node(struct mcs_spinlock *node) { }
-
+static __always_inline void __pv_kick_node(struct qspinlock *lock,
+ struct mcs_spinlock *node) { }
static __always_inline void __pv_wait_head(struct qspinlock *lock,
struct mcs_spinlock *node) { }
@@ -440,7 +440,7 @@ queue:
cpu_relax();
arch_mcs_spin_unlock_contended(&next->locked);
- pv_kick_node(next);
+ pv_kick_node(lock, next);
release:
/*
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index df19ae4debd0..c8e6e9a596f5 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -22,9 +22,14 @@
#define _Q_SLOW_VAL (3U << _Q_LOCKED_OFFSET)
+/*
+ * Queue node uses: vcpu_running & vcpu_halted.
+ * Queue head uses: vcpu_running & vcpu_hashed.
+ */
enum vcpu_state {
vcpu_running = 0,
- vcpu_halted,
+ vcpu_halted, /* Used only in pv_wait_node */
+ vcpu_hashed, /* = pv_hash'ed + vcpu_halted */
};
struct pv_node {
@@ -153,7 +158,8 @@ static void pv_init_node(struct mcs_spinlock *node)
/*
* Wait for node->locked to become true, halt the vcpu after a short spin.
- * pv_kick_node() is used to wake the vcpu again.
+ * pv_kick_node() is used to set _Q_SLOW_VAL and fill in hash table on its
+ * behalf.
*/
static void pv_wait_node(struct mcs_spinlock *node)
{
@@ -172,9 +178,9 @@ static void pv_wait_node(struct mcs_spinlock *node)
*
* [S] pn->state = vcpu_halted [S] next->locked = 1
* MB MB
- * [L] pn->locked [RmW] pn->state = vcpu_running
+ * [L] pn->locked [RmW] pn->state = vcpu_hashed
*
- * Matches the xchg() from pv_kick_node().
+ * Matches the cmpxchg() from pv_kick_node().
*/
smp_store_mb(pn->state, vcpu_halted);
@@ -182,9 +188,10 @@ static void pv_wait_node(struct mcs_spinlock *node)
pv_wait(&pn->state, vcpu_halted);
/*
- * Reset the vCPU state to avoid unncessary CPU kicking
+ * If pv_kick_node() changed us to vcpu_hashed, retain that value
+ * so that pv_wait_head() knows to not also try to hash this lock.
*/
- WRITE_ONCE(pn->state, vcpu_running);
+ cmpxchg(&pn->state, vcpu_halted, vcpu_running);
/*
* If the locked flag is still not set after wakeup, it is a
@@ -194,6 +201,7 @@ static void pv_wait_node(struct mcs_spinlock *node)
* MCS lock will be released soon.
*/
}
+
/*
* By now our node->locked should be 1 and our caller will not actually
* spin-wait for it. We do however rely on our caller to do a
@@ -202,24 +210,35 @@ static void pv_wait_node(struct mcs_spinlock *node)
}
/*
- * Called after setting next->locked = 1, used to wake those stuck in
- * pv_wait_node().
+ * Called after setting next->locked = 1 when we're the lock owner.
+ *
+ * Instead of waking the waiters stuck in pv_wait_node() advance their state such
+ * that they're waiting in pv_wait_head(), this avoids a wake/sleep cycle.
*/
-static void pv_kick_node(struct mcs_spinlock *node)
+static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
{
struct pv_node *pn = (struct pv_node *)node;
+ struct __qspinlock *l = (void *)lock;
/*
- * Note that because node->locked is already set, this actual
- * mcs_spinlock entry could be re-used already.
+ * If the vCPU is indeed halted, advance its state to match that of
+ * pv_wait_node(). If OTOH this fails, the vCPU was running and will
+ * observe its next->locked value and advance itself.
*
- * This should be fine however, kicking people for no reason is
- * harmless.
+ * Matches with smp_store_mb() and cmpxchg() in pv_wait_node()
+ */
+ if (cmpxchg(&pn->state, vcpu_halted, vcpu_hashed) != vcpu_halted)
+ return;
+
+ /*
+ * Put the lock into the hash table and set the _Q_SLOW_VAL.
*
- * See the comment in pv_wait_node().
+ * As this is the same vCPU that will check the _Q_SLOW_VAL value and
+ * the hash table later on at unlock time, no atomic instruction is
+ * needed.
*/
- if (xchg(&pn->state, vcpu_running) == vcpu_halted)
- pv_kick(pn->cpu);
+ WRITE_ONCE(l->locked, _Q_SLOW_VAL);
+ (void)pv_hash(lock, pn);
}
/*
@@ -233,6 +252,13 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
struct qspinlock **lp = NULL;
int loop;
+ /*
+ * If pv_kick_node() already advanced our state, we don't need to
+ * insert ourselves into the hash table anymore.
+ */
+ if (READ_ONCE(pn->state) == vcpu_hashed)
+ lp = (struct qspinlock **)1;
+
for (;;) {
for (loop = SPIN_THRESHOLD; loop; loop--) {
if (!READ_ONCE(l->locked))
@@ -240,17 +266,22 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
cpu_relax();
}
- WRITE_ONCE(pn->state, vcpu_halted);
if (!lp) { /* ONCE */
+ WRITE_ONCE(pn->state, vcpu_hashed);
lp = pv_hash(lock, pn);
+
/*
- * lp must be set before setting _Q_SLOW_VAL
+ * We must hash before setting _Q_SLOW_VAL, such that
+ * when we observe _Q_SLOW_VAL in __pv_queued_spin_unlock()
+ * we'll be sure to be able to observe our hash entry.
*
- * [S] lp = lock [RmW] l = l->locked = 0
- * MB MB
- * [S] l->locked = _Q_SLOW_VAL [L] lp
+ * [S] pn->state
+ * [S] <hash> [Rmw] l->locked == _Q_SLOW_VAL
+ * MB RMB
+ * [RmW] l->locked = _Q_SLOW_VAL [L] <unhash>
+ * [L] pn->state
*
- * Matches the cmpxchg() in __pv_queued_spin_unlock().
+ * Matches the smp_rmb() in __pv_queued_spin_unlock().
*/
if (!cmpxchg(&l->locked, _Q_LOCKED_VAL, _Q_SLOW_VAL)) {
/*
@@ -287,24 +318,34 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock)
{
struct __qspinlock *l = (void *)lock;
struct pv_node *node;
- u8 lockval = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0);
+ u8 locked;
/*
* We must not unlock if SLOW, because in that case we must first
* unhash. Otherwise it would be possible to have multiple @lock
* entries, which would be BAD.
*/
- if (likely(lockval == _Q_LOCKED_VAL))
+ locked = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0);
+ if (likely(locked == _Q_LOCKED_VAL))
return;
- if (unlikely(lockval != _Q_SLOW_VAL)) {
- if (debug_locks_silent)
- return;
- WARN(1, "pvqspinlock: lock %p has corrupted value 0x%x!\n", lock, atomic_read(&lock->val));
+ if (unlikely(locked != _Q_SLOW_VAL)) {
+ WARN(!debug_locks_silent,
+ "pvqspinlock: lock 0x%lx has corrupted value 0x%x!\n",
+ (unsigned long)lock, atomic_read(&lock->val));
return;
}
/*
+ * A failed cmpxchg doesn't provide any memory-ordering guarantees,
+ * so we need a barrier to order the read of the node data in
+ * pv_unhash *after* we've read the lock being _Q_SLOW_VAL.
+ *
+ * Matches the cmpxchg() in pv_wait_head() setting _Q_SLOW_VAL.
+ */
+ smp_rmb();
+
+ /*
* Since the above failed to release, this must be the SLOW path.
* Therefore start by looking up the blocked node and unhashing it.
*/
@@ -319,8 +360,11 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock)
/*
* At this point the memory pointed at by lock can be freed/reused,
* however we can still use the pv_node to kick the CPU.
+ * The other vCPU may not really be halted, but kicking an active
+ * vCPU is harmless other than the additional latency in completing
+ * the unlock.
*/
- if (READ_ONCE(node->state) == vcpu_halted)
+ if (READ_ONCE(node->state) == vcpu_hashed)
pv_kick(node->cpu);
}
/*
diff --git a/kernel/locking/rtmutex-tester.c b/kernel/locking/rtmutex-tester.c
deleted file mode 100644
index 1d96dd0d93c1..000000000000
--- a/kernel/locking/rtmutex-tester.c
+++ /dev/null
@@ -1,420 +0,0 @@
-/*
- * RT-Mutex-tester: scriptable tester for rt mutexes
- *
- * started by Thomas Gleixner:
- *
- * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
- *
- */
-#include <linux/device.h>
-#include <linux/kthread.h>
-#include <linux/export.h>
-#include <linux/sched.h>
-#include <linux/sched/rt.h>
-#include <linux/spinlock.h>
-#include <linux/timer.h>
-#include <linux/freezer.h>
-#include <linux/stat.h>
-
-#include "rtmutex.h"
-
-#define MAX_RT_TEST_THREADS 8
-#define MAX_RT_TEST_MUTEXES 8
-
-static spinlock_t rttest_lock;
-static atomic_t rttest_event;
-
-struct test_thread_data {
- int opcode;
- int opdata;
- int mutexes[MAX_RT_TEST_MUTEXES];
- int event;
- struct device dev;
-};
-
-static struct test_thread_data thread_data[MAX_RT_TEST_THREADS];
-static struct task_struct *threads[MAX_RT_TEST_THREADS];
-static struct rt_mutex mutexes[MAX_RT_TEST_MUTEXES];
-
-enum test_opcodes {
- RTTEST_NOP = 0,
- RTTEST_SCHEDOT, /* 1 Sched other, data = nice */
- RTTEST_SCHEDRT, /* 2 Sched fifo, data = prio */
- RTTEST_LOCK, /* 3 Lock uninterruptible, data = lockindex */
- RTTEST_LOCKNOWAIT, /* 4 Lock uninterruptible no wait in wakeup, data = lockindex */
- RTTEST_LOCKINT, /* 5 Lock interruptible, data = lockindex */
- RTTEST_LOCKINTNOWAIT, /* 6 Lock interruptible no wait in wakeup, data = lockindex */
- RTTEST_LOCKCONT, /* 7 Continue locking after the wakeup delay */
- RTTEST_UNLOCK, /* 8 Unlock, data = lockindex */
- /* 9, 10 - reserved for BKL commemoration */
- RTTEST_SIGNAL = 11, /* 11 Signal other test thread, data = thread id */
- RTTEST_RESETEVENT = 98, /* 98 Reset event counter */
- RTTEST_RESET = 99, /* 99 Reset all pending operations */
-};
-
-static int handle_op(struct test_thread_data *td, int lockwakeup)
-{
- int i, id, ret = -EINVAL;
-
- switch(td->opcode) {
-
- case RTTEST_NOP:
- return 0;
-
- case RTTEST_LOCKCONT:
- td->mutexes[td->opdata] = 1;
- td->event = atomic_add_return(1, &rttest_event);
- return 0;
-
- case RTTEST_RESET:
- for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) {
- if (td->mutexes[i] == 4) {
- rt_mutex_unlock(&mutexes[i]);
- td->mutexes[i] = 0;
- }
- }
- return 0;
-
- case RTTEST_RESETEVENT:
- atomic_set(&rttest_event, 0);
- return 0;
-
- default:
- if (lockwakeup)
- return ret;
- }
-
- switch(td->opcode) {
-
- case RTTEST_LOCK:
- case RTTEST_LOCKNOWAIT:
- id = td->opdata;
- if (id < 0 || id >= MAX_RT_TEST_MUTEXES)
- return ret;
-
- td->mutexes[id] = 1;
- td->event = atomic_add_return(1, &rttest_event);
- rt_mutex_lock(&mutexes[id]);
- td->event = atomic_add_return(1, &rttest_event);
- td->mutexes[id] = 4;
- return 0;
-
- case RTTEST_LOCKINT:
- case RTTEST_LOCKINTNOWAIT:
- id = td->opdata;
- if (id < 0 || id >= MAX_RT_TEST_MUTEXES)
- return ret;
-
- td->mutexes[id] = 1;
- td->event = atomic_add_return(1, &rttest_event);
- ret = rt_mutex_lock_interruptible(&mutexes[id], 0);
- td->event = atomic_add_return(1, &rttest_event);
- td->mutexes[id] = ret ? 0 : 4;
- return ret ? -EINTR : 0;
-
- case RTTEST_UNLOCK:
- id = td->opdata;
- if (id < 0 || id >= MAX_RT_TEST_MUTEXES || td->mutexes[id] != 4)
- return ret;
-
- td->event = atomic_add_return(1, &rttest_event);
- rt_mutex_unlock(&mutexes[id]);
- td->event = atomic_add_return(1, &rttest_event);
- td->mutexes[id] = 0;
- return 0;
-
- default:
- break;
- }
- return ret;
-}
-
-/*
- * Schedule replacement for rtsem_down(). Only called for threads with
- * PF_MUTEX_TESTER set.
- *
- * This allows us to have finegrained control over the event flow.
- *
- */
-void schedule_rt_mutex_test(struct rt_mutex *mutex)
-{
- int tid, op, dat;
- struct test_thread_data *td;
-
- /* We have to lookup the task */
- for (tid = 0; tid < MAX_RT_TEST_THREADS; tid++) {
- if (threads[tid] == current)
- break;
- }
-
- BUG_ON(tid == MAX_RT_TEST_THREADS);
-
- td = &thread_data[tid];
-
- op = td->opcode;
- dat = td->opdata;
-
- switch (op) {
- case RTTEST_LOCK:
- case RTTEST_LOCKINT:
- case RTTEST_LOCKNOWAIT:
- case RTTEST_LOCKINTNOWAIT:
- if (mutex != &mutexes[dat])
- break;
-
- if (td->mutexes[dat] != 1)
- break;
-
- td->mutexes[dat] = 2;
- td->event = atomic_add_return(1, &rttest_event);
- break;
-
- default:
- break;
- }
-
- schedule();
-
-
- switch (op) {
- case RTTEST_LOCK:
- case RTTEST_LOCKINT:
- if (mutex != &mutexes[dat])
- return;
-
- if (td->mutexes[dat] != 2)
- return;
-
- td->mutexes[dat] = 3;
- td->event = atomic_add_return(1, &rttest_event);
- break;
-
- case RTTEST_LOCKNOWAIT:
- case RTTEST_LOCKINTNOWAIT:
- if (mutex != &mutexes[dat])
- return;
-
- if (td->mutexes[dat] != 2)
- return;
-
- td->mutexes[dat] = 1;
- td->event = atomic_add_return(1, &rttest_event);
- return;
-
- default:
- return;
- }
-
- td->opcode = 0;
-
- for (;;) {
- set_current_state(TASK_INTERRUPTIBLE);
-
- if (td->opcode > 0) {
- int ret;
-
- set_current_state(TASK_RUNNING);
- ret = handle_op(td, 1);
- set_current_state(TASK_INTERRUPTIBLE);
- if (td->opcode == RTTEST_LOCKCONT)
- break;
- td->opcode = ret;
- }
-
- /* Wait for the next command to be executed */
- schedule();
- }
-
- /* Restore previous command and data */
- td->opcode = op;
- td->opdata = dat;
-}
-
-static int test_func(void *data)
-{
- struct test_thread_data *td = data;
- int ret;
-
- current->flags |= PF_MUTEX_TESTER;
- set_freezable();
- allow_signal(SIGHUP);
-
- for(;;) {
-
- set_current_state(TASK_INTERRUPTIBLE);
-
- if (td->opcode > 0) {
- set_current_state(TASK_RUNNING);
- ret = handle_op(td, 0);
- set_current_state(TASK_INTERRUPTIBLE);
- td->opcode = ret;
- }
-
- /* Wait for the next command to be executed */
- schedule();
- try_to_freeze();
-
- if (signal_pending(current))
- flush_signals(current);
-
- if(kthread_should_stop())
- break;
- }
- return 0;
-}
-
-/**
- * sysfs_test_command - interface for test commands
- * @dev: thread reference
- * @buf: command for actual step
- * @count: length of buffer
- *
- * command syntax:
- *
- * opcode:data
- */
-static ssize_t sysfs_test_command(struct device *dev, struct device_attribute *attr,
- const char *buf, size_t count)
-{
- struct sched_param schedpar;
- struct test_thread_data *td;
- char cmdbuf[32];
- int op, dat, tid, ret;
-
- td = container_of(dev, struct test_thread_data, dev);
- tid = td->dev.id;
-
- /* strings from sysfs write are not 0 terminated! */
- if (count >= sizeof(cmdbuf))
- return -EINVAL;
-
- /* strip of \n: */
- if (buf[count-1] == '\n')
- count--;
- if (count < 1)
- return -EINVAL;
-
- memcpy(cmdbuf, buf, count);
- cmdbuf[count] = 0;
-
- if (sscanf(cmdbuf, "%d:%d", &op, &dat) != 2)
- return -EINVAL;
-
- switch (op) {
- case RTTEST_SCHEDOT:
- schedpar.sched_priority = 0;
- ret = sched_setscheduler(threads[tid], SCHED_NORMAL, &schedpar);
- if (ret)
- return ret;
- set_user_nice(current, 0);
- break;
-
- case RTTEST_SCHEDRT:
- schedpar.sched_priority = dat;
- ret = sched_setscheduler(threads[tid], SCHED_FIFO, &schedpar);
- if (ret)
- return ret;
- break;
-
- case RTTEST_SIGNAL:
- send_sig(SIGHUP, threads[tid], 0);
- break;
-
- default:
- if (td->opcode > 0)
- return -EBUSY;
- td->opdata = dat;
- td->opcode = op;
- wake_up_process(threads[tid]);
- }
-
- return count;
-}
-
-/**
- * sysfs_test_status - sysfs interface for rt tester
- * @dev: thread to query
- * @buf: char buffer to be filled with thread status info
- */
-static ssize_t sysfs_test_status(struct device *dev, struct device_attribute *attr,
- char *buf)
-{
- struct test_thread_data *td;
- struct task_struct *tsk;
- char *curr = buf;
- int i;
-
- td = container_of(dev, struct test_thread_data, dev);
- tsk = threads[td->dev.id];
-
- spin_lock(&rttest_lock);
-
- curr += sprintf(curr,
- "O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, M:",
- td->opcode, td->event, tsk->state,
- (MAX_RT_PRIO - 1) - tsk->prio,
- (MAX_RT_PRIO - 1) - tsk->normal_prio,
- tsk->pi_blocked_on);
-
- for (i = MAX_RT_TEST_MUTEXES - 1; i >=0 ; i--)
- curr += sprintf(curr, "%d", td->mutexes[i]);
-
- spin_unlock(&rttest_lock);
-
- curr += sprintf(curr, ", T: %p, R: %p\n", tsk,
- mutexes[td->dev.id].owner);
-
- return curr - buf;
-}
-
-static DEVICE_ATTR(status, S_IRUSR, sysfs_test_status, NULL);
-static DEVICE_ATTR(command, S_IWUSR, NULL, sysfs_test_command);
-
-static struct bus_type rttest_subsys = {
- .name = "rttest",
- .dev_name = "rttest",
-};
-
-static int init_test_thread(int id)
-{
- thread_data[id].dev.bus = &rttest_subsys;
- thread_data[id].dev.id = id;
-
- threads[id] = kthread_run(test_func, &thread_data[id], "rt-test-%d", id);
- if (IS_ERR(threads[id]))
- return PTR_ERR(threads[id]);
-
- return device_register(&thread_data[id].dev);
-}
-
-static int init_rttest(void)
-{
- int ret, i;
-
- spin_lock_init(&rttest_lock);
-
- for (i = 0; i < MAX_RT_TEST_MUTEXES; i++)
- rt_mutex_init(&mutexes[i]);
-
- ret = subsys_system_register(&rttest_subsys, NULL);
- if (ret)
- return ret;
-
- for (i = 0; i < MAX_RT_TEST_THREADS; i++) {
- ret = init_test_thread(i);
- if (ret)
- break;
- ret = device_create_file(&thread_data[i].dev, &dev_attr_status);
- if (ret)
- break;
- ret = device_create_file(&thread_data[i].dev, &dev_attr_command);
- if (ret)
- break;
- }
-
- printk("Initializing RT-Tester: %s\n", ret ? "Failed" : "OK" );
-
- return ret;
-}
-
-device_initcall(init_rttest);
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 5674b073473c..7781d801212f 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1120,7 +1120,7 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
debug_rt_mutex_print_deadlock(waiter);
- schedule_rt_mutex(lock);
+ schedule();
raw_spin_lock(&lock->wait_lock);
set_current_state(state);
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
index 7844f8f0e639..4f5f83c7d2d3 100644
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -15,28 +15,6 @@
#include <linux/rtmutex.h>
/*
- * The rtmutex in kernel tester is independent of rtmutex debugging. We
- * call schedule_rt_mutex_test() instead of schedule() for the tasks which
- * belong to the tester. That way we can delay the wakeup path of those
- * threads to provoke lock stealing and testing of complex boosting scenarios.
- */
-#ifdef CONFIG_RT_MUTEX_TESTER
-
-extern void schedule_rt_mutex_test(struct rt_mutex *lock);
-
-#define schedule_rt_mutex(_lock) \
- do { \
- if (!(current->flags & PF_MUTEX_TESTER)) \
- schedule(); \
- else \
- schedule_rt_mutex_test(_lock); \
- } while (0)
-
-#else
-# define schedule_rt_mutex(_lock) schedule()
-#endif
-
-/*
* This is the control structure for tasks blocked on a rt_mutex,
* which is allocated on the kernel stack on of the blocked task.
*
diff --git a/kernel/memremap.c b/kernel/memremap.c
new file mode 100644
index 000000000000..72b0c66628b6
--- /dev/null
+++ b/kernel/memremap.c
@@ -0,0 +1,190 @@
+/*
+ * Copyright(c) 2015 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/device.h>
+#include <linux/types.h>
+#include <linux/io.h>
+#include <linux/mm.h>
+#include <linux/memory_hotplug.h>
+
+#ifndef ioremap_cache
+/* temporary while we convert existing ioremap_cache users to memremap */
+__weak void __iomem *ioremap_cache(resource_size_t offset, unsigned long size)
+{
+ return ioremap(offset, size);
+}
+#endif
+
+/**
+ * memremap() - remap an iomem_resource as cacheable memory
+ * @offset: iomem resource start address
+ * @size: size of remap
+ * @flags: either MEMREMAP_WB or MEMREMAP_WT
+ *
+ * memremap() is "ioremap" for cases where it is known that the resource
+ * being mapped does not have i/o side effects and the __iomem
+ * annotation is not applicable.
+ *
+ * MEMREMAP_WB - matches the default mapping for "System RAM" on
+ * the architecture. This is usually a read-allocate write-back cache.
+ * Morever, if MEMREMAP_WB is specified and the requested remap region is RAM
+ * memremap() will bypass establishing a new mapping and instead return
+ * a pointer into the direct map.
+ *
+ * MEMREMAP_WT - establish a mapping whereby writes either bypass the
+ * cache or are written through to memory and never exist in a
+ * cache-dirty state with respect to program visibility. Attempts to
+ * map "System RAM" with this mapping type will fail.
+ */
+void *memremap(resource_size_t offset, size_t size, unsigned long flags)
+{
+ int is_ram = region_intersects(offset, size, "System RAM");
+ void *addr = NULL;
+
+ if (is_ram == REGION_MIXED) {
+ WARN_ONCE(1, "memremap attempted on mixed range %pa size: %#lx\n",
+ &offset, (unsigned long) size);
+ return NULL;
+ }
+
+ /* Try all mapping types requested until one returns non-NULL */
+ if (flags & MEMREMAP_WB) {
+ flags &= ~MEMREMAP_WB;
+ /*
+ * MEMREMAP_WB is special in that it can be satisifed
+ * from the direct map. Some archs depend on the
+ * capability of memremap() to autodetect cases where
+ * the requested range is potentially in "System RAM"
+ */
+ if (is_ram == REGION_INTERSECTS)
+ addr = __va(offset);
+ else
+ addr = ioremap_cache(offset, size);
+ }
+
+ /*
+ * If we don't have a mapping yet and more request flags are
+ * pending then we will be attempting to establish a new virtual
+ * address mapping. Enforce that this mapping is not aliasing
+ * "System RAM"
+ */
+ if (!addr && is_ram == REGION_INTERSECTS && flags) {
+ WARN_ONCE(1, "memremap attempted on ram %pa size: %#lx\n",
+ &offset, (unsigned long) size);
+ return NULL;
+ }
+
+ if (!addr && (flags & MEMREMAP_WT)) {
+ flags &= ~MEMREMAP_WT;
+ addr = ioremap_wt(offset, size);
+ }
+
+ return addr;
+}
+EXPORT_SYMBOL(memremap);
+
+void memunmap(void *addr)
+{
+ if (is_vmalloc_addr(addr))
+ iounmap((void __iomem *) addr);
+}
+EXPORT_SYMBOL(memunmap);
+
+static void devm_memremap_release(struct device *dev, void *res)
+{
+ memunmap(res);
+}
+
+static int devm_memremap_match(struct device *dev, void *res, void *match_data)
+{
+ return *(void **)res == match_data;
+}
+
+void *devm_memremap(struct device *dev, resource_size_t offset,
+ size_t size, unsigned long flags)
+{
+ void **ptr, *addr;
+
+ ptr = devres_alloc(devm_memremap_release, sizeof(*ptr), GFP_KERNEL);
+ if (!ptr)
+ return NULL;
+
+ addr = memremap(offset, size, flags);
+ if (addr) {
+ *ptr = addr;
+ devres_add(dev, ptr);
+ } else
+ devres_free(ptr);
+
+ return addr;
+}
+EXPORT_SYMBOL(devm_memremap);
+
+void devm_memunmap(struct device *dev, void *addr)
+{
+ WARN_ON(devres_destroy(dev, devm_memremap_release, devm_memremap_match,
+ addr));
+ memunmap(addr);
+}
+EXPORT_SYMBOL(devm_memunmap);
+
+#ifdef CONFIG_ZONE_DEVICE
+struct page_map {
+ struct resource res;
+};
+
+static void devm_memremap_pages_release(struct device *dev, void *res)
+{
+ struct page_map *page_map = res;
+
+ /* pages are dead and unused, undo the arch mapping */
+ arch_remove_memory(page_map->res.start, resource_size(&page_map->res));
+}
+
+void *devm_memremap_pages(struct device *dev, struct resource *res)
+{
+ int is_ram = region_intersects(res->start, resource_size(res),
+ "System RAM");
+ struct page_map *page_map;
+ int error, nid;
+
+ if (is_ram == REGION_MIXED) {
+ WARN_ONCE(1, "%s attempted on mixed region %pr\n",
+ __func__, res);
+ return ERR_PTR(-ENXIO);
+ }
+
+ if (is_ram == REGION_INTERSECTS)
+ return __va(res->start);
+
+ page_map = devres_alloc(devm_memremap_pages_release,
+ sizeof(*page_map), GFP_KERNEL);
+ if (!page_map)
+ return ERR_PTR(-ENOMEM);
+
+ memcpy(&page_map->res, res, sizeof(*res));
+
+ nid = dev_to_node(dev);
+ if (nid < 0)
+ nid = 0;
+
+ error = arch_add_memory(nid, res->start, resource_size(res), true);
+ if (error) {
+ devres_free(page_map);
+ return ERR_PTR(error);
+ }
+
+ devres_add(dev, page_map);
+ return __va(res->start);
+}
+EXPORT_SYMBOL(devm_memremap_pages);
+#endif /* CONFIG_ZONE_DEVICE */
diff --git a/kernel/module_signing.c b/kernel/module_signing.c
index be5b8fac4bd0..bd62f5cda746 100644
--- a/kernel/module_signing.c
+++ b/kernel/module_signing.c
@@ -10,11 +10,8 @@
*/
#include <linux/kernel.h>
-#include <linux/err.h>
-#include <crypto/public_key.h>
-#include <crypto/hash.h>
-#include <keys/asymmetric-type.h>
#include <keys/system_keyring.h>
+#include <crypto/public_key.h>
#include "module-internal.h"
/*
@@ -28,170 +25,22 @@
* - Information block
*/
struct module_signature {
- u8 algo; /* Public-key crypto algorithm [enum pkey_algo] */
- u8 hash; /* Digest algorithm [enum hash_algo] */
- u8 id_type; /* Key identifier type [enum pkey_id_type] */
- u8 signer_len; /* Length of signer's name */
- u8 key_id_len; /* Length of key identifier */
+ u8 algo; /* Public-key crypto algorithm [0] */
+ u8 hash; /* Digest algorithm [0] */
+ u8 id_type; /* Key identifier type [PKEY_ID_PKCS7] */
+ u8 signer_len; /* Length of signer's name [0] */
+ u8 key_id_len; /* Length of key identifier [0] */
u8 __pad[3];
__be32 sig_len; /* Length of signature data */
};
/*
- * Digest the module contents.
- */
-static struct public_key_signature *mod_make_digest(enum hash_algo hash,
- const void *mod,
- unsigned long modlen)
-{
- struct public_key_signature *pks;
- struct crypto_shash *tfm;
- struct shash_desc *desc;
- size_t digest_size, desc_size;
- int ret;
-
- pr_devel("==>%s()\n", __func__);
-
- /* Allocate the hashing algorithm we're going to need and find out how
- * big the hash operational data will be.
- */
- tfm = crypto_alloc_shash(hash_algo_name[hash], 0, 0);
- if (IS_ERR(tfm))
- return (PTR_ERR(tfm) == -ENOENT) ? ERR_PTR(-ENOPKG) : ERR_CAST(tfm);
-
- desc_size = crypto_shash_descsize(tfm) + sizeof(*desc);
- digest_size = crypto_shash_digestsize(tfm);
-
- /* We allocate the hash operational data storage on the end of our
- * context data and the digest output buffer on the end of that.
- */
- ret = -ENOMEM;
- pks = kzalloc(digest_size + sizeof(*pks) + desc_size, GFP_KERNEL);
- if (!pks)
- goto error_no_pks;
-
- pks->pkey_hash_algo = hash;
- pks->digest = (u8 *)pks + sizeof(*pks) + desc_size;
- pks->digest_size = digest_size;
-
- desc = (void *)pks + sizeof(*pks);
- desc->tfm = tfm;
- desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
-
- ret = crypto_shash_init(desc);
- if (ret < 0)
- goto error;
-
- ret = crypto_shash_finup(desc, mod, modlen, pks->digest);
- if (ret < 0)
- goto error;
-
- crypto_free_shash(tfm);
- pr_devel("<==%s() = ok\n", __func__);
- return pks;
-
-error:
- kfree(pks);
-error_no_pks:
- crypto_free_shash(tfm);
- pr_devel("<==%s() = %d\n", __func__, ret);
- return ERR_PTR(ret);
-}
-
-/*
- * Extract an MPI array from the signature data. This represents the actual
- * signature. Each raw MPI is prefaced by a BE 2-byte value indicating the
- * size of the MPI in bytes.
- *
- * RSA signatures only have one MPI, so currently we only read one.
- */
-static int mod_extract_mpi_array(struct public_key_signature *pks,
- const void *data, size_t len)
-{
- size_t nbytes;
- MPI mpi;
-
- if (len < 3)
- return -EBADMSG;
- nbytes = ((const u8 *)data)[0] << 8 | ((const u8 *)data)[1];
- data += 2;
- len -= 2;
- if (len != nbytes)
- return -EBADMSG;
-
- mpi = mpi_read_raw_data(data, nbytes);
- if (!mpi)
- return -ENOMEM;
- pks->mpi[0] = mpi;
- pks->nr_mpi = 1;
- return 0;
-}
-
-/*
- * Request an asymmetric key.
- */
-static struct key *request_asymmetric_key(const char *signer, size_t signer_len,
- const u8 *key_id, size_t key_id_len)
-{
- key_ref_t key;
- size_t i;
- char *id, *q;
-
- pr_devel("==>%s(,%zu,,%zu)\n", __func__, signer_len, key_id_len);
-
- /* Construct an identifier. */
- id = kmalloc(signer_len + 2 + key_id_len * 2 + 1, GFP_KERNEL);
- if (!id)
- return ERR_PTR(-ENOKEY);
-
- memcpy(id, signer, signer_len);
-
- q = id + signer_len;
- *q++ = ':';
- *q++ = ' ';
- for (i = 0; i < key_id_len; i++) {
- *q++ = hex_asc[*key_id >> 4];
- *q++ = hex_asc[*key_id++ & 0x0f];
- }
-
- *q = 0;
-
- pr_debug("Look up: \"%s\"\n", id);
-
- key = keyring_search(make_key_ref(system_trusted_keyring, 1),
- &key_type_asymmetric, id);
- if (IS_ERR(key))
- pr_warn("Request for unknown module key '%s' err %ld\n",
- id, PTR_ERR(key));
- kfree(id);
-
- if (IS_ERR(key)) {
- switch (PTR_ERR(key)) {
- /* Hide some search errors */
- case -EACCES:
- case -ENOTDIR:
- case -EAGAIN:
- return ERR_PTR(-ENOKEY);
- default:
- return ERR_CAST(key);
- }
- }
-
- pr_devel("<==%s() = 0 [%x]\n", __func__, key_serial(key_ref_to_ptr(key)));
- return key_ref_to_ptr(key);
-}
-
-/*
* Verify the signature on a module.
*/
int mod_verify_sig(const void *mod, unsigned long *_modlen)
{
- struct public_key_signature *pks;
struct module_signature ms;
- struct key *key;
- const void *sig;
size_t modlen = *_modlen, sig_len;
- int ret;
pr_devel("==>%s(,%zu)\n", __func__, modlen);
@@ -205,46 +54,24 @@ int mod_verify_sig(const void *mod, unsigned long *_modlen)
if (sig_len >= modlen)
return -EBADMSG;
modlen -= sig_len;
- if ((size_t)ms.signer_len + ms.key_id_len >= modlen)
- return -EBADMSG;
- modlen -= (size_t)ms.signer_len + ms.key_id_len;
-
*_modlen = modlen;
- sig = mod + modlen;
-
- /* For the moment, only support RSA and X.509 identifiers */
- if (ms.algo != PKEY_ALGO_RSA ||
- ms.id_type != PKEY_ID_X509)
- return -ENOPKG;
- if (ms.hash >= PKEY_HASH__LAST ||
- !hash_algo_name[ms.hash])
+ if (ms.id_type != PKEY_ID_PKCS7) {
+ pr_err("Module is not signed with expected PKCS#7 message\n");
return -ENOPKG;
-
- key = request_asymmetric_key(sig, ms.signer_len,
- sig + ms.signer_len, ms.key_id_len);
- if (IS_ERR(key))
- return PTR_ERR(key);
-
- pks = mod_make_digest(ms.hash, mod, modlen);
- if (IS_ERR(pks)) {
- ret = PTR_ERR(pks);
- goto error_put_key;
}
- ret = mod_extract_mpi_array(pks, sig + ms.signer_len + ms.key_id_len,
- sig_len);
- if (ret < 0)
- goto error_free_pks;
-
- ret = verify_signature(key, pks);
- pr_devel("verify_signature() = %d\n", ret);
+ if (ms.algo != 0 ||
+ ms.hash != 0 ||
+ ms.signer_len != 0 ||
+ ms.key_id_len != 0 ||
+ ms.__pad[0] != 0 ||
+ ms.__pad[1] != 0 ||
+ ms.__pad[2] != 0) {
+ pr_err("PKCS#7 signature info has unexpected non-zero params\n");
+ return -EBADMSG;
+ }
-error_free_pks:
- mpi_free(pks->rsa.s);
- kfree(pks);
-error_put_key:
- key_put(key);
- pr_devel("<==%s() = %d\n", __func__, ret);
- return ret;
+ return system_verify_data(mod, modlen, mod + modlen, sig_len,
+ VERIFYING_MODULE_SIGNATURE);
}
diff --git a/kernel/profile.c b/kernel/profile.c
index a7bcd28d6e9f..99513e1160e5 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -339,7 +339,7 @@ static int profile_cpu_callback(struct notifier_block *info,
node = cpu_to_mem(cpu);
per_cpu(cpu_profile_flip, cpu) = 0;
if (!per_cpu(cpu_profile_hits, cpu)[1]) {
- page = alloc_pages_exact_node(node,
+ page = __alloc_pages_node(node,
GFP_KERNEL | __GFP_ZERO,
0);
if (!page)
@@ -347,7 +347,7 @@ static int profile_cpu_callback(struct notifier_block *info,
per_cpu(cpu_profile_hits, cpu)[1] = page_address(page);
}
if (!per_cpu(cpu_profile_hits, cpu)[0]) {
- page = alloc_pages_exact_node(node,
+ page = __alloc_pages_node(node,
GFP_KERNEL | __GFP_ZERO,
0);
if (!page)
@@ -543,14 +543,14 @@ static int create_hash_tables(void)
int node = cpu_to_mem(cpu);
struct page *page;
- page = alloc_pages_exact_node(node,
+ page = __alloc_pages_node(node,
GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE,
0);
if (!page)
goto out_cleanup;
per_cpu(cpu_profile_hits, cpu)[1]
= (struct profile_hit *)page_address(page);
- page = alloc_pages_exact_node(node,
+ page = __alloc_pages_node(node,
GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE,
0);
if (!page)
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index c8e0e050a36a..787320de68e0 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -556,6 +556,19 @@ static int ptrace_setoptions(struct task_struct *child, unsigned long data)
if (data & ~(unsigned long)PTRACE_O_MASK)
return -EINVAL;
+ if (unlikely(data & PTRACE_O_SUSPEND_SECCOMP)) {
+ if (!config_enabled(CONFIG_CHECKPOINT_RESTORE) ||
+ !config_enabled(CONFIG_SECCOMP))
+ return -EINVAL;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (seccomp_mode(&current->seccomp) != SECCOMP_MODE_DISABLED ||
+ current->ptrace & PT_SUSPEND_SECCOMP)
+ return -EPERM;
+ }
+
/* Avoid intermediate state when all opts are cleared */
flags = child->ptrace;
flags &= ~(PTRACE_O_MASK << PT_OPT_FLAG_SHIFT);
diff --git a/kernel/resource.c b/kernel/resource.c
index fed052a1bc9f..f150dbbe6f62 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -492,40 +492,51 @@ int __weak page_is_ram(unsigned long pfn)
}
EXPORT_SYMBOL_GPL(page_is_ram);
-/*
- * Search for a resouce entry that fully contains the specified region.
- * If found, return 1 if it is RAM, 0 if not.
- * If not found, or region is not fully contained, return -1
+/**
+ * region_intersects() - determine intersection of region with known resources
+ * @start: region start address
+ * @size: size of region
+ * @name: name of resource (in iomem_resource)
*
- * Used by the ioremap functions to ensure the user is not remapping RAM and is
- * a vast speed up over walking through the resource table page by page.
+ * Check if the specified region partially overlaps or fully eclipses a
+ * resource identified by @name. Return REGION_DISJOINT if the region
+ * does not overlap @name, return REGION_MIXED if the region overlaps
+ * @type and another resource, and return REGION_INTERSECTS if the
+ * region overlaps @type and no other defined resource. Note, that
+ * REGION_INTERSECTS is also returned in the case when the specified
+ * region overlaps RAM and undefined memory holes.
+ *
+ * region_intersect() is used by memory remapping functions to ensure
+ * the user is not remapping RAM and is a vast speed up over walking
+ * through the resource table page by page.
*/
-int region_is_ram(resource_size_t start, unsigned long size)
+int region_intersects(resource_size_t start, size_t size, const char *name)
{
- struct resource *p;
- resource_size_t end = start + size - 1;
unsigned long flags = IORESOURCE_MEM | IORESOURCE_BUSY;
- const char *name = "System RAM";
- int ret = -1;
+ resource_size_t end = start + size - 1;
+ int type = 0; int other = 0;
+ struct resource *p;
read_lock(&resource_lock);
for (p = iomem_resource.child; p ; p = p->sibling) {
- if (p->end < start)
- continue;
-
- if (p->start <= start && end <= p->end) {
- /* resource fully contains region */
- if ((p->flags != flags) || strcmp(p->name, name))
- ret = 0;
- else
- ret = 1;
- break;
- }
- if (end < p->start)
- break; /* not found */
+ bool is_type = strcmp(p->name, name) == 0 && p->flags == flags;
+
+ if (start >= p->start && start <= p->end)
+ is_type ? type++ : other++;
+ if (end >= p->start && end <= p->end)
+ is_type ? type++ : other++;
+ if (p->start >= start && p->end <= end)
+ is_type ? type++ : other++;
}
read_unlock(&resource_lock);
- return ret;
+
+ if (other == 0)
+ return type ? REGION_INTERSECTS : REGION_DISJOINT;
+
+ if (type)
+ return REGION_MIXED;
+
+ return REGION_DISJOINT;
}
void __weak arch_remove_reservations(struct resource *avail)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d8420c233ff7..3595403921bd 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -164,14 +164,12 @@ struct static_key sched_feat_keys[__SCHED_FEAT_NR] = {
static void sched_feat_disable(int i)
{
- if (static_key_enabled(&sched_feat_keys[i]))
- static_key_slow_dec(&sched_feat_keys[i]);
+ static_key_disable(&sched_feat_keys[i]);
}
static void sched_feat_enable(int i)
{
- if (!static_key_enabled(&sched_feat_keys[i]))
- static_key_slow_inc(&sched_feat_keys[i]);
+ static_key_enable(&sched_feat_keys[i]);
}
#else
static void sched_feat_disable(int i) { };
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 052e02672d12..272d9322bc5d 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -106,9 +106,10 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode, int nr)
}
EXPORT_SYMBOL_GPL(__wake_up_locked);
-void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
+void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, int nr,
+ void *key)
{
- __wake_up_common(q, mode, 1, 0, key);
+ __wake_up_common(q, mode, nr, 0, key);
}
EXPORT_SYMBOL_GPL(__wake_up_locked_key);
@@ -283,7 +284,7 @@ void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
if (!list_empty(&wait->task_list))
list_del_init(&wait->task_list);
else if (waitqueue_active(q))
- __wake_up_locked_key(q, mode, key);
+ __wake_up_locked_key(q, mode, 1, key);
spin_unlock_irqrestore(&q->lock, flags);
}
EXPORT_SYMBOL(abort_exclusive_wait);
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 245df6b32b81..5bd4779282df 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -175,17 +175,16 @@ static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
*/
static u32 seccomp_run_filters(struct seccomp_data *sd)
{
- struct seccomp_filter *f = ACCESS_ONCE(current->seccomp.filter);
struct seccomp_data sd_local;
u32 ret = SECCOMP_RET_ALLOW;
+ /* Make sure cross-thread synced filter points somewhere sane. */
+ struct seccomp_filter *f =
+ lockless_dereference(current->seccomp.filter);
/* Ensure unexpected behavior doesn't result in failing open. */
if (unlikely(WARN_ON(f == NULL)))
return SECCOMP_RET_KILL;
- /* Make sure cross-thread synced filter points somewhere sane. */
- smp_read_barrier_depends();
-
if (!sd) {
populate_seccomp_data(&sd_local);
sd = &sd_local;
@@ -549,7 +548,11 @@ void secure_computing_strict(int this_syscall)
{
int mode = current->seccomp.mode;
- if (mode == 0)
+ if (config_enabled(CONFIG_CHECKPOINT_RESTORE) &&
+ unlikely(current->ptrace & PT_SUSPEND_SECCOMP))
+ return;
+
+ if (mode == SECCOMP_MODE_DISABLED)
return;
else if (mode == SECCOMP_MODE_STRICT)
__secure_computing_strict(this_syscall);
@@ -650,6 +653,10 @@ u32 seccomp_phase1(struct seccomp_data *sd)
int this_syscall = sd ? sd->nr :
syscall_get_nr(current, task_pt_regs(current));
+ if (config_enabled(CONFIG_CHECKPOINT_RESTORE) &&
+ unlikely(current->ptrace & PT_SUSPEND_SECCOMP))
+ return SECCOMP_PHASE1_OK;
+
switch (mode) {
case SECCOMP_MODE_STRICT:
__secure_computing_strict(this_syscall); /* may call do_exit */
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index 7c434c39f02a..a818cbc73e14 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -113,7 +113,8 @@ static int smpboot_thread_fn(void *data)
if (kthread_should_stop()) {
__set_current_state(TASK_RUNNING);
preempt_enable();
- if (ht->cleanup)
+ /* cleanup must mirror setup */
+ if (ht->cleanup && td->status != HP_THREAD_NONE)
ht->cleanup(td->cpu, cpu_online(td->cpu));
kfree(td);
return 0;
@@ -259,15 +260,6 @@ static void smpboot_destroy_threads(struct smp_hotplug_thread *ht)
{
unsigned int cpu;
- /* Unpark any threads that were voluntarily parked. */
- for_each_cpu_not(cpu, ht->cpumask) {
- if (cpu_online(cpu)) {
- struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
- if (tsk)
- kthread_unpark(tsk);
- }
- }
-
/* We need to destroy also the parked threads of offline cpus */
for_each_possible_cpu(cpu) {
struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
@@ -281,19 +273,22 @@ static void smpboot_destroy_threads(struct smp_hotplug_thread *ht)
}
/**
- * smpboot_register_percpu_thread - Register a per_cpu thread related to hotplug
+ * smpboot_register_percpu_thread_cpumask - Register a per_cpu thread related
+ * to hotplug
* @plug_thread: Hotplug thread descriptor
+ * @cpumask: The cpumask where threads run
*
* Creates and starts the threads on all online cpus.
*/
-int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
+int smpboot_register_percpu_thread_cpumask(struct smp_hotplug_thread *plug_thread,
+ const struct cpumask *cpumask)
{
unsigned int cpu;
int ret = 0;
if (!alloc_cpumask_var(&plug_thread->cpumask, GFP_KERNEL))
return -ENOMEM;
- cpumask_copy(plug_thread->cpumask, cpu_possible_mask);
+ cpumask_copy(plug_thread->cpumask, cpumask);
get_online_cpus();
mutex_lock(&smpboot_threads_lock);
@@ -301,9 +296,11 @@ int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
ret = __smpboot_create_thread(plug_thread, cpu);
if (ret) {
smpboot_destroy_threads(plug_thread);
+ free_cpumask_var(plug_thread->cpumask);
goto out;
}
- smpboot_unpark_thread(plug_thread, cpu);
+ if (cpumask_test_cpu(cpu, cpumask))
+ smpboot_unpark_thread(plug_thread, cpu);
}
list_add(&plug_thread->list, &hotplug_threads);
out:
@@ -311,7 +308,7 @@ out:
put_online_cpus();
return ret;
}
-EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread);
+EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread_cpumask);
/**
* smpboot_unregister_percpu_thread - Unregister a per_cpu thread related to hotplug
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index ca7d84f438f1..03c3875d9958 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -219,6 +219,7 @@ cond_syscall(compat_sys_timerfd_gettime);
cond_syscall(sys_eventfd);
cond_syscall(sys_eventfd2);
cond_syscall(sys_memfd_create);
+cond_syscall(sys_userfaultfd);
/* performance counters: */
cond_syscall(sys_perf_event_open);
diff --git a/kernel/system_certificates.S b/kernel/system_certificates.S
deleted file mode 100644
index 3e9868d47535..000000000000
--- a/kernel/system_certificates.S
+++ /dev/null
@@ -1,20 +0,0 @@
-#include <linux/export.h>
-#include <linux/init.h>
-
- __INITRODATA
-
- .align 8
- .globl VMLINUX_SYMBOL(system_certificate_list)
-VMLINUX_SYMBOL(system_certificate_list):
-__cert_list_start:
- .incbin "kernel/x509_certificate_list"
-__cert_list_end:
-
- .align 8
- .globl VMLINUX_SYMBOL(system_certificate_list_size)
-VMLINUX_SYMBOL(system_certificate_list_size):
-#ifdef CONFIG_64BIT
- .quad __cert_list_end - __cert_list_start
-#else
- .long __cert_list_end - __cert_list_start
-#endif
diff --git a/kernel/system_keyring.c b/kernel/system_keyring.c
deleted file mode 100644
index 875f64e8935b..000000000000
--- a/kernel/system_keyring.c
+++ /dev/null
@@ -1,106 +0,0 @@
-/* System trusted keyring for trusted public keys
- *
- * Copyright (C) 2012 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
- */
-
-#include <linux/export.h>
-#include <linux/kernel.h>
-#include <linux/sched.h>
-#include <linux/cred.h>
-#include <linux/err.h>
-#include <keys/asymmetric-type.h>
-#include <keys/system_keyring.h>
-#include "module-internal.h"
-
-struct key *system_trusted_keyring;
-EXPORT_SYMBOL_GPL(system_trusted_keyring);
-
-extern __initconst const u8 system_certificate_list[];
-extern __initconst const unsigned long system_certificate_list_size;
-
-/*
- * Load the compiled-in keys
- */
-static __init int system_trusted_keyring_init(void)
-{
- pr_notice("Initialise system trusted keyring\n");
-
- system_trusted_keyring =
- keyring_alloc(".system_keyring",
- KUIDT_INIT(0), KGIDT_INIT(0), current_cred(),
- ((KEY_POS_ALL & ~KEY_POS_SETATTR) |
- KEY_USR_VIEW | KEY_USR_READ | KEY_USR_SEARCH),
- KEY_ALLOC_NOT_IN_QUOTA, NULL);
- if (IS_ERR(system_trusted_keyring))
- panic("Can't allocate system trusted keyring\n");
-
- set_bit(KEY_FLAG_TRUSTED_ONLY, &system_trusted_keyring->flags);
- return 0;
-}
-
-/*
- * Must be initialised before we try and load the keys into the keyring.
- */
-device_initcall(system_trusted_keyring_init);
-
-/*
- * Load the compiled-in list of X.509 certificates.
- */
-static __init int load_system_certificate_list(void)
-{
- key_ref_t key;
- const u8 *p, *end;
- size_t plen;
-
- pr_notice("Loading compiled-in X.509 certificates\n");
-
- p = system_certificate_list;
- end = p + system_certificate_list_size;
- while (p < end) {
- /* Each cert begins with an ASN.1 SEQUENCE tag and must be more
- * than 256 bytes in size.
- */
- if (end - p < 4)
- goto dodgy_cert;
- if (p[0] != 0x30 &&
- p[1] != 0x82)
- goto dodgy_cert;
- plen = (p[2] << 8) | p[3];
- plen += 4;
- if (plen > end - p)
- goto dodgy_cert;
-
- key = key_create_or_update(make_key_ref(system_trusted_keyring, 1),
- "asymmetric",
- NULL,
- p,
- plen,
- ((KEY_POS_ALL & ~KEY_POS_SETATTR) |
- KEY_USR_VIEW | KEY_USR_READ),
- KEY_ALLOC_NOT_IN_QUOTA |
- KEY_ALLOC_TRUSTED);
- if (IS_ERR(key)) {
- pr_err("Problem loading in-kernel X.509 certificate (%ld)\n",
- PTR_ERR(key));
- } else {
- set_bit(KEY_FLAG_BUILTIN, &key_ref_to_ptr(key)->flags);
- pr_notice("Loaded X.509 cert '%s'\n",
- key_ref_to_ptr(key)->description);
- key_ref_put(key);
- }
- p += plen;
- }
-
- return 0;
-
-dodgy_cert:
- pr_err("Problem parsing in-kernel X.509 certificate list\n");
- return 0;
-}
-late_initcall(load_system_certificate_list);
diff --git a/kernel/task_work.c b/kernel/task_work.c
index 8727032e3a6f..53fa971d000d 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -18,6 +18,8 @@ static struct callback_head work_exited; /* all we need is ->next == NULL */
* This is like the signal handler which runs in kernel mode, but it doesn't
* try to wake up the @task.
*
+ * Note: there is no ordering guarantee on works queued here.
+ *
* RETURNS:
* 0 if succeeds or -ESRCH.
*/
@@ -108,16 +110,6 @@ void task_work_run(void)
raw_spin_unlock_wait(&task->pi_lock);
smp_mb();
- /* Reverse the list to run the works in fifo order */
- head = NULL;
- do {
- next = work->next;
- work->next = head;
- head = work;
- work = next;
- } while (work);
-
- work = head;
do {
next = work->next;
work->func(work);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index eb11011b5292..b0623ac785a2 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -630,13 +630,18 @@ static int function_stat_show(struct seq_file *m, void *v)
goto out;
}
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ avg = rec->time;
+ do_div(avg, rec->counter);
+ if (tracing_thresh && (avg < tracing_thresh))
+ goto out;
+#endif
+
kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
seq_printf(m, " %-30.30s %10lu", str, rec->counter);
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
seq_puts(m, " ");
- avg = rec->time;
- do_div(avg, rec->counter);
/* Sample standard deviation (s^2) */
if (rec->counter <= 1)
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 6260717c18e3..fc347f8b1bca 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -400,6 +400,17 @@ struct rb_irq_work {
};
/*
+ * Structure to hold event state and handle nested events.
+ */
+struct rb_event_info {
+ u64 ts;
+ u64 delta;
+ unsigned long length;
+ struct buffer_page *tail_page;
+ int add_timestamp;
+};
+
+/*
* Used for which event context the event is in.
* NMI = 0
* IRQ = 1
@@ -1876,73 +1887,6 @@ rb_event_index(struct ring_buffer_event *event)
return (addr & ~PAGE_MASK) - BUF_PAGE_HDR_SIZE;
}
-static inline int
-rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
- struct ring_buffer_event *event)
-{
- unsigned long addr = (unsigned long)event;
- unsigned long index;
-
- index = rb_event_index(event);
- addr &= PAGE_MASK;
-
- return cpu_buffer->commit_page->page == (void *)addr &&
- rb_commit_index(cpu_buffer) == index;
-}
-
-static void
-rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
-{
- unsigned long max_count;
-
- /*
- * We only race with interrupts and NMIs on this CPU.
- * If we own the commit event, then we can commit
- * all others that interrupted us, since the interruptions
- * are in stack format (they finish before they come
- * back to us). This allows us to do a simple loop to
- * assign the commit to the tail.
- */
- again:
- max_count = cpu_buffer->nr_pages * 100;
-
- while (cpu_buffer->commit_page != cpu_buffer->tail_page) {
- if (RB_WARN_ON(cpu_buffer, !(--max_count)))
- return;
- if (RB_WARN_ON(cpu_buffer,
- rb_is_reader_page(cpu_buffer->tail_page)))
- return;
- local_set(&cpu_buffer->commit_page->page->commit,
- rb_page_write(cpu_buffer->commit_page));
- rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
- cpu_buffer->write_stamp =
- cpu_buffer->commit_page->page->time_stamp;
- /* add barrier to keep gcc from optimizing too much */
- barrier();
- }
- while (rb_commit_index(cpu_buffer) !=
- rb_page_write(cpu_buffer->commit_page)) {
-
- local_set(&cpu_buffer->commit_page->page->commit,
- rb_page_write(cpu_buffer->commit_page));
- RB_WARN_ON(cpu_buffer,
- local_read(&cpu_buffer->commit_page->page->commit) &
- ~RB_WRITE_MASK);
- barrier();
- }
-
- /* again, keep gcc from optimizing */
- barrier();
-
- /*
- * If an interrupt came in just after the first while loop
- * and pushed the tail page forward, we will be left with
- * a dangling commit that will never go forward.
- */
- if (unlikely(cpu_buffer->commit_page != cpu_buffer->tail_page))
- goto again;
-}
-
static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
{
cpu_buffer->read_stamp = cpu_buffer->reader_page->page->time_stamp;
@@ -1968,64 +1912,6 @@ static void rb_inc_iter(struct ring_buffer_iter *iter)
iter->head = 0;
}
-/* Slow path, do not inline */
-static noinline struct ring_buffer_event *
-rb_add_time_stamp(struct ring_buffer_event *event, u64 delta)
-{
- event->type_len = RINGBUF_TYPE_TIME_EXTEND;
-
- /* Not the first event on the page? */
- if (rb_event_index(event)) {
- event->time_delta = delta & TS_MASK;
- event->array[0] = delta >> TS_SHIFT;
- } else {
- /* nope, just zero it */
- event->time_delta = 0;
- event->array[0] = 0;
- }
-
- return skip_time_extend(event);
-}
-
-/**
- * rb_update_event - update event type and data
- * @event: the event to update
- * @type: the type of event
- * @length: the size of the event field in the ring buffer
- *
- * Update the type and data fields of the event. The length
- * is the actual size that is written to the ring buffer,
- * and with this, we can determine what to place into the
- * data field.
- */
-static void
-rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
- struct ring_buffer_event *event, unsigned length,
- int add_timestamp, u64 delta)
-{
- /* Only a commit updates the timestamp */
- if (unlikely(!rb_event_is_commit(cpu_buffer, event)))
- delta = 0;
-
- /*
- * If we need to add a timestamp, then we
- * add it to the start of the resevered space.
- */
- if (unlikely(add_timestamp)) {
- event = rb_add_time_stamp(event, delta);
- length -= RB_LEN_TIME_EXTEND;
- delta = 0;
- }
-
- event->time_delta = delta;
- length -= RB_EVNT_HDR_SIZE;
- if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) {
- event->type_len = 0;
- event->array[0] = length;
- } else
- event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
-}
-
/*
* rb_handle_head_page - writer hit the head page
*
@@ -2184,29 +2070,13 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer,
return 0;
}
-static unsigned rb_calculate_event_length(unsigned length)
-{
- struct ring_buffer_event event; /* Used only for sizeof array */
-
- /* zero length can cause confusions */
- if (!length)
- length++;
-
- if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT)
- length += sizeof(event.array[0]);
-
- length += RB_EVNT_HDR_SIZE;
- length = ALIGN(length, RB_ARCH_ALIGNMENT);
-
- return length;
-}
-
static inline void
rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
- struct buffer_page *tail_page,
- unsigned long tail, unsigned long length)
+ unsigned long tail, struct rb_event_info *info)
{
+ struct buffer_page *tail_page = info->tail_page;
struct ring_buffer_event *event;
+ unsigned long length = info->length;
/*
* Only the event that crossed the page boundary
@@ -2276,13 +2146,14 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
*/
static noinline struct ring_buffer_event *
rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
- unsigned long length, unsigned long tail,
- struct buffer_page *tail_page, u64 ts)
+ unsigned long tail, struct rb_event_info *info)
{
+ struct buffer_page *tail_page = info->tail_page;
struct buffer_page *commit_page = cpu_buffer->commit_page;
struct ring_buffer *buffer = cpu_buffer->buffer;
struct buffer_page *next_page;
int ret;
+ u64 ts;
next_page = tail_page;
@@ -2368,74 +2239,120 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
out_again:
- rb_reset_tail(cpu_buffer, tail_page, tail, length);
+ rb_reset_tail(cpu_buffer, tail, info);
/* fail and let the caller try again */
return ERR_PTR(-EAGAIN);
out_reset:
/* reset write */
- rb_reset_tail(cpu_buffer, tail_page, tail, length);
+ rb_reset_tail(cpu_buffer, tail, info);
return NULL;
}
-static struct ring_buffer_event *
-__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
- unsigned long length, u64 ts,
- u64 delta, int add_timestamp)
+/* Slow path, do not inline */
+static noinline struct ring_buffer_event *
+rb_add_time_stamp(struct ring_buffer_event *event, u64 delta)
{
- struct buffer_page *tail_page;
- struct ring_buffer_event *event;
- unsigned long tail, write;
+ event->type_len = RINGBUF_TYPE_TIME_EXTEND;
- /*
- * If the time delta since the last event is too big to
- * hold in the time field of the event, then we append a
- * TIME EXTEND event ahead of the data event.
- */
- if (unlikely(add_timestamp))
- length += RB_LEN_TIME_EXTEND;
+ /* Not the first event on the page? */
+ if (rb_event_index(event)) {
+ event->time_delta = delta & TS_MASK;
+ event->array[0] = delta >> TS_SHIFT;
+ } else {
+ /* nope, just zero it */
+ event->time_delta = 0;
+ event->array[0] = 0;
+ }
- tail_page = cpu_buffer->tail_page;
- write = local_add_return(length, &tail_page->write);
+ return skip_time_extend(event);
+}
- /* set write to only the index of the write */
- write &= RB_WRITE_MASK;
- tail = write - length;
+static inline int rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
+ struct ring_buffer_event *event);
+
+/**
+ * rb_update_event - update event type and data
+ * @event: the event to update
+ * @type: the type of event
+ * @length: the size of the event field in the ring buffer
+ *
+ * Update the type and data fields of the event. The length
+ * is the actual size that is written to the ring buffer,
+ * and with this, we can determine what to place into the
+ * data field.
+ */
+static void
+rb_update_event(struct ring_buffer_per_cpu *cpu_buffer,
+ struct ring_buffer_event *event,
+ struct rb_event_info *info)
+{
+ unsigned length = info->length;
+ u64 delta = info->delta;
+
+ /* Only a commit updates the timestamp */
+ if (unlikely(!rb_event_is_commit(cpu_buffer, event)))
+ delta = 0;
/*
- * If this is the first commit on the page, then it has the same
- * timestamp as the page itself.
+ * If we need to add a timestamp, then we
+ * add it to the start of the resevered space.
*/
- if (!tail)
+ if (unlikely(info->add_timestamp)) {
+ event = rb_add_time_stamp(event, delta);
+ length -= RB_LEN_TIME_EXTEND;
delta = 0;
+ }
- /* See if we shot pass the end of this buffer page */
- if (unlikely(write > BUF_PAGE_SIZE))
- return rb_move_tail(cpu_buffer, length, tail,
- tail_page, ts);
+ event->time_delta = delta;
+ length -= RB_EVNT_HDR_SIZE;
+ if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT) {
+ event->type_len = 0;
+ event->array[0] = length;
+ } else
+ event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
+}
- /* We reserved something on the buffer */
+static unsigned rb_calculate_event_length(unsigned length)
+{
+ struct ring_buffer_event event; /* Used only for sizeof array */
- event = __rb_page_index(tail_page, tail);
- kmemcheck_annotate_bitfield(event, bitfield);
- rb_update_event(cpu_buffer, event, length, add_timestamp, delta);
+ /* zero length can cause confusions */
+ if (!length)
+ length++;
- local_inc(&tail_page->entries);
+ if (length > RB_MAX_SMALL_DATA || RB_FORCE_8BYTE_ALIGNMENT)
+ length += sizeof(event.array[0]);
+
+ length += RB_EVNT_HDR_SIZE;
+ length = ALIGN(length, RB_ARCH_ALIGNMENT);
/*
- * If this is the first commit on the page, then update
- * its timestamp.
+ * In case the time delta is larger than the 27 bits for it
+ * in the header, we need to add a timestamp. If another
+ * event comes in when trying to discard this one to increase
+ * the length, then the timestamp will be added in the allocated
+ * space of this event. If length is bigger than the size needed
+ * for the TIME_EXTEND, then padding has to be used. The events
+ * length must be either RB_LEN_TIME_EXTEND, or greater than or equal
+ * to RB_LEN_TIME_EXTEND + 8, as 8 is the minimum size for padding.
+ * As length is a multiple of 4, we only need to worry if it
+ * is 12 (RB_LEN_TIME_EXTEND + 4).
*/
- if (!tail)
- tail_page->page->time_stamp = ts;
+ if (length == RB_LEN_TIME_EXTEND + RB_ALIGNMENT)
+ length += RB_ALIGNMENT;
- /* account for these added bytes */
- local_add(length, &cpu_buffer->entries_bytes);
+ return length;
+}
- return event;
+#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+static inline bool sched_clock_stable(void)
+{
+ return true;
}
+#endif
static inline int
rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
@@ -2483,6 +2400,59 @@ static void rb_start_commit(struct ring_buffer_per_cpu *cpu_buffer)
local_inc(&cpu_buffer->commits);
}
+static void
+rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
+{
+ unsigned long max_count;
+
+ /*
+ * We only race with interrupts and NMIs on this CPU.
+ * If we own the commit event, then we can commit
+ * all others that interrupted us, since the interruptions
+ * are in stack format (they finish before they come
+ * back to us). This allows us to do a simple loop to
+ * assign the commit to the tail.
+ */
+ again:
+ max_count = cpu_buffer->nr_pages * 100;
+
+ while (cpu_buffer->commit_page != cpu_buffer->tail_page) {
+ if (RB_WARN_ON(cpu_buffer, !(--max_count)))
+ return;
+ if (RB_WARN_ON(cpu_buffer,
+ rb_is_reader_page(cpu_buffer->tail_page)))
+ return;
+ local_set(&cpu_buffer->commit_page->page->commit,
+ rb_page_write(cpu_buffer->commit_page));
+ rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
+ cpu_buffer->write_stamp =
+ cpu_buffer->commit_page->page->time_stamp;
+ /* add barrier to keep gcc from optimizing too much */
+ barrier();
+ }
+ while (rb_commit_index(cpu_buffer) !=
+ rb_page_write(cpu_buffer->commit_page)) {
+
+ local_set(&cpu_buffer->commit_page->page->commit,
+ rb_page_write(cpu_buffer->commit_page));
+ RB_WARN_ON(cpu_buffer,
+ local_read(&cpu_buffer->commit_page->page->commit) &
+ ~RB_WRITE_MASK);
+ barrier();
+ }
+
+ /* again, keep gcc from optimizing */
+ barrier();
+
+ /*
+ * If an interrupt came in just after the first while loop
+ * and pushed the tail page forward, we will be left with
+ * a dangling commit that will never go forward.
+ */
+ if (unlikely(cpu_buffer->commit_page != cpu_buffer->tail_page))
+ goto again;
+}
+
static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
{
unsigned long commits;
@@ -2515,91 +2485,94 @@ static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer)
}
}
-static struct ring_buffer_event *
-rb_reserve_next_event(struct ring_buffer *buffer,
- struct ring_buffer_per_cpu *cpu_buffer,
- unsigned long length)
+static inline void rb_event_discard(struct ring_buffer_event *event)
{
- struct ring_buffer_event *event;
- u64 ts, delta;
- int nr_loops = 0;
- int add_timestamp;
- u64 diff;
+ if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
+ event = skip_time_extend(event);
- rb_start_commit(cpu_buffer);
+ /* array[0] holds the actual length for the discarded event */
+ event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE;
+ event->type_len = RINGBUF_TYPE_PADDING;
+ /* time delta must be non zero */
+ if (!event->time_delta)
+ event->time_delta = 1;
+}
-#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
- /*
- * Due to the ability to swap a cpu buffer from a buffer
- * it is possible it was swapped before we committed.
- * (committing stops a swap). We check for it here and
- * if it happened, we have to fail the write.
- */
- barrier();
- if (unlikely(ACCESS_ONCE(cpu_buffer->buffer) != buffer)) {
- local_dec(&cpu_buffer->committing);
- local_dec(&cpu_buffer->commits);
- return NULL;
- }
-#endif
+static inline int
+rb_event_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
+ struct ring_buffer_event *event)
+{
+ unsigned long addr = (unsigned long)event;
+ unsigned long index;
- length = rb_calculate_event_length(length);
- again:
- add_timestamp = 0;
- delta = 0;
+ index = rb_event_index(event);
+ addr &= PAGE_MASK;
+
+ return cpu_buffer->commit_page->page == (void *)addr &&
+ rb_commit_index(cpu_buffer) == index;
+}
+
+static void
+rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer,
+ struct ring_buffer_event *event)
+{
+ u64 delta;
/*
- * We allow for interrupts to reenter here and do a trace.
- * If one does, it will cause this original code to loop
- * back here. Even with heavy interrupts happening, this
- * should only happen a few times in a row. If this happens
- * 1000 times in a row, there must be either an interrupt
- * storm or we have something buggy.
- * Bail!
+ * The event first in the commit queue updates the
+ * time stamp.
*/
- if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
- goto out_fail;
+ if (rb_event_is_commit(cpu_buffer, event)) {
+ /*
+ * A commit event that is first on a page
+ * updates the write timestamp with the page stamp
+ */
+ if (!rb_event_index(event))
+ cpu_buffer->write_stamp =
+ cpu_buffer->commit_page->page->time_stamp;
+ else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
+ delta = event->array[0];
+ delta <<= TS_SHIFT;
+ delta += event->time_delta;
+ cpu_buffer->write_stamp += delta;
+ } else
+ cpu_buffer->write_stamp += event->time_delta;
+ }
+}
- ts = rb_time_stamp(cpu_buffer->buffer);
- diff = ts - cpu_buffer->write_stamp;
+static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
+ struct ring_buffer_event *event)
+{
+ local_inc(&cpu_buffer->entries);
+ rb_update_write_stamp(cpu_buffer, event);
+ rb_end_commit(cpu_buffer);
+}
- /* make sure this diff is calculated here */
- barrier();
+static __always_inline void
+rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
+{
+ bool pagebusy;
- /* Did the write stamp get updated already? */
- if (likely(ts >= cpu_buffer->write_stamp)) {
- delta = diff;
- if (unlikely(test_time_stamp(delta))) {
- int local_clock_stable = 1;
-#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
- local_clock_stable = sched_clock_stable();
-#endif
- WARN_ONCE(delta > (1ULL << 59),
- KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s",
- (unsigned long long)delta,
- (unsigned long long)ts,
- (unsigned long long)cpu_buffer->write_stamp,
- local_clock_stable ? "" :
- "If you just came from a suspend/resume,\n"
- "please switch to the trace global clock:\n"
- " echo global > /sys/kernel/debug/tracing/trace_clock\n");
- add_timestamp = 1;
- }
+ if (buffer->irq_work.waiters_pending) {
+ buffer->irq_work.waiters_pending = false;
+ /* irq_work_queue() supplies it's own memory barriers */
+ irq_work_queue(&buffer->irq_work.work);
}
- event = __rb_reserve_next(cpu_buffer, length, ts,
- delta, add_timestamp);
- if (unlikely(PTR_ERR(event) == -EAGAIN))
- goto again;
-
- if (!event)
- goto out_fail;
+ if (cpu_buffer->irq_work.waiters_pending) {
+ cpu_buffer->irq_work.waiters_pending = false;
+ /* irq_work_queue() supplies it's own memory barriers */
+ irq_work_queue(&cpu_buffer->irq_work.work);
+ }
- return event;
+ pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page;
- out_fail:
- rb_end_commit(cpu_buffer);
- return NULL;
+ if (!pagebusy && cpu_buffer->irq_work.full_waiters_pending) {
+ cpu_buffer->irq_work.wakeup_full = true;
+ cpu_buffer->irq_work.full_waiters_pending = false;
+ /* irq_work_queue() supplies it's own memory barriers */
+ irq_work_queue(&cpu_buffer->irq_work.work);
+ }
}
/*
@@ -2672,6 +2645,178 @@ trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer)
}
/**
+ * ring_buffer_unlock_commit - commit a reserved
+ * @buffer: The buffer to commit to
+ * @event: The event pointer to commit.
+ *
+ * This commits the data to the ring buffer, and releases any locks held.
+ *
+ * Must be paired with ring_buffer_lock_reserve.
+ */
+int ring_buffer_unlock_commit(struct ring_buffer *buffer,
+ struct ring_buffer_event *event)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ int cpu = raw_smp_processor_id();
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ rb_commit(cpu_buffer, event);
+
+ rb_wakeups(buffer, cpu_buffer);
+
+ trace_recursive_unlock(cpu_buffer);
+
+ preempt_enable_notrace();
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
+
+static noinline void
+rb_handle_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
+ struct rb_event_info *info)
+{
+ WARN_ONCE(info->delta > (1ULL << 59),
+ KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s",
+ (unsigned long long)info->delta,
+ (unsigned long long)info->ts,
+ (unsigned long long)cpu_buffer->write_stamp,
+ sched_clock_stable() ? "" :
+ "If you just came from a suspend/resume,\n"
+ "please switch to the trace global clock:\n"
+ " echo global > /sys/kernel/debug/tracing/trace_clock\n");
+ info->add_timestamp = 1;
+}
+
+static struct ring_buffer_event *
+__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
+ struct rb_event_info *info)
+{
+ struct ring_buffer_event *event;
+ struct buffer_page *tail_page;
+ unsigned long tail, write;
+
+ /*
+ * If the time delta since the last event is too big to
+ * hold in the time field of the event, then we append a
+ * TIME EXTEND event ahead of the data event.
+ */
+ if (unlikely(info->add_timestamp))
+ info->length += RB_LEN_TIME_EXTEND;
+
+ tail_page = info->tail_page = cpu_buffer->tail_page;
+ write = local_add_return(info->length, &tail_page->write);
+
+ /* set write to only the index of the write */
+ write &= RB_WRITE_MASK;
+ tail = write - info->length;
+
+ /*
+ * If this is the first commit on the page, then it has the same
+ * timestamp as the page itself.
+ */
+ if (!tail)
+ info->delta = 0;
+
+ /* See if we shot pass the end of this buffer page */
+ if (unlikely(write > BUF_PAGE_SIZE))
+ return rb_move_tail(cpu_buffer, tail, info);
+
+ /* We reserved something on the buffer */
+
+ event = __rb_page_index(tail_page, tail);
+ kmemcheck_annotate_bitfield(event, bitfield);
+ rb_update_event(cpu_buffer, event, info);
+
+ local_inc(&tail_page->entries);
+
+ /*
+ * If this is the first commit on the page, then update
+ * its timestamp.
+ */
+ if (!tail)
+ tail_page->page->time_stamp = info->ts;
+
+ /* account for these added bytes */
+ local_add(info->length, &cpu_buffer->entries_bytes);
+
+ return event;
+}
+
+static struct ring_buffer_event *
+rb_reserve_next_event(struct ring_buffer *buffer,
+ struct ring_buffer_per_cpu *cpu_buffer,
+ unsigned long length)
+{
+ struct ring_buffer_event *event;
+ struct rb_event_info info;
+ int nr_loops = 0;
+ u64 diff;
+
+ rb_start_commit(cpu_buffer);
+
+#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
+ /*
+ * Due to the ability to swap a cpu buffer from a buffer
+ * it is possible it was swapped before we committed.
+ * (committing stops a swap). We check for it here and
+ * if it happened, we have to fail the write.
+ */
+ barrier();
+ if (unlikely(ACCESS_ONCE(cpu_buffer->buffer) != buffer)) {
+ local_dec(&cpu_buffer->committing);
+ local_dec(&cpu_buffer->commits);
+ return NULL;
+ }
+#endif
+
+ info.length = rb_calculate_event_length(length);
+ again:
+ info.add_timestamp = 0;
+ info.delta = 0;
+
+ /*
+ * We allow for interrupts to reenter here and do a trace.
+ * If one does, it will cause this original code to loop
+ * back here. Even with heavy interrupts happening, this
+ * should only happen a few times in a row. If this happens
+ * 1000 times in a row, there must be either an interrupt
+ * storm or we have something buggy.
+ * Bail!
+ */
+ if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
+ goto out_fail;
+
+ info.ts = rb_time_stamp(cpu_buffer->buffer);
+ diff = info.ts - cpu_buffer->write_stamp;
+
+ /* make sure this diff is calculated here */
+ barrier();
+
+ /* Did the write stamp get updated already? */
+ if (likely(info.ts >= cpu_buffer->write_stamp)) {
+ info.delta = diff;
+ if (unlikely(test_time_stamp(info.delta)))
+ rb_handle_timestamp(cpu_buffer, &info);
+ }
+
+ event = __rb_reserve_next(cpu_buffer, &info);
+
+ if (unlikely(PTR_ERR(event) == -EAGAIN))
+ goto again;
+
+ if (!event)
+ goto out_fail;
+
+ return event;
+
+ out_fail:
+ rb_end_commit(cpu_buffer);
+ return NULL;
+}
+
+/**
* ring_buffer_lock_reserve - reserve a part of the buffer
* @buffer: the ring buffer to reserve from
* @length: the length of the data to reserve (excluding event header)
@@ -2729,111 +2874,6 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
}
EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve);
-static void
-rb_update_write_stamp(struct ring_buffer_per_cpu *cpu_buffer,
- struct ring_buffer_event *event)
-{
- u64 delta;
-
- /*
- * The event first in the commit queue updates the
- * time stamp.
- */
- if (rb_event_is_commit(cpu_buffer, event)) {
- /*
- * A commit event that is first on a page
- * updates the write timestamp with the page stamp
- */
- if (!rb_event_index(event))
- cpu_buffer->write_stamp =
- cpu_buffer->commit_page->page->time_stamp;
- else if (event->type_len == RINGBUF_TYPE_TIME_EXTEND) {
- delta = event->array[0];
- delta <<= TS_SHIFT;
- delta += event->time_delta;
- cpu_buffer->write_stamp += delta;
- } else
- cpu_buffer->write_stamp += event->time_delta;
- }
-}
-
-static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
- struct ring_buffer_event *event)
-{
- local_inc(&cpu_buffer->entries);
- rb_update_write_stamp(cpu_buffer, event);
- rb_end_commit(cpu_buffer);
-}
-
-static __always_inline void
-rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
-{
- bool pagebusy;
-
- if (buffer->irq_work.waiters_pending) {
- buffer->irq_work.waiters_pending = false;
- /* irq_work_queue() supplies it's own memory barriers */
- irq_work_queue(&buffer->irq_work.work);
- }
-
- if (cpu_buffer->irq_work.waiters_pending) {
- cpu_buffer->irq_work.waiters_pending = false;
- /* irq_work_queue() supplies it's own memory barriers */
- irq_work_queue(&cpu_buffer->irq_work.work);
- }
-
- pagebusy = cpu_buffer->reader_page == cpu_buffer->commit_page;
-
- if (!pagebusy && cpu_buffer->irq_work.full_waiters_pending) {
- cpu_buffer->irq_work.wakeup_full = true;
- cpu_buffer->irq_work.full_waiters_pending = false;
- /* irq_work_queue() supplies it's own memory barriers */
- irq_work_queue(&cpu_buffer->irq_work.work);
- }
-}
-
-/**
- * ring_buffer_unlock_commit - commit a reserved
- * @buffer: The buffer to commit to
- * @event: The event pointer to commit.
- *
- * This commits the data to the ring buffer, and releases any locks held.
- *
- * Must be paired with ring_buffer_lock_reserve.
- */
-int ring_buffer_unlock_commit(struct ring_buffer *buffer,
- struct ring_buffer_event *event)
-{
- struct ring_buffer_per_cpu *cpu_buffer;
- int cpu = raw_smp_processor_id();
-
- cpu_buffer = buffer->buffers[cpu];
-
- rb_commit(cpu_buffer, event);
-
- rb_wakeups(buffer, cpu_buffer);
-
- trace_recursive_unlock(cpu_buffer);
-
- preempt_enable_notrace();
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
-
-static inline void rb_event_discard(struct ring_buffer_event *event)
-{
- if (event->type_len == RINGBUF_TYPE_TIME_EXTEND)
- event = skip_time_extend(event);
-
- /* array[0] holds the actual length for the discarded event */
- event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE;
- event->type_len = RINGBUF_TYPE_PADDING;
- /* time delta must be non zero */
- if (!event->time_delta)
- event->time_delta = 1;
-}
-
/*
* Decrement the entries to the page that an event is on.
* The event does not even need to exist, only the pointer
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index abcbf7ff8743..6e79408674aa 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3035,7 +3035,7 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
if (!iter)
return ERR_PTR(-ENOMEM);
- iter->buffer_iter = kzalloc(sizeof(*iter->buffer_iter) * num_possible_cpus(),
+ iter->buffer_iter = kcalloc(nr_cpu_ids, sizeof(*iter->buffer_iter),
GFP_KERNEL);
if (!iter->buffer_iter)
goto release;
@@ -6990,7 +6990,7 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
trace_init_global_iter(&iter);
for_each_tracing_cpu(cpu) {
- atomic_inc(&per_cpu_ptr(iter.tr->trace_buffer.data, cpu)->disabled);
+ atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
}
old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 404a372ad85a..7ca09cdc20c2 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -30,6 +30,7 @@
DEFINE_MUTEX(event_mutex);
LIST_HEAD(ftrace_events);
+static LIST_HEAD(ftrace_generic_fields);
static LIST_HEAD(ftrace_common_fields);
#define GFP_TRACE (GFP_KERNEL | __GFP_ZERO)
@@ -94,6 +95,10 @@ trace_find_event_field(struct trace_event_call *call, char *name)
struct ftrace_event_field *field;
struct list_head *head;
+ field = __find_event_field(&ftrace_generic_fields, name);
+ if (field)
+ return field;
+
field = __find_event_field(&ftrace_common_fields, name);
if (field)
return field;
@@ -144,6 +149,13 @@ int trace_define_field(struct trace_event_call *call, const char *type,
}
EXPORT_SYMBOL_GPL(trace_define_field);
+#define __generic_field(type, item, filter_type) \
+ ret = __trace_define_field(&ftrace_generic_fields, #type, \
+ #item, 0, 0, is_signed_type(type), \
+ filter_type); \
+ if (ret) \
+ return ret;
+
#define __common_field(type, item) \
ret = __trace_define_field(&ftrace_common_fields, #type, \
"common_" #item, \
@@ -153,6 +165,16 @@ EXPORT_SYMBOL_GPL(trace_define_field);
if (ret) \
return ret;
+static int trace_define_generic_fields(void)
+{
+ int ret;
+
+ __generic_field(int, cpu, FILTER_OTHER);
+ __generic_field(char *, comm, FILTER_PTR_STRING);
+
+ return ret;
+}
+
static int trace_define_common_fields(void)
{
int ret;
@@ -2671,6 +2693,9 @@ static __init int event_trace_init(void)
if (!entry)
pr_warn("Could not create tracefs 'available_events' entry\n");
+ if (trace_define_generic_fields())
+ pr_warn("tracing: Failed to allocated generic fields");
+
if (trace_define_common_fields())
pr_warn("tracing: Failed to allocate common fields");
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index d81d6f302b14..bd1bf184c5c9 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -252,6 +252,50 @@ static int filter_pred_strloc(struct filter_pred *pred, void *event)
return match;
}
+/* Filter predicate for CPUs. */
+static int filter_pred_cpu(struct filter_pred *pred, void *event)
+{
+ int cpu, cmp;
+ int match = 0;
+
+ cpu = raw_smp_processor_id();
+ cmp = pred->val;
+
+ switch (pred->op) {
+ case OP_EQ:
+ match = cpu == cmp;
+ break;
+ case OP_LT:
+ match = cpu < cmp;
+ break;
+ case OP_LE:
+ match = cpu <= cmp;
+ break;
+ case OP_GT:
+ match = cpu > cmp;
+ break;
+ case OP_GE:
+ match = cpu >= cmp;
+ break;
+ default:
+ break;
+ }
+
+ return !!match == !pred->not;
+}
+
+/* Filter predicate for COMM. */
+static int filter_pred_comm(struct filter_pred *pred, void *event)
+{
+ int cmp, match;
+
+ cmp = pred->regex.match(current->comm, &pred->regex,
+ pred->regex.field_len);
+ match = cmp ^ pred->not;
+
+ return match;
+}
+
static int filter_pred_none(struct filter_pred *pred, void *event)
{
return 0;
@@ -1002,7 +1046,10 @@ static int init_pred(struct filter_parse_state *ps,
if (is_string_field(field)) {
filter_build_regex(pred);
- if (field->filter_type == FILTER_STATIC_STRING) {
+ if (!strcmp(field->name, "comm")) {
+ fn = filter_pred_comm;
+ pred->regex.field_len = TASK_COMM_LEN;
+ } else if (field->filter_type == FILTER_STATIC_STRING) {
fn = filter_pred_string;
pred->regex.field_len = field->size;
} else if (field->filter_type == FILTER_DYN_STRING)
@@ -1025,7 +1072,10 @@ static int init_pred(struct filter_parse_state *ps,
}
pred->val = val;
- fn = select_comparison_fn(pred->op, field->size,
+ if (!strcmp(field->name, "cpu"))
+ fn = filter_pred_cpu;
+ else
+ fn = select_comparison_fn(pred->op, field->size,
field->is_signed);
if (!fn) {
parse_error(ps, FILT_ERR_INVALID_OP, 0);
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 8968bf720c12..ca98445782ac 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -715,13 +715,13 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s)
snprintf(nsecs_str, slen, "%03lu", nsecs_rem);
trace_seq_printf(s, ".%s", nsecs_str);
- len += strlen(nsecs_str);
+ len += strlen(nsecs_str) + 1;
}
trace_seq_puts(s, " us ");
/* Print remaining spaces to fit the row's width */
- for (i = len; i < 7; i++)
+ for (i = len; i < 8; i++)
trace_seq_putc(s, ' ');
}
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index dfab253727dc..8e481a84aeea 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -496,6 +496,8 @@ static const struct trace_mark {
char sym;
} mark[] = {
MARK(1000000000ULL , '$'), /* 1 sec */
+ MARK(100000000ULL , '@'), /* 100 msec */
+ MARK(10000000ULL , '*'), /* 10 msec */
MARK(1000000ULL , '#'), /* 1000 usecs */
MARK(100000ULL , '!'), /* 100 usecs */
MARK(10000ULL , '+'), /* 10 usecs */
@@ -508,7 +510,7 @@ char trace_find_mark(unsigned long long d)
int size = ARRAY_SIZE(mark);
for (i = 0; i < size; i++) {
- if (d >= mark[i].val)
+ if (d > mark[i].val)
break;
}
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 3f34496244e9..b746399ab59c 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -18,12 +18,6 @@
#define STACK_TRACE_ENTRIES 500
-#ifdef CC_USING_FENTRY
-# define fentry 1
-#else
-# define fentry 0
-#endif
-
static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES+1] =
{ [0 ... (STACK_TRACE_ENTRIES)] = ULONG_MAX };
static unsigned stack_dump_index[STACK_TRACE_ENTRIES];
@@ -35,7 +29,7 @@ static unsigned stack_dump_index[STACK_TRACE_ENTRIES];
*/
static struct stack_trace max_stack_trace = {
.max_entries = STACK_TRACE_ENTRIES - 1,
- .entries = &stack_dump_trace[1],
+ .entries = &stack_dump_trace[0],
};
static unsigned long max_stack_size;
@@ -55,7 +49,7 @@ static inline void print_max_stack(void)
pr_emerg(" Depth Size Location (%d entries)\n"
" ----- ---- --------\n",
- max_stack_trace.nr_entries - 1);
+ max_stack_trace.nr_entries);
for (i = 0; i < max_stack_trace.nr_entries; i++) {
if (stack_dump_trace[i] == ULONG_MAX)
@@ -77,7 +71,7 @@ check_stack(unsigned long ip, unsigned long *stack)
unsigned long this_size, flags; unsigned long *p, *top, *start;
static int tracer_frame;
int frame_size = ACCESS_ONCE(tracer_frame);
- int i;
+ int i, x;
this_size = ((unsigned long)stack) & (THREAD_SIZE-1);
this_size = THREAD_SIZE - this_size;
@@ -105,26 +99,20 @@ check_stack(unsigned long ip, unsigned long *stack)
max_stack_size = this_size;
max_stack_trace.nr_entries = 0;
-
- if (using_ftrace_ops_list_func())
- max_stack_trace.skip = 4;
- else
- max_stack_trace.skip = 3;
+ max_stack_trace.skip = 3;
save_stack_trace(&max_stack_trace);
- /*
- * Add the passed in ip from the function tracer.
- * Searching for this on the stack will skip over
- * most of the overhead from the stack tracer itself.
- */
- stack_dump_trace[0] = ip;
- max_stack_trace.nr_entries++;
+ /* Skip over the overhead of the stack tracer itself */
+ for (i = 0; i < max_stack_trace.nr_entries; i++) {
+ if (stack_dump_trace[i] == ip)
+ break;
+ }
/*
* Now find where in the stack these are.
*/
- i = 0;
+ x = 0;
start = stack;
top = (unsigned long *)
(((unsigned long)start & ~(THREAD_SIZE-1)) + THREAD_SIZE);
@@ -139,12 +127,15 @@ check_stack(unsigned long ip, unsigned long *stack)
while (i < max_stack_trace.nr_entries) {
int found = 0;
- stack_dump_index[i] = this_size;
+ stack_dump_index[x] = this_size;
p = start;
for (; p < top && i < max_stack_trace.nr_entries; p++) {
+ if (stack_dump_trace[i] == ULONG_MAX)
+ break;
if (*p == stack_dump_trace[i]) {
- this_size = stack_dump_index[i++] =
+ stack_dump_trace[x] = stack_dump_trace[i++];
+ this_size = stack_dump_index[x++] =
(top - p) * sizeof(unsigned long);
found = 1;
/* Start the search from here */
@@ -156,7 +147,7 @@ check_stack(unsigned long ip, unsigned long *stack)
* out what that is, then figure it out
* now.
*/
- if (unlikely(!tracer_frame) && i == 1) {
+ if (unlikely(!tracer_frame)) {
tracer_frame = (p - stack) *
sizeof(unsigned long);
max_stack_size -= tracer_frame;
@@ -168,6 +159,10 @@ check_stack(unsigned long ip, unsigned long *stack)
i++;
}
+ max_stack_trace.nr_entries = x;
+ for (; x < i; x++)
+ stack_dump_trace[x] = ULONG_MAX;
+
if (task_stack_end_corrupted(current)) {
print_max_stack();
BUG();
@@ -192,24 +187,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip,
if (per_cpu(trace_active, cpu)++ != 0)
goto out;
- /*
- * When fentry is used, the traced function does not get
- * its stack frame set up, and we lose the parent.
- * The ip is pretty useless because the function tracer
- * was called before that function set up its stack frame.
- * In this case, we use the parent ip.
- *
- * By adding the return address of either the parent ip
- * or the current ip we can disregard most of the stack usage
- * caused by the stack tracer itself.
- *
- * The function tracer always reports the address of where the
- * mcount call was, but the stack will hold the return address.
- */
- if (fentry)
- ip = parent_ip;
- else
- ip += MCOUNT_INSN_SIZE;
+ ip += MCOUNT_INSN_SIZE;
check_stack(ip, &stack);
@@ -284,7 +262,7 @@ __next(struct seq_file *m, loff_t *pos)
{
long n = *pos - 1;
- if (n >= max_stack_trace.nr_entries || stack_dump_trace[n] == ULONG_MAX)
+ if (n > max_stack_trace.nr_entries || stack_dump_trace[n] == ULONG_MAX)
return NULL;
m->private = (void *)n;
@@ -354,7 +332,7 @@ static int t_show(struct seq_file *m, void *v)
seq_printf(m, " Depth Size Location"
" (%d entries)\n"
" ----- ---- --------\n",
- max_stack_trace.nr_entries - 1);
+ max_stack_trace.nr_entries);
if (!stack_tracer_enabled && !max_stack_size)
print_disabled(m);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index f65a0a06a8c0..88fefa68c516 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -39,6 +39,7 @@ static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
cred->cap_inheritable = CAP_EMPTY_SET;
cred->cap_permitted = CAP_FULL_SET;
cred->cap_effective = CAP_FULL_SET;
+ cred->cap_ambient = CAP_EMPTY_SET;
cred->cap_bset = CAP_FULL_SET;
#ifdef CONFIG_KEYS
key_put(cred->request_key_auth);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index a6ffa43f2993..64ed1c37bd1f 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -24,6 +24,7 @@
#include <asm/irq_regs.h>
#include <linux/kvm_para.h>
#include <linux/perf_event.h>
+#include <linux/kthread.h>
/*
* The run state of the lockup detectors is controlled by the content of the
@@ -66,7 +67,26 @@ unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
#define for_each_watchdog_cpu(cpu) \
for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask)
+/*
+ * The 'watchdog_running' variable is set to 1 when the watchdog threads
+ * are registered/started and is set to 0 when the watchdog threads are
+ * unregistered/stopped, so it is an indicator whether the threads exist.
+ */
static int __read_mostly watchdog_running;
+/*
+ * If a subsystem has a need to deactivate the watchdog temporarily, it
+ * can use the suspend/resume interface to achieve this. The content of
+ * the 'watchdog_suspended' variable reflects this state. Existing threads
+ * are parked/unparked by the lockup_detector_{suspend|resume} functions
+ * (see comment blocks pertaining to those functions for further details).
+ *
+ * 'watchdog_suspended' also prevents threads from being registered/started
+ * or unregistered/stopped via parameters in /proc/sys/kernel, so the state
+ * of 'watchdog_running' cannot change while the watchdog is deactivated
+ * temporarily (see related code in 'proc' handlers).
+ */
+static int __read_mostly watchdog_suspended;
+
static u64 __read_mostly sample_period;
static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
@@ -613,46 +633,9 @@ static void watchdog_nmi_disable(unsigned int cpu)
}
}
-void watchdog_nmi_enable_all(void)
-{
- int cpu;
-
- mutex_lock(&watchdog_proc_mutex);
-
- if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
- goto unlock;
-
- get_online_cpus();
- for_each_watchdog_cpu(cpu)
- watchdog_nmi_enable(cpu);
- put_online_cpus();
-
-unlock:
- mutex_unlock(&watchdog_proc_mutex);
-}
-
-void watchdog_nmi_disable_all(void)
-{
- int cpu;
-
- mutex_lock(&watchdog_proc_mutex);
-
- if (!watchdog_running)
- goto unlock;
-
- get_online_cpus();
- for_each_watchdog_cpu(cpu)
- watchdog_nmi_disable(cpu);
- put_online_cpus();
-
-unlock:
- mutex_unlock(&watchdog_proc_mutex);
-}
#else
static int watchdog_nmi_enable(unsigned int cpu) { return 0; }
static void watchdog_nmi_disable(unsigned int cpu) { return; }
-void watchdog_nmi_enable_all(void) {}
-void watchdog_nmi_disable_all(void) {}
#endif /* CONFIG_HARDLOCKUP_DETECTOR */
static struct smp_hotplug_thread watchdog_threads = {
@@ -666,46 +649,89 @@ static struct smp_hotplug_thread watchdog_threads = {
.unpark = watchdog_enable,
};
-static void restart_watchdog_hrtimer(void *info)
+/*
+ * park all watchdog threads that are specified in 'watchdog_cpumask'
+ */
+static int watchdog_park_threads(void)
{
- struct hrtimer *hrtimer = raw_cpu_ptr(&watchdog_hrtimer);
- int ret;
+ int cpu, ret = 0;
+ get_online_cpus();
+ for_each_watchdog_cpu(cpu) {
+ ret = kthread_park(per_cpu(softlockup_watchdog, cpu));
+ if (ret)
+ break;
+ }
+ if (ret) {
+ for_each_watchdog_cpu(cpu)
+ kthread_unpark(per_cpu(softlockup_watchdog, cpu));
+ }
+ put_online_cpus();
+
+ return ret;
+}
+
+/*
+ * unpark all watchdog threads that are specified in 'watchdog_cpumask'
+ */
+static void watchdog_unpark_threads(void)
+{
+ int cpu;
+
+ get_online_cpus();
+ for_each_watchdog_cpu(cpu)
+ kthread_unpark(per_cpu(softlockup_watchdog, cpu));
+ put_online_cpus();
+}
+
+/*
+ * Suspend the hard and soft lockup detector by parking the watchdog threads.
+ */
+int lockup_detector_suspend(void)
+{
+ int ret = 0;
+
+ mutex_lock(&watchdog_proc_mutex);
/*
- * No need to cancel and restart hrtimer if it is currently executing
- * because it will reprogram itself with the new period now.
- * We should never see it unqueued here because we are running per-cpu
- * with interrupts disabled.
+ * Multiple suspend requests can be active in parallel (counted by
+ * the 'watchdog_suspended' variable). If the watchdog threads are
+ * running, the first caller takes care that they will be parked.
+ * The state of 'watchdog_running' cannot change while a suspend
+ * request is active (see related code in 'proc' handlers).
*/
- ret = hrtimer_try_to_cancel(hrtimer);
- if (ret == 1)
- hrtimer_start(hrtimer, ns_to_ktime(sample_period),
- HRTIMER_MODE_REL_PINNED);
+ if (watchdog_running && !watchdog_suspended)
+ ret = watchdog_park_threads();
+
+ if (ret == 0)
+ watchdog_suspended++;
+
+ mutex_unlock(&watchdog_proc_mutex);
+
+ return ret;
}
-static void update_watchdog(int cpu)
+/*
+ * Resume the hard and soft lockup detector by unparking the watchdog threads.
+ */
+void lockup_detector_resume(void)
{
+ mutex_lock(&watchdog_proc_mutex);
+
+ watchdog_suspended--;
/*
- * Make sure that perf event counter will adopt to a new
- * sampling period. Updating the sampling period directly would
- * be much nicer but we do not have an API for that now so
- * let's use a big hammer.
- * Hrtimer will adopt the new period on the next tick but this
- * might be late already so we have to restart the timer as well.
+ * The watchdog threads are unparked if they were previously running
+ * and if there is no more active suspend request.
*/
- watchdog_nmi_disable(cpu);
- smp_call_function_single(cpu, restart_watchdog_hrtimer, NULL, 1);
- watchdog_nmi_enable(cpu);
+ if (watchdog_running && !watchdog_suspended)
+ watchdog_unpark_threads();
+
+ mutex_unlock(&watchdog_proc_mutex);
}
static void update_watchdog_all_cpus(void)
{
- int cpu;
-
- get_online_cpus();
- for_each_watchdog_cpu(cpu)
- update_watchdog(cpu);
- put_online_cpus();
+ watchdog_park_threads();
+ watchdog_unpark_threads();
}
static int watchdog_enable_all_cpus(void)
@@ -713,15 +739,12 @@ static int watchdog_enable_all_cpus(void)
int err = 0;
if (!watchdog_running) {
- err = smpboot_register_percpu_thread(&watchdog_threads);
+ err = smpboot_register_percpu_thread_cpumask(&watchdog_threads,
+ &watchdog_cpumask);
if (err)
pr_err("Failed to create watchdog threads, disabled\n");
- else {
- if (smpboot_update_cpumask_percpu_thread(
- &watchdog_threads, &watchdog_cpumask))
- pr_err("Failed to set cpumask for watchdog threads\n");
+ else
watchdog_running = 1;
- }
} else {
/*
* Enable/disable the lockup detectors or
@@ -787,6 +810,12 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write,
mutex_lock(&watchdog_proc_mutex);
+ if (watchdog_suspended) {
+ /* no parameter changes allowed while watchdog is suspended */
+ err = -EAGAIN;
+ goto out;
+ }
+
/*
* If the parameter is being read return the state of the corresponding
* bit(s) in 'watchdog_enabled', else update 'watchdog_enabled' and the
@@ -872,6 +901,12 @@ int proc_watchdog_thresh(struct ctl_table *table, int write,
mutex_lock(&watchdog_proc_mutex);
+ if (watchdog_suspended) {
+ /* no parameter changes allowed while watchdog is suspended */
+ err = -EAGAIN;
+ goto out;
+ }
+
old = ACCESS_ONCE(watchdog_thresh);
err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
@@ -903,6 +938,13 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write,
int err;
mutex_lock(&watchdog_proc_mutex);
+
+ if (watchdog_suspended) {
+ /* no parameter changes allowed while watchdog is suspended */
+ err = -EAGAIN;
+ goto out;
+ }
+
err = proc_do_large_bitmap(table, write, buffer, lenp, ppos);
if (!err && write) {
/* Remove impossible cpus to keep sysctl output cleaner. */
@@ -920,6 +962,7 @@ int proc_watchdog_cpumask(struct ctl_table *table, int write,
pr_err("cpumask update failed\n");
}
}
+out:
mutex_unlock(&watchdog_proc_mutex);
return err;
}
@@ -932,10 +975,8 @@ void __init lockup_detector_init(void)
#ifdef CONFIG_NO_HZ_FULL
if (tick_nohz_full_enabled()) {
- if (!cpumask_empty(tick_nohz_full_mask))
- pr_info("Disabling watchdog on nohz_full cores by default\n");
- cpumask_andnot(&watchdog_cpumask, cpu_possible_mask,
- tick_nohz_full_mask);
+ pr_info("Disabling watchdog on nohz_full cores by default\n");
+ cpumask_copy(&watchdog_cpumask, housekeeping_mask);
} else
cpumask_copy(&watchdog_cpumask, cpu_possible_mask);
#else