summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/acct.c14
-rw-r--r--kernel/async.c8
-rw-r--r--kernel/auditsc.c2
-rw-r--r--kernel/bpf/Makefile6
-rw-r--r--kernel/bpf/core.c127
-rw-r--r--kernel/bpf/syscall.c606
-rw-r--r--kernel/bpf/test_stub.c116
-rw-r--r--kernel/bpf/verifier.c1923
-rw-r--r--kernel/cgroup.c192
-rw-r--r--kernel/configs/tiny.config4
-rw-r--r--kernel/cpuset.c24
-rw-r--r--kernel/crash_dump.c1
-rw-r--r--kernel/events/core.c11
-rw-r--r--kernel/fork.c8
-rw-r--r--kernel/gcov/Kconfig2
-rw-r--r--kernel/irq/Kconfig3
-rw-r--r--kernel/irq/chip.c85
-rw-r--r--kernel/irq/devres.c2
-rw-r--r--kernel/irq/internals.h16
-rw-r--r--kernel/irq/irqdesc.c42
-rw-r--r--kernel/irq/manage.c32
-rw-r--r--kernel/irq/pm.c159
-rw-r--r--kernel/irq_work.c15
-rw-r--r--kernel/kthread.c2
-rw-r--r--kernel/module.c4
-rw-r--r--kernel/params.c17
-rw-r--r--kernel/power/Kconfig4
-rw-r--r--kernel/power/process.c1
-rw-r--r--kernel/power/snapshot.c4
-rw-r--r--kernel/power/suspend.c51
-rw-r--r--kernel/power/suspend_test.c32
-rw-r--r--kernel/printk/printk.c20
-rw-r--r--kernel/reboot.c81
-rw-r--r--kernel/resource.c70
-rw-r--r--kernel/sched/core.c12
-rw-r--r--kernel/sched/fair.c2
-rw-r--r--kernel/sched/wait.c36
-rw-r--r--kernel/seccomp.c7
-rw-r--r--kernel/sys.c489
-rw-r--r--kernel/sys_ni.c6
-rw-r--r--kernel/sysctl.c7
-rw-r--r--kernel/sysctl_binary.c1
-rw-r--r--kernel/time/tick-common.c1
-rw-r--r--kernel/time/tick-internal.h7
-rw-r--r--kernel/time/tick-sched.c60
-rw-r--r--kernel/time/timer.c2
-rw-r--r--kernel/trace/ftrace.c416
-rw-r--r--kernel/trace/ring_buffer.c2
-rw-r--r--kernel/trace/trace_events.c5
-rw-r--r--kernel/trace/trace_selftest.c51
-rw-r--r--kernel/trace/trace_syscalls.c4
-rw-r--r--kernel/watchdog.c18
52 files changed, 4124 insertions, 686 deletions
diff --git a/kernel/acct.c b/kernel/acct.c
index b4c667d22e79..33738ef972f3 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -472,7 +472,6 @@ static void do_acct_process(struct bsd_acct_struct *acct)
acct_t ac;
unsigned long flim;
const struct cred *orig_cred;
- struct pid_namespace *ns = acct->ns;
struct file *file = acct->file;
/*
@@ -500,10 +499,15 @@ static void do_acct_process(struct bsd_acct_struct *acct)
ac.ac_gid16 = ac.ac_gid;
#endif
#if ACCT_VERSION == 3
- ac.ac_pid = task_tgid_nr_ns(current, ns);
- rcu_read_lock();
- ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns);
- rcu_read_unlock();
+ {
+ struct pid_namespace *ns = acct->ns;
+
+ ac.ac_pid = task_tgid_nr_ns(current, ns);
+ rcu_read_lock();
+ ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent),
+ ns);
+ rcu_read_unlock();
+ }
#endif
/*
* Get freeze protection. If the fs is frozen, just skip the write
diff --git a/kernel/async.c b/kernel/async.c
index 61f023ce0228..4c3773c0bf63 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -115,7 +115,7 @@ static void async_run_entry_fn(struct work_struct *work)
/* 1) run (and print duration) */
if (initcall_debug && system_state == SYSTEM_BOOTING) {
- printk(KERN_DEBUG "calling %lli_%pF @ %i\n",
+ pr_debug("calling %lli_%pF @ %i\n",
(long long)entry->cookie,
entry->func, task_pid_nr(current));
calltime = ktime_get();
@@ -124,7 +124,7 @@ static void async_run_entry_fn(struct work_struct *work)
if (initcall_debug && system_state == SYSTEM_BOOTING) {
rettime = ktime_get();
delta = ktime_sub(rettime, calltime);
- printk(KERN_DEBUG "initcall %lli_%pF returned 0 after %lld usecs\n",
+ pr_debug("initcall %lli_%pF returned 0 after %lld usecs\n",
(long long)entry->cookie,
entry->func,
(long long)ktime_to_ns(delta) >> 10);
@@ -285,7 +285,7 @@ void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain
ktime_t uninitialized_var(starttime), delta, endtime;
if (initcall_debug && system_state == SYSTEM_BOOTING) {
- printk(KERN_DEBUG "async_waiting @ %i\n", task_pid_nr(current));
+ pr_debug("async_waiting @ %i\n", task_pid_nr(current));
starttime = ktime_get();
}
@@ -295,7 +295,7 @@ void async_synchronize_cookie_domain(async_cookie_t cookie, struct async_domain
endtime = ktime_get();
delta = ktime_sub(endtime, starttime);
- printk(KERN_DEBUG "async_continuing @ %i after %lli usec\n",
+ pr_debug("async_continuing @ %i after %lli usec\n",
task_pid_nr(current),
(long long)ktime_to_ns(delta) >> 10);
}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 21eae3c05ec0..7208c1df248d 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2406,7 +2406,7 @@ int __audit_log_bprm_fcaps(struct linux_binprm *bprm,
* @new: the new credentials
* @old: the old (current) credentials
*
- * Record the aguments userspace sent to sys_capset for later printing by the
+ * Record the arguments userspace sent to sys_capset for later printing by the
* audit system if applicable
*/
void __audit_log_capset(const struct cred *new, const struct cred *old)
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
index 6a71145e2769..45427239f375 100644
--- a/kernel/bpf/Makefile
+++ b/kernel/bpf/Makefile
@@ -1 +1,5 @@
-obj-y := core.o
+obj-y := core.o syscall.o verifier.o
+
+ifdef CONFIG_TEST_BPF
+obj-y += test_stub.o
+endif
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 7f0dbcbb34af..f0c30c59b317 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -20,9 +20,14 @@
* Andi Kleen - Fix a few bad bugs and races.
* Kris Katterjohn - Added many additional checks in bpf_check_classic()
*/
+
#include <linux/filter.h>
#include <linux/skbuff.h>
+#include <linux/vmalloc.h>
+#include <linux/random.h>
+#include <linux/moduleloader.h>
#include <asm/unaligned.h>
+#include <linux/bpf.h>
/* Registers */
#define BPF_R0 regs[BPF_REG_0]
@@ -63,6 +68,105 @@ void *bpf_internal_load_pointer_neg_helper(const struct sk_buff *skb, int k, uns
return NULL;
}
+struct bpf_prog *bpf_prog_alloc(unsigned int size, gfp_t gfp_extra_flags)
+{
+ gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO |
+ gfp_extra_flags;
+ struct bpf_prog_aux *aux;
+ struct bpf_prog *fp;
+
+ size = round_up(size, PAGE_SIZE);
+ fp = __vmalloc(size, gfp_flags, PAGE_KERNEL);
+ if (fp == NULL)
+ return NULL;
+
+ aux = kzalloc(sizeof(*aux), GFP_KERNEL | gfp_extra_flags);
+ if (aux == NULL) {
+ vfree(fp);
+ return NULL;
+ }
+
+ fp->pages = size / PAGE_SIZE;
+ fp->aux = aux;
+
+ return fp;
+}
+EXPORT_SYMBOL_GPL(bpf_prog_alloc);
+
+struct bpf_prog *bpf_prog_realloc(struct bpf_prog *fp_old, unsigned int size,
+ gfp_t gfp_extra_flags)
+{
+ gfp_t gfp_flags = GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO |
+ gfp_extra_flags;
+ struct bpf_prog *fp;
+
+ BUG_ON(fp_old == NULL);
+
+ size = round_up(size, PAGE_SIZE);
+ if (size <= fp_old->pages * PAGE_SIZE)
+ return fp_old;
+
+ fp = __vmalloc(size, gfp_flags, PAGE_KERNEL);
+ if (fp != NULL) {
+ memcpy(fp, fp_old, fp_old->pages * PAGE_SIZE);
+ fp->pages = size / PAGE_SIZE;
+
+ /* We keep fp->aux from fp_old around in the new
+ * reallocated structure.
+ */
+ fp_old->aux = NULL;
+ __bpf_prog_free(fp_old);
+ }
+
+ return fp;
+}
+EXPORT_SYMBOL_GPL(bpf_prog_realloc);
+
+void __bpf_prog_free(struct bpf_prog *fp)
+{
+ kfree(fp->aux);
+ vfree(fp);
+}
+EXPORT_SYMBOL_GPL(__bpf_prog_free);
+
+#ifdef CONFIG_BPF_JIT
+struct bpf_binary_header *
+bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr,
+ unsigned int alignment,
+ bpf_jit_fill_hole_t bpf_fill_ill_insns)
+{
+ struct bpf_binary_header *hdr;
+ unsigned int size, hole, start;
+
+ /* Most of BPF filters are really small, but if some of them
+ * fill a page, allow at least 128 extra bytes to insert a
+ * random section of illegal instructions.
+ */
+ size = round_up(proglen + sizeof(*hdr) + 128, PAGE_SIZE);
+ hdr = module_alloc(size);
+ if (hdr == NULL)
+ return NULL;
+
+ /* Fill space with illegal/arch-dep instructions. */
+ bpf_fill_ill_insns(hdr, size);
+
+ hdr->pages = size / PAGE_SIZE;
+ hole = min_t(unsigned int, size - (proglen + sizeof(*hdr)),
+ PAGE_SIZE - sizeof(*hdr));
+ start = (prandom_u32() % hole) & ~(alignment - 1);
+
+ /* Leave a random number of instructions before BPF code. */
+ *image_ptr = &hdr->image[start];
+
+ return hdr;
+}
+
+void bpf_jit_binary_free(struct bpf_binary_header *hdr)
+{
+ module_free(NULL, hdr);
+}
+#endif /* CONFIG_BPF_JIT */
+
/* Base function for offset calculation. Needs to go into .text section,
* therefore keeping it non-static as well; will also be used by JITs
* anyway later on, so do not let the compiler omit it.
@@ -180,6 +284,7 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn)
[BPF_LD | BPF_IND | BPF_W] = &&LD_IND_W,
[BPF_LD | BPF_IND | BPF_H] = &&LD_IND_H,
[BPF_LD | BPF_IND | BPF_B] = &&LD_IND_B,
+ [BPF_LD | BPF_IMM | BPF_DW] = &&LD_IMM_DW,
};
void *ptr;
int off;
@@ -239,6 +344,10 @@ select_insn:
ALU64_MOV_K:
DST = IMM;
CONT;
+ LD_IMM_DW:
+ DST = (u64) (u32) insn[0].imm | ((u64) (u32) insn[1].imm) << 32;
+ insn++;
+ CONT;
ALU64_ARSH_X:
(*(s64 *) &DST) >>= SRC;
CONT;
@@ -523,12 +632,26 @@ void bpf_prog_select_runtime(struct bpf_prog *fp)
/* Probe if internal BPF can be JITed */
bpf_int_jit_compile(fp);
+ /* Lock whole bpf_prog as read-only */
+ bpf_prog_lock_ro(fp);
}
EXPORT_SYMBOL_GPL(bpf_prog_select_runtime);
-/* free internal BPF program */
+static void bpf_prog_free_deferred(struct work_struct *work)
+{
+ struct bpf_prog_aux *aux;
+
+ aux = container_of(work, struct bpf_prog_aux, work);
+ bpf_jit_free(aux->prog);
+}
+
+/* Free internal BPF program */
void bpf_prog_free(struct bpf_prog *fp)
{
- bpf_jit_free(fp);
+ struct bpf_prog_aux *aux = fp->aux;
+
+ INIT_WORK(&aux->work, bpf_prog_free_deferred);
+ aux->prog = fp;
+ schedule_work(&aux->work);
}
EXPORT_SYMBOL_GPL(bpf_prog_free);
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
new file mode 100644
index 000000000000..ba61c8c16032
--- /dev/null
+++ b/kernel/bpf/syscall.c
@@ -0,0 +1,606 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/bpf.h>
+#include <linux/syscalls.h>
+#include <linux/slab.h>
+#include <linux/anon_inodes.h>
+#include <linux/file.h>
+#include <linux/license.h>
+#include <linux/filter.h>
+
+static LIST_HEAD(bpf_map_types);
+
+static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
+{
+ struct bpf_map_type_list *tl;
+ struct bpf_map *map;
+
+ list_for_each_entry(tl, &bpf_map_types, list_node) {
+ if (tl->type == attr->map_type) {
+ map = tl->ops->map_alloc(attr);
+ if (IS_ERR(map))
+ return map;
+ map->ops = tl->ops;
+ map->map_type = attr->map_type;
+ return map;
+ }
+ }
+ return ERR_PTR(-EINVAL);
+}
+
+/* boot time registration of different map implementations */
+void bpf_register_map_type(struct bpf_map_type_list *tl)
+{
+ list_add(&tl->list_node, &bpf_map_types);
+}
+
+/* called from workqueue */
+static void bpf_map_free_deferred(struct work_struct *work)
+{
+ struct bpf_map *map = container_of(work, struct bpf_map, work);
+
+ /* implementation dependent freeing */
+ map->ops->map_free(map);
+}
+
+/* decrement map refcnt and schedule it for freeing via workqueue
+ * (unrelying map implementation ops->map_free() might sleep)
+ */
+void bpf_map_put(struct bpf_map *map)
+{
+ if (atomic_dec_and_test(&map->refcnt)) {
+ INIT_WORK(&map->work, bpf_map_free_deferred);
+ schedule_work(&map->work);
+ }
+}
+
+static int bpf_map_release(struct inode *inode, struct file *filp)
+{
+ struct bpf_map *map = filp->private_data;
+
+ bpf_map_put(map);
+ return 0;
+}
+
+static const struct file_operations bpf_map_fops = {
+ .release = bpf_map_release,
+};
+
+/* helper macro to check that unused fields 'union bpf_attr' are zero */
+#define CHECK_ATTR(CMD) \
+ memchr_inv((void *) &attr->CMD##_LAST_FIELD + \
+ sizeof(attr->CMD##_LAST_FIELD), 0, \
+ sizeof(*attr) - \
+ offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
+ sizeof(attr->CMD##_LAST_FIELD)) != NULL
+
+#define BPF_MAP_CREATE_LAST_FIELD max_entries
+/* called via syscall */
+static int map_create(union bpf_attr *attr)
+{
+ struct bpf_map *map;
+ int err;
+
+ err = CHECK_ATTR(BPF_MAP_CREATE);
+ if (err)
+ return -EINVAL;
+
+ /* find map type and init map: hashtable vs rbtree vs bloom vs ... */
+ map = find_and_alloc_map(attr);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ atomic_set(&map->refcnt, 1);
+
+ err = anon_inode_getfd("bpf-map", &bpf_map_fops, map, O_RDWR | O_CLOEXEC);
+
+ if (err < 0)
+ /* failed to allocate fd */
+ goto free_map;
+
+ return err;
+
+free_map:
+ map->ops->map_free(map);
+ return err;
+}
+
+/* if error is returned, fd is released.
+ * On success caller should complete fd access with matching fdput()
+ */
+struct bpf_map *bpf_map_get(struct fd f)
+{
+ struct bpf_map *map;
+
+ if (!f.file)
+ return ERR_PTR(-EBADF);
+
+ if (f.file->f_op != &bpf_map_fops) {
+ fdput(f);
+ return ERR_PTR(-EINVAL);
+ }
+
+ map = f.file->private_data;
+
+ return map;
+}
+
+/* helper to convert user pointers passed inside __aligned_u64 fields */
+static void __user *u64_to_ptr(__u64 val)
+{
+ return (void __user *) (unsigned long) val;
+}
+
+/* last field in 'union bpf_attr' used by this command */
+#define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value
+
+static int map_lookup_elem(union bpf_attr *attr)
+{
+ void __user *ukey = u64_to_ptr(attr->key);
+ void __user *uvalue = u64_to_ptr(attr->value);
+ int ufd = attr->map_fd;
+ struct fd f = fdget(ufd);
+ struct bpf_map *map;
+ void *key, *value;
+ int err;
+
+ if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))
+ return -EINVAL;
+
+ map = bpf_map_get(f);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ err = -ENOMEM;
+ key = kmalloc(map->key_size, GFP_USER);
+ if (!key)
+ goto err_put;
+
+ err = -EFAULT;
+ if (copy_from_user(key, ukey, map->key_size) != 0)
+ goto free_key;
+
+ err = -ESRCH;
+ rcu_read_lock();
+ value = map->ops->map_lookup_elem(map, key);
+ if (!value)
+ goto err_unlock;
+
+ err = -EFAULT;
+ if (copy_to_user(uvalue, value, map->value_size) != 0)
+ goto err_unlock;
+
+ err = 0;
+
+err_unlock:
+ rcu_read_unlock();
+free_key:
+ kfree(key);
+err_put:
+ fdput(f);
+ return err;
+}
+
+#define BPF_MAP_UPDATE_ELEM_LAST_FIELD value
+
+static int map_update_elem(union bpf_attr *attr)
+{
+ void __user *ukey = u64_to_ptr(attr->key);
+ void __user *uvalue = u64_to_ptr(attr->value);
+ int ufd = attr->map_fd;
+ struct fd f = fdget(ufd);
+ struct bpf_map *map;
+ void *key, *value;
+ int err;
+
+ if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM))
+ return -EINVAL;
+
+ map = bpf_map_get(f);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ err = -ENOMEM;
+ key = kmalloc(map->key_size, GFP_USER);
+ if (!key)
+ goto err_put;
+
+ err = -EFAULT;
+ if (copy_from_user(key, ukey, map->key_size) != 0)
+ goto free_key;
+
+ err = -ENOMEM;
+ value = kmalloc(map->value_size, GFP_USER);
+ if (!value)
+ goto free_key;
+
+ err = -EFAULT;
+ if (copy_from_user(value, uvalue, map->value_size) != 0)
+ goto free_value;
+
+ /* eBPF program that use maps are running under rcu_read_lock(),
+ * therefore all map accessors rely on this fact, so do the same here
+ */
+ rcu_read_lock();
+ err = map->ops->map_update_elem(map, key, value);
+ rcu_read_unlock();
+
+free_value:
+ kfree(value);
+free_key:
+ kfree(key);
+err_put:
+ fdput(f);
+ return err;
+}
+
+#define BPF_MAP_DELETE_ELEM_LAST_FIELD key
+
+static int map_delete_elem(union bpf_attr *attr)
+{
+ void __user *ukey = u64_to_ptr(attr->key);
+ int ufd = attr->map_fd;
+ struct fd f = fdget(ufd);
+ struct bpf_map *map;
+ void *key;
+ int err;
+
+ if (CHECK_ATTR(BPF_MAP_DELETE_ELEM))
+ return -EINVAL;
+
+ map = bpf_map_get(f);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ err = -ENOMEM;
+ key = kmalloc(map->key_size, GFP_USER);
+ if (!key)
+ goto err_put;
+
+ err = -EFAULT;
+ if (copy_from_user(key, ukey, map->key_size) != 0)
+ goto free_key;
+
+ rcu_read_lock();
+ err = map->ops->map_delete_elem(map, key);
+ rcu_read_unlock();
+
+free_key:
+ kfree(key);
+err_put:
+ fdput(f);
+ return err;
+}
+
+/* last field in 'union bpf_attr' used by this command */
+#define BPF_MAP_GET_NEXT_KEY_LAST_FIELD next_key
+
+static int map_get_next_key(union bpf_attr *attr)
+{
+ void __user *ukey = u64_to_ptr(attr->key);
+ void __user *unext_key = u64_to_ptr(attr->next_key);
+ int ufd = attr->map_fd;
+ struct fd f = fdget(ufd);
+ struct bpf_map *map;
+ void *key, *next_key;
+ int err;
+
+ if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY))
+ return -EINVAL;
+
+ map = bpf_map_get(f);
+ if (IS_ERR(map))
+ return PTR_ERR(map);
+
+ err = -ENOMEM;
+ key = kmalloc(map->key_size, GFP_USER);
+ if (!key)
+ goto err_put;
+
+ err = -EFAULT;
+ if (copy_from_user(key, ukey, map->key_size) != 0)
+ goto free_key;
+
+ err = -ENOMEM;
+ next_key = kmalloc(map->key_size, GFP_USER);
+ if (!next_key)
+ goto free_key;
+
+ rcu_read_lock();
+ err = map->ops->map_get_next_key(map, key, next_key);
+ rcu_read_unlock();
+ if (err)
+ goto free_next_key;
+
+ err = -EFAULT;
+ if (copy_to_user(unext_key, next_key, map->key_size) != 0)
+ goto free_next_key;
+
+ err = 0;
+
+free_next_key:
+ kfree(next_key);
+free_key:
+ kfree(key);
+err_put:
+ fdput(f);
+ return err;
+}
+
+static LIST_HEAD(bpf_prog_types);
+
+static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
+{
+ struct bpf_prog_type_list *tl;
+
+ list_for_each_entry(tl, &bpf_prog_types, list_node) {
+ if (tl->type == type) {
+ prog->aux->ops = tl->ops;
+ prog->aux->prog_type = type;
+ return 0;
+ }
+ }
+ return -EINVAL;
+}
+
+void bpf_register_prog_type(struct bpf_prog_type_list *tl)
+{
+ list_add(&tl->list_node, &bpf_prog_types);
+}
+
+/* fixup insn->imm field of bpf_call instructions:
+ * if (insn->imm == BPF_FUNC_map_lookup_elem)
+ * insn->imm = bpf_map_lookup_elem - __bpf_call_base;
+ * else if (insn->imm == BPF_FUNC_map_update_elem)
+ * insn->imm = bpf_map_update_elem - __bpf_call_base;
+ * else ...
+ *
+ * this function is called after eBPF program passed verification
+ */
+static void fixup_bpf_calls(struct bpf_prog *prog)
+{
+ const struct bpf_func_proto *fn;
+ int i;
+
+ for (i = 0; i < prog->len; i++) {
+ struct bpf_insn *insn = &prog->insnsi[i];
+
+ if (insn->code == (BPF_JMP | BPF_CALL)) {
+ /* we reach here when program has bpf_call instructions
+ * and it passed bpf_check(), means that
+ * ops->get_func_proto must have been supplied, check it
+ */
+ BUG_ON(!prog->aux->ops->get_func_proto);
+
+ fn = prog->aux->ops->get_func_proto(insn->imm);
+ /* all functions that have prototype and verifier allowed
+ * programs to call them, must be real in-kernel functions
+ */
+ BUG_ON(!fn->func);
+ insn->imm = fn->func - __bpf_call_base;
+ }
+ }
+}
+
+/* drop refcnt on maps used by eBPF program and free auxilary data */
+static void free_used_maps(struct bpf_prog_aux *aux)
+{
+ int i;
+
+ for (i = 0; i < aux->used_map_cnt; i++)
+ bpf_map_put(aux->used_maps[i]);
+
+ kfree(aux->used_maps);
+}
+
+void bpf_prog_put(struct bpf_prog *prog)
+{
+ if (atomic_dec_and_test(&prog->aux->refcnt)) {
+ free_used_maps(prog->aux);
+ bpf_prog_free(prog);
+ }
+}
+
+static int bpf_prog_release(struct inode *inode, struct file *filp)
+{
+ struct bpf_prog *prog = filp->private_data;
+
+ bpf_prog_put(prog);
+ return 0;
+}
+
+static const struct file_operations bpf_prog_fops = {
+ .release = bpf_prog_release,
+};
+
+static struct bpf_prog *get_prog(struct fd f)
+{
+ struct bpf_prog *prog;
+
+ if (!f.file)
+ return ERR_PTR(-EBADF);
+
+ if (f.file->f_op != &bpf_prog_fops) {
+ fdput(f);
+ return ERR_PTR(-EINVAL);
+ }
+
+ prog = f.file->private_data;
+
+ return prog;
+}
+
+/* called by sockets/tracing/seccomp before attaching program to an event
+ * pairs with bpf_prog_put()
+ */
+struct bpf_prog *bpf_prog_get(u32 ufd)
+{
+ struct fd f = fdget(ufd);
+ struct bpf_prog *prog;
+
+ prog = get_prog(f);
+
+ if (IS_ERR(prog))
+ return prog;
+
+ atomic_inc(&prog->aux->refcnt);
+ fdput(f);
+ return prog;
+}
+
+/* last field in 'union bpf_attr' used by this command */
+#define BPF_PROG_LOAD_LAST_FIELD log_buf
+
+static int bpf_prog_load(union bpf_attr *attr)
+{
+ enum bpf_prog_type type = attr->prog_type;
+ struct bpf_prog *prog;
+ int err;
+ char license[128];
+ bool is_gpl;
+
+ if (CHECK_ATTR(BPF_PROG_LOAD))
+ return -EINVAL;
+
+ /* copy eBPF program license from user space */
+ if (strncpy_from_user(license, u64_to_ptr(attr->license),
+ sizeof(license) - 1) < 0)
+ return -EFAULT;
+ license[sizeof(license) - 1] = 0;
+
+ /* eBPF programs must be GPL compatible to use GPL-ed functions */
+ is_gpl = license_is_gpl_compatible(license);
+
+ if (attr->insn_cnt >= BPF_MAXINSNS)
+ return -EINVAL;
+
+ /* plain bpf_prog allocation */
+ prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
+ if (!prog)
+ return -ENOMEM;
+
+ prog->len = attr->insn_cnt;
+
+ err = -EFAULT;
+ if (copy_from_user(prog->insns, u64_to_ptr(attr->insns),
+ prog->len * sizeof(struct bpf_insn)) != 0)
+ goto free_prog;
+
+ prog->orig_prog = NULL;
+ prog->jited = false;
+
+ atomic_set(&prog->aux->refcnt, 1);
+ prog->aux->is_gpl_compatible = is_gpl;
+
+ /* find program type: socket_filter vs tracing_filter */
+ err = find_prog_type(type, prog);
+ if (err < 0)
+ goto free_prog;
+
+ /* run eBPF verifier */
+ err = bpf_check(prog, attr);
+
+ if (err < 0)
+ goto free_used_maps;
+
+ /* fixup BPF_CALL->imm field */
+ fixup_bpf_calls(prog);
+
+ /* eBPF program is ready to be JITed */
+ bpf_prog_select_runtime(prog);
+
+ err = anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC);
+
+ if (err < 0)
+ /* failed to allocate fd */
+ goto free_used_maps;
+
+ return err;
+
+free_used_maps:
+ free_used_maps(prog->aux);
+free_prog:
+ bpf_prog_free(prog);
+ return err;
+}
+
+SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
+{
+ union bpf_attr attr = {};
+ int err;
+
+ /* the syscall is limited to root temporarily. This restriction will be
+ * lifted when security audit is clean. Note that eBPF+tracing must have
+ * this restriction, since it may pass kernel data to user space
+ */
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (!access_ok(VERIFY_READ, uattr, 1))
+ return -EFAULT;
+
+ if (size > PAGE_SIZE) /* silly large */
+ return -E2BIG;
+
+ /* If we're handed a bigger struct than we know of,
+ * ensure all the unknown bits are 0 - i.e. new
+ * user-space does not rely on any kernel feature
+ * extensions we dont know about yet.
+ */
+ if (size > sizeof(attr)) {
+ unsigned char __user *addr;
+ unsigned char __user *end;
+ unsigned char val;
+
+ addr = (void __user *)uattr + sizeof(attr);
+ end = (void __user *)uattr + size;
+
+ for (; addr < end; addr++) {
+ err = get_user(val, addr);
+ if (err)
+ return err;
+ if (val)
+ return -E2BIG;
+ }
+ size = sizeof(attr);
+ }
+
+ /* copy attributes from user space, may be less than sizeof(bpf_attr) */
+ if (copy_from_user(&attr, uattr, size) != 0)
+ return -EFAULT;
+
+ switch (cmd) {
+ case BPF_MAP_CREATE:
+ err = map_create(&attr);
+ break;
+ case BPF_MAP_LOOKUP_ELEM:
+ err = map_lookup_elem(&attr);
+ break;
+ case BPF_MAP_UPDATE_ELEM:
+ err = map_update_elem(&attr);
+ break;
+ case BPF_MAP_DELETE_ELEM:
+ err = map_delete_elem(&attr);
+ break;
+ case BPF_MAP_GET_NEXT_KEY:
+ err = map_get_next_key(&attr);
+ break;
+ case BPF_PROG_LOAD:
+ err = bpf_prog_load(&attr);
+ break;
+ default:
+ err = -EINVAL;
+ break;
+ }
+
+ return err;
+}
diff --git a/kernel/bpf/test_stub.c b/kernel/bpf/test_stub.c
new file mode 100644
index 000000000000..fcaddff4003e
--- /dev/null
+++ b/kernel/bpf/test_stub.c
@@ -0,0 +1,116 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/err.h>
+#include <linux/bpf.h>
+
+/* test stubs for BPF_MAP_TYPE_UNSPEC and for BPF_PROG_TYPE_UNSPEC
+ * to be used by user space verifier testsuite
+ */
+struct bpf_context {
+ u64 arg1;
+ u64 arg2;
+};
+
+static u64 test_func(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ return 0;
+}
+
+static struct bpf_func_proto test_funcs[] = {
+ [BPF_FUNC_unspec] = {
+ .func = test_func,
+ .gpl_only = true,
+ .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
+ .arg1_type = ARG_CONST_MAP_PTR,
+ .arg2_type = ARG_PTR_TO_MAP_KEY,
+ },
+};
+
+static const struct bpf_func_proto *test_func_proto(enum bpf_func_id func_id)
+{
+ if (func_id < 0 || func_id >= ARRAY_SIZE(test_funcs))
+ return NULL;
+ return &test_funcs[func_id];
+}
+
+static const struct bpf_context_access {
+ int size;
+ enum bpf_access_type type;
+} test_ctx_access[] = {
+ [offsetof(struct bpf_context, arg1)] = {
+ FIELD_SIZEOF(struct bpf_context, arg1),
+ BPF_READ
+ },
+ [offsetof(struct bpf_context, arg2)] = {
+ FIELD_SIZEOF(struct bpf_context, arg2),
+ BPF_READ
+ },
+};
+
+static bool test_is_valid_access(int off, int size, enum bpf_access_type type)
+{
+ const struct bpf_context_access *access;
+
+ if (off < 0 || off >= ARRAY_SIZE(test_ctx_access))
+ return false;
+
+ access = &test_ctx_access[off];
+ if (access->size == size && (access->type & type))
+ return true;
+
+ return false;
+}
+
+static struct bpf_verifier_ops test_ops = {
+ .get_func_proto = test_func_proto,
+ .is_valid_access = test_is_valid_access,
+};
+
+static struct bpf_prog_type_list tl_prog = {
+ .ops = &test_ops,
+ .type = BPF_PROG_TYPE_UNSPEC,
+};
+
+static struct bpf_map *test_map_alloc(union bpf_attr *attr)
+{
+ struct bpf_map *map;
+
+ map = kzalloc(sizeof(*map), GFP_USER);
+ if (!map)
+ return ERR_PTR(-ENOMEM);
+
+ map->key_size = attr->key_size;
+ map->value_size = attr->value_size;
+ map->max_entries = attr->max_entries;
+ return map;
+}
+
+static void test_map_free(struct bpf_map *map)
+{
+ kfree(map);
+}
+
+static struct bpf_map_ops test_map_ops = {
+ .map_alloc = test_map_alloc,
+ .map_free = test_map_free,
+};
+
+static struct bpf_map_type_list tl_map = {
+ .ops = &test_map_ops,
+ .type = BPF_MAP_TYPE_UNSPEC,
+};
+
+static int __init register_test_ops(void)
+{
+ bpf_register_map_type(&tl_map);
+ bpf_register_prog_type(&tl_prog);
+ return 0;
+}
+late_initcall(register_test_ops);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
new file mode 100644
index 000000000000..801f5f3b9307
--- /dev/null
+++ b/kernel/bpf/verifier.c
@@ -0,0 +1,1923 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ */
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/bpf.h>
+#include <linux/filter.h>
+#include <net/netlink.h>
+#include <linux/file.h>
+#include <linux/vmalloc.h>
+
+/* bpf_check() is a static code analyzer that walks eBPF program
+ * instruction by instruction and updates register/stack state.
+ * All paths of conditional branches are analyzed until 'bpf_exit' insn.
+ *
+ * The first pass is depth-first-search to check that the program is a DAG.
+ * It rejects the following programs:
+ * - larger than BPF_MAXINSNS insns
+ * - if loop is present (detected via back-edge)
+ * - unreachable insns exist (shouldn't be a forest. program = one function)
+ * - out of bounds or malformed jumps
+ * The second pass is all possible path descent from the 1st insn.
+ * Since it's analyzing all pathes through the program, the length of the
+ * analysis is limited to 32k insn, which may be hit even if total number of
+ * insn is less then 4K, but there are too many branches that change stack/regs.
+ * Number of 'branches to be analyzed' is limited to 1k
+ *
+ * On entry to each instruction, each register has a type, and the instruction
+ * changes the types of the registers depending on instruction semantics.
+ * If instruction is BPF_MOV64_REG(BPF_REG_1, BPF_REG_5), then type of R5 is
+ * copied to R1.
+ *
+ * All registers are 64-bit.
+ * R0 - return register
+ * R1-R5 argument passing registers
+ * R6-R9 callee saved registers
+ * R10 - frame pointer read-only
+ *
+ * At the start of BPF program the register R1 contains a pointer to bpf_context
+ * and has type PTR_TO_CTX.
+ *
+ * Verifier tracks arithmetic operations on pointers in case:
+ * BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
+ * BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -20),
+ * 1st insn copies R10 (which has FRAME_PTR) type into R1
+ * and 2nd arithmetic instruction is pattern matched to recognize
+ * that it wants to construct a pointer to some element within stack.
+ * So after 2nd insn, the register R1 has type PTR_TO_STACK
+ * (and -20 constant is saved for further stack bounds checking).
+ * Meaning that this reg is a pointer to stack plus known immediate constant.
+ *
+ * Most of the time the registers have UNKNOWN_VALUE type, which
+ * means the register has some value, but it's not a valid pointer.
+ * (like pointer plus pointer becomes UNKNOWN_VALUE type)
+ *
+ * When verifier sees load or store instructions the type of base register
+ * can be: PTR_TO_MAP_VALUE, PTR_TO_CTX, FRAME_PTR. These are three pointer
+ * types recognized by check_mem_access() function.
+ *
+ * PTR_TO_MAP_VALUE means that this register is pointing to 'map element value'
+ * and the range of [ptr, ptr + map's value_size) is accessible.
+ *
+ * registers used to pass values to function calls are checked against
+ * function argument constraints.
+ *
+ * ARG_PTR_TO_MAP_KEY is one of such argument constraints.
+ * It means that the register type passed to this function must be
+ * PTR_TO_STACK and it will be used inside the function as
+ * 'pointer to map element key'
+ *
+ * For example the argument constraints for bpf_map_lookup_elem():
+ * .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
+ * .arg1_type = ARG_CONST_MAP_PTR,
+ * .arg2_type = ARG_PTR_TO_MAP_KEY,
+ *
+ * ret_type says that this function returns 'pointer to map elem value or null'
+ * function expects 1st argument to be a const pointer to 'struct bpf_map' and
+ * 2nd argument should be a pointer to stack, which will be used inside
+ * the helper function as a pointer to map element key.
+ *
+ * On the kernel side the helper function looks like:
+ * u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+ * {
+ * struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
+ * void *key = (void *) (unsigned long) r2;
+ * void *value;
+ *
+ * here kernel can access 'key' and 'map' pointers safely, knowing that
+ * [key, key + map->key_size) bytes are valid and were initialized on
+ * the stack of eBPF program.
+ * }
+ *
+ * Corresponding eBPF program may look like:
+ * BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), // after this insn R2 type is FRAME_PTR
+ * BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), // after this insn R2 type is PTR_TO_STACK
+ * BPF_LD_MAP_FD(BPF_REG_1, map_fd), // after this insn R1 type is CONST_PTR_TO_MAP
+ * BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
+ * here verifier looks at prototype of map_lookup_elem() and sees:
+ * .arg1_type == ARG_CONST_MAP_PTR and R1->type == CONST_PTR_TO_MAP, which is ok,
+ * Now verifier knows that this map has key of R1->map_ptr->key_size bytes
+ *
+ * Then .arg2_type == ARG_PTR_TO_MAP_KEY and R2->type == PTR_TO_STACK, ok so far,
+ * Now verifier checks that [R2, R2 + map's key_size) are within stack limits
+ * and were initialized prior to this call.
+ * If it's ok, then verifier allows this BPF_CALL insn and looks at
+ * .ret_type which is RET_PTR_TO_MAP_VALUE_OR_NULL, so it sets
+ * R0->type = PTR_TO_MAP_VALUE_OR_NULL which means bpf_map_lookup_elem() function
+ * returns ether pointer to map value or NULL.
+ *
+ * When type PTR_TO_MAP_VALUE_OR_NULL passes through 'if (reg != 0) goto +off'
+ * insn, the register holding that pointer in the true branch changes state to
+ * PTR_TO_MAP_VALUE and the same register changes state to CONST_IMM in the false
+ * branch. See check_cond_jmp_op().
+ *
+ * After the call R0 is set to return type of the function and registers R1-R5
+ * are set to NOT_INIT to indicate that they are no longer readable.
+ */
+
+/* types of values stored in eBPF registers */
+enum bpf_reg_type {
+ NOT_INIT = 0, /* nothing was written into register */
+ UNKNOWN_VALUE, /* reg doesn't contain a valid pointer */
+ PTR_TO_CTX, /* reg points to bpf_context */
+ CONST_PTR_TO_MAP, /* reg points to struct bpf_map */
+ PTR_TO_MAP_VALUE, /* reg points to map element value */
+ PTR_TO_MAP_VALUE_OR_NULL,/* points to map elem value or NULL */
+ FRAME_PTR, /* reg == frame_pointer */
+ PTR_TO_STACK, /* reg == frame_pointer + imm */
+ CONST_IMM, /* constant integer value */
+};
+
+struct reg_state {
+ enum bpf_reg_type type;
+ union {
+ /* valid when type == CONST_IMM | PTR_TO_STACK */
+ int imm;
+
+ /* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE |
+ * PTR_TO_MAP_VALUE_OR_NULL
+ */
+ struct bpf_map *map_ptr;
+ };
+};
+
+enum bpf_stack_slot_type {
+ STACK_INVALID, /* nothing was stored in this stack slot */
+ STACK_SPILL, /* 1st byte of register spilled into stack */
+ STACK_SPILL_PART, /* other 7 bytes of register spill */
+ STACK_MISC /* BPF program wrote some data into this slot */
+};
+
+struct bpf_stack_slot {
+ enum bpf_stack_slot_type stype;
+ struct reg_state reg_st;
+};
+
+/* state of the program:
+ * type of all registers and stack info
+ */
+struct verifier_state {
+ struct reg_state regs[MAX_BPF_REG];
+ struct bpf_stack_slot stack[MAX_BPF_STACK];
+};
+
+/* linked list of verifier states used to prune search */
+struct verifier_state_list {
+ struct verifier_state state;
+ struct verifier_state_list *next;
+};
+
+/* verifier_state + insn_idx are pushed to stack when branch is encountered */
+struct verifier_stack_elem {
+ /* verifer state is 'st'
+ * before processing instruction 'insn_idx'
+ * and after processing instruction 'prev_insn_idx'
+ */
+ struct verifier_state st;
+ int insn_idx;
+ int prev_insn_idx;
+ struct verifier_stack_elem *next;
+};
+
+#define MAX_USED_MAPS 64 /* max number of maps accessed by one eBPF program */
+
+/* single container for all structs
+ * one verifier_env per bpf_check() call
+ */
+struct verifier_env {
+ struct bpf_prog *prog; /* eBPF program being verified */
+ struct verifier_stack_elem *head; /* stack of verifier states to be processed */
+ int stack_size; /* number of states to be processed */
+ struct verifier_state cur_state; /* current verifier state */
+ struct verifier_state_list **explored_states; /* search pruning optimization */
+ struct bpf_map *used_maps[MAX_USED_MAPS]; /* array of map's used by eBPF program */
+ u32 used_map_cnt; /* number of used maps */
+};
+
+/* verbose verifier prints what it's seeing
+ * bpf_check() is called under lock, so no race to access these global vars
+ */
+static u32 log_level, log_size, log_len;
+static char *log_buf;
+
+static DEFINE_MUTEX(bpf_verifier_lock);
+
+/* log_level controls verbosity level of eBPF verifier.
+ * verbose() is used to dump the verification trace to the log, so the user
+ * can figure out what's wrong with the program
+ */
+static void verbose(const char *fmt, ...)
+{
+ va_list args;
+
+ if (log_level == 0 || log_len >= log_size - 1)
+ return;
+
+ va_start(args, fmt);
+ log_len += vscnprintf(log_buf + log_len, log_size - log_len, fmt, args);
+ va_end(args);
+}
+
+/* string representation of 'enum bpf_reg_type' */
+static const char * const reg_type_str[] = {
+ [NOT_INIT] = "?",
+ [UNKNOWN_VALUE] = "inv",
+ [PTR_TO_CTX] = "ctx",
+ [CONST_PTR_TO_MAP] = "map_ptr",
+ [PTR_TO_MAP_VALUE] = "map_value",
+ [PTR_TO_MAP_VALUE_OR_NULL] = "map_value_or_null",
+ [FRAME_PTR] = "fp",
+ [PTR_TO_STACK] = "fp",
+ [CONST_IMM] = "imm",
+};
+
+static void print_verifier_state(struct verifier_env *env)
+{
+ enum bpf_reg_type t;
+ int i;
+
+ for (i = 0; i < MAX_BPF_REG; i++) {
+ t = env->cur_state.regs[i].type;
+ if (t == NOT_INIT)
+ continue;
+ verbose(" R%d=%s", i, reg_type_str[t]);
+ if (t == CONST_IMM || t == PTR_TO_STACK)
+ verbose("%d", env->cur_state.regs[i].imm);
+ else if (t == CONST_PTR_TO_MAP || t == PTR_TO_MAP_VALUE ||
+ t == PTR_TO_MAP_VALUE_OR_NULL)
+ verbose("(ks=%d,vs=%d)",
+ env->cur_state.regs[i].map_ptr->key_size,
+ env->cur_state.regs[i].map_ptr->value_size);
+ }
+ for (i = 0; i < MAX_BPF_STACK; i++) {
+ if (env->cur_state.stack[i].stype == STACK_SPILL)
+ verbose(" fp%d=%s", -MAX_BPF_STACK + i,
+ reg_type_str[env->cur_state.stack[i].reg_st.type]);
+ }
+ verbose("\n");
+}
+
+static const char *const bpf_class_string[] = {
+ [BPF_LD] = "ld",
+ [BPF_LDX] = "ldx",
+ [BPF_ST] = "st",
+ [BPF_STX] = "stx",
+ [BPF_ALU] = "alu",
+ [BPF_JMP] = "jmp",
+ [BPF_RET] = "BUG",
+ [BPF_ALU64] = "alu64",
+};
+
+static const char *const bpf_alu_string[] = {
+ [BPF_ADD >> 4] = "+=",
+ [BPF_SUB >> 4] = "-=",
+ [BPF_MUL >> 4] = "*=",
+ [BPF_DIV >> 4] = "/=",
+ [BPF_OR >> 4] = "|=",
+ [BPF_AND >> 4] = "&=",
+ [BPF_LSH >> 4] = "<<=",
+ [BPF_RSH >> 4] = ">>=",
+ [BPF_NEG >> 4] = "neg",
+ [BPF_MOD >> 4] = "%=",
+ [BPF_XOR >> 4] = "^=",
+ [BPF_MOV >> 4] = "=",
+ [BPF_ARSH >> 4] = "s>>=",
+ [BPF_END >> 4] = "endian",
+};
+
+static const char *const bpf_ldst_string[] = {
+ [BPF_W >> 3] = "u32",
+ [BPF_H >> 3] = "u16",
+ [BPF_B >> 3] = "u8",
+ [BPF_DW >> 3] = "u64",
+};
+
+static const char *const bpf_jmp_string[] = {
+ [BPF_JA >> 4] = "jmp",
+ [BPF_JEQ >> 4] = "==",
+ [BPF_JGT >> 4] = ">",
+ [BPF_JGE >> 4] = ">=",
+ [BPF_JSET >> 4] = "&",
+ [BPF_JNE >> 4] = "!=",
+ [BPF_JSGT >> 4] = "s>",
+ [BPF_JSGE >> 4] = "s>=",
+ [BPF_CALL >> 4] = "call",
+ [BPF_EXIT >> 4] = "exit",
+};
+
+static void print_bpf_insn(struct bpf_insn *insn)
+{
+ u8 class = BPF_CLASS(insn->code);
+
+ if (class == BPF_ALU || class == BPF_ALU64) {
+ if (BPF_SRC(insn->code) == BPF_X)
+ verbose("(%02x) %sr%d %s %sr%d\n",
+ insn->code, class == BPF_ALU ? "(u32) " : "",
+ insn->dst_reg,
+ bpf_alu_string[BPF_OP(insn->code) >> 4],
+ class == BPF_ALU ? "(u32) " : "",
+ insn->src_reg);
+ else
+ verbose("(%02x) %sr%d %s %s%d\n",
+ insn->code, class == BPF_ALU ? "(u32) " : "",
+ insn->dst_reg,
+ bpf_alu_string[BPF_OP(insn->code) >> 4],
+ class == BPF_ALU ? "(u32) " : "",
+ insn->imm);
+ } else if (class == BPF_STX) {
+ if (BPF_MODE(insn->code) == BPF_MEM)
+ verbose("(%02x) *(%s *)(r%d %+d) = r%d\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->dst_reg,
+ insn->off, insn->src_reg);
+ else if (BPF_MODE(insn->code) == BPF_XADD)
+ verbose("(%02x) lock *(%s *)(r%d %+d) += r%d\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->dst_reg, insn->off,
+ insn->src_reg);
+ else
+ verbose("BUG_%02x\n", insn->code);
+ } else if (class == BPF_ST) {
+ if (BPF_MODE(insn->code) != BPF_MEM) {
+ verbose("BUG_st_%02x\n", insn->code);
+ return;
+ }
+ verbose("(%02x) *(%s *)(r%d %+d) = %d\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->dst_reg,
+ insn->off, insn->imm);
+ } else if (class == BPF_LDX) {
+ if (BPF_MODE(insn->code) != BPF_MEM) {
+ verbose("BUG_ldx_%02x\n", insn->code);
+ return;
+ }
+ verbose("(%02x) r%d = *(%s *)(r%d %+d)\n",
+ insn->code, insn->dst_reg,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->src_reg, insn->off);
+ } else if (class == BPF_LD) {
+ if (BPF_MODE(insn->code) == BPF_ABS) {
+ verbose("(%02x) r0 = *(%s *)skb[%d]\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->imm);
+ } else if (BPF_MODE(insn->code) == BPF_IND) {
+ verbose("(%02x) r0 = *(%s *)skb[r%d + %d]\n",
+ insn->code,
+ bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
+ insn->src_reg, insn->imm);
+ } else if (BPF_MODE(insn->code) == BPF_IMM) {
+ verbose("(%02x) r%d = 0x%x\n",
+ insn->code, insn->dst_reg, insn->imm);
+ } else {
+ verbose("BUG_ld_%02x\n", insn->code);
+ return;
+ }
+ } else if (class == BPF_JMP) {
+ u8 opcode = BPF_OP(insn->code);
+
+ if (opcode == BPF_CALL) {
+ verbose("(%02x) call %d\n", insn->code, insn->imm);
+ } else if (insn->code == (BPF_JMP | BPF_JA)) {
+ verbose("(%02x) goto pc%+d\n",
+ insn->code, insn->off);
+ } else if (insn->code == (BPF_JMP | BPF_EXIT)) {
+ verbose("(%02x) exit\n", insn->code);
+ } else if (BPF_SRC(insn->code) == BPF_X) {
+ verbose("(%02x) if r%d %s r%d goto pc%+d\n",
+ insn->code, insn->dst_reg,
+ bpf_jmp_string[BPF_OP(insn->code) >> 4],
+ insn->src_reg, insn->off);
+ } else {
+ verbose("(%02x) if r%d %s 0x%x goto pc%+d\n",
+ insn->code, insn->dst_reg,
+ bpf_jmp_string[BPF_OP(insn->code) >> 4],
+ insn->imm, insn->off);
+ }
+ } else {
+ verbose("(%02x) %s\n", insn->code, bpf_class_string[class]);
+ }
+}
+
+static int pop_stack(struct verifier_env *env, int *prev_insn_idx)
+{
+ struct verifier_stack_elem *elem;
+ int insn_idx;
+
+ if (env->head == NULL)
+ return -1;
+
+ memcpy(&env->cur_state, &env->head->st, sizeof(env->cur_state));
+ insn_idx = env->head->insn_idx;
+ if (prev_insn_idx)
+ *prev_insn_idx = env->head->prev_insn_idx;
+ elem = env->head->next;
+ kfree(env->head);
+ env->head = elem;
+ env->stack_size--;
+ return insn_idx;
+}
+
+static struct verifier_state *push_stack(struct verifier_env *env, int insn_idx,
+ int prev_insn_idx)
+{
+ struct verifier_stack_elem *elem;
+
+ elem = kmalloc(sizeof(struct verifier_stack_elem), GFP_KERNEL);
+ if (!elem)
+ goto err;
+
+ memcpy(&elem->st, &env->cur_state, sizeof(env->cur_state));
+ elem->insn_idx = insn_idx;
+ elem->prev_insn_idx = prev_insn_idx;
+ elem->next = env->head;
+ env->head = elem;
+ env->stack_size++;
+ if (env->stack_size > 1024) {
+ verbose("BPF program is too complex\n");
+ goto err;
+ }
+ return &elem->st;
+err:
+ /* pop all elements and return */
+ while (pop_stack(env, NULL) >= 0);
+ return NULL;
+}
+
+#define CALLER_SAVED_REGS 6
+static const int caller_saved[CALLER_SAVED_REGS] = {
+ BPF_REG_0, BPF_REG_1, BPF_REG_2, BPF_REG_3, BPF_REG_4, BPF_REG_5
+};
+
+static void init_reg_state(struct reg_state *regs)
+{
+ int i;
+
+ for (i = 0; i < MAX_BPF_REG; i++) {
+ regs[i].type = NOT_INIT;
+ regs[i].imm = 0;
+ regs[i].map_ptr = NULL;
+ }
+
+ /* frame pointer */
+ regs[BPF_REG_FP].type = FRAME_PTR;
+
+ /* 1st arg to a function */
+ regs[BPF_REG_1].type = PTR_TO_CTX;
+}
+
+static void mark_reg_unknown_value(struct reg_state *regs, u32 regno)
+{
+ BUG_ON(regno >= MAX_BPF_REG);
+ regs[regno].type = UNKNOWN_VALUE;
+ regs[regno].imm = 0;
+ regs[regno].map_ptr = NULL;
+}
+
+enum reg_arg_type {
+ SRC_OP, /* register is used as source operand */
+ DST_OP, /* register is used as destination operand */
+ DST_OP_NO_MARK /* same as above, check only, don't mark */
+};
+
+static int check_reg_arg(struct reg_state *regs, u32 regno,
+ enum reg_arg_type t)
+{
+ if (regno >= MAX_BPF_REG) {
+ verbose("R%d is invalid\n", regno);
+ return -EINVAL;
+ }
+
+ if (t == SRC_OP) {
+ /* check whether register used as source operand can be read */
+ if (regs[regno].type == NOT_INIT) {
+ verbose("R%d !read_ok\n", regno);
+ return -EACCES;
+ }
+ } else {
+ /* check whether register used as dest operand can be written to */
+ if (regno == BPF_REG_FP) {
+ verbose("frame pointer is read only\n");
+ return -EACCES;
+ }
+ if (t == DST_OP)
+ mark_reg_unknown_value(regs, regno);
+ }
+ return 0;
+}
+
+static int bpf_size_to_bytes(int bpf_size)
+{
+ if (bpf_size == BPF_W)
+ return 4;
+ else if (bpf_size == BPF_H)
+ return 2;
+ else if (bpf_size == BPF_B)
+ return 1;
+ else if (bpf_size == BPF_DW)
+ return 8;
+ else
+ return -EINVAL;
+}
+
+/* check_stack_read/write functions track spill/fill of registers,
+ * stack boundary and alignment are checked in check_mem_access()
+ */
+static int check_stack_write(struct verifier_state *state, int off, int size,
+ int value_regno)
+{
+ struct bpf_stack_slot *slot;
+ int i;
+
+ if (value_regno >= 0 &&
+ (state->regs[value_regno].type == PTR_TO_MAP_VALUE ||
+ state->regs[value_regno].type == PTR_TO_STACK ||
+ state->regs[value_regno].type == PTR_TO_CTX)) {
+
+ /* register containing pointer is being spilled into stack */
+ if (size != 8) {
+ verbose("invalid size of register spill\n");
+ return -EACCES;
+ }
+
+ slot = &state->stack[MAX_BPF_STACK + off];
+ slot->stype = STACK_SPILL;
+ /* save register state */
+ slot->reg_st = state->regs[value_regno];
+ for (i = 1; i < 8; i++) {
+ slot = &state->stack[MAX_BPF_STACK + off + i];
+ slot->stype = STACK_SPILL_PART;
+ slot->reg_st.type = UNKNOWN_VALUE;
+ slot->reg_st.map_ptr = NULL;
+ }
+ } else {
+
+ /* regular write of data into stack */
+ for (i = 0; i < size; i++) {
+ slot = &state->stack[MAX_BPF_STACK + off + i];
+ slot->stype = STACK_MISC;
+ slot->reg_st.type = UNKNOWN_VALUE;
+ slot->reg_st.map_ptr = NULL;
+ }
+ }
+ return 0;
+}
+
+static int check_stack_read(struct verifier_state *state, int off, int size,
+ int value_regno)
+{
+ int i;
+ struct bpf_stack_slot *slot;
+
+ slot = &state->stack[MAX_BPF_STACK + off];
+
+ if (slot->stype == STACK_SPILL) {
+ if (size != 8) {
+ verbose("invalid size of register spill\n");
+ return -EACCES;
+ }
+ for (i = 1; i < 8; i++) {
+ if (state->stack[MAX_BPF_STACK + off + i].stype !=
+ STACK_SPILL_PART) {
+ verbose("corrupted spill memory\n");
+ return -EACCES;
+ }
+ }
+
+ if (value_regno >= 0)
+ /* restore register state from stack */
+ state->regs[value_regno] = slot->reg_st;
+ return 0;
+ } else {
+ for (i = 0; i < size; i++) {
+ if (state->stack[MAX_BPF_STACK + off + i].stype !=
+ STACK_MISC) {
+ verbose("invalid read from stack off %d+%d size %d\n",
+ off, i, size);
+ return -EACCES;
+ }
+ }
+ if (value_regno >= 0)
+ /* have read misc data from the stack */
+ mark_reg_unknown_value(state->regs, value_regno);
+ return 0;
+ }
+}
+
+/* check read/write into map element returned by bpf_map_lookup_elem() */
+static int check_map_access(struct verifier_env *env, u32 regno, int off,
+ int size)
+{
+ struct bpf_map *map = env->cur_state.regs[regno].map_ptr;
+
+ if (off < 0 || off + size > map->value_size) {
+ verbose("invalid access to map value, value_size=%d off=%d size=%d\n",
+ map->value_size, off, size);
+ return -EACCES;
+ }
+ return 0;
+}
+
+/* check access to 'struct bpf_context' fields */
+static int check_ctx_access(struct verifier_env *env, int off, int size,
+ enum bpf_access_type t)
+{
+ if (env->prog->aux->ops->is_valid_access &&
+ env->prog->aux->ops->is_valid_access(off, size, t))
+ return 0;
+
+ verbose("invalid bpf_context access off=%d size=%d\n", off, size);
+ return -EACCES;
+}
+
+/* check whether memory at (regno + off) is accessible for t = (read | write)
+ * if t==write, value_regno is a register which value is stored into memory
+ * if t==read, value_regno is a register which will receive the value from memory
+ * if t==write && value_regno==-1, some unknown value is stored into memory
+ * if t==read && value_regno==-1, don't care what we read from memory
+ */
+static int check_mem_access(struct verifier_env *env, u32 regno, int off,
+ int bpf_size, enum bpf_access_type t,
+ int value_regno)
+{
+ struct verifier_state *state = &env->cur_state;
+ int size, err = 0;
+
+ size = bpf_size_to_bytes(bpf_size);
+ if (size < 0)
+ return size;
+
+ if (off % size != 0) {
+ verbose("misaligned access off %d size %d\n", off, size);
+ return -EACCES;
+ }
+
+ if (state->regs[regno].type == PTR_TO_MAP_VALUE) {
+ err = check_map_access(env, regno, off, size);
+ if (!err && t == BPF_READ && value_regno >= 0)
+ mark_reg_unknown_value(state->regs, value_regno);
+
+ } else if (state->regs[regno].type == PTR_TO_CTX) {
+ err = check_ctx_access(env, off, size, t);
+ if (!err && t == BPF_READ && value_regno >= 0)
+ mark_reg_unknown_value(state->regs, value_regno);
+
+ } else if (state->regs[regno].type == FRAME_PTR) {
+ if (off >= 0 || off < -MAX_BPF_STACK) {
+ verbose("invalid stack off=%d size=%d\n", off, size);
+ return -EACCES;
+ }
+ if (t == BPF_WRITE)
+ err = check_stack_write(state, off, size, value_regno);
+ else
+ err = check_stack_read(state, off, size, value_regno);
+ } else {
+ verbose("R%d invalid mem access '%s'\n",
+ regno, reg_type_str[state->regs[regno].type]);
+ return -EACCES;
+ }
+ return err;
+}
+
+static int check_xadd(struct verifier_env *env, struct bpf_insn *insn)
+{
+ struct reg_state *regs = env->cur_state.regs;
+ int err;
+
+ if ((BPF_SIZE(insn->code) != BPF_W && BPF_SIZE(insn->code) != BPF_DW) ||
+ insn->imm != 0) {
+ verbose("BPF_XADD uses reserved fields\n");
+ return -EINVAL;
+ }
+
+ /* check src1 operand */
+ err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+ if (err)
+ return err;
+
+ /* check src2 operand */
+ err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+ if (err)
+ return err;
+
+ /* check whether atomic_add can read the memory */
+ err = check_mem_access(env, insn->dst_reg, insn->off,
+ BPF_SIZE(insn->code), BPF_READ, -1);
+ if (err)
+ return err;
+
+ /* check whether atomic_add can write into the same memory */
+ return check_mem_access(env, insn->dst_reg, insn->off,
+ BPF_SIZE(insn->code), BPF_WRITE, -1);
+}
+
+/* when register 'regno' is passed into function that will read 'access_size'
+ * bytes from that pointer, make sure that it's within stack boundary
+ * and all elements of stack are initialized
+ */
+static int check_stack_boundary(struct verifier_env *env,
+ int regno, int access_size)
+{
+ struct verifier_state *state = &env->cur_state;
+ struct reg_state *regs = state->regs;
+ int off, i;
+
+ if (regs[regno].type != PTR_TO_STACK)
+ return -EACCES;
+
+ off = regs[regno].imm;
+ if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 ||
+ access_size <= 0) {
+ verbose("invalid stack type R%d off=%d access_size=%d\n",
+ regno, off, access_size);
+ return -EACCES;
+ }
+
+ for (i = 0; i < access_size; i++) {
+ if (state->stack[MAX_BPF_STACK + off + i].stype != STACK_MISC) {
+ verbose("invalid indirect read from stack off %d+%d size %d\n",
+ off, i, access_size);
+ return -EACCES;
+ }
+ }
+ return 0;
+}
+
+static int check_func_arg(struct verifier_env *env, u32 regno,
+ enum bpf_arg_type arg_type, struct bpf_map **mapp)
+{
+ struct reg_state *reg = env->cur_state.regs + regno;
+ enum bpf_reg_type expected_type;
+ int err = 0;
+
+ if (arg_type == ARG_ANYTHING)
+ return 0;
+
+ if (reg->type == NOT_INIT) {
+ verbose("R%d !read_ok\n", regno);
+ return -EACCES;
+ }
+
+ if (arg_type == ARG_PTR_TO_STACK || arg_type == ARG_PTR_TO_MAP_KEY ||
+ arg_type == ARG_PTR_TO_MAP_VALUE) {
+ expected_type = PTR_TO_STACK;
+ } else if (arg_type == ARG_CONST_STACK_SIZE) {
+ expected_type = CONST_IMM;
+ } else if (arg_type == ARG_CONST_MAP_PTR) {
+ expected_type = CONST_PTR_TO_MAP;
+ } else {
+ verbose("unsupported arg_type %d\n", arg_type);
+ return -EFAULT;
+ }
+
+ if (reg->type != expected_type) {
+ verbose("R%d type=%s expected=%s\n", regno,
+ reg_type_str[reg->type], reg_type_str[expected_type]);
+ return -EACCES;
+ }
+
+ if (arg_type == ARG_CONST_MAP_PTR) {
+ /* bpf_map_xxx(map_ptr) call: remember that map_ptr */
+ *mapp = reg->map_ptr;
+
+ } else if (arg_type == ARG_PTR_TO_MAP_KEY) {
+ /* bpf_map_xxx(..., map_ptr, ..., key) call:
+ * check that [key, key + map->key_size) are within
+ * stack limits and initialized
+ */
+ if (!*mapp) {
+ /* in function declaration map_ptr must come before
+ * map_key, so that it's verified and known before
+ * we have to check map_key here. Otherwise it means
+ * that kernel subsystem misconfigured verifier
+ */
+ verbose("invalid map_ptr to access map->key\n");
+ return -EACCES;
+ }
+ err = check_stack_boundary(env, regno, (*mapp)->key_size);
+
+ } else if (arg_type == ARG_PTR_TO_MAP_VALUE) {
+ /* bpf_map_xxx(..., map_ptr, ..., value) call:
+ * check [value, value + map->value_size) validity
+ */
+ if (!*mapp) {
+ /* kernel subsystem misconfigured verifier */
+ verbose("invalid map_ptr to access map->value\n");
+ return -EACCES;
+ }
+ err = check_stack_boundary(env, regno, (*mapp)->value_size);
+
+ } else if (arg_type == ARG_CONST_STACK_SIZE) {
+ /* bpf_xxx(..., buf, len) call will access 'len' bytes
+ * from stack pointer 'buf'. Check it
+ * note: regno == len, regno - 1 == buf
+ */
+ if (regno == 0) {
+ /* kernel subsystem misconfigured verifier */
+ verbose("ARG_CONST_STACK_SIZE cannot be first argument\n");
+ return -EACCES;
+ }
+ err = check_stack_boundary(env, regno - 1, reg->imm);
+ }
+
+ return err;
+}
+
+static int check_call(struct verifier_env *env, int func_id)
+{
+ struct verifier_state *state = &env->cur_state;
+ const struct bpf_func_proto *fn = NULL;
+ struct reg_state *regs = state->regs;
+ struct bpf_map *map = NULL;
+ struct reg_state *reg;
+ int i, err;
+
+ /* find function prototype */
+ if (func_id < 0 || func_id >= __BPF_FUNC_MAX_ID) {
+ verbose("invalid func %d\n", func_id);
+ return -EINVAL;
+ }
+
+ if (env->prog->aux->ops->get_func_proto)
+ fn = env->prog->aux->ops->get_func_proto(func_id);
+
+ if (!fn) {
+ verbose("unknown func %d\n", func_id);
+ return -EINVAL;
+ }
+
+ /* eBPF programs must be GPL compatible to use GPL-ed functions */
+ if (!env->prog->aux->is_gpl_compatible && fn->gpl_only) {
+ verbose("cannot call GPL only function from proprietary program\n");
+ return -EINVAL;
+ }
+
+ /* check args */
+ err = check_func_arg(env, BPF_REG_1, fn->arg1_type, &map);
+ if (err)
+ return err;
+ err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &map);
+ if (err)
+ return err;
+ err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &map);
+ if (err)
+ return err;
+ err = check_func_arg(env, BPF_REG_4, fn->arg4_type, &map);
+ if (err)
+ return err;
+ err = check_func_arg(env, BPF_REG_5, fn->arg5_type, &map);
+ if (err)
+ return err;
+
+ /* reset caller saved regs */
+ for (i = 0; i < CALLER_SAVED_REGS; i++) {
+ reg = regs + caller_saved[i];
+ reg->type = NOT_INIT;
+ reg->imm = 0;
+ }
+
+ /* update return register */
+ if (fn->ret_type == RET_INTEGER) {
+ regs[BPF_REG_0].type = UNKNOWN_VALUE;
+ } else if (fn->ret_type == RET_VOID) {
+ regs[BPF_REG_0].type = NOT_INIT;
+ } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) {
+ regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL;
+ /* remember map_ptr, so that check_map_access()
+ * can check 'value_size' boundary of memory access
+ * to map element returned from bpf_map_lookup_elem()
+ */
+ if (map == NULL) {
+ verbose("kernel subsystem misconfigured verifier\n");
+ return -EINVAL;
+ }
+ regs[BPF_REG_0].map_ptr = map;
+ } else {
+ verbose("unknown return type %d of func %d\n",
+ fn->ret_type, func_id);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+/* check validity of 32-bit and 64-bit arithmetic operations */
+static int check_alu_op(struct reg_state *regs, struct bpf_insn *insn)
+{
+ u8 opcode = BPF_OP(insn->code);
+ int err;
+
+ if (opcode == BPF_END || opcode == BPF_NEG) {
+ if (opcode == BPF_NEG) {
+ if (BPF_SRC(insn->code) != 0 ||
+ insn->src_reg != BPF_REG_0 ||
+ insn->off != 0 || insn->imm != 0) {
+ verbose("BPF_NEG uses reserved fields\n");
+ return -EINVAL;
+ }
+ } else {
+ if (insn->src_reg != BPF_REG_0 || insn->off != 0 ||
+ (insn->imm != 16 && insn->imm != 32 && insn->imm != 64)) {
+ verbose("BPF_END uses reserved fields\n");
+ return -EINVAL;
+ }
+ }
+
+ /* check src operand */
+ err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+ if (err)
+ return err;
+
+ /* check dest operand */
+ err = check_reg_arg(regs, insn->dst_reg, DST_OP);
+ if (err)
+ return err;
+
+ } else if (opcode == BPF_MOV) {
+
+ if (BPF_SRC(insn->code) == BPF_X) {
+ if (insn->imm != 0 || insn->off != 0) {
+ verbose("BPF_MOV uses reserved fields\n");
+ return -EINVAL;
+ }
+
+ /* check src operand */
+ err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+ if (err)
+ return err;
+ } else {
+ if (insn->src_reg != BPF_REG_0 || insn->off != 0) {
+ verbose("BPF_MOV uses reserved fields\n");
+ return -EINVAL;
+ }
+ }
+
+ /* check dest operand */
+ err = check_reg_arg(regs, insn->dst_reg, DST_OP);
+ if (err)
+ return err;
+
+ if (BPF_SRC(insn->code) == BPF_X) {
+ if (BPF_CLASS(insn->code) == BPF_ALU64) {
+ /* case: R1 = R2
+ * copy register state to dest reg
+ */
+ regs[insn->dst_reg] = regs[insn->src_reg];
+ } else {
+ regs[insn->dst_reg].type = UNKNOWN_VALUE;
+ regs[insn->dst_reg].map_ptr = NULL;
+ }
+ } else {
+ /* case: R = imm
+ * remember the value we stored into this reg
+ */
+ regs[insn->dst_reg].type = CONST_IMM;
+ regs[insn->dst_reg].imm = insn->imm;
+ }
+
+ } else if (opcode > BPF_END) {
+ verbose("invalid BPF_ALU opcode %x\n", opcode);
+ return -EINVAL;
+
+ } else { /* all other ALU ops: and, sub, xor, add, ... */
+
+ bool stack_relative = false;
+
+ if (BPF_SRC(insn->code) == BPF_X) {
+ if (insn->imm != 0 || insn->off != 0) {
+ verbose("BPF_ALU uses reserved fields\n");
+ return -EINVAL;
+ }
+ /* check src1 operand */
+ err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+ if (err)
+ return err;
+ } else {
+ if (insn->src_reg != BPF_REG_0 || insn->off != 0) {
+ verbose("BPF_ALU uses reserved fields\n");
+ return -EINVAL;
+ }
+ }
+
+ /* check src2 operand */
+ err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+ if (err)
+ return err;
+
+ if ((opcode == BPF_MOD || opcode == BPF_DIV) &&
+ BPF_SRC(insn->code) == BPF_K && insn->imm == 0) {
+ verbose("div by zero\n");
+ return -EINVAL;
+ }
+
+ /* pattern match 'bpf_add Rx, imm' instruction */
+ if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 &&
+ regs[insn->dst_reg].type == FRAME_PTR &&
+ BPF_SRC(insn->code) == BPF_K)
+ stack_relative = true;
+
+ /* check dest operand */
+ err = check_reg_arg(regs, insn->dst_reg, DST_OP);
+ if (err)
+ return err;
+
+ if (stack_relative) {
+ regs[insn->dst_reg].type = PTR_TO_STACK;
+ regs[insn->dst_reg].imm = insn->imm;
+ }
+ }
+
+ return 0;
+}
+
+static int check_cond_jmp_op(struct verifier_env *env,
+ struct bpf_insn *insn, int *insn_idx)
+{
+ struct reg_state *regs = env->cur_state.regs;
+ struct verifier_state *other_branch;
+ u8 opcode = BPF_OP(insn->code);
+ int err;
+
+ if (opcode > BPF_EXIT) {
+ verbose("invalid BPF_JMP opcode %x\n", opcode);
+ return -EINVAL;
+ }
+
+ if (BPF_SRC(insn->code) == BPF_X) {
+ if (insn->imm != 0) {
+ verbose("BPF_JMP uses reserved fields\n");
+ return -EINVAL;
+ }
+
+ /* check src1 operand */
+ err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+ if (err)
+ return err;
+ } else {
+ if (insn->src_reg != BPF_REG_0) {
+ verbose("BPF_JMP uses reserved fields\n");
+ return -EINVAL;
+ }
+ }
+
+ /* check src2 operand */
+ err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+ if (err)
+ return err;
+
+ /* detect if R == 0 where R was initialized to zero earlier */
+ if (BPF_SRC(insn->code) == BPF_K &&
+ (opcode == BPF_JEQ || opcode == BPF_JNE) &&
+ regs[insn->dst_reg].type == CONST_IMM &&
+ regs[insn->dst_reg].imm == insn->imm) {
+ if (opcode == BPF_JEQ) {
+ /* if (imm == imm) goto pc+off;
+ * only follow the goto, ignore fall-through
+ */
+ *insn_idx += insn->off;
+ return 0;
+ } else {
+ /* if (imm != imm) goto pc+off;
+ * only follow fall-through branch, since
+ * that's where the program will go
+ */
+ return 0;
+ }
+ }
+
+ other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx);
+ if (!other_branch)
+ return -EFAULT;
+
+ /* detect if R == 0 where R is returned value from bpf_map_lookup_elem() */
+ if (BPF_SRC(insn->code) == BPF_K &&
+ insn->imm == 0 && (opcode == BPF_JEQ ||
+ opcode == BPF_JNE) &&
+ regs[insn->dst_reg].type == PTR_TO_MAP_VALUE_OR_NULL) {
+ if (opcode == BPF_JEQ) {
+ /* next fallthrough insn can access memory via
+ * this register
+ */
+ regs[insn->dst_reg].type = PTR_TO_MAP_VALUE;
+ /* branch targer cannot access it, since reg == 0 */
+ other_branch->regs[insn->dst_reg].type = CONST_IMM;
+ other_branch->regs[insn->dst_reg].imm = 0;
+ } else {
+ other_branch->regs[insn->dst_reg].type = PTR_TO_MAP_VALUE;
+ regs[insn->dst_reg].type = CONST_IMM;
+ regs[insn->dst_reg].imm = 0;
+ }
+ } else if (BPF_SRC(insn->code) == BPF_K &&
+ (opcode == BPF_JEQ || opcode == BPF_JNE)) {
+
+ if (opcode == BPF_JEQ) {
+ /* detect if (R == imm) goto
+ * and in the target state recognize that R = imm
+ */
+ other_branch->regs[insn->dst_reg].type = CONST_IMM;
+ other_branch->regs[insn->dst_reg].imm = insn->imm;
+ } else {
+ /* detect if (R != imm) goto
+ * and in the fall-through state recognize that R = imm
+ */
+ regs[insn->dst_reg].type = CONST_IMM;
+ regs[insn->dst_reg].imm = insn->imm;
+ }
+ }
+ if (log_level)
+ print_verifier_state(env);
+ return 0;
+}
+
+/* return the map pointer stored inside BPF_LD_IMM64 instruction */
+static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn)
+{
+ u64 imm64 = ((u64) (u32) insn[0].imm) | ((u64) (u32) insn[1].imm) << 32;
+
+ return (struct bpf_map *) (unsigned long) imm64;
+}
+
+/* verify BPF_LD_IMM64 instruction */
+static int check_ld_imm(struct verifier_env *env, struct bpf_insn *insn)
+{
+ struct reg_state *regs = env->cur_state.regs;
+ int err;
+
+ if (BPF_SIZE(insn->code) != BPF_DW) {
+ verbose("invalid BPF_LD_IMM insn\n");
+ return -EINVAL;
+ }
+ if (insn->off != 0) {
+ verbose("BPF_LD_IMM64 uses reserved fields\n");
+ return -EINVAL;
+ }
+
+ err = check_reg_arg(regs, insn->dst_reg, DST_OP);
+ if (err)
+ return err;
+
+ if (insn->src_reg == 0)
+ /* generic move 64-bit immediate into a register */
+ return 0;
+
+ /* replace_map_fd_with_map_ptr() should have caught bad ld_imm64 */
+ BUG_ON(insn->src_reg != BPF_PSEUDO_MAP_FD);
+
+ regs[insn->dst_reg].type = CONST_PTR_TO_MAP;
+ regs[insn->dst_reg].map_ptr = ld_imm64_to_map_ptr(insn);
+ return 0;
+}
+
+/* non-recursive DFS pseudo code
+ * 1 procedure DFS-iterative(G,v):
+ * 2 label v as discovered
+ * 3 let S be a stack
+ * 4 S.push(v)
+ * 5 while S is not empty
+ * 6 t <- S.pop()
+ * 7 if t is what we're looking for:
+ * 8 return t
+ * 9 for all edges e in G.adjacentEdges(t) do
+ * 10 if edge e is already labelled
+ * 11 continue with the next edge
+ * 12 w <- G.adjacentVertex(t,e)
+ * 13 if vertex w is not discovered and not explored
+ * 14 label e as tree-edge
+ * 15 label w as discovered
+ * 16 S.push(w)
+ * 17 continue at 5
+ * 18 else if vertex w is discovered
+ * 19 label e as back-edge
+ * 20 else
+ * 21 // vertex w is explored
+ * 22 label e as forward- or cross-edge
+ * 23 label t as explored
+ * 24 S.pop()
+ *
+ * convention:
+ * 0x10 - discovered
+ * 0x11 - discovered and fall-through edge labelled
+ * 0x12 - discovered and fall-through and branch edges labelled
+ * 0x20 - explored
+ */
+
+enum {
+ DISCOVERED = 0x10,
+ EXPLORED = 0x20,
+ FALLTHROUGH = 1,
+ BRANCH = 2,
+};
+
+#define STATE_LIST_MARK ((struct verifier_state_list *) -1L)
+
+static int *insn_stack; /* stack of insns to process */
+static int cur_stack; /* current stack index */
+static int *insn_state;
+
+/* t, w, e - match pseudo-code above:
+ * t - index of current instruction
+ * w - next instruction
+ * e - edge
+ */
+static int push_insn(int t, int w, int e, struct verifier_env *env)
+{
+ if (e == FALLTHROUGH && insn_state[t] >= (DISCOVERED | FALLTHROUGH))
+ return 0;
+
+ if (e == BRANCH && insn_state[t] >= (DISCOVERED | BRANCH))
+ return 0;
+
+ if (w < 0 || w >= env->prog->len) {
+ verbose("jump out of range from insn %d to %d\n", t, w);
+ return -EINVAL;
+ }
+
+ if (e == BRANCH)
+ /* mark branch target for state pruning */
+ env->explored_states[w] = STATE_LIST_MARK;
+
+ if (insn_state[w] == 0) {
+ /* tree-edge */
+ insn_state[t] = DISCOVERED | e;
+ insn_state[w] = DISCOVERED;
+ if (cur_stack >= env->prog->len)
+ return -E2BIG;
+ insn_stack[cur_stack++] = w;
+ return 1;
+ } else if ((insn_state[w] & 0xF0) == DISCOVERED) {
+ verbose("back-edge from insn %d to %d\n", t, w);
+ return -EINVAL;
+ } else if (insn_state[w] == EXPLORED) {
+ /* forward- or cross-edge */
+ insn_state[t] = DISCOVERED | e;
+ } else {
+ verbose("insn state internal bug\n");
+ return -EFAULT;
+ }
+ return 0;
+}
+
+/* non-recursive depth-first-search to detect loops in BPF program
+ * loop == back-edge in directed graph
+ */
+static int check_cfg(struct verifier_env *env)
+{
+ struct bpf_insn *insns = env->prog->insnsi;
+ int insn_cnt = env->prog->len;
+ int ret = 0;
+ int i, t;
+
+ insn_state = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
+ if (!insn_state)
+ return -ENOMEM;
+
+ insn_stack = kcalloc(insn_cnt, sizeof(int), GFP_KERNEL);
+ if (!insn_stack) {
+ kfree(insn_state);
+ return -ENOMEM;
+ }
+
+ insn_state[0] = DISCOVERED; /* mark 1st insn as discovered */
+ insn_stack[0] = 0; /* 0 is the first instruction */
+ cur_stack = 1;
+
+peek_stack:
+ if (cur_stack == 0)
+ goto check_state;
+ t = insn_stack[cur_stack - 1];
+
+ if (BPF_CLASS(insns[t].code) == BPF_JMP) {
+ u8 opcode = BPF_OP(insns[t].code);
+
+ if (opcode == BPF_EXIT) {
+ goto mark_explored;
+ } else if (opcode == BPF_CALL) {
+ ret = push_insn(t, t + 1, FALLTHROUGH, env);
+ if (ret == 1)
+ goto peek_stack;
+ else if (ret < 0)
+ goto err_free;
+ } else if (opcode == BPF_JA) {
+ if (BPF_SRC(insns[t].code) != BPF_K) {
+ ret = -EINVAL;
+ goto err_free;
+ }
+ /* unconditional jump with single edge */
+ ret = push_insn(t, t + insns[t].off + 1,
+ FALLTHROUGH, env);
+ if (ret == 1)
+ goto peek_stack;
+ else if (ret < 0)
+ goto err_free;
+ /* tell verifier to check for equivalent states
+ * after every call and jump
+ */
+ env->explored_states[t + 1] = STATE_LIST_MARK;
+ } else {
+ /* conditional jump with two edges */
+ ret = push_insn(t, t + 1, FALLTHROUGH, env);
+ if (ret == 1)
+ goto peek_stack;
+ else if (ret < 0)
+ goto err_free;
+
+ ret = push_insn(t, t + insns[t].off + 1, BRANCH, env);
+ if (ret == 1)
+ goto peek_stack;
+ else if (ret < 0)
+ goto err_free;
+ }
+ } else {
+ /* all other non-branch instructions with single
+ * fall-through edge
+ */
+ ret = push_insn(t, t + 1, FALLTHROUGH, env);
+ if (ret == 1)
+ goto peek_stack;
+ else if (ret < 0)
+ goto err_free;
+ }
+
+mark_explored:
+ insn_state[t] = EXPLORED;
+ if (cur_stack-- <= 0) {
+ verbose("pop stack internal bug\n");
+ ret = -EFAULT;
+ goto err_free;
+ }
+ goto peek_stack;
+
+check_state:
+ for (i = 0; i < insn_cnt; i++) {
+ if (insn_state[i] != EXPLORED) {
+ verbose("unreachable insn %d\n", i);
+ ret = -EINVAL;
+ goto err_free;
+ }
+ }
+ ret = 0; /* cfg looks good */
+
+err_free:
+ kfree(insn_state);
+ kfree(insn_stack);
+ return ret;
+}
+
+/* compare two verifier states
+ *
+ * all states stored in state_list are known to be valid, since
+ * verifier reached 'bpf_exit' instruction through them
+ *
+ * this function is called when verifier exploring different branches of
+ * execution popped from the state stack. If it sees an old state that has
+ * more strict register state and more strict stack state then this execution
+ * branch doesn't need to be explored further, since verifier already
+ * concluded that more strict state leads to valid finish.
+ *
+ * Therefore two states are equivalent if register state is more conservative
+ * and explored stack state is more conservative than the current one.
+ * Example:
+ * explored current
+ * (slot1=INV slot2=MISC) == (slot1=MISC slot2=MISC)
+ * (slot1=MISC slot2=MISC) != (slot1=INV slot2=MISC)
+ *
+ * In other words if current stack state (one being explored) has more
+ * valid slots than old one that already passed validation, it means
+ * the verifier can stop exploring and conclude that current state is valid too
+ *
+ * Similarly with registers. If explored state has register type as invalid
+ * whereas register type in current state is meaningful, it means that
+ * the current state will reach 'bpf_exit' instruction safely
+ */
+static bool states_equal(struct verifier_state *old, struct verifier_state *cur)
+{
+ int i;
+
+ for (i = 0; i < MAX_BPF_REG; i++) {
+ if (memcmp(&old->regs[i], &cur->regs[i],
+ sizeof(old->regs[0])) != 0) {
+ if (old->regs[i].type == NOT_INIT ||
+ old->regs[i].type == UNKNOWN_VALUE)
+ continue;
+ return false;
+ }
+ }
+
+ for (i = 0; i < MAX_BPF_STACK; i++) {
+ if (memcmp(&old->stack[i], &cur->stack[i],
+ sizeof(old->stack[0])) != 0) {
+ if (old->stack[i].stype == STACK_INVALID)
+ continue;
+ return false;
+ }
+ }
+ return true;
+}
+
+static int is_state_visited(struct verifier_env *env, int insn_idx)
+{
+ struct verifier_state_list *new_sl;
+ struct verifier_state_list *sl;
+
+ sl = env->explored_states[insn_idx];
+ if (!sl)
+ /* this 'insn_idx' instruction wasn't marked, so we will not
+ * be doing state search here
+ */
+ return 0;
+
+ while (sl != STATE_LIST_MARK) {
+ if (states_equal(&sl->state, &env->cur_state))
+ /* reached equivalent register/stack state,
+ * prune the search
+ */
+ return 1;
+ sl = sl->next;
+ }
+
+ /* there were no equivalent states, remember current one.
+ * technically the current state is not proven to be safe yet,
+ * but it will either reach bpf_exit (which means it's safe) or
+ * it will be rejected. Since there are no loops, we won't be
+ * seeing this 'insn_idx' instruction again on the way to bpf_exit
+ */
+ new_sl = kmalloc(sizeof(struct verifier_state_list), GFP_USER);
+ if (!new_sl)
+ return -ENOMEM;
+
+ /* add new state to the head of linked list */
+ memcpy(&new_sl->state, &env->cur_state, sizeof(env->cur_state));
+ new_sl->next = env->explored_states[insn_idx];
+ env->explored_states[insn_idx] = new_sl;
+ return 0;
+}
+
+static int do_check(struct verifier_env *env)
+{
+ struct verifier_state *state = &env->cur_state;
+ struct bpf_insn *insns = env->prog->insnsi;
+ struct reg_state *regs = state->regs;
+ int insn_cnt = env->prog->len;
+ int insn_idx, prev_insn_idx = 0;
+ int insn_processed = 0;
+ bool do_print_state = false;
+
+ init_reg_state(regs);
+ insn_idx = 0;
+ for (;;) {
+ struct bpf_insn *insn;
+ u8 class;
+ int err;
+
+ if (insn_idx >= insn_cnt) {
+ verbose("invalid insn idx %d insn_cnt %d\n",
+ insn_idx, insn_cnt);
+ return -EFAULT;
+ }
+
+ insn = &insns[insn_idx];
+ class = BPF_CLASS(insn->code);
+
+ if (++insn_processed > 32768) {
+ verbose("BPF program is too large. Proccessed %d insn\n",
+ insn_processed);
+ return -E2BIG;
+ }
+
+ err = is_state_visited(env, insn_idx);
+ if (err < 0)
+ return err;
+ if (err == 1) {
+ /* found equivalent state, can prune the search */
+ if (log_level) {
+ if (do_print_state)
+ verbose("\nfrom %d to %d: safe\n",
+ prev_insn_idx, insn_idx);
+ else
+ verbose("%d: safe\n", insn_idx);
+ }
+ goto process_bpf_exit;
+ }
+
+ if (log_level && do_print_state) {
+ verbose("\nfrom %d to %d:", prev_insn_idx, insn_idx);
+ print_verifier_state(env);
+ do_print_state = false;
+ }
+
+ if (log_level) {
+ verbose("%d: ", insn_idx);
+ print_bpf_insn(insn);
+ }
+
+ if (class == BPF_ALU || class == BPF_ALU64) {
+ err = check_alu_op(regs, insn);
+ if (err)
+ return err;
+
+ } else if (class == BPF_LDX) {
+ if (BPF_MODE(insn->code) != BPF_MEM ||
+ insn->imm != 0) {
+ verbose("BPF_LDX uses reserved fields\n");
+ return -EINVAL;
+ }
+ /* check src operand */
+ err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+ if (err)
+ return err;
+
+ err = check_reg_arg(regs, insn->dst_reg, DST_OP_NO_MARK);
+ if (err)
+ return err;
+
+ /* check that memory (src_reg + off) is readable,
+ * the state of dst_reg will be updated by this func
+ */
+ err = check_mem_access(env, insn->src_reg, insn->off,
+ BPF_SIZE(insn->code), BPF_READ,
+ insn->dst_reg);
+ if (err)
+ return err;
+
+ } else if (class == BPF_STX) {
+ if (BPF_MODE(insn->code) == BPF_XADD) {
+ err = check_xadd(env, insn);
+ if (err)
+ return err;
+ insn_idx++;
+ continue;
+ }
+
+ if (BPF_MODE(insn->code) != BPF_MEM ||
+ insn->imm != 0) {
+ verbose("BPF_STX uses reserved fields\n");
+ return -EINVAL;
+ }
+ /* check src1 operand */
+ err = check_reg_arg(regs, insn->src_reg, SRC_OP);
+ if (err)
+ return err;
+ /* check src2 operand */
+ err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+ if (err)
+ return err;
+
+ /* check that memory (dst_reg + off) is writeable */
+ err = check_mem_access(env, insn->dst_reg, insn->off,
+ BPF_SIZE(insn->code), BPF_WRITE,
+ insn->src_reg);
+ if (err)
+ return err;
+
+ } else if (class == BPF_ST) {
+ if (BPF_MODE(insn->code) != BPF_MEM ||
+ insn->src_reg != BPF_REG_0) {
+ verbose("BPF_ST uses reserved fields\n");
+ return -EINVAL;
+ }
+ /* check src operand */
+ err = check_reg_arg(regs, insn->dst_reg, SRC_OP);
+ if (err)
+ return err;
+
+ /* check that memory (dst_reg + off) is writeable */
+ err = check_mem_access(env, insn->dst_reg, insn->off,
+ BPF_SIZE(insn->code), BPF_WRITE,
+ -1);
+ if (err)
+ return err;
+
+ } else if (class == BPF_JMP) {
+ u8 opcode = BPF_OP(insn->code);
+
+ if (opcode == BPF_CALL) {
+ if (BPF_SRC(insn->code) != BPF_K ||
+ insn->off != 0 ||
+ insn->src_reg != BPF_REG_0 ||
+ insn->dst_reg != BPF_REG_0) {
+ verbose("BPF_CALL uses reserved fields\n");
+ return -EINVAL;
+ }
+
+ err = check_call(env, insn->imm);
+ if (err)
+ return err;
+
+ } else if (opcode == BPF_JA) {
+ if (BPF_SRC(insn->code) != BPF_K ||
+ insn->imm != 0 ||
+ insn->src_reg != BPF_REG_0 ||
+ insn->dst_reg != BPF_REG_0) {
+ verbose("BPF_JA uses reserved fields\n");
+ return -EINVAL;
+ }
+
+ insn_idx += insn->off + 1;
+ continue;
+
+ } else if (opcode == BPF_EXIT) {
+ if (BPF_SRC(insn->code) != BPF_K ||
+ insn->imm != 0 ||
+ insn->src_reg != BPF_REG_0 ||
+ insn->dst_reg != BPF_REG_0) {
+ verbose("BPF_EXIT uses reserved fields\n");
+ return -EINVAL;
+ }
+
+ /* eBPF calling convetion is such that R0 is used
+ * to return the value from eBPF program.
+ * Make sure that it's readable at this time
+ * of bpf_exit, which means that program wrote
+ * something into it earlier
+ */
+ err = check_reg_arg(regs, BPF_REG_0, SRC_OP);
+ if (err)
+ return err;
+
+process_bpf_exit:
+ insn_idx = pop_stack(env, &prev_insn_idx);
+ if (insn_idx < 0) {
+ break;
+ } else {
+ do_print_state = true;
+ continue;
+ }
+ } else {
+ err = check_cond_jmp_op(env, insn, &insn_idx);
+ if (err)
+ return err;
+ }
+ } else if (class == BPF_LD) {
+ u8 mode = BPF_MODE(insn->code);
+
+ if (mode == BPF_ABS || mode == BPF_IND) {
+ verbose("LD_ABS is not supported yet\n");
+ return -EINVAL;
+ } else if (mode == BPF_IMM) {
+ err = check_ld_imm(env, insn);
+ if (err)
+ return err;
+
+ insn_idx++;
+ } else {
+ verbose("invalid BPF_LD mode\n");
+ return -EINVAL;
+ }
+ } else {
+ verbose("unknown insn class %d\n", class);
+ return -EINVAL;
+ }
+
+ insn_idx++;
+ }
+
+ return 0;
+}
+
+/* look for pseudo eBPF instructions that access map FDs and
+ * replace them with actual map pointers
+ */
+static int replace_map_fd_with_map_ptr(struct verifier_env *env)
+{
+ struct bpf_insn *insn = env->prog->insnsi;
+ int insn_cnt = env->prog->len;
+ int i, j;
+
+ for (i = 0; i < insn_cnt; i++, insn++) {
+ if (insn[0].code == (BPF_LD | BPF_IMM | BPF_DW)) {
+ struct bpf_map *map;
+ struct fd f;
+
+ if (i == insn_cnt - 1 || insn[1].code != 0 ||
+ insn[1].dst_reg != 0 || insn[1].src_reg != 0 ||
+ insn[1].off != 0) {
+ verbose("invalid bpf_ld_imm64 insn\n");
+ return -EINVAL;
+ }
+
+ if (insn->src_reg == 0)
+ /* valid generic load 64-bit imm */
+ goto next_insn;
+
+ if (insn->src_reg != BPF_PSEUDO_MAP_FD) {
+ verbose("unrecognized bpf_ld_imm64 insn\n");
+ return -EINVAL;
+ }
+
+ f = fdget(insn->imm);
+
+ map = bpf_map_get(f);
+ if (IS_ERR(map)) {
+ verbose("fd %d is not pointing to valid bpf_map\n",
+ insn->imm);
+ fdput(f);
+ return PTR_ERR(map);
+ }
+
+ /* store map pointer inside BPF_LD_IMM64 instruction */
+ insn[0].imm = (u32) (unsigned long) map;
+ insn[1].imm = ((u64) (unsigned long) map) >> 32;
+
+ /* check whether we recorded this map already */
+ for (j = 0; j < env->used_map_cnt; j++)
+ if (env->used_maps[j] == map) {
+ fdput(f);
+ goto next_insn;
+ }
+
+ if (env->used_map_cnt >= MAX_USED_MAPS) {
+ fdput(f);
+ return -E2BIG;
+ }
+
+ /* remember this map */
+ env->used_maps[env->used_map_cnt++] = map;
+
+ /* hold the map. If the program is rejected by verifier,
+ * the map will be released by release_maps() or it
+ * will be used by the valid program until it's unloaded
+ * and all maps are released in free_bpf_prog_info()
+ */
+ atomic_inc(&map->refcnt);
+
+ fdput(f);
+next_insn:
+ insn++;
+ i++;
+ }
+ }
+
+ /* now all pseudo BPF_LD_IMM64 instructions load valid
+ * 'struct bpf_map *' into a register instead of user map_fd.
+ * These pointers will be used later by verifier to validate map access.
+ */
+ return 0;
+}
+
+/* drop refcnt of maps used by the rejected program */
+static void release_maps(struct verifier_env *env)
+{
+ int i;
+
+ for (i = 0; i < env->used_map_cnt; i++)
+ bpf_map_put(env->used_maps[i]);
+}
+
+/* convert pseudo BPF_LD_IMM64 into generic BPF_LD_IMM64 */
+static void convert_pseudo_ld_imm64(struct verifier_env *env)
+{
+ struct bpf_insn *insn = env->prog->insnsi;
+ int insn_cnt = env->prog->len;
+ int i;
+
+ for (i = 0; i < insn_cnt; i++, insn++)
+ if (insn->code == (BPF_LD | BPF_IMM | BPF_DW))
+ insn->src_reg = 0;
+}
+
+static void free_states(struct verifier_env *env)
+{
+ struct verifier_state_list *sl, *sln;
+ int i;
+
+ if (!env->explored_states)
+ return;
+
+ for (i = 0; i < env->prog->len; i++) {
+ sl = env->explored_states[i];
+
+ if (sl)
+ while (sl != STATE_LIST_MARK) {
+ sln = sl->next;
+ kfree(sl);
+ sl = sln;
+ }
+ }
+
+ kfree(env->explored_states);
+}
+
+int bpf_check(struct bpf_prog *prog, union bpf_attr *attr)
+{
+ char __user *log_ubuf = NULL;
+ struct verifier_env *env;
+ int ret = -EINVAL;
+
+ if (prog->len <= 0 || prog->len > BPF_MAXINSNS)
+ return -E2BIG;
+
+ /* 'struct verifier_env' can be global, but since it's not small,
+ * allocate/free it every time bpf_check() is called
+ */
+ env = kzalloc(sizeof(struct verifier_env), GFP_KERNEL);
+ if (!env)
+ return -ENOMEM;
+
+ env->prog = prog;
+
+ /* grab the mutex to protect few globals used by verifier */
+ mutex_lock(&bpf_verifier_lock);
+
+ if (attr->log_level || attr->log_buf || attr->log_size) {
+ /* user requested verbose verifier output
+ * and supplied buffer to store the verification trace
+ */
+ log_level = attr->log_level;
+ log_ubuf = (char __user *) (unsigned long) attr->log_buf;
+ log_size = attr->log_size;
+ log_len = 0;
+
+ ret = -EINVAL;
+ /* log_* values have to be sane */
+ if (log_size < 128 || log_size > UINT_MAX >> 8 ||
+ log_level == 0 || log_ubuf == NULL)
+ goto free_env;
+
+ ret = -ENOMEM;
+ log_buf = vmalloc(log_size);
+ if (!log_buf)
+ goto free_env;
+ } else {
+ log_level = 0;
+ }
+
+ ret = replace_map_fd_with_map_ptr(env);
+ if (ret < 0)
+ goto skip_full_check;
+
+ env->explored_states = kcalloc(prog->len,
+ sizeof(struct verifier_state_list *),
+ GFP_USER);
+ ret = -ENOMEM;
+ if (!env->explored_states)
+ goto skip_full_check;
+
+ ret = check_cfg(env);
+ if (ret < 0)
+ goto skip_full_check;
+
+ ret = do_check(env);
+
+skip_full_check:
+ while (pop_stack(env, NULL) >= 0);
+ free_states(env);
+
+ if (log_level && log_len >= log_size - 1) {
+ BUG_ON(log_len >= log_size);
+ /* verifier log exceeded user supplied buffer */
+ ret = -ENOSPC;
+ /* fall through to return what was recorded */
+ }
+
+ /* copy verifier log back to user space including trailing zero */
+ if (log_level && copy_to_user(log_ubuf, log_buf, log_len + 1) != 0) {
+ ret = -EFAULT;
+ goto free_log_buf;
+ }
+
+ if (ret == 0 && env->used_map_cnt) {
+ /* if program passed verifier, update used_maps in bpf_prog_info */
+ prog->aux->used_maps = kmalloc_array(env->used_map_cnt,
+ sizeof(env->used_maps[0]),
+ GFP_KERNEL);
+
+ if (!prog->aux->used_maps) {
+ ret = -ENOMEM;
+ goto free_log_buf;
+ }
+
+ memcpy(prog->aux->used_maps, env->used_maps,
+ sizeof(env->used_maps[0]) * env->used_map_cnt);
+ prog->aux->used_map_cnt = env->used_map_cnt;
+
+ /* program is valid. Convert pseudo bpf_ld_imm64 into generic
+ * bpf_ld_imm64 instructions
+ */
+ convert_pseudo_ld_imm64(env);
+ }
+
+free_log_buf:
+ if (log_level)
+ vfree(log_buf);
+free_env:
+ if (!prog->aux->used_maps)
+ /* if we didn't copy map pointers into bpf_prog_info, release
+ * them now. Otherwise free_bpf_prog_info() will release them.
+ */
+ release_maps(env);
+ kfree(env);
+ mutex_unlock(&bpf_verifier_lock);
+ return ret;
+}
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 940aced4ed00..136eceadeed1 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -185,7 +185,6 @@ static int need_forkexit_callback __read_mostly;
static struct cftype cgroup_dfl_base_files[];
static struct cftype cgroup_legacy_base_files[];
-static void cgroup_put(struct cgroup *cgrp);
static int rebind_subsystems(struct cgroup_root *dst_root,
unsigned int ss_mask);
static int cgroup_destroy_locked(struct cgroup *cgrp);
@@ -195,7 +194,6 @@ static void css_release(struct percpu_ref *ref);
static void kill_css(struct cgroup_subsys_state *css);
static int cgroup_addrm_files(struct cgroup *cgrp, struct cftype cfts[],
bool is_add);
-static void cgroup_pidlist_destroy_all(struct cgroup *cgrp);
/* IDR wrappers which synchronize using cgroup_idr_lock */
static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end,
@@ -331,14 +329,6 @@ bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor)
return false;
}
-static int cgroup_is_releasable(const struct cgroup *cgrp)
-{
- const int bits =
- (1 << CGRP_RELEASABLE) |
- (1 << CGRP_NOTIFY_ON_RELEASE);
- return (cgrp->flags & bits) == bits;
-}
-
static int notify_on_release(const struct cgroup *cgrp)
{
return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
@@ -394,12 +384,7 @@ static int notify_on_release(const struct cgroup *cgrp)
; \
else
-/* the list of cgroups eligible for automatic release. Protected by
- * release_list_lock */
-static LIST_HEAD(release_list);
-static DEFINE_RAW_SPINLOCK(release_list_lock);
static void cgroup_release_agent(struct work_struct *work);
-static DECLARE_WORK(release_agent_work, cgroup_release_agent);
static void check_for_release(struct cgroup *cgrp);
/*
@@ -498,7 +483,7 @@ static unsigned long css_set_hash(struct cgroup_subsys_state *css[])
return key;
}
-static void put_css_set_locked(struct css_set *cset, bool taskexit)
+static void put_css_set_locked(struct css_set *cset)
{
struct cgrp_cset_link *link, *tmp_link;
struct cgroup_subsys *ss;
@@ -524,11 +509,7 @@ static void put_css_set_locked(struct css_set *cset, bool taskexit)
/* @cgrp can't go away while we're holding css_set_rwsem */
if (list_empty(&cgrp->cset_links)) {
cgroup_update_populated(cgrp, false);
- if (notify_on_release(cgrp)) {
- if (taskexit)
- set_bit(CGRP_RELEASABLE, &cgrp->flags);
- check_for_release(cgrp);
- }
+ check_for_release(cgrp);
}
kfree(link);
@@ -537,7 +518,7 @@ static void put_css_set_locked(struct css_set *cset, bool taskexit)
kfree_rcu(cset, rcu_head);
}
-static void put_css_set(struct css_set *cset, bool taskexit)
+static void put_css_set(struct css_set *cset)
{
/*
* Ensure that the refcount doesn't hit zero while any readers
@@ -548,7 +529,7 @@ static void put_css_set(struct css_set *cset, bool taskexit)
return;
down_write(&css_set_rwsem);
- put_css_set_locked(cset, taskexit);
+ put_css_set_locked(cset);
up_write(&css_set_rwsem);
}
@@ -969,14 +950,6 @@ static struct cgroup *task_cgroup_from_root(struct task_struct *task,
* knows that the cgroup won't be removed, as cgroup_rmdir()
* needs that mutex.
*
- * The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't
- * (usually) take cgroup_mutex. These are the two most performance
- * critical pieces of code here. The exception occurs on cgroup_exit(),
- * when a task in a notify_on_release cgroup exits. Then cgroup_mutex
- * is taken, and if the cgroup count is zero, a usermode call made
- * to the release agent with the name of the cgroup (path relative to
- * the root of cgroup file system) as the argument.
- *
* A cgroup can only be deleted if both its 'count' of using tasks
* is zero, and its list of 'children' cgroups is empty. Since all
* tasks in the system use _some_ cgroup, and since there is always at
@@ -1587,7 +1560,6 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
INIT_LIST_HEAD(&cgrp->self.sibling);
INIT_LIST_HEAD(&cgrp->self.children);
INIT_LIST_HEAD(&cgrp->cset_links);
- INIT_LIST_HEAD(&cgrp->release_list);
INIT_LIST_HEAD(&cgrp->pidlists);
mutex_init(&cgrp->pidlist_mutex);
cgrp->self.cgroup = cgrp;
@@ -1597,6 +1569,7 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
INIT_LIST_HEAD(&cgrp->e_csets[ssid]);
init_waitqueue_head(&cgrp->offline_waitq);
+ INIT_WORK(&cgrp->release_agent_work, cgroup_release_agent);
}
static void init_cgroup_root(struct cgroup_root *root,
@@ -1634,7 +1607,8 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned int ss_mask)
goto out;
root_cgrp->id = ret;
- ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release);
+ ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0,
+ GFP_KERNEL);
if (ret)
goto out;
@@ -2052,8 +2026,7 @@ static void cgroup_task_migrate(struct cgroup *old_cgrp,
* task. As trading it for new_cset is protected by cgroup_mutex,
* we're safe to drop it here; it will be freed under RCU.
*/
- set_bit(CGRP_RELEASABLE, &old_cgrp->flags);
- put_css_set_locked(old_cset, false);
+ put_css_set_locked(old_cset);
}
/**
@@ -2074,7 +2047,7 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets)
cset->mg_src_cgrp = NULL;
cset->mg_dst_cset = NULL;
list_del_init(&cset->mg_preload_node);
- put_css_set_locked(cset, false);
+ put_css_set_locked(cset);
}
up_write(&css_set_rwsem);
}
@@ -2168,8 +2141,8 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,
if (src_cset == dst_cset) {
src_cset->mg_src_cgrp = NULL;
list_del_init(&src_cset->mg_preload_node);
- put_css_set(src_cset, false);
- put_css_set(dst_cset, false);
+ put_css_set(src_cset);
+ put_css_set(dst_cset);
continue;
}
@@ -2178,7 +2151,7 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp,
if (list_empty(&dst_cset->mg_preload_node))
list_add(&dst_cset->mg_preload_node, &csets);
else
- put_css_set(dst_cset, false);
+ put_css_set(dst_cset);
}
list_splice_tail(&csets, preloaded_csets);
@@ -3985,7 +3958,6 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type,
l = cgroup_pidlist_find_create(cgrp, type);
if (!l) {
- mutex_unlock(&cgrp->pidlist_mutex);
pidlist_free(array);
return -ENOMEM;
}
@@ -4174,7 +4146,6 @@ static u64 cgroup_read_notify_on_release(struct cgroup_subsys_state *css,
static int cgroup_write_notify_on_release(struct cgroup_subsys_state *css,
struct cftype *cft, u64 val)
{
- clear_bit(CGRP_RELEASABLE, &css->cgroup->flags);
if (val)
set_bit(CGRP_NOTIFY_ON_RELEASE, &css->cgroup->flags);
else
@@ -4352,6 +4323,7 @@ static void css_free_work_fn(struct work_struct *work)
/* cgroup free path */
atomic_dec(&cgrp->root->nr_cgrps);
cgroup_pidlist_destroy_all(cgrp);
+ cancel_work_sync(&cgrp->release_agent_work);
if (cgroup_parent(cgrp)) {
/*
@@ -4511,7 +4483,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss,
init_and_link_css(css, ss, cgrp);
- err = percpu_ref_init(&css->refcnt, css_release);
+ err = percpu_ref_init(&css->refcnt, css_release, 0, GFP_KERNEL);
if (err)
goto err_free_css;
@@ -4584,7 +4556,7 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name,
goto out_unlock;
}
- ret = percpu_ref_init(&cgrp->self.refcnt, css_release);
+ ret = percpu_ref_init(&cgrp->self.refcnt, css_release, 0, GFP_KERNEL);
if (ret)
goto out_free_cgrp;
@@ -4814,19 +4786,12 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
for_each_css(css, ssid, cgrp)
kill_css(css);
- /* CSS_ONLINE is clear, remove from ->release_list for the last time */
- raw_spin_lock(&release_list_lock);
- if (!list_empty(&cgrp->release_list))
- list_del_init(&cgrp->release_list);
- raw_spin_unlock(&release_list_lock);
-
/*
* Remove @cgrp directory along with the base files. @cgrp has an
* extra ref on its kn.
*/
kernfs_remove(cgrp->kn);
- set_bit(CGRP_RELEASABLE, &cgroup_parent(cgrp)->flags);
check_for_release(cgroup_parent(cgrp));
/* put the base reference */
@@ -4843,13 +4808,10 @@ static int cgroup_rmdir(struct kernfs_node *kn)
cgrp = cgroup_kn_lock_live(kn);
if (!cgrp)
return 0;
- cgroup_get(cgrp); /* for @kn->priv clearing */
ret = cgroup_destroy_locked(cgrp);
cgroup_kn_unlock(kn);
-
- cgroup_put(cgrp);
return ret;
}
@@ -5053,12 +5015,9 @@ core_initcall(cgroup_wq_init);
* - Print task's cgroup paths into seq_file, one line for each hierarchy
* - Used for /proc/<pid>/cgroup.
*/
-
-/* TODO: Use a proper seq_file iterator */
-int proc_cgroup_show(struct seq_file *m, void *v)
+int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *tsk)
{
- struct pid *pid;
- struct task_struct *tsk;
char *buf, *path;
int retval;
struct cgroup_root *root;
@@ -5068,14 +5027,6 @@ int proc_cgroup_show(struct seq_file *m, void *v)
if (!buf)
goto out;
- retval = -ESRCH;
- pid = m->private;
- tsk = get_pid_task(pid, PIDTYPE_PID);
- if (!tsk)
- goto out_free;
-
- retval = 0;
-
mutex_lock(&cgroup_mutex);
down_read(&css_set_rwsem);
@@ -5105,11 +5056,10 @@ int proc_cgroup_show(struct seq_file *m, void *v)
seq_putc(m, '\n');
}
+ retval = 0;
out_unlock:
up_read(&css_set_rwsem);
mutex_unlock(&cgroup_mutex);
- put_task_struct(tsk);
-out_free:
kfree(buf);
out:
return retval;
@@ -5180,7 +5130,7 @@ void cgroup_post_fork(struct task_struct *child)
int i;
/*
- * This may race against cgroup_enable_task_cg_links(). As that
+ * This may race against cgroup_enable_task_cg_lists(). As that
* function sets use_task_css_set_links before grabbing
* tasklist_lock and we just went through tasklist_lock to add
* @child, it's guaranteed that either we see the set
@@ -5195,7 +5145,7 @@ void cgroup_post_fork(struct task_struct *child)
* when implementing operations which need to migrate all tasks of
* a cgroup to another.
*
- * Note that if we lose to cgroup_enable_task_cg_links(), @child
+ * Note that if we lose to cgroup_enable_task_cg_lists(), @child
* will remain in init_css_set. This is safe because all tasks are
* in the init_css_set before cg_links is enabled and there's no
* operation which transfers all tasks out of init_css_set.
@@ -5279,30 +5229,14 @@ void cgroup_exit(struct task_struct *tsk)
}
if (put_cset)
- put_css_set(cset, true);
+ put_css_set(cset);
}
static void check_for_release(struct cgroup *cgrp)
{
- if (cgroup_is_releasable(cgrp) && list_empty(&cgrp->cset_links) &&
- !css_has_online_children(&cgrp->self)) {
- /*
- * Control Group is currently removeable. If it's not
- * already queued for a userspace notification, queue
- * it now
- */
- int need_schedule_work = 0;
-
- raw_spin_lock(&release_list_lock);
- if (!cgroup_is_dead(cgrp) &&
- list_empty(&cgrp->release_list)) {
- list_add(&cgrp->release_list, &release_list);
- need_schedule_work = 1;
- }
- raw_spin_unlock(&release_list_lock);
- if (need_schedule_work)
- schedule_work(&release_agent_work);
- }
+ if (notify_on_release(cgrp) && !cgroup_has_tasks(cgrp) &&
+ !css_has_online_children(&cgrp->self) && !cgroup_is_dead(cgrp))
+ schedule_work(&cgrp->release_agent_work);
}
/*
@@ -5330,52 +5264,39 @@ static void check_for_release(struct cgroup *cgrp)
*/
static void cgroup_release_agent(struct work_struct *work)
{
- BUG_ON(work != &release_agent_work);
+ struct cgroup *cgrp =
+ container_of(work, struct cgroup, release_agent_work);
+ char *pathbuf = NULL, *agentbuf = NULL, *path;
+ char *argv[3], *envp[3];
+
mutex_lock(&cgroup_mutex);
- raw_spin_lock(&release_list_lock);
- while (!list_empty(&release_list)) {
- char *argv[3], *envp[3];
- int i;
- char *pathbuf = NULL, *agentbuf = NULL, *path;
- struct cgroup *cgrp = list_entry(release_list.next,
- struct cgroup,
- release_list);
- list_del_init(&cgrp->release_list);
- raw_spin_unlock(&release_list_lock);
- pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
- if (!pathbuf)
- goto continue_free;
- path = cgroup_path(cgrp, pathbuf, PATH_MAX);
- if (!path)
- goto continue_free;
- agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
- if (!agentbuf)
- goto continue_free;
-
- i = 0;
- argv[i++] = agentbuf;
- argv[i++] = path;
- argv[i] = NULL;
-
- i = 0;
- /* minimal command environment */
- envp[i++] = "HOME=/";
- envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
- envp[i] = NULL;
-
- /* Drop the lock while we invoke the usermode helper,
- * since the exec could involve hitting disk and hence
- * be a slow process */
- mutex_unlock(&cgroup_mutex);
- call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
- mutex_lock(&cgroup_mutex);
- continue_free:
- kfree(pathbuf);
- kfree(agentbuf);
- raw_spin_lock(&release_list_lock);
- }
- raw_spin_unlock(&release_list_lock);
+
+ pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
+ agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
+ if (!pathbuf || !agentbuf)
+ goto out;
+
+ path = cgroup_path(cgrp, pathbuf, PATH_MAX);
+ if (!path)
+ goto out;
+
+ argv[0] = agentbuf;
+ argv[1] = path;
+ argv[2] = NULL;
+
+ /* minimal command environment */
+ envp[0] = "HOME=/";
+ envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
+ envp[2] = NULL;
+
+ mutex_unlock(&cgroup_mutex);
+ call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
+ goto out_free;
+out:
mutex_unlock(&cgroup_mutex);
+out_free:
+ kfree(agentbuf);
+ kfree(pathbuf);
}
static int __init cgroup_disable(char *str)
@@ -5563,7 +5484,8 @@ static int cgroup_css_links_read(struct seq_file *seq, void *v)
static u64 releasable_read(struct cgroup_subsys_state *css, struct cftype *cft)
{
- return test_bit(CGRP_RELEASABLE, &css->cgroup->flags);
+ return (!cgroup_has_tasks(css->cgroup) &&
+ !css_has_online_children(&css->cgroup->self));
}
static struct cftype debug_files[] = {
diff --git a/kernel/configs/tiny.config b/kernel/configs/tiny.config
new file mode 100644
index 000000000000..c2de56ab0fce
--- /dev/null
+++ b/kernel/configs/tiny.config
@@ -0,0 +1,4 @@
+CONFIG_CC_OPTIMIZE_FOR_SIZE=y
+CONFIG_KERNEL_XZ=y
+CONFIG_OPTIMIZE_INLINING=y
+CONFIG_SLOB=y
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 22874d7cf2c0..1f107c74087b 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -365,13 +365,14 @@ static void cpuset_update_task_spread_flag(struct cpuset *cs,
struct task_struct *tsk)
{
if (is_spread_page(cs))
- tsk->flags |= PF_SPREAD_PAGE;
+ task_set_spread_page(tsk);
else
- tsk->flags &= ~PF_SPREAD_PAGE;
+ task_clear_spread_page(tsk);
+
if (is_spread_slab(cs))
- tsk->flags |= PF_SPREAD_SLAB;
+ task_set_spread_slab(tsk);
else
- tsk->flags &= ~PF_SPREAD_SLAB;
+ task_clear_spread_slab(tsk);
}
/*
@@ -2729,10 +2730,9 @@ void __cpuset_memory_pressure_bump(void)
* and we take cpuset_mutex, keeping cpuset_attach() from changing it
* anyway.
*/
-int proc_cpuset_show(struct seq_file *m, void *unused_v)
+int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
+ struct pid *pid, struct task_struct *tsk)
{
- struct pid *pid;
- struct task_struct *tsk;
char *buf, *p;
struct cgroup_subsys_state *css;
int retval;
@@ -2742,24 +2742,16 @@ int proc_cpuset_show(struct seq_file *m, void *unused_v)
if (!buf)
goto out;
- retval = -ESRCH;
- pid = m->private;
- tsk = get_pid_task(pid, PIDTYPE_PID);
- if (!tsk)
- goto out_free;
-
retval = -ENAMETOOLONG;
rcu_read_lock();
css = task_css(tsk, cpuset_cgrp_id);
p = cgroup_path(css->cgroup, buf, PATH_MAX);
rcu_read_unlock();
if (!p)
- goto out_put_task;
+ goto out_free;
seq_puts(m, p);
seq_putc(m, '\n');
retval = 0;
-out_put_task:
- put_task_struct(tsk);
out_free:
kfree(buf);
out:
diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c
index c766ee54c0b1..b64e238b553b 100644
--- a/kernel/crash_dump.c
+++ b/kernel/crash_dump.c
@@ -18,6 +18,7 @@ unsigned long saved_max_pfn;
* it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE.
*/
unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
+EXPORT_SYMBOL_GPL(elfcorehdr_addr);
/*
* stores the size of elf header of crash image
diff --git a/kernel/events/core.c b/kernel/events/core.c
index d640a8b4dcbc..b1c663593f5c 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -392,14 +392,9 @@ perf_cgroup_match(struct perf_event *event)
event->cgrp->css.cgroup);
}
-static inline void perf_put_cgroup(struct perf_event *event)
-{
- css_put(&event->cgrp->css);
-}
-
static inline void perf_detach_cgroup(struct perf_event *event)
{
- perf_put_cgroup(event);
+ css_put(&event->cgrp->css);
event->cgrp = NULL;
}
@@ -7948,8 +7943,10 @@ int perf_event_init_task(struct task_struct *child)
for_each_task_context_nr(ctxn) {
ret = perf_event_init_context(child, ctxn);
- if (ret)
+ if (ret) {
+ perf_event_free_task(child);
return ret;
+ }
}
return 0;
diff --git a/kernel/fork.c b/kernel/fork.c
index 0cf9cdb6e491..8c162d102740 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -601,9 +601,8 @@ static void check_mm(struct mm_struct *mm)
printk(KERN_ALERT "BUG: Bad rss-counter state "
"mm:%p idx:%d val:%ld\n", mm, i, x);
}
-
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
- VM_BUG_ON(mm->pmd_huge_pte);
+ VM_BUG_ON_MM(mm->pmd_huge_pte, mm);
#endif
}
@@ -1360,7 +1359,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
goto bad_fork_cleanup_policy;
retval = audit_alloc(p);
if (retval)
- goto bad_fork_cleanup_policy;
+ goto bad_fork_cleanup_perf;
/* copy all the process information */
shm_init_task(p);
retval = copy_semundo(clone_flags, p);
@@ -1566,8 +1565,9 @@ bad_fork_cleanup_semundo:
exit_sem(p);
bad_fork_cleanup_audit:
audit_free(p);
-bad_fork_cleanup_policy:
+bad_fork_cleanup_perf:
perf_event_free_task(p);
+bad_fork_cleanup_policy:
#ifdef CONFIG_NUMA
mpol_put(p->mempolicy);
bad_fork_cleanup_threadgroup_lock:
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index d04ce8ac4399..cf66c5c8458e 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -35,7 +35,7 @@ config GCOV_KERNEL
config GCOV_PROFILE_ALL
bool "Profile entire Kernel"
depends on GCOV_KERNEL
- depends on SUPERH || S390 || X86 || PPC || MICROBLAZE
+ depends on SUPERH || S390 || X86 || PPC || MICROBLAZE || ARM
default n
---help---
This options activates profiling for the entire kernel.
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index d269cecdfbf0..225086b2652e 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -55,6 +55,9 @@ config GENERIC_IRQ_CHIP
config IRQ_DOMAIN
bool
+config HANDLE_DOMAIN_IRQ
+ bool
+
config IRQ_DOMAIN_DEBUG
bool "Expose hardware/virtual IRQ mapping via debugfs"
depends on IRQ_DOMAIN && DEBUG_FS
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 6223fab9a9d2..8fb52e9bddc1 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -342,6 +342,31 @@ static bool irq_check_poll(struct irq_desc *desc)
return irq_wait_for_poll(desc);
}
+static bool irq_may_run(struct irq_desc *desc)
+{
+ unsigned int mask = IRQD_IRQ_INPROGRESS | IRQD_WAKEUP_ARMED;
+
+ /*
+ * If the interrupt is not in progress and is not an armed
+ * wakeup interrupt, proceed.
+ */
+ if (!irqd_has_set(&desc->irq_data, mask))
+ return true;
+
+ /*
+ * If the interrupt is an armed wakeup source, mark it pending
+ * and suspended, disable it and notify the pm core about the
+ * event.
+ */
+ if (irq_pm_check_wakeup(desc))
+ return false;
+
+ /*
+ * Handle a potential concurrent poll on a different core.
+ */
+ return irq_check_poll(desc);
+}
+
/**
* handle_simple_irq - Simple and software-decoded IRQs.
* @irq: the interrupt number
@@ -359,9 +384,8 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc)
{
raw_spin_lock(&desc->lock);
- if (unlikely(irqd_irq_inprogress(&desc->irq_data)))
- if (!irq_check_poll(desc))
- goto out_unlock;
+ if (!irq_may_run(desc))
+ goto out_unlock;
desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
kstat_incr_irqs_this_cpu(irq, desc);
@@ -412,9 +436,8 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
raw_spin_lock(&desc->lock);
mask_ack_irq(desc);
- if (unlikely(irqd_irq_inprogress(&desc->irq_data)))
- if (!irq_check_poll(desc))
- goto out_unlock;
+ if (!irq_may_run(desc))
+ goto out_unlock;
desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
kstat_incr_irqs_this_cpu(irq, desc);
@@ -485,9 +508,8 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
raw_spin_lock(&desc->lock);
- if (unlikely(irqd_irq_inprogress(&desc->irq_data)))
- if (!irq_check_poll(desc))
- goto out;
+ if (!irq_may_run(desc))
+ goto out;
desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
kstat_incr_irqs_this_cpu(irq, desc);
@@ -541,19 +563,23 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
raw_spin_lock(&desc->lock);
desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
+
+ if (!irq_may_run(desc)) {
+ desc->istate |= IRQS_PENDING;
+ mask_ack_irq(desc);
+ goto out_unlock;
+ }
+
/*
- * If we're currently running this IRQ, or its disabled,
- * we shouldn't process the IRQ. Mark it pending, handle
- * the necessary masking and go out
+ * If its disabled or no action available then mask it and get
+ * out of here.
*/
- if (unlikely(irqd_irq_disabled(&desc->irq_data) ||
- irqd_irq_inprogress(&desc->irq_data) || !desc->action)) {
- if (!irq_check_poll(desc)) {
- desc->istate |= IRQS_PENDING;
- mask_ack_irq(desc);
- goto out_unlock;
- }
+ if (irqd_irq_disabled(&desc->irq_data) || !desc->action) {
+ desc->istate |= IRQS_PENDING;
+ mask_ack_irq(desc);
+ goto out_unlock;
}
+
kstat_incr_irqs_this_cpu(irq, desc);
/* Start handling the irq */
@@ -602,18 +628,21 @@ void handle_edge_eoi_irq(unsigned int irq, struct irq_desc *desc)
raw_spin_lock(&desc->lock);
desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
+
+ if (!irq_may_run(desc)) {
+ desc->istate |= IRQS_PENDING;
+ goto out_eoi;
+ }
+
/*
- * If we're currently running this IRQ, or its disabled,
- * we shouldn't process the IRQ. Mark it pending, handle
- * the necessary masking and go out
+ * If its disabled or no action available then mask it and get
+ * out of here.
*/
- if (unlikely(irqd_irq_disabled(&desc->irq_data) ||
- irqd_irq_inprogress(&desc->irq_data) || !desc->action)) {
- if (!irq_check_poll(desc)) {
- desc->istate |= IRQS_PENDING;
- goto out_eoi;
- }
+ if (irqd_irq_disabled(&desc->irq_data) || !desc->action) {
+ desc->istate |= IRQS_PENDING;
+ goto out_eoi;
}
+
kstat_incr_irqs_this_cpu(irq, desc);
do {
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
index 1ef0606797c9..d5d0f7345c54 100644
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -38,7 +38,7 @@ static int devm_irq_match(struct device *dev, void *res, void *data)
*
* Except for the extra @dev argument, this function takes the
* same arguments and performs the same function as
- * request_irq(). IRQs requested with this function will be
+ * request_threaded_irq(). IRQs requested with this function will be
* automatically freed on driver detach.
*
* If an IRQ allocated with this function needs to be freed
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 099ea2e0eb88..4332d766619d 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -63,8 +63,8 @@ enum {
extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
unsigned long flags);
-extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp);
-extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume);
+extern void __disable_irq(struct irq_desc *desc, unsigned int irq);
+extern void __enable_irq(struct irq_desc *desc, unsigned int irq);
extern int irq_startup(struct irq_desc *desc, bool resend);
extern void irq_shutdown(struct irq_desc *desc);
@@ -194,3 +194,15 @@ static inline void kstat_incr_irqs_this_cpu(unsigned int irq, struct irq_desc *d
__this_cpu_inc(*desc->kstat_irqs);
__this_cpu_inc(kstat.irqs_sum);
}
+
+#ifdef CONFIG_PM_SLEEP
+bool irq_pm_check_wakeup(struct irq_desc *desc);
+void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action);
+void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action);
+#else
+static inline bool irq_pm_check_wakeup(struct irq_desc *desc) { return false; }
+static inline void
+irq_pm_install_action(struct irq_desc *desc, struct irqaction *action) { }
+static inline void
+irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action) { }
+#endif
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 1487a123db5c..a1782f88f0af 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -14,6 +14,7 @@
#include <linux/kernel_stat.h>
#include <linux/radix-tree.h>
#include <linux/bitmap.h>
+#include <linux/irqdomain.h>
#include "internals.h"
@@ -336,6 +337,47 @@ int generic_handle_irq(unsigned int irq)
}
EXPORT_SYMBOL_GPL(generic_handle_irq);
+#ifdef CONFIG_HANDLE_DOMAIN_IRQ
+/**
+ * __handle_domain_irq - Invoke the handler for a HW irq belonging to a domain
+ * @domain: The domain where to perform the lookup
+ * @hwirq: The HW irq number to convert to a logical one
+ * @lookup: Whether to perform the domain lookup or not
+ * @regs: Register file coming from the low-level handling code
+ *
+ * Returns: 0 on success, or -EINVAL if conversion has failed
+ */
+int __handle_domain_irq(struct irq_domain *domain, unsigned int hwirq,
+ bool lookup, struct pt_regs *regs)
+{
+ struct pt_regs *old_regs = set_irq_regs(regs);
+ unsigned int irq = hwirq;
+ int ret = 0;
+
+ irq_enter();
+
+#ifdef CONFIG_IRQ_DOMAIN
+ if (lookup)
+ irq = irq_find_mapping(domain, hwirq);
+#endif
+
+ /*
+ * Some hardware gives randomly wrong interrupts. Rather
+ * than crashing, do something sensible.
+ */
+ if (unlikely(!irq || irq >= nr_irqs)) {
+ ack_bad_irq(irq);
+ ret = -EINVAL;
+ } else {
+ generic_handle_irq(irq);
+ }
+
+ irq_exit();
+ set_irq_regs(old_regs);
+ return ret;
+}
+#endif
+
/* Dynamic interrupt handling */
/**
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 3dc6a61bf06a..0a9104b4608b 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -382,14 +382,8 @@ setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
}
#endif
-void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend)
+void __disable_irq(struct irq_desc *desc, unsigned int irq)
{
- if (suspend) {
- if (!desc->action || (desc->action->flags & IRQF_NO_SUSPEND))
- return;
- desc->istate |= IRQS_SUSPENDED;
- }
-
if (!desc->depth++)
irq_disable(desc);
}
@@ -401,7 +395,7 @@ static int __disable_irq_nosync(unsigned int irq)
if (!desc)
return -EINVAL;
- __disable_irq(desc, irq, false);
+ __disable_irq(desc, irq);
irq_put_desc_busunlock(desc, flags);
return 0;
}
@@ -442,20 +436,8 @@ void disable_irq(unsigned int irq)
}
EXPORT_SYMBOL(disable_irq);
-void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
+void __enable_irq(struct irq_desc *desc, unsigned int irq)
{
- if (resume) {
- if (!(desc->istate & IRQS_SUSPENDED)) {
- if (!desc->action)
- return;
- if (!(desc->action->flags & IRQF_FORCE_RESUME))
- return;
- /* Pretend that it got disabled ! */
- desc->depth++;
- }
- desc->istate &= ~IRQS_SUSPENDED;
- }
-
switch (desc->depth) {
case 0:
err_out:
@@ -497,7 +479,7 @@ void enable_irq(unsigned int irq)
KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq))
goto out;
- __enable_irq(desc, irq, false);
+ __enable_irq(desc, irq);
out:
irq_put_desc_busunlock(desc, flags);
}
@@ -1218,6 +1200,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
new->irq = irq;
*old_ptr = new;
+ irq_pm_install_action(desc, new);
+
/* Reset broken irq detection when installing new handler */
desc->irq_count = 0;
desc->irqs_unhandled = 0;
@@ -1228,7 +1212,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
*/
if (shared && (desc->istate & IRQS_SPURIOUS_DISABLED)) {
desc->istate &= ~IRQS_SPURIOUS_DISABLED;
- __enable_irq(desc, irq, false);
+ __enable_irq(desc, irq);
}
raw_spin_unlock_irqrestore(&desc->lock, flags);
@@ -1336,6 +1320,8 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
/* Found it - now remove it from the list of entries: */
*action_ptr = action->next;
+ irq_pm_remove_action(desc, action);
+
/* If this was the last handler, shut down the IRQ line: */
if (!desc->action) {
irq_shutdown(desc);
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index abcd6ca86cb7..3ca532592704 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -9,17 +9,105 @@
#include <linux/irq.h>
#include <linux/module.h>
#include <linux/interrupt.h>
+#include <linux/suspend.h>
#include <linux/syscore_ops.h>
#include "internals.h"
+bool irq_pm_check_wakeup(struct irq_desc *desc)
+{
+ if (irqd_is_wakeup_armed(&desc->irq_data)) {
+ irqd_clear(&desc->irq_data, IRQD_WAKEUP_ARMED);
+ desc->istate |= IRQS_SUSPENDED | IRQS_PENDING;
+ desc->depth++;
+ irq_disable(desc);
+ pm_system_wakeup();
+ return true;
+ }
+ return false;
+}
+
+/*
+ * Called from __setup_irq() with desc->lock held after @action has
+ * been installed in the action chain.
+ */
+void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action)
+{
+ desc->nr_actions++;
+
+ if (action->flags & IRQF_FORCE_RESUME)
+ desc->force_resume_depth++;
+
+ WARN_ON_ONCE(desc->force_resume_depth &&
+ desc->force_resume_depth != desc->nr_actions);
+
+ if (action->flags & IRQF_NO_SUSPEND)
+ desc->no_suspend_depth++;
+
+ WARN_ON_ONCE(desc->no_suspend_depth &&
+ desc->no_suspend_depth != desc->nr_actions);
+}
+
+/*
+ * Called from __free_irq() with desc->lock held after @action has
+ * been removed from the action chain.
+ */
+void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action)
+{
+ desc->nr_actions--;
+
+ if (action->flags & IRQF_FORCE_RESUME)
+ desc->force_resume_depth--;
+
+ if (action->flags & IRQF_NO_SUSPEND)
+ desc->no_suspend_depth--;
+}
+
+static bool suspend_device_irq(struct irq_desc *desc, int irq)
+{
+ if (!desc->action || desc->no_suspend_depth)
+ return false;
+
+ if (irqd_is_wakeup_set(&desc->irq_data)) {
+ irqd_set(&desc->irq_data, IRQD_WAKEUP_ARMED);
+ /*
+ * We return true here to force the caller to issue
+ * synchronize_irq(). We need to make sure that the
+ * IRQD_WAKEUP_ARMED is visible before we return from
+ * suspend_device_irqs().
+ */
+ return true;
+ }
+
+ desc->istate |= IRQS_SUSPENDED;
+ __disable_irq(desc, irq);
+
+ /*
+ * Hardware which has no wakeup source configuration facility
+ * requires that the non wakeup interrupts are masked at the
+ * chip level. The chip implementation indicates that with
+ * IRQCHIP_MASK_ON_SUSPEND.
+ */
+ if (irq_desc_get_chip(desc)->flags & IRQCHIP_MASK_ON_SUSPEND)
+ mask_irq(desc);
+ return true;
+}
+
/**
* suspend_device_irqs - disable all currently enabled interrupt lines
*
- * During system-wide suspend or hibernation device drivers need to be prevented
- * from receiving interrupts and this function is provided for this purpose.
- * It marks all interrupt lines in use, except for the timer ones, as disabled
- * and sets the IRQS_SUSPENDED flag for each of them.
+ * During system-wide suspend or hibernation device drivers need to be
+ * prevented from receiving interrupts and this function is provided
+ * for this purpose.
+ *
+ * So we disable all interrupts and mark them IRQS_SUSPENDED except
+ * for those which are unused, those which are marked as not
+ * suspendable via an interrupt request with the flag IRQF_NO_SUSPEND
+ * set and those which are marked as active wakeup sources.
+ *
+ * The active wakeup sources are handled by the flow handler entry
+ * code which checks for the IRQD_WAKEUP_ARMED flag, suspends the
+ * interrupt and notifies the pm core about the wakeup.
*/
void suspend_device_irqs(void)
{
@@ -28,18 +116,36 @@ void suspend_device_irqs(void)
for_each_irq_desc(irq, desc) {
unsigned long flags;
+ bool sync;
raw_spin_lock_irqsave(&desc->lock, flags);
- __disable_irq(desc, irq, true);
+ sync = suspend_device_irq(desc, irq);
raw_spin_unlock_irqrestore(&desc->lock, flags);
- }
- for_each_irq_desc(irq, desc)
- if (desc->istate & IRQS_SUSPENDED)
+ if (sync)
synchronize_irq(irq);
+ }
}
EXPORT_SYMBOL_GPL(suspend_device_irqs);
+static void resume_irq(struct irq_desc *desc, int irq)
+{
+ irqd_clear(&desc->irq_data, IRQD_WAKEUP_ARMED);
+
+ if (desc->istate & IRQS_SUSPENDED)
+ goto resume;
+
+ /* Force resume the interrupt? */
+ if (!desc->force_resume_depth)
+ return;
+
+ /* Pretend that it got disabled ! */
+ desc->depth++;
+resume:
+ desc->istate &= ~IRQS_SUSPENDED;
+ __enable_irq(desc, irq);
+}
+
static void resume_irqs(bool want_early)
{
struct irq_desc *desc;
@@ -54,7 +160,7 @@ static void resume_irqs(bool want_early)
continue;
raw_spin_lock_irqsave(&desc->lock, flags);
- __enable_irq(desc, irq, true);
+ resume_irq(desc, irq);
raw_spin_unlock_irqrestore(&desc->lock, flags);
}
}
@@ -93,38 +199,3 @@ void resume_device_irqs(void)
resume_irqs(false);
}
EXPORT_SYMBOL_GPL(resume_device_irqs);
-
-/**
- * check_wakeup_irqs - check if any wake-up interrupts are pending
- */
-int check_wakeup_irqs(void)
-{
- struct irq_desc *desc;
- int irq;
-
- for_each_irq_desc(irq, desc) {
- /*
- * Only interrupts which are marked as wakeup source
- * and have not been disabled before the suspend check
- * can abort suspend.
- */
- if (irqd_is_wakeup_set(&desc->irq_data)) {
- if (desc->depth == 1 && desc->istate & IRQS_PENDING)
- return -EBUSY;
- continue;
- }
- /*
- * Check the non wakeup interrupts whether they need
- * to be masked before finally going into suspend
- * state. That's for hardware which has no wakeup
- * source configuration facility. The chip
- * implementation indicates that with
- * IRQCHIP_MASK_ON_SUSPEND.
- */
- if (desc->istate & IRQS_SUSPENDED &&
- irq_desc_get_chip(desc)->flags & IRQCHIP_MASK_ON_SUSPEND)
- mask_irq(desc);
- }
-
- return 0;
-}
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
index e6bcbe756663..385b85aded19 100644
--- a/kernel/irq_work.c
+++ b/kernel/irq_work.c
@@ -115,8 +115,10 @@ bool irq_work_needs_cpu(void)
raised = &__get_cpu_var(raised_list);
lazy = &__get_cpu_var(lazy_list);
- if (llist_empty(raised) && llist_empty(lazy))
- return false;
+
+ if (llist_empty(raised) || arch_irq_work_has_interrupt())
+ if (llist_empty(lazy))
+ return false;
/* All work should have been flushed before going offline */
WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
@@ -171,6 +173,15 @@ void irq_work_run(void)
}
EXPORT_SYMBOL_GPL(irq_work_run);
+void irq_work_tick(void)
+{
+ struct llist_head *raised = &__get_cpu_var(raised_list);
+
+ if (!llist_empty(raised) && !arch_irq_work_has_interrupt())
+ irq_work_run_list(raised);
+ irq_work_run_list(&__get_cpu_var(lazy_list));
+}
+
/*
* Synchronize against the irq_work @entry, ensures the entry is not
* currently in use.
diff --git a/kernel/kthread.c b/kernel/kthread.c
index ef483220e855..10e489c448fe 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -369,7 +369,7 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
{
struct task_struct *p;
- p = kthread_create_on_node(threadfn, data, cpu_to_mem(cpu), namefmt,
+ p = kthread_create_on_node(threadfn, data, cpu_to_node(cpu), namefmt,
cpu);
if (IS_ERR(p))
return p;
diff --git a/kernel/module.c b/kernel/module.c
index 03214bd288e9..65586ffa0c98 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -135,7 +135,7 @@ static int param_set_bool_enable_only(const char *val,
}
static const struct kernel_param_ops param_ops_bool_enable_only = {
- .flags = KERNEL_PARAM_FL_NOARG,
+ .flags = KERNEL_PARAM_OPS_FL_NOARG,
.set = param_set_bool_enable_only,
.get = param_get_bool,
};
@@ -3388,7 +3388,7 @@ static inline int is_arm_mapping_symbol(const char *str)
{
if (str[0] == '.' && str[1] == 'L')
return true;
- return str[0] == '$' && strchr("atd", str[1])
+ return str[0] == '$' && strchr("axtd", str[1])
&& (str[2] == '\0' || str[2] == '.');
}
diff --git a/kernel/params.c b/kernel/params.c
index 34f527023794..041b5899d5e2 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -83,6 +83,15 @@ bool parameq(const char *a, const char *b)
return parameqn(a, b, strlen(a)+1);
}
+static void param_check_unsafe(const struct kernel_param *kp)
+{
+ if (kp->flags & KERNEL_PARAM_FL_UNSAFE) {
+ pr_warn("Setting dangerous option %s - tainting kernel\n",
+ kp->name);
+ add_taint(TAINT_USER, LOCKDEP_STILL_OK);
+ }
+}
+
static int parse_one(char *param,
char *val,
const char *doing,
@@ -104,11 +113,12 @@ static int parse_one(char *param,
return 0;
/* No one handled NULL, so do it here. */
if (!val &&
- !(params[i].ops->flags & KERNEL_PARAM_FL_NOARG))
+ !(params[i].ops->flags & KERNEL_PARAM_OPS_FL_NOARG))
return -EINVAL;
pr_debug("handling %s with %p\n", param,
params[i].ops->set);
mutex_lock(&param_lock);
+ param_check_unsafe(&params[i]);
err = params[i].ops->set(val, &params[i]);
mutex_unlock(&param_lock);
return err;
@@ -318,7 +328,7 @@ int param_get_bool(char *buffer, const struct kernel_param *kp)
EXPORT_SYMBOL(param_get_bool);
struct kernel_param_ops param_ops_bool = {
- .flags = KERNEL_PARAM_FL_NOARG,
+ .flags = KERNEL_PARAM_OPS_FL_NOARG,
.set = param_set_bool,
.get = param_get_bool,
};
@@ -369,7 +379,7 @@ int param_set_bint(const char *val, const struct kernel_param *kp)
EXPORT_SYMBOL(param_set_bint);
struct kernel_param_ops param_ops_bint = {
- .flags = KERNEL_PARAM_FL_NOARG,
+ .flags = KERNEL_PARAM_OPS_FL_NOARG,
.set = param_set_bint,
.get = param_get_int,
};
@@ -552,6 +562,7 @@ static ssize_t param_attr_store(struct module_attribute *mattr,
return -EPERM;
mutex_lock(&param_lock);
+ param_check_unsafe(attribute->param);
err = attribute->param->ops->set(buf, attribute->param);
mutex_unlock(&param_lock);
if (!err)
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index e4e4121fa327..bbef57f5bdfd 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -302,6 +302,10 @@ config PM_GENERIC_DOMAINS_RUNTIME
def_bool y
depends on PM_RUNTIME && PM_GENERIC_DOMAINS
+config PM_GENERIC_DOMAINS_OF
+ def_bool y
+ depends on PM_GENERIC_DOMAINS && OF
+
config CPU_PM
bool
depends on SUSPEND || CPU_IDLE
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 4ee194eb524b..7b323221b9ee 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -129,6 +129,7 @@ int freeze_processes(void)
if (!pm_freezing)
atomic_inc(&system_freezing_cnt);
+ pm_wakeup_clear();
printk("Freezing user space processes ... ");
pm_freezing = true;
error = try_to_freeze_tasks(true);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index c4b8093c80b3..791a61892bb5 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -1343,6 +1343,9 @@ void swsusp_free(void)
{
unsigned long fb_pfn, fr_pfn;
+ if (!forbidden_pages_map || !free_pages_map)
+ goto out;
+
memory_bm_position_reset(forbidden_pages_map);
memory_bm_position_reset(free_pages_map);
@@ -1370,6 +1373,7 @@ loop:
goto loop;
}
+out:
nr_copy_pages = 0;
nr_meta_pages = 0;
restore_pblist = NULL;
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 18c62195660f..4ca9a33ff620 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -146,17 +146,29 @@ static int platform_suspend_prepare(suspend_state_t state)
static int platform_suspend_prepare_late(suspend_state_t state)
{
+ return state == PM_SUSPEND_FREEZE && freeze_ops->prepare ?
+ freeze_ops->prepare() : 0;
+}
+
+static int platform_suspend_prepare_noirq(suspend_state_t state)
+{
return state != PM_SUSPEND_FREEZE && suspend_ops->prepare_late ?
suspend_ops->prepare_late() : 0;
}
-static void platform_suspend_wake(suspend_state_t state)
+static void platform_resume_noirq(suspend_state_t state)
{
if (state != PM_SUSPEND_FREEZE && suspend_ops->wake)
suspend_ops->wake();
}
-static void platform_suspend_finish(suspend_state_t state)
+static void platform_resume_early(suspend_state_t state)
+{
+ if (state == PM_SUSPEND_FREEZE && freeze_ops->restore)
+ freeze_ops->restore();
+}
+
+static void platform_resume_finish(suspend_state_t state)
{
if (state != PM_SUSPEND_FREEZE && suspend_ops->finish)
suspend_ops->finish();
@@ -172,7 +184,7 @@ static int platform_suspend_begin(suspend_state_t state)
return 0;
}
-static void platform_suspend_end(suspend_state_t state)
+static void platform_resume_end(suspend_state_t state)
{
if (state == PM_SUSPEND_FREEZE && freeze_ops && freeze_ops->end)
freeze_ops->end();
@@ -180,7 +192,7 @@ static void platform_suspend_end(suspend_state_t state)
suspend_ops->end();
}
-static void platform_suspend_recover(suspend_state_t state)
+static void platform_recover(suspend_state_t state)
{
if (state != PM_SUSPEND_FREEZE && suspend_ops->recover)
suspend_ops->recover();
@@ -265,13 +277,22 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
if (error)
goto Platform_finish;
- error = dpm_suspend_end(PMSG_SUSPEND);
+ error = dpm_suspend_late(PMSG_SUSPEND);
if (error) {
- printk(KERN_ERR "PM: Some devices failed to power down\n");
+ printk(KERN_ERR "PM: late suspend of devices failed\n");
goto Platform_finish;
}
error = platform_suspend_prepare_late(state);
if (error)
+ goto Devices_early_resume;
+
+ error = dpm_suspend_noirq(PMSG_SUSPEND);
+ if (error) {
+ printk(KERN_ERR "PM: noirq suspend of devices failed\n");
+ goto Platform_early_resume;
+ }
+ error = platform_suspend_prepare_noirq(state);
+ if (error)
goto Platform_wake;
if (suspend_test(TEST_PLATFORM))
@@ -318,11 +339,17 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
enable_nonboot_cpus();
Platform_wake:
- platform_suspend_wake(state);
- dpm_resume_start(PMSG_RESUME);
+ platform_resume_noirq(state);
+ dpm_resume_noirq(PMSG_RESUME);
+
+ Platform_early_resume:
+ platform_resume_early(state);
+
+ Devices_early_resume:
+ dpm_resume_early(PMSG_RESUME);
Platform_finish:
- platform_suspend_finish(state);
+ platform_resume_finish(state);
return error;
}
@@ -361,14 +388,16 @@ int suspend_devices_and_enter(suspend_state_t state)
suspend_test_start();
dpm_resume_end(PMSG_RESUME);
suspend_test_finish("resume devices");
+ trace_suspend_resume(TPS("resume_console"), state, true);
resume_console();
+ trace_suspend_resume(TPS("resume_console"), state, false);
Close:
- platform_suspend_end(state);
+ platform_resume_end(state);
return error;
Recover_platform:
- platform_suspend_recover(state);
+ platform_recover(state);
goto Resume_devices;
}
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
index bd91bc177c93..084452e34a12 100644
--- a/kernel/power/suspend_test.c
+++ b/kernel/power/suspend_test.c
@@ -22,6 +22,8 @@
#define TEST_SUSPEND_SECONDS 10
static unsigned long suspend_test_start_time;
+static u32 test_repeat_count_max = 1;
+static u32 test_repeat_count_current;
void suspend_test_start(void)
{
@@ -74,6 +76,7 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
int status;
/* this may fail if the RTC hasn't been initialized */
+repeat:
status = rtc_read_time(rtc, &alm.time);
if (status < 0) {
printk(err_readtime, dev_name(&rtc->dev), status);
@@ -100,10 +103,21 @@ static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
if (state == PM_SUSPEND_STANDBY) {
printk(info_test, pm_states[state]);
status = pm_suspend(state);
+ if (status < 0)
+ state = PM_SUSPEND_FREEZE;
}
+ if (state == PM_SUSPEND_FREEZE) {
+ printk(info_test, pm_states[state]);
+ status = pm_suspend(state);
+ }
+
if (status < 0)
printk(err_suspend, status);
+ test_repeat_count_current++;
+ if (test_repeat_count_current < test_repeat_count_max)
+ goto repeat;
+
/* Some platforms can't detect that the alarm triggered the
* wakeup, or (accordingly) disable it after it afterwards.
* It's supposed to give oneshot behavior; cope.
@@ -137,16 +151,28 @@ static char warn_bad_state[] __initdata =
static int __init setup_test_suspend(char *value)
{
int i;
+ char *repeat;
+ char *suspend_type;
- /* "=mem" ==> "mem" */
+ /* example : "=mem[,N]" ==> "mem[,N]" */
value++;
+ suspend_type = strsep(&value, ",");
+ if (!suspend_type)
+ return 0;
+
+ repeat = strsep(&value, ",");
+ if (repeat) {
+ if (kstrtou32(repeat, 0, &test_repeat_count_max))
+ return 0;
+ }
+
for (i = 0; pm_labels[i]; i++)
- if (!strcmp(pm_labels[i], value)) {
+ if (!strcmp(pm_labels[i], suspend_type)) {
test_state_label = pm_labels[i];
return 0;
}
- printk(warn_bad_state, value);
+ printk(warn_bad_state, suspend_type);
return 0;
}
__setup("test_suspend", setup_test_suspend);
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 1ce770687ea8..7a6e69441f75 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -519,14 +519,13 @@ struct devkmsg_user {
char buf[8192];
};
-static ssize_t devkmsg_writev(struct kiocb *iocb, const struct iovec *iv,
- unsigned long count, loff_t pos)
+static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from)
{
char *buf, *line;
int i;
int level = default_message_loglevel;
int facility = 1; /* LOG_USER */
- size_t len = iov_length(iv, count);
+ size_t len = iocb->ki_nbytes;
ssize_t ret = len;
if (len > LOG_LINE_MAX)
@@ -535,13 +534,10 @@ static ssize_t devkmsg_writev(struct kiocb *iocb, const struct iovec *iv,
if (buf == NULL)
return -ENOMEM;
- line = buf;
- for (i = 0; i < count; i++) {
- if (copy_from_user(line, iv[i].iov_base, iv[i].iov_len)) {
- ret = -EFAULT;
- goto out;
- }
- line += iv[i].iov_len;
+ buf[len] = '\0';
+ if (copy_from_iter(buf, len, from) != len) {
+ kfree(buf);
+ return -EFAULT;
}
/*
@@ -567,10 +563,8 @@ static ssize_t devkmsg_writev(struct kiocb *iocb, const struct iovec *iv,
line = endp;
}
}
- line[len] = '\0';
printk_emit(facility, level, NULL, 0, "%s", line);
-out:
kfree(buf);
return ret;
}
@@ -802,7 +796,7 @@ static int devkmsg_release(struct inode *inode, struct file *file)
const struct file_operations kmsg_fops = {
.open = devkmsg_open,
.read = devkmsg_read,
- .aio_write = devkmsg_writev,
+ .write_iter = devkmsg_write,
.llseek = devkmsg_llseek,
.poll = devkmsg_poll,
.release = devkmsg_release,
diff --git a/kernel/reboot.c b/kernel/reboot.c
index a3a9e240fcdb..5925f5ae8dff 100644
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -104,6 +104,87 @@ int unregister_reboot_notifier(struct notifier_block *nb)
}
EXPORT_SYMBOL(unregister_reboot_notifier);
+/*
+ * Notifier list for kernel code which wants to be called
+ * to restart the system.
+ */
+static ATOMIC_NOTIFIER_HEAD(restart_handler_list);
+
+/**
+ * register_restart_handler - Register function to be called to reset
+ * the system
+ * @nb: Info about handler function to be called
+ * @nb->priority: Handler priority. Handlers should follow the
+ * following guidelines for setting priorities.
+ * 0: Restart handler of last resort,
+ * with limited restart capabilities
+ * 128: Default restart handler; use if no other
+ * restart handler is expected to be available,
+ * and/or if restart functionality is
+ * sufficient to restart the entire system
+ * 255: Highest priority restart handler, will
+ * preempt all other restart handlers
+ *
+ * Registers a function with code to be called to restart the
+ * system.
+ *
+ * Registered functions will be called from machine_restart as last
+ * step of the restart sequence (if the architecture specific
+ * machine_restart function calls do_kernel_restart - see below
+ * for details).
+ * Registered functions are expected to restart the system immediately.
+ * If more than one function is registered, the restart handler priority
+ * selects which function will be called first.
+ *
+ * Restart handlers are expected to be registered from non-architecture
+ * code, typically from drivers. A typical use case would be a system
+ * where restart functionality is provided through a watchdog. Multiple
+ * restart handlers may exist; for example, one restart handler might
+ * restart the entire system, while another only restarts the CPU.
+ * In such cases, the restart handler which only restarts part of the
+ * hardware is expected to register with low priority to ensure that
+ * it only runs if no other means to restart the system is available.
+ *
+ * Currently always returns zero, as atomic_notifier_chain_register()
+ * always returns zero.
+ */
+int register_restart_handler(struct notifier_block *nb)
+{
+ return atomic_notifier_chain_register(&restart_handler_list, nb);
+}
+EXPORT_SYMBOL(register_restart_handler);
+
+/**
+ * unregister_restart_handler - Unregister previously registered
+ * restart handler
+ * @nb: Hook to be unregistered
+ *
+ * Unregisters a previously registered restart handler function.
+ *
+ * Returns zero on success, or %-ENOENT on failure.
+ */
+int unregister_restart_handler(struct notifier_block *nb)
+{
+ return atomic_notifier_chain_unregister(&restart_handler_list, nb);
+}
+EXPORT_SYMBOL(unregister_restart_handler);
+
+/**
+ * do_kernel_restart - Execute kernel restart handler call chain
+ *
+ * Calls functions registered with register_restart_handler.
+ *
+ * Expected to be called from machine_restart as last step of the restart
+ * sequence.
+ *
+ * Restarts the system immediately if a restart handler function has been
+ * registered. Otherwise does nothing.
+ */
+void do_kernel_restart(char *cmd)
+{
+ atomic_notifier_call_chain(&restart_handler_list, reboot_mode, cmd);
+}
+
void migrate_to_reboot_cpu(void)
{
/* The boot cpu is always logical cpu 0 */
diff --git a/kernel/resource.c b/kernel/resource.c
index 60c5a3856ab7..46322019ab7d 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -1245,6 +1245,76 @@ int release_mem_region_adjustable(struct resource *parent,
/*
* Managed region resource
*/
+static void devm_resource_release(struct device *dev, void *ptr)
+{
+ struct resource **r = ptr;
+
+ release_resource(*r);
+}
+
+/**
+ * devm_request_resource() - request and reserve an I/O or memory resource
+ * @dev: device for which to request the resource
+ * @root: root of the resource tree from which to request the resource
+ * @new: descriptor of the resource to request
+ *
+ * This is a device-managed version of request_resource(). There is usually
+ * no need to release resources requested by this function explicitly since
+ * that will be taken care of when the device is unbound from its driver.
+ * If for some reason the resource needs to be released explicitly, because
+ * of ordering issues for example, drivers must call devm_release_resource()
+ * rather than the regular release_resource().
+ *
+ * When a conflict is detected between any existing resources and the newly
+ * requested resource, an error message will be printed.
+ *
+ * Returns 0 on success or a negative error code on failure.
+ */
+int devm_request_resource(struct device *dev, struct resource *root,
+ struct resource *new)
+{
+ struct resource *conflict, **ptr;
+
+ ptr = devres_alloc(devm_resource_release, sizeof(*ptr), GFP_KERNEL);
+ if (!ptr)
+ return -ENOMEM;
+
+ *ptr = new;
+
+ conflict = request_resource_conflict(root, new);
+ if (conflict) {
+ dev_err(dev, "resource collision: %pR conflicts with %s %pR\n",
+ new, conflict->name, conflict);
+ devres_free(ptr);
+ return -EBUSY;
+ }
+
+ devres_add(dev, ptr);
+ return 0;
+}
+EXPORT_SYMBOL(devm_request_resource);
+
+static int devm_resource_match(struct device *dev, void *res, void *data)
+{
+ struct resource **ptr = res;
+
+ return *ptr == data;
+}
+
+/**
+ * devm_release_resource() - release a previously requested resource
+ * @dev: device for which to release the resource
+ * @new: descriptor of the resource to release
+ *
+ * Releases a resource previously requested using devm_request_resource().
+ */
+void devm_release_resource(struct device *dev, struct resource *new)
+{
+ WARN_ON(devres_release(dev, devm_resource_release, devm_resource_match,
+ new));
+}
+EXPORT_SYMBOL(devm_release_resource);
+
struct region_devres {
struct resource *parent;
resource_size_t start;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index ec1a286684a5..59965ec0b7de 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2366,6 +2366,18 @@ unsigned long nr_running(void)
return sum;
}
+/*
+ * Check if only the current task is running on the cpu.
+ */
+bool single_task_running(void)
+{
+ if (cpu_rq(smp_processor_id())->nr_running == 1)
+ return true;
+ else
+ return false;
+}
+EXPORT_SYMBOL(single_task_running);
+
unsigned long long nr_context_switches(void)
{
int i;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index bfa3c86d0d68..82088b29704e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1946,7 +1946,7 @@ void task_numa_work(struct callback_head *work)
vma = mm->mmap;
}
for (; vma; vma = vma->vm_next) {
- if (!vma_migratable(vma) || !vma_policy_mof(p, vma))
+ if (!vma_migratable(vma) || !vma_policy_mof(vma))
continue;
/*
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 15cab1a4f84e..5a62915f47a8 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -343,6 +343,18 @@ int __sched out_of_line_wait_on_bit(void *word, int bit,
}
EXPORT_SYMBOL(out_of_line_wait_on_bit);
+int __sched out_of_line_wait_on_bit_timeout(
+ void *word, int bit, wait_bit_action_f *action,
+ unsigned mode, unsigned long timeout)
+{
+ wait_queue_head_t *wq = bit_waitqueue(word, bit);
+ DEFINE_WAIT_BIT(wait, word, bit);
+
+ wait.key.timeout = jiffies + timeout;
+ return __wait_on_bit(wq, &wait, action, mode);
+}
+EXPORT_SYMBOL_GPL(out_of_line_wait_on_bit_timeout);
+
int __sched
__wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q,
wait_bit_action_f *action, unsigned mode)
@@ -520,3 +532,27 @@ __sched int bit_wait_io(struct wait_bit_key *word)
return 0;
}
EXPORT_SYMBOL(bit_wait_io);
+
+__sched int bit_wait_timeout(struct wait_bit_key *word)
+{
+ unsigned long now = ACCESS_ONCE(jiffies);
+ if (signal_pending_state(current->state, current))
+ return 1;
+ if (time_after_eq(now, word->timeout))
+ return -EAGAIN;
+ schedule_timeout(word->timeout - now);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(bit_wait_timeout);
+
+__sched int bit_wait_io_timeout(struct wait_bit_key *word)
+{
+ unsigned long now = ACCESS_ONCE(jiffies);
+ if (signal_pending_state(current->state, current))
+ return 1;
+ if (time_after_eq(now, word->timeout))
+ return -EAGAIN;
+ io_schedule_timeout(word->timeout - now);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(bit_wait_io_timeout);
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 44eb005c6695..84922befea84 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -395,16 +395,15 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
if (!filter)
goto free_prog;
- filter->prog = kzalloc(bpf_prog_size(new_len),
- GFP_KERNEL|__GFP_NOWARN);
+ filter->prog = bpf_prog_alloc(bpf_prog_size(new_len), __GFP_NOWARN);
if (!filter->prog)
goto free_filter;
ret = bpf_convert_filter(fp, fprog->len, filter->prog->insnsi, &new_len);
if (ret)
goto free_filter_prog;
- kfree(fp);
+ kfree(fp);
atomic_set(&filter->usage, 1);
filter->prog->len = new_len;
@@ -413,7 +412,7 @@ static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
return filter;
free_filter_prog:
- kfree(filter->prog);
+ __bpf_prog_free(filter->prog);
free_filter:
kfree(filter);
free_prog:
diff --git a/kernel/sys.c b/kernel/sys.c
index ce8129192a26..dfce4debd138 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -62,28 +62,28 @@
#include <asm/unistd.h>
#ifndef SET_UNALIGN_CTL
-# define SET_UNALIGN_CTL(a,b) (-EINVAL)
+# define SET_UNALIGN_CTL(a, b) (-EINVAL)
#endif
#ifndef GET_UNALIGN_CTL
-# define GET_UNALIGN_CTL(a,b) (-EINVAL)
+# define GET_UNALIGN_CTL(a, b) (-EINVAL)
#endif
#ifndef SET_FPEMU_CTL
-# define SET_FPEMU_CTL(a,b) (-EINVAL)
+# define SET_FPEMU_CTL(a, b) (-EINVAL)
#endif
#ifndef GET_FPEMU_CTL
-# define GET_FPEMU_CTL(a,b) (-EINVAL)
+# define GET_FPEMU_CTL(a, b) (-EINVAL)
#endif
#ifndef SET_FPEXC_CTL
-# define SET_FPEXC_CTL(a,b) (-EINVAL)
+# define SET_FPEXC_CTL(a, b) (-EINVAL)
#endif
#ifndef GET_FPEXC_CTL
-# define GET_FPEXC_CTL(a,b) (-EINVAL)
+# define GET_FPEXC_CTL(a, b) (-EINVAL)
#endif
#ifndef GET_ENDIAN
-# define GET_ENDIAN(a,b) (-EINVAL)
+# define GET_ENDIAN(a, b) (-EINVAL)
#endif
#ifndef SET_ENDIAN
-# define SET_ENDIAN(a,b) (-EINVAL)
+# define SET_ENDIAN(a, b) (-EINVAL)
#endif
#ifndef GET_TSC_CTL
# define GET_TSC_CTL(a) (-EINVAL)
@@ -182,39 +182,40 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
rcu_read_lock();
read_lock(&tasklist_lock);
switch (which) {
- case PRIO_PROCESS:
- if (who)
- p = find_task_by_vpid(who);
- else
- p = current;
- if (p)
- error = set_one_prio(p, niceval, error);
- break;
- case PRIO_PGRP:
- if (who)
- pgrp = find_vpid(who);
- else
- pgrp = task_pgrp(current);
- do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
- error = set_one_prio(p, niceval, error);
- } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
- break;
- case PRIO_USER:
- uid = make_kuid(cred->user_ns, who);
- user = cred->user;
- if (!who)
- uid = cred->uid;
- else if (!uid_eq(uid, cred->uid) &&
- !(user = find_user(uid)))
+ case PRIO_PROCESS:
+ if (who)
+ p = find_task_by_vpid(who);
+ else
+ p = current;
+ if (p)
+ error = set_one_prio(p, niceval, error);
+ break;
+ case PRIO_PGRP:
+ if (who)
+ pgrp = find_vpid(who);
+ else
+ pgrp = task_pgrp(current);
+ do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
+ error = set_one_prio(p, niceval, error);
+ } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
+ break;
+ case PRIO_USER:
+ uid = make_kuid(cred->user_ns, who);
+ user = cred->user;
+ if (!who)
+ uid = cred->uid;
+ else if (!uid_eq(uid, cred->uid)) {
+ user = find_user(uid);
+ if (!user)
goto out_unlock; /* No processes for this user */
-
- do_each_thread(g, p) {
- if (uid_eq(task_uid(p), uid))
- error = set_one_prio(p, niceval, error);
- } while_each_thread(g, p);
- if (!uid_eq(uid, cred->uid))
- free_uid(user); /* For find_user() */
- break;
+ }
+ do_each_thread(g, p) {
+ if (uid_eq(task_uid(p), uid))
+ error = set_one_prio(p, niceval, error);
+ } while_each_thread(g, p);
+ if (!uid_eq(uid, cred->uid))
+ free_uid(user); /* For find_user() */
+ break;
}
out_unlock:
read_unlock(&tasklist_lock);
@@ -244,47 +245,48 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who)
rcu_read_lock();
read_lock(&tasklist_lock);
switch (which) {
- case PRIO_PROCESS:
- if (who)
- p = find_task_by_vpid(who);
- else
- p = current;
- if (p) {
+ case PRIO_PROCESS:
+ if (who)
+ p = find_task_by_vpid(who);
+ else
+ p = current;
+ if (p) {
+ niceval = nice_to_rlimit(task_nice(p));
+ if (niceval > retval)
+ retval = niceval;
+ }
+ break;
+ case PRIO_PGRP:
+ if (who)
+ pgrp = find_vpid(who);
+ else
+ pgrp = task_pgrp(current);
+ do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
+ niceval = nice_to_rlimit(task_nice(p));
+ if (niceval > retval)
+ retval = niceval;
+ } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
+ break;
+ case PRIO_USER:
+ uid = make_kuid(cred->user_ns, who);
+ user = cred->user;
+ if (!who)
+ uid = cred->uid;
+ else if (!uid_eq(uid, cred->uid)) {
+ user = find_user(uid);
+ if (!user)
+ goto out_unlock; /* No processes for this user */
+ }
+ do_each_thread(g, p) {
+ if (uid_eq(task_uid(p), uid)) {
niceval = nice_to_rlimit(task_nice(p));
if (niceval > retval)
retval = niceval;
}
- break;
- case PRIO_PGRP:
- if (who)
- pgrp = find_vpid(who);
- else
- pgrp = task_pgrp(current);
- do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
- niceval = nice_to_rlimit(task_nice(p));
- if (niceval > retval)
- retval = niceval;
- } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
- break;
- case PRIO_USER:
- uid = make_kuid(cred->user_ns, who);
- user = cred->user;
- if (!who)
- uid = cred->uid;
- else if (!uid_eq(uid, cred->uid) &&
- !(user = find_user(uid)))
- goto out_unlock; /* No processes for this user */
-
- do_each_thread(g, p) {
- if (uid_eq(task_uid(p), uid)) {
- niceval = nice_to_rlimit(task_nice(p));
- if (niceval > retval)
- retval = niceval;
- }
- } while_each_thread(g, p);
- if (!uid_eq(uid, cred->uid))
- free_uid(user); /* for find_user() */
- break;
+ } while_each_thread(g, p);
+ if (!uid_eq(uid, cred->uid))
+ free_uid(user); /* for find_user() */
+ break;
}
out_unlock:
read_unlock(&tasklist_lock);
@@ -306,7 +308,7 @@ out_unlock:
*
* The general idea is that a program which uses just setregid() will be
* 100% compatible with BSD. A program which uses just setgid() will be
- * 100% compatible with POSIX with saved IDs.
+ * 100% compatible with POSIX with saved IDs.
*
* SMP: There are not races, the GIDs are checked only by filesystem
* operations (as far as semantic preservation is concerned).
@@ -364,7 +366,7 @@ error:
}
/*
- * setgid() is implemented like SysV w/ SAVED_IDS
+ * setgid() is implemented like SysV w/ SAVED_IDS
*
* SMP: Same implicit races as above.
*/
@@ -442,7 +444,7 @@ static int set_user(struct cred *new)
*
* The general idea is that a program which uses just setreuid() will be
* 100% compatible with BSD. A program which uses just setuid() will be
- * 100% compatible with POSIX with saved IDs.
+ * 100% compatible with POSIX with saved IDs.
*/
SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
{
@@ -503,17 +505,17 @@ error:
abort_creds(new);
return retval;
}
-
+
/*
- * setuid() is implemented like SysV with SAVED_IDS
- *
+ * setuid() is implemented like SysV with SAVED_IDS
+ *
* Note that SAVED_ID's is deficient in that a setuid root program
- * like sendmail, for example, cannot set its uid to be a normal
+ * like sendmail, for example, cannot set its uid to be a normal
* user and then switch back, because if you're root, setuid() sets
* the saved uid too. If you don't like this, blame the bright people
* in the POSIX committee and/or USG. Note that the BSD-style setreuid()
* will allow a root program to temporarily drop privileges and be able to
- * regain them by swapping the real and effective uid.
+ * regain them by swapping the real and effective uid.
*/
SYSCALL_DEFINE1(setuid, uid_t, uid)
{
@@ -637,10 +639,12 @@ SYSCALL_DEFINE3(getresuid, uid_t __user *, ruidp, uid_t __user *, euidp, uid_t _
euid = from_kuid_munged(cred->user_ns, cred->euid);
suid = from_kuid_munged(cred->user_ns, cred->suid);
- if (!(retval = put_user(ruid, ruidp)) &&
- !(retval = put_user(euid, euidp)))
- retval = put_user(suid, suidp);
-
+ retval = put_user(ruid, ruidp);
+ if (!retval) {
+ retval = put_user(euid, euidp);
+ if (!retval)
+ return put_user(suid, suidp);
+ }
return retval;
}
@@ -709,9 +713,12 @@ SYSCALL_DEFINE3(getresgid, gid_t __user *, rgidp, gid_t __user *, egidp, gid_t _
egid = from_kgid_munged(cred->user_ns, cred->egid);
sgid = from_kgid_munged(cred->user_ns, cred->sgid);
- if (!(retval = put_user(rgid, rgidp)) &&
- !(retval = put_user(egid, egidp)))
- retval = put_user(sgid, sgidp);
+ retval = put_user(rgid, rgidp);
+ if (!retval) {
+ retval = put_user(egid, egidp);
+ if (!retval)
+ retval = put_user(sgid, sgidp);
+ }
return retval;
}
@@ -1284,7 +1291,6 @@ SYSCALL_DEFINE2(getrlimit, unsigned int, resource, struct rlimit __user *, rlim)
/*
* Back compatibility for getrlimit. Needed for some apps.
*/
-
SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
struct rlimit __user *, rlim)
{
@@ -1299,7 +1305,7 @@ SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource,
x.rlim_cur = 0x7FFFFFFF;
if (x.rlim_max > 0x7FFFFFFF)
x.rlim_max = 0x7FFFFFFF;
- return copy_to_user(rlim, &x, sizeof(x))?-EFAULT:0;
+ return copy_to_user(rlim, &x, sizeof(x)) ? -EFAULT : 0;
}
#endif
@@ -1527,7 +1533,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
cputime_t tgutime, tgstime, utime, stime;
unsigned long maxrss = 0;
- memset((char *) r, 0, sizeof *r);
+ memset((char *)r, 0, sizeof (*r));
utime = stime = 0;
if (who == RUSAGE_THREAD) {
@@ -1541,41 +1547,41 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
return;
switch (who) {
- case RUSAGE_BOTH:
- case RUSAGE_CHILDREN:
- utime = p->signal->cutime;
- stime = p->signal->cstime;
- r->ru_nvcsw = p->signal->cnvcsw;
- r->ru_nivcsw = p->signal->cnivcsw;
- r->ru_minflt = p->signal->cmin_flt;
- r->ru_majflt = p->signal->cmaj_flt;
- r->ru_inblock = p->signal->cinblock;
- r->ru_oublock = p->signal->coublock;
- maxrss = p->signal->cmaxrss;
-
- if (who == RUSAGE_CHILDREN)
- break;
-
- case RUSAGE_SELF:
- thread_group_cputime_adjusted(p, &tgutime, &tgstime);
- utime += tgutime;
- stime += tgstime;
- r->ru_nvcsw += p->signal->nvcsw;
- r->ru_nivcsw += p->signal->nivcsw;
- r->ru_minflt += p->signal->min_flt;
- r->ru_majflt += p->signal->maj_flt;
- r->ru_inblock += p->signal->inblock;
- r->ru_oublock += p->signal->oublock;
- if (maxrss < p->signal->maxrss)
- maxrss = p->signal->maxrss;
- t = p;
- do {
- accumulate_thread_rusage(t, r);
- } while_each_thread(p, t);
+ case RUSAGE_BOTH:
+ case RUSAGE_CHILDREN:
+ utime = p->signal->cutime;
+ stime = p->signal->cstime;
+ r->ru_nvcsw = p->signal->cnvcsw;
+ r->ru_nivcsw = p->signal->cnivcsw;
+ r->ru_minflt = p->signal->cmin_flt;
+ r->ru_majflt = p->signal->cmaj_flt;
+ r->ru_inblock = p->signal->cinblock;
+ r->ru_oublock = p->signal->coublock;
+ maxrss = p->signal->cmaxrss;
+
+ if (who == RUSAGE_CHILDREN)
break;
- default:
- BUG();
+ case RUSAGE_SELF:
+ thread_group_cputime_adjusted(p, &tgutime, &tgstime);
+ utime += tgutime;
+ stime += tgstime;
+ r->ru_nvcsw += p->signal->nvcsw;
+ r->ru_nivcsw += p->signal->nivcsw;
+ r->ru_minflt += p->signal->min_flt;
+ r->ru_majflt += p->signal->maj_flt;
+ r->ru_inblock += p->signal->inblock;
+ r->ru_oublock += p->signal->oublock;
+ if (maxrss < p->signal->maxrss)
+ maxrss = p->signal->maxrss;
+ t = p;
+ do {
+ accumulate_thread_rusage(t, r);
+ } while_each_thread(p, t);
+ break;
+
+ default:
+ BUG();
}
unlock_task_sighand(p, &flags);
@@ -1585,6 +1591,7 @@ out:
if (who != RUSAGE_CHILDREN) {
struct mm_struct *mm = get_task_mm(p);
+
if (mm) {
setmax_mm_hiwater_rss(&maxrss, mm);
mmput(mm);
@@ -1596,6 +1603,7 @@ out:
int getrusage(struct task_struct *p, int who, struct rusage __user *ru)
{
struct rusage r;
+
k_getrusage(p, who, &r);
return copy_to_user(ru, &r, sizeof(r)) ? -EFAULT : 0;
}
@@ -1628,12 +1636,14 @@ SYSCALL_DEFINE1(umask, int, mask)
return mask;
}
-static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
+static int prctl_set_mm_exe_file_locked(struct mm_struct *mm, unsigned int fd)
{
struct fd exe;
struct inode *inode;
int err;
+ VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm);
+
exe = fdget(fd);
if (!exe.file)
return -EBADF;
@@ -1654,8 +1664,6 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
if (err)
goto exit;
- down_write(&mm->mmap_sem);
-
/*
* Forbid mm->exe_file change if old file still mapped.
*/
@@ -1667,7 +1675,7 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
if (vma->vm_file &&
path_equal(&vma->vm_file->f_path,
&mm->exe_file->f_path))
- goto exit_unlock;
+ goto exit;
}
/*
@@ -1678,34 +1686,222 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
*/
err = -EPERM;
if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags))
- goto exit_unlock;
+ goto exit;
err = 0;
set_mm_exe_file(mm, exe.file); /* this grabs a reference to exe.file */
-exit_unlock:
- up_write(&mm->mmap_sem);
-
exit:
fdput(exe);
return err;
}
+#ifdef CONFIG_CHECKPOINT_RESTORE
+/*
+ * WARNING: we don't require any capability here so be very careful
+ * in what is allowed for modification from userspace.
+ */
+static int validate_prctl_map(struct prctl_mm_map *prctl_map)
+{
+ unsigned long mmap_max_addr = TASK_SIZE;
+ struct mm_struct *mm = current->mm;
+ int error = -EINVAL, i;
+
+ static const unsigned char offsets[] = {
+ offsetof(struct prctl_mm_map, start_code),
+ offsetof(struct prctl_mm_map, end_code),
+ offsetof(struct prctl_mm_map, start_data),
+ offsetof(struct prctl_mm_map, end_data),
+ offsetof(struct prctl_mm_map, start_brk),
+ offsetof(struct prctl_mm_map, brk),
+ offsetof(struct prctl_mm_map, start_stack),
+ offsetof(struct prctl_mm_map, arg_start),
+ offsetof(struct prctl_mm_map, arg_end),
+ offsetof(struct prctl_mm_map, env_start),
+ offsetof(struct prctl_mm_map, env_end),
+ };
+
+ /*
+ * Make sure the members are not somewhere outside
+ * of allowed address space.
+ */
+ for (i = 0; i < ARRAY_SIZE(offsets); i++) {
+ u64 val = *(u64 *)((char *)prctl_map + offsets[i]);
+
+ if ((unsigned long)val >= mmap_max_addr ||
+ (unsigned long)val < mmap_min_addr)
+ goto out;
+ }
+
+ /*
+ * Make sure the pairs are ordered.
+ */
+#define __prctl_check_order(__m1, __op, __m2) \
+ ((unsigned long)prctl_map->__m1 __op \
+ (unsigned long)prctl_map->__m2) ? 0 : -EINVAL
+ error = __prctl_check_order(start_code, <, end_code);
+ error |= __prctl_check_order(start_data, <, end_data);
+ error |= __prctl_check_order(start_brk, <=, brk);
+ error |= __prctl_check_order(arg_start, <=, arg_end);
+ error |= __prctl_check_order(env_start, <=, env_end);
+ if (error)
+ goto out;
+#undef __prctl_check_order
+
+ error = -EINVAL;
+
+ /*
+ * @brk should be after @end_data in traditional maps.
+ */
+ if (prctl_map->start_brk <= prctl_map->end_data ||
+ prctl_map->brk <= prctl_map->end_data)
+ goto out;
+
+ /*
+ * Neither we should allow to override limits if they set.
+ */
+ if (check_data_rlimit(rlimit(RLIMIT_DATA), prctl_map->brk,
+ prctl_map->start_brk, prctl_map->end_data,
+ prctl_map->start_data))
+ goto out;
+
+ /*
+ * Someone is trying to cheat the auxv vector.
+ */
+ if (prctl_map->auxv_size) {
+ if (!prctl_map->auxv || prctl_map->auxv_size > sizeof(mm->saved_auxv))
+ goto out;
+ }
+
+ /*
+ * Finally, make sure the caller has the rights to
+ * change /proc/pid/exe link: only local root should
+ * be allowed to.
+ */
+ if (prctl_map->exe_fd != (u32)-1) {
+ struct user_namespace *ns = current_user_ns();
+ const struct cred *cred = current_cred();
+
+ if (!uid_eq(cred->uid, make_kuid(ns, 0)) ||
+ !gid_eq(cred->gid, make_kgid(ns, 0)))
+ goto out;
+ }
+
+ error = 0;
+out:
+ return error;
+}
+
+static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data_size)
+{
+ struct prctl_mm_map prctl_map = { .exe_fd = (u32)-1, };
+ unsigned long user_auxv[AT_VECTOR_SIZE];
+ struct mm_struct *mm = current->mm;
+ int error;
+
+ BUILD_BUG_ON(sizeof(user_auxv) != sizeof(mm->saved_auxv));
+ BUILD_BUG_ON(sizeof(struct prctl_mm_map) > 256);
+
+ if (opt == PR_SET_MM_MAP_SIZE)
+ return put_user((unsigned int)sizeof(prctl_map),
+ (unsigned int __user *)addr);
+
+ if (data_size != sizeof(prctl_map))
+ return -EINVAL;
+
+ if (copy_from_user(&prctl_map, addr, sizeof(prctl_map)))
+ return -EFAULT;
+
+ error = validate_prctl_map(&prctl_map);
+ if (error)
+ return error;
+
+ if (prctl_map.auxv_size) {
+ memset(user_auxv, 0, sizeof(user_auxv));
+ if (copy_from_user(user_auxv,
+ (const void __user *)prctl_map.auxv,
+ prctl_map.auxv_size))
+ return -EFAULT;
+
+ /* Last entry must be AT_NULL as specification requires */
+ user_auxv[AT_VECTOR_SIZE - 2] = AT_NULL;
+ user_auxv[AT_VECTOR_SIZE - 1] = AT_NULL;
+ }
+
+ down_write(&mm->mmap_sem);
+ if (prctl_map.exe_fd != (u32)-1)
+ error = prctl_set_mm_exe_file_locked(mm, prctl_map.exe_fd);
+ downgrade_write(&mm->mmap_sem);
+ if (error)
+ goto out;
+
+ /*
+ * We don't validate if these members are pointing to
+ * real present VMAs because application may have correspond
+ * VMAs already unmapped and kernel uses these members for statistics
+ * output in procfs mostly, except
+ *
+ * - @start_brk/@brk which are used in do_brk but kernel lookups
+ * for VMAs when updating these memvers so anything wrong written
+ * here cause kernel to swear at userspace program but won't lead
+ * to any problem in kernel itself
+ */
+
+ mm->start_code = prctl_map.start_code;
+ mm->end_code = prctl_map.end_code;
+ mm->start_data = prctl_map.start_data;
+ mm->end_data = prctl_map.end_data;
+ mm->start_brk = prctl_map.start_brk;
+ mm->brk = prctl_map.brk;
+ mm->start_stack = prctl_map.start_stack;
+ mm->arg_start = prctl_map.arg_start;
+ mm->arg_end = prctl_map.arg_end;
+ mm->env_start = prctl_map.env_start;
+ mm->env_end = prctl_map.env_end;
+
+ /*
+ * Note this update of @saved_auxv is lockless thus
+ * if someone reads this member in procfs while we're
+ * updating -- it may get partly updated results. It's
+ * known and acceptable trade off: we leave it as is to
+ * not introduce additional locks here making the kernel
+ * more complex.
+ */
+ if (prctl_map.auxv_size)
+ memcpy(mm->saved_auxv, user_auxv, sizeof(user_auxv));
+
+ error = 0;
+out:
+ up_read(&mm->mmap_sem);
+ return error;
+}
+#endif /* CONFIG_CHECKPOINT_RESTORE */
+
static int prctl_set_mm(int opt, unsigned long addr,
unsigned long arg4, unsigned long arg5)
{
- unsigned long rlim = rlimit(RLIMIT_DATA);
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
int error;
- if (arg5 || (arg4 && opt != PR_SET_MM_AUXV))
+ if (arg5 || (arg4 && (opt != PR_SET_MM_AUXV &&
+ opt != PR_SET_MM_MAP &&
+ opt != PR_SET_MM_MAP_SIZE)))
return -EINVAL;
+#ifdef CONFIG_CHECKPOINT_RESTORE
+ if (opt == PR_SET_MM_MAP || opt == PR_SET_MM_MAP_SIZE)
+ return prctl_set_mm_map(opt, (const void __user *)addr, arg4);
+#endif
+
if (!capable(CAP_SYS_RESOURCE))
return -EPERM;
- if (opt == PR_SET_MM_EXE_FILE)
- return prctl_set_mm_exe_file(mm, (unsigned int)addr);
+ if (opt == PR_SET_MM_EXE_FILE) {
+ down_write(&mm->mmap_sem);
+ error = prctl_set_mm_exe_file_locked(mm, (unsigned int)addr);
+ up_write(&mm->mmap_sem);
+ return error;
+ }
if (addr >= TASK_SIZE || addr < mmap_min_addr)
return -EINVAL;
@@ -1733,9 +1929,8 @@ static int prctl_set_mm(int opt, unsigned long addr,
if (addr <= mm->end_data)
goto out;
- if (rlim < RLIM_INFINITY &&
- (mm->brk - addr) +
- (mm->end_data - mm->start_data) > rlim)
+ if (check_data_rlimit(rlimit(RLIMIT_DATA), mm->brk, addr,
+ mm->end_data, mm->start_data))
goto out;
mm->start_brk = addr;
@@ -1745,9 +1940,8 @@ static int prctl_set_mm(int opt, unsigned long addr,
if (addr <= mm->end_data)
goto out;
- if (rlim < RLIM_INFINITY &&
- (addr - mm->start_brk) +
- (mm->end_data - mm->start_data) > rlim)
+ if (check_data_rlimit(rlimit(RLIMIT_DATA), addr, mm->start_brk,
+ mm->end_data, mm->start_data))
goto out;
mm->brk = addr;
@@ -2023,6 +2217,7 @@ SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep,
{
int err = 0;
int cpu = raw_smp_processor_id();
+
if (cpup)
err |= put_user(cpu, cpup);
if (nodep)
@@ -2135,7 +2330,7 @@ COMPAT_SYSCALL_DEFINE1(sysinfo, struct compat_sysinfo __user *, info)
/* Check to see if any memory value is too large for 32-bit and scale
* down if needed
*/
- if ((s.totalram >> 32) || (s.totalswap >> 32)) {
+ if (upper_32_bits(s.totalram) || upper_32_bits(s.totalswap)) {
int bitcount = 0;
while (s.mem_unit < PAGE_SIZE) {
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 391d4ddb6f4b..02aa4185b17e 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -156,6 +156,9 @@ cond_syscall(sys_process_vm_writev);
cond_syscall(compat_sys_process_vm_readv);
cond_syscall(compat_sys_process_vm_writev);
cond_syscall(sys_uselib);
+cond_syscall(sys_fadvise64);
+cond_syscall(sys_fadvise64_64);
+cond_syscall(sys_madvise);
/* arch-specific weak syscall entries */
cond_syscall(sys_pciconfig_read);
@@ -218,3 +221,6 @@ cond_syscall(sys_kcmp);
/* operate on Secure Computing state */
cond_syscall(sys_seccomp);
+
+/* access BPF programs and maps */
+cond_syscall(sys_bpf);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index ab456664609d..4aada6d9fe74 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1451,13 +1451,6 @@ static struct ctl_table vm_table[] = {
.extra2 = &one,
},
#endif
- {
- .procname = "scan_unevictable_pages",
- .data = &scan_unevictable_pages,
- .maxlen = sizeof(scan_unevictable_pages),
- .mode = 0644,
- .proc_handler = scan_unevictable_handler,
- },
#ifdef CONFIG_MEMORY_FAILURE
{
.procname = "memory_failure_early_kill",
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index e4ba9a5a5ccb..9a4f750a2963 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -390,7 +390,6 @@ static const struct bin_table bin_net_ipv4_table[] = {
{ CTL_INT, NET_TCP_MTU_PROBING, "tcp_mtu_probing" },
{ CTL_INT, NET_TCP_BASE_MSS, "tcp_base_mss" },
{ CTL_INT, NET_IPV4_TCP_WORKAROUND_SIGNED_WINDOWS, "tcp_workaround_signed_windows" },
- { CTL_INT, NET_TCP_DMA_COPYBREAK, "tcp_dma_copybreak" },
{ CTL_INT, NET_TCP_SLOW_START_AFTER_IDLE, "tcp_slow_start_after_idle" },
{ CTL_INT, NET_CIPSOV4_CACHE_ENABLE, "cipso_cache_enable" },
{ CTL_INT, NET_CIPSOV4_CACHE_BUCKET_SIZE, "cipso_cache_bucket_size" },
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 0a0608edeb26..052b4b53c3d6 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -400,4 +400,5 @@ void tick_resume(void)
void __init tick_init(void)
{
tick_broadcast_init();
+ tick_nohz_init();
}
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index c19c1d84b6f3..366aeb4f2c66 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -99,6 +99,13 @@ static inline int tick_broadcast_oneshot_active(void) { return 0; }
static inline bool tick_broadcast_oneshot_available(void) { return false; }
#endif /* !TICK_ONESHOT */
+/* NO_HZ_FULL internal */
+#ifdef CONFIG_NO_HZ_FULL
+extern void tick_nohz_init(void);
+# else
+static inline void tick_nohz_init(void) { }
+#endif
+
/*
* Broadcasting support
*/
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index f654a8a298fa..7c1412ea2d29 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -295,22 +295,12 @@ out:
/* Parse the boot-time nohz CPU list from the kernel parameters. */
static int __init tick_nohz_full_setup(char *str)
{
- int cpu;
-
alloc_bootmem_cpumask_var(&tick_nohz_full_mask);
- alloc_bootmem_cpumask_var(&housekeeping_mask);
if (cpulist_parse(str, tick_nohz_full_mask) < 0) {
pr_warning("NOHZ: Incorrect nohz_full cpumask\n");
+ free_bootmem_cpumask_var(tick_nohz_full_mask);
return 1;
}
-
- cpu = smp_processor_id();
- if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) {
- pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu);
- cpumask_clear_cpu(cpu, tick_nohz_full_mask);
- }
- cpumask_andnot(housekeeping_mask,
- cpu_possible_mask, tick_nohz_full_mask);
tick_nohz_full_running = true;
return 1;
@@ -349,18 +339,11 @@ static int tick_nohz_init_all(void)
#ifdef CONFIG_NO_HZ_FULL_ALL
if (!alloc_cpumask_var(&tick_nohz_full_mask, GFP_KERNEL)) {
- pr_err("NO_HZ: Can't allocate full dynticks cpumask\n");
- return err;
- }
- if (!alloc_cpumask_var(&housekeeping_mask, GFP_KERNEL)) {
- pr_err("NO_HZ: Can't allocate not-full dynticks cpumask\n");
+ WARN(1, "NO_HZ: Can't allocate full dynticks cpumask\n");
return err;
}
err = 0;
cpumask_setall(tick_nohz_full_mask);
- cpumask_clear_cpu(smp_processor_id(), tick_nohz_full_mask);
- cpumask_clear(housekeeping_mask);
- cpumask_set_cpu(smp_processor_id(), housekeeping_mask);
tick_nohz_full_running = true;
#endif
return err;
@@ -375,6 +358,37 @@ void __init tick_nohz_init(void)
return;
}
+ if (!alloc_cpumask_var(&housekeeping_mask, GFP_KERNEL)) {
+ WARN(1, "NO_HZ: Can't allocate not-full dynticks cpumask\n");
+ cpumask_clear(tick_nohz_full_mask);
+ tick_nohz_full_running = false;
+ return;
+ }
+
+ /*
+ * Full dynticks uses irq work to drive the tick rescheduling on safe
+ * locking contexts. But then we need irq work to raise its own
+ * interrupts to avoid circular dependency on the tick
+ */
+ if (!arch_irq_work_has_interrupt()) {
+ pr_warning("NO_HZ: Can't run full dynticks because arch doesn't "
+ "support irq work self-IPIs\n");
+ cpumask_clear(tick_nohz_full_mask);
+ cpumask_copy(housekeeping_mask, cpu_possible_mask);
+ tick_nohz_full_running = false;
+ return;
+ }
+
+ cpu = smp_processor_id();
+
+ if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) {
+ pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu);
+ cpumask_clear_cpu(cpu, tick_nohz_full_mask);
+ }
+
+ cpumask_andnot(housekeeping_mask,
+ cpu_possible_mask, tick_nohz_full_mask);
+
for_each_cpu(cpu, tick_nohz_full_mask)
context_tracking_cpu_set(cpu);
@@ -982,6 +996,10 @@ static void tick_nohz_handler(struct clock_event_device *dev)
tick_sched_do_timer(now);
tick_sched_handle(ts, regs);
+ /* No need to reprogram if we are running tickless */
+ if (unlikely(ts->tick_stopped))
+ return;
+
while (tick_nohz_reprogram(ts, now)) {
now = ktime_get();
tick_do_update_jiffies64(now);
@@ -1109,6 +1127,10 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
if (regs)
tick_sched_handle(ts, regs);
+ /* No need to reprogram if we are in idle or full dynticks mode */
+ if (unlikely(ts->tick_stopped))
+ return HRTIMER_NORESTART;
+
hrtimer_forward(timer, now, tick_period);
return HRTIMER_RESTART;
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index aca5dfe2fa3d..9bbb8344ed3b 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1385,7 +1385,7 @@ void update_process_times(int user_tick)
rcu_check_callbacks(cpu, user_tick);
#ifdef CONFIG_IRQ_WORK
if (in_irq())
- irq_work_run();
+ irq_work_tick();
#endif
scheduler_tick();
run_posix_cpu_timers(p);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 5916a8e59e87..fb186b9ddf51 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -113,6 +113,9 @@ ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub;
static struct ftrace_ops global_ops;
static struct ftrace_ops control_ops;
+static void ftrace_ops_recurs_func(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op, struct pt_regs *regs);
+
#if ARCH_SUPPORTS_FTRACE_OPS
static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct pt_regs *regs);
@@ -251,18 +254,24 @@ static void update_ftrace_function(void)
ftrace_func_t func;
/*
+ * Prepare the ftrace_ops that the arch callback will use.
+ * If there's only one ftrace_ops registered, the ftrace_ops_list
+ * will point to the ops we want.
+ */
+ set_function_trace_op = ftrace_ops_list;
+
+ /* If there's no ftrace_ops registered, just call the stub function */
+ if (ftrace_ops_list == &ftrace_list_end) {
+ func = ftrace_stub;
+
+ /*
* If we are at the end of the list and this ops is
* recursion safe and not dynamic and the arch supports passing ops,
* then have the mcount trampoline call the function directly.
*/
- if (ftrace_ops_list == &ftrace_list_end ||
- (ftrace_ops_list->next == &ftrace_list_end &&
- !(ftrace_ops_list->flags & FTRACE_OPS_FL_DYNAMIC) &&
- (ftrace_ops_list->flags & FTRACE_OPS_FL_RECURSION_SAFE) &&
- !FTRACE_FORCE_LIST_FUNC)) {
- /* Set the ftrace_ops that the arch callback uses */
- set_function_trace_op = ftrace_ops_list;
- func = ftrace_ops_list->func;
+ } else if (ftrace_ops_list->next == &ftrace_list_end) {
+ func = ftrace_ops_get_func(ftrace_ops_list);
+
} else {
/* Just use the default ftrace_ops */
set_function_trace_op = &ftrace_list_end;
@@ -1048,6 +1057,12 @@ static struct pid * const ftrace_swapper_pid = &init_struct_pid;
static struct ftrace_ops *removed_ops;
+/*
+ * Set when doing a global update, like enabling all recs or disabling them.
+ * It is not set when just updating a single ftrace_ops.
+ */
+static bool update_all_ops;
+
#ifndef CONFIG_FTRACE_MCOUNT_RECORD
# error Dynamic ftrace depends on MCOUNT_RECORD
#endif
@@ -1307,7 +1322,6 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
struct ftrace_func_entry *entry;
struct hlist_node *tn;
struct hlist_head *hhd;
- struct ftrace_hash *old_hash;
struct ftrace_hash *new_hash;
int size = src->count;
int bits = 0;
@@ -1352,15 +1366,28 @@ update:
*/
ftrace_hash_rec_disable_modify(ops, enable);
- old_hash = *dst;
rcu_assign_pointer(*dst, new_hash);
- free_ftrace_hash_rcu(old_hash);
ftrace_hash_rec_enable_modify(ops, enable);
return 0;
}
+static bool hash_contains_ip(unsigned long ip,
+ struct ftrace_ops_hash *hash)
+{
+ /*
+ * The function record is a match if it exists in the filter
+ * hash and not in the notrace hash. Note, an emty hash is
+ * considered a match for the filter hash, but an empty
+ * notrace hash is considered not in the notrace hash.
+ */
+ return (ftrace_hash_empty(hash->filter_hash) ||
+ ftrace_lookup_ip(hash->filter_hash, ip)) &&
+ (ftrace_hash_empty(hash->notrace_hash) ||
+ !ftrace_lookup_ip(hash->notrace_hash, ip));
+}
+
/*
* Test the hashes for this ops to see if we want to call
* the ops->func or not.
@@ -1376,8 +1403,7 @@ update:
static int
ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs)
{
- struct ftrace_hash *filter_hash;
- struct ftrace_hash *notrace_hash;
+ struct ftrace_ops_hash hash;
int ret;
#ifdef CONFIG_DYNAMIC_FTRACE_WITH_REGS
@@ -1390,13 +1416,10 @@ ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip, void *regs)
return 0;
#endif
- filter_hash = rcu_dereference_raw_notrace(ops->func_hash->filter_hash);
- notrace_hash = rcu_dereference_raw_notrace(ops->func_hash->notrace_hash);
+ hash.filter_hash = rcu_dereference_raw_notrace(ops->func_hash->filter_hash);
+ hash.notrace_hash = rcu_dereference_raw_notrace(ops->func_hash->notrace_hash);
- if ((ftrace_hash_empty(filter_hash) ||
- ftrace_lookup_ip(filter_hash, ip)) &&
- (ftrace_hash_empty(notrace_hash) ||
- !ftrace_lookup_ip(notrace_hash, ip)))
+ if (hash_contains_ip(ip, &hash))
ret = 1;
else
ret = 0;
@@ -1508,46 +1531,6 @@ static bool test_rec_ops_needs_regs(struct dyn_ftrace *rec)
return keep_regs;
}
-static void ftrace_remove_tramp(struct ftrace_ops *ops,
- struct dyn_ftrace *rec)
-{
- /* If TRAMP is not set, no ops should have a trampoline for this */
- if (!(rec->flags & FTRACE_FL_TRAMP))
- return;
-
- rec->flags &= ~FTRACE_FL_TRAMP;
-
- if ((!ftrace_hash_empty(ops->func_hash->filter_hash) &&
- !ftrace_lookup_ip(ops->func_hash->filter_hash, rec->ip)) ||
- ftrace_lookup_ip(ops->func_hash->notrace_hash, rec->ip))
- return;
- /*
- * The tramp_hash entry will be removed at time
- * of update.
- */
- ops->nr_trampolines--;
-}
-
-static void ftrace_clear_tramps(struct dyn_ftrace *rec, struct ftrace_ops *ops)
-{
- struct ftrace_ops *op;
-
- /* If TRAMP is not set, no ops should have a trampoline for this */
- if (!(rec->flags & FTRACE_FL_TRAMP))
- return;
-
- do_for_each_ftrace_op(op, ftrace_ops_list) {
- /*
- * This function is called to clear other tramps
- * not the one that is being updated.
- */
- if (op == ops)
- continue;
- if (op->nr_trampolines)
- ftrace_remove_tramp(op, rec);
- } while_for_each_ftrace_op(op);
-}
-
static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
int filter_hash,
bool inc)
@@ -1636,18 +1619,16 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
* function, and the ops has a trampoline registered
* for it, then we can call it directly.
*/
- if (ftrace_rec_count(rec) == 1 && ops->trampoline) {
+ if (ftrace_rec_count(rec) == 1 && ops->trampoline)
rec->flags |= FTRACE_FL_TRAMP;
- ops->nr_trampolines++;
- } else {
+ else
/*
* If we are adding another function callback
* to this function, and the previous had a
* custom trampoline in use, then we need to go
* back to the default trampoline.
*/
- ftrace_clear_tramps(rec, ops);
- }
+ rec->flags &= ~FTRACE_FL_TRAMP;
/*
* If any ops wants regs saved for this function
@@ -1660,9 +1641,6 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
return;
rec->flags--;
- if (ops->trampoline && !ftrace_rec_count(rec))
- ftrace_remove_tramp(ops, rec);
-
/*
* If the rec had REGS enabled and the ops that is
* being removed had REGS set, then see if there is
@@ -1677,6 +1655,17 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
}
/*
+ * If the rec had TRAMP enabled, then it needs to
+ * be cleared. As TRAMP can only be enabled iff
+ * there is only a single ops attached to it.
+ * In otherwords, always disable it on decrementing.
+ * In the future, we may set it if rec count is
+ * decremented to one, and the ops that is left
+ * has a trampoline.
+ */
+ rec->flags &= ~FTRACE_FL_TRAMP;
+
+ /*
* flags will be cleared in ftrace_check_record()
* if rec count is zero.
*/
@@ -1895,21 +1884,72 @@ int ftrace_test_record(struct dyn_ftrace *rec, int enable)
}
static struct ftrace_ops *
+ftrace_find_tramp_ops_any(struct dyn_ftrace *rec)
+{
+ struct ftrace_ops *op;
+ unsigned long ip = rec->ip;
+
+ do_for_each_ftrace_op(op, ftrace_ops_list) {
+
+ if (!op->trampoline)
+ continue;
+
+ if (hash_contains_ip(ip, op->func_hash))
+ return op;
+ } while_for_each_ftrace_op(op);
+
+ return NULL;
+}
+
+static struct ftrace_ops *
ftrace_find_tramp_ops_curr(struct dyn_ftrace *rec)
{
struct ftrace_ops *op;
+ unsigned long ip = rec->ip;
- /* Removed ops need to be tested first */
- if (removed_ops && removed_ops->tramp_hash) {
- if (ftrace_lookup_ip(removed_ops->tramp_hash, rec->ip))
+ /*
+ * Need to check removed ops first.
+ * If they are being removed, and this rec has a tramp,
+ * and this rec is in the ops list, then it would be the
+ * one with the tramp.
+ */
+ if (removed_ops) {
+ if (hash_contains_ip(ip, &removed_ops->old_hash))
return removed_ops;
}
+ /*
+ * Need to find the current trampoline for a rec.
+ * Now, a trampoline is only attached to a rec if there
+ * was a single 'ops' attached to it. But this can be called
+ * when we are adding another op to the rec or removing the
+ * current one. Thus, if the op is being added, we can
+ * ignore it because it hasn't attached itself to the rec
+ * yet. That means we just need to find the op that has a
+ * trampoline and is not beeing added.
+ */
do_for_each_ftrace_op(op, ftrace_ops_list) {
- if (!op->tramp_hash)
+
+ if (!op->trampoline)
continue;
- if (ftrace_lookup_ip(op->tramp_hash, rec->ip))
+ /*
+ * If the ops is being added, it hasn't gotten to
+ * the point to be removed from this tree yet.
+ */
+ if (op->flags & FTRACE_OPS_FL_ADDING)
+ continue;
+
+ /*
+ * If the ops is not being added and has a trampoline,
+ * then it must be the one that we want!
+ */
+ if (hash_contains_ip(ip, op->func_hash))
+ return op;
+
+ /* If the ops is being modified, it may be in the old hash. */
+ if ((op->flags & FTRACE_OPS_FL_MODIFYING) &&
+ hash_contains_ip(ip, &op->old_hash))
return op;
} while_for_each_ftrace_op(op);
@@ -1921,10 +1961,11 @@ static struct ftrace_ops *
ftrace_find_tramp_ops_new(struct dyn_ftrace *rec)
{
struct ftrace_ops *op;
+ unsigned long ip = rec->ip;
do_for_each_ftrace_op(op, ftrace_ops_list) {
/* pass rec in as regs to have non-NULL val */
- if (ftrace_ops_test(op, rec->ip, rec))
+ if (hash_contains_ip(ip, op->func_hash))
return op;
} while_for_each_ftrace_op(op);
@@ -2231,92 +2272,6 @@ void __weak arch_ftrace_update_code(int command)
ftrace_run_stop_machine(command);
}
-static int ftrace_save_ops_tramp_hash(struct ftrace_ops *ops)
-{
- struct ftrace_page *pg;
- struct dyn_ftrace *rec;
- int size, bits;
- int ret;
-
- size = ops->nr_trampolines;
- bits = 0;
- /*
- * Make the hash size about 1/2 the # found
- */
- for (size /= 2; size; size >>= 1)
- bits++;
-
- ops->tramp_hash = alloc_ftrace_hash(bits);
- /*
- * TODO: a failed allocation is going to screw up
- * the accounting of what needs to be modified
- * and not. For now, we kill ftrace if we fail
- * to allocate here. But there are ways around this,
- * but that will take a little more work.
- */
- if (!ops->tramp_hash)
- return -ENOMEM;
-
- do_for_each_ftrace_rec(pg, rec) {
- if (ftrace_rec_count(rec) == 1 &&
- ftrace_ops_test(ops, rec->ip, rec)) {
-
- /*
- * If another ops adds to a rec, the rec will
- * lose its trampoline and never get it back
- * until all ops are off of it.
- */
- if (!(rec->flags & FTRACE_FL_TRAMP))
- continue;
-
- /* This record had better have a trampoline */
- if (FTRACE_WARN_ON(!(rec->flags & FTRACE_FL_TRAMP_EN)))
- return -1;
-
- ret = add_hash_entry(ops->tramp_hash, rec->ip);
- if (ret < 0)
- return ret;
- }
- } while_for_each_ftrace_rec();
-
- /* The number of recs in the hash must match nr_trampolines */
- if (FTRACE_WARN_ON(ops->tramp_hash->count != ops->nr_trampolines))
- pr_warn("count=%ld trampolines=%d\n",
- ops->tramp_hash->count,
- ops->nr_trampolines);
-
- return 0;
-}
-
-static int ftrace_save_tramp_hashes(void)
-{
- struct ftrace_ops *op;
- int ret;
-
- /*
- * Now that any trampoline is being used, we need to save the
- * hashes for the ops that have them. This allows the mapping
- * back from the record to the ops that has the trampoline to
- * know what code is being replaced. Modifying code must always
- * verify what it is changing.
- */
- do_for_each_ftrace_op(op, ftrace_ops_list) {
-
- /* The tramp_hash is recreated each time. */
- free_ftrace_hash(op->tramp_hash);
- op->tramp_hash = NULL;
-
- if (op->nr_trampolines) {
- ret = ftrace_save_ops_tramp_hash(op);
- if (ret)
- return ret;
- }
-
- } while_for_each_ftrace_op(op);
-
- return 0;
-}
-
static void ftrace_run_update_code(int command)
{
int ret;
@@ -2336,9 +2291,13 @@ static void ftrace_run_update_code(int command)
ret = ftrace_arch_code_modify_post_process();
FTRACE_WARN_ON(ret);
+}
- ret = ftrace_save_tramp_hashes();
- FTRACE_WARN_ON(ret);
+static void ftrace_run_modify_code(struct ftrace_ops *ops, int command)
+{
+ ops->flags |= FTRACE_OPS_FL_MODIFYING;
+ ftrace_run_update_code(command);
+ ops->flags &= ~FTRACE_OPS_FL_MODIFYING;
}
static ftrace_func_t saved_ftrace_func;
@@ -2362,6 +2321,13 @@ static void ftrace_startup_enable(int command)
ftrace_run_update_code(command);
}
+static void ftrace_startup_all(int command)
+{
+ update_all_ops = true;
+ ftrace_startup_enable(command);
+ update_all_ops = false;
+}
+
static int ftrace_startup(struct ftrace_ops *ops, int command)
{
int ret;
@@ -2376,12 +2342,22 @@ static int ftrace_startup(struct ftrace_ops *ops, int command)
ftrace_start_up++;
command |= FTRACE_UPDATE_CALLS;
- ops->flags |= FTRACE_OPS_FL_ENABLED;
+ /*
+ * Note that ftrace probes uses this to start up
+ * and modify functions it will probe. But we still
+ * set the ADDING flag for modification, as probes
+ * do not have trampolines. If they add them in the
+ * future, then the probes will need to distinguish
+ * between adding and updating probes.
+ */
+ ops->flags |= FTRACE_OPS_FL_ENABLED | FTRACE_OPS_FL_ADDING;
ftrace_hash_rec_enable(ops, 1);
ftrace_startup_enable(command);
+ ops->flags &= ~FTRACE_OPS_FL_ADDING;
+
return 0;
}
@@ -2431,11 +2407,35 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
* If the ops uses a trampoline, then it needs to be
* tested first on update.
*/
+ ops->flags |= FTRACE_OPS_FL_REMOVING;
removed_ops = ops;
+ /* The trampoline logic checks the old hashes */
+ ops->old_hash.filter_hash = ops->func_hash->filter_hash;
+ ops->old_hash.notrace_hash = ops->func_hash->notrace_hash;
+
ftrace_run_update_code(command);
+ /*
+ * If there's no more ops registered with ftrace, run a
+ * sanity check to make sure all rec flags are cleared.
+ */
+ if (ftrace_ops_list == &ftrace_list_end) {
+ struct ftrace_page *pg;
+ struct dyn_ftrace *rec;
+
+ do_for_each_ftrace_rec(pg, rec) {
+ if (FTRACE_WARN_ON_ONCE(rec->flags))
+ pr_warn(" %pS flags:%lx\n",
+ (void *)rec->ip, rec->flags);
+ } while_for_each_ftrace_rec();
+ }
+
+ ops->old_hash.filter_hash = NULL;
+ ops->old_hash.notrace_hash = NULL;
+
removed_ops = NULL;
+ ops->flags &= ~FTRACE_OPS_FL_REMOVING;
/*
* Dynamic ops may be freed, we must make sure that all
@@ -2960,8 +2960,8 @@ static int t_show(struct seq_file *m, void *v)
if (rec->flags & FTRACE_FL_TRAMP_EN) {
struct ftrace_ops *ops;
- ops = ftrace_find_tramp_ops_curr(rec);
- if (ops && ops->trampoline)
+ ops = ftrace_find_tramp_ops_any(rec);
+ if (ops)
seq_printf(m, "\ttramp: %pS",
(void *)ops->trampoline);
else
@@ -3348,7 +3348,7 @@ static void __enable_ftrace_function_probe(void)
if (ftrace_probe_registered) {
/* still need to update the function call sites */
if (ftrace_enabled)
- ftrace_run_update_code(FTRACE_UPDATE_CALLS);
+ ftrace_run_modify_code(&trace_probe_ops, FTRACE_UPDATE_CALLS);
return;
}
@@ -3399,6 +3399,7 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
{
struct ftrace_func_probe *entry;
struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash;
+ struct ftrace_hash *old_hash = *orig_hash;
struct ftrace_hash *hash;
struct ftrace_page *pg;
struct dyn_ftrace *rec;
@@ -3417,7 +3418,7 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
mutex_lock(&trace_probe_ops.func_hash->regex_lock);
- hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash);
+ hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, old_hash);
if (!hash) {
count = -ENOMEM;
goto out;
@@ -3476,7 +3477,9 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
} while_for_each_ftrace_rec();
ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash);
- if (ret < 0)
+ if (!ret)
+ free_ftrace_hash_rcu(old_hash);
+ else
count = ret;
__enable_ftrace_function_probe();
@@ -3503,6 +3506,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
struct ftrace_func_probe *entry;
struct ftrace_func_probe *p;
struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash;
+ struct ftrace_hash *old_hash = *orig_hash;
struct list_head free_list;
struct ftrace_hash *hash;
struct hlist_node *tmp;
@@ -3510,6 +3514,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
int type = MATCH_FULL;
int i, len = 0;
char *search;
+ int ret;
if (glob && (strcmp(glob, "*") == 0 || !strlen(glob)))
glob = NULL;
@@ -3568,8 +3573,11 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
* Remove after the disable is called. Otherwise, if the last
* probe is removed, a null hash means *all enabled*.
*/
- ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash);
+ ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash);
synchronize_sched();
+ if (!ret)
+ free_ftrace_hash_rcu(old_hash);
+
list_for_each_entry_safe(entry, p, &free_list, free_list) {
list_del(&entry->free_list);
ftrace_free_entry(entry);
@@ -3759,7 +3767,7 @@ ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove)
static void ftrace_ops_update_code(struct ftrace_ops *ops)
{
if (ops->flags & FTRACE_OPS_FL_ENABLED && ftrace_enabled)
- ftrace_run_update_code(FTRACE_UPDATE_CALLS);
+ ftrace_run_modify_code(ops, FTRACE_UPDATE_CALLS);
}
static int
@@ -3767,6 +3775,7 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
unsigned long ip, int remove, int reset, int enable)
{
struct ftrace_hash **orig_hash;
+ struct ftrace_hash *old_hash;
struct ftrace_hash *hash;
int ret;
@@ -3801,10 +3810,12 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len,
}
mutex_lock(&ftrace_lock);
+ old_hash = *orig_hash;
ret = ftrace_hash_move(ops, enable, orig_hash, hash);
- if (!ret)
+ if (!ret) {
ftrace_ops_update_code(ops);
-
+ free_ftrace_hash_rcu(old_hash);
+ }
mutex_unlock(&ftrace_lock);
out_regex_unlock:
@@ -4013,6 +4024,7 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
struct seq_file *m = (struct seq_file *)file->private_data;
struct ftrace_iterator *iter;
struct ftrace_hash **orig_hash;
+ struct ftrace_hash *old_hash;
struct trace_parser *parser;
int filter_hash;
int ret;
@@ -4042,11 +4054,13 @@ int ftrace_regex_release(struct inode *inode, struct file *file)
orig_hash = &iter->ops->func_hash->notrace_hash;
mutex_lock(&ftrace_lock);
+ old_hash = *orig_hash;
ret = ftrace_hash_move(iter->ops, filter_hash,
orig_hash, iter->hash);
- if (!ret)
+ if (!ret) {
ftrace_ops_update_code(iter->ops);
-
+ free_ftrace_hash_rcu(old_hash);
+ }
mutex_unlock(&ftrace_lock);
}
@@ -4678,6 +4692,7 @@ core_initcall(ftrace_nodyn_init);
static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; }
static inline void ftrace_startup_enable(int command) { }
+static inline void ftrace_startup_all(int command) { }
/* Keep as macros so we do not need to define the commands */
# define ftrace_startup(ops, command) \
({ \
@@ -4827,6 +4842,56 @@ static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip)
}
#endif
+/*
+ * If there's only one function registered but it does not support
+ * recursion, this function will be called by the mcount trampoline.
+ * This function will handle recursion protection.
+ */
+static void ftrace_ops_recurs_func(unsigned long ip, unsigned long parent_ip,
+ struct ftrace_ops *op, struct pt_regs *regs)
+{
+ int bit;
+
+ bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX);
+ if (bit < 0)
+ return;
+
+ op->func(ip, parent_ip, op, regs);
+
+ trace_clear_recursion(bit);
+}
+
+/**
+ * ftrace_ops_get_func - get the function a trampoline should call
+ * @ops: the ops to get the function for
+ *
+ * Normally the mcount trampoline will call the ops->func, but there
+ * are times that it should not. For example, if the ops does not
+ * have its own recursion protection, then it should call the
+ * ftrace_ops_recurs_func() instead.
+ *
+ * Returns the function that the trampoline should call for @ops.
+ */
+ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops)
+{
+ /*
+ * If this is a dynamic ops or we force list func,
+ * then it needs to call the list anyway.
+ */
+ if (ops->flags & FTRACE_OPS_FL_DYNAMIC || FTRACE_FORCE_LIST_FUNC)
+ return ftrace_ops_list_func;
+
+ /*
+ * If the func handles its own recursion, call it directly.
+ * Otherwise call the recursion protected function that
+ * will call the ftrace ops function.
+ */
+ if (!(ops->flags & FTRACE_OPS_FL_RECURSION_SAFE))
+ return ftrace_ops_recurs_func;
+
+ return ops->func;
+}
+
static void clear_ftrace_swapper(void)
{
struct task_struct *p;
@@ -4927,7 +4992,8 @@ static int ftrace_pid_add(int p)
set_ftrace_pid_task(pid);
ftrace_update_pid_func();
- ftrace_startup_enable(0);
+
+ ftrace_startup_all(0);
mutex_unlock(&ftrace_lock);
return 0;
@@ -4956,7 +5022,7 @@ static void ftrace_pid_reset(void)
}
ftrace_update_pid_func();
- ftrace_startup_enable(0);
+ ftrace_startup_all(0);
mutex_unlock(&ftrace_lock);
}
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index b38fb2b9e237..2d75c94ae87d 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -3359,7 +3359,7 @@ static void rb_iter_reset(struct ring_buffer_iter *iter)
iter->head = cpu_buffer->reader_page->read;
iter->cache_reader_page = iter->head_page;
- iter->cache_read = iter->head;
+ iter->cache_read = cpu_buffer->read;
if (iter->head)
iter->read_stamp = cpu_buffer->read_stamp;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index ef06ce7e9cf8..0cc51edde3a8 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2513,8 +2513,11 @@ static __init int event_test_thread(void *unused)
kfree(test_malloc);
set_current_state(TASK_INTERRUPTIBLE);
- while (!kthread_should_stop())
+ while (!kthread_should_stop()) {
schedule();
+ set_current_state(TASK_INTERRUPTIBLE);
+ }
+ __set_current_state(TASK_RUNNING);
return 0;
}
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 5ef60499dc8e..b0f86ea77881 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -382,6 +382,8 @@ static int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
/* check the trace buffer */
ret = trace_test_buffer(&tr->trace_buffer, &count);
+
+ ftrace_enabled = 1;
tracing_start();
/* we should only have one item */
@@ -679,6 +681,8 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
/* check the trace buffer */
ret = trace_test_buffer(&tr->trace_buffer, &count);
+
+ ftrace_enabled = 1;
trace->reset(tr);
tracing_start();
@@ -1025,6 +1029,12 @@ trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr)
#endif
#ifdef CONFIG_SCHED_TRACER
+
+struct wakeup_test_data {
+ struct completion is_ready;
+ int go;
+};
+
static int trace_wakeup_test_thread(void *data)
{
/* Make this a -deadline thread */
@@ -1034,51 +1044,56 @@ static int trace_wakeup_test_thread(void *data)
.sched_deadline = 10000000ULL,
.sched_period = 10000000ULL
};
- struct completion *x = data;
+ struct wakeup_test_data *x = data;
sched_setattr(current, &attr);
/* Make it know we have a new prio */
- complete(x);
+ complete(&x->is_ready);
/* now go to sleep and let the test wake us up */
set_current_state(TASK_INTERRUPTIBLE);
- schedule();
+ while (!x->go) {
+ schedule();
+ set_current_state(TASK_INTERRUPTIBLE);
+ }
- complete(x);
+ complete(&x->is_ready);
+
+ set_current_state(TASK_INTERRUPTIBLE);
/* we are awake, now wait to disappear */
while (!kthread_should_stop()) {
- /*
- * This will likely be the system top priority
- * task, do short sleeps to let others run.
- */
- msleep(100);
+ schedule();
+ set_current_state(TASK_INTERRUPTIBLE);
}
+ __set_current_state(TASK_RUNNING);
+
return 0;
}
-
int
trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
{
unsigned long save_max = tr->max_latency;
struct task_struct *p;
- struct completion is_ready;
+ struct wakeup_test_data data;
unsigned long count;
int ret;
- init_completion(&is_ready);
+ memset(&data, 0, sizeof(data));
+
+ init_completion(&data.is_ready);
/* create a -deadline thread */
- p = kthread_run(trace_wakeup_test_thread, &is_ready, "ftrace-test");
+ p = kthread_run(trace_wakeup_test_thread, &data, "ftrace-test");
if (IS_ERR(p)) {
printk(KERN_CONT "Failed to create ftrace wakeup test thread ");
return -1;
}
/* make sure the thread is running at -deadline policy */
- wait_for_completion(&is_ready);
+ wait_for_completion(&data.is_ready);
/* start the tracing */
ret = tracer_init(trace, tr);
@@ -1099,18 +1114,20 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
msleep(100);
}
- init_completion(&is_ready);
+ init_completion(&data.is_ready);
+
+ data.go = 1;
+ /* memory barrier is in the wake_up_process() */
wake_up_process(p);
/* Wait for the task to wake up */
- wait_for_completion(&is_ready);
+ wait_for_completion(&data.is_ready);
/* stop the tracing. */
tracing_stop();
/* check both trace buffers */
ret = trace_test_buffer(&tr->trace_buffer, NULL);
- printk("ret = %d\n", ret);
if (!ret)
ret = trace_test_buffer(&tr->max_buffer, &count);
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 759d5e004517..4dc8b79c5f75 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -425,7 +425,7 @@ static void unreg_event_syscall_enter(struct ftrace_event_file *file,
return;
mutex_lock(&syscall_trace_lock);
tr->sys_refcount_enter--;
- rcu_assign_pointer(tr->enter_syscall_files[num], NULL);
+ RCU_INIT_POINTER(tr->enter_syscall_files[num], NULL);
if (!tr->sys_refcount_enter)
unregister_trace_sys_enter(ftrace_syscall_enter, tr);
mutex_unlock(&syscall_trace_lock);
@@ -463,7 +463,7 @@ static void unreg_event_syscall_exit(struct ftrace_event_file *file,
return;
mutex_lock(&syscall_trace_lock);
tr->sys_refcount_exit--;
- rcu_assign_pointer(tr->exit_syscall_files[num], NULL);
+ RCU_INIT_POINTER(tr->exit_syscall_files[num], NULL);
if (!tr->sys_refcount_exit)
unregister_trace_sys_exit(ftrace_syscall_exit, tr);
mutex_unlock(&syscall_trace_lock);
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index a8d6914030fe..7b223b212683 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -47,6 +47,7 @@ static DEFINE_PER_CPU(bool, softlockup_touch_sync);
static DEFINE_PER_CPU(bool, soft_watchdog_warn);
static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt);
+static DEFINE_PER_CPU(struct task_struct *, softlockup_task_ptr_saved);
#ifdef CONFIG_HARDLOCKUP_DETECTOR
static DEFINE_PER_CPU(bool, hard_watchdog_warn);
static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
@@ -333,8 +334,22 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
return HRTIMER_RESTART;
/* only warn once */
- if (__this_cpu_read(soft_watchdog_warn) == true)
+ if (__this_cpu_read(soft_watchdog_warn) == true) {
+ /*
+ * When multiple processes are causing softlockups the
+ * softlockup detector only warns on the first one
+ * because the code relies on a full quiet cycle to
+ * re-arm. The second process prevents the quiet cycle
+ * and never gets reported. Use task pointers to detect
+ * this.
+ */
+ if (__this_cpu_read(softlockup_task_ptr_saved) !=
+ current) {
+ __this_cpu_write(soft_watchdog_warn, false);
+ __touch_watchdog();
+ }
return HRTIMER_RESTART;
+ }
if (softlockup_all_cpu_backtrace) {
/* Prevent multiple soft-lockup reports if one cpu is already
@@ -350,6 +365,7 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
smp_processor_id(), duration,
current->comm, task_pid_nr(current));
+ __this_cpu_write(softlockup_task_ptr_saved, current);
print_modules();
print_irqtrace_events(current);
if (regs)