summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorPhilippe Schenker <philippe.schenker@toradex.com>2019-05-08 16:08:30 +0200
committerPhilippe Schenker <philippe.schenker@toradex.com>2019-05-08 16:08:30 +0200
commit774f42075a4800fe4106dffca804e3207bc3c2e7 (patch)
tree4bb3e8ca84f7f23ac41966017e81ffe09e98412e /kernel
parent95a0514c2e68a035e2adbdb5a297da310dccb09c (diff)
parent33e2f28596a7059dbe31d837caae924a74aa36d8 (diff)
Merge branch 'linux-4.14.y_for_4.14-2.0.x-imx' into 4.14-2.0.x-imx
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/core.c51
-rw-r--r--kernel/bpf/hashtab.c4
-rw-r--r--kernel/bpf/inode.c32
-rw-r--r--kernel/bpf/map_in_map.c17
-rw-r--r--kernel/bpf/percpu_freelist.c41
-rw-r--r--kernel/bpf/percpu_freelist.h4
-rw-r--r--kernel/bpf/verifier.c804
-rw-r--r--kernel/cgroup/cgroup.c26
-rw-r--r--kernel/cgroup/pids.c4
-rw-r--r--kernel/cpu.c62
-rw-r--r--kernel/debug/debug_core.c4
-rw-r--r--kernel/debug/kdb/kdb_bt.c11
-rw-r--r--kernel/debug/kdb/kdb_debugger.c7
-rw-r--r--kernel/events/core.c32
-rw-r--r--kernel/events/ring_buffer.c3
-rw-r--r--kernel/exit.c3
-rw-r--r--kernel/futex.c114
-rw-r--r--kernel/hung_task.c50
-rw-r--r--kernel/irq/chip.c16
-rw-r--r--kernel/irq/internals.h8
-rw-r--r--kernel/irq/irqdesc.c8
-rw-r--r--kernel/irq/manage.c3
-rw-r--r--kernel/kprobes.c6
-rw-r--r--kernel/locking/rtmutex.c37
-rw-r--r--kernel/locking/rwsem-xadd.c11
-rw-r--r--kernel/module.c6
-rw-r--r--kernel/ptrace.c15
-rw-r--r--kernel/rcu/tree.c20
-rw-r--r--kernel/relay.c4
-rw-r--r--kernel/sched/cpufreq_schedutil.c3
-rw-r--r--kernel/sched/deadline.c3
-rw-r--r--kernel/sched/debug.c4
-rw-r--r--kernel/sched/fair.c36
-rw-r--r--kernel/sched/topology.c2
-rw-r--r--kernel/signal.c61
-rw-r--r--kernel/smp.c2
-rw-r--r--kernel/sysctl.c17
-rw-r--r--kernel/time/alarmtimer.c2
-rw-r--r--kernel/time/timekeeping.c4
-rw-r--r--kernel/trace/ftrace.c6
-rw-r--r--kernel/trace/ring_buffer.c7
-rw-r--r--kernel/trace/trace.c48
-rw-r--r--kernel/trace/trace_events_hist.c5
-rw-r--r--kernel/trace/trace_kdb.c6
-rw-r--r--kernel/trace/trace_uprobe.c11
45 files changed, 1212 insertions, 408 deletions
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index d203a5d6b726..e46106c6ac39 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -51,6 +51,7 @@
#define DST regs[insn->dst_reg]
#define SRC regs[insn->src_reg]
#define FP regs[BPF_REG_FP]
+#define AX regs[BPF_REG_AX]
#define ARG1 regs[BPF_REG_ARG1]
#define CTX regs[BPF_REG_CTX]
#define IMM insn->imm
@@ -552,6 +553,26 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from,
BUILD_BUG_ON(BPF_REG_AX + 1 != MAX_BPF_JIT_REG);
BUILD_BUG_ON(MAX_BPF_REG + 1 != MAX_BPF_JIT_REG);
+ /* Constraints on AX register:
+ *
+ * AX register is inaccessible from user space. It is mapped in
+ * all JITs, and used here for constant blinding rewrites. It is
+ * typically "stateless" meaning its contents are only valid within
+ * the executed instruction, but not across several instructions.
+ * There are a few exceptions however which are further detailed
+ * below.
+ *
+ * Constant blinding is only used by JITs, not in the interpreter.
+ * The interpreter uses AX in some occasions as a local temporary
+ * register e.g. in DIV or MOD instructions.
+ *
+ * In restricted circumstances, the verifier can also use the AX
+ * register for rewrites as long as they do not interfere with
+ * the above cases!
+ */
+ if (from->dst_reg == BPF_REG_AX || from->src_reg == BPF_REG_AX)
+ goto out;
+
if (from->imm == 0 &&
(from->code == (BPF_ALU | BPF_MOV | BPF_K) ||
from->code == (BPF_ALU64 | BPF_MOV | BPF_K))) {
@@ -939,22 +960,22 @@ select_insn:
ALU64_MOD_X:
if (unlikely(SRC == 0))
return 0;
- div64_u64_rem(DST, SRC, &tmp);
- DST = tmp;
+ div64_u64_rem(DST, SRC, &AX);
+ DST = AX;
CONT;
ALU_MOD_X:
if (unlikely((u32)SRC == 0))
return 0;
- tmp = (u32) DST;
- DST = do_div(tmp, (u32) SRC);
+ AX = (u32) DST;
+ DST = do_div(AX, (u32) SRC);
CONT;
ALU64_MOD_K:
- div64_u64_rem(DST, IMM, &tmp);
- DST = tmp;
+ div64_u64_rem(DST, IMM, &AX);
+ DST = AX;
CONT;
ALU_MOD_K:
- tmp = (u32) DST;
- DST = do_div(tmp, (u32) IMM);
+ AX = (u32) DST;
+ DST = do_div(AX, (u32) IMM);
CONT;
ALU64_DIV_X:
if (unlikely(SRC == 0))
@@ -964,17 +985,17 @@ select_insn:
ALU_DIV_X:
if (unlikely((u32)SRC == 0))
return 0;
- tmp = (u32) DST;
- do_div(tmp, (u32) SRC);
- DST = (u32) tmp;
+ AX = (u32) DST;
+ do_div(AX, (u32) SRC);
+ DST = (u32) AX;
CONT;
ALU64_DIV_K:
DST = div64_u64(DST, IMM);
CONT;
ALU_DIV_K:
- tmp = (u32) DST;
- do_div(tmp, (u32) IMM);
- DST = (u32) tmp;
+ AX = (u32) DST;
+ do_div(AX, (u32) IMM);
+ DST = (u32) AX;
CONT;
ALU_END_TO_BE:
switch (IMM) {
@@ -1278,7 +1299,7 @@ STACK_FRAME_NON_STANDARD(___bpf_prog_run); /* jump table */
static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn *insn) \
{ \
u64 stack[stack_size / sizeof(u64)]; \
- u64 regs[MAX_BPF_REG]; \
+ u64 regs[MAX_BPF_EXT_REG]; \
\
FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \
ARG1 = (u64) (unsigned long) ctx; \
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index 3d0ecc273cc6..84237f640789 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -655,7 +655,7 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
}
if (htab_is_prealloc(htab)) {
- pcpu_freelist_push(&htab->freelist, &l->fnode);
+ __pcpu_freelist_push(&htab->freelist, &l->fnode);
} else {
atomic_dec(&htab->count);
l->htab = htab;
@@ -717,7 +717,7 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
} else {
struct pcpu_freelist_node *l;
- l = pcpu_freelist_pop(&htab->freelist);
+ l = __pcpu_freelist_pop(&htab->freelist);
if (!l)
return ERR_PTR(-E2BIG);
l_new = container_of(l, struct htab_elem, fnode);
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index be1dde967208..ccf9ffd5da78 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -365,19 +365,6 @@ out:
}
EXPORT_SYMBOL_GPL(bpf_obj_get_user);
-static void bpf_evict_inode(struct inode *inode)
-{
- enum bpf_type type;
-
- truncate_inode_pages_final(&inode->i_data);
- clear_inode(inode);
-
- if (S_ISLNK(inode->i_mode))
- kfree(inode->i_link);
- if (!bpf_inode_type(inode, &type))
- bpf_any_put(inode->i_private, type);
-}
-
/*
* Display the mount options in /proc/mounts.
*/
@@ -390,11 +377,28 @@ static int bpf_show_options(struct seq_file *m, struct dentry *root)
return 0;
}
+static void bpf_destroy_inode_deferred(struct rcu_head *head)
+{
+ struct inode *inode = container_of(head, struct inode, i_rcu);
+ enum bpf_type type;
+
+ if (S_ISLNK(inode->i_mode))
+ kfree(inode->i_link);
+ if (!bpf_inode_type(inode, &type))
+ bpf_any_put(inode->i_private, type);
+ free_inode_nonrcu(inode);
+}
+
+static void bpf_destroy_inode(struct inode *inode)
+{
+ call_rcu(&inode->i_rcu, bpf_destroy_inode_deferred);
+}
+
static const struct super_operations bpf_super_ops = {
.statfs = simple_statfs,
.drop_inode = generic_delete_inode,
.show_options = bpf_show_options,
- .evict_inode = bpf_evict_inode,
+ .destroy_inode = bpf_destroy_inode,
};
enum {
diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c
index 1da574612bea..c0c494b7647b 100644
--- a/kernel/bpf/map_in_map.c
+++ b/kernel/bpf/map_in_map.c
@@ -12,6 +12,7 @@
struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
{
struct bpf_map *inner_map, *inner_map_meta;
+ u32 inner_map_meta_size;
struct fd f;
f = fdget(inner_map_ufd);
@@ -34,7 +35,12 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
return ERR_PTR(-EINVAL);
}
- inner_map_meta = kzalloc(sizeof(*inner_map_meta), GFP_USER);
+ inner_map_meta_size = sizeof(*inner_map_meta);
+ /* In some cases verifier needs to access beyond just base map. */
+ if (inner_map->ops == &array_map_ops)
+ inner_map_meta_size = sizeof(struct bpf_array);
+
+ inner_map_meta = kzalloc(inner_map_meta_size, GFP_USER);
if (!inner_map_meta) {
fdput(f);
return ERR_PTR(-ENOMEM);
@@ -44,9 +50,16 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd)
inner_map_meta->key_size = inner_map->key_size;
inner_map_meta->value_size = inner_map->value_size;
inner_map_meta->map_flags = inner_map->map_flags;
- inner_map_meta->ops = inner_map->ops;
inner_map_meta->max_entries = inner_map->max_entries;
+ /* Misc members not needed in bpf_map_meta_equal() check. */
+ inner_map_meta->ops = inner_map->ops;
+ if (inner_map->ops == &array_map_ops) {
+ inner_map_meta->unpriv_array = inner_map->unpriv_array;
+ container_of(inner_map_meta, struct bpf_array, map)->index_mask =
+ container_of(inner_map, struct bpf_array, map)->index_mask;
+ }
+
fdput(f);
return inner_map_meta;
}
diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c
index 673fa6fe2d73..0c1b4ba9e90e 100644
--- a/kernel/bpf/percpu_freelist.c
+++ b/kernel/bpf/percpu_freelist.c
@@ -28,8 +28,8 @@ void pcpu_freelist_destroy(struct pcpu_freelist *s)
free_percpu(s->freelist);
}
-static inline void __pcpu_freelist_push(struct pcpu_freelist_head *head,
- struct pcpu_freelist_node *node)
+static inline void ___pcpu_freelist_push(struct pcpu_freelist_head *head,
+ struct pcpu_freelist_node *node)
{
raw_spin_lock(&head->lock);
node->next = head->first;
@@ -37,12 +37,22 @@ static inline void __pcpu_freelist_push(struct pcpu_freelist_head *head,
raw_spin_unlock(&head->lock);
}
-void pcpu_freelist_push(struct pcpu_freelist *s,
+void __pcpu_freelist_push(struct pcpu_freelist *s,
struct pcpu_freelist_node *node)
{
struct pcpu_freelist_head *head = this_cpu_ptr(s->freelist);
- __pcpu_freelist_push(head, node);
+ ___pcpu_freelist_push(head, node);
+}
+
+void pcpu_freelist_push(struct pcpu_freelist *s,
+ struct pcpu_freelist_node *node)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __pcpu_freelist_push(s, node);
+ local_irq_restore(flags);
}
void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size,
@@ -63,7 +73,7 @@ void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size,
for_each_possible_cpu(cpu) {
again:
head = per_cpu_ptr(s->freelist, cpu);
- __pcpu_freelist_push(head, buf);
+ ___pcpu_freelist_push(head, buf);
i++;
buf += elem_size;
if (i == nr_elems)
@@ -74,14 +84,12 @@ again:
local_irq_restore(flags);
}
-struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s)
+struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s)
{
struct pcpu_freelist_head *head;
struct pcpu_freelist_node *node;
- unsigned long flags;
int orig_cpu, cpu;
- local_irq_save(flags);
orig_cpu = cpu = raw_smp_processor_id();
while (1) {
head = per_cpu_ptr(s->freelist, cpu);
@@ -89,16 +97,25 @@ struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s)
node = head->first;
if (node) {
head->first = node->next;
- raw_spin_unlock_irqrestore(&head->lock, flags);
+ raw_spin_unlock(&head->lock);
return node;
}
raw_spin_unlock(&head->lock);
cpu = cpumask_next(cpu, cpu_possible_mask);
if (cpu >= nr_cpu_ids)
cpu = 0;
- if (cpu == orig_cpu) {
- local_irq_restore(flags);
+ if (cpu == orig_cpu)
return NULL;
- }
}
}
+
+struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s)
+{
+ struct pcpu_freelist_node *ret;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ ret = __pcpu_freelist_pop(s);
+ local_irq_restore(flags);
+ return ret;
+}
diff --git a/kernel/bpf/percpu_freelist.h b/kernel/bpf/percpu_freelist.h
index 3049aae8ea1e..c3960118e617 100644
--- a/kernel/bpf/percpu_freelist.h
+++ b/kernel/bpf/percpu_freelist.h
@@ -22,8 +22,12 @@ struct pcpu_freelist_node {
struct pcpu_freelist_node *next;
};
+/* pcpu_freelist_* do spin_lock_irqsave. */
void pcpu_freelist_push(struct pcpu_freelist *, struct pcpu_freelist_node *);
struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *);
+/* __pcpu_freelist_* do spin_lock only. caller must disable irqs. */
+void __pcpu_freelist_push(struct pcpu_freelist *, struct pcpu_freelist_node *);
+struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *);
void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size,
u32 nr_elems);
int pcpu_freelist_init(struct pcpu_freelist *);
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index f6755fd5bae2..a4875ff0bab1 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -265,10 +265,11 @@ static void print_verifier_state(struct bpf_verifier_state *state)
verbose(")");
}
}
- for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
- if (state->stack_slot_type[i] == STACK_SPILL)
- verbose(" fp%d=%s", -MAX_BPF_STACK + i,
- reg_type_str[state->spilled_regs[i / BPF_REG_SIZE].type]);
+ for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+ if (state->stack[i].slot_type[0] == STACK_SPILL)
+ verbose(" fp%d=%s",
+ (-i - 1) * BPF_REG_SIZE,
+ reg_type_str[state->stack[i].spilled_ptr.type]);
}
verbose("\n");
}
@@ -434,40 +435,133 @@ static void print_bpf_insn(const struct bpf_verifier_env *env,
}
}
-static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx)
+static int copy_stack_state(struct bpf_verifier_state *dst,
+ const struct bpf_verifier_state *src)
{
- struct bpf_verifier_stack_elem *elem;
- int insn_idx;
+ if (!src->stack)
+ return 0;
+ if (WARN_ON_ONCE(dst->allocated_stack < src->allocated_stack)) {
+ /* internal bug, make state invalid to reject the program */
+ memset(dst, 0, sizeof(*dst));
+ return -EFAULT;
+ }
+ memcpy(dst->stack, src->stack,
+ sizeof(*src->stack) * (src->allocated_stack / BPF_REG_SIZE));
+ return 0;
+}
+
+/* do_check() starts with zero-sized stack in struct bpf_verifier_state to
+ * make it consume minimal amount of memory. check_stack_write() access from
+ * the program calls into realloc_verifier_state() to grow the stack size.
+ * Note there is a non-zero 'parent' pointer inside bpf_verifier_state
+ * which this function copies over. It points to previous bpf_verifier_state
+ * which is never reallocated
+ */
+static int realloc_verifier_state(struct bpf_verifier_state *state, int size,
+ bool copy_old)
+{
+ u32 old_size = state->allocated_stack;
+ struct bpf_stack_state *new_stack;
+ int slot = size / BPF_REG_SIZE;
+
+ if (size <= old_size || !size) {
+ if (copy_old)
+ return 0;
+ state->allocated_stack = slot * BPF_REG_SIZE;
+ if (!size && old_size) {
+ kfree(state->stack);
+ state->stack = NULL;
+ }
+ return 0;
+ }
+ new_stack = kmalloc_array(slot, sizeof(struct bpf_stack_state),
+ GFP_KERNEL);
+ if (!new_stack)
+ return -ENOMEM;
+ if (copy_old) {
+ if (state->stack)
+ memcpy(new_stack, state->stack,
+ sizeof(*new_stack) * (old_size / BPF_REG_SIZE));
+ memset(new_stack + old_size / BPF_REG_SIZE, 0,
+ sizeof(*new_stack) * (size - old_size) / BPF_REG_SIZE);
+ }
+ state->allocated_stack = slot * BPF_REG_SIZE;
+ kfree(state->stack);
+ state->stack = new_stack;
+ return 0;
+}
+
+static void free_verifier_state(struct bpf_verifier_state *state,
+ bool free_self)
+{
+ kfree(state->stack);
+ if (free_self)
+ kfree(state);
+}
+
+/* copy verifier state from src to dst growing dst stack space
+ * when necessary to accommodate larger src stack
+ */
+static int copy_verifier_state(struct bpf_verifier_state *dst,
+ const struct bpf_verifier_state *src)
+{
+ int err;
+
+ err = realloc_verifier_state(dst, src->allocated_stack, false);
+ if (err)
+ return err;
+ memcpy(dst, src, offsetof(struct bpf_verifier_state, allocated_stack));
+ return copy_stack_state(dst, src);
+}
+
+static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx,
+ int *insn_idx)
+{
+ struct bpf_verifier_state *cur = env->cur_state;
+ struct bpf_verifier_stack_elem *elem, *head = env->head;
+ int err;
if (env->head == NULL)
- return -1;
+ return -ENOENT;
- memcpy(&env->cur_state, &env->head->st, sizeof(env->cur_state));
- insn_idx = env->head->insn_idx;
+ if (cur) {
+ err = copy_verifier_state(cur, &head->st);
+ if (err)
+ return err;
+ }
+ if (insn_idx)
+ *insn_idx = head->insn_idx;
if (prev_insn_idx)
- *prev_insn_idx = env->head->prev_insn_idx;
- elem = env->head->next;
- kfree(env->head);
+ *prev_insn_idx = head->prev_insn_idx;
+ elem = head->next;
+ free_verifier_state(&head->st, false);
+ kfree(head);
env->head = elem;
env->stack_size--;
- return insn_idx;
+ return 0;
}
static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
- int insn_idx, int prev_insn_idx)
+ int insn_idx, int prev_insn_idx,
+ bool speculative)
{
struct bpf_verifier_stack_elem *elem;
+ struct bpf_verifier_state *cur = env->cur_state;
+ int err;
- elem = kmalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL);
+ elem = kzalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL);
if (!elem)
goto err;
- memcpy(&elem->st, &env->cur_state, sizeof(env->cur_state));
elem->insn_idx = insn_idx;
elem->prev_insn_idx = prev_insn_idx;
elem->next = env->head;
+ elem->st.speculative |= speculative;
env->head = elem;
env->stack_size++;
+ err = copy_verifier_state(&elem->st, cur);
+ if (err)
+ goto err;
if (env->stack_size > BPF_COMPLEXITY_LIMIT_STACK) {
verbose("BPF program is too complex\n");
goto err;
@@ -475,7 +569,7 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env,
return &elem->st;
err:
/* pop all elements and return */
- while (pop_stack(env, NULL) >= 0);
+ while (!pop_stack(env, NULL, NULL));
return NULL;
}
@@ -671,7 +765,7 @@ static void mark_reg_read(const struct bpf_verifier_state *state, u32 regno)
static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
enum reg_arg_type t)
{
- struct bpf_reg_state *regs = env->cur_state.regs;
+ struct bpf_reg_state *regs = env->cur_state->regs;
if (regno >= MAX_BPF_REG) {
verbose("R%d is invalid\n", regno);
@@ -684,7 +778,7 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
verbose("R%d !read_ok\n", regno);
return -EACCES;
}
- mark_reg_read(&env->cur_state, regno);
+ mark_reg_read(env->cur_state, regno);
} else {
/* check whether register used as dest operand can be written to */
if (regno == BPF_REG_FP) {
@@ -721,10 +815,21 @@ static int check_stack_write(struct bpf_verifier_env *env,
struct bpf_verifier_state *state, int off,
int size, int value_regno, int insn_idx)
{
- int i, spi = (MAX_BPF_STACK + off) / BPF_REG_SIZE;
+ int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err;
+
+ err = realloc_verifier_state(state, round_up(slot + 1, BPF_REG_SIZE),
+ true);
+ if (err)
+ return err;
/* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0,
* so it's aligned access and [off, off + size) are within stack limits
*/
+ if (!env->allow_ptr_leaks &&
+ state->stack[spi].slot_type[0] == STACK_SPILL &&
+ size != BPF_REG_SIZE) {
+ verbose("attempt to corrupt spilled pointer on stack\n");
+ return -EACCES;
+ }
if (value_regno >= 0 &&
is_spillable_regtype(state->regs[value_regno].type)) {
@@ -736,11 +841,11 @@ static int check_stack_write(struct bpf_verifier_env *env,
}
/* save register state */
- state->spilled_regs[spi] = state->regs[value_regno];
- state->spilled_regs[spi].live |= REG_LIVE_WRITTEN;
+ state->stack[spi].spilled_ptr = state->regs[value_regno];
+ state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN;
for (i = 0; i < BPF_REG_SIZE; i++) {
- if (state->stack_slot_type[MAX_BPF_STACK + off + i] == STACK_MISC &&
+ if (state->stack[spi].slot_type[i] == STACK_MISC &&
!env->allow_ptr_leaks) {
int *poff = &env->insn_aux_data[insn_idx].sanitize_stack_off;
int soff = (-spi - 1) * BPF_REG_SIZE;
@@ -763,14 +868,15 @@ static int check_stack_write(struct bpf_verifier_env *env,
}
*poff = soff;
}
- state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_SPILL;
+ state->stack[spi].slot_type[i] = STACK_SPILL;
}
} else {
/* regular write of data into stack */
- state->spilled_regs[spi] = (struct bpf_reg_state) {};
+ state->stack[spi].spilled_ptr = (struct bpf_reg_state) {};
for (i = 0; i < size; i++)
- state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_MISC;
+ state->stack[spi].slot_type[(slot - i) % BPF_REG_SIZE] =
+ STACK_MISC;
}
return 0;
}
@@ -781,10 +887,10 @@ static void mark_stack_slot_read(const struct bpf_verifier_state *state, int slo
while (parent) {
/* if read wasn't screened by an earlier write ... */
- if (state->spilled_regs[slot].live & REG_LIVE_WRITTEN)
+ if (state->stack[slot].spilled_ptr.live & REG_LIVE_WRITTEN)
break;
/* ... then we depend on parent's value */
- parent->spilled_regs[slot].live |= REG_LIVE_READ;
+ parent->stack[slot].spilled_ptr.live |= REG_LIVE_READ;
state = parent;
parent = state->parent;
}
@@ -793,34 +899,37 @@ static void mark_stack_slot_read(const struct bpf_verifier_state *state, int slo
static int check_stack_read(struct bpf_verifier_state *state, int off, int size,
int value_regno)
{
- u8 *slot_type;
- int i, spi;
+ int i, slot = -off - 1, spi = slot / BPF_REG_SIZE;
+ u8 *stype;
- slot_type = &state->stack_slot_type[MAX_BPF_STACK + off];
+ if (state->allocated_stack <= slot) {
+ verbose("invalid read from stack off %d+0 size %d\n",
+ off, size);
+ return -EACCES;
+ }
+ stype = state->stack[spi].slot_type;
- if (slot_type[0] == STACK_SPILL) {
+ if (stype[0] == STACK_SPILL) {
if (size != BPF_REG_SIZE) {
verbose("invalid size of register spill\n");
return -EACCES;
}
for (i = 1; i < BPF_REG_SIZE; i++) {
- if (slot_type[i] != STACK_SPILL) {
+ if (stype[(slot - i) % BPF_REG_SIZE] != STACK_SPILL) {
verbose("corrupted spill memory\n");
return -EACCES;
}
}
- spi = (MAX_BPF_STACK + off) / BPF_REG_SIZE;
-
if (value_regno >= 0) {
/* restore register state from stack */
- state->regs[value_regno] = state->spilled_regs[spi];
+ state->regs[value_regno] = state->stack[spi].spilled_ptr;
mark_stack_slot_read(state, spi);
}
return 0;
} else {
for (i = 0; i < size; i++) {
- if (slot_type[i] != STACK_MISC) {
+ if (stype[(slot - i) % BPF_REG_SIZE] != STACK_MISC) {
verbose("invalid read from stack off %d+%d size %d\n",
off, i, size);
return -EACCES;
@@ -833,11 +942,37 @@ static int check_stack_read(struct bpf_verifier_state *state, int off, int size,
}
}
+static int check_stack_access(struct bpf_verifier_env *env,
+ const struct bpf_reg_state *reg,
+ int off, int size)
+{
+ /* Stack accesses must be at a fixed offset, so that we
+ * can determine what type of data were returned. See
+ * check_stack_read().
+ */
+ if (!tnum_is_const(reg->var_off)) {
+ char tn_buf[48];
+
+ tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
+ verbose("variable stack access var_off=%s off=%d size=%d",
+ tn_buf, off, size);
+ return -EACCES;
+ }
+
+ if (off >= 0 || off < -MAX_BPF_STACK) {
+ verbose("invalid stack off=%d size=%d\n", off, size);
+ return -EACCES;
+ }
+
+ return 0;
+}
+
/* check read/write into map element returned by bpf_map_lookup_elem() */
static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
int size)
{
- struct bpf_map *map = env->cur_state.regs[regno].map_ptr;
+ struct bpf_reg_state *regs = cur_regs(env);
+ struct bpf_map *map = regs[regno].map_ptr;
if (off < 0 || size <= 0 || off + size > map->value_size) {
verbose("invalid access to map value, value_size=%d off=%d size=%d\n",
@@ -849,9 +984,9 @@ static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off,
/* check read/write into a map element with possible variable offset */
static int check_map_access(struct bpf_verifier_env *env, u32 regno,
- int off, int size)
+ int off, int size)
{
- struct bpf_verifier_state *state = &env->cur_state;
+ struct bpf_verifier_state *state = env->cur_state;
struct bpf_reg_state *reg = &state->regs[regno];
int err;
@@ -861,13 +996,17 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno,
*/
if (log_level)
print_verifier_state(state);
+
/* The minimum value is only important with signed
* comparisons where we can't assume the floor of a
* value is 0. If we are using signed variables for our
* index'es we need to make sure that whatever we use
* will have a set floor within our range.
*/
- if (reg->smin_value < 0) {
+ if (reg->smin_value < 0 &&
+ (reg->smin_value == S64_MIN ||
+ (off + reg->smin_value != (s64)(s32)(off + reg->smin_value)) ||
+ reg->smin_value + off < 0)) {
verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n",
regno);
return -EACCES;
@@ -924,7 +1063,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env,
static int __check_packet_access(struct bpf_verifier_env *env, u32 regno,
int off, int size)
{
- struct bpf_reg_state *regs = env->cur_state.regs;
+ struct bpf_reg_state *regs = cur_regs(env);
struct bpf_reg_state *reg = &regs[regno];
if (off < 0 || size <= 0 || (u64)off + size > reg->range) {
@@ -938,7 +1077,7 @@ static int __check_packet_access(struct bpf_verifier_env *env, u32 regno,
static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off,
int size)
{
- struct bpf_reg_state *regs = env->cur_state.regs;
+ struct bpf_reg_state *regs = cur_regs(env);
struct bpf_reg_state *reg = &regs[regno];
int err;
@@ -1008,19 +1147,19 @@ static bool __is_pointer_value(bool allow_ptr_leaks,
static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
{
- return __is_pointer_value(env->allow_ptr_leaks, &env->cur_state.regs[regno]);
+ return __is_pointer_value(env->allow_ptr_leaks, cur_regs(env) + regno);
}
static bool is_ctx_reg(struct bpf_verifier_env *env, int regno)
{
- const struct bpf_reg_state *reg = &env->cur_state.regs[regno];
+ const struct bpf_reg_state *reg = cur_regs(env) + regno;
return reg->type == PTR_TO_CTX;
}
static bool is_pkt_reg(struct bpf_verifier_env *env, int regno)
{
- const struct bpf_reg_state *reg = &env->cur_state.regs[regno];
+ const struct bpf_reg_state *reg = cur_regs(env) + regno;
return reg->type == PTR_TO_PACKET;
}
@@ -1145,8 +1284,9 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
int off, int bpf_size, enum bpf_access_type t,
int value_regno, bool strict_alignment_once)
{
- struct bpf_verifier_state *state = &env->cur_state;
- struct bpf_reg_state *reg = &state->regs[regno];
+ struct bpf_verifier_state *state = env->cur_state;
+ struct bpf_reg_state *regs = cur_regs(env);
+ struct bpf_reg_state *reg = regs + regno;
int size, err = 0;
size = bpf_size_to_bytes(bpf_size);
@@ -1170,7 +1310,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
err = check_map_access(env, regno, off, size);
if (!err && t == BPF_READ && value_regno >= 0)
- mark_reg_unknown(state->regs, value_regno);
+ mark_reg_unknown(regs, value_regno);
} else if (reg->type == PTR_TO_CTX) {
enum bpf_reg_type reg_type = SCALAR_VALUE;
@@ -1203,49 +1343,29 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
* the offset is zero.
*/
if (reg_type == SCALAR_VALUE)
- mark_reg_unknown(state->regs, value_regno);
+ mark_reg_unknown(regs, value_regno);
else
- mark_reg_known_zero(state->regs, value_regno);
- state->regs[value_regno].id = 0;
- state->regs[value_regno].off = 0;
- state->regs[value_regno].range = 0;
- state->regs[value_regno].type = reg_type;
+ mark_reg_known_zero(regs, value_regno);
+ regs[value_regno].id = 0;
+ regs[value_regno].off = 0;
+ regs[value_regno].range = 0;
+ regs[value_regno].type = reg_type;
}
} else if (reg->type == PTR_TO_STACK) {
- /* stack accesses must be at a fixed offset, so that we can
- * determine what type of data were returned.
- * See check_stack_read().
- */
- if (!tnum_is_const(reg->var_off)) {
- char tn_buf[48];
-
- tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off);
- verbose("variable stack access var_off=%s off=%d size=%d",
- tn_buf, off, size);
- return -EACCES;
- }
off += reg->var_off.value;
- if (off >= 0 || off < -MAX_BPF_STACK) {
- verbose("invalid stack off=%d size=%d\n", off, size);
- return -EACCES;
- }
+ err = check_stack_access(env, reg, off, size);
+ if (err)
+ return err;
if (env->prog->aux->stack_depth < -off)
env->prog->aux->stack_depth = -off;
- if (t == BPF_WRITE) {
- if (!env->allow_ptr_leaks &&
- state->stack_slot_type[MAX_BPF_STACK + off] == STACK_SPILL &&
- size != BPF_REG_SIZE) {
- verbose("attempt to corrupt spilled pointer on stack\n");
- return -EACCES;
- }
+ if (t == BPF_WRITE)
err = check_stack_write(env, state, off, size,
value_regno, insn_idx);
- } else {
+ else
err = check_stack_read(state, off, size, value_regno);
- }
} else if (reg->type == PTR_TO_PACKET) {
if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) {
verbose("cannot write into packet\n");
@@ -1258,7 +1378,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
}
err = check_packet_access(env, regno, off, size);
if (!err && t == BPF_READ && value_regno >= 0)
- mark_reg_unknown(state->regs, value_regno);
+ mark_reg_unknown(regs, value_regno);
} else {
verbose("R%d invalid mem access '%s'\n",
regno, reg_type_str[reg->type]);
@@ -1266,9 +1386,9 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn
}
if (!err && size < BPF_REG_SIZE && value_regno >= 0 && t == BPF_READ &&
- state->regs[value_regno].type == SCALAR_VALUE) {
+ regs[value_regno].type == SCALAR_VALUE) {
/* b/h/w load zero-extends, mark upper bits as known 0 */
- coerce_reg_to_size(&state->regs[value_regno], size);
+ coerce_reg_to_size(&regs[value_regno], size);
}
return err;
}
@@ -1333,9 +1453,9 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
int access_size, bool zero_size_allowed,
struct bpf_call_arg_meta *meta)
{
- struct bpf_verifier_state *state = &env->cur_state;
+ struct bpf_verifier_state *state = env->cur_state;
struct bpf_reg_state *regs = state->regs;
- int off, i;
+ int off, i, slot, spi;
if (regs[regno].type != PTR_TO_STACK) {
/* Allow zero-byte read from NULL, regardless of pointer type */
@@ -1376,7 +1496,11 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno,
}
for (i = 0; i < access_size; i++) {
- if (state->stack_slot_type[MAX_BPF_STACK + off + i] != STACK_MISC) {
+ slot = -(off + i) - 1;
+ spi = slot / BPF_REG_SIZE;
+ if (state->allocated_stack <= slot ||
+ state->stack[spi].slot_type[slot % BPF_REG_SIZE] !=
+ STACK_MISC) {
verbose("invalid indirect read from stack off %d+%d size %d\n",
off, i, access_size);
return -EACCES;
@@ -1389,7 +1513,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno,
int access_size, bool zero_size_allowed,
struct bpf_call_arg_meta *meta)
{
- struct bpf_reg_state *regs = env->cur_state.regs, *reg = &regs[regno];
+ struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
switch (reg->type) {
case PTR_TO_PACKET:
@@ -1406,7 +1530,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno,
enum bpf_arg_type arg_type,
struct bpf_call_arg_meta *meta)
{
- struct bpf_reg_state *regs = env->cur_state.regs, *reg = &regs[regno];
+ struct bpf_reg_state *regs = cur_regs(env), *reg = &regs[regno];
enum bpf_reg_type expected_type, type = reg->type;
int err = 0;
@@ -1678,7 +1802,7 @@ static int check_raw_mode(const struct bpf_func_proto *fn)
*/
static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
{
- struct bpf_verifier_state *state = &env->cur_state;
+ struct bpf_verifier_state *state = env->cur_state;
struct bpf_reg_state *regs = state->regs, *reg;
int i;
@@ -1687,10 +1811,10 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
regs[i].type == PTR_TO_PACKET_END)
mark_reg_unknown(regs, i);
- for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
- if (state->stack_slot_type[i] != STACK_SPILL)
+ for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+ if (state->stack[i].slot_type[0] != STACK_SPILL)
continue;
- reg = &state->spilled_regs[i / BPF_REG_SIZE];
+ reg = &state->stack[i].spilled_ptr;
if (reg->type != PTR_TO_PACKET &&
reg->type != PTR_TO_PACKET_END)
continue;
@@ -1700,9 +1824,8 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env)
static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
{
- struct bpf_verifier_state *state = &env->cur_state;
const struct bpf_func_proto *fn = NULL;
- struct bpf_reg_state *regs = state->regs;
+ struct bpf_reg_state *regs;
struct bpf_call_arg_meta meta;
bool changes_data;
int i, err;
@@ -1776,6 +1899,7 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx)
return err;
}
+ regs = cur_regs(env);
/* reset caller saved regs */
for (i = 0; i < CALLER_SAVED_REGS; i++) {
mark_reg_not_init(regs, caller_saved[i]);
@@ -1880,6 +2004,125 @@ static bool check_reg_sane_offset(struct bpf_verifier_env *env,
return true;
}
+static struct bpf_insn_aux_data *cur_aux(struct bpf_verifier_env *env)
+{
+ return &env->insn_aux_data[env->insn_idx];
+}
+
+static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg,
+ u32 *ptr_limit, u8 opcode, bool off_is_neg)
+{
+ bool mask_to_left = (opcode == BPF_ADD && off_is_neg) ||
+ (opcode == BPF_SUB && !off_is_neg);
+ u32 off;
+
+ switch (ptr_reg->type) {
+ case PTR_TO_STACK:
+ off = ptr_reg->off + ptr_reg->var_off.value;
+ if (mask_to_left)
+ *ptr_limit = MAX_BPF_STACK + off;
+ else
+ *ptr_limit = -off;
+ return 0;
+ case PTR_TO_MAP_VALUE:
+ if (mask_to_left) {
+ *ptr_limit = ptr_reg->umax_value + ptr_reg->off;
+ } else {
+ off = ptr_reg->smin_value + ptr_reg->off;
+ *ptr_limit = ptr_reg->map_ptr->value_size - off;
+ }
+ return 0;
+ default:
+ return -EINVAL;
+ }
+}
+
+static bool can_skip_alu_sanitation(const struct bpf_verifier_env *env,
+ const struct bpf_insn *insn)
+{
+ return env->allow_ptr_leaks || BPF_SRC(insn->code) == BPF_K;
+}
+
+static int update_alu_sanitation_state(struct bpf_insn_aux_data *aux,
+ u32 alu_state, u32 alu_limit)
+{
+ /* If we arrived here from different branches with different
+ * state or limits to sanitize, then this won't work.
+ */
+ if (aux->alu_state &&
+ (aux->alu_state != alu_state ||
+ aux->alu_limit != alu_limit))
+ return -EACCES;
+
+ /* Corresponding fixup done in fixup_bpf_calls(). */
+ aux->alu_state = alu_state;
+ aux->alu_limit = alu_limit;
+ return 0;
+}
+
+static int sanitize_val_alu(struct bpf_verifier_env *env,
+ struct bpf_insn *insn)
+{
+ struct bpf_insn_aux_data *aux = cur_aux(env);
+
+ if (can_skip_alu_sanitation(env, insn))
+ return 0;
+
+ return update_alu_sanitation_state(aux, BPF_ALU_NON_POINTER, 0);
+}
+
+static int sanitize_ptr_alu(struct bpf_verifier_env *env,
+ struct bpf_insn *insn,
+ const struct bpf_reg_state *ptr_reg,
+ struct bpf_reg_state *dst_reg,
+ bool off_is_neg)
+{
+ struct bpf_verifier_state *vstate = env->cur_state;
+ struct bpf_insn_aux_data *aux = cur_aux(env);
+ bool ptr_is_dst_reg = ptr_reg == dst_reg;
+ u8 opcode = BPF_OP(insn->code);
+ u32 alu_state, alu_limit;
+ struct bpf_reg_state tmp;
+ bool ret;
+
+ if (can_skip_alu_sanitation(env, insn))
+ return 0;
+
+ /* We already marked aux for masking from non-speculative
+ * paths, thus we got here in the first place. We only care
+ * to explore bad access from here.
+ */
+ if (vstate->speculative)
+ goto do_sim;
+
+ alu_state = off_is_neg ? BPF_ALU_NEG_VALUE : 0;
+ alu_state |= ptr_is_dst_reg ?
+ BPF_ALU_SANITIZE_SRC : BPF_ALU_SANITIZE_DST;
+
+ if (retrieve_ptr_limit(ptr_reg, &alu_limit, opcode, off_is_neg))
+ return 0;
+ if (update_alu_sanitation_state(aux, alu_state, alu_limit))
+ return -EACCES;
+do_sim:
+ /* Simulate and find potential out-of-bounds access under
+ * speculative execution from truncation as a result of
+ * masking when off was not within expected range. If off
+ * sits in dst, then we temporarily need to move ptr there
+ * to simulate dst (== 0) +/-= ptr. Needed, for example,
+ * for cases where we use K-based arithmetic in one direction
+ * and truncated reg-based in the other in order to explore
+ * bad access.
+ */
+ if (!ptr_is_dst_reg) {
+ tmp = *dst_reg;
+ *dst_reg = *ptr_reg;
+ }
+ ret = push_stack(env, env->insn_idx + 1, env->insn_idx, true);
+ if (!ptr_is_dst_reg && ret)
+ *dst_reg = tmp;
+ return !ret ? -EFAULT : 0;
+}
+
/* Handles arithmetic on a pointer and a scalar: computes new min/max and var_off.
* Caller should also handle BPF_MOV case separately.
* If we return -EACCES, caller may want to try again treating pointer as a
@@ -1890,14 +2133,15 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
const struct bpf_reg_state *ptr_reg,
const struct bpf_reg_state *off_reg)
{
- struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg;
+ struct bpf_reg_state *regs = cur_regs(env), *dst_reg;
bool known = tnum_is_const(off_reg->var_off);
s64 smin_val = off_reg->smin_value, smax_val = off_reg->smax_value,
smin_ptr = ptr_reg->smin_value, smax_ptr = ptr_reg->smax_value;
u64 umin_val = off_reg->umin_value, umax_val = off_reg->umax_value,
umin_ptr = ptr_reg->umin_value, umax_ptr = ptr_reg->umax_value;
+ u32 dst = insn->dst_reg, src = insn->src_reg;
u8 opcode = BPF_OP(insn->code);
- u32 dst = insn->dst_reg;
+ int ret;
dst_reg = &regs[dst];
@@ -1949,6 +2193,11 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
switch (opcode) {
case BPF_ADD:
+ ret = sanitize_ptr_alu(env, insn, ptr_reg, dst_reg, smin_val < 0);
+ if (ret < 0) {
+ verbose("R%d tried to add from different maps or paths\n", dst);
+ return ret;
+ }
/* We can take a fixed offset as long as it doesn't overflow
* the s32 'off' field
*/
@@ -1999,6 +2248,11 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
}
break;
case BPF_SUB:
+ ret = sanitize_ptr_alu(env, insn, ptr_reg, dst_reg, smin_val < 0);
+ if (ret < 0) {
+ verbose("R%d tried to sub from different maps or paths\n", dst);
+ return ret;
+ }
if (dst_reg == off_reg) {
/* scalar -= pointer. Creates an unknown scalar */
if (!env->allow_ptr_leaks)
@@ -2071,6 +2325,13 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
verbose("R%d bitwise operator %s on pointer prohibited\n",
dst, bpf_alu_string[opcode >> 4]);
return -EACCES;
+ case PTR_TO_MAP_VALUE:
+ if (!env->allow_ptr_leaks && !known && (smin_val < 0) != (smax_val < 0)) {
+ verbose("R%d has unknown scalar with mixed signed bounds, pointer arithmetic with it prohibited for !root\n",
+ off_reg == dst_reg ? dst : src);
+ return -EACCES;
+ }
+ /* fall-through */
default:
/* other operators (e.g. MUL,LSH) produce non-pointer results */
if (!env->allow_ptr_leaks)
@@ -2085,6 +2346,25 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env,
__update_reg_bounds(dst_reg);
__reg_deduce_bounds(dst_reg);
__reg_bound_offset(dst_reg);
+
+ /* For unprivileged we require that resulting offset must be in bounds
+ * in order to be able to sanitize access later on.
+ */
+ if (!env->allow_ptr_leaks) {
+ if (dst_reg->type == PTR_TO_MAP_VALUE &&
+ check_map_access(env, dst, dst_reg->off, 1)) {
+ verbose("R%d pointer arithmetic of map value goes out of range, "
+ "prohibited for !root\n", dst);
+ return -EACCES;
+ } else if (dst_reg->type == PTR_TO_STACK &&
+ check_stack_access(env, dst_reg, dst_reg->off +
+ dst_reg->var_off.value, 1)) {
+ verbose("R%d stack pointer arithmetic goes out of range, "
+ "prohibited for !root\n", dst);
+ return -EACCES;
+ }
+ }
+
return 0;
}
@@ -2097,12 +2377,14 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
struct bpf_reg_state *dst_reg,
struct bpf_reg_state src_reg)
{
- struct bpf_reg_state *regs = env->cur_state.regs;
+ struct bpf_reg_state *regs = cur_regs(env);
u8 opcode = BPF_OP(insn->code);
bool src_known, dst_known;
s64 smin_val, smax_val;
u64 umin_val, umax_val;
u64 insn_bitness = (BPF_CLASS(insn->code) == BPF_ALU64) ? 64 : 32;
+ u32 dst = insn->dst_reg;
+ int ret;
if (insn_bitness == 32) {
/* Relevant for 32-bit RSH: Information can propagate towards
@@ -2137,6 +2419,11 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
switch (opcode) {
case BPF_ADD:
+ ret = sanitize_val_alu(env, insn);
+ if (ret < 0) {
+ verbose("R%d tried to add from different pointers or scalars\n", dst);
+ return ret;
+ }
if (signed_add_overflows(dst_reg->smin_value, smin_val) ||
signed_add_overflows(dst_reg->smax_value, smax_val)) {
dst_reg->smin_value = S64_MIN;
@@ -2156,6 +2443,11 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
dst_reg->var_off = tnum_add(dst_reg->var_off, src_reg.var_off);
break;
case BPF_SUB:
+ ret = sanitize_val_alu(env, insn);
+ if (ret < 0) {
+ verbose("R%d tried to sub from different pointers or scalars\n", dst);
+ return ret;
+ }
if (signed_sub_overflows(dst_reg->smin_value, smax_val) ||
signed_sub_overflows(dst_reg->smax_value, smin_val)) {
/* Overflow possible, we know nothing */
@@ -2345,7 +2637,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env,
static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
struct bpf_insn *insn)
{
- struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg, *src_reg;
+ struct bpf_reg_state *regs = cur_regs(env), *dst_reg, *src_reg;
struct bpf_reg_state *ptr_reg = NULL, off_reg = {0};
u8 opcode = BPF_OP(insn->code);
int rc;
@@ -2419,12 +2711,12 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
/* Got here implies adding two SCALAR_VALUEs */
if (WARN_ON_ONCE(ptr_reg)) {
- print_verifier_state(&env->cur_state);
+ print_verifier_state(env->cur_state);
verbose("verifier internal error: unexpected ptr_reg\n");
return -EINVAL;
}
if (WARN_ON(!src_reg)) {
- print_verifier_state(&env->cur_state);
+ print_verifier_state(env->cur_state);
verbose("verifier internal error: no src_reg\n");
return -EINVAL;
}
@@ -2434,7 +2726,7 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env,
/* check validity of 32-bit and 64-bit arithmetic operations */
static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
{
- struct bpf_reg_state *regs = env->cur_state.regs;
+ struct bpf_reg_state *regs = cur_regs(env);
u8 opcode = BPF_OP(insn->code);
int err;
@@ -2661,10 +2953,10 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,
/* keep the maximum range already checked */
regs[i].range = max(regs[i].range, new_range);
- for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
- if (state->stack_slot_type[i] != STACK_SPILL)
+ for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+ if (state->stack[i].slot_type[0] != STACK_SPILL)
continue;
- reg = &state->spilled_regs[i / BPF_REG_SIZE];
+ reg = &state->stack[i].spilled_ptr;
if (reg->type == PTR_TO_PACKET && reg->id == dst_reg->id)
reg->range = max(reg->range, new_range);
}
@@ -2914,17 +3206,17 @@ static void mark_map_regs(struct bpf_verifier_state *state, u32 regno,
for (i = 0; i < MAX_BPF_REG; i++)
mark_map_reg(regs, i, id, is_null);
- for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
- if (state->stack_slot_type[i] != STACK_SPILL)
+ for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) {
+ if (state->stack[i].slot_type[0] != STACK_SPILL)
continue;
- mark_map_reg(state->spilled_regs, i / BPF_REG_SIZE, id, is_null);
+ mark_map_reg(&state->stack[i].spilled_ptr, 0, id, is_null);
}
}
static int check_cond_jmp_op(struct bpf_verifier_env *env,
struct bpf_insn *insn, int *insn_idx)
{
- struct bpf_verifier_state *other_branch, *this_branch = &env->cur_state;
+ struct bpf_verifier_state *other_branch, *this_branch = env->cur_state;
struct bpf_reg_state *regs = this_branch->regs, *dst_reg;
u8 opcode = BPF_OP(insn->code);
int err;
@@ -2984,7 +3276,8 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
}
}
- other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx);
+ other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx,
+ false);
if (!other_branch)
return -EFAULT;
@@ -3087,7 +3380,7 @@ static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn)
/* verify BPF_LD_IMM64 instruction */
static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn)
{
- struct bpf_reg_state *regs = env->cur_state.regs;
+ struct bpf_reg_state *regs = cur_regs(env);
int err;
if (BPF_SIZE(insn->code) != BPF_DW) {
@@ -3148,7 +3441,7 @@ static bool may_access_skb(enum bpf_prog_type type)
*/
static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn)
{
- struct bpf_reg_state *regs = env->cur_state.regs;
+ struct bpf_reg_state *regs = cur_regs(env);
u8 mode = BPF_MODE(insn->code);
int i, err;
@@ -3534,6 +3827,57 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur,
return false;
}
+static bool stacksafe(struct bpf_verifier_state *old,
+ struct bpf_verifier_state *cur,
+ struct idpair *idmap)
+{
+ int i, spi;
+
+ /* if explored stack has more populated slots than current stack
+ * such stacks are not equivalent
+ */
+ if (old->allocated_stack > cur->allocated_stack)
+ return false;
+
+ /* walk slots of the explored stack and ignore any additional
+ * slots in the current stack, since explored(safe) state
+ * didn't use them
+ */
+ for (i = 0; i < old->allocated_stack; i++) {
+ spi = i / BPF_REG_SIZE;
+
+ if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_INVALID)
+ continue;
+ if (old->stack[spi].slot_type[i % BPF_REG_SIZE] !=
+ cur->stack[spi].slot_type[i % BPF_REG_SIZE])
+ /* Ex: old explored (safe) state has STACK_SPILL in
+ * this stack slot, but current has has STACK_MISC ->
+ * this verifier states are not equivalent,
+ * return false to continue verification of this path
+ */
+ return false;
+ if (i % BPF_REG_SIZE)
+ continue;
+ if (old->stack[spi].slot_type[0] != STACK_SPILL)
+ continue;
+ if (!regsafe(&old->stack[spi].spilled_ptr,
+ &cur->stack[spi].spilled_ptr,
+ idmap))
+ /* when explored and current stack slot are both storing
+ * spilled registers, check that stored pointers types
+ * are the same as well.
+ * Ex: explored safe path could have stored
+ * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -8}
+ * but current path has stored:
+ * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -16}
+ * such verifier states are not equivalent.
+ * return false to continue verification of this path
+ */
+ return false;
+ }
+ return true;
+}
+
/* compare two verifier states
*
* all states stored in state_list are known to be valid, since
@@ -3568,6 +3912,12 @@ static bool states_equal(struct bpf_verifier_env *env,
bool ret = false;
int i;
+ /* Verification state from speculative execution simulation
+ * must never prune a non-speculative execution one.
+ */
+ if (old->speculative && !cur->speculative)
+ return false;
+
idmap = kcalloc(ID_MAP_SIZE, sizeof(struct idpair), GFP_KERNEL);
/* If we failed to allocate the idmap, just say it's not safe */
if (!idmap)
@@ -3578,37 +3928,8 @@ static bool states_equal(struct bpf_verifier_env *env,
goto out_free;
}
- for (i = 0; i < MAX_BPF_STACK; i++) {
- if (old->stack_slot_type[i] == STACK_INVALID)
- continue;
- if (old->stack_slot_type[i] != cur->stack_slot_type[i])
- /* Ex: old explored (safe) state has STACK_SPILL in
- * this stack slot, but current has has STACK_MISC ->
- * this verifier states are not equivalent,
- * return false to continue verification of this path
- */
- goto out_free;
- if (i % BPF_REG_SIZE)
- continue;
- if (old->stack_slot_type[i] != STACK_SPILL)
- continue;
- if (!regsafe(&old->spilled_regs[i / BPF_REG_SIZE],
- &cur->spilled_regs[i / BPF_REG_SIZE],
- idmap))
- /* when explored and current stack slot are both storing
- * spilled registers, check that stored pointers types
- * are the same as well.
- * Ex: explored safe path could have stored
- * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -8}
- * but current path has stored:
- * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -16}
- * such verifier states are not equivalent.
- * return false to continue verification of this path
- */
- goto out_free;
- else
- continue;
- }
+ if (!stacksafe(old, cur, idmap))
+ goto out_free;
ret = true;
out_free:
kfree(idmap);
@@ -3644,17 +3965,19 @@ static bool do_propagate_liveness(const struct bpf_verifier_state *state,
}
}
/* ... and stack slots */
- for (i = 0; i < MAX_BPF_STACK / BPF_REG_SIZE; i++) {
- if (parent->stack_slot_type[i * BPF_REG_SIZE] != STACK_SPILL)
+ for (i = 0; i < state->allocated_stack / BPF_REG_SIZE &&
+ i < parent->allocated_stack / BPF_REG_SIZE; i++) {
+ if (parent->stack[i].slot_type[0] != STACK_SPILL)
continue;
- if (state->stack_slot_type[i * BPF_REG_SIZE] != STACK_SPILL)
+ if (state->stack[i].slot_type[0] != STACK_SPILL)
continue;
- if (parent->spilled_regs[i].live & REG_LIVE_READ)
+ if (parent->stack[i].spilled_ptr.live & REG_LIVE_READ)
continue;
- if (writes && (state->spilled_regs[i].live & REG_LIVE_WRITTEN))
+ if (writes &&
+ (state->stack[i].spilled_ptr.live & REG_LIVE_WRITTEN))
continue;
- if (state->spilled_regs[i].live & REG_LIVE_READ) {
- parent->spilled_regs[i].live |= REG_LIVE_READ;
+ if (state->stack[i].spilled_ptr.live & REG_LIVE_READ) {
+ parent->stack[i].spilled_ptr.live |= REG_LIVE_READ;
touched = true;
}
}
@@ -3684,7 +4007,8 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
{
struct bpf_verifier_state_list *new_sl;
struct bpf_verifier_state_list *sl;
- int i;
+ struct bpf_verifier_state *cur = env->cur_state;
+ int i, err;
sl = env->explored_states[insn_idx];
if (!sl)
@@ -3694,7 +4018,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
return 0;
while (sl != STATE_LIST_MARK) {
- if (states_equal(env, &sl->state, &env->cur_state)) {
+ if (states_equal(env, &sl->state, cur)) {
/* reached equivalent register/stack state,
* prune the search.
* Registers read by the continuation are read by us.
@@ -3705,7 +4029,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
* they'll be immediately forgotten as we're pruning
* this state and will pop a new one.
*/
- propagate_liveness(&sl->state, &env->cur_state);
+ propagate_liveness(&sl->state, cur);
return 1;
}
sl = sl->next;
@@ -3717,16 +4041,21 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
* it will be rejected. Since there are no loops, we won't be
* seeing this 'insn_idx' instruction again on the way to bpf_exit
*/
- new_sl = kmalloc(sizeof(struct bpf_verifier_state_list), GFP_USER);
+ new_sl = kzalloc(sizeof(struct bpf_verifier_state_list), GFP_KERNEL);
if (!new_sl)
return -ENOMEM;
/* add new state to the head of linked list */
- memcpy(&new_sl->state, &env->cur_state, sizeof(env->cur_state));
+ err = copy_verifier_state(&new_sl->state, cur);
+ if (err) {
+ free_verifier_state(&new_sl->state, false);
+ kfree(new_sl);
+ return err;
+ }
new_sl->next = env->explored_states[insn_idx];
env->explored_states[insn_idx] = new_sl;
/* connect new state to parentage chain */
- env->cur_state.parent = &new_sl->state;
+ cur->parent = &new_sl->state;
/* clear write marks in current state: the writes we did are not writes
* our child did, so they don't screen off its reads from us.
* (There are no read marks in current state, because reads always mark
@@ -3734,10 +4063,10 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx)
* explored_states can get read marks.)
*/
for (i = 0; i < BPF_REG_FP; i++)
- env->cur_state.regs[i].live = REG_LIVE_NONE;
- for (i = 0; i < MAX_BPF_STACK / BPF_REG_SIZE; i++)
- if (env->cur_state.stack_slot_type[i * BPF_REG_SIZE] == STACK_SPILL)
- env->cur_state.spilled_regs[i].live = REG_LIVE_NONE;
+ cur->regs[i].live = REG_LIVE_NONE;
+ for (i = 0; i < cur->allocated_stack / BPF_REG_SIZE; i++)
+ if (cur->stack[i].slot_type[0] == STACK_SPILL)
+ cur->stack[i].spilled_ptr.live = REG_LIVE_NONE;
return 0;
}
@@ -3752,29 +4081,31 @@ static int ext_analyzer_insn_hook(struct bpf_verifier_env *env,
static int do_check(struct bpf_verifier_env *env)
{
- struct bpf_verifier_state *state = &env->cur_state;
+ struct bpf_verifier_state *state;
struct bpf_insn *insns = env->prog->insnsi;
- struct bpf_reg_state *regs = state->regs;
+ struct bpf_reg_state *regs;
int insn_cnt = env->prog->len;
- int insn_idx, prev_insn_idx = 0;
int insn_processed = 0;
bool do_print_state = false;
- init_reg_state(regs);
+ state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL);
+ if (!state)
+ return -ENOMEM;
+ env->cur_state = state;
+ init_reg_state(state->regs);
state->parent = NULL;
- insn_idx = 0;
for (;;) {
struct bpf_insn *insn;
u8 class;
int err;
- if (insn_idx >= insn_cnt) {
+ if (env->insn_idx >= insn_cnt) {
verbose("invalid insn idx %d insn_cnt %d\n",
- insn_idx, insn_cnt);
+ env->insn_idx, insn_cnt);
return -EFAULT;
}
- insn = &insns[insn_idx];
+ insn = &insns[env->insn_idx];
class = BPF_CLASS(insn->code);
if (++insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) {
@@ -3783,17 +4114,19 @@ static int do_check(struct bpf_verifier_env *env)
return -E2BIG;
}
- err = is_state_visited(env, insn_idx);
+ err = is_state_visited(env, env->insn_idx);
if (err < 0)
return err;
if (err == 1) {
/* found equivalent state, can prune the search */
if (log_level) {
if (do_print_state)
- verbose("\nfrom %d to %d: safe\n",
- prev_insn_idx, insn_idx);
+ verbose("\nfrom %d to %d%s: safe\n",
+ env->prev_insn_idx, env->insn_idx,
+ env->cur_state->speculative ?
+ " (speculative execution)" : "");
else
- verbose("%d: safe\n", insn_idx);
+ verbose("%d: safe\n", env->insn_idx);
}
goto process_bpf_exit;
}
@@ -3803,24 +4136,27 @@ static int do_check(struct bpf_verifier_env *env)
if (log_level > 1 || (log_level && do_print_state)) {
if (log_level > 1)
- verbose("%d:", insn_idx);
+ verbose("%d:", env->insn_idx);
else
- verbose("\nfrom %d to %d:",
- prev_insn_idx, insn_idx);
- print_verifier_state(&env->cur_state);
+ verbose("\nfrom %d to %d%s:",
+ env->prev_insn_idx, env->insn_idx,
+ env->cur_state->speculative ?
+ " (speculative execution)" : "");
+ print_verifier_state(env->cur_state);
do_print_state = false;
}
if (log_level) {
- verbose("%d: ", insn_idx);
+ verbose("%d: ", env->insn_idx);
print_bpf_insn(env, insn);
}
- err = ext_analyzer_insn_hook(env, insn_idx, prev_insn_idx);
+ err = ext_analyzer_insn_hook(env, env->insn_idx, env->prev_insn_idx);
if (err)
return err;
- env->insn_aux_data[insn_idx].seen = true;
+ regs = cur_regs(env);
+ env->insn_aux_data[env->insn_idx].seen = true;
if (class == BPF_ALU || class == BPF_ALU64) {
err = check_alu_op(env, insn);
if (err)
@@ -3845,13 +4181,13 @@ static int do_check(struct bpf_verifier_env *env)
/* check that memory (src_reg + off) is readable,
* the state of dst_reg will be updated by this func
*/
- err = check_mem_access(env, insn_idx, insn->src_reg, insn->off,
- BPF_SIZE(insn->code), BPF_READ,
- insn->dst_reg, false);
+ err = check_mem_access(env, env->insn_idx, insn->src_reg,
+ insn->off, BPF_SIZE(insn->code),
+ BPF_READ, insn->dst_reg, false);
if (err)
return err;
- prev_src_type = &env->insn_aux_data[insn_idx].ptr_type;
+ prev_src_type = &env->insn_aux_data[env->insn_idx].ptr_type;
if (*prev_src_type == NOT_INIT) {
/* saw a valid insn
@@ -3878,10 +4214,10 @@ static int do_check(struct bpf_verifier_env *env)
enum bpf_reg_type *prev_dst_type, dst_reg_type;
if (BPF_MODE(insn->code) == BPF_XADD) {
- err = check_xadd(env, insn_idx, insn);
+ err = check_xadd(env, env->insn_idx, insn);
if (err)
return err;
- insn_idx++;
+ env->insn_idx++;
continue;
}
@@ -3897,13 +4233,13 @@ static int do_check(struct bpf_verifier_env *env)
dst_reg_type = regs[insn->dst_reg].type;
/* check that memory (dst_reg + off) is writeable */
- err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
- BPF_SIZE(insn->code), BPF_WRITE,
- insn->src_reg, false);
+ err = check_mem_access(env, env->insn_idx, insn->dst_reg,
+ insn->off, BPF_SIZE(insn->code),
+ BPF_WRITE, insn->src_reg, false);
if (err)
return err;
- prev_dst_type = &env->insn_aux_data[insn_idx].ptr_type;
+ prev_dst_type = &env->insn_aux_data[env->insn_idx].ptr_type;
if (*prev_dst_type == NOT_INIT) {
*prev_dst_type = dst_reg_type;
@@ -3932,9 +4268,9 @@ static int do_check(struct bpf_verifier_env *env)
}
/* check that memory (dst_reg + off) is writeable */
- err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off,
- BPF_SIZE(insn->code), BPF_WRITE,
- -1, false);
+ err = check_mem_access(env, env->insn_idx, insn->dst_reg,
+ insn->off, BPF_SIZE(insn->code),
+ BPF_WRITE, -1, false);
if (err)
return err;
@@ -3950,7 +4286,7 @@ static int do_check(struct bpf_verifier_env *env)
return -EINVAL;
}
- err = check_call(env, insn->imm, insn_idx);
+ err = check_call(env, insn->imm, env->insn_idx);
if (err)
return err;
@@ -3963,7 +4299,7 @@ static int do_check(struct bpf_verifier_env *env)
return -EINVAL;
}
- insn_idx += insn->off + 1;
+ env->insn_idx += insn->off + 1;
continue;
} else if (opcode == BPF_EXIT) {
@@ -3991,15 +4327,17 @@ static int do_check(struct bpf_verifier_env *env)
}
process_bpf_exit:
- insn_idx = pop_stack(env, &prev_insn_idx);
- if (insn_idx < 0) {
+ err = pop_stack(env, &env->prev_insn_idx, &env->insn_idx);
+ if (err < 0) {
+ if (err != -ENOENT)
+ return err;
break;
} else {
do_print_state = true;
continue;
}
} else {
- err = check_cond_jmp_op(env, insn, &insn_idx);
+ err = check_cond_jmp_op(env, insn, &env->insn_idx);
if (err)
return err;
}
@@ -4016,8 +4354,8 @@ process_bpf_exit:
if (err)
return err;
- insn_idx++;
- env->insn_aux_data[insn_idx].seen = true;
+ env->insn_idx++;
+ env->insn_aux_data[env->insn_idx].seen = true;
} else {
verbose("invalid BPF_LD mode\n");
return -EINVAL;
@@ -4027,7 +4365,7 @@ process_bpf_exit:
return -EINVAL;
}
- insn_idx++;
+ env->insn_idx++;
}
verbose("processed %d insns, stack depth %d\n",
@@ -4402,6 +4740,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
struct bpf_prog *new_prog;
struct bpf_map *map_ptr;
int i, cnt, delta = 0;
+ struct bpf_insn_aux_data *aux;
for (i = 0; i < insn_cnt; i++, insn++) {
if (insn->code == (BPF_ALU | BPF_MOD | BPF_X) ||
@@ -4422,6 +4761,58 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env)
continue;
}
+ if (insn->code == (BPF_ALU64 | BPF_ADD | BPF_X) ||
+ insn->code == (BPF_ALU64 | BPF_SUB | BPF_X)) {
+ const u8 code_add = BPF_ALU64 | BPF_ADD | BPF_X;
+ const u8 code_sub = BPF_ALU64 | BPF_SUB | BPF_X;
+ struct bpf_insn insn_buf[16];
+ struct bpf_insn *patch = &insn_buf[0];
+ bool issrc, isneg;
+ u32 off_reg;
+
+ aux = &env->insn_aux_data[i + delta];
+ if (!aux->alu_state ||
+ aux->alu_state == BPF_ALU_NON_POINTER)
+ continue;
+
+ isneg = aux->alu_state & BPF_ALU_NEG_VALUE;
+ issrc = (aux->alu_state & BPF_ALU_SANITIZE) ==
+ BPF_ALU_SANITIZE_SRC;
+
+ off_reg = issrc ? insn->src_reg : insn->dst_reg;
+ if (isneg)
+ *patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1);
+ *patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit - 1);
+ *patch++ = BPF_ALU64_REG(BPF_SUB, BPF_REG_AX, off_reg);
+ *patch++ = BPF_ALU64_REG(BPF_OR, BPF_REG_AX, off_reg);
+ *patch++ = BPF_ALU64_IMM(BPF_NEG, BPF_REG_AX, 0);
+ *patch++ = BPF_ALU64_IMM(BPF_ARSH, BPF_REG_AX, 63);
+ if (issrc) {
+ *patch++ = BPF_ALU64_REG(BPF_AND, BPF_REG_AX,
+ off_reg);
+ insn->src_reg = BPF_REG_AX;
+ } else {
+ *patch++ = BPF_ALU64_REG(BPF_AND, off_reg,
+ BPF_REG_AX);
+ }
+ if (isneg)
+ insn->code = insn->code == code_add ?
+ code_sub : code_add;
+ *patch++ = *insn;
+ if (issrc && isneg)
+ *patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1);
+ cnt = patch - insn_buf;
+
+ new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt);
+ if (!new_prog)
+ return -ENOMEM;
+
+ delta += cnt - 1;
+ env->prog = prog = new_prog;
+ insn = new_prog->insnsi + i + delta;
+ continue;
+ }
+
if (insn->code != (BPF_JMP | BPF_CALL))
continue;
@@ -4557,6 +4948,7 @@ static void free_states(struct bpf_verifier_env *env)
if (sl)
while (sl != STATE_LIST_MARK) {
sln = sl->next;
+ free_verifier_state(&sl->state, false);
kfree(sl);
sl = sln;
}
@@ -4633,9 +5025,13 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr)
env->allow_ptr_leaks = capable(CAP_SYS_ADMIN);
ret = do_check(env);
+ if (env->cur_state) {
+ free_verifier_state(env->cur_state, true);
+ env->cur_state = NULL;
+ }
skip_full_check:
- while (pop_stack(env, NULL) >= 0);
+ while (!pop_stack(env, NULL, NULL));
free_states(env);
if (ret == 0)
@@ -4741,9 +5137,13 @@ int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops,
env->allow_ptr_leaks = capable(CAP_SYS_ADMIN);
ret = do_check(env);
+ if (env->cur_state) {
+ free_verifier_state(env->cur_state, true);
+ env->cur_state = NULL;
+ }
skip_full_check:
- while (pop_stack(env, NULL) >= 0);
+ while (!pop_stack(env, NULL, NULL));
free_states(env);
mutex_unlock(&bpf_verifier_lock);
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 109c32c56de7..694b1cc8d144 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -187,7 +187,7 @@ static u64 css_serial_nr_next = 1;
*/
static u16 have_fork_callback __read_mostly;
static u16 have_exit_callback __read_mostly;
-static u16 have_free_callback __read_mostly;
+static u16 have_release_callback __read_mostly;
static u16 have_canfork_callback __read_mostly;
/* cgroup namespace for init task */
@@ -1692,7 +1692,7 @@ static int parse_cgroup_root_flags(char *data, unsigned int *root_flags)
*root_flags = 0;
- if (!data)
+ if (!data || *data == '\0')
return 0;
while ((token = strsep(&data, ",")) != NULL) {
@@ -1942,7 +1942,7 @@ struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags,
struct cgroup_namespace *ns)
{
struct dentry *dentry;
- bool new_sb;
+ bool new_sb = false;
dentry = kernfs_mount(fs_type, flags, root->kf_root, magic, &new_sb);
@@ -1952,6 +1952,7 @@ struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags,
*/
if (!IS_ERR(dentry) && ns != &init_cgroup_ns) {
struct dentry *nsdentry;
+ struct super_block *sb = dentry->d_sb;
struct cgroup *cgrp;
mutex_lock(&cgroup_mutex);
@@ -1962,12 +1963,14 @@ struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags,
spin_unlock_irq(&css_set_lock);
mutex_unlock(&cgroup_mutex);
- nsdentry = kernfs_node_dentry(cgrp->kn, dentry->d_sb);
+ nsdentry = kernfs_node_dentry(cgrp->kn, sb);
dput(dentry);
+ if (IS_ERR(nsdentry))
+ deactivate_locked_super(sb);
dentry = nsdentry;
}
- if (IS_ERR(dentry) || !new_sb)
+ if (!new_sb)
cgroup_put(&root->cgrp);
return dentry;
@@ -5109,7 +5112,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early)
have_fork_callback |= (bool)ss->fork << ss->id;
have_exit_callback |= (bool)ss->exit << ss->id;
- have_free_callback |= (bool)ss->free << ss->id;
+ have_release_callback |= (bool)ss->release << ss->id;
have_canfork_callback |= (bool)ss->can_fork << ss->id;
/* At system boot, before all subsystems have been
@@ -5543,16 +5546,19 @@ void cgroup_exit(struct task_struct *tsk)
} while_each_subsys_mask();
}
-void cgroup_free(struct task_struct *task)
+void cgroup_release(struct task_struct *task)
{
- struct css_set *cset = task_css_set(task);
struct cgroup_subsys *ss;
int ssid;
- do_each_subsys_mask(ss, ssid, have_free_callback) {
- ss->free(task);
+ do_each_subsys_mask(ss, ssid, have_release_callback) {
+ ss->release(task);
} while_each_subsys_mask();
+}
+void cgroup_free(struct task_struct *task)
+{
+ struct css_set *cset = task_css_set(task);
put_css_set(cset);
}
diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c
index 9829c67ebc0a..c9960baaa14f 100644
--- a/kernel/cgroup/pids.c
+++ b/kernel/cgroup/pids.c
@@ -247,7 +247,7 @@ static void pids_cancel_fork(struct task_struct *task)
pids_uncharge(pids, 1);
}
-static void pids_free(struct task_struct *task)
+static void pids_release(struct task_struct *task)
{
struct pids_cgroup *pids = css_pids(task_css(task, pids_cgrp_id));
@@ -342,7 +342,7 @@ struct cgroup_subsys pids_cgrp_subsys = {
.cancel_attach = pids_cancel_attach,
.can_fork = pids_can_fork,
.cancel_fork = pids_cancel_fork,
- .free = pids_free,
+ .release = pids_release,
.legacy_cftypes = pids_files,
.dfl_cftypes = pids_files,
.threaded = true,
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 5c907d96e3dd..8c350dd81581 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -314,6 +314,15 @@ void cpus_write_unlock(void)
void lockdep_assert_cpus_held(void)
{
+ /*
+ * We can't have hotplug operations before userspace starts running,
+ * and some init codepaths will knowingly not take the hotplug lock.
+ * This is all valid, so mute lockdep until it makes sense to report
+ * unheld locks.
+ */
+ if (system_state < SYSTEM_RUNNING)
+ return;
+
percpu_rwsem_assert_held(&cpu_hotplug_lock);
}
@@ -356,9 +365,6 @@ void __weak arch_smt_update(void) { }
#ifdef CONFIG_HOTPLUG_SMT
enum cpuhp_smt_control cpu_smt_control __read_mostly = CPU_SMT_ENABLED;
-EXPORT_SYMBOL_GPL(cpu_smt_control);
-
-static bool cpu_smt_available __read_mostly;
void __init cpu_smt_disable(bool force)
{
@@ -376,25 +382,11 @@ void __init cpu_smt_disable(bool force)
/*
* The decision whether SMT is supported can only be done after the full
- * CPU identification. Called from architecture code before non boot CPUs
- * are brought up.
- */
-void __init cpu_smt_check_topology_early(void)
-{
- if (!topology_smt_supported())
- cpu_smt_control = CPU_SMT_NOT_SUPPORTED;
-}
-
-/*
- * If SMT was disabled by BIOS, detect it here, after the CPUs have been
- * brought online. This ensures the smt/l1tf sysfs entries are consistent
- * with reality. cpu_smt_available is set to true during the bringup of non
- * boot CPUs when a SMT sibling is detected. Note, this may overwrite
- * cpu_smt_control's previous setting.
+ * CPU identification. Called from architecture code.
*/
void __init cpu_smt_check_topology(void)
{
- if (!cpu_smt_available)
+ if (!topology_smt_supported())
cpu_smt_control = CPU_SMT_NOT_SUPPORTED;
}
@@ -407,18 +399,10 @@ early_param("nosmt", smt_cmdline_disable);
static inline bool cpu_smt_allowed(unsigned int cpu)
{
- if (topology_is_primary_thread(cpu))
+ if (cpu_smt_control == CPU_SMT_ENABLED)
return true;
- /*
- * If the CPU is not a 'primary' thread and the booted_once bit is
- * set then the processor has SMT support. Store this information
- * for the late check of SMT support in cpu_smt_check_topology().
- */
- if (per_cpu(cpuhp_state, cpu).booted_once)
- cpu_smt_available = true;
-
- if (cpu_smt_control == CPU_SMT_ENABLED)
+ if (topology_is_primary_thread(cpu))
return true;
/*
@@ -563,6 +547,20 @@ static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st)
}
}
+static inline bool can_rollback_cpu(struct cpuhp_cpu_state *st)
+{
+ if (IS_ENABLED(CONFIG_HOTPLUG_CPU))
+ return true;
+ /*
+ * When CPU hotplug is disabled, then taking the CPU down is not
+ * possible because takedown_cpu() and the architecture and
+ * subsystem specific mechanisms are not available. So the CPU
+ * which would be completely unplugged again needs to stay around
+ * in the current state.
+ */
+ return st->state <= CPUHP_BRINGUP_CPU;
+}
+
static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
enum cpuhp_state target)
{
@@ -573,8 +571,10 @@ static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
st->state++;
ret = cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL);
if (ret) {
- st->target = prev_state;
- undo_cpu_up(cpu, st);
+ if (can_rollback_cpu(st)) {
+ st->target = prev_state;
+ undo_cpu_up(cpu, st);
+ }
break;
}
}
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index 65c0f1363788..94aa9ae0007a 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -535,6 +535,8 @@ return_normal:
arch_kgdb_ops.correct_hw_break();
if (trace_on)
tracing_on();
+ kgdb_info[cpu].debuggerinfo = NULL;
+ kgdb_info[cpu].task = NULL;
kgdb_info[cpu].exception_state &=
~(DCPU_WANT_MASTER | DCPU_IS_SLAVE);
kgdb_info[cpu].enter_kgdb--;
@@ -667,6 +669,8 @@ kgdb_restore:
if (trace_on)
tracing_on();
+ kgdb_info[cpu].debuggerinfo = NULL;
+ kgdb_info[cpu].task = NULL;
kgdb_info[cpu].exception_state &=
~(DCPU_WANT_MASTER | DCPU_IS_SLAVE);
kgdb_info[cpu].enter_kgdb--;
diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c
index 7921ae4fca8d..7e2379aa0a1e 100644
--- a/kernel/debug/kdb/kdb_bt.c
+++ b/kernel/debug/kdb/kdb_bt.c
@@ -186,7 +186,16 @@ kdb_bt(int argc, const char **argv)
kdb_printf("btc: cpu status: ");
kdb_parse("cpu\n");
for_each_online_cpu(cpu) {
- sprintf(buf, "btt 0x%px\n", KDB_TSK(cpu));
+ void *kdb_tsk = KDB_TSK(cpu);
+
+ /* If a CPU failed to round up we could be here */
+ if (!kdb_tsk) {
+ kdb_printf("WARNING: no task for cpu %ld\n",
+ cpu);
+ continue;
+ }
+
+ sprintf(buf, "btt 0x%px\n", kdb_tsk);
kdb_parse(buf);
touch_nmi_watchdog();
}
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c
index 15e1a7af5dd0..53a0df6e4d92 100644
--- a/kernel/debug/kdb/kdb_debugger.c
+++ b/kernel/debug/kdb/kdb_debugger.c
@@ -118,13 +118,6 @@ int kdb_stub(struct kgdb_state *ks)
kdb_bp_remove();
KDB_STATE_CLEAR(DOING_SS);
KDB_STATE_SET(PAGER);
- /* zero out any offline cpu data */
- for_each_present_cpu(i) {
- if (!cpu_online(i)) {
- kgdb_info[i].debuggerinfo = NULL;
- kgdb_info[i].task = NULL;
- }
- }
if (ks->err_code == DIE_OOPS || reason == KDB_REASON_OOPS) {
ks->pass_exception = 1;
KDB_FLAG_SET(CATASTROPHIC);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 991af683ef9e..580616e6fcee 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -436,18 +436,18 @@ int perf_proc_update_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
- int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
-
- if (ret || !write)
- return ret;
-
+ int ret;
+ int perf_cpu = sysctl_perf_cpu_time_max_percent;
/*
* If throttling is disabled don't allow the write:
*/
- if (sysctl_perf_cpu_time_max_percent == 100 ||
- sysctl_perf_cpu_time_max_percent == 0)
+ if (write && (perf_cpu == 100 || perf_cpu == 0))
return -EINVAL;
+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ if (ret || !write)
+ return ret;
+
max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
update_perf_cpu_limits();
@@ -4738,6 +4738,11 @@ static void __perf_event_period(struct perf_event *event,
}
}
+static int perf_event_check_period(struct perf_event *event, u64 value)
+{
+ return event->pmu->check_period(event, value);
+}
+
static int perf_event_period(struct perf_event *event, u64 __user *arg)
{
u64 value;
@@ -4754,6 +4759,9 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg)
if (event->attr.freq && value > sysctl_perf_event_sample_rate)
return -EINVAL;
+ if (perf_event_check_period(event, value))
+ return -EINVAL;
+
event_function_call(event, __perf_event_period, &value);
return 0;
@@ -6915,6 +6923,7 @@ static void perf_event_mmap_output(struct perf_event *event,
struct perf_output_handle handle;
struct perf_sample_data sample;
int size = mmap_event->event_id.header.size;
+ u32 type = mmap_event->event_id.header.type;
int ret;
if (!perf_event_mmap_match(event, data))
@@ -6958,6 +6967,7 @@ static void perf_event_mmap_output(struct perf_event *event,
perf_output_end(&handle);
out:
mmap_event->event_id.header.size = size;
+ mmap_event->event_id.header.type = type;
}
static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
@@ -8951,6 +8961,11 @@ static int perf_pmu_nop_int(struct pmu *pmu)
return 0;
}
+static int perf_event_nop_int(struct perf_event *event, u64 value)
+{
+ return 0;
+}
+
static DEFINE_PER_CPU(unsigned int, nop_txn_flags);
static void perf_pmu_start_txn(struct pmu *pmu, unsigned int flags)
@@ -9251,6 +9266,9 @@ got_cpu_context:
pmu->pmu_disable = perf_pmu_nop_void;
}
+ if (!pmu->check_period)
+ pmu->check_period = perf_event_nop_int;
+
if (!pmu->event_idx)
pmu->event_idx = perf_event_idx_default;
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index c573c7339223..489dc6b60053 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -719,6 +719,9 @@ struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags)
size = sizeof(struct ring_buffer);
size += nr_pages * sizeof(void *);
+ if (order_base_2(size) >= PAGE_SHIFT+MAX_ORDER)
+ goto fail;
+
rb = kzalloc(size, GFP_KERNEL);
if (!rb)
goto fail;
diff --git a/kernel/exit.c b/kernel/exit.c
index 3aa01b74c1e3..95ce231ff5e2 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -218,6 +218,7 @@ repeat:
}
write_unlock_irq(&tasklist_lock);
+ cgroup_release(p);
release_thread(p);
call_rcu(&p->rcu, delayed_put_task_struct);
@@ -306,7 +307,7 @@ void rcuwait_wake_up(struct rcuwait *w)
* MB (A) MB (B)
* [L] cond [L] tsk
*/
- smp_rmb(); /* (B) */
+ smp_mb(); /* (B) */
/*
* Avoid using task_rcu_dereference() magic as long as we are careful,
diff --git a/kernel/futex.c b/kernel/futex.c
index 046cd780d057..f2fa48c6c476 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1166,11 +1166,65 @@ out_error:
return ret;
}
+static int handle_exit_race(u32 __user *uaddr, u32 uval,
+ struct task_struct *tsk)
+{
+ u32 uval2;
+
+ /*
+ * If PF_EXITPIDONE is not yet set, then try again.
+ */
+ if (tsk && !(tsk->flags & PF_EXITPIDONE))
+ return -EAGAIN;
+
+ /*
+ * Reread the user space value to handle the following situation:
+ *
+ * CPU0 CPU1
+ *
+ * sys_exit() sys_futex()
+ * do_exit() futex_lock_pi()
+ * futex_lock_pi_atomic()
+ * exit_signals(tsk) No waiters:
+ * tsk->flags |= PF_EXITING; *uaddr == 0x00000PID
+ * mm_release(tsk) Set waiter bit
+ * exit_robust_list(tsk) { *uaddr = 0x80000PID;
+ * Set owner died attach_to_pi_owner() {
+ * *uaddr = 0xC0000000; tsk = get_task(PID);
+ * } if (!tsk->flags & PF_EXITING) {
+ * ... attach();
+ * tsk->flags |= PF_EXITPIDONE; } else {
+ * if (!(tsk->flags & PF_EXITPIDONE))
+ * return -EAGAIN;
+ * return -ESRCH; <--- FAIL
+ * }
+ *
+ * Returning ESRCH unconditionally is wrong here because the
+ * user space value has been changed by the exiting task.
+ *
+ * The same logic applies to the case where the exiting task is
+ * already gone.
+ */
+ if (get_futex_value_locked(&uval2, uaddr))
+ return -EFAULT;
+
+ /* If the user space value has changed, try again. */
+ if (uval2 != uval)
+ return -EAGAIN;
+
+ /*
+ * The exiting task did not have a robust list, the robust list was
+ * corrupted or the user space value in *uaddr is simply bogus.
+ * Give up and tell user space.
+ */
+ return -ESRCH;
+}
+
/*
* Lookup the task for the TID provided from user space and attach to
* it after doing proper sanity checks.
*/
-static int attach_to_pi_owner(u32 uval, union futex_key *key,
+static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key,
struct futex_pi_state **ps)
{
pid_t pid = uval & FUTEX_TID_MASK;
@@ -1180,12 +1234,15 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,
/*
* We are the first waiter - try to look up the real owner and attach
* the new pi_state to it, but bail out when TID = 0 [1]
+ *
+ * The !pid check is paranoid. None of the call sites should end up
+ * with pid == 0, but better safe than sorry. Let the caller retry
*/
if (!pid)
- return -ESRCH;
+ return -EAGAIN;
p = futex_find_get_task(pid);
if (!p)
- return -ESRCH;
+ return handle_exit_race(uaddr, uval, NULL);
if (unlikely(p->flags & PF_KTHREAD)) {
put_task_struct(p);
@@ -1205,7 +1262,7 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,
* set, we know that the task has finished the
* cleanup:
*/
- int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN;
+ int ret = handle_exit_race(uaddr, uval, p);
raw_spin_unlock_irq(&p->pi_lock);
put_task_struct(p);
@@ -1262,7 +1319,7 @@ static int lookup_pi_state(u32 __user *uaddr, u32 uval,
* We are the first waiter - try to look up the owner based on
* @uval and attach to it.
*/
- return attach_to_pi_owner(uval, key, ps);
+ return attach_to_pi_owner(uaddr, uval, key, ps);
}
static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
@@ -1370,7 +1427,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
* attach to the owner. If that fails, no harm done, we only
* set the FUTEX_WAITERS bit in the user space variable.
*/
- return attach_to_pi_owner(uval, key, ps);
+ return attach_to_pi_owner(uaddr, newval, key, ps);
}
/**
@@ -1405,11 +1462,7 @@ static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q)
if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n"))
return;
- /*
- * Queue the task for later wakeup for after we've released
- * the hb->lock. wake_q_add() grabs reference to p.
- */
- wake_q_add(wake_q, p);
+ get_task_struct(p);
__unqueue_futex(q);
/*
* The waiting task can free the futex_q as soon as q->lock_ptr = NULL
@@ -1419,6 +1472,13 @@ static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q)
* plist_del in __unqueue_futex().
*/
smp_store_release(&q->lock_ptr, NULL);
+
+ /*
+ * Queue the task for later wakeup for after we've released
+ * the hb->lock. wake_q_add() grabs reference to p.
+ */
+ wake_q_add(wake_q, p);
+ put_task_struct(p);
}
/*
@@ -2811,35 +2871,39 @@ retry_private:
* and BUG when futex_unlock_pi() interleaves with this.
*
* Therefore acquire wait_lock while holding hb->lock, but drop the
- * latter before calling rt_mutex_start_proxy_lock(). This still fully
- * serializes against futex_unlock_pi() as that does the exact same
- * lock handoff sequence.
+ * latter before calling __rt_mutex_start_proxy_lock(). This
+ * interleaves with futex_unlock_pi() -- which does a similar lock
+ * handoff -- such that the latter can observe the futex_q::pi_state
+ * before __rt_mutex_start_proxy_lock() is done.
*/
raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
spin_unlock(q.lock_ptr);
+ /*
+ * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter
+ * such that futex_unlock_pi() is guaranteed to observe the waiter when
+ * it sees the futex_q::pi_state.
+ */
ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
if (ret) {
if (ret == 1)
ret = 0;
-
- spin_lock(q.lock_ptr);
- goto no_block;
+ goto cleanup;
}
-
if (unlikely(to))
hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS);
ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
+cleanup:
spin_lock(q.lock_ptr);
/*
- * If we failed to acquire the lock (signal/timeout), we must
+ * If we failed to acquire the lock (deadlock/signal/timeout), we must
* first acquire the hb->lock before removing the lock from the
- * rt_mutex waitqueue, such that we can keep the hb and rt_mutex
- * wait lists consistent.
+ * rt_mutex waitqueue, such that we can keep the hb and rt_mutex wait
+ * lists consistent.
*
* In particular; it is important that futex_unlock_pi() can not
* observe this inconsistency.
@@ -2963,6 +3027,10 @@ retry:
* there is no point where we hold neither; and therefore
* wake_futex_pi() must observe a state consistent with what we
* observed.
+ *
+ * In particular; this forces __rt_mutex_start_proxy() to
+ * complete such that we're guaranteed to observe the
+ * rt_waiter. Also see the WARN in wake_futex_pi().
*/
raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
spin_unlock(&hb->lock);
@@ -3382,6 +3450,10 @@ int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi)
{
u32 uval, uninitialized_var(nval), mval;
+ /* Futex address must be 32bit aligned */
+ if ((((unsigned long)uaddr) % sizeof(*uaddr)) != 0)
+ return -1;
+
retry:
if (get_user(uval, uaddr))
return -1;
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 32b479468e4d..2e4869fa66c9 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -15,6 +15,7 @@
#include <linux/lockdep.h>
#include <linux/export.h>
#include <linux/sysctl.h>
+#include <linux/suspend.h>
#include <linux/utsname.h>
#include <linux/sched/signal.h>
#include <linux/sched/debug.h>
@@ -33,7 +34,7 @@ int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
* is disabled during the critical section. It also controls the size of
* the RCU grace period. So it needs to be upper-bound.
*/
-#define HUNG_TASK_BATCHING 1024
+#define HUNG_TASK_LOCK_BREAK (HZ / 10)
/*
* Zero means infinite timeout - no checking done:
@@ -103,8 +104,11 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
trace_sched_process_hang(t);
- if (!sysctl_hung_task_warnings && !sysctl_hung_task_panic)
- return;
+ if (sysctl_hung_task_panic) {
+ console_verbose();
+ hung_task_show_lock = true;
+ hung_task_call_panic = true;
+ }
/*
* Ok, the task did not get scheduled for more than 2 minutes,
@@ -126,11 +130,6 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
}
touch_nmi_watchdog();
-
- if (sysctl_hung_task_panic) {
- hung_task_show_lock = true;
- hung_task_call_panic = true;
- }
}
/*
@@ -164,7 +163,7 @@ static bool rcu_lock_break(struct task_struct *g, struct task_struct *t)
static void check_hung_uninterruptible_tasks(unsigned long timeout)
{
int max_count = sysctl_hung_task_check_count;
- int batch_count = HUNG_TASK_BATCHING;
+ unsigned long last_break = jiffies;
struct task_struct *g, *t;
/*
@@ -179,10 +178,10 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
for_each_process_thread(g, t) {
if (!max_count--)
goto unlock;
- if (!--batch_count) {
- batch_count = HUNG_TASK_BATCHING;
+ if (time_after(jiffies, last_break + HUNG_TASK_LOCK_BREAK)) {
if (!rcu_lock_break(g, t))
goto unlock;
+ last_break = jiffies;
}
/* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
if (t->state == TASK_UNINTERRUPTIBLE)
@@ -234,6 +233,28 @@ void reset_hung_task_detector(void)
}
EXPORT_SYMBOL_GPL(reset_hung_task_detector);
+static bool hung_detector_suspended;
+
+static int hungtask_pm_notify(struct notifier_block *self,
+ unsigned long action, void *hcpu)
+{
+ switch (action) {
+ case PM_SUSPEND_PREPARE:
+ case PM_HIBERNATION_PREPARE:
+ case PM_RESTORE_PREPARE:
+ hung_detector_suspended = true;
+ break;
+ case PM_POST_SUSPEND:
+ case PM_POST_HIBERNATION:
+ case PM_POST_RESTORE:
+ hung_detector_suspended = false;
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_OK;
+}
+
/*
* kthread which checks for tasks stuck in D state
*/
@@ -248,7 +269,8 @@ static int watchdog(void *dummy)
long t = hung_timeout_jiffies(hung_last_checked, timeout);
if (t <= 0) {
- if (!atomic_xchg(&reset_hung_task, 0))
+ if (!atomic_xchg(&reset_hung_task, 0) &&
+ !hung_detector_suspended)
check_hung_uninterruptible_tasks(timeout);
hung_last_checked = jiffies;
continue;
@@ -262,6 +284,10 @@ static int watchdog(void *dummy)
static int __init hung_task_init(void)
{
atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
+
+ /* Disable hung task detector on suspend */
+ pm_notifier(hungtask_pm_notify, 0);
+
watchdog_task = kthread_run(watchdog, NULL, "khungtaskd");
return 0;
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 5a2ef92c2782..317fc759de76 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -834,7 +834,11 @@ void handle_percpu_irq(struct irq_desc *desc)
{
struct irq_chip *chip = irq_desc_get_chip(desc);
- kstat_incr_irqs_this_cpu(desc);
+ /*
+ * PER CPU interrupts are not serialized. Do not touch
+ * desc->tot_count.
+ */
+ __kstat_incr_irqs_this_cpu(desc);
if (chip->irq_ack)
chip->irq_ack(&desc->irq_data);
@@ -863,7 +867,11 @@ void handle_percpu_devid_irq(struct irq_desc *desc)
unsigned int irq = irq_desc_get_irq(desc);
irqreturn_t res;
- kstat_incr_irqs_this_cpu(desc);
+ /*
+ * PER CPU interrupts are not serialized. Do not touch
+ * desc->tot_count.
+ */
+ __kstat_incr_irqs_this_cpu(desc);
if (chip->irq_ack)
chip->irq_ack(&desc->irq_data);
@@ -1355,6 +1363,10 @@ int irq_chip_set_vcpu_affinity_parent(struct irq_data *data, void *vcpu_info)
int irq_chip_set_wake_parent(struct irq_data *data, unsigned int on)
{
data = data->parent_data;
+
+ if (data->chip->flags & IRQCHIP_SKIP_SET_WAKE)
+ return 0;
+
if (data->chip->irq_set_wake)
return data->chip->irq_set_wake(data, on);
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 44ed5f8c8759..4ef7f3b820ce 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -240,12 +240,18 @@ static inline void irq_state_set_masked(struct irq_desc *desc)
#undef __irqd_to_state
-static inline void kstat_incr_irqs_this_cpu(struct irq_desc *desc)
+static inline void __kstat_incr_irqs_this_cpu(struct irq_desc *desc)
{
__this_cpu_inc(*desc->kstat_irqs);
__this_cpu_inc(kstat.irqs_sum);
}
+static inline void kstat_incr_irqs_this_cpu(struct irq_desc *desc)
+{
+ __kstat_incr_irqs_this_cpu(desc);
+ desc->tot_count++;
+}
+
static inline int irq_desc_get_node(struct irq_desc *desc)
{
return irq_common_data_get_node(&desc->irq_common_data);
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index e97bbae947f0..aa08d4184608 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -119,6 +119,7 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node,
desc->depth = 1;
desc->irq_count = 0;
desc->irqs_unhandled = 0;
+ desc->tot_count = 0;
desc->name = NULL;
desc->owner = owner;
for_each_possible_cpu(cpu)
@@ -534,6 +535,7 @@ int __init early_irq_init(void)
alloc_masks(&desc[i], node);
raw_spin_lock_init(&desc[i].lock);
lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
+ mutex_init(&desc[i].request_mutex);
desc_set_defaults(i, &desc[i], node, NULL, NULL);
}
return arch_early_irq_init();
@@ -895,11 +897,15 @@ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
unsigned int kstat_irqs(unsigned int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
- int cpu;
unsigned int sum = 0;
+ int cpu;
if (!desc || !desc->kstat_irqs)
return 0;
+ if (!irq_settings_is_per_cpu_devid(desc) &&
+ !irq_settings_is_per_cpu(desc))
+ return desc->tot_count;
+
for_each_possible_cpu(cpu)
sum += *per_cpu_ptr(desc->kstat_irqs, cpu);
return sum;
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 4cd85870f00e..6c877d28838f 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -360,6 +360,9 @@ int irq_setup_affinity(struct irq_desc *desc)
}
cpumask_and(&mask, cpu_online_mask, set);
+ if (cpumask_empty(&mask))
+ cpumask_copy(&mask, cpu_online_mask);
+
if (node != NUMA_NO_NODE) {
const struct cpumask *nodemask = cpumask_of_node(node);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 5cbad4fb9107..ec11bb986a8b 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -703,7 +703,6 @@ static void unoptimize_kprobe(struct kprobe *p, bool force)
static int reuse_unused_kprobe(struct kprobe *ap)
{
struct optimized_kprobe *op;
- int ret;
BUG_ON(!kprobe_unused(ap));
/*
@@ -717,9 +716,8 @@ static int reuse_unused_kprobe(struct kprobe *ap)
/* Enable the probe again */
ap->flags &= ~KPROBE_FLAG_DISABLED;
/* Optimize it again (remove from op->list) */
- ret = kprobe_optready(ap);
- if (ret)
- return ret;
+ if (!kprobe_optready(ap))
+ return -EINVAL;
optimize_kprobe(ap);
return 0;
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 4ad35718f123..71c554a9e17f 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -1726,12 +1726,33 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock,
rt_mutex_set_owner(lock, NULL);
}
+/**
+ * __rt_mutex_start_proxy_lock() - Start lock acquisition for another task
+ * @lock: the rt_mutex to take
+ * @waiter: the pre-initialized rt_mutex_waiter
+ * @task: the task to prepare
+ *
+ * Starts the rt_mutex acquire; it enqueues the @waiter and does deadlock
+ * detection. It does not wait, see rt_mutex_wait_proxy_lock() for that.
+ *
+ * NOTE: does _NOT_ remove the @waiter on failure; must either call
+ * rt_mutex_wait_proxy_lock() or rt_mutex_cleanup_proxy_lock() after this.
+ *
+ * Returns:
+ * 0 - task blocked on lock
+ * 1 - acquired the lock for task, caller should wake it up
+ * <0 - error
+ *
+ * Special API call for PI-futex support.
+ */
int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
struct rt_mutex_waiter *waiter,
struct task_struct *task)
{
int ret;
+ lockdep_assert_held(&lock->wait_lock);
+
if (try_to_take_rt_mutex(lock, task, NULL))
return 1;
@@ -1749,9 +1770,6 @@ int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
ret = 0;
}
- if (unlikely(ret))
- remove_waiter(lock, waiter);
-
debug_rt_mutex_print_deadlock(waiter);
return ret;
@@ -1763,12 +1781,18 @@ int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
* @waiter: the pre-initialized rt_mutex_waiter
* @task: the task to prepare
*
+ * Starts the rt_mutex acquire; it enqueues the @waiter and does deadlock
+ * detection. It does not wait, see rt_mutex_wait_proxy_lock() for that.
+ *
+ * NOTE: unlike __rt_mutex_start_proxy_lock this _DOES_ remove the @waiter
+ * on failure.
+ *
* Returns:
* 0 - task blocked on lock
* 1 - acquired the lock for task, caller should wake it up
* <0 - error
*
- * Special API call for FUTEX_REQUEUE_PI support.
+ * Special API call for PI-futex support.
*/
int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
struct rt_mutex_waiter *waiter,
@@ -1778,6 +1802,8 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
raw_spin_lock_irq(&lock->wait_lock);
ret = __rt_mutex_start_proxy_lock(lock, waiter, task);
+ if (unlikely(ret))
+ remove_waiter(lock, waiter);
raw_spin_unlock_irq(&lock->wait_lock);
return ret;
@@ -1845,7 +1871,8 @@ int rt_mutex_wait_proxy_lock(struct rt_mutex *lock,
* @lock: the rt_mutex we were woken on
* @waiter: the pre-initialized rt_mutex_waiter
*
- * Attempt to clean up after a failed rt_mutex_wait_proxy_lock().
+ * Attempt to clean up after a failed __rt_mutex_start_proxy_lock() or
+ * rt_mutex_wait_proxy_lock().
*
* Unless we acquired the lock; we're still enqueued on the wait-list and can
* in fact still be granted ownership until we're removed. Therefore we can
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
index a90336779375..c75017326c37 100644
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -198,15 +198,22 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem,
woken++;
tsk = waiter->task;
- wake_q_add(wake_q, tsk);
+ get_task_struct(tsk);
list_del(&waiter->list);
/*
- * Ensure that the last operation is setting the reader
+ * Ensure calling get_task_struct() before setting the reader
* waiter to nil such that rwsem_down_read_failed() cannot
* race with do_exit() by always holding a reference count
* to the task to wakeup.
*/
smp_store_release(&waiter->task, NULL);
+ /*
+ * Ensure issuing the wakeup (either by us or someone else)
+ * after setting the reader waiter to nil.
+ */
+ wake_q_add(wake_q, tsk);
+ /* wake_q_add() already take the task ref */
+ put_task_struct(tsk);
}
adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment;
diff --git a/kernel/module.c b/kernel/module.c
index 2a44c515f0d7..94528b891027 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1201,8 +1201,10 @@ static ssize_t store_uevent(struct module_attribute *mattr,
struct module_kobject *mk,
const char *buffer, size_t count)
{
- kobject_synth_uevent(&mk->kobj, buffer, count);
- return count;
+ int rc;
+
+ rc = kobject_synth_uevent(&mk->kobj, buffer, count);
+ return rc ? rc : count;
}
struct module_attribute module_uevent =
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 84b1367935e4..f1c85b6c39ae 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -29,6 +29,7 @@
#include <linux/hw_breakpoint.h>
#include <linux/cn_proc.h>
#include <linux/compat.h>
+#include <linux/sched/signal.h>
/*
* Access another process' address space via ptrace.
@@ -925,18 +926,26 @@ int ptrace_request(struct task_struct *child, long request,
ret = ptrace_setsiginfo(child, &siginfo);
break;
- case PTRACE_GETSIGMASK:
+ case PTRACE_GETSIGMASK: {
+ sigset_t *mask;
+
if (addr != sizeof(sigset_t)) {
ret = -EINVAL;
break;
}
- if (copy_to_user(datavp, &child->blocked, sizeof(sigset_t)))
+ if (test_tsk_restore_sigmask(child))
+ mask = &child->saved_sigmask;
+ else
+ mask = &child->blocked;
+
+ if (copy_to_user(datavp, mask, sizeof(sigset_t)))
ret = -EFAULT;
else
ret = 0;
break;
+ }
case PTRACE_SETSIGMASK: {
sigset_t new_set;
@@ -962,6 +971,8 @@ int ptrace_request(struct task_struct *child, long request,
child->blocked = new_set;
spin_unlock_irq(&child->sighand->siglock);
+ clear_tsk_restore_sigmask(child);
+
ret = 0;
break;
}
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 710ce1d6b982..fb051fa99b67 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1789,15 +1789,23 @@ static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp)
}
/*
- * Awaken the grace-period kthread for the specified flavor of RCU.
- * Don't do a self-awaken, and don't bother awakening when there is
- * nothing for the grace-period kthread to do (as in several CPUs
- * raced to awaken, and we lost), and finally don't try to awaken
- * a kthread that has not yet been created.
+ * Awaken the grace-period kthread. Don't do a self-awaken (unless in
+ * an interrupt or softirq handler), and don't bother awakening when there
+ * is nothing for the grace-period kthread to do (as in several CPUs raced
+ * to awaken, and we lost), and finally don't try to awaken a kthread that
+ * has not yet been created. If all those checks are passed, track some
+ * debug information and awaken.
+ *
+ * So why do the self-wakeup when in an interrupt or softirq handler
+ * in the grace-period kthread's context? Because the kthread might have
+ * been interrupted just as it was going to sleep, and just after the final
+ * pre-sleep check of the awaken condition. In this case, a wakeup really
+ * is required, and is therefore supplied.
*/
static void rcu_gp_kthread_wake(struct rcu_state *rsp)
{
- if (current == rsp->gp_kthread ||
+ if ((current == rsp->gp_kthread &&
+ !in_interrupt() && !in_serving_softirq()) ||
!READ_ONCE(rsp->gp_flags) ||
!rsp->gp_kthread)
return;
diff --git a/kernel/relay.c b/kernel/relay.c
index 1537158c67b3..61d37e6da22d 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -427,6 +427,8 @@ static struct dentry *relay_create_buf_file(struct rchan *chan,
dentry = chan->cb->create_buf_file(tmpname, chan->parent,
S_IRUSR, buf,
&chan->is_global);
+ if (IS_ERR(dentry))
+ dentry = NULL;
kfree(tmpname);
@@ -460,7 +462,7 @@ static struct rchan_buf *relay_open_buf(struct rchan *chan, unsigned int cpu)
dentry = chan->cb->create_buf_file(NULL, NULL,
S_IRUSR, buf,
&chan->is_global);
- if (WARN_ON(dentry))
+ if (IS_ERR_OR_NULL(dentry))
goto free_buf;
}
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 6d689c039189..d0d39f714f47 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -605,10 +605,9 @@ fail:
stop_kthread:
sugov_kthread_stop(sg_policy);
-
-free_sg_policy:
mutex_unlock(&global_tunables_lock);
+free_sg_policy:
sugov_policy_free(sg_policy);
disable_fast_switch:
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index b2589c7e9439..22770168bff8 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -217,7 +217,6 @@ static void task_non_contending(struct task_struct *p)
if (dl_se->dl_runtime == 0)
return;
- WARN_ON(hrtimer_active(&dl_se->inactive_timer));
WARN_ON(dl_se->dl_non_contending);
zerolag_time = dl_se->deadline -
@@ -234,7 +233,7 @@ static void task_non_contending(struct task_struct *p)
* If the "0-lag time" already passed, decrease the active
* utilization now, instead of starting a timer
*/
- if (zerolag_time < 0) {
+ if ((zerolag_time < 0) || hrtimer_active(&dl_se->inactive_timer)) {
if (dl_task(p))
sub_running_bw(dl_se->dl_bw, dl_rq);
if (!dl_task(p) || p->state == TASK_DEAD) {
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 2f93e4a2d9f6..187c04a34ba1 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -339,6 +339,7 @@ void register_sched_domain_sysctl(void)
{
static struct ctl_table *cpu_entries;
static struct ctl_table **cpu_idx;
+ static bool init_done = false;
char buf[32];
int i;
@@ -368,7 +369,10 @@ void register_sched_domain_sysctl(void)
if (!cpumask_available(sd_sysctl_cpus)) {
if (!alloc_cpumask_var(&sd_sysctl_cpus, GFP_KERNEL))
return;
+ }
+ if (!init_done) {
+ init_done = true;
/* init to possible to not have holes in @cpu_entries */
cpumask_copy(sd_sysctl_cpus, cpu_possible_mask);
}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f33b24080b1c..af7de1f9906c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -2026,6 +2026,10 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
if (p->last_task_numa_placement) {
delta = runtime - p->last_sum_exec_runtime;
*period = now - p->last_task_numa_placement;
+
+ /* Avoid time going backwards, prevent potential divide error: */
+ if (unlikely((s64)*period < 0))
+ *period = 0;
} else {
delta = p->se.avg.load_sum / p->se.load.weight;
*period = LOAD_AVG_MAX;
@@ -4672,12 +4676,15 @@ static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
return HRTIMER_NORESTART;
}
+extern const u64 max_cfs_quota_period;
+
static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
{
struct cfs_bandwidth *cfs_b =
container_of(timer, struct cfs_bandwidth, period_timer);
int overrun;
int idle = 0;
+ int count = 0;
raw_spin_lock(&cfs_b->lock);
for (;;) {
@@ -4685,6 +4692,28 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
if (!overrun)
break;
+ if (++count > 3) {
+ u64 new, old = ktime_to_ns(cfs_b->period);
+
+ new = (old * 147) / 128; /* ~115% */
+ new = min(new, max_cfs_quota_period);
+
+ cfs_b->period = ns_to_ktime(new);
+
+ /* since max is 1s, this is limited to 1e9^2, which fits in u64 */
+ cfs_b->quota *= new;
+ cfs_b->quota = div64_u64(cfs_b->quota, old);
+
+ pr_warn_ratelimited(
+ "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us %lld, cfs_quota_us = %lld)\n",
+ smp_processor_id(),
+ div_u64(new, NSEC_PER_USEC),
+ div_u64(cfs_b->quota, NSEC_PER_USEC));
+
+ /* reset count so we don't come right back in here */
+ count = 0;
+ }
+
idle = do_sched_cfs_period_timer(cfs_b, overrun);
}
if (idle)
@@ -5651,6 +5680,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
#ifdef CONFIG_SCHED_SMT
DEFINE_STATIC_KEY_FALSE(sched_smt_present);
+EXPORT_SYMBOL_GPL(sched_smt_present);
static inline void set_idle_cores(int cpu, int val)
{
@@ -7017,10 +7047,10 @@ static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
if (cfs_rq->last_h_load_update == now)
return;
- cfs_rq->h_load_next = NULL;
+ WRITE_ONCE(cfs_rq->h_load_next, NULL);
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
- cfs_rq->h_load_next = se;
+ WRITE_ONCE(cfs_rq->h_load_next, se);
if (cfs_rq->last_h_load_update == now)
break;
}
@@ -7030,7 +7060,7 @@ static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
cfs_rq->last_h_load_update = now;
}
- while ((se = cfs_rq->h_load_next) != NULL) {
+ while ((se = READ_ONCE(cfs_rq->h_load_next)) != NULL) {
load = cfs_rq->h_load;
load = div64_ul(load * se->avg.load_avg,
cfs_rq_load_avg(cfs_rq) + 1);
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 659e075ef70b..9dcd80ed9d4c 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -499,7 +499,7 @@ static int __init isolated_cpu_setup(char *str)
__setup("isolcpus=", isolated_cpu_setup);
struct s_data {
- struct sched_domain ** __percpu sd;
+ struct sched_domain * __percpu *sd;
struct root_domain *rd;
};
diff --git a/kernel/signal.c b/kernel/signal.c
index 164c36ef0825..619c6160f64f 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -672,6 +672,48 @@ void signal_wake_up_state(struct task_struct *t, unsigned int state)
kick_process(t);
}
+static int dequeue_synchronous_signal(siginfo_t *info)
+{
+ struct task_struct *tsk = current;
+ struct sigpending *pending = &tsk->pending;
+ struct sigqueue *q, *sync = NULL;
+
+ /*
+ * Might a synchronous signal be in the queue?
+ */
+ if (!((pending->signal.sig[0] & ~tsk->blocked.sig[0]) & SYNCHRONOUS_MASK))
+ return 0;
+
+ /*
+ * Return the first synchronous signal in the queue.
+ */
+ list_for_each_entry(q, &pending->list, list) {
+ /* Synchronous signals have a postive si_code */
+ if ((q->info.si_code > SI_USER) &&
+ (sigmask(q->info.si_signo) & SYNCHRONOUS_MASK)) {
+ sync = q;
+ goto next;
+ }
+ }
+ return 0;
+next:
+ /*
+ * Check if there is another siginfo for the same signal.
+ */
+ list_for_each_entry_continue(q, &pending->list, list) {
+ if (q->info.si_signo == sync->info.si_signo)
+ goto still_pending;
+ }
+
+ sigdelset(&pending->signal, sync->info.si_signo);
+ recalc_sigpending();
+still_pending:
+ list_del_init(&sync->list);
+ copy_siginfo(info, &sync->info);
+ __sigqueue_free(sync);
+ return info->si_signo;
+}
+
/*
* Remove signals in mask from the pending set and queue.
* Returns 1 if any signals were found.
@@ -2225,6 +2267,14 @@ relock:
goto relock;
}
+ /* Has this task already been marked for death? */
+ if (signal_group_exit(signal)) {
+ ksig->info.si_signo = signr = SIGKILL;
+ sigdelset(&current->pending.signal, SIGKILL);
+ recalc_sigpending();
+ goto fatal;
+ }
+
for (;;) {
struct k_sigaction *ka;
@@ -2238,7 +2288,15 @@ relock:
goto relock;
}
- signr = dequeue_signal(current, &current->blocked, &ksig->info);
+ /*
+ * Signals generated by the execution of an instruction
+ * need to be delivered before any other pending signals
+ * so that the instruction pointer in the signal stack
+ * frame points to the faulting instruction.
+ */
+ signr = dequeue_synchronous_signal(&ksig->info);
+ if (!signr)
+ signr = dequeue_signal(current, &current->blocked, &ksig->info);
if (!signr)
break; /* will return 0 */
@@ -2320,6 +2378,7 @@ relock:
continue;
}
+ fatal:
spin_unlock_irq(&sighand->siglock);
/*
diff --git a/kernel/smp.c b/kernel/smp.c
index 2d1da290f144..c94dd85c8d41 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -584,8 +584,6 @@ void __init smp_init(void)
num_nodes, (num_nodes > 1 ? "s" : ""),
num_cpus, (num_cpus > 1 ? "s" : ""));
- /* Final decision about SMT support */
- cpu_smt_check_topology();
/* Any cleanup work */
smp_cpus_done(setup_max_cpus);
}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index d330b1ce3b94..f13601a616ad 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -124,7 +124,9 @@ static int zero;
static int __maybe_unused one = 1;
static int __maybe_unused two = 2;
static int __maybe_unused four = 4;
+static unsigned long zero_ul;
static unsigned long one_ul = 1;
+static unsigned long long_max = LONG_MAX;
static int one_hundred = 100;
static int one_thousand = 1000;
#ifdef CONFIG_PRINTK
@@ -1681,6 +1683,8 @@ static struct ctl_table fs_table[] = {
.maxlen = sizeof(files_stat.max_files),
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
+ .extra1 = &zero_ul,
+ .extra2 = &long_max,
},
{
.procname = "nr_open",
@@ -2530,7 +2534,16 @@ static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp,
{
struct do_proc_dointvec_minmax_conv_param *param = data;
if (write) {
- int val = *negp ? -*lvalp : *lvalp;
+ int val;
+ if (*negp) {
+ if (*lvalp > (unsigned long) INT_MAX + 1)
+ return -EINVAL;
+ val = -*lvalp;
+ } else {
+ if (*lvalp > (unsigned long) INT_MAX)
+ return -EINVAL;
+ val = *lvalp;
+ }
if ((param->min && *param->min > val) ||
(param->max && *param->max < val))
return -EINVAL;
@@ -2708,6 +2721,8 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int
bool neg;
left -= proc_skip_spaces(&p);
+ if (!left)
+ break;
err = proc_get_long(&p, &left, &val, &neg,
proc_wspace_sep,
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index fa5de5e8de61..fdeb9bc6affb 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -597,7 +597,7 @@ static ktime_t alarm_timer_remaining(struct k_itimer *timr, ktime_t now)
{
struct alarm *alarm = &timr->it.alarm.alarmtimer;
- return ktime_sub(now, alarm->node.expires);
+ return ktime_sub(alarm->node.expires, now);
}
/**
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 2cafb49aa65e..1ce7c404d0b0 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -41,7 +41,9 @@
static struct {
seqcount_t seq;
struct timekeeper timekeeper;
-} tk_core ____cacheline_aligned;
+} tk_core ____cacheline_aligned = {
+ .seq = SEQCNT_ZERO(tk_core.seq),
+};
static DEFINE_RAW_SPINLOCK(timekeeper_lock);
static struct timekeeper shadow_timekeeper;
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 9937d7cf2a64..3e92852c8b23 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -33,6 +33,7 @@
#include <linux/list.h>
#include <linux/hash.h>
#include <linux/rcupdate.h>
+#include <linux/kprobes.h>
#include <trace/events/sched.h>
@@ -6035,7 +6036,7 @@ void ftrace_reset_array_ops(struct trace_array *tr)
tr->ops->func = ftrace_stub;
}
-static inline void
+static nokprobe_inline void
__ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *ignored, struct pt_regs *regs)
{
@@ -6098,11 +6099,13 @@ static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip,
{
__ftrace_ops_list_func(ip, parent_ip, NULL, regs);
}
+NOKPROBE_SYMBOL(ftrace_ops_list_func);
#else
static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip)
{
__ftrace_ops_list_func(ip, parent_ip, NULL, NULL);
}
+NOKPROBE_SYMBOL(ftrace_ops_no_ops);
#endif
/*
@@ -6132,6 +6135,7 @@ static void ftrace_ops_assist_func(unsigned long ip, unsigned long parent_ip,
preempt_enable_notrace();
trace_clear_recursion(bit);
}
+NOKPROBE_SYMBOL(ftrace_ops_assist_func);
/**
* ftrace_ops_get_func - get the function a trampoline should call
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index a1d5e0949dcf..8123a8b53c54 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -700,7 +700,7 @@ u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu)
preempt_disable_notrace();
time = rb_time_stamp(buffer);
- preempt_enable_no_resched_notrace();
+ preempt_enable_notrace();
return time;
}
@@ -4010,6 +4010,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_consume);
* ring_buffer_read_prepare - Prepare for a non consuming read of the buffer
* @buffer: The ring buffer to read from
* @cpu: The cpu buffer to iterate over
+ * @flags: gfp flags to use for memory allocation
*
* This performs the initial preparations necessary to iterate
* through the buffer. Memory is allocated, buffer recording
@@ -4027,7 +4028,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_consume);
* This overall must be paired with ring_buffer_read_finish.
*/
struct ring_buffer_iter *
-ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu)
+ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu, gfp_t flags)
{
struct ring_buffer_per_cpu *cpu_buffer;
struct ring_buffer_iter *iter;
@@ -4035,7 +4036,7 @@ ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu)
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return NULL;
- iter = kmalloc(sizeof(*iter), GFP_KERNEL);
+ iter = kmalloc(sizeof(*iter), flags);
if (!iter)
return NULL;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index e9cbb96cd99e..ace0e8f6f2b4 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -494,8 +494,10 @@ int trace_pid_write(struct trace_pid_list *filtered_pids,
* not modified.
*/
pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL);
- if (!pid_list)
+ if (!pid_list) {
+ trace_parser_put(&parser);
return -ENOMEM;
+ }
pid_list->pid_max = READ_ONCE(pid_max);
@@ -505,6 +507,7 @@ int trace_pid_write(struct trace_pid_list *filtered_pids,
pid_list->pids = vzalloc((pid_list->pid_max + 7) >> 3);
if (!pid_list->pids) {
+ trace_parser_put(&parser);
kfree(pid_list);
return -ENOMEM;
}
@@ -3381,6 +3384,8 @@ static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file
const char tgid_space[] = " ";
const char space[] = " ";
+ print_event_info(buf, m);
+
seq_printf(m, "# %s _-----=> irqs-off\n",
tgid ? tgid_space : space);
seq_printf(m, "# %s / _----=> need-resched\n",
@@ -3899,7 +3904,8 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
if (iter->cpu_file == RING_BUFFER_ALL_CPUS) {
for_each_tracing_cpu(cpu) {
iter->buffer_iter[cpu] =
- ring_buffer_read_prepare(iter->trace_buffer->buffer, cpu);
+ ring_buffer_read_prepare(iter->trace_buffer->buffer,
+ cpu, GFP_KERNEL);
}
ring_buffer_read_prepare_sync();
for_each_tracing_cpu(cpu) {
@@ -3909,7 +3915,8 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
} else {
cpu = iter->cpu_file;
iter->buffer_iter[cpu] =
- ring_buffer_read_prepare(iter->trace_buffer->buffer, cpu);
+ ring_buffer_read_prepare(iter->trace_buffer->buffer,
+ cpu, GFP_KERNEL);
ring_buffer_read_prepare_sync();
ring_buffer_read_start(iter->buffer_iter[cpu]);
tracing_iter_reset(iter, cpu);
@@ -5602,7 +5609,6 @@ out:
return ret;
fail:
- kfree(iter->trace);
kfree(iter);
__trace_array_put(tr);
mutex_unlock(&trace_types_lock);
@@ -6713,28 +6719,36 @@ struct buffer_ref {
struct ring_buffer *buffer;
void *page;
int cpu;
- int ref;
+ refcount_t refcount;
};
+static void buffer_ref_release(struct buffer_ref *ref)
+{
+ if (!refcount_dec_and_test(&ref->refcount))
+ return;
+ ring_buffer_free_read_page(ref->buffer, ref->cpu, ref->page);
+ kfree(ref);
+}
+
static void buffer_pipe_buf_release(struct pipe_inode_info *pipe,
struct pipe_buffer *buf)
{
struct buffer_ref *ref = (struct buffer_ref *)buf->private;
- if (--ref->ref)
- return;
-
- ring_buffer_free_read_page(ref->buffer, ref->cpu, ref->page);
- kfree(ref);
+ buffer_ref_release(ref);
buf->private = 0;
}
-static void buffer_pipe_buf_get(struct pipe_inode_info *pipe,
+static bool buffer_pipe_buf_get(struct pipe_inode_info *pipe,
struct pipe_buffer *buf)
{
struct buffer_ref *ref = (struct buffer_ref *)buf->private;
- ref->ref++;
+ if (refcount_read(&ref->refcount) > INT_MAX/2)
+ return false;
+
+ refcount_inc(&ref->refcount);
+ return true;
}
/* Pipe buffer operations for a buffer. */
@@ -6742,7 +6756,7 @@ static const struct pipe_buf_operations buffer_pipe_buf_ops = {
.can_merge = 0,
.confirm = generic_pipe_buf_confirm,
.release = buffer_pipe_buf_release,
- .steal = generic_pipe_buf_steal,
+ .steal = generic_pipe_buf_nosteal,
.get = buffer_pipe_buf_get,
};
@@ -6755,11 +6769,7 @@ static void buffer_spd_release(struct splice_pipe_desc *spd, unsigned int i)
struct buffer_ref *ref =
(struct buffer_ref *)spd->partial[i].private;
- if (--ref->ref)
- return;
-
- ring_buffer_free_read_page(ref->buffer, ref->cpu, ref->page);
- kfree(ref);
+ buffer_ref_release(ref);
spd->partial[i].private = 0;
}
@@ -6814,7 +6824,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
break;
}
- ref->ref = 1;
+ refcount_set(&ref->refcount, 1);
ref->buffer = iter->trace_buffer->buffer;
ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file);
if (IS_ERR(ref->page)) {
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 7eb975a2d0e1..e8c9eba9b1e7 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -872,9 +872,10 @@ static inline void add_to_key(char *compound_key, void *key,
/* ensure NULL-termination */
if (size > key_field->size - 1)
size = key_field->size - 1;
- }
- memcpy(compound_key + key_field->offset, key, size);
+ strncpy(compound_key + key_field->offset, (char *)key, size);
+ } else
+ memcpy(compound_key + key_field->offset, key, size);
}
static void event_hist_trigger(struct event_trigger_data *data, void *rec)
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
index d953c163a079..810d78a8d14c 100644
--- a/kernel/trace/trace_kdb.c
+++ b/kernel/trace/trace_kdb.c
@@ -51,14 +51,16 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file)
if (cpu_file == RING_BUFFER_ALL_CPUS) {
for_each_tracing_cpu(cpu) {
iter.buffer_iter[cpu] =
- ring_buffer_read_prepare(iter.trace_buffer->buffer, cpu);
+ ring_buffer_read_prepare(iter.trace_buffer->buffer,
+ cpu, GFP_ATOMIC);
ring_buffer_read_start(iter.buffer_iter[cpu]);
tracing_iter_reset(&iter, cpu);
}
} else {
iter.cpu_file = cpu_file;
iter.buffer_iter[cpu_file] =
- ring_buffer_read_prepare(iter.trace_buffer->buffer, cpu_file);
+ ring_buffer_read_prepare(iter.trace_buffer->buffer,
+ cpu_file, GFP_ATOMIC);
ring_buffer_read_start(iter.buffer_iter[cpu_file]);
tracing_iter_reset(&iter, cpu_file);
}
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index ea0d90a31fc9..fdf2ea4d64ec 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -17,7 +17,7 @@
* Copyright (C) IBM Corporation, 2010-2012
* Author: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
*/
-#define pr_fmt(fmt) "trace_kprobe: " fmt
+#define pr_fmt(fmt) "trace_uprobe: " fmt
#include <linux/module.h>
#include <linux/uaccess.h>
@@ -153,7 +153,14 @@ static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs,
ret = strncpy_from_user(dst, src, maxlen);
if (ret == maxlen)
- dst[--ret] = '\0';
+ dst[ret - 1] = '\0';
+ else if (ret >= 0)
+ /*
+ * Include the terminating null byte. In this case it
+ * was copied by strncpy_from_user but not accounted
+ * for in ret.
+ */
+ ret++;
if (ret < 0) { /* Failed to fetch string */
((u8 *)get_rloc_data(dest))[0] = '\0';