diff options
author | Philippe Schenker <philippe.schenker@toradex.com> | 2019-05-08 16:08:30 +0200 |
---|---|---|
committer | Philippe Schenker <philippe.schenker@toradex.com> | 2019-05-08 16:08:30 +0200 |
commit | 774f42075a4800fe4106dffca804e3207bc3c2e7 (patch) | |
tree | 4bb3e8ca84f7f23ac41966017e81ffe09e98412e /kernel | |
parent | 95a0514c2e68a035e2adbdb5a297da310dccb09c (diff) | |
parent | 33e2f28596a7059dbe31d837caae924a74aa36d8 (diff) |
Merge branch 'linux-4.14.y_for_4.14-2.0.x-imx' into 4.14-2.0.x-imx
Diffstat (limited to 'kernel')
45 files changed, 1212 insertions, 408 deletions
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index d203a5d6b726..e46106c6ac39 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -51,6 +51,7 @@ #define DST regs[insn->dst_reg] #define SRC regs[insn->src_reg] #define FP regs[BPF_REG_FP] +#define AX regs[BPF_REG_AX] #define ARG1 regs[BPF_REG_ARG1] #define CTX regs[BPF_REG_CTX] #define IMM insn->imm @@ -552,6 +553,26 @@ static int bpf_jit_blind_insn(const struct bpf_insn *from, BUILD_BUG_ON(BPF_REG_AX + 1 != MAX_BPF_JIT_REG); BUILD_BUG_ON(MAX_BPF_REG + 1 != MAX_BPF_JIT_REG); + /* Constraints on AX register: + * + * AX register is inaccessible from user space. It is mapped in + * all JITs, and used here for constant blinding rewrites. It is + * typically "stateless" meaning its contents are only valid within + * the executed instruction, but not across several instructions. + * There are a few exceptions however which are further detailed + * below. + * + * Constant blinding is only used by JITs, not in the interpreter. + * The interpreter uses AX in some occasions as a local temporary + * register e.g. in DIV or MOD instructions. + * + * In restricted circumstances, the verifier can also use the AX + * register for rewrites as long as they do not interfere with + * the above cases! + */ + if (from->dst_reg == BPF_REG_AX || from->src_reg == BPF_REG_AX) + goto out; + if (from->imm == 0 && (from->code == (BPF_ALU | BPF_MOV | BPF_K) || from->code == (BPF_ALU64 | BPF_MOV | BPF_K))) { @@ -939,22 +960,22 @@ select_insn: ALU64_MOD_X: if (unlikely(SRC == 0)) return 0; - div64_u64_rem(DST, SRC, &tmp); - DST = tmp; + div64_u64_rem(DST, SRC, &AX); + DST = AX; CONT; ALU_MOD_X: if (unlikely((u32)SRC == 0)) return 0; - tmp = (u32) DST; - DST = do_div(tmp, (u32) SRC); + AX = (u32) DST; + DST = do_div(AX, (u32) SRC); CONT; ALU64_MOD_K: - div64_u64_rem(DST, IMM, &tmp); - DST = tmp; + div64_u64_rem(DST, IMM, &AX); + DST = AX; CONT; ALU_MOD_K: - tmp = (u32) DST; - DST = do_div(tmp, (u32) IMM); + AX = (u32) DST; + DST = do_div(AX, (u32) IMM); CONT; ALU64_DIV_X: if (unlikely(SRC == 0)) @@ -964,17 +985,17 @@ select_insn: ALU_DIV_X: if (unlikely((u32)SRC == 0)) return 0; - tmp = (u32) DST; - do_div(tmp, (u32) SRC); - DST = (u32) tmp; + AX = (u32) DST; + do_div(AX, (u32) SRC); + DST = (u32) AX; CONT; ALU64_DIV_K: DST = div64_u64(DST, IMM); CONT; ALU_DIV_K: - tmp = (u32) DST; - do_div(tmp, (u32) IMM); - DST = (u32) tmp; + AX = (u32) DST; + do_div(AX, (u32) IMM); + DST = (u32) AX; CONT; ALU_END_TO_BE: switch (IMM) { @@ -1278,7 +1299,7 @@ STACK_FRAME_NON_STANDARD(___bpf_prog_run); /* jump table */ static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn *insn) \ { \ u64 stack[stack_size / sizeof(u64)]; \ - u64 regs[MAX_BPF_REG]; \ + u64 regs[MAX_BPF_EXT_REG]; \ \ FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \ ARG1 = (u64) (unsigned long) ctx; \ diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 3d0ecc273cc6..84237f640789 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -655,7 +655,7 @@ static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) } if (htab_is_prealloc(htab)) { - pcpu_freelist_push(&htab->freelist, &l->fnode); + __pcpu_freelist_push(&htab->freelist, &l->fnode); } else { atomic_dec(&htab->count); l->htab = htab; @@ -717,7 +717,7 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, } else { struct pcpu_freelist_node *l; - l = pcpu_freelist_pop(&htab->freelist); + l = __pcpu_freelist_pop(&htab->freelist); if (!l) return ERR_PTR(-E2BIG); l_new = container_of(l, struct htab_elem, fnode); diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index be1dde967208..ccf9ffd5da78 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -365,19 +365,6 @@ out: } EXPORT_SYMBOL_GPL(bpf_obj_get_user); -static void bpf_evict_inode(struct inode *inode) -{ - enum bpf_type type; - - truncate_inode_pages_final(&inode->i_data); - clear_inode(inode); - - if (S_ISLNK(inode->i_mode)) - kfree(inode->i_link); - if (!bpf_inode_type(inode, &type)) - bpf_any_put(inode->i_private, type); -} - /* * Display the mount options in /proc/mounts. */ @@ -390,11 +377,28 @@ static int bpf_show_options(struct seq_file *m, struct dentry *root) return 0; } +static void bpf_destroy_inode_deferred(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + enum bpf_type type; + + if (S_ISLNK(inode->i_mode)) + kfree(inode->i_link); + if (!bpf_inode_type(inode, &type)) + bpf_any_put(inode->i_private, type); + free_inode_nonrcu(inode); +} + +static void bpf_destroy_inode(struct inode *inode) +{ + call_rcu(&inode->i_rcu, bpf_destroy_inode_deferred); +} + static const struct super_operations bpf_super_ops = { .statfs = simple_statfs, .drop_inode = generic_delete_inode, .show_options = bpf_show_options, - .evict_inode = bpf_evict_inode, + .destroy_inode = bpf_destroy_inode, }; enum { diff --git a/kernel/bpf/map_in_map.c b/kernel/bpf/map_in_map.c index 1da574612bea..c0c494b7647b 100644 --- a/kernel/bpf/map_in_map.c +++ b/kernel/bpf/map_in_map.c @@ -12,6 +12,7 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) { struct bpf_map *inner_map, *inner_map_meta; + u32 inner_map_meta_size; struct fd f; f = fdget(inner_map_ufd); @@ -34,7 +35,12 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) return ERR_PTR(-EINVAL); } - inner_map_meta = kzalloc(sizeof(*inner_map_meta), GFP_USER); + inner_map_meta_size = sizeof(*inner_map_meta); + /* In some cases verifier needs to access beyond just base map. */ + if (inner_map->ops == &array_map_ops) + inner_map_meta_size = sizeof(struct bpf_array); + + inner_map_meta = kzalloc(inner_map_meta_size, GFP_USER); if (!inner_map_meta) { fdput(f); return ERR_PTR(-ENOMEM); @@ -44,9 +50,16 @@ struct bpf_map *bpf_map_meta_alloc(int inner_map_ufd) inner_map_meta->key_size = inner_map->key_size; inner_map_meta->value_size = inner_map->value_size; inner_map_meta->map_flags = inner_map->map_flags; - inner_map_meta->ops = inner_map->ops; inner_map_meta->max_entries = inner_map->max_entries; + /* Misc members not needed in bpf_map_meta_equal() check. */ + inner_map_meta->ops = inner_map->ops; + if (inner_map->ops == &array_map_ops) { + inner_map_meta->unpriv_array = inner_map->unpriv_array; + container_of(inner_map_meta, struct bpf_array, map)->index_mask = + container_of(inner_map, struct bpf_array, map)->index_mask; + } + fdput(f); return inner_map_meta; } diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c index 673fa6fe2d73..0c1b4ba9e90e 100644 --- a/kernel/bpf/percpu_freelist.c +++ b/kernel/bpf/percpu_freelist.c @@ -28,8 +28,8 @@ void pcpu_freelist_destroy(struct pcpu_freelist *s) free_percpu(s->freelist); } -static inline void __pcpu_freelist_push(struct pcpu_freelist_head *head, - struct pcpu_freelist_node *node) +static inline void ___pcpu_freelist_push(struct pcpu_freelist_head *head, + struct pcpu_freelist_node *node) { raw_spin_lock(&head->lock); node->next = head->first; @@ -37,12 +37,22 @@ static inline void __pcpu_freelist_push(struct pcpu_freelist_head *head, raw_spin_unlock(&head->lock); } -void pcpu_freelist_push(struct pcpu_freelist *s, +void __pcpu_freelist_push(struct pcpu_freelist *s, struct pcpu_freelist_node *node) { struct pcpu_freelist_head *head = this_cpu_ptr(s->freelist); - __pcpu_freelist_push(head, node); + ___pcpu_freelist_push(head, node); +} + +void pcpu_freelist_push(struct pcpu_freelist *s, + struct pcpu_freelist_node *node) +{ + unsigned long flags; + + local_irq_save(flags); + __pcpu_freelist_push(s, node); + local_irq_restore(flags); } void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size, @@ -63,7 +73,7 @@ void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size, for_each_possible_cpu(cpu) { again: head = per_cpu_ptr(s->freelist, cpu); - __pcpu_freelist_push(head, buf); + ___pcpu_freelist_push(head, buf); i++; buf += elem_size; if (i == nr_elems) @@ -74,14 +84,12 @@ again: local_irq_restore(flags); } -struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s) +struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *s) { struct pcpu_freelist_head *head; struct pcpu_freelist_node *node; - unsigned long flags; int orig_cpu, cpu; - local_irq_save(flags); orig_cpu = cpu = raw_smp_processor_id(); while (1) { head = per_cpu_ptr(s->freelist, cpu); @@ -89,16 +97,25 @@ struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s) node = head->first; if (node) { head->first = node->next; - raw_spin_unlock_irqrestore(&head->lock, flags); + raw_spin_unlock(&head->lock); return node; } raw_spin_unlock(&head->lock); cpu = cpumask_next(cpu, cpu_possible_mask); if (cpu >= nr_cpu_ids) cpu = 0; - if (cpu == orig_cpu) { - local_irq_restore(flags); + if (cpu == orig_cpu) return NULL; - } } } + +struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s) +{ + struct pcpu_freelist_node *ret; + unsigned long flags; + + local_irq_save(flags); + ret = __pcpu_freelist_pop(s); + local_irq_restore(flags); + return ret; +} diff --git a/kernel/bpf/percpu_freelist.h b/kernel/bpf/percpu_freelist.h index 3049aae8ea1e..c3960118e617 100644 --- a/kernel/bpf/percpu_freelist.h +++ b/kernel/bpf/percpu_freelist.h @@ -22,8 +22,12 @@ struct pcpu_freelist_node { struct pcpu_freelist_node *next; }; +/* pcpu_freelist_* do spin_lock_irqsave. */ void pcpu_freelist_push(struct pcpu_freelist *, struct pcpu_freelist_node *); struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *); +/* __pcpu_freelist_* do spin_lock only. caller must disable irqs. */ +void __pcpu_freelist_push(struct pcpu_freelist *, struct pcpu_freelist_node *); +struct pcpu_freelist_node *__pcpu_freelist_pop(struct pcpu_freelist *); void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size, u32 nr_elems); int pcpu_freelist_init(struct pcpu_freelist *); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index f6755fd5bae2..a4875ff0bab1 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -265,10 +265,11 @@ static void print_verifier_state(struct bpf_verifier_state *state) verbose(")"); } } - for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { - if (state->stack_slot_type[i] == STACK_SPILL) - verbose(" fp%d=%s", -MAX_BPF_STACK + i, - reg_type_str[state->spilled_regs[i / BPF_REG_SIZE].type]); + for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { + if (state->stack[i].slot_type[0] == STACK_SPILL) + verbose(" fp%d=%s", + (-i - 1) * BPF_REG_SIZE, + reg_type_str[state->stack[i].spilled_ptr.type]); } verbose("\n"); } @@ -434,40 +435,133 @@ static void print_bpf_insn(const struct bpf_verifier_env *env, } } -static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx) +static int copy_stack_state(struct bpf_verifier_state *dst, + const struct bpf_verifier_state *src) { - struct bpf_verifier_stack_elem *elem; - int insn_idx; + if (!src->stack) + return 0; + if (WARN_ON_ONCE(dst->allocated_stack < src->allocated_stack)) { + /* internal bug, make state invalid to reject the program */ + memset(dst, 0, sizeof(*dst)); + return -EFAULT; + } + memcpy(dst->stack, src->stack, + sizeof(*src->stack) * (src->allocated_stack / BPF_REG_SIZE)); + return 0; +} + +/* do_check() starts with zero-sized stack in struct bpf_verifier_state to + * make it consume minimal amount of memory. check_stack_write() access from + * the program calls into realloc_verifier_state() to grow the stack size. + * Note there is a non-zero 'parent' pointer inside bpf_verifier_state + * which this function copies over. It points to previous bpf_verifier_state + * which is never reallocated + */ +static int realloc_verifier_state(struct bpf_verifier_state *state, int size, + bool copy_old) +{ + u32 old_size = state->allocated_stack; + struct bpf_stack_state *new_stack; + int slot = size / BPF_REG_SIZE; + + if (size <= old_size || !size) { + if (copy_old) + return 0; + state->allocated_stack = slot * BPF_REG_SIZE; + if (!size && old_size) { + kfree(state->stack); + state->stack = NULL; + } + return 0; + } + new_stack = kmalloc_array(slot, sizeof(struct bpf_stack_state), + GFP_KERNEL); + if (!new_stack) + return -ENOMEM; + if (copy_old) { + if (state->stack) + memcpy(new_stack, state->stack, + sizeof(*new_stack) * (old_size / BPF_REG_SIZE)); + memset(new_stack + old_size / BPF_REG_SIZE, 0, + sizeof(*new_stack) * (size - old_size) / BPF_REG_SIZE); + } + state->allocated_stack = slot * BPF_REG_SIZE; + kfree(state->stack); + state->stack = new_stack; + return 0; +} + +static void free_verifier_state(struct bpf_verifier_state *state, + bool free_self) +{ + kfree(state->stack); + if (free_self) + kfree(state); +} + +/* copy verifier state from src to dst growing dst stack space + * when necessary to accommodate larger src stack + */ +static int copy_verifier_state(struct bpf_verifier_state *dst, + const struct bpf_verifier_state *src) +{ + int err; + + err = realloc_verifier_state(dst, src->allocated_stack, false); + if (err) + return err; + memcpy(dst, src, offsetof(struct bpf_verifier_state, allocated_stack)); + return copy_stack_state(dst, src); +} + +static int pop_stack(struct bpf_verifier_env *env, int *prev_insn_idx, + int *insn_idx) +{ + struct bpf_verifier_state *cur = env->cur_state; + struct bpf_verifier_stack_elem *elem, *head = env->head; + int err; if (env->head == NULL) - return -1; + return -ENOENT; - memcpy(&env->cur_state, &env->head->st, sizeof(env->cur_state)); - insn_idx = env->head->insn_idx; + if (cur) { + err = copy_verifier_state(cur, &head->st); + if (err) + return err; + } + if (insn_idx) + *insn_idx = head->insn_idx; if (prev_insn_idx) - *prev_insn_idx = env->head->prev_insn_idx; - elem = env->head->next; - kfree(env->head); + *prev_insn_idx = head->prev_insn_idx; + elem = head->next; + free_verifier_state(&head->st, false); + kfree(head); env->head = elem; env->stack_size--; - return insn_idx; + return 0; } static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, - int insn_idx, int prev_insn_idx) + int insn_idx, int prev_insn_idx, + bool speculative) { struct bpf_verifier_stack_elem *elem; + struct bpf_verifier_state *cur = env->cur_state; + int err; - elem = kmalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL); + elem = kzalloc(sizeof(struct bpf_verifier_stack_elem), GFP_KERNEL); if (!elem) goto err; - memcpy(&elem->st, &env->cur_state, sizeof(env->cur_state)); elem->insn_idx = insn_idx; elem->prev_insn_idx = prev_insn_idx; elem->next = env->head; + elem->st.speculative |= speculative; env->head = elem; env->stack_size++; + err = copy_verifier_state(&elem->st, cur); + if (err) + goto err; if (env->stack_size > BPF_COMPLEXITY_LIMIT_STACK) { verbose("BPF program is too complex\n"); goto err; @@ -475,7 +569,7 @@ static struct bpf_verifier_state *push_stack(struct bpf_verifier_env *env, return &elem->st; err: /* pop all elements and return */ - while (pop_stack(env, NULL) >= 0); + while (!pop_stack(env, NULL, NULL)); return NULL; } @@ -671,7 +765,7 @@ static void mark_reg_read(const struct bpf_verifier_state *state, u32 regno) static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, enum reg_arg_type t) { - struct bpf_reg_state *regs = env->cur_state.regs; + struct bpf_reg_state *regs = env->cur_state->regs; if (regno >= MAX_BPF_REG) { verbose("R%d is invalid\n", regno); @@ -684,7 +778,7 @@ static int check_reg_arg(struct bpf_verifier_env *env, u32 regno, verbose("R%d !read_ok\n", regno); return -EACCES; } - mark_reg_read(&env->cur_state, regno); + mark_reg_read(env->cur_state, regno); } else { /* check whether register used as dest operand can be written to */ if (regno == BPF_REG_FP) { @@ -721,10 +815,21 @@ static int check_stack_write(struct bpf_verifier_env *env, struct bpf_verifier_state *state, int off, int size, int value_regno, int insn_idx) { - int i, spi = (MAX_BPF_STACK + off) / BPF_REG_SIZE; + int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err; + + err = realloc_verifier_state(state, round_up(slot + 1, BPF_REG_SIZE), + true); + if (err) + return err; /* caller checked that off % size == 0 and -MAX_BPF_STACK <= off < 0, * so it's aligned access and [off, off + size) are within stack limits */ + if (!env->allow_ptr_leaks && + state->stack[spi].slot_type[0] == STACK_SPILL && + size != BPF_REG_SIZE) { + verbose("attempt to corrupt spilled pointer on stack\n"); + return -EACCES; + } if (value_regno >= 0 && is_spillable_regtype(state->regs[value_regno].type)) { @@ -736,11 +841,11 @@ static int check_stack_write(struct bpf_verifier_env *env, } /* save register state */ - state->spilled_regs[spi] = state->regs[value_regno]; - state->spilled_regs[spi].live |= REG_LIVE_WRITTEN; + state->stack[spi].spilled_ptr = state->regs[value_regno]; + state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; for (i = 0; i < BPF_REG_SIZE; i++) { - if (state->stack_slot_type[MAX_BPF_STACK + off + i] == STACK_MISC && + if (state->stack[spi].slot_type[i] == STACK_MISC && !env->allow_ptr_leaks) { int *poff = &env->insn_aux_data[insn_idx].sanitize_stack_off; int soff = (-spi - 1) * BPF_REG_SIZE; @@ -763,14 +868,15 @@ static int check_stack_write(struct bpf_verifier_env *env, } *poff = soff; } - state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_SPILL; + state->stack[spi].slot_type[i] = STACK_SPILL; } } else { /* regular write of data into stack */ - state->spilled_regs[spi] = (struct bpf_reg_state) {}; + state->stack[spi].spilled_ptr = (struct bpf_reg_state) {}; for (i = 0; i < size; i++) - state->stack_slot_type[MAX_BPF_STACK + off + i] = STACK_MISC; + state->stack[spi].slot_type[(slot - i) % BPF_REG_SIZE] = + STACK_MISC; } return 0; } @@ -781,10 +887,10 @@ static void mark_stack_slot_read(const struct bpf_verifier_state *state, int slo while (parent) { /* if read wasn't screened by an earlier write ... */ - if (state->spilled_regs[slot].live & REG_LIVE_WRITTEN) + if (state->stack[slot].spilled_ptr.live & REG_LIVE_WRITTEN) break; /* ... then we depend on parent's value */ - parent->spilled_regs[slot].live |= REG_LIVE_READ; + parent->stack[slot].spilled_ptr.live |= REG_LIVE_READ; state = parent; parent = state->parent; } @@ -793,34 +899,37 @@ static void mark_stack_slot_read(const struct bpf_verifier_state *state, int slo static int check_stack_read(struct bpf_verifier_state *state, int off, int size, int value_regno) { - u8 *slot_type; - int i, spi; + int i, slot = -off - 1, spi = slot / BPF_REG_SIZE; + u8 *stype; - slot_type = &state->stack_slot_type[MAX_BPF_STACK + off]; + if (state->allocated_stack <= slot) { + verbose("invalid read from stack off %d+0 size %d\n", + off, size); + return -EACCES; + } + stype = state->stack[spi].slot_type; - if (slot_type[0] == STACK_SPILL) { + if (stype[0] == STACK_SPILL) { if (size != BPF_REG_SIZE) { verbose("invalid size of register spill\n"); return -EACCES; } for (i = 1; i < BPF_REG_SIZE; i++) { - if (slot_type[i] != STACK_SPILL) { + if (stype[(slot - i) % BPF_REG_SIZE] != STACK_SPILL) { verbose("corrupted spill memory\n"); return -EACCES; } } - spi = (MAX_BPF_STACK + off) / BPF_REG_SIZE; - if (value_regno >= 0) { /* restore register state from stack */ - state->regs[value_regno] = state->spilled_regs[spi]; + state->regs[value_regno] = state->stack[spi].spilled_ptr; mark_stack_slot_read(state, spi); } return 0; } else { for (i = 0; i < size; i++) { - if (slot_type[i] != STACK_MISC) { + if (stype[(slot - i) % BPF_REG_SIZE] != STACK_MISC) { verbose("invalid read from stack off %d+%d size %d\n", off, i, size); return -EACCES; @@ -833,11 +942,37 @@ static int check_stack_read(struct bpf_verifier_state *state, int off, int size, } } +static int check_stack_access(struct bpf_verifier_env *env, + const struct bpf_reg_state *reg, + int off, int size) +{ + /* Stack accesses must be at a fixed offset, so that we + * can determine what type of data were returned. See + * check_stack_read(). + */ + if (!tnum_is_const(reg->var_off)) { + char tn_buf[48]; + + tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); + verbose("variable stack access var_off=%s off=%d size=%d", + tn_buf, off, size); + return -EACCES; + } + + if (off >= 0 || off < -MAX_BPF_STACK) { + verbose("invalid stack off=%d size=%d\n", off, size); + return -EACCES; + } + + return 0; +} + /* check read/write into map element returned by bpf_map_lookup_elem() */ static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off, int size) { - struct bpf_map *map = env->cur_state.regs[regno].map_ptr; + struct bpf_reg_state *regs = cur_regs(env); + struct bpf_map *map = regs[regno].map_ptr; if (off < 0 || size <= 0 || off + size > map->value_size) { verbose("invalid access to map value, value_size=%d off=%d size=%d\n", @@ -849,9 +984,9 @@ static int __check_map_access(struct bpf_verifier_env *env, u32 regno, int off, /* check read/write into a map element with possible variable offset */ static int check_map_access(struct bpf_verifier_env *env, u32 regno, - int off, int size) + int off, int size) { - struct bpf_verifier_state *state = &env->cur_state; + struct bpf_verifier_state *state = env->cur_state; struct bpf_reg_state *reg = &state->regs[regno]; int err; @@ -861,13 +996,17 @@ static int check_map_access(struct bpf_verifier_env *env, u32 regno, */ if (log_level) print_verifier_state(state); + /* The minimum value is only important with signed * comparisons where we can't assume the floor of a * value is 0. If we are using signed variables for our * index'es we need to make sure that whatever we use * will have a set floor within our range. */ - if (reg->smin_value < 0) { + if (reg->smin_value < 0 && + (reg->smin_value == S64_MIN || + (off + reg->smin_value != (s64)(s32)(off + reg->smin_value)) || + reg->smin_value + off < 0)) { verbose("R%d min value is negative, either use unsigned index or do a if (index >=0) check.\n", regno); return -EACCES; @@ -924,7 +1063,7 @@ static bool may_access_direct_pkt_data(struct bpf_verifier_env *env, static int __check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, int size) { - struct bpf_reg_state *regs = env->cur_state.regs; + struct bpf_reg_state *regs = cur_regs(env); struct bpf_reg_state *reg = ®s[regno]; if (off < 0 || size <= 0 || (u64)off + size > reg->range) { @@ -938,7 +1077,7 @@ static int __check_packet_access(struct bpf_verifier_env *env, u32 regno, static int check_packet_access(struct bpf_verifier_env *env, u32 regno, int off, int size) { - struct bpf_reg_state *regs = env->cur_state.regs; + struct bpf_reg_state *regs = cur_regs(env); struct bpf_reg_state *reg = ®s[regno]; int err; @@ -1008,19 +1147,19 @@ static bool __is_pointer_value(bool allow_ptr_leaks, static bool is_pointer_value(struct bpf_verifier_env *env, int regno) { - return __is_pointer_value(env->allow_ptr_leaks, &env->cur_state.regs[regno]); + return __is_pointer_value(env->allow_ptr_leaks, cur_regs(env) + regno); } static bool is_ctx_reg(struct bpf_verifier_env *env, int regno) { - const struct bpf_reg_state *reg = &env->cur_state.regs[regno]; + const struct bpf_reg_state *reg = cur_regs(env) + regno; return reg->type == PTR_TO_CTX; } static bool is_pkt_reg(struct bpf_verifier_env *env, int regno) { - const struct bpf_reg_state *reg = &env->cur_state.regs[regno]; + const struct bpf_reg_state *reg = cur_regs(env) + regno; return reg->type == PTR_TO_PACKET; } @@ -1145,8 +1284,9 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn int off, int bpf_size, enum bpf_access_type t, int value_regno, bool strict_alignment_once) { - struct bpf_verifier_state *state = &env->cur_state; - struct bpf_reg_state *reg = &state->regs[regno]; + struct bpf_verifier_state *state = env->cur_state; + struct bpf_reg_state *regs = cur_regs(env); + struct bpf_reg_state *reg = regs + regno; int size, err = 0; size = bpf_size_to_bytes(bpf_size); @@ -1170,7 +1310,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn err = check_map_access(env, regno, off, size); if (!err && t == BPF_READ && value_regno >= 0) - mark_reg_unknown(state->regs, value_regno); + mark_reg_unknown(regs, value_regno); } else if (reg->type == PTR_TO_CTX) { enum bpf_reg_type reg_type = SCALAR_VALUE; @@ -1203,49 +1343,29 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn * the offset is zero. */ if (reg_type == SCALAR_VALUE) - mark_reg_unknown(state->regs, value_regno); + mark_reg_unknown(regs, value_regno); else - mark_reg_known_zero(state->regs, value_regno); - state->regs[value_regno].id = 0; - state->regs[value_regno].off = 0; - state->regs[value_regno].range = 0; - state->regs[value_regno].type = reg_type; + mark_reg_known_zero(regs, value_regno); + regs[value_regno].id = 0; + regs[value_regno].off = 0; + regs[value_regno].range = 0; + regs[value_regno].type = reg_type; } } else if (reg->type == PTR_TO_STACK) { - /* stack accesses must be at a fixed offset, so that we can - * determine what type of data were returned. - * See check_stack_read(). - */ - if (!tnum_is_const(reg->var_off)) { - char tn_buf[48]; - - tnum_strn(tn_buf, sizeof(tn_buf), reg->var_off); - verbose("variable stack access var_off=%s off=%d size=%d", - tn_buf, off, size); - return -EACCES; - } off += reg->var_off.value; - if (off >= 0 || off < -MAX_BPF_STACK) { - verbose("invalid stack off=%d size=%d\n", off, size); - return -EACCES; - } + err = check_stack_access(env, reg, off, size); + if (err) + return err; if (env->prog->aux->stack_depth < -off) env->prog->aux->stack_depth = -off; - if (t == BPF_WRITE) { - if (!env->allow_ptr_leaks && - state->stack_slot_type[MAX_BPF_STACK + off] == STACK_SPILL && - size != BPF_REG_SIZE) { - verbose("attempt to corrupt spilled pointer on stack\n"); - return -EACCES; - } + if (t == BPF_WRITE) err = check_stack_write(env, state, off, size, value_regno, insn_idx); - } else { + else err = check_stack_read(state, off, size, value_regno); - } } else if (reg->type == PTR_TO_PACKET) { if (t == BPF_WRITE && !may_access_direct_pkt_data(env, NULL, t)) { verbose("cannot write into packet\n"); @@ -1258,7 +1378,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn } err = check_packet_access(env, regno, off, size); if (!err && t == BPF_READ && value_regno >= 0) - mark_reg_unknown(state->regs, value_regno); + mark_reg_unknown(regs, value_regno); } else { verbose("R%d invalid mem access '%s'\n", regno, reg_type_str[reg->type]); @@ -1266,9 +1386,9 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn } if (!err && size < BPF_REG_SIZE && value_regno >= 0 && t == BPF_READ && - state->regs[value_regno].type == SCALAR_VALUE) { + regs[value_regno].type == SCALAR_VALUE) { /* b/h/w load zero-extends, mark upper bits as known 0 */ - coerce_reg_to_size(&state->regs[value_regno], size); + coerce_reg_to_size(®s[value_regno], size); } return err; } @@ -1333,9 +1453,9 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, int access_size, bool zero_size_allowed, struct bpf_call_arg_meta *meta) { - struct bpf_verifier_state *state = &env->cur_state; + struct bpf_verifier_state *state = env->cur_state; struct bpf_reg_state *regs = state->regs; - int off, i; + int off, i, slot, spi; if (regs[regno].type != PTR_TO_STACK) { /* Allow zero-byte read from NULL, regardless of pointer type */ @@ -1376,7 +1496,11 @@ static int check_stack_boundary(struct bpf_verifier_env *env, int regno, } for (i = 0; i < access_size; i++) { - if (state->stack_slot_type[MAX_BPF_STACK + off + i] != STACK_MISC) { + slot = -(off + i) - 1; + spi = slot / BPF_REG_SIZE; + if (state->allocated_stack <= slot || + state->stack[spi].slot_type[slot % BPF_REG_SIZE] != + STACK_MISC) { verbose("invalid indirect read from stack off %d+%d size %d\n", off, i, access_size); return -EACCES; @@ -1389,7 +1513,7 @@ static int check_helper_mem_access(struct bpf_verifier_env *env, int regno, int access_size, bool zero_size_allowed, struct bpf_call_arg_meta *meta) { - struct bpf_reg_state *regs = env->cur_state.regs, *reg = ®s[regno]; + struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; switch (reg->type) { case PTR_TO_PACKET: @@ -1406,7 +1530,7 @@ static int check_func_arg(struct bpf_verifier_env *env, u32 regno, enum bpf_arg_type arg_type, struct bpf_call_arg_meta *meta) { - struct bpf_reg_state *regs = env->cur_state.regs, *reg = ®s[regno]; + struct bpf_reg_state *regs = cur_regs(env), *reg = ®s[regno]; enum bpf_reg_type expected_type, type = reg->type; int err = 0; @@ -1678,7 +1802,7 @@ static int check_raw_mode(const struct bpf_func_proto *fn) */ static void clear_all_pkt_pointers(struct bpf_verifier_env *env) { - struct bpf_verifier_state *state = &env->cur_state; + struct bpf_verifier_state *state = env->cur_state; struct bpf_reg_state *regs = state->regs, *reg; int i; @@ -1687,10 +1811,10 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env) regs[i].type == PTR_TO_PACKET_END) mark_reg_unknown(regs, i); - for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { - if (state->stack_slot_type[i] != STACK_SPILL) + for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { + if (state->stack[i].slot_type[0] != STACK_SPILL) continue; - reg = &state->spilled_regs[i / BPF_REG_SIZE]; + reg = &state->stack[i].spilled_ptr; if (reg->type != PTR_TO_PACKET && reg->type != PTR_TO_PACKET_END) continue; @@ -1700,9 +1824,8 @@ static void clear_all_pkt_pointers(struct bpf_verifier_env *env) static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) { - struct bpf_verifier_state *state = &env->cur_state; const struct bpf_func_proto *fn = NULL; - struct bpf_reg_state *regs = state->regs; + struct bpf_reg_state *regs; struct bpf_call_arg_meta meta; bool changes_data; int i, err; @@ -1776,6 +1899,7 @@ static int check_call(struct bpf_verifier_env *env, int func_id, int insn_idx) return err; } + regs = cur_regs(env); /* reset caller saved regs */ for (i = 0; i < CALLER_SAVED_REGS; i++) { mark_reg_not_init(regs, caller_saved[i]); @@ -1880,6 +2004,125 @@ static bool check_reg_sane_offset(struct bpf_verifier_env *env, return true; } +static struct bpf_insn_aux_data *cur_aux(struct bpf_verifier_env *env) +{ + return &env->insn_aux_data[env->insn_idx]; +} + +static int retrieve_ptr_limit(const struct bpf_reg_state *ptr_reg, + u32 *ptr_limit, u8 opcode, bool off_is_neg) +{ + bool mask_to_left = (opcode == BPF_ADD && off_is_neg) || + (opcode == BPF_SUB && !off_is_neg); + u32 off; + + switch (ptr_reg->type) { + case PTR_TO_STACK: + off = ptr_reg->off + ptr_reg->var_off.value; + if (mask_to_left) + *ptr_limit = MAX_BPF_STACK + off; + else + *ptr_limit = -off; + return 0; + case PTR_TO_MAP_VALUE: + if (mask_to_left) { + *ptr_limit = ptr_reg->umax_value + ptr_reg->off; + } else { + off = ptr_reg->smin_value + ptr_reg->off; + *ptr_limit = ptr_reg->map_ptr->value_size - off; + } + return 0; + default: + return -EINVAL; + } +} + +static bool can_skip_alu_sanitation(const struct bpf_verifier_env *env, + const struct bpf_insn *insn) +{ + return env->allow_ptr_leaks || BPF_SRC(insn->code) == BPF_K; +} + +static int update_alu_sanitation_state(struct bpf_insn_aux_data *aux, + u32 alu_state, u32 alu_limit) +{ + /* If we arrived here from different branches with different + * state or limits to sanitize, then this won't work. + */ + if (aux->alu_state && + (aux->alu_state != alu_state || + aux->alu_limit != alu_limit)) + return -EACCES; + + /* Corresponding fixup done in fixup_bpf_calls(). */ + aux->alu_state = alu_state; + aux->alu_limit = alu_limit; + return 0; +} + +static int sanitize_val_alu(struct bpf_verifier_env *env, + struct bpf_insn *insn) +{ + struct bpf_insn_aux_data *aux = cur_aux(env); + + if (can_skip_alu_sanitation(env, insn)) + return 0; + + return update_alu_sanitation_state(aux, BPF_ALU_NON_POINTER, 0); +} + +static int sanitize_ptr_alu(struct bpf_verifier_env *env, + struct bpf_insn *insn, + const struct bpf_reg_state *ptr_reg, + struct bpf_reg_state *dst_reg, + bool off_is_neg) +{ + struct bpf_verifier_state *vstate = env->cur_state; + struct bpf_insn_aux_data *aux = cur_aux(env); + bool ptr_is_dst_reg = ptr_reg == dst_reg; + u8 opcode = BPF_OP(insn->code); + u32 alu_state, alu_limit; + struct bpf_reg_state tmp; + bool ret; + + if (can_skip_alu_sanitation(env, insn)) + return 0; + + /* We already marked aux for masking from non-speculative + * paths, thus we got here in the first place. We only care + * to explore bad access from here. + */ + if (vstate->speculative) + goto do_sim; + + alu_state = off_is_neg ? BPF_ALU_NEG_VALUE : 0; + alu_state |= ptr_is_dst_reg ? + BPF_ALU_SANITIZE_SRC : BPF_ALU_SANITIZE_DST; + + if (retrieve_ptr_limit(ptr_reg, &alu_limit, opcode, off_is_neg)) + return 0; + if (update_alu_sanitation_state(aux, alu_state, alu_limit)) + return -EACCES; +do_sim: + /* Simulate and find potential out-of-bounds access under + * speculative execution from truncation as a result of + * masking when off was not within expected range. If off + * sits in dst, then we temporarily need to move ptr there + * to simulate dst (== 0) +/-= ptr. Needed, for example, + * for cases where we use K-based arithmetic in one direction + * and truncated reg-based in the other in order to explore + * bad access. + */ + if (!ptr_is_dst_reg) { + tmp = *dst_reg; + *dst_reg = *ptr_reg; + } + ret = push_stack(env, env->insn_idx + 1, env->insn_idx, true); + if (!ptr_is_dst_reg && ret) + *dst_reg = tmp; + return !ret ? -EFAULT : 0; +} + /* Handles arithmetic on a pointer and a scalar: computes new min/max and var_off. * Caller should also handle BPF_MOV case separately. * If we return -EACCES, caller may want to try again treating pointer as a @@ -1890,14 +2133,15 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, const struct bpf_reg_state *ptr_reg, const struct bpf_reg_state *off_reg) { - struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg; + struct bpf_reg_state *regs = cur_regs(env), *dst_reg; bool known = tnum_is_const(off_reg->var_off); s64 smin_val = off_reg->smin_value, smax_val = off_reg->smax_value, smin_ptr = ptr_reg->smin_value, smax_ptr = ptr_reg->smax_value; u64 umin_val = off_reg->umin_value, umax_val = off_reg->umax_value, umin_ptr = ptr_reg->umin_value, umax_ptr = ptr_reg->umax_value; + u32 dst = insn->dst_reg, src = insn->src_reg; u8 opcode = BPF_OP(insn->code); - u32 dst = insn->dst_reg; + int ret; dst_reg = ®s[dst]; @@ -1949,6 +2193,11 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, switch (opcode) { case BPF_ADD: + ret = sanitize_ptr_alu(env, insn, ptr_reg, dst_reg, smin_val < 0); + if (ret < 0) { + verbose("R%d tried to add from different maps or paths\n", dst); + return ret; + } /* We can take a fixed offset as long as it doesn't overflow * the s32 'off' field */ @@ -1999,6 +2248,11 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, } break; case BPF_SUB: + ret = sanitize_ptr_alu(env, insn, ptr_reg, dst_reg, smin_val < 0); + if (ret < 0) { + verbose("R%d tried to sub from different maps or paths\n", dst); + return ret; + } if (dst_reg == off_reg) { /* scalar -= pointer. Creates an unknown scalar */ if (!env->allow_ptr_leaks) @@ -2071,6 +2325,13 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, verbose("R%d bitwise operator %s on pointer prohibited\n", dst, bpf_alu_string[opcode >> 4]); return -EACCES; + case PTR_TO_MAP_VALUE: + if (!env->allow_ptr_leaks && !known && (smin_val < 0) != (smax_val < 0)) { + verbose("R%d has unknown scalar with mixed signed bounds, pointer arithmetic with it prohibited for !root\n", + off_reg == dst_reg ? dst : src); + return -EACCES; + } + /* fall-through */ default: /* other operators (e.g. MUL,LSH) produce non-pointer results */ if (!env->allow_ptr_leaks) @@ -2085,6 +2346,25 @@ static int adjust_ptr_min_max_vals(struct bpf_verifier_env *env, __update_reg_bounds(dst_reg); __reg_deduce_bounds(dst_reg); __reg_bound_offset(dst_reg); + + /* For unprivileged we require that resulting offset must be in bounds + * in order to be able to sanitize access later on. + */ + if (!env->allow_ptr_leaks) { + if (dst_reg->type == PTR_TO_MAP_VALUE && + check_map_access(env, dst, dst_reg->off, 1)) { + verbose("R%d pointer arithmetic of map value goes out of range, " + "prohibited for !root\n", dst); + return -EACCES; + } else if (dst_reg->type == PTR_TO_STACK && + check_stack_access(env, dst_reg, dst_reg->off + + dst_reg->var_off.value, 1)) { + verbose("R%d stack pointer arithmetic goes out of range, " + "prohibited for !root\n", dst); + return -EACCES; + } + } + return 0; } @@ -2097,12 +2377,14 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, struct bpf_reg_state *dst_reg, struct bpf_reg_state src_reg) { - struct bpf_reg_state *regs = env->cur_state.regs; + struct bpf_reg_state *regs = cur_regs(env); u8 opcode = BPF_OP(insn->code); bool src_known, dst_known; s64 smin_val, smax_val; u64 umin_val, umax_val; u64 insn_bitness = (BPF_CLASS(insn->code) == BPF_ALU64) ? 64 : 32; + u32 dst = insn->dst_reg; + int ret; if (insn_bitness == 32) { /* Relevant for 32-bit RSH: Information can propagate towards @@ -2137,6 +2419,11 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, switch (opcode) { case BPF_ADD: + ret = sanitize_val_alu(env, insn); + if (ret < 0) { + verbose("R%d tried to add from different pointers or scalars\n", dst); + return ret; + } if (signed_add_overflows(dst_reg->smin_value, smin_val) || signed_add_overflows(dst_reg->smax_value, smax_val)) { dst_reg->smin_value = S64_MIN; @@ -2156,6 +2443,11 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, dst_reg->var_off = tnum_add(dst_reg->var_off, src_reg.var_off); break; case BPF_SUB: + ret = sanitize_val_alu(env, insn); + if (ret < 0) { + verbose("R%d tried to sub from different pointers or scalars\n", dst); + return ret; + } if (signed_sub_overflows(dst_reg->smin_value, smax_val) || signed_sub_overflows(dst_reg->smax_value, smin_val)) { /* Overflow possible, we know nothing */ @@ -2345,7 +2637,7 @@ static int adjust_scalar_min_max_vals(struct bpf_verifier_env *env, static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, struct bpf_insn *insn) { - struct bpf_reg_state *regs = env->cur_state.regs, *dst_reg, *src_reg; + struct bpf_reg_state *regs = cur_regs(env), *dst_reg, *src_reg; struct bpf_reg_state *ptr_reg = NULL, off_reg = {0}; u8 opcode = BPF_OP(insn->code); int rc; @@ -2419,12 +2711,12 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, /* Got here implies adding two SCALAR_VALUEs */ if (WARN_ON_ONCE(ptr_reg)) { - print_verifier_state(&env->cur_state); + print_verifier_state(env->cur_state); verbose("verifier internal error: unexpected ptr_reg\n"); return -EINVAL; } if (WARN_ON(!src_reg)) { - print_verifier_state(&env->cur_state); + print_verifier_state(env->cur_state); verbose("verifier internal error: no src_reg\n"); return -EINVAL; } @@ -2434,7 +2726,7 @@ static int adjust_reg_min_max_vals(struct bpf_verifier_env *env, /* check validity of 32-bit and 64-bit arithmetic operations */ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) { - struct bpf_reg_state *regs = env->cur_state.regs; + struct bpf_reg_state *regs = cur_regs(env); u8 opcode = BPF_OP(insn->code); int err; @@ -2661,10 +2953,10 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state, /* keep the maximum range already checked */ regs[i].range = max(regs[i].range, new_range); - for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { - if (state->stack_slot_type[i] != STACK_SPILL) + for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { + if (state->stack[i].slot_type[0] != STACK_SPILL) continue; - reg = &state->spilled_regs[i / BPF_REG_SIZE]; + reg = &state->stack[i].spilled_ptr; if (reg->type == PTR_TO_PACKET && reg->id == dst_reg->id) reg->range = max(reg->range, new_range); } @@ -2914,17 +3206,17 @@ static void mark_map_regs(struct bpf_verifier_state *state, u32 regno, for (i = 0; i < MAX_BPF_REG; i++) mark_map_reg(regs, i, id, is_null); - for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) { - if (state->stack_slot_type[i] != STACK_SPILL) + for (i = 0; i < state->allocated_stack / BPF_REG_SIZE; i++) { + if (state->stack[i].slot_type[0] != STACK_SPILL) continue; - mark_map_reg(state->spilled_regs, i / BPF_REG_SIZE, id, is_null); + mark_map_reg(&state->stack[i].spilled_ptr, 0, id, is_null); } } static int check_cond_jmp_op(struct bpf_verifier_env *env, struct bpf_insn *insn, int *insn_idx) { - struct bpf_verifier_state *other_branch, *this_branch = &env->cur_state; + struct bpf_verifier_state *other_branch, *this_branch = env->cur_state; struct bpf_reg_state *regs = this_branch->regs, *dst_reg; u8 opcode = BPF_OP(insn->code); int err; @@ -2984,7 +3276,8 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env, } } - other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx); + other_branch = push_stack(env, *insn_idx + insn->off + 1, *insn_idx, + false); if (!other_branch) return -EFAULT; @@ -3087,7 +3380,7 @@ static struct bpf_map *ld_imm64_to_map_ptr(struct bpf_insn *insn) /* verify BPF_LD_IMM64 instruction */ static int check_ld_imm(struct bpf_verifier_env *env, struct bpf_insn *insn) { - struct bpf_reg_state *regs = env->cur_state.regs; + struct bpf_reg_state *regs = cur_regs(env); int err; if (BPF_SIZE(insn->code) != BPF_DW) { @@ -3148,7 +3441,7 @@ static bool may_access_skb(enum bpf_prog_type type) */ static int check_ld_abs(struct bpf_verifier_env *env, struct bpf_insn *insn) { - struct bpf_reg_state *regs = env->cur_state.regs; + struct bpf_reg_state *regs = cur_regs(env); u8 mode = BPF_MODE(insn->code); int i, err; @@ -3534,6 +3827,57 @@ static bool regsafe(struct bpf_reg_state *rold, struct bpf_reg_state *rcur, return false; } +static bool stacksafe(struct bpf_verifier_state *old, + struct bpf_verifier_state *cur, + struct idpair *idmap) +{ + int i, spi; + + /* if explored stack has more populated slots than current stack + * such stacks are not equivalent + */ + if (old->allocated_stack > cur->allocated_stack) + return false; + + /* walk slots of the explored stack and ignore any additional + * slots in the current stack, since explored(safe) state + * didn't use them + */ + for (i = 0; i < old->allocated_stack; i++) { + spi = i / BPF_REG_SIZE; + + if (old->stack[spi].slot_type[i % BPF_REG_SIZE] == STACK_INVALID) + continue; + if (old->stack[spi].slot_type[i % BPF_REG_SIZE] != + cur->stack[spi].slot_type[i % BPF_REG_SIZE]) + /* Ex: old explored (safe) state has STACK_SPILL in + * this stack slot, but current has has STACK_MISC -> + * this verifier states are not equivalent, + * return false to continue verification of this path + */ + return false; + if (i % BPF_REG_SIZE) + continue; + if (old->stack[spi].slot_type[0] != STACK_SPILL) + continue; + if (!regsafe(&old->stack[spi].spilled_ptr, + &cur->stack[spi].spilled_ptr, + idmap)) + /* when explored and current stack slot are both storing + * spilled registers, check that stored pointers types + * are the same as well. + * Ex: explored safe path could have stored + * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -8} + * but current path has stored: + * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -16} + * such verifier states are not equivalent. + * return false to continue verification of this path + */ + return false; + } + return true; +} + /* compare two verifier states * * all states stored in state_list are known to be valid, since @@ -3568,6 +3912,12 @@ static bool states_equal(struct bpf_verifier_env *env, bool ret = false; int i; + /* Verification state from speculative execution simulation + * must never prune a non-speculative execution one. + */ + if (old->speculative && !cur->speculative) + return false; + idmap = kcalloc(ID_MAP_SIZE, sizeof(struct idpair), GFP_KERNEL); /* If we failed to allocate the idmap, just say it's not safe */ if (!idmap) @@ -3578,37 +3928,8 @@ static bool states_equal(struct bpf_verifier_env *env, goto out_free; } - for (i = 0; i < MAX_BPF_STACK; i++) { - if (old->stack_slot_type[i] == STACK_INVALID) - continue; - if (old->stack_slot_type[i] != cur->stack_slot_type[i]) - /* Ex: old explored (safe) state has STACK_SPILL in - * this stack slot, but current has has STACK_MISC -> - * this verifier states are not equivalent, - * return false to continue verification of this path - */ - goto out_free; - if (i % BPF_REG_SIZE) - continue; - if (old->stack_slot_type[i] != STACK_SPILL) - continue; - if (!regsafe(&old->spilled_regs[i / BPF_REG_SIZE], - &cur->spilled_regs[i / BPF_REG_SIZE], - idmap)) - /* when explored and current stack slot are both storing - * spilled registers, check that stored pointers types - * are the same as well. - * Ex: explored safe path could have stored - * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -8} - * but current path has stored: - * (bpf_reg_state) {.type = PTR_TO_STACK, .off = -16} - * such verifier states are not equivalent. - * return false to continue verification of this path - */ - goto out_free; - else - continue; - } + if (!stacksafe(old, cur, idmap)) + goto out_free; ret = true; out_free: kfree(idmap); @@ -3644,17 +3965,19 @@ static bool do_propagate_liveness(const struct bpf_verifier_state *state, } } /* ... and stack slots */ - for (i = 0; i < MAX_BPF_STACK / BPF_REG_SIZE; i++) { - if (parent->stack_slot_type[i * BPF_REG_SIZE] != STACK_SPILL) + for (i = 0; i < state->allocated_stack / BPF_REG_SIZE && + i < parent->allocated_stack / BPF_REG_SIZE; i++) { + if (parent->stack[i].slot_type[0] != STACK_SPILL) continue; - if (state->stack_slot_type[i * BPF_REG_SIZE] != STACK_SPILL) + if (state->stack[i].slot_type[0] != STACK_SPILL) continue; - if (parent->spilled_regs[i].live & REG_LIVE_READ) + if (parent->stack[i].spilled_ptr.live & REG_LIVE_READ) continue; - if (writes && (state->spilled_regs[i].live & REG_LIVE_WRITTEN)) + if (writes && + (state->stack[i].spilled_ptr.live & REG_LIVE_WRITTEN)) continue; - if (state->spilled_regs[i].live & REG_LIVE_READ) { - parent->spilled_regs[i].live |= REG_LIVE_READ; + if (state->stack[i].spilled_ptr.live & REG_LIVE_READ) { + parent->stack[i].spilled_ptr.live |= REG_LIVE_READ; touched = true; } } @@ -3684,7 +4007,8 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) { struct bpf_verifier_state_list *new_sl; struct bpf_verifier_state_list *sl; - int i; + struct bpf_verifier_state *cur = env->cur_state; + int i, err; sl = env->explored_states[insn_idx]; if (!sl) @@ -3694,7 +4018,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) return 0; while (sl != STATE_LIST_MARK) { - if (states_equal(env, &sl->state, &env->cur_state)) { + if (states_equal(env, &sl->state, cur)) { /* reached equivalent register/stack state, * prune the search. * Registers read by the continuation are read by us. @@ -3705,7 +4029,7 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) * they'll be immediately forgotten as we're pruning * this state and will pop a new one. */ - propagate_liveness(&sl->state, &env->cur_state); + propagate_liveness(&sl->state, cur); return 1; } sl = sl->next; @@ -3717,16 +4041,21 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) * it will be rejected. Since there are no loops, we won't be * seeing this 'insn_idx' instruction again on the way to bpf_exit */ - new_sl = kmalloc(sizeof(struct bpf_verifier_state_list), GFP_USER); + new_sl = kzalloc(sizeof(struct bpf_verifier_state_list), GFP_KERNEL); if (!new_sl) return -ENOMEM; /* add new state to the head of linked list */ - memcpy(&new_sl->state, &env->cur_state, sizeof(env->cur_state)); + err = copy_verifier_state(&new_sl->state, cur); + if (err) { + free_verifier_state(&new_sl->state, false); + kfree(new_sl); + return err; + } new_sl->next = env->explored_states[insn_idx]; env->explored_states[insn_idx] = new_sl; /* connect new state to parentage chain */ - env->cur_state.parent = &new_sl->state; + cur->parent = &new_sl->state; /* clear write marks in current state: the writes we did are not writes * our child did, so they don't screen off its reads from us. * (There are no read marks in current state, because reads always mark @@ -3734,10 +4063,10 @@ static int is_state_visited(struct bpf_verifier_env *env, int insn_idx) * explored_states can get read marks.) */ for (i = 0; i < BPF_REG_FP; i++) - env->cur_state.regs[i].live = REG_LIVE_NONE; - for (i = 0; i < MAX_BPF_STACK / BPF_REG_SIZE; i++) - if (env->cur_state.stack_slot_type[i * BPF_REG_SIZE] == STACK_SPILL) - env->cur_state.spilled_regs[i].live = REG_LIVE_NONE; + cur->regs[i].live = REG_LIVE_NONE; + for (i = 0; i < cur->allocated_stack / BPF_REG_SIZE; i++) + if (cur->stack[i].slot_type[0] == STACK_SPILL) + cur->stack[i].spilled_ptr.live = REG_LIVE_NONE; return 0; } @@ -3752,29 +4081,31 @@ static int ext_analyzer_insn_hook(struct bpf_verifier_env *env, static int do_check(struct bpf_verifier_env *env) { - struct bpf_verifier_state *state = &env->cur_state; + struct bpf_verifier_state *state; struct bpf_insn *insns = env->prog->insnsi; - struct bpf_reg_state *regs = state->regs; + struct bpf_reg_state *regs; int insn_cnt = env->prog->len; - int insn_idx, prev_insn_idx = 0; int insn_processed = 0; bool do_print_state = false; - init_reg_state(regs); + state = kzalloc(sizeof(struct bpf_verifier_state), GFP_KERNEL); + if (!state) + return -ENOMEM; + env->cur_state = state; + init_reg_state(state->regs); state->parent = NULL; - insn_idx = 0; for (;;) { struct bpf_insn *insn; u8 class; int err; - if (insn_idx >= insn_cnt) { + if (env->insn_idx >= insn_cnt) { verbose("invalid insn idx %d insn_cnt %d\n", - insn_idx, insn_cnt); + env->insn_idx, insn_cnt); return -EFAULT; } - insn = &insns[insn_idx]; + insn = &insns[env->insn_idx]; class = BPF_CLASS(insn->code); if (++insn_processed > BPF_COMPLEXITY_LIMIT_INSNS) { @@ -3783,17 +4114,19 @@ static int do_check(struct bpf_verifier_env *env) return -E2BIG; } - err = is_state_visited(env, insn_idx); + err = is_state_visited(env, env->insn_idx); if (err < 0) return err; if (err == 1) { /* found equivalent state, can prune the search */ if (log_level) { if (do_print_state) - verbose("\nfrom %d to %d: safe\n", - prev_insn_idx, insn_idx); + verbose("\nfrom %d to %d%s: safe\n", + env->prev_insn_idx, env->insn_idx, + env->cur_state->speculative ? + " (speculative execution)" : ""); else - verbose("%d: safe\n", insn_idx); + verbose("%d: safe\n", env->insn_idx); } goto process_bpf_exit; } @@ -3803,24 +4136,27 @@ static int do_check(struct bpf_verifier_env *env) if (log_level > 1 || (log_level && do_print_state)) { if (log_level > 1) - verbose("%d:", insn_idx); + verbose("%d:", env->insn_idx); else - verbose("\nfrom %d to %d:", - prev_insn_idx, insn_idx); - print_verifier_state(&env->cur_state); + verbose("\nfrom %d to %d%s:", + env->prev_insn_idx, env->insn_idx, + env->cur_state->speculative ? + " (speculative execution)" : ""); + print_verifier_state(env->cur_state); do_print_state = false; } if (log_level) { - verbose("%d: ", insn_idx); + verbose("%d: ", env->insn_idx); print_bpf_insn(env, insn); } - err = ext_analyzer_insn_hook(env, insn_idx, prev_insn_idx); + err = ext_analyzer_insn_hook(env, env->insn_idx, env->prev_insn_idx); if (err) return err; - env->insn_aux_data[insn_idx].seen = true; + regs = cur_regs(env); + env->insn_aux_data[env->insn_idx].seen = true; if (class == BPF_ALU || class == BPF_ALU64) { err = check_alu_op(env, insn); if (err) @@ -3845,13 +4181,13 @@ static int do_check(struct bpf_verifier_env *env) /* check that memory (src_reg + off) is readable, * the state of dst_reg will be updated by this func */ - err = check_mem_access(env, insn_idx, insn->src_reg, insn->off, - BPF_SIZE(insn->code), BPF_READ, - insn->dst_reg, false); + err = check_mem_access(env, env->insn_idx, insn->src_reg, + insn->off, BPF_SIZE(insn->code), + BPF_READ, insn->dst_reg, false); if (err) return err; - prev_src_type = &env->insn_aux_data[insn_idx].ptr_type; + prev_src_type = &env->insn_aux_data[env->insn_idx].ptr_type; if (*prev_src_type == NOT_INIT) { /* saw a valid insn @@ -3878,10 +4214,10 @@ static int do_check(struct bpf_verifier_env *env) enum bpf_reg_type *prev_dst_type, dst_reg_type; if (BPF_MODE(insn->code) == BPF_XADD) { - err = check_xadd(env, insn_idx, insn); + err = check_xadd(env, env->insn_idx, insn); if (err) return err; - insn_idx++; + env->insn_idx++; continue; } @@ -3897,13 +4233,13 @@ static int do_check(struct bpf_verifier_env *env) dst_reg_type = regs[insn->dst_reg].type; /* check that memory (dst_reg + off) is writeable */ - err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, - BPF_SIZE(insn->code), BPF_WRITE, - insn->src_reg, false); + err = check_mem_access(env, env->insn_idx, insn->dst_reg, + insn->off, BPF_SIZE(insn->code), + BPF_WRITE, insn->src_reg, false); if (err) return err; - prev_dst_type = &env->insn_aux_data[insn_idx].ptr_type; + prev_dst_type = &env->insn_aux_data[env->insn_idx].ptr_type; if (*prev_dst_type == NOT_INIT) { *prev_dst_type = dst_reg_type; @@ -3932,9 +4268,9 @@ static int do_check(struct bpf_verifier_env *env) } /* check that memory (dst_reg + off) is writeable */ - err = check_mem_access(env, insn_idx, insn->dst_reg, insn->off, - BPF_SIZE(insn->code), BPF_WRITE, - -1, false); + err = check_mem_access(env, env->insn_idx, insn->dst_reg, + insn->off, BPF_SIZE(insn->code), + BPF_WRITE, -1, false); if (err) return err; @@ -3950,7 +4286,7 @@ static int do_check(struct bpf_verifier_env *env) return -EINVAL; } - err = check_call(env, insn->imm, insn_idx); + err = check_call(env, insn->imm, env->insn_idx); if (err) return err; @@ -3963,7 +4299,7 @@ static int do_check(struct bpf_verifier_env *env) return -EINVAL; } - insn_idx += insn->off + 1; + env->insn_idx += insn->off + 1; continue; } else if (opcode == BPF_EXIT) { @@ -3991,15 +4327,17 @@ static int do_check(struct bpf_verifier_env *env) } process_bpf_exit: - insn_idx = pop_stack(env, &prev_insn_idx); - if (insn_idx < 0) { + err = pop_stack(env, &env->prev_insn_idx, &env->insn_idx); + if (err < 0) { + if (err != -ENOENT) + return err; break; } else { do_print_state = true; continue; } } else { - err = check_cond_jmp_op(env, insn, &insn_idx); + err = check_cond_jmp_op(env, insn, &env->insn_idx); if (err) return err; } @@ -4016,8 +4354,8 @@ process_bpf_exit: if (err) return err; - insn_idx++; - env->insn_aux_data[insn_idx].seen = true; + env->insn_idx++; + env->insn_aux_data[env->insn_idx].seen = true; } else { verbose("invalid BPF_LD mode\n"); return -EINVAL; @@ -4027,7 +4365,7 @@ process_bpf_exit: return -EINVAL; } - insn_idx++; + env->insn_idx++; } verbose("processed %d insns, stack depth %d\n", @@ -4402,6 +4740,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) struct bpf_prog *new_prog; struct bpf_map *map_ptr; int i, cnt, delta = 0; + struct bpf_insn_aux_data *aux; for (i = 0; i < insn_cnt; i++, insn++) { if (insn->code == (BPF_ALU | BPF_MOD | BPF_X) || @@ -4422,6 +4761,58 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) continue; } + if (insn->code == (BPF_ALU64 | BPF_ADD | BPF_X) || + insn->code == (BPF_ALU64 | BPF_SUB | BPF_X)) { + const u8 code_add = BPF_ALU64 | BPF_ADD | BPF_X; + const u8 code_sub = BPF_ALU64 | BPF_SUB | BPF_X; + struct bpf_insn insn_buf[16]; + struct bpf_insn *patch = &insn_buf[0]; + bool issrc, isneg; + u32 off_reg; + + aux = &env->insn_aux_data[i + delta]; + if (!aux->alu_state || + aux->alu_state == BPF_ALU_NON_POINTER) + continue; + + isneg = aux->alu_state & BPF_ALU_NEG_VALUE; + issrc = (aux->alu_state & BPF_ALU_SANITIZE) == + BPF_ALU_SANITIZE_SRC; + + off_reg = issrc ? insn->src_reg : insn->dst_reg; + if (isneg) + *patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1); + *patch++ = BPF_MOV32_IMM(BPF_REG_AX, aux->alu_limit - 1); + *patch++ = BPF_ALU64_REG(BPF_SUB, BPF_REG_AX, off_reg); + *patch++ = BPF_ALU64_REG(BPF_OR, BPF_REG_AX, off_reg); + *patch++ = BPF_ALU64_IMM(BPF_NEG, BPF_REG_AX, 0); + *patch++ = BPF_ALU64_IMM(BPF_ARSH, BPF_REG_AX, 63); + if (issrc) { + *patch++ = BPF_ALU64_REG(BPF_AND, BPF_REG_AX, + off_reg); + insn->src_reg = BPF_REG_AX; + } else { + *patch++ = BPF_ALU64_REG(BPF_AND, off_reg, + BPF_REG_AX); + } + if (isneg) + insn->code = insn->code == code_add ? + code_sub : code_add; + *patch++ = *insn; + if (issrc && isneg) + *patch++ = BPF_ALU64_IMM(BPF_MUL, off_reg, -1); + cnt = patch - insn_buf; + + new_prog = bpf_patch_insn_data(env, i + delta, insn_buf, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = prog = new_prog; + insn = new_prog->insnsi + i + delta; + continue; + } + if (insn->code != (BPF_JMP | BPF_CALL)) continue; @@ -4557,6 +4948,7 @@ static void free_states(struct bpf_verifier_env *env) if (sl) while (sl != STATE_LIST_MARK) { sln = sl->next; + free_verifier_state(&sl->state, false); kfree(sl); sl = sln; } @@ -4633,9 +5025,13 @@ int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) env->allow_ptr_leaks = capable(CAP_SYS_ADMIN); ret = do_check(env); + if (env->cur_state) { + free_verifier_state(env->cur_state, true); + env->cur_state = NULL; + } skip_full_check: - while (pop_stack(env, NULL) >= 0); + while (!pop_stack(env, NULL, NULL)); free_states(env); if (ret == 0) @@ -4741,9 +5137,13 @@ int bpf_analyzer(struct bpf_prog *prog, const struct bpf_ext_analyzer_ops *ops, env->allow_ptr_leaks = capable(CAP_SYS_ADMIN); ret = do_check(env); + if (env->cur_state) { + free_verifier_state(env->cur_state, true); + env->cur_state = NULL; + } skip_full_check: - while (pop_stack(env, NULL) >= 0); + while (!pop_stack(env, NULL, NULL)); free_states(env); mutex_unlock(&bpf_verifier_lock); diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 109c32c56de7..694b1cc8d144 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -187,7 +187,7 @@ static u64 css_serial_nr_next = 1; */ static u16 have_fork_callback __read_mostly; static u16 have_exit_callback __read_mostly; -static u16 have_free_callback __read_mostly; +static u16 have_release_callback __read_mostly; static u16 have_canfork_callback __read_mostly; /* cgroup namespace for init task */ @@ -1692,7 +1692,7 @@ static int parse_cgroup_root_flags(char *data, unsigned int *root_flags) *root_flags = 0; - if (!data) + if (!data || *data == '\0') return 0; while ((token = strsep(&data, ",")) != NULL) { @@ -1942,7 +1942,7 @@ struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags, struct cgroup_namespace *ns) { struct dentry *dentry; - bool new_sb; + bool new_sb = false; dentry = kernfs_mount(fs_type, flags, root->kf_root, magic, &new_sb); @@ -1952,6 +1952,7 @@ struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags, */ if (!IS_ERR(dentry) && ns != &init_cgroup_ns) { struct dentry *nsdentry; + struct super_block *sb = dentry->d_sb; struct cgroup *cgrp; mutex_lock(&cgroup_mutex); @@ -1962,12 +1963,14 @@ struct dentry *cgroup_do_mount(struct file_system_type *fs_type, int flags, spin_unlock_irq(&css_set_lock); mutex_unlock(&cgroup_mutex); - nsdentry = kernfs_node_dentry(cgrp->kn, dentry->d_sb); + nsdentry = kernfs_node_dentry(cgrp->kn, sb); dput(dentry); + if (IS_ERR(nsdentry)) + deactivate_locked_super(sb); dentry = nsdentry; } - if (IS_ERR(dentry) || !new_sb) + if (!new_sb) cgroup_put(&root->cgrp); return dentry; @@ -5109,7 +5112,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) have_fork_callback |= (bool)ss->fork << ss->id; have_exit_callback |= (bool)ss->exit << ss->id; - have_free_callback |= (bool)ss->free << ss->id; + have_release_callback |= (bool)ss->release << ss->id; have_canfork_callback |= (bool)ss->can_fork << ss->id; /* At system boot, before all subsystems have been @@ -5543,16 +5546,19 @@ void cgroup_exit(struct task_struct *tsk) } while_each_subsys_mask(); } -void cgroup_free(struct task_struct *task) +void cgroup_release(struct task_struct *task) { - struct css_set *cset = task_css_set(task); struct cgroup_subsys *ss; int ssid; - do_each_subsys_mask(ss, ssid, have_free_callback) { - ss->free(task); + do_each_subsys_mask(ss, ssid, have_release_callback) { + ss->release(task); } while_each_subsys_mask(); +} +void cgroup_free(struct task_struct *task) +{ + struct css_set *cset = task_css_set(task); put_css_set(cset); } diff --git a/kernel/cgroup/pids.c b/kernel/cgroup/pids.c index 9829c67ebc0a..c9960baaa14f 100644 --- a/kernel/cgroup/pids.c +++ b/kernel/cgroup/pids.c @@ -247,7 +247,7 @@ static void pids_cancel_fork(struct task_struct *task) pids_uncharge(pids, 1); } -static void pids_free(struct task_struct *task) +static void pids_release(struct task_struct *task) { struct pids_cgroup *pids = css_pids(task_css(task, pids_cgrp_id)); @@ -342,7 +342,7 @@ struct cgroup_subsys pids_cgrp_subsys = { .cancel_attach = pids_cancel_attach, .can_fork = pids_can_fork, .cancel_fork = pids_cancel_fork, - .free = pids_free, + .release = pids_release, .legacy_cftypes = pids_files, .dfl_cftypes = pids_files, .threaded = true, diff --git a/kernel/cpu.c b/kernel/cpu.c index 5c907d96e3dd..8c350dd81581 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -314,6 +314,15 @@ void cpus_write_unlock(void) void lockdep_assert_cpus_held(void) { + /* + * We can't have hotplug operations before userspace starts running, + * and some init codepaths will knowingly not take the hotplug lock. + * This is all valid, so mute lockdep until it makes sense to report + * unheld locks. + */ + if (system_state < SYSTEM_RUNNING) + return; + percpu_rwsem_assert_held(&cpu_hotplug_lock); } @@ -356,9 +365,6 @@ void __weak arch_smt_update(void) { } #ifdef CONFIG_HOTPLUG_SMT enum cpuhp_smt_control cpu_smt_control __read_mostly = CPU_SMT_ENABLED; -EXPORT_SYMBOL_GPL(cpu_smt_control); - -static bool cpu_smt_available __read_mostly; void __init cpu_smt_disable(bool force) { @@ -376,25 +382,11 @@ void __init cpu_smt_disable(bool force) /* * The decision whether SMT is supported can only be done after the full - * CPU identification. Called from architecture code before non boot CPUs - * are brought up. - */ -void __init cpu_smt_check_topology_early(void) -{ - if (!topology_smt_supported()) - cpu_smt_control = CPU_SMT_NOT_SUPPORTED; -} - -/* - * If SMT was disabled by BIOS, detect it here, after the CPUs have been - * brought online. This ensures the smt/l1tf sysfs entries are consistent - * with reality. cpu_smt_available is set to true during the bringup of non - * boot CPUs when a SMT sibling is detected. Note, this may overwrite - * cpu_smt_control's previous setting. + * CPU identification. Called from architecture code. */ void __init cpu_smt_check_topology(void) { - if (!cpu_smt_available) + if (!topology_smt_supported()) cpu_smt_control = CPU_SMT_NOT_SUPPORTED; } @@ -407,18 +399,10 @@ early_param("nosmt", smt_cmdline_disable); static inline bool cpu_smt_allowed(unsigned int cpu) { - if (topology_is_primary_thread(cpu)) + if (cpu_smt_control == CPU_SMT_ENABLED) return true; - /* - * If the CPU is not a 'primary' thread and the booted_once bit is - * set then the processor has SMT support. Store this information - * for the late check of SMT support in cpu_smt_check_topology(). - */ - if (per_cpu(cpuhp_state, cpu).booted_once) - cpu_smt_available = true; - - if (cpu_smt_control == CPU_SMT_ENABLED) + if (topology_is_primary_thread(cpu)) return true; /* @@ -563,6 +547,20 @@ static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st) } } +static inline bool can_rollback_cpu(struct cpuhp_cpu_state *st) +{ + if (IS_ENABLED(CONFIG_HOTPLUG_CPU)) + return true; + /* + * When CPU hotplug is disabled, then taking the CPU down is not + * possible because takedown_cpu() and the architecture and + * subsystem specific mechanisms are not available. So the CPU + * which would be completely unplugged again needs to stay around + * in the current state. + */ + return st->state <= CPUHP_BRINGUP_CPU; +} + static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, enum cpuhp_state target) { @@ -573,8 +571,10 @@ static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, st->state++; ret = cpuhp_invoke_callback(cpu, st->state, true, NULL, NULL); if (ret) { - st->target = prev_state; - undo_cpu_up(cpu, st); + if (can_rollback_cpu(st)) { + st->target = prev_state; + undo_cpu_up(cpu, st); + } break; } } diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 65c0f1363788..94aa9ae0007a 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -535,6 +535,8 @@ return_normal: arch_kgdb_ops.correct_hw_break(); if (trace_on) tracing_on(); + kgdb_info[cpu].debuggerinfo = NULL; + kgdb_info[cpu].task = NULL; kgdb_info[cpu].exception_state &= ~(DCPU_WANT_MASTER | DCPU_IS_SLAVE); kgdb_info[cpu].enter_kgdb--; @@ -667,6 +669,8 @@ kgdb_restore: if (trace_on) tracing_on(); + kgdb_info[cpu].debuggerinfo = NULL; + kgdb_info[cpu].task = NULL; kgdb_info[cpu].exception_state &= ~(DCPU_WANT_MASTER | DCPU_IS_SLAVE); kgdb_info[cpu].enter_kgdb--; diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c index 7921ae4fca8d..7e2379aa0a1e 100644 --- a/kernel/debug/kdb/kdb_bt.c +++ b/kernel/debug/kdb/kdb_bt.c @@ -186,7 +186,16 @@ kdb_bt(int argc, const char **argv) kdb_printf("btc: cpu status: "); kdb_parse("cpu\n"); for_each_online_cpu(cpu) { - sprintf(buf, "btt 0x%px\n", KDB_TSK(cpu)); + void *kdb_tsk = KDB_TSK(cpu); + + /* If a CPU failed to round up we could be here */ + if (!kdb_tsk) { + kdb_printf("WARNING: no task for cpu %ld\n", + cpu); + continue; + } + + sprintf(buf, "btt 0x%px\n", kdb_tsk); kdb_parse(buf); touch_nmi_watchdog(); } diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c index 15e1a7af5dd0..53a0df6e4d92 100644 --- a/kernel/debug/kdb/kdb_debugger.c +++ b/kernel/debug/kdb/kdb_debugger.c @@ -118,13 +118,6 @@ int kdb_stub(struct kgdb_state *ks) kdb_bp_remove(); KDB_STATE_CLEAR(DOING_SS); KDB_STATE_SET(PAGER); - /* zero out any offline cpu data */ - for_each_present_cpu(i) { - if (!cpu_online(i)) { - kgdb_info[i].debuggerinfo = NULL; - kgdb_info[i].task = NULL; - } - } if (ks->err_code == DIE_OOPS || reason == KDB_REASON_OOPS) { ks->pass_exception = 1; KDB_FLAG_SET(CATASTROPHIC); diff --git a/kernel/events/core.c b/kernel/events/core.c index 991af683ef9e..580616e6fcee 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -436,18 +436,18 @@ int perf_proc_update_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); - - if (ret || !write) - return ret; - + int ret; + int perf_cpu = sysctl_perf_cpu_time_max_percent; /* * If throttling is disabled don't allow the write: */ - if (sysctl_perf_cpu_time_max_percent == 100 || - sysctl_perf_cpu_time_max_percent == 0) + if (write && (perf_cpu == 100 || perf_cpu == 0)) return -EINVAL; + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + if (ret || !write) + return ret; + max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ); perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; update_perf_cpu_limits(); @@ -4738,6 +4738,11 @@ static void __perf_event_period(struct perf_event *event, } } +static int perf_event_check_period(struct perf_event *event, u64 value) +{ + return event->pmu->check_period(event, value); +} + static int perf_event_period(struct perf_event *event, u64 __user *arg) { u64 value; @@ -4754,6 +4759,9 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg) if (event->attr.freq && value > sysctl_perf_event_sample_rate) return -EINVAL; + if (perf_event_check_period(event, value)) + return -EINVAL; + event_function_call(event, __perf_event_period, &value); return 0; @@ -6915,6 +6923,7 @@ static void perf_event_mmap_output(struct perf_event *event, struct perf_output_handle handle; struct perf_sample_data sample; int size = mmap_event->event_id.header.size; + u32 type = mmap_event->event_id.header.type; int ret; if (!perf_event_mmap_match(event, data)) @@ -6958,6 +6967,7 @@ static void perf_event_mmap_output(struct perf_event *event, perf_output_end(&handle); out: mmap_event->event_id.header.size = size; + mmap_event->event_id.header.type = type; } static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) @@ -8951,6 +8961,11 @@ static int perf_pmu_nop_int(struct pmu *pmu) return 0; } +static int perf_event_nop_int(struct perf_event *event, u64 value) +{ + return 0; +} + static DEFINE_PER_CPU(unsigned int, nop_txn_flags); static void perf_pmu_start_txn(struct pmu *pmu, unsigned int flags) @@ -9251,6 +9266,9 @@ got_cpu_context: pmu->pmu_disable = perf_pmu_nop_void; } + if (!pmu->check_period) + pmu->check_period = perf_event_nop_int; + if (!pmu->event_idx) pmu->event_idx = perf_event_idx_default; diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index c573c7339223..489dc6b60053 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -719,6 +719,9 @@ struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags) size = sizeof(struct ring_buffer); size += nr_pages * sizeof(void *); + if (order_base_2(size) >= PAGE_SHIFT+MAX_ORDER) + goto fail; + rb = kzalloc(size, GFP_KERNEL); if (!rb) goto fail; diff --git a/kernel/exit.c b/kernel/exit.c index 3aa01b74c1e3..95ce231ff5e2 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -218,6 +218,7 @@ repeat: } write_unlock_irq(&tasklist_lock); + cgroup_release(p); release_thread(p); call_rcu(&p->rcu, delayed_put_task_struct); @@ -306,7 +307,7 @@ void rcuwait_wake_up(struct rcuwait *w) * MB (A) MB (B) * [L] cond [L] tsk */ - smp_rmb(); /* (B) */ + smp_mb(); /* (B) */ /* * Avoid using task_rcu_dereference() magic as long as we are careful, diff --git a/kernel/futex.c b/kernel/futex.c index 046cd780d057..f2fa48c6c476 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1166,11 +1166,65 @@ out_error: return ret; } +static int handle_exit_race(u32 __user *uaddr, u32 uval, + struct task_struct *tsk) +{ + u32 uval2; + + /* + * If PF_EXITPIDONE is not yet set, then try again. + */ + if (tsk && !(tsk->flags & PF_EXITPIDONE)) + return -EAGAIN; + + /* + * Reread the user space value to handle the following situation: + * + * CPU0 CPU1 + * + * sys_exit() sys_futex() + * do_exit() futex_lock_pi() + * futex_lock_pi_atomic() + * exit_signals(tsk) No waiters: + * tsk->flags |= PF_EXITING; *uaddr == 0x00000PID + * mm_release(tsk) Set waiter bit + * exit_robust_list(tsk) { *uaddr = 0x80000PID; + * Set owner died attach_to_pi_owner() { + * *uaddr = 0xC0000000; tsk = get_task(PID); + * } if (!tsk->flags & PF_EXITING) { + * ... attach(); + * tsk->flags |= PF_EXITPIDONE; } else { + * if (!(tsk->flags & PF_EXITPIDONE)) + * return -EAGAIN; + * return -ESRCH; <--- FAIL + * } + * + * Returning ESRCH unconditionally is wrong here because the + * user space value has been changed by the exiting task. + * + * The same logic applies to the case where the exiting task is + * already gone. + */ + if (get_futex_value_locked(&uval2, uaddr)) + return -EFAULT; + + /* If the user space value has changed, try again. */ + if (uval2 != uval) + return -EAGAIN; + + /* + * The exiting task did not have a robust list, the robust list was + * corrupted or the user space value in *uaddr is simply bogus. + * Give up and tell user space. + */ + return -ESRCH; +} + /* * Lookup the task for the TID provided from user space and attach to * it after doing proper sanity checks. */ -static int attach_to_pi_owner(u32 uval, union futex_key *key, +static int attach_to_pi_owner(u32 __user *uaddr, u32 uval, union futex_key *key, struct futex_pi_state **ps) { pid_t pid = uval & FUTEX_TID_MASK; @@ -1180,12 +1234,15 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key, /* * We are the first waiter - try to look up the real owner and attach * the new pi_state to it, but bail out when TID = 0 [1] + * + * The !pid check is paranoid. None of the call sites should end up + * with pid == 0, but better safe than sorry. Let the caller retry */ if (!pid) - return -ESRCH; + return -EAGAIN; p = futex_find_get_task(pid); if (!p) - return -ESRCH; + return handle_exit_race(uaddr, uval, NULL); if (unlikely(p->flags & PF_KTHREAD)) { put_task_struct(p); @@ -1205,7 +1262,7 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key, * set, we know that the task has finished the * cleanup: */ - int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN; + int ret = handle_exit_race(uaddr, uval, p); raw_spin_unlock_irq(&p->pi_lock); put_task_struct(p); @@ -1262,7 +1319,7 @@ static int lookup_pi_state(u32 __user *uaddr, u32 uval, * We are the first waiter - try to look up the owner based on * @uval and attach to it. */ - return attach_to_pi_owner(uval, key, ps); + return attach_to_pi_owner(uaddr, uval, key, ps); } static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval) @@ -1370,7 +1427,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, * attach to the owner. If that fails, no harm done, we only * set the FUTEX_WAITERS bit in the user space variable. */ - return attach_to_pi_owner(uval, key, ps); + return attach_to_pi_owner(uaddr, newval, key, ps); } /** @@ -1405,11 +1462,7 @@ static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q) if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n")) return; - /* - * Queue the task for later wakeup for after we've released - * the hb->lock. wake_q_add() grabs reference to p. - */ - wake_q_add(wake_q, p); + get_task_struct(p); __unqueue_futex(q); /* * The waiting task can free the futex_q as soon as q->lock_ptr = NULL @@ -1419,6 +1472,13 @@ static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q) * plist_del in __unqueue_futex(). */ smp_store_release(&q->lock_ptr, NULL); + + /* + * Queue the task for later wakeup for after we've released + * the hb->lock. wake_q_add() grabs reference to p. + */ + wake_q_add(wake_q, p); + put_task_struct(p); } /* @@ -2811,35 +2871,39 @@ retry_private: * and BUG when futex_unlock_pi() interleaves with this. * * Therefore acquire wait_lock while holding hb->lock, but drop the - * latter before calling rt_mutex_start_proxy_lock(). This still fully - * serializes against futex_unlock_pi() as that does the exact same - * lock handoff sequence. + * latter before calling __rt_mutex_start_proxy_lock(). This + * interleaves with futex_unlock_pi() -- which does a similar lock + * handoff -- such that the latter can observe the futex_q::pi_state + * before __rt_mutex_start_proxy_lock() is done. */ raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock); spin_unlock(q.lock_ptr); + /* + * __rt_mutex_start_proxy_lock() unconditionally enqueues the @rt_waiter + * such that futex_unlock_pi() is guaranteed to observe the waiter when + * it sees the futex_q::pi_state. + */ ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current); raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock); if (ret) { if (ret == 1) ret = 0; - - spin_lock(q.lock_ptr); - goto no_block; + goto cleanup; } - if (unlikely(to)) hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS); ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter); +cleanup: spin_lock(q.lock_ptr); /* - * If we failed to acquire the lock (signal/timeout), we must + * If we failed to acquire the lock (deadlock/signal/timeout), we must * first acquire the hb->lock before removing the lock from the - * rt_mutex waitqueue, such that we can keep the hb and rt_mutex - * wait lists consistent. + * rt_mutex waitqueue, such that we can keep the hb and rt_mutex wait + * lists consistent. * * In particular; it is important that futex_unlock_pi() can not * observe this inconsistency. @@ -2963,6 +3027,10 @@ retry: * there is no point where we hold neither; and therefore * wake_futex_pi() must observe a state consistent with what we * observed. + * + * In particular; this forces __rt_mutex_start_proxy() to + * complete such that we're guaranteed to observe the + * rt_waiter. Also see the WARN in wake_futex_pi(). */ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); spin_unlock(&hb->lock); @@ -3382,6 +3450,10 @@ int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi) { u32 uval, uninitialized_var(nval), mval; + /* Futex address must be 32bit aligned */ + if ((((unsigned long)uaddr) % sizeof(*uaddr)) != 0) + return -1; + retry: if (get_user(uval, uaddr)) return -1; diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 32b479468e4d..2e4869fa66c9 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -15,6 +15,7 @@ #include <linux/lockdep.h> #include <linux/export.h> #include <linux/sysctl.h> +#include <linux/suspend.h> #include <linux/utsname.h> #include <linux/sched/signal.h> #include <linux/sched/debug.h> @@ -33,7 +34,7 @@ int __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT; * is disabled during the critical section. It also controls the size of * the RCU grace period. So it needs to be upper-bound. */ -#define HUNG_TASK_BATCHING 1024 +#define HUNG_TASK_LOCK_BREAK (HZ / 10) /* * Zero means infinite timeout - no checking done: @@ -103,8 +104,11 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) trace_sched_process_hang(t); - if (!sysctl_hung_task_warnings && !sysctl_hung_task_panic) - return; + if (sysctl_hung_task_panic) { + console_verbose(); + hung_task_show_lock = true; + hung_task_call_panic = true; + } /* * Ok, the task did not get scheduled for more than 2 minutes, @@ -126,11 +130,6 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) } touch_nmi_watchdog(); - - if (sysctl_hung_task_panic) { - hung_task_show_lock = true; - hung_task_call_panic = true; - } } /* @@ -164,7 +163,7 @@ static bool rcu_lock_break(struct task_struct *g, struct task_struct *t) static void check_hung_uninterruptible_tasks(unsigned long timeout) { int max_count = sysctl_hung_task_check_count; - int batch_count = HUNG_TASK_BATCHING; + unsigned long last_break = jiffies; struct task_struct *g, *t; /* @@ -179,10 +178,10 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) for_each_process_thread(g, t) { if (!max_count--) goto unlock; - if (!--batch_count) { - batch_count = HUNG_TASK_BATCHING; + if (time_after(jiffies, last_break + HUNG_TASK_LOCK_BREAK)) { if (!rcu_lock_break(g, t)) goto unlock; + last_break = jiffies; } /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */ if (t->state == TASK_UNINTERRUPTIBLE) @@ -234,6 +233,28 @@ void reset_hung_task_detector(void) } EXPORT_SYMBOL_GPL(reset_hung_task_detector); +static bool hung_detector_suspended; + +static int hungtask_pm_notify(struct notifier_block *self, + unsigned long action, void *hcpu) +{ + switch (action) { + case PM_SUSPEND_PREPARE: + case PM_HIBERNATION_PREPARE: + case PM_RESTORE_PREPARE: + hung_detector_suspended = true; + break; + case PM_POST_SUSPEND: + case PM_POST_HIBERNATION: + case PM_POST_RESTORE: + hung_detector_suspended = false; + break; + default: + break; + } + return NOTIFY_OK; +} + /* * kthread which checks for tasks stuck in D state */ @@ -248,7 +269,8 @@ static int watchdog(void *dummy) long t = hung_timeout_jiffies(hung_last_checked, timeout); if (t <= 0) { - if (!atomic_xchg(&reset_hung_task, 0)) + if (!atomic_xchg(&reset_hung_task, 0) && + !hung_detector_suspended) check_hung_uninterruptible_tasks(timeout); hung_last_checked = jiffies; continue; @@ -262,6 +284,10 @@ static int watchdog(void *dummy) static int __init hung_task_init(void) { atomic_notifier_chain_register(&panic_notifier_list, &panic_block); + + /* Disable hung task detector on suspend */ + pm_notifier(hungtask_pm_notify, 0); + watchdog_task = kthread_run(watchdog, NULL, "khungtaskd"); return 0; diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 5a2ef92c2782..317fc759de76 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -834,7 +834,11 @@ void handle_percpu_irq(struct irq_desc *desc) { struct irq_chip *chip = irq_desc_get_chip(desc); - kstat_incr_irqs_this_cpu(desc); + /* + * PER CPU interrupts are not serialized. Do not touch + * desc->tot_count. + */ + __kstat_incr_irqs_this_cpu(desc); if (chip->irq_ack) chip->irq_ack(&desc->irq_data); @@ -863,7 +867,11 @@ void handle_percpu_devid_irq(struct irq_desc *desc) unsigned int irq = irq_desc_get_irq(desc); irqreturn_t res; - kstat_incr_irqs_this_cpu(desc); + /* + * PER CPU interrupts are not serialized. Do not touch + * desc->tot_count. + */ + __kstat_incr_irqs_this_cpu(desc); if (chip->irq_ack) chip->irq_ack(&desc->irq_data); @@ -1355,6 +1363,10 @@ int irq_chip_set_vcpu_affinity_parent(struct irq_data *data, void *vcpu_info) int irq_chip_set_wake_parent(struct irq_data *data, unsigned int on) { data = data->parent_data; + + if (data->chip->flags & IRQCHIP_SKIP_SET_WAKE) + return 0; + if (data->chip->irq_set_wake) return data->chip->irq_set_wake(data, on); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 44ed5f8c8759..4ef7f3b820ce 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -240,12 +240,18 @@ static inline void irq_state_set_masked(struct irq_desc *desc) #undef __irqd_to_state -static inline void kstat_incr_irqs_this_cpu(struct irq_desc *desc) +static inline void __kstat_incr_irqs_this_cpu(struct irq_desc *desc) { __this_cpu_inc(*desc->kstat_irqs); __this_cpu_inc(kstat.irqs_sum); } +static inline void kstat_incr_irqs_this_cpu(struct irq_desc *desc) +{ + __kstat_incr_irqs_this_cpu(desc); + desc->tot_count++; +} + static inline int irq_desc_get_node(struct irq_desc *desc) { return irq_common_data_get_node(&desc->irq_common_data); diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index e97bbae947f0..aa08d4184608 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -119,6 +119,7 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node, desc->depth = 1; desc->irq_count = 0; desc->irqs_unhandled = 0; + desc->tot_count = 0; desc->name = NULL; desc->owner = owner; for_each_possible_cpu(cpu) @@ -534,6 +535,7 @@ int __init early_irq_init(void) alloc_masks(&desc[i], node); raw_spin_lock_init(&desc[i].lock); lockdep_set_class(&desc[i].lock, &irq_desc_lock_class); + mutex_init(&desc[i].request_mutex); desc_set_defaults(i, &desc[i], node, NULL, NULL); } return arch_early_irq_init(); @@ -895,11 +897,15 @@ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) unsigned int kstat_irqs(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); - int cpu; unsigned int sum = 0; + int cpu; if (!desc || !desc->kstat_irqs) return 0; + if (!irq_settings_is_per_cpu_devid(desc) && + !irq_settings_is_per_cpu(desc)) + return desc->tot_count; + for_each_possible_cpu(cpu) sum += *per_cpu_ptr(desc->kstat_irqs, cpu); return sum; diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 4cd85870f00e..6c877d28838f 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -360,6 +360,9 @@ int irq_setup_affinity(struct irq_desc *desc) } cpumask_and(&mask, cpu_online_mask, set); + if (cpumask_empty(&mask)) + cpumask_copy(&mask, cpu_online_mask); + if (node != NUMA_NO_NODE) { const struct cpumask *nodemask = cpumask_of_node(node); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 5cbad4fb9107..ec11bb986a8b 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -703,7 +703,6 @@ static void unoptimize_kprobe(struct kprobe *p, bool force) static int reuse_unused_kprobe(struct kprobe *ap) { struct optimized_kprobe *op; - int ret; BUG_ON(!kprobe_unused(ap)); /* @@ -717,9 +716,8 @@ static int reuse_unused_kprobe(struct kprobe *ap) /* Enable the probe again */ ap->flags &= ~KPROBE_FLAG_DISABLED; /* Optimize it again (remove from op->list) */ - ret = kprobe_optready(ap); - if (ret) - return ret; + if (!kprobe_optready(ap)) + return -EINVAL; optimize_kprobe(ap); return 0; diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 4ad35718f123..71c554a9e17f 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1726,12 +1726,33 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock, rt_mutex_set_owner(lock, NULL); } +/** + * __rt_mutex_start_proxy_lock() - Start lock acquisition for another task + * @lock: the rt_mutex to take + * @waiter: the pre-initialized rt_mutex_waiter + * @task: the task to prepare + * + * Starts the rt_mutex acquire; it enqueues the @waiter and does deadlock + * detection. It does not wait, see rt_mutex_wait_proxy_lock() for that. + * + * NOTE: does _NOT_ remove the @waiter on failure; must either call + * rt_mutex_wait_proxy_lock() or rt_mutex_cleanup_proxy_lock() after this. + * + * Returns: + * 0 - task blocked on lock + * 1 - acquired the lock for task, caller should wake it up + * <0 - error + * + * Special API call for PI-futex support. + */ int __rt_mutex_start_proxy_lock(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, struct task_struct *task) { int ret; + lockdep_assert_held(&lock->wait_lock); + if (try_to_take_rt_mutex(lock, task, NULL)) return 1; @@ -1749,9 +1770,6 @@ int __rt_mutex_start_proxy_lock(struct rt_mutex *lock, ret = 0; } - if (unlikely(ret)) - remove_waiter(lock, waiter); - debug_rt_mutex_print_deadlock(waiter); return ret; @@ -1763,12 +1781,18 @@ int __rt_mutex_start_proxy_lock(struct rt_mutex *lock, * @waiter: the pre-initialized rt_mutex_waiter * @task: the task to prepare * + * Starts the rt_mutex acquire; it enqueues the @waiter and does deadlock + * detection. It does not wait, see rt_mutex_wait_proxy_lock() for that. + * + * NOTE: unlike __rt_mutex_start_proxy_lock this _DOES_ remove the @waiter + * on failure. + * * Returns: * 0 - task blocked on lock * 1 - acquired the lock for task, caller should wake it up * <0 - error * - * Special API call for FUTEX_REQUEUE_PI support. + * Special API call for PI-futex support. */ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, @@ -1778,6 +1802,8 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, raw_spin_lock_irq(&lock->wait_lock); ret = __rt_mutex_start_proxy_lock(lock, waiter, task); + if (unlikely(ret)) + remove_waiter(lock, waiter); raw_spin_unlock_irq(&lock->wait_lock); return ret; @@ -1845,7 +1871,8 @@ int rt_mutex_wait_proxy_lock(struct rt_mutex *lock, * @lock: the rt_mutex we were woken on * @waiter: the pre-initialized rt_mutex_waiter * - * Attempt to clean up after a failed rt_mutex_wait_proxy_lock(). + * Attempt to clean up after a failed __rt_mutex_start_proxy_lock() or + * rt_mutex_wait_proxy_lock(). * * Unless we acquired the lock; we're still enqueued on the wait-list and can * in fact still be granted ownership until we're removed. Therefore we can diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index a90336779375..c75017326c37 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -198,15 +198,22 @@ static void __rwsem_mark_wake(struct rw_semaphore *sem, woken++; tsk = waiter->task; - wake_q_add(wake_q, tsk); + get_task_struct(tsk); list_del(&waiter->list); /* - * Ensure that the last operation is setting the reader + * Ensure calling get_task_struct() before setting the reader * waiter to nil such that rwsem_down_read_failed() cannot * race with do_exit() by always holding a reference count * to the task to wakeup. */ smp_store_release(&waiter->task, NULL); + /* + * Ensure issuing the wakeup (either by us or someone else) + * after setting the reader waiter to nil. + */ + wake_q_add(wake_q, tsk); + /* wake_q_add() already take the task ref */ + put_task_struct(tsk); } adjustment = woken * RWSEM_ACTIVE_READ_BIAS - adjustment; diff --git a/kernel/module.c b/kernel/module.c index 2a44c515f0d7..94528b891027 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1201,8 +1201,10 @@ static ssize_t store_uevent(struct module_attribute *mattr, struct module_kobject *mk, const char *buffer, size_t count) { - kobject_synth_uevent(&mk->kobj, buffer, count); - return count; + int rc; + + rc = kobject_synth_uevent(&mk->kobj, buffer, count); + return rc ? rc : count; } struct module_attribute module_uevent = diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 84b1367935e4..f1c85b6c39ae 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -29,6 +29,7 @@ #include <linux/hw_breakpoint.h> #include <linux/cn_proc.h> #include <linux/compat.h> +#include <linux/sched/signal.h> /* * Access another process' address space via ptrace. @@ -925,18 +926,26 @@ int ptrace_request(struct task_struct *child, long request, ret = ptrace_setsiginfo(child, &siginfo); break; - case PTRACE_GETSIGMASK: + case PTRACE_GETSIGMASK: { + sigset_t *mask; + if (addr != sizeof(sigset_t)) { ret = -EINVAL; break; } - if (copy_to_user(datavp, &child->blocked, sizeof(sigset_t))) + if (test_tsk_restore_sigmask(child)) + mask = &child->saved_sigmask; + else + mask = &child->blocked; + + if (copy_to_user(datavp, mask, sizeof(sigset_t))) ret = -EFAULT; else ret = 0; break; + } case PTRACE_SETSIGMASK: { sigset_t new_set; @@ -962,6 +971,8 @@ int ptrace_request(struct task_struct *child, long request, child->blocked = new_set; spin_unlock_irq(&child->sighand->siglock); + clear_tsk_restore_sigmask(child); + ret = 0; break; } diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 710ce1d6b982..fb051fa99b67 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1789,15 +1789,23 @@ static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) } /* - * Awaken the grace-period kthread for the specified flavor of RCU. - * Don't do a self-awaken, and don't bother awakening when there is - * nothing for the grace-period kthread to do (as in several CPUs - * raced to awaken, and we lost), and finally don't try to awaken - * a kthread that has not yet been created. + * Awaken the grace-period kthread. Don't do a self-awaken (unless in + * an interrupt or softirq handler), and don't bother awakening when there + * is nothing for the grace-period kthread to do (as in several CPUs raced + * to awaken, and we lost), and finally don't try to awaken a kthread that + * has not yet been created. If all those checks are passed, track some + * debug information and awaken. + * + * So why do the self-wakeup when in an interrupt or softirq handler + * in the grace-period kthread's context? Because the kthread might have + * been interrupted just as it was going to sleep, and just after the final + * pre-sleep check of the awaken condition. In this case, a wakeup really + * is required, and is therefore supplied. */ static void rcu_gp_kthread_wake(struct rcu_state *rsp) { - if (current == rsp->gp_kthread || + if ((current == rsp->gp_kthread && + !in_interrupt() && !in_serving_softirq()) || !READ_ONCE(rsp->gp_flags) || !rsp->gp_kthread) return; diff --git a/kernel/relay.c b/kernel/relay.c index 1537158c67b3..61d37e6da22d 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -427,6 +427,8 @@ static struct dentry *relay_create_buf_file(struct rchan *chan, dentry = chan->cb->create_buf_file(tmpname, chan->parent, S_IRUSR, buf, &chan->is_global); + if (IS_ERR(dentry)) + dentry = NULL; kfree(tmpname); @@ -460,7 +462,7 @@ static struct rchan_buf *relay_open_buf(struct rchan *chan, unsigned int cpu) dentry = chan->cb->create_buf_file(NULL, NULL, S_IRUSR, buf, &chan->is_global); - if (WARN_ON(dentry)) + if (IS_ERR_OR_NULL(dentry)) goto free_buf; } diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 6d689c039189..d0d39f714f47 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -605,10 +605,9 @@ fail: stop_kthread: sugov_kthread_stop(sg_policy); - -free_sg_policy: mutex_unlock(&global_tunables_lock); +free_sg_policy: sugov_policy_free(sg_policy); disable_fast_switch: diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index b2589c7e9439..22770168bff8 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -217,7 +217,6 @@ static void task_non_contending(struct task_struct *p) if (dl_se->dl_runtime == 0) return; - WARN_ON(hrtimer_active(&dl_se->inactive_timer)); WARN_ON(dl_se->dl_non_contending); zerolag_time = dl_se->deadline - @@ -234,7 +233,7 @@ static void task_non_contending(struct task_struct *p) * If the "0-lag time" already passed, decrease the active * utilization now, instead of starting a timer */ - if (zerolag_time < 0) { + if ((zerolag_time < 0) || hrtimer_active(&dl_se->inactive_timer)) { if (dl_task(p)) sub_running_bw(dl_se->dl_bw, dl_rq); if (!dl_task(p) || p->state == TASK_DEAD) { diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 2f93e4a2d9f6..187c04a34ba1 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -339,6 +339,7 @@ void register_sched_domain_sysctl(void) { static struct ctl_table *cpu_entries; static struct ctl_table **cpu_idx; + static bool init_done = false; char buf[32]; int i; @@ -368,7 +369,10 @@ void register_sched_domain_sysctl(void) if (!cpumask_available(sd_sysctl_cpus)) { if (!alloc_cpumask_var(&sd_sysctl_cpus, GFP_KERNEL)) return; + } + if (!init_done) { + init_done = true; /* init to possible to not have holes in @cpu_entries */ cpumask_copy(sd_sysctl_cpus, cpu_possible_mask); } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index f33b24080b1c..af7de1f9906c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2026,6 +2026,10 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period) if (p->last_task_numa_placement) { delta = runtime - p->last_sum_exec_runtime; *period = now - p->last_task_numa_placement; + + /* Avoid time going backwards, prevent potential divide error: */ + if (unlikely((s64)*period < 0)) + *period = 0; } else { delta = p->se.avg.load_sum / p->se.load.weight; *period = LOAD_AVG_MAX; @@ -4672,12 +4676,15 @@ static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) return HRTIMER_NORESTART; } +extern const u64 max_cfs_quota_period; + static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) { struct cfs_bandwidth *cfs_b = container_of(timer, struct cfs_bandwidth, period_timer); int overrun; int idle = 0; + int count = 0; raw_spin_lock(&cfs_b->lock); for (;;) { @@ -4685,6 +4692,28 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) if (!overrun) break; + if (++count > 3) { + u64 new, old = ktime_to_ns(cfs_b->period); + + new = (old * 147) / 128; /* ~115% */ + new = min(new, max_cfs_quota_period); + + cfs_b->period = ns_to_ktime(new); + + /* since max is 1s, this is limited to 1e9^2, which fits in u64 */ + cfs_b->quota *= new; + cfs_b->quota = div64_u64(cfs_b->quota, old); + + pr_warn_ratelimited( + "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us %lld, cfs_quota_us = %lld)\n", + smp_processor_id(), + div_u64(new, NSEC_PER_USEC), + div_u64(cfs_b->quota, NSEC_PER_USEC)); + + /* reset count so we don't come right back in here */ + count = 0; + } + idle = do_sched_cfs_period_timer(cfs_b, overrun); } if (idle) @@ -5651,6 +5680,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) #ifdef CONFIG_SCHED_SMT DEFINE_STATIC_KEY_FALSE(sched_smt_present); +EXPORT_SYMBOL_GPL(sched_smt_present); static inline void set_idle_cores(int cpu, int val) { @@ -7017,10 +7047,10 @@ static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq) if (cfs_rq->last_h_load_update == now) return; - cfs_rq->h_load_next = NULL; + WRITE_ONCE(cfs_rq->h_load_next, NULL); for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); - cfs_rq->h_load_next = se; + WRITE_ONCE(cfs_rq->h_load_next, se); if (cfs_rq->last_h_load_update == now) break; } @@ -7030,7 +7060,7 @@ static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq) cfs_rq->last_h_load_update = now; } - while ((se = cfs_rq->h_load_next) != NULL) { + while ((se = READ_ONCE(cfs_rq->h_load_next)) != NULL) { load = cfs_rq->h_load; load = div64_ul(load * se->avg.load_avg, cfs_rq_load_avg(cfs_rq) + 1); diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 659e075ef70b..9dcd80ed9d4c 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -499,7 +499,7 @@ static int __init isolated_cpu_setup(char *str) __setup("isolcpus=", isolated_cpu_setup); struct s_data { - struct sched_domain ** __percpu sd; + struct sched_domain * __percpu *sd; struct root_domain *rd; }; diff --git a/kernel/signal.c b/kernel/signal.c index 164c36ef0825..619c6160f64f 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -672,6 +672,48 @@ void signal_wake_up_state(struct task_struct *t, unsigned int state) kick_process(t); } +static int dequeue_synchronous_signal(siginfo_t *info) +{ + struct task_struct *tsk = current; + struct sigpending *pending = &tsk->pending; + struct sigqueue *q, *sync = NULL; + + /* + * Might a synchronous signal be in the queue? + */ + if (!((pending->signal.sig[0] & ~tsk->blocked.sig[0]) & SYNCHRONOUS_MASK)) + return 0; + + /* + * Return the first synchronous signal in the queue. + */ + list_for_each_entry(q, &pending->list, list) { + /* Synchronous signals have a postive si_code */ + if ((q->info.si_code > SI_USER) && + (sigmask(q->info.si_signo) & SYNCHRONOUS_MASK)) { + sync = q; + goto next; + } + } + return 0; +next: + /* + * Check if there is another siginfo for the same signal. + */ + list_for_each_entry_continue(q, &pending->list, list) { + if (q->info.si_signo == sync->info.si_signo) + goto still_pending; + } + + sigdelset(&pending->signal, sync->info.si_signo); + recalc_sigpending(); +still_pending: + list_del_init(&sync->list); + copy_siginfo(info, &sync->info); + __sigqueue_free(sync); + return info->si_signo; +} + /* * Remove signals in mask from the pending set and queue. * Returns 1 if any signals were found. @@ -2225,6 +2267,14 @@ relock: goto relock; } + /* Has this task already been marked for death? */ + if (signal_group_exit(signal)) { + ksig->info.si_signo = signr = SIGKILL; + sigdelset(¤t->pending.signal, SIGKILL); + recalc_sigpending(); + goto fatal; + } + for (;;) { struct k_sigaction *ka; @@ -2238,7 +2288,15 @@ relock: goto relock; } - signr = dequeue_signal(current, ¤t->blocked, &ksig->info); + /* + * Signals generated by the execution of an instruction + * need to be delivered before any other pending signals + * so that the instruction pointer in the signal stack + * frame points to the faulting instruction. + */ + signr = dequeue_synchronous_signal(&ksig->info); + if (!signr) + signr = dequeue_signal(current, ¤t->blocked, &ksig->info); if (!signr) break; /* will return 0 */ @@ -2320,6 +2378,7 @@ relock: continue; } + fatal: spin_unlock_irq(&sighand->siglock); /* diff --git a/kernel/smp.c b/kernel/smp.c index 2d1da290f144..c94dd85c8d41 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -584,8 +584,6 @@ void __init smp_init(void) num_nodes, (num_nodes > 1 ? "s" : ""), num_cpus, (num_cpus > 1 ? "s" : "")); - /* Final decision about SMT support */ - cpu_smt_check_topology(); /* Any cleanup work */ smp_cpus_done(setup_max_cpus); } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index d330b1ce3b94..f13601a616ad 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -124,7 +124,9 @@ static int zero; static int __maybe_unused one = 1; static int __maybe_unused two = 2; static int __maybe_unused four = 4; +static unsigned long zero_ul; static unsigned long one_ul = 1; +static unsigned long long_max = LONG_MAX; static int one_hundred = 100; static int one_thousand = 1000; #ifdef CONFIG_PRINTK @@ -1681,6 +1683,8 @@ static struct ctl_table fs_table[] = { .maxlen = sizeof(files_stat.max_files), .mode = 0644, .proc_handler = proc_doulongvec_minmax, + .extra1 = &zero_ul, + .extra2 = &long_max, }, { .procname = "nr_open", @@ -2530,7 +2534,16 @@ static int do_proc_dointvec_minmax_conv(bool *negp, unsigned long *lvalp, { struct do_proc_dointvec_minmax_conv_param *param = data; if (write) { - int val = *negp ? -*lvalp : *lvalp; + int val; + if (*negp) { + if (*lvalp > (unsigned long) INT_MAX + 1) + return -EINVAL; + val = -*lvalp; + } else { + if (*lvalp > (unsigned long) INT_MAX) + return -EINVAL; + val = *lvalp; + } if ((param->min && *param->min > val) || (param->max && *param->max < val)) return -EINVAL; @@ -2708,6 +2721,8 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int bool neg; left -= proc_skip_spaces(&p); + if (!left) + break; err = proc_get_long(&p, &left, &val, &neg, proc_wspace_sep, diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index fa5de5e8de61..fdeb9bc6affb 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -597,7 +597,7 @@ static ktime_t alarm_timer_remaining(struct k_itimer *timr, ktime_t now) { struct alarm *alarm = &timr->it.alarm.alarmtimer; - return ktime_sub(now, alarm->node.expires); + return ktime_sub(alarm->node.expires, now); } /** diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 2cafb49aa65e..1ce7c404d0b0 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -41,7 +41,9 @@ static struct { seqcount_t seq; struct timekeeper timekeeper; -} tk_core ____cacheline_aligned; +} tk_core ____cacheline_aligned = { + .seq = SEQCNT_ZERO(tk_core.seq), +}; static DEFINE_RAW_SPINLOCK(timekeeper_lock); static struct timekeeper shadow_timekeeper; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 9937d7cf2a64..3e92852c8b23 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -33,6 +33,7 @@ #include <linux/list.h> #include <linux/hash.h> #include <linux/rcupdate.h> +#include <linux/kprobes.h> #include <trace/events/sched.h> @@ -6035,7 +6036,7 @@ void ftrace_reset_array_ops(struct trace_array *tr) tr->ops->func = ftrace_stub; } -static inline void +static nokprobe_inline void __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *ignored, struct pt_regs *regs) { @@ -6098,11 +6099,13 @@ static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, { __ftrace_ops_list_func(ip, parent_ip, NULL, regs); } +NOKPROBE_SYMBOL(ftrace_ops_list_func); #else static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip) { __ftrace_ops_list_func(ip, parent_ip, NULL, NULL); } +NOKPROBE_SYMBOL(ftrace_ops_no_ops); #endif /* @@ -6132,6 +6135,7 @@ static void ftrace_ops_assist_func(unsigned long ip, unsigned long parent_ip, preempt_enable_notrace(); trace_clear_recursion(bit); } +NOKPROBE_SYMBOL(ftrace_ops_assist_func); /** * ftrace_ops_get_func - get the function a trampoline should call diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index a1d5e0949dcf..8123a8b53c54 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -700,7 +700,7 @@ u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu) preempt_disable_notrace(); time = rb_time_stamp(buffer); - preempt_enable_no_resched_notrace(); + preempt_enable_notrace(); return time; } @@ -4010,6 +4010,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_consume); * ring_buffer_read_prepare - Prepare for a non consuming read of the buffer * @buffer: The ring buffer to read from * @cpu: The cpu buffer to iterate over + * @flags: gfp flags to use for memory allocation * * This performs the initial preparations necessary to iterate * through the buffer. Memory is allocated, buffer recording @@ -4027,7 +4028,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_consume); * This overall must be paired with ring_buffer_read_finish. */ struct ring_buffer_iter * -ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu) +ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu, gfp_t flags) { struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_iter *iter; @@ -4035,7 +4036,7 @@ ring_buffer_read_prepare(struct ring_buffer *buffer, int cpu) if (!cpumask_test_cpu(cpu, buffer->cpumask)) return NULL; - iter = kmalloc(sizeof(*iter), GFP_KERNEL); + iter = kmalloc(sizeof(*iter), flags); if (!iter) return NULL; diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index e9cbb96cd99e..ace0e8f6f2b4 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -494,8 +494,10 @@ int trace_pid_write(struct trace_pid_list *filtered_pids, * not modified. */ pid_list = kmalloc(sizeof(*pid_list), GFP_KERNEL); - if (!pid_list) + if (!pid_list) { + trace_parser_put(&parser); return -ENOMEM; + } pid_list->pid_max = READ_ONCE(pid_max); @@ -505,6 +507,7 @@ int trace_pid_write(struct trace_pid_list *filtered_pids, pid_list->pids = vzalloc((pid_list->pid_max + 7) >> 3); if (!pid_list->pids) { + trace_parser_put(&parser); kfree(pid_list); return -ENOMEM; } @@ -3381,6 +3384,8 @@ static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file const char tgid_space[] = " "; const char space[] = " "; + print_event_info(buf, m); + seq_printf(m, "# %s _-----=> irqs-off\n", tgid ? tgid_space : space); seq_printf(m, "# %s / _----=> need-resched\n", @@ -3899,7 +3904,8 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) if (iter->cpu_file == RING_BUFFER_ALL_CPUS) { for_each_tracing_cpu(cpu) { iter->buffer_iter[cpu] = - ring_buffer_read_prepare(iter->trace_buffer->buffer, cpu); + ring_buffer_read_prepare(iter->trace_buffer->buffer, + cpu, GFP_KERNEL); } ring_buffer_read_prepare_sync(); for_each_tracing_cpu(cpu) { @@ -3909,7 +3915,8 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) } else { cpu = iter->cpu_file; iter->buffer_iter[cpu] = - ring_buffer_read_prepare(iter->trace_buffer->buffer, cpu); + ring_buffer_read_prepare(iter->trace_buffer->buffer, + cpu, GFP_KERNEL); ring_buffer_read_prepare_sync(); ring_buffer_read_start(iter->buffer_iter[cpu]); tracing_iter_reset(iter, cpu); @@ -5602,7 +5609,6 @@ out: return ret; fail: - kfree(iter->trace); kfree(iter); __trace_array_put(tr); mutex_unlock(&trace_types_lock); @@ -6713,28 +6719,36 @@ struct buffer_ref { struct ring_buffer *buffer; void *page; int cpu; - int ref; + refcount_t refcount; }; +static void buffer_ref_release(struct buffer_ref *ref) +{ + if (!refcount_dec_and_test(&ref->refcount)) + return; + ring_buffer_free_read_page(ref->buffer, ref->cpu, ref->page); + kfree(ref); +} + static void buffer_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { struct buffer_ref *ref = (struct buffer_ref *)buf->private; - if (--ref->ref) - return; - - ring_buffer_free_read_page(ref->buffer, ref->cpu, ref->page); - kfree(ref); + buffer_ref_release(ref); buf->private = 0; } -static void buffer_pipe_buf_get(struct pipe_inode_info *pipe, +static bool buffer_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { struct buffer_ref *ref = (struct buffer_ref *)buf->private; - ref->ref++; + if (refcount_read(&ref->refcount) > INT_MAX/2) + return false; + + refcount_inc(&ref->refcount); + return true; } /* Pipe buffer operations for a buffer. */ @@ -6742,7 +6756,7 @@ static const struct pipe_buf_operations buffer_pipe_buf_ops = { .can_merge = 0, .confirm = generic_pipe_buf_confirm, .release = buffer_pipe_buf_release, - .steal = generic_pipe_buf_steal, + .steal = generic_pipe_buf_nosteal, .get = buffer_pipe_buf_get, }; @@ -6755,11 +6769,7 @@ static void buffer_spd_release(struct splice_pipe_desc *spd, unsigned int i) struct buffer_ref *ref = (struct buffer_ref *)spd->partial[i].private; - if (--ref->ref) - return; - - ring_buffer_free_read_page(ref->buffer, ref->cpu, ref->page); - kfree(ref); + buffer_ref_release(ref); spd->partial[i].private = 0; } @@ -6814,7 +6824,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, break; } - ref->ref = 1; + refcount_set(&ref->refcount, 1); ref->buffer = iter->trace_buffer->buffer; ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file); if (IS_ERR(ref->page)) { diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 7eb975a2d0e1..e8c9eba9b1e7 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -872,9 +872,10 @@ static inline void add_to_key(char *compound_key, void *key, /* ensure NULL-termination */ if (size > key_field->size - 1) size = key_field->size - 1; - } - memcpy(compound_key + key_field->offset, key, size); + strncpy(compound_key + key_field->offset, (char *)key, size); + } else + memcpy(compound_key + key_field->offset, key, size); } static void event_hist_trigger(struct event_trigger_data *data, void *rec) diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c index d953c163a079..810d78a8d14c 100644 --- a/kernel/trace/trace_kdb.c +++ b/kernel/trace/trace_kdb.c @@ -51,14 +51,16 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file) if (cpu_file == RING_BUFFER_ALL_CPUS) { for_each_tracing_cpu(cpu) { iter.buffer_iter[cpu] = - ring_buffer_read_prepare(iter.trace_buffer->buffer, cpu); + ring_buffer_read_prepare(iter.trace_buffer->buffer, + cpu, GFP_ATOMIC); ring_buffer_read_start(iter.buffer_iter[cpu]); tracing_iter_reset(&iter, cpu); } } else { iter.cpu_file = cpu_file; iter.buffer_iter[cpu_file] = - ring_buffer_read_prepare(iter.trace_buffer->buffer, cpu_file); + ring_buffer_read_prepare(iter.trace_buffer->buffer, + cpu_file, GFP_ATOMIC); ring_buffer_read_start(iter.buffer_iter[cpu_file]); tracing_iter_reset(&iter, cpu_file); } diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index ea0d90a31fc9..fdf2ea4d64ec 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -17,7 +17,7 @@ * Copyright (C) IBM Corporation, 2010-2012 * Author: Srikar Dronamraju <srikar@linux.vnet.ibm.com> */ -#define pr_fmt(fmt) "trace_kprobe: " fmt +#define pr_fmt(fmt) "trace_uprobe: " fmt #include <linux/module.h> #include <linux/uaccess.h> @@ -153,7 +153,14 @@ static void FETCH_FUNC_NAME(memory, string)(struct pt_regs *regs, ret = strncpy_from_user(dst, src, maxlen); if (ret == maxlen) - dst[--ret] = '\0'; + dst[ret - 1] = '\0'; + else if (ret >= 0) + /* + * Include the terminating null byte. In this case it + * was copied by strncpy_from_user but not accounted + * for in ret. + */ + ret++; if (ret < 0) { /* Failed to fetch string */ ((u8 *)get_rloc_data(dest))[0] = '\0'; |