diff options
author | Stefan Agner <stefan@agner.ch> | 2016-09-16 13:17:38 -0700 |
---|---|---|
committer | Max Krummenacher <max.krummenacher@toradex.com> | 2017-03-15 18:35:47 +0100 |
commit | 6819e8267b359913b95ccc822ac267454f39a460 (patch) | |
tree | 13ccc08839313536152b2c46fc9d3665c9e31db4 /kernel | |
parent | 440ef98ad687b210dca17e5c5141b24738f04922 (diff) | |
parent | 3b60b86aec06fbae1142ccc4e55b39b529ae2a25 (diff) |
Merge tag 'v4.1.32' into toradex_vf_4.1-next
Linux 4.1.32
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/auditsc.c | 333 | ||||
-rw-r--r-- | kernel/bpf/verifier.c | 2 | ||||
-rw-r--r-- | kernel/cgroup.c | 13 | ||||
-rw-r--r-- | kernel/events/ring_buffer.c | 10 | ||||
-rw-r--r-- | kernel/events/uprobes.c | 5 | ||||
-rw-r--r-- | kernel/exit.c | 29 | ||||
-rw-r--r-- | kernel/futex.c | 2 | ||||
-rw-r--r-- | kernel/locking/mutex.c | 9 | ||||
-rw-r--r-- | kernel/module.c | 13 | ||||
-rw-r--r-- | kernel/sched/core.c | 6 | ||||
-rw-r--r-- | kernel/sched/proc.c | 11 | ||||
-rw-r--r-- | kernel/sysctl.c | 14 | ||||
-rw-r--r-- | kernel/time/posix-cpu-timers.c | 1 | ||||
-rw-r--r-- | kernel/time/timekeeping.c | 5 | ||||
-rw-r--r-- | kernel/time/timekeeping_debug.c | 9 | ||||
-rw-r--r-- | kernel/trace/ring_buffer.c | 80 | ||||
-rw-r--r-- | kernel/trace/trace_events.c | 9 | ||||
-rw-r--r-- | kernel/trace/trace_printk.c | 7 | ||||
-rw-r--r-- | kernel/workqueue.c | 40 |
19 files changed, 350 insertions, 248 deletions
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 9fb9d1cb83ce..b1943039aab6 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -73,6 +73,7 @@ #include <linux/compat.h> #include <linux/ctype.h> #include <linux/string.h> +#include <linux/uaccess.h> #include <uapi/linux/limits.h> #include "audit.h" @@ -82,7 +83,8 @@ #define AUDITSC_SUCCESS 1 #define AUDITSC_FAILURE 2 -/* no execve audit message should be longer than this (userspace limits) */ +/* no execve audit message should be longer than this (userspace limits), + * see the note near the top of audit_log_execve_info() about this value */ #define MAX_EXECVE_AUDIT_LEN 7500 /* max length to print of cmdline/proctitle value during audit */ @@ -987,185 +989,178 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid, return rc; } -/* - * to_send and len_sent accounting are very loose estimates. We aren't - * really worried about a hard cap to MAX_EXECVE_AUDIT_LEN so much as being - * within about 500 bytes (next page boundary) - * - * why snprintf? an int is up to 12 digits long. if we just assumed when - * logging that a[%d]= was going to be 16 characters long we would be wasting - * space in every audit message. In one 7500 byte message we can log up to - * about 1000 min size arguments. That comes down to about 50% waste of space - * if we didn't do the snprintf to find out how long arg_num_len was. - */ -static int audit_log_single_execve_arg(struct audit_context *context, - struct audit_buffer **ab, - int arg_num, - size_t *len_sent, - const char __user *p, - char *buf) +static void audit_log_execve_info(struct audit_context *context, + struct audit_buffer **ab) { - char arg_num_len_buf[12]; - const char __user *tmp_p = p; - /* how many digits are in arg_num? 5 is the length of ' a=""' */ - size_t arg_num_len = snprintf(arg_num_len_buf, 12, "%d", arg_num) + 5; - size_t len, len_left, to_send; - size_t max_execve_audit_len = MAX_EXECVE_AUDIT_LEN; - unsigned int i, has_cntl = 0, too_long = 0; - int ret; - - /* strnlen_user includes the null we don't want to send */ - len_left = len = strnlen_user(p, MAX_ARG_STRLEN) - 1; - - /* - * We just created this mm, if we can't find the strings - * we just copied into it something is _very_ wrong. Similar - * for strings that are too long, we should not have created - * any. - */ - if (unlikely((len == -1) || len > MAX_ARG_STRLEN - 1)) { - WARN_ON(1); - send_sig(SIGKILL, current, 0); - return -1; + long len_max; + long len_rem; + long len_full; + long len_buf; + long len_abuf; + long len_tmp; + bool require_data; + bool encode; + unsigned int iter; + unsigned int arg; + char *buf_head; + char *buf; + const char __user *p = (const char __user *)current->mm->arg_start; + + /* NOTE: this buffer needs to be large enough to hold all the non-arg + * data we put in the audit record for this argument (see the + * code below) ... at this point in time 96 is plenty */ + char abuf[96]; + + /* NOTE: we set MAX_EXECVE_AUDIT_LEN to a rather arbitrary limit, the + * current value of 7500 is not as important as the fact that it + * is less than 8k, a setting of 7500 gives us plenty of wiggle + * room if we go over a little bit in the logging below */ + WARN_ON_ONCE(MAX_EXECVE_AUDIT_LEN > 7500); + len_max = MAX_EXECVE_AUDIT_LEN; + + /* scratch buffer to hold the userspace args */ + buf_head = kmalloc(MAX_EXECVE_AUDIT_LEN + 1, GFP_KERNEL); + if (!buf_head) { + audit_panic("out of memory for argv string"); + return; } + buf = buf_head; - /* walk the whole argument looking for non-ascii chars */ + audit_log_format(*ab, "argc=%d", context->execve.argc); + + len_rem = len_max; + len_buf = 0; + len_full = 0; + require_data = true; + encode = false; + iter = 0; + arg = 0; do { - if (len_left > MAX_EXECVE_AUDIT_LEN) - to_send = MAX_EXECVE_AUDIT_LEN; - else - to_send = len_left; - ret = copy_from_user(buf, tmp_p, to_send); - /* - * There is no reason for this copy to be short. We just - * copied them here, and the mm hasn't been exposed to user- - * space yet. - */ - if (ret) { - WARN_ON(1); - send_sig(SIGKILL, current, 0); - return -1; - } - buf[to_send] = '\0'; - has_cntl = audit_string_contains_control(buf, to_send); - if (has_cntl) { - /* - * hex messages get logged as 2 bytes, so we can only - * send half as much in each message - */ - max_execve_audit_len = MAX_EXECVE_AUDIT_LEN / 2; - break; - } - len_left -= to_send; - tmp_p += to_send; - } while (len_left > 0); - - len_left = len; - - if (len > max_execve_audit_len) - too_long = 1; - - /* rewalk the argument actually logging the message */ - for (i = 0; len_left > 0; i++) { - int room_left; - - if (len_left > max_execve_audit_len) - to_send = max_execve_audit_len; - else - to_send = len_left; - - /* do we have space left to send this argument in this ab? */ - room_left = MAX_EXECVE_AUDIT_LEN - arg_num_len - *len_sent; - if (has_cntl) - room_left -= (to_send * 2); - else - room_left -= to_send; - if (room_left < 0) { - *len_sent = 0; - audit_log_end(*ab); - *ab = audit_log_start(context, GFP_KERNEL, AUDIT_EXECVE); - if (!*ab) - return 0; - } + /* NOTE: we don't ever want to trust this value for anything + * serious, but the audit record format insists we + * provide an argument length for really long arguments, + * e.g. > MAX_EXECVE_AUDIT_LEN, so we have no choice but + * to use strncpy_from_user() to obtain this value for + * recording in the log, although we don't use it + * anywhere here to avoid a double-fetch problem */ + if (len_full == 0) + len_full = strnlen_user(p, MAX_ARG_STRLEN) - 1; + + /* read more data from userspace */ + if (require_data) { + /* can we make more room in the buffer? */ + if (buf != buf_head) { + memmove(buf_head, buf, len_buf); + buf = buf_head; + } + + /* fetch as much as we can of the argument */ + len_tmp = strncpy_from_user(&buf_head[len_buf], p, + len_max - len_buf); + if (len_tmp == -EFAULT) { + /* unable to copy from userspace */ + send_sig(SIGKILL, current, 0); + goto out; + } else if (len_tmp == (len_max - len_buf)) { + /* buffer is not large enough */ + require_data = true; + /* NOTE: if we are going to span multiple + * buffers force the encoding so we stand + * a chance at a sane len_full value and + * consistent record encoding */ + encode = true; + len_full = len_full * 2; + p += len_tmp; + } else { + require_data = false; + if (!encode) + encode = audit_string_contains_control( + buf, len_tmp); + /* try to use a trusted value for len_full */ + if (len_full < len_max) + len_full = (encode ? + len_tmp * 2 : len_tmp); + p += len_tmp + 1; + } + len_buf += len_tmp; + buf_head[len_buf] = '\0'; - /* - * first record needs to say how long the original string was - * so we can be sure nothing was lost. - */ - if ((i == 0) && (too_long)) - audit_log_format(*ab, " a%d_len=%zu", arg_num, - has_cntl ? 2*len : len); - - /* - * normally arguments are small enough to fit and we already - * filled buf above when we checked for control characters - * so don't bother with another copy_from_user - */ - if (len >= max_execve_audit_len) - ret = copy_from_user(buf, p, to_send); - else - ret = 0; - if (ret) { - WARN_ON(1); - send_sig(SIGKILL, current, 0); - return -1; + /* length of the buffer in the audit record? */ + len_abuf = (encode ? len_buf * 2 : len_buf + 2); } - buf[to_send] = '\0'; - - /* actually log it */ - audit_log_format(*ab, " a%d", arg_num); - if (too_long) - audit_log_format(*ab, "[%d]", i); - audit_log_format(*ab, "="); - if (has_cntl) - audit_log_n_hex(*ab, buf, to_send); - else - audit_log_string(*ab, buf); - - p += to_send; - len_left -= to_send; - *len_sent += arg_num_len; - if (has_cntl) - *len_sent += to_send * 2; - else - *len_sent += to_send; - } - /* include the null we didn't log */ - return len + 1; -} -static void audit_log_execve_info(struct audit_context *context, - struct audit_buffer **ab) -{ - int i, len; - size_t len_sent = 0; - const char __user *p; - char *buf; + /* write as much as we can to the audit log */ + if (len_buf > 0) { + /* NOTE: some magic numbers here - basically if we + * can't fit a reasonable amount of data into the + * existing audit buffer, flush it and start with + * a new buffer */ + if ((sizeof(abuf) + 8) > len_rem) { + len_rem = len_max; + audit_log_end(*ab); + *ab = audit_log_start(context, + GFP_KERNEL, AUDIT_EXECVE); + if (!*ab) + goto out; + } - p = (const char __user *)current->mm->arg_start; + /* create the non-arg portion of the arg record */ + len_tmp = 0; + if (require_data || (iter > 0) || + ((len_abuf + sizeof(abuf)) > len_rem)) { + if (iter == 0) { + len_tmp += snprintf(&abuf[len_tmp], + sizeof(abuf) - len_tmp, + " a%d_len=%lu", + arg, len_full); + } + len_tmp += snprintf(&abuf[len_tmp], + sizeof(abuf) - len_tmp, + " a%d[%d]=", arg, iter++); + } else + len_tmp += snprintf(&abuf[len_tmp], + sizeof(abuf) - len_tmp, + " a%d=", arg); + WARN_ON(len_tmp >= sizeof(abuf)); + abuf[sizeof(abuf) - 1] = '\0'; + + /* log the arg in the audit record */ + audit_log_format(*ab, "%s", abuf); + len_rem -= len_tmp; + len_tmp = len_buf; + if (encode) { + if (len_abuf > len_rem) + len_tmp = len_rem / 2; /* encoding */ + audit_log_n_hex(*ab, buf, len_tmp); + len_rem -= len_tmp * 2; + len_abuf -= len_tmp * 2; + } else { + if (len_abuf > len_rem) + len_tmp = len_rem - 2; /* quotes */ + audit_log_n_string(*ab, buf, len_tmp); + len_rem -= len_tmp + 2; + /* don't subtract the "2" because we still need + * to add quotes to the remaining string */ + len_abuf -= len_tmp; + } + len_buf -= len_tmp; + buf += len_tmp; + } - audit_log_format(*ab, "argc=%d", context->execve.argc); + /* ready to move to the next argument? */ + if ((len_buf == 0) && !require_data) { + arg++; + iter = 0; + len_full = 0; + require_data = true; + encode = false; + } + } while (arg < context->execve.argc); - /* - * we need some kernel buffer to hold the userspace args. Just - * allocate one big one rather than allocating one of the right size - * for every single argument inside audit_log_single_execve_arg() - * should be <8k allocation so should be pretty safe. - */ - buf = kmalloc(MAX_EXECVE_AUDIT_LEN + 1, GFP_KERNEL); - if (!buf) { - audit_panic("out of memory for argv string"); - return; - } + /* NOTE: the caller handles the final audit_log_end() call */ - for (i = 0; i < context->execve.argc; i++) { - len = audit_log_single_execve_arg(context, ab, i, - &len_sent, p, buf); - if (len <= 0) - break; - p += len; - } - kfree(buf); +out: + kfree(buf_head); } static void show_special(struct audit_context *context, int *call_panic) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 6582410a71c7..efd143dcedf1 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1227,6 +1227,7 @@ static int check_ld_abs(struct verifier_env *env, struct bpf_insn *insn) } if (insn->dst_reg != BPF_REG_0 || insn->off != 0 || + BPF_SIZE(insn->code) == BPF_DW || (mode == BPF_ABS && insn->src_reg != BPF_REG_0)) { verbose("BPF_LD_ABS uses reserved fields\n"); return -EINVAL; @@ -1864,7 +1865,6 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env) if (IS_ERR(map)) { verbose("fd %d is not pointing to valid bpf_map\n", insn->imm); - fdput(f); return PTR_ERR(map); } diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 359da3abb004..3abce1e0f910 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4378,14 +4378,15 @@ static void css_free_work_fn(struct work_struct *work) if (ss) { /* css free path */ + struct cgroup_subsys_state *parent = css->parent; int id = css->id; - if (css->parent) - css_put(css->parent); - ss->css_free(css); cgroup_idr_remove(&ss->css_idr, id); cgroup_put(cgrp); + + if (parent) + css_put(parent); } else { /* cgroup free path */ atomic_dec(&cgrp->root->nr_cgrps); @@ -4478,6 +4479,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css, memset(css, 0, sizeof(*css)); css->cgroup = cgrp; css->ss = ss; + css->id = -1; INIT_LIST_HEAD(&css->sibling); INIT_LIST_HEAD(&css->children); css->serial_nr = css_serial_nr_next++; @@ -4563,7 +4565,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, err = cgroup_idr_alloc(&ss->css_idr, NULL, 2, 0, GFP_NOWAIT); if (err < 0) - goto err_free_percpu_ref; + goto err_free_css; css->id = err; if (visible) { @@ -4595,9 +4597,6 @@ err_list_del: list_del_rcu(&css->sibling); cgroup_clear_dir(css->cgroup, 1 << css->ss->id); err_free_id: - cgroup_idr_remove(&ss->css_idr, css->id); -err_free_percpu_ref: - percpu_ref_exit(&css->refcnt); err_free_css: call_rcu(&css->rcu_head, css_free_rcu_fn); return err; diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 7f63ad978cb8..dba8894d25cc 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -347,6 +347,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size, bool truncated) { struct ring_buffer *rb = handle->rb; + bool wakeup = truncated; unsigned long aux_head; u64 flags = 0; @@ -375,9 +376,16 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size, aux_head = rb->user_page->aux_head = local_read(&rb->aux_head); if (aux_head - local_read(&rb->aux_wakeup) >= rb->aux_watermark) { - perf_output_wakeup(handle); + wakeup = true; local_add(rb->aux_watermark, &rb->aux_wakeup); } + + if (wakeup) { + if (truncated) + handle->event->pending_disable = 1; + perf_output_wakeup(handle); + } + handle->event = NULL; local_set(&rb->aux_nest, 0); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index cb346f26a22d..a89bca964b1f 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -179,8 +179,10 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); err = -EAGAIN; ptep = page_check_address(page, mm, addr, &ptl, 0); - if (!ptep) + if (!ptep) { + mem_cgroup_cancel_charge(kpage, memcg); goto unlock; + } get_page(kpage); page_add_new_anon_rmap(kpage, vma, addr); @@ -207,7 +209,6 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, err = 0; unlock: - mem_cgroup_cancel_charge(kpage, memcg); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); unlock_page(page); return err; diff --git a/kernel/exit.c b/kernel/exit.c index 22fcc05dec40..819f51ec4f55 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -914,17 +914,28 @@ static int eligible_pid(struct wait_opts *wo, struct task_struct *p) task_pid_type(p, wo->wo_type) == wo->wo_pid; } -static int eligible_child(struct wait_opts *wo, struct task_struct *p) +static int +eligible_child(struct wait_opts *wo, bool ptrace, struct task_struct *p) { if (!eligible_pid(wo, p)) return 0; - /* Wait for all children (clone and not) if __WALL is set; - * otherwise, wait for clone children *only* if __WCLONE is - * set; otherwise, wait for non-clone children *only*. (Note: - * A "clone" child here is one that reports to its parent - * using a signal other than SIGCHLD.) */ - if (((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE)) - && !(wo->wo_flags & __WALL)) + + /* + * Wait for all children (clone and not) if __WALL is set or + * if it is traced by us. + */ + if (ptrace || (wo->wo_flags & __WALL)) + return 1; + + /* + * Otherwise, wait for clone children *only* if __WCLONE is set; + * otherwise, wait for non-clone children *only*. + * + * Note: a "clone" child here is one that reports to its parent + * using a signal other than SIGCHLD, or a non-leader thread which + * we can only see if it is traced by us. + */ + if ((p->exit_signal != SIGCHLD) ^ !!(wo->wo_flags & __WCLONE)) return 0; return 1; @@ -1297,7 +1308,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, if (unlikely(exit_state == EXIT_DEAD)) return 0; - ret = eligible_child(wo, p); + ret = eligible_child(wo, ptrace, p); if (!ret) return ret; diff --git a/kernel/futex.c b/kernel/futex.c index 46b168e19c98..2214b70f1910 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1381,8 +1381,8 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, if (likely(&hb1->chain != &hb2->chain)) { plist_del(&q->list, &hb1->chain); hb_waiters_dec(hb1); - plist_add(&q->list, &hb2->chain); hb_waiters_inc(hb2); + plist_add(&q->list, &hb2->chain); q->lock_ptr = &hb2->lock; } get_futex_key_refs(key2); diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 4cccea6b8934..6f1c3e41b2a6 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -486,9 +486,6 @@ __ww_mutex_lock_check_stamp(struct mutex *lock, struct ww_acquire_ctx *ctx) if (!hold_ctx) return 0; - if (unlikely(ctx == hold_ctx)) - return -EALREADY; - if (ctx->stamp - hold_ctx->stamp <= LONG_MAX && (ctx->stamp != hold_ctx->stamp || ctx > hold_ctx)) { #ifdef CONFIG_DEBUG_MUTEXES @@ -514,6 +511,12 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, unsigned long flags; int ret; + if (use_ww_ctx) { + struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); + if (unlikely(ww_ctx == READ_ONCE(ww->ctx))) + return -EALREADY; + } + preempt_disable(); mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip); diff --git a/kernel/module.c b/kernel/module.c index be8971d817ed..6920d1080cdd 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2451,13 +2451,18 @@ static inline void kmemleak_load_module(const struct module *mod, #endif #ifdef CONFIG_MODULE_SIG -static int module_sig_check(struct load_info *info) +static int module_sig_check(struct load_info *info, int flags) { int err = -ENOKEY; const unsigned long markerlen = sizeof(MODULE_SIG_STRING) - 1; const void *mod = info->hdr; - if (info->len > markerlen && + /* + * Require flags == 0, as a module with version information + * removed is no longer the module that was signed + */ + if (flags == 0 && + info->len > markerlen && memcmp(mod + info->len - markerlen, MODULE_SIG_STRING, markerlen) == 0) { /* We truncate the module to discard the signature */ info->len -= markerlen; @@ -2476,7 +2481,7 @@ static int module_sig_check(struct load_info *info) return err; } #else /* !CONFIG_MODULE_SIG */ -static int module_sig_check(struct load_info *info) +static int module_sig_check(struct load_info *info, int flags) { return 0; } @@ -3277,7 +3282,7 @@ static int load_module(struct load_info *info, const char __user *uargs, long err; char *after_dashes; - err = module_sig_check(info); + err = module_sig_check(info, flags); if (err) goto free_copy; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3b0f4c09ab92..98e607121d09 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4568,14 +4568,16 @@ void show_state_filter(unsigned long state_filter) /* * reset the NMI-timeout, listing all files on a slow * console might take a lot of time: + * Also, reset softlockup watchdogs on all CPUs, because + * another CPU might be blocked waiting for us to process + * an IPI. */ touch_nmi_watchdog(); + touch_all_softlockup_watchdogs(); if (!state_filter || (p->state & state_filter)) sched_show_task(p); } - touch_all_softlockup_watchdogs(); - #ifdef CONFIG_SCHED_DEBUG sysrq_sched_debug_show(); #endif diff --git a/kernel/sched/proc.c b/kernel/sched/proc.c index 8ecd552fe4f2..09dfe51f89b5 100644 --- a/kernel/sched/proc.c +++ b/kernel/sched/proc.c @@ -97,10 +97,13 @@ long calc_load_fold_active(struct rq *this_rq) static unsigned long calc_load(unsigned long load, unsigned long exp, unsigned long active) { - load *= exp; - load += active * (FIXED_1 - exp); - load += 1UL << (FSHIFT - 1); - return load >> FSHIFT; + unsigned long newload; + + newload = load * exp + active * (FIXED_1 - exp); + if (active >= load) + newload += FIXED_1-1; + + return newload / FIXED_1; } #ifdef CONFIG_NO_HZ_COMMON diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c3eee4c6d6c1..7d4900404c94 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1695,6 +1695,20 @@ static struct ctl_table fs_table[] = { .proc_handler = &pipe_proc_fn, .extra1 = &pipe_min_size, }, + { + .procname = "pipe-user-pages-hard", + .data = &pipe_user_pages_hard, + .maxlen = sizeof(pipe_user_pages_hard), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, + { + .procname = "pipe-user-pages-soft", + .data = &pipe_user_pages_soft, + .maxlen = sizeof(pipe_user_pages_soft), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, { } }; diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index 0075da74abf0..57d1acb91c56 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -784,6 +784,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) timer->it.cpu.expires = 0; sample_to_timespec(timer->it_clock, timer->it.cpu.expires, &itp->it_value); + return; } else { cpu_timer_sample_group(timer->it_clock, p, &now); unlock_task_sighand(p, &flags); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 65dbf8aee751..8fcc801fde15 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -419,7 +419,10 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf) do { seq = raw_read_seqcount(&tkf->seq); tkr = tkf->base + (seq & 0x01); - now = ktime_to_ns(tkr->base) + timekeeping_get_ns(tkr); + now = ktime_to_ns(tkr->base); + + now += clocksource_delta(tkr->read(tkr->clock), + tkr->cycle_last, tkr->mask); } while (read_seqcount_retry(&tkf->seq, seq)); return now; diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c index f6bd65236712..107310a6f36f 100644 --- a/kernel/time/timekeeping_debug.c +++ b/kernel/time/timekeeping_debug.c @@ -23,7 +23,9 @@ #include "timekeeping_internal.h" -static unsigned int sleep_time_bin[32] = {0}; +#define NUM_BINS 32 + +static unsigned int sleep_time_bin[NUM_BINS] = {0}; static int tk_debug_show_sleep_time(struct seq_file *s, void *data) { @@ -69,6 +71,9 @@ late_initcall(tk_debug_sleep_time_init); void tk_debug_account_sleep_time(struct timespec64 *t) { - sleep_time_bin[fls(t->tv_sec)]++; + /* Cap bin index so we don't overflow the array */ + int bin = min(fls(t->tv_sec), NUM_BINS-1); + + sleep_time_bin[bin]++; } diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 0315d43176d8..fb147c7811d2 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -461,7 +461,8 @@ struct ring_buffer_per_cpu { raw_spinlock_t reader_lock; /* serialize readers */ arch_spinlock_t lock; struct lock_class_key lock_key; - unsigned int nr_pages; + unsigned long nr_pages; + unsigned int current_context; struct list_head *pages; struct buffer_page *head_page; /* read from head */ struct buffer_page *tail_page; /* write to tail */ @@ -481,7 +482,7 @@ struct ring_buffer_per_cpu { u64 write_stamp; u64 read_stamp; /* ring buffer pages to update, > 0 to add, < 0 to remove */ - int nr_pages_to_update; + long nr_pages_to_update; struct list_head new_pages; /* new pages to add */ struct work_struct update_pages_work; struct completion update_done; @@ -1160,10 +1161,10 @@ static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer) return 0; } -static int __rb_allocate_pages(int nr_pages, struct list_head *pages, int cpu) +static int __rb_allocate_pages(long nr_pages, struct list_head *pages, int cpu) { - int i; struct buffer_page *bpage, *tmp; + long i; for (i = 0; i < nr_pages; i++) { struct page *page; @@ -1200,7 +1201,7 @@ free_pages: } static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, - unsigned nr_pages) + unsigned long nr_pages) { LIST_HEAD(pages); @@ -1225,7 +1226,7 @@ static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer, } static struct ring_buffer_per_cpu * -rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu) +rb_allocate_cpu_buffer(struct ring_buffer *buffer, long nr_pages, int cpu) { struct ring_buffer_per_cpu *cpu_buffer; struct buffer_page *bpage; @@ -1325,8 +1326,9 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, struct lock_class_key *key) { struct ring_buffer *buffer; + long nr_pages; int bsize; - int cpu, nr_pages; + int cpu; /* keep it in its own cache line */ buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()), @@ -1452,12 +1454,12 @@ static inline unsigned long rb_page_write(struct buffer_page *bpage) } static int -rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned int nr_pages) +rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned long nr_pages) { struct list_head *tail_page, *to_remove, *next_page; struct buffer_page *to_remove_page, *tmp_iter_page; struct buffer_page *last_page, *first_page; - unsigned int nr_removed; + unsigned long nr_removed; unsigned long head_bit; int page_entries; @@ -1674,7 +1676,7 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size, int cpu_id) { struct ring_buffer_per_cpu *cpu_buffer; - unsigned nr_pages; + unsigned long nr_pages; int cpu, err = 0; /* @@ -1688,14 +1690,13 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size, !cpumask_test_cpu(cpu_id, buffer->cpumask)) return size; - size = DIV_ROUND_UP(size, BUF_PAGE_SIZE); - size *= BUF_PAGE_SIZE; + nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); /* we need a minimum of two pages */ - if (size < BUF_PAGE_SIZE * 2) - size = BUF_PAGE_SIZE * 2; + if (nr_pages < 2) + nr_pages = 2; - nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE); + size = nr_pages * BUF_PAGE_SIZE; /* * Don't succeed if resizing is disabled, as a reader might be @@ -2675,11 +2676,11 @@ rb_reserve_next_event(struct ring_buffer *buffer, * just so happens that it is the same bit corresponding to * the current context. */ -static DEFINE_PER_CPU(unsigned int, current_context); -static __always_inline int trace_recursive_lock(void) +static __always_inline int +trace_recursive_lock(struct ring_buffer_per_cpu *cpu_buffer) { - unsigned int val = __this_cpu_read(current_context); + unsigned int val = cpu_buffer->current_context; int bit; if (in_interrupt()) { @@ -2696,20 +2697,21 @@ static __always_inline int trace_recursive_lock(void) return 1; val |= (1 << bit); - __this_cpu_write(current_context, val); + cpu_buffer->current_context = val; return 0; } -static __always_inline void trace_recursive_unlock(void) +static __always_inline void +trace_recursive_unlock(struct ring_buffer_per_cpu *cpu_buffer) { - __this_cpu_and(current_context, __this_cpu_read(current_context) - 1); + cpu_buffer->current_context &= cpu_buffer->current_context - 1; } #else -#define trace_recursive_lock() (0) -#define trace_recursive_unlock() do { } while (0) +#define trace_recursive_lock(cpu_buffer) (0) +#define trace_recursive_unlock(cpu_buffer) do { } while (0) #endif @@ -2741,35 +2743,34 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length) /* If we are tracing schedule, we don't want to recurse */ preempt_disable_notrace(); - if (atomic_read(&buffer->record_disabled)) - goto out_nocheck; - - if (trace_recursive_lock()) - goto out_nocheck; + if (unlikely(atomic_read(&buffer->record_disabled))) + goto out; cpu = raw_smp_processor_id(); - if (!cpumask_test_cpu(cpu, buffer->cpumask)) + if (unlikely(!cpumask_test_cpu(cpu, buffer->cpumask))) goto out; cpu_buffer = buffer->buffers[cpu]; - if (atomic_read(&cpu_buffer->record_disabled)) + if (unlikely(atomic_read(&cpu_buffer->record_disabled))) goto out; - if (length > BUF_MAX_DATA_SIZE) + if (unlikely(length > BUF_MAX_DATA_SIZE)) + goto out; + + if (unlikely(trace_recursive_lock(cpu_buffer))) goto out; event = rb_reserve_next_event(buffer, cpu_buffer, length); if (!event) - goto out; + goto out_unlock; return event; + out_unlock: + trace_recursive_unlock(cpu_buffer); out: - trace_recursive_unlock(); - - out_nocheck: preempt_enable_notrace(); return NULL; } @@ -2859,7 +2860,7 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer, rb_wakeups(buffer, cpu_buffer); - trace_recursive_unlock(); + trace_recursive_unlock(cpu_buffer); preempt_enable_notrace(); @@ -2970,7 +2971,7 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer, out: rb_end_commit(cpu_buffer); - trace_recursive_unlock(); + trace_recursive_unlock(cpu_buffer); preempt_enable_notrace(); @@ -4647,8 +4648,9 @@ static int rb_cpu_notify(struct notifier_block *self, struct ring_buffer *buffer = container_of(self, struct ring_buffer, cpu_notify); long cpu = (long)hcpu; - int cpu_i, nr_pages_same; - unsigned int nr_pages; + long nr_pages_same; + int cpu_i; + unsigned long nr_pages; switch (action) { case CPU_UP_PREPARE: diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index f69ec1295b0b..6459f77e2c72 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1626,8 +1626,13 @@ event_create_dir(struct dentry *parent, struct ftrace_event_file *file) trace_create_file("filter", 0644, file->dir, file, &ftrace_event_filter_fops); - trace_create_file("trigger", 0644, file->dir, file, - &event_trigger_fops); + /* + * Only event directories that can be enabled should have + * triggers. + */ + if (!(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) + trace_create_file("trigger", 0644, file->dir, file, + &event_trigger_fops); trace_create_file("format", 0444, file->dir, call, &ftrace_event_format_fops); diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index 6d6c0411cbe8..fc0c74f18288 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -36,6 +36,10 @@ struct trace_bprintk_fmt { static inline struct trace_bprintk_fmt *lookup_format(const char *fmt) { struct trace_bprintk_fmt *pos; + + if (!fmt) + return ERR_PTR(-EINVAL); + list_for_each_entry(pos, &trace_bprintk_fmt_list, list) { if (!strcmp(pos->fmt, fmt)) return pos; @@ -57,7 +61,8 @@ void hold_module_trace_bprintk_format(const char **start, const char **end) for (iter = start; iter < end; iter++) { struct trace_bprintk_fmt *tb_fmt = lookup_format(*iter); if (tb_fmt) { - *iter = tb_fmt->fmt; + if (!IS_ERR(tb_fmt)) + *iter = tb_fmt->fmt; continue; } diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 6d631161705c..d0efe9295a0e 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -654,6 +654,35 @@ static void set_work_pool_and_clear_pending(struct work_struct *work, */ smp_wmb(); set_work_data(work, (unsigned long)pool_id << WORK_OFFQ_POOL_SHIFT, 0); + /* + * The following mb guarantees that previous clear of a PENDING bit + * will not be reordered with any speculative LOADS or STORES from + * work->current_func, which is executed afterwards. This possible + * reordering can lead to a missed execution on attempt to qeueue + * the same @work. E.g. consider this case: + * + * CPU#0 CPU#1 + * ---------------------------- -------------------------------- + * + * 1 STORE event_indicated + * 2 queue_work_on() { + * 3 test_and_set_bit(PENDING) + * 4 } set_..._and_clear_pending() { + * 5 set_work_data() # clear bit + * 6 smp_mb() + * 7 work->current_func() { + * 8 LOAD event_indicated + * } + * + * Without an explicit full barrier speculative LOAD on line 8 can + * be executed before CPU#0 does STORE on line 1. If that happens, + * CPU#0 observes the PENDING bit is still set and new execution of + * a @work is not queued in a hope, that CPU#1 will eventually + * finish the queued @work. Meanwhile CPU#1 does not see + * event_indicated is set, because speculative LOAD was executed + * before actual STORE. + */ + smp_mb(); } static void clear_work_data(struct work_struct *work) @@ -4448,6 +4477,17 @@ static void rebind_workers(struct worker_pool *pool) pool->attrs->cpumask) < 0); spin_lock_irq(&pool->lock); + + /* + * XXX: CPU hotplug notifiers are weird and can call DOWN_FAILED + * w/o preceding DOWN_PREPARE. Work around it. CPU hotplug is + * being reworked and this can go away in time. + */ + if (!(pool->flags & POOL_DISASSOCIATED)) { + spin_unlock_irq(&pool->lock); + return; + } + pool->flags &= ~POOL_DISASSOCIATED; for_each_pool_worker(worker, pool) { |