summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/audit_watch.c12
-rw-r--r--kernel/bpf/arraymap.c18
-rw-r--r--kernel/bpf/hashtab.c22
-rw-r--r--kernel/bpf/stackmap.c20
-rw-r--r--kernel/bpf/syscall.c26
-rw-r--r--kernel/bpf/verifier.c328
-rw-r--r--kernel/cgroup.c22
-rw-r--r--kernel/cgroup_pids.c2
-rw-r--r--kernel/cpu.c59
-rw-r--r--kernel/cpuset.c22
-rw-r--r--kernel/events/core.c128
-rw-r--r--kernel/events/uprobes.c2
-rw-r--r--kernel/extable.c2
-rw-r--r--kernel/fork.c19
-rw-r--r--kernel/futex.c29
-rw-r--r--kernel/gcov/base.c6
-rw-r--r--kernel/gcov/gcc_4_7.c4
-rw-r--r--kernel/irq/chip.c12
-rw-r--r--kernel/irq/ipi.c4
-rw-r--r--kernel/irq/irqdesc.c24
-rw-r--r--kernel/irq/manage.c4
-rw-r--r--kernel/kprobes.c2
-rw-r--r--kernel/kthread.c3
-rw-r--r--kernel/locking/lockdep.c11
-rw-r--r--kernel/locking/locktorture.c6
-rw-r--r--kernel/locking/rwsem-spinlock.c15
-rw-r--r--kernel/locking/spinlock_debug.c86
-rw-r--r--kernel/membarrier.c4
-rw-r--r--kernel/memremap.c6
-rw-r--r--kernel/padata.c7
-rw-r--r--kernel/panic.c2
-rw-r--r--kernel/pid.c11
-rw-r--r--kernel/pid_namespace.c2
-rw-r--r--kernel/power/process.c5
-rw-r--r--kernel/printk/printk.c2
-rw-r--r--kernel/ptrace.c34
-rw-r--r--kernel/rcu/tree.c14
-rw-r--r--kernel/rcu/tree_plugin.h2
-rw-r--r--kernel/sched/core.c77
-rw-r--r--kernel/sched/cpufreq_schedutil.c28
-rw-r--r--kernel/sched/deadline.c3
-rw-r--r--kernel/sched/fair.c53
-rw-r--r--kernel/sched/loadavg.c4
-rw-r--r--kernel/sched/rt.c238
-rw-r--r--kernel/sched/sched.h47
-rw-r--r--kernel/seccomp.c23
-rw-r--r--kernel/signal.c35
-rw-r--r--kernel/sysctl.c5
-rw-r--r--kernel/time/alarmtimer.c15
-rw-r--r--kernel/time/hrtimer.c20
-rw-r--r--kernel/time/timekeeping.c72
-rw-r--r--kernel/time/timekeeping_debug.c4
-rw-r--r--kernel/time/timer.c54
-rw-r--r--kernel/trace/bpf_trace.c34
-rw-r--r--kernel/trace/ftrace.c59
-rw-r--r--kernel/trace/ring_buffer.c24
-rw-r--r--kernel/trace/trace.c40
-rw-r--r--kernel/trace/trace.h2
-rw-r--r--kernel/trace/trace_events_filter.c4
-rw-r--r--kernel/trace/trace_kprobe.c26
-rw-r--r--kernel/trace/trace_selftest.c2
-rw-r--r--kernel/trace/tracing_map.c11
-rw-r--r--kernel/ucount.c21
-rw-r--r--kernel/watchdog.c278
-rw-r--r--kernel/watchdog_hld.c230
-rw-r--r--kernel/workqueue.c60
-rw-r--r--kernel/workqueue_internal.h3
68 files changed, 1456 insertions, 994 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index eb26e12c6c2a..314e7d62f5f0 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -84,6 +84,7 @@ obj-$(CONFIG_KPROBES) += kprobes.o
obj-$(CONFIG_KGDB) += debug/
obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
obj-$(CONFIG_LOCKUP_DETECTOR) += watchdog.o
+obj-$(CONFIG_HARDLOCKUP_DETECTOR) += watchdog_hld.o
obj-$(CONFIG_SECCOMP) += seccomp.o
obj-$(CONFIG_RELAY) += relay.o
obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index 0d302a87f21b..690e1e3c59f7 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -457,13 +457,15 @@ void audit_remove_watch_rule(struct audit_krule *krule)
list_del(&krule->rlist);
if (list_empty(&watch->rules)) {
+ /*
+ * audit_remove_watch() drops our reference to 'parent' which
+ * can get freed. Grab our own reference to be safe.
+ */
+ audit_get_parent(parent);
audit_remove_watch(watch);
-
- if (list_empty(&parent->watches)) {
- audit_get_parent(parent);
+ if (list_empty(&parent->watches))
fsnotify_destroy_mark(&parent->mark, audit_watch_group);
- audit_put_parent(parent);
- }
+ audit_put_parent(parent);
}
}
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index a2ac051c342f..f3721e150d94 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -11,7 +11,6 @@
*/
#include <linux/bpf.h>
#include <linux/err.h>
-#include <linux/vmalloc.h>
#include <linux/slab.h>
#include <linux/mm.h>
#include <linux/filter.h>
@@ -74,14 +73,10 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
if (array_size >= U32_MAX - PAGE_SIZE)
return ERR_PTR(-ENOMEM);
-
/* allocate all map elements and zero-initialize them */
- array = kzalloc(array_size, GFP_USER | __GFP_NOWARN);
- if (!array) {
- array = vzalloc(array_size);
- if (!array)
- return ERR_PTR(-ENOMEM);
- }
+ array = bpf_map_area_alloc(array_size);
+ if (!array)
+ return ERR_PTR(-ENOMEM);
/* copy mandatory map attributes */
array->map.map_type = attr->map_type;
@@ -97,7 +92,7 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
if (array_size >= U32_MAX - PAGE_SIZE ||
elem_size > PCPU_MIN_UNIT_SIZE || bpf_array_alloc_percpu(array)) {
- kvfree(array);
+ bpf_map_area_free(array);
return ERR_PTR(-ENOMEM);
}
out:
@@ -262,7 +257,7 @@ static void array_map_free(struct bpf_map *map)
if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY)
bpf_array_free_percpu(array);
- kvfree(array);
+ bpf_map_area_free(array);
}
static const struct bpf_map_ops array_ops = {
@@ -319,7 +314,8 @@ static void fd_array_map_free(struct bpf_map *map)
/* make sure it's empty */
for (i = 0; i < array->map.max_entries; i++)
BUG_ON(array->ptrs[i] != NULL);
- kvfree(array);
+
+ bpf_map_area_free(array);
}
static void *fd_array_map_lookup_elem(struct bpf_map *map, void *key)
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
index ad1bc67aff1b..ad2f0ed75471 100644
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -13,7 +13,6 @@
#include <linux/bpf.h>
#include <linux/jhash.h>
#include <linux/filter.h>
-#include <linux/vmalloc.h>
#include "percpu_freelist.h"
struct bucket {
@@ -84,14 +83,15 @@ static void htab_free_elems(struct bpf_htab *htab)
free_percpu(pptr);
}
free_elems:
- vfree(htab->elems);
+ bpf_map_area_free(htab->elems);
}
static int prealloc_elems_and_freelist(struct bpf_htab *htab)
{
int err = -ENOMEM, i;
- htab->elems = vzalloc(htab->elem_size * htab->map.max_entries);
+ htab->elems = bpf_map_area_alloc(htab->elem_size *
+ htab->map.max_entries);
if (!htab->elems)
return -ENOMEM;
@@ -227,14 +227,10 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
goto free_htab;
err = -ENOMEM;
- htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct bucket),
- GFP_USER | __GFP_NOWARN);
-
- if (!htab->buckets) {
- htab->buckets = vmalloc(htab->n_buckets * sizeof(struct bucket));
- if (!htab->buckets)
- goto free_htab;
- }
+ htab->buckets = bpf_map_area_alloc(htab->n_buckets *
+ sizeof(struct bucket));
+ if (!htab->buckets)
+ goto free_htab;
for (i = 0; i < htab->n_buckets; i++) {
INIT_HLIST_HEAD(&htab->buckets[i].head);
@@ -258,7 +254,7 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
free_extra_elems:
free_percpu(htab->extra_elems);
free_buckets:
- kvfree(htab->buckets);
+ bpf_map_area_free(htab->buckets);
free_htab:
kfree(htab);
return ERR_PTR(err);
@@ -715,7 +711,7 @@ static void htab_map_free(struct bpf_map *map)
pcpu_freelist_destroy(&htab->freelist);
}
free_percpu(htab->extra_elems);
- kvfree(htab->buckets);
+ bpf_map_area_free(htab->buckets);
kfree(htab);
}
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c
index 732ae16d12b7..be8519148c25 100644
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -7,7 +7,6 @@
#include <linux/bpf.h>
#include <linux/jhash.h>
#include <linux/filter.h>
-#include <linux/vmalloc.h>
#include <linux/stacktrace.h>
#include <linux/perf_event.h>
#include "percpu_freelist.h"
@@ -32,7 +31,7 @@ static int prealloc_elems_and_freelist(struct bpf_stack_map *smap)
u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size;
int err;
- smap->elems = vzalloc(elem_size * smap->map.max_entries);
+ smap->elems = bpf_map_area_alloc(elem_size * smap->map.max_entries);
if (!smap->elems)
return -ENOMEM;
@@ -45,7 +44,7 @@ static int prealloc_elems_and_freelist(struct bpf_stack_map *smap)
return 0;
free_elems:
- vfree(smap->elems);
+ bpf_map_area_free(smap->elems);
return err;
}
@@ -76,12 +75,9 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
if (cost >= U32_MAX - PAGE_SIZE)
return ERR_PTR(-E2BIG);
- smap = kzalloc(cost, GFP_USER | __GFP_NOWARN);
- if (!smap) {
- smap = vzalloc(cost);
- if (!smap)
- return ERR_PTR(-ENOMEM);
- }
+ smap = bpf_map_area_alloc(cost);
+ if (!smap)
+ return ERR_PTR(-ENOMEM);
err = -E2BIG;
cost += n_buckets * (value_size + sizeof(struct stack_map_bucket));
@@ -112,7 +108,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
put_buffers:
put_callchain_buffers();
free_smap:
- kvfree(smap);
+ bpf_map_area_free(smap);
return ERR_PTR(err);
}
@@ -262,9 +258,9 @@ static void stack_map_free(struct bpf_map *map)
/* wait for bpf programs to complete before freeing stack map */
synchronize_rcu();
- vfree(smap->elems);
+ bpf_map_area_free(smap->elems);
pcpu_freelist_destroy(&smap->freelist);
- kvfree(smap);
+ bpf_map_area_free(smap);
put_callchain_buffers();
}
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index 237f3d6a7ddc..72ea91df71c9 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -12,6 +12,8 @@
#include <linux/bpf.h>
#include <linux/syscalls.h>
#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <linux/mmzone.h>
#include <linux/anon_inodes.h>
#include <linux/file.h>
#include <linux/license.h>
@@ -48,6 +50,30 @@ void bpf_register_map_type(struct bpf_map_type_list *tl)
list_add(&tl->list_node, &bpf_map_types);
}
+void *bpf_map_area_alloc(size_t size)
+{
+ /* We definitely need __GFP_NORETRY, so OOM killer doesn't
+ * trigger under memory pressure as we really just want to
+ * fail instead.
+ */
+ const gfp_t flags = __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO;
+ void *area;
+
+ if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
+ area = kmalloc(size, GFP_USER | flags);
+ if (area != NULL)
+ return area;
+ }
+
+ return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | flags,
+ PAGE_KERNEL);
+}
+
+void bpf_map_area_free(void *area)
+{
+ kvfree(area);
+}
+
int bpf_map_precharge_memlock(u32 pages)
{
struct user_struct *user = get_current_user();
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 8199821f54cf..372454aa7f37 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -139,7 +139,7 @@ struct bpf_verifier_stack_elem {
struct bpf_verifier_stack_elem *next;
};
-#define BPF_COMPLEXITY_LIMIT_INSNS 65536
+#define BPF_COMPLEXITY_LIMIT_INSNS 98304
#define BPF_COMPLEXITY_LIMIT_STACK 1024
struct bpf_call_arg_meta {
@@ -212,9 +212,10 @@ static void print_verifier_state(struct bpf_verifier_state *state)
else if (t == CONST_PTR_TO_MAP || t == PTR_TO_MAP_VALUE ||
t == PTR_TO_MAP_VALUE_OR_NULL ||
t == PTR_TO_MAP_VALUE_ADJ)
- verbose("(ks=%d,vs=%d)",
+ verbose("(ks=%d,vs=%d,id=%u)",
reg->map_ptr->key_size,
- reg->map_ptr->value_size);
+ reg->map_ptr->value_size,
+ reg->id);
if (reg->min_value != BPF_REGISTER_MIN_RANGE)
verbose(",min_value=%lld",
(long long)reg->min_value);
@@ -278,7 +279,8 @@ static const char *const bpf_jmp_string[16] = {
[BPF_EXIT >> 4] = "exit",
};
-static void print_bpf_insn(struct bpf_insn *insn)
+static void print_bpf_insn(const struct bpf_verifier_env *env,
+ const struct bpf_insn *insn)
{
u8 class = BPF_CLASS(insn->code);
@@ -342,9 +344,19 @@ static void print_bpf_insn(struct bpf_insn *insn)
insn->code,
bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
insn->src_reg, insn->imm);
- } else if (BPF_MODE(insn->code) == BPF_IMM) {
- verbose("(%02x) r%d = 0x%x\n",
- insn->code, insn->dst_reg, insn->imm);
+ } else if (BPF_MODE(insn->code) == BPF_IMM &&
+ BPF_SIZE(insn->code) == BPF_DW) {
+ /* At this point, we already made sure that the second
+ * part of the ldimm64 insn is accessible.
+ */
+ u64 imm = ((u64)(insn + 1)->imm << 32) | (u32)insn->imm;
+ bool map_ptr = insn->src_reg == BPF_PSEUDO_MAP_FD;
+
+ if (map_ptr && !env->allow_ptr_leaks)
+ imm = 0;
+
+ verbose("(%02x) r%d = 0x%llx\n", insn->code,
+ insn->dst_reg, (unsigned long long)imm);
} else {
verbose("BUG_ld_%02x\n", insn->code);
return;
@@ -443,13 +455,19 @@ static void init_reg_state(struct bpf_reg_state *regs)
regs[BPF_REG_1].type = PTR_TO_CTX;
}
-static void mark_reg_unknown_value(struct bpf_reg_state *regs, u32 regno)
+static void __mark_reg_unknown_value(struct bpf_reg_state *regs, u32 regno)
{
- BUG_ON(regno >= MAX_BPF_REG);
regs[regno].type = UNKNOWN_VALUE;
+ regs[regno].id = 0;
regs[regno].imm = 0;
}
+static void mark_reg_unknown_value(struct bpf_reg_state *regs, u32 regno)
+{
+ BUG_ON(regno >= MAX_BPF_REG);
+ __mark_reg_unknown_value(regs, regno);
+}
+
static void reset_reg_range_values(struct bpf_reg_state *regs, u32 regno)
{
regs[regno].min_value = BPF_REGISTER_MIN_RANGE;
@@ -664,12 +682,13 @@ static int check_ctx_access(struct bpf_verifier_env *env, int off, int size,
return -EACCES;
}
-static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
+static bool __is_pointer_value(bool allow_ptr_leaks,
+ const struct bpf_reg_state *reg)
{
- if (env->allow_ptr_leaks)
+ if (allow_ptr_leaks)
return false;
- switch (env->cur_state.regs[regno].type) {
+ switch (reg->type) {
case UNKNOWN_VALUE:
case CONST_IMM:
return false;
@@ -678,6 +697,11 @@ static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
}
}
+static bool is_pointer_value(struct bpf_verifier_env *env, int regno)
+{
+ return __is_pointer_value(env->allow_ptr_leaks, &env->cur_state.regs[regno]);
+}
+
static int check_ptr_alignment(struct bpf_verifier_env *env,
struct bpf_reg_state *reg, int off, int size)
{
@@ -867,6 +891,11 @@ static int check_xadd(struct bpf_verifier_env *env, struct bpf_insn *insn)
if (err)
return err;
+ if (is_pointer_value(env, insn->src_reg)) {
+ verbose("R%d leaks addr into mem\n", insn->src_reg);
+ return -EACCES;
+ }
+
/* check whether atomic_add can read the memory */
err = check_mem_access(env, insn->dst_reg, insn->off,
BPF_SIZE(insn->code), BPF_READ, -1);
@@ -1252,6 +1281,7 @@ static int check_call(struct bpf_verifier_env *env, int func_id)
return -EINVAL;
}
regs[BPF_REG_0].map_ptr = meta.map_ptr;
+ regs[BPF_REG_0].id = ++env->id_gen;
} else {
verbose("unknown return type %d of func %d\n",
fn->ret_type, func_id);
@@ -1443,6 +1473,65 @@ static int evaluate_reg_alu(struct bpf_verifier_env *env, struct bpf_insn *insn)
return 0;
}
+static int evaluate_reg_imm_alu_unknown(struct bpf_verifier_env *env,
+ struct bpf_insn *insn)
+{
+ struct bpf_reg_state *regs = env->cur_state.regs;
+ struct bpf_reg_state *dst_reg = &regs[insn->dst_reg];
+ struct bpf_reg_state *src_reg = &regs[insn->src_reg];
+ u8 opcode = BPF_OP(insn->code);
+ s64 imm_log2 = __ilog2_u64((long long)dst_reg->imm);
+
+ /* BPF_X code with src_reg->type UNKNOWN_VALUE here. */
+ if (src_reg->imm > 0 && dst_reg->imm) {
+ switch (opcode) {
+ case BPF_ADD:
+ /* dreg += sreg
+ * where both have zero upper bits. Adding them
+ * can only result making one more bit non-zero
+ * in the larger value.
+ * Ex. 0xffff (imm=48) + 1 (imm=63) = 0x10000 (imm=47)
+ * 0xffff (imm=48) + 0xffff = 0x1fffe (imm=47)
+ */
+ dst_reg->imm = min(src_reg->imm, 63 - imm_log2);
+ dst_reg->imm--;
+ break;
+ case BPF_AND:
+ /* dreg &= sreg
+ * AND can not extend zero bits only shrink
+ * Ex. 0x00..00ffffff
+ * & 0x0f..ffffffff
+ * ----------------
+ * 0x00..00ffffff
+ */
+ dst_reg->imm = max(src_reg->imm, 63 - imm_log2);
+ break;
+ case BPF_OR:
+ /* dreg |= sreg
+ * OR can only extend zero bits
+ * Ex. 0x00..00ffffff
+ * | 0x0f..ffffffff
+ * ----------------
+ * 0x0f..00ffffff
+ */
+ dst_reg->imm = min(src_reg->imm, 63 - imm_log2);
+ break;
+ case BPF_SUB:
+ case BPF_MUL:
+ case BPF_RSH:
+ case BPF_LSH:
+ /* These may be flushed out later */
+ default:
+ mark_reg_unknown_value(regs, insn->dst_reg);
+ }
+ } else {
+ mark_reg_unknown_value(regs, insn->dst_reg);
+ }
+
+ dst_reg->type = UNKNOWN_VALUE;
+ return 0;
+}
+
static int evaluate_reg_imm_alu(struct bpf_verifier_env *env,
struct bpf_insn *insn)
{
@@ -1451,6 +1540,9 @@ static int evaluate_reg_imm_alu(struct bpf_verifier_env *env,
struct bpf_reg_state *src_reg = &regs[insn->src_reg];
u8 opcode = BPF_OP(insn->code);
+ if (BPF_SRC(insn->code) == BPF_X && src_reg->type == UNKNOWN_VALUE)
+ return evaluate_reg_imm_alu_unknown(env, insn);
+
/* dst_reg->type == CONST_IMM here, simulate execution of 'add' insn.
* Don't care about overflow or negative values, just add them
*/
@@ -1506,10 +1598,24 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,
}
/* We don't know anything about what was done to this register, mark it
- * as unknown.
+ * as unknown. Also, if both derived bounds came from signed/unsigned
+ * mixed compares and one side is unbounded, we cannot really do anything
+ * with them as boundaries cannot be trusted. Thus, arithmetic of two
+ * regs of such kind will get invalidated bounds on the dst side.
*/
- if (min_val == BPF_REGISTER_MIN_RANGE &&
- max_val == BPF_REGISTER_MAX_RANGE) {
+ if ((min_val == BPF_REGISTER_MIN_RANGE &&
+ max_val == BPF_REGISTER_MAX_RANGE) ||
+ (BPF_SRC(insn->code) == BPF_X &&
+ ((min_val != BPF_REGISTER_MIN_RANGE &&
+ max_val == BPF_REGISTER_MAX_RANGE) ||
+ (min_val == BPF_REGISTER_MIN_RANGE &&
+ max_val != BPF_REGISTER_MAX_RANGE) ||
+ (dst_reg->min_value != BPF_REGISTER_MIN_RANGE &&
+ dst_reg->max_value == BPF_REGISTER_MAX_RANGE) ||
+ (dst_reg->min_value == BPF_REGISTER_MIN_RANGE &&
+ dst_reg->max_value != BPF_REGISTER_MAX_RANGE)) &&
+ regs[insn->dst_reg].value_from_signed !=
+ regs[insn->src_reg].value_from_signed)) {
reset_reg_range_values(regs, insn->dst_reg);
return;
}
@@ -1518,10 +1624,12 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,
* do our normal operations to the register, we need to set the values
* to the min/max since they are undefined.
*/
- if (min_val == BPF_REGISTER_MIN_RANGE)
- dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
- if (max_val == BPF_REGISTER_MAX_RANGE)
- dst_reg->max_value = BPF_REGISTER_MAX_RANGE;
+ if (opcode != BPF_SUB) {
+ if (min_val == BPF_REGISTER_MIN_RANGE)
+ dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
+ if (max_val == BPF_REGISTER_MAX_RANGE)
+ dst_reg->max_value = BPF_REGISTER_MAX_RANGE;
+ }
switch (opcode) {
case BPF_ADD:
@@ -1531,10 +1639,17 @@ static void adjust_reg_min_max_vals(struct bpf_verifier_env *env,
dst_reg->max_value += max_val;
break;
case BPF_SUB:
+ /* If one of our values was at the end of our ranges, then the
+ * _opposite_ value in the dst_reg goes to the end of our range.
+ */
+ if (min_val == BPF_REGISTER_MIN_RANGE)
+ dst_reg->max_value = BPF_REGISTER_MAX_RANGE;
+ if (max_val == BPF_REGISTER_MAX_RANGE)
+ dst_reg->min_value = BPF_REGISTER_MIN_RANGE;
if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
- dst_reg->min_value -= min_val;
+ dst_reg->min_value -= max_val;
if (dst_reg->max_value != BPF_REGISTER_MAX_RANGE)
- dst_reg->max_value -= max_val;
+ dst_reg->max_value -= min_val;
break;
case BPF_MUL:
if (dst_reg->min_value != BPF_REGISTER_MIN_RANGE)
@@ -1605,7 +1720,8 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
}
} else {
if (insn->src_reg != BPF_REG_0 || insn->off != 0 ||
- (insn->imm != 16 && insn->imm != 32 && insn->imm != 64)) {
+ (insn->imm != 16 && insn->imm != 32 && insn->imm != 64) ||
+ BPF_CLASS(insn->code) == BPF_ALU64) {
verbose("BPF_END uses reserved fields\n");
return -EINVAL;
}
@@ -1668,8 +1784,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
insn->src_reg);
return -EACCES;
}
- regs[insn->dst_reg].type = UNKNOWN_VALUE;
- regs[insn->dst_reg].map_ptr = NULL;
+ mark_reg_unknown_value(regs, insn->dst_reg);
}
} else {
/* case: R = imm
@@ -1742,6 +1857,17 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
return 0;
} else if (opcode == BPF_ADD &&
BPF_CLASS(insn->code) == BPF_ALU64 &&
+ dst_reg->type == PTR_TO_STACK &&
+ ((BPF_SRC(insn->code) == BPF_X &&
+ regs[insn->src_reg].type == CONST_IMM) ||
+ BPF_SRC(insn->code) == BPF_K)) {
+ if (BPF_SRC(insn->code) == BPF_X)
+ dst_reg->imm += regs[insn->src_reg].imm;
+ else
+ dst_reg->imm += insn->imm;
+ return 0;
+ } else if (opcode == BPF_ADD &&
+ BPF_CLASS(insn->code) == BPF_ALU64 &&
(dst_reg->type == PTR_TO_PACKET ||
(BPF_SRC(insn->code) == BPF_X &&
regs[insn->src_reg].type == PTR_TO_PACKET))) {
@@ -1774,6 +1900,7 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn)
* register as unknown.
*/
if (env->allow_ptr_leaks &&
+ BPF_CLASS(insn->code) == BPF_ALU64 && opcode == BPF_ADD &&
(dst_reg->type == PTR_TO_MAP_VALUE ||
dst_reg->type == PTR_TO_MAP_VALUE_ADJ))
dst_reg->type = PTR_TO_MAP_VALUE_ADJ;
@@ -1822,14 +1949,15 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *state,
for (i = 0; i < MAX_BPF_REG; i++)
if (regs[i].type == PTR_TO_PACKET && regs[i].id == dst_reg->id)
- regs[i].range = dst_reg->off;
+ /* keep the maximum range already checked */
+ regs[i].range = max(regs[i].range, dst_reg->off);
for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
if (state->stack_slot_type[i] != STACK_SPILL)
continue;
reg = &state->spilled_regs[i / BPF_REG_SIZE];
if (reg->type == PTR_TO_PACKET && reg->id == dst_reg->id)
- reg->range = dst_reg->off;
+ reg->range = max(reg->range, dst_reg->off);
}
}
@@ -1841,38 +1969,63 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg,
struct bpf_reg_state *false_reg, u64 val,
u8 opcode)
{
+ bool value_from_signed = true;
+ bool is_range = true;
+
switch (opcode) {
case BPF_JEQ:
/* If this is false then we know nothing Jon Snow, but if it is
* true then we know for sure.
*/
true_reg->max_value = true_reg->min_value = val;
+ is_range = false;
break;
case BPF_JNE:
/* If this is true we know nothing Jon Snow, but if it is false
* we know the value for sure;
*/
false_reg->max_value = false_reg->min_value = val;
+ is_range = false;
break;
case BPF_JGT:
- /* Unsigned comparison, the minimum value is 0. */
- false_reg->min_value = 0;
+ value_from_signed = false;
+ /* fallthrough */
case BPF_JSGT:
+ if (true_reg->value_from_signed != value_from_signed)
+ reset_reg_range_values(true_reg, 0);
+ if (false_reg->value_from_signed != value_from_signed)
+ reset_reg_range_values(false_reg, 0);
+ if (opcode == BPF_JGT) {
+ /* Unsigned comparison, the minimum value is 0. */
+ false_reg->min_value = 0;
+ }
/* If this is false then we know the maximum val is val,
* otherwise we know the min val is val+1.
*/
false_reg->max_value = val;
+ false_reg->value_from_signed = value_from_signed;
true_reg->min_value = val + 1;
+ true_reg->value_from_signed = value_from_signed;
break;
case BPF_JGE:
- /* Unsigned comparison, the minimum value is 0. */
- false_reg->min_value = 0;
+ value_from_signed = false;
+ /* fallthrough */
case BPF_JSGE:
+ if (true_reg->value_from_signed != value_from_signed)
+ reset_reg_range_values(true_reg, 0);
+ if (false_reg->value_from_signed != value_from_signed)
+ reset_reg_range_values(false_reg, 0);
+ if (opcode == BPF_JGE) {
+ /* Unsigned comparison, the minimum value is 0. */
+ false_reg->min_value = 0;
+ }
/* If this is false then we know the maximum value is val - 1,
* otherwise we know the mimimum value is val.
*/
false_reg->max_value = val - 1;
+ false_reg->value_from_signed = value_from_signed;
true_reg->min_value = val;
+ true_reg->value_from_signed = value_from_signed;
break;
default:
break;
@@ -1880,6 +2033,12 @@ static void reg_set_min_max(struct bpf_reg_state *true_reg,
check_reg_overflow(false_reg);
check_reg_overflow(true_reg);
+ if (is_range) {
+ if (__is_pointer_value(false, false_reg))
+ reset_reg_range_values(false_reg, 0);
+ if (__is_pointer_value(false, true_reg))
+ reset_reg_range_values(true_reg, 0);
+ }
}
/* Same as above, but for the case that dst_reg is a CONST_IMM reg and src_reg
@@ -1889,39 +2048,64 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg,
struct bpf_reg_state *false_reg, u64 val,
u8 opcode)
{
+ bool value_from_signed = true;
+ bool is_range = true;
+
switch (opcode) {
case BPF_JEQ:
/* If this is false then we know nothing Jon Snow, but if it is
* true then we know for sure.
*/
true_reg->max_value = true_reg->min_value = val;
+ is_range = false;
break;
case BPF_JNE:
/* If this is true we know nothing Jon Snow, but if it is false
* we know the value for sure;
*/
false_reg->max_value = false_reg->min_value = val;
+ is_range = false;
break;
case BPF_JGT:
- /* Unsigned comparison, the minimum value is 0. */
- true_reg->min_value = 0;
+ value_from_signed = false;
+ /* fallthrough */
case BPF_JSGT:
+ if (true_reg->value_from_signed != value_from_signed)
+ reset_reg_range_values(true_reg, 0);
+ if (false_reg->value_from_signed != value_from_signed)
+ reset_reg_range_values(false_reg, 0);
+ if (opcode == BPF_JGT) {
+ /* Unsigned comparison, the minimum value is 0. */
+ true_reg->min_value = 0;
+ }
/*
* If this is false, then the val is <= the register, if it is
* true the register <= to the val.
*/
false_reg->min_value = val;
+ false_reg->value_from_signed = value_from_signed;
true_reg->max_value = val - 1;
+ true_reg->value_from_signed = value_from_signed;
break;
case BPF_JGE:
- /* Unsigned comparison, the minimum value is 0. */
- true_reg->min_value = 0;
+ value_from_signed = false;
+ /* fallthrough */
case BPF_JSGE:
+ if (true_reg->value_from_signed != value_from_signed)
+ reset_reg_range_values(true_reg, 0);
+ if (false_reg->value_from_signed != value_from_signed)
+ reset_reg_range_values(false_reg, 0);
+ if (opcode == BPF_JGE) {
+ /* Unsigned comparison, the minimum value is 0. */
+ true_reg->min_value = 0;
+ }
/* If this is false then constant < register, if it is true then
* the register < constant.
*/
false_reg->min_value = val + 1;
+ false_reg->value_from_signed = value_from_signed;
true_reg->max_value = val;
+ true_reg->value_from_signed = value_from_signed;
break;
default:
break;
@@ -1929,6 +2113,49 @@ static void reg_set_min_max_inv(struct bpf_reg_state *true_reg,
check_reg_overflow(false_reg);
check_reg_overflow(true_reg);
+ if (is_range) {
+ if (__is_pointer_value(false, false_reg))
+ reset_reg_range_values(false_reg, 0);
+ if (__is_pointer_value(false, true_reg))
+ reset_reg_range_values(true_reg, 0);
+ }
+}
+
+static void mark_map_reg(struct bpf_reg_state *regs, u32 regno, u32 id,
+ enum bpf_reg_type type)
+{
+ struct bpf_reg_state *reg = &regs[regno];
+
+ if (reg->type == PTR_TO_MAP_VALUE_OR_NULL && reg->id == id) {
+ reg->type = type;
+ /* We don't need id from this point onwards anymore, thus we
+ * should better reset it, so that state pruning has chances
+ * to take effect.
+ */
+ reg->id = 0;
+ if (type == UNKNOWN_VALUE)
+ __mark_reg_unknown_value(regs, regno);
+ }
+}
+
+/* The logic is similar to find_good_pkt_pointers(), both could eventually
+ * be folded together at some point.
+ */
+static void mark_map_regs(struct bpf_verifier_state *state, u32 regno,
+ enum bpf_reg_type type)
+{
+ struct bpf_reg_state *regs = state->regs;
+ u32 id = regs[regno].id;
+ int i;
+
+ for (i = 0; i < MAX_BPF_REG; i++)
+ mark_map_reg(regs, i, id, type);
+
+ for (i = 0; i < MAX_BPF_STACK; i += BPF_REG_SIZE) {
+ if (state->stack_slot_type[i] != STACK_SPILL)
+ continue;
+ mark_map_reg(state->spilled_regs, i / BPF_REG_SIZE, id, type);
+ }
}
static int check_cond_jmp_op(struct bpf_verifier_env *env,
@@ -2018,18 +2245,13 @@ static int check_cond_jmp_op(struct bpf_verifier_env *env,
if (BPF_SRC(insn->code) == BPF_K &&
insn->imm == 0 && (opcode == BPF_JEQ || opcode == BPF_JNE) &&
dst_reg->type == PTR_TO_MAP_VALUE_OR_NULL) {
- if (opcode == BPF_JEQ) {
- /* next fallthrough insn can access memory via
- * this register
- */
- regs[insn->dst_reg].type = PTR_TO_MAP_VALUE;
- /* branch targer cannot access it, since reg == 0 */
- mark_reg_unknown_value(other_branch->regs,
- insn->dst_reg);
- } else {
- other_branch->regs[insn->dst_reg].type = PTR_TO_MAP_VALUE;
- mark_reg_unknown_value(regs, insn->dst_reg);
- }
+ /* Mark all identical map registers in each branch as either
+ * safe or unknown depending R == 0 or R != 0 conditional.
+ */
+ mark_map_regs(this_branch, insn->dst_reg,
+ opcode == BPF_JEQ ? PTR_TO_MAP_VALUE : UNKNOWN_VALUE);
+ mark_map_regs(other_branch, insn->dst_reg,
+ opcode == BPF_JEQ ? UNKNOWN_VALUE : PTR_TO_MAP_VALUE);
} else if (BPF_SRC(insn->code) == BPF_X && opcode == BPF_JGT &&
dst_reg->type == PTR_TO_PACKET &&
regs[insn->src_reg].type == PTR_TO_PACKET_END) {
@@ -2323,6 +2545,7 @@ peek_stack:
env->explored_states[t + 1] = STATE_LIST_MARK;
} else {
/* conditional jump with two edges */
+ env->explored_states[t] = STATE_LIST_MARK;
ret = push_insn(t, t + 1, FALLTHROUGH, env);
if (ret == 1)
goto peek_stack;
@@ -2469,7 +2692,7 @@ static bool states_equal(struct bpf_verifier_env *env,
* we didn't do a variable access into a map then we are a-ok.
*/
if (!varlen_map_access &&
- rold->type == rcur->type && rold->imm == rcur->imm)
+ memcmp(rold, rcur, offsetofend(struct bpf_reg_state, id)) == 0)
continue;
/* If we didn't map access then again we don't care about the
@@ -2481,6 +2704,12 @@ static bool states_equal(struct bpf_verifier_env *env,
rcur->type != NOT_INIT))
continue;
+ /* Don't care about the reg->id in this case. */
+ if (rold->type == PTR_TO_MAP_VALUE_OR_NULL &&
+ rcur->type == PTR_TO_MAP_VALUE_OR_NULL &&
+ rold->map_ptr == rcur->map_ptr)
+ continue;
+
if (rold->type == PTR_TO_PACKET && rcur->type == PTR_TO_PACKET &&
compare_ptrs_to_packet(rold, rcur))
continue;
@@ -2615,6 +2844,9 @@ static int do_check(struct bpf_verifier_env *env)
goto process_bpf_exit;
}
+ if (need_resched())
+ cond_resched();
+
if (log_level && do_print_state) {
verbose("\nfrom %d to %d:", prev_insn_idx, insn_idx);
print_verifier_state(&env->cur_state);
@@ -2623,7 +2855,7 @@ static int do_check(struct bpf_verifier_env *env)
if (log_level) {
verbose("%d: ", insn_idx);
- print_bpf_insn(insn);
+ print_bpf_insn(env, insn);
}
err = ext_analyzer_insn_hook(env, insn_idx, prev_insn_idx);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 4e2f3de0e40b..4c233437ee1a 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2920,11 +2920,12 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
tsk = tsk->group_leader;
/*
- * Workqueue threads may acquire PF_NO_SETAFFINITY and become
- * trapped in a cpuset, or RT worker may be born in a cgroup
- * with no rt_runtime allocated. Just say no.
+ * kthreads may acquire PF_NO_SETAFFINITY during initialization.
+ * If userland migrates such a kthread to a non-root cgroup, it can
+ * become trapped in a cpuset, or RT kthread may be born in a
+ * cgroup with no rt_runtime allocated. Just say no.
*/
- if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) {
+ if (tsk->no_cgroup_migration || (tsk->flags & PF_NO_SETAFFINITY)) {
ret = -EINVAL;
goto out_unlock_rcu;
}
@@ -3486,11 +3487,11 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of,
cgrp->subtree_control &= ~disable;
ret = cgroup_apply_control(cgrp);
-
cgroup_finalize_control(cgrp, ret);
+ if (ret)
+ goto out_unlock;
kernfs_activate(cgrp->kn);
- ret = 0;
out_unlock:
cgroup_kn_unlock(of->kn);
return ret ?: nbytes;
@@ -5406,6 +5407,11 @@ static void kill_css(struct cgroup_subsys_state *css)
{
lockdep_assert_held(&cgroup_mutex);
+ if (css->flags & CSS_DYING)
+ return;
+
+ css->flags |= CSS_DYING;
+
/*
* This must happen before css is disassociated with its cgroup.
* See seq_css() for details.
@@ -5712,6 +5718,10 @@ int __init cgroup_init(void)
if (ss->bind)
ss->bind(init_css_set.subsys[ssid]);
+
+ mutex_lock(&cgroup_mutex);
+ css_populate_dir(init_css_set.subsys[ssid]);
+ mutex_unlock(&cgroup_mutex);
}
/* init_css_set.subsys[] has been updated, re-hash */
diff --git a/kernel/cgroup_pids.c b/kernel/cgroup_pids.c
index 2bd673783f1a..a57242e0d5a6 100644
--- a/kernel/cgroup_pids.c
+++ b/kernel/cgroup_pids.c
@@ -229,7 +229,7 @@ static int pids_can_fork(struct task_struct *task)
/* Only log the first time events_limit is incremented. */
if (atomic64_inc_return(&pids->events_limit) == 1) {
pr_info("cgroup: fork rejected by pids controller in ");
- pr_cont_cgroup_path(task_cgroup(current, pids_cgrp_id));
+ pr_cont_cgroup_path(css->cgroup);
pr_cont("\n");
}
cgroup_file_notify(&pids->events_file);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 3f7369bac680..3fb3d7209e6f 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -410,11 +410,26 @@ static int notify_online(unsigned int cpu)
return 0;
}
+static void __cpuhp_kick_ap_work(struct cpuhp_cpu_state *st);
+
static int bringup_wait_for_ap(unsigned int cpu)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu);
+ /* Wait for the CPU to reach CPUHP_AP_ONLINE_IDLE */
wait_for_completion(&st->done);
+ if (WARN_ON_ONCE((!cpu_online(cpu))))
+ return -ECANCELED;
+
+ /* Unpark the stopper thread and the hotplug thread of the target cpu */
+ stop_machine_unpark(cpu);
+ kthread_unpark(st->thread);
+
+ /* Should we go further up ? */
+ if (st->target > CPUHP_AP_ONLINE_IDLE) {
+ __cpuhp_kick_ap_work(st);
+ wait_for_completion(&st->done);
+ }
return st->result;
}
@@ -437,9 +452,7 @@ static int bringup_cpu(unsigned int cpu)
cpu_notify(CPU_UP_CANCELED, cpu);
return ret;
}
- ret = bringup_wait_for_ap(cpu);
- BUG_ON(!cpu_online(cpu));
- return ret;
+ return bringup_wait_for_ap(cpu);
}
/*
@@ -974,31 +987,20 @@ void notify_cpu_starting(unsigned int cpu)
}
/*
- * Called from the idle task. We need to set active here, so we can kick off
- * the stopper thread and unpark the smpboot threads. If the target state is
- * beyond CPUHP_AP_ONLINE_IDLE we kick cpuhp thread and let it bring up the
- * cpu further.
+ * Called from the idle task. Wake up the controlling task which brings the
+ * stopper and the hotplug thread of the upcoming CPU up and then delegates
+ * the rest of the online bringup to the hotplug thread.
*/
void cpuhp_online_idle(enum cpuhp_state state)
{
struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state);
- unsigned int cpu = smp_processor_id();
/* Happens for the boot cpu */
if (state != CPUHP_AP_ONLINE_IDLE)
return;
st->state = CPUHP_AP_ONLINE_IDLE;
-
- /* Unpark the stopper thread and the hotplug thread of this cpu */
- stop_machine_unpark(cpu);
- kthread_unpark(st->thread);
-
- /* Should we go further up ? */
- if (st->target > CPUHP_AP_ONLINE_IDLE)
- __cpuhp_kick_ap_work(st);
- else
- complete(&st->done);
+ complete(&st->done);
}
/* Requires cpu_add_remove_lock to be held */
@@ -1441,14 +1443,12 @@ static void cpuhp_store_callbacks(enum cpuhp_state state,
/* (Un)Install the callbacks for further cpu hotplug operations */
struct cpuhp_step *sp;
- mutex_lock(&cpuhp_state_mutex);
sp = cpuhp_get_step(state);
sp->startup.single = startup;
sp->teardown.single = teardown;
sp->name = name;
sp->multi_instance = multi_instance;
INIT_HLIST_HEAD(&sp->list);
- mutex_unlock(&cpuhp_state_mutex);
}
static void *cpuhp_get_teardown_cb(enum cpuhp_state state)
@@ -1518,16 +1518,13 @@ static int cpuhp_reserve_state(enum cpuhp_state state)
{
enum cpuhp_state i;
- mutex_lock(&cpuhp_state_mutex);
for (i = CPUHP_AP_ONLINE_DYN; i <= CPUHP_AP_ONLINE_DYN_END; i++) {
if (cpuhp_ap_states[i].name)
continue;
cpuhp_ap_states[i].name = "Reserved";
- mutex_unlock(&cpuhp_state_mutex);
return i;
}
- mutex_unlock(&cpuhp_state_mutex);
WARN(1, "No more dynamic states available for CPU hotplug\n");
return -ENOSPC;
}
@@ -1544,6 +1541,7 @@ int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node,
return -EINVAL;
get_online_cpus();
+ mutex_lock(&cpuhp_state_mutex);
if (!invoke || !sp->startup.multi)
goto add_node;
@@ -1568,11 +1566,10 @@ int __cpuhp_state_add_instance(enum cpuhp_state state, struct hlist_node *node,
}
add_node:
ret = 0;
- mutex_lock(&cpuhp_state_mutex);
hlist_add_head(node, &sp->list);
- mutex_unlock(&cpuhp_state_mutex);
err:
+ mutex_unlock(&cpuhp_state_mutex);
put_online_cpus();
return ret;
}
@@ -1601,6 +1598,7 @@ int __cpuhp_setup_state(enum cpuhp_state state,
return -EINVAL;
get_online_cpus();
+ mutex_lock(&cpuhp_state_mutex);
/* currently assignments for the ONLINE state are possible */
if (state == CPUHP_AP_ONLINE_DYN) {
@@ -1636,6 +1634,8 @@ int __cpuhp_setup_state(enum cpuhp_state state,
}
}
out:
+ mutex_unlock(&cpuhp_state_mutex);
+
put_online_cpus();
if (!ret && dyn_state)
return state;
@@ -1655,6 +1655,8 @@ int __cpuhp_state_remove_instance(enum cpuhp_state state,
return -EINVAL;
get_online_cpus();
+ mutex_lock(&cpuhp_state_mutex);
+
if (!invoke || !cpuhp_get_teardown_cb(state))
goto remove;
/*
@@ -1671,7 +1673,6 @@ int __cpuhp_state_remove_instance(enum cpuhp_state state,
}
remove:
- mutex_lock(&cpuhp_state_mutex);
hlist_del(node);
mutex_unlock(&cpuhp_state_mutex);
put_online_cpus();
@@ -1696,6 +1697,7 @@ void __cpuhp_remove_state(enum cpuhp_state state, bool invoke)
BUG_ON(cpuhp_cb_check(state));
get_online_cpus();
+ mutex_lock(&cpuhp_state_mutex);
if (sp->multi_instance) {
WARN(!hlist_empty(&sp->list),
@@ -1721,6 +1723,7 @@ void __cpuhp_remove_state(enum cpuhp_state state, bool invoke)
}
remove:
cpuhp_store_callbacks(state, NULL, NULL, NULL, false);
+ mutex_unlock(&cpuhp_state_mutex);
put_online_cpus();
}
EXPORT_SYMBOL(__cpuhp_remove_state);
@@ -1764,13 +1767,13 @@ static ssize_t write_cpuhp_target(struct device *dev,
ret = !sp->name || sp->cant_stop ? -EINVAL : 0;
mutex_unlock(&cpuhp_state_mutex);
if (ret)
- return ret;
+ goto out;
if (st->state < target)
ret = do_cpu_up(dev->id, target);
else
ret = do_cpu_down(dev->id, target);
-
+out:
unlock_device_hotplug();
return ret ? ret : count;
}
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 29f815d2ef7e..511b1dd8ff09 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -61,6 +61,7 @@
#include <linux/cgroup.h>
#include <linux/wait.h>
+DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);
DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
/* See "Frequency meter" comments, below. */
@@ -174,9 +175,9 @@ typedef enum {
} cpuset_flagbits_t;
/* convenient tests for these bits */
-static inline bool is_cpuset_online(const struct cpuset *cs)
+static inline bool is_cpuset_online(struct cpuset *cs)
{
- return test_bit(CS_ONLINE, &cs->flags);
+ return test_bit(CS_ONLINE, &cs->flags) && !css_is_dying(&cs->css);
}
static inline int is_cpu_exclusive(const struct cpuset *cs)
@@ -1904,6 +1905,7 @@ static struct cftype files[] = {
{
.name = "memory_pressure",
.read_u64 = cpuset_read_u64,
+ .private = FILE_MEMORY_PRESSURE,
},
{
@@ -2274,6 +2276,13 @@ retry:
mutex_unlock(&cpuset_mutex);
}
+static bool force_rebuild;
+
+void cpuset_force_rebuild(void)
+{
+ force_rebuild = true;
+}
+
/**
* cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset
*
@@ -2348,8 +2357,10 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
}
/* rebuild sched domains if cpus_allowed has changed */
- if (cpus_updated)
+ if (cpus_updated || force_rebuild) {
+ force_rebuild = false;
rebuild_sched_domains();
+ }
}
void cpuset_update_active_cpus(bool cpu_online)
@@ -2368,6 +2379,11 @@ void cpuset_update_active_cpus(bool cpu_online)
schedule_work(&cpuset_hotplug_work);
}
+void cpuset_wait_for_hotplug(void)
+{
+ flush_work(&cpuset_hotplug_work);
+}
+
/*
* Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY].
* Call this routine anytime after node_states[N_MEMORY] changes.
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 4b3323151a2f..36ff2d93f222 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -2272,7 +2272,7 @@ static int __perf_install_in_context(void *info)
struct perf_event_context *ctx = event->ctx;
struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
struct perf_event_context *task_ctx = cpuctx->task_ctx;
- bool activate = true;
+ bool reprogram = true;
int ret = 0;
raw_spin_lock(&cpuctx->ctx.lock);
@@ -2280,27 +2280,26 @@ static int __perf_install_in_context(void *info)
raw_spin_lock(&ctx->lock);
task_ctx = ctx;
- /* If we're on the wrong CPU, try again */
- if (task_cpu(ctx->task) != smp_processor_id()) {
- ret = -ESRCH;
- goto unlock;
- }
+ reprogram = (ctx->task == current);
/*
- * If we're on the right CPU, see if the task we target is
- * current, if not we don't have to activate the ctx, a future
- * context switch will do that for us.
+ * If the task is running, it must be running on this CPU,
+ * otherwise we cannot reprogram things.
+ *
+ * If its not running, we don't care, ctx->lock will
+ * serialize against it becoming runnable.
*/
- if (ctx->task != current)
- activate = false;
- else
- WARN_ON_ONCE(cpuctx->task_ctx && cpuctx->task_ctx != ctx);
+ if (task_curr(ctx->task) && !reprogram) {
+ ret = -ESRCH;
+ goto unlock;
+ }
+ WARN_ON_ONCE(reprogram && cpuctx->task_ctx && cpuctx->task_ctx != ctx);
} else if (task_ctx) {
raw_spin_lock(&task_ctx->lock);
}
- if (activate) {
+ if (reprogram) {
ctx_sched_out(ctx, cpuctx, EVENT_TIME);
add_event_to_ctx(event, ctx);
ctx_resched(cpuctx, task_ctx);
@@ -2351,13 +2350,36 @@ perf_install_in_context(struct perf_event_context *ctx,
/*
* Installing events is tricky because we cannot rely on ctx->is_active
* to be set in case this is the nr_events 0 -> 1 transition.
+ *
+ * Instead we use task_curr(), which tells us if the task is running.
+ * However, since we use task_curr() outside of rq::lock, we can race
+ * against the actual state. This means the result can be wrong.
+ *
+ * If we get a false positive, we retry, this is harmless.
+ *
+ * If we get a false negative, things are complicated. If we are after
+ * perf_event_context_sched_in() ctx::lock will serialize us, and the
+ * value must be correct. If we're before, it doesn't matter since
+ * perf_event_context_sched_in() will program the counter.
+ *
+ * However, this hinges on the remote context switch having observed
+ * our task->perf_event_ctxp[] store, such that it will in fact take
+ * ctx::lock in perf_event_context_sched_in().
+ *
+ * We do this by task_function_call(), if the IPI fails to hit the task
+ * we know any future context switch of task must see the
+ * perf_event_ctpx[] store.
*/
-again:
+
/*
- * Cannot use task_function_call() because we need to run on the task's
- * CPU regardless of whether its current or not.
+ * This smp_mb() orders the task->perf_event_ctxp[] store with the
+ * task_cpu() load, such that if the IPI then does not find the task
+ * running, a future context switch of that task must observe the
+ * store.
*/
- if (!cpu_function_call(task_cpu(task), __perf_install_in_context, event))
+ smp_mb();
+again:
+ if (!task_function_call(task, __perf_install_in_context, event))
return;
raw_spin_lock_irq(&ctx->lock);
@@ -2371,12 +2393,16 @@ again:
raw_spin_unlock_irq(&ctx->lock);
return;
}
- raw_spin_unlock_irq(&ctx->lock);
/*
- * Since !ctx->is_active doesn't mean anything, we must IPI
- * unconditionally.
+ * If the task is not running, ctx->lock will avoid it becoming so,
+ * thus we can safely install the event.
*/
- goto again;
+ if (task_curr(task)) {
+ raw_spin_unlock_irq(&ctx->lock);
+ goto again;
+ }
+ add_event_to_ctx(event, ctx);
+ raw_spin_unlock_irq(&ctx->lock);
}
/*
@@ -7845,6 +7871,7 @@ static int perf_event_set_bpf_prog(struct perf_event *event, u32 prog_fd)
}
}
event->tp_event->prog = prog;
+ event->tp_event->bpf_prog_owner = event;
return 0;
}
@@ -7859,7 +7886,7 @@ static void perf_event_free_bpf_prog(struct perf_event *event)
return;
prog = event->tp_event->prog;
- if (prog) {
+ if (prog && event->tp_event->bpf_prog_owner == event) {
event->tp_event->prog = NULL;
bpf_prog_put(prog);
}
@@ -9760,28 +9787,27 @@ SYSCALL_DEFINE5(perf_event_open,
goto err_context;
/*
- * Do not allow to attach to a group in a different
- * task or CPU context:
+ * Make sure we're both events for the same CPU;
+ * grouping events for different CPUs is broken; since
+ * you can never concurrently schedule them anyhow.
*/
- if (move_group) {
- /*
- * Make sure we're both on the same task, or both
- * per-cpu events.
- */
- if (group_leader->ctx->task != ctx->task)
- goto err_context;
+ if (group_leader->cpu != event->cpu)
+ goto err_context;
- /*
- * Make sure we're both events for the same CPU;
- * grouping events for different CPUs is broken; since
- * you can never concurrently schedule them anyhow.
- */
- if (group_leader->cpu != event->cpu)
- goto err_context;
- } else {
- if (group_leader->ctx != ctx)
- goto err_context;
- }
+ /*
+ * Make sure we're both on the same task, or both
+ * per-CPU events.
+ */
+ if (group_leader->ctx->task != ctx->task)
+ goto err_context;
+
+ /*
+ * Do not allow to attach to a group in a different task
+ * or CPU context. If we're moving SW events, we'll fix
+ * this up later, so allow that.
+ */
+ if (!move_group && group_leader->ctx != ctx)
+ goto err_context;
/*
* Only a group leader can be exclusive or pinned
@@ -10333,6 +10359,17 @@ void perf_event_free_task(struct task_struct *task)
continue;
mutex_lock(&ctx->mutex);
+ raw_spin_lock_irq(&ctx->lock);
+ /*
+ * Destroy the task <-> ctx relation and mark the context dead.
+ *
+ * This is important because even though the task hasn't been
+ * exposed yet the context has been (through child_list).
+ */
+ RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], NULL);
+ WRITE_ONCE(ctx->task, TASK_TOMBSTONE);
+ put_task_struct(task); /* cannot be last */
+ raw_spin_unlock_irq(&ctx->lock);
again:
list_for_each_entry_safe(event, tmp, &ctx->pinned_groups,
group_entry)
@@ -10586,7 +10623,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn)
ret = inherit_task_group(event, parent, parent_ctx,
child, ctxn, &inherited_all);
if (ret)
- break;
+ goto out_unlock;
}
/*
@@ -10602,7 +10639,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn)
ret = inherit_task_group(event, parent, parent_ctx,
child, ctxn, &inherited_all);
if (ret)
- break;
+ goto out_unlock;
}
raw_spin_lock_irqsave(&parent_ctx->lock, flags);
@@ -10630,6 +10667,7 @@ static int perf_event_init_context(struct task_struct *child, int ctxn)
}
raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
+out_unlock:
mutex_unlock(&parent_ctx->mutex);
perf_unpin_context(parent_ctx);
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index f9ec9add2164..a1de021dccba 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -1254,8 +1254,6 @@ void uprobe_end_dup_mmap(void)
void uprobe_dup_mmap(struct mm_struct *oldmm, struct mm_struct *newmm)
{
- newmm->uprobes_state.xol_area = NULL;
-
if (test_bit(MMF_HAS_UPROBES, &oldmm->flags)) {
set_bit(MMF_HAS_UPROBES, &newmm->flags);
/* unconditionally, dup_mmap() skips VM_DONTCOPY vmas */
diff --git a/kernel/extable.c b/kernel/extable.c
index e820ccee9846..4f06fc34313f 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -66,7 +66,7 @@ static inline int init_kernel_text(unsigned long addr)
return 0;
}
-int core_kernel_text(unsigned long addr)
+int notrace core_kernel_text(unsigned long addr)
{
if (addr >= (unsigned long)_stext &&
addr < (unsigned long)_etext)
diff --git a/kernel/fork.c b/kernel/fork.c
index ba8a01564985..9321b1ad3335 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -521,7 +521,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
set_task_stack_end_magic(tsk);
#ifdef CONFIG_CC_STACKPROTECTOR
- tsk->stack_canary = get_random_int();
+ tsk->stack_canary = get_random_long();
#endif
/*
@@ -745,6 +745,13 @@ static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
#endif
}
+static void mm_init_uprobes_state(struct mm_struct *mm)
+{
+#ifdef CONFIG_UPROBES
+ mm->uprobes_state.xol_area = NULL;
+#endif
+}
+
static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
struct user_namespace *user_ns)
{
@@ -766,11 +773,13 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
mm_init_cpumask(mm);
mm_init_aio(mm);
mm_init_owner(mm, p);
+ RCU_INIT_POINTER(mm->exe_file, NULL);
mmu_notifier_mm_init(mm);
clear_tlb_flush_pending(mm);
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
mm->pmd_huge_pte = NULL;
#endif
+ mm_init_uprobes_state(mm);
if (current->mm) {
mm->flags = current->mm->flags & MMF_INIT_MASK;
@@ -1773,11 +1782,13 @@ static __latent_entropy struct task_struct *copy_process(
*/
recalc_sigpending();
if (signal_pending(current)) {
- spin_unlock(&current->sighand->siglock);
- write_unlock_irq(&tasklist_lock);
retval = -ERESTARTNOINTR;
goto bad_fork_cancel_cgroup;
}
+ if (unlikely(!(ns_of_pid(pid)->nr_hashed & PIDNS_HASH_ADDING))) {
+ retval = -ENOMEM;
+ goto bad_fork_cancel_cgroup;
+ }
if (likely(p->pid)) {
ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
@@ -1828,6 +1839,8 @@ static __latent_entropy struct task_struct *copy_process(
return p;
bad_fork_cancel_cgroup:
+ spin_unlock(&current->sighand->siglock);
+ write_unlock_irq(&tasklist_lock);
cgroup_cancel_fork(p);
bad_fork_free_pid:
threadgroup_change_end(current);
diff --git a/kernel/futex.c b/kernel/futex.c
index 2c4be467fecd..88bad86180ac 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -668,13 +668,14 @@ again:
* this reference was taken by ihold under the page lock
* pinning the inode in place so i_lock was unnecessary. The
* only way for this check to fail is if the inode was
- * truncated in parallel so warn for now if this happens.
+ * truncated in parallel which is almost certainly an
+ * application bug. In such a case, just retry.
*
* We are not calling into get_futex_key_refs() in file-backed
* cases, therefore a successful atomic_inc return below will
* guarantee that get_futex_key() will still imply smp_mb(); (B).
*/
- if (WARN_ON_ONCE(!atomic_inc_not_zero(&inode->i_count))) {
+ if (!atomic_inc_not_zero(&inode->i_count)) {
rcu_read_unlock();
put_page(page);
@@ -2813,7 +2814,6 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
{
struct hrtimer_sleeper timeout, *to = NULL;
struct rt_mutex_waiter rt_waiter;
- struct rt_mutex *pi_mutex = NULL;
struct futex_hash_bucket *hb;
union futex_key key2 = FUTEX_KEY_INIT;
struct futex_q q = futex_q_init;
@@ -2897,6 +2897,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
if (q.pi_state && (q.pi_state->owner != current)) {
spin_lock(q.lock_ptr);
ret = fixup_pi_state_owner(uaddr2, &q, current);
+ if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current)
+ rt_mutex_unlock(&q.pi_state->pi_mutex);
/*
* Drop the reference to the pi state which
* the requeue_pi() code acquired for us.
@@ -2905,6 +2907,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
spin_unlock(q.lock_ptr);
}
} else {
+ struct rt_mutex *pi_mutex;
+
/*
* We have been woken up by futex_unlock_pi(), a timeout, or a
* signal. futex_unlock_pi() will not destroy the lock_ptr nor
@@ -2928,18 +2932,19 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
if (res)
ret = (res < 0) ? res : 0;
+ /*
+ * If fixup_pi_state_owner() faulted and was unable to handle
+ * the fault, unlock the rt_mutex and return the fault to
+ * userspace.
+ */
+ if (ret && rt_mutex_owner(pi_mutex) == current)
+ rt_mutex_unlock(pi_mutex);
+
/* Unqueue and drop the lock. */
unqueue_me_pi(&q);
}
- /*
- * If fixup_pi_state_owner() faulted and was unable to handle the
- * fault, unlock the rt_mutex and return the fault to userspace.
- */
- if (ret == -EFAULT) {
- if (pi_mutex && rt_mutex_owner(pi_mutex) == current)
- rt_mutex_unlock(pi_mutex);
- } else if (ret == -EINTR) {
+ if (ret == -EINTR) {
/*
* We've already been requeued, but cannot restart by calling
* futex_lock_pi() directly. We could restart this syscall, but
@@ -3323,4 +3328,4 @@ static int __init futex_init(void)
return 0;
}
-__initcall(futex_init);
+core_initcall(futex_init);
diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c
index 2f9df37940a0..c51a49c9be70 100644
--- a/kernel/gcov/base.c
+++ b/kernel/gcov/base.c
@@ -98,6 +98,12 @@ void __gcov_merge_icall_topn(gcov_type *counters, unsigned int n_counters)
}
EXPORT_SYMBOL(__gcov_merge_icall_topn);
+void __gcov_exit(void)
+{
+ /* Unused. */
+}
+EXPORT_SYMBOL(__gcov_exit);
+
/**
* gcov_enable_events - enable event reporting through gcov_event()
*
diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c
index 6a5c239c7669..46a18e72bce6 100644
--- a/kernel/gcov/gcc_4_7.c
+++ b/kernel/gcov/gcc_4_7.c
@@ -18,7 +18,9 @@
#include <linux/vmalloc.h>
#include "gcov.h"
-#if (__GNUC__ > 5) || (__GNUC__ == 5 && __GNUC_MINOR__ >= 1)
+#if (__GNUC__ >= 7)
+#define GCOV_COUNTERS 9
+#elif (__GNUC__ > 5) || (__GNUC__ == 5 && __GNUC_MINOR__ >= 1)
#define GCOV_COUNTERS 10
#elif __GNUC__ == 4 && __GNUC_MINOR__ >= 9
#define GCOV_COUNTERS 9
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index be3c34e4f2ac..f30110e1b8c9 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -877,8 +877,8 @@ irq_set_chained_handler_and_data(unsigned int irq, irq_flow_handler_t handle,
if (!desc)
return;
- __irq_do_set_handler(desc, handle, 1, NULL);
desc->irq_common_data.handler_data = data;
+ __irq_do_set_handler(desc, handle, 1, NULL);
irq_put_desc_busunlock(desc, flags);
}
@@ -895,13 +895,15 @@ EXPORT_SYMBOL_GPL(irq_set_chip_and_handler_name);
void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
{
- unsigned long flags;
+ unsigned long flags, trigger, tmp;
struct irq_desc *desc = irq_get_desc_lock(irq, &flags, 0);
if (!desc)
return;
irq_settings_clr_and_set(desc, clr, set);
+ trigger = irqd_get_trigger_type(&desc->irq_data);
+
irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU |
IRQD_TRIGGER_MASK | IRQD_LEVEL | IRQD_MOVE_PCNTXT);
if (irq_settings_has_no_balance_set(desc))
@@ -913,7 +915,11 @@ void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
if (irq_settings_is_level(desc))
irqd_set(&desc->irq_data, IRQD_LEVEL);
- irqd_set(&desc->irq_data, irq_settings_get_trigger_mask(desc));
+ tmp = irq_settings_get_trigger_mask(desc);
+ if (tmp != IRQ_TYPE_NONE)
+ trigger = tmp;
+
+ irqd_set(&desc->irq_data, trigger);
irq_put_desc_unlock(desc, flags);
}
diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c
index 1a9abc1c8ea0..259a22aa9934 100644
--- a/kernel/irq/ipi.c
+++ b/kernel/irq/ipi.c
@@ -165,7 +165,7 @@ irq_hw_number_t ipi_get_hwirq(unsigned int irq, unsigned int cpu)
struct irq_data *data = irq_get_irq_data(irq);
struct cpumask *ipimask = data ? irq_data_get_affinity_mask(data) : NULL;
- if (!data || !ipimask || cpu > nr_cpu_ids)
+ if (!data || !ipimask || cpu >= nr_cpu_ids)
return INVALID_HWIRQ;
if (!cpumask_test_cpu(cpu, ipimask))
@@ -195,7 +195,7 @@ static int ipi_send_verify(struct irq_chip *chip, struct irq_data *data,
if (!chip->ipi_send_single && !chip->ipi_send_mask)
return -EINVAL;
- if (cpu > nr_cpu_ids)
+ if (cpu >= nr_cpu_ids)
return -EINVAL;
if (dest) {
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 00bb0aeea1d0..77977f55dff7 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -405,10 +405,8 @@ static void free_desc(unsigned int irq)
* The sysfs entry must be serialized against a concurrent
* irq_sysfs_init() as well.
*/
- mutex_lock(&sparse_irq_lock);
kobject_del(&desc->kobj);
delete_irq_desc(irq);
- mutex_unlock(&sparse_irq_lock);
/*
* We free the descriptor, masks and stat fields via RCU. That
@@ -446,20 +444,15 @@ static int alloc_descs(unsigned int start, unsigned int cnt, int node,
desc = alloc_desc(start + i, node, flags, mask, owner);
if (!desc)
goto err;
- mutex_lock(&sparse_irq_lock);
irq_insert_desc(start + i, desc);
irq_sysfs_add(start + i, desc);
- mutex_unlock(&sparse_irq_lock);
}
+ bitmap_set(allocated_irqs, start, cnt);
return start;
err:
for (i--; i >= 0; i--)
free_desc(start + i);
-
- mutex_lock(&sparse_irq_lock);
- bitmap_clear(allocated_irqs, start, cnt);
- mutex_unlock(&sparse_irq_lock);
return -ENOMEM;
}
@@ -558,6 +551,7 @@ static inline int alloc_descs(unsigned int start, unsigned int cnt, int node,
desc->owner = owner;
}
+ bitmap_set(allocated_irqs, start, cnt);
return start;
}
@@ -653,10 +647,10 @@ void irq_free_descs(unsigned int from, unsigned int cnt)
if (from >= nr_irqs || (from + cnt) > nr_irqs)
return;
+ mutex_lock(&sparse_irq_lock);
for (i = 0; i < cnt; i++)
free_desc(from + i);
- mutex_lock(&sparse_irq_lock);
bitmap_clear(allocated_irqs, from, cnt);
mutex_unlock(&sparse_irq_lock);
}
@@ -703,19 +697,15 @@ __irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node,
from, cnt, 0);
ret = -EEXIST;
if (irq >=0 && start != irq)
- goto err;
+ goto unlock;
if (start + cnt > nr_irqs) {
ret = irq_expand_nr_irqs(start + cnt);
if (ret)
- goto err;
+ goto unlock;
}
-
- bitmap_set(allocated_irqs, start, cnt);
- mutex_unlock(&sparse_irq_lock);
- return alloc_descs(start, cnt, node, affinity, owner);
-
-err:
+ ret = alloc_descs(start, cnt, node, affinity, owner);
+unlock:
mutex_unlock(&sparse_irq_lock);
return ret;
}
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 6b669593e7eb..ea41820ab12e 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1308,8 +1308,10 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
ret = __irq_set_trigger(desc,
new->flags & IRQF_TRIGGER_MASK);
- if (ret)
+ if (ret) {
+ irq_release_resources(desc);
goto out_mask;
+ }
}
desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index d63095472ea9..a1a07cf1101f 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -563,7 +563,7 @@ static void kprobe_optimizer(struct work_struct *work)
}
/* Wait for completing optimization and unoptimization */
-static void wait_for_kprobe_optimizer(void)
+void wait_for_kprobe_optimizer(void)
{
mutex_lock(&kprobe_mutex);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index be2cc1f9dd57..c2c911a106cf 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -18,6 +18,7 @@
#include <linux/freezer.h>
#include <linux/ptrace.h>
#include <linux/uaccess.h>
+#include <linux/cgroup.h>
#include <trace/events/sched.h>
static DEFINE_SPINLOCK(kthread_create_lock);
@@ -205,6 +206,7 @@ static int kthread(void *_create)
ret = -EINTR;
if (!test_bit(KTHREAD_SHOULD_STOP, &self.flags)) {
+ cgroup_kthread_ready();
__kthread_parkme(&self);
ret = threadfn(data);
}
@@ -530,6 +532,7 @@ int kthreadd(void *unused)
set_mems_allowed(node_states[N_MEMORY]);
current->flags |= PF_NOFREEZE;
+ cgroup_init_kthreadd();
for (;;) {
set_current_state(TASK_INTERRUPTIBLE);
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 4d7ffc0a0d00..6599c7f3071d 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -3260,10 +3260,17 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
if (depth) {
hlock = curr->held_locks + depth - 1;
if (hlock->class_idx == class_idx && nest_lock) {
- if (hlock->references)
+ if (hlock->references) {
+ /*
+ * Check: unsigned int references:12, overflow.
+ */
+ if (DEBUG_LOCKS_WARN_ON(hlock->references == (1 << 12)-1))
+ return 0;
+
hlock->references++;
- else
+ } else {
hlock->references = 2;
+ }
return 1;
}
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index f8c5af52a131..d3de04b12f8c 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -780,6 +780,10 @@ static void lock_torture_cleanup(void)
else
lock_torture_print_module_parms(cxt.cur_ops,
"End of test: SUCCESS");
+
+ kfree(cxt.lwsa);
+ kfree(cxt.lrsa);
+
end:
torture_cleanup_end();
}
@@ -924,6 +928,8 @@ static int __init lock_torture_init(void)
GFP_KERNEL);
if (reader_tasks == NULL) {
VERBOSE_TOROUT_ERRSTRING("reader_tasks: Out of memory");
+ kfree(writer_tasks);
+ writer_tasks = NULL;
firsterr = -ENOMEM;
goto unwind;
}
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
index 1591f6b3539f..a608f7a8fbd1 100644
--- a/kernel/locking/rwsem-spinlock.c
+++ b/kernel/locking/rwsem-spinlock.c
@@ -216,10 +216,8 @@ int __sched __down_write_common(struct rw_semaphore *sem, int state)
*/
if (sem->count == 0)
break;
- if (signal_pending_state(state, current)) {
- ret = -EINTR;
- goto out;
- }
+ if (signal_pending_state(state, current))
+ goto out_nolock;
set_task_state(tsk, state);
raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
schedule();
@@ -227,12 +225,19 @@ int __sched __down_write_common(struct rw_semaphore *sem, int state)
}
/* got the lock */
sem->count = -1;
-out:
list_del(&waiter.list);
raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
return ret;
+
+out_nolock:
+ list_del(&waiter.list);
+ if (!list_empty(&sem->wait_list) && sem->count >= 0)
+ __rwsem_do_wake(sem, 0);
+ raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+
+ return -EINTR;
}
void __sched __down_write(struct rw_semaphore *sem)
diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c
index 0374a596cffa..9aa0fccd5d43 100644
--- a/kernel/locking/spinlock_debug.c
+++ b/kernel/locking/spinlock_debug.c
@@ -103,38 +103,14 @@ static inline void debug_spin_unlock(raw_spinlock_t *lock)
lock->owner_cpu = -1;
}
-static void __spin_lock_debug(raw_spinlock_t *lock)
-{
- u64 i;
- u64 loops = loops_per_jiffy * HZ;
-
- for (i = 0; i < loops; i++) {
- if (arch_spin_trylock(&lock->raw_lock))
- return;
- __delay(1);
- }
- /* lockup suspected: */
- spin_dump(lock, "lockup suspected");
-#ifdef CONFIG_SMP
- trigger_all_cpu_backtrace();
-#endif
-
- /*
- * The trylock above was causing a livelock. Give the lower level arch
- * specific lock code a chance to acquire the lock. We have already
- * printed a warning/backtrace at this point. The non-debug arch
- * specific code might actually succeed in acquiring the lock. If it is
- * not successful, the end-result is the same - there is no forward
- * progress.
- */
- arch_spin_lock(&lock->raw_lock);
-}
-
+/*
+ * We are now relying on the NMI watchdog to detect lockup instead of doing
+ * the detection here with an unfair lock which can cause problem of its own.
+ */
void do_raw_spin_lock(raw_spinlock_t *lock)
{
debug_spin_lock_before(lock);
- if (unlikely(!arch_spin_trylock(&lock->raw_lock)))
- __spin_lock_debug(lock);
+ arch_spin_lock(&lock->raw_lock);
debug_spin_lock_after(lock);
}
@@ -172,32 +148,6 @@ static void rwlock_bug(rwlock_t *lock, const char *msg)
#define RWLOCK_BUG_ON(cond, lock, msg) if (unlikely(cond)) rwlock_bug(lock, msg)
-#if 0 /* __write_lock_debug() can lock up - maybe this can too? */
-static void __read_lock_debug(rwlock_t *lock)
-{
- u64 i;
- u64 loops = loops_per_jiffy * HZ;
- int print_once = 1;
-
- for (;;) {
- for (i = 0; i < loops; i++) {
- if (arch_read_trylock(&lock->raw_lock))
- return;
- __delay(1);
- }
- /* lockup suspected: */
- if (print_once) {
- print_once = 0;
- printk(KERN_EMERG "BUG: read-lock lockup on CPU#%d, "
- "%s/%d, %p\n",
- raw_smp_processor_id(), current->comm,
- current->pid, lock);
- dump_stack();
- }
- }
-}
-#endif
-
void do_raw_read_lock(rwlock_t *lock)
{
RWLOCK_BUG_ON(lock->magic != RWLOCK_MAGIC, lock, "bad magic");
@@ -247,32 +197,6 @@ static inline void debug_write_unlock(rwlock_t *lock)
lock->owner_cpu = -1;
}
-#if 0 /* This can cause lockups */
-static void __write_lock_debug(rwlock_t *lock)
-{
- u64 i;
- u64 loops = loops_per_jiffy * HZ;
- int print_once = 1;
-
- for (;;) {
- for (i = 0; i < loops; i++) {
- if (arch_write_trylock(&lock->raw_lock))
- return;
- __delay(1);
- }
- /* lockup suspected: */
- if (print_once) {
- print_once = 0;
- printk(KERN_EMERG "BUG: write-lock lockup on CPU#%d, "
- "%s/%d, %p\n",
- raw_smp_processor_id(), current->comm,
- current->pid, lock);
- dump_stack();
- }
- }
-}
-#endif
-
void do_raw_write_lock(rwlock_t *lock)
{
debug_write_lock_before(lock);
diff --git a/kernel/membarrier.c b/kernel/membarrier.c
index 536c727a56e9..9f9284f37f8d 100644
--- a/kernel/membarrier.c
+++ b/kernel/membarrier.c
@@ -16,6 +16,7 @@
#include <linux/syscalls.h>
#include <linux/membarrier.h>
+#include <linux/tick.h>
/*
* Bitmask made from a "or" of all commands within enum membarrier_cmd,
@@ -51,6 +52,9 @@
*/
SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
{
+ /* MEMBARRIER_CMD_SHARED is not compatible with nohz_full. */
+ if (tick_nohz_full_enabled())
+ return -ENOSYS;
if (unlikely(flags))
return -EINVAL;
switch (cmd) {
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 9ecedc28b928..06123234f118 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -246,9 +246,13 @@ static void devm_memremap_pages_release(struct device *dev, void *data)
/* pages are dead and unused, undo the arch mapping */
align_start = res->start & ~(SECTION_SIZE - 1);
align_size = ALIGN(resource_size(res), SECTION_SIZE);
+
+ lock_device_hotplug();
mem_hotplug_begin();
arch_remove_memory(align_start, align_size);
mem_hotplug_done();
+ unlock_device_hotplug();
+
untrack_pfn(NULL, PHYS_PFN(align_start), align_size);
pgmap_radix_release(res);
dev_WARN_ONCE(dev, pgmap->altmap && pgmap->altmap->alloc,
@@ -360,9 +364,11 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
if (error)
goto err_pfn_remap;
+ lock_device_hotplug();
mem_hotplug_begin();
error = arch_add_memory(nid, align_start, align_size, true);
mem_hotplug_done();
+ unlock_device_hotplug();
if (error)
goto err_add_memory;
diff --git a/kernel/padata.c b/kernel/padata.c
index 7848f0566403..e4a8f8d9b31a 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -190,19 +190,20 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd)
reorder = &next_queue->reorder;
+ spin_lock(&reorder->lock);
if (!list_empty(&reorder->list)) {
padata = list_entry(reorder->list.next,
struct padata_priv, list);
- spin_lock(&reorder->lock);
list_del_init(&padata->list);
atomic_dec(&pd->reorder_objects);
- spin_unlock(&reorder->lock);
pd->processed++;
+ spin_unlock(&reorder->lock);
goto out;
}
+ spin_unlock(&reorder->lock);
if (__this_cpu_read(pd->pqueue->cpu_index) == next_queue->cpu_index) {
padata = ERR_PTR(-ENODATA);
@@ -357,7 +358,7 @@ static int padata_setup_cpumasks(struct parallel_data *pd,
cpumask_and(pd->cpumask.pcpu, pcpumask, cpu_online_mask);
if (!alloc_cpumask_var(&pd->cpumask.cbcpu, GFP_KERNEL)) {
- free_cpumask_var(pd->cpumask.cbcpu);
+ free_cpumask_var(pd->cpumask.pcpu);
return -ENOMEM;
}
diff --git a/kernel/panic.c b/kernel/panic.c
index e6480e20379e..dbec387099b1 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -249,7 +249,7 @@ void panic(const char *fmt, ...)
* Delay timeout seconds before rebooting the machine.
* We can't use the "normal" timers since we just panicked.
*/
- pr_emerg("Rebooting in %d seconds..", panic_timeout);
+ pr_emerg("Rebooting in %d seconds..\n", panic_timeout);
for (i = 0; i < panic_timeout * 1000; i += PANIC_TIMER_STEP) {
touch_nmi_watchdog();
diff --git a/kernel/pid.c b/kernel/pid.c
index f66162f2359b..693a64385d59 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -526,8 +526,11 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
if (!ns)
ns = task_active_pid_ns(current);
if (likely(pid_alive(task))) {
- if (type != PIDTYPE_PID)
+ if (type != PIDTYPE_PID) {
+ if (type == __PIDTYPE_TGID)
+ type = PIDTYPE_PID;
task = task->group_leader;
+ }
nr = pid_nr_ns(rcu_dereference(task->pids[type].pid), ns);
}
rcu_read_unlock();
@@ -536,12 +539,6 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
}
EXPORT_SYMBOL(__task_pid_nr_ns);
-pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
-{
- return pid_nr_ns(task_tgid(tsk), ns);
-}
-EXPORT_SYMBOL(task_tgid_nr_ns);
-
struct pid_namespace *task_active_pid_ns(struct task_struct *tsk)
{
return ns_of_pid(task_pid(tsk));
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index eef2ce968636..3976dd57db78 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -274,7 +274,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
* if reparented.
*/
for (;;) {
- set_current_state(TASK_UNINTERRUPTIBLE);
+ set_current_state(TASK_INTERRUPTIBLE);
if (pid_ns->nr_hashed == init_pids)
break;
schedule();
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 2fba066e125f..8ea24ded1dab 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -18,8 +18,9 @@
#include <linux/workqueue.h>
#include <linux/kmod.h>
#include <trace/events/power.h>
+#include <linux/cpuset.h>
-/*
+/*
* Timeout for stopping processes
*/
unsigned int __read_mostly freeze_timeout_msecs = 20 * MSEC_PER_SEC;
@@ -200,6 +201,8 @@ void thaw_processes(void)
__usermodehelper_set_disable_depth(UMH_FREEZING);
thaw_workqueues();
+ cpuset_wait_for_hotplug();
+
read_lock(&tasklist_lock);
for_each_process_thread(g, p) {
/* No other threads should have PF_SUSPEND_TASK set */
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index f7a55e9ff2f7..9c5b231684d0 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -1564,7 +1564,7 @@ static void call_console_drivers(int level,
{
struct console *con;
- trace_console(text, len);
+ trace_console_rcuidle(text, len);
if (!console_drivers)
return;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 49ba7c1ade9d..f39a7be98fc1 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -57,19 +57,25 @@ int ptrace_access_vm(struct task_struct *tsk, unsigned long addr,
}
+void __ptrace_link(struct task_struct *child, struct task_struct *new_parent,
+ const struct cred *ptracer_cred)
+{
+ BUG_ON(!list_empty(&child->ptrace_entry));
+ list_add(&child->ptrace_entry, &new_parent->ptraced);
+ child->parent = new_parent;
+ child->ptracer_cred = get_cred(ptracer_cred);
+}
+
/*
* ptrace a task: make the debugger its new parent and
* move it to the ptrace list.
*
* Must be called with the tasklist lock write-held.
*/
-void __ptrace_link(struct task_struct *child, struct task_struct *new_parent)
+static void ptrace_link(struct task_struct *child, struct task_struct *new_parent)
{
- BUG_ON(!list_empty(&child->ptrace_entry));
- list_add(&child->ptrace_entry, &new_parent->ptraced);
- child->parent = new_parent;
rcu_read_lock();
- child->ptracer_cred = get_cred(__task_cred(new_parent));
+ __ptrace_link(child, new_parent, __task_cred(new_parent));
rcu_read_unlock();
}
@@ -181,11 +187,17 @@ static void ptrace_unfreeze_traced(struct task_struct *task)
WARN_ON(!task->ptrace || task->parent != current);
+ /*
+ * PTRACE_LISTEN can allow ptrace_trap_notify to wake us up remotely.
+ * Recheck state under the lock to close this race.
+ */
spin_lock_irq(&task->sighand->siglock);
- if (__fatal_signal_pending(task))
- wake_up_state(task, __TASK_TRACED);
- else
- task->state = TASK_TRACED;
+ if (task->state == __TASK_TRACED) {
+ if (__fatal_signal_pending(task))
+ wake_up_state(task, __TASK_TRACED);
+ else
+ task->state = TASK_TRACED;
+ }
spin_unlock_irq(&task->sighand->siglock);
}
@@ -377,7 +389,7 @@ static int ptrace_attach(struct task_struct *task, long request,
flags |= PT_SEIZED;
task->ptrace = flags;
- __ptrace_link(task, current);
+ ptrace_link(task, current);
/* SEIZE doesn't trap tracee on attach */
if (!seize)
@@ -450,7 +462,7 @@ static int ptrace_traceme(void)
*/
if (!ret && !(current->real_parent->flags & PF_EXITING)) {
current->ptrace = PT_PTRACED;
- __ptrace_link(current, current->real_parent);
+ ptrace_link(current, current->real_parent);
}
}
write_unlock_irq(&tasklist_lock);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 10f62c6f48e7..d1a02877a42c 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -792,8 +792,13 @@ void rcu_irq_exit(void)
long long oldval;
struct rcu_dynticks *rdtp;
- RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_exit() invoked with irqs enabled!!!");
rdtp = this_cpu_ptr(&rcu_dynticks);
+
+ /* Page faults can happen in NMI handlers, so check... */
+ if (READ_ONCE(rdtp->dynticks_nmi_nesting))
+ return;
+
+ RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_exit() invoked with irqs enabled!!!");
oldval = rdtp->dynticks_nesting;
rdtp->dynticks_nesting--;
WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
@@ -930,8 +935,13 @@ void rcu_irq_enter(void)
struct rcu_dynticks *rdtp;
long long oldval;
- RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_enter() invoked with irqs enabled!!!");
rdtp = this_cpu_ptr(&rcu_dynticks);
+
+ /* Page faults can happen in NMI handlers, so check... */
+ if (READ_ONCE(rdtp->dynticks_nmi_nesting))
+ return;
+
+ RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_enter() invoked with irqs enabled!!!");
oldval = rdtp->dynticks_nesting;
rdtp->dynticks_nesting++;
WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 56583e764ebf..e3944c4b072d 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1767,6 +1767,7 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force)
if (READ_ONCE(rdp_leader->nocb_leader_sleep) || force) {
/* Prior smp_mb__after_atomic() orders against prior enqueue. */
WRITE_ONCE(rdp_leader->nocb_leader_sleep, false);
+ smp_mb(); /* ->nocb_leader_sleep before swake_up(). */
swake_up(&rdp_leader->nocb_wq);
}
}
@@ -2021,6 +2022,7 @@ wait_again:
* nocb_gp_head, where they await a grace period.
*/
gotcbs = false;
+ smp_mb(); /* wakeup before ->nocb_head reads. */
for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
rdp->nocb_gp_head = READ_ONCE(rdp->nocb_head);
if (!rdp->nocb_gp_head)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 154fd689fe02..e5066955cc3a 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -507,8 +507,7 @@ void resched_cpu(int cpu)
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
- if (!raw_spin_trylock_irqsave(&rq->lock, flags))
- return;
+ raw_spin_lock_irqsave(&rq->lock, flags);
resched_curr(rq);
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
@@ -1141,6 +1140,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
int ret = 0;
rq = task_rq_lock(p, &rf);
+ update_rq_clock(rq);
if (p->flags & PF_KTHREAD) {
/*
@@ -5469,7 +5469,7 @@ void idle_task_exit(void)
BUG_ON(cpu_online(smp_processor_id()));
if (mm != &init_mm) {
- switch_mm_irqs_off(mm, &init_mm, current);
+ switch_mm(mm, &init_mm, current);
finish_arch_post_lock_switch();
}
mmdrop(mm);
@@ -5877,6 +5877,12 @@ static int init_rootdomain(struct root_domain *rd)
if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
goto free_dlo_mask;
+#ifdef HAVE_RT_PUSH_IPI
+ rd->rto_cpu = -1;
+ raw_spin_lock_init(&rd->rto_lock);
+ init_irq_work(&rd->rto_push_work, rto_push_irq_work_func);
+#endif
+
init_dl_bw(&rd->dl_bw);
if (cpudl_init(&rd->cpudl) != 0)
goto free_dlo_mask;
@@ -6102,6 +6108,9 @@ enum s_alloc {
* Build an iteration mask that can exclude certain CPUs from the upwards
* domain traversal.
*
+ * Only CPUs that can arrive at this group should be considered to continue
+ * balancing.
+ *
* Asymmetric node setups can result in situations where the domain tree is of
* unequal depth, make sure to skip domains that already cover the entire
* range.
@@ -6113,18 +6122,31 @@ enum s_alloc {
*/
static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
{
- const struct cpumask *span = sched_domain_span(sd);
+ const struct cpumask *sg_span = sched_group_cpus(sg);
struct sd_data *sdd = sd->private;
struct sched_domain *sibling;
int i;
- for_each_cpu(i, span) {
+ for_each_cpu(i, sg_span) {
sibling = *per_cpu_ptr(sdd->sd, i);
- if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
+
+ /*
+ * Can happen in the asymmetric case, where these siblings are
+ * unused. The mask will not be empty because those CPUs that
+ * do have the top domain _should_ span the domain.
+ */
+ if (!sibling->child)
+ continue;
+
+ /* If we would not end up here, we can't continue from here */
+ if (!cpumask_equal(sg_span, sched_domain_span(sibling->child)))
continue;
cpumask_set_cpu(i, sched_group_mask(sg));
}
+
+ /* We must not have empty masks here */
+ WARN_ON_ONCE(cpumask_empty(sched_group_mask(sg)));
}
/*
@@ -6148,7 +6170,7 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu)
cpumask_clear(covered);
- for_each_cpu(i, span) {
+ for_each_cpu_wrap(i, span, cpu) {
struct cpumask *sg_span;
if (cpumask_test_cpu(i, covered))
@@ -7276,16 +7298,15 @@ static void cpuset_cpu_active(void)
* operation in the resume sequence, just build a single sched
* domain, ignoring cpusets.
*/
- num_cpus_frozen--;
- if (likely(num_cpus_frozen)) {
- partition_sched_domains(1, NULL, NULL);
+ partition_sched_domains(1, NULL, NULL);
+ if (--num_cpus_frozen)
return;
- }
/*
* This is the last CPU online operation. So fall through and
* restore the original sched domains by considering the
* cpuset configurations.
*/
+ cpuset_force_rebuild();
}
cpuset_update_active_cpus(true);
}
@@ -7422,22 +7443,6 @@ int sched_cpu_dying(unsigned int cpu)
}
#endif
-#ifdef CONFIG_SCHED_SMT
-DEFINE_STATIC_KEY_FALSE(sched_smt_present);
-
-static void sched_init_smt(void)
-{
- /*
- * We've enumerated all CPUs and will assume that if any CPU
- * has SMT siblings, CPU0 will too.
- */
- if (cpumask_weight(cpu_smt_mask(0)) > 1)
- static_branch_enable(&sched_smt_present);
-}
-#else
-static inline void sched_init_smt(void) { }
-#endif
-
void __init sched_init_smp(void)
{
cpumask_var_t non_isolated_cpus;
@@ -7467,9 +7472,6 @@ void __init sched_init_smp(void)
init_sched_rt_class();
init_sched_dl_class();
-
- sched_init_smt();
-
sched_smp_initialized = true;
}
@@ -7964,6 +7966,7 @@ void sched_move_task(struct task_struct *tsk)
struct rq *rq;
rq = task_rq_lock(tsk, &rf);
+ update_rq_clock(rq);
running = task_current(rq, tsk);
queued = task_on_rq_queued(tsk);
@@ -8379,11 +8382,20 @@ cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
if (IS_ERR(tg))
return ERR_PTR(-ENOMEM);
- sched_online_group(tg, parent);
-
return &tg->css;
}
+/* Expose task group only after completing cgroup initialization */
+static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
+{
+ struct task_group *tg = css_tg(css);
+ struct task_group *parent = css_tg(css->parent);
+
+ if (parent)
+ sched_online_group(tg, parent);
+ return 0;
+}
+
static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
{
struct task_group *tg = css_tg(css);
@@ -8786,6 +8798,7 @@ static struct cftype cpu_files[] = {
struct cgroup_subsys cpu_cgrp_subsys = {
.css_alloc = cpu_cgroup_css_alloc,
+ .css_online = cpu_cgroup_css_online,
.css_released = cpu_cgroup_css_released,
.css_free = cpu_cgroup_css_free,
.fork = cpu_cgroup_fork,
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index c67c751944a9..4704f654e7c3 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -32,6 +32,7 @@ struct sugov_policy {
u64 last_freq_update_time;
s64 freq_update_delay_ns;
unsigned int next_freq;
+ unsigned int cached_raw_freq;
/* The next fields are only needed if fast switch cannot be used. */
struct irq_work irq_work;
@@ -46,7 +47,6 @@ struct sugov_cpu {
struct update_util_data update_util;
struct sugov_policy *sg_policy;
- unsigned int cached_raw_freq;
unsigned long iowait_boost;
unsigned long iowait_boost_max;
u64 last_update;
@@ -140,9 +140,9 @@ static unsigned int get_next_freq(struct sugov_cpu *sg_cpu, unsigned long util,
freq = (freq + (freq >> 2)) * util / max;
- if (freq == sg_cpu->cached_raw_freq && sg_policy->next_freq != UINT_MAX)
+ if (freq == sg_policy->cached_raw_freq && sg_policy->next_freq != UINT_MAX)
return sg_policy->next_freq;
- sg_cpu->cached_raw_freq = freq;
+ sg_policy->cached_raw_freq = freq;
return cpufreq_driver_resolve_freq(policy, freq);
}
@@ -502,25 +502,19 @@ static int sugov_start(struct cpufreq_policy *policy)
sg_policy->next_freq = UINT_MAX;
sg_policy->work_in_progress = false;
sg_policy->need_freq_update = false;
+ sg_policy->cached_raw_freq = 0;
for_each_cpu(cpu, policy->cpus) {
struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);
+ memset(sg_cpu, 0, sizeof(*sg_cpu));
sg_cpu->sg_policy = sg_policy;
- if (policy_is_shared(policy)) {
- sg_cpu->util = 0;
- sg_cpu->max = 0;
- sg_cpu->flags = SCHED_CPUFREQ_RT;
- sg_cpu->last_update = 0;
- sg_cpu->cached_raw_freq = 0;
- sg_cpu->iowait_boost = 0;
- sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq;
- cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
- sugov_update_shared);
- } else {
- cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
- sugov_update_single);
- }
+ sg_cpu->flags = SCHED_CPUFREQ_RT;
+ sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq;
+ cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
+ policy_is_shared(policy) ?
+ sugov_update_shared :
+ sugov_update_single);
}
return 0;
}
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 37e2449186c4..c95c5122b105 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1729,12 +1729,11 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
#ifdef CONFIG_SMP
if (tsk_nr_cpus_allowed(p) > 1 && rq->dl.overloaded)
queue_push_tasks(rq);
-#else
+#endif
if (dl_task(rq->curr))
check_preempt_curr_dl(rq, p, 0);
else
resched_curr(rq);
-#endif
}
}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index c242944f5cbd..7a68c631d5b5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5310,43 +5310,6 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
}
-/*
- * Implement a for_each_cpu() variant that starts the scan at a given cpu
- * (@start), and wraps around.
- *
- * This is used to scan for idle CPUs; such that not all CPUs looking for an
- * idle CPU find the same CPU. The down-side is that tasks tend to cycle
- * through the LLC domain.
- *
- * Especially tbench is found sensitive to this.
- */
-
-static int cpumask_next_wrap(int n, const struct cpumask *mask, int start, int *wrapped)
-{
- int next;
-
-again:
- next = find_next_bit(cpumask_bits(mask), nr_cpumask_bits, n+1);
-
- if (*wrapped) {
- if (next >= start)
- return nr_cpumask_bits;
- } else {
- if (next >= nr_cpumask_bits) {
- *wrapped = 1;
- n = -1;
- goto again;
- }
- }
-
- return next;
-}
-
-#define for_each_cpu_wrap(cpu, mask, start, wrap) \
- for ((wrap) = 0, (cpu) = (start)-1; \
- (cpu) = cpumask_next_wrap((cpu), (mask), (start), &(wrap)), \
- (cpu) < nr_cpumask_bits; )
-
#ifdef CONFIG_SCHED_SMT
static inline void set_idle_cores(int cpu, int val)
@@ -5376,7 +5339,7 @@ static inline bool test_idle_cores(int cpu, bool def)
* Since SMT siblings share all cache levels, inspecting this limited remote
* state should be fairly cheap.
*/
-void __update_idle_core(struct rq *rq)
+void update_idle_core(struct rq *rq)
{
int core = cpu_of(rq);
int cpu;
@@ -5406,17 +5369,14 @@ unlock:
static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target)
{
struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
- int core, cpu, wrap;
-
- if (!static_branch_likely(&sched_smt_present))
- return -1;
+ int core, cpu;
if (!test_idle_cores(target, false))
return -1;
cpumask_and(cpus, sched_domain_span(sd), tsk_cpus_allowed(p));
- for_each_cpu_wrap(core, cpus, target, wrap) {
+ for_each_cpu_wrap(core, cpus, target) {
bool idle = true;
for_each_cpu(cpu, cpu_smt_mask(core)) {
@@ -5444,9 +5404,6 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t
{
int cpu;
- if (!static_branch_likely(&sched_smt_present))
- return -1;
-
for_each_cpu(cpu, cpu_smt_mask(target)) {
if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
continue;
@@ -5482,7 +5439,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
u64 avg_cost, avg_idle = this_rq()->avg_idle;
u64 time, cost;
s64 delta;
- int cpu, wrap;
+ int cpu;
this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc));
if (!this_sd)
@@ -5499,7 +5456,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
time = local_clock();
- for_each_cpu_wrap(cpu, sched_domain_span(sd), target, wrap) {
+ for_each_cpu_wrap(cpu, sched_domain_span(sd), target) {
if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
continue;
if (idle_cpu(cpu))
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
index a2d6eb71f06b..ec91fcc09bfe 100644
--- a/kernel/sched/loadavg.c
+++ b/kernel/sched/loadavg.c
@@ -201,8 +201,9 @@ void calc_load_exit_idle(void)
struct rq *this_rq = this_rq();
/*
- * If we're still before the sample window, we're done.
+ * If we're still before the pending sample window, we're done.
*/
+ this_rq->calc_load_update = calc_load_update;
if (time_before(jiffies, this_rq->calc_load_update))
return;
@@ -211,7 +212,6 @@ void calc_load_exit_idle(void)
* accounted through the nohz accounting, so skip the entire deal and
* sync up for the next window.
*/
- this_rq->calc_load_update = calc_load_update;
if (time_before(jiffies, this_rq->calc_load_update + 10))
this_rq->calc_load_update += LOAD_FREQ;
}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 2516b8df6dbb..9c131168d933 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -72,10 +72,6 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
raw_spin_unlock(&rt_b->rt_runtime_lock);
}
-#if defined(CONFIG_SMP) && defined(HAVE_RT_PUSH_IPI)
-static void push_irq_work_func(struct irq_work *work);
-#endif
-
void init_rt_rq(struct rt_rq *rt_rq)
{
struct rt_prio_array *array;
@@ -95,13 +91,6 @@ void init_rt_rq(struct rt_rq *rt_rq)
rt_rq->rt_nr_migratory = 0;
rt_rq->overloaded = 0;
plist_head_init(&rt_rq->pushable_tasks);
-
-#ifdef HAVE_RT_PUSH_IPI
- rt_rq->push_flags = 0;
- rt_rq->push_cpu = nr_cpu_ids;
- raw_spin_lock_init(&rt_rq->push_lock);
- init_irq_work(&rt_rq->push_work, push_irq_work_func);
-#endif
#endif /* CONFIG_SMP */
/* We start is dequeued state, because no RT tasks are queued */
rt_rq->rt_queued = 0;
@@ -1864,160 +1853,166 @@ static void push_rt_tasks(struct rq *rq)
}
#ifdef HAVE_RT_PUSH_IPI
+
/*
- * The search for the next cpu always starts at rq->cpu and ends
- * when we reach rq->cpu again. It will never return rq->cpu.
- * This returns the next cpu to check, or nr_cpu_ids if the loop
- * is complete.
+ * When a high priority task schedules out from a CPU and a lower priority
+ * task is scheduled in, a check is made to see if there's any RT tasks
+ * on other CPUs that are waiting to run because a higher priority RT task
+ * is currently running on its CPU. In this case, the CPU with multiple RT
+ * tasks queued on it (overloaded) needs to be notified that a CPU has opened
+ * up that may be able to run one of its non-running queued RT tasks.
+ *
+ * All CPUs with overloaded RT tasks need to be notified as there is currently
+ * no way to know which of these CPUs have the highest priority task waiting
+ * to run. Instead of trying to take a spinlock on each of these CPUs,
+ * which has shown to cause large latency when done on machines with many
+ * CPUs, sending an IPI to the CPUs to have them push off the overloaded
+ * RT tasks waiting to run.
+ *
+ * Just sending an IPI to each of the CPUs is also an issue, as on large
+ * count CPU machines, this can cause an IPI storm on a CPU, especially
+ * if its the only CPU with multiple RT tasks queued, and a large number
+ * of CPUs scheduling a lower priority task at the same time.
+ *
+ * Each root domain has its own irq work function that can iterate over
+ * all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT
+ * tassk must be checked if there's one or many CPUs that are lowering
+ * their priority, there's a single irq work iterator that will try to
+ * push off RT tasks that are waiting to run.
+ *
+ * When a CPU schedules a lower priority task, it will kick off the
+ * irq work iterator that will jump to each CPU with overloaded RT tasks.
+ * As it only takes the first CPU that schedules a lower priority task
+ * to start the process, the rto_start variable is incremented and if
+ * the atomic result is one, then that CPU will try to take the rto_lock.
+ * This prevents high contention on the lock as the process handles all
+ * CPUs scheduling lower priority tasks.
+ *
+ * All CPUs that are scheduling a lower priority task will increment the
+ * rt_loop_next variable. This will make sure that the irq work iterator
+ * checks all RT overloaded CPUs whenever a CPU schedules a new lower
+ * priority task, even if the iterator is in the middle of a scan. Incrementing
+ * the rt_loop_next will cause the iterator to perform another scan.
*
- * rq->rt.push_cpu holds the last cpu returned by this function,
- * or if this is the first instance, it must hold rq->cpu.
*/
static int rto_next_cpu(struct rq *rq)
{
- int prev_cpu = rq->rt.push_cpu;
+ struct root_domain *rd = rq->rd;
+ int next;
int cpu;
- cpu = cpumask_next(prev_cpu, rq->rd->rto_mask);
-
/*
- * If the previous cpu is less than the rq's CPU, then it already
- * passed the end of the mask, and has started from the beginning.
- * We end if the next CPU is greater or equal to rq's CPU.
+ * When starting the IPI RT pushing, the rto_cpu is set to -1,
+ * rt_next_cpu() will simply return the first CPU found in
+ * the rto_mask.
+ *
+ * If rto_next_cpu() is called with rto_cpu is a valid cpu, it
+ * will return the next CPU found in the rto_mask.
+ *
+ * If there are no more CPUs left in the rto_mask, then a check is made
+ * against rto_loop and rto_loop_next. rto_loop is only updated with
+ * the rto_lock held, but any CPU may increment the rto_loop_next
+ * without any locking.
*/
- if (prev_cpu < rq->cpu) {
- if (cpu >= rq->cpu)
- return nr_cpu_ids;
+ for (;;) {
- } else if (cpu >= nr_cpu_ids) {
- /*
- * We passed the end of the mask, start at the beginning.
- * If the result is greater or equal to the rq's CPU, then
- * the loop is finished.
- */
- cpu = cpumask_first(rq->rd->rto_mask);
- if (cpu >= rq->cpu)
- return nr_cpu_ids;
- }
- rq->rt.push_cpu = cpu;
+ /* When rto_cpu is -1 this acts like cpumask_first() */
+ cpu = cpumask_next(rd->rto_cpu, rd->rto_mask);
- /* Return cpu to let the caller know if the loop is finished or not */
- return cpu;
-}
+ rd->rto_cpu = cpu;
-static int find_next_push_cpu(struct rq *rq)
-{
- struct rq *next_rq;
- int cpu;
+ if (cpu < nr_cpu_ids)
+ return cpu;
- while (1) {
- cpu = rto_next_cpu(rq);
- if (cpu >= nr_cpu_ids)
- break;
- next_rq = cpu_rq(cpu);
+ rd->rto_cpu = -1;
+
+ /*
+ * ACQUIRE ensures we see the @rto_mask changes
+ * made prior to the @next value observed.
+ *
+ * Matches WMB in rt_set_overload().
+ */
+ next = atomic_read_acquire(&rd->rto_loop_next);
- /* Make sure the next rq can push to this rq */
- if (next_rq->rt.highest_prio.next < rq->rt.highest_prio.curr)
+ if (rd->rto_loop == next)
break;
+
+ rd->rto_loop = next;
}
- return cpu;
+ return -1;
}
-#define RT_PUSH_IPI_EXECUTING 1
-#define RT_PUSH_IPI_RESTART 2
+static inline bool rto_start_trylock(atomic_t *v)
+{
+ return !atomic_cmpxchg_acquire(v, 0, 1);
+}
-static void tell_cpu_to_push(struct rq *rq)
+static inline void rto_start_unlock(atomic_t *v)
{
- int cpu;
+ atomic_set_release(v, 0);
+}
- if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) {
- raw_spin_lock(&rq->rt.push_lock);
- /* Make sure it's still executing */
- if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) {
- /*
- * Tell the IPI to restart the loop as things have
- * changed since it started.
- */
- rq->rt.push_flags |= RT_PUSH_IPI_RESTART;
- raw_spin_unlock(&rq->rt.push_lock);
- return;
- }
- raw_spin_unlock(&rq->rt.push_lock);
- }
+static void tell_cpu_to_push(struct rq *rq)
+{
+ int cpu = -1;
- /* When here, there's no IPI going around */
+ /* Keep the loop going if the IPI is currently active */
+ atomic_inc(&rq->rd->rto_loop_next);
- rq->rt.push_cpu = rq->cpu;
- cpu = find_next_push_cpu(rq);
- if (cpu >= nr_cpu_ids)
+ /* Only one CPU can initiate a loop at a time */
+ if (!rto_start_trylock(&rq->rd->rto_loop_start))
return;
- rq->rt.push_flags = RT_PUSH_IPI_EXECUTING;
+ raw_spin_lock(&rq->rd->rto_lock);
+
+ /*
+ * The rto_cpu is updated under the lock, if it has a valid cpu
+ * then the IPI is still running and will continue due to the
+ * update to loop_next, and nothing needs to be done here.
+ * Otherwise it is finishing up and an ipi needs to be sent.
+ */
+ if (rq->rd->rto_cpu < 0)
+ cpu = rto_next_cpu(rq);
+
+ raw_spin_unlock(&rq->rd->rto_lock);
+
+ rto_start_unlock(&rq->rd->rto_loop_start);
- irq_work_queue_on(&rq->rt.push_work, cpu);
+ if (cpu >= 0)
+ irq_work_queue_on(&rq->rd->rto_push_work, cpu);
}
/* Called from hardirq context */
-static void try_to_push_tasks(void *arg)
+void rto_push_irq_work_func(struct irq_work *work)
{
- struct rt_rq *rt_rq = arg;
- struct rq *rq, *src_rq;
- int this_cpu;
+ struct rq *rq;
int cpu;
- this_cpu = rt_rq->push_cpu;
-
- /* Paranoid check */
- BUG_ON(this_cpu != smp_processor_id());
+ rq = this_rq();
- rq = cpu_rq(this_cpu);
- src_rq = rq_of_rt_rq(rt_rq);
-
-again:
+ /*
+ * We do not need to grab the lock to check for has_pushable_tasks.
+ * When it gets updated, a check is made if a push is possible.
+ */
if (has_pushable_tasks(rq)) {
raw_spin_lock(&rq->lock);
- push_rt_task(rq);
+ push_rt_tasks(rq);
raw_spin_unlock(&rq->lock);
}
- /* Pass the IPI to the next rt overloaded queue */
- raw_spin_lock(&rt_rq->push_lock);
- /*
- * If the source queue changed since the IPI went out,
- * we need to restart the search from that CPU again.
- */
- if (rt_rq->push_flags & RT_PUSH_IPI_RESTART) {
- rt_rq->push_flags &= ~RT_PUSH_IPI_RESTART;
- rt_rq->push_cpu = src_rq->cpu;
- }
+ raw_spin_lock(&rq->rd->rto_lock);
- cpu = find_next_push_cpu(src_rq);
+ /* Pass the IPI to the next rt overloaded queue */
+ cpu = rto_next_cpu(rq);
- if (cpu >= nr_cpu_ids)
- rt_rq->push_flags &= ~RT_PUSH_IPI_EXECUTING;
- raw_spin_unlock(&rt_rq->push_lock);
+ raw_spin_unlock(&rq->rd->rto_lock);
- if (cpu >= nr_cpu_ids)
+ if (cpu < 0)
return;
- /*
- * It is possible that a restart caused this CPU to be
- * chosen again. Don't bother with an IPI, just see if we
- * have more to push.
- */
- if (unlikely(cpu == rq->cpu))
- goto again;
-
/* Try the next RT overloaded CPU */
- irq_work_queue_on(&rt_rq->push_work, cpu);
-}
-
-static void push_irq_work_func(struct irq_work *work)
-{
- struct rt_rq *rt_rq = container_of(work, struct rt_rq, push_work);
-
- try_to_push_tasks(rt_rq);
+ irq_work_queue_on(&rq->rd->rto_push_work, cpu);
}
#endif /* HAVE_RT_PUSH_IPI */
@@ -2198,10 +2193,9 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p)
#ifdef CONFIG_SMP
if (tsk_nr_cpus_allowed(p) > 1 && rq->rt.overloaded)
queue_push_tasks(rq);
-#else
+#endif /* CONFIG_SMP */
if (p->prio < rq->curr->prio)
resched_curr(rq);
-#endif /* CONFIG_SMP */
}
}
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 055f935d4421..cff985feb6e7 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -43,6 +43,12 @@ extern void cpu_load_update_active(struct rq *this_rq);
static inline void cpu_load_update_active(struct rq *this_rq) { }
#endif
+#ifdef CONFIG_SCHED_SMT
+extern void update_idle_core(struct rq *rq);
+#else
+static inline void update_idle_core(struct rq *rq) { }
+#endif
+
/*
* Helpers for converting nanosecond timing to jiffy resolution
*/
@@ -457,7 +463,7 @@ static inline int rt_bandwidth_enabled(void)
}
/* RT IPI pull logic requires IRQ_WORK */
-#ifdef CONFIG_IRQ_WORK
+#if defined(CONFIG_IRQ_WORK) && defined(CONFIG_SMP)
# define HAVE_RT_PUSH_IPI
#endif
@@ -479,12 +485,6 @@ struct rt_rq {
unsigned long rt_nr_total;
int overloaded;
struct plist_head pushable_tasks;
-#ifdef HAVE_RT_PUSH_IPI
- int push_flags;
- int push_cpu;
- struct irq_work push_work;
- raw_spinlock_t push_lock;
-#endif
#endif /* CONFIG_SMP */
int rt_queued;
@@ -566,6 +566,19 @@ struct root_domain {
struct dl_bw dl_bw;
struct cpudl cpudl;
+#ifdef HAVE_RT_PUSH_IPI
+ /*
+ * For IPI pull requests, loop across the rto_mask.
+ */
+ struct irq_work rto_push_work;
+ raw_spinlock_t rto_lock;
+ /* These are only updated and read within rto_lock */
+ int rto_loop;
+ int rto_cpu;
+ /* These atomics are updated outside of a lock */
+ atomic_t rto_loop_next;
+ atomic_t rto_loop_start;
+#endif
/*
* The "RT overload" flag: it gets set if a CPU has more than
* one runnable RT task.
@@ -578,6 +591,9 @@ struct root_domain {
extern struct root_domain def_root_domain;
+#ifdef HAVE_RT_PUSH_IPI
+extern void rto_push_irq_work_func(struct irq_work *work);
+#endif
#endif /* CONFIG_SMP */
/*
@@ -731,23 +747,6 @@ static inline int cpu_of(struct rq *rq)
#endif
}
-
-#ifdef CONFIG_SCHED_SMT
-
-extern struct static_key_false sched_smt_present;
-
-extern void __update_idle_core(struct rq *rq);
-
-static inline void update_idle_core(struct rq *rq)
-{
- if (static_branch_unlikely(&sched_smt_present))
- __update_idle_core(rq);
-}
-
-#else
-static inline void update_idle_core(struct rq *rq) { }
-#endif
-
DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 0db7c8a2afe2..af182a6df25b 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -457,14 +457,19 @@ static long seccomp_attach_filter(unsigned int flags,
return 0;
}
+void __get_seccomp_filter(struct seccomp_filter *filter)
+{
+ /* Reference count is bounded by the number of total processes. */
+ atomic_inc(&filter->usage);
+}
+
/* get_seccomp_filter - increments the reference count of the filter on @tsk */
void get_seccomp_filter(struct task_struct *tsk)
{
struct seccomp_filter *orig = tsk->seccomp.filter;
if (!orig)
return;
- /* Reference count is bounded by the number of total processes. */
- atomic_inc(&orig->usage);
+ __get_seccomp_filter(orig);
}
static inline void seccomp_filter_free(struct seccomp_filter *filter)
@@ -475,10 +480,8 @@ static inline void seccomp_filter_free(struct seccomp_filter *filter)
}
}
-/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */
-void put_seccomp_filter(struct task_struct *tsk)
+static void __put_seccomp_filter(struct seccomp_filter *orig)
{
- struct seccomp_filter *orig = tsk->seccomp.filter;
/* Clean up single-reference branches iteratively. */
while (orig && atomic_dec_and_test(&orig->usage)) {
struct seccomp_filter *freeme = orig;
@@ -487,6 +490,12 @@ void put_seccomp_filter(struct task_struct *tsk)
}
}
+/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */
+void put_seccomp_filter(struct task_struct *tsk)
+{
+ __put_seccomp_filter(tsk->seccomp.filter);
+}
+
/**
* seccomp_send_sigsys - signals the task to allow in-process syscall emulation
* @syscall: syscall number to send to userland
@@ -892,13 +901,13 @@ long seccomp_get_filter(struct task_struct *task, unsigned long filter_off,
if (!data)
goto out;
- get_seccomp_filter(task);
+ __get_seccomp_filter(filter);
spin_unlock_irq(&task->sighand->siglock);
if (copy_to_user(data, fprog->filter, bpf_classic_proglen(fprog)))
ret = -EFAULT;
- put_seccomp_filter(task);
+ __put_seccomp_filter(filter);
return ret;
out:
diff --git a/kernel/signal.c b/kernel/signal.c
index 75761acc77cf..e48668c3c972 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -346,7 +346,7 @@ static bool task_participate_group_stop(struct task_struct *task)
* fresh group stop. Read comment in do_signal_stop() for details.
*/
if (!sig->group_stop_count && !(sig->flags & SIGNAL_STOP_STOPPED)) {
- sig->flags = SIGNAL_STOP_STOPPED;
+ signal_set_stop_flags(sig, SIGNAL_STOP_STOPPED);
return true;
}
return false;
@@ -503,7 +503,8 @@ int unhandled_signal(struct task_struct *tsk, int sig)
return !tsk->ptrace;
}
-static void collect_signal(int sig, struct sigpending *list, siginfo_t *info)
+static void collect_signal(int sig, struct sigpending *list, siginfo_t *info,
+ bool *resched_timer)
{
struct sigqueue *q, *first = NULL;
@@ -525,6 +526,12 @@ static void collect_signal(int sig, struct sigpending *list, siginfo_t *info)
still_pending:
list_del_init(&first->list);
copy_siginfo(info, &first->info);
+
+ *resched_timer =
+ (first->flags & SIGQUEUE_PREALLOC) &&
+ (info->si_code == SI_TIMER) &&
+ (info->si_sys_private);
+
__sigqueue_free(first);
} else {
/*
@@ -541,12 +548,12 @@ still_pending:
}
static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
- siginfo_t *info)
+ siginfo_t *info, bool *resched_timer)
{
int sig = next_signal(pending, mask);
if (sig)
- collect_signal(sig, pending, info);
+ collect_signal(sig, pending, info, resched_timer);
return sig;
}
@@ -558,15 +565,16 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
*/
int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
{
+ bool resched_timer = false;
int signr;
/* We only dequeue private signals from ourselves, we don't let
* signalfd steal them
*/
- signr = __dequeue_signal(&tsk->pending, mask, info);
+ signr = __dequeue_signal(&tsk->pending, mask, info, &resched_timer);
if (!signr) {
signr = __dequeue_signal(&tsk->signal->shared_pending,
- mask, info);
+ mask, info, &resched_timer);
/*
* itimer signal ?
*
@@ -611,7 +619,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
*/
current->jobctl |= JOBCTL_STOP_DEQUEUED;
}
- if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) {
+ if (resched_timer) {
/*
* Release the siglock to ensure proper locking order
* of timer locks outside of siglocks. Note, we leave
@@ -837,7 +845,7 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force)
* will take ->siglock, notice SIGNAL_CLD_MASK, and
* notify its parent. See get_signal_to_deliver().
*/
- signal->flags = why | SIGNAL_STOP_CONTINUED;
+ signal_set_stop_flags(signal, why | SIGNAL_STOP_CONTINUED);
signal->group_stop_count = 0;
signal->group_exit_code = 0;
}
@@ -3226,10 +3234,17 @@ int compat_restore_altstack(const compat_stack_t __user *uss)
int __compat_save_altstack(compat_stack_t __user *uss, unsigned long sp)
{
+ int err;
struct task_struct *t = current;
- return __put_user(ptr_to_compat((void __user *)t->sas_ss_sp), &uss->ss_sp) |
- __put_user(sas_ss_flags(sp), &uss->ss_flags) |
+ err = __put_user(ptr_to_compat((void __user *)t->sas_ss_sp),
+ &uss->ss_sp) |
+ __put_user(t->sas_ss_flags, &uss->ss_flags) |
__put_user(t->sas_ss_size, &uss->ss_size);
+ if (err)
+ return err;
+ if (t->sas_ss_flags & SS_AUTODISARM)
+ sas_ss_reset(t);
+ return 0;
}
#endif
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c1095cdc0fe2..24d603d29512 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1189,6 +1189,8 @@ static struct ctl_table kern_table[] = {
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = timer_migration_handler,
+ .extra1 = &zero,
+ .extra2 = &one,
},
#endif
#ifdef CONFIG_BPF_SYSCALL
@@ -2146,9 +2148,12 @@ static int do_proc_douintvec_conv(bool *negp, unsigned long *lvalp,
if (write) {
if (*negp)
return -EINVAL;
+ if (*lvalp > UINT_MAX)
+ return -EINVAL;
*valp = *lvalp;
} else {
unsigned int val = *valp;
+ *negp = false;
*lvalp = (unsigned long)val;
}
return 0;
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
index 12dd190634ab..d67ef56ca9bc 100644
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -354,7 +354,7 @@ void alarm_start_relative(struct alarm *alarm, ktime_t start)
{
struct alarm_base *base = &alarm_bases[alarm->type];
- start = ktime_add(start, base->gettime());
+ start = ktime_add_safe(start, base->gettime());
alarm_start(alarm, start);
}
EXPORT_SYMBOL_GPL(alarm_start_relative);
@@ -440,7 +440,7 @@ u64 alarm_forward(struct alarm *alarm, ktime_t now, ktime_t interval)
overrun++;
}
- alarm->node.expires = ktime_add(alarm->node.expires, interval);
+ alarm->node.expires = ktime_add_safe(alarm->node.expires, interval);
return overrun;
}
EXPORT_SYMBOL_GPL(alarm_forward);
@@ -624,13 +624,22 @@ static int alarm_timer_set(struct k_itimer *timr, int flags,
/* start the timer */
timr->it.alarm.interval = timespec_to_ktime(new_setting->it_interval);
+
+ /*
+ * Rate limit to the tick as a hot fix to prevent DOS. Will be
+ * mopped up later.
+ */
+ if (timr->it.alarm.interval.tv64 &&
+ ktime_to_ns(timr->it.alarm.interval) < TICK_NSEC)
+ timr->it.alarm.interval = ktime_set(0, TICK_NSEC);
+
exp = timespec_to_ktime(new_setting->it_value);
/* Convert (if necessary) to absolute time */
if (flags != TIMER_ABSTIME) {
ktime_t now;
now = alarm_bases[timr->it.alarm.alarmtimer.type].gettime();
- exp = ktime_add(now, exp);
+ exp = ktime_add_safe(now, exp);
}
alarm_start(&timr->it.alarm.alarmtimer, exp);
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index bb5ec425dfe0..eeb7f2f5698d 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -94,17 +94,15 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
};
static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
+ /* Make sure we catch unsupported clockids */
+ [0 ... MAX_CLOCKS - 1] = HRTIMER_MAX_CLOCK_BASES,
+
[CLOCK_REALTIME] = HRTIMER_BASE_REALTIME,
[CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC,
[CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME,
[CLOCK_TAI] = HRTIMER_BASE_TAI,
};
-static inline int hrtimer_clockid_to_base(clockid_t clock_id)
-{
- return hrtimer_clock_to_base_table[clock_id];
-}
-
/*
* Functions and macros which are different for UP/SMP systems are kept in a
* single place
@@ -1112,6 +1110,18 @@ u64 hrtimer_get_next_event(void)
}
#endif
+static inline int hrtimer_clockid_to_base(clockid_t clock_id)
+{
+ if (likely(clock_id < MAX_CLOCKS)) {
+ int base = hrtimer_clock_to_base_table[clock_id];
+
+ if (likely(base != HRTIMER_MAX_CLOCK_BASES))
+ return base;
+ }
+ WARN(1, "Invalid clockid %d. Using MONOTONIC\n", clock_id);
+ return HRTIMER_BASE_MONOTONIC;
+}
+
static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
enum hrtimer_mode mode)
{
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 46e312e9be38..d831827d7ab0 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -116,6 +116,26 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta)
tk->offs_boot = ktime_add(tk->offs_boot, delta);
}
+/*
+ * tk_clock_read - atomic clocksource read() helper
+ *
+ * This helper is necessary to use in the read paths because, while the
+ * seqlock ensures we don't return a bad value while structures are updated,
+ * it doesn't protect from potential crashes. There is the possibility that
+ * the tkr's clocksource may change between the read reference, and the
+ * clock reference passed to the read function. This can cause crashes if
+ * the wrong clocksource is passed to the wrong read function.
+ * This isn't necessary to use when holding the timekeeper_lock or doing
+ * a read of the fast-timekeeper tkrs (which is protected by its own locking
+ * and update logic).
+ */
+static inline u64 tk_clock_read(struct tk_read_base *tkr)
+{
+ struct clocksource *clock = READ_ONCE(tkr->clock);
+
+ return clock->read(clock);
+}
+
#ifdef CONFIG_DEBUG_TIMEKEEPING
#define WARNING_FREQ (HZ*300) /* 5 minute rate-limiting */
@@ -173,7 +193,7 @@ static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr)
*/
do {
seq = read_seqcount_begin(&tk_core.seq);
- now = tkr->read(tkr->clock);
+ now = tk_clock_read(tkr);
last = tkr->cycle_last;
mask = tkr->mask;
max = tkr->clock->max_cycles;
@@ -207,7 +227,7 @@ static inline cycle_t timekeeping_get_delta(struct tk_read_base *tkr)
cycle_t cycle_now, delta;
/* read clocksource */
- cycle_now = tkr->read(tkr->clock);
+ cycle_now = tk_clock_read(tkr);
/* calculate the delta since the last update_wall_time */
delta = clocksource_delta(cycle_now, tkr->cycle_last, tkr->mask);
@@ -236,12 +256,10 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
++tk->cs_was_changed_seq;
old_clock = tk->tkr_mono.clock;
tk->tkr_mono.clock = clock;
- tk->tkr_mono.read = clock->read;
tk->tkr_mono.mask = clock->mask;
- tk->tkr_mono.cycle_last = tk->tkr_mono.read(clock);
+ tk->tkr_mono.cycle_last = tk_clock_read(&tk->tkr_mono);
tk->tkr_raw.clock = clock;
- tk->tkr_raw.read = clock->read;
tk->tkr_raw.mask = clock->mask;
tk->tkr_raw.cycle_last = tk->tkr_mono.cycle_last;
@@ -260,8 +278,7 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock)
/* Go back from cycles -> shifted ns */
tk->xtime_interval = (u64) interval * clock->mult;
tk->xtime_remainder = ntpinterval - tk->xtime_interval;
- tk->raw_interval =
- ((u64) interval * clock->mult) >> clock->shift;
+ tk->raw_interval = interval * clock->mult;
/* if changing clocks, convert xtime_nsec shift units */
if (old_clock) {
@@ -405,7 +422,7 @@ static __always_inline u64 __ktime_get_fast_ns(struct tk_fast *tkf)
now += timekeeping_delta_to_ns(tkr,
clocksource_delta(
- tkr->read(tkr->clock),
+ tk_clock_read(tkr),
tkr->cycle_last,
tkr->mask));
} while (read_seqcount_retry(&tkf->seq, seq));
@@ -433,6 +450,10 @@ static cycle_t dummy_clock_read(struct clocksource *cs)
return cycles_at_suspend;
}
+static struct clocksource dummy_clock = {
+ .read = dummy_clock_read,
+};
+
/**
* halt_fast_timekeeper - Prevent fast timekeeper from accessing clocksource.
* @tk: Timekeeper to snapshot.
@@ -449,13 +470,13 @@ static void halt_fast_timekeeper(struct timekeeper *tk)
struct tk_read_base *tkr = &tk->tkr_mono;
memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy));
- cycles_at_suspend = tkr->read(tkr->clock);
- tkr_dummy.read = dummy_clock_read;
+ cycles_at_suspend = tk_clock_read(tkr);
+ tkr_dummy.clock = &dummy_clock;
update_fast_timekeeper(&tkr_dummy, &tk_fast_mono);
tkr = &tk->tkr_raw;
memcpy(&tkr_dummy, tkr, sizeof(tkr_dummy));
- tkr_dummy.read = dummy_clock_read;
+ tkr_dummy.clock = &dummy_clock;
update_fast_timekeeper(&tkr_dummy, &tk_fast_raw);
}
@@ -621,11 +642,10 @@ static void timekeeping_update(struct timekeeper *tk, unsigned int action)
*/
static void timekeeping_forward_now(struct timekeeper *tk)
{
- struct clocksource *clock = tk->tkr_mono.clock;
cycle_t cycle_now, delta;
s64 nsec;
- cycle_now = tk->tkr_mono.read(clock);
+ cycle_now = tk_clock_read(&tk->tkr_mono);
delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask);
tk->tkr_mono.cycle_last = cycle_now;
tk->tkr_raw.cycle_last = cycle_now;
@@ -901,8 +921,7 @@ void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot)
do {
seq = read_seqcount_begin(&tk_core.seq);
-
- now = tk->tkr_mono.read(tk->tkr_mono.clock);
+ now = tk_clock_read(&tk->tkr_mono);
systime_snapshot->cs_was_changed_seq = tk->cs_was_changed_seq;
systime_snapshot->clock_was_set_seq = tk->clock_was_set_seq;
base_real = ktime_add(tk->tkr_mono.base,
@@ -1081,7 +1100,7 @@ int get_device_system_crosststamp(int (*get_time_fn)
* Check whether the system counter value provided by the
* device driver is on the current timekeeping interval.
*/
- now = tk->tkr_mono.read(tk->tkr_mono.clock);
+ now = tk_clock_read(&tk->tkr_mono);
interval_start = tk->tkr_mono.cycle_last;
if (!cycle_between(interval_start, cycles, now)) {
clock_was_set_seq = tk->clock_was_set_seq;
@@ -1639,7 +1658,7 @@ void timekeeping_resume(void)
* The less preferred source will only be tried if there is no better
* usable source. The rtc part is handled separately in rtc core code.
*/
- cycle_now = tk->tkr_mono.read(clock);
+ cycle_now = tk_clock_read(&tk->tkr_mono);
if ((clock->flags & CLOCK_SOURCE_SUSPEND_NONSTOP) &&
cycle_now > tk->tkr_mono.cycle_last) {
u64 num, max = ULLONG_MAX;
@@ -2003,7 +2022,7 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
unsigned int *clock_set)
{
cycle_t interval = tk->cycle_interval << shift;
- u64 raw_nsecs;
+ u64 snsec_per_sec;
/* If the offset is smaller than a shifted interval, do nothing */
if (offset < interval)
@@ -2018,14 +2037,15 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
*clock_set |= accumulate_nsecs_to_secs(tk);
/* Accumulate raw time */
- raw_nsecs = (u64)tk->raw_interval << shift;
- raw_nsecs += tk->raw_time.tv_nsec;
- if (raw_nsecs >= NSEC_PER_SEC) {
- u64 raw_secs = raw_nsecs;
- raw_nsecs = do_div(raw_secs, NSEC_PER_SEC);
- tk->raw_time.tv_sec += raw_secs;
+ tk->tkr_raw.xtime_nsec += (u64)tk->raw_time.tv_nsec << tk->tkr_raw.shift;
+ tk->tkr_raw.xtime_nsec += tk->raw_interval << shift;
+ snsec_per_sec = (u64)NSEC_PER_SEC << tk->tkr_raw.shift;
+ while (tk->tkr_raw.xtime_nsec >= snsec_per_sec) {
+ tk->tkr_raw.xtime_nsec -= snsec_per_sec;
+ tk->raw_time.tv_sec++;
}
- tk->raw_time.tv_nsec = raw_nsecs;
+ tk->raw_time.tv_nsec = tk->tkr_raw.xtime_nsec >> tk->tkr_raw.shift;
+ tk->tkr_raw.xtime_nsec -= (u64)tk->raw_time.tv_nsec << tk->tkr_raw.shift;
/* Accumulate error between NTP and clock interval */
tk->ntp_error += tk->ntp_tick << shift;
@@ -2057,7 +2077,7 @@ void update_wall_time(void)
#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET
offset = real_tk->cycle_interval;
#else
- offset = clocksource_delta(tk->tkr_mono.read(tk->tkr_mono.clock),
+ offset = clocksource_delta(tk_clock_read(&tk->tkr_mono),
tk->tkr_mono.cycle_last, tk->tkr_mono.mask);
#endif
diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c
index ca9fb800336b..38bc4d2208e8 100644
--- a/kernel/time/timekeeping_debug.c
+++ b/kernel/time/timekeeping_debug.c
@@ -75,7 +75,7 @@ void tk_debug_account_sleep_time(struct timespec64 *t)
int bin = min(fls(t->tv_sec), NUM_BINS-1);
sleep_time_bin[bin]++;
- pr_info("Suspended for %lld.%03lu seconds\n", (s64)t->tv_sec,
- t->tv_nsec / NSEC_PER_MSEC);
+ printk_deferred(KERN_INFO "Suspended for %lld.%03lu seconds\n",
+ (s64)t->tv_sec, t->tv_nsec / NSEC_PER_MSEC);
}
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
index c611c47de884..7d670362891a 100644
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -201,6 +201,7 @@ struct timer_base {
bool migration_enabled;
bool nohz_active;
bool is_idle;
+ bool must_forward_clk;
DECLARE_BITMAP(pending_map, WHEEL_SIZE);
struct hlist_head vectors[WHEEL_SIZE];
} ____cacheline_aligned;
@@ -239,7 +240,7 @@ int timer_migration_handler(struct ctl_table *table, int write,
int ret;
mutex_lock(&mutex);
- ret = proc_dointvec(table, write, buffer, lenp, ppos);
+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
if (!ret && write)
timers_update_migration(false);
mutex_unlock(&mutex);
@@ -891,13 +892,19 @@ get_target_base(struct timer_base *base, unsigned tflags)
static inline void forward_timer_base(struct timer_base *base)
{
- unsigned long jnow = READ_ONCE(jiffies);
+ unsigned long jnow;
/*
- * We only forward the base when it's idle and we have a delta between
- * base clock and jiffies.
+ * We only forward the base when we are idle or have just come out of
+ * idle (must_forward_clk logic), and have a delta between base clock
+ * and jiffies. In the common case, run_timers will take care of it.
*/
- if (!base->is_idle || (long) (jnow - base->clk) < 2)
+ if (likely(!base->must_forward_clk))
+ return;
+
+ jnow = READ_ONCE(jiffies);
+ base->must_forward_clk = base->is_idle;
+ if ((long)(jnow - base->clk) < 2)
return;
/*
@@ -973,6 +980,11 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
* same array bucket then just return:
*/
if (timer_pending(timer)) {
+ /*
+ * The downside of this optimization is that it can result in
+ * larger granularity than you would get from adding a new
+ * timer with this expiry.
+ */
if (timer->expires == expires)
return 1;
@@ -983,6 +995,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
* dequeue/enqueue dance.
*/
base = lock_timer_base(timer, &flags);
+ forward_timer_base(base);
clk = base->clk;
idx = calc_wheel_index(expires, clk);
@@ -999,6 +1012,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
}
} else {
base = lock_timer_base(timer, &flags);
+ forward_timer_base(base);
}
timer_stats_timer_set_start_info(timer);
@@ -1028,12 +1042,10 @@ __mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
spin_lock(&base->lock);
WRITE_ONCE(timer->flags,
(timer->flags & ~TIMER_BASEMASK) | base->cpu);
+ forward_timer_base(base);
}
}
- /* Try to forward a stale timer base clock */
- forward_timer_base(base);
-
timer->expires = expires;
/*
* If 'idx' was calculated above and the base time did not advance
@@ -1150,6 +1162,7 @@ void add_timer_on(struct timer_list *timer, int cpu)
WRITE_ONCE(timer->flags,
(timer->flags & ~TIMER_BASEMASK) | cpu);
}
+ forward_timer_base(base);
debug_activate(timer, timer->expires);
internal_add_timer(base, timer);
@@ -1536,12 +1549,18 @@ u64 get_next_timer_interrupt(unsigned long basej, u64 basem)
base->is_idle = false;
} else {
if (!is_max_delta)
- expires = basem + (nextevt - basej) * TICK_NSEC;
+ expires = basem + (u64)(nextevt - basej) * TICK_NSEC;
/*
- * If we expect to sleep more than a tick, mark the base idle:
+ * If we expect to sleep more than a tick, mark the base idle.
+ * Also the tick is stopped so any added timer must forward
+ * the base clk itself to keep granularity small. This idle
+ * logic is only maintained for the BASE_STD base, deferrable
+ * timers may still see large granularity skew (by design).
*/
- if ((expires - basem) > TICK_NSEC)
+ if ((expires - basem) > TICK_NSEC) {
+ base->must_forward_clk = true;
base->is_idle = true;
+ }
}
spin_unlock(&base->lock);
@@ -1651,6 +1670,19 @@ static __latent_entropy void run_timer_softirq(struct softirq_action *h)
{
struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
+ /*
+ * must_forward_clk must be cleared before running timers so that any
+ * timer functions that call mod_timer will not try to forward the
+ * base. idle trcking / clock forwarding logic is only used with
+ * BASE_STD timers.
+ *
+ * The deferrable base does not do idle tracking at all, so we do
+ * not forward it. This can result in very large variations in
+ * granularity for deferrable timers, but they can be deferred for
+ * long periods due to idle.
+ */
+ base->must_forward_clk = false;
+
__run_timers(base);
if (IS_ENABLED(CONFIG_NO_HZ_COMMON) && base->nohz_active)
__run_timers(this_cpu_ptr(&timer_bases[BASE_DEF]));
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 5dcb99281259..41805fb3c661 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -203,10 +203,36 @@ BPF_CALL_5(bpf_trace_printk, char *, fmt, u32, fmt_size, u64, arg1,
fmt_cnt++;
}
- return __trace_printk(1/* fake ip will not be printed */, fmt,
- mod[0] == 2 ? arg1 : mod[0] == 1 ? (long) arg1 : (u32) arg1,
- mod[1] == 2 ? arg2 : mod[1] == 1 ? (long) arg2 : (u32) arg2,
- mod[2] == 2 ? arg3 : mod[2] == 1 ? (long) arg3 : (u32) arg3);
+/* Horrid workaround for getting va_list handling working with different
+ * argument type combinations generically for 32 and 64 bit archs.
+ */
+#define __BPF_TP_EMIT() __BPF_ARG3_TP()
+#define __BPF_TP(...) \
+ __trace_printk(1 /* Fake ip will not be printed. */, \
+ fmt, ##__VA_ARGS__)
+
+#define __BPF_ARG1_TP(...) \
+ ((mod[0] == 2 || (mod[0] == 1 && __BITS_PER_LONG == 64)) \
+ ? __BPF_TP(arg1, ##__VA_ARGS__) \
+ : ((mod[0] == 1 || (mod[0] == 0 && __BITS_PER_LONG == 32)) \
+ ? __BPF_TP((long)arg1, ##__VA_ARGS__) \
+ : __BPF_TP((u32)arg1, ##__VA_ARGS__)))
+
+#define __BPF_ARG2_TP(...) \
+ ((mod[1] == 2 || (mod[1] == 1 && __BITS_PER_LONG == 64)) \
+ ? __BPF_ARG1_TP(arg2, ##__VA_ARGS__) \
+ : ((mod[1] == 1 || (mod[1] == 0 && __BITS_PER_LONG == 32)) \
+ ? __BPF_ARG1_TP((long)arg2, ##__VA_ARGS__) \
+ : __BPF_ARG1_TP((u32)arg2, ##__VA_ARGS__)))
+
+#define __BPF_ARG3_TP(...) \
+ ((mod[2] == 2 || (mod[2] == 1 && __BITS_PER_LONG == 64)) \
+ ? __BPF_ARG2_TP(arg3, ##__VA_ARGS__) \
+ : ((mod[2] == 1 || (mod[2] == 0 && __BITS_PER_LONG == 32)) \
+ ? __BPF_ARG2_TP((long)arg3, ##__VA_ARGS__) \
+ : __BPF_ARG2_TP((u32)arg3, ##__VA_ARGS__)))
+
+ return __BPF_TP_EMIT();
}
static const struct bpf_func_proto bpf_trace_printk_proto = {
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index da87b3cba5b3..5b8d7189e147 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -876,6 +876,10 @@ static int profile_graph_entry(struct ftrace_graph_ent *trace)
function_profile_call(trace->func, 0, NULL, NULL);
+ /* If function graph is shutting down, ret_stack can be NULL */
+ if (!current->ret_stack)
+ return 0;
+
if (index >= 0 && index < FTRACE_RETFUNC_DEPTH)
current->ret_stack[index].subtime = 0;
@@ -2743,13 +2747,14 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
if (!command || !ftrace_enabled) {
/*
- * If these are per_cpu ops, they still need their
- * per_cpu field freed. Since, function tracing is
+ * If these are dynamic or per_cpu ops, they still
+ * need their data freed. Since, function tracing is
* not currently active, we can just free them
* without synchronizing all CPUs.
*/
- if (ops->flags & FTRACE_OPS_FL_PER_CPU)
- per_cpu_ops_free(ops);
+ if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_PER_CPU))
+ goto free_ops;
+
return 0;
}
@@ -2804,6 +2809,7 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command)
if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_PER_CPU)) {
schedule_on_each_cpu(ftrace_sync);
+ free_ops:
arch_ftrace_trampoline_free(ops);
if (ops->flags & FTRACE_OPS_FL_PER_CPU)
@@ -3590,7 +3596,7 @@ match_records(struct ftrace_hash *hash, char *func, int len, char *mod)
int exclude_mod = 0;
int found = 0;
int ret;
- int clear_filter;
+ int clear_filter = 0;
if (func) {
func_g.type = filter_parse_regex(func, len, &func_g.search,
@@ -3736,23 +3742,24 @@ static void __enable_ftrace_function_probe(struct ftrace_ops_hash *old_hash)
ftrace_probe_registered = 1;
}
-static void __disable_ftrace_function_probe(void)
+static bool __disable_ftrace_function_probe(void)
{
int i;
if (!ftrace_probe_registered)
- return;
+ return false;
for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
struct hlist_head *hhd = &ftrace_func_hash[i];
if (hhd->first)
- return;
+ return false;
}
/* no more funcs left */
ftrace_shutdown(&trace_probe_ops, 0);
ftrace_probe_registered = 0;
+ return true;
}
@@ -3882,6 +3889,7 @@ static void
__unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
void *data, int flags)
{
+ struct ftrace_ops_hash old_hash_ops;
struct ftrace_func_entry *rec_entry;
struct ftrace_func_probe *entry;
struct ftrace_func_probe *p;
@@ -3893,6 +3901,7 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
struct hlist_node *tmp;
char str[KSYM_SYMBOL_LEN];
int i, ret;
+ bool disabled;
if (glob && (strcmp(glob, "*") == 0 || !strlen(glob)))
func_g.search = NULL;
@@ -3911,6 +3920,10 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
mutex_lock(&trace_probe_ops.func_hash->regex_lock);
+ old_hash_ops.filter_hash = old_hash;
+ /* Probes only have filters */
+ old_hash_ops.notrace_hash = NULL;
+
hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash);
if (!hash)
/* Hmm, should report this somehow */
@@ -3948,12 +3961,17 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
}
}
mutex_lock(&ftrace_lock);
- __disable_ftrace_function_probe();
+ disabled = __disable_ftrace_function_probe();
/*
* Remove after the disable is called. Otherwise, if the last
* probe is removed, a null hash means *all enabled*.
*/
ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash);
+
+ /* still need to update the function call sites */
+ if (ftrace_enabled && !disabled)
+ ftrace_run_modify_code(&trace_probe_ops, FTRACE_UPDATE_CALLS,
+ &old_hash_ops);
synchronize_sched();
if (!ret)
free_ftrace_hash_rcu(old_hash);
@@ -4363,9 +4381,6 @@ static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata;
static char ftrace_graph_notrace_buf[FTRACE_FILTER_SIZE] __initdata;
static int ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer);
-static unsigned long save_global_trampoline;
-static unsigned long save_global_flags;
-
static int __init set_graph_function(char *str)
{
strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE);
@@ -5389,6 +5404,15 @@ static void clear_ftrace_pids(struct trace_array *tr)
trace_free_pid_list(pid_list);
}
+void ftrace_clear_pids(struct trace_array *tr)
+{
+ mutex_lock(&ftrace_lock);
+
+ clear_ftrace_pids(tr);
+
+ mutex_unlock(&ftrace_lock);
+}
+
static void ftrace_pid_reset(struct trace_array *tr)
{
mutex_lock(&ftrace_lock);
@@ -5954,17 +5978,6 @@ void unregister_ftrace_graph(void)
unregister_pm_notifier(&ftrace_suspend_notifier);
unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
-#ifdef CONFIG_DYNAMIC_FTRACE
- /*
- * Function graph does not allocate the trampoline, but
- * other global_ops do. We need to reset the ALLOC_TRAMP flag
- * if one was used.
- */
- global_ops.trampoline = save_global_trampoline;
- if (save_global_flags & FTRACE_OPS_FL_ALLOC_TRAMP)
- global_ops.flags |= FTRACE_OPS_FL_ALLOC_TRAMP;
-#endif
-
out:
mutex_unlock(&ftrace_lock);
}
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 9c143739b8d7..f5c016e8fc88 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -3435,11 +3435,23 @@ EXPORT_SYMBOL_GPL(ring_buffer_iter_reset);
int ring_buffer_iter_empty(struct ring_buffer_iter *iter)
{
struct ring_buffer_per_cpu *cpu_buffer;
+ struct buffer_page *reader;
+ struct buffer_page *head_page;
+ struct buffer_page *commit_page;
+ unsigned commit;
cpu_buffer = iter->cpu_buffer;
- return iter->head_page == cpu_buffer->commit_page &&
- iter->head == rb_commit_index(cpu_buffer);
+ /* Remember, trace recording is off when iterator is in use */
+ reader = cpu_buffer->reader_page;
+ head_page = cpu_buffer->head_page;
+ commit_page = cpu_buffer->commit_page;
+ commit = rb_page_commit(commit_page);
+
+ return ((iter->head_page == commit_page && iter->head == commit) ||
+ (iter->head_page == reader && commit_page == head_page &&
+ head_page->read == commit &&
+ iter->head == rb_page_commit(cpu_buffer->reader_page)));
}
EXPORT_SYMBOL_GPL(ring_buffer_iter_empty);
@@ -4870,9 +4882,9 @@ static __init int test_ringbuffer(void)
rb_data[cpu].cnt = cpu;
rb_threads[cpu] = kthread_create(rb_test, &rb_data[cpu],
"rbtester/%d", cpu);
- if (WARN_ON(!rb_threads[cpu])) {
+ if (WARN_ON(IS_ERR(rb_threads[cpu]))) {
pr_cont("FAILED\n");
- ret = -1;
+ ret = PTR_ERR(rb_threads[cpu]);
goto out_free;
}
@@ -4882,9 +4894,9 @@ static __init int test_ringbuffer(void)
/* Now create the rb hammer! */
rb_hammer = kthread_run(rb_hammer_test, NULL, "rbhammer");
- if (WARN_ON(!rb_hammer)) {
+ if (WARN_ON(IS_ERR(rb_hammer))) {
pr_cont("FAILED\n");
- ret = -1;
+ ret = PTR_ERR(rb_hammer);
goto out_free;
}
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8696ce6bf2f6..c1e50cc0d7b0 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1906,7 +1906,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
#endif
((pc & NMI_MASK ) ? TRACE_FLAG_NMI : 0) |
((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
- ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
+ ((pc & SOFTIRQ_OFFSET) ? TRACE_FLAG_SOFTIRQ : 0) |
(tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) |
(test_preempt_need_resched() ? TRACE_FLAG_PREEMPT_RESCHED : 0);
}
@@ -2369,11 +2369,17 @@ static char *get_trace_buf(void)
if (!buffer || buffer->nesting >= 4)
return NULL;
- return &buffer->buffer[buffer->nesting++][0];
+ buffer->nesting++;
+
+ /* Interrupts must see nesting incremented before we use the buffer */
+ barrier();
+ return &buffer->buffer[buffer->nesting][0];
}
static void put_trace_buf(void)
{
+ /* Don't let the decrement of nesting leak before this */
+ barrier();
this_cpu_dec(trace_percpu_buffer->nesting);
}
@@ -3563,11 +3569,17 @@ static int tracing_open(struct inode *inode, struct file *file)
/* If this file was open for write, then erase contents */
if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) {
int cpu = tracing_get_cpu(inode);
+ struct trace_buffer *trace_buf = &tr->trace_buffer;
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+ if (tr->current_trace->print_max)
+ trace_buf = &tr->max_buffer;
+#endif
if (cpu == RING_BUFFER_ALL_CPUS)
- tracing_reset_online_cpus(&tr->trace_buffer);
+ tracing_reset_online_cpus(trace_buf);
else
- tracing_reset(&tr->trace_buffer, cpu);
+ tracing_reset(trace_buf, cpu);
}
if (file->f_mode & FMODE_READ) {
@@ -5122,7 +5134,7 @@ static int tracing_wait_pipe(struct file *filp)
*
* iter->pos will be 0 if we haven't read anything.
*/
- if (!tracing_is_on() && iter->pos)
+ if (!tracer_tracing_is_on(iter->tr) && iter->pos)
break;
mutex_unlock(&iter->mutex);
@@ -5658,7 +5670,7 @@ static int tracing_set_clock(struct trace_array *tr, const char *clockstr)
tracing_reset_online_cpus(&tr->trace_buffer);
#ifdef CONFIG_TRACER_MAX_TRACE
- if (tr->flags & TRACE_ARRAY_FL_GLOBAL && tr->max_buffer.buffer)
+ if (tr->max_buffer.buffer)
ring_buffer_set_clock(tr->max_buffer.buffer, trace_clocks[i].func);
tracing_reset_online_cpus(&tr->max_buffer);
#endif
@@ -6481,11 +6493,13 @@ ftrace_trace_snapshot_callback(struct ftrace_hash *hash,
return ret;
out_reg:
- ret = register_ftrace_function_probe(glob, ops, count);
+ ret = alloc_snapshot(&global_trace);
+ if (ret < 0)
+ goto out;
- if (ret >= 0)
- alloc_snapshot(&global_trace);
+ ret = register_ftrace_function_probe(glob, ops, count);
+ out:
return ret < 0 ? ret : 0;
}
@@ -7150,6 +7164,7 @@ static int instance_rmdir(const char *name)
tracing_set_nop(tr);
event_trace_del_tracer(tr);
+ ftrace_clear_pids(tr);
ftrace_destroy_function_files(tr);
tracefs_remove_recursive(tr->dir);
free_trace_buffers(tr);
@@ -7159,6 +7174,7 @@ static int instance_rmdir(const char *name)
}
kfree(tr->topts);
+ free_cpumask_var(tr->tracing_cpumask);
kfree(tr->name);
kfree(tr);
@@ -7241,7 +7257,7 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
ftrace_init_tracefs(tr, d_tracer);
}
-static struct vfsmount *trace_automount(void *ingore)
+static struct vfsmount *trace_automount(struct dentry *mntpt, void *ingore)
{
struct vfsmount *mnt;
struct file_system_type *type;
@@ -7254,7 +7270,7 @@ static struct vfsmount *trace_automount(void *ingore)
type = get_fs_type("tracefs");
if (!type)
return NULL;
- mnt = vfs_kern_mount(type, 0, "tracefs", NULL);
+ mnt = vfs_submount(mntpt, type, "tracefs", NULL);
put_filesystem(type);
if (IS_ERR(mnt))
return NULL;
@@ -7763,4 +7779,4 @@ __init static int clear_boot_tracer(void)
}
fs_initcall(tracer_init_tracefs);
-late_initcall(clear_boot_tracer);
+late_initcall_sync(clear_boot_tracer);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index fd24b1f9ac43..b0d8576c27ae 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -870,6 +870,7 @@ int using_ftrace_ops_list_func(void);
void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d_tracer);
void ftrace_init_tracefs_toplevel(struct trace_array *tr,
struct dentry *d_tracer);
+void ftrace_clear_pids(struct trace_array *tr);
#else
static inline int ftrace_trace_task(struct trace_array *tr)
{
@@ -888,6 +889,7 @@ ftrace_init_global_array_ops(struct trace_array *tr) { }
static inline void ftrace_reset_array_ops(struct trace_array *tr) { }
static inline void ftrace_init_tracefs(struct trace_array *tr, struct dentry *d) { }
static inline void ftrace_init_tracefs_toplevel(struct trace_array *tr, struct dentry *d) { }
+static inline void ftrace_clear_pids(struct trace_array *tr) { }
/* ftace_func_t type is not defined, use macro instead of static inline */
#define ftrace_init_array_ops(tr, func) do { } while (0)
#endif /* CONFIG_FUNCTION_TRACER */
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 9daa9b3bc6d9..0193f58c45f0 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1926,6 +1926,10 @@ static int create_filter(struct trace_event_call *call,
if (err && set_str)
append_filter_err(ps, filter);
}
+ if (err && !set_str) {
+ free_event_filter(filter);
+ filter = NULL;
+ }
create_filter_finish(ps);
*filterp = filter;
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index eb6c9f1d3a93..5ff45cae4ac4 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -667,30 +667,25 @@ static int create_trace_kprobe(int argc, char **argv)
pr_info("Probe point is not specified.\n");
return -EINVAL;
}
- if (isdigit(argv[1][0])) {
- if (is_return) {
- pr_info("Return probe point must be a symbol.\n");
- return -EINVAL;
- }
- /* an address specified */
- ret = kstrtoul(&argv[1][0], 0, (unsigned long *)&addr);
- if (ret) {
- pr_info("Failed to parse address.\n");
- return ret;
- }
- } else {
+
+ /* try to parse an address. if that fails, try to read the
+ * input as a symbol. */
+ if (kstrtoul(argv[1], 0, (unsigned long *)&addr)) {
/* a symbol specified */
symbol = argv[1];
/* TODO: support .init module functions */
ret = traceprobe_split_symbol_offset(symbol, &offset);
if (ret) {
- pr_info("Failed to parse symbol.\n");
+ pr_info("Failed to parse either an address or a symbol.\n");
return ret;
}
if (offset && is_return) {
pr_info("Return probe must be used without offset.\n");
return -EINVAL;
}
+ } else if (is_return) {
+ pr_info("Return probe point must be a symbol.\n");
+ return -EINVAL;
}
argc -= 2; argv += 2;
@@ -1484,6 +1479,11 @@ static __init int kprobe_trace_self_tests_init(void)
end:
release_all_trace_kprobes();
+ /*
+ * Wait for the optimizer work to finish. Otherwise it might fiddle
+ * with probes in already freed __init text.
+ */
+ wait_for_kprobe_optimizer();
if (warn)
pr_cont("NG: Some tests are failed. Please check them.\n");
else
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index b0f86ea77881..ca70d11b8aa7 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -272,7 +272,7 @@ static int trace_selftest_ops(struct trace_array *tr, int cnt)
goto out_free;
if (cnt > 1) {
if (trace_selftest_test_global_cnt == 0)
- goto out;
+ goto out_free;
}
if (trace_selftest_test_dyn_cnt == 0)
goto out_free;
diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c
index 0a689bbb78ef..305039b122fa 100644
--- a/kernel/trace/tracing_map.c
+++ b/kernel/trace/tracing_map.c
@@ -221,16 +221,19 @@ void tracing_map_array_free(struct tracing_map_array *a)
if (!a)
return;
- if (!a->pages) {
- kfree(a);
- return;
- }
+ if (!a->pages)
+ goto free;
for (i = 0; i < a->n_pages; i++) {
if (!a->pages[i])
break;
free_page((unsigned long)a->pages[i]);
}
+
+ kfree(a->pages);
+
+ free:
+ kfree(a);
}
struct tracing_map_array *tracing_map_array_alloc(unsigned int n_elts,
diff --git a/kernel/ucount.c b/kernel/ucount.c
index 4bbd38ec3788..c761cdba2a2d 100644
--- a/kernel/ucount.c
+++ b/kernel/ucount.c
@@ -139,7 +139,7 @@ static struct ucounts *get_ucounts(struct user_namespace *ns, kuid_t uid)
new->ns = ns;
new->uid = uid;
- atomic_set(&new->count, 0);
+ new->count = 0;
spin_lock_irq(&ucounts_lock);
ucounts = find_ucounts(ns, uid, hashent);
@@ -150,8 +150,10 @@ static struct ucounts *get_ucounts(struct user_namespace *ns, kuid_t uid)
ucounts = new;
}
}
- if (!atomic_add_unless(&ucounts->count, 1, INT_MAX))
+ if (ucounts->count == INT_MAX)
ucounts = NULL;
+ else
+ ucounts->count += 1;
spin_unlock_irq(&ucounts_lock);
return ucounts;
}
@@ -160,13 +162,15 @@ static void put_ucounts(struct ucounts *ucounts)
{
unsigned long flags;
- if (atomic_dec_and_test(&ucounts->count)) {
- spin_lock_irqsave(&ucounts_lock, flags);
+ spin_lock_irqsave(&ucounts_lock, flags);
+ ucounts->count -= 1;
+ if (!ucounts->count)
hlist_del_init(&ucounts->node);
- spin_unlock_irqrestore(&ucounts_lock, flags);
+ else
+ ucounts = NULL;
+ spin_unlock_irqrestore(&ucounts_lock, flags);
- kfree(ucounts);
- }
+ kfree(ucounts);
}
static inline bool atomic_inc_below(atomic_t *v, int u)
@@ -227,11 +231,10 @@ static __init int user_namespace_sysctl_init(void)
* properly.
*/
user_header = register_sysctl("user", empty);
+ kmemleak_ignore(user_header);
BUG_ON(!user_header);
BUG_ON(!setup_userns_sysctls(&init_user_ns));
#endif
return 0;
}
subsys_initcall(user_namespace_sysctl_init);
-
-
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 6d1020c03d41..63177be0159e 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -24,32 +24,14 @@
#include <asm/irq_regs.h>
#include <linux/kvm_para.h>
-#include <linux/perf_event.h>
#include <linux/kthread.h>
-/*
- * The run state of the lockup detectors is controlled by the content of the
- * 'watchdog_enabled' variable. Each lockup detector has its dedicated bit -
- * bit 0 for the hard lockup detector and bit 1 for the soft lockup detector.
- *
- * 'watchdog_user_enabled', 'nmi_watchdog_enabled' and 'soft_watchdog_enabled'
- * are variables that are only used as an 'interface' between the parameters
- * in /proc/sys/kernel and the internal state bits in 'watchdog_enabled'. The
- * 'watchdog_thresh' variable is handled differently because its value is not
- * boolean, and the lockup detectors are 'suspended' while 'watchdog_thresh'
- * is equal zero.
- */
-#define NMI_WATCHDOG_ENABLED_BIT 0
-#define SOFT_WATCHDOG_ENABLED_BIT 1
-#define NMI_WATCHDOG_ENABLED (1 << NMI_WATCHDOG_ENABLED_BIT)
-#define SOFT_WATCHDOG_ENABLED (1 << SOFT_WATCHDOG_ENABLED_BIT)
-
static DEFINE_MUTEX(watchdog_proc_mutex);
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
-static unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED|NMI_WATCHDOG_ENABLED;
+#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR)
+unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED|NMI_WATCHDOG_ENABLED;
#else
-static unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED;
+unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED;
#endif
int __read_mostly nmi_watchdog_enabled;
int __read_mostly soft_watchdog_enabled;
@@ -59,9 +41,6 @@ int __read_mostly watchdog_thresh = 10;
#ifdef CONFIG_SMP
int __read_mostly sysctl_softlockup_all_cpu_backtrace;
int __read_mostly sysctl_hardlockup_all_cpu_backtrace;
-#else
-#define sysctl_softlockup_all_cpu_backtrace 0
-#define sysctl_hardlockup_all_cpu_backtrace 0
#endif
static struct cpumask watchdog_cpumask __read_mostly;
unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
@@ -70,6 +49,8 @@ unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask);
#define for_each_watchdog_cpu(cpu) \
for_each_cpu_and((cpu), cpu_online_mask, &watchdog_cpumask)
+atomic_t watchdog_park_in_progress = ATOMIC_INIT(0);
+
/*
* The 'watchdog_running' variable is set to 1 when the watchdog threads
* are registered/started and is set to 0 when the watchdog threads are
@@ -100,50 +81,9 @@ static DEFINE_PER_CPU(bool, soft_watchdog_warn);
static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts);
static DEFINE_PER_CPU(unsigned long, soft_lockup_hrtimer_cnt);
static DEFINE_PER_CPU(struct task_struct *, softlockup_task_ptr_saved);
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
-static DEFINE_PER_CPU(bool, hard_watchdog_warn);
-static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
-static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
-#endif
static unsigned long soft_lockup_nmi_warn;
-/* boot commands */
-/*
- * Should we panic when a soft-lockup or hard-lockup occurs:
- */
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
-unsigned int __read_mostly hardlockup_panic =
- CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
-static unsigned long hardlockup_allcpu_dumped;
-/*
- * We may not want to enable hard lockup detection by default in all cases,
- * for example when running the kernel as a guest on a hypervisor. In these
- * cases this function can be called to disable hard lockup detection. This
- * function should only be executed once by the boot processor before the
- * kernel command line parameters are parsed, because otherwise it is not
- * possible to override this in hardlockup_panic_setup().
- */
-void hardlockup_detector_disable(void)
-{
- watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
-}
-
-static int __init hardlockup_panic_setup(char *str)
-{
- if (!strncmp(str, "panic", 5))
- hardlockup_panic = 1;
- else if (!strncmp(str, "nopanic", 7))
- hardlockup_panic = 0;
- else if (!strncmp(str, "0", 1))
- watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
- else if (!strncmp(str, "1", 1))
- watchdog_enabled |= NMI_WATCHDOG_ENABLED;
- return 1;
-}
-__setup("nmi_watchdog=", hardlockup_panic_setup);
-#endif
-
unsigned int __read_mostly softlockup_panic =
CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
@@ -264,32 +204,14 @@ void touch_all_softlockup_watchdogs(void)
wq_watchdog_touch(-1);
}
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
-void touch_nmi_watchdog(void)
-{
- /*
- * Using __raw here because some code paths have
- * preemption enabled. If preemption is enabled
- * then interrupts should be enabled too, in which
- * case we shouldn't have to worry about the watchdog
- * going off.
- */
- raw_cpu_write(watchdog_nmi_touch, true);
- touch_softlockup_watchdog();
-}
-EXPORT_SYMBOL(touch_nmi_watchdog);
-
-#endif
-
void touch_softlockup_watchdog_sync(void)
{
__this_cpu_write(softlockup_touch_sync, true);
__this_cpu_write(watchdog_touch_ts, 0);
}
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
/* watchdog detector functions */
-static bool is_hardlockup(void)
+bool is_hardlockup(void)
{
unsigned long hrint = __this_cpu_read(hrtimer_interrupts);
@@ -299,7 +221,6 @@ static bool is_hardlockup(void)
__this_cpu_write(hrtimer_interrupts_saved, hrint);
return false;
}
-#endif
static int is_softlockup(unsigned long touch_ts)
{
@@ -313,77 +234,22 @@ static int is_softlockup(unsigned long touch_ts)
return 0;
}
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
-
-static struct perf_event_attr wd_hw_attr = {
- .type = PERF_TYPE_HARDWARE,
- .config = PERF_COUNT_HW_CPU_CYCLES,
- .size = sizeof(struct perf_event_attr),
- .pinned = 1,
- .disabled = 1,
-};
-
-/* Callback function for perf event subsystem */
-static void watchdog_overflow_callback(struct perf_event *event,
- struct perf_sample_data *data,
- struct pt_regs *regs)
-{
- /* Ensure the watchdog never gets throttled */
- event->hw.interrupts = 0;
-
- if (__this_cpu_read(watchdog_nmi_touch) == true) {
- __this_cpu_write(watchdog_nmi_touch, false);
- return;
- }
-
- /* check for a hardlockup
- * This is done by making sure our timer interrupt
- * is incrementing. The timer interrupt should have
- * fired multiple times before we overflow'd. If it hasn't
- * then this is a good indication the cpu is stuck
- */
- if (is_hardlockup()) {
- int this_cpu = smp_processor_id();
-
- /* only print hardlockups once */
- if (__this_cpu_read(hard_watchdog_warn) == true)
- return;
-
- pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
- print_modules();
- print_irqtrace_events(current);
- if (regs)
- show_regs(regs);
- else
- dump_stack();
-
- /*
- * Perform all-CPU dump only once to avoid multiple hardlockups
- * generating interleaving traces
- */
- if (sysctl_hardlockup_all_cpu_backtrace &&
- !test_and_set_bit(0, &hardlockup_allcpu_dumped))
- trigger_allbutself_cpu_backtrace();
-
- if (hardlockup_panic)
- nmi_panic(regs, "Hard LOCKUP");
-
- __this_cpu_write(hard_watchdog_warn, true);
- return;
- }
-
- __this_cpu_write(hard_watchdog_warn, false);
- return;
-}
-#endif /* CONFIG_HARDLOCKUP_DETECTOR */
-
static void watchdog_interrupt_count(void)
{
__this_cpu_inc(hrtimer_interrupts);
}
-static int watchdog_nmi_enable(unsigned int cpu);
-static void watchdog_nmi_disable(unsigned int cpu);
+/*
+ * These two functions are mostly architecture specific
+ * defining them as weak here.
+ */
+int __weak watchdog_nmi_enable(unsigned int cpu)
+{
+ return 0;
+}
+void __weak watchdog_nmi_disable(unsigned int cpu)
+{
+}
static int watchdog_enable_all_cpus(void);
static void watchdog_disable_all_cpus(void);
@@ -396,6 +262,9 @@ static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
int duration;
int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace;
+ if (atomic_read(&watchdog_park_in_progress) != 0)
+ return HRTIMER_NORESTART;
+
/* kick the hardlockup detector */
watchdog_interrupt_count();
@@ -576,109 +445,6 @@ static void watchdog(unsigned int cpu)
watchdog_nmi_disable(cpu);
}
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
-/*
- * People like the simple clean cpu node info on boot.
- * Reduce the watchdog noise by only printing messages
- * that are different from what cpu0 displayed.
- */
-static unsigned long cpu0_err;
-
-static int watchdog_nmi_enable(unsigned int cpu)
-{
- struct perf_event_attr *wd_attr;
- struct perf_event *event = per_cpu(watchdog_ev, cpu);
-
- /* nothing to do if the hard lockup detector is disabled */
- if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
- goto out;
-
- /* is it already setup and enabled? */
- if (event && event->state > PERF_EVENT_STATE_OFF)
- goto out;
-
- /* it is setup but not enabled */
- if (event != NULL)
- goto out_enable;
-
- wd_attr = &wd_hw_attr;
- wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
-
- /* Try to register using hardware perf events */
- event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
-
- /* save cpu0 error for future comparision */
- if (cpu == 0 && IS_ERR(event))
- cpu0_err = PTR_ERR(event);
-
- if (!IS_ERR(event)) {
- /* only print for cpu0 or different than cpu0 */
- if (cpu == 0 || cpu0_err)
- pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n");
- goto out_save;
- }
-
- /*
- * Disable the hard lockup detector if _any_ CPU fails to set up
- * set up the hardware perf event. The watchdog() function checks
- * the NMI_WATCHDOG_ENABLED bit periodically.
- *
- * The barriers are for syncing up watchdog_enabled across all the
- * cpus, as clear_bit() does not use barriers.
- */
- smp_mb__before_atomic();
- clear_bit(NMI_WATCHDOG_ENABLED_BIT, &watchdog_enabled);
- smp_mb__after_atomic();
-
- /* skip displaying the same error again */
- if (cpu > 0 && (PTR_ERR(event) == cpu0_err))
- return PTR_ERR(event);
-
- /* vary the KERN level based on the returned errno */
- if (PTR_ERR(event) == -EOPNOTSUPP)
- pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu);
- else if (PTR_ERR(event) == -ENOENT)
- pr_warn("disabled (cpu%i): hardware events not enabled\n",
- cpu);
- else
- pr_err("disabled (cpu%i): unable to create perf event: %ld\n",
- cpu, PTR_ERR(event));
-
- pr_info("Shutting down hard lockup detector on all cpus\n");
-
- return PTR_ERR(event);
-
- /* success path */
-out_save:
- per_cpu(watchdog_ev, cpu) = event;
-out_enable:
- perf_event_enable(per_cpu(watchdog_ev, cpu));
-out:
- return 0;
-}
-
-static void watchdog_nmi_disable(unsigned int cpu)
-{
- struct perf_event *event = per_cpu(watchdog_ev, cpu);
-
- if (event) {
- perf_event_disable(event);
- per_cpu(watchdog_ev, cpu) = NULL;
-
- /* should be in cleanup, but blocks oprofile */
- perf_event_release_kernel(event);
- }
- if (cpu == 0) {
- /* watchdog_nmi_enable() expects this to be zero initially. */
- cpu0_err = 0;
- }
-}
-
-#else
-static int watchdog_nmi_enable(unsigned int cpu) { return 0; }
-static void watchdog_nmi_disable(unsigned int cpu) { return; }
-#endif /* CONFIG_HARDLOCKUP_DETECTOR */
-
static struct smp_hotplug_thread watchdog_threads = {
.store = &softlockup_watchdog,
.thread_should_run = watchdog_should_run,
@@ -706,12 +472,16 @@ static int watchdog_park_threads(void)
{
int cpu, ret = 0;
+ atomic_set(&watchdog_park_in_progress, 1);
+
for_each_watchdog_cpu(cpu) {
ret = kthread_park(per_cpu(softlockup_watchdog, cpu));
if (ret)
break;
}
+ atomic_set(&watchdog_park_in_progress, 0);
+
return ret;
}
diff --git a/kernel/watchdog_hld.c b/kernel/watchdog_hld.c
new file mode 100644
index 000000000000..12b8dd640786
--- /dev/null
+++ b/kernel/watchdog_hld.c
@@ -0,0 +1,230 @@
+/*
+ * Detect hard lockups on a system
+ *
+ * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc.
+ *
+ * Note: Most of this code is borrowed heavily from the original softlockup
+ * detector, so thanks to Ingo for the initial implementation.
+ * Some chunks also taken from the old x86-specific nmi watchdog code, thanks
+ * to those contributors as well.
+ */
+
+#define pr_fmt(fmt) "NMI watchdog: " fmt
+
+#include <linux/nmi.h>
+#include <linux/module.h>
+#include <asm/irq_regs.h>
+#include <linux/perf_event.h>
+
+static DEFINE_PER_CPU(bool, hard_watchdog_warn);
+static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
+static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
+
+/* boot commands */
+/*
+ * Should we panic when a soft-lockup or hard-lockup occurs:
+ */
+unsigned int __read_mostly hardlockup_panic =
+ CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
+static unsigned long hardlockup_allcpu_dumped;
+/*
+ * We may not want to enable hard lockup detection by default in all cases,
+ * for example when running the kernel as a guest on a hypervisor. In these
+ * cases this function can be called to disable hard lockup detection. This
+ * function should only be executed once by the boot processor before the
+ * kernel command line parameters are parsed, because otherwise it is not
+ * possible to override this in hardlockup_panic_setup().
+ */
+void hardlockup_detector_disable(void)
+{
+ watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
+}
+
+static int __init hardlockup_panic_setup(char *str)
+{
+ if (!strncmp(str, "panic", 5))
+ hardlockup_panic = 1;
+ else if (!strncmp(str, "nopanic", 7))
+ hardlockup_panic = 0;
+ else if (!strncmp(str, "0", 1))
+ watchdog_enabled &= ~NMI_WATCHDOG_ENABLED;
+ else if (!strncmp(str, "1", 1))
+ watchdog_enabled |= NMI_WATCHDOG_ENABLED;
+ return 1;
+}
+__setup("nmi_watchdog=", hardlockup_panic_setup);
+
+void touch_nmi_watchdog(void)
+{
+ /*
+ * Using __raw here because some code paths have
+ * preemption enabled. If preemption is enabled
+ * then interrupts should be enabled too, in which
+ * case we shouldn't have to worry about the watchdog
+ * going off.
+ */
+ raw_cpu_write(watchdog_nmi_touch, true);
+ touch_softlockup_watchdog();
+}
+EXPORT_SYMBOL(touch_nmi_watchdog);
+
+static struct perf_event_attr wd_hw_attr = {
+ .type = PERF_TYPE_HARDWARE,
+ .config = PERF_COUNT_HW_CPU_CYCLES,
+ .size = sizeof(struct perf_event_attr),
+ .pinned = 1,
+ .disabled = 1,
+};
+
+/* Callback function for perf event subsystem */
+static void watchdog_overflow_callback(struct perf_event *event,
+ struct perf_sample_data *data,
+ struct pt_regs *regs)
+{
+ /* Ensure the watchdog never gets throttled */
+ event->hw.interrupts = 0;
+
+ if (atomic_read(&watchdog_park_in_progress) != 0)
+ return;
+
+ if (__this_cpu_read(watchdog_nmi_touch) == true) {
+ __this_cpu_write(watchdog_nmi_touch, false);
+ return;
+ }
+
+ /* check for a hardlockup
+ * This is done by making sure our timer interrupt
+ * is incrementing. The timer interrupt should have
+ * fired multiple times before we overflow'd. If it hasn't
+ * then this is a good indication the cpu is stuck
+ */
+ if (is_hardlockup()) {
+ int this_cpu = smp_processor_id();
+
+ /* only print hardlockups once */
+ if (__this_cpu_read(hard_watchdog_warn) == true)
+ return;
+
+ pr_emerg("Watchdog detected hard LOCKUP on cpu %d", this_cpu);
+ print_modules();
+ print_irqtrace_events(current);
+ if (regs)
+ show_regs(regs);
+ else
+ dump_stack();
+
+ /*
+ * Perform all-CPU dump only once to avoid multiple hardlockups
+ * generating interleaving traces
+ */
+ if (sysctl_hardlockup_all_cpu_backtrace &&
+ !test_and_set_bit(0, &hardlockup_allcpu_dumped))
+ trigger_allbutself_cpu_backtrace();
+
+ if (hardlockup_panic)
+ nmi_panic(regs, "Hard LOCKUP");
+
+ __this_cpu_write(hard_watchdog_warn, true);
+ return;
+ }
+
+ __this_cpu_write(hard_watchdog_warn, false);
+ return;
+}
+
+/*
+ * People like the simple clean cpu node info on boot.
+ * Reduce the watchdog noise by only printing messages
+ * that are different from what cpu0 displayed.
+ */
+static unsigned long cpu0_err;
+
+int watchdog_nmi_enable(unsigned int cpu)
+{
+ struct perf_event_attr *wd_attr;
+ struct perf_event *event = per_cpu(watchdog_ev, cpu);
+
+ /* nothing to do if the hard lockup detector is disabled */
+ if (!(watchdog_enabled & NMI_WATCHDOG_ENABLED))
+ goto out;
+
+ /* is it already setup and enabled? */
+ if (event && event->state > PERF_EVENT_STATE_OFF)
+ goto out;
+
+ /* it is setup but not enabled */
+ if (event != NULL)
+ goto out_enable;
+
+ wd_attr = &wd_hw_attr;
+ wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
+
+ /* Try to register using hardware perf events */
+ event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback, NULL);
+
+ /* save cpu0 error for future comparision */
+ if (cpu == 0 && IS_ERR(event))
+ cpu0_err = PTR_ERR(event);
+
+ if (!IS_ERR(event)) {
+ /* only print for cpu0 or different than cpu0 */
+ if (cpu == 0 || cpu0_err)
+ pr_info("enabled on all CPUs, permanently consumes one hw-PMU counter.\n");
+ goto out_save;
+ }
+
+ /*
+ * Disable the hard lockup detector if _any_ CPU fails to set up
+ * set up the hardware perf event. The watchdog() function checks
+ * the NMI_WATCHDOG_ENABLED bit periodically.
+ *
+ * The barriers are for syncing up watchdog_enabled across all the
+ * cpus, as clear_bit() does not use barriers.
+ */
+ smp_mb__before_atomic();
+ clear_bit(NMI_WATCHDOG_ENABLED_BIT, &watchdog_enabled);
+ smp_mb__after_atomic();
+
+ /* skip displaying the same error again */
+ if (cpu > 0 && (PTR_ERR(event) == cpu0_err))
+ return PTR_ERR(event);
+
+ /* vary the KERN level based on the returned errno */
+ if (PTR_ERR(event) == -EOPNOTSUPP)
+ pr_info("disabled (cpu%i): not supported (no LAPIC?)\n", cpu);
+ else if (PTR_ERR(event) == -ENOENT)
+ pr_warn("disabled (cpu%i): hardware events not enabled\n",
+ cpu);
+ else
+ pr_err("disabled (cpu%i): unable to create perf event: %ld\n",
+ cpu, PTR_ERR(event));
+
+ pr_info("Shutting down hard lockup detector on all cpus\n");
+
+ return PTR_ERR(event);
+
+ /* success path */
+out_save:
+ per_cpu(watchdog_ev, cpu) = event;
+out_enable:
+ perf_event_enable(per_cpu(watchdog_ev, cpu));
+out:
+ return 0;
+}
+
+void watchdog_nmi_disable(unsigned int cpu)
+{
+ struct perf_event *event = per_cpu(watchdog_ev, cpu);
+
+ if (event) {
+ perf_event_disable(event);
+ per_cpu(watchdog_ev, cpu) = NULL;
+
+ /* should be in cleanup, but blocks oprofile */
+ perf_event_release_kernel(event);
+ }
+ if (cpu == 0) {
+ /* watchdog_nmi_enable() expects this to be zero initially. */
+ cpu0_err = 0;
+ }
+}
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 479d840db286..296dcca77f33 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -68,6 +68,7 @@ enum {
* attach_mutex to avoid changing binding state while
* worker_attach_to_pool() is in progress.
*/
+ POOL_MANAGER_ACTIVE = 1 << 0, /* being managed */
POOL_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */
/* worker flags */
@@ -165,7 +166,6 @@ struct worker_pool {
/* L: hash of busy workers */
/* see manage_workers() for details on the two manager mutexes */
- struct mutex manager_arb; /* manager arbitration */
struct worker *manager; /* L: purely informational */
struct mutex attach_mutex; /* attach/detach exclusion */
struct list_head workers; /* A: attached workers */
@@ -297,6 +297,7 @@ static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf;
static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */
static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */
+static DECLARE_WAIT_QUEUE_HEAD(wq_manager_wait); /* wait for manager to go away */
static LIST_HEAD(workqueues); /* PR: list of all workqueues */
static bool workqueue_freezing; /* PL: have wqs started freezing? */
@@ -799,7 +800,7 @@ static bool need_to_create_worker(struct worker_pool *pool)
/* Do we have too many workers and should some go away? */
static bool too_many_workers(struct worker_pool *pool)
{
- bool managing = mutex_is_locked(&pool->manager_arb);
+ bool managing = pool->flags & POOL_MANAGER_ACTIVE;
int nr_idle = pool->nr_idle + managing; /* manager is considered idle */
int nr_busy = pool->nr_workers - nr_idle;
@@ -1979,24 +1980,17 @@ static bool manage_workers(struct worker *worker)
{
struct worker_pool *pool = worker->pool;
- /*
- * Anyone who successfully grabs manager_arb wins the arbitration
- * and becomes the manager. mutex_trylock() on pool->manager_arb
- * failure while holding pool->lock reliably indicates that someone
- * else is managing the pool and the worker which failed trylock
- * can proceed to executing work items. This means that anyone
- * grabbing manager_arb is responsible for actually performing
- * manager duties. If manager_arb is grabbed and released without
- * actual management, the pool may stall indefinitely.
- */
- if (!mutex_trylock(&pool->manager_arb))
+ if (pool->flags & POOL_MANAGER_ACTIVE)
return false;
+
+ pool->flags |= POOL_MANAGER_ACTIVE;
pool->manager = worker;
maybe_create_worker(pool);
pool->manager = NULL;
- mutex_unlock(&pool->manager_arb);
+ pool->flags &= ~POOL_MANAGER_ACTIVE;
+ wake_up(&wq_manager_wait);
return true;
}
@@ -3203,7 +3197,6 @@ static int init_worker_pool(struct worker_pool *pool)
setup_timer(&pool->mayday_timer, pool_mayday_timeout,
(unsigned long)pool);
- mutex_init(&pool->manager_arb);
mutex_init(&pool->attach_mutex);
INIT_LIST_HEAD(&pool->workers);
@@ -3273,13 +3266,15 @@ static void put_unbound_pool(struct worker_pool *pool)
hash_del(&pool->hash_node);
/*
- * Become the manager and destroy all workers. Grabbing
- * manager_arb prevents @pool's workers from blocking on
- * attach_mutex.
+ * Become the manager and destroy all workers. This prevents
+ * @pool's workers from blocking on attach_mutex. We're the last
+ * manager and @pool gets freed with the flag set.
*/
- mutex_lock(&pool->manager_arb);
-
spin_lock_irq(&pool->lock);
+ wait_event_lock_irq(wq_manager_wait,
+ !(pool->flags & POOL_MANAGER_ACTIVE), pool->lock);
+ pool->flags |= POOL_MANAGER_ACTIVE;
+
while ((worker = first_idle_worker(pool)))
destroy_worker(worker);
WARN_ON(pool->nr_workers || pool->nr_idle);
@@ -3293,8 +3288,6 @@ static void put_unbound_pool(struct worker_pool *pool)
if (pool->detach_completion)
wait_for_completion(pool->detach_completion);
- mutex_unlock(&pool->manager_arb);
-
/* shut down the timers */
del_timer_sync(&pool->idle_timer);
del_timer_sync(&pool->mayday_timer);
@@ -3730,8 +3723,12 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq,
return -EINVAL;
/* creating multiple pwqs breaks ordering guarantee */
- if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs)))
- return -EINVAL;
+ if (!list_empty(&wq->pwqs)) {
+ if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT))
+ return -EINVAL;
+
+ wq->flags &= ~__WQ_ORDERED;
+ }
ctx = apply_wqattrs_prepare(wq, attrs);
if (!ctx)
@@ -3915,6 +3912,16 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
struct workqueue_struct *wq;
struct pool_workqueue *pwq;
+ /*
+ * Unbound && max_active == 1 used to imply ordered, which is no
+ * longer the case on NUMA machines due to per-node pools. While
+ * alloc_ordered_workqueue() is the right way to create an ordered
+ * workqueue, keep the previous behavior to avoid subtle breakages
+ * on NUMA.
+ */
+ if ((flags & WQ_UNBOUND) && max_active == 1)
+ flags |= __WQ_ORDERED;
+
/* see the comment above the definition of WQ_POWER_EFFICIENT */
if ((flags & WQ_POWER_EFFICIENT) && wq_power_efficient)
flags |= WQ_UNBOUND;
@@ -4103,13 +4110,14 @@ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
struct pool_workqueue *pwq;
/* disallow meddling with max_active for ordered workqueues */
- if (WARN_ON(wq->flags & __WQ_ORDERED))
+ if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT))
return;
max_active = wq_clamp_max_active(max_active, wq->flags, wq->name);
mutex_lock(&wq->mutex);
+ wq->flags &= ~__WQ_ORDERED;
wq->saved_max_active = max_active;
for_each_pwq(pwq, wq)
@@ -5214,7 +5222,7 @@ int workqueue_sysfs_register(struct workqueue_struct *wq)
* attributes breaks ordering guarantee. Disallow exposing ordered
* workqueues.
*/
- if (WARN_ON(wq->flags & __WQ_ORDERED))
+ if (WARN_ON(wq->flags & __WQ_ORDERED_EXPLICIT))
return -EINVAL;
wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL);
diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h
index 8635417c587b..29fa81f0f51a 100644
--- a/kernel/workqueue_internal.h
+++ b/kernel/workqueue_internal.h
@@ -9,6 +9,7 @@
#include <linux/workqueue.h>
#include <linux/kthread.h>
+#include <linux/preempt.h>
struct worker_pool;
@@ -59,7 +60,7 @@ struct worker {
*/
static inline struct worker *current_wq_worker(void)
{
- if (current->flags & PF_WQ_WORKER)
+ if (in_task() && (current->flags & PF_WQ_WORKER))
return kthread_data(current);
return NULL;
}