summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/async.c3
-rw-r--r--kernel/audit.c101
-rw-r--r--kernel/audit.h4
-rw-r--r--kernel/auditsc.c87
-rw-r--r--kernel/bpf/devmap.c4
-rw-r--r--kernel/bpf/syscall.c3
-rw-r--r--kernel/bpf/verifier.c2
-rw-r--r--kernel/cgroup/cgroup-internal.h19
-rw-r--r--kernel/cgroup/cgroup-v1.c47
-rw-r--r--kernel/cgroup/cgroup.c98
-rw-r--r--kernel/cgroup/cpuset.c15
-rw-r--r--kernel/dma/debug.c4
-rw-r--r--kernel/dma/direct.c3
-rw-r--r--kernel/dma/swiotlb.c3
-rw-r--r--kernel/events/core.c30
-rw-r--r--kernel/fork.c7
-rw-r--r--kernel/irq/affinity.c5
-rw-r--r--kernel/irq/internals.h2
-rw-r--r--kernel/irq/irqdesc.c2
-rw-r--r--kernel/irq/manage.c39
-rw-r--r--kernel/kprobes.c3
-rw-r--r--kernel/module.c27
-rw-r--r--kernel/power/hibernate.c8
-rw-r--r--kernel/power/main.c5
-rw-r--r--kernel/power/process.c2
-rw-r--r--kernel/power/snapshot.c21
-rw-r--r--kernel/power/suspend.c2
-rw-r--r--kernel/power/suspend_test.c8
-rw-r--r--kernel/power/wakelock.c11
-rw-r--r--kernel/printk/printk.c6
-rw-r--r--kernel/ptrace.c47
-rw-r--r--kernel/rcu/tree.c17
-rw-r--r--kernel/rcu/tree_exp.h1
-rw-r--r--kernel/rcu/tree_plugin.h9
-rw-r--r--kernel/sched/core.c5
-rw-r--r--kernel/sched/cputime.c4
-rw-r--r--kernel/sched/debug.c10
-rw-r--r--kernel/sched/psi.c66
-rw-r--r--kernel/sched/rt.c23
-rw-r--r--kernel/sched/topology.c99
-rw-r--r--kernel/sched/wait.c7
-rw-r--r--kernel/seccomp.c10
-rw-r--r--kernel/smp.c2
-rw-r--r--kernel/sysctl.c37
-rw-r--r--kernel/time/tick-sched.c2
-rw-r--r--kernel/time/timekeeping.c3
-rw-r--r--kernel/trace/ftrace.c6
-rw-r--r--kernel/trace/trace.c27
-rw-r--r--kernel/trace/trace.h24
-rw-r--r--kernel/trace/trace_events.c7
-rw-r--r--kernel/trace/trace_events_hist.c48
-rw-r--r--kernel/trace/trace_events_trigger.c61
-rw-r--r--kernel/trace/trace_kprobe.c7
-rw-r--r--kernel/trace/trace_uprobe.c1
-rw-r--r--kernel/trace/tracing_map.c3
-rw-r--r--kernel/tsacct.c7
-rw-r--r--kernel/workqueue.c9
57 files changed, 744 insertions, 369 deletions
diff --git a/kernel/async.c b/kernel/async.c
index 4f9c1d614016..74660f611b97 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -205,9 +205,6 @@ async_cookie_t async_schedule_node_domain(async_func_t func, void *data,
atomic_inc(&entry_count);
spin_unlock_irqrestore(&async_lock, flags);
- /* mark that this task has queued an async job, used by module init */
- current->flags |= PF_USED_ASYNC;
-
/* schedule for execution */
queue_work_node(node, system_unbound_wq, &entry->work);
diff --git a/kernel/audit.c b/kernel/audit.c
index 05ae208ad442..db8141866cea 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -535,20 +535,22 @@ static void kauditd_printk_skb(struct sk_buff *skb)
/**
* kauditd_rehold_skb - Handle a audit record send failure in the hold queue
* @skb: audit record
+ * @error: error code (unused)
*
* Description:
* This should only be used by the kauditd_thread when it fails to flush the
* hold queue.
*/
-static void kauditd_rehold_skb(struct sk_buff *skb)
+static void kauditd_rehold_skb(struct sk_buff *skb, __always_unused int error)
{
- /* put the record back in the queue at the same place */
- skb_queue_head(&audit_hold_queue, skb);
+ /* put the record back in the queue */
+ skb_queue_tail(&audit_hold_queue, skb);
}
/**
* kauditd_hold_skb - Queue an audit record, waiting for auditd
* @skb: audit record
+ * @error: error code
*
* Description:
* Queue the audit record, waiting for an instance of auditd. When this
@@ -558,19 +560,31 @@ static void kauditd_rehold_skb(struct sk_buff *skb)
* and queue it, if we have room. If we want to hold on to the record, but we
* don't have room, record a record lost message.
*/
-static void kauditd_hold_skb(struct sk_buff *skb)
+static void kauditd_hold_skb(struct sk_buff *skb, int error)
{
/* at this point it is uncertain if we will ever send this to auditd so
* try to send the message via printk before we go any further */
kauditd_printk_skb(skb);
/* can we just silently drop the message? */
- if (!audit_default) {
- kfree_skb(skb);
- return;
+ if (!audit_default)
+ goto drop;
+
+ /* the hold queue is only for when the daemon goes away completely,
+ * not -EAGAIN failures; if we are in a -EAGAIN state requeue the
+ * record on the retry queue unless it's full, in which case drop it
+ */
+ if (error == -EAGAIN) {
+ if (!audit_backlog_limit ||
+ skb_queue_len(&audit_retry_queue) < audit_backlog_limit) {
+ skb_queue_tail(&audit_retry_queue, skb);
+ return;
+ }
+ audit_log_lost("kauditd retry queue overflow");
+ goto drop;
}
- /* if we have room, queue the message */
+ /* if we have room in the hold queue, queue the message */
if (!audit_backlog_limit ||
skb_queue_len(&audit_hold_queue) < audit_backlog_limit) {
skb_queue_tail(&audit_hold_queue, skb);
@@ -579,24 +593,32 @@ static void kauditd_hold_skb(struct sk_buff *skb)
/* we have no other options - drop the message */
audit_log_lost("kauditd hold queue overflow");
+drop:
kfree_skb(skb);
}
/**
* kauditd_retry_skb - Queue an audit record, attempt to send again to auditd
* @skb: audit record
+ * @error: error code (unused)
*
* Description:
* Not as serious as kauditd_hold_skb() as we still have a connected auditd,
* but for some reason we are having problems sending it audit records so
* queue the given record and attempt to resend.
*/
-static void kauditd_retry_skb(struct sk_buff *skb)
+static void kauditd_retry_skb(struct sk_buff *skb, __always_unused int error)
{
- /* NOTE: because records should only live in the retry queue for a
- * short period of time, before either being sent or moved to the hold
- * queue, we don't currently enforce a limit on this queue */
- skb_queue_tail(&audit_retry_queue, skb);
+ if (!audit_backlog_limit ||
+ skb_queue_len(&audit_retry_queue) < audit_backlog_limit) {
+ skb_queue_tail(&audit_retry_queue, skb);
+ return;
+ }
+
+ /* we have to drop the record, send it via printk as a last effort */
+ kauditd_printk_skb(skb);
+ audit_log_lost("kauditd retry queue overflow");
+ kfree_skb(skb);
}
/**
@@ -634,7 +656,7 @@ static void auditd_reset(const struct auditd_connection *ac)
/* flush the retry queue to the hold queue, but don't touch the main
* queue since we need to process that normally for multicast */
while ((skb = skb_dequeue(&audit_retry_queue)))
- kauditd_hold_skb(skb);
+ kauditd_hold_skb(skb, -ECONNREFUSED);
}
/**
@@ -708,16 +730,18 @@ static int kauditd_send_queue(struct sock *sk, u32 portid,
struct sk_buff_head *queue,
unsigned int retry_limit,
void (*skb_hook)(struct sk_buff *skb),
- void (*err_hook)(struct sk_buff *skb))
+ void (*err_hook)(struct sk_buff *skb, int error))
{
int rc = 0;
- struct sk_buff *skb;
- static unsigned int failed = 0;
+ struct sk_buff *skb = NULL;
+ struct sk_buff *skb_tail;
+ unsigned int failed = 0;
/* NOTE: kauditd_thread takes care of all our locking, we just use
* the netlink info passed to us (e.g. sk and portid) */
- while ((skb = skb_dequeue(queue))) {
+ skb_tail = skb_peek_tail(queue);
+ while ((skb != skb_tail) && (skb = skb_dequeue(queue))) {
/* call the skb_hook for each skb we touch */
if (skb_hook)
(*skb_hook)(skb);
@@ -725,36 +749,34 @@ static int kauditd_send_queue(struct sock *sk, u32 portid,
/* can we send to anyone via unicast? */
if (!sk) {
if (err_hook)
- (*err_hook)(skb);
+ (*err_hook)(skb, -ECONNREFUSED);
continue;
}
+retry:
/* grab an extra skb reference in case of error */
skb_get(skb);
rc = netlink_unicast(sk, skb, portid, 0);
if (rc < 0) {
- /* fatal failure for our queue flush attempt? */
+ /* send failed - try a few times unless fatal error */
if (++failed >= retry_limit ||
rc == -ECONNREFUSED || rc == -EPERM) {
- /* yes - error processing for the queue */
sk = NULL;
if (err_hook)
- (*err_hook)(skb);
- if (!skb_hook)
- goto out;
- /* keep processing with the skb_hook */
+ (*err_hook)(skb, rc);
+ if (rc == -EAGAIN)
+ rc = 0;
+ /* continue to drain the queue */
continue;
} else
- /* no - requeue to preserve ordering */
- skb_queue_head(queue, skb);
+ goto retry;
} else {
- /* it worked - drop the extra reference and continue */
+ /* skb sent - drop the extra reference and continue */
consume_skb(skb);
failed = 0;
}
}
-out:
return (rc >= 0 ? 0 : rc);
}
@@ -1530,6 +1552,20 @@ static void audit_receive(struct sk_buff *skb)
nlh = nlmsg_next(nlh, &len);
}
audit_ctl_unlock();
+
+ /* can't block with the ctrl lock, so penalize the sender now */
+ if (audit_backlog_limit &&
+ (skb_queue_len(&audit_queue) > audit_backlog_limit)) {
+ DECLARE_WAITQUEUE(wait, current);
+
+ /* wake kauditd to try and flush the queue */
+ wake_up_interruptible(&kauditd_wait);
+
+ add_wait_queue_exclusive(&audit_backlog_wait, &wait);
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule_timeout(audit_backlog_wait_time);
+ remove_wait_queue(&audit_backlog_wait, &wait);
+ }
}
/* Run custom bind function on netlink socket group connect or bind requests. */
@@ -1557,7 +1593,8 @@ static int __net_init audit_net_init(struct net *net)
audit_panic("cannot initialize netlink socket in namespace");
return -ENOMEM;
}
- aunet->sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
+ /* limit the timeout in case auditd is blocked/stopped */
+ aunet->sk->sk_sndtimeo = HZ / 10;
return 0;
}
@@ -1773,7 +1810,9 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
* task_tgid_vnr() since auditd_pid is set in audit_receive_msg()
* using a PID anchored in the caller's namespace
* 2. generator holding the audit_cmd_mutex - we don't want to block
- * while holding the mutex */
+ * while holding the mutex, although we do penalize the sender
+ * later in audit_receive() when it is safe to block
+ */
if (!(auditd_test_task(current) || audit_ctl_owner_current())) {
long stime = audit_backlog_wait_time;
diff --git a/kernel/audit.h b/kernel/audit.h
index ddc22878433d..fed8e93ce169 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -191,6 +191,10 @@ struct audit_context {
struct {
char *name;
} module;
+ struct {
+ struct audit_ntp_data ntp_data;
+ struct timespec64 tk_injoffset;
+ } time;
};
int fds[2];
struct audit_proctitle proctitle;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index d33c5dccde1c..e8e90c0c4936 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1185,6 +1185,53 @@ static void audit_log_fcaps(struct audit_buffer *ab, struct audit_names *name)
from_kuid(&init_user_ns, name->fcap.rootid));
}
+static void audit_log_time(struct audit_context *context, struct audit_buffer **ab)
+{
+ const struct audit_ntp_data *ntp = &context->time.ntp_data;
+ const struct timespec64 *tk = &context->time.tk_injoffset;
+ static const char * const ntp_name[] = {
+ "offset",
+ "freq",
+ "status",
+ "tai",
+ "tick",
+ "adjust",
+ };
+ int type;
+
+ if (context->type == AUDIT_TIME_ADJNTPVAL) {
+ for (type = 0; type < AUDIT_NTP_NVALS; type++) {
+ if (ntp->vals[type].newval != ntp->vals[type].oldval) {
+ if (!*ab) {
+ *ab = audit_log_start(context,
+ GFP_KERNEL,
+ AUDIT_TIME_ADJNTPVAL);
+ if (!*ab)
+ return;
+ }
+ audit_log_format(*ab, "op=%s old=%lli new=%lli",
+ ntp_name[type],
+ ntp->vals[type].oldval,
+ ntp->vals[type].newval);
+ audit_log_end(*ab);
+ *ab = NULL;
+ }
+ }
+ }
+ if (tk->tv_sec != 0 || tk->tv_nsec != 0) {
+ if (!*ab) {
+ *ab = audit_log_start(context, GFP_KERNEL,
+ AUDIT_TIME_INJOFFSET);
+ if (!*ab)
+ return;
+ }
+ audit_log_format(*ab, "sec=%lli nsec=%li",
+ (long long)tk->tv_sec, tk->tv_nsec);
+ audit_log_end(*ab);
+ *ab = NULL;
+ }
+}
+
static void show_special(struct audit_context *context, int *call_panic)
{
struct audit_buffer *ab;
@@ -1290,6 +1337,11 @@ static void show_special(struct audit_context *context, int *call_panic)
audit_log_format(ab, "(null)");
break;
+ case AUDIT_TIME_ADJNTPVAL:
+ case AUDIT_TIME_INJOFFSET:
+ /* this call deviates from the rest, eating the buffer */
+ audit_log_time(context, &ab);
+ break;
}
audit_log_end(ab);
}
@@ -2518,31 +2570,26 @@ void __audit_fanotify(unsigned int response)
void __audit_tk_injoffset(struct timespec64 offset)
{
- audit_log(audit_context(), GFP_KERNEL, AUDIT_TIME_INJOFFSET,
- "sec=%lli nsec=%li",
- (long long)offset.tv_sec, offset.tv_nsec);
-}
-
-static void audit_log_ntp_val(const struct audit_ntp_data *ad,
- const char *op, enum audit_ntp_type type)
-{
- const struct audit_ntp_val *val = &ad->vals[type];
-
- if (val->newval == val->oldval)
- return;
+ struct audit_context *context = audit_context();
- audit_log(audit_context(), GFP_KERNEL, AUDIT_TIME_ADJNTPVAL,
- "op=%s old=%lli new=%lli", op, val->oldval, val->newval);
+ /* only set type if not already set by NTP */
+ if (!context->type)
+ context->type = AUDIT_TIME_INJOFFSET;
+ memcpy(&context->time.tk_injoffset, &offset, sizeof(offset));
}
void __audit_ntp_log(const struct audit_ntp_data *ad)
{
- audit_log_ntp_val(ad, "offset", AUDIT_NTP_OFFSET);
- audit_log_ntp_val(ad, "freq", AUDIT_NTP_FREQ);
- audit_log_ntp_val(ad, "status", AUDIT_NTP_STATUS);
- audit_log_ntp_val(ad, "tai", AUDIT_NTP_TAI);
- audit_log_ntp_val(ad, "tick", AUDIT_NTP_TICK);
- audit_log_ntp_val(ad, "adjust", AUDIT_NTP_ADJUST);
+ struct audit_context *context = audit_context();
+ int type;
+
+ for (type = 0; type < AUDIT_NTP_NVALS; type++)
+ if (ad->vals[type].newval != ad->vals[type].oldval) {
+ /* unconditionally set type, overwriting TK */
+ context->type = AUDIT_TIME_ADJNTPVAL;
+ memcpy(&context->time.ntp_data, ad, sizeof(*ad));
+ break;
+ }
}
static void audit_log_task(struct audit_buffer *ab)
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c
index 6684696fa457..4b2819b0a05a 100644
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -94,7 +94,7 @@ static struct hlist_head *dev_map_create_hash(unsigned int entries,
int i;
struct hlist_head *hash;
- hash = bpf_map_area_alloc(entries * sizeof(*hash), numa_node);
+ hash = bpf_map_area_alloc((u64) entries * sizeof(*hash), numa_node);
if (hash != NULL)
for (i = 0; i < entries; i++)
INIT_HLIST_HEAD(&hash[i]);
@@ -159,7 +159,7 @@ static int dev_map_init_map(struct bpf_dtab *dtab, union bpf_attr *attr)
spin_lock_init(&dtab->index_lock);
} else {
- dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries *
+ dtab->netdev_map = bpf_map_area_alloc((u64) dtab->map.max_entries *
sizeof(struct bpf_dtab_netdev *),
dtab->map.numa_node);
if (!dtab->netdev_map)
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
index bf03d04a9e2f..9ebdcdaa5f16 100644
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -39,7 +39,8 @@ static DEFINE_SPINLOCK(prog_idr_lock);
static DEFINE_IDR(map_idr);
static DEFINE_SPINLOCK(map_idr_lock);
-int sysctl_unprivileged_bpf_disabled __read_mostly;
+int sysctl_unprivileged_bpf_disabled __read_mostly =
+ IS_BUILTIN(CONFIG_BPF_UNPRIV_DEFAULT_OFF) ? 2 : 0;
static const struct bpf_map_ops * const bpf_map_types[] = {
#define BPF_PROG_TYPE(_id, _ops)
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 9c5fa5c52903..34262d83dce1 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5372,7 +5372,7 @@ static void find_good_pkt_pointers(struct bpf_verifier_state *vstate,
new_range = dst_reg->off;
if (range_right_open)
- new_range--;
+ new_range++;
/* Examples for register markings:
*
diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h
index 809e34a3c017..236f290224aa 100644
--- a/kernel/cgroup/cgroup-internal.h
+++ b/kernel/cgroup/cgroup-internal.h
@@ -65,6 +65,25 @@ static inline struct cgroup_fs_context *cgroup_fc2context(struct fs_context *fc)
return container_of(kfc, struct cgroup_fs_context, kfc);
}
+struct cgroup_pidlist;
+
+struct cgroup_file_ctx {
+ struct cgroup_namespace *ns;
+
+ struct {
+ void *trigger;
+ } psi;
+
+ struct {
+ bool started;
+ struct css_task_iter iter;
+ } procs;
+
+ struct {
+ struct cgroup_pidlist *pidlist;
+ } procs1;
+};
+
/*
* A cgroup can be associated with multiple css_sets as different tasks may
* belong to different cgroups on different hierarchies. In the other
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c
index 2d0ef613ca07..117d70098cd4 100644
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -398,6 +398,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
* next pid to display, if any
*/
struct kernfs_open_file *of = s->private;
+ struct cgroup_file_ctx *ctx = of->priv;
struct cgroup *cgrp = seq_css(s)->cgroup;
struct cgroup_pidlist *l;
enum cgroup_filetype type = seq_cft(s)->private;
@@ -407,25 +408,24 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
mutex_lock(&cgrp->pidlist_mutex);
/*
- * !NULL @of->priv indicates that this isn't the first start()
- * after open. If the matching pidlist is around, we can use that.
- * Look for it. Note that @of->priv can't be used directly. It
- * could already have been destroyed.
+ * !NULL @ctx->procs1.pidlist indicates that this isn't the first
+ * start() after open. If the matching pidlist is around, we can use
+ * that. Look for it. Note that @ctx->procs1.pidlist can't be used
+ * directly. It could already have been destroyed.
*/
- if (of->priv)
- of->priv = cgroup_pidlist_find(cgrp, type);
+ if (ctx->procs1.pidlist)
+ ctx->procs1.pidlist = cgroup_pidlist_find(cgrp, type);
/*
* Either this is the first start() after open or the matching
* pidlist has been destroyed inbetween. Create a new one.
*/
- if (!of->priv) {
- ret = pidlist_array_load(cgrp, type,
- (struct cgroup_pidlist **)&of->priv);
+ if (!ctx->procs1.pidlist) {
+ ret = pidlist_array_load(cgrp, type, &ctx->procs1.pidlist);
if (ret)
return ERR_PTR(ret);
}
- l = of->priv;
+ l = ctx->procs1.pidlist;
if (pid) {
int end = l->length;
@@ -453,7 +453,8 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos)
static void cgroup_pidlist_stop(struct seq_file *s, void *v)
{
struct kernfs_open_file *of = s->private;
- struct cgroup_pidlist *l = of->priv;
+ struct cgroup_file_ctx *ctx = of->priv;
+ struct cgroup_pidlist *l = ctx->procs1.pidlist;
if (l)
mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork,
@@ -464,7 +465,8 @@ static void cgroup_pidlist_stop(struct seq_file *s, void *v)
static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos)
{
struct kernfs_open_file *of = s->private;
- struct cgroup_pidlist *l = of->priv;
+ struct cgroup_file_ctx *ctx = of->priv;
+ struct cgroup_pidlist *l = ctx->procs1.pidlist;
pid_t *p = v;
pid_t *end = l->list + l->length;
/*
@@ -507,10 +509,11 @@ static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of,
goto out_unlock;
/*
- * Even if we're attaching all tasks in the thread group, we only
- * need to check permissions on one of them.
+ * Even if we're attaching all tasks in the thread group, we only need
+ * to check permissions on one of them. Check permissions using the
+ * credentials from file open to protect against inherited fd attacks.
*/
- cred = current_cred();
+ cred = of->file->f_cred;
tcred = get_task_cred(task);
if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
!uid_eq(cred->euid, tcred->uid) &&
@@ -549,6 +552,14 @@ static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of,
BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
+ /*
+ * Release agent gets called with all capabilities,
+ * require capabilities to set release agent.
+ */
+ if ((of->file->f_cred->user_ns != &init_user_ns) ||
+ !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
cgrp = cgroup_kn_lock_live(of->kn, false);
if (!cgrp)
return -ENODEV;
@@ -961,6 +972,12 @@ int cgroup1_parse_param(struct fs_context *fc, struct fs_parameter *param)
/* Specifying two release agents is forbidden */
if (ctx->release_agent)
return cg_invalf(fc, "cgroup1: release_agent respecified");
+ /*
+ * Release agent gets called with all capabilities,
+ * require capabilities to set release agent.
+ */
+ if ((fc->user_ns != &init_user_ns) || !capable(CAP_SYS_ADMIN))
+ return cg_invalf(fc, "cgroup1: Setting release_agent not allowed");
ctx->release_agent = param->string;
param->string = NULL;
break;
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index 1904ffcee0f1..177d57ce9016 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -3648,6 +3648,7 @@ static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v)
static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
size_t nbytes, enum psi_res res)
{
+ struct cgroup_file_ctx *ctx = of->priv;
struct psi_trigger *new;
struct cgroup *cgrp;
struct psi_group *psi;
@@ -3659,6 +3660,12 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
cgroup_get(cgrp);
cgroup_kn_unlock(of->kn);
+ /* Allow only one trigger per file descriptor */
+ if (ctx->psi.trigger) {
+ cgroup_put(cgrp);
+ return -EBUSY;
+ }
+
psi = cgroup_ino(cgrp) == 1 ? &psi_system : &cgrp->psi;
new = psi_trigger_create(psi, buf, nbytes, res);
if (IS_ERR(new)) {
@@ -3666,8 +3673,7 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf,
return PTR_ERR(new);
}
- psi_trigger_replace(&of->priv, new);
-
+ smp_store_release(&ctx->psi.trigger, new);
cgroup_put(cgrp);
return nbytes;
@@ -3697,12 +3703,15 @@ static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of,
static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of,
poll_table *pt)
{
- return psi_trigger_poll(&of->priv, of->file, pt);
+ struct cgroup_file_ctx *ctx = of->priv;
+ return psi_trigger_poll(&ctx->psi.trigger, of->file, pt);
}
static void cgroup_pressure_release(struct kernfs_open_file *of)
{
- psi_trigger_replace(&of->priv, NULL);
+ struct cgroup_file_ctx *ctx = of->priv;
+
+ psi_trigger_destroy(ctx->psi.trigger);
}
#endif /* CONFIG_PSI */
@@ -3743,24 +3752,43 @@ static ssize_t cgroup_freeze_write(struct kernfs_open_file *of,
static int cgroup_file_open(struct kernfs_open_file *of)
{
struct cftype *cft = of->kn->priv;
+ struct cgroup_file_ctx *ctx;
+ int ret;
- if (cft->open)
- return cft->open(of);
- return 0;
+ ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
+ if (!ctx)
+ return -ENOMEM;
+
+ ctx->ns = current->nsproxy->cgroup_ns;
+ get_cgroup_ns(ctx->ns);
+ of->priv = ctx;
+
+ if (!cft->open)
+ return 0;
+
+ ret = cft->open(of);
+ if (ret) {
+ put_cgroup_ns(ctx->ns);
+ kfree(ctx);
+ }
+ return ret;
}
static void cgroup_file_release(struct kernfs_open_file *of)
{
struct cftype *cft = of->kn->priv;
+ struct cgroup_file_ctx *ctx = of->priv;
if (cft->release)
cft->release(of);
+ put_cgroup_ns(ctx->ns);
+ kfree(ctx);
}
static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
size_t nbytes, loff_t off)
{
- struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
+ struct cgroup_file_ctx *ctx = of->priv;
struct cgroup *cgrp = of->kn->parent->priv;
struct cftype *cft = of->kn->priv;
struct cgroup_subsys_state *css;
@@ -3774,7 +3802,7 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf,
*/
if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) &&
!(cft->flags & CFTYPE_NS_DELEGATABLE) &&
- ns != &init_cgroup_ns && ns->root_cset->dfl_cgrp == cgrp)
+ ctx->ns != &init_cgroup_ns && ctx->ns->root_cset->dfl_cgrp == cgrp)
return -EPERM;
if (cft->write)
@@ -4682,21 +4710,21 @@ void css_task_iter_end(struct css_task_iter *it)
static void cgroup_procs_release(struct kernfs_open_file *of)
{
- if (of->priv) {
- css_task_iter_end(of->priv);
- kfree(of->priv);
- }
+ struct cgroup_file_ctx *ctx = of->priv;
+
+ if (ctx->procs.started)
+ css_task_iter_end(&ctx->procs.iter);
}
static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos)
{
struct kernfs_open_file *of = s->private;
- struct css_task_iter *it = of->priv;
+ struct cgroup_file_ctx *ctx = of->priv;
if (pos)
(*pos)++;
- return css_task_iter_next(it);
+ return css_task_iter_next(&ctx->procs.iter);
}
static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos,
@@ -4704,21 +4732,18 @@ static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos,
{
struct kernfs_open_file *of = s->private;
struct cgroup *cgrp = seq_css(s)->cgroup;
- struct css_task_iter *it = of->priv;
+ struct cgroup_file_ctx *ctx = of->priv;
+ struct css_task_iter *it = &ctx->procs.iter;
/*
* When a seq_file is seeked, it's always traversed sequentially
* from position 0, so we can simply keep iterating on !0 *pos.
*/
- if (!it) {
+ if (!ctx->procs.started) {
if (WARN_ON_ONCE((*pos)))
return ERR_PTR(-EINVAL);
-
- it = kzalloc(sizeof(*it), GFP_KERNEL);
- if (!it)
- return ERR_PTR(-ENOMEM);
- of->priv = it;
css_task_iter_start(&cgrp->self, iter_flags, it);
+ ctx->procs.started = true;
} else if (!(*pos)) {
css_task_iter_end(it);
css_task_iter_start(&cgrp->self, iter_flags, it);
@@ -4753,9 +4778,9 @@ static int cgroup_procs_show(struct seq_file *s, void *v)
static int cgroup_procs_write_permission(struct cgroup *src_cgrp,
struct cgroup *dst_cgrp,
- struct super_block *sb)
+ struct super_block *sb,
+ struct cgroup_namespace *ns)
{
- struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
struct cgroup *com_cgrp = src_cgrp;
struct inode *inode;
int ret;
@@ -4791,8 +4816,10 @@ static int cgroup_procs_write_permission(struct cgroup *src_cgrp,
static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
+ struct cgroup_file_ctx *ctx = of->priv;
struct cgroup *src_cgrp, *dst_cgrp;
struct task_struct *task;
+ const struct cred *saved_cred;
ssize_t ret;
dst_cgrp = cgroup_kn_lock_live(of->kn, false);
@@ -4809,8 +4836,16 @@ static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
spin_unlock_irq(&css_set_lock);
+ /*
+ * Process and thread migrations follow same delegation rule. Check
+ * permissions using the credentials from file open to protect against
+ * inherited fd attacks.
+ */
+ saved_cred = override_creds(of->file->f_cred);
ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp,
- of->file->f_path.dentry->d_sb);
+ of->file->f_path.dentry->d_sb,
+ ctx->ns);
+ revert_creds(saved_cred);
if (ret)
goto out_finish;
@@ -4832,8 +4867,10 @@ static void *cgroup_threads_start(struct seq_file *s, loff_t *pos)
static ssize_t cgroup_threads_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
+ struct cgroup_file_ctx *ctx = of->priv;
struct cgroup *src_cgrp, *dst_cgrp;
struct task_struct *task;
+ const struct cred *saved_cred;
ssize_t ret;
buf = strstrip(buf);
@@ -4852,9 +4889,16 @@ static ssize_t cgroup_threads_write(struct kernfs_open_file *of,
src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
spin_unlock_irq(&css_set_lock);
- /* thread migrations follow the cgroup.procs delegation rule */
+ /*
+ * Process and thread migrations follow same delegation rule. Check
+ * permissions using the credentials from file open to protect against
+ * inherited fd attacks.
+ */
+ saved_cred = override_creds(of->file->f_cred);
ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp,
- of->file->f_path.dentry->d_sb);
+ of->file->f_path.dentry->d_sb,
+ ctx->ns);
+ revert_creds(saved_cred);
if (ret)
goto out_finish;
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index badfa8f15359..3674798ade1f 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -1473,10 +1473,15 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
struct cpuset *sibling;
struct cgroup_subsys_state *pos_css;
+ percpu_rwsem_assert_held(&cpuset_rwsem);
+
/*
* Check all its siblings and call update_cpumasks_hier()
* if their use_parent_ecpus flag is set in order for them
* to use the right effective_cpus value.
+ *
+ * The update_cpumasks_hier() function may sleep. So we have to
+ * release the RCU read lock before calling it.
*/
rcu_read_lock();
cpuset_for_each_child(sibling, pos_css, parent) {
@@ -1484,8 +1489,13 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
continue;
if (!sibling->use_parent_ecpus)
continue;
+ if (!css_tryget_online(&sibling->css))
+ continue;
+ rcu_read_unlock();
update_cpumasks_hier(sibling, tmp);
+ rcu_read_lock();
+ css_put(&sibling->css);
}
rcu_read_unlock();
}
@@ -1558,8 +1568,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
* Make sure that subparts_cpus is a subset of cpus_allowed.
*/
if (cs->nr_subparts_cpus) {
- cpumask_andnot(cs->subparts_cpus, cs->subparts_cpus,
- cs->cpus_allowed);
+ cpumask_and(cs->subparts_cpus, cs->subparts_cpus, cs->cpus_allowed);
cs->nr_subparts_cpus = cpumask_weight(cs->subparts_cpus);
}
spin_unlock_irq(&callback_lock);
@@ -2195,6 +2204,7 @@ static void cpuset_attach(struct cgroup_taskset *tset)
cgroup_taskset_first(tset, &css);
cs = css_cs(css);
+ cpus_read_lock();
percpu_down_write(&cpuset_rwsem);
/* prepare for attach */
@@ -2250,6 +2260,7 @@ static void cpuset_attach(struct cgroup_taskset *tset)
wake_up(&cpuset_attach_wq);
percpu_up_write(&cpuset_rwsem);
+ cpus_read_unlock();
}
/* The various types of files and directories in a cpuset file system */
diff --git a/kernel/dma/debug.c b/kernel/dma/debug.c
index b28665f4d8c7..4dc3bbfd3e3f 100644
--- a/kernel/dma/debug.c
+++ b/kernel/dma/debug.c
@@ -980,7 +980,7 @@ static __init int dma_debug_cmdline(char *str)
global_disable = true;
}
- return 0;
+ return 1;
}
static __init int dma_debug_entries_cmdline(char *str)
@@ -989,7 +989,7 @@ static __init int dma_debug_entries_cmdline(char *str)
return -EINVAL;
if (!get_option(&str, &nr_prealloc_entries))
nr_prealloc_entries = PREALLOC_DMA_DEBUG_ENTRIES;
- return 0;
+ return 1;
}
__setup("dma_debug=", dma_debug_cmdline);
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 0a093a675b63..f04cfc2e9e01 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -306,7 +306,8 @@ void dma_direct_unmap_page(struct device *dev, dma_addr_t addr,
dma_direct_sync_single_for_cpu(dev, addr, size, dir);
if (unlikely(is_swiotlb_buffer(phys)))
- swiotlb_tbl_unmap_single(dev, phys, size, size, dir, attrs);
+ swiotlb_tbl_unmap_single(dev, phys, size, size, dir,
+ attrs | DMA_ATTR_SKIP_CPU_SYNC);
}
EXPORT_SYMBOL(dma_direct_unmap_page);
diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c
index f99b79d7e123..f17b771856d1 100644
--- a/kernel/dma/swiotlb.c
+++ b/kernel/dma/swiotlb.c
@@ -572,7 +572,8 @@ found:
for (i = 0; i < nslots; i++)
io_tlb_orig_addr[index+i] = orig_addr + (i << IO_TLB_SHIFT);
if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) &&
- (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL))
+ (!(attrs & DMA_ATTR_OVERWRITE) || dir == DMA_TO_DEVICE ||
+ dir == DMA_BIDIRECTIONAL))
swiotlb_bounce(orig_addr, tlb_addr, mapping_size, DMA_TO_DEVICE);
return tlb_addr;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 1993a741d2dc..52f4a9e46704 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -799,7 +799,7 @@ static DEFINE_PER_CPU(struct list_head, cgrp_cpuctx_list);
*/
static void perf_cgroup_switch(struct task_struct *task, int mode)
{
- struct perf_cpu_context *cpuctx;
+ struct perf_cpu_context *cpuctx, *tmp;
struct list_head *list;
unsigned long flags;
@@ -810,7 +810,7 @@ static void perf_cgroup_switch(struct task_struct *task, int mode)
local_irq_save(flags);
list = this_cpu_ptr(&cgrp_cpuctx_list);
- list_for_each_entry(cpuctx, list, cgrp_cpuctx_entry) {
+ list_for_each_entry_safe(cpuctx, tmp, list, cgrp_cpuctx_entry) {
WARN_ON_ONCE(cpuctx->ctx.nr_cgroups == 0);
perf_ctx_lock(cpuctx, cpuctx->task_ctx);
@@ -6045,18 +6045,25 @@ static void perf_pending_event(struct irq_work *entry)
* Later on, we might change it to a list if there is
* another virtualization implementation supporting the callbacks.
*/
-struct perf_guest_info_callbacks *perf_guest_cbs;
+struct perf_guest_info_callbacks __rcu *perf_guest_cbs;
int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
{
- perf_guest_cbs = cbs;
+ if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs)))
+ return -EBUSY;
+
+ rcu_assign_pointer(perf_guest_cbs, cbs);
return 0;
}
EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
{
- perf_guest_cbs = NULL;
+ if (WARN_ON_ONCE(rcu_access_pointer(perf_guest_cbs) != cbs))
+ return -EINVAL;
+
+ rcu_assign_pointer(perf_guest_cbs, NULL);
+ synchronize_rcu();
return 0;
}
EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
@@ -6536,7 +6543,6 @@ void perf_output_sample(struct perf_output_handle *handle,
static u64 perf_virt_to_phys(u64 virt)
{
u64 phys_addr = 0;
- struct page *p = NULL;
if (!virt)
return 0;
@@ -6555,14 +6561,15 @@ static u64 perf_virt_to_phys(u64 virt)
* If failed, leave phys_addr as 0.
*/
if (current->mm != NULL) {
+ struct page *p;
+
pagefault_disable();
- if (__get_user_pages_fast(virt, 1, 0, &p) == 1)
+ if (__get_user_pages_fast(virt, 1, 0, &p) == 1) {
phys_addr = page_to_phys(p) + virt % PAGE_SIZE;
+ put_page(p);
+ }
pagefault_enable();
}
-
- if (p)
- put_page(p);
}
return phys_addr;
@@ -9483,8 +9490,11 @@ perf_event_parse_addr_filter(struct perf_event *event, char *fstr,
}
/* ready to consume more filters */
+ kfree(filename);
+ filename = NULL;
state = IF_STATE_ACTION;
filter = NULL;
+ kernel = 0;
}
}
diff --git a/kernel/fork.c b/kernel/fork.c
index cf2cebd214b9..5b4a19682207 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -2182,10 +2182,6 @@ static __latent_entropy struct task_struct *copy_process(
goto bad_fork_cancel_cgroup;
}
- /* past the last point of failure */
- if (pidfile)
- fd_install(pidfd, pidfile);
-
init_task_pid_links(p);
if (likely(p->pid)) {
ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
@@ -2234,6 +2230,9 @@ static __latent_entropy struct task_struct *copy_process(
syscall_tracepoint_update(p);
write_unlock_irq(&tasklist_lock);
+ if (pidfile)
+ fd_install(pidfd, pidfile);
+
proc_fork_connector(p);
cgroup_post_fork(p);
cgroup_threadgroup_change_end(current);
diff --git a/kernel/irq/affinity.c b/kernel/irq/affinity.c
index 4d89ad4fae3b..5fb78addff51 100644
--- a/kernel/irq/affinity.c
+++ b/kernel/irq/affinity.c
@@ -269,8 +269,9 @@ static int __irq_build_affinity_masks(unsigned int startvec,
*/
if (numvecs <= nodes) {
for_each_node_mask(n, nodemsk) {
- cpumask_or(&masks[curvec].mask, &masks[curvec].mask,
- node_to_cpumask[n]);
+ /* Ensure that only CPUs which are in both masks are set */
+ cpumask_and(nmsk, cpu_mask, node_to_cpumask[n]);
+ cpumask_or(&masks[curvec].mask, &masks[curvec].mask, nmsk);
if (++curvec == last_affv)
curvec = firstvec;
}
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index c9d8eb7f5c02..ba4d742c1c65 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -29,12 +29,14 @@ extern struct irqaction chained_action;
* IRQTF_WARNED - warning "IRQ_WAKE_THREAD w/o thread_fn" has been printed
* IRQTF_AFFINITY - irq thread is requested to adjust affinity
* IRQTF_FORCED_THREAD - irq action is force threaded
+ * IRQTF_READY - signals that irq thread is ready
*/
enum {
IRQTF_RUNTHREAD,
IRQTF_WARNED,
IRQTF_AFFINITY,
IRQTF_FORCED_THREAD,
+ IRQTF_READY,
};
/*
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 9be995fc3c5a..172b5e6bc4c2 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -405,6 +405,7 @@ static struct irq_desc *alloc_desc(int irq, int node, unsigned int flags,
lockdep_set_class(&desc->lock, &irq_desc_lock_class);
mutex_init(&desc->request_mutex);
init_rcu_head(&desc->rcu);
+ init_waitqueue_head(&desc->wait_for_threads);
desc_set_defaults(irq, desc, node, affinity, owner);
irqd_set(&desc->irq_data, flags);
@@ -573,6 +574,7 @@ int __init early_irq_init(void)
raw_spin_lock_init(&desc[i].lock);
lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
mutex_init(&desc[i].request_mutex);
+ init_waitqueue_head(&desc[i].wait_for_threads);
desc_set_defaults(i, &desc[i], node, NULL, NULL);
}
return arch_early_irq_init();
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 918fe0593386..79214f983624 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1103,6 +1103,31 @@ static void irq_wake_secondary(struct irq_desc *desc, struct irqaction *action)
}
/*
+ * Internal function to notify that a interrupt thread is ready.
+ */
+static void irq_thread_set_ready(struct irq_desc *desc,
+ struct irqaction *action)
+{
+ set_bit(IRQTF_READY, &action->thread_flags);
+ wake_up(&desc->wait_for_threads);
+}
+
+/*
+ * Internal function to wake up a interrupt thread and wait until it is
+ * ready.
+ */
+static void wake_up_and_wait_for_irq_thread_ready(struct irq_desc *desc,
+ struct irqaction *action)
+{
+ if (!action || !action->thread)
+ return;
+
+ wake_up_process(action->thread);
+ wait_event(desc->wait_for_threads,
+ test_bit(IRQTF_READY, &action->thread_flags));
+}
+
+/*
* Interrupt handler thread
*/
static int irq_thread(void *data)
@@ -1113,6 +1138,8 @@ static int irq_thread(void *data)
irqreturn_t (*handler_fn)(struct irq_desc *desc,
struct irqaction *action);
+ irq_thread_set_ready(desc, action);
+
if (force_irqthreads && test_bit(IRQTF_FORCED_THREAD,
&action->thread_flags))
handler_fn = irq_forced_thread_fn;
@@ -1541,8 +1568,6 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
}
if (!shared) {
- init_waitqueue_head(&desc->wait_for_threads);
-
/* Setup the type (level, edge polarity) if configured: */
if (new->flags & IRQF_TRIGGER_MASK) {
ret = __irq_set_trigger(desc,
@@ -1632,14 +1657,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
irq_setup_timings(desc, new);
- /*
- * Strictly no need to wake it up, but hung_task complains
- * when no hard interrupt wakes the thread up.
- */
- if (new->thread)
- wake_up_process(new->thread);
- if (new->secondary)
- wake_up_process(new->secondary->thread);
+ wake_up_and_wait_for_irq_thread_ready(desc, new);
+ wake_up_and_wait_for_irq_thread_ready(desc, new->secondary);
register_irq_proc(irq, desc);
new->dir = NULL;
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 1668439b269d..c93340bae3ac 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -2003,6 +2003,9 @@ int register_kretprobe(struct kretprobe *rp)
}
}
+ if (rp->data_size > KRETPROBE_MAX_DATA_SIZE)
+ return -E2BIG;
+
rp->kp.pre_handler = pre_handler_kretprobe;
rp->kp.post_handler = NULL;
rp->kp.fault_handler = NULL;
diff --git a/kernel/module.c b/kernel/module.c
index 59d487b8d8da..7c724356aca3 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -3377,7 +3377,7 @@ static int find_module_sections(struct module *mod, struct load_info *info)
#endif
#ifdef CONFIG_FTRACE_MCOUNT_RECORD
/* sechdrs[0].sh_size is always zero */
- mod->ftrace_callsites = section_objs(info, "__mcount_loc",
+ mod->ftrace_callsites = section_objs(info, FTRACE_CALLSITE_SECTION,
sizeof(*mod->ftrace_callsites),
&mod->num_ftrace_callsites);
#endif
@@ -3711,12 +3711,6 @@ static noinline int do_init_module(struct module *mod)
}
freeinit->module_init = mod->init_layout.base;
- /*
- * We want to find out whether @mod uses async during init. Clear
- * PF_USED_ASYNC. async_schedule*() will set it.
- */
- current->flags &= ~PF_USED_ASYNC;
-
do_mod_ctors(mod);
/* Start the module */
if (mod->init != NULL)
@@ -3742,22 +3736,13 @@ static noinline int do_init_module(struct module *mod)
/*
* We need to finish all async code before the module init sequence
- * is done. This has potential to deadlock. For example, a newly
- * detected block device can trigger request_module() of the
- * default iosched from async probing task. Once userland helper
- * reaches here, async_synchronize_full() will wait on the async
- * task waiting on request_module() and deadlock.
- *
- * This deadlock is avoided by perfomring async_synchronize_full()
- * iff module init queued any async jobs. This isn't a full
- * solution as it will deadlock the same if module loading from
- * async jobs nests more than once; however, due to the various
- * constraints, this hack seems to be the best option for now.
- * Please refer to the following thread for details.
+ * is done. This has potential to deadlock if synchronous module
+ * loading is requested from async (which is not allowed!).
*
- * http://thread.gmane.org/gmane.linux.kernel/1420814
+ * See commit 0fdff3ec6d87 ("async, kmod: warn on synchronous
+ * request_module() from async workers") for more details.
*/
- if (!mod->async_probe_requested && (current->flags & PF_USED_ASYNC))
+ if (!mod->async_probe_requested)
async_synchronize_full();
ftrace_free_mem(mod, mod->init_layout.base, mod->init_layout.base +
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 69c4cd472def..406b4cbbec5e 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -676,7 +676,7 @@ static int load_image_and_restore(void)
goto Unlock;
error = swsusp_read(&flags);
- swsusp_close(FMODE_READ);
+ swsusp_close(FMODE_READ | FMODE_EXCL);
if (!error)
hibernation_restore(flags & SF_PLATFORM_MODE);
@@ -871,7 +871,7 @@ static int software_resume(void)
/* The snapshot device should not be opened while we're running */
if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
error = -EBUSY;
- swsusp_close(FMODE_READ);
+ swsusp_close(FMODE_READ | FMODE_EXCL);
goto Unlock;
}
@@ -907,7 +907,7 @@ static int software_resume(void)
pm_pr_dbg("Hibernation image not present or could not be loaded.\n");
return error;
Close_Finish:
- swsusp_close(FMODE_READ);
+ swsusp_close(FMODE_READ | FMODE_EXCL);
goto Finish;
}
@@ -1216,7 +1216,7 @@ static int __init resumedelay_setup(char *str)
int rc = kstrtouint(str, 0, &resume_delay);
if (rc)
- return rc;
+ pr_warn("resumedelay: bad option string '%s'\n", str);
return 1;
}
diff --git a/kernel/power/main.c b/kernel/power/main.c
index e26de7af520b..718884857830 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -472,7 +472,10 @@ static ssize_t pm_wakeup_irq_show(struct kobject *kobj,
struct kobj_attribute *attr,
char *buf)
{
- return pm_wakeup_irq ? sprintf(buf, "%u\n", pm_wakeup_irq) : -ENODATA;
+ if (!pm_wakeup_irq())
+ return -ENODATA;
+
+ return sprintf(buf, "%u\n", pm_wakeup_irq());
}
power_attr_ro(pm_wakeup_irq);
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 4b6a54da7e65..e9f4def4d291 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -134,7 +134,7 @@ int freeze_processes(void)
if (!pm_freezing)
atomic_inc(&system_freezing_cnt);
- pm_wakeup_clear(true);
+ pm_wakeup_clear(0);
pr_info("Freezing user space processes ... ");
pm_freezing = true;
error = try_to_freeze_tasks(true);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index d65f2d5ab694..46455aa7951e 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -945,8 +945,7 @@ static void memory_bm_recycle(struct memory_bitmap *bm)
* Register a range of page frames the contents of which should not be saved
* during hibernation (to be used in the early initialization code).
*/
-void __init __register_nosave_region(unsigned long start_pfn,
- unsigned long end_pfn, int use_kmalloc)
+void __init register_nosave_region(unsigned long start_pfn, unsigned long end_pfn)
{
struct nosave_region *region;
@@ -962,18 +961,12 @@ void __init __register_nosave_region(unsigned long start_pfn,
goto Report;
}
}
- if (use_kmalloc) {
- /* During init, this shouldn't fail */
- region = kmalloc(sizeof(struct nosave_region), GFP_KERNEL);
- BUG_ON(!region);
- } else {
- /* This allocation cannot fail */
- region = memblock_alloc(sizeof(struct nosave_region),
- SMP_CACHE_BYTES);
- if (!region)
- panic("%s: Failed to allocate %zu bytes\n", __func__,
- sizeof(struct nosave_region));
- }
+ /* This allocation cannot fail */
+ region = memblock_alloc(sizeof(struct nosave_region),
+ SMP_CACHE_BYTES);
+ if (!region)
+ panic("%s: Failed to allocate %zu bytes\n", __func__,
+ sizeof(struct nosave_region));
region->start_pfn = start_pfn;
region->end_pfn = end_pfn;
list_add_tail(&region->list, &nosave_regions);
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index 27f149f5d4a9..5dea2778a3db 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -138,8 +138,6 @@ static void s2idle_loop(void)
break;
}
- pm_wakeup_clear(false);
-
s2idle_enter();
}
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
index 60564b58de07..bfd2a96c695c 100644
--- a/kernel/power/suspend_test.c
+++ b/kernel/power/suspend_test.c
@@ -157,22 +157,22 @@ static int __init setup_test_suspend(char *value)
value++;
suspend_type = strsep(&value, ",");
if (!suspend_type)
- return 0;
+ return 1;
repeat = strsep(&value, ",");
if (repeat) {
if (kstrtou32(repeat, 0, &test_repeat_count_max))
- return 0;
+ return 1;
}
for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++)
if (!strcmp(pm_labels[i], suspend_type)) {
test_state_label = pm_labels[i];
- return 0;
+ return 1;
}
printk(warn_bad_state, suspend_type);
- return 0;
+ return 1;
}
__setup("test_suspend", setup_test_suspend);
diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c
index 105df4dfc783..52571dcad768 100644
--- a/kernel/power/wakelock.c
+++ b/kernel/power/wakelock.c
@@ -39,23 +39,20 @@ ssize_t pm_show_wakelocks(char *buf, bool show_active)
{
struct rb_node *node;
struct wakelock *wl;
- char *str = buf;
- char *end = buf + PAGE_SIZE;
+ int len = 0;
mutex_lock(&wakelocks_lock);
for (node = rb_first(&wakelocks_tree); node; node = rb_next(node)) {
wl = rb_entry(node, struct wakelock, node);
if (wl->ws->active == show_active)
- str += scnprintf(str, end - str, "%s ", wl->name);
+ len += sysfs_emit_at(buf, len, "%s ", wl->name);
}
- if (str > buf)
- str--;
- str += scnprintf(str, end - str, "\n");
+ len += sysfs_emit_at(buf, len, "\n");
mutex_unlock(&wakelocks_lock);
- return (str - buf);
+ return len;
}
#if CONFIG_PM_WAKELOCKS_LIMIT > 0
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
index 23e26a203a9e..bb2198b40756 100644
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -146,8 +146,10 @@ static int __control_devkmsg(char *str)
static int __init control_devkmsg(char *str)
{
- if (__control_devkmsg(str) < 0)
+ if (__control_devkmsg(str) < 0) {
+ pr_warn("printk.devkmsg: bad option string '%s'\n", str);
return 1;
+ }
/*
* Set sysctl string accordingly:
@@ -166,7 +168,7 @@ static int __init control_devkmsg(char *str)
*/
devkmsg_log |= DEVKMSG_LOG_MASK_LOCK;
- return 0;
+ return 1;
}
__setup("printk.devkmsg=", control_devkmsg);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index eb4d04cb3aaf..d99f73f83bf5 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -370,6 +370,26 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode)
return !err;
}
+static int check_ptrace_options(unsigned long data)
+{
+ if (data & ~(unsigned long)PTRACE_O_MASK)
+ return -EINVAL;
+
+ if (unlikely(data & PTRACE_O_SUSPEND_SECCOMP)) {
+ if (!IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) ||
+ !IS_ENABLED(CONFIG_SECCOMP))
+ return -EINVAL;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if (seccomp_mode(&current->seccomp) != SECCOMP_MODE_DISABLED ||
+ current->ptrace & PT_SUSPEND_SECCOMP)
+ return -EPERM;
+ }
+ return 0;
+}
+
static int ptrace_attach(struct task_struct *task, long request,
unsigned long addr,
unsigned long flags)
@@ -381,8 +401,16 @@ static int ptrace_attach(struct task_struct *task, long request,
if (seize) {
if (addr != 0)
goto out;
+ /*
+ * This duplicates the check in check_ptrace_options() because
+ * ptrace_attach() and ptrace_setoptions() have historically
+ * used different error codes for unknown ptrace options.
+ */
if (flags & ~(unsigned long)PTRACE_O_MASK)
goto out;
+ retval = check_ptrace_options(flags);
+ if (retval)
+ return retval;
flags = PT_PTRACED | PT_SEIZED | (flags << PT_OPT_FLAG_SHIFT);
} else {
flags = PT_PTRACED;
@@ -655,22 +683,11 @@ int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long ds
static int ptrace_setoptions(struct task_struct *child, unsigned long data)
{
unsigned flags;
+ int ret;
- if (data & ~(unsigned long)PTRACE_O_MASK)
- return -EINVAL;
-
- if (unlikely(data & PTRACE_O_SUSPEND_SECCOMP)) {
- if (!IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) ||
- !IS_ENABLED(CONFIG_SECCOMP))
- return -EINVAL;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if (seccomp_mode(&current->seccomp) != SECCOMP_MODE_DISABLED ||
- current->ptrace & PT_SUSPEND_SECCOMP)
- return -EPERM;
- }
+ ret = check_ptrace_options(data);
+ if (ret)
+ return ret;
/* Avoid intermediate state when all opts are cleared */
flags = child->ptrace;
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 11ae2747701b..5797cf2909b0 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1358,10 +1358,11 @@ static void __maybe_unused rcu_advance_cbs_nowake(struct rcu_node *rnp,
struct rcu_data *rdp)
{
rcu_lockdep_assert_cblist_protected(rdp);
- if (!rcu_seq_state(rcu_seq_current(&rnp->gp_seq)) ||
- !raw_spin_trylock_rcu_node(rnp))
+ if (!rcu_seq_state(rcu_seq_current(&rnp->gp_seq)) || !raw_spin_trylock_rcu_node(rnp))
return;
- WARN_ON_ONCE(rcu_advance_cbs(rnp, rdp));
+ // The grace period cannot end while we hold the rcu_node lock.
+ if (rcu_seq_state(rcu_seq_current(&rnp->gp_seq)))
+ WARN_ON_ONCE(rcu_advance_cbs(rnp, rdp));
raw_spin_unlock_rcu_node(rnp);
}
@@ -1602,7 +1603,7 @@ static void rcu_gp_fqs(bool first_time)
struct rcu_node *rnp = rcu_get_root();
WRITE_ONCE(rcu_state.gp_activity, jiffies);
- rcu_state.n_force_qs++;
+ WRITE_ONCE(rcu_state.n_force_qs, rcu_state.n_force_qs + 1);
if (first_time) {
/* Collect dyntick-idle snapshots. */
force_qs_rnp(dyntick_save_progress_counter);
@@ -2207,7 +2208,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
/* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */
if (count == 0 && rdp->qlen_last_fqs_check != 0) {
rdp->qlen_last_fqs_check = 0;
- rdp->n_force_qs_snap = rcu_state.n_force_qs;
+ rdp->n_force_qs_snap = READ_ONCE(rcu_state.n_force_qs);
} else if (count < rdp->qlen_last_fqs_check - qhimark)
rdp->qlen_last_fqs_check = count;
@@ -2535,10 +2536,10 @@ static void __call_rcu_core(struct rcu_data *rdp, struct rcu_head *head,
} else {
/* Give the grace period a kick. */
rdp->blimit = DEFAULT_MAX_RCU_BLIMIT;
- if (rcu_state.n_force_qs == rdp->n_force_qs_snap &&
+ if (READ_ONCE(rcu_state.n_force_qs) == rdp->n_force_qs_snap &&
rcu_segcblist_first_pend_cb(&rdp->cblist) != head)
rcu_force_quiescent_state();
- rdp->n_force_qs_snap = rcu_state.n_force_qs;
+ rdp->n_force_qs_snap = READ_ONCE(rcu_state.n_force_qs);
rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist);
}
}
@@ -3029,7 +3030,7 @@ int rcutree_prepare_cpu(unsigned int cpu)
/* Set up local state, ensuring consistent view of global state. */
raw_spin_lock_irqsave_rcu_node(rnp, flags);
rdp->qlen_last_fqs_check = 0;
- rdp->n_force_qs_snap = rcu_state.n_force_qs;
+ rdp->n_force_qs_snap = READ_ONCE(rcu_state.n_force_qs);
rdp->blimit = blimit;
if (rcu_segcblist_empty(&rdp->cblist) && /* No early-boot CBs? */
!rcu_segcblist_is_offloaded(&rdp->cblist))
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index 4c4d7683a4e5..173e3ce60790 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -382,6 +382,7 @@ retry_ipi:
continue;
}
if (get_cpu() == cpu) {
+ mask_ofl_test |= mask;
put_cpu();
continue;
}
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index a71a4a272515..2c127d438fe0 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -523,16 +523,17 @@ rcu_preempt_deferred_qs_irqrestore(struct task_struct *t, unsigned long flags)
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
}
- /* Unboost if we were boosted. */
- if (IS_ENABLED(CONFIG_RCU_BOOST) && drop_boost_mutex)
- rt_mutex_futex_unlock(&rnp->boost_mtx);
-
/*
* If this was the last task on the expedited lists,
* then we need to report up the rcu_node hierarchy.
*/
if (!empty_exp && empty_exp_now)
rcu_report_exp_rnp(rnp, true);
+
+ /* Unboost if we were boosted. */
+ if (IS_ENABLED(CONFIG_RCU_BOOST) && drop_boost_mutex)
+ rt_mutex_futex_unlock(&rnp->boost_mtx);
+
} else {
local_irq_restore(flags);
}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5dc43d37e6a2..5befdecefe94 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1337,7 +1337,7 @@ static void __init init_uclamp_rq(struct rq *rq)
};
}
- rq->uclamp_flags = 0;
+ rq->uclamp_flags = UCLAMP_FLAG_IDLE;
}
static void __init init_uclamp(void)
@@ -2482,6 +2482,9 @@ out:
bool cpus_share_cache(int this_cpu, int that_cpu)
{
+ if (this_cpu == that_cpu)
+ return true;
+
return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
}
#endif /* CONFIG_SMP */
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index 46ed4e1383e2..66188567778d 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -147,10 +147,10 @@ void account_guest_time(struct task_struct *p, u64 cputime)
/* Add guest time to cpustat. */
if (task_nice(p) > 0) {
- cpustat[CPUTIME_NICE] += cputime;
+ task_group_account_field(p, CPUTIME_NICE, cputime);
cpustat[CPUTIME_GUEST_NICE] += cputime;
} else {
- cpustat[CPUTIME_USER] += cputime;
+ task_group_account_field(p, CPUTIME_USER, cputime);
cpustat[CPUTIME_GUEST] += cputime;
}
}
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index faada713cfae..d5f7fc7099bc 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -847,25 +847,15 @@ void print_numa_stats(struct seq_file *m, int node, unsigned long tsf,
static void sched_show_numa(struct task_struct *p, struct seq_file *m)
{
#ifdef CONFIG_NUMA_BALANCING
- struct mempolicy *pol;
-
if (p->mm)
P(mm->numa_scan_seq);
- task_lock(p);
- pol = p->mempolicy;
- if (pol && !(pol->flags & MPOL_F_MORON))
- pol = NULL;
- mpol_get(pol);
- task_unlock(p);
-
P(numa_pages_migrated);
P(numa_preferred_nid);
P(total_numa_faults);
SEQ_printf(m, "current_node=%d, numa_group_id=%d\n",
task_node(p), task_numa_group_id(p));
show_numa_stats(p, m);
- mpol_put(pol);
#endif
}
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index 9154e745f097..9dd83eb74a9d 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -1046,7 +1046,6 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
t->event = 0;
t->last_event_time = 0;
init_waitqueue_head(&t->event_wait);
- kref_init(&t->refcount);
mutex_lock(&group->trigger_lock);
@@ -1079,15 +1078,19 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
return t;
}
-static void psi_trigger_destroy(struct kref *ref)
+void psi_trigger_destroy(struct psi_trigger *t)
{
- struct psi_trigger *t = container_of(ref, struct psi_trigger, refcount);
- struct psi_group *group = t->group;
+ struct psi_group *group;
struct kthread_worker *kworker_to_destroy = NULL;
- if (static_branch_likely(&psi_disabled))
+ /*
+ * We do not check psi_disabled since it might have been disabled after
+ * the trigger got created.
+ */
+ if (!t)
return;
+ group = t->group;
/*
* Wakeup waiters to stop polling. Can happen if cgroup is deleted
* from under a polling process.
@@ -1122,9 +1125,9 @@ static void psi_trigger_destroy(struct kref *ref)
mutex_unlock(&group->trigger_lock);
/*
- * Wait for both *trigger_ptr from psi_trigger_replace and
- * poll_kworker RCUs to complete their read-side critical sections
- * before destroying the trigger and optionally the poll_kworker
+ * Wait for psi_schedule_poll_work RCU to complete its read-side
+ * critical section before destroying the trigger and optionally the
+ * poll_task.
*/
synchronize_rcu();
/*
@@ -1146,18 +1149,6 @@ static void psi_trigger_destroy(struct kref *ref)
kfree(t);
}
-void psi_trigger_replace(void **trigger_ptr, struct psi_trigger *new)
-{
- struct psi_trigger *old = *trigger_ptr;
-
- if (static_branch_likely(&psi_disabled))
- return;
-
- rcu_assign_pointer(*trigger_ptr, new);
- if (old)
- kref_put(&old->refcount, psi_trigger_destroy);
-}
-
__poll_t psi_trigger_poll(void **trigger_ptr,
struct file *file, poll_table *wait)
{
@@ -1167,24 +1158,15 @@ __poll_t psi_trigger_poll(void **trigger_ptr,
if (static_branch_likely(&psi_disabled))
return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI;
- rcu_read_lock();
-
- t = rcu_dereference(*(void __rcu __force **)trigger_ptr);
- if (!t) {
- rcu_read_unlock();
+ t = smp_load_acquire(trigger_ptr);
+ if (!t)
return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI;
- }
- kref_get(&t->refcount);
-
- rcu_read_unlock();
poll_wait(file, &t->event_wait, wait);
if (cmpxchg(&t->event, 1, 0) == 1)
ret |= EPOLLPRI;
- kref_put(&t->refcount, psi_trigger_destroy);
-
return ret;
}
@@ -1208,14 +1190,24 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf,
buf[buf_size - 1] = '\0';
- new = psi_trigger_create(&psi_system, buf, nbytes, res);
- if (IS_ERR(new))
- return PTR_ERR(new);
-
seq = file->private_data;
+
/* Take seq->lock to protect seq->private from concurrent writes */
mutex_lock(&seq->lock);
- psi_trigger_replace(&seq->private, new);
+
+ /* Allow only one trigger per file descriptor */
+ if (seq->private) {
+ mutex_unlock(&seq->lock);
+ return -EBUSY;
+ }
+
+ new = psi_trigger_create(&psi_system, buf, nbytes, res);
+ if (IS_ERR(new)) {
+ mutex_unlock(&seq->lock);
+ return PTR_ERR(new);
+ }
+
+ smp_store_release(&seq->private, new);
mutex_unlock(&seq->lock);
return nbytes;
@@ -1250,7 +1242,7 @@ static int psi_fop_release(struct inode *inode, struct file *file)
{
struct seq_file *seq = file->private_data;
- psi_trigger_replace(&seq->private, NULL);
+ psi_trigger_destroy(seq->private);
return single_release(inode, file);
}
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 2dffb8762e16..28c82dee13ea 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -52,11 +52,8 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
rt_b->rt_period_timer.function = sched_rt_period_timer;
}
-static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
+static inline void do_start_rt_bandwidth(struct rt_bandwidth *rt_b)
{
- if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
- return;
-
raw_spin_lock(&rt_b->rt_runtime_lock);
if (!rt_b->rt_period_active) {
rt_b->rt_period_active = 1;
@@ -75,6 +72,14 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
raw_spin_unlock(&rt_b->rt_runtime_lock);
}
+static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
+{
+ if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
+ return;
+
+ do_start_rt_bandwidth(rt_b);
+}
+
void init_rt_rq(struct rt_rq *rt_rq)
{
struct rt_prio_array *array;
@@ -983,13 +988,17 @@ static void update_curr_rt(struct rq *rq)
for_each_sched_rt_entity(rt_se) {
struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
+ int exceeded;
if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
raw_spin_lock(&rt_rq->rt_runtime_lock);
rt_rq->rt_time += delta_exec;
- if (sched_rt_runtime_exceeded(rt_rq))
+ exceeded = sched_rt_runtime_exceeded(rt_rq);
+ if (exceeded)
resched_curr(rq);
raw_spin_unlock(&rt_rq->rt_runtime_lock);
+ if (exceeded)
+ do_start_rt_bandwidth(sched_rt_bandwidth(rt_rq));
}
}
}
@@ -2659,8 +2668,12 @@ static int sched_rt_global_validate(void)
static void sched_rt_do_global(void)
{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
def_rt_bandwidth.rt_runtime = global_rt_runtime();
def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
+ raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
}
int sched_rt_handler(struct ctl_table *table, int write,
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index ffaa97a8d405..e5ebaffc4fef 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1552,66 +1552,58 @@ static void init_numa_topology_type(void)
}
}
+
+#define NR_DISTANCE_VALUES (1 << DISTANCE_BITS)
+
void sched_init_numa(void)
{
- int next_distance, curr_distance = node_distance(0, 0);
struct sched_domain_topology_level *tl;
- int level = 0;
- int i, j, k;
-
- sched_domains_numa_distance = kzalloc(sizeof(int) * (nr_node_ids + 1), GFP_KERNEL);
- if (!sched_domains_numa_distance)
- return;
-
- /* Includes NUMA identity node at level 0. */
- sched_domains_numa_distance[level++] = curr_distance;
- sched_domains_numa_levels = level;
+ unsigned long *distance_map;
+ int nr_levels = 0;
+ int i, j;
/*
* O(nr_nodes^2) deduplicating selection sort -- in order to find the
* unique distances in the node_distance() table.
- *
- * Assumes node_distance(0,j) includes all distances in
- * node_distance(i,j) in order to avoid cubic time.
*/
- next_distance = curr_distance;
+ distance_map = bitmap_alloc(NR_DISTANCE_VALUES, GFP_KERNEL);
+ if (!distance_map)
+ return;
+
+ bitmap_zero(distance_map, NR_DISTANCE_VALUES);
for (i = 0; i < nr_node_ids; i++) {
for (j = 0; j < nr_node_ids; j++) {
- for (k = 0; k < nr_node_ids; k++) {
- int distance = node_distance(i, k);
-
- if (distance > curr_distance &&
- (distance < next_distance ||
- next_distance == curr_distance))
- next_distance = distance;
-
- /*
- * While not a strong assumption it would be nice to know
- * about cases where if node A is connected to B, B is not
- * equally connected to A.
- */
- if (sched_debug() && node_distance(k, i) != distance)
- sched_numa_warn("Node-distance not symmetric");
+ int distance = node_distance(i, j);
- if (sched_debug() && i && !find_numa_distance(distance))
- sched_numa_warn("Node-0 not representative");
+ if (distance < LOCAL_DISTANCE || distance >= NR_DISTANCE_VALUES) {
+ sched_numa_warn("Invalid distance value range");
+ return;
}
- if (next_distance != curr_distance) {
- sched_domains_numa_distance[level++] = next_distance;
- sched_domains_numa_levels = level;
- curr_distance = next_distance;
- } else break;
+
+ bitmap_set(distance_map, distance, 1);
}
+ }
+ /*
+ * We can now figure out how many unique distance values there are and
+ * allocate memory accordingly.
+ */
+ nr_levels = bitmap_weight(distance_map, NR_DISTANCE_VALUES);
- /*
- * In case of sched_debug() we verify the above assumption.
- */
- if (!sched_debug())
- break;
+ sched_domains_numa_distance = kcalloc(nr_levels, sizeof(int), GFP_KERNEL);
+ if (!sched_domains_numa_distance) {
+ bitmap_free(distance_map);
+ return;
+ }
+
+ for (i = 0, j = 0; i < nr_levels; i++, j++) {
+ j = find_next_bit(distance_map, NR_DISTANCE_VALUES, j);
+ sched_domains_numa_distance[i] = j;
}
+ bitmap_free(distance_map);
+
/*
- * 'level' contains the number of unique distances
+ * 'nr_levels' contains the number of unique distances
*
* The sched_domains_numa_distance[] array includes the actual distance
* numbers.
@@ -1620,15 +1612,15 @@ void sched_init_numa(void)
/*
* Here, we should temporarily reset sched_domains_numa_levels to 0.
* If it fails to allocate memory for array sched_domains_numa_masks[][],
- * the array will contain less then 'level' members. This could be
+ * the array will contain less then 'nr_levels' members. This could be
* dangerous when we use it to iterate array sched_domains_numa_masks[][]
* in other functions.
*
- * We reset it to 'level' at the end of this function.
+ * We reset it to 'nr_levels' at the end of this function.
*/
sched_domains_numa_levels = 0;
- sched_domains_numa_masks = kzalloc(sizeof(void *) * level, GFP_KERNEL);
+ sched_domains_numa_masks = kzalloc(sizeof(void *) * nr_levels, GFP_KERNEL);
if (!sched_domains_numa_masks)
return;
@@ -1636,7 +1628,7 @@ void sched_init_numa(void)
* Now for each level, construct a mask per node which contains all
* CPUs of nodes that are that many hops away from us.
*/
- for (i = 0; i < level; i++) {
+ for (i = 0; i < nr_levels; i++) {
sched_domains_numa_masks[i] =
kzalloc(nr_node_ids * sizeof(void *), GFP_KERNEL);
if (!sched_domains_numa_masks[i])
@@ -1644,12 +1636,17 @@ void sched_init_numa(void)
for (j = 0; j < nr_node_ids; j++) {
struct cpumask *mask = kzalloc(cpumask_size(), GFP_KERNEL);
+ int k;
+
if (!mask)
return;
sched_domains_numa_masks[i][j] = mask;
for_each_node(k) {
+ if (sched_debug() && (node_distance(j, k) != node_distance(k, j)))
+ sched_numa_warn("Node-distance not symmetric");
+
if (node_distance(j, k) > sched_domains_numa_distance[i])
continue;
@@ -1661,7 +1658,7 @@ void sched_init_numa(void)
/* Compute default topology size */
for (i = 0; sched_domain_topology[i].mask; i++);
- tl = kzalloc((i + level + 1) *
+ tl = kzalloc((i + nr_levels + 1) *
sizeof(struct sched_domain_topology_level), GFP_KERNEL);
if (!tl)
return;
@@ -1684,7 +1681,7 @@ void sched_init_numa(void)
/*
* .. and append 'j' levels of NUMA goodness.
*/
- for (j = 1; j < level; i++, j++) {
+ for (j = 1; j < nr_levels; i++, j++) {
tl[i] = (struct sched_domain_topology_level){
.mask = sd_numa_mask,
.sd_flags = cpu_numa_flags,
@@ -1696,8 +1693,8 @@ void sched_init_numa(void)
sched_domain_topology = tl;
- sched_domains_numa_levels = level;
- sched_max_numa_distance = sched_domains_numa_distance[level - 1];
+ sched_domains_numa_levels = nr_levels;
+ sched_max_numa_distance = sched_domains_numa_distance[nr_levels - 1];
init_numa_topology_type();
}
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 84bd05117dc2..7d668b31dbc6 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -206,6 +206,13 @@ void __wake_up_sync(struct wait_queue_head *wq_head, unsigned int mode, int nr_e
}
EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
+void __wake_up_pollfree(struct wait_queue_head *wq_head)
+{
+ __wake_up(wq_head, TASK_NORMAL, 0, poll_to_key(EPOLLHUP | POLLFREE));
+ /* POLLFREE must have cleared the queue. */
+ WARN_ON_ONCE(waitqueue_active(wq_head));
+}
+
/*
* Note: we use "set_current_state()" _after_ the wait-queue add,
* because we need a memory barrier there on SMP, so that any
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index 1d62fa2b6b91..b2e1981663b8 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -28,6 +28,9 @@
#include <linux/syscalls.h>
#include <linux/sysctl.h>
+/* Not exposed in headers: strictly internal use only. */
+#define SECCOMP_MODE_DEAD (SECCOMP_MODE_FILTER + 1)
+
#ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
#include <asm/syscall.h>
#endif
@@ -706,6 +709,7 @@ static void __secure_computing_strict(int this_syscall)
#ifdef SECCOMP_DEBUG
dump_stack();
#endif
+ current->seccomp.mode = SECCOMP_MODE_DEAD;
seccomp_log(this_syscall, SIGKILL, SECCOMP_RET_KILL_THREAD, true);
do_exit(SIGKILL);
}
@@ -892,6 +896,7 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
case SECCOMP_RET_KILL_THREAD:
case SECCOMP_RET_KILL_PROCESS:
default:
+ current->seccomp.mode = SECCOMP_MODE_DEAD;
seccomp_log(this_syscall, SIGSYS, action, true);
/* Dump core only if this is the last remaining thread. */
if (action == SECCOMP_RET_KILL_PROCESS ||
@@ -944,6 +949,11 @@ int __secure_computing(const struct seccomp_data *sd)
return 0;
case SECCOMP_MODE_FILTER:
return __seccomp_filter(this_syscall, sd, false);
+ /* Surviving SECCOMP_RET_KILL_* must be proactively impossible. */
+ case SECCOMP_MODE_DEAD:
+ WARN_ON_ONCE(1);
+ do_exit(SIGKILL);
+ return -1;
default:
BUG();
}
diff --git a/kernel/smp.c b/kernel/smp.c
index 3a390932f8b2..be65b76cb803 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -222,7 +222,7 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
/* There shouldn't be any pending callbacks on an offline CPU. */
if (unlikely(warn_cpu_offline && !cpu_online(smp_processor_id()) &&
- !warned && !llist_empty(head))) {
+ !warned && entry != NULL)) {
warned = true;
WARN(1, "IPI on offline CPU %d\n", smp_processor_id());
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index eae6a078619f..0457d36540e3 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -250,6 +250,36 @@ static int sysrq_sysctl_handler(struct ctl_table *table, int write,
#endif
+#ifdef CONFIG_BPF_SYSCALL
+
+void __weak unpriv_ebpf_notify(int new_state)
+{
+}
+
+static int bpf_unpriv_handler(struct ctl_table *table, int write,
+ void *buffer, size_t *lenp, loff_t *ppos)
+{
+ int ret, unpriv_enable = *(int *)table->data;
+ bool locked_state = unpriv_enable == 1;
+ struct ctl_table tmp = *table;
+
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ tmp.data = &unpriv_enable;
+ ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+ if (write && !ret) {
+ if (locked_state && unpriv_enable != 1)
+ return -EPERM;
+ *(int *)table->data = unpriv_enable;
+ }
+
+ unpriv_ebpf_notify(unpriv_enable);
+
+ return ret;
+}
+#endif
+
static struct ctl_table kern_table[];
static struct ctl_table vm_table[];
static struct ctl_table fs_table[];
@@ -1255,10 +1285,9 @@ static struct ctl_table kern_table[] = {
.data = &sysctl_unprivileged_bpf_disabled,
.maxlen = sizeof(sysctl_unprivileged_bpf_disabled),
.mode = 0644,
- /* only handle a transition from default "0" to "1" */
- .proc_handler = proc_dointvec_minmax,
- .extra1 = SYSCTL_ONE,
- .extra2 = SYSCTL_ONE,
+ .proc_handler = bpf_unpriv_handler,
+ .extra1 = SYSCTL_ZERO,
+ .extra2 = &two,
},
{
.procname = "bpf_stats_enabled",
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 4419486d7413..5eb04bb59802 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -131,7 +131,7 @@ static void tick_sched_do_timer(struct tick_sched *ts, ktime_t now)
*/
if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE)) {
#ifdef CONFIG_NO_HZ_FULL
- WARN_ON(tick_nohz_full_running);
+ WARN_ON_ONCE(tick_nohz_full_running);
#endif
tick_do_timer_cpu = cpu;
}
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 4fc2af4367a7..36ed8bad3909 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1236,8 +1236,7 @@ int do_settimeofday64(const struct timespec64 *ts)
timekeeping_forward_now(tk);
xt = tk_xtime(tk);
- ts_delta.tv_sec = ts->tv_sec - xt.tv_sec;
- ts_delta.tv_nsec = ts->tv_nsec - xt.tv_nsec;
+ ts_delta = timespec64_sub(*ts, xt);
if (timespec64_compare(&tk->wall_to_monotonic, &ts_delta) > 0) {
ret = -EINVAL;
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index efc338969b98..7719d444bda1 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2520,14 +2520,14 @@ struct dyn_ftrace *ftrace_rec_iter_record(struct ftrace_rec_iter *iter)
}
static int
-ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec)
+ftrace_nop_initialize(struct module *mod, struct dyn_ftrace *rec)
{
int ret;
if (unlikely(ftrace_disabled))
return 0;
- ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR);
+ ret = ftrace_init_nop(mod, rec);
if (ret) {
ftrace_bug_type = FTRACE_BUG_INIT;
ftrace_bug(ret, rec);
@@ -2969,7 +2969,7 @@ static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs)
* to the NOP instructions.
*/
if (!__is_defined(CC_USING_NOP_MCOUNT) &&
- !ftrace_code_disable(mod, p))
+ !ftrace_nop_initialize(mod, p))
break;
update_cnt++;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 5240ba9a82db..56619766e910 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -219,7 +219,7 @@ static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata;
static int __init set_trace_boot_options(char *str)
{
strlcpy(trace_boot_options_buf, str, MAX_TRACER_SIZE);
- return 0;
+ return 1;
}
__setup("trace_options=", set_trace_boot_options);
@@ -230,12 +230,16 @@ static int __init set_trace_boot_clock(char *str)
{
strlcpy(trace_boot_clock_buf, str, MAX_TRACER_SIZE);
trace_boot_clock = trace_boot_clock_buf;
- return 0;
+ return 1;
}
__setup("trace_clock=", set_trace_boot_clock);
static int __init set_tracepoint_printk(char *str)
{
+ /* Ignore the "tp_printk_stop_on_boot" param */
+ if (*str == '_')
+ return 0;
+
if ((strcmp(str, "=0") != 0 && strcmp(str, "=off") != 0))
tracepoint_printk = 1;
return 1;
@@ -1301,10 +1305,12 @@ static int __init set_buf_size(char *str)
if (!str)
return 0;
buf_size = memparse(str, &str);
- /* nr_entries can not be zero */
- if (buf_size == 0)
- return 0;
- trace_buf_size = buf_size;
+ /*
+ * nr_entries can not be zero and the startup
+ * tests require some buffer space. Therefore
+ * ensure we have at least 4096 bytes of buffer.
+ */
+ trace_buf_size = max(4096UL, buf_size);
return 1;
}
__setup("trace_buf_size=", set_buf_size);
@@ -3007,7 +3013,7 @@ struct trace_buffer_struct {
char buffer[4][TRACE_BUF_SIZE];
};
-static struct trace_buffer_struct *trace_percpu_buffer;
+static struct trace_buffer_struct __percpu *trace_percpu_buffer;
/*
* Thise allows for lockless recording. If we're nested too deeply, then
@@ -3017,7 +3023,7 @@ static char *get_trace_buf(void)
{
struct trace_buffer_struct *buffer = this_cpu_ptr(trace_percpu_buffer);
- if (!buffer || buffer->nesting >= 4)
+ if (!trace_percpu_buffer || buffer->nesting >= 4)
return NULL;
buffer->nesting++;
@@ -3036,7 +3042,7 @@ static void put_trace_buf(void)
static int alloc_percpu_trace_buffer(void)
{
- struct trace_buffer_struct *buffers;
+ struct trace_buffer_struct __percpu *buffers;
buffers = alloc_percpu(struct trace_buffer_struct);
if (WARN(!buffers, "Could not allocate percpu trace_printk buffer"))
@@ -6994,7 +7000,8 @@ static struct tracing_log_err *get_tracing_log_err(struct trace_array *tr)
err = kzalloc(sizeof(*err), GFP_KERNEL);
if (!err)
err = ERR_PTR(-ENOMEM);
- tr->n_err_log_entries++;
+ else
+ tr->n_err_log_entries++;
return err;
}
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 35e9a01b5480..1d514a1a3155 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1423,14 +1423,26 @@ __event_trigger_test_discard(struct trace_event_file *file,
if (eflags & EVENT_FILE_FL_TRIGGER_COND)
*tt = event_triggers_call(file, entry, event);
- if (test_bit(EVENT_FILE_FL_SOFT_DISABLED_BIT, &file->flags) ||
- (unlikely(file->flags & EVENT_FILE_FL_FILTERED) &&
- !filter_match_preds(file->filter, entry))) {
- __trace_event_discard_commit(buffer, event);
- return true;
- }
+ if (likely(!(file->flags & (EVENT_FILE_FL_SOFT_DISABLED |
+ EVENT_FILE_FL_FILTERED |
+ EVENT_FILE_FL_PID_FILTER))))
+ return false;
+
+ if (file->flags & EVENT_FILE_FL_SOFT_DISABLED)
+ goto discard;
+
+ if (file->flags & EVENT_FILE_FL_FILTERED &&
+ !filter_match_preds(file->filter, entry))
+ goto discard;
+
+ if ((file->flags & EVENT_FILE_FL_PID_FILTER) &&
+ trace_event_ignore_this_pid(file))
+ goto discard;
return false;
+ discard:
+ __trace_event_discard_commit(buffer, event);
+ return true;
}
/**
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index e31ee325dad1..4acc77e049e5 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -2247,12 +2247,19 @@ static struct trace_event_file *
trace_create_new_event(struct trace_event_call *call,
struct trace_array *tr)
{
+ struct trace_pid_list *pid_list;
struct trace_event_file *file;
file = kmem_cache_alloc(file_cachep, GFP_TRACE);
if (!file)
return NULL;
+ pid_list = rcu_dereference_protected(tr->filtered_pids,
+ lockdep_is_held(&event_mutex));
+
+ if (pid_list)
+ file->flags |= EVENT_FILE_FL_PID_FILTER;
+
file->event_call = call;
file->tr = tr;
atomic_set(&file->sm_ref, 0);
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index f63766366e23..413da11260f8 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -149,6 +149,8 @@ struct hist_field {
*/
unsigned int var_ref_idx;
bool read_once;
+
+ unsigned int var_str_idx;
};
static u64 hist_field_none(struct hist_field *field,
@@ -351,6 +353,7 @@ struct hist_trigger_data {
unsigned int n_keys;
unsigned int n_fields;
unsigned int n_vars;
+ unsigned int n_var_str;
unsigned int key_size;
struct tracing_map_sort_key sort_keys[TRACING_MAP_SORT_KEYS_MAX];
unsigned int n_sort_keys;
@@ -2305,7 +2308,12 @@ static int hist_trigger_elt_data_alloc(struct tracing_map_elt *elt)
}
}
- n_str = hist_data->n_field_var_str + hist_data->n_save_var_str;
+ n_str = hist_data->n_field_var_str + hist_data->n_save_var_str +
+ hist_data->n_var_str;
+ if (n_str > SYNTH_FIELDS_MAX) {
+ hist_elt_data_free(elt_data);
+ return -EINVAL;
+ }
size = STR_VAR_LEN_MAX;
@@ -2582,9 +2590,10 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data,
if (!hist_field->type)
goto free;
- if (field->filter_type == FILTER_STATIC_STRING)
+ if (field->filter_type == FILTER_STATIC_STRING) {
hist_field->fn = hist_field_string;
- else if (field->filter_type == FILTER_DYN_STRING)
+ hist_field->size = field->size;
+ } else if (field->filter_type == FILTER_DYN_STRING)
hist_field->fn = hist_field_dynstring;
else
hist_field->fn = hist_field_pstring;
@@ -2882,9 +2891,9 @@ parse_field(struct hist_trigger_data *hist_data, struct trace_event_file *file,
/*
* For backward compatibility, if field_name
* was "cpu", then we treat this the same as
- * common_cpu.
+ * common_cpu. This also works for "CPU".
*/
- if (strcmp(field_name, "cpu") == 0) {
+ if (field && field->filter_type == FILTER_CPU) {
*flags |= HIST_FIELD_FL_CPU;
} else {
hist_err(tr, HIST_ERR_FIELD_NOT_FOUND,
@@ -3522,7 +3531,7 @@ static inline void __update_field_vars(struct tracing_map_elt *elt,
char *str = elt_data->field_var_str[j++];
char *val_str = (char *)(uintptr_t)var_val;
- strscpy(str, val_str, STR_VAR_LEN_MAX);
+ strscpy(str, val_str, val->size);
var_val = (u64)(uintptr_t)str;
}
tracing_map_set_var(elt, var_idx, var_val);
@@ -4389,6 +4398,7 @@ static int trace_action_create(struct hist_trigger_data *hist_data,
var_ref_idx = find_var_ref_idx(hist_data, var_ref);
if (WARN_ON(var_ref_idx < 0)) {
+ kfree(p);
ret = var_ref_idx;
goto err;
}
@@ -4599,6 +4609,7 @@ static int create_var_field(struct hist_trigger_data *hist_data,
{
struct trace_array *tr = hist_data->event_file->tr;
unsigned long flags = 0;
+ int ret;
if (WARN_ON(val_idx >= TRACING_MAP_VALS_MAX + TRACING_MAP_VARS_MAX))
return -EINVAL;
@@ -4613,7 +4624,12 @@ static int create_var_field(struct hist_trigger_data *hist_data,
if (WARN_ON(hist_data->n_vars > TRACING_MAP_VARS_MAX))
return -EINVAL;
- return __create_val_field(hist_data, val_idx, file, var_name, expr_str, flags);
+ ret = __create_val_field(hist_data, val_idx, file, var_name, expr_str, flags);
+
+ if (hist_data->fields[val_idx]->flags & HIST_FIELD_FL_STRING)
+ hist_data->fields[val_idx]->var_str_idx = hist_data->n_var_str++;
+
+ return ret;
}
static int create_val_fields(struct hist_trigger_data *hist_data,
@@ -5231,7 +5247,7 @@ static int create_tracing_map_fields(struct hist_trigger_data *hist_data)
if (hist_field->flags & HIST_FIELD_FL_STACKTRACE)
cmp_fn = tracing_map_cmp_none;
- else if (!field)
+ else if (!field || hist_field->flags & HIST_FIELD_FL_CPU)
cmp_fn = tracing_map_cmp_num(hist_field->size,
hist_field->is_signed);
else if (is_string_field(field))
@@ -5333,6 +5349,22 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data,
hist_val = hist_field->fn(hist_field, elt, rbe, rec);
if (hist_field->flags & HIST_FIELD_FL_VAR) {
var_idx = hist_field->var.idx;
+
+ if (hist_field->flags & HIST_FIELD_FL_STRING) {
+ unsigned int str_start, var_str_idx, idx;
+ char *str, *val_str;
+
+ str_start = hist_data->n_field_var_str +
+ hist_data->n_save_var_str;
+ var_str_idx = hist_field->var_str_idx;
+ idx = str_start + var_str_idx;
+
+ str = elt_data->field_var_str[idx];
+ val_str = (char *)(uintptr_t)hist_val;
+ strscpy(str, val_str, hist_field->size);
+
+ hist_val = (u64)(uintptr_t)str;
+ }
tracing_map_set_var(elt, var_idx, hist_val);
continue;
}
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
index e913d41a4194..82580f7ffad9 100644
--- a/kernel/trace/trace_events_trigger.c
+++ b/kernel/trace/trace_events_trigger.c
@@ -940,6 +940,16 @@ static void
traceon_trigger(struct event_trigger_data *data, void *rec,
struct ring_buffer_event *event)
{
+ struct trace_event_file *file = data->private_data;
+
+ if (file) {
+ if (tracer_tracing_is_on(file->tr))
+ return;
+
+ tracer_tracing_on(file->tr);
+ return;
+ }
+
if (tracing_is_on())
return;
@@ -950,8 +960,15 @@ static void
traceon_count_trigger(struct event_trigger_data *data, void *rec,
struct ring_buffer_event *event)
{
- if (tracing_is_on())
- return;
+ struct trace_event_file *file = data->private_data;
+
+ if (file) {
+ if (tracer_tracing_is_on(file->tr))
+ return;
+ } else {
+ if (tracing_is_on())
+ return;
+ }
if (!data->count)
return;
@@ -959,13 +976,26 @@ traceon_count_trigger(struct event_trigger_data *data, void *rec,
if (data->count != -1)
(data->count)--;
- tracing_on();
+ if (file)
+ tracer_tracing_on(file->tr);
+ else
+ tracing_on();
}
static void
traceoff_trigger(struct event_trigger_data *data, void *rec,
struct ring_buffer_event *event)
{
+ struct trace_event_file *file = data->private_data;
+
+ if (file) {
+ if (!tracer_tracing_is_on(file->tr))
+ return;
+
+ tracer_tracing_off(file->tr);
+ return;
+ }
+
if (!tracing_is_on())
return;
@@ -976,8 +1006,15 @@ static void
traceoff_count_trigger(struct event_trigger_data *data, void *rec,
struct ring_buffer_event *event)
{
- if (!tracing_is_on())
- return;
+ struct trace_event_file *file = data->private_data;
+
+ if (file) {
+ if (!tracer_tracing_is_on(file->tr))
+ return;
+ } else {
+ if (!tracing_is_on())
+ return;
+ }
if (!data->count)
return;
@@ -985,7 +1022,10 @@ traceoff_count_trigger(struct event_trigger_data *data, void *rec,
if (data->count != -1)
(data->count)--;
- tracing_off();
+ if (file)
+ tracer_tracing_off(file->tr);
+ else
+ tracing_off();
}
static int
@@ -1179,7 +1219,14 @@ static void
stacktrace_trigger(struct event_trigger_data *data, void *rec,
struct ring_buffer_event *event)
{
- trace_dump_stack(STACK_SKIP);
+ struct trace_event_file *file = data->private_data;
+ unsigned long flags;
+
+ if (file) {
+ local_save_flags(flags);
+ __trace_stack(file->tr, flags, STACK_SKIP, preempt_count());
+ } else
+ trace_dump_stack(STACK_SKIP);
}
static void
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 5de084dab4fa..a422cf6a0358 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -430,7 +430,7 @@ static int disable_trace_kprobe(struct trace_event_call *call,
*/
trace_probe_remove_file(tp, file);
- return 0;
+ return 1;
}
#if defined(CONFIG_DYNAMIC_FTRACE) && \
@@ -999,15 +999,18 @@ static int probes_profile_seq_show(struct seq_file *m, void *v)
{
struct dyn_event *ev = v;
struct trace_kprobe *tk;
+ unsigned long nmissed;
if (!is_trace_kprobe(ev))
return 0;
tk = to_trace_kprobe(ev);
+ nmissed = trace_kprobe_is_return(tk) ?
+ tk->rp.kp.nmissed + tk->rp.nmissed : tk->rp.kp.nmissed;
seq_printf(m, " %-44s %15lu %15lu\n",
trace_probe_name(&tk->tp),
trace_kprobe_nhit(tk),
- tk->rp.kp.nmissed);
+ nmissed);
return 0;
}
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
index b515db036bec..efb51a23a14f 100644
--- a/kernel/trace/trace_uprobe.c
+++ b/kernel/trace/trace_uprobe.c
@@ -1299,6 +1299,7 @@ static int uprobe_perf_open(struct trace_event_call *call,
return 0;
list_for_each_entry(pos, trace_probe_probe_list(tp), list) {
+ tu = container_of(pos, struct trace_uprobe, tp);
err = uprobe_apply(tu->inode, tu->offset, &tu->consumer, true);
if (err) {
uprobe_perf_close(call, event);
diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c
index 10657b8dc2c2..83c2a0598c64 100644
--- a/kernel/trace/tracing_map.c
+++ b/kernel/trace/tracing_map.c
@@ -15,6 +15,7 @@
#include <linux/jhash.h>
#include <linux/slab.h>
#include <linux/sort.h>
+#include <linux/kmemleak.h>
#include "tracing_map.h"
#include "trace.h"
@@ -307,6 +308,7 @@ void tracing_map_array_free(struct tracing_map_array *a)
for (i = 0; i < a->n_pages; i++) {
if (!a->pages[i])
break;
+ kmemleak_free(a->pages[i]);
free_page((unsigned long)a->pages[i]);
}
@@ -342,6 +344,7 @@ struct tracing_map_array *tracing_map_array_alloc(unsigned int n_elts,
a->pages[i] = (void *)get_zeroed_page(GFP_KERNEL);
if (!a->pages[i])
goto free;
+ kmemleak_alloc(a->pages[i], PAGE_SIZE, 1, GFP_KERNEL);
}
out:
return a;
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 7be3e7530841..33a4093306f9 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -35,11 +35,10 @@ void bacct_add_tsk(struct user_namespace *user_ns,
/* Convert to seconds for btime */
do_div(delta, USEC_PER_SEC);
stats->ac_btime = get_seconds() - delta;
- if (thread_group_leader(tsk)) {
+ if (tsk->flags & PF_EXITING)
stats->ac_exitcode = tsk->exit_code;
- if (tsk->flags & PF_FORKNOEXEC)
- stats->ac_flag |= AFORK;
- }
+ if (thread_group_leader(tsk) && (tsk->flags & PF_FORKNOEXEC))
+ stats->ac_flag |= AFORK;
if (tsk->flags & PF_SUPERPRIV)
stats->ac_flag |= ASU;
if (tsk->flags & PF_DUMPCORE)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 77e6964ae1a9..e90f37e22202 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -855,8 +855,17 @@ void wq_worker_running(struct task_struct *task)
if (!worker->sleeping)
return;
+
+ /*
+ * If preempted by unbind_workers() between the WORKER_NOT_RUNNING check
+ * and the nr_running increment below, we may ruin the nr_running reset
+ * and leave with an unexpected pool->nr_running == 1 on the newly unbound
+ * pool. Protect against such race.
+ */
+ preempt_disable();
if (!(worker->flags & WORKER_NOT_RUNNING))
atomic_inc(&worker->pool->nr_running);
+ preempt_enable();
worker->sleeping = 0;
}