summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--include/linux/init_task.h9
-rw-r--r--include/linux/rcupdate.h36
-rw-r--r--include/linux/sched.h23
-rw-r--r--init/Kconfig10
-rw-r--r--kernel/rcu/tiny.c2
-rw-r--r--kernel/rcu/tree.c2
-rw-r--r--kernel/rcu/update.c171
7 files changed, 242 insertions, 11 deletions
diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 2bb4c4f3531a..dffd9258ee60 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -117,6 +117,14 @@ extern struct group_info init_groups;
#else
#define INIT_TASK_RCU_PREEMPT(tsk)
#endif
+#ifdef CONFIG_TASKS_RCU
+#define INIT_TASK_RCU_TASKS(tsk) \
+ .rcu_tasks_holdout = false, \
+ .rcu_tasks_holdout_list = \
+ LIST_HEAD_INIT(tsk.rcu_tasks_holdout_list),
+#else
+#define INIT_TASK_RCU_TASKS(tsk)
+#endif
extern struct cred init_cred;
@@ -224,6 +232,7 @@ extern struct task_group root_task_group;
INIT_FTRACE_GRAPH \
INIT_TRACE_RECURSION \
INIT_TASK_RCU_PREEMPT(tsk) \
+ INIT_TASK_RCU_TASKS(tsk) \
INIT_CPUSET_SEQ(tsk) \
INIT_RT_MUTEXES(tsk) \
INIT_VTIME(tsk) \
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index d231aa17b1d7..3432063f4c87 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -197,6 +197,26 @@ void call_rcu_sched(struct rcu_head *head,
void synchronize_sched(void);
+/**
+ * call_rcu_tasks() - Queue an RCU for invocation task-based grace period
+ * @head: structure to be used for queueing the RCU updates.
+ * @func: actual callback function to be invoked after the grace period
+ *
+ * The callback function will be invoked some time after a full grace
+ * period elapses, in other words after all currently executing RCU
+ * read-side critical sections have completed. call_rcu_tasks() assumes
+ * that the read-side critical sections end at a voluntary context
+ * switch (not a preemption!), entry into idle, or transition to usermode
+ * execution. As such, there are no read-side primitives analogous to
+ * rcu_read_lock() and rcu_read_unlock() because this primitive is intended
+ * to determine that all tasks have passed through a safe state, not so
+ * much for data-strcuture synchronization.
+ *
+ * See the description of call_rcu() for more detailed information on
+ * memory ordering guarantees.
+ */
+void call_rcu_tasks(struct rcu_head *head, void (*func)(struct rcu_head *head));
+
#ifdef CONFIG_PREEMPT_RCU
void __rcu_read_lock(void);
@@ -294,6 +314,22 @@ static inline void rcu_user_hooks_switch(struct task_struct *prev,
rcu_irq_exit(); \
} while (0)
+/*
+ * Note a voluntary context switch for RCU-tasks benefit. This is a
+ * macro rather than an inline function to avoid #include hell.
+ */
+#ifdef CONFIG_TASKS_RCU
+#define rcu_note_voluntary_context_switch(t) \
+ do { \
+ preempt_disable(); /* Exclude synchronize_sched(); */ \
+ if (ACCESS_ONCE((t)->rcu_tasks_holdout)) \
+ ACCESS_ONCE((t)->rcu_tasks_holdout) = false; \
+ preempt_enable(); \
+ } while (0)
+#else /* #ifdef CONFIG_TASKS_RCU */
+#define rcu_note_voluntary_context_switch(t) do { } while (0)
+#endif /* #else #ifdef CONFIG_TASKS_RCU */
+
#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) || defined(CONFIG_SMP)
bool __rcu_is_watching(void);
#endif /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) || defined(CONFIG_SMP) */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5c2c885ee52b..eaacac4ae77d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1270,6 +1270,11 @@ struct task_struct {
#ifdef CONFIG_TREE_PREEMPT_RCU
struct rcu_node *rcu_blocked_node;
#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+#ifdef CONFIG_TASKS_RCU
+ unsigned long rcu_tasks_nvcsw;
+ bool rcu_tasks_holdout;
+ struct list_head rcu_tasks_holdout_list;
+#endif /* #ifdef CONFIG_TASKS_RCU */
#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
struct sched_info sched_info;
@@ -2000,28 +2005,24 @@ extern void task_clear_jobctl_pending(struct task_struct *task,
unsigned int mask);
#ifdef CONFIG_PREEMPT_RCU
-
#define RCU_READ_UNLOCK_BLOCKED (1 << 0) /* blocked while in RCU read-side. */
#define RCU_READ_UNLOCK_NEED_QS (1 << 1) /* RCU core needs CPU response. */
+#endif /* #ifdef CONFIG_PREEMPT_RCU */
static inline void rcu_copy_process(struct task_struct *p)
{
+#ifdef CONFIG_PREEMPT_RCU
p->rcu_read_lock_nesting = 0;
p->rcu_read_unlock_special = 0;
-#ifdef CONFIG_TREE_PREEMPT_RCU
p->rcu_blocked_node = NULL;
-#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
INIT_LIST_HEAD(&p->rcu_node_entry);
+#endif /* #ifdef CONFIG_PREEMPT_RCU */
+#ifdef CONFIG_TASKS_RCU
+ p->rcu_tasks_holdout = false;
+ INIT_LIST_HEAD(&p->rcu_tasks_holdout_list);
+#endif /* #ifdef CONFIG_TASKS_RCU */
}
-#else
-
-static inline void rcu_copy_process(struct task_struct *p)
-{
-}
-
-#endif
-
static inline void tsk_restore_flags(struct task_struct *task,
unsigned long orig_flags, unsigned long flags)
{
diff --git a/init/Kconfig b/init/Kconfig
index e84c6423a2e5..c4539c4e177f 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -507,6 +507,16 @@ config PREEMPT_RCU
This option enables preemptible-RCU code that is common between
TREE_PREEMPT_RCU and, in the old days, TINY_PREEMPT_RCU.
+config TASKS_RCU
+ bool "Task_based RCU implementation using voluntary context switch"
+ default n
+ help
+ This option enables a task-based RCU implementation that uses
+ only voluntary context switch (not preemption!), idle, and
+ user-mode execution as quiescent states.
+
+ If unsure, say N.
+
config RCU_STALL_COMMON
def_bool ( TREE_RCU || TREE_PREEMPT_RCU || RCU_TRACE )
help
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index d9efcc13008c..717f00854fc0 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -254,6 +254,8 @@ void rcu_check_callbacks(int cpu, int user)
rcu_sched_qs(cpu);
else if (!in_softirq())
rcu_bh_qs(cpu);
+ if (user)
+ rcu_note_voluntary_context_switch(current);
}
/*
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 1b70cb6fbe3c..8ad91d1e317d 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2410,6 +2410,8 @@ void rcu_check_callbacks(int cpu, int user)
rcu_preempt_check_callbacks(cpu);
if (rcu_pending(cpu))
invoke_rcu_core();
+ if (user)
+ rcu_note_voluntary_context_switch(current);
trace_rcu_utilization(TPS("End scheduler-tick"));
}
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 4056d7992a6c..19b3dacb0753 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -47,6 +47,7 @@
#include <linux/hardirq.h>
#include <linux/delay.h>
#include <linux/module.h>
+#include <linux/kthread.h>
#define CREATE_TRACE_POINTS
@@ -347,3 +348,173 @@ static int __init check_cpu_stall_init(void)
early_initcall(check_cpu_stall_init);
#endif /* #ifdef CONFIG_RCU_STALL_COMMON */
+
+#ifdef CONFIG_TASKS_RCU
+
+/*
+ * Simple variant of RCU whose quiescent states are voluntary context switch,
+ * user-space execution, and idle. As such, grace periods can take one good
+ * long time. There are no read-side primitives similar to rcu_read_lock()
+ * and rcu_read_unlock() because this implementation is intended to get
+ * the system into a safe state for some of the manipulations involved in
+ * tracing and the like. Finally, this implementation does not support
+ * high call_rcu_tasks() rates from multiple CPUs. If this is required,
+ * per-CPU callback lists will be needed.
+ */
+
+/* Global list of callbacks and associated lock. */
+static struct rcu_head *rcu_tasks_cbs_head;
+static struct rcu_head **rcu_tasks_cbs_tail = &rcu_tasks_cbs_head;
+static DEFINE_RAW_SPINLOCK(rcu_tasks_cbs_lock);
+
+/* Post an RCU-tasks callback. */
+void call_rcu_tasks(struct rcu_head *rhp, void (*func)(struct rcu_head *rhp))
+{
+ unsigned long flags;
+
+ rhp->next = NULL;
+ rhp->func = func;
+ raw_spin_lock_irqsave(&rcu_tasks_cbs_lock, flags);
+ *rcu_tasks_cbs_tail = rhp;
+ rcu_tasks_cbs_tail = &rhp->next;
+ raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags);
+}
+EXPORT_SYMBOL_GPL(call_rcu_tasks);
+
+/* See if the current task has stopped holding out, remove from list if so. */
+static void check_holdout_task(struct task_struct *t)
+{
+ if (!ACCESS_ONCE(t->rcu_tasks_holdout) ||
+ t->rcu_tasks_nvcsw != ACCESS_ONCE(t->nvcsw) ||
+ !ACCESS_ONCE(t->on_rq)) {
+ ACCESS_ONCE(t->rcu_tasks_holdout) = false;
+ list_del_rcu(&t->rcu_tasks_holdout_list);
+ put_task_struct(t);
+ }
+}
+
+/* RCU-tasks kthread that detects grace periods and invokes callbacks. */
+static int __noreturn rcu_tasks_kthread(void *arg)
+{
+ unsigned long flags;
+ struct task_struct *g, *t;
+ struct rcu_head *list;
+ struct rcu_head *next;
+ LIST_HEAD(rcu_tasks_holdouts);
+
+ /* FIXME: Add housekeeping affinity. */
+
+ /*
+ * Each pass through the following loop makes one check for
+ * newly arrived callbacks, and, if there are some, waits for
+ * one RCU-tasks grace period and then invokes the callbacks.
+ * This loop is terminated by the system going down. ;-)
+ */
+ for (;;) {
+
+ /* Pick up any new callbacks. */
+ raw_spin_lock_irqsave(&rcu_tasks_cbs_lock, flags);
+ list = rcu_tasks_cbs_head;
+ rcu_tasks_cbs_head = NULL;
+ rcu_tasks_cbs_tail = &rcu_tasks_cbs_head;
+ raw_spin_unlock_irqrestore(&rcu_tasks_cbs_lock, flags);
+
+ /* If there were none, wait a bit and start over. */
+ if (!list) {
+ schedule_timeout_interruptible(HZ);
+ WARN_ON(signal_pending(current));
+ continue;
+ }
+
+ /*
+ * Wait for all pre-existing t->on_rq and t->nvcsw
+ * transitions to complete. Invoking synchronize_sched()
+ * suffices because all these transitions occur with
+ * interrupts disabled. Without this synchronize_sched(),
+ * a read-side critical section that started before the
+ * grace period might be incorrectly seen as having started
+ * after the grace period.
+ *
+ * This synchronize_sched() also dispenses with the
+ * need for a memory barrier on the first store to
+ * ->rcu_tasks_holdout, as it forces the store to happen
+ * after the beginning of the grace period.
+ */
+ synchronize_sched();
+
+ /*
+ * There were callbacks, so we need to wait for an
+ * RCU-tasks grace period. Start off by scanning
+ * the task list for tasks that are not already
+ * voluntarily blocked. Mark these tasks and make
+ * a list of them in rcu_tasks_holdouts.
+ */
+ rcu_read_lock();
+ for_each_process_thread(g, t) {
+ if (t != current && ACCESS_ONCE(t->on_rq) &&
+ !is_idle_task(t)) {
+ get_task_struct(t);
+ t->rcu_tasks_nvcsw = ACCESS_ONCE(t->nvcsw);
+ ACCESS_ONCE(t->rcu_tasks_holdout) = true;
+ list_add(&t->rcu_tasks_holdout_list,
+ &rcu_tasks_holdouts);
+ }
+ }
+ rcu_read_unlock();
+
+ /*
+ * Each pass through the following loop scans the list
+ * of holdout tasks, removing any that are no longer
+ * holdouts. When the list is empty, we are done.
+ */
+ while (!list_empty(&rcu_tasks_holdouts)) {
+ schedule_timeout_interruptible(HZ);
+ WARN_ON(signal_pending(current));
+ rcu_read_lock();
+ list_for_each_entry_rcu(t, &rcu_tasks_holdouts,
+ rcu_tasks_holdout_list)
+ check_holdout_task(t);
+ rcu_read_unlock();
+ }
+
+ /*
+ * Because ->on_rq and ->nvcsw are not guaranteed
+ * to have a full memory barriers prior to them in the
+ * schedule() path, memory reordering on other CPUs could
+ * cause their RCU-tasks read-side critical sections to
+ * extend past the end of the grace period. However,
+ * because these ->nvcsw updates are carried out with
+ * interrupts disabled, we can use synchronize_sched()
+ * to force the needed ordering on all such CPUs.
+ *
+ * This synchronize_sched() also confines all
+ * ->rcu_tasks_holdout accesses to be within the grace
+ * period, avoiding the need for memory barriers for
+ * ->rcu_tasks_holdout accesses.
+ */
+ synchronize_sched();
+
+ /* Invoke the callbacks. */
+ while (list) {
+ next = list->next;
+ local_bh_disable();
+ list->func(list);
+ local_bh_enable();
+ list = next;
+ cond_resched();
+ }
+ }
+}
+
+/* Spawn rcu_tasks_kthread() at boot time. */
+static int __init rcu_spawn_tasks_kthread(void)
+{
+ struct task_struct __maybe_unused *t;
+
+ t = kthread_run(rcu_tasks_kthread, NULL, "rcu_tasks_kthread");
+ BUG_ON(IS_ERR(t));
+ return 0;
+}
+early_initcall(rcu_spawn_tasks_kthread);
+
+#endif /* #ifdef CONFIG_TASKS_RCU */