summaryrefslogtreecommitdiff
path: root/kernel/sched/fair.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r--kernel/sched/fair.c131
1 files changed, 82 insertions, 49 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8a0e6bdba50d..dddaf61378f6 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3824,7 +3824,11 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
return;
}
- rq->misfit_task_load = task_h_load(p);
+ /*
+ * Make sure that misfit_task_load will not be null even if
+ * task_h_load() returns 0.
+ */
+ rq->misfit_task_load = max_t(unsigned long, task_h_load(p), 1);
}
#else /* CONFIG_SMP */
@@ -4379,16 +4383,16 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
}
/* returns 0 on failure to allocate runtime */
-static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+static int __assign_cfs_rq_runtime(struct cfs_bandwidth *cfs_b,
+ struct cfs_rq *cfs_rq, u64 target_runtime)
{
- struct task_group *tg = cfs_rq->tg;
- struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
- u64 amount = 0, min_amount;
+ u64 min_amount, amount = 0;
+
+ lockdep_assert_held(&cfs_b->lock);
/* note: this is a positive sum as runtime_remaining <= 0 */
- min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
+ min_amount = target_runtime - cfs_rq->runtime_remaining;
- raw_spin_lock(&cfs_b->lock);
if (cfs_b->quota == RUNTIME_INF)
amount = min_amount;
else {
@@ -4400,13 +4404,25 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
cfs_b->idle = 0;
}
}
- raw_spin_unlock(&cfs_b->lock);
cfs_rq->runtime_remaining += amount;
return cfs_rq->runtime_remaining > 0;
}
+/* returns 0 on failure to allocate runtime */
+static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
+{
+ struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
+ int ret;
+
+ raw_spin_lock(&cfs_b->lock);
+ ret = __assign_cfs_rq_runtime(cfs_b, cfs_rq, sched_cfs_bandwidth_slice());
+ raw_spin_unlock(&cfs_b->lock);
+
+ return ret;
+}
+
static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
{
/* dock delta_exec before expiring quota (as it could span periods) */
@@ -4495,13 +4511,33 @@ static int tg_throttle_down(struct task_group *tg, void *data)
return 0;
}
-static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
+static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
{
struct rq *rq = rq_of(cfs_rq);
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se;
long task_delta, idle_task_delta, dequeue = 1;
- bool empty;
+
+ raw_spin_lock(&cfs_b->lock);
+ /* This will start the period timer if necessary */
+ if (__assign_cfs_rq_runtime(cfs_b, cfs_rq, 1)) {
+ /*
+ * We have raced with bandwidth becoming available, and if we
+ * actually throttled the timer might not unthrottle us for an
+ * entire period. We additionally needed to make sure that any
+ * subsequent check_cfs_rq_runtime calls agree not to throttle
+ * us, as we may commit to do cfs put_prev+pick_next, so we ask
+ * for 1ns of runtime rather than just check cfs_b.
+ */
+ dequeue = 0;
+ } else {
+ list_add_tail_rcu(&cfs_rq->throttled_list,
+ &cfs_b->throttled_cfs_rq);
+ }
+ raw_spin_unlock(&cfs_b->lock);
+
+ if (!dequeue)
+ return false; /* Throttle no longer required. */
se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
@@ -4530,29 +4566,13 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
if (!se)
sub_nr_running(rq, task_delta);
- cfs_rq->throttled = 1;
- cfs_rq->throttled_clock = rq_clock(rq);
- raw_spin_lock(&cfs_b->lock);
- empty = list_empty(&cfs_b->throttled_cfs_rq);
-
- /*
- * Add to the _head_ of the list, so that an already-started
- * distribute_cfs_runtime will not see us. If disribute_cfs_runtime is
- * not running add to the tail so that later runqueues don't get starved.
- */
- if (cfs_b->distribute_running)
- list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
- else
- list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
-
/*
- * If we're the first throttled task, make sure the bandwidth
- * timer is running.
+ * Note: distribution will already see us throttled via the
+ * throttled-list. rq->lock protects completion.
*/
- if (empty)
- start_cfs_bandwidth(cfs_b);
-
- raw_spin_unlock(&cfs_b->lock);
+ cfs_rq->throttled = 1;
+ cfs_rq->throttled_clock = rq_clock(rq);
+ return true;
}
void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
@@ -4911,8 +4931,7 @@ static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
if (cfs_rq_throttled(cfs_rq))
return true;
- throttle_cfs_rq(cfs_rq);
- return true;
+ return throttle_cfs_rq(cfs_rq);
}
static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
@@ -4942,6 +4961,8 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
if (!overrun)
break;
+ idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
+
if (++count > 3) {
u64 new, old = ktime_to_ns(cfs_b->period);
@@ -4971,8 +4992,6 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
/* reset count so we don't come right back in here */
count = 0;
}
-
- idle = do_sched_cfs_period_timer(cfs_b, overrun, flags);
}
if (idle)
cfs_b->period_active = 0;
@@ -5917,7 +5936,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int
/*
* Scan the local SMT mask for idle CPUs.
*/
-static int select_idle_smt(struct task_struct *p, int target)
+static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
{
int cpu, si_cpu = -1;
@@ -5925,7 +5944,8 @@ static int select_idle_smt(struct task_struct *p, int target)
return -1;
for_each_cpu(cpu, cpu_smt_mask(target)) {
- if (!cpumask_test_cpu(cpu, p->cpus_ptr))
+ if (!cpumask_test_cpu(cpu, p->cpus_ptr) ||
+ !cpumask_test_cpu(cpu, sched_domain_span(sd)))
continue;
if (available_idle_cpu(cpu))
return cpu;
@@ -5943,7 +5963,7 @@ static inline int select_idle_core(struct task_struct *p, struct sched_domain *s
return -1;
}
-static inline int select_idle_smt(struct task_struct *p, int target)
+static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target)
{
return -1;
}
@@ -6053,7 +6073,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
if ((unsigned)i < nr_cpumask_bits)
return i;
- i = select_idle_smt(p, target);
+ i = select_idle_smt(p, sd, target);
if ((unsigned)i < nr_cpumask_bits)
return i;
@@ -7407,7 +7427,15 @@ static int detach_tasks(struct lb_env *env)
if (!can_migrate_task(p, env))
goto next;
- load = task_h_load(p);
+ /*
+ * Depending of the number of CPUs and tasks and the
+ * cgroup hierarchy, task_h_load() can return a null
+ * value. Make sure that env->imbalance decreases
+ * otherwise detach_tasks() will stop only after
+ * detaching up to loop_max tasks.
+ */
+ load = max_t(unsigned long, task_h_load(p), 1);
+
if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
goto next;
@@ -9373,7 +9401,12 @@ static void kick_ilb(unsigned int flags)
{
int ilb_cpu;
- nohz.next_balance++;
+ /*
+ * Increase nohz.next_balance only when if full ilb is triggered but
+ * not if we only update stats.
+ */
+ if (flags & NOHZ_BALANCE_KICK)
+ nohz.next_balance = jiffies+1;
ilb_cpu = find_new_ilb();
@@ -9691,6 +9724,14 @@ static bool _nohz_idle_balance(struct rq *this_rq, unsigned int flags,
}
}
+ /*
+ * next_balance will be updated only when there is a need.
+ * When the CPU is attached to null domain for ex, it will not be
+ * updated.
+ */
+ if (likely(update_next_balance))
+ nohz.next_balance = next_balance;
+
/* Newly idle CPU doesn't need an update */
if (idle != CPU_NEWLY_IDLE) {
update_blocked_averages(this_cpu);
@@ -9711,14 +9752,6 @@ abort:
if (has_blocked_load)
WRITE_ONCE(nohz.has_blocked, 1);
- /*
- * next_balance will be updated only when there is a need.
- * When the CPU is attached to null domain for ex, it will not be
- * updated.
- */
- if (likely(update_next_balance))
- nohz.next_balance = next_balance;
-
return ret;
}