Merge tag 'sched-core-2022-05-23' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler updates from Ingo Molnar:

 - Updates to scheduler metrics:
     - PELT fixes & enhancements
     - PSI fixes & enhancements
     - Refactor cpu_util_without()

 - Updates to instrumentation/debugging:
     - Remove sched_trace_*() helper functions - can be done via debug
       info
     - Fix double update_rq_clock() warnings

 - Introduce & use "preemption model accessors" to simplify some of the
   Kconfig complexity.

 - Make softirq handling RT-safe.

 - Misc smaller fixes & cleanups.

* tag 'sched-core-2022-05-23' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  topology: Remove unused cpu_cluster_mask()
  sched: Reverse sched_class layout
  sched/deadline: Remove superfluous rq clock update in push_dl_task()
  sched/core: Avoid obvious double update_rq_clock warning
  smp: Make softirq handling RT safe in flush_smp_call_function_queue()
  smp: Rename flush_smp_call_function_from_idle()
  sched: Fix missing prototype warnings
  sched/fair: Remove cfs_rq_tg_path()
  sched/fair: Remove sched_trace_*() helper functions
  sched/fair: Refactor cpu_util_without()
  sched/fair: Revise comment about lb decision matrix
  sched/psi: report zeroes for CPU full at the system level
  sched/fair: Delete useless condition in tg_unthrottle_up()
  sched/fair: Fix cfs_rq_clock_pelt() for throttled cfs_rq
  sched/fair: Move calculate of avg_load to a better location
  mailmap: Update my email address to @redhat.com
  MAINTAINERS: Add myself as scheduler topology reviewer
  psi: Fix trigger being fired unexpectedly at initial
  ftrace: Use preemption model accessors for trace header printout
  kcsan: Use preemption model accessors
This commit is contained in:
Linus Torvalds
2022-05-24 11:11:13 -07:00
23 changed files with 219 additions and 337 deletions

View File

@@ -1380,13 +1380,14 @@ static const void *nthreads_gen_params(const void *prev, char *desc)
else
nthreads *= 2;
if (!IS_ENABLED(CONFIG_PREEMPT) || !IS_ENABLED(CONFIG_KCSAN_INTERRUPT_WATCHER)) {
if (!preempt_model_preemptible() ||
!IS_ENABLED(CONFIG_KCSAN_INTERRUPT_WATCHER)) {
/*
* Without any preemption, keep 2 CPUs free for other tasks, one
* of which is the main test case function checking for
* completion or failure.
*/
const long min_unused_cpus = IS_ENABLED(CONFIG_PREEMPT_NONE) ? 2 : 0;
const long min_unused_cpus = preempt_model_none() ? 2 : 0;
const long min_required_cpus = 2 + min_unused_cpus;
if (num_online_cpus() < min_required_cpus) {

View File

@@ -15,6 +15,7 @@
/* Headers: */
#include <linux/sched/clock.h>
#include <linux/sched/cputime.h>
#include <linux/sched/hotplug.h>
#include <linux/sched/posix-timers.h>
#include <linux/sched/rt.h>
@@ -31,6 +32,7 @@
#include <uapi/linux/sched/types.h>
#include "sched.h"
#include "smp.h"
#include "autogroup.h"
#include "stats.h"

View File

@@ -14,6 +14,7 @@
#include <linux/sched/debug.h>
#include <linux/sched/isolation.h>
#include <linux/sched/loadavg.h>
#include <linux/sched/nohz.h>
#include <linux/sched/mm.h>
#include <linux/sched/rseq_api.h>
#include <linux/sched/task_stack.h>

View File

@@ -26,7 +26,10 @@
#include <linux/topology.h>
#include <linux/sched/clock.h>
#include <linux/sched/cond_resched.h>
#include <linux/sched/cputime.h>
#include <linux/sched/debug.h>
#include <linux/sched/hotplug.h>
#include <linux/sched/init.h>
#include <linux/sched/isolation.h>
#include <linux/sched/loadavg.h>
#include <linux/sched/mm.h>
@@ -610,10 +613,10 @@ void double_rq_lock(struct rq *rq1, struct rq *rq2)
swap(rq1, rq2);
raw_spin_rq_lock(rq1);
if (__rq_lockp(rq1) == __rq_lockp(rq2))
return;
if (__rq_lockp(rq1) != __rq_lockp(rq2))
raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING);
raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING);
double_rq_clock_clear_update(rq1, rq2);
}
#endif
@@ -2190,7 +2193,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
{
if (p->sched_class == rq->curr->sched_class)
rq->curr->sched_class->check_preempt_curr(rq, p, flags);
else if (p->sched_class > rq->curr->sched_class)
else if (sched_class_above(p->sched_class, rq->curr->sched_class))
resched_curr(rq);
/*
@@ -2408,7 +2411,7 @@ static int migration_cpu_stop(void *data)
* __migrate_task() such that we will not miss enforcing cpus_ptr
* during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
*/
flush_smp_call_function_from_idle();
flush_smp_call_function_queue();
raw_spin_lock(&p->pi_lock);
rq_lock(rq, &rf);
@@ -5689,7 +5692,7 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
* higher scheduling class, because otherwise those lose the
* opportunity to pull in more work from other CPUs.
*/
if (likely(prev->sched_class <= &fair_sched_class &&
if (likely(!sched_class_above(prev->sched_class, &fair_sched_class) &&
rq->nr_running == rq->cfs.h_nr_running)) {
p = pick_next_task_fair(rq, prev, rf);
@@ -9469,11 +9472,11 @@ void __init sched_init(void)
int i;
/* Make sure the linker didn't screw up */
BUG_ON(&idle_sched_class + 1 != &fair_sched_class ||
&fair_sched_class + 1 != &rt_sched_class ||
&rt_sched_class + 1 != &dl_sched_class);
BUG_ON(&idle_sched_class != &fair_sched_class + 1 ||
&fair_sched_class != &rt_sched_class + 1 ||
&rt_sched_class != &dl_sched_class + 1);
#ifdef CONFIG_SMP
BUG_ON(&dl_sched_class + 1 != &stop_sched_class);
BUG_ON(&dl_sched_class != &stop_sched_class + 1);
#endif
wait_bit_init();

View File

@@ -1220,8 +1220,6 @@ int dl_runtime_exceeded(struct sched_dl_entity *dl_se)
return (dl_se->runtime <= 0);
}
extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
/*
* This function implements the GRUB accounting rule:
* according to the GRUB reclaiming algorithm, the runtime is
@@ -1832,6 +1830,7 @@ out:
static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused)
{
struct rq_flags rf;
struct rq *rq;
if (READ_ONCE(p->__state) != TASK_WAKING)
@@ -1843,7 +1842,7 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused
* from try_to_wake_up(). Hence, p->pi_lock is locked, but
* rq->lock is not... So, lock it
*/
raw_spin_rq_lock(rq);
rq_lock(rq, &rf);
if (p->dl.dl_non_contending) {
update_rq_clock(rq);
sub_running_bw(&p->dl, &rq->dl);
@@ -1859,7 +1858,7 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused
put_task_struct(p);
}
sub_rq_bw(&p->dl, &rq->dl);
raw_spin_rq_unlock(rq);
rq_unlock(rq, &rf);
}
static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
@@ -2319,13 +2318,7 @@ retry:
deactivate_task(rq, next_task, 0);
set_task_cpu(next_task, later_rq->cpu);
/*
* Update the later_rq clock here, because the clock is used
* by the cpufreq_update_util() inside __add_running_bw().
*/
update_rq_clock(later_rq);
activate_task(later_rq, next_task, ENQUEUE_NOCLOCK);
activate_task(later_rq, next_task, 0);
ret = 1;
resched_curr(later_rq);

View File

@@ -36,6 +36,7 @@
#include <linux/sched/cond_resched.h>
#include <linux/sched/cputime.h>
#include <linux/sched/isolation.h>
#include <linux/sched/nohz.h>
#include <linux/cpuidle.h>
#include <linux/interrupt.h>
@@ -313,19 +314,6 @@ const struct sched_class fair_sched_class;
#define for_each_sched_entity(se) \
for (; se; se = se->parent)
static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
{
if (!path)
return;
if (cfs_rq && task_group_is_autogroup(cfs_rq->tg))
autogroup_path(cfs_rq->tg, path, len);
else if (cfs_rq && cfs_rq->tg->css.cgroup)
cgroup_path(cfs_rq->tg->css.cgroup, path, len);
else
strlcpy(path, "(null)", len);
}
static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
struct rq *rq = rq_of(cfs_rq);
@@ -493,12 +481,6 @@ static int se_is_idle(struct sched_entity *se)
#define for_each_sched_entity(se) \
for (; se; se = NULL)
static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
{
if (path)
strlcpy(path, "(null)", len);
}
static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
return true;
@@ -4846,11 +4828,11 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
cfs_rq->throttle_count--;
if (!cfs_rq->throttle_count) {
cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
cfs_rq->throttled_clock_task;
cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
cfs_rq->throttled_clock_pelt;
/* Add cfs_rq with load or one or more already running entities to the list */
if (!cfs_rq_is_decayed(cfs_rq) || cfs_rq->nr_running)
if (!cfs_rq_is_decayed(cfs_rq))
list_add_leaf_cfs_rq(cfs_rq);
}
@@ -4864,7 +4846,7 @@ static int tg_throttle_down(struct task_group *tg, void *data)
/* group is entering throttled state, stop time */
if (!cfs_rq->throttle_count) {
cfs_rq->throttled_clock_task = rq_clock_task(rq);
cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
list_del_leaf_cfs_rq(cfs_rq);
}
cfs_rq->throttle_count++;
@@ -5308,7 +5290,7 @@ static void sync_throttle(struct task_group *tg, int cpu)
pcfs_rq = tg->parent->cfs_rq[cpu];
cfs_rq->throttle_count = pcfs_rq->throttle_count;
cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu));
cfs_rq->throttled_clock_pelt = rq_clock_pelt(cpu_rq(cpu));
}
/* conditionally throttle active cfs_rq's from put_prev_entity() */
@@ -6543,6 +6525,68 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
return target;
}
/*
* Predicts what cpu_util(@cpu) would return if @p was removed from @cpu
* (@dst_cpu = -1) or migrated to @dst_cpu.
*/
static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
{
struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
unsigned long util = READ_ONCE(cfs_rq->avg.util_avg);
/*
* If @dst_cpu is -1 or @p migrates from @cpu to @dst_cpu remove its
* contribution. If @p migrates from another CPU to @cpu add its
* contribution. In all the other cases @cpu is not impacted by the
* migration so its util_avg is already correct.
*/
if (task_cpu(p) == cpu && dst_cpu != cpu)
lsub_positive(&util, task_util(p));
else if (task_cpu(p) != cpu && dst_cpu == cpu)
util += task_util(p);
if (sched_feat(UTIL_EST)) {
unsigned long util_est;
util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued);
/*
* During wake-up @p isn't enqueued yet and doesn't contribute
* to any cpu_rq(cpu)->cfs.avg.util_est.enqueued.
* If @dst_cpu == @cpu add it to "simulate" cpu_util after @p
* has been enqueued.
*
* During exec (@dst_cpu = -1) @p is enqueued and does
* contribute to cpu_rq(cpu)->cfs.util_est.enqueued.
* Remove it to "simulate" cpu_util without @p's contribution.
*
* Despite the task_on_rq_queued(@p) check there is still a
* small window for a possible race when an exec
* select_task_rq_fair() races with LB's detach_task().
*
* detach_task()
* deactivate_task()
* p->on_rq = TASK_ON_RQ_MIGRATING;
* -------------------------------- A
* dequeue_task() \
* dequeue_task_fair() + Race Time
* util_est_dequeue() /
* -------------------------------- B
*
* The additional check "current == p" is required to further
* reduce the race window.
*/
if (dst_cpu == cpu)
util_est += _task_util_est(p);
else if (unlikely(task_on_rq_queued(p) || current == p))
lsub_positive(&util_est, _task_util_est(p));
util = max(util, util_est);
}
return min(util, capacity_orig_of(cpu));
}
/*
* cpu_util_without: compute cpu utilization without any contributions from *p
* @cpu: the CPU which utilization is requested
@@ -6558,116 +6602,11 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
*/
static unsigned long cpu_util_without(int cpu, struct task_struct *p)
{
struct cfs_rq *cfs_rq;
unsigned int util;
/* Task has no contribution or is new */
if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
return cpu_util_cfs(cpu);
cfs_rq = &cpu_rq(cpu)->cfs;
util = READ_ONCE(cfs_rq->avg.util_avg);
/* Discount task's util from CPU's util */
lsub_positive(&util, task_util(p));
/*
* Covered cases:
*
* a) if *p is the only task sleeping on this CPU, then:
* cpu_util (== task_util) > util_est (== 0)
* and thus we return:
* cpu_util_without = (cpu_util - task_util) = 0
*
* b) if other tasks are SLEEPING on this CPU, which is now exiting
* IDLE, then:
* cpu_util >= task_util
* cpu_util > util_est (== 0)
* and thus we discount *p's blocked utilization to return:
* cpu_util_without = (cpu_util - task_util) >= 0
*
* c) if other tasks are RUNNABLE on that CPU and
* util_est > cpu_util
* then we use util_est since it returns a more restrictive
* estimation of the spare capacity on that CPU, by just
* considering the expected utilization of tasks already
* runnable on that CPU.
*
* Cases a) and b) are covered by the above code, while case c) is
* covered by the following code when estimated utilization is
* enabled.
*/
if (sched_feat(UTIL_EST)) {
unsigned int estimated =
READ_ONCE(cfs_rq->avg.util_est.enqueued);
/*
* Despite the following checks we still have a small window
* for a possible race, when an execl's select_task_rq_fair()
* races with LB's detach_task():
*
* detach_task()
* p->on_rq = TASK_ON_RQ_MIGRATING;
* ---------------------------------- A
* deactivate_task() \
* dequeue_task() + RaceTime
* util_est_dequeue() /
* ---------------------------------- B
*
* The additional check on "current == p" it's required to
* properly fix the execl regression and it helps in further
* reducing the chances for the above race.
*/
if (unlikely(task_on_rq_queued(p) || current == p))
lsub_positive(&estimated, _task_util_est(p));
util = max(util, estimated);
}
/*
* Utilization (estimated) can exceed the CPU capacity, thus let's
* clamp to the maximum CPU capacity to ensure consistency with
* cpu_util.
*/
return min_t(unsigned long, util, capacity_orig_of(cpu));
}
/*
* Predicts what cpu_util(@cpu) would return if @p was migrated (and enqueued)
* to @dst_cpu.
*/
static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
{
struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
unsigned long util_est, util = READ_ONCE(cfs_rq->avg.util_avg);
/*
* If @p migrates from @cpu to another, remove its contribution. Or,
* if @p migrates from another CPU to @cpu, add its contribution. In
* the other cases, @cpu is not impacted by the migration, so the
* util_avg should already be correct.
*/
if (task_cpu(p) == cpu && dst_cpu != cpu)
lsub_positive(&util, task_util(p));
else if (task_cpu(p) != cpu && dst_cpu == cpu)
util += task_util(p);
if (sched_feat(UTIL_EST)) {
util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued);
/*
* During wake-up, the task isn't enqueued yet and doesn't
* appear in the cfs_rq->avg.util_est.enqueued of any rq,
* so just add it (if needed) to "simulate" what will be
* cpu_util after the task has been enqueued.
*/
if (dst_cpu == cpu)
util_est += _task_util_est(p);
util = max(util, util_est);
}
return min(util, capacity_orig_of(cpu));
return cpu_util_next(cpu, p, -1);
}
/*
@@ -9460,8 +9399,6 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
local->avg_load = (local->group_load * SCHED_CAPACITY_SCALE) /
local->group_capacity;
sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) /
sds->total_capacity;
/*
* If the local group is more loaded than the selected
* busiest group don't try to pull any tasks.
@@ -9470,6 +9407,9 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
env->imbalance = 0;
return;
}
sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) /
sds->total_capacity;
}
/*
@@ -9495,7 +9435,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
* busiest \ local has_spare fully_busy misfit asym imbalanced overloaded
* has_spare nr_idle balanced N/A N/A balanced balanced
* fully_busy nr_idle nr_idle N/A N/A balanced balanced
* misfit_task force N/A N/A N/A force force
* misfit_task force N/A N/A N/A N/A N/A
* asym_packing force force N/A N/A force force
* imbalanced force force N/A N/A force force
* overloaded force force N/A N/A force avg_load
@@ -11881,101 +11821,3 @@ __init void init_sched_fair_class(void)
#endif /* SMP */
}
/*
* Helper functions to facilitate extracting info from tracepoints.
*/
const struct sched_avg *sched_trace_cfs_rq_avg(struct cfs_rq *cfs_rq)
{
#ifdef CONFIG_SMP
return cfs_rq ? &cfs_rq->avg : NULL;
#else
return NULL;
#endif
}
EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_avg);
char *sched_trace_cfs_rq_path(struct cfs_rq *cfs_rq, char *str, int len)
{
if (!cfs_rq) {
if (str)
strlcpy(str, "(null)", len);
else
return NULL;
}
cfs_rq_tg_path(cfs_rq, str, len);
return str;
}
EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_path);
int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq)
{
return cfs_rq ? cpu_of(rq_of(cfs_rq)) : -1;
}
EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_cpu);
const struct sched_avg *sched_trace_rq_avg_rt(struct rq *rq)
{
#ifdef CONFIG_SMP
return rq ? &rq->avg_rt : NULL;
#else
return NULL;
#endif
}
EXPORT_SYMBOL_GPL(sched_trace_rq_avg_rt);
const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq)
{
#ifdef CONFIG_SMP
return rq ? &rq->avg_dl : NULL;
#else
return NULL;
#endif
}
EXPORT_SYMBOL_GPL(sched_trace_rq_avg_dl);
const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq)
{
#if defined(CONFIG_SMP) && defined(CONFIG_HAVE_SCHED_AVG_IRQ)
return rq ? &rq->avg_irq : NULL;
#else
return NULL;
#endif
}
EXPORT_SYMBOL_GPL(sched_trace_rq_avg_irq);
int sched_trace_rq_cpu(struct rq *rq)
{
return rq ? cpu_of(rq) : -1;
}
EXPORT_SYMBOL_GPL(sched_trace_rq_cpu);
int sched_trace_rq_cpu_capacity(struct rq *rq)
{
return rq ?
#ifdef CONFIG_SMP
rq->cpu_capacity
#else
SCHED_CAPACITY_SCALE
#endif
: -1;
}
EXPORT_SYMBOL_GPL(sched_trace_rq_cpu_capacity);
const struct cpumask *sched_trace_rd_span(struct root_domain *rd)
{
#ifdef CONFIG_SMP
return rd ? rd->span : NULL;
#else
return NULL;
#endif
}
EXPORT_SYMBOL_GPL(sched_trace_rd_span);
int sched_trace_rq_nr_running(struct rq *rq)
{
return rq ? rq->nr_running : -1;
}
EXPORT_SYMBOL_GPL(sched_trace_rq_nr_running);

View File

@@ -327,7 +327,7 @@ static void do_idle(void)
* RCU relies on this call to be done outside of an RCU read-side
* critical section.
*/
flush_smp_call_function_from_idle();
flush_smp_call_function_queue();
schedule_idle();
if (unlikely(klp_patch_pending(current)))

View File

@@ -145,9 +145,9 @@ static inline u64 rq_clock_pelt(struct rq *rq)
static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
{
if (unlikely(cfs_rq->throttle_count))
return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time;
return cfs_rq->throttled_clock_pelt - cfs_rq->throttled_clock_pelt_time;
return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_pelt_time;
}
#else
static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)

View File

@@ -1060,14 +1060,17 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
mutex_unlock(&group->avgs_lock);
for (full = 0; full < 2; full++) {
unsigned long avg[3];
u64 total;
unsigned long avg[3] = { 0, };
u64 total = 0;
int w;
for (w = 0; w < 3; w++)
avg[w] = group->avg[res * 2 + full][w];
total = div_u64(group->total[PSI_AVGS][res * 2 + full],
NSEC_PER_USEC);
/* CPU FULL is undefined at the system level */
if (!(group == &psi_system && res == PSI_CPU && full)) {
for (w = 0; w < 3; w++)
avg[w] = group->avg[res * 2 + full][w];
total = div_u64(group->total[PSI_AVGS][res * 2 + full],
NSEC_PER_USEC);
}
seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n",
full ? "full" : "some",
@@ -1117,7 +1120,8 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
t->state = state;
t->threshold = threshold_us * NSEC_PER_USEC;
t->win.size = window_us * NSEC_PER_USEC;
window_reset(&t->win, 0, 0, 0);
window_reset(&t->win, sched_clock(),
group->total[PSI_POLL][t->state], 0);
t->event = 0;
t->last_event_time = 0;

View File

@@ -871,6 +871,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
int enqueue = 0;
struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
struct rq *rq = rq_of_rt_rq(rt_rq);
struct rq_flags rf;
int skip;
/*
@@ -885,7 +886,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
if (skip)
continue;
raw_spin_rq_lock(rq);
rq_lock(rq, &rf);
update_rq_clock(rq);
if (rt_rq->rt_time) {
@@ -923,7 +924,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
if (enqueue)
sched_rt_rq_enqueue(rt_rq);
raw_spin_rq_unlock(rq);
rq_unlock(rq, &rf);
}
if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF))

View File

@@ -603,8 +603,8 @@ struct cfs_rq {
s64 runtime_remaining;
u64 throttled_clock;
u64 throttled_clock_task;
u64 throttled_clock_task_time;
u64 throttled_clock_pelt;
u64 throttled_clock_pelt_time;
int throttled;
int throttle_count;
struct list_head throttled_list;
@@ -1827,12 +1827,7 @@ static inline void dirty_sched_domain_sysctl(int cpu)
#endif
extern int sched_update_scaling(void);
extern void flush_smp_call_function_from_idle(void);
#else /* !CONFIG_SMP: */
static inline void flush_smp_call_function_from_idle(void) { }
#endif
#endif /* CONFIG_SMP */
#include "stats.h"
@@ -2182,6 +2177,8 @@ static inline void set_next_task(struct rq *rq, struct task_struct *next)
*
* include/asm-generic/vmlinux.lds.h
*
* *CAREFUL* they are laid out in *REVERSE* order!!!
*
* Also enforce alignment on the instance, not the type, to guarantee layout.
*/
#define DEFINE_SCHED_CLASS(name) \
@@ -2190,17 +2187,16 @@ const struct sched_class name##_sched_class \
__section("__" #name "_sched_class")
/* Defined in include/asm-generic/vmlinux.lds.h */
extern struct sched_class __begin_sched_classes[];
extern struct sched_class __end_sched_classes[];
#define sched_class_highest (__end_sched_classes - 1)
#define sched_class_lowest (__begin_sched_classes - 1)
extern struct sched_class __sched_class_highest[];
extern struct sched_class __sched_class_lowest[];
#define for_class_range(class, _from, _to) \
for (class = (_from); class != (_to); class--)
for (class = (_from); class < (_to); class++)
#define for_each_class(class) \
for_class_range(class, sched_class_highest, sched_class_lowest)
for_class_range(class, __sched_class_highest, __sched_class_lowest)
#define sched_class_above(_a, _b) ((_a) < (_b))
extern const struct sched_class stop_sched_class;
extern const struct sched_class dl_sched_class;
@@ -2309,6 +2305,7 @@ extern void resched_cpu(int cpu);
extern struct rt_bandwidth def_rt_bandwidth;
extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime);
extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
@@ -2478,6 +2475,24 @@ unsigned long arch_scale_freq_capacity(int cpu)
}
#endif
#ifdef CONFIG_SCHED_DEBUG
/*
* In double_lock_balance()/double_rq_lock(), we use raw_spin_rq_lock() to
* acquire rq lock instead of rq_lock(). So at the end of these two functions
* we need to call double_rq_clock_clear_update() to clear RQCF_UPDATED of
* rq->clock_update_flags to avoid the WARN_DOUBLE_CLOCK warning.
*/
static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2)
{
rq1->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
/* rq1 == rq2 for !CONFIG_SMP, so just clear RQCF_UPDATED once. */
#ifdef CONFIG_SMP
rq2->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
#endif
}
#else
static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2) {}
#endif
#ifdef CONFIG_SMP
@@ -2543,14 +2558,15 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
__acquires(busiest->lock)
__acquires(this_rq->lock)
{
if (__rq_lockp(this_rq) == __rq_lockp(busiest))
return 0;
if (likely(raw_spin_rq_trylock(busiest)))
if (__rq_lockp(this_rq) == __rq_lockp(busiest) ||
likely(raw_spin_rq_trylock(busiest))) {
double_rq_clock_clear_update(this_rq, busiest);
return 0;
}
if (rq_order_less(this_rq, busiest)) {
raw_spin_rq_lock_nested(busiest, SINGLE_DEPTH_NESTING);
double_rq_clock_clear_update(this_rq, busiest);
return 0;
}
@@ -2644,6 +2660,7 @@ static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
BUG_ON(rq1 != rq2);
raw_spin_rq_lock(rq1);
__acquire(rq2->lock); /* Fake it out ;) */
double_rq_clock_clear_update(rq1, rq2);
}
/*

View File

@@ -7,3 +7,9 @@
extern void sched_ttwu_pending(void *arg);
extern void send_call_function_single_ipi(int cpu);
#ifdef CONFIG_SMP
extern void flush_smp_call_function_queue(void);
#else
static inline void flush_smp_call_function_queue(void) { }
#endif

View File

@@ -96,7 +96,7 @@ static DEFINE_PER_CPU_ALIGNED(struct call_function_data, cfd_data);
static DEFINE_PER_CPU_SHARED_ALIGNED(struct llist_head, call_single_queue);
static void flush_smp_call_function_queue(bool warn_cpu_offline);
static void __flush_smp_call_function_queue(bool warn_cpu_offline);
int smpcfd_prepare_cpu(unsigned int cpu)
{
@@ -141,7 +141,7 @@ int smpcfd_dying_cpu(unsigned int cpu)
* ensure that the outgoing CPU doesn't go offline with work
* still pending.
*/
flush_smp_call_function_queue(false);
__flush_smp_call_function_queue(false);
irq_work_run();
return 0;
}
@@ -544,11 +544,11 @@ void generic_smp_call_function_single_interrupt(void)
{
cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->gotipi, CFD_SEQ_NOCPU,
smp_processor_id(), CFD_SEQ_GOTIPI);
flush_smp_call_function_queue(true);
__flush_smp_call_function_queue(true);
}
/**
* flush_smp_call_function_queue - Flush pending smp-call-function callbacks
* __flush_smp_call_function_queue - Flush pending smp-call-function callbacks
*
* @warn_cpu_offline: If set to 'true', warn if callbacks were queued on an
* offline CPU. Skip this check if set to 'false'.
@@ -561,7 +561,7 @@ void generic_smp_call_function_single_interrupt(void)
* Loop through the call_single_queue and run all the queued callbacks.
* Must be called with interrupts disabled.
*/
static void flush_smp_call_function_queue(bool warn_cpu_offline)
static void __flush_smp_call_function_queue(bool warn_cpu_offline)
{
call_single_data_t *csd, *csd_next;
struct llist_node *entry, *prev;
@@ -684,8 +684,22 @@ static void flush_smp_call_function_queue(bool warn_cpu_offline)
smp_processor_id(), CFD_SEQ_HDLEND);
}
void flush_smp_call_function_from_idle(void)
/**
* flush_smp_call_function_queue - Flush pending smp-call-function callbacks
* from task context (idle, migration thread)
*
* When TIF_POLLING_NRFLAG is supported and a CPU is in idle and has it
* set, then remote CPUs can avoid sending IPIs and wake the idle CPU by
* setting TIF_NEED_RESCHED. The idle task on the woken up CPU has to
* handle queued SMP function calls before scheduling.
*
* The migration thread has to ensure that an eventually pending wakeup has
* been handled before it migrates a task.
*/
void flush_smp_call_function_queue(void)
{
unsigned int was_pending;
unsigned long flags;
if (llist_empty(this_cpu_ptr(&call_single_queue)))
@@ -694,9 +708,11 @@ void flush_smp_call_function_from_idle(void)
cfd_seq_store(this_cpu_ptr(&cfd_seq_local)->idle, CFD_SEQ_NOCPU,
smp_processor_id(), CFD_SEQ_IDLE);
local_irq_save(flags);
flush_smp_call_function_queue(true);
/* Get the already pending soft interrupts for RT enabled kernels */
was_pending = local_softirq_pending();
__flush_smp_call_function_queue(true);
if (local_softirq_pending())
do_softirq();
do_softirq_post_smp_call_flush(was_pending);
local_irq_restore(flags);
}

View File

@@ -294,6 +294,19 @@ static inline void invoke_softirq(void)
wakeup_softirqd();
}
/*
* flush_smp_call_function_queue() can raise a soft interrupt in a function
* call. On RT kernels this is undesired and the only known functionality
* in the block layer which does this is disabled on RT. If soft interrupts
* get raised which haven't been raised before the flush, warn so it can be
* investigated.
*/
void do_softirq_post_smp_call_flush(unsigned int was_pending)
{
if (WARN_ON_ONCE(was_pending != local_softirq_pending()))
invoke_softirq();
}
#else /* CONFIG_PREEMPT_RT */
/*

View File

@@ -535,8 +535,6 @@ void stop_machine_park(int cpu)
kthread_park(stopper->thread);
}
extern void sched_set_stop_task(int cpu, struct task_struct *stop);
static void cpu_stop_create(unsigned int cpu)
{
sched_set_stop_task(cpu, per_cpu(cpu_stopper.thread, cpu));

View File

@@ -4289,17 +4289,11 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
entries,
total,
buf->cpu,
#if defined(CONFIG_PREEMPT_NONE)
"server",
#elif defined(CONFIG_PREEMPT_VOLUNTARY)
"desktop",
#elif defined(CONFIG_PREEMPT)
"preempt",
#elif defined(CONFIG_PREEMPT_RT)
"preempt_rt",
#else
preempt_model_none() ? "server" :
preempt_model_voluntary() ? "desktop" :
preempt_model_full() ? "preempt" :
preempt_model_rt() ? "preempt_rt" :
"unknown",
#endif
/* These are reserved for later use */
0, 0, 0, 0);
#ifdef CONFIG_SMP