sched_ext: Enable the ops breather and eject BPF scheduler on softlockup

On 2 x Intel Sapphire Rapids machines with 224 logical CPUs, a poorly
behaving BPF scheduler can live-lock the system by making multiple CPUs bang
on the same DSQ to the point where soft-lockup detection triggers before
SCX's own watchdog can take action. It also seems possible that the machine
can be live-locked enough to prevent scx_ops_helper, which is an RT task,
from running in a timely manner.

Implement scx_softlockup() which is called when three quarters of
soft-lockup threshold has passed. The function immediately enables the ops
breather and triggers an ops error to initiate ejection of the BPF
scheduler.

The previous and this patch combined enable the kernel to reliably recover
the system from live-lock conditions that can be triggered by a poorly
behaving BPF scheduler on Intel dual socket systems.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Douglas Anderson <dianders@chromium.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
This commit is contained in:
Tejun Heo 2024-11-05 11:49:04 -10:00
parent 62dcbab8b0
commit e32c260195
4 changed files with 57 additions and 0 deletions

View File

@ -205,11 +205,13 @@ struct sched_ext_entity {
void sched_ext_free(struct task_struct *p); void sched_ext_free(struct task_struct *p);
void print_scx_info(const char *log_lvl, struct task_struct *p); void print_scx_info(const char *log_lvl, struct task_struct *p);
void scx_softlockup(u32 dur_s);
#else /* !CONFIG_SCHED_CLASS_EXT */ #else /* !CONFIG_SCHED_CLASS_EXT */
static inline void sched_ext_free(struct task_struct *p) {} static inline void sched_ext_free(struct task_struct *p) {}
static inline void print_scx_info(const char *log_lvl, struct task_struct *p) {} static inline void print_scx_info(const char *log_lvl, struct task_struct *p) {}
static inline void scx_softlockup(u32 dur_s) {}
#endif /* CONFIG_SCHED_CLASS_EXT */ #endif /* CONFIG_SCHED_CLASS_EXT */
#endif /* _LINUX_SCHED_EXT_H */ #endif /* _LINUX_SCHED_EXT_H */

View File

@ -867,6 +867,7 @@ static DEFINE_MUTEX(scx_ops_enable_mutex);
DEFINE_STATIC_KEY_FALSE(__scx_ops_enabled); DEFINE_STATIC_KEY_FALSE(__scx_ops_enabled);
DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem); DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem);
static atomic_t scx_ops_enable_state_var = ATOMIC_INIT(SCX_OPS_DISABLED); static atomic_t scx_ops_enable_state_var = ATOMIC_INIT(SCX_OPS_DISABLED);
static unsigned long scx_in_softlockup;
static atomic_t scx_ops_breather_depth = ATOMIC_INIT(0); static atomic_t scx_ops_breather_depth = ATOMIC_INIT(0);
static int scx_ops_bypass_depth; static int scx_ops_bypass_depth;
static bool scx_ops_init_task_enabled; static bool scx_ops_init_task_enabled;
@ -4614,6 +4615,49 @@ bool task_should_scx(struct task_struct *p)
return p->policy == SCHED_EXT; return p->policy == SCHED_EXT;
} }
/**
* scx_softlockup - sched_ext softlockup handler
*
* On some multi-socket setups (e.g. 2x Intel 8480c), the BPF scheduler can
* live-lock the system by making many CPUs target the same DSQ to the point
* where soft-lockup detection triggers. This function is called from
* soft-lockup watchdog when the triggering point is close and tries to unjam
* the system by enabling the breather and aborting the BPF scheduler.
*/
void scx_softlockup(u32 dur_s)
{
switch (scx_ops_enable_state()) {
case SCX_OPS_ENABLING:
case SCX_OPS_ENABLED:
break;
default:
return;
}
/* allow only one instance, cleared at the end of scx_ops_bypass() */
if (test_and_set_bit(0, &scx_in_softlockup))
return;
printk_deferred(KERN_ERR "sched_ext: Soft lockup - CPU%d stuck for %us, disabling \"%s\"\n",
smp_processor_id(), dur_s, scx_ops.name);
/*
* Some CPUs may be trapped in the dispatch paths. Enable breather
* immediately; otherwise, we might even be able to get to
* scx_ops_bypass().
*/
atomic_inc(&scx_ops_breather_depth);
scx_ops_error("soft lockup - CPU#%d stuck for %us",
smp_processor_id(), dur_s);
}
static void scx_clear_softlockup(void)
{
if (test_and_clear_bit(0, &scx_in_softlockup))
atomic_dec(&scx_ops_breather_depth);
}
/** /**
* scx_ops_bypass - [Un]bypass scx_ops and guarantee forward progress * scx_ops_bypass - [Un]bypass scx_ops and guarantee forward progress
* *
@ -4724,6 +4768,7 @@ static void scx_ops_bypass(bool bypass)
atomic_dec(&scx_ops_breather_depth); atomic_dec(&scx_ops_breather_depth);
unlock: unlock:
raw_spin_unlock_irqrestore(&bypass_lock, flags); raw_spin_unlock_irqrestore(&bypass_lock, flags);
scx_clear_softlockup();
} }
static void free_exit_info(struct scx_exit_info *ei) static void free_exit_info(struct scx_exit_info *ei)

View File

@ -644,6 +644,14 @@ static int is_softlockup(unsigned long touch_ts,
need_counting_irqs()) need_counting_irqs())
start_counting_irqs(); start_counting_irqs();
/*
* A poorly behaving BPF scheduler can live-lock the system into
* soft lockups. Tell sched_ext to try ejecting the BPF
* scheduler when close to a soft lockup.
*/
if (time_after_eq(now, period_ts + get_softlockup_thresh() * 3 / 4))
scx_softlockup(now - touch_ts);
/* Warn about unreasonable delays. */ /* Warn about unreasonable delays. */
if (time_after(now, period_ts + get_softlockup_thresh())) if (time_after(now, period_ts + get_softlockup_thresh()))
return now - touch_ts; return now - touch_ts;

View File

@ -35,6 +35,8 @@ print(f'enabled : {read_static_key("__scx_ops_enabled")}')
print(f'switching_all : {read_int("scx_switching_all")}') print(f'switching_all : {read_int("scx_switching_all")}')
print(f'switched_all : {read_static_key("__scx_switched_all")}') print(f'switched_all : {read_static_key("__scx_switched_all")}')
print(f'enable_state : {ops_state_str(enable_state)} ({enable_state})') print(f'enable_state : {ops_state_str(enable_state)} ({enable_state})')
print(f'in_softlockup : {prog["scx_in_softlockup"].value_()}')
print(f'breather_depth: {read_atomic("scx_ops_breather_depth")}')
print(f'bypass_depth : {prog["scx_ops_bypass_depth"].value_()}') print(f'bypass_depth : {prog["scx_ops_bypass_depth"].value_()}')
print(f'nr_rejected : {read_atomic("scx_nr_rejected")}') print(f'nr_rejected : {read_atomic("scx_nr_rejected")}')
print(f'enable_seq : {read_atomic("scx_enable_seq")}') print(f'enable_seq : {read_atomic("scx_enable_seq")}')