From 8427acb6b5861d205abca7afa656a897bbae34b7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 24 Sep 2024 14:08:52 -1000
Subject: [PATCH 01/21] sched_ext: Factor out move_task_between_dsqs() from
 scx_dispatch_from_dsq()

Pure reorganization. No functional changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 116 +++++++++++++++++++++++++++++----------------
 1 file changed, 75 insertions(+), 41 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 3cd7c50a51c5..74d0ebc5e15e 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -2369,6 +2369,73 @@ static inline bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *r
 static inline bool consume_remote_task(struct rq *this_rq, struct task_struct *p, struct scx_dispatch_q *dsq, struct rq *task_rq) { return false; }
 #endif	/* CONFIG_SMP */
 
+/**
+ * move_task_between_dsqs() - Move a task from one DSQ to another
+ * @p: target task
+ * @enq_flags: %SCX_ENQ_*
+ * @src_dsq: DSQ @p is currently on, must not be a local DSQ
+ * @dst_dsq: DSQ @p is being moved to, can be any DSQ
+ *
+ * Must be called with @p's task_rq and @src_dsq locked. If @dst_dsq is a local
+ * DSQ and @p is on a different CPU, @p will be migrated and thus its task_rq
+ * will change. As @p's task_rq is locked, this function doesn't need to use the
+ * holding_cpu mechanism.
+ *
+ * On return, @src_dsq is unlocked and only @p's new task_rq, which is the
+ * return value, is locked.
+ */
+static struct rq *move_task_between_dsqs(struct task_struct *p, u64 enq_flags,
+					 struct scx_dispatch_q *src_dsq,
+					 struct scx_dispatch_q *dst_dsq)
+{
+	struct rq *src_rq = task_rq(p), *dst_rq;
+
+	BUG_ON(src_dsq->id == SCX_DSQ_LOCAL);
+	lockdep_assert_held(&src_dsq->lock);
+	lockdep_assert_rq_held(src_rq);
+
+	if (dst_dsq->id == SCX_DSQ_LOCAL) {
+		dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq);
+		if (!task_can_run_on_remote_rq(p, dst_rq, true)) {
+			dst_dsq = find_global_dsq(p);
+			dst_rq = src_rq;
+		}
+	} else {
+		/* no need to migrate if destination is a non-local DSQ */
+		dst_rq = src_rq;
+	}
+
+	/*
+	 * Move @p into $dst_dsq. If $dst_dsq is the local DSQ of a different
+	 * CPU, @p will be migrated.
+	 */
+	if (dst_dsq->id == SCX_DSQ_LOCAL) {
+		/* @p is going from a non-local DSQ to a local DSQ */
+		if (src_rq == dst_rq) {
+			task_unlink_from_dsq(p, src_dsq);
+			move_local_task_to_local_dsq(p, enq_flags,
+						     src_dsq, dst_rq);
+			raw_spin_unlock(&src_dsq->lock);
+		} else {
+			raw_spin_unlock(&src_dsq->lock);
+			move_remote_task_to_local_dsq(p, enq_flags,
+						      src_rq, dst_rq);
+		}
+	} else {
+		/*
+		 * @p is going from a non-local DSQ to a non-local DSQ. As
+		 * $src_dsq is already locked, do an abbreviated dequeue.
+		 */
+		task_unlink_from_dsq(p, src_dsq);
+		p->scx.dsq = NULL;
+		raw_spin_unlock(&src_dsq->lock);
+
+		dispatch_enqueue(dst_dsq, p, enq_flags);
+	}
+
+	return dst_rq;
+}
+
 static bool consume_dispatch_q(struct rq *rq, struct scx_dispatch_q *dsq)
 {
 	struct task_struct *p;
@@ -6033,7 +6100,7 @@ static bool scx_dispatch_from_dsq(struct bpf_iter_scx_dsq_kern *kit,
 				  u64 enq_flags)
 {
 	struct scx_dispatch_q *src_dsq = kit->dsq, *dst_dsq;
-	struct rq *this_rq, *src_rq, *dst_rq, *locked_rq;
+	struct rq *this_rq, *src_rq, *locked_rq;
 	bool dispatched = false;
 	bool in_balance;
 	unsigned long flags;
@@ -6079,51 +6146,18 @@ static bool scx_dispatch_from_dsq(struct bpf_iter_scx_dsq_kern *kit,
 	/* @p is still on $src_dsq and stable, determine the destination */
 	dst_dsq = find_dsq_for_dispatch(this_rq, dsq_id, p);
 
-	if (dst_dsq->id == SCX_DSQ_LOCAL) {
-		dst_rq = container_of(dst_dsq, struct rq, scx.local_dsq);
-		if (!task_can_run_on_remote_rq(p, dst_rq, true)) {
-			dst_dsq = find_global_dsq(p);
-			dst_rq = src_rq;
-		}
-	} else {
-		/* no need to migrate if destination is a non-local DSQ */
-		dst_rq = src_rq;
-	}
-
 	/*
-	 * Move @p into $dst_dsq. If $dst_dsq is the local DSQ of a different
-	 * CPU, @p will be migrated.
+	 * Apply vtime and slice updates before moving so that the new time is
+	 * visible before inserting into $dst_dsq. @p is still on $src_dsq but
+	 * this is safe as we're locking it.
 	 */
-	if (dst_dsq->id == SCX_DSQ_LOCAL) {
-		/* @p is going from a non-local DSQ to a local DSQ */
-		if (src_rq == dst_rq) {
-			task_unlink_from_dsq(p, src_dsq);
-			move_local_task_to_local_dsq(p, enq_flags,
-						     src_dsq, dst_rq);
-			raw_spin_unlock(&src_dsq->lock);
-		} else {
-			raw_spin_unlock(&src_dsq->lock);
-			move_remote_task_to_local_dsq(p, enq_flags,
-						      src_rq, dst_rq);
-			locked_rq = dst_rq;
-		}
-	} else {
-		/*
-		 * @p is going from a non-local DSQ to a non-local DSQ. As
-		 * $src_dsq is already locked, do an abbreviated dequeue.
-		 */
-		task_unlink_from_dsq(p, src_dsq);
-		p->scx.dsq = NULL;
-		raw_spin_unlock(&src_dsq->lock);
-
-		if (kit->cursor.flags & __SCX_DSQ_ITER_HAS_VTIME)
-			p->scx.dsq_vtime = kit->vtime;
-		dispatch_enqueue(dst_dsq, p, enq_flags);
-	}
-
+	if (kit->cursor.flags & __SCX_DSQ_ITER_HAS_VTIME)
+		p->scx.dsq_vtime = kit->vtime;
 	if (kit->cursor.flags & __SCX_DSQ_ITER_HAS_SLICE)
 		p->scx.slice = kit->slice;
 
+	/* execute move */
+	locked_rq = move_task_between_dsqs(p, enq_flags, src_dsq, dst_dsq);
 	dispatched = true;
 out:
 	if (in_balance) {

From e418cd2b80f5b76706162c0cfde3e5f3efc19cbe Mon Sep 17 00:00:00 2001
From: Zhang Qiao <zhangqiao22@huawei.com>
Date: Thu, 26 Sep 2024 10:15:04 +0800
Subject: [PATCH 02/21] sched/ext: Fix unmatch trailing comment of
 CONFIG_EXT_GROUP_SCHED

The #endif trailing comment of CONFIG_EXT_GROUP_SCHED is unmatched, so fix
it.

Signed-off-by: Zhang Qiao <zhangqiao22@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 74d0ebc5e15e..0e8eac19daa4 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -589,7 +589,7 @@ struct sched_ext_ops {
 	 * Update @tg's weight to @weight.
 	 */
 	void (*cgroup_set_weight)(struct cgroup *cgrp, u32 weight);
-#endif	/* CONFIG_CGROUPS */
+#endif	/* CONFIG_EXT_GROUP_SCHED */
 
 	/*
 	 * All online ops must come before ops.cpu_online().

From 161853a78bb2f304eb16f025ea08bd1bbe61516c Mon Sep 17 00:00:00 2001
From: Zhang Qiao <zhangqiao22@huawei.com>
Date: Thu, 26 Sep 2024 10:15:05 +0800
Subject: [PATCH 03/21] sched/ext: Use tg_cgroup() to elieminate duplicate code

Use tg_cgroup() to eliminate duplicate code patterns
in scx_bpf_task_cgroup().

Signed-off-by: Zhang Qiao <zhangqiao22@huawei.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 0e8eac19daa4..fe8acae35166 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -7130,15 +7130,8 @@ __bpf_kfunc struct cgroup *scx_bpf_task_cgroup(struct task_struct *p)
 	if (!scx_kf_allowed_on_arg_tasks(__SCX_KF_RQ_LOCKED, p))
 		goto out;
 
-	/*
-	 * A task_group may either be a cgroup or an autogroup. In the latter
-	 * case, @tg->css.cgroup is %NULL. A task_group can't become the other
-	 * kind once created.
-	 */
-	if (tg && tg->css.cgroup)
-		cgrp = tg->css.cgroup;
-	else
-		cgrp = &cgrp_dfl_root.cgrp;
+	cgrp = tg_cgrp(tg);
+
 out:
 	cgroup_get(cgrp);
 	return cgrp;

From ba1c9d327e7ffb7b931fc2b5209aae532ffee5a5 Mon Sep 17 00:00:00 2001
From: Tianchen Ding <dtcccc@linux.alibaba.com>
Date: Thu, 17 Oct 2024 10:44:12 +0800
Subject: [PATCH 04/21] sched_ext: Use btf_ids to resolve task_struct

Save the searching time during bpf_scx_init.

Signed-off-by: Tianchen Ding <dtcccc@linux.alibaba.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index fe8acae35166..06c45d6c5be1 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -5324,7 +5324,6 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 
 extern struct btf *btf_vmlinux;
 static const struct btf_type *task_struct_type;
-static u32 task_struct_type_id;
 
 static bool set_arg_maybe_null(const char *op, int arg_n, int off, int size,
 			       enum bpf_access_type type,
@@ -5376,7 +5375,7 @@ static bool set_arg_maybe_null(const char *op, int arg_n, int off, int size,
 		 */
 		info->reg_type = PTR_MAYBE_NULL | PTR_TO_BTF_ID | PTR_TRUSTED;
 		info->btf = btf_vmlinux;
-		info->btf_id = task_struct_type_id;
+		info->btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK];
 
 		return true;
 	}
@@ -5528,13 +5527,7 @@ static void bpf_scx_unreg(void *kdata, struct bpf_link *link)
 
 static int bpf_scx_init(struct btf *btf)
 {
-	s32 type_id;
-
-	type_id = btf_find_by_name_kind(btf, "task_struct", BTF_KIND_STRUCT);
-	if (type_id < 0)
-		return -EINVAL;
-	task_struct_type = btf_type_by_id(btf, type_id);
-	task_struct_type_id = type_id;
+	task_struct_type = btf_type_by_id(btf, btf_tracing_ids[BTF_TRACING_TYPE_TASK]);
 
 	return 0;
 }

From 21b8964826c4da3cc8ed89308f8838cfd221429f Mon Sep 17 00:00:00 2001
From: Andrea Righi <andrea.righi@linux.dev>
Date: Fri, 18 Oct 2024 20:09:51 +0200
Subject: [PATCH 05/21] sched_ext: improve WAKE_SYNC behavior for default idle
 CPU selection

In the sched_ext built-in idle CPU selection logic, when handling a
WF_SYNC wakeup, we always attempt to migrate the task to the waker's
CPU, as the waker is expected to yield the CPU after waking the task.

However, it may be preferable to keep the task on its previous CPU if
the waker's CPU is cache-affine.

The same approach is also used by the fair class and in other scx
schedulers, like scx_rusty and scx_bpfland.

Therefore, apply the same logic to the built-in idle CPU selection
policy as well.

Signed-off-by: Andrea Righi <andrea.righi@linux.dev>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 44 ++++++++++++++++++++++++++++++++------------
 1 file changed, 32 insertions(+), 12 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 06c45d6c5be1..a13a6461a290 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3132,20 +3132,40 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
 	}
 
 	/*
-	 * If WAKE_SYNC, the waker's local DSQ is empty, and the system is
-	 * under utilized, wake up @p to the local DSQ of the waker. Checking
-	 * only for an empty local DSQ is insufficient as it could give the
-	 * wakee an unfair advantage when the system is oversaturated.
-	 * Checking only for the presence of idle CPUs is also insufficient as
-	 * the local DSQ of the waker could have tasks piled up on it even if
-	 * there is an idle core elsewhere on the system.
+	 * If WAKE_SYNC, try to migrate the wakee to the waker's CPU.
 	 */
-	cpu = smp_processor_id();
-	if ((wake_flags & SCX_WAKE_SYNC) &&
-	    !cpumask_empty(idle_masks.cpu) && !(current->flags & PF_EXITING) &&
-	    cpu_rq(cpu)->scx.local_dsq.nr == 0) {
-		if (cpumask_test_cpu(cpu, p->cpus_ptr))
+	if (wake_flags & SCX_WAKE_SYNC) {
+		cpu = smp_processor_id();
+
+		/*
+		 * If the waker's CPU is cache affine and prev_cpu is idle,
+		 * then avoid a migration.
+		 */
+		if (cpus_share_cache(cpu, prev_cpu) &&
+		    test_and_clear_cpu_idle(prev_cpu)) {
+			cpu = prev_cpu;
 			goto cpu_found;
+		}
+
+		/*
+		 * If the waker's local DSQ is empty, and the system is under
+		 * utilized, try to wake up @p to the local DSQ of the waker.
+		 *
+		 * Checking only for an empty local DSQ is insufficient as it
+		 * could give the wakee an unfair advantage when the system is
+		 * oversaturated.
+		 *
+		 * Checking only for the presence of idle CPUs is also
+		 * insufficient as the local DSQ of the waker could have tasks
+		 * piled up on it even if there is an idle core elsewhere on
+		 * the system.
+		 */
+		if (!cpumask_empty(idle_masks.cpu) &&
+		    !(current->flags & PF_EXITING) &&
+		    cpu_rq(cpu)->scx.local_dsq.nr == 0) {
+			if (cpumask_test_cpu(cpu, p->cpus_ptr))
+				goto cpu_found;
+		}
 	}
 
 	/*

From b452ae4d20aefe96232440421faed37c9d3410a7 Mon Sep 17 00:00:00 2001
From: Andrea Righi <arighi@nvidia.com>
Date: Wed, 23 Oct 2024 13:19:07 +0200
Subject: [PATCH 06/21] sched_ext: Clarify ops.select_cpu() for single-CPU
 tasks

Update ops.select_cpu() documentation to clarify that this method is not
called for tasks that are restricted to run on a single CPU, as these
tasks do not have the option to select a different CPU.

Signed-off-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index a13a6461a290..97231a65487f 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -217,6 +217,11 @@ struct sched_ext_ops {
 	 * is dispatched, the ops.enqueue() callback will be skipped. Finally,
 	 * if @p is dispatched to SCX_DSQ_LOCAL, it will be dispatched to the
 	 * local DSQ of whatever CPU is returned by this callback.
+	 *
+	 * Note that select_cpu() is never called for tasks that can only run
+	 * on a single CPU or tasks with migration disabled, as they don't have
+	 * the option to select a different CPU. See select_task_rq() for
+	 * details.
 	 */
 	s32 (*select_cpu)(struct task_struct *p, s32 prev_cpu, u64 wake_flags);
 

From dfa4ed29b18c5f26cd311b0da7f049dbb2a2b33b Mon Sep 17 00:00:00 2001
From: Andrea Righi <arighi@nvidia.com>
Date: Wed, 23 Oct 2024 01:47:18 +0200
Subject: [PATCH 07/21] sched_ext: Introduce LLC awareness to the default idle
 selection policy

Rely on the scheduler topology information to implement basic LLC
awareness in the sched_ext build-in idle selection policy.

This allows schedulers using the built-in policy to make more informed
decisions when selecting an idle CPU in systems with multiple LLCs, such
as NUMA systems or chiplet-based architectures, and it helps keep tasks
within the same LLC domain, thereby improving cache locality.

For efficiency, LLC awareness is applied only to tasks that can run on
all the CPUs in the system for now. If a task's affinity is modified
from user space, it's the responsibility of user space to choose the
appropriate optimized scheduling domain.

Signed-off-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 97231a65487f..d7ae816db6f2 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3124,9 +3124,39 @@ static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags)
 		goto retry;
 }
 
+#ifdef CONFIG_SCHED_MC
+/*
+ * Return the cpumask of CPUs usable by task @p in the same LLC domain of @cpu,
+ * or NULL if the LLC domain cannot be determined.
+ */
+static const struct cpumask *llc_domain(const struct task_struct *p, s32 cpu)
+{
+	struct sched_domain *sd = rcu_dereference(per_cpu(sd_llc, cpu));
+	const struct cpumask *llc_cpus = sd ? sched_domain_span(sd) : NULL;
+
+	/*
+	 * Return the LLC domain only if the task is allowed to run on all
+	 * CPUs.
+	 */
+	return p->nr_cpus_allowed == nr_cpu_ids ? llc_cpus : NULL;
+}
+#else /* CONFIG_SCHED_MC */
+static inline const struct cpumask *llc_domain(struct task_struct *p, s32 cpu)
+{
+	return NULL;
+}
+#endif /* CONFIG_SCHED_MC */
+
+/*
+ * Built-in cpu idle selection policy.
+ *
+ * NOTE: tasks that can only run on 1 CPU are excluded by this logic, because
+ * we never call ops.select_cpu() for them, see select_task_rq().
+ */
 static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
 			      u64 wake_flags, bool *found)
 {
+	const struct cpumask *llc_cpus = llc_domain(p, prev_cpu);
 	s32 cpu;
 
 	*found = false;
@@ -3178,22 +3208,52 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
 	 * partially idle @prev_cpu.
 	 */
 	if (sched_smt_active()) {
+		/*
+		 * Keep using @prev_cpu if it's part of a fully idle core.
+		 */
 		if (cpumask_test_cpu(prev_cpu, idle_masks.smt) &&
 		    test_and_clear_cpu_idle(prev_cpu)) {
 			cpu = prev_cpu;
 			goto cpu_found;
 		}
 
+		/*
+		 * Search for any fully idle core in the same LLC domain.
+		 */
+		if (llc_cpus) {
+			cpu = scx_pick_idle_cpu(llc_cpus, SCX_PICK_IDLE_CORE);
+			if (cpu >= 0)
+				goto cpu_found;
+		}
+
+		/*
+		 * Search for any full idle core usable by the task.
+		 */
 		cpu = scx_pick_idle_cpu(p->cpus_ptr, SCX_PICK_IDLE_CORE);
 		if (cpu >= 0)
 			goto cpu_found;
 	}
 
+	/*
+	 * Use @prev_cpu if it's idle.
+	 */
 	if (test_and_clear_cpu_idle(prev_cpu)) {
 		cpu = prev_cpu;
 		goto cpu_found;
 	}
 
+	/*
+	 * Search for any idle CPU in the same LLC domain.
+	 */
+	if (llc_cpus) {
+		cpu = scx_pick_idle_cpu(llc_cpus, 0);
+		if (cpu >= 0)
+			goto cpu_found;
+	}
+
+	/*
+	 * Search for any idle CPU usable by the task.
+	 */
 	cpu = scx_pick_idle_cpu(p->cpus_ptr, 0);
 	if (cpu >= 0)
 		goto cpu_found;

From cf583264d025d5c895bd9a364ae33622adececd7 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 24 Oct 2024 06:58:09 -1000
Subject: [PATCH 08/21] sched_ext: Rename CFI stubs to names that are
 recognized by BPF

CFI stubs can be used to tag arguments with __nullable (and possibly other
tags in the future) but for that to work the CFI stubs must have names that
are recognized by BPF. Rename them.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: David Vernet <void@manifault.com>
Acked-by: Martin KaFai Lau <martin.lau@kernel.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/sched/ext.c | 132 ++++++++++++++++++++++-----------------------
 1 file changed, 66 insertions(+), 66 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index d7ae816db6f2..00a05578d75a 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -5634,78 +5634,78 @@ static int bpf_scx_validate(void *kdata)
 	return 0;
 }
 
-static s32 select_cpu_stub(struct task_struct *p, s32 prev_cpu, u64 wake_flags) { return -EINVAL; }
-static void enqueue_stub(struct task_struct *p, u64 enq_flags) {}
-static void dequeue_stub(struct task_struct *p, u64 enq_flags) {}
-static void dispatch_stub(s32 prev_cpu, struct task_struct *p) {}
-static void tick_stub(struct task_struct *p) {}
-static void runnable_stub(struct task_struct *p, u64 enq_flags) {}
-static void running_stub(struct task_struct *p) {}
-static void stopping_stub(struct task_struct *p, bool runnable) {}
-static void quiescent_stub(struct task_struct *p, u64 deq_flags) {}
-static bool yield_stub(struct task_struct *from, struct task_struct *to) { return false; }
-static bool core_sched_before_stub(struct task_struct *a, struct task_struct *b) { return false; }
-static void set_weight_stub(struct task_struct *p, u32 weight) {}
-static void set_cpumask_stub(struct task_struct *p, const struct cpumask *mask) {}
-static void update_idle_stub(s32 cpu, bool idle) {}
-static void cpu_acquire_stub(s32 cpu, struct scx_cpu_acquire_args *args) {}
-static void cpu_release_stub(s32 cpu, struct scx_cpu_release_args *args) {}
-static s32 init_task_stub(struct task_struct *p, struct scx_init_task_args *args) { return -EINVAL; }
-static void exit_task_stub(struct task_struct *p, struct scx_exit_task_args *args) {}
-static void enable_stub(struct task_struct *p) {}
-static void disable_stub(struct task_struct *p) {}
+static s32 sched_ext_ops__select_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags) { return -EINVAL; }
+static void sched_ext_ops__enqueue(struct task_struct *p, u64 enq_flags) {}
+static void sched_ext_ops__dequeue(struct task_struct *p, u64 enq_flags) {}
+static void sched_ext_ops__dispatch(s32 prev_cpu, struct task_struct *p) {}
+static void sched_ext_ops__tick(struct task_struct *p) {}
+static void sched_ext_ops__runnable(struct task_struct *p, u64 enq_flags) {}
+static void sched_ext_ops__running(struct task_struct *p) {}
+static void sched_ext_ops__stopping(struct task_struct *p, bool runnable) {}
+static void sched_ext_ops__quiescent(struct task_struct *p, u64 deq_flags) {}
+static bool sched_ext_ops__yield(struct task_struct *from, struct task_struct *to) { return false; }
+static bool sched_ext_ops__core_sched_before(struct task_struct *a, struct task_struct *b) { return false; }
+static void sched_ext_ops__set_weight(struct task_struct *p, u32 weight) {}
+static void sched_ext_ops__set_cpumask(struct task_struct *p, const struct cpumask *mask) {}
+static void sched_ext_ops__update_idle(s32 cpu, bool idle) {}
+static void sched_ext_ops__cpu_acquire(s32 cpu, struct scx_cpu_acquire_args *args) {}
+static void sched_ext_ops__cpu_release(s32 cpu, struct scx_cpu_release_args *args) {}
+static s32 sched_ext_ops__init_task(struct task_struct *p, struct scx_init_task_args *args) { return -EINVAL; }
+static void sched_ext_ops__exit_task(struct task_struct *p, struct scx_exit_task_args *args) {}
+static void sched_ext_ops__enable(struct task_struct *p) {}
+static void sched_ext_ops__disable(struct task_struct *p) {}
 #ifdef CONFIG_EXT_GROUP_SCHED
-static s32 cgroup_init_stub(struct cgroup *cgrp, struct scx_cgroup_init_args *args) { return -EINVAL; }
-static void cgroup_exit_stub(struct cgroup *cgrp) {}
-static s32 cgroup_prep_move_stub(struct task_struct *p, struct cgroup *from, struct cgroup *to) { return -EINVAL; }
-static void cgroup_move_stub(struct task_struct *p, struct cgroup *from, struct cgroup *to) {}
-static void cgroup_cancel_move_stub(struct task_struct *p, struct cgroup *from, struct cgroup *to) {}
-static void cgroup_set_weight_stub(struct cgroup *cgrp, u32 weight) {}
+static s32 sched_ext_ops__cgroup_init(struct cgroup *cgrp, struct scx_cgroup_init_args *args) { return -EINVAL; }
+static void sched_ext_ops__cgroup_exit(struct cgroup *cgrp) {}
+static s32 sched_ext_ops__cgroup_prep_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) { return -EINVAL; }
+static void sched_ext_ops__cgroup_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) {}
+static void sched_ext_ops__cgroup_cancel_move(struct task_struct *p, struct cgroup *from, struct cgroup *to) {}
+static void sched_ext_ops__cgroup_set_weight(struct cgroup *cgrp, u32 weight) {}
 #endif
-static void cpu_online_stub(s32 cpu) {}
-static void cpu_offline_stub(s32 cpu) {}
-static s32 init_stub(void) { return -EINVAL; }
-static void exit_stub(struct scx_exit_info *info) {}
-static void dump_stub(struct scx_dump_ctx *ctx) {}
-static void dump_cpu_stub(struct scx_dump_ctx *ctx, s32 cpu, bool idle) {}
-static void dump_task_stub(struct scx_dump_ctx *ctx, struct task_struct *p) {}
+static void sched_ext_ops__cpu_online(s32 cpu) {}
+static void sched_ext_ops__cpu_offline(s32 cpu) {}
+static s32 sched_ext_ops__init(void) { return -EINVAL; }
+static void sched_ext_ops__exit(struct scx_exit_info *info) {}
+static void sched_ext_ops__dump(struct scx_dump_ctx *ctx) {}
+static void sched_ext_ops__dump_cpu(struct scx_dump_ctx *ctx, s32 cpu, bool idle) {}
+static void sched_ext_ops__dump_task(struct scx_dump_ctx *ctx, struct task_struct *p) {}
 
 static struct sched_ext_ops __bpf_ops_sched_ext_ops = {
-	.select_cpu = select_cpu_stub,
-	.enqueue = enqueue_stub,
-	.dequeue = dequeue_stub,
-	.dispatch = dispatch_stub,
-	.tick = tick_stub,
-	.runnable = runnable_stub,
-	.running = running_stub,
-	.stopping = stopping_stub,
-	.quiescent = quiescent_stub,
-	.yield = yield_stub,
-	.core_sched_before = core_sched_before_stub,
-	.set_weight = set_weight_stub,
-	.set_cpumask = set_cpumask_stub,
-	.update_idle = update_idle_stub,
-	.cpu_acquire = cpu_acquire_stub,
-	.cpu_release = cpu_release_stub,
-	.init_task = init_task_stub,
-	.exit_task = exit_task_stub,
-	.enable = enable_stub,
-	.disable = disable_stub,
+	.select_cpu		= sched_ext_ops__select_cpu,
+	.enqueue		= sched_ext_ops__enqueue,
+	.dequeue		= sched_ext_ops__dequeue,
+	.dispatch		= sched_ext_ops__dispatch,
+	.tick			= sched_ext_ops__tick,
+	.runnable		= sched_ext_ops__runnable,
+	.running		= sched_ext_ops__running,
+	.stopping		= sched_ext_ops__stopping,
+	.quiescent		= sched_ext_ops__quiescent,
+	.yield			= sched_ext_ops__yield,
+	.core_sched_before	= sched_ext_ops__core_sched_before,
+	.set_weight		= sched_ext_ops__set_weight,
+	.set_cpumask		= sched_ext_ops__set_cpumask,
+	.update_idle		= sched_ext_ops__update_idle,
+	.cpu_acquire		= sched_ext_ops__cpu_acquire,
+	.cpu_release		= sched_ext_ops__cpu_release,
+	.init_task		= sched_ext_ops__init_task,
+	.exit_task		= sched_ext_ops__exit_task,
+	.enable			= sched_ext_ops__enable,
+	.disable		= sched_ext_ops__disable,
 #ifdef CONFIG_EXT_GROUP_SCHED
-	.cgroup_init = cgroup_init_stub,
-	.cgroup_exit = cgroup_exit_stub,
-	.cgroup_prep_move = cgroup_prep_move_stub,
-	.cgroup_move = cgroup_move_stub,
-	.cgroup_cancel_move = cgroup_cancel_move_stub,
-	.cgroup_set_weight = cgroup_set_weight_stub,
+	.cgroup_init		= sched_ext_ops__cgroup_init,
+	.cgroup_exit		= sched_ext_ops__cgroup_exit,
+	.cgroup_prep_move	= sched_ext_ops__cgroup_prep_move,
+	.cgroup_move		= sched_ext_ops__cgroup_move,
+	.cgroup_cancel_move	= sched_ext_ops__cgroup_cancel_move,
+	.cgroup_set_weight	= sched_ext_ops__cgroup_set_weight,
 #endif
-	.cpu_online = cpu_online_stub,
-	.cpu_offline = cpu_offline_stub,
-	.init = init_stub,
-	.exit = exit_stub,
-	.dump = dump_stub,
-	.dump_cpu = dump_cpu_stub,
-	.dump_task = dump_task_stub,
+	.cpu_online		= sched_ext_ops__cpu_online,
+	.cpu_offline		= sched_ext_ops__cpu_offline,
+	.init			= sched_ext_ops__init,
+	.exit			= sched_ext_ops__exit,
+	.dump			= sched_ext_ops__dump,
+	.dump_cpu		= sched_ext_ops__dump_cpu,
+	.dump_task		= sched_ext_ops__dump_task,
 };
 
 static struct bpf_struct_ops bpf_sched_ext_ops = {

From b7d0bbcf0c9828dd6974f8c1ccbdf2e13f3ad73f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Thu, 24 Oct 2024 06:58:09 -1000
Subject: [PATCH 09/21] sched_ext: Replace set_arg_maybe_null() with __nullable
 CFI stub tags

ops.dispatch() and ops.yield() may be fed a NULL task_struct pointer.
set_arg_maybe_null() is used to tell the verifier that they should be NULL
checked before being dereferenced. BPF now has an a lot prettier way to
express this - tagging arguments in CFI stubs with __nullable. Replace
set_arg_maybe_null() with __nullable CFI stub tags.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: David Vernet <void@manifault.com>
Acked-by: Martin KaFai Lau <martin.lau@kernel.org>
Acked-by: Alexei Starovoitov <ast@kernel.org>
---
 kernel/sched/ext.c | 66 ++--------------------------------------------
 1 file changed, 2 insertions(+), 64 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 00a05578d75a..6705c2e67c99 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -5407,67 +5407,8 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 #include <linux/bpf.h>
 #include <linux/btf.h>
 
-extern struct btf *btf_vmlinux;
 static const struct btf_type *task_struct_type;
 
-static bool set_arg_maybe_null(const char *op, int arg_n, int off, int size,
-			       enum bpf_access_type type,
-			       const struct bpf_prog *prog,
-			       struct bpf_insn_access_aux *info)
-{
-	struct btf *btf = bpf_get_btf_vmlinux();
-	const struct bpf_struct_ops_desc *st_ops_desc;
-	const struct btf_member *member;
-	const struct btf_type *t;
-	u32 btf_id, member_idx;
-	const char *mname;
-
-	/* struct_ops op args are all sequential, 64-bit numbers */
-	if (off != arg_n * sizeof(__u64))
-		return false;
-
-	/* btf_id should be the type id of struct sched_ext_ops */
-	btf_id = prog->aux->attach_btf_id;
-	st_ops_desc = bpf_struct_ops_find(btf, btf_id);
-	if (!st_ops_desc)
-		return false;
-
-	/* BTF type of struct sched_ext_ops */
-	t = st_ops_desc->type;
-
-	member_idx = prog->expected_attach_type;
-	if (member_idx >= btf_type_vlen(t))
-		return false;
-
-	/*
-	 * Get the member name of this struct_ops program, which corresponds to
-	 * a field in struct sched_ext_ops. For example, the member name of the
-	 * dispatch struct_ops program (callback) is "dispatch".
-	 */
-	member = &btf_type_member(t)[member_idx];
-	mname = btf_name_by_offset(btf_vmlinux, member->name_off);
-
-	if (!strcmp(mname, op)) {
-		/*
-		 * The value is a pointer to a type (struct task_struct) given
-		 * by a BTF ID (PTR_TO_BTF_ID). It is trusted (PTR_TRUSTED),
-		 * however, can be a NULL (PTR_MAYBE_NULL). The BPF program
-		 * should check the pointer to make sure it is not NULL before
-		 * using it, or the verifier will reject the program.
-		 *
-		 * Longer term, this is something that should be addressed by
-		 * BTF, and be fully contained within the verifier.
-		 */
-		info->reg_type = PTR_MAYBE_NULL | PTR_TO_BTF_ID | PTR_TRUSTED;
-		info->btf = btf_vmlinux;
-		info->btf_id = btf_tracing_ids[BTF_TRACING_TYPE_TASK];
-
-		return true;
-	}
-
-	return false;
-}
-
 static bool bpf_scx_is_valid_access(int off, int size,
 				    enum bpf_access_type type,
 				    const struct bpf_prog *prog,
@@ -5475,9 +5416,6 @@ static bool bpf_scx_is_valid_access(int off, int size,
 {
 	if (type != BPF_READ)
 		return false;
-	if (set_arg_maybe_null("dispatch", 1, off, size, type, prog, info) ||
-	    set_arg_maybe_null("yield", 1, off, size, type, prog, info))
-		return true;
 	if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS)
 		return false;
 	if (off % size != 0)
@@ -5637,13 +5575,13 @@ static int bpf_scx_validate(void *kdata)
 static s32 sched_ext_ops__select_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags) { return -EINVAL; }
 static void sched_ext_ops__enqueue(struct task_struct *p, u64 enq_flags) {}
 static void sched_ext_ops__dequeue(struct task_struct *p, u64 enq_flags) {}
-static void sched_ext_ops__dispatch(s32 prev_cpu, struct task_struct *p) {}
+static void sched_ext_ops__dispatch(s32 prev_cpu, struct task_struct *prev__nullable) {}
 static void sched_ext_ops__tick(struct task_struct *p) {}
 static void sched_ext_ops__runnable(struct task_struct *p, u64 enq_flags) {}
 static void sched_ext_ops__running(struct task_struct *p) {}
 static void sched_ext_ops__stopping(struct task_struct *p, bool runnable) {}
 static void sched_ext_ops__quiescent(struct task_struct *p, u64 deq_flags) {}
-static bool sched_ext_ops__yield(struct task_struct *from, struct task_struct *to) { return false; }
+static bool sched_ext_ops__yield(struct task_struct *from, struct task_struct *to__nullable) { return false; }
 static bool sched_ext_ops__core_sched_before(struct task_struct *a, struct task_struct *b) { return false; }
 static void sched_ext_ops__set_weight(struct task_struct *p, u32 weight) {}
 static void sched_ext_ops__set_cpumask(struct task_struct *p, const struct cpumask *mask) {}

From 860a45219bce09d9ebac883cfcf9b5b0b8a8a999 Mon Sep 17 00:00:00 2001
From: Andrea Righi <arighi@nvidia.com>
Date: Tue, 29 Oct 2024 11:16:18 +0100
Subject: [PATCH 10/21] sched_ext: Introduce NUMA awareness to the default idle
 selection policy

Similarly to commit dfa4ed29b18c ("sched_ext: Introduce LLC awareness to
the default idle selection policy"), extend the built-in idle CPU
selection policy to also prioritize CPUs within the same NUMA node.

With this change applied, the built-in CPU idle selection policy follows
this logic:
 - always prioritize CPUs from fully idle SMT cores,
 - select the same CPU if possible,
 - select a CPU within the same LLC domain,
 - select a CPU within the same NUMA node.

Both NUMA and LLC awareness features are enabled only when the system
has multiple NUMA nodes or multiple LLC domains.

In the future, we may want to improve the NUMA node selection to account
the node distance from prev_cpu. Currently, the logic only tries to keep
tasks running on the same NUMA node. If all CPUs within a node are busy,
the next NUMA node is chosen randomly.

Signed-off-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 136 +++++++++++++++++++++++++++++++++++++++------
 1 file changed, 119 insertions(+), 17 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 6705c2e67c99..fc7f15eefe54 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -870,6 +870,11 @@ static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_exiting);
 static DEFINE_STATIC_KEY_FALSE(scx_ops_cpu_preempt);
 static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled);
 
+#ifdef CONFIG_SMP
+static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_llc);
+static DEFINE_STATIC_KEY_FALSE(scx_selcpu_topo_numa);
+#endif
+
 static struct static_key_false scx_has_op[SCX_OPI_END] =
 	{ [0 ... SCX_OPI_END-1] = STATIC_KEY_FALSE_INIT };
 
@@ -3124,31 +3129,79 @@ static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags)
 		goto retry;
 }
 
-#ifdef CONFIG_SCHED_MC
 /*
- * Return the cpumask of CPUs usable by task @p in the same LLC domain of @cpu,
- * or NULL if the LLC domain cannot be determined.
+ * Initialize topology-aware scheduling.
+ *
+ * Detect if the system has multiple LLC or multiple NUMA domains and enable
+ * cache-aware / NUMA-aware scheduling optimizations in the default CPU idle
+ * selection policy.
  */
-static const struct cpumask *llc_domain(const struct task_struct *p, s32 cpu)
+static void update_selcpu_topology(void)
 {
-	struct sched_domain *sd = rcu_dereference(per_cpu(sd_llc, cpu));
-	const struct cpumask *llc_cpus = sd ? sched_domain_span(sd) : NULL;
+	bool enable_llc = false, enable_numa = false;
+	struct sched_domain *sd;
+	const struct cpumask *cpus;
+	s32 cpu = cpumask_first(cpu_online_mask);
 
 	/*
-	 * Return the LLC domain only if the task is allowed to run on all
-	 * CPUs.
+	 * We only need to check the NUMA node and LLC domain of the first
+	 * available CPU to determine if they cover all CPUs.
+	 *
+	 * If all CPUs belong to the same NUMA node or share the same LLC
+	 * domain, enabling NUMA or LLC optimizations is unnecessary.
+	 * Otherwise, these optimizations can be enabled.
 	 */
-	return p->nr_cpus_allowed == nr_cpu_ids ? llc_cpus : NULL;
+	rcu_read_lock();
+	sd = rcu_dereference(per_cpu(sd_llc, cpu));
+	if (sd) {
+		cpus = sched_domain_span(sd);
+		if (cpumask_weight(cpus) < num_possible_cpus())
+			enable_llc = true;
+	}
+	sd = highest_flag_domain(cpu, SD_NUMA);
+	if (sd) {
+		cpus = sched_group_span(sd->groups);
+		if (cpumask_weight(cpus) < num_possible_cpus())
+			enable_numa = true;
+	}
+	rcu_read_unlock();
+
+	pr_debug("sched_ext: LLC idle selection %s\n",
+		 enable_llc ? "enabled" : "disabled");
+	pr_debug("sched_ext: NUMA idle selection %s\n",
+		 enable_numa ? "enabled" : "disabled");
+
+	if (enable_llc)
+		static_branch_enable_cpuslocked(&scx_selcpu_topo_llc);
+	else
+		static_branch_disable_cpuslocked(&scx_selcpu_topo_llc);
+	if (enable_numa)
+		static_branch_enable_cpuslocked(&scx_selcpu_topo_numa);
+	else
+		static_branch_disable_cpuslocked(&scx_selcpu_topo_numa);
 }
-#else /* CONFIG_SCHED_MC */
-static inline const struct cpumask *llc_domain(struct task_struct *p, s32 cpu)
-{
-	return NULL;
-}
-#endif /* CONFIG_SCHED_MC */
 
 /*
- * Built-in cpu idle selection policy.
+ * Built-in CPU idle selection policy:
+ *
+ * 1. Prioritize full-idle cores:
+ *   - always prioritize CPUs from fully idle cores (both logical CPUs are
+ *     idle) to avoid interference caused by SMT.
+ *
+ * 2. Reuse the same CPU:
+ *   - prefer the last used CPU to take advantage of cached data (L1, L2) and
+ *     branch prediction optimizations.
+ *
+ * 3. Pick a CPU within the same LLC (Last-Level Cache):
+ *   - if the above conditions aren't met, pick a CPU that shares the same LLC
+ *     to maintain cache locality.
+ *
+ * 4. Pick a CPU within the same NUMA node, if enabled:
+ *   - choose a CPU from the same NUMA node to reduce memory access latency.
+ *
+ * Step 3 and 4 are performed only if the system has, respectively, multiple
+ * LLC domains / multiple NUMA nodes (see scx_selcpu_topo_llc and
+ * scx_selcpu_topo_numa).
  *
  * NOTE: tasks that can only run on 1 CPU are excluded by this logic, because
  * we never call ops.select_cpu() for them, see select_task_rq().
@@ -3156,7 +3209,8 @@ static inline const struct cpumask *llc_domain(struct task_struct *p, s32 cpu)
 static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
 			      u64 wake_flags, bool *found)
 {
-	const struct cpumask *llc_cpus = llc_domain(p, prev_cpu);
+	const struct cpumask *llc_cpus = NULL;
+	const struct cpumask *numa_cpus = NULL;
 	s32 cpu;
 
 	*found = false;
@@ -3166,6 +3220,30 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
 		return prev_cpu;
 	}
 
+	/*
+	 * Determine the scheduling domain only if the task is allowed to run
+	 * on all CPUs.
+	 *
+	 * This is done primarily for efficiency, as it avoids the overhead of
+	 * updating a cpumask every time we need to select an idle CPU (which
+	 * can be costly in large SMP systems), but it also aligns logically:
+	 * if a task's scheduling domain is restricted by user-space (through
+	 * CPU affinity), the task will simply use the flat scheduling domain
+	 * defined by user-space.
+	 */
+	if (p->nr_cpus_allowed >= num_possible_cpus()) {
+		if (static_branch_maybe(CONFIG_NUMA, &scx_selcpu_topo_numa))
+			numa_cpus = cpumask_of_node(cpu_to_node(prev_cpu));
+
+		if (static_branch_maybe(CONFIG_SCHED_MC, &scx_selcpu_topo_llc)) {
+			struct sched_domain *sd;
+
+			sd = rcu_dereference(per_cpu(sd_llc, prev_cpu));
+			if (sd)
+				llc_cpus = sched_domain_span(sd);
+		}
+	}
+
 	/*
 	 * If WAKE_SYNC, try to migrate the wakee to the waker's CPU.
 	 */
@@ -3226,6 +3304,15 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
 				goto cpu_found;
 		}
 
+		/*
+		 * Search for any fully idle core in the same NUMA node.
+		 */
+		if (numa_cpus) {
+			cpu = scx_pick_idle_cpu(numa_cpus, SCX_PICK_IDLE_CORE);
+			if (cpu >= 0)
+				goto cpu_found;
+		}
+
 		/*
 		 * Search for any full idle core usable by the task.
 		 */
@@ -3251,6 +3338,15 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
 			goto cpu_found;
 	}
 
+	/*
+	 * Search for any idle CPU in the same NUMA node.
+	 */
+	if (numa_cpus) {
+		cpu = scx_pick_idle_cpu(numa_cpus, 0);
+		if (cpu >= 0)
+			goto cpu_found;
+	}
+
 	/*
 	 * Search for any idle CPU usable by the task.
 	 */
@@ -3383,6 +3479,9 @@ static void handle_hotplug(struct rq *rq, bool online)
 
 	atomic_long_inc(&scx_hotplug_seq);
 
+	if (scx_enabled())
+		update_selcpu_topology();
+
 	if (online && SCX_HAS_OP(cpu_online))
 		SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_online, cpu);
 	else if (!online && SCX_HAS_OP(cpu_offline))
@@ -5202,6 +5301,9 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 			static_branch_enable_cpuslocked(&scx_has_op[i]);
 
 	check_hotplug_seq(ops);
+#ifdef CONFIG_SMP
+	update_selcpu_topology();
+#endif
 	cpus_read_unlock();
 
 	ret = validate_ops(ops);

From f6ce6b949304bc7a54dbfea98402080c42bbc9a4 Mon Sep 17 00:00:00 2001
From: Andrea Righi <arighi@nvidia.com>
Date: Fri, 8 Nov 2024 01:01:36 +0100
Subject: [PATCH 11/21] sched_ext: Do not enable LLC/NUMA optimizations when
 domains overlap

When the LLC and NUMA domains fully overlap, enabling both optimizations
in the built-in idle CPU selection policy is redundant, as it leads to
searching for an idle CPU within the same domain twice.

Likewise, if all online CPUs are within a single LLC domain, LLC
optimization is unnecessary.

Therefore, detect overlapping domains and enable topology optimizations
only when necessary.

Moreover, rely on the online CPUs for this detection logic, instead of
using the possible CPUs.

Fixes: 860a45219bce ("sched_ext: Introduce NUMA awareness to the default idle selection policy")
Signed-off-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 85 +++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 72 insertions(+), 13 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index fc7f15eefe54..f154aaeb69e4 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3129,12 +3129,63 @@ static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags)
 		goto retry;
 }
 
+/*
+ * Return true if the LLC domains do not perfectly overlap with the NUMA
+ * domains, false otherwise.
+ */
+static bool llc_numa_mismatch(void)
+{
+	int cpu;
+
+	/*
+	 * We need to scan all online CPUs to verify whether their scheduling
+	 * domains overlap.
+	 *
+	 * While it is rare to encounter architectures with asymmetric NUMA
+	 * topologies, CPU hotplugging or virtualized environments can result
+	 * in asymmetric configurations.
+	 *
+	 * For example:
+	 *
+	 *  NUMA 0:
+	 *    - LLC 0: cpu0..cpu7
+	 *    - LLC 1: cpu8..cpu15 [offline]
+	 *
+	 *  NUMA 1:
+	 *    - LLC 0: cpu16..cpu23
+	 *    - LLC 1: cpu24..cpu31
+	 *
+	 * In this case, if we only check the first online CPU (cpu0), we might
+	 * incorrectly assume that the LLC and NUMA domains are fully
+	 * overlapping, which is incorrect (as NUMA 1 has two distinct LLC
+	 * domains).
+	 */
+	for_each_online_cpu(cpu) {
+		const struct cpumask *numa_cpus;
+		struct sched_domain *sd;
+
+		sd = rcu_dereference(per_cpu(sd_llc, cpu));
+		if (!sd)
+			return true;
+
+		numa_cpus = cpumask_of_node(cpu_to_node(cpu));
+		if (sd->span_weight != cpumask_weight(numa_cpus))
+			return true;
+	}
+
+	return false;
+}
+
 /*
  * Initialize topology-aware scheduling.
  *
  * Detect if the system has multiple LLC or multiple NUMA domains and enable
  * cache-aware / NUMA-aware scheduling optimizations in the default CPU idle
  * selection policy.
+ *
+ * Assumption: the kernel's internal topology representation assumes that each
+ * CPU belongs to a single LLC domain, and that each LLC domain is entirely
+ * contained within a single NUMA node.
  */
 static void update_selcpu_topology(void)
 {
@@ -3144,26 +3195,34 @@ static void update_selcpu_topology(void)
 	s32 cpu = cpumask_first(cpu_online_mask);
 
 	/*
-	 * We only need to check the NUMA node and LLC domain of the first
-	 * available CPU to determine if they cover all CPUs.
+	 * Enable LLC domain optimization only when there are multiple LLC
+	 * domains among the online CPUs. If all online CPUs are part of a
+	 * single LLC domain, the idle CPU selection logic can choose any
+	 * online CPU without bias.
 	 *
-	 * If all CPUs belong to the same NUMA node or share the same LLC
-	 * domain, enabling NUMA or LLC optimizations is unnecessary.
-	 * Otherwise, these optimizations can be enabled.
+	 * Note that it is sufficient to check the LLC domain of the first
+	 * online CPU to determine whether a single LLC domain includes all
+	 * CPUs.
 	 */
 	rcu_read_lock();
 	sd = rcu_dereference(per_cpu(sd_llc, cpu));
 	if (sd) {
-		cpus = sched_domain_span(sd);
-		if (cpumask_weight(cpus) < num_possible_cpus())
+		if (sd->span_weight < num_online_cpus())
 			enable_llc = true;
 	}
-	sd = highest_flag_domain(cpu, SD_NUMA);
-	if (sd) {
-		cpus = sched_group_span(sd->groups);
-		if (cpumask_weight(cpus) < num_possible_cpus())
-			enable_numa = true;
-	}
+
+	/*
+	 * Enable NUMA optimization only when there are multiple NUMA domains
+	 * among the online CPUs and the NUMA domains don't perfectly overlaps
+	 * with the LLC domains.
+	 *
+	 * If all CPUs belong to the same NUMA node and the same LLC domain,
+	 * enabling both NUMA and LLC optimizations is unnecessary, as checking
+	 * for an idle CPU in the same domain twice is redundant.
+	 */
+	cpus = cpumask_of_node(cpu_to_node(cpu));
+	if ((cpumask_weight(cpus) < num_online_cpus()) & llc_numa_mismatch())
+		enable_numa = true;
 	rcu_read_unlock();
 
 	pr_debug("sched_ext: LLC idle selection %s\n",

From 6d594af5bff2e565cae538401142c69182026c38 Mon Sep 17 00:00:00 2001
From: Andrea Righi <arighi@nvidia.com>
Date: Fri, 8 Nov 2024 20:51:44 +0100
Subject: [PATCH 12/21] sched_ext: Fix incorrect use of bitwise AND

There is no reason to use a bitwise AND when checking the conditions to
enable NUMA optimization for the built-in CPU idle selection policy, so
use a logical AND instead.

Fixes: f6ce6b949304 ("sched_ext: Do not enable LLC/NUMA optimizations when domains overlap")
Reported-by: Nathan Chancellor <nathan@kernel.org>
Closes: https://lore.kernel.org/lkml/20241108181753.GA2681424@thelio-3990X/
Signed-off-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index f154aaeb69e4..012a7fc77263 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3221,7 +3221,7 @@ static void update_selcpu_topology(void)
 	 * for an idle CPU in the same domain twice is redundant.
 	 */
 	cpus = cpumask_of_node(cpu_to_node(cpu));
-	if ((cpumask_weight(cpus) < num_online_cpus()) & llc_numa_mismatch())
+	if ((cpumask_weight(cpus) < num_online_cpus()) && llc_numa_mismatch())
 		enable_numa = true;
 	rcu_read_unlock();
 

From 62dcbab8b0ef21729532600039fd514c09407092 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 5 Nov 2024 11:48:11 -1000
Subject: [PATCH 13/21] sched_ext: Avoid live-locking bypass mode switching

A poorly behaving BPF scheduler can live-lock the system by e.g. incessantly
banging on the same DSQ on a large NUMA system to the point where switching
to the bypass mode can take a long time. Turning on the bypass mode requires
dequeueing and re-enqueueing currently runnable tasks, if the DSQs that they
are on are live-locked, this can take tens of seconds cascading into other
failures. This was observed on 2 x Intel Sapphire Rapids machines with 224
logical CPUs.

Inject artifical delays while the bypass mode is switching to guarantee
timely completion.

While at it, move __scx_ops_bypass_lock into scx_ops_bypass() and rename it
to bypass_lock.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Valentin Andrei <vandrei@meta.com>
Reported-by: Patrick Lu <patlu@meta.com>
---
 kernel/sched/ext.c | 55 +++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 52 insertions(+), 3 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index c074a64c20f0..2d41f1917464 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -867,8 +867,8 @@ static DEFINE_MUTEX(scx_ops_enable_mutex);
 DEFINE_STATIC_KEY_FALSE(__scx_ops_enabled);
 DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem);
 static atomic_t scx_ops_enable_state_var = ATOMIC_INIT(SCX_OPS_DISABLED);
+static atomic_t scx_ops_breather_depth = ATOMIC_INIT(0);
 static int scx_ops_bypass_depth;
-static DEFINE_RAW_SPINLOCK(__scx_ops_bypass_lock);
 static bool scx_ops_init_task_enabled;
 static bool scx_switching_all;
 DEFINE_STATIC_KEY_FALSE(__scx_switched_all);
@@ -2474,10 +2474,47 @@ static struct rq *move_task_between_dsqs(struct task_struct *p, u64 enq_flags,
 	return dst_rq;
 }
 
+/*
+ * A poorly behaving BPF scheduler can live-lock the system by e.g. incessantly
+ * banging on the same DSQ on a large NUMA system to the point where switching
+ * to the bypass mode can take a long time. Inject artifical delays while the
+ * bypass mode is switching to guarantee timely completion.
+ */
+static void scx_ops_breather(struct rq *rq)
+{
+	u64 until;
+
+	lockdep_assert_rq_held(rq);
+
+	if (likely(!atomic_read(&scx_ops_breather_depth)))
+		return;
+
+	raw_spin_rq_unlock(rq);
+
+	until = ktime_get_ns() + NSEC_PER_MSEC;
+
+	do {
+		int cnt = 1024;
+		while (atomic_read(&scx_ops_breather_depth) && --cnt)
+			cpu_relax();
+	} while (atomic_read(&scx_ops_breather_depth) &&
+		 time_before64(ktime_get_ns(), until));
+
+	raw_spin_rq_lock(rq);
+}
+
 static bool consume_dispatch_q(struct rq *rq, struct scx_dispatch_q *dsq)
 {
 	struct task_struct *p;
 retry:
+	/*
+	 * This retry loop can repeatedly race against scx_ops_bypass()
+	 * dequeueing tasks from @dsq trying to put the system into the bypass
+	 * mode. On some multi-socket machines (e.g. 2x Intel 8480c), this can
+	 * live-lock the machine into soft lockups. Give a breather.
+	 */
+	scx_ops_breather(rq);
+
 	/*
 	 * The caller can't expect to successfully consume a task if the task's
 	 * addition to @dsq isn't guaranteed to be visible somehow. Test
@@ -4609,10 +4646,11 @@ bool task_should_scx(struct task_struct *p)
  */
 static void scx_ops_bypass(bool bypass)
 {
+	static DEFINE_RAW_SPINLOCK(bypass_lock);
 	int cpu;
 	unsigned long flags;
 
-	raw_spin_lock_irqsave(&__scx_ops_bypass_lock, flags);
+	raw_spin_lock_irqsave(&bypass_lock, flags);
 	if (bypass) {
 		scx_ops_bypass_depth++;
 		WARN_ON_ONCE(scx_ops_bypass_depth <= 0);
@@ -4625,6 +4663,8 @@ static void scx_ops_bypass(bool bypass)
 			goto unlock;
 	}
 
+	atomic_inc(&scx_ops_breather_depth);
+
 	/*
 	 * No task property is changing. We just need to make sure all currently
 	 * queued tasks are re-queued according to the new scx_rq_bypassing()
@@ -4680,8 +4720,10 @@ static void scx_ops_bypass(bool bypass)
 		/* resched to restore ticks and idle state */
 		resched_cpu(cpu);
 	}
+
+	atomic_dec(&scx_ops_breather_depth);
 unlock:
-	raw_spin_unlock_irqrestore(&__scx_ops_bypass_lock, flags);
+	raw_spin_unlock_irqrestore(&bypass_lock, flags);
 }
 
 static void free_exit_info(struct scx_exit_info *ei)
@@ -6334,6 +6376,13 @@ static bool scx_dispatch_from_dsq(struct bpf_iter_scx_dsq_kern *kit,
 		raw_spin_rq_lock(src_rq);
 	}
 
+	/*
+	 * If the BPF scheduler keeps calling this function repeatedly, it can
+	 * cause similar live-lock conditions as consume_dispatch_q(). Insert a
+	 * breather if necessary.
+	 */
+	scx_ops_breather(src_rq);
+
 	locked_rq = src_rq;
 	raw_spin_lock(&src_dsq->lock);
 

From e32c260195e6ff72940ab7826e38e0a0066fc58f Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 5 Nov 2024 11:49:04 -1000
Subject: [PATCH 14/21] sched_ext: Enable the ops breather and eject BPF
 scheduler on softlockup

On 2 x Intel Sapphire Rapids machines with 224 logical CPUs, a poorly
behaving BPF scheduler can live-lock the system by making multiple CPUs bang
on the same DSQ to the point where soft-lockup detection triggers before
SCX's own watchdog can take action. It also seems possible that the machine
can be live-locked enough to prevent scx_ops_helper, which is an RT task,
from running in a timely manner.

Implement scx_softlockup() which is called when three quarters of
soft-lockup threshold has passed. The function immediately enables the ops
breather and triggers an ops error to initiate ejection of the BPF
scheduler.

The previous and this patch combined enable the kernel to reliably recover
the system from live-lock conditions that can be triggered by a poorly
behaving BPF scheduler on Intel dual socket systems.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Douglas Anderson <dianders@chromium.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/sched/ext.h         |  2 ++
 kernel/sched/ext.c                | 45 +++++++++++++++++++++++++++++++
 kernel/watchdog.c                 |  8 ++++++
 tools/sched_ext/scx_show_state.py |  2 ++
 4 files changed, 57 insertions(+)

diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h
index 1ddbde64a31b..65bc0a489cd2 100644
--- a/include/linux/sched/ext.h
+++ b/include/linux/sched/ext.h
@@ -205,11 +205,13 @@ struct sched_ext_entity {
 
 void sched_ext_free(struct task_struct *p);
 void print_scx_info(const char *log_lvl, struct task_struct *p);
+void scx_softlockup(u32 dur_s);
 
 #else	/* !CONFIG_SCHED_CLASS_EXT */
 
 static inline void sched_ext_free(struct task_struct *p) {}
 static inline void print_scx_info(const char *log_lvl, struct task_struct *p) {}
+static inline void scx_softlockup(u32 dur_s) {}
 
 #endif	/* CONFIG_SCHED_CLASS_EXT */
 #endif	/* _LINUX_SCHED_EXT_H */
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 2d41f1917464..02f39314ef8a 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -867,6 +867,7 @@ static DEFINE_MUTEX(scx_ops_enable_mutex);
 DEFINE_STATIC_KEY_FALSE(__scx_ops_enabled);
 DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem);
 static atomic_t scx_ops_enable_state_var = ATOMIC_INIT(SCX_OPS_DISABLED);
+static unsigned long scx_in_softlockup;
 static atomic_t scx_ops_breather_depth = ATOMIC_INIT(0);
 static int scx_ops_bypass_depth;
 static bool scx_ops_init_task_enabled;
@@ -4614,6 +4615,49 @@ bool task_should_scx(struct task_struct *p)
 	return p->policy == SCHED_EXT;
 }
 
+/**
+ * scx_softlockup - sched_ext softlockup handler
+ *
+ * On some multi-socket setups (e.g. 2x Intel 8480c), the BPF scheduler can
+ * live-lock the system by making many CPUs target the same DSQ to the point
+ * where soft-lockup detection triggers. This function is called from
+ * soft-lockup watchdog when the triggering point is close and tries to unjam
+ * the system by enabling the breather and aborting the BPF scheduler.
+ */
+void scx_softlockup(u32 dur_s)
+{
+	switch (scx_ops_enable_state()) {
+	case SCX_OPS_ENABLING:
+	case SCX_OPS_ENABLED:
+		break;
+	default:
+		return;
+	}
+
+	/* allow only one instance, cleared at the end of scx_ops_bypass() */
+	if (test_and_set_bit(0, &scx_in_softlockup))
+		return;
+
+	printk_deferred(KERN_ERR "sched_ext: Soft lockup - CPU%d stuck for %us, disabling \"%s\"\n",
+			smp_processor_id(), dur_s, scx_ops.name);
+
+	/*
+	 * Some CPUs may be trapped in the dispatch paths. Enable breather
+	 * immediately; otherwise, we might even be able to get to
+	 * scx_ops_bypass().
+	 */
+	atomic_inc(&scx_ops_breather_depth);
+
+	scx_ops_error("soft lockup - CPU#%d stuck for %us",
+		      smp_processor_id(), dur_s);
+}
+
+static void scx_clear_softlockup(void)
+{
+	if (test_and_clear_bit(0, &scx_in_softlockup))
+		atomic_dec(&scx_ops_breather_depth);
+}
+
 /**
  * scx_ops_bypass - [Un]bypass scx_ops and guarantee forward progress
  *
@@ -4724,6 +4768,7 @@ static void scx_ops_bypass(bool bypass)
 	atomic_dec(&scx_ops_breather_depth);
 unlock:
 	raw_spin_unlock_irqrestore(&bypass_lock, flags);
+	scx_clear_softlockup();
 }
 
 static void free_exit_info(struct scx_exit_info *ei)
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 262691ba62b7..5a93d4c446b8 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -644,6 +644,14 @@ static int is_softlockup(unsigned long touch_ts,
 		    need_counting_irqs())
 			start_counting_irqs();
 
+		/*
+		 * A poorly behaving BPF scheduler can live-lock the system into
+		 * soft lockups. Tell sched_ext to try ejecting the BPF
+		 * scheduler when close to a soft lockup.
+		 */
+		if (time_after_eq(now, period_ts + get_softlockup_thresh() * 3 / 4))
+			scx_softlockup(now - touch_ts);
+
 		/* Warn about unreasonable delays. */
 		if (time_after(now, period_ts + get_softlockup_thresh()))
 			return now - touch_ts;
diff --git a/tools/sched_ext/scx_show_state.py b/tools/sched_ext/scx_show_state.py
index c4b3fdda9a0b..b800d4f5f2e9 100644
--- a/tools/sched_ext/scx_show_state.py
+++ b/tools/sched_ext/scx_show_state.py
@@ -35,6 +35,8 @@ print(f'enabled       : {read_static_key("__scx_ops_enabled")}')
 print(f'switching_all : {read_int("scx_switching_all")}')
 print(f'switched_all  : {read_static_key("__scx_switched_all")}')
 print(f'enable_state  : {ops_state_str(enable_state)} ({enable_state})')
+print(f'in_softlockup : {prog["scx_in_softlockup"].value_()}')
+print(f'breather_depth: {read_atomic("scx_ops_breather_depth")}')
 print(f'bypass_depth  : {prog["scx_ops_bypass_depth"].value_()}')
 print(f'nr_rejected   : {read_atomic("scx_nr_rejected")}')
 print(f'enable_seq    : {read_atomic("scx_enable_seq")}')

From 153591f7036be19ddcdb102c34b9f3e876a236c7 Mon Sep 17 00:00:00 2001
From: Changwoo Min <multics69@gmail.com>
Date: Sat, 9 Nov 2024 11:25:14 +0900
Subject: [PATCH 15/21] sched_ext: Clarify sched_ext_ops table for userland
 scheduler

Update the comments in sched_ext_ops to clarify this table is for
a BPF scheduler and a userland scheduler should also rely on the
sched_ext_ops table through the BPF scheduler.

Signed-off-by: Changwoo Min <changwoo@igalia.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 02f39314ef8a..f1a185ab5fbc 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -199,8 +199,10 @@ struct scx_dump_ctx {
 /**
  * struct sched_ext_ops - Operation table for BPF scheduler implementation
  *
- * Userland can implement an arbitrary scheduling policy by implementing and
- * loading operations in this table.
+ * A BPF scheduler can implement an arbitrary scheduling policy by
+ * implementing and loading operations in this table. Note that a userland
+ * scheduling policy can also be implemented using the BPF scheduler
+ * as a shim layer.
  */
 struct sched_ext_ops {
 	/**

From f39489fea677ad78ca4ce1ab2d204a6639b868dc Mon Sep 17 00:00:00 2001
From: Changwoo Min <multics69@gmail.com>
Date: Sat, 9 Nov 2024 15:29:05 +0900
Subject: [PATCH 16/21] sched_ext: add a missing rcu_read_lock/unlock pair at
 scx_select_cpu_dfl()

When getting an LLC CPU mask in the default CPU selection policy,
scx_select_cpu_dfl(), a pointer to the sched_domain is dereferenced
using rcu_read_lock() without holding rcu_read_lock(). Such an unprotected
dereference often causes the following warning and can cause an invalid
memory access in the worst case.

Therefore, protect dereference of a sched_domain pointer using a pair
of rcu_read_lock() and unlock().

[   20.996135] =============================
[   20.996345] WARNING: suspicious RCU usage
[   20.996563] 6.11.0-virtme #17 Tainted: G        W
[   20.996576] -----------------------------
[   20.996576] kernel/sched/ext.c:3323 suspicious rcu_dereference_check() usage!
[   20.996576]
[   20.996576] other info that might help us debug this:
[   20.996576]
[   20.996576]
[   20.996576] rcu_scheduler_active = 2, debug_locks = 1
[   20.996576] 4 locks held by kworker/8:1/140:
[   20.996576]  #0: ffff8b18c00dd348 ((wq_completion)pm){+.+.}-{0:0}, at: process_one_work+0x4a0/0x590
[   20.996576]  #1: ffffb3da01f67e58 ((work_completion)(&dev->power.work)){+.+.}-{0:0}, at: process_one_work+0x1ba/0x590
[   20.996576]  #2: ffffffffa316f9f0 (&rcu_state.gp_wq){..-.}-{2:2}, at: swake_up_one+0x15/0x60
[   20.996576]  #3: ffff8b1880398a60 (&p->pi_lock){-.-.}-{2:2}, at: try_to_wake_up+0x59/0x7d0
[   20.996576]
[   20.996576] stack backtrace:
[   20.996576] CPU: 8 UID: 0 PID: 140 Comm: kworker/8:1 Tainted: G        W          6.11.0-virtme #17
[   20.996576] Tainted: [W]=WARN
[   20.996576] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Arch Linux 1.16.3-1-1 04/01/2014
[   20.996576] Workqueue: pm pm_runtime_work
[   20.996576] Sched_ext: simple (disabling+all), task: runnable_at=-6ms
[   20.996576] Call Trace:
[   20.996576]  <IRQ>
[   20.996576]  dump_stack_lvl+0x6f/0xb0
[   20.996576]  lockdep_rcu_suspicious.cold+0x4e/0x96
[   20.996576]  scx_select_cpu_dfl+0x234/0x260
[   20.996576]  select_task_rq_scx+0xfb/0x190
[   20.996576]  select_task_rq+0x47/0x110
[   20.996576]  try_to_wake_up+0x110/0x7d0
[   20.996576]  swake_up_one+0x39/0x60
[   20.996576]  rcu_core+0xb08/0xe50
[   20.996576]  ? srso_alias_return_thunk+0x5/0xfbef5
[   20.996576]  ? mark_held_locks+0x40/0x70
[   20.996576]  handle_softirqs+0xd3/0x410
[   20.996576]  irq_exit_rcu+0x78/0xa0
[   20.996576]  sysvec_apic_timer_interrupt+0x73/0x80
[   20.996576]  </IRQ>
[   20.996576]  <TASK>
[   20.996576]  asm_sysvec_apic_timer_interrupt+0x1a/0x20
[   20.996576] RIP: 0010:_raw_spin_unlock_irqrestore+0x36/0x70
[   20.996576] Code: f5 53 48 8b 74 24 10 48 89 fb 48 83 c7 18 e8 11 b4 36 ff 48 89 df e8 99 0d 37 ff f7 c5 00 02 00 00 75 17 9c 58 f6 c4 02 75 2b <65> ff 0d 5b 55 3c 5e 74 16 5b 5d e9 95 8e 28 00 e8 a5 ee 44 ff 9c
[   20.996576] RSP: 0018:ffffb3da01f67d20 EFLAGS: 00000246
[   20.996576] RAX: 0000000000000002 RBX: ffffffffa4640220 RCX: 0000000000000040
[   20.996576] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffffffffa1c7b27b
[   20.996576] RBP: 0000000000000246 R08: 0000000000000001 R09: 0000000000000000
[   20.996576] R10: 0000000000000001 R11: 000000000000021c R12: 0000000000000246
[   20.996576] R13: ffff8b1881363958 R14: 0000000000000000 R15: ffff8b1881363800
[   20.996576]  ? _raw_spin_unlock_irqrestore+0x4b/0x70
[   20.996576]  serial_port_runtime_resume+0xd4/0x1a0
[   20.996576]  ? __pfx_serial_port_runtime_resume+0x10/0x10
[   20.996576]  __rpm_callback+0x44/0x170
[   20.996576]  ? __pfx_serial_port_runtime_resume+0x10/0x10
[   20.996576]  rpm_callback+0x55/0x60
[   20.996576]  ? __pfx_serial_port_runtime_resume+0x10/0x10
[   20.996576]  rpm_resume+0x582/0x7b0
[   20.996576]  pm_runtime_work+0x7c/0xb0
[   20.996576]  process_one_work+0x1fb/0x590
[   20.996576]  worker_thread+0x18e/0x350
[   20.996576]  ? __pfx_worker_thread+0x10/0x10
[   20.996576]  kthread+0xe2/0x110
[   20.996576]  ? __pfx_kthread+0x10/0x10
[   20.996576]  ret_from_fork+0x34/0x50
[   20.996576]  ? __pfx_kthread+0x10/0x10
[   20.996576]  ret_from_fork_asm+0x1a/0x30
[   20.996576]  </TASK>
[   21.056592] sched_ext: BPF scheduler "simple" disabled (unregistered from user space)

Signed-off-by: Changwoo Min <changwoo@igalia.com>
Acked-by: Andrea Righi <arighi@nvidia.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index f1a185ab5fbc..0c68ace02e4b 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -3342,6 +3342,12 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
 
 	*found = false;
 
+
+	/*
+	 * This is necessary to protect llc_cpus.
+	 */
+	rcu_read_lock();
+
 	/*
 	 * Determine the scheduling domain only if the task is allowed to run
 	 * on all CPUs.
@@ -3476,9 +3482,12 @@ static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu,
 	if (cpu >= 0)
 		goto cpu_found;
 
+	rcu_read_unlock();
 	return prev_cpu;
 
 cpu_found:
+	rcu_read_unlock();
+
 	*found = true;
 	return cpu;
 }

From 72b85bf6a7f6f6c38c39a1e5b04bc1da1bf5016e Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Sat, 9 Nov 2024 09:40:25 -1000
Subject: [PATCH 17/21] sched_ext: scx_bpf_dispatch_from_dsq_set_*() are
 allowed from unlocked context

4c30f5ce4f7a ("sched_ext: Implement scx_bpf_dispatch[_vtime]_from_dsq()")
added four kfuncs for dispatching while iterating. They are allowed from the
dispatch and unlocked contexts but two of the kfuncs were only added in the
dispatch section. Add missing declarations in the unlocked section.

Signed-off-by: Tejun Heo <tj@kernel.org>
Fixes: 4c30f5ce4f7a ("sched_ext: Implement scx_bpf_dispatch[_vtime]_from_dsq()")
---
 kernel/sched/ext.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 0c68ace02e4b..5dcb0c1c49ea 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -6766,6 +6766,8 @@ __bpf_kfunc_end_defs();
 
 BTF_KFUNCS_START(scx_kfunc_ids_unlocked)
 BTF_ID_FLAGS(func, scx_bpf_create_dsq, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_slice)
+BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_vtime)
 BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq, KF_RCU)
 BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime_from_dsq, KF_RCU)
 BTF_KFUNCS_END(scx_kfunc_ids_unlocked)

From cc26abb1a19adbb91b79d25a2e74976633ece429 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 11 Nov 2024 07:06:16 -1000
Subject: [PATCH 18/21] sched_ext: Rename scx_bpf_dispatch[_vtime]() to
 scx_bpf_dsq_insert[_vtime]()

In sched_ext API, a repeatedly reported pain point is the overuse of the
verb "dispatch" and confusion around "consume":

- ops.dispatch()
- scx_bpf_dispatch[_vtime]()
- scx_bpf_consume()
- scx_bpf_dispatch[_vtime]_from_dsq*()

This overloading of the term is historical. Originally, there were only
built-in DSQs and moving a task into a DSQ always dispatched it for
execution. Using the verb "dispatch" for the kfuncs to move tasks into these
DSQs made sense.

Later, user DSQs were added and scx_bpf_dispatch[_vtime]() updated to be
able to insert tasks into any DSQ. The only allowed DSQ to DSQ transfer was
from a non-local DSQ to a local DSQ and this operation was named "consume".
This was already confusing as a task could be dispatched to a user DSQ from
ops.enqueue() and then the DSQ would have to be consumed in ops.dispatch().
Later addition of scx_bpf_dispatch_from_dsq*() made the confusion even worse
as "dispatch" in this context meant moving a task to an arbitrary DSQ from a
user DSQ.

Clean up the API with the following renames:

1. scx_bpf_dispatch[_vtime]()		-> scx_bpf_dsq_insert[_vtime]()
2. scx_bpf_consume()			-> scx_bpf_dsq_move_to_local()
3. scx_bpf_dispatch[_vtime]_from_dsq*()	-> scx_bpf_dsq_move[_vtime]*()

This patch performs the first set of renames. Compatibility is maintained
by:

- The previous kfunc names are still provided by the kernel so that old
  binaries can run. Kernel generates a warning when the old names are used.

- compat.bpf.h provides wrappers for the new names which automatically fall
  back to the old names when running on older kernels. They also trigger
  build error if old names are used for new builds.

The compat features will be dropped after v6.15.

v2: Documentation updates.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Andrea Righi <arighi@nvidia.com>
Acked-by: Changwoo Min <changwoo@igalia.com>
Acked-by: Johannes Bechberger <me@mostlynerdless.de>
Acked-by: Giovanni Gherdovich <ggherdovich@suse.com>
Cc: Dan Schatzberg <dschatzberg@meta.com>
Cc: Ming Yang <yougmark94@gmail.com>
---
 Documentation/scheduler/sched-ext.rst    |  50 +++++-----
 kernel/sched/ext.c                       | 111 +++++++++++++----------
 tools/sched_ext/include/scx/common.bpf.h |   4 +-
 tools/sched_ext/include/scx/compat.bpf.h |  26 ++++++
 tools/sched_ext/scx_central.bpf.c        |  10 +-
 tools/sched_ext/scx_flatcg.bpf.c         |  14 +--
 tools/sched_ext/scx_qmap.bpf.c           |  12 +--
 tools/sched_ext/scx_simple.bpf.c         |  14 +--
 8 files changed, 144 insertions(+), 97 deletions(-)

diff --git a/Documentation/scheduler/sched-ext.rst b/Documentation/scheduler/sched-ext.rst
index 7b59bbd2e564..ee4af97701bf 100644
--- a/Documentation/scheduler/sched-ext.rst
+++ b/Documentation/scheduler/sched-ext.rst
@@ -130,7 +130,7 @@ optional. The following modified excerpt is from
      * Decide which CPU a task should be migrated to before being
      * enqueued (either at wakeup, fork time, or exec time). If an
      * idle core is found by the default ops.select_cpu() implementation,
-     * then dispatch the task directly to SCX_DSQ_LOCAL and skip the
+     * then insert the task directly into SCX_DSQ_LOCAL and skip the
      * ops.enqueue() callback.
      *
      * Note that this implementation has exactly the same behavior as the
@@ -148,15 +148,15 @@ optional. The following modified excerpt is from
             cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &direct);
 
             if (direct)
-                    scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
+                    scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
 
             return cpu;
     }
 
     /*
-     * Do a direct dispatch of a task to the global DSQ. This ops.enqueue()
-     * callback will only be invoked if we failed to find a core to dispatch
-     * to in ops.select_cpu() above.
+     * Do a direct insertion of a task to the global DSQ. This ops.enqueue()
+     * callback will only be invoked if we failed to find a core to insert
+     * into in ops.select_cpu() above.
      *
      * Note that this implementation has exactly the same behavior as the
      * default ops.enqueue implementation, which just dispatches the task
@@ -166,7 +166,7 @@ optional. The following modified excerpt is from
      */
     void BPF_STRUCT_OPS(simple_enqueue, struct task_struct *p, u64 enq_flags)
     {
-            scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
+            scx_bpf_dsq_insert(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags);
     }
 
     s32 BPF_STRUCT_OPS_SLEEPABLE(simple_init)
@@ -202,7 +202,7 @@ and one local dsq per CPU (``SCX_DSQ_LOCAL``). The BPF scheduler can manage
 an arbitrary number of dsq's using ``scx_bpf_create_dsq()`` and
 ``scx_bpf_destroy_dsq()``.
 
-A CPU always executes a task from its local DSQ. A task is "dispatched" to a
+A CPU always executes a task from its local DSQ. A task is "inserted" into a
 DSQ. A non-local DSQ is "consumed" to transfer a task to the consuming CPU's
 local DSQ.
 
@@ -229,26 +229,26 @@ The following briefly shows how a waking task is scheduled and executed.
    scheduler can wake up any cpu using the ``scx_bpf_kick_cpu()`` helper,
    using ``ops.select_cpu()`` judiciously can be simpler and more efficient.
 
-   A task can be immediately dispatched to a DSQ from ``ops.select_cpu()`` by
-   calling ``scx_bpf_dispatch()``. If the task is dispatched to
-   ``SCX_DSQ_LOCAL`` from ``ops.select_cpu()``, it will be dispatched to the
+   A task can be immediately inserted into a DSQ from ``ops.select_cpu()``
+   by calling ``scx_bpf_dsq_insert()``. If the task is inserted into
+   ``SCX_DSQ_LOCAL`` from ``ops.select_cpu()``, it will be inserted into the
    local DSQ of whichever CPU is returned from ``ops.select_cpu()``.
-   Additionally, dispatching directly from ``ops.select_cpu()`` will cause the
+   Additionally, inserting directly from ``ops.select_cpu()`` will cause the
    ``ops.enqueue()`` callback to be skipped.
 
    Note that the scheduler core will ignore an invalid CPU selection, for
    example, if it's outside the allowed cpumask of the task.
 
 2. Once the target CPU is selected, ``ops.enqueue()`` is invoked (unless the
-   task was dispatched directly from ``ops.select_cpu()``). ``ops.enqueue()``
+   task was inserted directly from ``ops.select_cpu()``). ``ops.enqueue()``
    can make one of the following decisions:
 
-   * Immediately dispatch the task to either the global or local DSQ by
-     calling ``scx_bpf_dispatch()`` with ``SCX_DSQ_GLOBAL`` or
+   * Immediately insert the task into either the global or local DSQ by
+     calling ``scx_bpf_dsq_insert()`` with ``SCX_DSQ_GLOBAL`` or
      ``SCX_DSQ_LOCAL``, respectively.
 
-   * Immediately dispatch the task to a custom DSQ by calling
-     ``scx_bpf_dispatch()`` with a DSQ ID which is smaller than 2^63.
+   * Immediately insert the task into a custom DSQ by calling
+     ``scx_bpf_dsq_insert()`` with a DSQ ID which is smaller than 2^63.
 
    * Queue the task on the BPF side.
 
@@ -257,11 +257,11 @@ The following briefly shows how a waking task is scheduled and executed.
    run, ``ops.dispatch()`` is invoked which can use the following two
    functions to populate the local DSQ.
 
-   * ``scx_bpf_dispatch()`` dispatches a task to a DSQ. Any target DSQ can
-     be used - ``SCX_DSQ_LOCAL``, ``SCX_DSQ_LOCAL_ON | cpu``,
-     ``SCX_DSQ_GLOBAL`` or a custom DSQ. While ``scx_bpf_dispatch()``
+   * ``scx_bpf_dsq_insert()`` inserts a task to a DSQ. Any target DSQ can be
+     used - ``SCX_DSQ_LOCAL``, ``SCX_DSQ_LOCAL_ON | cpu``,
+     ``SCX_DSQ_GLOBAL`` or a custom DSQ. While ``scx_bpf_dsq_insert()``
      currently can't be called with BPF locks held, this is being worked on
-     and will be supported. ``scx_bpf_dispatch()`` schedules dispatching
+     and will be supported. ``scx_bpf_dsq_insert()`` schedules insertion
      rather than performing them immediately. There can be up to
      ``ops.dispatch_max_batch`` pending tasks.
 
@@ -288,12 +288,12 @@ built-in DSQs are used, there is no need to implement ``ops.dispatch()`` as
 a task is never queued on the BPF scheduler and both the local and global
 DSQs are consumed automatically.
 
-``scx_bpf_dispatch()`` queues the task on the FIFO of the target DSQ. Use
-``scx_bpf_dispatch_vtime()`` for the priority queue. Internal DSQs such as
+``scx_bpf_dsq_insert()`` inserts the task on the FIFO of the target DSQ. Use
+``scx_bpf_dsq_insert_vtime()`` for the priority queue. Internal DSQs such as
 ``SCX_DSQ_LOCAL`` and ``SCX_DSQ_GLOBAL`` do not support priority-queue
-dispatching, and must be dispatched to with ``scx_bpf_dispatch()``.  See the
-function documentation and usage in ``tools/sched_ext/scx_simple.bpf.c`` for
-more information.
+dispatching, and must be dispatched to with ``scx_bpf_dsq_insert()``. See
+the function documentation and usage in ``tools/sched_ext/scx_simple.bpf.c``
+for more information.
 
 Where to Look
 =============
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 5dcb0c1c49ea..8c4f588f54e4 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -220,10 +220,10 @@ struct sched_ext_ops {
 	 * dispatch. While an explicit custom mechanism can be added,
 	 * select_cpu() serves as the default way to wake up idle CPUs.
 	 *
-	 * @p may be dispatched directly by calling scx_bpf_dispatch(). If @p
-	 * is dispatched, the ops.enqueue() callback will be skipped. Finally,
-	 * if @p is dispatched to SCX_DSQ_LOCAL, it will be dispatched to the
-	 * local DSQ of whatever CPU is returned by this callback.
+	 * @p may be inserted into a DSQ directly by calling
+	 * scx_bpf_dsq_insert(). If so, the ops.enqueue() will be skipped.
+	 * Directly inserting into %SCX_DSQ_LOCAL will put @p in the local DSQ
+	 * of the CPU returned by this operation.
 	 *
 	 * Note that select_cpu() is never called for tasks that can only run
 	 * on a single CPU or tasks with migration disabled, as they don't have
@@ -237,12 +237,12 @@ struct sched_ext_ops {
 	 * @p: task being enqueued
 	 * @enq_flags: %SCX_ENQ_*
 	 *
-	 * @p is ready to run. Dispatch directly by calling scx_bpf_dispatch()
-	 * or enqueue on the BPF scheduler. If not directly dispatched, the bpf
-	 * scheduler owns @p and if it fails to dispatch @p, the task will
-	 * stall.
+	 * @p is ready to run. Insert directly into a DSQ by calling
+	 * scx_bpf_dsq_insert() or enqueue on the BPF scheduler. If not directly
+	 * inserted, the bpf scheduler owns @p and if it fails to dispatch @p,
+	 * the task will stall.
 	 *
-	 * If @p was dispatched from ops.select_cpu(), this callback is
+	 * If @p was inserted into a DSQ from ops.select_cpu(), this callback is
 	 * skipped.
 	 */
 	void (*enqueue)(struct task_struct *p, u64 enq_flags);
@@ -270,11 +270,11 @@ struct sched_ext_ops {
 	 *
 	 * Called when a CPU's local dsq is empty. The operation should dispatch
 	 * one or more tasks from the BPF scheduler into the DSQs using
-	 * scx_bpf_dispatch() and/or consume user DSQs into the local DSQ using
-	 * scx_bpf_consume().
+	 * scx_bpf_dsq_insert() and/or consume user DSQs into the local DSQ
+	 * using scx_bpf_consume().
 	 *
-	 * The maximum number of times scx_bpf_dispatch() can be called without
-	 * an intervening scx_bpf_consume() is specified by
+	 * The maximum number of times scx_bpf_dsq_insert() can be called
+	 * without an intervening scx_bpf_consume() is specified by
 	 * ops.dispatch_max_batch. See the comments on top of the two functions
 	 * for more details.
 	 *
@@ -714,7 +714,7 @@ enum scx_enq_flags {
 
 	/*
 	 * Set the following to trigger preemption when calling
-	 * scx_bpf_dispatch() with a local dsq as the target. The slice of the
+	 * scx_bpf_dsq_insert() with a local dsq as the target. The slice of the
 	 * current task is cleared to zero and the CPU is kicked into the
 	 * scheduling path. Implies %SCX_ENQ_HEAD.
 	 */
@@ -2322,7 +2322,7 @@ static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq,
 	/*
 	 * We don't require the BPF scheduler to avoid dispatching to offline
 	 * CPUs mostly for convenience but also because CPUs can go offline
-	 * between scx_bpf_dispatch() calls and here. Trigger error iff the
+	 * between scx_bpf_dsq_insert() calls and here. Trigger error iff the
 	 * picked CPU is outside the allowed mask.
 	 */
 	if (!task_allowed_on_cpu(p, cpu)) {
@@ -2658,7 +2658,7 @@ static void dispatch_to_local_dsq(struct rq *rq, struct scx_dispatch_q *dst_dsq,
  * Dispatching to local DSQs may need to wait for queueing to complete or
  * require rq lock dancing. As we don't wanna do either while inside
  * ops.dispatch() to avoid locking order inversion, we split dispatching into
- * two parts. scx_bpf_dispatch() which is called by ops.dispatch() records the
+ * two parts. scx_bpf_dsq_insert() which is called by ops.dispatch() records the
  * task and its qseq. Once ops.dispatch() returns, this function is called to
  * finish up.
  *
@@ -2690,7 +2690,7 @@ static void finish_dispatch(struct rq *rq, struct task_struct *p,
 		/*
 		 * If qseq doesn't match, @p has gone through at least one
 		 * dispatch/dequeue and re-enqueue cycle between
-		 * scx_bpf_dispatch() and here and we have no claim on it.
+		 * scx_bpf_dsq_insert() and here and we have no claim on it.
 		 */
 		if ((opss & SCX_OPSS_QSEQ_MASK) != qseq_at_dispatch)
 			return;
@@ -6258,7 +6258,7 @@ static const struct btf_kfunc_id_set scx_kfunc_set_select_cpu = {
 	.set			= &scx_kfunc_ids_select_cpu,
 };
 
-static bool scx_dispatch_preamble(struct task_struct *p, u64 enq_flags)
+static bool scx_dsq_insert_preamble(struct task_struct *p, u64 enq_flags)
 {
 	if (!scx_kf_allowed(SCX_KF_ENQUEUE | SCX_KF_DISPATCH))
 		return false;
@@ -6278,7 +6278,8 @@ static bool scx_dispatch_preamble(struct task_struct *p, u64 enq_flags)
 	return true;
 }
 
-static void scx_dispatch_commit(struct task_struct *p, u64 dsq_id, u64 enq_flags)
+static void scx_dsq_insert_commit(struct task_struct *p, u64 dsq_id,
+				  u64 enq_flags)
 {
 	struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
 	struct task_struct *ddsp_task;
@@ -6305,14 +6306,14 @@ static void scx_dispatch_commit(struct task_struct *p, u64 dsq_id, u64 enq_flags
 __bpf_kfunc_start_defs();
 
 /**
- * scx_bpf_dispatch - Dispatch a task into the FIFO queue of a DSQ
- * @p: task_struct to dispatch
- * @dsq_id: DSQ to dispatch to
+ * scx_bpf_dsq_insert - Insert a task into the FIFO queue of a DSQ
+ * @p: task_struct to insert
+ * @dsq_id: DSQ to insert into
  * @slice: duration @p can run for in nsecs, 0 to keep the current value
  * @enq_flags: SCX_ENQ_*
  *
- * Dispatch @p into the FIFO queue of the DSQ identified by @dsq_id. It is safe
- * to call this function spuriously. Can be called from ops.enqueue(),
+ * Insert @p into the FIFO queue of the DSQ identified by @dsq_id. It is safe to
+ * call this function spuriously. Can be called from ops.enqueue(),
  * ops.select_cpu(), and ops.dispatch().
  *
  * When called from ops.select_cpu() or ops.enqueue(), it's for direct dispatch
@@ -6321,14 +6322,14 @@ __bpf_kfunc_start_defs();
  * ops.select_cpu() to be on the target CPU in the first place.
  *
  * When called from ops.select_cpu(), @enq_flags and @dsp_id are stored, and @p
- * will be directly dispatched to the corresponding dispatch queue after
- * ops.select_cpu() returns. If @p is dispatched to SCX_DSQ_LOCAL, it will be
- * dispatched to the local DSQ of the CPU returned by ops.select_cpu().
+ * will be directly inserted into the corresponding dispatch queue after
+ * ops.select_cpu() returns. If @p is inserted into SCX_DSQ_LOCAL, it will be
+ * inserted into the local DSQ of the CPU returned by ops.select_cpu().
  * @enq_flags are OR'd with the enqueue flags on the enqueue path before the
- * task is dispatched.
+ * task is inserted.
  *
  * When called from ops.dispatch(), there are no restrictions on @p or @dsq_id
- * and this function can be called upto ops.dispatch_max_batch times to dispatch
+ * and this function can be called upto ops.dispatch_max_batch times to insert
  * multiple tasks. scx_bpf_dispatch_nr_slots() returns the number of the
  * remaining slots. scx_bpf_consume() flushes the batch and resets the counter.
  *
@@ -6340,10 +6341,10 @@ __bpf_kfunc_start_defs();
  * %SCX_SLICE_INF, @p never expires and the BPF scheduler must kick the CPU with
  * scx_bpf_kick_cpu() to trigger scheduling.
  */
-__bpf_kfunc void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice,
-				  u64 enq_flags)
+__bpf_kfunc void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice,
+				    u64 enq_flags)
 {
-	if (!scx_dispatch_preamble(p, enq_flags))
+	if (!scx_dsq_insert_preamble(p, enq_flags))
 		return;
 
 	if (slice)
@@ -6351,30 +6352,38 @@ __bpf_kfunc void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice,
 	else
 		p->scx.slice = p->scx.slice ?: 1;
 
-	scx_dispatch_commit(p, dsq_id, enq_flags);
+	scx_dsq_insert_commit(p, dsq_id, enq_flags);
+}
+
+/* for backward compatibility, will be removed in v6.15 */
+__bpf_kfunc void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice,
+				  u64 enq_flags)
+{
+	printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch() renamed to scx_bpf_dsq_insert()");
+	scx_bpf_dsq_insert(p, dsq_id, slice, enq_flags);
 }
 
 /**
- * scx_bpf_dispatch_vtime - Dispatch a task into the vtime priority queue of a DSQ
- * @p: task_struct to dispatch
- * @dsq_id: DSQ to dispatch to
+ * scx_bpf_dsq_insert_vtime - Insert a task into the vtime priority queue of a DSQ
+ * @p: task_struct to insert
+ * @dsq_id: DSQ to insert into
  * @slice: duration @p can run for in nsecs, 0 to keep the current value
  * @vtime: @p's ordering inside the vtime-sorted queue of the target DSQ
  * @enq_flags: SCX_ENQ_*
  *
- * Dispatch @p into the vtime priority queue of the DSQ identified by @dsq_id.
+ * Insert @p into the vtime priority queue of the DSQ identified by @dsq_id.
  * Tasks queued into the priority queue are ordered by @vtime and always
  * consumed after the tasks in the FIFO queue. All other aspects are identical
- * to scx_bpf_dispatch().
+ * to scx_bpf_dsq_insert().
  *
  * @vtime ordering is according to time_before64() which considers wrapping. A
  * numerically larger vtime may indicate an earlier position in the ordering and
  * vice-versa.
  */
-__bpf_kfunc void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id,
-					u64 slice, u64 vtime, u64 enq_flags)
+__bpf_kfunc void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id,
+					  u64 slice, u64 vtime, u64 enq_flags)
 {
-	if (!scx_dispatch_preamble(p, enq_flags))
+	if (!scx_dsq_insert_preamble(p, enq_flags))
 		return;
 
 	if (slice)
@@ -6384,12 +6393,22 @@ __bpf_kfunc void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id,
 
 	p->scx.dsq_vtime = vtime;
 
-	scx_dispatch_commit(p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ);
+	scx_dsq_insert_commit(p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ);
+}
+
+/* for backward compatibility, will be removed in v6.15 */
+__bpf_kfunc void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id,
+					u64 slice, u64 vtime, u64 enq_flags)
+{
+	printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_vtime() renamed to scx_bpf_dsq_insert_vtime()");
+	scx_bpf_dsq_insert_vtime(p, dsq_id, slice, vtime, enq_flags);
 }
 
 __bpf_kfunc_end_defs();
 
 BTF_KFUNCS_START(scx_kfunc_ids_enqueue_dispatch)
+BTF_ID_FLAGS(func, scx_bpf_dsq_insert, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_dsq_insert_vtime, KF_RCU)
 BTF_ID_FLAGS(func, scx_bpf_dispatch, KF_RCU)
 BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime, KF_RCU)
 BTF_KFUNCS_END(scx_kfunc_ids_enqueue_dispatch)
@@ -6527,9 +6546,9 @@ __bpf_kfunc void scx_bpf_dispatch_cancel(void)
  * to the current CPU's local DSQ for execution. Can only be called from
  * ops.dispatch().
  *
- * This function flushes the in-flight dispatches from scx_bpf_dispatch() before
- * trying to consume the specified DSQ. It may also grab rq locks and thus can't
- * be called under any BPF locks.
+ * This function flushes the in-flight dispatches from scx_bpf_dsq_insert()
+ * before trying to consume the specified DSQ. It may also grab rq locks and
+ * thus can't be called under any BPF locks.
  *
  * Returns %true if a task has been consumed, %false if there isn't any task to
  * consume.
@@ -6650,7 +6669,7 @@ __bpf_kfunc bool scx_bpf_dispatch_from_dsq(struct bpf_iter_scx_dsq *it__iter,
  * scx_bpf_dispatch_from_dsq_set_vtime() to update.
  *
  * All other aspects are identical to scx_bpf_dispatch_from_dsq(). See
- * scx_bpf_dispatch_vtime() for more information on @vtime.
+ * scx_bpf_dsq_insert_vtime() for more information on @vtime.
  */
 __bpf_kfunc bool scx_bpf_dispatch_vtime_from_dsq(struct bpf_iter_scx_dsq *it__iter,
 						 struct task_struct *p, u64 dsq_id,
diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h
index 248ab790d143..16c7100c69e4 100644
--- a/tools/sched_ext/include/scx/common.bpf.h
+++ b/tools/sched_ext/include/scx/common.bpf.h
@@ -36,8 +36,8 @@ static inline void ___vmlinux_h_sanity_check___(void)
 
 s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) __ksym;
 s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool *is_idle) __ksym;
-void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym;
-void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym;
+void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym __weak;
+void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym __weak;
 u32 scx_bpf_dispatch_nr_slots(void) __ksym;
 void scx_bpf_dispatch_cancel(void) __ksym;
 bool scx_bpf_consume(u64 dsq_id) __ksym;
diff --git a/tools/sched_ext/include/scx/compat.bpf.h b/tools/sched_ext/include/scx/compat.bpf.h
index e5afe9efd3f3..183aa4c6f674 100644
--- a/tools/sched_ext/include/scx/compat.bpf.h
+++ b/tools/sched_ext/include/scx/compat.bpf.h
@@ -34,6 +34,32 @@
 	(bpf_ksym_exists(scx_bpf_dispatch_vtime_from_dsq) ?			\
 	 scx_bpf_dispatch_vtime_from_dsq((it), (p), (dsq_id), (enq_flags)) : false)
 
+/*
+ * v6.13: The verb `dispatch` was too overloaded and confusing. kfuncs are
+ * renamed to unload the verb.
+ *
+ * Build error is triggered if old names are used. New binaries work with both
+ * new and old names. The compat macros will be removed on v6.15 release.
+ */
+void scx_bpf_dispatch___compat(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym __weak;
+void scx_bpf_dispatch_vtime___compat(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym __weak;
+
+#define scx_bpf_dsq_insert(p, dsq_id, slice, enq_flags)				\
+	(bpf_ksym_exists(scx_bpf_dsq_insert) ?					\
+	 scx_bpf_dsq_insert((p), (dsq_id), (slice), (enq_flags)) :		\
+	 scx_bpf_dispatch___compat((p), (dsq_id), (slice), (enq_flags)))
+
+#define scx_bpf_dsq_insert_vtime(p, dsq_id, slice, vtime, enq_flags)		\
+	(bpf_ksym_exists(scx_bpf_dsq_insert_vtime) ?				\
+	 scx_bpf_dsq_insert_vtime((p), (dsq_id), (slice), (vtime), (enq_flags)) : \
+	 scx_bpf_dispatch_vtime___compat((p), (dsq_id), (slice), (vtime), (enq_flags)))
+
+#define scx_bpf_dispatch(p, dsq_id, slice, enq_flags)				\
+	_Static_assert(false, "scx_bpf_dispatch() renamed to scx_bpf_dsq_insert()")
+
+#define scx_bpf_dispatch_vtime(p, dsq_id, slice, vtime, enq_flags)		\
+	_Static_assert(false, "scx_bpf_dispatch_vtime() renamed to scx_bpf_dsq_insert_vtime()")
+
 /*
  * Define sched_ext_ops. This may be expanded to define multiple variants for
  * backward compatibility. See compat.h::SCX_OPS_LOAD/ATTACH().
diff --git a/tools/sched_ext/scx_central.bpf.c b/tools/sched_ext/scx_central.bpf.c
index 8dd8eb73b6b8..153d9656c75f 100644
--- a/tools/sched_ext/scx_central.bpf.c
+++ b/tools/sched_ext/scx_central.bpf.c
@@ -118,14 +118,14 @@ void BPF_STRUCT_OPS(central_enqueue, struct task_struct *p, u64 enq_flags)
 	 */
 	if ((p->flags & PF_KTHREAD) && p->nr_cpus_allowed == 1) {
 		__sync_fetch_and_add(&nr_locals, 1);
-		scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_INF,
-				 enq_flags | SCX_ENQ_PREEMPT);
+		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_INF,
+				   enq_flags | SCX_ENQ_PREEMPT);
 		return;
 	}
 
 	if (bpf_map_push_elem(&central_q, &pid, 0)) {
 		__sync_fetch_and_add(&nr_overflows, 1);
-		scx_bpf_dispatch(p, FALLBACK_DSQ_ID, SCX_SLICE_INF, enq_flags);
+		scx_bpf_dsq_insert(p, FALLBACK_DSQ_ID, SCX_SLICE_INF, enq_flags);
 		return;
 	}
 
@@ -158,7 +158,7 @@ static bool dispatch_to_cpu(s32 cpu)
 		 */
 		if (!bpf_cpumask_test_cpu(cpu, p->cpus_ptr)) {
 			__sync_fetch_and_add(&nr_mismatches, 1);
-			scx_bpf_dispatch(p, FALLBACK_DSQ_ID, SCX_SLICE_INF, 0);
+			scx_bpf_dsq_insert(p, FALLBACK_DSQ_ID, SCX_SLICE_INF, 0);
 			bpf_task_release(p);
 			/*
 			 * We might run out of dispatch buffer slots if we continue dispatching
@@ -172,7 +172,7 @@ static bool dispatch_to_cpu(s32 cpu)
 		}
 
 		/* dispatch to local and mark that @cpu doesn't need more */
-		scx_bpf_dispatch(p, SCX_DSQ_LOCAL_ON | cpu, SCX_SLICE_INF, 0);
+		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | cpu, SCX_SLICE_INF, 0);
 
 		if (cpu != central_cpu)
 			scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE);
diff --git a/tools/sched_ext/scx_flatcg.bpf.c b/tools/sched_ext/scx_flatcg.bpf.c
index b722baf6da4b..4dcb6c747b46 100644
--- a/tools/sched_ext/scx_flatcg.bpf.c
+++ b/tools/sched_ext/scx_flatcg.bpf.c
@@ -341,7 +341,7 @@ s32 BPF_STRUCT_OPS(fcg_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake
 	if (is_idle) {
 		set_bypassed_at(p, taskc);
 		stat_inc(FCG_STAT_LOCAL);
-		scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
+		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
 	}
 
 	return cpu;
@@ -377,10 +377,12 @@ void BPF_STRUCT_OPS(fcg_enqueue, struct task_struct *p, u64 enq_flags)
 		 */
 		if (p->nr_cpus_allowed == 1 && (p->flags & PF_KTHREAD)) {
 			stat_inc(FCG_STAT_LOCAL);
-			scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags);
+			scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL,
+					   enq_flags);
 		} else {
 			stat_inc(FCG_STAT_GLOBAL);
-			scx_bpf_dispatch(p, FALLBACK_DSQ, SCX_SLICE_DFL, enq_flags);
+			scx_bpf_dsq_insert(p, FALLBACK_DSQ, SCX_SLICE_DFL,
+					   enq_flags);
 		}
 		return;
 	}
@@ -391,7 +393,7 @@ void BPF_STRUCT_OPS(fcg_enqueue, struct task_struct *p, u64 enq_flags)
 		goto out_release;
 
 	if (fifo_sched) {
-		scx_bpf_dispatch(p, cgrp->kn->id, SCX_SLICE_DFL, enq_flags);
+		scx_bpf_dsq_insert(p, cgrp->kn->id, SCX_SLICE_DFL, enq_flags);
 	} else {
 		u64 tvtime = p->scx.dsq_vtime;
 
@@ -402,8 +404,8 @@ void BPF_STRUCT_OPS(fcg_enqueue, struct task_struct *p, u64 enq_flags)
 		if (vtime_before(tvtime, cgc->tvtime_now - SCX_SLICE_DFL))
 			tvtime = cgc->tvtime_now - SCX_SLICE_DFL;
 
-		scx_bpf_dispatch_vtime(p, cgrp->kn->id, SCX_SLICE_DFL,
-				       tvtime, enq_flags);
+		scx_bpf_dsq_insert_vtime(p, cgrp->kn->id, SCX_SLICE_DFL,
+					 tvtime, enq_flags);
 	}
 
 	cgrp_enqueued(cgrp, cgc);
diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
index 5d1f880d1149..65c52cc4fffb 100644
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -226,7 +226,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
 	 */
 	if (tctx->force_local) {
 		tctx->force_local = false;
-		scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, enq_flags);
+		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_ns, enq_flags);
 		return;
 	}
 
@@ -234,7 +234,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
 	if (!(enq_flags & SCX_ENQ_CPU_SELECTED) &&
 	    (cpu = pick_direct_dispatch_cpu(p, scx_bpf_task_cpu(p))) >= 0) {
 		__sync_fetch_and_add(&nr_ddsp_from_enq, 1);
-		scx_bpf_dispatch(p, SCX_DSQ_LOCAL_ON | cpu, slice_ns, enq_flags);
+		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | cpu, slice_ns, enq_flags);
 		return;
 	}
 
@@ -247,7 +247,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
 	if (enq_flags & SCX_ENQ_REENQ) {
 		s32 cpu;
 
-		scx_bpf_dispatch(p, SHARED_DSQ, 0, enq_flags);
+		scx_bpf_dsq_insert(p, SHARED_DSQ, 0, enq_flags);
 		cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0);
 		if (cpu >= 0)
 			scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE);
@@ -262,7 +262,7 @@ void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags)
 
 	/* Queue on the selected FIFO. If the FIFO overflows, punt to global. */
 	if (bpf_map_push_elem(ring, &pid, 0)) {
-		scx_bpf_dispatch(p, SHARED_DSQ, slice_ns, enq_flags);
+		scx_bpf_dsq_insert(p, SHARED_DSQ, slice_ns, enq_flags);
 		return;
 	}
 
@@ -385,7 +385,7 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
 		 */
 		p = bpf_task_from_pid(2);
 		if (p) {
-			scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, 0);
+			scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_ns, 0);
 			bpf_task_release(p);
 			return;
 		}
@@ -431,7 +431,7 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
 			update_core_sched_head_seq(p);
 			__sync_fetch_and_add(&nr_dispatched, 1);
 
-			scx_bpf_dispatch(p, SHARED_DSQ, slice_ns, 0);
+			scx_bpf_dsq_insert(p, SHARED_DSQ, slice_ns, 0);
 			bpf_task_release(p);
 
 			batch--;
diff --git a/tools/sched_ext/scx_simple.bpf.c b/tools/sched_ext/scx_simple.bpf.c
index ed7e8d535fc5..ce86a4a690b5 100644
--- a/tools/sched_ext/scx_simple.bpf.c
+++ b/tools/sched_ext/scx_simple.bpf.c
@@ -31,10 +31,10 @@ UEI_DEFINE(uei);
 
 /*
  * Built-in DSQs such as SCX_DSQ_GLOBAL cannot be used as priority queues
- * (meaning, cannot be dispatched to with scx_bpf_dispatch_vtime()). We
+ * (meaning, cannot be dispatched to with scx_bpf_dsq_insert_vtime()). We
  * therefore create a separate DSQ with ID 0 that we dispatch to and consume
- * from. If scx_simple only supported global FIFO scheduling, then we could
- * just use SCX_DSQ_GLOBAL.
+ * from. If scx_simple only supported global FIFO scheduling, then we could just
+ * use SCX_DSQ_GLOBAL.
  */
 #define SHARED_DSQ 0
 
@@ -65,7 +65,7 @@ s32 BPF_STRUCT_OPS(simple_select_cpu, struct task_struct *p, s32 prev_cpu, u64 w
 	cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle);
 	if (is_idle) {
 		stat_inc(0);	/* count local queueing */
-		scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
+		scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0);
 	}
 
 	return cpu;
@@ -76,7 +76,7 @@ void BPF_STRUCT_OPS(simple_enqueue, struct task_struct *p, u64 enq_flags)
 	stat_inc(1);	/* count global queueing */
 
 	if (fifo_sched) {
-		scx_bpf_dispatch(p, SHARED_DSQ, SCX_SLICE_DFL, enq_flags);
+		scx_bpf_dsq_insert(p, SHARED_DSQ, SCX_SLICE_DFL, enq_flags);
 	} else {
 		u64 vtime = p->scx.dsq_vtime;
 
@@ -87,8 +87,8 @@ void BPF_STRUCT_OPS(simple_enqueue, struct task_struct *p, u64 enq_flags)
 		if (vtime_before(vtime, vtime_now - SCX_SLICE_DFL))
 			vtime = vtime_now - SCX_SLICE_DFL;
 
-		scx_bpf_dispatch_vtime(p, SHARED_DSQ, SCX_SLICE_DFL, vtime,
-				       enq_flags);
+		scx_bpf_dsq_insert_vtime(p, SHARED_DSQ, SCX_SLICE_DFL, vtime,
+					 enq_flags);
 	}
 }
 

From 5209c03c8ed215357a4827496a71fd32167d83ef Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 11 Nov 2024 07:06:16 -1000
Subject: [PATCH 19/21] sched_ext: Rename scx_bpf_consume() to
 scx_bpf_dsq_move_to_local()

In sched_ext API, a repeatedly reported pain point is the overuse of the
verb "dispatch" and confusion around "consume":

- ops.dispatch()
- scx_bpf_dispatch[_vtime]()
- scx_bpf_consume()
- scx_bpf_dispatch[_vtime]_from_dsq*()

This overloading of the term is historical. Originally, there were only
built-in DSQs and moving a task into a DSQ always dispatched it for
execution. Using the verb "dispatch" for the kfuncs to move tasks into these
DSQs made sense.

Later, user DSQs were added and scx_bpf_dispatch[_vtime]() updated to be
able to insert tasks into any DSQ. The only allowed DSQ to DSQ transfer was
from a non-local DSQ to a local DSQ and this operation was named "consume".
This was already confusing as a task could be dispatched to a user DSQ from
ops.enqueue() and then the DSQ would have to be consumed in ops.dispatch().
Later addition of scx_bpf_dispatch_from_dsq*() made the confusion even worse
as "dispatch" in this context meant moving a task to an arbitrary DSQ from a
user DSQ.

Clean up the API with the following renames:

1. scx_bpf_dispatch[_vtime]()		-> scx_bpf_dsq_insert[_vtime]()
2. scx_bpf_consume()			-> scx_bpf_dsq_move_to_local()
3. scx_bpf_dispatch[_vtime]_from_dsq*()	-> scx_bpf_dsq_move[_vtime]*()

This patch performs the second rename. Compatibility is maintained by:

- The previous kfunc names are still provided by the kernel so that old
  binaries can run. Kernel generates a warning when the old names are used.

- compat.bpf.h provides wrappers for the new names which automatically fall
  back to the old names when running on older kernels. They also trigger
  build error if old names are used for new builds.

The compat features will be dropped after v6.15.

v2: Comment and documentation updates.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Andrea Righi <arighi@nvidia.com>
Acked-by: Changwoo Min <changwoo@igalia.com>
Acked-by: Johannes Bechberger <me@mostlynerdless.de>
Acked-by: Giovanni Gherdovich <ggherdovich@suse.com>
Cc: Dan Schatzberg <dschatzberg@meta.com>
Cc: Ming Yang <yougmark94@gmail.com>
---
 Documentation/scheduler/sched-ext.rst    | 21 ++++++-----
 kernel/sched/ext.c                       | 45 +++++++++++++++---------
 tools/sched_ext/include/scx/common.bpf.h |  2 +-
 tools/sched_ext/include/scx/compat.bpf.h | 11 ++++++
 tools/sched_ext/scx_central.bpf.c        |  4 +--
 tools/sched_ext/scx_flatcg.bpf.c         |  6 ++--
 tools/sched_ext/scx_qmap.bpf.c           |  4 +--
 tools/sched_ext/scx_simple.bpf.c         |  2 +-
 8 files changed, 58 insertions(+), 37 deletions(-)

diff --git a/Documentation/scheduler/sched-ext.rst b/Documentation/scheduler/sched-ext.rst
index ee4af97701bf..6cb8b676ce03 100644
--- a/Documentation/scheduler/sched-ext.rst
+++ b/Documentation/scheduler/sched-ext.rst
@@ -203,13 +203,12 @@ an arbitrary number of dsq's using ``scx_bpf_create_dsq()`` and
 ``scx_bpf_destroy_dsq()``.
 
 A CPU always executes a task from its local DSQ. A task is "inserted" into a
-DSQ. A non-local DSQ is "consumed" to transfer a task to the consuming CPU's
-local DSQ.
+DSQ. A task in a non-local DSQ is "move"d into the target CPU's local DSQ.
 
 When a CPU is looking for the next task to run, if the local DSQ is not
-empty, the first task is picked. Otherwise, the CPU tries to consume the
-global DSQ. If that doesn't yield a runnable task either, ``ops.dispatch()``
-is invoked.
+empty, the first task is picked. Otherwise, the CPU tries to move a task
+from the global DSQ. If that doesn't yield a runnable task either,
+``ops.dispatch()`` is invoked.
 
 Scheduling Cycle
 ----------------
@@ -265,15 +264,15 @@ The following briefly shows how a waking task is scheduled and executed.
      rather than performing them immediately. There can be up to
      ``ops.dispatch_max_batch`` pending tasks.
 
-   * ``scx_bpf_consume()`` tranfers a task from the specified non-local DSQ
-     to the dispatching DSQ. This function cannot be called with any BPF
-     locks held. ``scx_bpf_consume()`` flushes the pending dispatched tasks
-     before trying to consume the specified DSQ.
+   * ``scx_bpf_move_to_local()`` moves a task from the specified non-local
+     DSQ to the dispatching DSQ. This function cannot be called with any BPF
+     locks held. ``scx_bpf_move_to_local()`` flushes the pending insertions
+     tasks before trying to move from the specified DSQ.
 
 4. After ``ops.dispatch()`` returns, if there are tasks in the local DSQ,
    the CPU runs the first one. If empty, the following steps are taken:
 
-   * Try to consume the global DSQ. If successful, run the task.
+   * Try to move from the global DSQ. If successful, run the task.
 
    * If ``ops.dispatch()`` has dispatched any tasks, retry #3.
 
@@ -286,7 +285,7 @@ Note that the BPF scheduler can always choose to dispatch tasks immediately
 in ``ops.enqueue()`` as illustrated in the above simple example. If only the
 built-in DSQs are used, there is no need to implement ``ops.dispatch()`` as
 a task is never queued on the BPF scheduler and both the local and global
-DSQs are consumed automatically.
+DSQs are executed automatically.
 
 ``scx_bpf_dsq_insert()`` inserts the task on the FIFO of the target DSQ. Use
 ``scx_bpf_dsq_insert_vtime()`` for the priority queue. Internal DSQs such as
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 8c4f588f54e4..929475f9b1cd 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -264,17 +264,17 @@ struct sched_ext_ops {
 	void (*dequeue)(struct task_struct *p, u64 deq_flags);
 
 	/**
-	 * dispatch - Dispatch tasks from the BPF scheduler and/or consume DSQs
+	 * dispatch - Dispatch tasks from the BPF scheduler and/or user DSQs
 	 * @cpu: CPU to dispatch tasks for
 	 * @prev: previous task being switched out
 	 *
 	 * Called when a CPU's local dsq is empty. The operation should dispatch
 	 * one or more tasks from the BPF scheduler into the DSQs using
-	 * scx_bpf_dsq_insert() and/or consume user DSQs into the local DSQ
-	 * using scx_bpf_consume().
+	 * scx_bpf_dsq_insert() and/or move from user DSQs into the local DSQ
+	 * using scx_bpf_dsq_move_to_local().
 	 *
 	 * The maximum number of times scx_bpf_dsq_insert() can be called
-	 * without an intervening scx_bpf_consume() is specified by
+	 * without an intervening scx_bpf_dsq_move_to_local() is specified by
 	 * ops.dispatch_max_batch. See the comments on top of the two functions
 	 * for more details.
 	 *
@@ -282,7 +282,7 @@ struct sched_ext_ops {
 	 * @prev is still runnable as indicated by set %SCX_TASK_QUEUED in
 	 * @prev->scx.flags, it is not enqueued yet and will be enqueued after
 	 * ops.dispatch() returns. To keep executing @prev, return without
-	 * dispatching or consuming any tasks. Also see %SCX_OPS_ENQ_LAST.
+	 * dispatching or moving any tasks. Also see %SCX_OPS_ENQ_LAST.
 	 */
 	void (*dispatch)(s32 cpu, struct task_struct *prev);
 
@@ -6372,13 +6372,17 @@ __bpf_kfunc void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice,
  * @enq_flags: SCX_ENQ_*
  *
  * Insert @p into the vtime priority queue of the DSQ identified by @dsq_id.
- * Tasks queued into the priority queue are ordered by @vtime and always
- * consumed after the tasks in the FIFO queue. All other aspects are identical
- * to scx_bpf_dsq_insert().
+ * Tasks queued into the priority queue are ordered by @vtime. All other aspects
+ * are identical to scx_bpf_dsq_insert().
  *
  * @vtime ordering is according to time_before64() which considers wrapping. A
  * numerically larger vtime may indicate an earlier position in the ordering and
  * vice-versa.
+ *
+ * A DSQ can only be used as a FIFO or priority queue at any given time and this
+ * function must not be called on a DSQ which already has one or more FIFO tasks
+ * queued and vice-versa. Also, the built-in DSQs (SCX_DSQ_LOCAL and
+ * SCX_DSQ_GLOBAL) cannot be used as priority queues.
  */
 __bpf_kfunc void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id,
 					  u64 slice, u64 vtime, u64 enq_flags)
@@ -6539,21 +6543,20 @@ __bpf_kfunc void scx_bpf_dispatch_cancel(void)
 }
 
 /**
- * scx_bpf_consume - Transfer a task from a DSQ to the current CPU's local DSQ
- * @dsq_id: DSQ to consume
+ * scx_bpf_dsq_move_to_local - move a task from a DSQ to the current CPU's local DSQ
+ * @dsq_id: DSQ to move task from
  *
- * Consume a task from the non-local DSQ identified by @dsq_id and transfer it
- * to the current CPU's local DSQ for execution. Can only be called from
- * ops.dispatch().
+ * Move a task from the non-local DSQ identified by @dsq_id to the current CPU's
+ * local DSQ for execution. Can only be called from ops.dispatch().
  *
  * This function flushes the in-flight dispatches from scx_bpf_dsq_insert()
- * before trying to consume the specified DSQ. It may also grab rq locks and
+ * before trying to move from the specified DSQ. It may also grab rq locks and
  * thus can't be called under any BPF locks.
  *
- * Returns %true if a task has been consumed, %false if there isn't any task to
- * consume.
+ * Returns %true if a task has been moved, %false if there isn't any task to
+ * move.
  */
-__bpf_kfunc bool scx_bpf_consume(u64 dsq_id)
+__bpf_kfunc bool scx_bpf_dsq_move_to_local(u64 dsq_id)
 {
 	struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx);
 	struct scx_dispatch_q *dsq;
@@ -6583,6 +6586,13 @@ __bpf_kfunc bool scx_bpf_consume(u64 dsq_id)
 	}
 }
 
+/* for backward compatibility, will be removed in v6.15 */
+__bpf_kfunc bool scx_bpf_consume(u64 dsq_id)
+{
+	printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_consume() renamed to scx_bpf_dsq_move_to_local()");
+	return scx_bpf_dsq_move_to_local(dsq_id);
+}
+
 /**
  * scx_bpf_dispatch_from_dsq_set_slice - Override slice when dispatching from DSQ
  * @it__iter: DSQ iterator in progress
@@ -6684,6 +6694,7 @@ __bpf_kfunc_end_defs();
 BTF_KFUNCS_START(scx_kfunc_ids_dispatch)
 BTF_ID_FLAGS(func, scx_bpf_dispatch_nr_slots)
 BTF_ID_FLAGS(func, scx_bpf_dispatch_cancel)
+BTF_ID_FLAGS(func, scx_bpf_dsq_move_to_local)
 BTF_ID_FLAGS(func, scx_bpf_consume)
 BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_slice)
 BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_vtime)
diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h
index 16c7100c69e4..535377649a22 100644
--- a/tools/sched_ext/include/scx/common.bpf.h
+++ b/tools/sched_ext/include/scx/common.bpf.h
@@ -40,7 +40,7 @@ void scx_bpf_dsq_insert(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_fl
 void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym __weak;
 u32 scx_bpf_dispatch_nr_slots(void) __ksym;
 void scx_bpf_dispatch_cancel(void) __ksym;
-bool scx_bpf_consume(u64 dsq_id) __ksym;
+bool scx_bpf_dsq_move_to_local(u64 dsq_id) __ksym;
 void scx_bpf_dispatch_from_dsq_set_slice(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym __weak;
 void scx_bpf_dispatch_from_dsq_set_vtime(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym __weak;
 bool scx_bpf_dispatch_from_dsq(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
diff --git a/tools/sched_ext/include/scx/compat.bpf.h b/tools/sched_ext/include/scx/compat.bpf.h
index 183aa4c6f674..c00ed24a8721 100644
--- a/tools/sched_ext/include/scx/compat.bpf.h
+++ b/tools/sched_ext/include/scx/compat.bpf.h
@@ -43,6 +43,7 @@
  */
 void scx_bpf_dispatch___compat(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym __weak;
 void scx_bpf_dispatch_vtime___compat(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym __weak;
+bool scx_bpf_consume___compat(u64 dsq_id) __ksym __weak;
 
 #define scx_bpf_dsq_insert(p, dsq_id, slice, enq_flags)				\
 	(bpf_ksym_exists(scx_bpf_dsq_insert) ?					\
@@ -54,12 +55,22 @@ void scx_bpf_dispatch_vtime___compat(struct task_struct *p, u64 dsq_id, u64 slic
 	 scx_bpf_dsq_insert_vtime((p), (dsq_id), (slice), (vtime), (enq_flags)) : \
 	 scx_bpf_dispatch_vtime___compat((p), (dsq_id), (slice), (vtime), (enq_flags)))
 
+#define scx_bpf_dsq_move_to_local(dsq_id)					\
+	(bpf_ksym_exists(scx_bpf_dsq_move_to_local) ?				\
+	 scx_bpf_dsq_move_to_local((dsq_id)) :					\
+	 scx_bpf_consume___compat((dsq_id)))
+
 #define scx_bpf_dispatch(p, dsq_id, slice, enq_flags)				\
 	_Static_assert(false, "scx_bpf_dispatch() renamed to scx_bpf_dsq_insert()")
 
 #define scx_bpf_dispatch_vtime(p, dsq_id, slice, vtime, enq_flags)		\
 	_Static_assert(false, "scx_bpf_dispatch_vtime() renamed to scx_bpf_dsq_insert_vtime()")
 
+#define scx_bpf_consume(dsq_id) ({						\
+	_Static_assert(false, "scx_bpf_consume() renamed to scx_bpf_dsq_move_to_local()"); \
+	false;									\
+})
+
 /*
  * Define sched_ext_ops. This may be expanded to define multiple variants for
  * backward compatibility. See compat.h::SCX_OPS_LOAD/ATTACH().
diff --git a/tools/sched_ext/scx_central.bpf.c b/tools/sched_ext/scx_central.bpf.c
index 153d9656c75f..e6fad6211f6c 100644
--- a/tools/sched_ext/scx_central.bpf.c
+++ b/tools/sched_ext/scx_central.bpf.c
@@ -219,13 +219,13 @@ void BPF_STRUCT_OPS(central_dispatch, s32 cpu, struct task_struct *prev)
 		}
 
 		/* look for a task to run on the central CPU */
-		if (scx_bpf_consume(FALLBACK_DSQ_ID))
+		if (scx_bpf_dsq_move_to_local(FALLBACK_DSQ_ID))
 			return;
 		dispatch_to_cpu(central_cpu);
 	} else {
 		bool *gimme;
 
-		if (scx_bpf_consume(FALLBACK_DSQ_ID))
+		if (scx_bpf_dsq_move_to_local(FALLBACK_DSQ_ID))
 			return;
 
 		gimme = ARRAY_ELEM_PTR(cpu_gimme_task, cpu, nr_cpu_ids);
diff --git a/tools/sched_ext/scx_flatcg.bpf.c b/tools/sched_ext/scx_flatcg.bpf.c
index 4dcb6c747b46..4e3afcd260bf 100644
--- a/tools/sched_ext/scx_flatcg.bpf.c
+++ b/tools/sched_ext/scx_flatcg.bpf.c
@@ -665,7 +665,7 @@ static bool try_pick_next_cgroup(u64 *cgidp)
 		goto out_free;
 	}
 
-	if (!scx_bpf_consume(cgid)) {
+	if (!scx_bpf_dsq_move_to_local(cgid)) {
 		bpf_cgroup_release(cgrp);
 		stat_inc(FCG_STAT_PNC_EMPTY);
 		goto out_stash;
@@ -745,7 +745,7 @@ void BPF_STRUCT_OPS(fcg_dispatch, s32 cpu, struct task_struct *prev)
 		goto pick_next_cgroup;
 
 	if (vtime_before(now, cpuc->cur_at + cgrp_slice_ns)) {
-		if (scx_bpf_consume(cpuc->cur_cgid)) {
+		if (scx_bpf_dsq_move_to_local(cpuc->cur_cgid)) {
 			stat_inc(FCG_STAT_CNS_KEEP);
 			return;
 		}
@@ -785,7 +785,7 @@ void BPF_STRUCT_OPS(fcg_dispatch, s32 cpu, struct task_struct *prev)
 pick_next_cgroup:
 	cpuc->cur_at = now;
 
-	if (scx_bpf_consume(FALLBACK_DSQ)) {
+	if (scx_bpf_dsq_move_to_local(FALLBACK_DSQ)) {
 		cpuc->cur_cgid = 0;
 		return;
 	}
diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
index 65c52cc4fffb..08f121252c56 100644
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -374,7 +374,7 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
 	if (dispatch_highpri(false))
 		return;
 
-	if (!nr_highpri_queued && scx_bpf_consume(SHARED_DSQ))
+	if (!nr_highpri_queued && scx_bpf_dsq_move_to_local(SHARED_DSQ))
 		return;
 
 	if (dsp_inf_loop_after && nr_dispatched > dsp_inf_loop_after) {
@@ -439,7 +439,7 @@ void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev)
 			if (!batch || !scx_bpf_dispatch_nr_slots()) {
 				if (dispatch_highpri(false))
 					return;
-				scx_bpf_consume(SHARED_DSQ);
+				scx_bpf_dsq_move_to_local(SHARED_DSQ);
 				return;
 			}
 			if (!cpuc->dsp_cnt)
diff --git a/tools/sched_ext/scx_simple.bpf.c b/tools/sched_ext/scx_simple.bpf.c
index ce86a4a690b5..31f915b286c6 100644
--- a/tools/sched_ext/scx_simple.bpf.c
+++ b/tools/sched_ext/scx_simple.bpf.c
@@ -94,7 +94,7 @@ void BPF_STRUCT_OPS(simple_enqueue, struct task_struct *p, u64 enq_flags)
 
 void BPF_STRUCT_OPS(simple_dispatch, s32 cpu, struct task_struct *prev)
 {
-	scx_bpf_consume(SHARED_DSQ);
+	scx_bpf_dsq_move_to_local(SHARED_DSQ);
 }
 
 void BPF_STRUCT_OPS(simple_running, struct task_struct *p)

From 5cbb302880f50f3edf35f8c6a1d38b6948bf4d11 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 11 Nov 2024 07:06:16 -1000
Subject: [PATCH 20/21] sched_ext: Rename scx_bpf_dispatch[_vtime]_from_dsq*()
 -> scx_bpf_dsq_move[_vtime]*()

In sched_ext API, a repeatedly reported pain point is the overuse of the
verb "dispatch" and confusion around "consume":

- ops.dispatch()
- scx_bpf_dispatch[_vtime]()
- scx_bpf_consume()
- scx_bpf_dispatch[_vtime]_from_dsq*()

This overloading of the term is historical. Originally, there were only
built-in DSQs and moving a task into a DSQ always dispatched it for
execution. Using the verb "dispatch" for the kfuncs to move tasks into these
DSQs made sense.

Later, user DSQs were added and scx_bpf_dispatch[_vtime]() updated to be
able to insert tasks into any DSQ. The only allowed DSQ to DSQ transfer was
from a non-local DSQ to a local DSQ and this operation was named "consume".
This was already confusing as a task could be dispatched to a user DSQ from
ops.enqueue() and then the DSQ would have to be consumed in ops.dispatch().
Later addition of scx_bpf_dispatch_from_dsq*() made the confusion even worse
as "dispatch" in this context meant moving a task to an arbitrary DSQ from a
user DSQ.

Clean up the API with the following renames:

1. scx_bpf_dispatch[_vtime]()		-> scx_bpf_dsq_insert[_vtime]()
2. scx_bpf_consume()			-> scx_bpf_dsq_move_to_local()
3. scx_bpf_dispatch[_vtime]_from_dsq*()	-> scx_bpf_dsq_move[_vtime]*()

This patch performs the third set of renames. Compatibility is maintained
by:

- The previous kfunc names are still provided by the kernel so that old
  binaries can run. Kernel generates a warning when the old names are used.

- compat.bpf.h provides wrappers for the new names which automatically fall
  back to the old names when running on older kernels. They also trigger
  build error if old names are used for new builds.

- scx_bpf_dispatch[_vtime]_from_dsq*() were already wrapped in __COMPAT
  macros as they were introduced during v6.12 cycle. Wrap new API in
  __COMPAT macros too and trigger build errors on both __COMPAT prefixed and
  naked usages of the old names.

The compat features will be dropped after v6.15.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Andrea Righi <arighi@nvidia.com>
Acked-by: Changwoo Min <changwoo@igalia.com>
Acked-by: Johannes Bechberger <me@mostlynerdless.de>
Acked-by: Giovanni Gherdovich <ggherdovich@suse.com>
Cc: Dan Schatzberg <dschatzberg@meta.com>
Cc: Ming Yang <yougmark94@gmail.com>
---
 kernel/sched/ext.c                       | 97 +++++++++++++++++-------
 tools/sched_ext/include/scx/common.bpf.h | 12 +--
 tools/sched_ext/include/scx/compat.bpf.h | 82 ++++++++++++++++----
 tools/sched_ext/scx_qmap.bpf.c           | 20 ++---
 4 files changed, 152 insertions(+), 59 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 929475f9b1cd..5081fbdab74d 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -6422,9 +6422,8 @@ static const struct btf_kfunc_id_set scx_kfunc_set_enqueue_dispatch = {
 	.set			= &scx_kfunc_ids_enqueue_dispatch,
 };
 
-static bool scx_dispatch_from_dsq(struct bpf_iter_scx_dsq_kern *kit,
-				  struct task_struct *p, u64 dsq_id,
-				  u64 enq_flags)
+static bool scx_dsq_move(struct bpf_iter_scx_dsq_kern *kit,
+			 struct task_struct *p, u64 dsq_id, u64 enq_flags)
 {
 	struct scx_dispatch_q *src_dsq = kit->dsq, *dst_dsq;
 	struct rq *this_rq, *src_rq, *locked_rq;
@@ -6594,16 +6593,16 @@ __bpf_kfunc bool scx_bpf_consume(u64 dsq_id)
 }
 
 /**
- * scx_bpf_dispatch_from_dsq_set_slice - Override slice when dispatching from DSQ
+ * scx_bpf_dsq_move_set_slice - Override slice when moving between DSQs
  * @it__iter: DSQ iterator in progress
- * @slice: duration the dispatched task can run for in nsecs
+ * @slice: duration the moved task can run for in nsecs
  *
- * Override the slice of the next task that will be dispatched from @it__iter
- * using scx_bpf_dispatch_from_dsq[_vtime](). If this function is not called,
- * the previous slice duration is kept.
+ * Override the slice of the next task that will be moved from @it__iter using
+ * scx_bpf_dsq_move[_vtime](). If this function is not called, the previous
+ * slice duration is kept.
  */
-__bpf_kfunc void scx_bpf_dispatch_from_dsq_set_slice(
-				struct bpf_iter_scx_dsq *it__iter, u64 slice)
+__bpf_kfunc void scx_bpf_dsq_move_set_slice(struct bpf_iter_scx_dsq *it__iter,
+					    u64 slice)
 {
 	struct bpf_iter_scx_dsq_kern *kit = (void *)it__iter;
 
@@ -6611,18 +6610,26 @@ __bpf_kfunc void scx_bpf_dispatch_from_dsq_set_slice(
 	kit->cursor.flags |= __SCX_DSQ_ITER_HAS_SLICE;
 }
 
+/* for backward compatibility, will be removed in v6.15 */
+__bpf_kfunc void scx_bpf_dispatch_from_dsq_set_slice(
+			struct bpf_iter_scx_dsq *it__iter, u64 slice)
+{
+	printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_from_dsq_set_slice() renamed to scx_bpf_dsq_move_set_slice()");
+	scx_bpf_dsq_move_set_slice(it__iter, slice);
+}
+
 /**
- * scx_bpf_dispatch_from_dsq_set_vtime - Override vtime when dispatching from DSQ
+ * scx_bpf_dsq_move_set_vtime - Override vtime when moving between DSQs
  * @it__iter: DSQ iterator in progress
  * @vtime: task's ordering inside the vtime-sorted queue of the target DSQ
  *
- * Override the vtime of the next task that will be dispatched from @it__iter
- * using scx_bpf_dispatch_from_dsq_vtime(). If this function is not called, the
- * previous slice vtime is kept. If scx_bpf_dispatch_from_dsq() is used to
- * dispatch the next task, the override is ignored and cleared.
+ * Override the vtime of the next task that will be moved from @it__iter using
+ * scx_bpf_dsq_move_vtime(). If this function is not called, the previous slice
+ * vtime is kept. If scx_bpf_dsq_move() is used to dispatch the next task, the
+ * override is ignored and cleared.
  */
-__bpf_kfunc void scx_bpf_dispatch_from_dsq_set_vtime(
-				struct bpf_iter_scx_dsq *it__iter, u64 vtime)
+__bpf_kfunc void scx_bpf_dsq_move_set_vtime(struct bpf_iter_scx_dsq *it__iter,
+					    u64 vtime)
 {
 	struct bpf_iter_scx_dsq_kern *kit = (void *)it__iter;
 
@@ -6630,8 +6637,16 @@ __bpf_kfunc void scx_bpf_dispatch_from_dsq_set_vtime(
 	kit->cursor.flags |= __SCX_DSQ_ITER_HAS_VTIME;
 }
 
+/* for backward compatibility, will be removed in v6.15 */
+__bpf_kfunc void scx_bpf_dispatch_from_dsq_set_vtime(
+			struct bpf_iter_scx_dsq *it__iter, u64 vtime)
+{
+	printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_from_dsq_set_vtime() renamed to scx_bpf_dsq_move_set_vtime()");
+	scx_bpf_dsq_move_set_vtime(it__iter, vtime);
+}
+
 /**
- * scx_bpf_dispatch_from_dsq - Move a task from DSQ iteration to a DSQ
+ * scx_bpf_dsq_move - Move a task from DSQ iteration to a DSQ
  * @it__iter: DSQ iterator in progress
  * @p: task to transfer
  * @dsq_id: DSQ to move @p to
@@ -6646,8 +6661,7 @@ __bpf_kfunc void scx_bpf_dispatch_from_dsq_set_vtime(
  * @p was obtained from the DSQ iteration. @p just has to be on the DSQ and have
  * been queued before the iteration started.
  *
- * @p's slice is kept by default. Use scx_bpf_dispatch_from_dsq_set_slice() to
- * update.
+ * @p's slice is kept by default. Use scx_bpf_dsq_move_set_slice() to update.
  *
  * Can be called from ops.dispatch() or any BPF context which doesn't hold a rq
  * lock (e.g. BPF timers or SYSCALL programs).
@@ -6655,16 +6669,25 @@ __bpf_kfunc void scx_bpf_dispatch_from_dsq_set_vtime(
  * Returns %true if @p has been consumed, %false if @p had already been consumed
  * or dequeued.
  */
+__bpf_kfunc bool scx_bpf_dsq_move(struct bpf_iter_scx_dsq *it__iter,
+				  struct task_struct *p, u64 dsq_id,
+				  u64 enq_flags)
+{
+	return scx_dsq_move((struct bpf_iter_scx_dsq_kern *)it__iter,
+			    p, dsq_id, enq_flags);
+}
+
+/* for backward compatibility, will be removed in v6.15 */
 __bpf_kfunc bool scx_bpf_dispatch_from_dsq(struct bpf_iter_scx_dsq *it__iter,
 					   struct task_struct *p, u64 dsq_id,
 					   u64 enq_flags)
 {
-	return scx_dispatch_from_dsq((struct bpf_iter_scx_dsq_kern *)it__iter,
-				     p, dsq_id, enq_flags);
+	printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_from_dsq() renamed to scx_bpf_dsq_move()");
+	return scx_bpf_dsq_move(it__iter, p, dsq_id, enq_flags);
 }
 
 /**
- * scx_bpf_dispatch_vtime_from_dsq - Move a task from DSQ iteration to a PRIQ DSQ
+ * scx_bpf_dsq_move_vtime - Move a task from DSQ iteration to a PRIQ DSQ
  * @it__iter: DSQ iterator in progress
  * @p: task to transfer
  * @dsq_id: DSQ to move @p to
@@ -6674,19 +6697,27 @@ __bpf_kfunc bool scx_bpf_dispatch_from_dsq(struct bpf_iter_scx_dsq *it__iter,
  * priority queue of the DSQ specified by @dsq_id. The destination must be a
  * user DSQ as only user DSQs support priority queue.
  *
- * @p's slice and vtime are kept by default. Use
- * scx_bpf_dispatch_from_dsq_set_slice() and
- * scx_bpf_dispatch_from_dsq_set_vtime() to update.
+ * @p's slice and vtime are kept by default. Use scx_bpf_dsq_move_set_slice()
+ * and scx_bpf_dsq_move_set_vtime() to update.
  *
- * All other aspects are identical to scx_bpf_dispatch_from_dsq(). See
+ * All other aspects are identical to scx_bpf_dsq_move(). See
  * scx_bpf_dsq_insert_vtime() for more information on @vtime.
  */
+__bpf_kfunc bool scx_bpf_dsq_move_vtime(struct bpf_iter_scx_dsq *it__iter,
+					struct task_struct *p, u64 dsq_id,
+					u64 enq_flags)
+{
+	return scx_dsq_move((struct bpf_iter_scx_dsq_kern *)it__iter,
+			    p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ);
+}
+
+/* for backward compatibility, will be removed in v6.15 */
 __bpf_kfunc bool scx_bpf_dispatch_vtime_from_dsq(struct bpf_iter_scx_dsq *it__iter,
 						 struct task_struct *p, u64 dsq_id,
 						 u64 enq_flags)
 {
-	return scx_dispatch_from_dsq((struct bpf_iter_scx_dsq_kern *)it__iter,
-				     p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ);
+	printk_deferred_once(KERN_WARNING "sched_ext: scx_bpf_dispatch_from_dsq_vtime() renamed to scx_bpf_dsq_move_vtime()");
+	return scx_bpf_dsq_move_vtime(it__iter, p, dsq_id, enq_flags);
 }
 
 __bpf_kfunc_end_defs();
@@ -6696,6 +6727,10 @@ BTF_ID_FLAGS(func, scx_bpf_dispatch_nr_slots)
 BTF_ID_FLAGS(func, scx_bpf_dispatch_cancel)
 BTF_ID_FLAGS(func, scx_bpf_dsq_move_to_local)
 BTF_ID_FLAGS(func, scx_bpf_consume)
+BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice)
+BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime)
+BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU)
 BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_slice)
 BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_vtime)
 BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq, KF_RCU)
@@ -6796,6 +6831,10 @@ __bpf_kfunc_end_defs();
 
 BTF_KFUNCS_START(scx_kfunc_ids_unlocked)
 BTF_ID_FLAGS(func, scx_bpf_create_dsq, KF_SLEEPABLE)
+BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_slice)
+BTF_ID_FLAGS(func, scx_bpf_dsq_move_set_vtime)
+BTF_ID_FLAGS(func, scx_bpf_dsq_move, KF_RCU)
+BTF_ID_FLAGS(func, scx_bpf_dsq_move_vtime, KF_RCU)
 BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_slice)
 BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq_set_vtime)
 BTF_ID_FLAGS(func, scx_bpf_dispatch_from_dsq, KF_RCU)
diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h
index 535377649a22..2f36b7b6418d 100644
--- a/tools/sched_ext/include/scx/common.bpf.h
+++ b/tools/sched_ext/include/scx/common.bpf.h
@@ -41,10 +41,10 @@ void scx_bpf_dsq_insert_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64
 u32 scx_bpf_dispatch_nr_slots(void) __ksym;
 void scx_bpf_dispatch_cancel(void) __ksym;
 bool scx_bpf_dsq_move_to_local(u64 dsq_id) __ksym;
-void scx_bpf_dispatch_from_dsq_set_slice(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym __weak;
-void scx_bpf_dispatch_from_dsq_set_vtime(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym __weak;
-bool scx_bpf_dispatch_from_dsq(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
-bool scx_bpf_dispatch_vtime_from_dsq(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
+void scx_bpf_dsq_move_set_slice(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym;
+void scx_bpf_dsq_move_set_vtime(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym;
+bool scx_bpf_dsq_move(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
+bool scx_bpf_dsq_move_vtime(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
 u32 scx_bpf_reenqueue_local(void) __ksym;
 void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym;
 s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym;
@@ -74,8 +74,8 @@ struct rq *scx_bpf_cpu_rq(s32 cpu) __ksym;
 struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym __weak;
 
 /*
- * Use the following as @it__iter when calling
- * scx_bpf_dispatch[_vtime]_from_dsq() from within bpf_for_each() loops.
+ * Use the following as @it__iter when calling scx_bpf_dsq_move[_vtime]() from
+ * within bpf_for_each() loops.
  */
 #define BPF_FOR_EACH_ITER	(&___it)
 
diff --git a/tools/sched_ext/include/scx/compat.bpf.h b/tools/sched_ext/include/scx/compat.bpf.h
index c00ed24a8721..d56520100a26 100644
--- a/tools/sched_ext/include/scx/compat.bpf.h
+++ b/tools/sched_ext/include/scx/compat.bpf.h
@@ -20,30 +20,24 @@
 	(bpf_ksym_exists(scx_bpf_task_cgroup) ?					\
 	 scx_bpf_task_cgroup((p)) : NULL)
 
-/* v6.12: 4c30f5ce4f7a ("sched_ext: Implement scx_bpf_dispatch[_vtime]_from_dsq()") */
-#define __COMPAT_scx_bpf_dispatch_from_dsq_set_slice(it, slice)			\
-	(bpf_ksym_exists(scx_bpf_dispatch_from_dsq_set_slice) ?			\
-	 scx_bpf_dispatch_from_dsq_set_slice((it), (slice)) : (void)0)
-#define __COMPAT_scx_bpf_dispatch_from_dsq_set_vtime(it, vtime)			\
-	(bpf_ksym_exists(scx_bpf_dispatch_from_dsq_set_vtime) ?			\
-	 scx_bpf_dispatch_from_dsq_set_vtime((it), (vtime)) : (void)0)
-#define __COMPAT_scx_bpf_dispatch_from_dsq(it, p, dsq_id, enq_flags)		\
-	(bpf_ksym_exists(scx_bpf_dispatch_from_dsq) ?				\
-	 scx_bpf_dispatch_from_dsq((it), (p), (dsq_id), (enq_flags)) : false)
-#define __COMPAT_scx_bpf_dispatch_vtime_from_dsq(it, p, dsq_id, enq_flags)	\
-	(bpf_ksym_exists(scx_bpf_dispatch_vtime_from_dsq) ?			\
-	 scx_bpf_dispatch_vtime_from_dsq((it), (p), (dsq_id), (enq_flags)) : false)
-
 /*
  * v6.13: The verb `dispatch` was too overloaded and confusing. kfuncs are
  * renamed to unload the verb.
  *
  * Build error is triggered if old names are used. New binaries work with both
  * new and old names. The compat macros will be removed on v6.15 release.
+ *
+ * scx_bpf_dispatch_from_dsq() and friends were added during v6.12 by
+ * 4c30f5ce4f7a ("sched_ext: Implement scx_bpf_dispatch[_vtime]_from_dsq()").
+ * Preserve __COMPAT macros until v6.15.
  */
 void scx_bpf_dispatch___compat(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym __weak;
 void scx_bpf_dispatch_vtime___compat(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym __weak;
 bool scx_bpf_consume___compat(u64 dsq_id) __ksym __weak;
+void scx_bpf_dispatch_from_dsq_set_slice___compat(struct bpf_iter_scx_dsq *it__iter, u64 slice) __ksym __weak;
+void scx_bpf_dispatch_from_dsq_set_vtime___compat(struct bpf_iter_scx_dsq *it__iter, u64 vtime) __ksym __weak;
+bool scx_bpf_dispatch_from_dsq___compat(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
+bool scx_bpf_dispatch_vtime_from_dsq___compat(struct bpf_iter_scx_dsq *it__iter, struct task_struct *p, u64 dsq_id, u64 enq_flags) __ksym __weak;
 
 #define scx_bpf_dsq_insert(p, dsq_id, slice, enq_flags)				\
 	(bpf_ksym_exists(scx_bpf_dsq_insert) ?					\
@@ -60,6 +54,34 @@ bool scx_bpf_consume___compat(u64 dsq_id) __ksym __weak;
 	 scx_bpf_dsq_move_to_local((dsq_id)) :					\
 	 scx_bpf_consume___compat((dsq_id)))
 
+#define __COMPAT_scx_bpf_dsq_move_set_slice(it__iter, slice)			\
+	(bpf_ksym_exists(scx_bpf_dsq_move_set_slice) ?				\
+	 scx_bpf_dsq_move_set_slice((it__iter), (slice)) :			\
+	 (bpf_ksym_exists(scx_bpf_dispatch_from_dsq_set_slice___compat) ?	\
+	  scx_bpf_dispatch_from_dsq_set_slice___compat((it__iter), (slice)) :	\
+	  (void)0))
+
+#define __COMPAT_scx_bpf_dsq_move_set_vtime(it__iter, vtime)			\
+	(bpf_ksym_exists(scx_bpf_dsq_move_set_vtime) ?				\
+	 scx_bpf_dsq_move_set_vtime((it__iter), (vtime)) :			\
+	 (bpf_ksym_exists(scx_bpf_dispatch_from_dsq_set_vtime___compat) ?	\
+	  scx_bpf_dispatch_from_dsq_set_vtime___compat((it__iter), (vtime)) :	\
+	  (void) 0))
+
+#define __COMPAT_scx_bpf_dsq_move(it__iter, p, dsq_id, enq_flags)		\
+	(bpf_ksym_exists(scx_bpf_dsq_move) ?					\
+	 scx_bpf_dsq_move((it__iter), (p), (dsq_id), (enq_flags)) :		\
+	 (bpf_ksym_exists(scx_bpf_dispatch_from_dsq___compat) ?			\
+	  scx_bpf_dispatch_from_dsq___compat((it__iter), (p), (dsq_id), (enq_flags)) : \
+	  false))
+
+#define __COMPAT_scx_bpf_dsq_move_vtime(it__iter, p, dsq_id, enq_flags)		\
+	(bpf_ksym_exists(scx_bpf_dsq_move_vtime) ?				\
+	 scx_bpf_dsq_move_vtime((it__iter), (p), (dsq_id), (enq_flags)) :	\
+	 (bpf_ksym_exists(scx_bpf_dispatch_vtime_from_dsq___compat) ?		\
+	  scx_bpf_dispatch_vtime_from_dsq___compat((it__iter), (p), (dsq_id), (enq_flags)) : \
+	  false))
+
 #define scx_bpf_dispatch(p, dsq_id, slice, enq_flags)				\
 	_Static_assert(false, "scx_bpf_dispatch() renamed to scx_bpf_dsq_insert()")
 
@@ -71,6 +93,38 @@ bool scx_bpf_consume___compat(u64 dsq_id) __ksym __weak;
 	false;									\
 })
 
+#define scx_bpf_dispatch_from_dsq_set_slice(it__iter, slice)		\
+	_Static_assert(false, "scx_bpf_dispatch_from_dsq_set_slice() renamed to scx_bpf_dsq_move_set_slice()")
+
+#define scx_bpf_dispatch_from_dsq_set_vtime(it__iter, vtime)		\
+	_Static_assert(false, "scx_bpf_dispatch_from_dsq_set_vtime() renamed to scx_bpf_dsq_move_set_vtime()")
+
+#define scx_bpf_dispatch_from_dsq(it__iter, p, dsq_id, enq_flags) ({	\
+	_Static_assert(false, "scx_bpf_dispatch_from_dsq() renamed to scx_bpf_dsq_move()"); \
+	false;									\
+})
+
+#define scx_bpf_dispatch_vtime_from_dsq(it__iter, p, dsq_id, enq_flags) ({  \
+	_Static_assert(false, "scx_bpf_dispatch_vtime_from_dsq() renamed to scx_bpf_dsq_move_vtime()"); \
+	false;									\
+})
+
+#define __COMPAT_scx_bpf_dispatch_from_dsq_set_slice(it__iter, slice)		\
+	_Static_assert(false, "__COMPAT_scx_bpf_dispatch_from_dsq_set_slice() renamed to __COMPAT_scx_bpf_dsq_move_set_slice()")
+
+#define __COMPAT_scx_bpf_dispatch_from_dsq_set_vtime(it__iter, vtime)		\
+	_Static_assert(false, "__COMPAT_scx_bpf_dispatch_from_dsq_set_vtime() renamed to __COMPAT_scx_bpf_dsq_move_set_vtime()")
+
+#define __COMPAT_scx_bpf_dispatch_from_dsq(it__iter, p, dsq_id, enq_flags) ({	\
+	_Static_assert(false, "__COMPAT_scx_bpf_dispatch_from_dsq() renamed to __COMPAT_scx_bpf_dsq_move()"); \
+	false;									\
+})
+
+#define __COMPAT_scx_bpf_dispatch_vtime_from_dsq(it__iter, p, dsq_id, enq_flags) ({  \
+	_Static_assert(false, "__COMPAT_scx_bpf_dispatch_vtime_from_dsq() renamed to __COMPAT_scx_bpf_dsq_move_vtime()"); \
+	false;									\
+})
+
 /*
  * Define sched_ext_ops. This may be expanded to define multiple variants for
  * backward compatibility. See compat.h::SCX_OPS_LOAD/ATTACH().
diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c
index 08f121252c56..ee264947e0c3 100644
--- a/tools/sched_ext/scx_qmap.bpf.c
+++ b/tools/sched_ext/scx_qmap.bpf.c
@@ -294,10 +294,10 @@ static void update_core_sched_head_seq(struct task_struct *p)
 }
 
 /*
- * To demonstrate the use of scx_bpf_dispatch_from_dsq(), implement silly
- * selective priority boosting mechanism by scanning SHARED_DSQ looking for
- * highpri tasks, moving them to HIGHPRI_DSQ and then consuming them first. This
- * makes minor difference only when dsp_batch is larger than 1.
+ * To demonstrate the use of scx_bpf_dsq_move(), implement silly selective
+ * priority boosting mechanism by scanning SHARED_DSQ looking for highpri tasks,
+ * moving them to HIGHPRI_DSQ and then consuming them first. This makes minor
+ * difference only when dsp_batch is larger than 1.
  *
  * scx_bpf_dispatch[_vtime]_from_dsq() are allowed both from ops.dispatch() and
  * non-rq-lock holding BPF programs. As demonstration, this function is called
@@ -318,11 +318,11 @@ static bool dispatch_highpri(bool from_timer)
 
 		if (tctx->highpri) {
 			/* exercise the set_*() and vtime interface too */
-			__COMPAT_scx_bpf_dispatch_from_dsq_set_slice(
+			__COMPAT_scx_bpf_dsq_move_set_slice(
 				BPF_FOR_EACH_ITER, slice_ns * 2);
-			__COMPAT_scx_bpf_dispatch_from_dsq_set_vtime(
+			__COMPAT_scx_bpf_dsq_move_set_vtime(
 				BPF_FOR_EACH_ITER, highpri_seq++);
-			__COMPAT_scx_bpf_dispatch_vtime_from_dsq(
+			__COMPAT_scx_bpf_dsq_move_vtime(
 				BPF_FOR_EACH_ITER, p, HIGHPRI_DSQ, 0);
 		}
 	}
@@ -340,9 +340,9 @@ static bool dispatch_highpri(bool from_timer)
 		else
 			cpu = scx_bpf_pick_any_cpu(p->cpus_ptr, 0);
 
-		if (__COMPAT_scx_bpf_dispatch_from_dsq(BPF_FOR_EACH_ITER, p,
-						       SCX_DSQ_LOCAL_ON | cpu,
-						       SCX_ENQ_PREEMPT)) {
+		if (__COMPAT_scx_bpf_dsq_move(BPF_FOR_EACH_ITER, p,
+					      SCX_DSQ_LOCAL_ON | cpu,
+					      SCX_ENQ_PREEMPT)) {
 			if (cpu == this_cpu) {
 				dispatched = true;
 				__sync_fetch_and_add(&nr_expedited_local, 1);

From 6b8950ef993bcf198d4a80cde0b2da805b75ed70 Mon Sep 17 00:00:00 2001
From: Zhao Mengmeng <zhaomengmeng@kylinos.cn>
Date: Fri, 15 Nov 2024 09:17:09 +0800
Subject: [PATCH 21/21] sched_ext: Replace scx_next_task_picked() with
 switch_class() in comment

scx_next_task_picked() has been replaced with siwtch_class(), but comment
is still referencing old one, so replace it.

Signed-off-by: Zhao Mengmeng <zhaomengmeng@kylinos.cn>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched/ext.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 5081fbdab74d..3c4a94e4258f 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -2759,7 +2759,7 @@ static int balance_one(struct rq *rq, struct task_struct *prev)
 		 * If the previous sched_class for the current CPU was not SCX,
 		 * notify the BPF scheduler that it again has control of the
 		 * core. This callback complements ->cpu_release(), which is
-		 * emitted in scx_next_task_picked().
+		 * emitted in switch_class().
 		 */
 		if (SCX_HAS_OP(cpu_acquire))
 			SCX_CALL_OP(0, cpu_acquire, cpu_of(rq), NULL);
@@ -6073,7 +6073,7 @@ static void kick_cpus_irq_workfn(struct irq_work *irq_work)
 		if (cpu != cpu_of(this_rq)) {
 			/*
 			 * Pairs with smp_store_release() issued by this CPU in
-			 * scx_next_task_picked() on the resched path.
+			 * switch_class() on the resched path.
 			 *
 			 * We busy-wait here to guarantee that no other task can
 			 * be scheduled on our core before the target CPU has