From a87fba4a707efee10e385df03733ef8db12538fc Mon Sep 17 00:00:00 2001
From: Eric Naim <dnaim@cachyos.org>
Date: Thu, 27 Feb 2025 21:23:05 +0800
Subject: [PATCH 07/12] itmt-core-ranking

Signed-off-by: Eric Naim <dnaim@cachyos.org>
---
 arch/x86/include/asm/topology.h |  4 +-
 arch/x86/kernel/itmt.c          | 81 ++++++++++++++-------------------
 arch/x86/kernel/smpboot.c       |  8 +---
 kernel/sched/fair.c             | 42 +++++++++++++----
 kernel/sched/sched.h            |  1 -
 kernel/sched/topology.c         | 15 +-----
 6 files changed, 69 insertions(+), 82 deletions(-)

diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index fd41103ad342..63bab25a4896 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -250,7 +250,7 @@ extern bool x86_topology_update;
 #include <asm/percpu.h>
 
 DECLARE_PER_CPU_READ_MOSTLY(int, sched_core_priority);
-extern unsigned int __read_mostly sysctl_sched_itmt_enabled;
+extern bool __read_mostly sysctl_sched_itmt_enabled;
 
 /* Interface to set priority of a cpu */
 void sched_set_itmt_core_prio(int prio, int core_cpu);
@@ -263,7 +263,7 @@ void sched_clear_itmt_support(void);
 
 #else /* CONFIG_SCHED_MC_PRIO */
 
-#define sysctl_sched_itmt_enabled	0
+#define sysctl_sched_itmt_enabled	false
 static inline void sched_set_itmt_core_prio(int prio, int core_cpu)
 {
 }
diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c
index 51b805c727fc..9cea1fc36c18 100644
--- a/arch/x86/kernel/itmt.c
+++ b/arch/x86/kernel/itmt.c
@@ -19,6 +19,7 @@
 #include <linux/sched.h>
 #include <linux/cpumask.h>
 #include <linux/cpuset.h>
+#include <linux/debugfs.h>
 #include <linux/mutex.h>
 #include <linux/sysctl.h>
 #include <linux/nodemask.h>
@@ -34,49 +35,38 @@ static bool __read_mostly sched_itmt_capable;
  * of higher turbo frequency for cpus supporting Intel Turbo Boost Max
  * Technology 3.0.
  *
- * It can be set via /proc/sys/kernel/sched_itmt_enabled
+ * It can be set via /sys/kernel/debug/x86/sched_itmt_enabled
  */
-unsigned int __read_mostly sysctl_sched_itmt_enabled;
+bool __read_mostly sysctl_sched_itmt_enabled;
 
-static int sched_itmt_update_handler(const struct ctl_table *table, int write,
-				     void *buffer, size_t *lenp, loff_t *ppos)
+static ssize_t sched_itmt_enabled_write(struct file *filp,
+					const char __user *ubuf,
+					size_t cnt, loff_t *ppos)
 {
-	unsigned int old_sysctl;
-	int ret;
+	ssize_t result;
+	bool orig;
 
-	mutex_lock(&itmt_update_mutex);
+	guard(mutex)(&itmt_update_mutex);
 
-	if (!sched_itmt_capable) {
-		mutex_unlock(&itmt_update_mutex);
-		return -EINVAL;
-	}
-
-	old_sysctl = sysctl_sched_itmt_enabled;
-	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+	orig = sysctl_sched_itmt_enabled;
+	result = debugfs_write_file_bool(filp, ubuf, cnt, ppos);
 
-	if (!ret && write && old_sysctl != sysctl_sched_itmt_enabled) {
+	if (sysctl_sched_itmt_enabled != orig) {
 		x86_topology_update = true;
 		rebuild_sched_domains();
 	}
 
-	mutex_unlock(&itmt_update_mutex);
-
-	return ret;
+	return result;
 }
 
-static struct ctl_table itmt_kern_table[] = {
-	{
-		.procname	= "sched_itmt_enabled",
-		.data		= &sysctl_sched_itmt_enabled,
-		.maxlen		= sizeof(unsigned int),
-		.mode		= 0644,
-		.proc_handler	= sched_itmt_update_handler,
-		.extra1		= SYSCTL_ZERO,
-		.extra2		= SYSCTL_ONE,
-	},
+static const struct file_operations dfs_sched_itmt_fops = {
+	.read =         debugfs_read_file_bool,
+	.write =        sched_itmt_enabled_write,
+	.open =         simple_open,
+	.llseek =       default_llseek,
 };
 
-static struct ctl_table_header *itmt_sysctl_header;
+static struct dentry *dfs_sched_itmt;
 
 /**
  * sched_set_itmt_support() - Indicate platform supports ITMT
@@ -97,16 +87,18 @@ static struct ctl_table_header *itmt_sysctl_header;
  */
 int sched_set_itmt_support(void)
 {
-	mutex_lock(&itmt_update_mutex);
+	guard(mutex)(&itmt_update_mutex);
 
-	if (sched_itmt_capable) {
-		mutex_unlock(&itmt_update_mutex);
+	if (sched_itmt_capable)
 		return 0;
-	}
 
-	itmt_sysctl_header = register_sysctl("kernel", itmt_kern_table);
-	if (!itmt_sysctl_header) {
-		mutex_unlock(&itmt_update_mutex);
+	dfs_sched_itmt = debugfs_create_file_unsafe("sched_itmt_enabled",
+						    0644,
+						    arch_debugfs_dir,
+						    &sysctl_sched_itmt_enabled,
+						    &dfs_sched_itmt_fops);
+	if (IS_ERR_OR_NULL(dfs_sched_itmt)) {
+		dfs_sched_itmt = NULL;
 		return -ENOMEM;
 	}
 
@@ -117,8 +109,6 @@ int sched_set_itmt_support(void)
 	x86_topology_update = true;
 	rebuild_sched_domains();
 
-	mutex_unlock(&itmt_update_mutex);
-
 	return 0;
 }
 
@@ -134,18 +124,15 @@ int sched_set_itmt_support(void)
  */
 void sched_clear_itmt_support(void)
 {
-	mutex_lock(&itmt_update_mutex);
+	guard(mutex)(&itmt_update_mutex);
 
-	if (!sched_itmt_capable) {
-		mutex_unlock(&itmt_update_mutex);
+	if (!sched_itmt_capable)
 		return;
-	}
+
 	sched_itmt_capable = false;
 
-	if (itmt_sysctl_header) {
-		unregister_sysctl_table(itmt_sysctl_header);
-		itmt_sysctl_header = NULL;
-	}
+	debugfs_remove(dfs_sched_itmt);
+	dfs_sched_itmt = NULL;
 
 	if (sysctl_sched_itmt_enabled) {
 		/* disable sched_itmt if we are no longer ITMT capable */
@@ -153,8 +140,6 @@ void sched_clear_itmt_support(void)
 		x86_topology_update = true;
 		rebuild_sched_domains();
 	}
-
-	mutex_unlock(&itmt_update_mutex);
 }
 
 int arch_asym_cpu_priority(int cpu)
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index f1fac08fdef2..ef63b1c0b491 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -482,12 +482,6 @@ static int x86_core_flags(void)
 	return cpu_core_flags() | x86_sched_itmt_flags();
 }
 #endif
-#ifdef CONFIG_SCHED_SMT
-static int x86_smt_flags(void)
-{
-	return cpu_smt_flags();
-}
-#endif
 #ifdef CONFIG_SCHED_CLUSTER
 static int x86_cluster_flags(void)
 {
@@ -510,7 +504,7 @@ static void __init build_sched_topology(void)
 
 #ifdef CONFIG_SCHED_SMT
 	x86_topology[i++] = (struct sched_domain_topology_level){
-		cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT)
+		cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT)
 	};
 #endif
 #ifdef CONFIG_SCHED_CLUSTER
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 3a3116dca89c..a27896a05103 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -9864,6 +9864,8 @@ struct sg_lb_stats {
 	unsigned int group_weight;
 	enum group_type group_type;
 	unsigned int group_asym_packing;	/* Tasks should be moved to preferred CPU */
+	unsigned int asym_prefer_cpu;		/* Group CPU with highest asym priority */
+	int highest_asym_prio;			/* Asym priority of asym_prefer_cpu */
 	unsigned int group_smt_balance;		/* Task on busy SMT be moved */
 	unsigned long group_misfit_task_load;	/* A CPU has a task too big for its capacity */
 #ifdef CONFIG_NUMA_BALANCING
@@ -10193,7 +10195,7 @@ sched_group_asym(struct lb_env *env, struct sg_lb_stats *sgs, struct sched_group
 	    (sgs->group_weight - sgs->idle_cpus != 1))
 		return false;
 
-	return sched_asym(env->sd, env->dst_cpu, group->asym_prefer_cpu);
+	return sched_asym(env->sd, env->dst_cpu, sgs->asym_prefer_cpu);
 }
 
 /* One group has more than one SMT CPU while the other group does not */
@@ -10274,6 +10276,17 @@ sched_reduced_capacity(struct rq *rq, struct sched_domain *sd)
 	return check_cpu_capacity(rq, sd);
 }
 
+static inline void
+update_sg_pick_asym_prefer(struct sg_lb_stats *sgs, int cpu)
+{
+	int asym_prio = arch_asym_cpu_priority(cpu);
+
+	if (asym_prio > sgs->highest_asym_prio) {
+		sgs->asym_prefer_cpu = cpu;
+		sgs->highest_asym_prio = asym_prio;
+	}
+}
+
 /**
  * update_sg_lb_stats - Update sched_group's statistics for load balancing.
  * @env: The load balancing environment.
@@ -10290,11 +10303,13 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 				      bool *sg_overloaded,
 				      bool *sg_overutilized)
 {
-	int i, nr_running, local_group;
+	int i, nr_running, local_group, sd_flags = env->sd->flags;
+	bool balancing_at_rd = !env->sd->parent;
 
 	memset(sgs, 0, sizeof(*sgs));
 
 	local_group = group == sds->local;
+	sgs->highest_asym_prio = INT_MIN;
 
 	for_each_cpu_and(i, sched_group_span(group), env->cpus) {
 		struct rq *rq = cpu_rq(i);
@@ -10308,16 +10323,12 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 		nr_running = rq->nr_running;
 		sgs->sum_nr_running += nr_running;
 
-		if (nr_running > 1)
-			*sg_overloaded = 1;
+		if (sd_flags & SD_ASYM_PACKING)
+			update_sg_pick_asym_prefer(sgs, i);
 
 		if (cpu_overutilized(i))
 			*sg_overutilized = 1;
 
-#ifdef CONFIG_NUMA_BALANCING
-		sgs->nr_numa_running += rq->nr_numa_running;
-		sgs->nr_preferred_running += rq->nr_preferred_running;
-#endif
 		/*
 		 * No need to call idle_cpu() if nr_running is not 0
 		 */
@@ -10327,10 +10338,21 @@ static inline void update_sg_lb_stats(struct lb_env *env,
 			continue;
 		}
 
+		/* Overload indicator is only updated at root domain */
+		if (balancing_at_rd && nr_running > 1)
+			*sg_overloaded = 1;
+
+#ifdef CONFIG_NUMA_BALANCING
+		/* Only fbq_classify_group() uses this to classify NUMA groups */
+		if (sd_flags & SD_NUMA) {
+			sgs->nr_numa_running += rq->nr_numa_running;
+			sgs->nr_preferred_running += rq->nr_preferred_running;
+		}
+#endif
 		if (local_group)
 			continue;
 
-		if (env->sd->flags & SD_ASYM_CPUCAPACITY) {
+		if (sd_flags & SD_ASYM_CPUCAPACITY) {
 			/* Check for a misfit task on the cpu */
 			if (sgs->group_misfit_task_load < rq->misfit_task_load) {
 				sgs->group_misfit_task_load = rq->misfit_task_load;
@@ -10425,7 +10447,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
 
 	case group_asym_packing:
 		/* Prefer to move from lowest priority CPU's work */
-		return sched_asym_prefer(sds->busiest->asym_prefer_cpu, sg->asym_prefer_cpu);
+		return sched_asym_prefer(busiest->asym_prefer_cpu, sgs->asym_prefer_cpu);
 
 	case group_misfit_task:
 		/*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 18f3955ddb8f..6a9efb0fd86f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2056,7 +2056,6 @@ struct sched_group {
 	unsigned int		group_weight;
 	unsigned int		cores;
 	struct sched_group_capacity *sgc;
-	int			asym_prefer_cpu;	/* CPU of highest priority in group */
 	int			flags;
 
 	/*
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 9748a4c8d668..59b8157cb114 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1302,7 +1302,7 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
 	WARN_ON(!sg);
 
 	do {
-		int cpu, cores = 0, max_cpu = -1;
+		int cpu, cores = 0;
 
 		sg->group_weight = cpumask_weight(sched_group_span(sg));
 
@@ -1314,19 +1314,6 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
 #endif
 		}
 		sg->cores = cores;
-
-		if (!(sd->flags & SD_ASYM_PACKING))
-			goto next;
-
-		for_each_cpu(cpu, sched_group_span(sg)) {
-			if (max_cpu < 0)
-				max_cpu = cpu;
-			else if (sched_asym_prefer(cpu, max_cpu))
-				max_cpu = cpu;
-		}
-		sg->asym_prefer_cpu = max_cpu;
-
-next:
 		sg = sg->next;
 	} while (sg != sd->groups);
 
-- 
2.48.1