Linux schedule 4、负载均衡
4、负载均衡
4.1、SMP负载均衡
4.1.1、Scheduling Domains
4.1.1.1、Scheduling Domains概念
借用Linux Scheduling Domains的描述,阐述Scheduling Domains的概念。
一个复杂的高端系统由上到下可以这样构成:
- 1、它是一个 NUMA 架构的系统,系统中的每个 Node 访问系统中不同区域的内存有不同的速度。
- 2、同时它又是一个 SMP 系统。由多个物理 CPU(Physical Package) 构成。这些物理 CPU 共享系统中所有的内存。但都有自己独立的 Cache 。
- 3、每个物理 CPU 又由多个核 (Core) 构成,即 Multi-core 技术或者叫 Chip-level Multi processor(CMP) 。这些核都被集成在一块 die 里面。一般有自己独立的 L1 Cache,但可能共享 L2 Cache 。
- 4、每个核中又通过 SMT 之类的技术实现多个硬件线程,或者叫 Virtual CPU( 比如 Intel 的 Hyper-threading 技术 ) 。这些硬件线程,逻辑上看是就是一个 CPU 。它们之间几乎所有的东西都共享。包括 L1 Cache,甚至是逻辑运算单元 (ALU) 以及 Power 。
可以看到cpu是有多个层级的,cpu和越近的层级之间共享的资源越多。所以进程在cpu之间迁移是有代价的,从性能的角度看,迁移跨越的层级越大性能损失越大。另外还需要从功耗的角度来考虑进程迁移的代价,这就是EAS考虑的。
4.1.1.2、arm64 cpu_topology
arm64架构的cpu拓扑结构存储在cpu_topology[]变量当中:
/*
* cpu topology table
*/
struct cpu_topology cpu_topology[NR_CPUS];
struct cpu_topology {
int thread_id;
int core_id;
int cluster_id; // 本cpu所在的cluster
unsigned int partno;
cpumask_t thread_sibling;
cpumask_t core_sibling; // 在MutiCore层次(即同一个cluster中),有哪些兄弟cpu
};
cpu_topology[]是parse_dt_cpu_capacity()函数解析dts中的信息建立的:
kernel_init() -> kernel_init_freeable() -> smp_prepare_cpus() -> init_cpu_topology() -> parse_dt_topology()
↓
static int __init parse_dt_topology(void)
{
struct device_node *cn, *map;
int ret = 0;
int cpu;
/* (1) 找到dts中cpu topology的根节点"/cpus"" */
cn = of_find_node_by_path("/cpus");
if (!cn) {
pr_err("No CPU information found in DT\n");
return 0;
}
/*
* When topology is provided cpu-map is essentially a root
* cluster with restricted subnodes.
*/
/* (2) 找到"cpu-map"节点 */
map = of_get_child_by_name(cn, "cpu-map");
if (!map)
goto out;
/* (3) 解析"cpu-map"中的cluster */
ret = parse_cluster(map, 0);
if (ret != 0)
goto out_map;
/*
* Check that all cores are in the topology; the SMP code will
* only mark cores described in the DT as possible.
*/
for_each_possible_cpu(cpu)
if (cpu_topology[cpu].cluster_id == -1)
ret = -EINVAL;
out_map:
of_node_put(map);
out:
of_node_put(cn);
return ret;
}
|→
static int __init parse_cluster(struct device_node *cluster, int depth)
{
char name[10];
bool leaf = true;
bool has_cores = false;
struct device_node *c;
static int cluster_id __initdata;
int core_id = 0;
int i, ret;
/*
* First check for child clusters; we currently ignore any
* information about the nesting of clusters and present the
* scheduler with a flat list of them.
*/
i = 0;
/* (3.1) 如果有多级cluster,继续递归搜索 */
do {
snprintf(name, sizeof(name), "cluster%d", i);
c = of_get_child_by_name(cluster, name);
if (c) {
leaf = false;
ret = parse_cluster(c, depth + 1);
of_node_put(c);
if (ret != 0)
return ret;
}
i++;
} while (c);
/* Now check for cores */
i = 0;
do {
/* (3.2) 或者core层次的节点 */
snprintf(name, sizeof(name), "core%d", i);
c = of_get_child_by_name(cluster, name);
if (c) {
has_cores = true;
if (depth == 0) {
pr_err("%s: cpu-map children should be clusters\n",
c->full_name);
of_node_put(c);
return -EINVAL;
}
if (leaf) {
/* (3.3) 如果是叶子cluster节点,继续遍历core中的cpu节点 */
ret = parse_core(c, cluster_id, core_id++);
} else {
pr_err("%s: Non-leaf cluster with core %s\n",
cluster->full_name, name);
ret = -EINVAL;
}
of_node_put(c);
if (ret != 0)
return ret;
}
i++;
} while (c);
if (leaf && !has_cores)
pr_warn("%s: empty cluster\n", cluster->full_name);
if (leaf)
cluster_id++;
return 0;
}
||→
static int __init parse_core(struct device_node *core, int cluster_id,
int core_id)
{
char name[10];
bool leaf = true;
int i = 0;
int cpu;
struct device_node *t;
do {
/* (3.3.1) 如果存在thread层级,解析thread和cpu层级 */
snprintf(name, sizeof(name), "thread%d", i);
t = of_get_child_by_name(core, name);
if (t) {
leaf = false;
cpu = get_cpu_for_node(t);
if (cpu >= 0) {
cpu_topology[cpu].cluster_id = cluster_id;
cpu_topology[cpu].core_id = core_id;
cpu_topology[cpu].thread_id = i;
} else {
pr_err("%s: Can't get CPU for thread\n",
t->full_name);
of_node_put(t);
return -EINVAL;
}
of_node_put(t);
}
i++;
} while (t);
/* (3.3.2) 否则直接解析cpu层级 */
cpu = get_cpu_for_node(core);
if (cpu >= 0) {
if (!leaf) {
pr_err("%s: Core has both threads and CPU\n",
core->full_name);
return -EINVAL;
}
/* (3.3.3) 得到了cpu的cluster_id/core_id */
cpu_topology[cpu].cluster_id = cluster_id;
cpu_topology[cpu].core_id = core_id;
} else if (leaf) {
pr_err("%s: Can't get CPU for leaf core\n", core->full_name);
return -EINVAL;
}
return 0;
}
|||→
static int __init get_cpu_for_node(struct device_node *node)
{
struct device_node *cpu_node;
int cpu;
cpu_node = of_parse_phandle(node, "cpu", 0);
if (!cpu_node)
return -1;
for_each_possible_cpu(cpu) {
if (of_get_cpu_node(cpu, NULL) == cpu_node) {
of_node_put(cpu_node);
return cpu;
}
}
pr_crit("Unable to find CPU node for %s\n", cpu_node->full_name);
of_node_put(cpu_node);
return -1;
}
cpu同一层次的关系cpu_topology[cpu].core_sibling/thread_sibling会在update_siblings_masks()中更新:
kernel_init() -> kernel_init_freeable() -> smp_prepare_cpus() -> store_cpu_topology() -> update_siblings_masks()
↓
static void update_siblings_masks(unsigned int cpuid)
{
struct cpu_topology *cpu_topo, *cpuid_topo = &cpu_topology[cpuid];
int cpu;
/* update core and thread sibling masks */
for_each_possible_cpu(cpu) {
cpu_topo = &cpu_topology[cpu];
if (cpuid_topo->cluster_id != cpu_topo->cluster_id)
continue;
cpumask_set_cpu(cpuid, &cpu_topo->core_sibling);
if (cpu != cpuid)
cpumask_set_cpu(cpu, &cpuid_topo->core_sibling);
if (cpuid_topo->core_id != cpu_topo->core_id)
continue;
cpumask_set_cpu(cpuid, &cpu_topo->thread_sibling);
if (cpu != cpuid)
cpumask_set_cpu(cpu, &cpuid_topo->thread_sibling);
}
}
以mt6799为例,topology为”4*A35 + 4*A53 + 2*A73”,dts中定义如下:
mt6799.dtsi:
cpus {
#address-cells = <1>;
#size-cells = <0>;
cpu0: cpu@0 {
device_type = "cpu";
compatible = "arm,cortex-a35";
reg = <0x000>;
enable-method = "psci";
cpu-idle-states = <&LEGACY_MCDI &LEGACY_SODI &LEGACY_SODI3 &LEGACY_DPIDLE>,
<&LEGACY_SUSPEND &MCDI &SODI &SODI3 &DPIDLE &SUSPEND>;
cpu-release-addr = <0x0 0x40000200>;
clock-frequency = <1248000000>;
};
cpu1: cpu@001 {
device_type = "cpu";
compatible = "arm,cortex-a35";
reg = <0x001>;
enable-method = "psci";
cpu-idle-states = <&LEGACY_MCDI &LEGACY_SODI &LEGACY_SODI3 &LEGACY_DPIDLE>,
<&LEGACY_SUSPEND &MCDI &SODI &SODI3 &DPIDLE &SUSPEND>;
cpu-release-addr = <0x0 0x40000200>;
clock-frequency = <1248000000>;
};
cpu2: cpu@002 {
device_type = "cpu";
compatible = "arm,cortex-a35";
reg = <0x002>;
enable-method = "psci";
cpu-idle-states = <&LEGACY_MCDI &LEGACY_SODI &LEGACY_SODI3 &LEGACY_DPIDLE>,
<&LEGACY_SUSPEND &MCDI &SODI &SODI3 &DPIDLE &SUSPEND>;
cpu-release-addr = <0x0 0x40000200>;
clock-frequency = <1248000000>;
};
cpu3: cpu@003 {
device_type = "cpu";
compatible = "arm,cortex-a35";
reg = <0x003>;
enable-method = "psci";
cpu-idle-states = <&LEGACY_MCDI &LEGACY_SODI &LEGACY_SODI3 &LEGACY_DPIDLE>,
<&LEGACY_SUSPEND &MCDI &SODI &SODI3 &DPIDLE &SUSPEND>;
cpu-release-addr = <0x0 0x40000200>;
clock-frequency = <1248000000>;
};
cpu4: cpu@100 {
device_type = "cpu";
compatible = "arm,cortex-a53";
reg = <0x100>;
enable-method = "psci";
cpu-idle-states = <&LEGACY_MCDI &LEGACY_SODI &LEGACY_SODI3 &LEGACY_DPIDLE>,
<&LEGACY_SUSPEND &MCDI &SODI &SODI3 &DPIDLE &SUSPEND>;
cpu-release-addr = <0x0 0x40000200>;
clock-frequency = <1378000000>;
};
cpu5: cpu@101 {
device_type = "cpu";
compatible = "arm,cortex-a53";
reg = <0x101>;
enable-method = "psci";
cpu-idle-states = <&LEGACY_MCDI &LEGACY_SODI &LEGACY_SODI3 &LEGACY_DPIDLE>,
<&LEGACY_SUSPEND &MCDI &SODI &SODI3 &DPIDLE &SUSPEND>;
cpu-release-addr = <0x0 0x40000200>;
clock-frequency = <1378000000>;
};
cpu6: cpu@102 {
device_type = "cpu";
compatible = "arm,cortex-a53";
reg = <0x102>;
enable-method = "psci";
cpu-idle-states = <&LEGACY_MCDI &LEGACY_SODI &LEGACY_SODI3 &LEGACY_DPIDLE>,
<&LEGACY_SUSPEND &MCDI &SODI &SODI3 &DPIDLE &SUSPEND>;
cpu-release-addr = <0x0 0x40000200>;
clock-frequency = <1378000000>;
};
cpu7: cpu@103 {
device_type = "cpu";
compatible = "arm,cortex-a53";
reg = <0x103>;
enable-method = "psci";
cpu-idle-states = <&LEGACY_MCDI &LEGACY_SODI &LEGACY_SODI3 &LEGACY_DPIDLE>,
<&LEGACY_SUSPEND &MCDI &SODI &SODI3 &DPIDLE &SUSPEND>;
cpu-release-addr = <0x0 0x40000200>;
clock-frequency = <1378000000>;
};
cpu8: cpu@200 {
device_type = "cpu";
compatible = "arm,cortex-a73";
reg = <0x200>;
enable-method = "psci";
cpu-idle-states = <&LEGACY_MCDI &LEGACY_SODI &LEGACY_SODI3 &LEGACY_DPIDLE>,
<&LEGACY_SUSPEND &MCDI &SODI &SODI3 &DPIDLE &SUSPEND>;
cpu-release-addr = <0x0 0x40000200>;
clock-frequency = <1638000000>;
};
cpu9: cpu@201 {
device_type = "cpu";
compatible = "arm,cortex-a73";
reg = <0x201>;
enable-method = "psci";
cpu-idle-states = <&LEGACY_MCDI &LEGACY_SODI &LEGACY_SODI3 &LEGACY_DPIDLE>,
<&LEGACY_SUSPEND &MCDI &SODI &SODI3 &DPIDLE &SUSPEND>;
cpu-release-addr = <0x0 0x40000200>;
clock-frequency = <1638000000>;
};
cpu-map {
cluster0 {
core0 {
cpu = <&cpu0>;
};
core1 {
cpu = <&cpu1>;
};
core2 {
cpu = <&cpu2>;
};
core3 {
cpu = <&cpu3>;
};
};
cluster1 {
core0 {
cpu = <&cpu4>;
};
core1 {
cpu = <&cpu5>;
};
core2 {
cpu = <&cpu6>;
};
core3 {
cpu = <&cpu7>;
};
};
cluster2 {
core0 {
cpu = <&cpu8>;
};
core1 {
cpu = <&cpu9>;
};
};
};
- 经过parse_dt_topology()、update_siblings_masks()解析后得到cpu_topology[}的值为:
cpu 0 cluster_id = 0, core_id = 0, core_sibling = 0xf
cpu 1 cluster_id = 0, core_id = 1, core_sibling = 0xf
cpu 2 cluster_id = 0, core_id = 2, core_sibling = 0xf
cpu 3 cluster_id = 0, core_id = 3, core_sibling = 0xf
cpu 4 cluster_id = 1, core_id = 0, core_sibling = 0xf0
cpu 5 cluster_id = 1, core_id = 1, core_sibling = 0xf0
cpu 6 cluster_id = 1, core_id = 2, core_sibling = 0xf0
cpu 7 cluster_id = 1, core_id = 3, core_sibling = 0xf0
cpu 8 cluster_id = 2, core_id = 0, core_sibling = 0x300
cpu 9 cluster_id = 2, core_id = 1, core_sibling = 0x300
4.1.1.3、Scheduling Domains的初始化
在kernel_init_freeable()中,调用smp_prepare_cpus()初始化完cpu的拓扑关系,再调用smp_init()唤醒cpu,紧接会调用sched_init_smp()初始化系统的Scheduling Domains。
关于拓扑的层次默认可选的有3层:SMT/MC/DIE。arm目前不支持多线程技术,所以现在只支持2层:MC/DIE。
/*
* Topology list, bottom-up.
*/
static struct sched_domain_topology_level default_topology[] = {
#ifdef CONFIG_SCHED_SMT
{ cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
#endif
#ifdef CONFIG_SCHED_MC
{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
#endif
{ cpu_cpu_mask, SD_INIT_NAME(DIE) },
{ NULL, },
};
arm64使用的SDTL如下:
static struct sched_domain_topology_level arm64_topology[] = {
#ifdef CONFIG_SCHED_MC
{ cpu_coregroup_mask, cpu_corepower_flags, cpu_core_energy, SD_INIT_NAME(MC) },
#endif
{ cpu_cpu_mask, NULL, cpu_cluster_energy, SD_INIT_NAME(DIE) },
{ NULL, },
};
具体的Scheduling Domains的初始化代码分析如下:
kernel_init() -> kernel_init_freeable() -> sched_init_smp() -> init_sched_domains(cpu_active_mask):
↓
static int init_sched_domains(const struct cpumask *cpu_map)
{
int err;
arch_update_cpu_topology();
/* (1) 当前只有一个schedule domain需要初始化 */
ndoms_cur = 1;
doms_cur = alloc_sched_domains(ndoms_cur);
if (!doms_cur)
doms_cur = &fallback_doms;
/* (2) 按照传入的cpu_active_mask,构造sched_domains */
cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
err = build_sched_domains(doms_cur[0], NULL);
/* (3) 注册“/proc/sys/kernel/sched_domain/” */
register_sched_domain_sysctl();
return err;
}
|→
static int build_sched_domains(const struct cpumask *cpu_map,
struct sched_domain_attr *attr)
{
enum s_alloc alloc_state;
struct sched_domain *sd;
struct s_data d;
struct rq *rq = NULL;
int i, ret = -ENOMEM;
/* (2.1) 在每个tl层次,给每个cpu分配sd、sg、sgc空间 */
alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
if (alloc_state != sa_rootdomain)
goto error;
/* Set up domains for cpus specified by the cpu_map. */
for_each_cpu(i, cpu_map) {
struct sched_domain_topology_level *tl;
sd = NULL;
for_each_sd_topology(tl) {
/* (2.2) 初始化sd
构造其不同tl之间的sd的parent、cild关系
按照SDTL传入的tl->mask()函数,给sd->span[]赋值
*/
sd = build_sched_domain(tl, cpu_map, attr, sd, i);
/* (2.2.1) 将最底层tl的sd赋值给d.sd */
if (tl == sched_domain_topology)
*per_cpu_ptr(d.sd, i) = sd;
if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
sd->flags |= SD_OVERLAP;
if (cpumask_equal(cpu_map, sched_domain_span(sd)))
break;
}
}
/* Build the groups for the domains */
for_each_cpu(i, cpu_map) {
for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
/* (2.3) 给sd->span_weight赋值 */
sd->span_weight = cpumask_weight(sched_domain_span(sd));
if (sd->flags & SD_OVERLAP) {
if (build_overlap_sched_groups(sd, i))
goto error;
} else {
/* (2.4) 按照span,构造每个tl层次中,sd、sg之间的关系 */
if (build_sched_groups(sd, i))
goto error;
}
}
}
/* Calculate CPU capacity for physical packages and nodes */
for (i = nr_cpumask_bits-1; i >= 0; i--) {
struct sched_domain_topology_level *tl = sched_domain_topology;
if (!cpumask_test_cpu(i, cpu_map))
continue;
for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent, tl++) {
/* (2.5) 初始化sg->sge对应的energy表 */
init_sched_energy(i, sd, tl->energy);
/* (2.6) 对有人引用的sd、sg、sgc进行标识,
无人引用的sd、sg、sgc在__free_domain_allocs()中会被释放
*/
claim_allocations(i, sd);
/* (2.7) 初始化每个tl层级的sgc->capacity
*/
init_sched_groups_capacity(i, sd);
}
}
/* Attach the domains */
rcu_read_lock();
/* (2.8) 将d.rd赋值给rq->sd
将d.rd赋值给rq->rd
*/
for_each_cpu(i, cpu_map) {
rq = cpu_rq(i);
sd = *per_cpu_ptr(d.sd, i);
cpu_attach_domain(sd, d.rd, i);
}
rcu_read_unlock();
ret = 0;
error:
/* (2.9) free掉分配失败/分配成功多余的内存 */
__free_domain_allocs(&d, alloc_state, cpu_map);
return ret;
}
||→
static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
const struct cpumask *cpu_map)
{
memset(d, 0, sizeof(*d));
/* (2.1.1) 每个tl层次,给每个cpu都分配sd、sg、sgc,
tl->data->sd、l->data->sg、l->data->sgc
*/
if (__sdt_alloc(cpu_map))
return sa_sd_storage;
/* (2.1.2) 分配d->sd指针空间
实际d->sd会指向最底层tl的tl->data->sd
*/
d->sd = alloc_percpu(struct sched_domain *);
if (!d->sd)
return sa_sd_storage;
/* (2.1.3) 分配d->rd的指针空间和实际空间
rd = root_domain
*/
d->rd = alloc_rootdomain();
if (!d->rd)
return sa_sd;
return sa_rootdomain;
}
||→
struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
const struct cpumask *cpu_map, struct sched_domain_attr *attr,
struct sched_domain *child, int cpu)
{
struct sched_domain *sd = sd_init(tl, cpu);
if (!sd)
return child;
/* (2.2.1) 根据tl->mask()初始化sd->sapn[] */
cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
if (child) {
sd->level = child->level + 1;
sched_domain_level_max = max(sched_domain_level_max, sd->level);
/* (2.2.2) 如果有多层tl,建立起sd之间的parent/child关系,
对arm来说:MC层tl->data->sd是child,DIE层tl->data->sd是parent
*/
child->parent = sd;
sd->child = child;
if (!cpumask_subset(sched_domain_span(child),
sched_domain_span(sd))) {
pr_err("BUG: arch topology borken\n");
#ifdef CONFIG_SCHED_DEBUG
pr_err(" the %s domain not a subset of the %s domain\n",
child->name, sd->name);
#endif
/* Fixup, ensure @sd has at least @child cpus. */
cpumask_or(sched_domain_span(sd),
sched_domain_span(sd),
sched_domain_span(child));
}
}
set_domain_attribute(sd, attr);
return sd;
}
||→
static int
build_sched_groups(struct sched_domain *sd, int cpu)
{
struct sched_group *first = NULL, *last = NULL;
struct sd_data *sdd = sd->private;
const struct cpumask *span = sched_domain_span(sd);
struct cpumask *covered;
int i;
/* (2.4.1) 根据sd->span[]建立起sd、sg之间的关系 ,
如果sd没有child,每个cpu的sd、sg之间建立链接
如果sd有child,每个cpu的sd和span中第一个cpu的sg建立链接
*/
get_group(cpu, sdd, &sd->groups);
atomic_inc(&sd->groups->ref);
if (cpu != cpumask_first(span))
return 0;
lockdep_assert_held(&sched_domains_mutex);
covered = sched_domains_tmpmask;
cpumask_clear(covered);
/* (2.4.2) 挑选有sd链接的sg,给其中的sg->cpumask[]成员赋值 */
for_each_cpu(i, span) {
struct sched_group *sg;
int group, j;
if (cpumask_test_cpu(i, covered))
continue;
group = get_group(i, sdd, &sg);
cpumask_setall(sched_group_mask(sg));
for_each_cpu(j, span) {
if (get_group(j, sdd, NULL) != group)
continue;
cpumask_set_cpu(j, covered);
cpumask_set_cpu(j, sched_group_cpus(sg));
}
/* (2.4.3) 挑选有sd链接的sg,将同一层级sg链接成链表, */
if (!first)
first = sg;
if (last)
last->next = sg;
last = sg;
}
last->next = first;
return 0;
}
||→
static void init_sched_energy(int cpu, struct sched_domain *sd,
sched_domain_energy_f fn)
{
if (!(fn && fn(cpu)))
return;
if (cpu != group_balance_cpu(sd->groups))
return;
if (sd->child && !sd->child->groups->sge) {
pr_err("BUG: EAS setup broken for CPU%d\n", cpu);
#ifdef CONFIG_SCHED_DEBUG
pr_err(" energy data on %s but not on %s domain\n",
sd->name, sd->child->name);
#endif
return;
}
check_sched_energy_data(cpu, fn, sched_group_cpus(sd->groups));
/* (2.5.1) 不同层级tl,按照tl->energy()给sg->sge赋值 */
sd->groups->sge = fn(cpu);
}
||→
static void claim_allocations(int cpu, struct sched_domain *sd)
{
struct sd_data *sdd = sd->private;
/* (2.6.1) 对有人使用的tl->data->sd、tl->data->sg、tl->data->sgc置空,
无人使用的空间,将会在__free_domain_allocs()中被释放
*/
WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
*per_cpu_ptr(sdd->sd, cpu) = NULL;
if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
*per_cpu_ptr(sdd->sg, cpu) = NULL;
if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref))
*per_cpu_ptr(sdd->sgc, cpu) = NULL;
}
||→
static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
{
struct sched_group *sg = sd->groups;
WARN_ON(!sg);
do {
/* (2.7.1) 更新sg->group_weight的值 */
sg->group_weight = cpumask_weight(sched_group_cpus(sg));
sg = sg->next;
} while (sg != sd->groups);
if (cpu != group_balance_cpu(sg))
return;
/* (2.7.2) 更新sgc->capacity的值 */
update_group_capacity(sd, cpu);
/* (2.7.3) 更新sgc->nr_busy_cpus的值 */
atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight);
}
|||→
void update_group_capacity(struct sched_domain *sd, int cpu)
{
struct sched_domain *child = sd->child;
struct sched_group *group, *sdg = sd->groups;
unsigned long capacity;
unsigned long interval;
interval = msecs_to_jiffies(sd->balance_interval);
interval = clamp(interval, 1UL, max_load_balance_interval);
sdg->sgc->next_update = jiffies + interval;
if (!child) {
/* (2.7.2.1) 如果sd没有child是最底层tl,
则调用arch_scale_cpu_capacity()获取最大运算能力,并减去rt进程的消耗rq->rt_avg,
得到本sd的sg->sgc->capacity
*/
update_cpu_capacity(sd, cpu);
return;
}
capacity = 0;
if (child->flags & SD_OVERLAP) {
/*
* SD_OVERLAP domains cannot assume that child groups
* span the current group.
*/
for_each_cpu(cpu, sched_group_cpus(sdg)) {
struct sched_group_capacity *sgc;
struct rq *rq = cpu_rq(cpu);
/*
* build_sched_domains() -> init_sched_groups_capacity()
* gets here before we've attached the domains to the
* runqueues.
*
* Use capacity_of(), which is set irrespective of domains
* in update_cpu_capacity().
*
* This avoids capacity from being 0 and
* causing divide-by-zero issues on boot.
*/
if (unlikely(!rq->sd)) {
capacity += capacity_of(cpu);
continue;
}
sgc = rq->sd->groups->sgc;
capacity += sgc->capacity;
}
} else {
/*
* !SD_OVERLAP domains can assume that child groups
* span the current group.
*/
/* (2.7.2.2) 如果sd有child不是最底层tl,
则sgc->capacity等于所有child sg的group->sgc->capacity的和
*/
group = child->groups;
do {
capacity += group->sgc->capacity;
group = group->next;
} while (group != child->groups);
}
sdg->sgc->capacity = capacity;
}
||||→
static void update_cpu_capacity(struct sched_domain *sd, int cpu)
{
unsigned long capacity = arch_scale_cpu_capacity(sd, cpu);
struct sched_group *sdg = sd->groups;
struct max_cpu_capacity *mcc;
unsigned long max_capacity;
int max_cap_cpu;
unsigned long flags;
/* (2.7.2.1.1) 根据arch_scale_cpu_capacity获取到本cpu最大/orig capacity
*/
cpu_rq(cpu)->cpu_capacity_orig = capacity;
mcc = &cpu_rq(cpu)->rd->max_cpu_capacity;
raw_spin_lock_irqsave(&mcc->lock, flags);
max_capacity = mcc->val;
max_cap_cpu = mcc->cpu;
if ((max_capacity > capacity && max_cap_cpu == cpu) ||
(max_capacity < capacity)) {
mcc->val = capacity;
mcc->cpu = cpu;
#ifdef CONFIG_SCHED_DEBUG
raw_spin_unlock_irqrestore(&mcc->lock, flags);
/* pr_info("CPU%d: update max cpu_capacity %lu\n", cpu, capacity); */
goto skip_unlock;
#endif
}
raw_spin_unlock_irqrestore(&mcc->lock, flags);
skip_unlock: __attribute__ ((unused));
/* (2.7.2.1.2) 减去rt消耗的capacity,
rq->rt_avg/(sched_avg_period() + delta)是rt进程占用cpu的比例,
剩下就为cfs可用的capacity
*/
capacity *= scale_rt_capacity(cpu);
capacity >>= SCHED_CAPACITY_SHIFT;
if (!capacity)
capacity = 1;
cpu_rq(cpu)->cpu_capacity = capacity;
sdg->sgc->capacity = capacity;
}
init_sched_domains()是在系统启动时创建sched_domain,如果发生cpu hotplug系统中online的cpu发生变化时,会调用partition_sched_domains重新构造系统的sched_domain。
cpu_up() -> _cpu_up() -> __raw_notifier_call_chain() -> cpuset_cpu_active() -> cpuset_update_active_cpus() -> partition_sched_domains() -> build_sched_domains();
void __init sched_init_smp(void)
{
hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);
}
static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
void *hcpu)
{
switch (action) {
case CPU_ONLINE_FROZEN:
case CPU_DOWN_FAILED_FROZEN:
/*
* num_cpus_frozen tracks how many CPUs are involved in suspend
* resume sequence. As long as this is not the last online
* operation in the resume sequence, just build a single sched
* domain, ignoring cpusets.
*/
num_cpus_frozen--;
if (likely(num_cpus_frozen)) {
partition_sched_domains(1, NULL, NULL);
break;
}
/*
* This is the last CPU online operation. So fall through and
* restore the original sched domains by considering the
* cpuset configurations.
*/
case CPU_ONLINE:
cpuset_update_active_cpus(true);
break;
default:
return NOTIFY_DONE;
}
return NOTIFY_OK;
}
static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
void *hcpu)
{
unsigned long flags;
long cpu = (long)hcpu;
struct dl_bw *dl_b;
bool overflow;
int cpus;
switch (action) {
case CPU_DOWN_PREPARE:
rcu_read_lock_sched();
dl_b = dl_bw_of(cpu);
raw_spin_lock_irqsave(&dl_b->lock, flags);
cpus = dl_bw_cpus(cpu);
overflow = __dl_overflow(dl_b, cpus, 0, 0);
raw_spin_unlock_irqrestore(&dl_b->lock, flags);
rcu_read_unlock_sched();
if (overflow)
return notifier_from_errno(-EBUSY);
cpuset_update_active_cpus(false);
break;
case CPU_DOWN_PREPARE_FROZEN:
num_cpus_frozen++;
partition_sched_domains(1, NULL, NULL);
break;
default:
return NOTIFY_DONE;
}
return NOTIFY_OK;
}
4.1.1.4、mt6799的Scheduling Domains
在系统初始化时,因为cmdline中传入了“maxcpus=8”所以setup_max_cpus=8,smp只是启动了8个核,mt6799的另外2个大核是在后面才启动的。我们看看在系统启动8个核的时候,Scheduling Domains是什么样的。
在启动的时候每个层次的tl对每个cpu都会分配sd、sg、sgc的内存空间,但是建立起有效链接后有些sg、sgc空间是没有用上的。没有使用的内存后面会在claim_allocations()中标识出来,build_sched_domains()函数返回之前调用__free_domain_allocs()释放掉。
kernel_init() -> kernel_init_freeable() -> sched_init_smp() -> init_sched_domains() -> build_sched_domains() -> __visit_domain_allocation_hell() -> __sdt_alloc():
[__sdt_alloc][tl MC] cpu0, &sd = 0xffffffc15663c600, &sg = 0xffffffc156062600, &sgc = 0xffffffc156062780
[__sdt_alloc][tl MC] cpu1, &sd = 0xffffffc15608f000, &sg = 0xffffffc156056780, &sgc = 0xffffffc156090000
[__sdt_alloc][tl MC] cpu2, &sd = 0xffffffc15608fc00, &sg = 0xffffffc156090d80, &sgc = 0xffffffc156090180
[__sdt_alloc][tl MC] cpu3, &sd = 0xffffffc15608f300, &sg = 0xffffffc156090c00, &sgc = 0xffffffc156090300
[__sdt_alloc][tl MC] cpu4, &sd = 0xffffffc15608f900, &sg = 0xffffffc156090a80, &sgc = 0xffffffc156090480
[__sdt_alloc][tl MC] cpu5, &sd = 0xffffffc15608f600, &sg = 0xffffffc156090900, &sgc = 0xffffffc156090600
[__sdt_alloc][tl MC] cpu6, &sd = 0xffffffc156091000, &sg = 0xffffffc156090780, &sgc = 0xffffffc156092000
[__sdt_alloc][tl MC] cpu7, &sd = 0xffffffc156091c00, &sg = 0xffffffc156092d80, &sgc = 0xffffffc156092180
[__sdt_alloc][tl DIE] cpu0, &sd = 0xffffffc156091300, &sg = 0xffffffc156092c00, &sgc = 0xffffffc156092300
[__sdt_alloc][tl DIE] cpu1, &sd = 0xffffffc156091900, &sg = 0xffffffc156092a80, &sgc = 0xffffffc156092480
[__sdt_alloc][tl DIE] cpu2, &sd = 0xffffffc156091600, &sg = 0xffffffc156092900, &sgc = 0xffffffc156092600
[__sdt_alloc][tl DIE] cpu3, &sd = 0xffffffc156093000, &sg = 0xffffffc156092780, &sgc = 0xffffffc156094000
[__sdt_alloc][tl DIE] cpu4, &sd = 0xffffffc156093c00, &sg = 0xffffffc156094d80, &sgc = 0xffffffc156094180
[__sdt_alloc][tl DIE] cpu5, &sd = 0xffffffc156093300, &sg = 0xffffffc156094c00, &sgc = 0xffffffc156094300
[__sdt_alloc][tl DIE] cpu6, &sd = 0xffffffc156093900, &sg = 0xffffffc156094a80, &sgc = 0xffffffc156094480
[__sdt_alloc][tl DIE] cpu7, &sd = 0xffffffc156093600, &sg = 0xffffffc156094900, &sgc = 0xffffffc156094600
建立链接以后每个层次tl的sd、sg之间的关系:
kernel_init() -> kernel_init_freeable() -> sched_init_smp() -> init_sched_domains() -> build_sched_domains() -> build_sched_groups():
[build_sched_domains][tl MC] cpu0, sd->groups=0xffffffc156062600, sd->span_weight=4, sd->balance_interval=4, sd->span[]=0xf
[build_sched_domains][tl MC] cpu0, sg->sgc=0xffffffc156062780, sg->next=0xffffffc156056780, sg->group_weight=0, sg->cpumask[]=0x1
[build_sched_domains][tl MC] cpu0, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x3ff
[build_sched_domains][tl MC] cpu0, sd->min_interval=4, sd->max_interval=8, sd->busy_factor=32, sd->imbalance_pct=117, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=0, sd->newidle_idx=0, sd->wake_idx=0, sd->forkexec_idx=0, sd->span_weight=4, sd->balance_interval=4, sd->level=0
[build_sched_domains][tl MC] cpu0, sd->flags=0x832f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_SHARE_POWERDOMAIN|SD_SHARE_PKG_RESOURCES|SD_SHARE_CAP_STATES|
[build_sched_domains][tl MC] cpu1, sd->groups=0xffffffc156056780, sd->span_weight=4, sd->balance_interval=4, sd->span[]=0xf
[build_sched_domains][tl MC] cpu1, sg->sgc=0xffffffc156090000, sg->next=0xffffffc156090d80, sg->group_weight=0, sg->cpumask[]=0x2
[build_sched_domains][tl MC] cpu1, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x3ff
[build_sched_domains][tl MC] cpu1, sd->min_interval=4, sd->max_interval=8, sd->busy_factor=32, sd->imbalance_pct=117, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=0, sd->newidle_idx=0, sd->wake_idx=0, sd->forkexec_idx=0, sd->span_weight=4, sd->balance_interval=4, sd->level=0
[build_sched_domains][tl MC] cpu1, sd->flags=0x832f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_SHARE_POWERDOMAIN|SD_SHARE_PKG_RESOURCES|SD_SHARE_CAP_STATES|
[build_sched_domains][tl MC] cpu2, sd->groups=0xffffffc156090d80, sd->span_weight=4, sd->balance_interval=4, sd->span[]=0xf
[build_sched_domains][tl MC] cpu2, sg->sgc=0xffffffc156090180, sg->next=0xffffffc156090c00, sg->group_weight=0, sg->cpumask[]=0x4
[build_sched_domains][tl MC] cpu2, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x3ff
[build_sched_domains][tl MC] cpu2, sd->min_interval=4, sd->max_interval=8, sd->busy_factor=32, sd->imbalance_pct=117, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=0, sd->newidle_idx=0, sd->wake_idx=0, sd->forkexec_idx=0, sd->span_weight=4, sd->balance_interval=4, sd->level=0
[build_sched_domains][tl MC] cpu2, sd->flags=0x832f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_SHARE_POWERDOMAIN|SD_SHARE_PKG_RESOURCES|SD_SHARE_CAP_STATES|
[build_sched_domains][tl MC] cpu3, sd->groups=0xffffffc156090c00, sd->span_weight=4, sd->balance_interval=4, sd->span[]=0xf
[build_sched_domains][tl MC] cpu3, sg->sgc=0xffffffc156090300, sg->next=0xffffffc156062600, sg->group_weight=0, sg->cpumask[]=0x8
[build_sched_domains][tl MC] cpu3, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x3ff
[build_sched_domains][tl MC] cpu3, sd->min_interval=4, sd->max_interval=8, sd->busy_factor=32, sd->imbalance_pct=117, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=0, sd->newidle_idx=0, sd->wake_idx=0, sd->forkexec_idx=0, sd->span_weight=4, sd->balance_interval=4, sd->level=0
[build_sched_domains][tl MC] cpu3, sd->flags=0x832f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_SHARE_POWERDOMAIN|SD_SHARE_PKG_RESOURCES|SD_SHARE_CAP_STATES|
[build_sched_domains][tl MC] cpu4, sd->groups=0xffffffc156090a80, sd->span_weight=4, sd->balance_interval=4, sd->span[]=0xf0
[build_sched_domains][tl MC] cpu4, sg->sgc=0xffffffc156090480, sg->next=0xffffffc156090900, sg->group_weight=0, sg->cpumask[]=0x10
[build_sched_domains][tl MC] cpu4, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x3ff
[build_sched_domains][tl MC] cpu4, sd->min_interval=4, sd->max_interval=8, sd->busy_factor=32, sd->imbalance_pct=117, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=0, sd->newidle_idx=0, sd->wake_idx=0, sd->forkexec_idx=0, sd->span_weight=4, sd->balance_interval=4, sd->level=0
[build_sched_domains][tl MC] cpu4, sd->flags=0x832f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_SHARE_POWERDOMAIN|SD_SHARE_PKG_RESOURCES|SD_SHARE_CAP_STATES|
[build_sched_domains][tl MC] cpu5, sd->groups=0xffffffc156090900, sd->span_weight=4, sd->balance_interval=4, sd->span[]=0xf0
[build_sched_domains][tl MC] cpu5, sg->sgc=0xffffffc156090600, sg->next=0xffffffc156090780, sg->group_weight=0, sg->cpumask[]=0x20
[build_sched_domains][tl MC] cpu5, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x3ff
[build_sched_domains][tl MC] cpu5, sd->min_interval=4, sd->max_interval=8, sd->busy_factor=32, sd->imbalance_pct=117, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=0, sd->newidle_idx=0, sd->wake_idx=0, sd->forkexec_idx=0, sd->span_weight=4, sd->balance_interval=4, sd->level=0
[build_sched_domains][tl MC] cpu5, sd->flags=0x832f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_SHARE_POWERDOMAIN|SD_SHARE_PKG_RESOURCES|SD_SHARE_CAP_STATES|
[build_sched_domains][tl MC] cpu6, sd->groups=0xffffffc156090780, sd->span_weight=4, sd->balance_interval=4, sd->span[]=0xf0
[build_sched_domains][tl MC] cpu6, sg->sgc=0xffffffc156092000, sg->next=0xffffffc156092d80, sg->group_weight=0, sg->cpumask[]=0x40
[build_sched_domains][tl MC] cpu6, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x3ff
[build_sched_domains][tl MC] cpu6, sd->min_interval=4, sd->max_interval=8, sd->busy_factor=32, sd->imbalance_pct=117, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=0, sd->newidle_idx=0, sd->wake_idx=0, sd->forkexec_idx=0, sd->span_weight=4, sd->balance_interval=4, sd->level=0
[build_sched_domains][tl MC] cpu6, sd->flags=0x832f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_SHARE_POWERDOMAIN|SD_SHARE_PKG_RESOURCES|SD_SHARE_CAP_STATES|
[build_sched_domains][tl MC] cpu7, sd->groups=0xffffffc156092d80, sd->span_weight=4, sd->balance_interval=4, sd->span[]=0xf0
[build_sched_domains][tl MC] cpu7, sg->sgc=0xffffffc156092180, sg->next=0xffffffc156090a80, sg->group_weight=0, sg->cpumask[]=0x80
[build_sched_domains][tl MC] cpu7, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x3ff
[build_sched_domains][tl MC] cpu7, sd->min_interval=4, sd->max_interval=8, sd->busy_factor=32, sd->imbalance_pct=117, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=0, sd->newidle_idx=0, sd->wake_idx=0, sd->forkexec_idx=0, sd->span_weight=4, sd->balance_interval=4, sd->level=0
[build_sched_domains][tl MC] cpu7, sd->flags=0x832f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_SHARE_POWERDOMAIN|SD_SHARE_PKG_RESOURCES|SD_SHARE_CAP_STATES|
[build_sched_domains][tl DIE] cpu0, sd->groups=0xffffffc156092c00, sd->span_weight=8, sd->balance_interval=8, sd->span[]=0xff
[build_sched_domains][tl DIE] cpu0, sg->sgc=0xffffffc156092300, sg->next=0xffffffc156094d80, sg->group_weight=0, sg->cpumask[]=0xf
[build_sched_domains][tl DIE] cpu0, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x3ff
[build_sched_domains][tl DIE] cpu0, sd->min_interval=8, sd->max_interval=16, sd->busy_factor=32, sd->imbalance_pct=125, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=1, sd->newidle_idx=0, sd->wake_idx=0, sd->forkexec_idx=0, sd->span_weight=8, sd->balance_interval=8, sd->level=1
[build_sched_domains][tl DIE] cpu0, sd->flags=0x102f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_PREFER_SIBLING|
[build_sched_domains][tl DIE] cpu1, sd->groups=0xffffffc156092c00, sd->span_weight=8, sd->balance_interval=8, sd->span[]=0xff
[build_sched_domains][tl DIE] cpu1, sg->sgc=0x0, sg->next=0xffffffc156092a80, sg->group_weight=0, sg->cpumask[]=0x0
[build_sched_domains][tl DIE] cpu1, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x0
[build_sched_domains][tl DIE] cpu1, sd->min_interval=8, sd->max_interval=16, sd->busy_factor=32, sd->imbalance_pct=125, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=1, sd->newidle_idx=0, sd->wake_idx=0, sd->forkexec_idx=0, sd->span_weight=8, sd->balance_interval=8, sd->level=1
[build_sched_domains][tl DIE] cpu1, sd->flags=0x102f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_PREFER_SIBLING|
[build_sched_domains][tl DIE] cpu2, sd->groups=0xffffffc156092c00, sd->span_weight=8, sd->balance_interval=8, sd->span[]=0xff
[build_sched_domains][tl DIE] cpu2, sg->sgc=0x0, sg->next=0xffffffc156092900, sg->group_weight=0, sg->cpumask[]=0x0
[build_sched_domains][tl DIE] cpu2, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x0
[build_sched_domains][tl DIE] cpu2, sd->min_interval=8, sd->max_interval=16, sd->busy_factor=32, sd->imbalance_pct=125, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=1, sd->newidle_idx=0, sd->wake_idx=0, sd->forkexec_idx=0, sd->span_weight=8, sd->balance_interval=8, sd->level=1
[build_sched_domains][tl DIE] cpu2, sd->flags=0x102f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_PREFER_SIBLING|
[build_sched_domains][tl DIE] cpu3, sd->groups=0xffffffc156092c00, sd->span_weight=8, sd->balance_interval=8, sd->span[]=0xff
[build_sched_domains][tl DIE] cpu3, sg->sgc=0x0, sg->next=0xffffffc156092780, sg->group_weight=0, sg->cpumask[]=0x0
[build_sched_domains][tl DIE] cpu3, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x0
[build_sched_domains][tl DIE] cpu3, sd->min_interval=8, sd->max_interval=16, sd->busy_factor=32, sd->imbalance_pct=125, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=1, sd->newidle_idx=0, sd->wake_idx=0, sd->forkexec_idx=0, sd->span_weight=8, sd->balance_interval=8, sd->level=1
[build_sched_domains][tl DIE] cpu3, sd->flags=0x102f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_PREFER_SIBLING|
[build_sched_domains][tl DIE] cpu4, sd->groups=0xffffffc156094d80, sd->span_weight=8, sd->balance_interval=8, sd->span[]=0xff
[build_sched_domains][tl DIE] cpu4, sg->sgc=0xffffffc156094180, sg->next=0xffffffc156092c00, sg->group_weight=0, sg->cpumask[]=0xf0
[build_sched_domains][tl DIE] cpu4, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x3ff
[build_sched_domains][tl DIE] cpu4, sd->min_interval=8, sd->max_interval=16, sd->busy_factor=32, sd->imbalance_pct=125, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=1, sd->newidle_idx=0, sd->wake_idx=0, sd->forkexec_idx=0, sd->span_weight=8, sd->balance_interval=8, sd->level=1
[build_sched_domains][tl DIE] cpu4, sd->flags=0x102f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_PREFER_SIBLING|
[build_sched_domains][tl DIE] cpu5, sd->groups=0xffffffc156094d80, sd->span_weight=8, sd->balance_interval=8, sd->span[]=0xff
[build_sched_domains][tl DIE] cpu5, sg->sgc=0x0, sg->next=0xffffffc156094c00, sg->group_weight=0, sg->cpumask[]=0x0
[build_sched_domains][tl DIE] cpu5, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x0
[build_sched_domains][tl DIE] cpu5, sd->min_interval=8, sd->max_interval=16, sd->busy_factor=32, sd->imbalance_pct=125, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=1, sd->newidle_idx=0, sd->wake_idx=0, sd->forkexec_idx=0, sd->span_weight=8, sd->balance_interval=8, sd->level=1
[build_sched_domains][tl DIE] cpu5, sd->flags=0x102f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_PREFER_SIBLING|
[build_sched_domains][tl DIE] cpu6, sd->groups=0xffffffc156094d80, sd->span_weight=8, sd->balance_interval=8, sd->span[]=0xff
[build_sched_domains][tl DIE] cpu6, sg->sgc=0x0, sg->next=0xffffffc156094a80, sg->group_weight=0, sg->cpumask[]=0x0
[build_sched_domains][tl DIE] cpu6, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x0
[build_sched_domains][tl DIE] cpu6, sd->min_interval=8, sd->max_interval=16, sd->busy_factor=32, sd->imbalance_pct=125, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=1, sd->newidle_idx=0, sd->wake_idx=0, sd->forkexec_idx=0, sd->span_weight=8, sd->balance_interval=8, sd->level=1
[build_sched_domains][tl DIE] cpu6, sd->flags=0x102f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_PREFER_SIBLING|
[build_sched_domains][tl DIE] cpu7, sd->groups=0xffffffc156094d80, sd->span_weight=8, sd->balance_interval=8, sd->span[]=0xff
[build_sched_domains][tl DIE] cpu7, sg->sgc=0x0, sg->next=0xffffffc156094900, sg->group_weight=0, sg->cpumask[]=0x0
[build_sched_domains][tl DIE] cpu7, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x0
[build_sched_domains][tl DIE] cpu7, sd->min_interval=8, sd->max_interval=16, sd->busy_factor=32, sd->imbalance_pct=125, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=1, sd->newidle_idx=0, sd->wake_idx=0, sd->forkexec_idx=0, sd->span_weight=8, sd->balance_interval=8, sd->level=1
[build_sched_domains][tl DIE] cpu7, sd->flags=0x102f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_PREFER_SIBLING|
用图形表达的关系如下:
每个sched_domain中的参数也非常重要,在函数sd_init()中初始化,在smp负载均衡时会频繁的使用这些参数和标志:
sd 参数 | tl MC 层级 | tl DIE 层级 |
sd->min_interval | 4 | 8 |
sd->max_interval | 8 | 16 |
sd->busy_factor | 32 | 32 |
sd->imbalance_pct | 117 | 125 |
sd->cache_nice_tries | 1 | 1 |
sd->busy_idx | 2 | 2 |
sd->idle_idx | 0 | 1 |
sd->newidle_idx | 0 | 0 |
sd->wake_idx | 0 | 0 |
sd->forkexec_idx | 0 | 0 |
sd->span_weight | 4 | 8 |
sd->balance_interval | 4 | 8 |
sd->level | 0 | 1 |
sd->flags | 0x832f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_SHARE_POWERDOMAIN|SD_SHARE_PKG_RESOURCES|SD_SHARE_CAP_STATES | 0x102f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_PREFER_SIBLING |
update_top_cache_domain()函数中还把常用的一些sd进行了cache,我们通过打印得出每个cache实际对应的层次sd:
cache sd | 说明 | 赋值 |
sd_busy | per_cpu(sd_busy, cpu), | 本cpu的tl DIE层级sd |
sd_llc | per_cpu(sd_llc, cpu), | 本cpu的tl MC层级sd |
sd_llc_size | per_cpu(sd_llc_size, cpu), | 4 |
sd_llc_id | per_cpu(sd_llc_id, cpu), | 0/4 |
sd_numa | per_cpu(sd_numa, cpu), | 0 |
sd_asym | per_cpu(sd_asym, cpu), | 0 |
sd_ea | per_cpu(sd_ea, cpu), | 本cpu的tl DIE层级sd |
sd_scs | per_cpu(sd_scs, cpu), | 本cpu的tl MC层级sd |
static void update_top_cache_domain(int cpu)
{
struct sched_domain *sd;
struct sched_domain *busy_sd = NULL, *ea_sd = NULL;
int id = cpu;
int size = 1;
sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
if (sd) {
id = cpumask_first(sched_domain_span(sd));
size = cpumask_weight(sched_domain_span(sd));
busy_sd = sd->parent; /* sd_busy */
}
rcu_assign_pointer(per_cpu(sd_busy, cpu), busy_sd);
rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
per_cpu(sd_llc_size, cpu) = size;
per_cpu(sd_llc_id, cpu) = id;
sd = lowest_flag_domain(cpu, SD_NUMA);
rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
rcu_assign_pointer(per_cpu(sd_asym, cpu), sd);
for_each_domain(cpu, sd) {
if (sd->groups->sge)
ea_sd = sd;
else
break;
}
rcu_assign_pointer(per_cpu(sd_ea, cpu), ea_sd);
sd = highest_flag_domain(cpu, SD_SHARE_CAP_STATES);
rcu_assign_pointer(per_cpu(sd_scs, cpu), sd);
}
[update_top_cache_domain] cpu0, sd_busy=0xffffffc156091300, sd_llc=0xffffffc15663c600, sd_llc_size=4, sd_llc_id=0, sd_numa=0x0, sd_asym=0x0, sd_ea=0xffffffc156091300, sd_scs=0xffffffc15663c600
[update_top_cache_domain] cpu1, sd_busy=0xffffffc156091900, sd_llc=0xffffffc15608f000, sd_llc_size=4, sd_llc_id=0, sd_numa=0x0, sd_asym=0x0, sd_ea=0xffffffc156091900, sd_scs=0xffffffc15608f000
[update_top_cache_domain] cpu2, sd_busy=0xffffffc156091600, sd_llc=0xffffffc15608fc00, sd_llc_size=4, sd_llc_id=0, sd_numa=0x0, sd_asym=0x0, sd_ea=0xffffffc156091600, sd_scs=0xffffffc15608fc00
[update_top_cache_domain] cpu3, sd_busy=0xffffffc156093000, sd_llc=0xffffffc15608f300, sd_llc_size=4, sd_llc_id=0, sd_numa=0x0, sd_asym=0x0, sd_ea=0xffffffc156093000, sd_scs=0xffffffc15608f300
[update_top_cache_domain] cpu4, sd_busy=0xffffffc156093c00, sd_llc=0xffffffc15608f900, sd_llc_size=4, sd_llc_id=4, sd_numa=0x0, sd_asym=0x0, sd_ea=0xffffffc156093c00, sd_scs=0xffffffc15608f900
[update_top_cache_domain] cpu5, sd_busy=0xffffffc156093300, sd_llc=0xffffffc15608f600, sd_llc_size=4, sd_llc_id=4, sd_numa=0x0, sd_asym=0x0, sd_ea=0xffffffc156093300, sd_scs=0xffffffc15608f600
[update_top_cache_domain] cpu6, sd_busy=0xffffffc156093900, sd_llc=0xffffffc156091000, sd_llc_size=4, sd_llc_id=4, sd_numa=0x0, sd_asym=0x0, sd_ea=0xffffffc156093900, sd_scs=0xffffffc156091000
[update_top_cache_domain] cpu7, sd_busy=0xffffffc156093600, sd_llc=0xffffffc156091c00, sd_llc_size=4, sd_llc_id=4, sd_numa=0x0, sd_asym=0x0, sd_ea=0xffffffc156093600, sd_scs=0xffffffc156091c00
[__sdt_alloc][tl MC] cpu0, &sd = 0xffffffc15663c600, &sg = 0xffffffc156062600, &sgc = 0xffffffc156062780
[__sdt_alloc][tl MC] cpu1, &sd = 0xffffffc15608f000, &sg = 0xffffffc156056780, &sgc = 0xffffffc156090000
[__sdt_alloc][tl MC] cpu2, &sd = 0xffffffc15608fc00, &sg = 0xffffffc156090d80, &sgc = 0xffffffc156090180
[__sdt_alloc][tl MC] cpu3, &sd = 0xffffffc15608f300, &sg = 0xffffffc156090c00, &sgc = 0xffffffc156090300
[__sdt_alloc][tl MC] cpu4, &sd = 0xffffffc15608f900, &sg = 0xffffffc156090a80, &sgc = 0xffffffc156090480
[__sdt_alloc][tl MC] cpu5, &sd = 0xffffffc15608f600, &sg = 0xffffffc156090900, &sgc = 0xffffffc156090600
[__sdt_alloc][tl MC] cpu6, &sd = 0xffffffc156091000, &sg = 0xffffffc156090780, &sgc = 0xffffffc156092000
[__sdt_alloc][tl MC] cpu7, &sd = 0xffffffc156091c00, &sg = 0xffffffc156092d80, &sgc = 0xffffffc156092180
[__sdt_alloc][tl DIE] cpu0, &sd = 0xffffffc156091300, &sg = 0xffffffc156092c00, &sgc = 0xffffffc156092300
[__sdt_alloc][tl DIE] cpu1, &sd = 0xffffffc156091900, &sg = 0xffffffc156092a80, &sgc = 0xffffffc156092480
[__sdt_alloc][tl DIE] cpu2, &sd = 0xffffffc156091600, &sg = 0xffffffc156092900, &sgc = 0xffffffc156092600
[__sdt_alloc][tl DIE] cpu3, &sd = 0xffffffc156093000, &sg = 0xffffffc156092780, &sgc = 0xffffffc156094000
[__sdt_alloc][tl DIE] cpu4, &sd = 0xffffffc156093c00, &sg = 0xffffffc156094d80, &sgc = 0xffffffc156094180
[__sdt_alloc][tl DIE] cpu5, &sd = 0xffffffc156093300, &sg = 0xffffffc156094c00, &sgc = 0xffffffc156094300
[__sdt_alloc][tl DIE] cpu6, &sd = 0xffffffc156093900, &sg = 0xffffffc156094a80, &sgc = 0xffffffc156094480
[__sdt_alloc][tl DIE] cpu7, &sd = 0xffffffc156093600, &sg = 0xffffffc156094900, &sgc = 0xffffffc156094600
mt6799在计算功耗(energy)和运算能力(capacity)时使用的表项如下:
kernel_init() -> kernel_init_freeable() -> sched_init_smp() -> init_sched_domains() -> build_sched_domains() -> init_sched_energy()/init_sched_groups_capacity();
/* v1 FY */
struct upower_tbl_info upower_tbl_infos_FY[NR_UPOWER_BANK] = {
INIT_UPOWER_TBL_INFOS(UPOWER_BANK_LL, upower_tbl_ll_1_FY),
INIT_UPOWER_TBL_INFOS(UPOWER_BANK_L, upower_tbl_l_1_FY),
INIT_UPOWER_TBL_INFOS(UPOWER_BANK_B, upower_tbl_b_1_FY),
INIT_UPOWER_TBL_INFOS(UPOWER_BANK_CLS_LL, upower_tbl_cluster_ll_1_FY),
INIT_UPOWER_TBL_INFOS(UPOWER_BANK_CLS_L, upower_tbl_cluster_l_1_FY),
INIT_UPOWER_TBL_INFOS(UPOWER_BANK_CLS_B, upower_tbl_cluster_b_1_FY),
INIT_UPOWER_TBL_INFOS(UPOWER_BANK_CCI, upower_tbl_cci_1_FY),
};
/* ver1 */
/* FY table */
struct upower_tbl upower_tbl_ll_1_FY = {
.row = {
{.cap = 100, .volt = 75000, .dyn_pwr = 9994, .lkg_pwr = {13681, 13681, 13681, 13681, 13681, 13681} },
{.cap = 126, .volt = 75000, .dyn_pwr = 12585, .lkg_pwr = {13681, 13681, 13681, 13681, 13681, 13681} },
{.cap = 148, .volt = 75000, .dyn_pwr = 14806, .lkg_pwr = {13681, 13681, 13681, 13681, 13681, 13681} },
{.cap = 167, .volt = 75000, .dyn_pwr = 16656, .lkg_pwr = {13681, 13681, 13681, 13681, 13681, 13681} },
{.cap = 189, .volt = 75000, .dyn_pwr = 18877, .lkg_pwr = {13681, 13681, 13681, 13681, 13681, 13681} },
{.cap = 212, .volt = 75000, .dyn_pwr = 21098, .lkg_pwr = {13681, 13681, 13681, 13681, 13681, 13681} },
{.cap = 230, .volt = 75700, .dyn_pwr = 23379, .lkg_pwr = {13936, 13936, 13936, 13936, 13936, 13936} },
{.cap = 245, .volt = 78100, .dyn_pwr = 26490, .lkg_pwr = {14811, 14811, 14811, 14811, 14811, 14811} },
{.cap = 263, .volt = 81100, .dyn_pwr = 30729, .lkg_pwr = {15958, 15958, 15958, 15958, 15958, 15958} },
{.cap = 278, .volt = 83500, .dyn_pwr = 34409, .lkg_pwr = {16949, 16949, 16949, 16949, 16949, 16949} },
{.cap = 293, .volt = 86000, .dyn_pwr = 38447, .lkg_pwr = {18036, 18036, 18036, 18036, 18036, 18036} },
{.cap = 304, .volt = 88400, .dyn_pwr = 42166, .lkg_pwr = {19159, 19159, 19159, 19159, 19159, 19159} },
{.cap = 319, .volt = 90800, .dyn_pwr = 46657, .lkg_pwr = {20333, 20333, 20333, 20333, 20333, 20333} },
{.cap = 334, .volt = 93200, .dyn_pwr = 51442, .lkg_pwr = {21605, 21605, 21605, 21605, 21605, 21605} },
{.cap = 345, .volt = 95000, .dyn_pwr = 55230, .lkg_pwr = {22560, 22560, 22560, 22560, 22560, 22560} },
{.cap = 356, .volt = 97400, .dyn_pwr = 59928, .lkg_pwr = {24002, 24002, 24002, 24002, 24002, 24002} },
},
.lkg_idx = DEFAULT_LKG_IDX,
.row_num = UPOWER_OPP_NUM,
.nr_idle_states = NR_UPOWER_CSTATES,
.idle_states = {
{{0}, {7321} },
{{0}, {7321} },
{{0}, {7321} },
{{0}, {7321} },
{{0}, {7321} },
{{0}, {7321} },
},
};
struct upower_tbl upower_tbl_cluster_ll_1_FY = {
.row = {
{.cap = 100, .volt = 75000, .dyn_pwr = 3656, .lkg_pwr = {21729, 21729, 21729, 21729, 21729, 21729} },
{.cap = 126, .volt = 75000, .dyn_pwr = 4604, .lkg_pwr = {21729, 21729, 21729, 21729, 21729, 21729} },
{.cap = 148, .volt = 75000, .dyn_pwr = 5417, .lkg_pwr = {21729, 21729, 21729, 21729, 21729, 21729} },
{.cap = 167, .volt = 75000, .dyn_pwr = 6094, .lkg_pwr = {21729, 21729, 21729, 21729, 21729, 21729} },
{.cap = 189, .volt = 75000, .dyn_pwr = 6906, .lkg_pwr = {21729, 21729, 21729, 21729, 21729, 21729} },
{.cap = 212, .volt = 75000, .dyn_pwr = 7719, .lkg_pwr = {21729, 21729, 21729, 21729, 21729, 21729} },
{.cap = 230, .volt = 75700, .dyn_pwr = 8553, .lkg_pwr = {22134, 22134, 22134, 22134, 22134, 22134} },
{.cap = 245, .volt = 78100, .dyn_pwr = 9692, .lkg_pwr = {23523, 23523, 23523, 23523, 23523, 23523} },
{.cap = 263, .volt = 81100, .dyn_pwr = 11242, .lkg_pwr = {25344, 25344, 25344, 25344, 25344, 25344} },
{.cap = 278, .volt = 83500, .dyn_pwr = 12589, .lkg_pwr = {26919, 26919, 26919, 26919, 26919, 26919} },
{.cap = 293, .volt = 86000, .dyn_pwr = 14066, .lkg_pwr = {28646, 28646, 28646, 28646, 28646, 28646} },
{.cap = 304, .volt = 88400, .dyn_pwr = 15427, .lkg_pwr = {30430, 30430, 30430, 30430, 30430, 30430} },
{.cap = 319, .volt = 90800, .dyn_pwr = 17069, .lkg_pwr = {32293, 32293, 32293, 32293, 32293, 32293} },
{.cap = 334, .volt = 93200, .dyn_pwr = 18820, .lkg_pwr = {34314, 34314, 34314, 34314, 34314, 34314} },
{.cap = 345, .volt = 95000, .dyn_pwr = 20206, .lkg_pwr = {35830, 35830, 35830, 35830, 35830, 35830} },
{.cap = 356, .volt = 97400, .dyn_pwr = 21925, .lkg_pwr = {38121, 38121, 38121, 38121, 38121, 38121} },
},
.lkg_idx = DEFAULT_LKG_IDX,
.row_num = UPOWER_OPP_NUM,
.nr_idle_states = NR_UPOWER_CSTATES,
.idle_states = {
{{0}, {11628} },
{{0}, {11628} },
{{0}, {11628} },
{{0}, {11628} },
{{0}, {11628} },
{{0}, {11628} },
},
};
struct upower_tbl upower_tbl_l_1_FY = {
.row = {
{.cap = 116, .volt = 75000, .dyn_pwr = 16431, .lkg_pwr = {22973, 22973, 22973, 22973, 22973, 22973} },
{.cap = 152, .volt = 75000, .dyn_pwr = 21486, .lkg_pwr = {22973, 22973, 22973, 22973, 22973, 22973} },
{.cap = 179, .volt = 75000, .dyn_pwr = 25278, .lkg_pwr = {22973, 22973, 22973, 22973, 22973, 22973} },
{.cap = 201, .volt = 75000, .dyn_pwr = 28437, .lkg_pwr = {22973, 22973, 22973, 22973, 22973, 22973} },
{.cap = 228, .volt = 75000, .dyn_pwr = 32229, .lkg_pwr = {22973, 22973, 22973, 22973, 22973, 22973} },
{.cap = 255, .volt = 75000, .dyn_pwr = 36021, .lkg_pwr = {22973, 22973, 22973, 22973, 22973, 22973} },
{.cap = 282, .volt = 75700, .dyn_pwr = 40559, .lkg_pwr = {23423, 23423, 23423, 23423, 23423, 23423} },
{.cap = 304, .volt = 78100, .dyn_pwr = 46598, .lkg_pwr = {24968, 24968, 24968, 24968, 24968, 24968} },
{.cap = 331, .volt = 81100, .dyn_pwr = 54680, .lkg_pwr = {26999, 26999, 26999, 26999, 26999, 26999} },
{.cap = 349, .volt = 83500, .dyn_pwr = 61098, .lkg_pwr = {28760, 28760, 28760, 28760, 28760, 28760} },
{.cap = 371, .volt = 86000, .dyn_pwr = 68965, .lkg_pwr = {30698, 30698, 30698, 30698, 30698, 30698} },
{.cap = 393, .volt = 88400, .dyn_pwr = 77258, .lkg_pwr = {32706, 32706, 32706, 32706, 32706, 32706} },
{.cap = 416, .volt = 90800, .dyn_pwr = 86141, .lkg_pwr = {34808, 34808, 34808, 34808, 34808, 34808} },
{.cap = 438, .volt = 93200, .dyn_pwr = 95634, .lkg_pwr = {37097, 37097, 37097, 37097, 37097, 37097} },
{.cap = 452, .volt = 95000, .dyn_pwr = 102406, .lkg_pwr = {38814, 38814, 38814, 38814, 38814, 38814} },
{.cap = 474, .volt = 97400, .dyn_pwr = 112974, .lkg_pwr = {41424, 41424, 41424, 41424, 41424, 41424} },
},
.lkg_idx = DEFAULT_LKG_IDX,
.row_num = UPOWER_OPP_NUM,
.nr_idle_states = NR_UPOWER_CSTATES,
.idle_states = {
{{0}, {11926} },
{{0}, {11926} },
{{0}, {11926} },
{{0}, {11926} },
{{0}, {11926} },
{{0}, {11926} },
},
};
struct upower_tbl upower_tbl_cluster_l_1_FY = {
.row = {
{.cap = 116, .volt = 75000, .dyn_pwr = 2778, .lkg_pwr = {26537, 26537, 26537, 26537, 26537, 26537} },
{.cap = 152, .volt = 75000, .dyn_pwr = 3633, .lkg_pwr = {26537, 26537, 26537, 26537, 26537, 26537} },
{.cap = 179, .volt = 75000, .dyn_pwr = 4274, .lkg_pwr = {26537, 26537, 26537, 26537, 26537, 26537} },
{.cap = 201, .volt = 75000, .dyn_pwr = 4808, .lkg_pwr = {26537, 26537, 26537, 26537, 26537, 26537} },
{.cap = 228, .volt = 75000, .dyn_pwr = 5449, .lkg_pwr = {26537, 26537, 26537, 26537, 26537, 26537} },
{.cap = 255, .volt = 75000, .dyn_pwr = 6090, .lkg_pwr = {26537, 26537, 26537, 26537, 26537, 26537} },
{.cap = 282, .volt = 75700, .dyn_pwr = 6857, .lkg_pwr = {27058, 27058, 27058, 27058, 27058, 27058} },
{.cap = 304, .volt = 78100, .dyn_pwr = 7878, .lkg_pwr = {28843, 28843, 28843, 28843, 28843, 28843} },
{.cap = 331, .volt = 81100, .dyn_pwr = 9245, .lkg_pwr = {31188, 31188, 31188, 31188, 31188, 31188} },
{.cap = 349, .volt = 83500, .dyn_pwr = 10330, .lkg_pwr = {33223, 33223, 33223, 33223, 33223, 33223} },
{.cap = 371, .volt = 86000, .dyn_pwr = 11660, .lkg_pwr = {35461, 35461, 35461, 35461, 35461, 35461} },
{.cap = 393, .volt = 88400, .dyn_pwr = 13062, .lkg_pwr = {37781, 37781, 37781, 37781, 37781, 37781} },
{.cap = 416, .volt = 90800, .dyn_pwr = 14564, .lkg_pwr = {40209, 40209, 40209, 40209, 40209, 40209} },
{.cap = 438, .volt = 93200, .dyn_pwr = 16169, .lkg_pwr = {42854, 42854, 42854, 42854, 42854, 42854} },
{.cap = 452, .volt = 95000, .dyn_pwr = 17314, .lkg_pwr = {44837, 44837, 44837, 44837, 44837, 44837} },
{.cap = 474, .volt = 97400, .dyn_pwr = 19101, .lkg_pwr = {47852, 47852, 47852, 47852, 47852, 47852} },
},
.lkg_idx = DEFAULT_LKG_IDX,
.row_num = UPOWER_OPP_NUM,
.nr_idle_states = NR_UPOWER_CSTATES,
.idle_states = {
{{0}, {13776} },
{{0}, {13776} },
{{0}, {13776} },
{{0}, {13776} },
{{0}, {13776} },
{{0}, {13776} },
},
};
struct upower_tbl upower_tbl_b_1_FY = {
.row = {
{.cap = 211, .volt = 75000, .dyn_pwr = 61732, .lkg_pwr = {71164, 71164, 71164, 71164, 71164, 71164} },
{.cap = 268, .volt = 75000, .dyn_pwr = 78352, .lkg_pwr = {71164, 71164, 71164, 71164, 71164, 71164} },
{.cap = 317, .volt = 75000, .dyn_pwr = 92598, .lkg_pwr = {71164, 71164, 71164, 71164, 71164, 71164} },
{.cap = 358, .volt = 75000, .dyn_pwr = 104469, .lkg_pwr = {71164, 71164, 71164, 71164, 71164, 71164} },
{.cap = 406, .volt = 75000, .dyn_pwr = 118715, .lkg_pwr = {71164, 71164, 71164, 71164, 71164, 71164} },
{.cap = 447, .volt = 75000, .dyn_pwr = 130587, .lkg_pwr = {71164, 71164, 71164, 71164, 71164, 71164} },
{.cap = 504, .volt = 75700, .dyn_pwr = 149968, .lkg_pwr = {72438, 72438, 72438, 72438, 72438, 72438} },
{.cap = 561, .volt = 78100, .dyn_pwr = 177650, .lkg_pwr = {76806, 76806, 76806, 76806, 76806, 76806} },
{.cap = 634, .volt = 81100, .dyn_pwr = 216546, .lkg_pwr = {82521, 82521, 82521, 82521, 82521, 82521} },
{.cap = 691, .volt = 83500, .dyn_pwr = 250153, .lkg_pwr = {87447, 87447, 87447, 87447, 87447, 87447} },
{.cap = 748, .volt = 86000, .dyn_pwr = 287210, .lkg_pwr = {92841, 92841, 92841, 92841, 92841, 92841} },
{.cap = 805, .volt = 88400, .dyn_pwr = 326553, .lkg_pwr = {98397, 98397, 98397, 98397, 98397, 98397} },
{.cap = 861, .volt = 90800, .dyn_pwr = 368886, .lkg_pwr = {104190, 104190, 104190, 104190, 104190, 104190} },
{.cap = 918, .volt = 93200, .dyn_pwr = 414309, .lkg_pwr = {110456, 110456, 110456, 110456, 110456, 110456} },
{.cap = 959, .volt = 95000, .dyn_pwr = 449514, .lkg_pwr = {115156, 115156, 115156, 115156, 115156, 115156} },
{.cap = 1024, .volt = 97400, .dyn_pwr = 504548, .lkg_pwr = {122224, 122224, 122224, 122224, 122224, 122224} },
},
.lkg_idx = DEFAULT_LKG_IDX,
.row_num = UPOWER_OPP_NUM,
.nr_idle_states = NR_UPOWER_CSTATES,
.idle_states = {
{{0}, {38992} },
{{0}, {38992} },
{{0}, {38992} },
{{0}, {38992} },
{{0}, {38992} },
{{0}, {38992} },
},
};
struct upower_tbl upower_tbl_cluster_b_1_FY = {
.row = {
{.cap = 211, .volt = 75000, .dyn_pwr = 6408, .lkg_pwr = {27561, 27561, 27561, 27561, 27561, 27561} },
{.cap = 268, .volt = 75000, .dyn_pwr = 8133, .lkg_pwr = {27561, 27561, 27561, 27561, 27561, 27561} },
{.cap = 317, .volt = 75000, .dyn_pwr = 9612, .lkg_pwr = {27561, 27561, 27561, 27561, 27561, 27561} },
{.cap = 358, .volt = 75000, .dyn_pwr = 10844, .lkg_pwr = {27561, 27561, 27561, 27561, 27561, 27561} },
{.cap = 406, .volt = 75000, .dyn_pwr = 12323, .lkg_pwr = {27561, 27561, 27561, 27561, 27561, 27561} },
{.cap = 447, .volt = 75000, .dyn_pwr = 13555, .lkg_pwr = {27561, 27561, 27561, 27561, 27561, 27561} },
{.cap = 504, .volt = 75700, .dyn_pwr = 15567, .lkg_pwr = {28054, 28054, 28054, 28054, 28054, 28054} },
{.cap = 561, .volt = 78100, .dyn_pwr = 18440, .lkg_pwr = {29746, 29746, 29746, 29746, 29746, 29746} },
{.cap = 634, .volt = 81100, .dyn_pwr = 22478, .lkg_pwr = {31959, 31959, 31959, 31959, 31959, 31959} },
{.cap = 691, .volt = 83500, .dyn_pwr = 25966, .lkg_pwr = {33867, 33867, 33867, 33867, 33867, 33867} },
{.cap = 748, .volt = 86000, .dyn_pwr = 29813, .lkg_pwr = {35956, 35956, 35956, 35956, 35956, 35956} },
{.cap = 805, .volt = 88400, .dyn_pwr = 33897, .lkg_pwr = {38108, 38108, 38108, 38108, 38108, 38108} },
{.cap = 861, .volt = 90800, .dyn_pwr = 38291, .lkg_pwr = {40351, 40351, 40351, 40351, 40351, 40351} },
{.cap = 918, .volt = 93200, .dyn_pwr = 43006, .lkg_pwr = {42778, 42778, 42778, 42778, 42778, 42778} },
{.cap = 959, .volt = 95000, .dyn_pwr = 46661, .lkg_pwr = {44598, 44598, 44598, 44598, 44598, 44598} },
{.cap = 1024, .volt = 97400, .dyn_pwr = 52373, .lkg_pwr = {47335, 47335, 47335, 47335, 47335, 47335} },
},
.lkg_idx = DEFAULT_LKG_IDX,
.row_num = UPOWER_OPP_NUM,
.nr_idle_states = NR_UPOWER_CSTATES,
.idle_states = {
{{0}, {15101} },
{{0}, {15101} },
{{0}, {15101} },
{{0}, {15101} },
{{0}, {15101} },
{{0}, {15101} },
},
};
struct upower_tbl upower_tbl_cci_1_FY = {
.row = {
{.cap = 0, .volt = 75000, .dyn_pwr = 2708, .lkg_pwr = {16248, 16248, 16248, 16248, 16248, 16248} },
{.cap = 0, .volt = 75000, .dyn_pwr = 3611, .lkg_pwr = {16248, 16248, 16248, 16248, 16248, 16248} },
{.cap = 0, .volt = 75000, .dyn_pwr = 4288, .lkg_pwr = {16248, 16248, 16248, 16248, 16248, 16248} },
{.cap = 0, .volt = 75000, .dyn_pwr = 5191, .lkg_pwr = {16248, 16248, 16248, 16248, 16248, 16248} },
{.cap = 0, .volt = 75000, .dyn_pwr = 5868, .lkg_pwr = {16248, 16248, 16248, 16248, 16248, 16248} },
{.cap = 0, .volt = 75000, .dyn_pwr = 6771, .lkg_pwr = {16248, 16248, 16248, 16248, 16248, 16248} },
{.cap = 0, .volt = 75700, .dyn_pwr = 7588, .lkg_pwr = {16537, 16537, 16537, 16537, 16537, 16537} },
{.cap = 0, .volt = 78100, .dyn_pwr = 8811, .lkg_pwr = {17527, 17527, 17527, 17527, 17527, 17527} },
{.cap = 0, .volt = 81100, .dyn_pwr = 10292, .lkg_pwr = {18822, 18822, 18822, 18822, 18822, 18822} },
{.cap = 0, .volt = 83500, .dyn_pwr = 11750, .lkg_pwr = {19938, 19938, 19938, 19938, 19938, 19938} },
{.cap = 0, .volt = 86000, .dyn_pwr = 13354, .lkg_pwr = {21159, 21159, 21159, 21159, 21159, 21159} },
{.cap = 0, .volt = 88400, .dyn_pwr = 14737, .lkg_pwr = {22417, 22417, 22417, 22417, 22417, 22417} },
{.cap = 0, .volt = 90800, .dyn_pwr = 16540, .lkg_pwr = {23728, 23728, 23728, 23728, 23728, 23728} },
{.cap = 0, .volt = 93200, .dyn_pwr = 18472, .lkg_pwr = {25145, 25145, 25145, 25145, 25145, 25145} },
{.cap = 0, .volt = 95000, .dyn_pwr = 19916, .lkg_pwr = {26208, 26208, 26208, 26208, 26208, 26208} },
{.cap = 0, .volt = 97400, .dyn_pwr = 22077, .lkg_pwr = {27805, 27805, 27805, 27805, 27805, 27805} },
},
.lkg_idx = DEFAULT_LKG_IDX,
.row_num = UPOWER_OPP_NUM,
.nr_idle_states = NR_UPOWER_CSTATES,
.idle_states = {
{{0}, {8938} },
{{0}, {8938} },
{{0}, {8938} },
{{0}, {8938} },
{{0}, {8938} },
{{0}, {8938} },
},
};
4.1.2、smp负载均衡的实现
负载均衡和很多参数相关,下面列出了其中最重要的一些参数:
成员 | 所属结构 | 含义 | 更新/获取函数 | 计算方法 |
rq->cpu_capacity_orig | rq | 本cpu总的计算能力 | init_sched_groups_capacity()/update_sd_lb_stats() -> update_group_capacity() -> update_cpu_capacity() | capacity = arch_scale_cpu_capacity(sd, cpu) |
rq->cpu_capacity | rq | 本cpu cfs的计算能力 = 总capacity - rt占用的capacity | init_sched_groups_capacity()/update_sd_lb_stats() -> update_group_capacity() -> update_cpu_capacity() | capacity *= scale_rt_capacity(cpu); |
rq->rd->max_cpu_capacity | rq->rd | root_domain中最大的cpu计算能力 | init_sched_groups_capacity()/update_sd_lb_stats() -> update_group_capacity() -> update_cpu_capacity() | |
rq->rd->overutilized | rq->rd | update_sd_lb_stats() | ||
rq->rd->overload | rq->rd | update_sd_lb_stats() | ||
rq->rt_avg | rq | 本cpu的rt平均负载 | weighted_cpuload() -> cfs_rq_runnable_load_avg() | |
rq->cfs.runnable_load_avg | rq->cfs(cfs_rq) | 本cpu cfs_rq的runable平均负载 | __update_load_avg()、cfs_rq_load_avg() | (runnable时间*freq*weight)/LOAD_AVG_MAX |
rq->cfs.avg.load_avg | rq->cfs.avg | 本cpu cfs_rq的runnable平均负载 | __update_load_avg() | (runnable时间*freq*weight)/LOAD_AVG_MAX |
rq->cfs.avg.loadwop_avg | rq->cfs.avg | 本cpu cfs_rq的runnable平均负载,不含weight | __update_load_avg() | (runnable时间*freq)/LOAD_AVG_MAX |
rq->cfs.avg.util_avg | rq->cfs.avg | 本cpu cfs_rq的running负载 | __update_load_avg()、cpu_util() -> __cpu_util() | (running时间*freq*capacity)/LOAD_AVG_MAX |
cfs_rq->nr_running | cfs_rq | 本cfs_rq这个层次runnable的se的数量 | enqueue_entity()/dequeue_entity() -> account_entity_enqueue() | |
cfs_rq->h_nr_running | cfs_rq | 本cfs_rq包含所有子cfs_rq nr_running的总和 | enqueue_task_fair()/dequeue_task_fair | |
rq->nr_running | rq | 本cpu rq所有runnable的se的数量,包含所有子cfs_rq | enqueue_task_fair()/dequeue_task_fair -> add_nr_running() | |
4.1.2.1、rebalance_domains()
mtk对定义了3种power模式来兼容EAS的:EAS模式(energy_aware())、HMP模式(sched_feat(SCHED_HMP))、hybrid_support(EAS、HMP同时共存);
hybrid_support()模式下:一般负载均衡交给EAS;如果cpu_rq(cpu)->rd->overutilized负载已经严重不均衡,交给HMP;
系统在scheduler_tick()中会定期的检测smp负载均衡的时间是否已到,如果到时触发SCHED_SOFTIRQ软中断:
void scheduler_tick(void)
{
#ifdef CONFIG_SMP
rq->idle_balance = idle_cpu(cpu);
trigger_load_balance(rq);
#endif
}
|→
/*
* Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
*/
void trigger_load_balance(struct rq *rq)
{
/* Don't need to rebalance while attached to NULL domain */
if (unlikely(on_null_domain(rq)))
return;
if (time_after_eq(jiffies, rq->next_balance))
raise_softirq(SCHED_SOFTIRQ);
#ifdef CONFIG_NO_HZ_COMMON
if (nohz_kick_needed(rq))
nohz_balancer_kick();
#endif
}
SCHED_SOFTIRQ软中断的执行主体为run_rebalance_domains:
__init void init_sched_fair_class(void)
{
open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
}
/*
* run_rebalance_domains is triggered when needed from the scheduler tick.
* Also triggered for nohz idle balancing (with nohz_balancing_kick set).
*/
static void run_rebalance_domains(struct softirq_action *h)
{
struct rq *this_rq = this_rq();
enum cpu_idle_type idle = this_rq->idle_balance ?
CPU_IDLE : CPU_NOT_IDLE;
int this_cpu = smp_processor_id();
/* bypass load balance of HMP if EAS consideration */
if ((!energy_aware() && sched_feat(SCHED_HMP)) ||
(hybrid_support() && cpu_rq(this_cpu)->rd->overutilized))
hmp_force_up_migration(this_cpu);
/*
* If this cpu has a pending nohz_balance_kick, then do the
* balancing on behalf of the other idle cpus whose ticks are
* stopped. Do nohz_idle_balance *before* rebalance_domains to
* give the idle cpus a chance to load balance. Else we may
* load balance only within the local sched_domain hierarchy
* and abort nohz_idle_balance altogether if we pull some load.
*/
nohz_idle_balance(this_rq, idle);
rebalance_domains(this_rq, idle);
}
我们分析最核心的函数rebalance_domains():
需要重点提一下的是:负载计算计算了3种负载(load_avg、loadwop_avg、util_avg),rebalance_domains主要使用其中的load_avg,乘(SCHED_CAPACITY_SCALE/capacity)加以转换。
- 1、逐级轮询本cpu的sd,判断本sd的时间间隔是否到期,如果到期做load_balance();
| tl层级 | cpu_busy? | sd->balance_interval | sd->busy_factor | sd balance interval |
|—|—|—|—|—|
MC层级 | idle | 4 |1 | 4ms
MC层级 | busy | 4 | 32 | 128ms
DIE层级 | idle | 8 |1 | 8ms
DIE层级 | busy | 8 | 32 | 256ms
| | | | | rq->next_balance = min(上述值)
2、在load_balance()中判断在本层级sd本cpu的当前情况是否适合充当dst_cpu,在should_we_balance()做各种判断,做dst_cpu的条件有:要么是本sg的第一个idle cpu,要么是本sg的第一个cpu。dst_cpu是作为目的cpu让负载高的cpu迁移进程过来,如果本cpu不符合条件中断操作;
3、继续find_busiest_group(),在sg链表中找出负载最重的sg。核心计算在update_sd_lb_stats()、update_sg_lb_stats()中。如果dst_cpu所在的local_group负载大于busiest sg,或者大于sds平均负载,中断操作;如果成功计算需要迁移的负载env->imbalance,为min((sds->avg - local), (busiest - sds->avg));
- 3.1、根据当前cpu的idle状态计算cpu load(rq->cpu_load[])时选用的index值:
tl层级 | busy_idx | idle_idx | newidle_idx |
---|---|---|---|
MC层级 | 2 | 0 | 0 |
DIE层级 | 2 | 1 | 0 |
- 3.2、计算sg负载sgs,选择sgs->avg_load最大的sg作为busiest_group。其中几个关键值的计算如下:
负载值 | 计算方法 | 说明 |
---|---|---|
sgs->group_load | += cpu_rq(cpu)->cpu_load[index-1] | 累加cpu的load值,相对值(每个cpu的最大值都是1024),且带weight分量 |
sgs->group_util | += cpu_rq(cpu)->cfs.avg.util_avg | 累加cpu cfs running值,绝对值(不同cluster,只有最大capacity能力的cpu最大值为1024) |
sgs->group_capacity | += (arch_scale_cpu_capacity(sd, cpu)*(1-rt_capacity)) | 累加cpu的capacity,绝对值(不同cluster,只有最大capacity能力的cpu最大值为1024) |
sgs->avg_load | = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity | group_load做了转换,和group_capacity成反比 |
- 3.3、在计算sg负载时,几个关键状态的计算如下:
状态值 | 计算方法 | 说明 |
---|---|---|
sgs->group_no_capacity | (sgs->group_capacity * 100) < (sgs->group_util * env->sd->imbalance_pct) | 预留一定空间(比例为imbalance_pct),sg运算能力已经不够了,sgs->group_type=group_overloaded |
dst_rq->rd->overutilized | (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin) | 预留一定空间(比例为capacity_margin),sg运算能力已经不够了 |
dst_rq->rd->overload | rq->nr_running > 1 | sg中任何一个cpu的runnable进程大于1 |
比例参数imbalance_pct、capacity_margin的值为:
tl层级 | sd->imbalance_pct (/100) | capacity_margin (/1024) |
---|---|---|
MC层级 | 117 | 1280 |
DIE层级 | 125 | 1280 |
- 3.4、计算env->imbalance,这个是rebalance需要迁移的负载量:
负载值 | 计算方法 | 说明 |
---|---|---|
sds->total_load | += sgs->group_load | |
sds->total_capacity | += sgs->group_capacity | |
sds.avg_load | (SCHED_CAPACITY_SCALE * sds.total_load)/ sds.total_capacity | |
env->imbalance | min((busiest->avg_load - sds->avg_load)*busiest->group_capacity, (sds->avg_load - local->avg_load)*local->group_capacity) / SCHED_CAPACITY_SCALE) | 感觉这里计算有bug啊,前面是1024/capcity,后面是capacity/1024,很混乱 |
- 4、继续find_busiest_queue(),查找busiest sg中负载最重的cpu。
- 4.1、找出sg中weighted_cpuload*capacity_of值最大的cpu:
负载值 | 计算方法 | 说明 |
---|---|---|
weighted_cpuload(cpu) | cpu_rq(cpu)->cfs->runnable_load_avg | cpu的load值,相对值(每个cpu的最大值都是1024),且带weight分量 |
capacity_of(cpu) | arch_scale_cpu_capacity(sd, cpu)*(1-rt_capacity) | cpu cfs running值,绝对值(不同cluster,只有最大capacity能力的cpu最大值为1024) |
weighted_cpuload(cpu)*capacity_of(cpu) | 最大值为busiest sg中busiest cpu rq |
5、迁移busiest cpu的负载到本地dst cpu上,迁移的负载额度为env->imbalance:detach_tasks() -> attach_tasks();
6、处理几种因为进程亲和力问题,busiest cpu不能迁移走足够的进程:LBF_DST_PINNED尝试更改dst_cpu为本地cpu相同sg的其他cpu;LBF_SOME_PINNED当前不能均衡尝试让父sd均衡;LBF_ALL_PINNED一个进程都不能迁移尝试去掉dst_cpu重新进行load_balance();
7、如果经过各种尝试后还是没有一个进程迁移成功,最后尝试一次active_balance;
/*
* It checks each scheduling domain to see if it is due to be balanced,
* and initiates a balancing operation if so.
*
* Balancing parameters are set up in init_sched_domains.
* Balance的参数是在sched_domains初始化时设置的
*/
static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
{
int continue_balancing = 1;
int cpu = rq->cpu;
unsigned long interval;
struct sched_domain *sd;
/* 默认本cpu rq下一次的balance时间为60s以后 */
/* Earliest time when we have to do rebalance again */
unsigned long next_balance = jiffies + 60*HZ;
int update_next_balance = 0;
int need_serialize, need_decay = 0;
u64 max_cost = 0;
/* (1) 更新cpu rq中所有cfs_rq的最新负载 */
update_blocked_averages(cpu);
rcu_read_lock();
/* (2) 对本cpu每个层次的schedule_domain进行扫描 */
for_each_domain(cpu, sd) {
/* (3) 以1HZ的频率对sd->max_newidle_lb_cost进行老化,
老化公式: new = old * (253/256)
*/
/*
* Decay the newidle max times here because this is a regular
* visit to all the domains. Decay ~1% per second.
*/
if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
sd->max_newidle_lb_cost =
(sd->max_newidle_lb_cost * 253) / 256;
sd->next_decay_max_lb_cost = jiffies + HZ;
need_decay = 1;
}
max_cost += sd->max_newidle_lb_cost;
if (!(sd->flags & SD_LOAD_BALANCE))
continue;
#ifndef CONFIG_MTK_LOAD_BALANCE_ENHANCEMENT
/* nohz CPU need GTS balance to migrate tasks for more than 2 clusters*/
/* Don't consider GTS balance if hybrid support */
if (hybrid_support()) {
if (sd->child || (!sd->child &&
(rcu_dereference(per_cpu(sd_scs, cpu)) == NULL)))
continue;
}
#endif
/* (4) 如果continue_balancing = 0,指示停止当前层级的load balance
因为shed_group中其他的cpu正在这个层次做load_balance
*/
/*
* Stop the load balance at this level. There is another
* CPU in our sched group which is doing load balancing more
* actively.
*/
if (!continue_balancing) {
if (need_decay)
continue;
break;
}
/* (5) 计算当前层次schedule_domain的balance间隔时间 */
interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
/* (6) 如果需要串行化(SD_SERIALIZE),做balance之前需要持锁 */
need_serialize = sd->flags & SD_SERIALIZE;
if (need_serialize) {
if (!spin_trylock(&balancing))
goto out;
}
/* (7) 如果本sd的balance间隔时间已到,进行实际的load_balance() */
if (time_after_eq(jiffies, sd->last_balance + interval)) {
if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
/*
* The LBF_DST_PINNED logic could have changed
* env->dst_cpu, so we can't know our idle
* state even if we migrated tasks. Update it.
*/
idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
}
sd->last_balance = jiffies;
interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
}
if (need_serialize)
spin_unlock(&balancing);
out:
/* (8) 如果sd下一次balance时间在,rq的balance时间之前,需要更新rq的balance时间
rq的下一次balance时间:next_balance (默认是60s后)
本sd的下一次balance时间:sd->last_balance + interval
rq的下一次balance时间需要选取多个sd中时间最近的一个
*/
if (time_after(next_balance, sd->last_balance + interval)) {
next_balance = sd->last_balance + interval;
update_next_balance = 1;
}
}
if (need_decay) {
/*
* Ensure the rq-wide value also decays but keep it at a
* reasonable floor to avoid funnies with rq->avg_idle.
*/
rq->max_idle_balance_cost =
max((u64)sysctl_sched_migration_cost, max_cost);
}
rcu_read_unlock();
/* (8.1) 更新rq的balance时间 */
/*
* next_balance will be updated only when there is a need.
* When the cpu is attached to null domain for ex, it will not be
* updated.
*/
if (likely(update_next_balance)) {
rq->next_balance = next_balance;
#ifdef CONFIG_NO_HZ_COMMON
/*
* If this CPU has been elected to perform the nohz idle
* balance. Other idle CPUs have already rebalanced with
* nohz_idle_balance() and nohz.next_balance has been
* updated accordingly. This CPU is now running the idle load
* balance for itself and we need to update the
* nohz.next_balance accordingly.
*/
if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance))
nohz.next_balance = rq->next_balance;
#endif
}
}
|→
static int load_balance(int this_cpu, struct rq *this_rq,
struct sched_domain *sd, enum cpu_idle_type idle,
int *continue_balancing)
{
int ld_moved, cur_ld_moved, active_balance = 0;
struct sched_domain *sd_parent = sd->parent;
struct sched_group *group;
struct rq *busiest;
unsigned long flags;
struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);
/* (7.1) 构造Load_balance需要的数据结构:
.sd = sd, //本cpu在本tl层次的sd
.dst_cpu = this_cpu, // 目的cpu是本cpu
.dst_rq = this_rq, // 目的rq是本cpu的rq
// load_balance的目的是找出负载最重的cpu,并将一部分负载迁移到本cpu上
*/
struct lb_env env = {
.sd = sd,
.dst_cpu = this_cpu,
.dst_rq = this_rq,
.dst_grpmask = sched_group_cpus(sd->groups),
.idle = idle,
.loop_break = sched_nr_migrate_break,
.cpus = cpus,
.fbq_type = all,
.tasks = LIST_HEAD_INIT(env.tasks),
};
/*
* For NEWLY_IDLE load_balancing, we don't need to consider
* other cpus in our group
*/
if (idle == CPU_NEWLY_IDLE)
env.dst_grpmask = NULL;
cpumask_copy(cpus, cpu_active_mask);
schedstat_inc(sd, lb_count[idle]);
redo:
/* (7.2) check当前cpu是否适合作为dst_cpu(即light cpu,需要分担其他cpu的负载) */
if (!should_we_balance(&env)) {
*continue_balancing = 0;
goto out_balanced;
}
/* (7.3) 找出本层级sched_group链表中,负载最重的(busiest)的sched_group */
group = find_busiest_group(&env);
if (!group) {
schedstat_inc(sd, lb_nobusyg[idle]);
goto out_balanced;
}
/* (7.4) 找出busiest sched_group中sched_group的rq,即负载最重cpu对应的rq */
busiest = find_busiest_queue(&env, group);
if (!busiest) {
schedstat_inc(sd, lb_nobusyq[idle]);
goto out_balanced;
}
BUG_ON(busiest == env.dst_rq);
schedstat_add(sd, lb_imbalance[idle], env.imbalance);
env.src_cpu = busiest->cpu;
env.src_rq = busiest;
ld_moved = 0;
/* (7.5) 判断busiest cpu rq中的runnable进程数 > 1?
至少有进程可以迁移走
*/
if (busiest->nr_running > 1) {
/*
* Attempt to move tasks. If find_busiest_group has found
* an imbalance but busiest->nr_running <= 1, the group is
* still unbalanced. ld_moved simply stays zero, so it is
* correctly treated as an imbalance.
*/
env.flags |= LBF_ALL_PINNED;
env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
more_balance:
raw_spin_lock_irqsave(&busiest->lock, flags);
/* (7.6) 从busiest rq中detach进程,
env->imbalance:需要迁移的负载大小
cur_ld_moved:实际迁移的进程数
*/
/*
* cur_ld_moved - load moved in current iteration
* ld_moved - cumulative load moved across iterations
*/
cur_ld_moved = detach_tasks(&env);
/* (7.7) busiest cpu负载减轻后,
在sched_freq中判断cpu频率是否可以调低
*/
/*
* We want to potentially lower env.src_cpu's OPP.
*/
if (cur_ld_moved)
update_capacity_of(env.src_cpu, SCHE_ONESHOT);
/*
* We've detached some tasks from busiest_rq. Every
* task is masked "TASK_ON_RQ_MIGRATING", so we can safely
* unlock busiest->lock, and we are able to be sure
* that nobody can manipulate the tasks in parallel.
* See task_rq_lock() family for the details.
*/
raw_spin_unlock(&busiest->lock);
/* (7.8) 把迁移过来的任务attack到dest_cpu上 */
if (cur_ld_moved) {
attach_tasks(&env);
ld_moved += cur_ld_moved;
}
local_irq_restore(flags);
/* (7.9) LBF_NEED_BREAK设置,说明balance还没有完成,循环只是出来休息一下,
继续重新balance
*/
if (env.flags & LBF_NEED_BREAK) {
env.flags &= ~LBF_NEED_BREAK;
goto more_balance;
}
/* (7.10) 设置了LBF_DST_PINNED标志,并且env.imbalance > 0
说明src_cpu上有些进程因为affinity的原因不能迁移到dst_cpu但是能迁移到同sg的new_dst_cpu上
把dst_cpu更改为new_dst_cpu,重新开始balance流程
*/
/*
* Revisit (affine) tasks on src_cpu that couldn't be moved to
* us and move them to an alternate dst_cpu in our sched_group
* where they can run. The upper limit on how many times we
* iterate on same src_cpu is dependent on number of cpus in our
* sched_group.
*
* This changes load balance semantics a bit on who can move
* load to a given_cpu. In addition to the given_cpu itself
* (or a ilb_cpu acting on its behalf where given_cpu is
* nohz-idle), we now have balance_cpu in a position to move
* load to given_cpu. In rare situations, this may cause
* conflicts (balance_cpu and given_cpu/ilb_cpu deciding
* _independently_ and at _same_ time to move some load to
* given_cpu) causing exceess load to be moved to given_cpu.
* This however should not happen so much in practice and
* moreover subsequent load balance cycles should correct the
* excess load moved.
*/
if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {
/* Prevent to re-select dst_cpu via env's cpus */
cpumask_clear_cpu(env.dst_cpu, env.cpus);
env.dst_rq = cpu_rq(env.new_dst_cpu);
env.dst_cpu = env.new_dst_cpu;
env.flags &= ~LBF_DST_PINNED;
env.loop = 0;
env.loop_break = sched_nr_migrate_break;
/*
* Go back to "more_balance" rather than "redo" since we
* need to continue with same src_cpu.
*/
goto more_balance;
}
/* (7.11) 设置了LBF_SOME_PINNED标志,说明有些进程因为affinity迁移失败,
设置当前sd的parent sd的 sgc->imbalance,让parent sd做rebalance的概率增高
*/
/*
* We failed to reach balance because of affinity.
*/
if (sd_parent) {
int *group_imbalance = &sd_parent->groups->sgc->imbalance;
if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)
*group_imbalance = 1;
}
/* (7.12) 如果LBF_ALL_PINNED标志一直被置位,
说明busiest_cpu因为affinity没有一个进程迁移成功,哪怕迁移到dst_cpu同sg的其他cpu也没有一个成功
将busiest cpu从全局cpu mask去掉,重新做整个流程:find_busiest_group -> find_busiest_queue -> detach_tasks -> attach_tasks
*/
/* All tasks on this runqueue were pinned by CPU affinity */
if (unlikely(env.flags & LBF_ALL_PINNED)) {
cpumask_clear_cpu(cpu_of(busiest), cpus);
if (!cpumask_empty(cpus)) {
env.loop = 0;
env.loop_break = sched_nr_migrate_break;
goto redo;
}
goto out_all_pinned;
}
}
/* (7.13) 经过几轮的努力尝试,最终迁移的进程数ld_moved还是0,说明balance失败 */
if (!ld_moved) {
schedstat_inc(sd, lb_failed[idle]);
/*
* Increment the failure counter only on periodic balance.
* We do not want newidle balance, which can be very
* frequent, pollute the failure counter causing
* excessive cache_hot migrations and active balances.
*/
if (idle != CPU_NEWLY_IDLE)
if (env.src_grp_nr_running > 1)
sd->nr_balance_failed++;
/* (7.14) 最后一次尝试迁移一个进程 */
if (need_active_balance(&env)) {
raw_spin_lock_irqsave(&busiest->lock, flags);
/* (7.15) 如果当前cpu不在busiest->curr进程的affinity之内,返回失败 */
/* don't kick the active_load_balance_cpu_stop,
* if the curr task on busiest cpu can't be
* moved to this_cpu
*/
if (!cpumask_test_cpu(this_cpu,
tsk_cpus_allowed(busiest->curr))) {
raw_spin_unlock_irqrestore(&busiest->lock,
flags);
env.flags |= LBF_ALL_PINNED;
goto out_one_pinned;
}
/*
* ->active_balance synchronizes accesses to
* ->active_balance_work. Once set, it's cleared
* only after active load balance is finished.
*/
if (!busiest->active_balance && !cpu_park(cpu_of(busiest))) {
busiest->active_balance = 1; /* load_balance */
busiest->push_cpu = this_cpu;
active_balance = 1;
}
raw_spin_unlock_irqrestore(&busiest->lock, flags);
/* (7.16) 迁移busiest->curr进程当前期cpu */
if (active_balance) {
if (stop_one_cpu_dispatch(cpu_of(busiest),
active_load_balance_cpu_stop, busiest,
&busiest->active_balance_work)) {
raw_spin_lock_irqsave(&busiest->lock, flags);
busiest->active_balance = 0;
active_balance = 0;
raw_spin_unlock_irqrestore(&busiest->lock, flags);
}
}
/*
* We've kicked active balancing, reset the failure
* counter.
*/
sd->nr_balance_failed = sd->cache_nice_tries+1;
}
} else
sd->nr_balance_failed = 0;
if (likely(!active_balance)) {
/* We were unbalanced, so reset the balancing interval */
sd->balance_interval = sd->min_interval;
} else {
/*
* If we've begun active balancing, start to back off. This
* case may not be covered by the all_pinned logic if there
* is only 1 task on the busy runqueue (because we don't call
* detach_tasks).
*/
if (sd->balance_interval < sd->max_interval)
sd->balance_interval *= 2;
}
goto out;
out_balanced:
/*
* We reach balance although we may have faced some affinity
* constraints. Clear the imbalance flag if it was set.
*/
if (sd_parent) {
int *group_imbalance = &sd_parent->groups->sgc->imbalance;
if (*group_imbalance)
*group_imbalance = 0;
}
out_all_pinned:
/*
* We reach balance because all tasks are pinned at this level so
* we can't migrate them. Let the imbalance flag set so parent level
* can try to migrate them.
*/
schedstat_inc(sd, lb_balanced[idle]);
sd->nr_balance_failed = 0;
out_one_pinned:
/* tune up the balancing interval */
if (((env.flags & LBF_ALL_PINNED) &&
sd->balance_interval < MAX_PINNED_INTERVAL) ||
(sd->balance_interval < sd->max_interval))
sd->balance_interval *= 2;
ld_moved = 0;
out:
return ld_moved;
}
||→
static int should_we_balance(struct lb_env *env)
{
struct sched_group *sg = env->sd->groups;
struct cpumask *sg_cpus, *sg_mask;
int cpu, balance_cpu = -1;
/* (7.2.1) 如果本cpu为CPU_NEWLY_IDLE,直接符合迁移条件 */
/*
* In the newly idle case, we will allow all the cpu's
* to do the newly idle load balance.
*/
if (env->idle == CPU_NEWLY_IDLE)
return 1;
sg_cpus = sched_group_cpus(sg);
sg_mask = sched_group_mask(sg);
/* (7.2.2) 本sched_group的第一个idle cpu适合做load_balance */
/* Try to find first idle cpu */
for_each_cpu_and(cpu, sg_cpus, env->cpus) {
if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu))
continue;
balance_cpu = cpu;
break;
}
/* (7.2.3) 没有idle cpu,则选取本sched_group的第一个cpu做load_balance */
if (balance_cpu == -1)
balance_cpu = group_balance_cpu(sg);
/* (7.2.4) 不满足上述条件的cpu,不适合来启动load_balance */
/*
* First idle cpu or the first cpu(busiest) in this sched group
* is eligible for doing load balancing at this and above domains.
*/
return balance_cpu == env->dst_cpu;
}
||→
static struct sched_group *find_busiest_group(struct lb_env *env)
{
struct sg_lb_stats *local, *busiest;
struct sd_lb_stats sds;
int local_cpu = 0, busiest_cpu = 0;
struct cpumask *busiest_cpumask;
int same_clus = 0;
init_sd_lb_stats(&sds);
/* (7.3.1) 更新本层级sched_group链表中,每个sched_group的负载,
并选出busiest的一个sched_group
*/
/*
* Compute the various statistics relavent for load balancing at
* this level.
*/
update_sd_lb_stats(env, &sds);
local = &sds.local_stat;
busiest = &sds.busiest_stat;
if (sds.busiest) {
busiest_cpumask = sched_group_cpus(sds.busiest);
local_cpu = env->dst_cpu;
busiest_cpu = group_first_cpu(sds.busiest);
same_clus = is_the_same_domain(local_cpu, busiest_cpu);
mt_sched_printf(sched_lb, "%s: local_cpu=%d, busiest_cpu=%d, busiest_mask=%lu, same_cluster=%d",
__func__, local_cpu, busiest_cpu, busiest_cpumask->bits[0], same_clus);
}
/* (7.3.2) 如果EAS使能,跨cluster的任务迁移使用EAS来做 */
if (energy_aware() && !env->dst_rq->rd->overutilized && !same_clus)
goto out_balanced;
/* (7.3.3) */
/* ASYM feature bypasses nice load balance check */
if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
check_asym_packing(env, &sds))
return sds.busiest;
/* (7.3.4) busiest sg上没有负载,返回空 */
/* There is no busy sibling group to pull tasks from */
if (!sds.busiest || busiest->sum_nr_running == 0) {
if (!sds.busiest)
mt_sched_printf(sched_lb, "[%s] %d: fail no busiest ", __func__, env->src_cpu);
else
mt_sched_printf(sched_lb, "[%s] %d: fail busiest no task ", __func__, env->src_cpu);
goto out_balanced;
}
/* (7.3.5) sg链表里的平均负载 */
sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
/ sds.total_capacity;
/* (7.3.6) 如果busiest sg低一级别的因为cpu affinity没有balance成功,设置了group_imbalanced标志
强制在当前级别上进行balance
*/
/*
* If the busiest group is imbalanced the below checks don't
* work because they assume all things are equal, which typically
* isn't true due to cpus_allowed constraints and the like.
*/
if (busiest->group_type == group_imbalanced)
goto force_balance;
/* (7.3.7) 如果dest cpu/group很闲,busiest负载很重,
强制开展balance
*/
/* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) &&
busiest->group_no_capacity)
goto force_balance;
/* (7.3.8) 如果dest_cpu所在sg的负载都大于busiest sg的负载,
放弃balance
*/
/*
* If the local group is busier than the selected busiest group
* don't try and pull any tasks.
*/
if (local->avg_load >= busiest->avg_load)
goto out_balanced;
/* (7.3.9) 如果dest_cpu所在sg的负载都大于sg链表的平均负载,
放弃balance
*/
/*
* Don't pull any tasks if this group is already above the domain
* average load.
*/
if (local->avg_load >= sds.avg_load)
goto out_balanced;
/* (7.3.10) 如果dest_cpu为idle,但是dest_cpu所在的sg idle cpu数量小于busiest sg的idle cpu数量
放弃balance
*/
#ifdef CONFIG_MTK_LOAD_BALANCE_ENHANCEMENT
if ((env->idle == CPU_IDLE) || (env->idle == CPU_NEWLY_IDLE)) {
int i = (env->idle == CPU_IDLE) ? 1:0;
#else
if (env->idle == CPU_IDLE) {
#endif
/*
* This cpu is idle. If the busiest group is not overloaded
* and there is no imbalance between this and busiest group
* wrt idle cpus, it is balanced. The imbalance becomes
* significant if the diff is greater than 1 otherwise we
* might end up to just move the imbalance on another group
*/
#ifdef CONFIG_MTK_LOAD_BALANCE_ENHANCEMENT
if ((busiest->group_type != group_overloaded) &&
(local->idle_cpus < (busiest->idle_cpus + i)))
#else
if ((busiest->group_type != group_overloaded) &&
(local->idle_cpus <= (busiest->idle_cpus + 1)))
#endif
goto out_balanced;
} else {
/* (7.3.11) busiest->avg_load大于local->avg_load的比例没有超过env->sd->imbalance_pct
放弃balance
*/
/*
* In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
* imbalance_pct to be conservative.
*/
if (100 * busiest->avg_load <=
env->sd->imbalance_pct * local->avg_load)
goto out_balanced;
}
force_balance:
/* Looks like there is an imbalance. Compute it */
/* (7.3.12) 计算需要迁移的负载值env->imbalance */
calculate_imbalance(env, &sds);
#ifdef CONFIG_MTK_LOAD_BALANCE_ENHANCEMENT
env->imbalance = env->imbalance * SCHED_CAPACITY_SCALE
/ (sds.busiest->sgc->capacity / cpumask_weight(sched_group_cpus(sds.busiest)));
#endif
return sds.busiest;
out_balanced:
env->imbalance = 0;
return NULL;
}
|||→
static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
{
struct sched_domain *child = env->sd->child;
struct sched_group *sg = env->sd->groups;
struct sg_lb_stats tmp_sgs;
int load_idx, prefer_sibling = 0;
bool overload = false, overutilized = false;
if (child && child->flags & SD_PREFER_SIBLING)
prefer_sibling = 1;
/* (7.3.1.1) 根据idle情况,选择计算cpu负载时的idx,
idx:是CPU层级负载this_rq->cpu_load[i]数组的index值
*/
load_idx = get_sd_load_idx(env->sd, env->idle);
/* (7.3.1.2) 逐个轮询本层级sched_group链表中的每个sched_group */
do {
struct sg_lb_stats *sgs = &tmp_sgs;
int local_group;
/* (7.3.1.3) 如果sg是当前cpu所在的sg,则本sg称为local_group
使用专门的数据结构来存储local_group的信息:
sds->local = sg; // 使用sds->local来存储local_group
sgs = &sds->local_stat; // 使用sds->local_stat来存储local_group的统计
*/
local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
if (local_group) {
sds->local = sg;
sgs = &sds->local_stat;
/* (7.3.1.4) 更新local_group的capacity,更新的周期为sd->balance_interval
主要目的是动态减去RT进程消耗的capacity
*/
if (env->idle != CPU_NEWLY_IDLE ||
time_after_eq(jiffies, sg->sgc->next_update))
update_group_capacity(env->sd, env->dst_cpu);
}
/* (7.3.1.5) 更新当前sched_group的负载统计
sgs:sg统计数据放到sgs当中
overload:rq中runnable的进程>1,那么肯定有进程在等待
overutilized:cpu的capacity < util,运算能力不足
*/
update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
&overload, &overutilized);
/* (7.3.1.6) local_group不参与busiest sg的计算 */
if (local_group)
goto next_group;
/* (7.3.1.7) 如果设置了SD_PREFER_SIBLING标志,说明local_group希望其他人迁移任务到它身上,
提高其他sg的迁移优先级
*/
/*
* In case the child domain prefers tasks go to siblings
* first, lower the sg capacity so that we'll try
* and move all the excess tasks away. We lower the capacity
* of a group only if the local group has the capacity to fit
* these excess tasks. The extra check prevents the case where
* you always pull from the heaviest group when it is already
* under-utilized (possible with a large weight task outweighs
* the tasks on the system).
*/
if (prefer_sibling && sds->local &&
group_has_capacity(env, &sds->local_stat) &&
(sgs->sum_nr_running > 1)) {
sgs->group_no_capacity = 1;
sgs->group_type = group_classify(sg, sgs);
}
/* (7.3.1.8) 根据计算的sgs统计数据,找出busiest sg */
if (update_sd_pick_busiest(env, sds, sg, sgs)) {
sds->busiest = sg;
sds->busiest_stat = *sgs;
}
next_group:
/* (7.3.1.9) 更新sds中的负载、capacity统计 */
/* Now, start updating sd_lb_stats */
sds->total_load += sgs->group_load;
sds->total_capacity += sgs->group_capacity;
sg = sg->next;
} while (sg != env->sd->groups);
if (env->sd->flags & SD_NUMA)
env->fbq_type = fbq_classify_group(&sds->busiest_stat);
env->src_grp_nr_running = sds->busiest_stat.sum_nr_running;
/* (7.3.1.10) 根据最后一个sg的overload、overutilized值
来更新dst_cpu rq->rd中的对应值 。
ooooo这里是怎么想的?不是local_group,也不是busiest_group,而是最后一个计算的sg!!!
*/
if (!env->sd->parent) {
/* update overload indicator if we are at root domain */
if (env->dst_rq->rd->overload != overload)
env->dst_rq->rd->overload = overload;
/* Update over-utilization (tipping point, U >= 0) indicator */
if (env->dst_rq->rd->overutilized != overutilized)
env->dst_rq->rd->overutilized = overutilized;
} else {
if (!env->dst_rq->rd->overutilized && overutilized)
env->dst_rq->rd->overutilized = true;
}
}
||||→
static inline void update_sg_lb_stats(struct lb_env *env,
struct sched_group *group, int load_idx,
int local_group, struct sg_lb_stats *sgs,
bool *overload, bool *overutilized)
{
unsigned long load;
int i;
memset(sgs, 0, sizeof(*sgs));
/* (7.3.1.5.1) 遍历sched_group中的每个cpu */
for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
struct rq *rq = cpu_rq(i);
/* (7.3.1.5.2) 获取本cpu的负载rq->cpu_load[load_idx-1] */
/* Bias balancing toward cpus of our domain */
if (local_group)
/* 如果是local_group,负载往小的取:min(rq->cpu_load[load_idx-1], weighted_cpuload(cpu)) */
load = target_load(i, load_idx);
else
/* 如果不是local_group,负载往大的取:max(rq->cpu_load[load_idx-1], weighted_cpuload(cpu)) */
load = source_load(i, load_idx);
#ifdef CONFIG_MTK_SCHED_INTEROP
/* (7.3.1.5.3) 因为rq->cpu_load[]只包含cfs的负载,mtk尝试加上rt部分的负载
ooooo但是rq->cpu_capacity中已经减去了rt的部分,这里是否还需要??
*/
load += mt_rt_load(i);
#endif
/* (7.3.1.5.4) 累加sgs各项值:
sgs->group_load // runnable负载带weight分量(cpu_rq(cpu)->cfs.avg.util_avg),经过rq->cpu_load[]计算
sgs->group_util // running负载(cpu_rq(cpu)->cfs.avg.load_avg/cpu_rq(cpu)->cfs.runnable_load_avg)
sgs->sum_nr_running // rq中所有se的总和
sgs->sum_weighted_load // runnable负载带weight分量(cpu_rq(cpu)->cfs.avg.util_avg)
sgs->idle_cpus // idle状态的cpu计数
*/
#ifdef CONFIG_MTK_LOAD_BALANCE_ENHANCEMENT
sgs->group_load += (load * capacity_orig_of(i)) >> SCHED_CAPACITY_SHIFT;
#else
sgs->group_load += load;
#endif
sgs->group_util += cpu_util(i);
sgs->sum_nr_running += rq->cfs.h_nr_running;
/* (7.3.1.5.5) 如果rq中进程数量>1,则就会有进程处于runnable状态,
overload = true
*/
if (rq->nr_running > 1)
*overload = true;
#ifdef CONFIG_NUMA_BALANCING
sgs->nr_numa_running += rq->nr_numa_running;
sgs->nr_preferred_running += rq->nr_preferred_running;
#endif
sgs->sum_weighted_load += weighted_cpuload(i);
if (idle_cpu(i))
sgs->idle_cpus++;
/* (7.3.1.5.6) cpu的capacity小于cpu的running状态负载,
overutilized = true
*/
if (cpu_overutilized(i))
*overutilized = true;
}
/* (7.3.1.5.7) 更新汇总后sgs的统计数据:
sgs->group_capacity // sgs所有cpu capacity的累加
sgs->avg_load // 按照group_capacity,等比例放大group_load负载,capacity越小avg_load越大
sgs->load_per_task // sgs的平均每个进程的weight负载
sgs->group_weight // sgs的online cpu个数
sgs->group_no_capacity // sgs的capacity已经不够用,赶不上util
sgs->group_type // 严重级别 group_overloaded > group_imbalanced > group_other
// group_imbalanced: 下一等级的load_balance因为cpu_affinity的原因没有完成
*/
/* Adjust by relative CPU capacity of the group */
sgs->group_capacity = group->sgc->capacity;
sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;
if (sgs->sum_nr_running)
sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
sgs->group_weight = group->group_weight;
sgs->group_no_capacity = group_is_overloaded(env, sgs);
sgs->group_type = group_classify(group, sgs);
}
||||→
static bool update_sd_pick_busiest(struct lb_env *env,
struct sd_lb_stats *sds,
struct sched_group *sg,
struct sg_lb_stats *sgs)
{
struct sg_lb_stats *busiest = &sds->busiest_stat;
#ifdef CONFIG_MTK_LOAD_BALANCE_ENHANCEMENT
if (sgs->sum_nr_running == 0) {
mt_sched_printf(sched_lb_info, "[%s] sgs->sum_nr_running=%d",
__func__, sgs->sum_nr_running);
return false;
}
#endif
/* (7.3.1.9.1) 如果新的sgs group_type大于旧的busiest sgs,
新的sgs更busy
*/
if (sgs->group_type > busiest->group_type)
return true;
/* (7.3.1.9.2) 如果新的sgs group_type小于旧的busiest sgs,
旧的sgs更busy
*/
if (sgs->group_type < busiest->group_type)
return false;
/* (7.3.1.9.3) 在group_type相同的情况下,比较sgs->avg_load
sgs->avg_load = rq->cpu_load[load_idx-1] * (group_load*SCHED_CAPACITY_SCALE / sgs->group_capacity)
*/
if (sgs->avg_load <= busiest->avg_load)
return false;
/* (7.3.1.9.4) 如果SD_ASYM_PACKING标志没有置位,
在group_type相同的情况下,sgs->avg_load值较大的为busiest sg
*/
/* This is the busiest node in its class. */
if (!(env->sd->flags & SD_ASYM_PACKING))
return true;
/* (7.3.1.9.5) ASYM_PACKING的意思是会把负载移到最低序号的cpu上,
如果sg的frist cpu序号 > dst_cpu,则busiest
对个sg的frist cpu序号 > dst_cpu,选择序号小的sg
*/
/*
* ASYM_PACKING needs to move all the work to the lowest
* numbered CPUs in the group, therefore mark all groups
* higher than ourself as busy.
*/
if (sgs->sum_nr_running && env->dst_cpu < group_first_cpu(sg)) {
if (!sds->busiest)
return true;
if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
return true;
}
/* (7.3.1.9.6) 设置了ASYM_PACKING,且如果sg的frist cpu序号 <= dst_cpu,
返回false
*/
return false;
}
|||→
static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
{
unsigned long max_pull, load_above_capacity = ~0UL;
struct sg_lb_stats *local, *busiest;
/* (7.3.12.1) local sgs和busiest sgs */
local = &sds->local_stat;
busiest = &sds->busiest_stat;
if (busiest->group_type == group_imbalanced) {
/*
* In the group_imb case we cannot rely on group-wide averages
* to ensure cpu-load equilibrium, look at wider averages. XXX
*/
busiest->load_per_task =
min(busiest->load_per_task, sds->avg_load);
}
/* (7.3.12.2) */
/*
* In the presence of smp nice balancing, certain scenarios can have
* max load less than avg load(as we skip the groups at or below
* its cpu_capacity, while calculating max_load..)
*/
if (busiest->avg_load <= sds->avg_load ||
local->avg_load >= sds->avg_load) {
env->imbalance = 0;
return fix_small_imbalance(env, sds);
}
/* (7.3.12.3) */
/*
* If there aren't any idle cpus, avoid creating some.
*/
if (busiest->group_type == group_overloaded &&
local->group_type == group_overloaded) {
load_above_capacity = busiest->sum_nr_running *
SCHED_LOAD_SCALE;
if (load_above_capacity > busiest->group_capacity)
load_above_capacity -= busiest->group_capacity;
else
load_above_capacity = ~0UL;
}
/* (7.3.12.4) env->imbalance的值等于min((sds->avg - local), (busiest - sds->avg))
在local和sds平均值,busiest和sds平均值,两个差值之间选择最小值
*/
/*
* We're trying to get all the cpus to the average_load, so we don't
* want to push ourselves above the average load, nor do we wish to
* reduce the max loaded cpu below the average load. At the same time,
* we also don't want to reduce the group load below the group capacity
* (so that we can implement power-savings policies etc). Thus we look
* for the minimum possible imbalance.
*/
max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);
/* How much load to actually move to equalise the imbalance */
env->imbalance = min(
max_pull * busiest->group_capacity,
(sds->avg_load - local->avg_load) * local->group_capacity
) / SCHED_CAPACITY_SCALE;
/*
* if *imbalance is less than the average load per runnable task
* there is no guarantee that any tasks will be moved so we'll have
* a think about bumping its value to force at least one task to be
* moved
*/
if (env->imbalance < busiest->load_per_task)
return fix_small_imbalance(env, sds);
}
||→
static struct rq *find_busiest_queue(struct lb_env *env,
struct sched_group *group)
{
struct rq *busiest = NULL, *rq;
unsigned long busiest_load = 0, busiest_capacity = 1;
int i;
/* (7.4.1) 逐个遍历sg中的cpu */
for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
unsigned long capacity, wl;
enum fbq_type rt;
rq = cpu_rq(i);
rt = fbq_classify_rq(rq);
/*
* We classify groups/runqueues into three groups:
* - regular: there are !numa tasks
* - remote: there are numa tasks that run on the 'wrong' node
* - all: there is no distinction
*
* In order to avoid migrating ideally placed numa tasks,
* ignore those when there's better options.
*
* If we ignore the actual busiest queue to migrate another
* task, the next balance pass can still reduce the busiest
* queue by moving tasks around inside the node.
*
* If we cannot move enough load due to this classification
* the next pass will adjust the group classification and
* allow migration of more tasks.
*
* Both cases only affect the total convergence complexity.
*/
if (rt > env->fbq_type)
continue;
/* (7.4.2) 计算出cpu的capacity和weight_load */
capacity = capacity_of(i);
wl = weighted_cpuload(i);
#ifdef CONFIG_MTK_SCHED_INTEROP
wl += mt_rt_load(i);
#endif
/*
* When comparing with imbalance, use weighted_cpuload()
* which is not scaled with the cpu capacity.
*/
if (rq->nr_running == 1 && wl > env->imbalance &&
!check_cpu_capacity(rq, env->sd))
continue;
/* (7.4.3) 选出相对负载最重的cpu */
/*
* For the load comparisons with the other cpu's, consider
* the weighted_cpuload() scaled with the cpu capacity, so
* that the load can be moved away from the cpu that is
* potentially running at a lower capacity.
*
* Thus we're looking for max(wl_i / capacity_i), crosswise
* multiplication to rid ourselves of the division works out
* to: wl_i * capacity_j > wl_j * capacity_i; where j is
* our previous maximum.
*/
if (wl * busiest_capacity > busiest_load * capacity) {
busiest_load = wl;
busiest_capacity = capacity;
busiest = rq;
}
}
return busiest;
}
||→
static int detach_tasks(struct lb_env *env)
{
struct list_head *tasks = &env->src_rq->cfs_tasks;
struct task_struct *p;
unsigned long load;
int detached = 0;
lockdep_assert_held(&env->src_rq->lock);
if (env->imbalance <= 0)
return 0;
/* (7.6.1) 遍历busiest rq中的进程 */
while (!list_empty(tasks)) {
/* (7.6.2) 如果dest cpu不是idle,不能将busiest cpu迁移到idle状态 */
/*
* We don't want to steal all, otherwise we may be treated likewise,
* which could at worst lead to a livelock crash.
*/
if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
break;
p = list_first_entry(tasks, struct task_struct, se.group_node);
/* (7.6.3) 遍历任务最多不超过sysctl_sched_nr_migrate(32) */
env->loop++;
/* We've more or less seen every task there is, call it quits */
if (env->loop > env->loop_max)
break;
/* (7.6.4) 每sched_nr_migrate_break个任务遍历需要跳出休息一下,
如果没有达到env->loop_max,后面会重来
*/
/* take a breather every nr_migrate tasks */
if (env->loop > env->loop_break) {
env->loop_break += sched_nr_migrate_break;
env->flags |= LBF_NEED_BREAK;
break;
}
/* (7.6.5) 判断任务是否支持迁移? */
if (!can_migrate_task(p, env))
goto next;
/* (7.6.6) 获取p进程相对顶层cfs_rq的负载,
根据负载判断进程是否适合迁移
*/
load = task_h_load(p);
if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
goto next;
if ((load / 2) > env->imbalance)
goto next;
/* (7.6.7) detach 进程 */
detach_task(p, env);
list_add(&p->se.group_node, &env->tasks);
detached++;
env->imbalance -= load;
#ifdef CONFIG_PREEMPT
/*
* NEWIDLE balancing is a source of latency, so preemptible
* kernels will stop after the first task is detached to minimize
* the critical section.
*/
if (env->idle == CPU_NEWLY_IDLE)
break;
#endif
/*
* We only want to steal up to the prescribed amount of
* weighted load.
*/
if (env->imbalance <= 0)
break;
continue;
next:
list_move_tail(&p->se.group_node, tasks);
}
/*
* Right now, this is one of only two places we collect this stat
* so we can safely collect detach_one_task() stats here rather
* than inside detach_one_task().
*/
schedstat_add(env->sd, lb_gained[env->idle], detached);
return detached;
}
|||→
static
int can_migrate_task(struct task_struct *p, struct lb_env *env)
{
int tsk_cache_hot;
lockdep_assert_held(&env->src_rq->lock);
/*
* We do not migrate tasks that are:
* 1) throttled_lb_pair, or
* 2) cannot be migrated to this CPU due to cpus_allowed, or
* 3) running (obviously), or
* 4) are cache-hot on their current CPU.
*/
/* (7.6.5.1) 如果达到bandwith限制,返回失败 */
if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
return 0;
/* (7.6.5.2) 如果p进程的cpu affinity不允许迁移到dst_cpu,进一步处理 */
if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
int cpu;
schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
/* (7.6.5.3) LBF_SOME_PINNED标志,记录有些进程迁移失败 */
env->flags |= LBF_SOME_PINNED;
/* (7.6.5.5) 如果已经有其他的LBF_DST_PINNED动作,直接返回失败 */
/*
* Remember if this task can be migrated to any other cpu in
* our sched_group. We may want to revisit it if we couldn't
* meet load balance goals by pulling other tasks on src_cpu.
*
* Also avoid computing new_dst_cpu if we have already computed
* one in current iteration.
*/
if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED))
return 0;
/* (7.6.5.4) 如果dst_cpu同一sched_group中的其他cpu符合p的affinity,尝试更改dst_cpu,
设置LBF_DST_PINNED标志
*/
/* Prevent to re-select dst_cpu via env's cpus */
for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
env->flags |= LBF_DST_PINNED;
env->new_dst_cpu = cpu;
break;
}
}
return 0;
}
/* (7.6.5.6) 有任何符合affinity条件的p,清除LBF_ALL_PINNED标志 */
/* Record that we found atleast one task that could run on dst_cpu */
env->flags &= ~LBF_ALL_PINNED;
/* (7.6.5.7) 如果p在running状态,返回失败 */
if (task_running(env->src_rq, p)) {
schedstat_inc(p, se.statistics.nr_failed_migrations_running);
return 0;
}
/* (7.6.5.8) NUMA 相关的一些判断 */
/*
* Aggressive migration if:
* 1) destination numa is preferred
* 2) task is cache cold, or
* 3) too many balance attempts have failed.
*/
tsk_cache_hot = migrate_degrades_locality(p, env);
if (tsk_cache_hot == -1)
tsk_cache_hot = task_hot(p, env);
if (tsk_cache_hot <= 0 ||
env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
if (tsk_cache_hot == 1) {
schedstat_inc(env->sd, lb_hot_gained[env->idle]);
schedstat_inc(p, se.statistics.nr_forced_migrations);
}
return 1;
}
schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
return 0;
}
|||→
static unsigned long task_h_load(struct task_struct *p)
{
struct cfs_rq *cfs_rq = task_cfs_rq(p);
update_cfs_rq_h_load(cfs_rq);
/* (7.6.6.1) task_h_load的目的是在task_group使能时,rq中有多个层次的cfs_rq
如果进程p挂载在底层的cfs_rq中,把p的负载转换成顶层cfs_rq的相对负载
*/
return div64_ul(p->se.avg.load_avg * cfs_rq->h_load,
cfs_rq_load_avg(cfs_rq) + 1);
}
static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
{
struct rq *rq = rq_of(cfs_rq);
struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
u64 now = sched_clock_cpu(cpu_of(rq));
unsigned long load;
/* sched: change to jiffies */
now = now * HZ >> 30;
if (cfs_rq->last_h_load_update == now)
return;
/* 从底层cfs_rq到顶层cfs_rq建立起层次关系 */
cfs_rq->h_load_next = NULL;
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
cfs_rq->h_load_next = se;
if (cfs_rq->last_h_load_update == now)
break;
}
if (!se) {
cfs_rq->h_load = cfs_rq_load_avg(cfs_rq);
cfs_rq->last_h_load_update = now;
}
/* 使用建立的关系,从顶层cfs_rq开始计算每个层次cfs_rq的相对顶层负载h_load */
while ((se = cfs_rq->h_load_next) != NULL) {
load = cfs_rq->h_load;
load = div64_ul(load * se->avg.load_avg,
cfs_rq_load_avg(cfs_rq) + 1);
cfs_rq = group_cfs_rq(se);
cfs_rq->h_load = load;
cfs_rq->last_h_load_update = now;
}
}
4.1.2.2、nohz_idle_balance()
每个cpu的负载均衡是在本cpu的tick任务scheduler_tick()中判断执行的,如果cpu进入了nohz模式scheduler_tick()被stop,那么本cpu没有机会去做rebalance_domains()。为了解决这个问题,系统设计了nohz_idle_balance(),在运行的cpu上判断进入nohz的cpu是否需要rebalance load,如果需要选择一个idle cpu来帮所有的nohz idle cpu做负载均衡。
在rebalance_domains()函数之前有一个nohz_idle_balance(),这是系统在条件满足的情况下让一个idle cpu做idle负载均衡。主要的原理如下:
- 1、cpu在进入nohz idle状态时,设置标志:
tick_nohz_idle_enter() -> set_cpu_sd_state_idle():
↓
void set_cpu_sd_state_idle(void)
{
struct sched_domain *sd;
int cpu = smp_processor_id();
rcu_read_lock();
sd = rcu_dereference(per_cpu(sd_busy, cpu));
if (!sd || sd->nohz_idle)
goto unlock;
/* (1.1) 进入nohz idle,设置sd->nohz_idle标志 */
sd->nohz_idle = 1;
/* (1.2) 减少sgc->nr_busy_cpus的计数 */
atomic_dec(&sd->groups->sgc->nr_busy_cpus);
unlock:
rcu_read_unlock();
}
tick_nohz_idle_enter() -> __tick_nohz_idle_enter() -> tick_nohz_stop_sched_tick() -> nohz_balance_enter_idle():
↓
void nohz_balance_enter_idle(int cpu)
{
/*
* If this cpu is going down, then nothing needs to be done.
*/
if (!cpu_active(cpu))
return;
if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
return;
/*
* If we're a completely isolated CPU, we don't play.
*/
if (on_null_domain(cpu_rq(cpu)))
return;
/* (2.1) 进入idle状态,设置nohz.idle_cpus_mask中对应的bit */
cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
/* (2.2) 进入idle状态,增加nohz.nr_cpus计数 */
atomic_inc(&nohz.nr_cpus);
/* (2.3) 设置cpu_rq(cpu)->nohz_flags中的NOHZ_TICK_STOPPED标志 */
set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
}
- 2、在trigger_load_balance()中判断,当前是否需要触发idle load balance:
void trigger_load_balance(struct rq *rq)
{
/* (1) 判断当前是否需要idle load balance */
if (nohz_kick_needed(rq))
/* (2) 选中一个idle cpu去做idle load balance */
nohz_balancer_kick();
}
|→
/*
* Current heuristic for kicking the idle load balancer in the presence
* of an idle cpu in the system.
* - This rq has more than one task.
* - This rq has at least one CFS task and the capacity of the CPU is
* significantly reduced because of RT tasks or IRQs.
* - At parent of LLC scheduler domain level, this cpu's scheduler group has
* multiple busy cpu.
* - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
* domain span are idle.
*/
static inline bool nohz_kick_needed(struct rq *rq)
{
unsigned long now = jiffies;
struct sched_domain *sd;
struct sched_group_capacity *sgc;
int nr_busy, cpu = rq->cpu;
bool kick = false;
/* (1.1) 如果当前cpu为idle状态,失败退出 */
if (unlikely(rq->idle_balance))
return false;
/* (1.2) 退出nohz状态:set_cpu_sd_state_busy()、nohz_balance_exit_idle(cpu)
是set_cpu_sd_state_idle()、nohz_balance_enter_idle()的反向操作
*/
/*
* We may be recently in ticked or tickless idle mode. At the first
* busy tick after returning from idle, we will update the busy stats.
*/
set_cpu_sd_state_busy();
nohz_balance_exit_idle(cpu);
/* (1.3) 如果进入nohz idle状态的cpu数量为0,失败退出 */
/*
* None are in tickless mode and hence no need for NOHZ idle load
* balancing.
*/
if (likely(!atomic_read(&nohz.nr_cpus)))
return false;
/* (1.4) nohz balance时间未到,失败退出 */
if (time_before(now, nohz.next_balance))
return false;
#if !defined(CONFIG_MTK_LOAD_BALANCE_ENHANCEMENT) && defined(CONFIG_HMP)
/* for more than two clusters, still need wakup nohz CPUs and force balancing */
/*
* Bail out if there are no nohz CPUs in our
* HMP domain, since we will move tasks between
* domains through wakeup and force balancing
* as necessary based upon task load.
*/
if (sched_feat(SCHED_HMP) && cpumask_first_and(nohz.idle_cpus_mask,
&((struct hmp_domain *)hmp_cpu_domain(cpu))->cpus) >= nr_cpu_ids)
return false;
#endif
/* (1.5) 当前cpu的进程>=2,返回成功 */
if (rq->nr_running >= 2 &&
(!energy_aware() || cpu_overutilized(cpu)))
return true;
/* (1.6) sd所在sg的nr_busy_cpus>1,返回成功 */
rcu_read_lock();
sd = rcu_dereference(per_cpu(sd_busy, cpu));
if (sd && !energy_aware()) {
sgc = sd->groups->sgc;
nr_busy = atomic_read(&sgc->nr_busy_cpus);
if (nr_busy > 1) {
kick = true;
goto unlock;
}
}
/* (1.7) 如果所有层次的se个数>=1,且capacity在减少,返回成功 */
sd = rcu_dereference(rq->sd);
if (sd) {
if ((rq->cfs.h_nr_running >= 1) &&
check_cpu_capacity(rq, sd)) {
kick = true;
goto unlock;
}
}
/* (1.8) 如果本sd->span[]中第一个idle cpu < sd_asym,返回成功 */
sd = rcu_dereference(per_cpu(sd_asym, cpu));
if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
sched_domain_span(sd)) < cpu)) {
kick = true;
goto unlock;
}
unlock:
rcu_read_unlock();
return kick;
}
|→
static void nohz_balancer_kick(void)
{
int ilb_cpu;
nohz.next_balance++;
/* (2.1) 找到所有idle cpu中的第一个idle cpu */
ilb_cpu = find_new_ilb();
if (ilb_cpu >= nr_cpu_ids)
return;
/* (2.2) 给ilb_cpu的cpu_rq(cpu)->nohz_flags设置NOHZ_BALANCE_KICK标志位 */
if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
return;
/* (2.3) 使用ipi中断来唤醒ilb_cpu执行idle load balance */
/*
* Use smp_send_reschedule() instead of resched_cpu().
* This way we generate a sched IPI on the target cpu which
* is idle. And the softirq performing nohz idle load balance
* will be run before returning from the IPI.
*/
smp_send_reschedule(ilb_cpu);
return;
}
/* (2.3.1) ilb_cpu倍唤醒后处理IPI_RESCHEDULE,
会触发一个SCHED_SOFTIRQ软中断来启动run_rebalance_domains()
*/
void handle_IPI(int ipinr, struct pt_regs *regs)
{
unsigned int cpu = smp_processor_id();
struct pt_regs *old_regs = set_irq_regs(regs);
if ((unsigned)ipinr < NR_IPI) {
trace_ipi_entry_rcuidle(ipi_types[ipinr]);
__inc_irq_stat(cpu, ipi_irqs[ipinr]);
}
switch (ipinr) {
case IPI_RESCHEDULE:
scheduler_ipi();
break;
}
↓
void scheduler_ipi(void)
{
/*
* Check if someone kicked us for doing the nohz idle load balance.
*/
if (unlikely(got_nohz_idle_kick())) {
this_rq()->idle_balance = 1;
raise_softirq_irqoff(SCHED_SOFTIRQ);
}
}
- 3、被选中的ilb_cpu被唤醒后,需要帮其他所有idle cpu完成rebalance_domains()工作:
static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
{
int this_cpu = this_rq->cpu;
struct rq *rq;
int balance_cpu;
/* Earliest time when we have to do rebalance again */
unsigned long next_balance = jiffies + 60*HZ;
int update_next_balance = 0;
/* (1) 判断当前cpu是不是被选中被唤醒的ilb_cpu */
if (idle != CPU_IDLE ||
!test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
goto end;
/* (2) 轮询所有进入onhz状态的cpu */
for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
/* (3) 只服务非本cpu,且还是idle状态的cpu
ooooo本cpu也是idle状态,不需对本cpu做idle负载均衡?
ooooo给其他idle cpu的rq做了负载均衡后,什么时候唤醒其他idle cpu?
*/
if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
continue;
/* (4) 如果本cpu被设置了resched标志,说明有线程被唤醒,退出idle状态 */
/*
* If this cpu gets work to do, stop the load balancing
* work being done for other cpus. Next load
* balancing owner will pick it up.
*/
if (need_resched())
break;
/* (5) 需要做负载均衡的idle进程balance_cpu */
rq = cpu_rq(balance_cpu);
/* (6) 如果balance_cpu的rq->next_balance时间已到,替其做rebalance_domains() */
/*
* If time for next balance is due,
* do the balance.
*/
if (time_after_eq(jiffies, rq->next_balance)) {
raw_spin_lock_irq(&rq->lock);
update_rq_clock(rq);
/* (7) 更新idle cpu因为idle造成的负载衰减 */
update_idle_cpu_load(rq);
raw_spin_unlock_irq(&rq->lock);
/* (8) 对balance_cpu做负载均衡
ooooo做完负载均衡,什么时候唤醒balance_cpu??
*/
rebalance_domains(rq, CPU_IDLE);
}
if (time_after(next_balance, rq->next_balance)) {
next_balance = rq->next_balance;
update_next_balance = 1;
}
}
/* (9) 根据所有进入nohz idle cpu rq的最近的一次到期时间,更新nohz.next_balance */
/*
* next_balance will be updated only when there is a need.
* When the CPU is attached to null domain for ex, it will not be
* updated.
*/
if (likely(update_next_balance))
nohz.next_balance = next_balance;
end:
clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
}
4.1.2.3、select_task_rq_fair()
除了scheduler_tick()的时候会做负载均衡,另外一个时刻也会做负载均衡。就是fork新进程、wakeup休眠进程时,系统会根据负载均衡挑选一个最合适的cpu给进程运行,其核心函数就是select_task_rq_fair():
- 1、首先是使用EAS的方法来select_cpu,在EAS使能且没有overutilized时使用EAS方法:
需要重点提一下的是:负载计算计算了3种负载(load_avg、loadwop_avg、util_avg),EAS主要使用其中的util_avg,和capacity一起计算。
- 1.1、EAS遍历cluster和cpu,找到一个既能满足进程p的affinity又能容纳下进程p的负载util,属于能用最小capacity满足的cluster其中剩余capacity最多的target_cpu;
首先找到能容纳进程p的util且capacity最小的cluster:
然后在目标cluster中找到加上进程p以后,剩余capacity最大的cpu:
pre_cpu是进程p上一次运行的cpu作为src_cpu,上面选择的target_cpu作为dst_cpu,就是尝试计算进程p从pre_cpu迁移到target_cpu系统的功耗差异:
- 1.2、计算负载变化前后,target_cpu和prev_cpu带来的power变化。如果没有power增加则返回target_cpu,如果有power增加则返回prev_cpu;
计算负载变化的函数energy_diff()循环很多比较复杂,仔细分析下来就是计算target_cpu/prev_cpu在“MC层次cpu所在sg链表”+“DIE层级cpu所在sg”,这两种范围在负载变化中的功耗差异:
energy_diff()的计算方法如下:
负载值 | 计算方法 | 说明 |
---|---|---|
idle_idx | min(rq->idle_state_idx) | sg多个cpu中,idle_state_idx最小值 |
eenv->cap_idx | find_new_capacity() | 在负载变化后,根据sg多个cpu中的最大util值,匹配的cpu freq档位sg->sge->cap_states[eenv->cap_idx].cap |
group_util | += (__cpu_util << SCHED_CAPACITY_SHIFT)/sg->sge->cap_states[eenv->cap_idx].cap | 累加sg中cpu的util值,并且把util转换成capacity的反比 |
sg_busy_energy | (group_util * sg->sge->busy_power(group_first_cpu(sg), eenv, (sd->child) ? 1 : 0)) >> SCHED_CAPACITY_SHIFT | 使用group_util计算busy部分消耗的功耗 |
sg_idle_energy | ((SCHED_LOAD_SCALE - group_util) * sg->sge->idle_power(idle_idx, group_first_cpu(sg), eenv, (sd->child) ? 1 : 0)) >> SCHED_CAPACITY_SHIFT | 使用(SCHED_LOAD_SCALE - group_util)计算idle部分计算的功耗 |
total_energy | sg_busy_energy + sg_idle_energy | 单个sg的功耗,累计所有相关sg的功耗,总的差异就是进程P迁移以后的功耗差异 |
- 2、如果EAS不适应,使用传统的负载均衡方法来select_cpu:
- 2.1、find_idlest_group() -> find_idlest_cpu() 找出最时候的target_cpu;
- 2.2、最差的方法使用select_idle_sibling()讲究找到一个idle cpu作为target_cpu;
- 2.3、确定target_cpu后,继续使用hmp_select_task_rq_fair()来判断是否需要进行hmp迁移;
static int
select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
{
struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
int cpu = smp_processor_id();
int new_cpu = prev_cpu; /* 默认new_cpu为prev_cpu */
int want_affine = 0;
int sync = wake_flags & WF_SYNC;
int policy = 0;
#ifdef CONFIG_MTK_SCHED_VIP_TASKS
/* mtk: If task is VIP task, prefer most efficiency idle cpu */
if (is_vip_task(p)) {
int vip_idle_cpu;
vip_idle_cpu = find_idle_vip_cpu(p);
if (vip_idle_cpu >= 0)
return vip_idle_cpu;
}
#endif
/* (1) 优先使用EAS计算target cpu,
mtk 对EAS定义了3种模式:EAS模式(energy_aware())、HMP模式(sched_feat(SCHED_HMP))、hybrid_support(EAS、HMP同时共存);
hybrid_support()模式下:一般负载均衡交给EAS;如果cpu_rq(cpu)->rd->overutilized负载已经严重不均衡,交给HMP;
*/
/*
* Consider EAS if only EAS enabled, but HMP
* if hybrid enabled and system is over-utilized.
*/
if ((energy_aware() && !hybrid_support()) ||
(hybrid_support() && !cpu_rq(cpu)->rd->overutilized))
goto CONSIDER_EAS;
/* (2) 非EAS情况,fork使用hmp balance */
/* HMP fork balance:
* always put non-kernel forking tasks on a big domain
*/
if (sched_feat(SCHED_HMP) && p->mm && (sd_flag & SD_BALANCE_FORK)) {
new_cpu = hmp_fork_balance(p, prev_cpu);
/* to recover new_cpu value if something wrong */
if (new_cpu >= nr_cpu_ids)
new_cpu = prev_cpu;
else {
#ifdef CONFIG_MTK_SCHED_TRACERS
trace_sched_select_task_rq(p, (LB_FORK | new_cpu), prev_cpu, new_cpu);
#endif
return new_cpu;
}
}
CONSIDER_EAS:
/* (3) 如果唤醒flag中设置了SD_BALANCE_WAKE,优先使用唤醒cpu来运行进程p,
还需判断下面3个条件是否满足:
!wake_wide(p) // 当前cpu的唤醒次数没有超标
task_fits_max(p, cpu) // 当前cpu的capacity能容纳进程p的util
cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) // 当前cpu在进程在P的affinity中
EAS利用了want_affine这个标志,只要EAS使能,want_affine =1
*/
if (sd_flag & SD_BALANCE_WAKE)
want_affine = (!wake_wide(p) && task_fits_max(p, cpu) &&
cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) ||
energy_aware();
rcu_read_lock();
/* (4) 从下往上遍历当前cpu的sd,查询在哪个层次的sd进行负载均衡 */
for_each_domain(cpu, tmp) {
/* (4.1 如果当前sd不支持负载均SD_LOAD_BALANCE,退出) */
if (!(tmp->flags & SD_LOAD_BALANCE))
break;
/* (4.2) 优先找affine_sd,找到直接break;
需要符合以下3个条件:
want_affine //
(tmp->flags & SD_WAKE_AFFINE) // 当前sd支持SD_WAKE_AFFINE标志
cpumask_test_cpu(prev_cpu, sched_domain_span(tmp)) //当前sd->span[]中同时包含cpu、pre_cpu
*/
/*
* If both cpu and prev_cpu are part of this domain,
* cpu is a valid SD_WAKE_AFFINE target.
*/
if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
affine_sd = tmp;
break;
}
/* (4.3) 其次找一个符合sd_flag的sd */
if (tmp->flags & sd_flag)
sd = tmp;
/* (4.4) 如果以上都失败,直接跳出 */
else if (!want_affine)
break;
}
/* (5) 如果affine_sd成功找到
*/
if (affine_sd) {
sd = NULL; /* Prefer wake_affine over balance flags */
if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
new_cpu = cpu;
}
/* (6) 没有找到符合sd_flag的sd */
if (!sd) {
/* (6.1) EAS使能,且本cpu没有overutilized,
使用EAS负载均衡算法
*/
if (energy_aware() && !cpu_rq(cpu)->rd->overutilized) {
new_cpu = energy_aware_wake_cpu(p, prev_cpu);
policy |= LB_EAS;
}
/* (6.2) 如果不能使用EAS,且sd_flag中设置SD_BALANCE_WAKE标志
尝试在唤醒的cpu上运行p进程,
ooooo前面辛苦计算的affine_sd没有派上用场?
*/
else if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */
if (true) {
#ifdef CONFIG_CGROUP_SCHEDTUNE
bool prefer_idle = schedtune_prefer_idle(p) > 0;
#else
bool prefer_idle = true;
#endif
int idle_cpu;
idle_cpu = find_best_idle_cpu(p, prefer_idle);
if (idle_cpu >= 0) {
new_cpu = idle_cpu;
policy |= LB_IDLEST;
} else {
new_cpu = select_max_spare_capacity_cpu(p, new_cpu);
policy |= LB_SPARE;
}
} else
/* (6.3) 不符合上述条件下的默认处理,尝试找一个idle cpu */
new_cpu = select_idle_sibling(p, new_cpu);
}
} else while (sd) {
/* (7) 找到符合sd_flag的sd */
struct sched_group *group;
int weight;
policy |= LB_SMP;
/* (7.1) */
if (!(sd->flags & sd_flag)) {
sd = sd->child;
continue;
}
/* (7.2) */
group = find_idlest_group(sd, p, cpu, sd_flag);
if (!group) {
sd = sd->child;
continue;
}
/* (7.3) */
new_cpu = find_idlest_cpu(group, p, cpu);
if (new_cpu == -1 || new_cpu == cpu) {
/* Now try balancing at a lower domain level of cpu */
sd = sd->child;
continue;
}
/* (7.4) */
/* Now try balancing at a lower domain level of new_cpu */
cpu = new_cpu;
weight = sd->span_weight;
sd = NULL;
for_each_domain(cpu, tmp) {
if (weight <= tmp->span_weight)
break;
if (tmp->flags & sd_flag)
sd = tmp;
}
/* while loop will break here if sd == NULL */
}
#ifdef CONFIG_MTK_SCHED_TRACERS
policy |= (new_cpu << LB_SMP_SHIFT);
#endif
rcu_read_unlock();
/* (8) 在EAS不能运行的情况下,在做一次HMP的select操作:
判断进程p是否符合hmp的迁移条件,如果符合一次迁移到位,避免后续hmp的操作
*/
/* Consider hmp if no EAS or over-utiled in hybrid mode. */
if ((!energy_aware() && sched_feat(SCHED_HMP)) ||
(hybrid_support() && cpu_rq(cpu)->rd->overutilized)) {
new_cpu = hmp_select_task_rq_fair(sd_flag, p, prev_cpu, new_cpu);
#ifdef CONFIG_MTK_SCHED_TRACERS
policy |= (new_cpu << LB_HMP_SHIFT);
#endif
policy |= LB_HMP;
}
#ifdef CONFIG_MTK_SCHED_TRACERS
trace_sched_select_task_rq(p, policy, prev_cpu, new_cpu);
#endif
return new_cpu;
}
|→
inline int hmp_fork_balance(struct task_struct *p, int prev_cpu)
{
int new_cpu = prev_cpu;
int cpu = smp_processor_id();
/* (2.1) prev_cpu所在cluster是最快(fastest)的 */
if (hmp_cpu_is_fastest(prev_cpu)) {
/* prev_cpu is fastest domain */
struct hmp_domain *hmpdom;
__always_unused int lowest_ratio;
hmpdom = list_entry(
&hmp_cpu_domain(prev_cpu)->hmp_domains,
struct hmp_domain, hmp_domains);
/* (2.2) 尝试选出负载最小的cpu */
lowest_ratio = hmp_domain_min_load(hmpdom, &new_cpu);
if (new_cpu < nr_cpu_ids && cpumask_test_cpu(new_cpu, tsk_cpus_allowed(p)))
return new_cpu;
new_cpu = cpumask_any_and(&hmp_faster_domain(cpu)->cpus,
tsk_cpus_allowed(p));
if (new_cpu < nr_cpu_ids)
return new_cpu;
} else {
/* (2.3) 尝试选出prev_cpu所在cluster中负载最小的cpu */
/* prev_cpu is not fastest domain */
new_cpu = hmp_select_faster_cpu(p, prev_cpu);
if (new_cpu < nr_cpu_ids)
return new_cpu;
}
return new_cpu;
}
|→
static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
{
s64 this_load, load;
s64 this_eff_load, prev_eff_load;
int idx, this_cpu, prev_cpu;
struct task_group *tg;
unsigned long weight;
int balanced;
idx = sd->wake_idx;
this_cpu = smp_processor_id();
prev_cpu = task_cpu(p);
load = source_load(prev_cpu, idx);
this_load = target_load(this_cpu, idx);
/* (5.1) */
/*
* If sync wakeup then subtract the (maximum possible)
* effect of the currently running task from the load
* of the current CPU:
*/
if (sync) {
tg = task_group(current);
weight = current->se.avg.load_avg;
this_load += effective_load(tg, this_cpu, -weight, -weight);
load += effective_load(tg, prev_cpu, 0, -weight);
}
tg = task_group(p);
weight = p->se.avg.load_avg;
/*
* In low-load situations, where prev_cpu is idle and this_cpu is idle
* due to the sync cause above having dropped this_load to 0, we'll
* always have an imbalance, but there's really nothing you can do
* about that, so that's good too.
*
* Otherwise check if either cpus are near enough in load to allow this
* task to be woken on this_cpu.
*/
this_eff_load = 100;
this_eff_load *= capacity_of(prev_cpu);
prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
prev_eff_load *= capacity_of(this_cpu);
if (this_load > 0) {
this_eff_load *= this_load +
effective_load(tg, this_cpu, weight, weight);
prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
}
balanced = this_eff_load <= prev_eff_load;
schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
if (!balanced)
return 0;
schedstat_inc(sd, ttwu_move_affine);
schedstat_inc(p, se.statistics.nr_wakeups_affine);
return 1;
}
|→
static int energy_aware_wake_cpu(struct task_struct *p, int target)
{
int target_max_cap = INT_MAX;
int target_cpu = task_cpu(p);
unsigned long min_util;
unsigned long new_util;
int i, cpu;
bool is_tiny = false;
int nrg_diff = 0;
int cluster_id = 0;
struct cpumask cluster_cpus;
int max_cap_cpu = 0;
int best_cpu = 0;
/* (6.1.1) 遍历cluster和cpu,找出一个capacity最小的cpu能容纳下util(p)为best_cpu */
/*
* Find group with sufficient capacity. We only get here if no cpu is
* overutilized. We may end up overutilizing a cpu by adding the task,
* but that should not be any worse than select_idle_sibling().
* load_balance() should sort it out later as we get above the tipping
* point.
*/
cluster_id = arch_get_nr_clusters();
for (i = 0; i < cluster_id; i++) {
arch_get_cluster_cpus(&cluster_cpus, i);
max_cap_cpu = cpumask_first(&cluster_cpus);
/* Assuming all cpus are the same in group */
for_each_cpu(cpu, &cluster_cpus) {
if (!cpu_online(cpu))
continue;
if (capacity_of(max_cap_cpu) < target_max_cap &&
task_fits_max(p, max_cap_cpu)) {
best_cpu = cpu;
target_max_cap = capacity_of(max_cap_cpu);
}
break;
}
}
if (task_util(p) < TINY_TASK_THRESHOLD)
is_tiny = true;
/* Find cpu with sufficient capacity */
min_util = boosted_task_util(p);
if (!is_tiny)
/* (6.1.2) 根据best_cpu所在的cluster和进程p的affinity,
找出加上util(p)以后,剩余capacity最大的cpu:target_cpu
*/
target_cpu = select_max_spare_capacity_cpu(p, best_cpu);
else
/* (6.1.3) 根据cluster和进程p的affinity,
找出加上util(p)以后,当前freq的capacity能满足的第一个cpu:target_cpu
*/
for_each_cpu_and(i, tsk_cpus_allowed(p), &cluster_cpus) {
if (!cpu_online(i))
continue;
/*
* p's blocked utilization is still accounted for on prev_cpu
* so prev_cpu will receive a negative bias due to the double
* accounting. However, the blocked utilization may be zero.
*/
new_util = cpu_util(i) + task_util(p);
/*
* Ensure minimum capacity to grant the required boost.
* The target CPU can be already at a capacity level higher
* than the one required to boost the task.
*/
new_util = max(min_util, new_util);
#ifdef CONFIG_MTK_SCHED_INTEROP
if (cpu_rq(i)->rt.rt_nr_running && likely(!is_rt_throttle(i)))
continue;
#endif
if (new_util > capacity_orig_of(i))
continue;
if (new_util < capacity_curr_of(i)) {
target_cpu = i;
if (cpu_rq(i)->nr_running)
break;
}
/* cpu has capacity at higher OPP, keep it as fallback */
if (target_cpu == task_cpu(p))
target_cpu = i;
}
/* (6.1.4) 如果pre_cpu和target_cpu是同一个cluster,直接成功返回 */
/* no need energy calculation if the same domain */
if (is_the_same_domain(task_cpu(p), target_cpu))
return target_cpu;
/* no energy comparison if the same cluster */
if (target_cpu != task_cpu(p)) {
/* (6.1.5) 构造需要迁移的环境变量 */
struct energy_env eenv = {
.util_delta = task_util(p),
.src_cpu = task_cpu(p),
.dst_cpu = target_cpu,
.task = p,
};
/* Not enough spare capacity on previous cpu */
if (cpu_overutilized(task_cpu(p))) {
trace_energy_aware_wake_cpu(p, task_cpu(p), target_cpu,
(int)task_util(p), nrg_diff, true, is_tiny);
return target_cpu;
}
/* (6.1.6) 计算进程p从pre_cpu迁移到target_cpu后的功耗差值nrg_diff,
如果功耗增加,nrg_diff >= 0,返回pre_cpu即task_cpu(p),
如果功耗减少,返回新的target_cpu
*/
nrg_diff = energy_diff(&eenv);
if (nrg_diff >= 0) {
trace_energy_aware_wake_cpu(p, task_cpu(p), target_cpu,
(int)task_util(p), nrg_diff, false, is_tiny);
return task_cpu(p);
}
}
trace_energy_aware_wake_cpu(p, task_cpu(p), target_cpu, (int)task_util(p), nrg_diff, false, is_tiny);
return target_cpu;
}
||→
static inline int
energy_diff(struct energy_env *eenv)
{
unsigned int boost;
int nrg_delta;
/* Conpute "absolute" energy diff */
__energy_diff(eenv);
/* Return energy diff when boost margin is 0 */
#ifdef CONFIG_CGROUP_SCHEDTUNE
boost = schedtune_task_boost(eenv->task);
#else
boost = get_sysctl_sched_cfs_boost();
#endif
if (boost == 0)
return eenv->nrg.diff;
/* Compute normalized energy diff */
nrg_delta = normalize_energy(eenv->nrg.diff);
eenv->nrg.delta = nrg_delta;
eenv->payoff = schedtune_accept_deltas(
eenv->nrg.delta,
eenv->cap.delta,
eenv->task);
/*
* When SchedTune is enabled, the energy_diff() function will return
* the computed energy payoff value. Since the energy_diff() return
* value is expected to be negative by its callers, this evaluation
* function return a negative value each time the evaluation return a
* positive payoff, which is the condition for the acceptance of
* a scheduling decision
*/
return -eenv->payoff;
}
static int __energy_diff(struct energy_env *eenv)
{
struct sched_domain *sd;
struct sched_group *sg;
int sd_cpu = -1, energy_before = 0, energy_after = 0;
/* (6.1.6.1) 构造迁移前的环境变量 */
struct energy_env eenv_before = {
.util_delta = 0,
.src_cpu = eenv->src_cpu,
.dst_cpu = eenv->dst_cpu,
.nrg = { 0, 0, 0, 0},
.cap = { 0, 0, 0 },
};
#ifdef CONFIG_MTK_SCHED_EAS_POWER_SUPPORT
int i;
#endif
if (eenv->src_cpu == eenv->dst_cpu)
return 0;
#ifdef CONFIG_MTK_SCHED_EAS_POWER_SUPPORT
/* To get max opp index of every cluster for power estimation of share buck */
for (i = 0; i < arch_get_nr_clusters(); i++) {
/* for energy before */
eenv_before.opp_idx[i] = mtk_cluster_capacity_idx(i, &eenv_before);
/* for energy after */
eenv->opp_idx[i] = mtk_cluster_capacity_idx(i, eenv);
mt_sched_printf(sched_eas_energy_calc, "cid=%d, before max_opp:%d, after max_opp:%d\n",
i, eenv_before.opp_idx[i], eenv->opp_idx[i]);
}
#endif
/* (6.1.6.2) sd来至于cache sd_ea,是cpu对应的顶层sd(tl DIE层) */
sd_cpu = (eenv->src_cpu != -1) ? eenv->src_cpu : eenv->dst_cpu;
sd = rcu_dereference(per_cpu(sd_ea, sd_cpu));
if (!sd)
return 0; /* Error */
mt_sched_printf(sched_eas_energy_calc, "0. %s: move task from src=%d to dst=%d util=%d",
__func__, eenv->src_cpu, eenv->dst_cpu, eenv->util_delta);
sg = sd->groups;
/* (6.1.6.3) 遍历sg所在sg链表,找到符合条件的sg,
累加计算eenv_before、eenv相关sg的功耗
*/
do {
/* (6.1.6.4) 如果当前sg包含src_cpu或者dst_cpu,计算 */
if (cpu_in_sg(sg, eenv->src_cpu) || cpu_in_sg(sg, eenv->dst_cpu)) {
/* (6.1.6.5) 当前顶层sg为eenv的sg_top */
eenv_before.sg_top = eenv->sg_top = sg;
mt_sched_printf(sched_eas_energy_calc, "1. %s: src=%d dst=%d mask=0x%lx (before)",
__func__, eenv_before.src_cpu, eenv_before.dst_cpu, sg->cpumask[0]);
/* (6.1.6.6) 计算eenv_before负载下sg的power */
if (sched_group_energy(&eenv_before))
return 0; /* Invalid result abort */
energy_before += eenv_before.energy;
/* Keep track of SRC cpu (before) capacity */
eenv->cap.before = eenv_before.cap.before;
eenv->cap.delta = eenv_before.cap.delta;
mt_sched_printf(sched_eas_energy_calc, "2. %s: src=%d dst=%d mask=0x%lx (after)",
__func__, eenv->src_cpu, eenv->dst_cpu, sg->cpumask[0]);
/* (6.1.6.7) 计算eenv负载下sg的power */
if (sched_group_energy(eenv))
return 0; /* Invalid result abort */
energy_after += eenv->energy;
}
} while (sg = sg->next, sg != sd->groups);
/* (6.1.6.8) 计算energy_after - energy_before */
eenv->nrg.before = energy_before;
eenv->nrg.after = energy_after;
eenv->nrg.diff = eenv->nrg.after - eenv->nrg.before;
eenv->payoff = 0;
trace_sched_energy_diff(eenv->task,
eenv->src_cpu, eenv->dst_cpu, eenv->util_delta,
eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff,
eenv->cap.before, eenv->cap.after, eenv->cap.delta,
eenv->nrg.delta, eenv->payoff);
mt_sched_printf(sched_eas_energy_calc, "5. %s: nrg.diff=%d cap.delta=%d",
__func__, eenv->nrg.diff, eenv->cap.delta);
return eenv->nrg.diff;
}
|||→
static int sched_group_energy(struct energy_env *eenv)
{
struct sched_domain *sd;
int cpu, total_energy = 0;
struct cpumask visit_cpus;
struct sched_group *sg;
#ifdef CONFIG_MTK_SCHED_EAS_POWER_SUPPORT
int only_lv1_sd = 0;
#endif
WARN_ON(!eenv->sg_top->sge);
cpumask_copy(&visit_cpus, sched_group_cpus(eenv->sg_top));
/* (6.1.6.6.1) 根据sg_top顶层sd,找到需要计算的cpu集合visit_cpus,逐个遍历其中每一个cpu
ooooo这一套复杂的循环算法计算下来,其实就计算了几个power,以cpu0-cpu3为例:
4个底层sg的power + 1个顶层sg的power
*/
while (!cpumask_empty(&visit_cpus)) {
struct sched_group *sg_shared_cap = NULL;
/* (6.1.6.6.2) 选取visit_cpus中的第一个cpu */
cpu = cpumask_first(&visit_cpus);
sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd);
if (!sd) {
/* a corner racing with hotplug? sd doesn't exist in this cpu. */
return -EINVAL;
}
/*
* Is the group utilization affected by cpus outside this
* sched_group?
*/
sd = rcu_dereference(per_cpu(sd_scs, cpu));
#ifdef CONFIG_MTK_SCHED_EAS_POWER_SUPPORT
/* Try to handle one CPU in this cluster by hotplug.
* In it there is only lv-1 sched_domain exist which having
* no share_cap_states.
*/
if (!sd) {
sd = rcu_dereference(per_cpu(sd_ea, cpu));
only_lv1_sd = 1;
}
#endif
if (!sd) {
/*
* We most probably raced with hotplug; returning a
* wrong energy estimation is better than entering an
* infinite loop.
*/
return -EINVAL;
}
if (sd->parent)
sg_shared_cap = sd->parent->groups;
/* (6.1.6.6.3) 从底层到顶层逐个遍历cpu所在的sd */
for_each_domain(cpu, sd) {
sg = sd->groups;
/* (6.1.6.6.4) 如果是顶层sd,只会计算一个sg */
/* Has this sched_domain already been visited? */
if (sd->child && group_first_cpu(sg) != cpu)
break;
/* (6.1.6.6.5) 逐个遍历该层次sg链表所在sg */
do {
unsigned long group_util;
int sg_busy_energy, sg_idle_energy;
int cap_idx, idle_idx;
if (sg_shared_cap && sg_shared_cap->group_weight >= sg->group_weight)
eenv->sg_cap = sg_shared_cap;
else
eenv->sg_cap = sg;
/* (6.1.6.6.6) 根据eenv指示的负载变化,找出满足该sg中最大负载cpu的capacity_index */
cap_idx = find_new_capacity(eenv, sg->sge);
if (sg->group_weight == 1) {
/* Remove capacity of src CPU (before task move) */
if (eenv->util_delta == 0 &&
cpumask_test_cpu(eenv->src_cpu, sched_group_cpus(sg))) {
eenv->cap.before = sg->sge->cap_states[cap_idx].cap;
eenv->cap.delta -= eenv->cap.before;
}
/* Add capacity of dst CPU (after task move) */
if (eenv->util_delta != 0 &&
cpumask_test_cpu(eenv->dst_cpu, sched_group_cpus(sg))) {
eenv->cap.after = sg->sge->cap_states[cap_idx].cap;
eenv->cap.delta += eenv->cap.after;
}
}
/* (6.1.6.6.7) 找出sg所有cpu中最小的idle index */
idle_idx = group_idle_state(sg);
/* (6.1.6.6.8) 累加sg中所有cpu的相对负载,
最大负载为sg->sge->cap_states[eenv->cap_idx].cap
*/
group_util = group_norm_util(eenv, sg);
/* (6.1.6.6.9) 计算power = busy_power + idle_power */
#ifdef CONFIG_MTK_SCHED_EAS_POWER_SUPPORT
/*
* To support power estimation for MTK soc.
* Consider share buck for dynamic power and SPARK/MCDI for static power.
*/
sg_busy_energy = (group_util *
sg->sge->busy_power(group_first_cpu(sg), eenv, (sd->child) ? 1 : 0))
>> SCHED_CAPACITY_SHIFT;
sg_idle_energy = ((SCHED_LOAD_SCALE - group_util) *
sg->sge->idle_power(idle_idx, group_first_cpu(sg), eenv, (sd->child) ? 1 : 0))
>> SCHED_CAPACITY_SHIFT;
#else
/* Power value had been separated to static + dynamic here */
sg_busy_energy = (group_util * (sg->sge->cap_states[cap_idx].dyn_pwr +
sg->sge->cap_states[cap_idx].lkg_pwr[sg->sge->lkg_idx]))
>> SCHED_CAPACITY_SHIFT;
sg_idle_energy = ((SCHED_LOAD_SCALE-group_util) *
sg->sge->idle_states[idle_idx].power)
>> SCHED_CAPACITY_SHIFT;
#endif
total_energy += sg_busy_energy + sg_idle_energy;
mt_sched_printf(sched_eas_energy_calc, "busy_energy=%d idle_eneryg=%d (cost=%d)",
sg_busy_energy, sg_idle_energy, total_energy);
/* (6.1.6.6.10) 如果遍历了底层sd,从visit_cpus中去掉对应的sg cpu */
if (!sd->child)
cpumask_xor(&visit_cpus, &visit_cpus, sched_group_cpus(sg));
#ifdef CONFIG_MTK_SCHED_EAS_POWER_SUPPORT
/*
* We try to get correct energy estimation while racing with hotplug
* and avoid entering a infinite loop.
*/
if (only_lv1_sd) {
eenv->energy = total_energy;
return 0;
}
#endif
if (cpumask_equal(sched_group_cpus(sg), sched_group_cpus(eenv->sg_top)))
goto next_cpu;
} while (sg = sg->next, sg != sd->groups);
}
/* (6.1.6.6.11) 如果遍历了cpu的底层到顶层sd,从visit_cpus中去掉对应的cpu */
next_cpu:
cpumask_clear_cpu(cpu, &visit_cpus);
continue;
}
eenv->energy = total_energy;
return 0;
}
|→
static struct sched_group *
find_idlest_group(struct sched_domain *sd, struct task_struct *p,
int this_cpu, int sd_flag)
{
struct sched_group *idlest = NULL, *group = sd->groups;
struct sched_group *fit_group = NULL;
unsigned long min_load = ULONG_MAX, this_load = 0;
unsigned long fit_capacity = ULONG_MAX;
int load_idx = sd->forkexec_idx;
int imbalance = 100 + (sd->imbalance_pct-100)/2;
/* (7.2.1) 选择load_idx */
if (sd_flag & SD_BALANCE_WAKE)
load_idx = sd->wake_idx;
/* (7.2.2) 当前cpu所在sd层次的sg,遍历sg所在的sg链表,选出负载最轻的idlest sg */
do {
unsigned long load, avg_load;
int local_group;
int i;
/* (7.2.3) 略过不符合p进程affinity的sg */
/* Skip over this group if it has no CPUs allowed */
if (!cpumask_intersects(sched_group_cpus(group),
tsk_cpus_allowed(p)))
continue;
/* (7.2.4) local_group等于本cpu所在的sg */
local_group = cpumask_test_cpu(this_cpu,
sched_group_cpus(group));
/* Tally up the load of all CPUs in the group */
avg_load = 0;
/* (7.2.5) 遍历sg中的所有cpu,累加负载 */
for_each_cpu(i, sched_group_cpus(group)) {
/* Bias balancing toward cpus of our domain */
if (local_group)
load = source_load(i, load_idx);
else
load = target_load(i, load_idx);
#ifdef CONFIG_MTK_SCHED_INTEROP
load += mt_rt_load(i);
#endif
avg_load += load;
/* (7.2.6) 如果EAS使能,找到能最小满足进程p的capacity sg */
/*
* Look for most energy-efficient group that can fit
* that can fit the task.
*/
if (capacity_of(i) < fit_capacity && task_fits_spare(p, i)) {
fit_capacity = capacity_of(i);
fit_group = group;
}
}
/* (7.2.7) 用累计的负载计算相对负载 */
/* Adjust by relative CPU capacity of the group */
avg_load = (avg_load * SCHED_CAPACITY_SCALE) / group->sgc->capacity;
/* (7.2.8) 计算idlest sg */
if (local_group) {
this_load = avg_load;
} else if (avg_load < min_load) {
min_load = avg_load;
idlest = group;
}
} while (group = group->next, group != sd->groups);
/* (7.2.9) EAS使能,返回fit_group */
if (energy_aware() && fit_group)
return fit_group;
if (!idlest || 100*this_load < imbalance*min_load)
return NULL;
/* (7.2.11) 否则,返回idlest */
return idlest;
}
|→
static int
find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
{
unsigned long load, min_load = ULONG_MAX;
unsigned int min_exit_latency = UINT_MAX;
u64 latest_idle_timestamp = 0;
int least_loaded_cpu = this_cpu;
int shallowest_idle_cpu = -1;
int i;
/* (7.3.1) 遍历sg中符合p进程affinity的cpu */
/* Traverse only the allowed CPUs */
for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
/* (7.3.2) 如果cpu的剩余capacity能容纳下p进程的load */
if (task_fits_spare(p, i)) {
struct rq *rq = cpu_rq(i);
struct cpuidle_state *idle = idle_get_state(rq);
/* (7.3.2.1) 优先选出idle状态,且退出idle开销最小的cpu */
if (idle && idle->exit_latency < min_exit_latency) {
/*
* We give priority to a CPU whose idle state
* has the smallest exit latency irrespective
* of any idle timestamp.
*/
min_exit_latency = idle->exit_latency;
latest_idle_timestamp = rq->idle_stamp;
shallowest_idle_cpu = i;
} else if (idle_cpu(i) &&
(!idle || idle->exit_latency == min_exit_latency) &&
rq->idle_stamp > latest_idle_timestamp) {
/*
* If equal or no active idle state, then
* the most recently idled CPU might have
* a warmer cache.
*/
latest_idle_timestamp = rq->idle_stamp;
shallowest_idle_cpu = i;
} else if (shallowest_idle_cpu == -1) {
/*
* If we haven't found an idle CPU yet
* pick a non-idle one that can fit the task as
* fallback.
*/
shallowest_idle_cpu = i;
}
/* (7.3.3) cpu的剩余capacity容纳不下进程p,选出负载最轻的cpu */
} else if (shallowest_idle_cpu == -1) {
load = weighted_cpuload(i);
#ifdef CONFIG_MTK_SCHED_INTEROP
load += mt_rt_load(i);
#endif
if (load < min_load || (load == min_load && i == this_cpu)) {
min_load = load;
least_loaded_cpu = i;
}
}
}
return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
}
|→
static int hmp_select_task_rq_fair(int sd_flag, struct task_struct *p,
int prev_cpu, int new_cpu)
{
struct list_head *pos;
struct sched_entity *se = &p->se;
struct cpumask fast_cpu_mask, slow_cpu_mask;
#ifdef CONFIG_HMP_TRACER
int cpu = 0;
for_each_online_cpu(cpu)
trace_sched_cfs_runnable_load(cpu, cfs_load(cpu), cfs_length(cpu));
#endif
/* error handling */
if (prev_cpu >= num_possible_cpus())
return new_cpu;
/*
* Skip all the checks if only one CPU is online.
* Otherwise, select the most appropriate CPU from cluster.
*/
if (num_online_cpus() == 1)
goto out;
/* (8.1) 找出fastest hmp_domain,只有一个,
找出slow hmp_domain,有多个,
在一个fast_cpu_mask和多个slow_cpu_mask之间,逐个尝试hmp_select_task_migration()
p进程是否会满足hmp迁移
*/
cpumask_clear(&fast_cpu_mask);
cpumask_clear(&slow_cpu_mask);
/* order: fast to slow hmp domain */
list_for_each(pos, &hmp_domains) {
struct hmp_domain *domain = list_entry(pos, struct hmp_domain, hmp_domains);
if (!cpumask_empty(&domain->cpus)) {
if (cpumask_empty(&fast_cpu_mask)) {
cpumask_copy(&fast_cpu_mask, &domain->possible_cpus);
} else {
cpumask_copy(&slow_cpu_mask, &domain->possible_cpus);
new_cpu = hmp_select_task_migration(sd_flag, p,
prev_cpu, new_cpu, &fast_cpu_mask, &slow_cpu_mask);
}
}
}
out:
/* it happens when num_online_cpus=1 */
if (new_cpu >= nr_cpu_ids) {
/* BUG_ON(1); */
new_cpu = prev_cpu;
}
cfs_nr_pending(new_cpu)++;
cfs_pending_load(new_cpu) += se_load(se);
return new_cpu;
}
||→
static int hmp_select_task_migration(int sd_flag, struct task_struct *p, int prev_cpu, int new_cpu,
struct cpumask *fast_cpu_mask, struct cpumask *slow_cpu_mask)
{
int step = 0;
struct sched_entity *se = &p->se;
int B_target = num_possible_cpus();
int L_target = num_possible_cpus();
struct clb_env clbenv;
/* (8.1.1) 找出fast_cpu_mask中负载最轻的cpu B_target,且符合p进程的affinity */
B_target = hmp_select_cpu(HMP_SELECT_RQ, p, fast_cpu_mask, prev_cpu, 0);
/* (8.1.2) 找出slow_cpu_mask中负载最轻的cpu L_target,且符合p进程的affinity */
L_target = hmp_select_cpu(HMP_SELECT_RQ, p, slow_cpu_mask, prev_cpu, 1);
/*
* Only one cluster exists or only one cluster is allowed for this task
* Case 1: return the runqueue whose load is minimum
* Case 2: return original CFS runqueue selection result
*/
if (B_target >= num_possible_cpus() && L_target >= num_possible_cpus())
goto out;
if (B_target >= num_possible_cpus())
goto select_slow;
if (L_target >= num_possible_cpus())
goto select_fast;
/*
* Two clusters exist and both clusters are allowed for this task
* Step 1: Move newly created task to the cpu where no tasks are running
* Step 2: Migrate heavy-load task to big
* Step 3: Migrate light-load task to LITTLE
* Step 4: Make sure the task stays in its previous hmp domain
*/
step = 1;
if (task_created(sd_flag) && !task_low_priority(p->prio)) {
if (!rq_length(B_target))
goto select_fast;
if (!rq_length(L_target))
goto select_slow;
}
/* (8.1.3) 计算如果L_target和B_target发生hmp迁移,各种负载和thershold的计算 */
memset(&clbenv, 0, sizeof(clbenv));
clbenv.flags |= HMP_SELECT_RQ;
cpumask_copy(&clbenv.lcpus, slow_cpu_mask);
cpumask_copy(&clbenv.bcpus, fast_cpu_mask);
clbenv.ltarget = L_target;
clbenv.btarget = B_target;
sched_update_clbstats(&clbenv);
/* (8.1.4) 判断进程p从L_target up到 B_target的可行性 */
step = 2;
if (hmp_up_migration(L_target, &B_target, se, &clbenv))
goto select_fast;
/* (8.1.5) 判断进程p从B_target down到 L_target的可行性 */
step = 3;
if (hmp_down_migration(B_target, &L_target, se, &clbenv))
goto select_slow;
/* (8.1.6) 如果prev_cpu是slowest */
step = 4;
if (hmp_cpu_is_slowest(prev_cpu))
goto select_slow;
goto select_fast;
/* (8.1.7) 返回 B_target */
select_fast:
new_cpu = B_target;
cpumask_clear(slow_cpu_mask);
goto out;
/* (8.1.8) 返回 L_target */
select_slow:
new_cpu = L_target;
cpumask_copy(fast_cpu_mask, slow_cpu_mask);
cpumask_clear(slow_cpu_mask);
goto out;
out:
#ifdef CONFIG_HMP_TRACER
trace_sched_hmp_load(clbenv.bstats.load_avg, clbenv.lstats.load_avg);
#endif
return new_cpu;
}
4.2、HMP负载均衡
除了SMP load_balance()负载均衡以外,我们还希望在多个SMP cluster之间能遵守一种规则:heavy任务跑在big core上,light任务跑在little core上,这样能快速的达到一个合理的负载状态。这种算法就叫做HMP负载均衡,EAS会统一的考虑负载、性能、功耗,EAS使能后HMP就被禁用了。
HMP负载均衡的操作分两种:
- 1、heavy task从little cpu迁移到big cpu。这种叫做up操作,对应的函数hmp_force_up_migration();
- 2、light task从big cpu迁移到little cpu。这种叫做down操作,对应的函数hmp_force_down_migration();
4.2.1、hmp domain初始化
hmp在初始化的时候会每个cluster分配一个hmp_domain,把所有hmp_domain加入到全局链表hmp_domains中。hmp_domains链表构建完成以后,离链表头hmp_domains最近的hmp_domain是速度最快的cluster,离hmp_domains越远hmp_domain对应的速度越慢。因为在构造链表时是按照cluster id来加入的,速度最快cluster的hmp_domain最后加入,所以离表头最近。
static int __init hmp_cpu_mask_setup(void)
{
struct hmp_domain *domain;
struct list_head *pos;
int dc, cpu;
pr_warn("Initializing HMP scheduler:\n");
/* Initialize hmp_domains using platform code */
/* (1) 调用arch相关的hmp_domains初始化函数 */
arch_get_hmp_domains(&hmp_domains);
if (list_empty(&hmp_domains)) {
pr_warn("HMP domain list is empty!\n");
return 0;
}
/* Print hmp_domains */
dc = 0;
list_for_each(pos, &hmp_domains) {
domain = list_entry(pos, struct hmp_domain, hmp_domains);
for_each_cpu(cpu, &domain->possible_cpus) {
/* (2) 给per_cpu变量hmp_cpu_domain赋值 */
per_cpu(hmp_cpu_domain, cpu) = domain;
}
dc++;
}
return 1;
}
|→
void __init arch_get_hmp_domains(struct list_head *hmp_domains_list)
{
struct hmp_domain *domain;
struct cpumask cpu_mask;
int id, maxid;
cpumask_clear(&cpu_mask);
maxid = arch_get_nr_clusters();
/*
* Initialize hmp_domains
* Must be ordered with respect to compute capacity.
* Fastest domain at head of list.
*/
/* (1.1) 按照cluster id初始化对应的hmp_domain */
for (id = 0; id < maxid; id++) {
arch_get_cluster_cpus(&cpu_mask, id);
domain = (struct hmp_domain *)
kmalloc(sizeof(struct hmp_domain), GFP_KERNEL);
cpumask_copy(&domain->possible_cpus, &cpu_mask);
cpumask_and(&domain->cpus, cpu_online_mask, &domain->possible_cpus);
/* (1.2) 将hmp_domain加入到全局链表hmp_domains_list即hmp_domains中 */
list_add(&domain->hmp_domains, hmp_domains_list);
}
}
4.2.2、hmp_force_up_migration()
hmp_force_up_migration()的操作主要有以下几个步骤:
需要重点提一下的是:负载计算计算了3种负载(load_avg、loadwop_avg、util_avg),rebalance_domains主要使用其中的loadwop_avg。
- 1、根据当前cpu,选择fast_cpu_mask、slow_cpu_mask;
hmp_force_up_migration尝试把slow cpu上的heavy进程迁移到fast cpu上,关于slow、fast的选择有以下几种场景:
2、选择当前cpu的heaviest进程作为迁移进程p;并不会遍历cpu上所有进程去选出heaviest进程,只会查询curr进程和cfs_rq中5个进程中的heaviest;
3、根据fast_cpu_mask,选择一个负载最少的target cpu;
- 4、根据源cpu(curr_cpu)、目的cpu(target_cpu),计算负载;
重要的数据计算方法:
重要数据 | 所属结构 | 含义 | 更新/获取函数 | 计算方法 |
clbenv->bstats.cpu_power | clbenv->bstats | B族cpu的绝对计算能力 | sched_update_clbstats() | arch_scale_cpu_capacity(NULL, clbenv->btarget) |
clbenv->lstats.cpu_power | clbenv->lstats | L族cpu的绝对计算能力 | sched_update_clbstats() | arch_scale_cpu_capacity(NULL, clbenv->ltarget) |
clbenv->lstats.cpu_capacity | clbenv->lstats | B族cpu的相对计算能力,大于1024 | sched_update_clbstats() | SCHED_CAPACITY_SCALE * clbenv->bstats.cpu_power / (clbenv->lstats.cpu_power+1) |
clbenv->bstats.cpu_capacity | clbenv->bstats | L族cpu的相对计算能力,等于1024 | sched_update_clbstats() | SCHED_CAPACITY_SCALE |
clbs->ncpu | clbenv->bstats/clbenv->lstats | L族/B族online的cpu数量 | collect_cluster_stats() | if (cpu_online(cpu)) clbs->ncpu++; |
clbs->ntask | clbenv->bstats/clbenv->lstats | L族/B族所有online cpu中所有层级se的总和 | collect_cluster_stats() | clbs->ntask += cpu_rq(cpu)->cfs.h_nr_running; |
clbs->load_avg | clbenv->bstats/clbenv->lstats | L族/B族online cpu的平均runnable负载,不带weight | collect_cluster_stats() | sum(cpu_rq(cpu)->cfs.avg.loadwop_avg)/clbs->ncpu |
clbs->scaled_acap | clbenv->bstats/clbenv->lstats | L族/B族target cpu计算能力的剩余值 | collect_cluster_stats() | hmp_scale_down(clbs->cpu_capacity - cpu_rq(target)->cfs.avg.loadwop_avg) |
clbs->scaled_atask | clbenv->bstats/clbenv->lstats | L族/B族target cpu的task space的剩余值 | collect_cluster_stats() | hmp_scale_down(clbs->cpu_capacity - cpu_rq(target)->cfs.h_nr_running * cpu_rq(target)->cfs.avg.loadwop_avg) |
clbenv->bstats.threshold | clbenv->bstats | 进程要up迁移到B族的负载门限值 | adj_threshold() | HMP_MAX_LOAD - HMP_MAX_LOAD * b_nacap * b_natask / ((b_nacap + l_nacap) * (b_natask + l_natask) + 1);b_nacap、b_natask会乘以一个放大系数(b_cpu_power/l_cpu_power),类似如cpu_capacity的计算 |
clbenv->lstats.threshold | clbenv->lstats | 进程要down迁移到L族的负载门限值 | adj_threshold() | HMP_MAX_LOAD * l_nacap * l_natask / ((b_nacap + l_nacap) * (b_natask + l_natask) + 1);b_nacap、b_natask会乘以一个放大系数(b_cpu_power/l_cpu_power),类似如cpu_capacity的计算 |
- 5、根据计算的负载情况,判断进程p是否符合up迁移条件((se_load(se) > B->threshold),等其他条件);
up-migration条件列表(hmp_up_migration()):
条件 | 含义 | 计算方法 | 计算解析 |
[1] Migration stabilizing | 如果target cpu刚做过up迁移,不适合再进行迁移 | if (!hmp_up_stable(*target_cpu)) check->result = 0; | (((now - hmp_last_up_migration(cpu)) >> 10) |
static void run_rebalance_domains(struct softirq_action *h)
{
struct rq *this_rq = this_rq();
enum cpu_idle_type idle = this_rq->idle_balance ?
CPU_IDLE : CPU_NOT_IDLE;
int this_cpu = smp_processor_id();
/* bypass load balance of HMP if EAS consideration */
/* (1) 在EAS不使能的情况下,尝试进行HMP负载均衡 */
if ((!energy_aware() && sched_feat(SCHED_HMP)) ||
(hybrid_support() && cpu_rq(this_cpu)->rd->overutilized))
hmp_force_up_migration(this_cpu);
/*
* If this cpu has a pending nohz_balance_kick, then do the
* balancing on behalf of the other idle cpus whose ticks are
* stopped. Do nohz_idle_balance *before* rebalance_domains to
* give the idle cpus a chance to load balance. Else we may
* load balance only within the local sched_domain hierarchy
* and abort nohz_idle_balance altogether if we pull some load.
*/
nohz_idle_balance(this_rq, idle);
rebalance_domains(this_rq, idle);
}
|→
static void hmp_force_up_migration(int this_cpu)
{
int curr_cpu, target_cpu;a
struct sched_entity *se;
struct rq *target;
unsigned long flags;
unsigned int force = 0;
struct task_struct *p;
struct clb_env clbenv;
#ifdef CONFIG_SCHED_HMP_PLUS
struct sched_entity *orig;
#endif
if (!spin_trylock(&hmp_force_migration))
return;
#ifdef CONFIG_HMP_TRACER
for_each_online_cpu(curr_cpu)
trace_sched_cfs_runnable_load(curr_cpu, cfs_load(curr_cpu), cfs_length(curr_cpu));
#endif
/* Migrate heavy task from LITTLE to big */
/* (1.1) 逐个online cpu尝试进行heavy task从little cpu到big cpu的迁移 */
for_each_online_cpu(curr_cpu) {
struct hmp_domain *hmp_domain = NULL;
struct cpumask fast_cpu_mask, slow_cpu_mask;
cpumask_clear(&fast_cpu_mask);
cpumask_clear(&slow_cpu_mask);
/* (1.2) 如果当前cpu不属于速度最快(fastest)的domain,
则尝试进行up操作
*/
if (!hmp_cpu_is_fastest(curr_cpu)) {
/* current cpu is slow_cpu_mask*/
/* (1.2.1) 当前cpu所在的hmp_domain为slow_cpu_mask */
hmp_domain = hmp_cpu_domain(curr_cpu);
cpumask_copy(&slow_cpu_mask, &hmp_domain->possible_cpus);
/* (1.2.2) 最fastest且online的hmp_domain为fast_cpu_mask */
while (&hmp_domain->hmp_domains != hmp_domains.next) {
struct list_head *pos = &hmp_domain->hmp_domains;
hmp_domain = list_entry(pos->prev, struct hmp_domain, hmp_domains);
if (!cpumask_empty(&hmp_domain->cpus)) {
cpumask_copy(&fast_cpu_mask, &hmp_domain->possible_cpus);
break;
}
}
} else {
/* (1.3) 如果当前cpu属于速度最快(fastest)的domain,
则直接进行down操作
*/
hmp_force_down_migration(this_cpu);
continue;
}
if (!hmp_domain || hmp_domain == hmp_cpu_domain(curr_cpu))
continue;
if (cpumask_empty(&fast_cpu_mask) || cpumask_empty(&slow_cpu_mask))
continue;
force = 0;
/* (1.4) 取出当前cpu的当前cfs进程 */
target = cpu_rq(curr_cpu);
raw_spin_lock_irqsave(&target->lock, flags);
se = target->cfs.curr;
if (!se) {
raw_spin_unlock_irqrestore(&target->lock, flags);
continue;
}
/* Find task entity */
if (!entity_is_task(se)) {
struct cfs_rq *cfs_rq;
cfs_rq = group_cfs_rq(se);
while (cfs_rq) {
se = cfs_rq->curr;
cfs_rq = group_cfs_rq(se);
}
}
#ifdef CONFIG_SCHED_HMP_PLUS
orig = se;
/* (1.5) 或者取出当前cpu前5个cfs进程中,负载最重(heaviest)的进程 */
se = hmp_get_heaviest_task(se, -1);
if (!se) {
raw_spin_unlock_irqrestore(&target->lock, flags);
continue;
}
if (!entity_is_task(se))
p = task_of(orig);
else
#endif
p = task_of(se);
/* (1.6) 选择fast_cpu_mask domain中,负载最少的cpu */
target_cpu = hmp_select_cpu(HMP_GB, p, &fast_cpu_mask, -1, 0);
if (target_cpu >= num_possible_cpus()) {
raw_spin_unlock_irqrestore(&target->lock, flags);
continue;
}
/* Collect cluster information */
/* (1.7) up操作的对象已经选择好:
源little cpu:curr_cpu
目的big cpu:target_cpu
*/
memset(&clbenv, 0, sizeof(clbenv));
clbenv.flags |= HMP_GB;
clbenv.ltarget = curr_cpu;
clbenv.btarget = target_cpu;
cpumask_copy(&clbenv.lcpus, &slow_cpu_mask);
cpumask_copy(&clbenv.bcpus, &fast_cpu_mask);
/* (1.8) up操作前的数据计算 */
sched_update_clbstats(&clbenv);
/* Check migration threshold */
/* (1.9) 根据计算的数据,判断up操作的可行性 */
if (!target->active_balance &&
hmp_up_migration(curr_cpu, &target_cpu, se, &clbenv) &&
!cpu_park(cpu_of(target))) {
if (p->state != TASK_DEAD) {
/* 准备从target rq中迁移进程p到target_cpu,
设置rq正在处理负载balance标志active_balance */
get_task_struct(p);
target->active_balance = 1; /* force up */
target->push_cpu = target_cpu;
target->migrate_task = p;
force = 1;
trace_sched_hmp_migrate(p, target->push_cpu, 1);
hmp_next_up_delay(&p->se, target->push_cpu);
}
}
raw_spin_unlock_irqrestore(&target->lock, flags);
/* (1.10) 判断结果是可以进行up操作,
则调用hmp_force_up_cpu_stop()进行实际的up操作
*/
if (force) {
if (stop_one_cpu_dispatch(cpu_of(target),
hmp_force_up_cpu_stop,
target, &target->active_balance_work)) {
/* 迁移完成,清除标志 */
put_task_struct(p); /* out of rq->lock */
raw_spin_lock_irqsave(&target->lock, flags);
target->active_balance = 0;
force = 0;
raw_spin_unlock_irqrestore(&target->lock, flags);
}
} else
/* (1.11) 否则,再尝试进行down操作 */
hmp_force_down_migration(this_cpu);
}
#ifdef CONFIG_HMP_TRACER
trace_sched_hmp_load(clbenv.bstats.load_avg, clbenv.lstats.load_avg);
#endif
spin_unlock(&hmp_force_migration);
}
||→
static const int hmp_max_tasks = 5;
static struct sched_entity *hmp_get_heaviest_task(
struct sched_entity *se, int target_cpu)
{
int num_tasks = hmp_max_tasks;
struct sched_entity *max_se = se;
unsigned long int max_ratio = se->avg.loadwop_avg;
const struct cpumask *hmp_target_mask = NULL;
struct hmp_domain *hmp;
/* (1.5.1) 如果本cpu是fastest cpu,则不用查找直接返回,
因为本函数的目的是找little cpu中的heaviest进程
*/
if (hmp_cpu_is_fastest(cpu_of(se->cfs_rq->rq)))
return max_se;
/* (1.5.2) 获取比本cpu fater一级cpu的hmp_domain,作为进程亲和力判断的mask */
hmp = hmp_faster_domain(cpu_of(se->cfs_rq->rq));
hmp_target_mask = &hmp->cpus;
/* (1.5.3) 传入参数target_cpu = -1,
所以hmp_target_mask使用的是源cpu hmp_domain的hmp->cpus
*/
if (target_cpu >= 0) {
/* idle_balance gets run on a CPU while
* it is in the middle of being hotplugged
* out. Bail early in that case.
*/
if (!cpumask_test_cpu(target_cpu, hmp_target_mask))
return NULL;
hmp_target_mask = cpumask_of(target_cpu);
}
/* The currently running task is not on the runqueue */
/* (1.5.4) 从当前cpu的cfs红黑树中,连续5个进程和curr进程比较,选出heaviest进程
比较使用的负载为se->avg.loadwop_avg,不带weight分量
*/
se = __pick_first_entity(cfs_rq_of(se));
while (num_tasks && se) {
if (entity_is_task(se) && se->avg.loadwop_avg > max_ratio &&
cpumask_intersects(hmp_target_mask, tsk_cpus_allowed(task_of(se)))) {
max_se = se;
max_ratio = se->avg.loadwop_avg;
}
se = __pick_next_entity(se);
num_tasks--;
}
return max_se;
}
||→
static unsigned int hmp_select_cpu(unsigned int caller, struct task_struct *p,
struct cpumask *mask, int prev, int up)
{
int curr = 0;
int target = num_possible_cpus();
unsigned long curr_wload = 0;
unsigned long target_wload = 0;
struct cpumask srcp;
/* (1.6.1) 综合fast_cpu_mask、cpu_online_mask、tsk_cpus_allowed(p),
选取first cpu为target
*/
cpumask_and(&srcp, cpu_online_mask, mask);
target = cpumask_any_and(&srcp, tsk_cpus_allowed(p));
if (target >= num_possible_cpus())
goto out;
/*
* RT class is taken into account because CPU load is multiplied
* by the total number of CPU runnable tasks that includes RT tasks.
*/
/* (1.6.2) 计算target cpu所对应的load,
target_wload = (rq->cfs.avg.loadwop_avg + rq->cfs.avg.pending_load) * (rq->nr_running + rq->cfs.avg.nr_pending)
该负载会受RT进程的影响,因为rq->nr_running会统计包括RT进程的数量
*/
target_wload = hmp_inc(cfs_load(target));
target_wload += cfs_pending_load(target);
target_wload *= rq_length(target);
for_each_cpu(curr, mask) {
/* Check CPU status and task affinity */
if (!cpu_online(curr) || !cpumask_test_cpu(curr, tsk_cpus_allowed(p)))
continue;
/* For global load balancing, unstable CPU will be bypassed */
/* (1.6.3) 如果当前是up操作,如果cpu在短时间内进行了down操作,则不适合马上进行up操作 */
if (hmp_caller_is_gb(caller) && !hmp_cpu_stable(curr, up))
continue;
curr_wload = hmp_inc(cfs_load(curr));
curr_wload += cfs_pending_load(curr);
curr_wload *= rq_length(curr);
/* (1.6.4) 选择load最小的作为target cpu */
if (curr_wload < target_wload) {
target_wload = curr_wload;
target = curr;
/* (1.6.5) 在load同样小的情况下,选择prev cpu */
} else if (curr_wload == target_wload && curr == prev) {
target = curr;
}
}
out:
return target;
}
||→
static void sched_update_clbstats(struct clb_env *clbenv)
{
/* init cpu power and capacity */
/* (1.8.1) L族和B族的绝对运行能力和相对运算能力,
.cpu_power = 绝对运算能力
.cpu_capacity = 相对运算能力
*/
clbenv->bstats.cpu_power = (int) arch_scale_cpu_capacity(NULL, clbenv->btarget);
clbenv->lstats.cpu_power = (int) arch_scale_cpu_capacity(NULL, clbenv->ltarget);
clbenv->lstats.cpu_capacity = SCHED_CAPACITY_SCALE;
clbenv->bstats.cpu_capacity = SCHED_CAPACITY_SCALE * clbenv->bstats.cpu_power / (clbenv->lstats.cpu_power+1);
/* (1.8.2) L族和B族的 */
collect_cluster_stats(&clbenv->bstats, &clbenv->bcpus, clbenv->btarget);
collect_cluster_stats(&clbenv->lstats, &clbenv->lcpus, clbenv->ltarget);
/* (1.8.3) L族和B族的 */
adj_threshold(clbenv);
}
|||→
static void collect_cluster_stats(struct clb_stats *clbs, struct cpumask *cluster_cpus, int target)
{
#define HMP_RESOLUTION_SCALING (4)
#define hmp_scale_down(w) ((w) >> HMP_RESOLUTION_SCALING)
/* Update cluster informatics */
int cpu;
/* (1.8.2.1) 累加本族online cpu的值 */
for_each_cpu(cpu, cluster_cpus) {
if (cpu_online(cpu)) {
clbs->ncpu++;
clbs->ntask += cpu_rq(cpu)->cfs.h_nr_running;
clbs->load_avg += cpu_rq(cpu)->cfs.avg.loadwop_avg;
#ifdef CONFIG_SCHED_HMP_PRIO_FILTER
clbs->nr_normal_prio_task += cfs_nr_normal_prio(cpu);
clbs->nr_dequeuing_low_prio += cfs_nr_dequeuing_low_prio(cpu);
#endif
}
}
if (!clbs->ncpu || target >= num_possible_cpus() || !cpumask_test_cpu(target, cluster_cpus))
return;
/*
* Calculate available CPU capacity
* Calculate available task space
*
* Why load ratio should be multiplied by the number of task ?
* The task is the entity of scheduling unit so that we should consider
* it in scheduler. Only considering task load is not enough.
* Thus, multiplying the number of tasks can adjust load ratio to a more
* reasonable value.
*/
/* (1.8.2.2) 计算本族剩余的cpu计算能力
capacity = 相对计算能力(clbs->cpu_capacity) - 本cpu的负载(rq->cfs.avg.loadwop_avg)
:clbs->cpu_capacity是B族和L族相对的(L是1024,B大于1024),而负载(rq->cfs.avg.loadwop_avg)是相对自己的B族和L族的最大值都是1024
*/
clbs->load_avg /= clbs->ncpu;
clbs->acap = clbs->cpu_capacity - cpu_rq(target)->cfs.avg.loadwop_avg;
clbs->scaled_acap = hmp_scale_down(clbs->acap);
/* (1.8.2.3) 计算本族剩余的task空间
scaled_atask = 相对计算能力(clbs->cpu_capacity) - 本cpu的负载(rq->cfs.avg.loadwop_avg)*本cpu所有的进程数量(rq->cfs.h_nr_running)
ooooo这里的计算也不是在同一纬度上的
*/
clbs->scaled_atask = cpu_rq(target)->cfs.h_nr_running * cpu_rq(target)->cfs.avg.loadwop_avg;
clbs->scaled_atask = clbs->cpu_capacity - clbs->scaled_atask;
clbs->scaled_atask = hmp_scale_down(clbs->scaled_atask);
mt_sched_printf(sched_log, "[%s] cpu/cluster:%d/%02lx load/len:%lu/%u stats:%d,%d,%d,%d,%d,%d,%d,%d\n",
__func__, target, *cpumask_bits(cluster_cpus),
cpu_rq(target)->cfs.avg.loadwop_avg,
cpu_rq(target)->cfs.h_nr_running,
clbs->ncpu, clbs->ntask, clbs->load_avg, clbs->cpu_capacity,
clbs->acap, clbs->scaled_acap, clbs->scaled_atask, clbs->threshold);
}
|||→
/*
* Task Dynamic Migration Threshold Adjustment.
*
* If the workload between clusters is not balanced, adjust migration
* threshold in an attempt to move task precisely.
*
* Diff. = Max Threshold - Min Threshold
*
* Dynamic UP-Threshold =
* B_nacap B_natask
* Max Threshold - Diff. x ----------------- x -------------------
* B_nacap + L_nacap B_natask + L_natask
*
*
* Dynamic Down-Threshold =
* L_nacap L_natask
* Min Threshold + Diff. x ----------------- x -------------------
* B_nacap + L_nacap B_natask + L_natask
*/
static void adj_threshold(struct clb_env *clbenv)
{
#define POSITIVE(x) ((int)(x) < 0 ? 0 : (x))
unsigned long b_cap = 0, l_cap = 0;
int b_nacap, l_nacap, b_natask, l_natask;
b_cap = clbenv->bstats.cpu_power;
l_cap = clbenv->lstats.cpu_power;
/* (1.8.3.1) 把B族剩余cpu计算能力和task空间,转换成L族的相对值 */
b_nacap = POSITIVE(clbenv->bstats.scaled_acap *
clbenv->bstats.cpu_power / (clbenv->lstats.cpu_power+1));
b_natask = POSITIVE(clbenv->bstats.scaled_atask *
clbenv->bstats.cpu_power / (clbenv->lstats.cpu_power+1));
/* L族的值维持不变 */
l_nacap = POSITIVE(clbenv->lstats.scaled_acap);
l_natask = POSITIVE(clbenv->lstats.scaled_atask);
/* (1.8.3.2) 计算up的threshold,
up-threshold = HMP_MAX_LOAD - HMP_MAX_LOAD*B族剩余
*/
clbenv->bstats.threshold = HMP_MAX_LOAD - HMP_MAX_LOAD * b_nacap * b_natask /
((b_nacap + l_nacap) * (b_natask + l_natask) + 1);
/* (1.8.3.3) 计算down的threshold,
down-threshold = HMP_MAX_LOAD*L族剩余
*/
clbenv->lstats.threshold = HMP_MAX_LOAD * l_nacap * l_natask /
((b_nacap + l_nacap) * (b_natask + l_natask) + 1);
mt_sched_printf(sched_log, "[%s]\tup/dl:%4d/%4d L(%d:%4lu) b(%d:%4lu)\n", __func__,
clbenv->bstats.threshold, clbenv->lstats.threshold,
clbenv->ltarget, l_cap, clbenv->btarget, b_cap);
}
||→
/*
* Check whether this task should be migrated to big
* Briefly summarize the flow as below;
* 1) Migration stabilizing
* 2) Filter low-priority task
* 2.5) Keep all cpu busy
* 3) Check CPU capacity
* 4) Check dynamic migration threshold
*/
static unsigned int hmp_up_migration(int cpu, int *target_cpu, struct sched_entity *se,
struct clb_env *clbenv)
{
struct task_struct *p = task_of(se);
struct clb_stats *L, *B;
struct mcheck *check;
int curr_cpu = cpu;
#ifdef CONFIG_HMP_TRACER
unsigned int caller = clbenv->flags;
#endif
L = &clbenv->lstats;
B = &clbenv->bstats;
check = &clbenv->mcheck;
check->status = clbenv->flags;
check->status |= HMP_TASK_UP_MIGRATION;
check->result = 0;
/*
* No migration is needed if
* 1) There is only one cluster
* 2) Task is already in big cluster
* 3) It violates task affinity
*/
if (!L->ncpu || !B->ncpu
|| cpumask_test_cpu(curr_cpu, &clbenv->bcpus)
|| !cpumask_intersects(&clbenv->bcpus, tsk_cpus_allowed(p)))
goto out;
/* (1.9.1) 如果目标cpu短时间内已经执行了up操作,则为up unstable状态,退出 */
/*
* [1] Migration stabilizing
* Let the task load settle before doing another up migration.
* It can prevent a bunch of tasks from migrating to a unstable CPU.
*/
if (!hmp_up_stable(*target_cpu))
goto out;
/* (1.9.2) 过滤掉优先级较低的进程,不进行迁移操作。具体有3个条件:
(task_low_priority(p->prio) && \ // nice值大于5
(B->ntask >= B->ncpu || 0 != L->nr_normal_prio_task) && \ // B组进程大于cou数 || 正常优先级的进程不为0
(p->se.avg.loadwop_avg < 800)) // 平均负载小于800
*/
/* [2] Filter low-priority task */
#ifdef CONFIG_SCHED_HMP_PRIO_FILTER
if (hmp_low_prio_task_up_rejected(p, B, L)) {
check->status |= HMP_LOW_PRIORITY_FILTER;
goto trace;
}
#endif
/* (1.9.3) 如果B组的target cpu为idle,不用过多判断,直接准备迁移 */
/* [2.5]if big is idle, just go to big */
if (rq_length(*target_cpu) == 0) {
check->status |= HMP_BIG_IDLE;
check->status |= HMP_MIGRATION_APPROVED;
check->result = 1;
goto trace;
}
/* (1.9.4) 判断B族target cpu的capacity是否足够,
(se_load(se) + cfs_load(cpu)) < (B->cpu_capacity - (B->cpu_capacity >> 2))
// target cpu负载 + 要迁移的se负载 是否小于 3/4 B族cpu的capacity
*/
/*
* [3] Check CPU capacity
* Forbid up-migration if big CPU can't handle this task
*/
if (!hmp_task_fast_cpu_afford(B, se, *target_cpu)) {
check->status |= HMP_BIG_CAPACITY_INSUFFICIENT;
goto trace;
}
/* (1.9.5) 判断se的负载是否已经大于up-threshold(B->threshold) */
/*
* [4] Check dynamic migration threshold
* Migrate task from LITTLE to big if load is greater than up-threshold
*/
if (se_load(se) > B->threshold) {
check->status |= HMP_MIGRATION_APPROVED;
check->result = 1;
}
trace:
#ifdef CONFIG_HMP_TRACER
if (check->result && hmp_caller_is_gb(caller))
hmp_stats.nr_force_up++;
trace_sched_hmp_stats(&hmp_stats);
trace_sched_dynamic_threshold(task_of(se), B->threshold, check->status,
curr_cpu, *target_cpu, se_load(se), B, L);
trace_sched_dynamic_threshold_draw(B->threshold, L->threshold);
#endif
out:
return check->result;
}
||→
static int hmp_force_up_cpu_stop(void *data)
{
/* (1.10.1) 执行进程迁移 */
return hmp_active_task_migration_cpu_stop(data);
}
|||→
static int hmp_active_task_migration_cpu_stop(void *data)
{
struct rq *busiest_rq = data;
struct task_struct *p = NULL;
int busiest_cpu = cpu_of(busiest_rq);
int target_cpu = busiest_rq->push_cpu;
struct rq *target_rq = cpu_rq(target_cpu);
struct sched_domain *sd;
raw_spin_lock_irq(&busiest_rq->lock);
p = busiest_rq->migrate_task;
/* make sure the requested cpu hasn't gone down in the meantime */
if (unlikely(busiest_cpu != smp_processor_id() ||
!busiest_rq->active_balance)) {
goto out_unlock;
}
/* Is there any task to move? */
if (busiest_rq->nr_running <= 1)
goto out_unlock;
/* Are both target and busiest cpu online */
if (!cpu_online(busiest_cpu) || !cpu_online(target_cpu))
goto out_unlock;
/* Task has migrated meanwhile, abort forced migration */
if ((!p) || (task_rq(p) != busiest_rq))
goto out_unlock;
/*
* This condition is "impossible", if it occurs
* we need to fix it. Originally reported by
* Bjorn Helgaas on a 128-cpu setup.
*/
WARN_ON(busiest_rq == target_rq);
/* (1.10.1.1) 将源、目的rq lock住 */
/* move a task from busiest_rq to target_rq */
double_lock_balance(busiest_rq, target_rq);
/* (1.10.1.2) 搜索target cpu所在的某一层次的sd,其sd->span[]即包含源cpu又包含目的cpu */
/* Search for an sd spanning us and the target CPU. */
rcu_read_lock();
for_each_domain(target_cpu, sd) {
if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
break;
}
/* (1.10.1.3) 构造数据,在同一sd下进行迁移 */
if (likely(sd)) {
struct lb_env env = {
.sd = sd,
.dst_cpu = target_cpu,
.dst_rq = target_rq,
.src_cpu = busiest_rq->cpu,
.src_rq = busiest_rq,
.idle = CPU_IDLE,
};
schedstat_inc(sd, alb_count);
/* (1.10.1.4) 任务迁移 */
if (move_specific_task(&env, p))
schedstat_inc(sd, alb_pushed);
else
schedstat_inc(sd, alb_failed);
}
rcu_read_unlock();
double_unlock_balance(busiest_rq, target_rq);
out_unlock:
busiest_rq->active_balance = 0;
raw_spin_unlock_irq(&busiest_rq->lock);
put_task_struct(p);
return 0;
}
||||→
static int move_specific_task(struct lb_env *env, struct task_struct *pm)
{
struct task_struct *p, *n;
/* (1.10.1.4.1) 从源rq->cfs_tasks逐个取出任务,直到查到pm */
list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
/* (1.10.1.4.2) task group的throttled判断 */
if (throttled_lb_pair(task_group(p), env->src_rq->cpu,
env->dst_cpu))
continue;
/* (1.10.1.4.3) 判断任务能否被迁移 */
if (!hmp_can_migrate_task(p, env))
continue;
/* Check if we found the right task */
if (p != pm)
continue;
/* (1.10.1.4.4) 迁移 */
move_task(p, env);
/*
* Right now, this is only the third place move_task()
* is called, so we can safely collect move_task()
* stats here rather than inside move_task().
*/
schedstat_inc(env->sd, lb_gained[env->idle]);
return 1;
}
return 0;
}
|||||→
static void move_task(struct task_struct *p, struct lb_env *env)
{
deactivate_task(env->src_rq, p, 0);
set_task_cpu(p, env->dst_cpu);
activate_task(env->dst_rq, p, 0);
check_preempt_curr(env->dst_rq, p, 0);
}
4.2.3、hmp_force_down_migration()
hmp_force_down_migration()的操作主要有以下几个步骤:
- 1、根据当前cpu,选择fast_cpu_mask、slow_cpu_mask;
hmp_force_down_migration尝试把fast cpu上的light进程迁移到slow cpu上,关于fast、slow的选择有以下几种场景:
2、选择当前cpu的lightest进程作为迁移进程p;并不会遍历cpu上所有进程去选出lightest进程,只会查询curr进程和cfs_rq中5个进程中的lightest;
3、根据slow_cpu_mask,选择一个负载最少的target cpu;
- 4、根据源cpu(curr_cpu)、目的cpu(target_cpu),计算负载;
重要的数据计算方法和hmp_force_up_migration()一致,参考上一节;
- 5、根据计算的负载情况,判断进程p是否符合down迁移条件((L->threshold >= se_load(se)),等其他条件);
down-migration条件列表(hmp_down_migration()):
条件 | 含义 | 计算方法 | 计算解析 |
[1] Migration stabilizing | 如果target cpu刚做过down迁移,不适合再进行迁移 | if (!hmp_down_stable(*target_cpu)) check->result = 0; | (((now - hmp_last_down_migration(cpu)) >> 10) |
static void hmp_force_down_migration(int this_cpu)
{
int target_cpu;
struct sched_entity *se;
struct rq *target;
unsigned long flags;
unsigned int force = 0;
struct task_struct *p;
struct clb_env clbenv;
#ifdef CONFIG_SCHED_HMP_PLUS
struct sched_entity *orig;
int B_cpu;
#endif
struct hmp_domain *hmp_domain = NULL;
struct cpumask fast_cpu_mask, slow_cpu_mask;
cpumask_clear(&fast_cpu_mask);
cpumask_clear(&slow_cpu_mask);
/* Migrate light task from big to LITTLE */
/* (1) 如果当前cpu不是最慢的cpu(slowest),则尝试down操作 */
if (!hmp_cpu_is_slowest(this_cpu)) {
/* (2) 当前cpu所在的hmp_domain为fast_cpu_mask */
hmp_domain = hmp_cpu_domain(this_cpu);
cpumask_copy(&fast_cpu_mask, &hmp_domain->possible_cpus);
/* (3) 查找相比当前最慢且online的hmp_domain作为slow_cpu_mask */
while (!list_is_last(&hmp_domain->hmp_domains, &hmp_domains)) {
struct list_head *pos = &hmp_domain->hmp_domains;
hmp_domain = list_entry(pos->next, struct hmp_domain, hmp_domains);
if (!cpumask_empty(&hmp_domain->cpus)) {
cpumask_copy(&slow_cpu_mask, &hmp_domain->possible_cpus);
break;
}
}
}
if (!hmp_domain || hmp_domain == hmp_cpu_domain(this_cpu))
return;
/* (4) 找不到可操作的fast_cpu_mask、slow_cpu_mask直接返回 */
if (cpumask_empty(&fast_cpu_mask) || cpumask_empty(&slow_cpu_mask))
return;
/* (5) 源cpu = this_cpu,源rq = target */
force = 0;
target = cpu_rq(this_cpu);
raw_spin_lock_irqsave(&target->lock, flags);
se = target->cfs.curr;
if (!se) {
raw_spin_unlock_irqrestore(&target->lock, flags);
return;
}
/* (6) 首先尝试使用curr进程作为down迁移的进程 */
/* Find task entity */
if (!entity_is_task(se)) {
struct cfs_rq *cfs_rq;
cfs_rq = group_cfs_rq(se);
while (cfs_rq) {
se = cfs_rq->curr;
cfs_rq = group_cfs_rq(se);
}
}
#ifdef CONFIG_SCHED_HMP_PLUS
/* (7) 在curr进程开始的5个进程中,挑负载最轻的进程作为down迁移进程 */
orig = se;
se = hmp_get_lightest_task(orig, 1);
if (!entity_is_task(se))
p = task_of(orig);
else
#endif
p = task_of(se);
#ifdef CONFIG_SCHED_HMP_PLUS
/* (8) 找出B族中负载最轻的cpu,如果其为idle状态,则放弃down操作
因为load_balance中的idle_balance会重新把任务迁移回idle的big cpu,避免相互的乒乓操作
*/
/* Don't offload to little if there is one idle big, let load balance to do it's work */
/* Also, to prevent idle_balance from leading to potential ping-pong */
B_cpu = hmp_select_cpu(HMP_GB, p, &fast_cpu_mask, this_cpu, 0);
if (B_cpu < nr_cpu_ids && !rq_length(B_cpu)) {
raw_spin_unlock_irqrestore(&target->lock, flags);
return;
}
#endif
/* (9) 找出L族中负载最轻的cpu作为target_cpu */
target_cpu = hmp_select_cpu(HMP_GB, p, &slow_cpu_mask, -1, 1);
if (target_cpu >= num_possible_cpus()) {
raw_spin_unlock_irqrestore(&target->lock, flags);
return;
}
/* (10) 迁移前对B族、L族负载和threshold的计算 */
/* Collect cluster information */
memset(&clbenv, 0, sizeof(clbenv));
clbenv.flags |= HMP_GB;
clbenv.btarget = this_cpu;
clbenv.ltarget = target_cpu;
cpumask_copy(&clbenv.lcpus, &slow_cpu_mask);
cpumask_copy(&clbenv.bcpus, &fast_cpu_mask);
sched_update_clbstats(&clbenv);
#ifdef CONFIG_SCHED_HMP_PLUS
if (cpu_rq(this_cpu)->cfs.h_nr_running < 2) {
raw_spin_unlock_irqrestore(&target->lock, flags);
return;
}
#endif
/* (11) 检查down操作的迁移条件是否成立,hmp_down_migration() */
/* Check migration threshold */
if (!target->active_balance &&
hmp_down_migration(this_cpu, &target_cpu, se, &clbenv) &&
!cpu_park(cpu_of(target))) {
if (p->state != TASK_DEAD) {
get_task_struct(p);
target->active_balance = 1; /* force down */
target->push_cpu = target_cpu;
target->migrate_task = p;
force = 1;
trace_sched_hmp_migrate(p, target->push_cpu, 1);
hmp_next_down_delay(&p->se, target->push_cpu);
}
}
raw_spin_unlock_irqrestore(&target->lock, flags);
/* (12) 条件成立进行实际的down迁移操作hmp_force_down_cpu_stop() */
if (force) {
if (stop_one_cpu_dispatch(cpu_of(target),
hmp_force_down_cpu_stop,
target, &target->active_balance_work)) {
put_task_struct(p); /* out of rq->lock */
raw_spin_lock_irqsave(&target->lock, flags);
target->active_balance = 0;
force = 0;
raw_spin_unlock_irqrestore(&target->lock, flags);
}
}
}
|→
static struct sched_entity *hmp_get_lightest_task(
struct sched_entity *se, int migrate_down)
{
int num_tasks = hmp_max_tasks;
struct sched_entity *min_se = se;
unsigned long int min_ratio = se->avg.loadwop_avg;
const struct cpumask *hmp_target_mask = NULL;
if (migrate_down) {
struct hmp_domain *hmp;
/* (7.1) 如果cpu是最慢cpu(slowest)则直接退出,
因为本函数的目的是找出faster cpu中lightest进程
*/
if (hmp_cpu_is_slowest(cpu_of(se->cfs_rq->rq)))
return min_se;
/* (7.2) 将更slow一级的hmp_domain作为进程cpu亲和力的mask */
hmp = hmp_slower_domain(cpu_of(se->cfs_rq->rq));
hmp_target_mask = &hmp->cpus;
}
/* The currently running task is not on the runqueue */
se = __pick_first_entity(cfs_rq_of(se));
/* (7.3) 从当前cpu的cfs红黑树中,连续5个进程和curr进程比较,选出lightest进程
比较使用的负载为se->avg.loadwop_avg,不带weight分量
*/
while (num_tasks && se) {
if (entity_is_task(se) &&
(se->avg.loadwop_avg < min_ratio && hmp_target_mask &&
cpumask_intersects(hmp_target_mask, tsk_cpus_allowed(task_of(se))))) {
min_se = se;
min_ratio = se->avg.loadwop_avg;
}
se = __pick_next_entity(se);
num_tasks--;
}
return min_se;
}
|→
/*
* Check whether this task should be migrated to LITTLE
* Briefly summarize the flow as below;
* 1) Migration stabilizing
* 1.5) Keep all cpu busy
* 2) Filter low-priority task
* 3) Check CPU capacity
* 4) Check dynamic migration threshold
*/
static unsigned int hmp_down_migration(int cpu, int *target_cpu, struct sched_entity *se,
struct clb_env *clbenv)
{
struct task_struct *p = task_of(se);
struct clb_stats *L, *B;
struct mcheck *check;
int curr_cpu = cpu;
unsigned int caller = clbenv->flags;
L = &clbenv->lstats;
B = &clbenv->bstats;
check = &clbenv->mcheck;
check->status = caller;
check->status |= HMP_TASK_DOWN_MIGRATION;
check->result = 0;
/*
* No migration is needed if
* 1) There is only one cluster
* 2) Task is already in LITTLE cluster
* 3) It violates task affinity
*/
if (!L->ncpu || !B->ncpu
|| cpumask_test_cpu(curr_cpu, &clbenv->lcpus)
|| !cpumask_intersects(&clbenv->lcpus, tsk_cpus_allowed(p)))
goto out;
/* (11.1) 目的little cpu target_cpu近期如果有做过down操作,不适合再做down迁移 */
/*
* [1] Migration stabilizing
* Let the task load settle before doing another down migration.
* It can prevent a bunch of tasks from migrating to a unstable CPU.
*/
if (!hmp_down_stable(*target_cpu))
goto out;
/* (11.2) 如果big busy,little idle则不用进行threshold判断 */
/* [1.5]if big is busy and little is idle, just go to little */
if (rq_length(*target_cpu) == 0 && caller == HMP_SELECT_RQ && rq_length(curr_cpu) > 0) {
struct rq *curr_rq = cpu_rq(curr_cpu);
/* (11.2.1) 如果big cpu,curr进程不是heavy进程,但是p是heavy进程,直接准许down迁移
heavy进程的判断标准为:负载>=650
*/
/* if current big core is not heavy task and wake up task is heavy task no go to little */
if (!(!is_heavy_task(curr_rq->curr) && is_heavy_task(p))) {
check->status |= HMP_BIG_BUSY_LITTLE_IDLE;
check->status |= HMP_MIGRATION_APPROVED;
check->result = 1;
goto trace;
}
}
/* (11.3) 低优先级进程,如果满足以下条件,准许迁移:
(task_low_priority(p->prio) && !B->nr_dequeuing_low_prio && \ // nice值大于5
B->ntask >= B->ncpu && 0 != L->nr_normal_prio_task && \ // B和L都不是特别空闲
(p->se.avg.loadwop_avg < 800)) // L上准备迁移的进程负载小于800
*/
/* [2] Filter low-priority task */
#ifdef CONFIG_SCHED_HMP_PRIO_FILTER
if (hmp_low_prio_task_down_allowed(p, B, L)) {
cfs_nr_dequeuing_low_prio(curr_cpu)++;
check->status |= HMP_LOW_PRIORITY_FILTER;
check->status |= HMP_MIGRATION_APPROVED;
check->result = 1;
goto trace;
}
#endif
/*
* [3] Check CPU capacity
* Forbid down-migration if either of the following conditions is true
* 1) big cpu is not oversubscribed (if big CPU seems to have spare
* cycles, do not force this task to run on LITTLE CPU, but
* keep it staying in its previous cluster instead)
* 2) LITTLE cpu doesn't have available capacity for this new task
*/
/* (11.4) 如果big cpu有足够的空闲周期,不需要强制把light任务迁移到little cpu上
cfs_load(cpu) < (B->cpu_capacity - (B->cpu_capacity >> 2))
*/
if (!hmp_fast_cpu_oversubscribed(caller, B, se, curr_cpu)) {
check->status |= HMP_BIG_NOT_OVERSUBSCRIBED;
goto trace;
}
/* (11.5) 判断L族cpu的capacity是否足够容纳需要迁移的进程,
(L->acap > 0 && L->acap >= se_load(se))
*/
if (!hmp_task_slow_cpu_afford(L, se)) {
check->status |= HMP_LITTLE_CAPACITY_INSUFFICIENT;
goto trace;
}
/* (11.6) 判断se的负载是否已经小于down-threshold(L->threshold) */
/*
* [4] Check dynamic migration threshold
* Migrate task from big to LITTLE if load ratio is less than
* or equal to down-threshold
*/
if (L->threshold >= se_load(se)) {
check->status |= HMP_MIGRATION_APPROVED;
check->result = 1;
}
trace:
#ifdef CONFIG_HMP_TRACER
if (check->result && hmp_caller_is_gb(caller))
hmp_stats.nr_force_down++;
trace_sched_hmp_stats(&hmp_stats);
trace_sched_dynamic_threshold(task_of(se), L->threshold, check->status,
curr_cpu, *target_cpu, se_load(se), B, L);
trace_sched_dynamic_threshold_draw(B->threshold, L->threshold);
#endif
out:
return check->result;
}
4.2.4、hmp_select_task_rq_fair()
4.3、cpu freq调整
前面讲的负载均衡的手段都是负载迁移,把负载迁移到最idle或者最省power的cpu上。另外一种方式就是调整cpu的freq,从而改变cpu的curr_capacity,来满足性能和功耗的需求。
cpu的频率调整是基于3个层次的:cpufreq governor、cpufreq core、cpufreq driver。
- 1、cpufreq governor决定cpu调频的算法,计算负载、根据负载的变化来动态调整频率;
- 2、cpufreq core对通用层进行了一些封装,比如cpufreq_policy的封装;
- 3、cpufreq driver是底层操作的实现,比如freq_table的初始化、cpu target频率的配置;
如果是MTK平台,cpufreq driver除了接受governor的频率调整还需要接受ppm的频率调整,它的框图大概如下:
4.3.1、cpufreq core & cpufreq driver
cpufreq core层次最核心的就是每个cpu有一个自己的cpufreq_policy policy,放在per_cpu(cpufreq_cpu_data, cpu)变量中。实际上cpufreq_policy是一个cluster对应一个的,因为在现有的架构中,同一个cluster cpu都是同一个频率,所以同cluster中所有cpu的per_cpu(cpufreq_cpu_data, cpu)都指向同一个cpufreq_policy。
4.3.1.1、cpufreq_policy policy初始化
struct cpufreq_policy {
/* CPUs sharing clock, require sw coordination */
cpumask_var_t cpus; /* Online CPUs only */
cpumask_var_t related_cpus; /* Online + Offline CPUs */
cpumask_var_t real_cpus; /* Related and present */
unsigned int shared_type; /* ACPI: ANY or ALL affected CPUs
should set cpufreq */
unsigned int cpu; /* cpu managing this policy, must be online */
struct clk *clk;
struct cpufreq_cpuinfo cpuinfo;/* see above */
unsigned int min; /* in kHz */
unsigned int max; /* in kHz */
unsigned int cur; /* in kHz, only needed if cpufreq
* governors are used */
unsigned int restore_freq; /* = policy->cur before transition */
unsigned int suspend_freq; /* freq to set during suspend */
unsigned int policy; /* see above */
unsigned int last_policy; /* policy before unplug */
struct cpufreq_governor *governor; /* see below */
void *governor_data;
bool governor_enabled; /* governor start/stop flag */
char last_governor[CPUFREQ_NAME_LEN]; /* last governor used */
struct work_struct update; /* if update_policy() needs to be
* called, but you're in IRQ context */
struct cpufreq_user_policy user_policy;
struct cpufreq_frequency_table *freq_table;
struct list_head policy_list;
struct kobject kobj;
struct completion kobj_unregister;
/*
* The rules for this semaphore:
* - Any routine that wants to read from the policy structure will
* do a down_read on this semaphore.
* - Any routine that will write to the policy structure and/or may take away
* the policy altogether (eg. CPU hotplug), will hold this lock in write
* mode before doing so.
*
* Additional rules:
* - Lock should not be held across
* __cpufreq_governor(data, CPUFREQ_GOV_POLICY_EXIT);
*/
struct rw_semaphore rwsem;
/* Synchronization for frequency transitions */
bool transition_ongoing; /* Tracks transition status */
spinlock_t transition_lock;
wait_queue_head_t transition_wait;
struct task_struct *transition_task; /* Task which is doing the transition */
/* cpufreq-stats */
struct cpufreq_stats *stats;
/* For cpufreq driver's internal use */
void *driver_data;
}
在系统初始化化的时候初始化online cpu的cpufreq_policy,cpu在hotplug online的时候也会重新初始化cpufreq_policy。
- 1、在mtk的cpufreq_driver驱动初始化函数_mt_cpufreq_pdrv_probe()中注册了_mt_cpufreq_driver:
static int _mt_cpufreq_pdrv_probe(struct platform_device *pdev)
{
/* 注册cpufreq_driver */
cpufreq_register_driver(&_mt_cpufreq_driver);
/* 注册ppm的回调 */
mt_ppm_register_client(PPM_CLIENT_DVFS, &ppm_limit_callback);
}
static struct cpufreq_driver _mt_cpufreq_driver = {
.flags = CPUFREQ_ASYNC_NOTIFICATION,
.verify = _mt_cpufreq_verify,
.target = _mt_cpufreq_target,
.init = _mt_cpufreq_init,
.exit = _mt_cpufreq_exit,
.get = _mt_cpufreq_get,
.name = "mt-cpufreq",
.attr = _mt_cpufreq_attr,
};
- 2、在驱动注册cpufreq_register_driver()过程中会初始化online cpu的cpufreq_policy:
_mt_cpufreq_pdrv_probe() -> cpufreq_register_driver() -> subsys_interface_register() -> cpufreq_add_dev() -> cpufreq_online()
↓
static int cpufreq_online(unsigned int cpu)
{
struct cpufreq_policy *policy;
bool new_policy;
unsigned long flags;
unsigned int j;
int ret;
pr_debug("%s: bringing CPU%u online\n", __func__, cpu);
/* (1) 检查per_cpu(cpufreq_cpu_data, cpu)中的cpufreq_policy,
如果为NULL,重新分配空间
*/
/* Check if this CPU already has a policy to manage it */
policy = per_cpu(cpufreq_cpu_data, cpu);
if (policy) {
WARN_ON(!cpumask_test_cpu(cpu, policy->related_cpus));
if (!policy_is_inactive(policy))
return cpufreq_add_policy_cpu(policy, cpu);
/* This is the only online CPU for the policy. Start over. */
new_policy = false;
down_write(&policy->rwsem);
policy->cpu = cpu;
policy->governor = NULL;
up_write(&policy->rwsem);
} else {
new_policy = true;
policy = cpufreq_policy_alloc(cpu);
if (!policy)
return -ENOMEM;
}
cpumask_copy(policy->cpus, cpumask_of(cpu));
/* (2) 调用cpufreq_driver的初始化函数来初始化cpufreq_policy,
这步比较重要,初始化了以下的数据:
*/
/* call driver. From then on the cpufreq must be able
* to accept all calls to ->verify and ->setpolicy for this CPU
*/
ret = cpufreq_driver->init(policy);
if (ret) {
pr_debug("initialization failed\n");
goto out_free_policy;
}
down_write(&policy->rwsem);
/* (3) 如果cpufreq_policy是新分配空间的,
做一些相应的初始化工作
*/
if (new_policy) {
/* related_cpus should at least include policy->cpus. */
cpumask_copy(policy->related_cpus, policy->cpus);
/* Remember CPUs present at the policy creation time. */
cpumask_and(policy->real_cpus, policy->cpus, cpu_present_mask);
/* Name and add the kobject */
ret = kobject_add(&policy->kobj, cpufreq_global_kobject,
"policy%u",
cpumask_first(policy->related_cpus));
if (ret) {
pr_err("%s: failed to add policy->kobj: %d\n", __func__,
ret);
goto out_exit_policy;
}
}
/*
* affected cpus must always be the one, which are online. We aren't
* managing offline cpus here.
*/
cpumask_and(policy->cpus, policy->cpus, cpu_online_mask);
if (new_policy) {
policy->user_policy.min = policy->min;
policy->user_policy.max = policy->max;
write_lock_irqsave(&cpufreq_driver_lock, flags);
/* (3.1) 同一个cluster中所有cpu的per_cpu(cpufreq_cpu_data, j),共享同一个cpufreq_policy */
for_each_cpu(j, policy->related_cpus)
per_cpu(cpufreq_cpu_data, j) = policy;
write_unlock_irqrestore(&cpufreq_driver_lock, flags);
}
/* (4) 获取cpufreq_policy的当前频率
*/
if (cpufreq_driver->get && !cpufreq_driver->setpolicy) {
policy->cur = cpufreq_driver->get(policy->cpu);
if (!policy->cur) {
pr_err("%s: ->get() failed\n", __func__);
goto out_exit_policy;
}
}
/*
* Sometimes boot loaders set CPU frequency to a value outside of
* frequency table present with cpufreq core. In such cases CPU might be
* unstable if it has to run on that frequency for long duration of time
* and so its better to set it to a frequency which is specified in
* freq-table. This also makes cpufreq stats inconsistent as
* cpufreq-stats would fail to register because current frequency of CPU
* isn't found in freq-table.
*
* Because we don't want this change to effect boot process badly, we go
* for the next freq which is >= policy->cur ('cur' must be set by now,
* otherwise we will end up setting freq to lowest of the table as 'cur'
* is initialized to zero).
*
* We are passing target-freq as "policy->cur - 1" otherwise
* __cpufreq_driver_target() would simply fail, as policy->cur will be
* equal to target-freq.
*/
if ((cpufreq_driver->flags & CPUFREQ_NEED_INITIAL_FREQ_CHECK)
&& has_target()) {
/* Are we running at unknown frequency ? */
ret = cpufreq_frequency_table_get_index(policy, policy->cur);
if (ret == -EINVAL) {
/* Warn user and fix it */
pr_warn("%s: CPU%d: Running at unlisted freq: %u KHz\n",
__func__, policy->cpu, policy->cur);
ret = __cpufreq_driver_target(policy, policy->cur - 1,
CPUFREQ_RELATION_L);
/*
* Reaching here after boot in a few seconds may not
* mean that system will remain stable at "unknown"
* frequency for longer duration. Hence, a BUG_ON().
*/
BUG_ON(ret);
pr_warn("%s: CPU%d: Unlisted initial frequency changed to: %u KHz\n",
__func__, policy->cpu, policy->cur);
}
}
blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
CPUFREQ_START, policy);
if (new_policy) {
ret = cpufreq_add_dev_interface(policy);
if (ret)
goto out_exit_policy;
blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
CPUFREQ_CREATE_POLICY, policy);
write_lock_irqsave(&cpufreq_driver_lock, flags);
list_add(&policy->policy_list, &cpufreq_policy_list);
write_unlock_irqrestore(&cpufreq_driver_lock, flags);
}
/* (5) 调用cpufreq governor的初始化函数,来初始化cpufreq_policy
*/
ret = cpufreq_init_policy(policy);
if (ret) {
pr_err("%s: Failed to initialize policy for cpu: %d (%d)\n",
__func__, cpu, ret);
/* cpufreq_policy_free() will notify based on this */
new_policy = false;
goto out_exit_policy;
}
up_write(&policy->rwsem);
kobject_uevent(&policy->kobj, KOBJ_ADD);
/* Callback for handling stuff after policy is ready */
if (cpufreq_driver->ready)
cpufreq_driver->ready(policy);
pr_debug("initialization complete\n");
return 0;
out_exit_policy:
up_write(&policy->rwsem);
if (cpufreq_driver->exit)
cpufreq_driver->exit(policy);
out_free_policy:
cpufreq_policy_free(policy, !new_policy);
return ret;
}
|→
static int _mt_cpufreq_init(struct cpufreq_policy *policy)
{
int ret = -EINVAL;
unsigned long flags;
FUNC_ENTER(FUNC_LV_MODULE);
policy->shared_type = CPUFREQ_SHARED_TYPE_ANY;
cpumask_setall(policy->cpus);
policy->cpuinfo.transition_latency = 1000;
{
enum mt_cpu_dvfs_id id = _get_cpu_dvfs_id(policy->cpu);
struct mt_cpu_dvfs *p = id_to_cpu_dvfs(id);
unsigned int lv = _mt_cpufreq_get_cpu_level();
struct opp_tbl_info *opp_tbl_info;
struct opp_tbl_m_info *opp_tbl_m_info;
struct opp_tbl_m_info *opp_tbl_m_cci_info;
struct mt_cpu_dvfs *p_cci;
cpufreq_ver("DVFS: _mt_cpufreq_init: %s(cpu_id = %d)\n", cpu_dvfs_get_name(p), p->cpu_id);
opp_tbl_info = &opp_tbls[id][lv];
p->cpu_level = lv;
/* (2.1) 给policy->freq_table赋值
给policy->cpus赋值
给policy->related_cpus赋值
*/
ret = _mt_cpufreq_setup_freqs_table(policy,
opp_tbl_info->opp_tbl, opp_tbl_info->size);
/* (2.2) 给policy->cpuinfo.max_freq赋值
给policy->cpuinfo.min_freq赋值
*/
policy->cpuinfo.max_freq = cpu_dvfs_get_max_freq(p);
policy->cpuinfo.min_freq = cpu_dvfs_get_min_freq(p);
opp_tbl_m_info = &opp_tbls_m[id][lv];
p->freq_tbl = opp_tbl_m_info->opp_tbl_m;
cpufreq_lock(flags);
/* Sync p */
if (_mt_cpufreq_sync_opp_tbl_idx(p) >= 0)
if (p->idx_normal_max_opp == -1)
p->idx_normal_max_opp = p->idx_opp_tbl;
/* (2.3) 给policy->cur赋值
给policy->max赋值
给policy->min赋值
*/
policy->cur = cpu_dvfs_get_cur_freq(p); /* use cur phy freq is better */
policy->max = cpu_dvfs_get_freq_by_idx(p, p->idx_opp_ppm_limit);
policy->min = cpu_dvfs_get_freq_by_idx(p, p->idx_opp_ppm_base);
p->mt_policy = policy;
p->armpll_is_available = 1;
#ifdef CONFIG_HYBRID_CPU_DVFS
if (turbo_flag && cpu_dvfs_is(p, MT_CPU_DVFS_B) && !turbo_is_inited) {
unsigned int turbo_f, turbo_v;
turbo_f = ((cpu_dvfs_get_max_freq(p) * 104 / 100) / 13) * 13 / 1000;
if (picachu_need_higher_volt(MT_PICACHU_DOMAIN2))
turbo_v = MAX_VPROC_VOLT;
else
turbo_v = MAX_VPROC_VOLT - 2000;
/* turbo_v = p->opp_tbl[0].cpufreq_volt; */
cpuhvfs_set_turbo_scale(turbo_f * 1000, turbo_v);
turbo_is_inited = 1;
}
#endif
/* Sync cci */
if (cci_is_inited == 0) {
p_cci = id_to_cpu_dvfs(MT_CPU_DVFS_CCI);
/* init cci freq idx */
if (_mt_cpufreq_sync_opp_tbl_idx(p_cci) >= 0)
if (p_cci->idx_normal_max_opp == -1)
p_cci->idx_normal_max_opp = p_cci->idx_opp_tbl;
opp_tbl_m_cci_info = &opp_tbls_m[MT_CPU_DVFS_CCI][lv];
p_cci->freq_tbl = opp_tbl_m_cci_info->opp_tbl_m;
p_cci->mt_policy = NULL;
p_cci->armpll_is_available = 1;
cci_is_inited = 1;
}
#ifdef CONFIG_HYBRID_CPU_DVFS
cpuhvfs_set_cluster_on_off(arch_get_cluster_id(p->cpu_id), 1);
#endif
cpufreq_unlock(flags);
}
if (ret)
cpufreq_err("failed to setup frequency table\n");
FUNC_EXIT(FUNC_LV_MODULE);
return ret;
}
||→
static int _mt_cpufreq_setup_freqs_table(struct cpufreq_policy *policy,
struct mt_cpu_freq_info *freqs, int num)
{
struct mt_cpu_dvfs *p;
int ret = 0;
FUNC_ENTER(FUNC_LV_LOCAL);
p = id_to_cpu_dvfs(_get_cpu_dvfs_id(policy->cpu));
#ifdef CONFIG_CPU_FREQ
ret = cpufreq_frequency_table_cpuinfo(policy, p->freq_tbl_for_cpufreq);
/* (2.1.1) 给policy->freq_table赋值
*/
if (!ret)
policy->freq_table = p->freq_tbl_for_cpufreq;
/* (2.1.2) 根据cpu相同cluster中有哪些cpu
给policy->cpus赋值
给policy->related_cpus赋值
*/
cpumask_copy(policy->cpus, topology_core_cpumask(policy->cpu));
cpumask_copy(policy->related_cpus, policy->cpus);
#endif
FUNC_EXIT(FUNC_LV_LOCAL);
return 0;
}
- 3、在cpufreq_online()初始化完cpufreq_policy,最后会调用cpufreq_init_policy()继续governor的初始化:
static int cpufreq_init_policy(struct cpufreq_policy *policy)
{
struct cpufreq_governor *gov = NULL;
struct cpufreq_policy new_policy;
memcpy(&new_policy, policy, sizeof(*policy));
/* (5.1) 使用last或者default的governor,
给new_policy.governor赋值
*/
/* Update governor of new_policy to the governor used before hotplug */
gov = find_governor(policy->last_governor);
if (gov)
pr_debug("Restoring governor %s for cpu %d\n",
policy->governor->name, policy->cpu);
else
gov = CPUFREQ_DEFAULT_GOVERNOR;
new_policy.governor = gov;
/* Use the default policy if there is no last_policy. */
if (cpufreq_driver->setpolicy) {
if (policy->last_policy)
new_policy.policy = policy->last_policy;
else
cpufreq_parse_governor(gov->name, &new_policy.policy,
NULL);
}
/* (5.2) 启动governor来使用cpufreq_policy */
/* set default policy */
return cpufreq_set_policy(policy, &new_policy);
}
|→
static int cpufreq_set_policy(struct cpufreq_policy *policy,
struct cpufreq_policy *new_policy)
{
struct cpufreq_governor *old_gov;
int ret;
pr_debug("setting new policy for CPU %u: %u - %u kHz\n",
new_policy->cpu, new_policy->min, new_policy->max);
memcpy(&new_policy->cpuinfo, &policy->cpuinfo, sizeof(policy->cpuinfo));
/* (5.2.1) 对policy、new_policy的一堆合法性判断 */
/*
* This check works well when we store new min/max freq attributes,
* because new_policy is a copy of policy with one field updated.
*/
if (new_policy->min > new_policy->max)
return -EINVAL;
/* verify the cpu speed can be set within this limit */
ret = cpufreq_driver->verify(new_policy);
if (ret)
return ret;
/* adjust if necessary - all reasons */
blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
CPUFREQ_ADJUST, new_policy);
/*
* verify the cpu speed can be set within this limit, which might be
* different to the first one
*/
ret = cpufreq_driver->verify(new_policy);
if (ret)
return ret;
/* notification of the new policy */
blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
CPUFREQ_NOTIFY, new_policy);
scale_freq_capacity(new_policy, NULL);
policy->min = new_policy->min;
policy->max = new_policy->max;
trace_cpu_frequency_limits(policy->max, policy->min, policy->cpu);
pr_debug("new min and max freqs are %u - %u kHz\n",
policy->min, policy->max);
if (cpufreq_driver->setpolicy) {
policy->policy = new_policy->policy;
pr_debug("setting range\n");
return cpufreq_driver->setpolicy(new_policy);
}
if (new_policy->governor == policy->governor)
goto out;
pr_debug("governor switch\n");
/* (5.2.2) 如果旧的governor在工作中,
依次调用 CPUFREQ_GOV_STOP、CPUFREQ_GOV_POLICY_EXIT停止旧的governor
*/
/* save old, working values */
old_gov = policy->governor;
/* end old governor */
if (old_gov) {
ret = __cpufreq_governor(policy, CPUFREQ_GOV_STOP);
if (ret) {
/* This can happen due to race with other operations */
pr_debug("%s: Failed to Stop Governor: %s (%d)\n",
__func__, old_gov->name, ret);
return ret;
}
up_write(&policy->rwsem);
ret = __cpufreq_governor(policy, CPUFREQ_GOV_POLICY_EXIT);
down_write(&policy->rwsem);
if (ret) {
pr_err("%s: Failed to Exit Governor: %s (%d)\n",
__func__, old_gov->name, ret);
return ret;
}
}
/* (5.2.3) 依次调用 CPUFREQ_GOV_POLICY_INIT、CPUFREQ_GOV_START让新的governor开工
*/
/* start new governor */
policy->governor = new_policy->governor;
ret = __cpufreq_governor(policy, CPUFREQ_GOV_POLICY_INIT);
if (!ret) {
ret = __cpufreq_governor(policy, CPUFREQ_GOV_START);
if (!ret)
goto out;
up_write(&policy->rwsem);
__cpufreq_governor(policy, CPUFREQ_GOV_POLICY_EXIT);
down_write(&policy->rwsem);
}
/* new governor failed, so re-start old one */
pr_debug("starting governor %s failed\n", policy->governor->name);
if (old_gov) {
policy->governor = old_gov;
if (__cpufreq_governor(policy, CPUFREQ_GOV_POLICY_INIT))
policy->governor = NULL;
else
__cpufreq_governor(policy, CPUFREQ_GOV_START);
}
return ret;
out:
pr_debug("governor: change or update limits\n");
return __cpufreq_governor(policy, CPUFREQ_GOV_LIMITS);
}
||→
static int __cpufreq_governor(struct cpufreq_policy *policy,
unsigned int event)
{
/* __cpufreq_governor()调用的各种命令最后调用的都是governor的具体函数 */
ret = policy->governor->governor(policy, event);
}
- 4、以interactive governor为例,说明policy->governor->governor()对CPUFREQ_GOV_POLICY_INIT、CPUFREQ_GOV_START、CPUFREQ_GOV_STOP、CPUFREQ_GOV_POLICY_EXIT这几个命令的实现:
struct cpufreq_governor cpufreq_gov_interactive = {
.name = "interactive",
.governor = cpufreq_governor_interactive,
.max_transition_latency = 10000000,
.owner = THIS_MODULE,
};
↓
static int cpufreq_governor_interactive(struct cpufreq_policy *policy,
unsigned int event)
{
int rc;
unsigned int j;
struct cpufreq_interactive_cpuinfo *pcpu;
struct cpufreq_frequency_table *freq_table;
struct cpufreq_interactive_tunables *tunables;
unsigned long flags;
if (have_governor_per_policy())
tunables = policy->governor_data;
else
tunables = common_tunables;
WARN_ON(!tunables && (event != CPUFREQ_GOV_POLICY_INIT));
switch (event) {
/* (1) CPUFREQ_GOV_POLICY_INIT命令的实现:
初始化tunables,tunables是interactive governor在计算时使用的各种参数
相关的sysfs注册
*/
case CPUFREQ_GOV_POLICY_INIT:
if (have_governor_per_policy()) {
WARN_ON(tunables);
} else if (tunables) {
tunables->usage_count++;
policy->governor_data = tunables;
return 0;
}
tunables = kzalloc(sizeof(*tunables), GFP_KERNEL);
if (!tunables) {
pr_err("%s: POLICY_INIT: kzalloc failed\n", __func__);
return -ENOMEM;
}
tunables->usage_count = 1;
tunables->above_hispeed_delay = default_above_hispeed_delay;
tunables->nabove_hispeed_delay =
ARRAY_SIZE(default_above_hispeed_delay);
tunables->go_hispeed_load = DEFAULT_GO_HISPEED_LOAD;
tunables->target_loads = default_target_loads;
tunables->ntarget_loads = ARRAY_SIZE(default_target_loads);
tunables->min_sample_time = DEFAULT_MIN_SAMPLE_TIME;
tunables->timer_rate = DEFAULT_TIMER_RATE;
tunables->boostpulse_duration_val = DEFAULT_MIN_SAMPLE_TIME;
tunables->timer_slack_val = DEFAULT_TIMER_SLACK;
spin_lock_init(&tunables->target_loads_lock);
spin_lock_init(&tunables->above_hispeed_delay_lock);
policy->governor_data = tunables;
if (!have_governor_per_policy()) {
common_tunables = tunables;
}
rc = sysfs_create_group(get_governor_parent_kobj(policy),
get_sysfs_attr());
if (rc) {
kfree(tunables);
policy->governor_data = NULL;
if (!have_governor_per_policy()) {
common_tunables = NULL;
}
return rc;
}
if (!policy->governor->initialized) {
idle_notifier_register(&cpufreq_interactive_idle_nb);
cpufreq_register_notifier(&cpufreq_notifier_block,
CPUFREQ_TRANSITION_NOTIFIER);
}
break;
/* (2) CPUFREQ_GOV_POLICY_EXIT命令的实现:
remove相关的sysfs
*/
case CPUFREQ_GOV_POLICY_EXIT:
if (!--tunables->usage_count) {
if (policy->governor->initialized == 1) {
cpufreq_unregister_notifier(&cpufreq_notifier_block,
CPUFREQ_TRANSITION_NOTIFIER);
idle_notifier_unregister(&cpufreq_interactive_idle_nb);
}
#ifdef CONFIG_MEIZU_BSP
}
#else
sysfs_remove_group(get_governor_parent_kobj(policy),
get_sysfs_attr());
kfree(tunables);
common_tunables = NULL;
}
policy->governor_data = NULL;
#endif //CONFIG_MEIZU_BSP
break;
/* (3) CPUFREQ_GOV_START命令的实现:
因为同一个cluster中的多个cpu是共享一个cpufreq_policy的,
所以使用同一个cpufreq_policy来初始化cluster中多个online cpu的per_cpu(cpuinfo, j)变量:
pcpu->target_freq // 当前频率
pcpu->freq_table // 频率表
并且启动cpu上的interactive_timer=pcpu->cpu_timer:
cpufreq_interactive_timer_start(tunables, j);
*/
case CPUFREQ_GOV_START:
mutex_lock(&gov_lock);
freq_table = cpufreq_frequency_get_table(policy->cpu);
if (tunables && !tunables->hispeed_freq)
tunables->hispeed_freq = policy->max;
for_each_cpu(j, policy->cpus) {
pcpu = &per_cpu(cpuinfo, j);
pcpu->policy = policy;
pcpu->target_freq = policy->cur;
pcpu->freq_table = freq_table;
pcpu->floor_freq = pcpu->target_freq;
pcpu->pol_floor_val_time =
ktime_to_us(ktime_get());
pcpu->loc_floor_val_time = pcpu->pol_floor_val_time;
pcpu->pol_hispeed_val_time = pcpu->pol_floor_val_time;
pcpu->loc_hispeed_val_time = pcpu->pol_floor_val_time;
down_write(&pcpu->enable_sem);
del_timer_sync(&pcpu->cpu_timer);
del_timer_sync(&pcpu->cpu_slack_timer);
cpufreq_interactive_timer_start(tunables, j);
pcpu->governor_enabled = 1;
up_write(&pcpu->enable_sem);
}
mutex_unlock(&gov_lock);
break;
/* (4) CPUFREQ_GOV_STOP命令的实现:
如果同一个cluster中的多个cpu都已经offline,停掉对应的governor:
停掉cpu上的interactive_timer=pcpu->cpu_timer
*/
case CPUFREQ_GOV_STOP:
mutex_lock(&gov_lock);
for_each_cpu(j, policy->cpus) {
pcpu = &per_cpu(cpuinfo, j);
down_write(&pcpu->enable_sem);
pcpu->governor_enabled = 0;
del_timer_sync(&pcpu->cpu_timer);
del_timer_sync(&pcpu->cpu_slack_timer);
up_write(&pcpu->enable_sem);
}
mutex_unlock(&gov_lock);
break;
case CPUFREQ_GOV_LIMITS:
if (policy->max < policy->cur)
__cpufreq_driver_target(policy,
policy->max, CPUFREQ_RELATION_H);
else if (policy->min > policy->cur)
__cpufreq_driver_target(policy,
policy->min, CPUFREQ_RELATION_L);
for_each_cpu(j, policy->cpus) {
pcpu = &per_cpu(cpuinfo, j);
down_read(&pcpu->enable_sem);
if (pcpu->governor_enabled == 0) {
up_read(&pcpu->enable_sem);
continue;
}
spin_lock_irqsave(&pcpu->target_freq_lock, flags);
if (policy->max < pcpu->target_freq)
pcpu->target_freq = policy->max;
else if (policy->min > pcpu->target_freq)
pcpu->target_freq = policy->min;
spin_unlock_irqrestore(&pcpu->target_freq_lock, flags);
up_read(&pcpu->enable_sem);
}
break;
}
4.3.1.2、cpufrep的频率配置
cpufreq一个重要的作用就是能把用户需要的cpu频率配置下去,这部分的代码也需要cpufreq core和cpufreq driver的配合。频率调整也叫DVFS(Dynamic Voltage and Frequency Scaling),需要按照对应关系把电压和频率一起配置下去。
具体的代码解析如下:
int __cpufreq_driver_target(struct cpufreq_policy *policy,
unsigned int target_freq,
unsigned int relation)
{
unsigned int old_target_freq = target_freq;
int retval = -EINVAL;
if (cpufreq_disabled())
return -ENODEV;
/* (1) target目标频率在policy中的合法性检测 */
/* Make sure that target_freq is within supported range */
if (target_freq > policy->max)
target_freq = policy->max;
if (target_freq < policy->min)
target_freq = policy->min;
pr_debug("target for CPU %u: %u kHz, relation %u, requested %u kHz\n",
policy->cpu, target_freq, relation, old_target_freq);
/* (2) 如果当前频率就是target频率,不用调整直接返回 */
/*
* This might look like a redundant call as we are checking it again
* after finding index. But it is left intentionally for cases where
* exactly same freq is called again and so we can save on few function
* calls.
*/
if (target_freq == policy->cur)
return 0;
/* Save last value to restore later on errors */
policy->restore_freq = policy->cur;
if (cpufreq_driver->target)
/* (3) 调用实际的驱动target()函数来调整cpu频率 */
retval = cpufreq_driver->target(policy, target_freq, relation);
else if (cpufreq_driver->target_index) {
struct cpufreq_frequency_table *freq_table;
int index;
freq_table = cpufreq_frequency_get_table(policy->cpu);
if (unlikely(!freq_table)) {
pr_err("%s: Unable to find freq_table\n", __func__);
goto out;
}
retval = cpufreq_frequency_table_target(policy, freq_table,
target_freq, relation, &index);
if (unlikely(retval)) {
pr_err("%s: Unable to find matching freq\n", __func__);
goto out;
}
if (freq_table[index].frequency == policy->cur) {
retval = 0;
goto out;
}
retval = __target_index(policy, freq_table, index);
}
out:
return retval;
}
|→
static int _mt_cpufreq_target(struct cpufreq_policy *policy, unsigned int target_freq,
unsigned int relation)
{
struct mt_cpu_dvfs *p;
int ret;
unsigned int new_opp_idx;
p = id_to_cpu_dvfs(_get_cpu_dvfs_id(policy->cpu));
if (!p)
return -EINVAL;
/* (3.1) 驱动根据频率电压表,配置target频率和对应电压 */
ret = cpufreq_frequency_table_target(policy, p->freq_tbl_for_cpufreq,
target_freq, relation, &new_opp_idx);
if (ret || new_opp_idx >= p->nr_opp_tbl)
return -EINVAL;
if (dvfs_disable_flag || p->dvfs_disable_by_suspend || p->dvfs_disable_by_procfs)
return -EPERM;
_mt_cpufreq_dvfs_request_wrapper(p, new_opp_idx, MT_CPU_DVFS_NORMAL, NULL);
return 0;
}
4.3.2、interactive governor
在所有的cpufreq governor中最有名气的就是interactive governor了,因为几乎所有的andriod系统中都在使用。
interactive的思想就是使用cpu的负载来调整cpu频率,核心就是:使用一个20ms的定时器来计算cpu占用率,根据cpu占用率的不同threshold来调整不同档位的频率。
interactive的负载计算方法如上图所示。interactive的整个计算方法大概如下:
- 1、计算cpu的累加负载。每20ms采样一次,每次采样统计增加的active_time和当前频率的乘积:cputime_speedadj += active_time * cur_freq;
- 2、计算cpu的占用率。当前cpu占用率 = (累加负载100)/(累加时间当前频率),cpu_load = (loadadjfreq*100)/(delta_time*cur_freq);
- 3、如果cpu_load达到高门限go_hispeed_load(99%)或者发生boost,直接调节频率到hispeed_freq(最高频率);
- 4、其他情况下使用choose_freq()公式计算新频率:new_freq = cur_freq*(cpu_load/DEFAULT_TARGET_LOAD(90));new_freq = cpufreq_frequency_table_target(new_freq, CPUFREQ_RELATION_L);
- 5、如果当前频率已经达到hispeed_freq,还需要往上调整,必须在之前的频率上保持above_hispeed_delay(20ms);如果当前频率已经达到hispeed_freq,还需要往下调整,必须在之前的频率上保持min_sample_time(80ms);
interactive governor从原理上看,有以下问题:
- 1、20ms的采样时间过长,负载变化到频率调整的反应时间过长;
- 2、负载累加计算有问题,历史负载没有老化机制,历史负载的权重和当前一样,造成当前的负载变化不真实;
- 3、计算cpu占用率=总历史负载/(总时间*当前频率),算法不合理历史负载对当前影响太大。如果之前是高频率,现在变成低频率,那么cpu_load计算出来的值可能超过100%;如果之前是低频率,现在是高频率,那么cpu_load计算出来的值也会大大被拉低;
- 4、choose_freq()的计算公式有重大漏洞。比如我们cpu频率表={800M, 900M},当前cur_freq=800m cur_load=100%,那么newfreq = (cur_freq*cur_load)/90 = 889M,使用CPUFREQ_RELATION_L选择档位,选择到还是800M根本不能向高档位前进。这是算法的一个漏洞,如果cpu不同档位的频率差值大于(100/90),那么正常往上调频是调不上去的,会被CPUFREQ_RELATION_L参数拦下来。所以实际的interactive调频,都是使用go_hispeed_load(99%)调到最高值的,再使用choose_freq()来降频。
所以interactive governor会逐渐的被cpufreq gorernor所取代。
4.3.2.1、interactive governor的初始化
- 1、interactive的一部分初始化在cpufreq_interactive_init()当中:
static int __init cpufreq_interactive_init(void)
{
unsigned int i;
struct cpufreq_interactive_cpuinfo *pcpu;
struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
/* (1) 初始化percpu变量per_cpu(cpuinfo, i):
每个cpu创建负载计算定时器pcpu->cpu_timer
其他的锁
*/
/* Initalize per-cpu timers */
for_each_possible_cpu(i) {
pcpu = &per_cpu(cpuinfo, i);
init_timer_deferrable(&pcpu->cpu_timer);
pcpu->cpu_timer.function = cpufreq_interactive_timer;
pcpu->cpu_timer.data = i;
init_timer(&pcpu->cpu_slack_timer);
pcpu->cpu_slack_timer.function = cpufreq_interactive_nop_timer;
spin_lock_init(&pcpu->load_lock);
spin_lock_init(&pcpu->target_freq_lock);
init_rwsem(&pcpu->enable_sem);
}
spin_lock_init(&speedchange_cpumask_lock);
mutex_init(&gov_lock);
/* (2) 创建频率调整进程speedchange_task,
把耗时的频率调整工作单独放到一个进程中去做
*/
speedchange_task =
kthread_create(cpufreq_interactive_speedchange_task, NULL,
"cfinteractive");
if (IS_ERR(speedchange_task))
return PTR_ERR(speedchange_task);
sched_setscheduler_nocheck(speedchange_task, SCHED_FIFO, ¶m);
get_task_struct(speedchange_task);
/* NB: wake up so the thread does not look hung to the freezer */
wake_up_process(speedchange_task);
return cpufreq_register_governor(&cpufreq_gov_interactive);
}
- 2、interactive另一部分初始化在cpufreq_governor_interactive()中的CPUFREQ_GOV_POLICY_INIT、CPUFREQ_GOV_START命令,在cpu online时执行:
static int cpufreq_governor_interactive(struct cpufreq_policy *policy,
unsigned int event)
{
switch (event) {
/* (1) CPUFREQ_GOV_POLICY_INIT命令初始化interactive governor最核心的参数
*/
case CPUFREQ_GOV_POLICY_INIT:
if (have_governor_per_policy()) {
WARN_ON(tunables);
} else if (tunables) {
tunables->usage_count++;
policy->governor_data = tunables;
return 0;
}
tunables = kzalloc(sizeof(*tunables), GFP_KERNEL);
if (!tunables) {
pr_err("%s: POLICY_INIT: kzalloc failed\n", __func__);
return -ENOMEM;
}
tunables->usage_count = 1;
tunables->above_hispeed_delay = default_above_hispeed_delay;
tunables->nabove_hispeed_delay =
ARRAY_SIZE(default_above_hispeed_delay);
tunables->go_hispeed_load = DEFAULT_GO_HISPEED_LOAD;
tunables->target_loads = default_target_loads;
tunables->ntarget_loads = ARRAY_SIZE(default_target_loads);
tunables->min_sample_time = DEFAULT_MIN_SAMPLE_TIME;
tunables->timer_rate = DEFAULT_TIMER_RATE; // interactive负载计算timer默认时间为20ms
tunables->boostpulse_duration_val = DEFAULT_MIN_SAMPLE_TIME;
tunables->timer_slack_val = DEFAULT_TIMER_SLACK;
spin_lock_init(&tunables->target_loads_lock);
spin_lock_init(&tunables->above_hispeed_delay_lock);
policy->governor_data = tunables;
if (!have_governor_per_policy()) {
common_tunables = tunables;
}
rc = sysfs_create_group(get_governor_parent_kobj(policy),
get_sysfs_attr());
if (rc) {
kfree(tunables);
policy->governor_data = NULL;
if (!have_governor_per_policy()) {
common_tunables = NULL;
}
return rc;
}
if (!policy->governor->initialized) {
idle_notifier_register(&cpufreq_interactive_idle_nb);
cpufreq_register_notifier(&cpufreq_notifier_block,
CPUFREQ_TRANSITION_NOTIFIER);
}
break;
/* (2) CPUFREQ_GOV_START命令启动interactive负载计算的timer
*/
case CPUFREQ_GOV_START:
mutex_lock(&gov_lock);
freq_table = cpufreq_frequency_get_table(policy->cpu);
if (tunables && !tunables->hispeed_freq)
tunables->hispeed_freq = policy->max;
for_each_cpu(j, policy->cpus) {
pcpu = &per_cpu(cpuinfo, j);
pcpu->policy = policy;
pcpu->target_freq = policy->cur;
pcpu->freq_table = freq_table;
pcpu->floor_freq = pcpu->target_freq;
pcpu->pol_floor_val_time =
ktime_to_us(ktime_get());
pcpu->loc_floor_val_time = pcpu->pol_floor_val_time;
pcpu->pol_hispeed_val_time = pcpu->pol_floor_val_time;
pcpu->loc_hispeed_val_time = pcpu->pol_floor_val_time;
down_write(&pcpu->enable_sem);
del_timer_sync(&pcpu->cpu_timer);
del_timer_sync(&pcpu->cpu_slack_timer);
cpufreq_interactive_timer_start(tunables, j);
pcpu->governor_enabled = 1;
up_write(&pcpu->enable_sem);
}
mutex_unlock(&gov_lock);
break;
}
4.3.2.2、interactive governor的算法
interactive governor的核心算法在20ms周期的timer interactive governor()中:
static void cpufreq_interactive_timer(unsigned long data)
{
u64 now;
unsigned int delta_time;
u64 cputime_speedadj;
int cpu_load;
struct cpufreq_interactive_cpuinfo *pcpu =
&per_cpu(cpuinfo, data);
struct cpufreq_interactive_tunables *tunables =
pcpu->policy->governor_data;
unsigned int new_freq;
unsigned int loadadjfreq;
unsigned int index;
unsigned long flags;
u64 max_fvtime;
int j;
unsigned int max_t_freq = 0;
#ifdef CPUDVFS_POWER_MODE
/* default(normal), low power, just make, performance(sports) */
int min_sample_t[4] = { 80, 20, 20, 80 };
int ppb_idx;
#endif
if (!down_read_trylock(&pcpu->enable_sem))
return;
if (!pcpu->governor_enabled)
goto exit;
spin_lock_irqsave(&pcpu->load_lock, flags);
/* (1) 累加cpu上自从cpu_up()以来的负载,
pcpu->cputime_speedadj += active_time * pcpu->policy->cur;
pcpu->cputime_speedadj = (active_time * pcpu->policy->cur)samp1 + ... +(active_time * pcpu->policy->cur)sampn ;
每个采样周期为20mS,累加:第1个20ms中active_time*cur_cpu_freq + 第2个20ms中active_time*cur_cpu_freq +...+ 第n个20ms中active_time*cur_cpu_freq
*/
now = update_load(data);
/* (2) 自从cpu_up()以来的总的时间
delta_time = active_time + ilde_time
*/
delta_time = (unsigned int)(now - pcpu->cputime_speedadj_timestamp);
cputime_speedadj = pcpu->cputime_speedadj;
spin_unlock_irqrestore(&pcpu->load_lock, flags);
if (WARN_ON_ONCE(!delta_time))
goto rearm;
spin_lock_irqsave(&pcpu->target_freq_lock, flags);
/* (3) 总的负载/总时间 = 平均频率 */
do_div(cputime_speedadj, delta_time);
/* (4) (平均频率 * 100)/当前频率 = 当前cpu的占用率
*/
loadadjfreq = (unsigned int)cputime_speedadj * 100;
cpu_load = loadadjfreq / pcpu->policy->cur;
tunables->boosted = tunables->boost_val || now < tunables->boostpulse_endtime;
#ifdef CPUDVFS_POWER_MODE
ppb_idx = mt_cpufreq_get_ppb_state();
{
unsigned int idx = mt_cpufreq_ppb_hispeed_freq(data, ppb_idx);
tunables->hispeed_freq = pcpu->freq_table[idx].frequency;
tunables->min_sample_time = min_sample_t[ppb_idx] * USEC_PER_MSEC;
if (hispeed_freq_perf != 0)
tunables->hispeed_freq = hispeed_freq_perf;
if (min_sample_time_perf != 0)
tunables->min_sample_time = min_sample_time_perf;
}
#endif
/* (5) 如果cpu占用率达到go_hispeed_load(99%),或者在boost状态,
频率直接调整到最高频率hispeed_freq
*/
if (cpu_load >= tunables->go_hispeed_load || tunables->boosted) {
if (pcpu->policy->cur < tunables->hispeed_freq) {
new_freq = tunables->hispeed_freq;
} else {
new_freq = choose_freq(pcpu, loadadjfreq);
if (new_freq < tunables->hispeed_freq)
new_freq = tunables->hispeed_freq;
}
/* (6) 否则使用choose_freq()根据当前负载来计算对应的频率
*/
} else {
new_freq = choose_freq(pcpu, loadadjfreq);
if (new_freq > tunables->hispeed_freq &&
pcpu->policy->cur < tunables->hispeed_freq)
new_freq = tunables->hispeed_freq;
}
/* (7) 如果计算出的新频率 > hispeed_freq,不能马上调整,
在hispeed_freq以上的频率上必须待满above_hispeed_delay(20ms),才能继续往上调整频率
*/
if (pcpu->policy->cur >= tunables->hispeed_freq &&
new_freq > pcpu->policy->cur &&
now - pcpu->pol_hispeed_val_time <
freq_to_above_hispeed_delay(tunables, pcpu->policy->cur)) {
trace_cpufreq_interactive_notyet(
data, cpu_load, pcpu->target_freq,
pcpu->policy->cur, new_freq);
spin_unlock_irqrestore(&pcpu->target_freq_lock, flags);
goto rearm;
}
pcpu->loc_hispeed_val_time = now;
if (cpufreq_frequency_table_target(pcpu->policy, pcpu->freq_table,
new_freq, CPUFREQ_RELATION_L,
&index)) {
spin_unlock_irqrestore(&pcpu->target_freq_lock, flags);
goto rearm;
}
new_freq = pcpu->freq_table[index].frequency;
/* (8) 如果之前的频率 > hispeed_freq,或者发生boost
现在需要往低调频,之前的频率需要待满min_sample_time(80ms)
*/
/*
* Do not scale below floor_freq unless we have been at or above the
* floor frequency for the minimum sample time since last validated.
*/
max_fvtime = max(pcpu->pol_floor_val_time, pcpu->loc_floor_val_time);
if (new_freq < pcpu->floor_freq &&
pcpu->target_freq >= pcpu->policy->cur) {
if (now - max_fvtime < tunables->min_sample_time) {
trace_cpufreq_interactive_notyet(
data, cpu_load, pcpu->target_freq,
pcpu->policy->cur, new_freq);
spin_unlock_irqrestore(&pcpu->target_freq_lock, flags);
goto rearm;
}
}
/*
* Update the timestamp for checking whether speed has been held at
* or above the selected frequency for a minimum of min_sample_time,
* if not boosted to hispeed_freq. If boosted to hispeed_freq then we
* allow the speed to drop as soon as the boostpulse duration expires
* (or the indefinite boost is turned off).
*/
if (!tunables->boosted || new_freq > tunables->hispeed_freq) {
pcpu->floor_freq = new_freq;
if (pcpu->target_freq >= pcpu->policy->cur ||
new_freq >= pcpu->policy->cur)
pcpu->loc_floor_val_time = now;
}
/* (9) 如果当前cpu往低调整频率,判断当前policy是否需要更新,
因为多个cpu共享一个policy,取最大期望频率cpu的值作为整个policy的调整值
*/
if (pcpu->target_freq == new_freq &&
pcpu->target_freq <= pcpu->policy->cur) {
max_t_freq = 0;
for_each_cpu(j, pcpu->policy->cpus) {
struct cpufreq_interactive_cpuinfo *pjcpu;
pjcpu = &per_cpu(cpuinfo, j);
max_t_freq = max(max_t_freq, pjcpu->target_freq);
}
if (max_t_freq != pcpu->policy->cur)
goto pass_t;
trace_cpufreq_interactive_already(
data, cpu_load, pcpu->target_freq,
pcpu->policy->cur, new_freq);
spin_unlock_irqrestore(&pcpu->target_freq_lock, flags);
goto rearm;
}
pass_t:
trace_cpufreq_interactive_target(data, cpu_load, pcpu->target_freq,
pcpu->policy->cur, new_freq);
/* (10) 如果policy需要更新唤醒speedchange_task来执行调频动作 */
pcpu->target_freq = new_freq;
spin_unlock_irqrestore(&pcpu->target_freq_lock, flags);
spin_lock_irqsave(&speedchange_cpumask_lock, flags);
cpumask_set_cpu(data, &speedchange_cpumask);
spin_unlock_irqrestore(&speedchange_cpumask_lock, flags);
wake_up_process(speedchange_task);
rearm:
if (!timer_pending(&pcpu->cpu_timer))
cpufreq_interactive_timer_resched(pcpu);
exit:
up_read(&pcpu->enable_sem);
return;
}
|→
static unsigned int choose_freq(struct cpufreq_interactive_cpuinfo *pcpu,
unsigned int loadadjfreq)
{
unsigned int freq = pcpu->policy->cur;
unsigned int prevfreq, freqmin, freqmax;
unsigned int tl;
int index;
freqmin = 0;
freqmax = UINT_MAX;
do {
prevfreq = freq;
/* (6.1) tl = 90,loadadjfreq = (平均频率 * 100)
即 newfreq = (平均频率 * 100)/ 90
相当于cpufreq_frequency_table_target(CPUFREQ_RELATION_L),
相当于newfreq往低档位的计算,
ooooo这里带来一个非常严重的问题,如果档位之间差值大于100/90,向上调频将调不上去
*/
tl = freq_to_targetload(pcpu->policy->governor_data, freq);
/*
* Find the lowest frequency where the computed load is less
* than or equal to the target load.
*/
if (cpufreq_frequency_table_target(
pcpu->policy, pcpu->freq_table, loadadjfreq / tl,
CPUFREQ_RELATION_L, &index))
break;
freq = pcpu->freq_table[index].frequency;
if (freq > prevfreq) {
/* The previous frequency is too low. */
freqmin = prevfreq;
if (freq >= freqmax) {
/*
* Find the highest frequency that is less
* than freqmax.
*/
if (cpufreq_frequency_table_target(
pcpu->policy, pcpu->freq_table,
freqmax - 1, CPUFREQ_RELATION_H,
&index))
break;
freq = pcpu->freq_table[index].frequency;
if (freq == freqmin) {
/*
* The first frequency below freqmax
* has already been found to be too
* low. freqmax is the lowest speed
* we found that is fast enough.
*/
freq = freqmax;
break;
}
}
} else if (freq < prevfreq) {
/* The previous frequency is high enough. */
freqmax = prevfreq;
if (freq <= freqmin) {
/*
* Find the lowest frequency that is higher
* than freqmin.
*/
if (cpufreq_frequency_table_target(
pcpu->policy, pcpu->freq_table,
freqmin + 1, CPUFREQ_RELATION_L,
&index))
break;
freq = pcpu->freq_table[index].frequency;
/*
* If freqmax is the first frequency above
* freqmin then we have already found that
* this speed is fast enough.
*/
if (freq == freqmax)
break;
}
}
/* If same frequency chosen as previous then done. */
} while (freq != prevfreq);
return freq;
}
4.4、cpu hotplug调整
还有一种调节负载的方式是cpu hotplug:
- 1、cpu被hotplug掉的功耗小于cpu进入idle的功耗;如果整个cluster的cpu都offline,cluster也可以poweroff;所以hotplug能够节省功耗;
- 2、但是hotplug是有开销的:hotplug动作在速度慢的时候达到了ms级别,另外进程的迁移也是有开销的;cpu的hotplug必须遵循顺序插拔的规则,如果先拔掉负载重的cpu也是不合理的;
- 3、MTK的技术限制必须使用hotplug:MTK平台只有在剩一个online cpu的情况下才能进入深度idle模式,所以MTK平台必须支持hotplug;而samsung、qualcomm在多核online的情况下可以进入深度idle,所以一般不支持cpu hotplug;
4.4.1、hotplug 底层实现
4.4.1.1、cpu_cup()/cpu_down()
kernel对hotplug的支持是很完善的,标准接口cpu_up()/cpu_down()可以进行hotplug。
4.4.1.2、hotplug 进程迁移
在cpu_down()时,需要调用migration_call() -> migrate_tasks()把cpu上所有runnable进程迁移到其他cpu;在cpu_up()时,并不需要在函数中迁移进程,直接等待负载均衡算法的迁移。
static void migrate_tasks(struct rq *dead_rq)
{
struct rq *rq = dead_rq;
struct task_struct *next, *stop = rq->stop;
int dest_cpu;
/*
* Fudge the rq selection such that the below task selection loop
* doesn't get stuck on the currently eligible stop task.
*
* We're currently inside stop_machine() and the rq is either stuck
* in the stop_machine_cpu_stop() loop, or we're executing this code,
* either way we should never end up calling schedule() until we're
* done here.
*/
rq->stop = NULL;
/*
* put_prev_task() and pick_next_task() sched
* class method both need to have an up-to-date
* value of rq->clock[_task]
*/
update_rq_clock(rq);
unthrottle_offline_rt_rqs(rq);
for (;;) {
/*
* There's this thread running, bail when that's the only
* remaining thread.
*/
if (rq->nr_running == 1)
break;
/* (1) 逐个从rq中获取task = next */
/*
* pick_next_task assumes pinned rq->lock.
*/
lockdep_pin_lock(&rq->lock);
next = pick_next_task(rq, &fake_task);
BUG_ON(!next);
next->sched_class->put_prev_task(rq, next);
/*
* Rules for changing task_struct::cpus_allowed are holding
* both pi_lock and rq->lock, such that holding either
* stabilizes the mask.
*
* Drop rq->lock is not quite as disastrous as it usually is
* because !cpu_active at this point, which means load-balance
* will not interfere. Also, stop-machine.
*/
lockdep_unpin_lock(&rq->lock);
raw_spin_unlock(&rq->lock);
raw_spin_lock(&next->pi_lock);
raw_spin_lock(&rq->lock);
/*
* Since we're inside stop-machine, _nothing_ should have
* changed the task, WARN if weird stuff happened, because in
* that case the above rq->lock drop is a fail too.
*/
if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) {
raw_spin_unlock(&next->pi_lock);
continue;
}
/* (2) 找到最适合next进程迁移的目的cpu */
/* Find suitable destination for @next, with force if needed. */
dest_cpu = select_fallback_rq(dead_rq->cpu, next);
/* (3) 实施进程迁移 */
rq = __migrate_task(rq, next, dest_cpu);
if (rq != dead_rq) {
raw_spin_unlock(&rq->lock);
rq = dead_rq;
raw_spin_lock(&rq->lock);
}
raw_spin_unlock(&next->pi_lock);
}
rq->stop = stop;
}
|→
static int select_fallback_rq(int cpu, struct task_struct *p)
{
int nid = cpu_to_node(cpu);
const struct cpumask *nodemask = NULL;
enum { cpuset, possible, fail } state = cpuset;
int dest_cpu;
/*
* If the node that the cpu is on has been offlined, cpu_to_node()
* will return -1. There is no cpu on the node, and we should
* select the cpu on the other node.
*/
if (nid != -1) {
nodemask = cpumask_of_node(nid);
/* Look for allowed, online CPU in same node. */
for_each_cpu(dest_cpu, nodemask) {
if (!cpu_online(dest_cpu))
continue;
if (!cpu_active(dest_cpu))
continue;
if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
return dest_cpu;
}
}
for (;;) {
/* (2.1) 最好的情况:在tsk_cpus_allowed(p)中能找到online cpu迁移 */
/* Any allowed, online CPU? */
for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) {
if (!cpu_online(dest_cpu))
continue;
if (!cpu_active(dest_cpu))
continue;
goto out;
}
/* No more Mr. Nice Guy. */
switch (state) {
/* (2.2) 其次的情况:在cpuset中能找到online cpu迁移 */
case cpuset:
if (IS_ENABLED(CONFIG_CPUSETS)) {
cpuset_cpus_allowed_fallback(p);
state = possible;
break;
}
/* (2.3) 最差的情况:在系统所有cpu中能找到online cpu迁移 */
/* fall-through */
case possible:
do_set_cpus_allowed(p, cpu_possible_mask);
state = fail;
break;
case fail:
BUG();
break;
}
}
out:
if (state != cpuset) {
/*
* Don't tell them about moving exiting tasks or
* kernel threads (both mm NULL), since they never
* leave kernel.
*/
if (p->mm && printk_ratelimit()) {
printk_deferred("process %d (%s) no longer affine to cpu%d\n",
task_pid_nr(p), p->comm, cpu);
}
}
return dest_cpu;
}
4.4.2、MTK hotplug算法
在有了hotplug的底层cpu_cup()、cpu_down()的实现以后,在此之上还需要有一套算法根据cpu的负载来动态hotplug。MTK这套算法比较齐全,主要分为HICA、hps_algo_main两部分。
4.4.2.1、HICA/PPM
HICA和hps的关系,其实是HICA决定了一种大的mode,而hps在大的mode中实现精细化的调整。
比如对MT6799 HICA支持3种模式:
- 1、LL_ONLY。 // 只开小核
- 2、L_ONLY。 // 只开中核
- 3、ALL。 // LL、L、B10核都可以使用
HICA在mt_ppm_hica_update_algo_data()中计算负载,根据负载变化来决定mode:
_hps_task_main() -> mt_ppm_hica_update_algo_data()
↓
void mt_ppm_hica_update_algo_data(unsigned int cur_loads,
unsigned int cur_nr_heavy_task, unsigned int cur_tlp)
{
struct ppm_power_state_data *state_info = ppm_get_power_state_info();
struct ppm_state_transfer_data *data;
enum ppm_power_state cur_state;
enum ppm_mode cur_mode;
int i, j;
FUNC_ENTER(FUNC_LV_HICA);
ppm_lock(&hica_policy.lock);
ppm_hica_algo_data.ppm_cur_loads = cur_loads;
ppm_hica_algo_data.ppm_cur_tlp = cur_tlp;
ppm_hica_algo_data.ppm_cur_nr_heavy_task = cur_nr_heavy_task;
cur_state = ppm_hica_algo_data.cur_state;
cur_mode = ppm_main_info.cur_mode;
ppm_dbg(HICA, "cur_loads = %d, cur_tlp = %d, cur_nr_heavy_task = %d, cur_state = %s, cur_mode = %d\n",
cur_loads, cur_tlp, cur_nr_heavy_task, ppm_get_power_state_name(cur_state), cur_mode);
if (!ppm_main_info.is_enabled || !hica_policy.is_enabled || ppm_main_info.is_in_suspend ||
cur_state == PPM_POWER_STATE_NONE)
goto end;
#if defined(CONFIG_MACH_MT6757) || defined(CONFIG_MACH_KIBOPLUS)
if (setup_max_cpus == 4)
goto end;
#endif
#ifdef PPM_IC_SEGMENT_CHECK
if (ppm_main_info.fix_state_by_segment != PPM_POWER_STATE_NONE)
goto end;
#endif
/* skip HICA if DVFS is not ready (we cannot get current freq...) */
if (!ppm_main_info.client_info[PPM_CLIENT_DVFS].limit_cb)
goto end;
/* Power state is fixed by user, skip HICA state calculation */
if (fix_power_state != PPM_POWER_STATE_NONE)
goto end;
/* (1) 从transfer_by_perf到transfer_by_pwr逐个遍历判断当前state是否需要改变 */
for (i = 0; i < 2; i++) {
data = (i == 0) ? state_info[cur_state].transfer_by_perf
: state_info[cur_state].transfer_by_pwr;
/* (2) 如果当前state有几种变化逐个遍历,比如:
当前state为ALL,
可以ALL -> LL_ONLY
也可以ALL -> L_ONLY
*/
for (j = 0; j < data->size; j++) {
if (!data->transition_data[j].transition_rule
|| !((1 << cur_mode) & data->transition_data[j].mode_mask))
continue;
/* (3) 如果state变化,获取新的state返回 */
if (data->transition_data[j].transition_rule(
ppm_hica_algo_data, &data->transition_data[j])) {
ppm_hica_algo_data.new_state = data->transition_data[j].next_state;
ppm_dbg(HICA, "[%s(%d)] Need state transfer: %s --> %s\n",
(i == 0) ? "PERF" : "PWR",
j,
ppm_get_power_state_name(cur_state),
ppm_get_power_state_name(ppm_hica_algo_data.new_state)
);
goto end;
/* (4) 如果state不变化,维持当前state,继续遍历*/
} else {
ppm_hica_algo_data.new_state = cur_state;
#ifdef PPM_HICA_2P0
ppm_dbg(HICA, "[%s(%d)]hold in %s state, capacity_hold_cnt = %d, bigtsk_hold_cnt = %d, freq_hold_cnt = %d\n",
(i == 0) ? "PERF" : "PWR",
j,
ppm_get_power_state_name(cur_state),
data->transition_data[j].capacity_hold_cnt,
data->transition_data[j].bigtsk_hold_cnt,
data->transition_data[j].freq_hold_cnt
);
#else
#if PPM_HICA_VARIANT_SUPPORT
ppm_dbg(HICA, "[%s(%d)]hold in %s state, loading_cnt = %d, freq_cnt = %d, overutil_l_hold_cnt = %d, .overutil_h_hold_cnt = %d\n",
(i == 0) ? "PERF" : "PWR",
j,
ppm_get_power_state_name(cur_state),
data->transition_data[j].loading_hold_cnt,
data->transition_data[j].freq_hold_cnt,
data->transition_data[j].overutil_l_hold_cnt,
data->transition_data[j].overutil_h_hold_cnt
);
#else
ppm_dbg(HICA, "[%s(%d)]hold in %s state, loading_cnt = %d, freq_cnt = %d\n",
(i == 0) ? "PERF" : "PWR",
j,
ppm_get_power_state_name(cur_state),
data->transition_data[j].loading_hold_cnt,
data->transition_data[j].freq_hold_cnt
);
#endif
#endif
}
}
}
end:
ppm_unlock(&hica_policy.lock);
FUNC_EXIT(FUNC_LV_HICA);
}
关于计算state的函数和阈值定义在表中,除了heavy_task和big_task,基本是计算util/capacity的cpu占用情况:
struct ppm_power_state_data pwr_state_info_SB[NR_PPM_POWER_STATE] = {
[0] = {
.name = __stringify(LL_ONLY),
.state = PPM_POWER_STATE_LL_ONLY,
PWR_STATE_INFO(LL_ONLY, SB)
},
[1] = {
.name = __stringify(L_ONLY),
.state = PPM_POWER_STATE_L_ONLY,
PWR_STATE_INFO(L_ONLY, SB)
},
[2] = {
.name = __stringify(ALL),
.state = PPM_POWER_STATE_ALL,
PWR_STATE_INFO(ALL, SB)
},
};
static struct ppm_state_transfer state_pwr_transfer_ALL[] = {
TRANS_DATA(
LL_ONLY,
PPM_MODE_MASK_ALL_MODE,
ppm_trans_rule_ALL_to_LL_ONLY,
PPM_DEFAULT_HOLD_TIME,
PPM_CAPACITY_DOWN,
PPM_DEFAULT_BIGTSK_TIME,
0,
0,
0
),
TRANS_DATA(
L_ONLY,
PPM_MODE_MASK_ALL_MODE,
ppm_trans_rule_ALL_to_L_ONLY,
PPM_DEFAULT_HOLD_TIME,
PPM_CAPACITY_DOWN,
PPM_DEFAULT_BIGTSK_TIME,
2,
4,
0
),
};
STATE_TRANSFER_DATA_PWR(ALL);
static struct ppm_state_transfer state_perf_transfer_ALL[] = {
TRANS_DATA(NONE, 0, NULL, 0, 0, 0, 0, 0, 0),
};
STATE_TRANSFER_DATA_PERF(ALL);
/* 举例:当前state为ALL
尝试从power的角度从ALL切换到LL_ONLY:ppm_trans_rule_ALL_to_LL_ONLY()
尝试从power的角度从ALL切换到L_ONLY:ppm_trans_rule_ALL_to_L_ONLY()
*/
static bool ppm_trans_rule_ALL_to_LL_ONLY(
struct ppm_hica_algo_data data, struct ppm_state_transfer *settings)
{
/* keep in ALL state if root cluster is fixed at L or B */
if (ppm_main_info.fixed_root_cluster == PPM_CLUSTER_L
|| ppm_main_info.fixed_root_cluster == PPM_CLUSTER_B)
return false;
/* (1) 从heavy task负载判断是否需要切换模式 */
#if PPM_HEAVY_TASK_INDICATE_SUPPORT
{
unsigned int heavy_task, i;
for_each_ppm_clusters(i) {
heavy_task = hps_get_hvytsk(i);
if (heavy_task) {
ppm_dbg(HICA, "Stay in ALL due to cluster%d heavy task = %d\n",
i, heavy_task);
trace_ppm_hica(
ppm_get_power_state_name(PPM_POWER_STATE_ALL),
ppm_get_power_state_name(PPM_POWER_STATE_LL_ONLY),
-1, -1, -1, -1, heavy_task, -1, false);
settings->capacity_hold_cnt = 0;
return false;
}
}
}
#endif
/* (2) 从big task负载判断是否需要切换模式 */
#if PPM_BIG_TASK_INDICATE_SUPPORT
{
unsigned int big_task_L = hps_get_bigtsk(PPM_CLUSTER_L);
unsigned int big_task_B = hps_get_bigtsk(PPM_CLUSTER_B);
if (big_task_L || big_task_B) {
ppm_dbg(HICA, "Stay in ALL due to L/B big task = %d/%d\n",
big_task_L, big_task_B);
trace_ppm_hica(
ppm_get_power_state_name(PPM_POWER_STATE_ALL),
ppm_get_power_state_name(PPM_POWER_STATE_LL_ONLY),
-1, -1, big_task_L, big_task_B, -1, -1, false);
settings->capacity_hold_cnt = 0;
return false;
}
}
#endif
/* (3) 从util/capacity负载判断是否需要切换模式 */
{
/* check capacity */
unsigned long usage, usage_total = 0, capacity = 0, dummy;
unsigned int i;
for_each_ppm_clusters(i) {
if (sched_get_cluster_util(i, &usage, &dummy)) {
ppm_err("Get cluster %d util failed\n", i);
return false;
}
usage_total += usage;
if (i == PPM_CLUSTER_LL)
capacity = dummy;
}
ppm_dbg(HICA, "usage_total = %ld, LL capacity = %ld\n", usage_total, capacity);
/* (3.1) (util/capacity)超过门限值(settings->capacity_bond) 是否达到次数settings->capacity_hold_time,
如果条件满足进行state切换
*/
if (usage_total < capacity * settings->capacity_bond / 100) {
settings->capacity_hold_cnt++;
if (settings->capacity_hold_cnt >= settings->capacity_hold_time) {
trace_ppm_hica(
ppm_get_power_state_name(PPM_POWER_STATE_ALL),
ppm_get_power_state_name(PPM_POWER_STATE_LL_ONLY),
usage_total, capacity, -1, -1, -1, -1, true);
return true;
}
} else
settings->capacity_hold_cnt = 0;
trace_ppm_hica(
ppm_get_power_state_name(PPM_POWER_STATE_ALL),
ppm_get_power_state_name(PPM_POWER_STATE_LL_ONLY),
usage_total, capacity, -1, -1, -1, -1, false);
}
return false;
}
新的state计算完成后,是通过以下通道配置下去的:
_hps_task_main() -> mt_ppm_main() -> ppm_hica_update_limit_cb() -> ppm_hica_set_default_limit_by_state()
↓
void ppm_hica_set_default_limit_by_state(enum ppm_power_state state,
struct ppm_policy_data *policy)
{
unsigned int i;
struct ppm_power_state_data *state_info = ppm_get_power_state_info();
FUNC_ENTER(FUNC_LV_HICA);
for (i = 0; i < policy->req.cluster_num; i++) {
if (state >= PPM_POWER_STATE_NONE) {
if (state > NR_PPM_POWER_STATE)
ppm_err("@%s: Invalid PPM state(%d)\n", __func__, state);
policy->req.limit[i].min_cpu_core = get_cluster_min_cpu_core(i);
policy->req.limit[i].max_cpu_core = get_cluster_max_cpu_core(i);
policy->req.limit[i].min_cpufreq_idx = get_cluster_min_cpufreq_idx(i);
policy->req.limit[i].max_cpufreq_idx = get_cluster_max_cpufreq_idx(i);
#ifdef PPM_DISABLE_CLUSTER_MIGRATION
/* keep at least 1 LL */
if (i == 0)
policy->req.limit[i].min_cpu_core = 1;
#endif
/* (1) HICA根据新的state,配置对应的min_cpu_core/max_cpu_core到本policy当中 */
} else {
policy->req.limit[i].min_cpu_core =
state_info[state].cluster_limit->state_limit[i].min_cpu_core;
policy->req.limit[i].max_cpu_core =
state_info[state].cluster_limit->state_limit[i].max_cpu_core;
policy->req.limit[i].min_cpufreq_idx =
state_info[state].cluster_limit->state_limit[i].min_cpufreq_idx;
policy->req.limit[i].max_cpufreq_idx =
state_info[state].cluster_limit->state_limit[i].max_cpufreq_idx;
}
}
#ifdef PPM_IC_SEGMENT_CHECK
/* ignore HICA min freq setting for L cluster in L_ONLY state */
if (state == PPM_POWER_STATE_L_ONLY && ppm_main_info.fix_state_by_segment == PPM_POWER_STATE_L_ONLY)
policy->req.limit[1].min_cpufreq_idx = get_cluster_min_cpufreq_idx(1);
#endif
FUNC_EXIT(FUNC_LV_HICA);
}
/*==============================================================*/
/* Local Variables */
/*==============================================================*/
/* cluster limit for each power state */
static const struct ppm_cluster_limit state_limit_LL_ONLY[] = {
[0] = LIMIT(15, 0, 1, 4),
[1] = LIMIT(15, 0, 0, 0),
[2] = LIMIT(15, 0, 0, 0),
};
STATE_LIMIT(LL_ONLY);
static const struct ppm_cluster_limit state_limit_L_ONLY[] = {
[0] = LIMIT(15, 0, 0, 0),
[1] = LIMIT(8, 0, 1, 4),
[2] = LIMIT(15, 0, 0, 0),
};
STATE_LIMIT(L_ONLY);
static const struct ppm_cluster_limit state_limit_ALL[] = {
[0] = LIMIT(15, 0, 0, 4),
[1] = LIMIT(15, 0, 0, 4),
[2] = LIMIT(15, 0, 0, 2),
};
STATE_LIMIT(ALL);
_hps_task_main() -> mt_ppm_main() -> ppm_limit_callback()
↓
static void ppm_limit_callback(struct ppm_client_req req)
{
struct ppm_client_req *p = (struct ppm_client_req *)&req;
int i;
/* (2) 将HICA state对应的policy配置到hps限制中hps_sys.cluster_info[i].ref_base_value/ref_limit_value */
mutex_lock(&hps_ctxt.para_lock);
hps_sys.ppm_root_cluster = p->root_cluster;
for (i = 0; i < p->cluster_num; i++) {
/*
* hps_warn("ppm_limit_callback -> cluster%d: has_advise_core = %d, [%d, %d]\n",
* i, p->cpu_limit[i].has_advise_core,
* p->cpu_limit[i].min_cpu_core, p->cpu_limit[i].max_cpu_core);
*/
#ifdef _TRACE_
trace_ppm_limit_callback_update(i, p->cpu_limit[i].has_advise_core,
p->cpu_limit[i].min_cpu_core, p->cpu_limit[i].max_cpu_core);
#endif
if (!p->cpu_limit[i].has_advise_core) {
hps_sys.cluster_info[i].ref_base_value = p->cpu_limit[i].min_cpu_core;
hps_sys.cluster_info[i].ref_limit_value = p->cpu_limit[i].max_cpu_core;
} else {
hps_sys.cluster_info[i].ref_base_value =
hps_sys.cluster_info[i].ref_limit_value =
p->cpu_limit[i].advise_cpu_core;
}
}
mutex_unlock(&hps_ctxt.para_lock);
hps_ctxt.is_interrupt = 1;
hps_task_wakeup_nolock();
}
4.4.2.2、hps_algo_main
_hps_task_main() -> hps_algo_main()
↓
void hps_algo_main(void)
{
unsigned int i, val, base_val, action_print, origin_root, action_break;
char str_online[64], str_ref_limit[64], str_ref_base[64], str_criteria_limit[64],
str_criteria_base[64], str_target[64], str_hvytsk[64], str_pwrseq[64], str_bigtsk[64];
char *online_ptr = str_online;
char *criteria_limit_ptr = str_criteria_limit;
char *criteria_base_ptr = str_criteria_base;
char *ref_limit_ptr = str_ref_limit;
char *ref_base_ptr = str_ref_base;
char *hvytsk_ptr = str_hvytsk;
char *target_ptr = str_target;
char *pwrseq_ptr = str_pwrseq;
char *bigtsk_ptr = str_bigtsk;
static unsigned int hrtbt_dbg;
#ifdef CONFIG_MEIZU_BSP
static unsigned long int j;
#endif //CONFIG_MEIZU_BSP
#ifdef CONFIG_MTK_ICCS_SUPPORT
unsigned char real_online_power_state_bitmask = 0;
unsigned char real_target_power_state_bitmask = 0;
unsigned char iccs_online_power_state_bitmask = 0;
unsigned char iccs_target_power_state_bitmask = iccs_get_target_power_state_bitmask();
unsigned char target_cache_shared_state_bitmask = 0;
#endif
/* Initial value */
base_val = action_print = action_break = hps_sys.total_online_cores = 0;
hps_sys.up_load_avg = hps_sys.down_load_avg = hps_sys.tlp_avg = hps_sys.rush_cnt = 0;
hps_sys.action_id = origin_root = 0;
/*
* run algo or not by hps_ctxt.enabled
*/
if ((u64) ktime_to_ms(ktime_sub(ktime_get(), hps_ctxt.hps_hrt_ktime)) >= HPS_HRT_DBG_MS)
action_print = hrtbt_dbg = 1;
else
hrtbt_dbg = 0;
mutex_lock(&hps_ctxt.lock);
hps_ctxt.action = ACTION_NONE;
atomic_set(&hps_ctxt.is_ondemand, 0);
if (!hps_ctxt.enabled)
goto HPS_END;
if (hps_ctxt.eas_indicator) {
/*Set cpu cores by scheduler*/
goto HPS_ALGO_END;
}
/*
* algo - begin
*/
/*Back up limit and base value for check */
mutex_lock(&hps_ctxt.para_lock);
if ((hps_sys.cluster_info[0].base_value == 0) &&
(hps_sys.cluster_info[1].base_value == 0) &&
(hps_sys.cluster_info[2].base_value == 0) &&
(hps_sys.cluster_info[0].limit_value == 0) &&
(hps_sys.cluster_info[1].limit_value == 0) &&
(hps_sys.cluster_info[2].limit_value == 0)) {
hps_sys.cluster_info[0].base_value = hps_sys.cluster_info[0].ref_base_value = 0;
hps_sys.cluster_info[1].base_value = hps_sys.cluster_info[1].ref_base_value = 0;
hps_sys.cluster_info[2].base_value = hps_sys.cluster_info[2].ref_base_value = 0;
hps_sys.cluster_info[0].limit_value = hps_sys.cluster_info[0].ref_limit_value = 4;
hps_sys.cluster_info[1].limit_value = hps_sys.cluster_info[1].ref_limit_value = 4;
hps_sys.cluster_info[2].limit_value = hps_sys.cluster_info[2].ref_limit_value = 0;
}
for (i = 0; i < hps_sys.cluster_num; i++) {
hps_sys.cluster_info[i].base_value = hps_sys.cluster_info[i].ref_base_value;
hps_sys.cluster_info[i].limit_value = hps_sys.cluster_info[i].ref_limit_value;
}
for (i = 0; i < hps_sys.cluster_num; i++) {
base_val += hps_sys.cluster_info[i].base_value;
hps_sys.cluster_info[i].target_core_num = hps_sys.cluster_info[i].online_core_num =
0;
hps_sys.cluster_info[i].online_core_num =
hps_get_cluster_cpus(hps_sys.cluster_info[i].cluster_id);
hps_sys.total_online_cores += hps_sys.cluster_info[i].online_core_num;
}
mutex_unlock(&hps_ctxt.para_lock);
/* Determine root cluster */
origin_root = hps_sys.root_cluster_id;
hps_define_root_cluster(&hps_sys);
#ifdef CONFIG_MACH_MT6799
if (hps_ctxt.smart_det_enabled) {
mutex_lock(&hps_ctxt.para_lock);
hps_sys.root_cluster_id = 1;/*Change root to L cluster when smart detection is enabled*/
mutex_unlock(&hps_ctxt.para_lock);
}
#endif
if (origin_root != hps_sys.root_cluster_id)
hps_sys.action_id = HPS_SYS_CHANGE_ROOT;
/*
* update history - tlp
*/
val = hps_ctxt.tlp_history[hps_ctxt.tlp_history_index];
hps_ctxt.tlp_history[hps_ctxt.tlp_history_index] = hps_ctxt.cur_tlp;
hps_ctxt.tlp_sum += hps_ctxt.cur_tlp;
hps_ctxt.tlp_history_index =
(hps_ctxt.tlp_history_index + 1 ==
hps_ctxt.tlp_times) ? 0 : hps_ctxt.tlp_history_index + 1;
++hps_ctxt.tlp_count;
if (hps_ctxt.tlp_count > hps_ctxt.tlp_times) {
WARN_ON(hps_ctxt.tlp_sum < val);
hps_ctxt.tlp_sum -= val;
hps_ctxt.tlp_avg = hps_ctxt.tlp_sum / hps_ctxt.tlp_times;
} else {
hps_ctxt.tlp_avg = hps_ctxt.tlp_sum / hps_ctxt.tlp_count;
}
if (hps_ctxt.stats_dump_enabled)
hps_ctxt_print_algo_stats_tlp(0);
/*Determine eas enabled or not*/
if (!hps_ctxt.eas_enabled)
hps_sys.hps_sys_ops[2].enabled = 0;
for (i = 0 ; i < hps_sys.cluster_num ; i++)
hps_sys.cluster_info[i].target_core_num = hps_sys.cluster_info[i].online_core_num;
/* (1) 逐个调用 hps_sys_ops()根据各种算法来判断当前cpu是否需要hotplug */
for (i = 0; i < hps_sys.func_num; i++) {
if (hps_sys.hps_sys_ops[i].enabled == 1) {
if (hps_sys.hps_sys_ops[i].hps_sys_func_ptr()) {
hps_sys.action_id = hps_sys.hps_sys_ops[i].func_id;
break;
}
}
}
/*
if (hps_ctxt.heavy_task_enabled)
if (hps_algo_heavytsk_det())
hps_sys.action_id = 0xE1;
*/
if (hps_ctxt.big_task_enabled)
if (hps_algo_big_task_det())
hps_sys.action_id = 0xE2;
if (hps_sys.action_id == 0)
goto HPS_END;
HPS_ALGO_END:
#ifdef CONFIG_MACH_MT6799
if (hps_ctxt.smart_det_enabled) {
if (hps_sys.cluster_info[2].bigTsk_value <= 1) {
mutex_lock(&hps_ctxt.para_lock);
hps_sys.cluster_info[2].target_core_num = 1;
mutex_unlock(&hps_ctxt.para_lock);
}
}
#endif
/*
* algo - end
*/
/* (2) 对limit进行判断,HICA的值就配置到这里 */
/*Base and limit check */
hps_check_base_limit(&hps_sys);
/* Ensure that root cluster must one online cpu at less */
if (hps_sys.cluster_info[hps_sys.root_cluster_id].target_core_num <= 0)
hps_sys.cluster_info[hps_sys.root_cluster_id].target_core_num = 1;
#ifdef CONFIG_MTK_ICCS_SUPPORT
real_online_power_state_bitmask = 0;
real_target_power_state_bitmask = 0;
for (i = 0; i < hps_sys.cluster_num; i++) {
real_online_power_state_bitmask |= ((hps_sys.cluster_info[i].online_core_num > 0) << i);
real_target_power_state_bitmask |= ((hps_sys.cluster_info[i].target_core_num > 0) << i);
}
iccs_online_power_state_bitmask = iccs_target_power_state_bitmask;
iccs_target_power_state_bitmask = real_target_power_state_bitmask;
iccs_get_target_state(&iccs_target_power_state_bitmask, &target_cache_shared_state_bitmask);
/*
* pr_err("[%s] iccs_target_power_state_bitmask: 0x%x\n", __func__, iccs_target_power_state_bitmask);
*/
for (i = 0; i < hps_sys.cluster_num; i++) {
hps_sys.cluster_info[i].iccs_state = (((real_online_power_state_bitmask >> i) & 1) << 3) |
(((real_target_power_state_bitmask >> i) & 1) << 2) |
(((iccs_online_power_state_bitmask >> i) & 1) << 1) |
(((iccs_target_power_state_bitmask >> i) & 1) << 0);
/*
* pr_err("[%s] cluster: 0x%x iccs_state: 0x%x\n", __func__, i, hps_sys.cluster_info[i].iccs_state);
*/
if (hps_get_iccs_pwr_status(i) == 0x1)
iccs_cluster_on_off(i, 1);
else if (hps_get_iccs_pwr_status(i) == 0x2)
iccs_cluster_on_off(i, 0);
}
#endif
/* (3) 经过各种算法计算后目标值是target_core_num,而当前值是online_core_num;
如果不一致,进行cpu_up()/cpu_down()操作
*/
#if 1 /*Make sure that priority of power on action is higher than power down. */
for (i = 0; i < hps_sys.cluster_num; i++) {
if (hps_sys.cluster_info[i].target_core_num >
hps_sys.cluster_info[i].online_core_num) {
if (hps_algo_do_cluster_action(i) == 1) {
action_print = action_break = 1;
break;
}
action_print = 1;
}
}
if (!action_break) {
for (i = 0; i < hps_sys.cluster_num; i++) {
if (hps_sys.cluster_info[i].target_core_num <
hps_sys.cluster_info[i].online_core_num) {
if (hps_algo_do_cluster_action(i) == 1) {
action_print = action_break = 1;
break;
}
action_print = 1;
}
}
}
#else
/*Process root cluster first */
if (hps_sys.cluster_info[hps_sys.root_cluster_id].target_core_num !=
hps_sys.cluster_info[hps_sys.root_cluster_id].online_core_num) {
if (hps_algo_do_cluster_action(hps_sys.root_cluster_id) == 1)
action_break = 1;
else
action_break = 0;
action_print = 1;
}
for (i = 0; i < hps_sys.cluster_num; i++) {
if (i == hps_sys.root_cluster_id)
continue;
if (hps_sys.cluster_info[i].target_core_num !=
hps_sys.cluster_info[i].online_core_num) {
if (hps_algo_do_cluster_action(i) == 1)
action_break = 1;
else
action_break = 0;
action_print = 1;
}
}
#endif
#ifdef CONFIG_MTK_ICCS_SUPPORT
for (i = 0; i < hps_sys.cluster_num; i++) {
if (hps_get_cluster_cpus(hps_sys.cluster_info[i].cluster_id) !=
hps_sys.cluster_info[i].target_core_num) {
if (hps_get_cluster_cpus(hps_sys.cluster_info[i].cluster_id) == 0)
iccs_target_power_state_bitmask &= ~(1 << i);
else if (hps_sys.cluster_info[i].target_core_num == 0)
iccs_target_power_state_bitmask |= (1 << i);
}
}
/*
* pr_err("[%s] iccs_target_power_state_bitmask: 0x%x\n", __func__, iccs_target_power_state_bitmask);
*/
iccs_set_target_power_state_bitmask(iccs_target_power_state_bitmask);
#endif
HPS_END:
if (action_print || hrtbt_dbg) {
int online, target, ref_limit, ref_base, criteria_limit, criteria_base, hvytsk, pwrseq, bigtsk;
mutex_lock(&hps_ctxt.para_lock);
online = target = criteria_limit = criteria_base = 0;
for (i = 0; i < hps_sys.cluster_num; i++) {
if (i == origin_root)
online =
sprintf(online_ptr, "<%d>",
hps_sys.cluster_info[i].online_core_num);
else
online =
sprintf(online_ptr, "(%d)",
hps_sys.cluster_info[i].online_core_num);
if (i == hps_sys.root_cluster_id)
target =
sprintf(target_ptr, "<%d>",
hps_sys.cluster_info[i].target_core_num);
else
target =
sprintf(target_ptr, "(%d)",
hps_sys.cluster_info[i].target_core_num);
criteria_limit =
sprintf(criteria_limit_ptr, "(%d)",
hps_sys.cluster_info[i].limit_value);
criteria_base =
sprintf(criteria_base_ptr, "(%d)", hps_sys.cluster_info[i].base_value);
ref_limit =
sprintf(ref_limit_ptr, "(%d)", hps_sys.cluster_info[i].ref_limit_value);
ref_base =
sprintf(ref_base_ptr, "(%d)", hps_sys.cluster_info[i].ref_base_value);
hvytsk = sprintf(hvytsk_ptr, "(%d)", hps_sys.cluster_info[i].hvyTsk_value);
bigtsk = sprintf(bigtsk_ptr, "(%d)", hps_sys.cluster_info[i].bigTsk_value);
if (i == 0)
pwrseq = sprintf(pwrseq_ptr, "(%d->", hps_sys.cluster_info[i].pwr_seq);
else if ((i != 0) && (i != (hps_sys.cluster_num - 1)))
pwrseq = sprintf(pwrseq_ptr, "%d->", hps_sys.cluster_info[i].pwr_seq);
else if (i == (hps_sys.cluster_num - 1))
pwrseq = sprintf(pwrseq_ptr, "%d) ", hps_sys.cluster_info[i].pwr_seq);
online_ptr += online;
target_ptr += target;
criteria_limit_ptr += criteria_limit;
criteria_base_ptr += criteria_base;
ref_limit_ptr += ref_limit;
ref_base_ptr += ref_base;
hvytsk_ptr += hvytsk;
bigtsk_ptr += bigtsk;
pwrseq_ptr += pwrseq;
}
mutex_unlock(&hps_ctxt.para_lock);
if (action_print) {
hps_set_funct_ctrl();
if (action_break)
hps_warn
("(0x%X)%s action break!! (%u)(%u)(%u) %s %s%s-->%s%s (%u)(%u)(%u)(%u) %s\n",
((hps_ctxt.hps_func_control << 12) | hps_sys.action_id),
str_online, hps_ctxt.cur_loads,
hps_ctxt.cur_tlp, hps_ctxt.cur_iowait, str_hvytsk,
str_criteria_limit, str_criteria_base,
str_ref_limit, str_ref_base,
hps_sys.up_load_avg,
hps_sys.down_load_avg, hps_sys.tlp_avg, hps_sys.rush_cnt,
str_target);
else {
char str1[256];
char str2[256];
snprintf(str1, sizeof(str1),
"(0x%X)%s action end (%u)(%u)(%u) %s %s[%u][%u](%u) %s %s%s (%u)(%u)(%u)(%u)",
((hps_ctxt.hps_func_control << 12) | hps_sys.action_id),
str_online, hps_ctxt.cur_loads,
hps_ctxt.cur_tlp, hps_ctxt.cur_iowait,
str_hvytsk, str_bigtsk, hps_ctxt.is_screen_off,
hps_ctxt.is_idle, hps_ctxt.idle_ratio,
str_pwrseq, str_criteria_limit, str_criteria_base,
hps_sys.up_load_avg,
hps_sys.down_load_avg,
hps_sys.tlp_avg, hps_sys.rush_cnt);
snprintf(str2, sizeof(str2),
"[%u,%u|%u,%u|%u,%u][%u,%u,%u] [%u,%u,%u] [%u,%u,%u] [%u,%u,%u] %s",
hps_sys.cluster_info[0].up_threshold,
hps_sys.cluster_info[0].down_threshold,
hps_sys.cluster_info[1].up_threshold,
hps_sys.cluster_info[1].down_threshold,
hps_sys.cluster_info[2].up_threshold,
hps_sys.cluster_info[2].down_threshold,
hps_sys.cluster_info[0].loading,
hps_sys.cluster_info[1].loading,
hps_sys.cluster_info[2].loading,
hps_sys.cluster_info[0].rel_load,
hps_sys.cluster_info[1].rel_load,
hps_sys.cluster_info[2].rel_load,
hps_sys.cluster_info[0].abs_load,
hps_sys.cluster_info[1].abs_load,
hps_sys.cluster_info[2].abs_load,
/* sched-assist hotplug: for debug */
hps_sys.cluster_info[0].sched_load,
hps_sys.cluster_info[1].sched_load,
hps_sys.cluster_info[2].sched_load,
str_target);
#ifdef CONFIG_MEIZU_BSP
if (printk_timed_ratelimit(&j, 500))
hps_warn("%s%s\n", str1, str2);
#else
hps_warn("%s%s\n", str1, str2);
#endif //CONFIG_MEIZU_BSP
#ifdef _TRACE_
trace_hps_update(hps_sys.action_id, str_online, hps_ctxt.cur_loads,
hps_ctxt.cur_tlp, hps_ctxt.cur_iowait, str_hvytsk,
str_criteria_limit, str_criteria_base,
hps_sys.up_load_avg, hps_sys.down_load_avg,
hps_sys.tlp_avg,
hps_sys.rush_hps_sys.cluster_info[0].up_threshold,
hps_sys.cluster_info[0].down_threshold,
hps_sys.cluster_info[0].up_threshold,
hps_sys.cluster_info[0].down_threshold,
hps_sys.cluster_info[2].up_threshold,
hps_sys.cluster_info[2].down_threshold,
hps_sys.cluster_info[0].loading, hps_sys.cluster_info[1].loading,
hps_sys.cluster_info[2].loading,
hps_ctxt.up_times, hps_ctxt.down_times, str_target);
#endif
}
hps_ctxt_reset_stas_nolock();
}
}
#if HPS_HRT_BT_EN
if (hrtbt_dbg && (action_print)) {
hps_set_funct_ctrl();
hps_warn("(0x%X)%s HRT_BT_DBG (%u)(%u)(%u) %s %s %s %s%s (%u)(%u)(%u)(%u) %s\n",
((hps_ctxt.hps_func_control << 12) | hps_sys.action_id),
str_online, hps_ctxt.cur_loads, hps_ctxt.cur_tlp,
hps_ctxt.cur_iowait, str_hvytsk, str_bigtsk, str_pwrseq, str_criteria_limit,
str_criteria_base, hps_sys.up_load_avg, hps_sys.down_load_avg,
hps_sys.tlp_avg, hps_sys.rush_cnt, str_target);
hrtbt_dbg = 0;
hps_ctxt.hps_hrt_ktime = ktime_get();
}
#endif
action_print = 0;
action_break = 0;
mutex_unlock(&hps_ctxt.lock);
}
当前hps_algo_main()的算法对应有几种:
static int (*hps_func[]) (void) = {
/*hps_algo_perf_indicator, hps_algo_rush_boost, hps_algo_eas, hps_algo_up, hps_algo_down};*/
hps_algo_perf_indicator, hps_algo_rush_boost, hps_algo_eas};
/* (1) 取perf规定的最小值 */
static int hps_algo_perf_indicator(void)
{
unsigned int i;
if (atomic_read(&hps_ctxt.is_ondemand) != 0) { /* for ondemand request */
atomic_set(&hps_ctxt.is_ondemand, 0);
mutex_lock(&hps_ctxt.para_lock);
for (i = 0; i < hps_sys.cluster_num; i++)
hps_sys.cluster_info[i].target_core_num =
max(hps_sys.cluster_info[i].base_value, hps_sys.cluster_info[i].online_core_num);
mutex_unlock(&hps_ctxt.para_lock);
return 1;
}
return 0;
}
/* (2) 根据当前load的值是否达到boost门限,来决定是否启动boost */
static int hps_algo_rush_boost(void)
{
int val, base_val;
unsigned int idx, total_rel_load;
idx = total_rel_load = 0;
for (idx = 0 ; idx < hps_sys.cluster_num ; idx++)
total_rel_load += hps_sys.cluster_info[idx].rel_load;
if (!hps_ctxt.rush_boost_enabled)
return 0;
base_val = cal_base_cores();
if (total_rel_load > hps_ctxt.rush_boost_threshold * hps_sys.total_online_cores)
++hps_ctxt.rush_count;
else
hps_ctxt.rush_count = 0;
if (hps_ctxt.rush_boost_times == 1)
hps_ctxt.tlp_avg = hps_ctxt.cur_tlp;
if ((hps_ctxt.rush_count >= hps_ctxt.rush_boost_times) &&
(hps_sys.total_online_cores * 100 < hps_ctxt.tlp_avg)) {
val = hps_ctxt.tlp_avg / 100 + (hps_ctxt.tlp_avg % 100 ? 1 : 0);
WARN_ON(!(val > hps_sys.total_online_cores));
if (val > num_possible_cpus())
val = num_possible_cpus();
if (val > base_val)
val -= base_val;
else
val = 0;
hps_sys.tlp_avg = hps_ctxt.tlp_avg;
hps_sys.rush_cnt = hps_ctxt.rush_count;
hps_cal_core_num(&hps_sys, val, base_val);
/* [MET] debug for geekbench */
met_tag_oneshot(0, "sched_rush_boost", 1);
return 1;
} else {
/* [MET] debug for geekbench */
met_tag_oneshot(0, "sched_rush_boost", 0);
return 0;
}
}
/* (3) 根据负载来计算需要的online cpu */
static int hps_algo_eas(void)
{
int val, ret, i;
ret = 0;
for (i = 0 ; i < hps_sys.cluster_num ; i++) {
hps_sys.cluster_info[i].target_core_num = hps_sys.cluster_info[i].online_core_num;
/*if up_threshold > loading > down_threshold ==> No action*/
if ((hps_sys.cluster_info[i].loading <
(hps_sys.cluster_info[i].up_threshold*hps_sys.cluster_info[i].online_core_num)) &&
(hps_sys.cluster_info[i].loading >
(hps_sys.cluster_info[i].down_threshold*hps_sys.cluster_info[i].online_core_num)))
continue;
/*if loading > up_threshod ==> power on cores*/
if ((hps_sys.cluster_info[i].loading >
(hps_sys.cluster_info[i].up_threshold*hps_sys.cluster_info[i].online_core_num))) {
val = hps_sys.cluster_info[i].loading / hps_sys.cluster_info[i].up_threshold;
if (hps_sys.cluster_info[i].loading % hps_sys.cluster_info[i].up_threshold)
val++;
if (val <= hps_sys.cluster_info[i].limit_value)
hps_sys.cluster_info[i].target_core_num = val;
else
hps_sys.cluster_info[i].target_core_num = hps_sys.cluster_info[i].limit_value;
ret = 1;
} else if ((hps_sys.cluster_info[i].loading <
(hps_sys.cluster_info[i].down_threshold*hps_sys.cluster_info[i].online_core_num))) {
/*if loading < down_threshod ==> power off cores*/
if (!hps_sys.cluster_info[i].loading) {
hps_sys.cluster_info[i].target_core_num = 0;
continue;
}
val = hps_sys.cluster_info[i].loading / hps_sys.cluster_info[i].down_threshold;
if (hps_sys.cluster_info[i].loading % hps_sys.cluster_info[i].down_threshold)
val++;
if (val >= hps_sys.cluster_info[i].base_value)
hps_sys.cluster_info[i].target_core_num = val;
else
hps_sys.cluster_info[i].target_core_num = hps_sys.cluster_info[i].base_value;
ret = 1;
}
}
#if 0
/*Check with big task criteriai*/
for (i = 1 ; i < hps_sys.cluster_num ; i++) {
if ((!hps_sys.cluster_info[i].bigTsk_value) &&
(!(hps_sys.cluster_info[i].loading / hps_sys.cluster_info[i].down_threshold)))
hps_sys.cluster_info[i].target_core_num = 0;
}
#endif
return ret;
}
4.5、NUMA负载均衡
NUMA arm架构没有使用,暂时不去解析。
更多推荐
所有评论(0)