4、负载均衡

4.1、SMP负载均衡

4.1.1、Scheduling Domains

4.1.1.1、Scheduling Domains概念

借用Linux Scheduling Domains的描述,阐述Scheduling Domains的概念。

一个复杂的高端系统由上到下可以这样构成:

  • 1、它是一个 NUMA 架构的系统,系统中的每个 Node 访问系统中不同区域的内存有不同的速度。
  • 2、同时它又是一个 SMP 系统。由多个物理 CPU(Physical Package) 构成。这些物理 CPU 共享系统中所有的内存。但都有自己独立的 Cache 。
  • 3、每个物理 CPU 又由多个核 (Core) 构成,即 Multi-core 技术或者叫 Chip-level Multi processor(CMP) 。这些核都被集成在一块 die 里面。一般有自己独立的 L1 Cache,但可能共享 L2 Cache 。
  • 4、每个核中又通过 SMT 之类的技术实现多个硬件线程,或者叫 Virtual CPU( 比如 Intel 的 Hyper-threading 技术 ) 。这些硬件线程,逻辑上看是就是一个 CPU 。它们之间几乎所有的东西都共享。包括 L1 Cache,甚至是逻辑运算单元 (ALU) 以及 Power 。

可以看到cpu是有多个层级的,cpu和越近的层级之间共享的资源越多。所以进程在cpu之间迁移是有代价的,从性能的角度看,迁移跨越的层级越大性能损失越大。另外还需要从功耗的角度来考虑进程迁移的代价,这就是EAS考虑的。

4.1.1.2、arm64 cpu_topology

arm64架构的cpu拓扑结构存储在cpu_topology[]变量当中:

/*
 * cpu topology table
 */
struct cpu_topology cpu_topology[NR_CPUS];


struct cpu_topology {
    int thread_id;
    int core_id;
    int cluster_id;                 // 本cpu所在的cluster
    unsigned int partno;
    cpumask_t thread_sibling;
    cpumask_t core_sibling;         // 在MutiCore层次(即同一个cluster中),有哪些兄弟cpu
};

cpu_topology[]是parse_dt_cpu_capacity()函数解析dts中的信息建立的:

kernel_init() -> kernel_init_freeable() -> smp_prepare_cpus() -> init_cpu_topology() -> parse_dt_topology()

↓

static int __init parse_dt_topology(void)
{
    struct device_node *cn, *map;
    int ret = 0;
    int cpu;

    /* (1) 找到dts中cpu topology的根节点"/cpus"" */
    cn = of_find_node_by_path("/cpus");
    if (!cn) {
        pr_err("No CPU information found in DT\n");
        return 0;
    }

    /*
     * When topology is provided cpu-map is essentially a root
     * cluster with restricted subnodes.
     */
    /* (2) 找到"cpu-map"节点 */
    map = of_get_child_by_name(cn, "cpu-map");
    if (!map)
        goto out;

    /* (3) 解析"cpu-map"中的cluster */
    ret = parse_cluster(map, 0);
    if (ret != 0)
        goto out_map;

    /*
     * Check that all cores are in the topology; the SMP code will
     * only mark cores described in the DT as possible.
     */
    for_each_possible_cpu(cpu)
        if (cpu_topology[cpu].cluster_id == -1)
            ret = -EINVAL;

out_map:
    of_node_put(map);
out:
    of_node_put(cn);
    return ret;
}

|→

static int __init parse_cluster(struct device_node *cluster, int depth)
{
    char name[10];
    bool leaf = true;
    bool has_cores = false;
    struct device_node *c;
    static int cluster_id __initdata;
    int core_id = 0;
    int i, ret;

    /*
     * First check for child clusters; we currently ignore any
     * information about the nesting of clusters and present the
     * scheduler with a flat list of them.
     */
    i = 0;
    /* (3.1) 如果有多级cluster,继续递归搜索 */
    do {
        snprintf(name, sizeof(name), "cluster%d", i);
        c = of_get_child_by_name(cluster, name);
        if (c) {
            leaf = false;
            ret = parse_cluster(c, depth + 1);
            of_node_put(c);
            if (ret != 0)
                return ret;
        }
        i++;
    } while (c);

    /* Now check for cores */
    i = 0;
    do {
        /* (3.2) 或者core层次的节点 */
        snprintf(name, sizeof(name), "core%d", i);
        c = of_get_child_by_name(cluster, name);
        if (c) {
            has_cores = true;

            if (depth == 0) {
                pr_err("%s: cpu-map children should be clusters\n",
                       c->full_name);
                of_node_put(c);
                return -EINVAL;
            }

            if (leaf) {
                /* (3.3) 如果是叶子cluster节点,继续遍历core中的cpu节点 */
                ret = parse_core(c, cluster_id, core_id++);
            } else {
                pr_err("%s: Non-leaf cluster with core %s\n",
                       cluster->full_name, name);
                ret = -EINVAL;
            }

            of_node_put(c);
            if (ret != 0)
                return ret;
        }
        i++;
    } while (c);

    if (leaf && !has_cores)
        pr_warn("%s: empty cluster\n", cluster->full_name);

    if (leaf)
        cluster_id++;

    return 0;
}

||→

static int __init parse_core(struct device_node *core, int cluster_id,
                 int core_id)
{
    char name[10];
    bool leaf = true;
    int i = 0;
    int cpu;
    struct device_node *t;

    do {
        /* (3.3.1) 如果存在thread层级,解析thread和cpu层级 */
        snprintf(name, sizeof(name), "thread%d", i);
        t = of_get_child_by_name(core, name);
        if (t) {
            leaf = false;
            cpu = get_cpu_for_node(t);
            if (cpu >= 0) {
                cpu_topology[cpu].cluster_id = cluster_id;
                cpu_topology[cpu].core_id = core_id;
                cpu_topology[cpu].thread_id = i;
            } else {
                pr_err("%s: Can't get CPU for thread\n",
                       t->full_name);
                of_node_put(t);
                return -EINVAL;
            }
            of_node_put(t);
        }
        i++;
    } while (t);

    /* (3.3.2) 否则直接解析cpu层级 */
    cpu = get_cpu_for_node(core);
    if (cpu >= 0) {
        if (!leaf) {
            pr_err("%s: Core has both threads and CPU\n",
                   core->full_name);
            return -EINVAL;
        }

        /* (3.3.3) 得到了cpu的cluster_id/core_id */
        cpu_topology[cpu].cluster_id = cluster_id;
        cpu_topology[cpu].core_id = core_id;
    } else if (leaf) {
        pr_err("%s: Can't get CPU for leaf core\n", core->full_name);
        return -EINVAL;
    }

    return 0;
}

|||→

static int __init get_cpu_for_node(struct device_node *node)
{
    struct device_node *cpu_node;
    int cpu;

    cpu_node = of_parse_phandle(node, "cpu", 0);
    if (!cpu_node)
        return -1;

    for_each_possible_cpu(cpu) {
        if (of_get_cpu_node(cpu, NULL) == cpu_node) {
            of_node_put(cpu_node);
            return cpu;
        }
    }

    pr_crit("Unable to find CPU node for %s\n", cpu_node->full_name);

    of_node_put(cpu_node);
    return -1;
}

cpu同一层次的关系cpu_topology[cpu].core_sibling/thread_sibling会在update_siblings_masks()中更新:

kernel_init() -> kernel_init_freeable() -> smp_prepare_cpus() -> store_cpu_topology() -> update_siblings_masks()

↓

static void update_siblings_masks(unsigned int cpuid)
{
    struct cpu_topology *cpu_topo, *cpuid_topo = &cpu_topology[cpuid];
    int cpu;

    /* update core and thread sibling masks */
    for_each_possible_cpu(cpu) {
        cpu_topo = &cpu_topology[cpu];

        if (cpuid_topo->cluster_id != cpu_topo->cluster_id)
            continue;

        cpumask_set_cpu(cpuid, &cpu_topo->core_sibling);
        if (cpu != cpuid)
            cpumask_set_cpu(cpu, &cpuid_topo->core_sibling);

        if (cpuid_topo->core_id != cpu_topo->core_id)
            continue;

        cpumask_set_cpu(cpuid, &cpu_topo->thread_sibling);
        if (cpu != cpuid)
            cpumask_set_cpu(cpu, &cpuid_topo->thread_sibling);
    }
}

以mt6799为例,topology为”4*A35 + 4*A53 + 2*A73”,dts中定义如下:

mt6799.dtsi:

cpus {
        #address-cells = <1>;
        #size-cells = <0>;

        cpu0: cpu@0 {
            device_type = "cpu";
            compatible = "arm,cortex-a35";
            reg = <0x000>;
            enable-method = "psci";
            cpu-idle-states = <&LEGACY_MCDI &LEGACY_SODI &LEGACY_SODI3 &LEGACY_DPIDLE>,
                      <&LEGACY_SUSPEND &MCDI &SODI &SODI3 &DPIDLE &SUSPEND>;
            cpu-release-addr = <0x0 0x40000200>;
            clock-frequency = <1248000000>;
        };

        cpu1: cpu@001 {
            device_type = "cpu";
            compatible = "arm,cortex-a35";
            reg = <0x001>;
            enable-method = "psci";
            cpu-idle-states = <&LEGACY_MCDI &LEGACY_SODI &LEGACY_SODI3 &LEGACY_DPIDLE>,
                      <&LEGACY_SUSPEND &MCDI &SODI &SODI3 &DPIDLE &SUSPEND>;
            cpu-release-addr = <0x0 0x40000200>;
            clock-frequency = <1248000000>;
        };

        cpu2: cpu@002 {
            device_type = "cpu";
            compatible = "arm,cortex-a35";
            reg = <0x002>;
            enable-method = "psci";
            cpu-idle-states = <&LEGACY_MCDI &LEGACY_SODI &LEGACY_SODI3 &LEGACY_DPIDLE>,
                      <&LEGACY_SUSPEND &MCDI &SODI &SODI3 &DPIDLE &SUSPEND>;
            cpu-release-addr = <0x0 0x40000200>;
            clock-frequency = <1248000000>;
        };

        cpu3: cpu@003 {
            device_type = "cpu";
            compatible = "arm,cortex-a35";
            reg = <0x003>;
            enable-method = "psci";
            cpu-idle-states = <&LEGACY_MCDI &LEGACY_SODI &LEGACY_SODI3 &LEGACY_DPIDLE>,
                      <&LEGACY_SUSPEND &MCDI &SODI &SODI3 &DPIDLE &SUSPEND>;
            cpu-release-addr = <0x0 0x40000200>;
            clock-frequency = <1248000000>;
        };

        cpu4: cpu@100 {
            device_type = "cpu";
            compatible = "arm,cortex-a53";
            reg = <0x100>;
            enable-method = "psci";
            cpu-idle-states = <&LEGACY_MCDI &LEGACY_SODI &LEGACY_SODI3 &LEGACY_DPIDLE>,
                      <&LEGACY_SUSPEND &MCDI &SODI &SODI3 &DPIDLE &SUSPEND>;
            cpu-release-addr = <0x0 0x40000200>;
            clock-frequency = <1378000000>;
        };

        cpu5: cpu@101 {
            device_type = "cpu";
            compatible = "arm,cortex-a53";
            reg = <0x101>;
            enable-method = "psci";
            cpu-idle-states = <&LEGACY_MCDI &LEGACY_SODI &LEGACY_SODI3 &LEGACY_DPIDLE>,
                      <&LEGACY_SUSPEND &MCDI &SODI &SODI3 &DPIDLE &SUSPEND>;
            cpu-release-addr = <0x0 0x40000200>;
            clock-frequency = <1378000000>;
        };

        cpu6: cpu@102 {
            device_type = "cpu";
            compatible = "arm,cortex-a53";
            reg = <0x102>;
            enable-method = "psci";
            cpu-idle-states = <&LEGACY_MCDI &LEGACY_SODI &LEGACY_SODI3 &LEGACY_DPIDLE>,
                      <&LEGACY_SUSPEND &MCDI &SODI &SODI3 &DPIDLE &SUSPEND>;
            cpu-release-addr = <0x0 0x40000200>;
            clock-frequency = <1378000000>;
        };

        cpu7: cpu@103 {
            device_type = "cpu";
            compatible = "arm,cortex-a53";
            reg = <0x103>;
            enable-method = "psci";
            cpu-idle-states = <&LEGACY_MCDI &LEGACY_SODI &LEGACY_SODI3 &LEGACY_DPIDLE>,
                      <&LEGACY_SUSPEND &MCDI &SODI &SODI3 &DPIDLE &SUSPEND>;
            cpu-release-addr = <0x0 0x40000200>;
            clock-frequency = <1378000000>;
        };

        cpu8: cpu@200 {
            device_type = "cpu";
            compatible = "arm,cortex-a73";
            reg = <0x200>;
            enable-method = "psci";
            cpu-idle-states = <&LEGACY_MCDI &LEGACY_SODI &LEGACY_SODI3 &LEGACY_DPIDLE>,
                      <&LEGACY_SUSPEND &MCDI &SODI &SODI3 &DPIDLE &SUSPEND>;
            cpu-release-addr = <0x0 0x40000200>;
            clock-frequency = <1638000000>;
        };

        cpu9: cpu@201 {
            device_type = "cpu";
            compatible = "arm,cortex-a73";
            reg = <0x201>;
            enable-method = "psci";
            cpu-idle-states = <&LEGACY_MCDI &LEGACY_SODI &LEGACY_SODI3 &LEGACY_DPIDLE>,
                      <&LEGACY_SUSPEND &MCDI &SODI &SODI3 &DPIDLE &SUSPEND>;
            cpu-release-addr = <0x0 0x40000200>;
            clock-frequency = <1638000000>;
        };

        cpu-map {
            cluster0 {
                core0 {
                    cpu = <&cpu0>;
                };


                core1 {
                    cpu = <&cpu1>;
                };

                core2 {
                    cpu = <&cpu2>;
                };

                core3 {
                    cpu = <&cpu3>;
                };

            };

            cluster1 {
                core0 {
                    cpu = <&cpu4>;
                };

                core1 {
                    cpu = <&cpu5>;
                };

                core2 {
                    cpu = <&cpu6>;
                };

                core3 {
                    cpu = <&cpu7>;
                };

            };

            cluster2 {
                core0 {
                    cpu = <&cpu8>;
                };

                core1 {
                    cpu = <&cpu9>;
                };

            };
        };
  • 经过parse_dt_topology()、update_siblings_masks()解析后得到cpu_topology[}的值为:
cpu 0 cluster_id = 0, core_id = 0, core_sibling = 0xf
cpu 1 cluster_id = 0, core_id = 1, core_sibling = 0xf
cpu 2 cluster_id = 0, core_id = 2, core_sibling = 0xf
cpu 3 cluster_id = 0, core_id = 3, core_sibling = 0xf
cpu 4 cluster_id = 1, core_id = 0, core_sibling = 0xf0
cpu 5 cluster_id = 1, core_id = 1, core_sibling = 0xf0
cpu 6 cluster_id = 1, core_id = 2, core_sibling = 0xf0
cpu 7 cluster_id = 1, core_id = 3, core_sibling = 0xf0
cpu 8 cluster_id = 2, core_id = 0, core_sibling = 0x300
cpu 9 cluster_id = 2, core_id = 1, core_sibling = 0x300
4.1.1.3、Scheduling Domains的初始化

在kernel_init_freeable()中,调用smp_prepare_cpus()初始化完cpu的拓扑关系,再调用smp_init()唤醒cpu,紧接会调用sched_init_smp()初始化系统的Scheduling Domains。

关于拓扑的层次默认可选的有3层:SMT/MC/DIE。arm目前不支持多线程技术,所以现在只支持2层:MC/DIE。

/*
 * Topology list, bottom-up.
 */
static struct sched_domain_topology_level default_topology[] = {
#ifdef CONFIG_SCHED_SMT
    { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) },
#endif
#ifdef CONFIG_SCHED_MC
    { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
#endif
    { cpu_cpu_mask, SD_INIT_NAME(DIE) },
    { NULL, },
};

arm64使用的SDTL如下:

static struct sched_domain_topology_level arm64_topology[] = {
#ifdef CONFIG_SCHED_MC
    { cpu_coregroup_mask, cpu_corepower_flags, cpu_core_energy, SD_INIT_NAME(MC) },
#endif
    { cpu_cpu_mask, NULL, cpu_cluster_energy, SD_INIT_NAME(DIE) },
    { NULL, },
};

具体的Scheduling Domains的初始化代码分析如下:

kernel_init() -> kernel_init_freeable() -> sched_init_smp() -> init_sched_domains(cpu_active_mask):

↓

static int init_sched_domains(const struct cpumask *cpu_map)
{
    int err;

    arch_update_cpu_topology();

    /* (1) 当前只有一个schedule domain需要初始化 */
    ndoms_cur = 1;
    doms_cur = alloc_sched_domains(ndoms_cur);
    if (!doms_cur)
        doms_cur = &fallback_doms;

    /* (2) 按照传入的cpu_active_mask,构造sched_domains */
    cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
    err = build_sched_domains(doms_cur[0], NULL);

    /* (3) 注册“/proc/sys/kernel/sched_domain/” */
    register_sched_domain_sysctl();

    return err;
}

|→

static int build_sched_domains(const struct cpumask *cpu_map,
                   struct sched_domain_attr *attr)
{
    enum s_alloc alloc_state;
    struct sched_domain *sd;
    struct s_data d;
    struct rq *rq = NULL;
    int i, ret = -ENOMEM;

    /* (2.1) 在每个tl层次,给每个cpu分配sd、sg、sgc空间 */
    alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
    if (alloc_state != sa_rootdomain)
        goto error;

    /* Set up domains for cpus specified by the cpu_map. */
    for_each_cpu(i, cpu_map) {
        struct sched_domain_topology_level *tl;

        sd = NULL;
        for_each_sd_topology(tl) {
            /* (2.2) 初始化sd
                构造其不同tl之间的sd的parent、cild关系
                按照SDTL传入的tl->mask()函数,给sd->span[]赋值
             */
            sd = build_sched_domain(tl, cpu_map, attr, sd, i);

            /* (2.2.1) 将最底层tl的sd赋值给d.sd */
            if (tl == sched_domain_topology)
                *per_cpu_ptr(d.sd, i) = sd;
            if (tl->flags & SDTL_OVERLAP || sched_feat(FORCE_SD_OVERLAP))
                sd->flags |= SD_OVERLAP;
            if (cpumask_equal(cpu_map, sched_domain_span(sd)))
                break;
        }
    }

    /* Build the groups for the domains */
    for_each_cpu(i, cpu_map) {
        for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
            /* (2.3) 给sd->span_weight赋值 */
            sd->span_weight = cpumask_weight(sched_domain_span(sd));
            if (sd->flags & SD_OVERLAP) {
                if (build_overlap_sched_groups(sd, i))
                    goto error;
            } else {
                /* (2.4) 按照span,构造每个tl层次中,sd、sg之间的关系 */
                if (build_sched_groups(sd, i))
                    goto error;
            }
        }
    }

    /* Calculate CPU capacity for physical packages and nodes */
    for (i = nr_cpumask_bits-1; i >= 0; i--) {
        struct sched_domain_topology_level *tl = sched_domain_topology;

        if (!cpumask_test_cpu(i, cpu_map))
            continue;

        for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent, tl++) {
            /* (2.5) 初始化sg->sge对应的energy表 */
            init_sched_energy(i, sd, tl->energy);
            /* (2.6) 对有人引用的sd、sg、sgc进行标识,
                无人引用的sd、sg、sgc在__free_domain_allocs()中会被释放
             */
            claim_allocations(i, sd);
            /* (2.7) 初始化每个tl层级的sgc->capacity
             */
            init_sched_groups_capacity(i, sd);
        }
    }

    /* Attach the domains */
    rcu_read_lock();
    /* (2.8) 将d.rd赋值给rq->sd
        将d.rd赋值给rq->rd
     */
    for_each_cpu(i, cpu_map) {
        rq = cpu_rq(i);
        sd = *per_cpu_ptr(d.sd, i);
        cpu_attach_domain(sd, d.rd, i);
    }
    rcu_read_unlock();

    ret = 0;
error:
    /* (2.9) free掉分配失败/分配成功多余的内存 */
    __free_domain_allocs(&d, alloc_state, cpu_map);
    return ret;
}

||→

static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
                           const struct cpumask *cpu_map)
{
    memset(d, 0, sizeof(*d));

    /* (2.1.1) 每个tl层次,给每个cpu都分配sd、sg、sgc,
        tl->data->sd、l->data->sg、l->data->sgc
     */
    if (__sdt_alloc(cpu_map))
        return sa_sd_storage;

    /* (2.1.2) 分配d->sd指针空间
        实际d->sd会指向最底层tl的tl->data->sd
     */
    d->sd = alloc_percpu(struct sched_domain *);
    if (!d->sd)
        return sa_sd_storage;

    /* (2.1.3) 分配d->rd的指针空间和实际空间 
        rd = root_domain
     */
    d->rd = alloc_rootdomain();
    if (!d->rd)
        return sa_sd;
    return sa_rootdomain;
}

||→

struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
        const struct cpumask *cpu_map, struct sched_domain_attr *attr,
        struct sched_domain *child, int cpu)
{
    struct sched_domain *sd = sd_init(tl, cpu);
    if (!sd)
        return child;

    /* (2.2.1) 根据tl->mask()初始化sd->sapn[] */
    cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
    if (child) {
        sd->level = child->level + 1;
        sched_domain_level_max = max(sched_domain_level_max, sd->level);

        /* (2.2.2) 如果有多层tl,建立起sd之间的parent/child关系,
            对arm来说:MC层tl->data->sd是child,DIE层tl->data->sd是parent
         */
        child->parent = sd;
        sd->child = child;

        if (!cpumask_subset(sched_domain_span(child),
                    sched_domain_span(sd))) {
            pr_err("BUG: arch topology borken\n");
#ifdef CONFIG_SCHED_DEBUG
            pr_err("     the %s domain not a subset of the %s domain\n",
                    child->name, sd->name);
#endif
            /* Fixup, ensure @sd has at least @child cpus. */
            cpumask_or(sched_domain_span(sd),
                   sched_domain_span(sd),
                   sched_domain_span(child));
        }

    }
    set_domain_attribute(sd, attr);

    return sd;
}

||→

static int
build_sched_groups(struct sched_domain *sd, int cpu)
{
    struct sched_group *first = NULL, *last = NULL;
    struct sd_data *sdd = sd->private;
    const struct cpumask *span = sched_domain_span(sd);
    struct cpumask *covered;
    int i;

    /* (2.4.1) 根据sd->span[]建立起sd、sg之间的关系 ,
        如果sd没有child,每个cpu的sd、sg之间建立链接
        如果sd有child,每个cpu的sd和span中第一个cpu的sg建立链接
     */
    get_group(cpu, sdd, &sd->groups);
    atomic_inc(&sd->groups->ref);

    if (cpu != cpumask_first(span))
        return 0;

    lockdep_assert_held(&sched_domains_mutex);
    covered = sched_domains_tmpmask;

    cpumask_clear(covered);

    /* (2.4.2) 挑选有sd链接的sg,给其中的sg->cpumask[]成员赋值 */
    for_each_cpu(i, span) {
        struct sched_group *sg;
        int group, j;

        if (cpumask_test_cpu(i, covered))
            continue;

        group = get_group(i, sdd, &sg);
        cpumask_setall(sched_group_mask(sg));

        for_each_cpu(j, span) {
            if (get_group(j, sdd, NULL) != group)
                continue;

            cpumask_set_cpu(j, covered);
            cpumask_set_cpu(j, sched_group_cpus(sg));
        }

        /* (2.4.3) 挑选有sd链接的sg,将同一层级sg链接成链表, */
        if (!first)
            first = sg;
        if (last)
            last->next = sg;
        last = sg;
    }
    last->next = first;

    return 0;
}

||→

static void init_sched_energy(int cpu, struct sched_domain *sd,
                  sched_domain_energy_f fn)
{
    if (!(fn && fn(cpu)))
        return;

    if (cpu != group_balance_cpu(sd->groups))
        return;

    if (sd->child && !sd->child->groups->sge) {
        pr_err("BUG: EAS setup broken for CPU%d\n", cpu);
#ifdef CONFIG_SCHED_DEBUG
        pr_err("     energy data on %s but not on %s domain\n",
            sd->name, sd->child->name);
#endif
        return;
    }

    check_sched_energy_data(cpu, fn, sched_group_cpus(sd->groups));

    /* (2.5.1) 不同层级tl,按照tl->energy()给sg->sge赋值 */
    sd->groups->sge = fn(cpu);
}

||→

static void claim_allocations(int cpu, struct sched_domain *sd)
{
    struct sd_data *sdd = sd->private;

    /* (2.6.1) 对有人使用的tl->data->sd、tl->data->sg、tl->data->sgc置空,
        无人使用的空间,将会在__free_domain_allocs()中被释放
     */

    WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
    *per_cpu_ptr(sdd->sd, cpu) = NULL;

    if (atomic_read(&(*per_cpu_ptr(sdd->sg, cpu))->ref))
        *per_cpu_ptr(sdd->sg, cpu) = NULL;

    if (atomic_read(&(*per_cpu_ptr(sdd->sgc, cpu))->ref))
        *per_cpu_ptr(sdd->sgc, cpu) = NULL;
}

||→

static void init_sched_groups_capacity(int cpu, struct sched_domain *sd)
{
    struct sched_group *sg = sd->groups;

    WARN_ON(!sg);

    do {
        /* (2.7.1) 更新sg->group_weight的值 */
        sg->group_weight = cpumask_weight(sched_group_cpus(sg));
        sg = sg->next;
    } while (sg != sd->groups);

    if (cpu != group_balance_cpu(sg))
        return;

    /* (2.7.2) 更新sgc->capacity的值 */
    update_group_capacity(sd, cpu);

    /* (2.7.3) 更新sgc->nr_busy_cpus的值 */
    atomic_set(&sg->sgc->nr_busy_cpus, sg->group_weight);
}

|||→

void update_group_capacity(struct sched_domain *sd, int cpu)
{
    struct sched_domain *child = sd->child;
    struct sched_group *group, *sdg = sd->groups;
    unsigned long capacity;
    unsigned long interval;

    interval = msecs_to_jiffies(sd->balance_interval);
    interval = clamp(interval, 1UL, max_load_balance_interval);
    sdg->sgc->next_update = jiffies + interval;

    if (!child) {
        /* (2.7.2.1) 如果sd没有child是最底层tl,
            则调用arch_scale_cpu_capacity()获取最大运算能力,并减去rt进程的消耗rq->rt_avg,
            得到本sd的sg->sgc->capacity
         */
        update_cpu_capacity(sd, cpu);
        return;
    }

    capacity = 0;

    if (child->flags & SD_OVERLAP) {
        /*
         * SD_OVERLAP domains cannot assume that child groups
         * span the current group.
         */

        for_each_cpu(cpu, sched_group_cpus(sdg)) {
            struct sched_group_capacity *sgc;
            struct rq *rq = cpu_rq(cpu);

            /*
             * build_sched_domains() -> init_sched_groups_capacity()
             * gets here before we've attached the domains to the
             * runqueues.
             *
             * Use capacity_of(), which is set irrespective of domains
             * in update_cpu_capacity().
             *
             * This avoids capacity from being 0 and
             * causing divide-by-zero issues on boot.
             */
            if (unlikely(!rq->sd)) {
                capacity += capacity_of(cpu);
                continue;
            }

            sgc = rq->sd->groups->sgc;
            capacity += sgc->capacity;
        }
    } else  {
        /*
         * !SD_OVERLAP domains can assume that child groups
         * span the current group.
         */ 

        /*  (2.7.2.2) 如果sd有child不是最底层tl,
            则sgc->capacity等于所有child sg的group->sgc->capacity的和
         */
        group = child->groups;
        do {
            capacity += group->sgc->capacity;
            group = group->next;
        } while (group != child->groups);
    }

    sdg->sgc->capacity = capacity;
}

||||→

static void update_cpu_capacity(struct sched_domain *sd, int cpu)
{
    unsigned long capacity = arch_scale_cpu_capacity(sd, cpu);
    struct sched_group *sdg = sd->groups;
    struct max_cpu_capacity *mcc;
    unsigned long max_capacity;
    int max_cap_cpu;
    unsigned long flags;

    /* (2.7.2.1.1) 根据arch_scale_cpu_capacity获取到本cpu最大/orig capacity
     */
    cpu_rq(cpu)->cpu_capacity_orig = capacity;

    mcc = &cpu_rq(cpu)->rd->max_cpu_capacity;

    raw_spin_lock_irqsave(&mcc->lock, flags);
    max_capacity = mcc->val;
    max_cap_cpu = mcc->cpu;

    if ((max_capacity > capacity && max_cap_cpu == cpu) ||
        (max_capacity < capacity)) {
        mcc->val = capacity;
        mcc->cpu = cpu;
#ifdef CONFIG_SCHED_DEBUG
        raw_spin_unlock_irqrestore(&mcc->lock, flags);
        /* pr_info("CPU%d: update max cpu_capacity %lu\n", cpu, capacity); */
        goto skip_unlock;
#endif
    }
    raw_spin_unlock_irqrestore(&mcc->lock, flags);

skip_unlock: __attribute__ ((unused));
    /* (2.7.2.1.2) 减去rt消耗的capacity,
        rq->rt_avg/(sched_avg_period() + delta)是rt进程占用cpu的比例,
        剩下就为cfs可用的capacity
     */
    capacity *= scale_rt_capacity(cpu);
    capacity >>= SCHED_CAPACITY_SHIFT;

    if (!capacity)
        capacity = 1;

    cpu_rq(cpu)->cpu_capacity = capacity;
    sdg->sgc->capacity = capacity;
}

init_sched_domains()是在系统启动时创建sched_domain,如果发生cpu hotplug系统中online的cpu发生变化时,会调用partition_sched_domains重新构造系统的sched_domain。

cpu_up() -> _cpu_up() -> __raw_notifier_call_chain() -> cpuset_cpu_active() -> cpuset_update_active_cpus() -> partition_sched_domains() -> build_sched_domains();

void __init sched_init_smp(void)
{
    hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
    hotcpu_notifier(cpuset_cpu_inactive, CPU_PRI_CPUSET_INACTIVE);

}

static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action,
                 void *hcpu)
{
    switch (action) {
    case CPU_ONLINE_FROZEN:
    case CPU_DOWN_FAILED_FROZEN:

        /*
         * num_cpus_frozen tracks how many CPUs are involved in suspend
         * resume sequence. As long as this is not the last online
         * operation in the resume sequence, just build a single sched
         * domain, ignoring cpusets.
         */
        num_cpus_frozen--;
        if (likely(num_cpus_frozen)) {
            partition_sched_domains(1, NULL, NULL);
            break;
        }

        /*
         * This is the last CPU online operation. So fall through and
         * restore the original sched domains by considering the
         * cpuset configurations.
         */

    case CPU_ONLINE:
        cpuset_update_active_cpus(true);
        break;
    default:
        return NOTIFY_DONE;
    }
    return NOTIFY_OK;
}

static int cpuset_cpu_inactive(struct notifier_block *nfb, unsigned long action,
                   void *hcpu)
{
    unsigned long flags;
    long cpu = (long)hcpu;
    struct dl_bw *dl_b;
    bool overflow;
    int cpus;

    switch (action) {
    case CPU_DOWN_PREPARE:
        rcu_read_lock_sched();
        dl_b = dl_bw_of(cpu);

        raw_spin_lock_irqsave(&dl_b->lock, flags);
        cpus = dl_bw_cpus(cpu);
        overflow = __dl_overflow(dl_b, cpus, 0, 0);
        raw_spin_unlock_irqrestore(&dl_b->lock, flags);

        rcu_read_unlock_sched();

        if (overflow)
            return notifier_from_errno(-EBUSY);
        cpuset_update_active_cpus(false);
        break;
    case CPU_DOWN_PREPARE_FROZEN:
        num_cpus_frozen++;
        partition_sched_domains(1, NULL, NULL);
        break;
    default:
        return NOTIFY_DONE;
    }
    return NOTIFY_OK;
}
4.1.1.4、mt6799的Scheduling Domains

在系统初始化时,因为cmdline中传入了“maxcpus=8”所以setup_max_cpus=8,smp只是启动了8个核,mt6799的另外2个大核是在后面才启动的。我们看看在系统启动8个核的时候,Scheduling Domains是什么样的。

在启动的时候每个层次的tl对每个cpu都会分配sd、sg、sgc的内存空间,但是建立起有效链接后有些sg、sgc空间是没有用上的。没有使用的内存后面会在claim_allocations()中标识出来,build_sched_domains()函数返回之前调用__free_domain_allocs()释放掉。

kernel_init() -> kernel_init_freeable() -> sched_init_smp() -> init_sched_domains() -> build_sched_domains() ->  __visit_domain_allocation_hell() -> __sdt_alloc():

[__sdt_alloc][tl MC] cpu0, &sd = 0xffffffc15663c600, &sg = 0xffffffc156062600, &sgc = 0xffffffc156062780 
[__sdt_alloc][tl MC] cpu1, &sd = 0xffffffc15608f000, &sg = 0xffffffc156056780, &sgc = 0xffffffc156090000 
[__sdt_alloc][tl MC] cpu2, &sd = 0xffffffc15608fc00, &sg = 0xffffffc156090d80, &sgc = 0xffffffc156090180 
[__sdt_alloc][tl MC] cpu3, &sd = 0xffffffc15608f300, &sg = 0xffffffc156090c00, &sgc = 0xffffffc156090300 
[__sdt_alloc][tl MC] cpu4, &sd = 0xffffffc15608f900, &sg = 0xffffffc156090a80, &sgc = 0xffffffc156090480 
[__sdt_alloc][tl MC] cpu5, &sd = 0xffffffc15608f600, &sg = 0xffffffc156090900, &sgc = 0xffffffc156090600 
[__sdt_alloc][tl MC] cpu6, &sd = 0xffffffc156091000, &sg = 0xffffffc156090780, &sgc = 0xffffffc156092000 
[__sdt_alloc][tl MC] cpu7, &sd = 0xffffffc156091c00, &sg = 0xffffffc156092d80, &sgc = 0xffffffc156092180 

[__sdt_alloc][tl DIE] cpu0, &sd = 0xffffffc156091300, &sg = 0xffffffc156092c00, &sgc = 0xffffffc156092300 
[__sdt_alloc][tl DIE] cpu1, &sd = 0xffffffc156091900, &sg = 0xffffffc156092a80, &sgc = 0xffffffc156092480 
[__sdt_alloc][tl DIE] cpu2, &sd = 0xffffffc156091600, &sg = 0xffffffc156092900, &sgc = 0xffffffc156092600 
[__sdt_alloc][tl DIE] cpu3, &sd = 0xffffffc156093000, &sg = 0xffffffc156092780, &sgc = 0xffffffc156094000 
[__sdt_alloc][tl DIE] cpu4, &sd = 0xffffffc156093c00, &sg = 0xffffffc156094d80, &sgc = 0xffffffc156094180 
[__sdt_alloc][tl DIE] cpu5, &sd = 0xffffffc156093300, &sg = 0xffffffc156094c00, &sgc = 0xffffffc156094300 
[__sdt_alloc][tl DIE] cpu6, &sd = 0xffffffc156093900, &sg = 0xffffffc156094a80, &sgc = 0xffffffc156094480 
[__sdt_alloc][tl DIE] cpu7, &sd = 0xffffffc156093600, &sg = 0xffffffc156094900, &sgc = 0xffffffc156094600 

建立链接以后每个层次tl的sd、sg之间的关系:

kernel_init() -> kernel_init_freeable() -> sched_init_smp() -> init_sched_domains() -> build_sched_domains() -> build_sched_groups():

[build_sched_domains][tl MC] cpu0, sd->groups=0xffffffc156062600, sd->span_weight=4, sd->balance_interval=4, sd->span[]=0xf
[build_sched_domains][tl MC] cpu0, sg->sgc=0xffffffc156062780, sg->next=0xffffffc156056780, sg->group_weight=0, sg->cpumask[]=0x1
[build_sched_domains][tl MC] cpu0, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x3ff
[build_sched_domains][tl MC] cpu0, sd->min_interval=4, sd->max_interval=8, sd->busy_factor=32, sd->imbalance_pct=117, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=0, sd->newidle_idx=0, sd->wake_idx=0,  sd->forkexec_idx=0, sd->span_weight=4, sd->balance_interval=4, sd->level=0 
[build_sched_domains][tl MC] cpu0, sd->flags=0x832f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_SHARE_POWERDOMAIN|SD_SHARE_PKG_RESOURCES|SD_SHARE_CAP_STATES|
[build_sched_domains][tl MC] cpu1, sd->groups=0xffffffc156056780, sd->span_weight=4, sd->balance_interval=4, sd->span[]=0xf
[build_sched_domains][tl MC] cpu1, sg->sgc=0xffffffc156090000, sg->next=0xffffffc156090d80, sg->group_weight=0, sg->cpumask[]=0x2
[build_sched_domains][tl MC] cpu1, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x3ff
[build_sched_domains][tl MC] cpu1, sd->min_interval=4, sd->max_interval=8, sd->busy_factor=32, sd->imbalance_pct=117, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=0, sd->newidle_idx=0, sd->wake_idx=0,  sd->forkexec_idx=0, sd->span_weight=4, sd->balance_interval=4, sd->level=0 
[build_sched_domains][tl MC] cpu1, sd->flags=0x832f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_SHARE_POWERDOMAIN|SD_SHARE_PKG_RESOURCES|SD_SHARE_CAP_STATES|
[build_sched_domains][tl MC] cpu2, sd->groups=0xffffffc156090d80, sd->span_weight=4, sd->balance_interval=4, sd->span[]=0xf
[build_sched_domains][tl MC] cpu2, sg->sgc=0xffffffc156090180, sg->next=0xffffffc156090c00, sg->group_weight=0, sg->cpumask[]=0x4
[build_sched_domains][tl MC] cpu2, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x3ff
[build_sched_domains][tl MC] cpu2, sd->min_interval=4, sd->max_interval=8, sd->busy_factor=32, sd->imbalance_pct=117, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=0, sd->newidle_idx=0, sd->wake_idx=0,  sd->forkexec_idx=0, sd->span_weight=4, sd->balance_interval=4, sd->level=0 
[build_sched_domains][tl MC] cpu2, sd->flags=0x832f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_SHARE_POWERDOMAIN|SD_SHARE_PKG_RESOURCES|SD_SHARE_CAP_STATES|
[build_sched_domains][tl MC] cpu3, sd->groups=0xffffffc156090c00, sd->span_weight=4, sd->balance_interval=4, sd->span[]=0xf
[build_sched_domains][tl MC] cpu3, sg->sgc=0xffffffc156090300, sg->next=0xffffffc156062600, sg->group_weight=0, sg->cpumask[]=0x8
[build_sched_domains][tl MC] cpu3, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x3ff
[build_sched_domains][tl MC] cpu3, sd->min_interval=4, sd->max_interval=8, sd->busy_factor=32, sd->imbalance_pct=117, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=0, sd->newidle_idx=0, sd->wake_idx=0,  sd->forkexec_idx=0, sd->span_weight=4, sd->balance_interval=4, sd->level=0 
[build_sched_domains][tl MC] cpu3, sd->flags=0x832f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_SHARE_POWERDOMAIN|SD_SHARE_PKG_RESOURCES|SD_SHARE_CAP_STATES|
[build_sched_domains][tl MC] cpu4, sd->groups=0xffffffc156090a80, sd->span_weight=4, sd->balance_interval=4, sd->span[]=0xf0
[build_sched_domains][tl MC] cpu4, sg->sgc=0xffffffc156090480, sg->next=0xffffffc156090900, sg->group_weight=0, sg->cpumask[]=0x10
[build_sched_domains][tl MC] cpu4, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x3ff
[build_sched_domains][tl MC] cpu4, sd->min_interval=4, sd->max_interval=8, sd->busy_factor=32, sd->imbalance_pct=117, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=0, sd->newidle_idx=0, sd->wake_idx=0,  sd->forkexec_idx=0, sd->span_weight=4, sd->balance_interval=4, sd->level=0 
[build_sched_domains][tl MC] cpu4, sd->flags=0x832f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_SHARE_POWERDOMAIN|SD_SHARE_PKG_RESOURCES|SD_SHARE_CAP_STATES|
[build_sched_domains][tl MC] cpu5, sd->groups=0xffffffc156090900, sd->span_weight=4, sd->balance_interval=4, sd->span[]=0xf0
[build_sched_domains][tl MC] cpu5, sg->sgc=0xffffffc156090600, sg->next=0xffffffc156090780, sg->group_weight=0, sg->cpumask[]=0x20
[build_sched_domains][tl MC] cpu5, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x3ff
[build_sched_domains][tl MC] cpu5, sd->min_interval=4, sd->max_interval=8, sd->busy_factor=32, sd->imbalance_pct=117, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=0, sd->newidle_idx=0, sd->wake_idx=0,  sd->forkexec_idx=0, sd->span_weight=4, sd->balance_interval=4, sd->level=0 
[build_sched_domains][tl MC] cpu5, sd->flags=0x832f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_SHARE_POWERDOMAIN|SD_SHARE_PKG_RESOURCES|SD_SHARE_CAP_STATES|
[build_sched_domains][tl MC] cpu6, sd->groups=0xffffffc156090780, sd->span_weight=4, sd->balance_interval=4, sd->span[]=0xf0
[build_sched_domains][tl MC] cpu6, sg->sgc=0xffffffc156092000, sg->next=0xffffffc156092d80, sg->group_weight=0, sg->cpumask[]=0x40
[build_sched_domains][tl MC] cpu6, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x3ff
[build_sched_domains][tl MC] cpu6, sd->min_interval=4, sd->max_interval=8, sd->busy_factor=32, sd->imbalance_pct=117, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=0, sd->newidle_idx=0, sd->wake_idx=0,  sd->forkexec_idx=0, sd->span_weight=4, sd->balance_interval=4, sd->level=0 
[build_sched_domains][tl MC] cpu6, sd->flags=0x832f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_SHARE_POWERDOMAIN|SD_SHARE_PKG_RESOURCES|SD_SHARE_CAP_STATES|
[build_sched_domains][tl MC] cpu7, sd->groups=0xffffffc156092d80, sd->span_weight=4, sd->balance_interval=4, sd->span[]=0xf0
[build_sched_domains][tl MC] cpu7, sg->sgc=0xffffffc156092180, sg->next=0xffffffc156090a80, sg->group_weight=0, sg->cpumask[]=0x80
[build_sched_domains][tl MC] cpu7, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x3ff
[build_sched_domains][tl MC] cpu7, sd->min_interval=4, sd->max_interval=8, sd->busy_factor=32, sd->imbalance_pct=117, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=0, sd->newidle_idx=0, sd->wake_idx=0,  sd->forkexec_idx=0, sd->span_weight=4, sd->balance_interval=4, sd->level=0 
[build_sched_domains][tl MC] cpu7, sd->flags=0x832f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_SHARE_POWERDOMAIN|SD_SHARE_PKG_RESOURCES|SD_SHARE_CAP_STATES|


[build_sched_domains][tl DIE] cpu0, sd->groups=0xffffffc156092c00, sd->span_weight=8, sd->balance_interval=8, sd->span[]=0xff
[build_sched_domains][tl DIE] cpu0, sg->sgc=0xffffffc156092300, sg->next=0xffffffc156094d80, sg->group_weight=0, sg->cpumask[]=0xf
[build_sched_domains][tl DIE] cpu0, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x3ff
[build_sched_domains][tl DIE] cpu0, sd->min_interval=8, sd->max_interval=16, sd->busy_factor=32, sd->imbalance_pct=125, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=1, sd->newidle_idx=0, sd->wake_idx=0,  sd->forkexec_idx=0, sd->span_weight=8, sd->balance_interval=8, sd->level=1 
[build_sched_domains][tl DIE] cpu0, sd->flags=0x102f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_PREFER_SIBLING|
[build_sched_domains][tl DIE] cpu1, sd->groups=0xffffffc156092c00, sd->span_weight=8, sd->balance_interval=8, sd->span[]=0xff
[build_sched_domains][tl DIE] cpu1, sg->sgc=0x0, sg->next=0xffffffc156092a80, sg->group_weight=0, sg->cpumask[]=0x0
[build_sched_domains][tl DIE] cpu1, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x0
[build_sched_domains][tl DIE] cpu1, sd->min_interval=8, sd->max_interval=16, sd->busy_factor=32, sd->imbalance_pct=125, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=1, sd->newidle_idx=0, sd->wake_idx=0,  sd->forkexec_idx=0, sd->span_weight=8, sd->balance_interval=8, sd->level=1 
[build_sched_domains][tl DIE] cpu1, sd->flags=0x102f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_PREFER_SIBLING|
[build_sched_domains][tl DIE] cpu2, sd->groups=0xffffffc156092c00, sd->span_weight=8, sd->balance_interval=8, sd->span[]=0xff
[build_sched_domains][tl DIE] cpu2, sg->sgc=0x0, sg->next=0xffffffc156092900, sg->group_weight=0, sg->cpumask[]=0x0
[build_sched_domains][tl DIE] cpu2, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x0
[build_sched_domains][tl DIE] cpu2, sd->min_interval=8, sd->max_interval=16, sd->busy_factor=32, sd->imbalance_pct=125, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=1, sd->newidle_idx=0, sd->wake_idx=0,  sd->forkexec_idx=0, sd->span_weight=8, sd->balance_interval=8, sd->level=1 
[build_sched_domains][tl DIE] cpu2, sd->flags=0x102f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_PREFER_SIBLING|
[build_sched_domains][tl DIE] cpu3, sd->groups=0xffffffc156092c00, sd->span_weight=8, sd->balance_interval=8, sd->span[]=0xff
[build_sched_domains][tl DIE] cpu3, sg->sgc=0x0, sg->next=0xffffffc156092780, sg->group_weight=0, sg->cpumask[]=0x0
[build_sched_domains][tl DIE] cpu3, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x0
[build_sched_domains][tl DIE] cpu3, sd->min_interval=8, sd->max_interval=16, sd->busy_factor=32, sd->imbalance_pct=125, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=1, sd->newidle_idx=0, sd->wake_idx=0,  sd->forkexec_idx=0, sd->span_weight=8, sd->balance_interval=8, sd->level=1 
[build_sched_domains][tl DIE] cpu3, sd->flags=0x102f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_PREFER_SIBLING|
[build_sched_domains][tl DIE] cpu4, sd->groups=0xffffffc156094d80, sd->span_weight=8, sd->balance_interval=8, sd->span[]=0xff
[build_sched_domains][tl DIE] cpu4, sg->sgc=0xffffffc156094180, sg->next=0xffffffc156092c00, sg->group_weight=0, sg->cpumask[]=0xf0
[build_sched_domains][tl DIE] cpu4, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x3ff
[build_sched_domains][tl DIE] cpu4, sd->min_interval=8, sd->max_interval=16, sd->busy_factor=32, sd->imbalance_pct=125, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=1, sd->newidle_idx=0, sd->wake_idx=0,  sd->forkexec_idx=0, sd->span_weight=8, sd->balance_interval=8, sd->level=1 
[build_sched_domains][tl DIE] cpu4, sd->flags=0x102f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_PREFER_SIBLING|
[build_sched_domains][tl DIE] cpu5, sd->groups=0xffffffc156094d80, sd->span_weight=8, sd->balance_interval=8, sd->span[]=0xff
[build_sched_domains][tl DIE] cpu5, sg->sgc=0x0, sg->next=0xffffffc156094c00, sg->group_weight=0, sg->cpumask[]=0x0
[build_sched_domains][tl DIE] cpu5, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x0
[build_sched_domains][tl DIE] cpu5, sd->min_interval=8, sd->max_interval=16, sd->busy_factor=32, sd->imbalance_pct=125, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=1, sd->newidle_idx=0, sd->wake_idx=0,  sd->forkexec_idx=0, sd->span_weight=8, sd->balance_interval=8, sd->level=1 
[build_sched_domains][tl DIE] cpu5, sd->flags=0x102f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_PREFER_SIBLING|
[build_sched_domains][tl DIE] cpu6, sd->groups=0xffffffc156094d80, sd->span_weight=8, sd->balance_interval=8, sd->span[]=0xff
[build_sched_domains][tl DIE] cpu6, sg->sgc=0x0, sg->next=0xffffffc156094a80, sg->group_weight=0, sg->cpumask[]=0x0
[build_sched_domains][tl DIE] cpu6, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x0
[build_sched_domains][tl DIE] cpu6, sd->min_interval=8, sd->max_interval=16, sd->busy_factor=32, sd->imbalance_pct=125, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=1, sd->newidle_idx=0, sd->wake_idx=0,  sd->forkexec_idx=0, sd->span_weight=8, sd->balance_interval=8, sd->level=1 
[build_sched_domains][tl DIE] cpu6, sd->flags=0x102f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_PREFER_SIBLING|
[build_sched_domains][tl DIE] cpu7, sd->groups=0xffffffc156094d80, sd->span_weight=8, sd->balance_interval=8, sd->span[]=0xff
[build_sched_domains][tl DIE] cpu7, sg->sgc=0x0, sg->next=0xffffffc156094900, sg->group_weight=0, sg->cpumask[]=0x0
[build_sched_domains][tl DIE] cpu7, sgc->capacity=0, sgc->next_update=0, sgc->nr_busy_cpus=0, sgc->cpumask[]=0x0
[build_sched_domains][tl DIE] cpu7, sd->min_interval=8, sd->max_interval=16, sd->busy_factor=32, sd->imbalance_pct=125, sd->cache_nice_tries=1, sd->busy_idx=2, sd->idle_idx=1, sd->newidle_idx=0, sd->wake_idx=0,  sd->forkexec_idx=0, sd->span_weight=8, sd->balance_interval=8, sd->level=1 
[build_sched_domains][tl DIE] cpu7, sd->flags=0x102f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_PREFER_SIBLING|

用图形表达的关系如下:

这里写图片描述

每个sched_domain中的参数也非常重要,在函数sd_init()中初始化,在smp负载均衡时会频繁的使用这些参数和标志:

sd 参数 tl MC 层级 tl DIE 层级
sd->min_interval 4 8
sd->max_interval 8 16
sd->busy_factor 32 32
sd->imbalance_pct 117 125
sd->cache_nice_tries 1 1
sd->busy_idx 2 2
sd->idle_idx 0 1
sd->newidle_idx 0 0
sd->wake_idx 0 0
sd->forkexec_idx 0 0
sd->span_weight 4 8
sd->balance_interval 4 8
sd->level 0 1
sd->flags 0x832f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_SHARE_POWERDOMAIN|SD_SHARE_PKG_RESOURCES|SD_SHARE_CAP_STATES 0x102f: SD_LOAD_BALANCE|SD_BALANCE_NEWIDLE|SD_BALANCE_EXEC|SD_BALANCE_FORK|SD_WAKE_AFFINE|SD_PREFER_SIBLING

update_top_cache_domain()函数中还把常用的一些sd进行了cache,我们通过打印得出每个cache实际对应的层次sd:

cache sd 说明 赋值
sd_busy per_cpu(sd_busy, cpu), 本cpu的tl DIE层级sd
sd_llc per_cpu(sd_llc, cpu), 本cpu的tl MC层级sd
sd_llc_size per_cpu(sd_llc_size, cpu), 4
sd_llc_id per_cpu(sd_llc_id, cpu), 0/4
sd_numa per_cpu(sd_numa, cpu), 0
sd_asym per_cpu(sd_asym, cpu), 0
sd_ea per_cpu(sd_ea, cpu), 本cpu的tl DIE层级sd
sd_scs per_cpu(sd_scs, cpu), 本cpu的tl MC层级sd

static void update_top_cache_domain(int cpu)
{
    struct sched_domain *sd;
    struct sched_domain *busy_sd = NULL, *ea_sd = NULL;
    int id = cpu;
    int size = 1;

    sd = highest_flag_domain(cpu, SD_SHARE_PKG_RESOURCES);
    if (sd) {
        id = cpumask_first(sched_domain_span(sd));
        size = cpumask_weight(sched_domain_span(sd));
        busy_sd = sd->parent; /* sd_busy */
    }
    rcu_assign_pointer(per_cpu(sd_busy, cpu), busy_sd);

    rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
    per_cpu(sd_llc_size, cpu) = size;
    per_cpu(sd_llc_id, cpu) = id;

    sd = lowest_flag_domain(cpu, SD_NUMA);
    rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);

    sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
    rcu_assign_pointer(per_cpu(sd_asym, cpu), sd);

    for_each_domain(cpu, sd) {
        if (sd->groups->sge)
            ea_sd = sd;
        else
            break;
    }
    rcu_assign_pointer(per_cpu(sd_ea, cpu), ea_sd);

    sd = highest_flag_domain(cpu, SD_SHARE_CAP_STATES);
    rcu_assign_pointer(per_cpu(sd_scs, cpu), sd);
}
[update_top_cache_domain] cpu0, sd_busy=0xffffffc156091300, sd_llc=0xffffffc15663c600, sd_llc_size=4, sd_llc_id=0, sd_numa=0x0, sd_asym=0x0, sd_ea=0xffffffc156091300, sd_scs=0xffffffc15663c600
[update_top_cache_domain] cpu1, sd_busy=0xffffffc156091900, sd_llc=0xffffffc15608f000, sd_llc_size=4, sd_llc_id=0, sd_numa=0x0, sd_asym=0x0, sd_ea=0xffffffc156091900, sd_scs=0xffffffc15608f000
[update_top_cache_domain] cpu2, sd_busy=0xffffffc156091600, sd_llc=0xffffffc15608fc00, sd_llc_size=4, sd_llc_id=0, sd_numa=0x0, sd_asym=0x0, sd_ea=0xffffffc156091600, sd_scs=0xffffffc15608fc00
[update_top_cache_domain] cpu3, sd_busy=0xffffffc156093000, sd_llc=0xffffffc15608f300, sd_llc_size=4, sd_llc_id=0, sd_numa=0x0, sd_asym=0x0, sd_ea=0xffffffc156093000, sd_scs=0xffffffc15608f300
[update_top_cache_domain] cpu4, sd_busy=0xffffffc156093c00, sd_llc=0xffffffc15608f900, sd_llc_size=4, sd_llc_id=4, sd_numa=0x0, sd_asym=0x0, sd_ea=0xffffffc156093c00, sd_scs=0xffffffc15608f900
[update_top_cache_domain] cpu5, sd_busy=0xffffffc156093300, sd_llc=0xffffffc15608f600, sd_llc_size=4, sd_llc_id=4, sd_numa=0x0, sd_asym=0x0, sd_ea=0xffffffc156093300, sd_scs=0xffffffc15608f600
[update_top_cache_domain] cpu6, sd_busy=0xffffffc156093900, sd_llc=0xffffffc156091000, sd_llc_size=4, sd_llc_id=4, sd_numa=0x0, sd_asym=0x0, sd_ea=0xffffffc156093900, sd_scs=0xffffffc156091000
[update_top_cache_domain] cpu7, sd_busy=0xffffffc156093600, sd_llc=0xffffffc156091c00, sd_llc_size=4, sd_llc_id=4, sd_numa=0x0, sd_asym=0x0, sd_ea=0xffffffc156093600, sd_scs=0xffffffc156091c00

[__sdt_alloc][tl MC] cpu0, &sd = 0xffffffc15663c600, &sg = 0xffffffc156062600, &sgc = 0xffffffc156062780 
[__sdt_alloc][tl MC] cpu1, &sd = 0xffffffc15608f000, &sg = 0xffffffc156056780, &sgc = 0xffffffc156090000 
[__sdt_alloc][tl MC] cpu2, &sd = 0xffffffc15608fc00, &sg = 0xffffffc156090d80, &sgc = 0xffffffc156090180 
[__sdt_alloc][tl MC] cpu3, &sd = 0xffffffc15608f300, &sg = 0xffffffc156090c00, &sgc = 0xffffffc156090300 
[__sdt_alloc][tl MC] cpu4, &sd = 0xffffffc15608f900, &sg = 0xffffffc156090a80, &sgc = 0xffffffc156090480 
[__sdt_alloc][tl MC] cpu5, &sd = 0xffffffc15608f600, &sg = 0xffffffc156090900, &sgc = 0xffffffc156090600 
[__sdt_alloc][tl MC] cpu6, &sd = 0xffffffc156091000, &sg = 0xffffffc156090780, &sgc = 0xffffffc156092000 
[__sdt_alloc][tl MC] cpu7, &sd = 0xffffffc156091c00, &sg = 0xffffffc156092d80, &sgc = 0xffffffc156092180 

[__sdt_alloc][tl DIE] cpu0, &sd = 0xffffffc156091300, &sg = 0xffffffc156092c00, &sgc = 0xffffffc156092300 
[__sdt_alloc][tl DIE] cpu1, &sd = 0xffffffc156091900, &sg = 0xffffffc156092a80, &sgc = 0xffffffc156092480 
[__sdt_alloc][tl DIE] cpu2, &sd = 0xffffffc156091600, &sg = 0xffffffc156092900, &sgc = 0xffffffc156092600 
[__sdt_alloc][tl DIE] cpu3, &sd = 0xffffffc156093000, &sg = 0xffffffc156092780, &sgc = 0xffffffc156094000 
[__sdt_alloc][tl DIE] cpu4, &sd = 0xffffffc156093c00, &sg = 0xffffffc156094d80, &sgc = 0xffffffc156094180 
[__sdt_alloc][tl DIE] cpu5, &sd = 0xffffffc156093300, &sg = 0xffffffc156094c00, &sgc = 0xffffffc156094300 
[__sdt_alloc][tl DIE] cpu6, &sd = 0xffffffc156093900, &sg = 0xffffffc156094a80, &sgc = 0xffffffc156094480 
[__sdt_alloc][tl DIE] cpu7, &sd = 0xffffffc156093600, &sg = 0xffffffc156094900, &sgc = 0xffffffc156094600 

mt6799在计算功耗(energy)和运算能力(capacity)时使用的表项如下:

kernel_init() -> kernel_init_freeable() -> sched_init_smp() -> init_sched_domains() -> build_sched_domains() -> init_sched_energy()/init_sched_groups_capacity();


/* v1 FY */
struct upower_tbl_info upower_tbl_infos_FY[NR_UPOWER_BANK] = {
    INIT_UPOWER_TBL_INFOS(UPOWER_BANK_LL, upower_tbl_ll_1_FY),
    INIT_UPOWER_TBL_INFOS(UPOWER_BANK_L, upower_tbl_l_1_FY),
    INIT_UPOWER_TBL_INFOS(UPOWER_BANK_B, upower_tbl_b_1_FY),
    INIT_UPOWER_TBL_INFOS(UPOWER_BANK_CLS_LL, upower_tbl_cluster_ll_1_FY),
    INIT_UPOWER_TBL_INFOS(UPOWER_BANK_CLS_L, upower_tbl_cluster_l_1_FY),
    INIT_UPOWER_TBL_INFOS(UPOWER_BANK_CLS_B, upower_tbl_cluster_b_1_FY),
    INIT_UPOWER_TBL_INFOS(UPOWER_BANK_CCI, upower_tbl_cci_1_FY),
};

/* ver1 */
/* FY table */
struct upower_tbl upower_tbl_ll_1_FY = {
    .row = {
        {.cap = 100, .volt = 75000, .dyn_pwr = 9994, .lkg_pwr = {13681, 13681, 13681, 13681, 13681, 13681} },
        {.cap = 126, .volt = 75000, .dyn_pwr = 12585, .lkg_pwr = {13681, 13681, 13681, 13681, 13681, 13681} },
        {.cap = 148, .volt = 75000, .dyn_pwr = 14806, .lkg_pwr = {13681, 13681, 13681, 13681, 13681, 13681} },
        {.cap = 167, .volt = 75000, .dyn_pwr = 16656, .lkg_pwr = {13681, 13681, 13681, 13681, 13681, 13681} },
        {.cap = 189, .volt = 75000, .dyn_pwr = 18877, .lkg_pwr = {13681, 13681, 13681, 13681, 13681, 13681} },
        {.cap = 212, .volt = 75000, .dyn_pwr = 21098, .lkg_pwr = {13681, 13681, 13681, 13681, 13681, 13681} },
        {.cap = 230, .volt = 75700, .dyn_pwr = 23379, .lkg_pwr = {13936, 13936, 13936, 13936, 13936, 13936} },
        {.cap = 245, .volt = 78100, .dyn_pwr = 26490, .lkg_pwr = {14811, 14811, 14811, 14811, 14811, 14811} },
        {.cap = 263, .volt = 81100, .dyn_pwr = 30729, .lkg_pwr = {15958, 15958, 15958, 15958, 15958, 15958} },
        {.cap = 278, .volt = 83500, .dyn_pwr = 34409, .lkg_pwr = {16949, 16949, 16949, 16949, 16949, 16949} },
        {.cap = 293, .volt = 86000, .dyn_pwr = 38447, .lkg_pwr = {18036, 18036, 18036, 18036, 18036, 18036} },
        {.cap = 304, .volt = 88400, .dyn_pwr = 42166, .lkg_pwr = {19159, 19159, 19159, 19159, 19159, 19159} },
        {.cap = 319, .volt = 90800, .dyn_pwr = 46657, .lkg_pwr = {20333, 20333, 20333, 20333, 20333, 20333} },
        {.cap = 334, .volt = 93200, .dyn_pwr = 51442, .lkg_pwr = {21605, 21605, 21605, 21605, 21605, 21605} },
        {.cap = 345, .volt = 95000, .dyn_pwr = 55230, .lkg_pwr = {22560, 22560, 22560, 22560, 22560, 22560} },
        {.cap = 356, .volt = 97400, .dyn_pwr = 59928, .lkg_pwr = {24002, 24002, 24002, 24002, 24002, 24002} },
    },
    .lkg_idx = DEFAULT_LKG_IDX,
    .row_num = UPOWER_OPP_NUM,
    .nr_idle_states = NR_UPOWER_CSTATES,
    .idle_states = {
        {{0}, {7321} },
        {{0}, {7321} },
        {{0}, {7321} },
        {{0}, {7321} },
        {{0}, {7321} },
        {{0}, {7321} },
    },
};

struct upower_tbl upower_tbl_cluster_ll_1_FY = {
    .row = {
        {.cap = 100, .volt = 75000, .dyn_pwr = 3656, .lkg_pwr = {21729, 21729, 21729, 21729, 21729, 21729} },
        {.cap = 126, .volt = 75000, .dyn_pwr = 4604, .lkg_pwr = {21729, 21729, 21729, 21729, 21729, 21729} },
        {.cap = 148, .volt = 75000, .dyn_pwr = 5417, .lkg_pwr = {21729, 21729, 21729, 21729, 21729, 21729} },
        {.cap = 167, .volt = 75000, .dyn_pwr = 6094, .lkg_pwr = {21729, 21729, 21729, 21729, 21729, 21729} },
        {.cap = 189, .volt = 75000, .dyn_pwr = 6906, .lkg_pwr = {21729, 21729, 21729, 21729, 21729, 21729} },
        {.cap = 212, .volt = 75000, .dyn_pwr = 7719, .lkg_pwr = {21729, 21729, 21729, 21729, 21729, 21729} },
        {.cap = 230, .volt = 75700, .dyn_pwr = 8553, .lkg_pwr = {22134, 22134, 22134, 22134, 22134, 22134} },
        {.cap = 245, .volt = 78100, .dyn_pwr = 9692, .lkg_pwr = {23523, 23523, 23523, 23523, 23523, 23523} },
        {.cap = 263, .volt = 81100, .dyn_pwr = 11242, .lkg_pwr = {25344, 25344, 25344, 25344, 25344, 25344} },
        {.cap = 278, .volt = 83500, .dyn_pwr = 12589, .lkg_pwr = {26919, 26919, 26919, 26919, 26919, 26919} },
        {.cap = 293, .volt = 86000, .dyn_pwr = 14066, .lkg_pwr = {28646, 28646, 28646, 28646, 28646, 28646} },
        {.cap = 304, .volt = 88400, .dyn_pwr = 15427, .lkg_pwr = {30430, 30430, 30430, 30430, 30430, 30430} },
        {.cap = 319, .volt = 90800, .dyn_pwr = 17069, .lkg_pwr = {32293, 32293, 32293, 32293, 32293, 32293} },
        {.cap = 334, .volt = 93200, .dyn_pwr = 18820, .lkg_pwr = {34314, 34314, 34314, 34314, 34314, 34314} },
        {.cap = 345, .volt = 95000, .dyn_pwr = 20206, .lkg_pwr = {35830, 35830, 35830, 35830, 35830, 35830} },
        {.cap = 356, .volt = 97400, .dyn_pwr = 21925, .lkg_pwr = {38121, 38121, 38121, 38121, 38121, 38121} },
    },
    .lkg_idx = DEFAULT_LKG_IDX,
    .row_num = UPOWER_OPP_NUM,
    .nr_idle_states = NR_UPOWER_CSTATES,
    .idle_states = {
        {{0}, {11628} },
        {{0}, {11628} },
        {{0}, {11628} },
        {{0}, {11628} },
        {{0}, {11628} },
        {{0}, {11628} },
    },
};

struct upower_tbl upower_tbl_l_1_FY = {
    .row = {
        {.cap = 116, .volt = 75000, .dyn_pwr = 16431, .lkg_pwr = {22973, 22973, 22973, 22973, 22973, 22973} },
        {.cap = 152, .volt = 75000, .dyn_pwr = 21486, .lkg_pwr = {22973, 22973, 22973, 22973, 22973, 22973} },
        {.cap = 179, .volt = 75000, .dyn_pwr = 25278, .lkg_pwr = {22973, 22973, 22973, 22973, 22973, 22973} },
        {.cap = 201, .volt = 75000, .dyn_pwr = 28437, .lkg_pwr = {22973, 22973, 22973, 22973, 22973, 22973} },
        {.cap = 228, .volt = 75000, .dyn_pwr = 32229, .lkg_pwr = {22973, 22973, 22973, 22973, 22973, 22973} },
        {.cap = 255, .volt = 75000, .dyn_pwr = 36021, .lkg_pwr = {22973, 22973, 22973, 22973, 22973, 22973} },
        {.cap = 282, .volt = 75700, .dyn_pwr = 40559, .lkg_pwr = {23423, 23423, 23423, 23423, 23423, 23423} },
        {.cap = 304, .volt = 78100, .dyn_pwr = 46598, .lkg_pwr = {24968, 24968, 24968, 24968, 24968, 24968} },
        {.cap = 331, .volt = 81100, .dyn_pwr = 54680, .lkg_pwr = {26999, 26999, 26999, 26999, 26999, 26999} },
        {.cap = 349, .volt = 83500, .dyn_pwr = 61098, .lkg_pwr = {28760, 28760, 28760, 28760, 28760, 28760} },
        {.cap = 371, .volt = 86000, .dyn_pwr = 68965, .lkg_pwr = {30698, 30698, 30698, 30698, 30698, 30698} },
        {.cap = 393, .volt = 88400, .dyn_pwr = 77258, .lkg_pwr = {32706, 32706, 32706, 32706, 32706, 32706} },
        {.cap = 416, .volt = 90800, .dyn_pwr = 86141, .lkg_pwr = {34808, 34808, 34808, 34808, 34808, 34808} },
        {.cap = 438, .volt = 93200, .dyn_pwr = 95634, .lkg_pwr = {37097, 37097, 37097, 37097, 37097, 37097} },
        {.cap = 452, .volt = 95000, .dyn_pwr = 102406, .lkg_pwr = {38814, 38814, 38814, 38814, 38814, 38814} },
        {.cap = 474, .volt = 97400, .dyn_pwr = 112974, .lkg_pwr = {41424, 41424, 41424, 41424, 41424, 41424} },
    },
    .lkg_idx = DEFAULT_LKG_IDX,
    .row_num = UPOWER_OPP_NUM,
    .nr_idle_states = NR_UPOWER_CSTATES,
    .idle_states = {
        {{0}, {11926} },
        {{0}, {11926} },
        {{0}, {11926} },
        {{0}, {11926} },
        {{0}, {11926} },
        {{0}, {11926} },
    },
};

struct upower_tbl upower_tbl_cluster_l_1_FY = {
    .row = {
        {.cap = 116, .volt = 75000, .dyn_pwr = 2778, .lkg_pwr = {26537, 26537, 26537, 26537, 26537, 26537} },
        {.cap = 152, .volt = 75000, .dyn_pwr = 3633, .lkg_pwr = {26537, 26537, 26537, 26537, 26537, 26537} },
        {.cap = 179, .volt = 75000, .dyn_pwr = 4274, .lkg_pwr = {26537, 26537, 26537, 26537, 26537, 26537} },
        {.cap = 201, .volt = 75000, .dyn_pwr = 4808, .lkg_pwr = {26537, 26537, 26537, 26537, 26537, 26537} },
        {.cap = 228, .volt = 75000, .dyn_pwr = 5449, .lkg_pwr = {26537, 26537, 26537, 26537, 26537, 26537} },
        {.cap = 255, .volt = 75000, .dyn_pwr = 6090, .lkg_pwr = {26537, 26537, 26537, 26537, 26537, 26537} },
        {.cap = 282, .volt = 75700, .dyn_pwr = 6857, .lkg_pwr = {27058, 27058, 27058, 27058, 27058, 27058} },
        {.cap = 304, .volt = 78100, .dyn_pwr = 7878, .lkg_pwr = {28843, 28843, 28843, 28843, 28843, 28843} },
        {.cap = 331, .volt = 81100, .dyn_pwr = 9245, .lkg_pwr = {31188, 31188, 31188, 31188, 31188, 31188} },
        {.cap = 349, .volt = 83500, .dyn_pwr = 10330, .lkg_pwr = {33223, 33223, 33223, 33223, 33223, 33223} },
        {.cap = 371, .volt = 86000, .dyn_pwr = 11660, .lkg_pwr = {35461, 35461, 35461, 35461, 35461, 35461} },
        {.cap = 393, .volt = 88400, .dyn_pwr = 13062, .lkg_pwr = {37781, 37781, 37781, 37781, 37781, 37781} },
        {.cap = 416, .volt = 90800, .dyn_pwr = 14564, .lkg_pwr = {40209, 40209, 40209, 40209, 40209, 40209} },
        {.cap = 438, .volt = 93200, .dyn_pwr = 16169, .lkg_pwr = {42854, 42854, 42854, 42854, 42854, 42854} },
        {.cap = 452, .volt = 95000, .dyn_pwr = 17314, .lkg_pwr = {44837, 44837, 44837, 44837, 44837, 44837} },
        {.cap = 474, .volt = 97400, .dyn_pwr = 19101, .lkg_pwr = {47852, 47852, 47852, 47852, 47852, 47852} },
    },
    .lkg_idx = DEFAULT_LKG_IDX,
    .row_num = UPOWER_OPP_NUM,
    .nr_idle_states = NR_UPOWER_CSTATES,
    .idle_states = {
        {{0}, {13776} },
        {{0}, {13776} },
        {{0}, {13776} },
        {{0}, {13776} },
        {{0}, {13776} },
        {{0}, {13776} },
    },
};

struct upower_tbl upower_tbl_b_1_FY = {
    .row = {
        {.cap = 211, .volt = 75000, .dyn_pwr = 61732, .lkg_pwr = {71164, 71164, 71164, 71164, 71164, 71164} },
        {.cap = 268, .volt = 75000, .dyn_pwr = 78352, .lkg_pwr = {71164, 71164, 71164, 71164, 71164, 71164} },
        {.cap = 317, .volt = 75000, .dyn_pwr = 92598, .lkg_pwr = {71164, 71164, 71164, 71164, 71164, 71164} },
        {.cap = 358, .volt = 75000, .dyn_pwr = 104469, .lkg_pwr = {71164, 71164, 71164, 71164, 71164, 71164} },
        {.cap = 406, .volt = 75000, .dyn_pwr = 118715, .lkg_pwr = {71164, 71164, 71164, 71164, 71164, 71164} },
        {.cap = 447, .volt = 75000, .dyn_pwr = 130587, .lkg_pwr = {71164, 71164, 71164, 71164, 71164, 71164} },
        {.cap = 504, .volt = 75700, .dyn_pwr = 149968, .lkg_pwr = {72438, 72438, 72438, 72438, 72438, 72438} },
        {.cap = 561, .volt = 78100, .dyn_pwr = 177650, .lkg_pwr = {76806, 76806, 76806, 76806, 76806, 76806} },
        {.cap = 634, .volt = 81100, .dyn_pwr = 216546, .lkg_pwr = {82521, 82521, 82521, 82521, 82521, 82521} },
        {.cap = 691, .volt = 83500, .dyn_pwr = 250153, .lkg_pwr = {87447, 87447, 87447, 87447, 87447, 87447} },
        {.cap = 748, .volt = 86000, .dyn_pwr = 287210, .lkg_pwr = {92841, 92841, 92841, 92841, 92841, 92841} },
        {.cap = 805, .volt = 88400, .dyn_pwr = 326553, .lkg_pwr = {98397, 98397, 98397, 98397, 98397, 98397} },
    {.cap = 861, .volt = 90800, .dyn_pwr = 368886, .lkg_pwr = {104190, 104190, 104190, 104190, 104190, 104190} },
    {.cap = 918, .volt = 93200, .dyn_pwr = 414309, .lkg_pwr = {110456, 110456, 110456, 110456, 110456, 110456} },
    {.cap = 959, .volt = 95000, .dyn_pwr = 449514, .lkg_pwr = {115156, 115156, 115156, 115156, 115156, 115156} },
    {.cap = 1024, .volt = 97400, .dyn_pwr = 504548, .lkg_pwr = {122224, 122224, 122224, 122224, 122224, 122224} },
    },
    .lkg_idx = DEFAULT_LKG_IDX,
    .row_num = UPOWER_OPP_NUM,
    .nr_idle_states = NR_UPOWER_CSTATES,
    .idle_states = {
        {{0}, {38992} },
        {{0}, {38992} },
        {{0}, {38992} },
        {{0}, {38992} },
        {{0}, {38992} },
        {{0}, {38992} },
    },
};

struct upower_tbl upower_tbl_cluster_b_1_FY = {
    .row = {
        {.cap = 211, .volt = 75000, .dyn_pwr = 6408, .lkg_pwr = {27561, 27561, 27561, 27561, 27561, 27561} },
        {.cap = 268, .volt = 75000, .dyn_pwr = 8133, .lkg_pwr = {27561, 27561, 27561, 27561, 27561, 27561} },
        {.cap = 317, .volt = 75000, .dyn_pwr = 9612, .lkg_pwr = {27561, 27561, 27561, 27561, 27561, 27561} },
        {.cap = 358, .volt = 75000, .dyn_pwr = 10844, .lkg_pwr = {27561, 27561, 27561, 27561, 27561, 27561} },
        {.cap = 406, .volt = 75000, .dyn_pwr = 12323, .lkg_pwr = {27561, 27561, 27561, 27561, 27561, 27561} },
        {.cap = 447, .volt = 75000, .dyn_pwr = 13555, .lkg_pwr = {27561, 27561, 27561, 27561, 27561, 27561} },
        {.cap = 504, .volt = 75700, .dyn_pwr = 15567, .lkg_pwr = {28054, 28054, 28054, 28054, 28054, 28054} },
        {.cap = 561, .volt = 78100, .dyn_pwr = 18440, .lkg_pwr = {29746, 29746, 29746, 29746, 29746, 29746} },
        {.cap = 634, .volt = 81100, .dyn_pwr = 22478, .lkg_pwr = {31959, 31959, 31959, 31959, 31959, 31959} },
        {.cap = 691, .volt = 83500, .dyn_pwr = 25966, .lkg_pwr = {33867, 33867, 33867, 33867, 33867, 33867} },
        {.cap = 748, .volt = 86000, .dyn_pwr = 29813, .lkg_pwr = {35956, 35956, 35956, 35956, 35956, 35956} },
        {.cap = 805, .volt = 88400, .dyn_pwr = 33897, .lkg_pwr = {38108, 38108, 38108, 38108, 38108, 38108} },
        {.cap = 861, .volt = 90800, .dyn_pwr = 38291, .lkg_pwr = {40351, 40351, 40351, 40351, 40351, 40351} },
        {.cap = 918, .volt = 93200, .dyn_pwr = 43006, .lkg_pwr = {42778, 42778, 42778, 42778, 42778, 42778} },
        {.cap = 959, .volt = 95000, .dyn_pwr = 46661, .lkg_pwr = {44598, 44598, 44598, 44598, 44598, 44598} },
        {.cap = 1024, .volt = 97400, .dyn_pwr = 52373, .lkg_pwr = {47335, 47335, 47335, 47335, 47335, 47335} },
    },
    .lkg_idx = DEFAULT_LKG_IDX,
    .row_num = UPOWER_OPP_NUM,
    .nr_idle_states = NR_UPOWER_CSTATES,
    .idle_states = {
        {{0}, {15101} },
        {{0}, {15101} },
        {{0}, {15101} },
        {{0}, {15101} },
        {{0}, {15101} },
        {{0}, {15101} },
    },
};

struct upower_tbl upower_tbl_cci_1_FY = {
    .row = {
        {.cap = 0, .volt = 75000, .dyn_pwr = 2708, .lkg_pwr = {16248, 16248, 16248, 16248, 16248, 16248} },
        {.cap = 0, .volt = 75000, .dyn_pwr = 3611, .lkg_pwr = {16248, 16248, 16248, 16248, 16248, 16248} },
        {.cap = 0, .volt = 75000, .dyn_pwr = 4288, .lkg_pwr = {16248, 16248, 16248, 16248, 16248, 16248} },
        {.cap = 0, .volt = 75000, .dyn_pwr = 5191, .lkg_pwr = {16248, 16248, 16248, 16248, 16248, 16248} },
        {.cap = 0, .volt = 75000, .dyn_pwr = 5868, .lkg_pwr = {16248, 16248, 16248, 16248, 16248, 16248} },
        {.cap = 0, .volt = 75000, .dyn_pwr = 6771, .lkg_pwr = {16248, 16248, 16248, 16248, 16248, 16248} },
        {.cap = 0, .volt = 75700, .dyn_pwr = 7588, .lkg_pwr = {16537, 16537, 16537, 16537, 16537, 16537} },
        {.cap = 0, .volt = 78100, .dyn_pwr = 8811, .lkg_pwr = {17527, 17527, 17527, 17527, 17527, 17527} },
        {.cap = 0, .volt = 81100, .dyn_pwr = 10292, .lkg_pwr = {18822, 18822, 18822, 18822, 18822, 18822} },
        {.cap = 0, .volt = 83500, .dyn_pwr = 11750, .lkg_pwr = {19938, 19938, 19938, 19938, 19938, 19938} },
        {.cap = 0, .volt = 86000, .dyn_pwr = 13354, .lkg_pwr = {21159, 21159, 21159, 21159, 21159, 21159} },
        {.cap = 0, .volt = 88400, .dyn_pwr = 14737, .lkg_pwr = {22417, 22417, 22417, 22417, 22417, 22417} },
        {.cap = 0, .volt = 90800, .dyn_pwr = 16540, .lkg_pwr = {23728, 23728, 23728, 23728, 23728, 23728} },
        {.cap = 0, .volt = 93200, .dyn_pwr = 18472, .lkg_pwr = {25145, 25145, 25145, 25145, 25145, 25145} },
        {.cap = 0, .volt = 95000, .dyn_pwr = 19916, .lkg_pwr = {26208, 26208, 26208, 26208, 26208, 26208} },
        {.cap = 0, .volt = 97400, .dyn_pwr = 22077, .lkg_pwr = {27805, 27805, 27805, 27805, 27805, 27805} },
    },
    .lkg_idx = DEFAULT_LKG_IDX,
    .row_num = UPOWER_OPP_NUM,
    .nr_idle_states = NR_UPOWER_CSTATES,
    .idle_states = {
        {{0}, {8938} },
        {{0}, {8938} },
        {{0}, {8938} },
        {{0}, {8938} },
        {{0}, {8938} },
        {{0}, {8938} },
    },
};

4.1.2、smp负载均衡的实现

负载均衡和很多参数相关,下面列出了其中最重要的一些参数:

成员 所属结构 含义 更新/获取函数 计算方法
rq->cpu_capacity_orig rq 本cpu总的计算能力 init_sched_groups_capacity()/update_sd_lb_stats() -> update_group_capacity() -> update_cpu_capacity() capacity = arch_scale_cpu_capacity(sd, cpu)
rq->cpu_capacity rq 本cpu cfs的计算能力 = 总capacity - rt占用的capacity init_sched_groups_capacity()/update_sd_lb_stats() -> update_group_capacity() -> update_cpu_capacity() capacity *= scale_rt_capacity(cpu);
rq->rd->max_cpu_capacity rq->rd root_domain中最大的cpu计算能力 init_sched_groups_capacity()/update_sd_lb_stats() -> update_group_capacity() -> update_cpu_capacity()
rq->rd->overutilized rq->rd update_sd_lb_stats()
rq->rd->overload rq->rd update_sd_lb_stats()
rq->rt_avg rq 本cpu的rt平均负载 weighted_cpuload() -> cfs_rq_runnable_load_avg()
rq->cfs.runnable_load_avg rq->cfs(cfs_rq) 本cpu cfs_rq的runable平均负载 __update_load_avg()、cfs_rq_load_avg() (runnable时间*freq*weight)/LOAD_AVG_MAX
rq->cfs.avg.load_avg rq->cfs.avg 本cpu cfs_rq的runnable平均负载 __update_load_avg() (runnable时间*freq*weight)/LOAD_AVG_MAX
rq->cfs.avg.loadwop_avg rq->cfs.avg 本cpu cfs_rq的runnable平均负载,不含weight __update_load_avg() (runnable时间*freq)/LOAD_AVG_MAX
rq->cfs.avg.util_avg rq->cfs.avg 本cpu cfs_rq的running负载 __update_load_avg()、cpu_util() -> __cpu_util() (running时间*freq*capacity)/LOAD_AVG_MAX
cfs_rq->nr_running cfs_rq 本cfs_rq这个层次runnable的se的数量 enqueue_entity()/dequeue_entity() -> account_entity_enqueue()
cfs_rq->h_nr_running cfs_rq 本cfs_rq包含所有子cfs_rq nr_running的总和 enqueue_task_fair()/dequeue_task_fair
rq->nr_running rq 本cpu rq所有runnable的se的数量,包含所有子cfs_rq enqueue_task_fair()/dequeue_task_fair -> add_nr_running()

4.1.2.1、rebalance_domains()

mtk对定义了3种power模式来兼容EAS的:EAS模式(energy_aware())、HMP模式(sched_feat(SCHED_HMP))、hybrid_support(EAS、HMP同时共存);

hybrid_support()模式下:一般负载均衡交给EAS;如果cpu_rq(cpu)->rd->overutilized负载已经严重不均衡,交给HMP;

系统在scheduler_tick()中会定期的检测smp负载均衡的时间是否已到,如果到时触发SCHED_SOFTIRQ软中断:

void scheduler_tick(void)
{


#ifdef CONFIG_SMP
    rq->idle_balance = idle_cpu(cpu);
    trigger_load_balance(rq);
#endif

}

|→

/*
 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
 */
void trigger_load_balance(struct rq *rq)
{
    /* Don't need to rebalance while attached to NULL domain */
    if (unlikely(on_null_domain(rq)))
        return;

    if (time_after_eq(jiffies, rq->next_balance))
        raise_softirq(SCHED_SOFTIRQ);
#ifdef CONFIG_NO_HZ_COMMON
    if (nohz_kick_needed(rq))
        nohz_balancer_kick();
#endif
}

SCHED_SOFTIRQ软中断的执行主体为run_rebalance_domains:

__init void init_sched_fair_class(void)
{

    open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);

}

/*
 * run_rebalance_domains is triggered when needed from the scheduler tick.
 * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
 */
static void run_rebalance_domains(struct softirq_action *h)
{
    struct rq *this_rq = this_rq();
    enum cpu_idle_type idle = this_rq->idle_balance ?
                        CPU_IDLE : CPU_NOT_IDLE;
    int this_cpu = smp_processor_id();

    /* bypass load balance of HMP if EAS consideration */
    if ((!energy_aware() && sched_feat(SCHED_HMP)) ||
            (hybrid_support() && cpu_rq(this_cpu)->rd->overutilized))
        hmp_force_up_migration(this_cpu);

    /*
     * If this cpu has a pending nohz_balance_kick, then do the
     * balancing on behalf of the other idle cpus whose ticks are
     * stopped. Do nohz_idle_balance *before* rebalance_domains to
     * give the idle cpus a chance to load balance. Else we may
     * load balance only within the local sched_domain hierarchy
     * and abort nohz_idle_balance altogether if we pull some load.
     */
    nohz_idle_balance(this_rq, idle);
    rebalance_domains(this_rq, idle);
}

我们分析最核心的函数rebalance_domains():

需要重点提一下的是:负载计算计算了3种负载(load_avg、loadwop_avg、util_avg),rebalance_domains主要使用其中的load_avg,乘(SCHED_CAPACITY_SCALE/capacity)加以转换。

  • 1、逐级轮询本cpu的sd,判断本sd的时间间隔是否到期,如果到期做load_balance();

| tl层级 | cpu_busy? | sd->balance_interval | sd->busy_factor | sd balance interval |
|—|—|—|—|—|
MC层级 | idle | 4 |1 | 4ms
MC层级 | busy | 4 | 32 | 128ms
DIE层级 | idle | 8 |1 | 8ms
DIE层级 | busy | 8 | 32 | 256ms
| | | | | rq->next_balance = min(上述值)

  • 2、在load_balance()中判断在本层级sd本cpu的当前情况是否适合充当dst_cpu,在should_we_balance()做各种判断,做dst_cpu的条件有:要么是本sg的第一个idle cpu,要么是本sg的第一个cpu。dst_cpu是作为目的cpu让负载高的cpu迁移进程过来,如果本cpu不符合条件中断操作;

  • 3、继续find_busiest_group(),在sg链表中找出负载最重的sg。核心计算在update_sd_lb_stats()、update_sg_lb_stats()中。如果dst_cpu所在的local_group负载大于busiest sg,或者大于sds平均负载,中断操作;如果成功计算需要迁移的负载env->imbalance,为min((sds->avg - local), (busiest - sds->avg));

这里写图片描述

  • 3.1、根据当前cpu的idle状态计算cpu load(rq->cpu_load[])时选用的index值:
tl层级busy_idxidle_idxnewidle_idx
MC层级200
DIE层级210

- 3.2、计算sg负载sgs,选择sgs->avg_load最大的sg作为busiest_group。其中几个关键值的计算如下:

负载值计算方法说明
sgs->group_load+= cpu_rq(cpu)->cpu_load[index-1]累加cpu的load值,相对值(每个cpu的最大值都是1024),且带weight分量
sgs->group_util+= cpu_rq(cpu)->cfs.avg.util_avg累加cpu cfs running值,绝对值(不同cluster,只有最大capacity能力的cpu最大值为1024)
sgs->group_capacity+= (arch_scale_cpu_capacity(sd, cpu)*(1-rt_capacity))累加cpu的capacity,绝对值(不同cluster,只有最大capacity能力的cpu最大值为1024)
sgs->avg_load= (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacitygroup_load做了转换,和group_capacity成反比

- 3.3、在计算sg负载时,几个关键状态的计算如下:

状态值计算方法说明
sgs->group_no_capacity(sgs->group_capacity * 100) < (sgs->group_util * env->sd->imbalance_pct)预留一定空间(比例为imbalance_pct),sg运算能力已经不够了,sgs->group_type=group_overloaded
dst_rq->rd->overutilized(capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin)预留一定空间(比例为capacity_margin),sg运算能力已经不够了
dst_rq->rd->overloadrq->nr_running > 1sg中任何一个cpu的runnable进程大于1

比例参数imbalance_pct、capacity_margin的值为:

tl层级sd->imbalance_pct (/100)capacity_margin (/1024)
MC层级1171280
DIE层级1251280

- 3.4、计算env->imbalance,这个是rebalance需要迁移的负载量:

负载值计算方法说明
sds->total_load+= sgs->group_load
sds->total_capacity+= sgs->group_capacity
sds.avg_load(SCHED_CAPACITY_SCALE * sds.total_load)/ sds.total_capacity
env->imbalancemin((busiest->avg_load - sds->avg_load)*busiest->group_capacity, (sds->avg_load - local->avg_load)*local->group_capacity) / SCHED_CAPACITY_SCALE)感觉这里计算有bug啊,前面是1024/capcity,后面是capacity/1024,很混乱
  • 4、继续find_busiest_queue(),查找busiest sg中负载最重的cpu。

这里写图片描述

  • 4.1、找出sg中weighted_cpuload*capacity_of值最大的cpu:
负载值计算方法说明
weighted_cpuload(cpu)cpu_rq(cpu)->cfs->runnable_load_avgcpu的load值,相对值(每个cpu的最大值都是1024),且带weight分量
capacity_of(cpu)arch_scale_cpu_capacity(sd, cpu)*(1-rt_capacity)cpu cfs running值,绝对值(不同cluster,只有最大capacity能力的cpu最大值为1024)
weighted_cpuload(cpu)*capacity_of(cpu)最大值为busiest sg中busiest cpu rq
  • 5、迁移busiest cpu的负载到本地dst cpu上,迁移的负载额度为env->imbalance:detach_tasks() -> attach_tasks();

  • 6、处理几种因为进程亲和力问题,busiest cpu不能迁移走足够的进程:LBF_DST_PINNED尝试更改dst_cpu为本地cpu相同sg的其他cpu;LBF_SOME_PINNED当前不能均衡尝试让父sd均衡;LBF_ALL_PINNED一个进程都不能迁移尝试去掉dst_cpu重新进行load_balance();

  • 7、如果经过各种尝试后还是没有一个进程迁移成功,最后尝试一次active_balance;

/*
 * It checks each scheduling domain to see if it is due to be balanced,
 * and initiates a balancing operation if so.
 *
 * Balancing parameters are set up in init_sched_domains.
 * Balance的参数是在sched_domains初始化时设置的
 */
static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
{
    int continue_balancing = 1;
    int cpu = rq->cpu;
    unsigned long interval;
    struct sched_domain *sd;
    /* 默认本cpu rq下一次的balance时间为60s以后 */
    /* Earliest time when we have to do rebalance again */
    unsigned long next_balance = jiffies + 60*HZ;
    int update_next_balance = 0;
    int need_serialize, need_decay = 0;
    u64 max_cost = 0;

    /* (1) 更新cpu rq中所有cfs_rq的最新负载 */
    update_blocked_averages(cpu);

    rcu_read_lock();
    /* (2) 对本cpu每个层次的schedule_domain进行扫描 */
    for_each_domain(cpu, sd) {

        /* (3) 以1HZ的频率对sd->max_newidle_lb_cost进行老化,
            老化公式: new = old * (253/256)
         */
        /*
         * Decay the newidle max times here because this is a regular
         * visit to all the domains. Decay ~1% per second.
         */
        if (time_after(jiffies, sd->next_decay_max_lb_cost)) {
            sd->max_newidle_lb_cost =
                (sd->max_newidle_lb_cost * 253) / 256;
            sd->next_decay_max_lb_cost = jiffies + HZ;
            need_decay = 1;
        }
        max_cost += sd->max_newidle_lb_cost;

        if (!(sd->flags & SD_LOAD_BALANCE))
            continue;

#ifndef CONFIG_MTK_LOAD_BALANCE_ENHANCEMENT
        /* nohz CPU need GTS balance to migrate tasks for more than 2 clusters*/
        /* Don't consider GTS balance if hybrid support */
        if (hybrid_support()) {
            if (sd->child || (!sd->child &&
                (rcu_dereference(per_cpu(sd_scs, cpu)) == NULL)))
            continue;
        }
#endif

        /* (4) 如果continue_balancing = 0,指示停止当前层级的load balance
            因为shed_group中其他的cpu正在这个层次做load_balance
         */
        /*
         * Stop the load balance at this level. There is another
         * CPU in our sched group which is doing load balancing more
         * actively.
         */
        if (!continue_balancing) {
            if (need_decay)
                continue;
            break;
        }

        /* (5) 计算当前层次schedule_domain的balance间隔时间 */
        interval = get_sd_balance_interval(sd, idle != CPU_IDLE);

        /* (6) 如果需要串行化(SD_SERIALIZE),做balance之前需要持锁 */
        need_serialize = sd->flags & SD_SERIALIZE;
        if (need_serialize) {
            if (!spin_trylock(&balancing))
                goto out;
        }

        /* (7) 如果本sd的balance间隔时间已到,进行实际的load_balance() */
        if (time_after_eq(jiffies, sd->last_balance + interval)) {
            if (load_balance(cpu, rq, sd, idle, &continue_balancing)) {
                /*
                 * The LBF_DST_PINNED logic could have changed
                 * env->dst_cpu, so we can't know our idle
                 * state even if we migrated tasks. Update it.
                 */
                idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
            }
            sd->last_balance = jiffies;
            interval = get_sd_balance_interval(sd, idle != CPU_IDLE);
        }
        if (need_serialize)
            spin_unlock(&balancing);
out:
        /* (8) 如果sd下一次balance时间在,rq的balance时间之前,需要更新rq的balance时间
            rq的下一次balance时间:next_balance  (默认是60s后)
            本sd的下一次balance时间:sd->last_balance + interval
            rq的下一次balance时间需要选取多个sd中时间最近的一个
         */
        if (time_after(next_balance, sd->last_balance + interval)) {
            next_balance = sd->last_balance + interval;
            update_next_balance = 1;
        }
    }
    if (need_decay) {
        /*
         * Ensure the rq-wide value also decays but keep it at a
         * reasonable floor to avoid funnies with rq->avg_idle.
         */
        rq->max_idle_balance_cost =
            max((u64)sysctl_sched_migration_cost, max_cost);
    }
    rcu_read_unlock();

    /* (8.1) 更新rq的balance时间 */
    /*
     * next_balance will be updated only when there is a need.
     * When the cpu is attached to null domain for ex, it will not be
     * updated.
     */
    if (likely(update_next_balance)) {
        rq->next_balance = next_balance;

#ifdef CONFIG_NO_HZ_COMMON
        /*
         * If this CPU has been elected to perform the nohz idle
         * balance. Other idle CPUs have already rebalanced with
         * nohz_idle_balance() and nohz.next_balance has been
         * updated accordingly. This CPU is now running the idle load
         * balance for itself and we need to update the
         * nohz.next_balance accordingly.
         */
        if ((idle == CPU_IDLE) && time_after(nohz.next_balance, rq->next_balance))
            nohz.next_balance = rq->next_balance;
#endif
    }
}

|→

static int load_balance(int this_cpu, struct rq *this_rq,
            struct sched_domain *sd, enum cpu_idle_type idle,
            int *continue_balancing)
{
    int ld_moved, cur_ld_moved, active_balance = 0;
    struct sched_domain *sd_parent = sd->parent;
    struct sched_group *group;
    struct rq *busiest;
    unsigned long flags;
    struct cpumask *cpus = this_cpu_cpumask_var_ptr(load_balance_mask);

    /* (7.1) 构造Load_balance需要的数据结构:
        .sd     = sd,   //本cpu在本tl层次的sd
        .dst_cpu    = this_cpu,   // 目的cpu是本cpu
        .dst_rq     = this_rq,    // 目的rq是本cpu的rq

        // load_balance的目的是找出负载最重的cpu,并将一部分负载迁移到本cpu上
     */
    struct lb_env env = {
        .sd     = sd,
        .dst_cpu    = this_cpu,
        .dst_rq     = this_rq,
        .dst_grpmask    = sched_group_cpus(sd->groups),
        .idle       = idle,
        .loop_break = sched_nr_migrate_break,
        .cpus       = cpus,
        .fbq_type   = all,
        .tasks      = LIST_HEAD_INIT(env.tasks),
    };

    /*
     * For NEWLY_IDLE load_balancing, we don't need to consider
     * other cpus in our group
     */
    if (idle == CPU_NEWLY_IDLE)
        env.dst_grpmask = NULL;

    cpumask_copy(cpus, cpu_active_mask);

    schedstat_inc(sd, lb_count[idle]);

redo:
    /* (7.2) check当前cpu是否适合作为dst_cpu(即light cpu,需要分担其他cpu的负载) */
    if (!should_we_balance(&env)) {
        *continue_balancing = 0;
        goto out_balanced;
    }

    /* (7.3) 找出本层级sched_group链表中,负载最重的(busiest)的sched_group */
    group = find_busiest_group(&env);
    if (!group) {
        schedstat_inc(sd, lb_nobusyg[idle]);
        goto out_balanced;
    }

    /* (7.4) 找出busiest sched_group中sched_group的rq,即负载最重cpu对应的rq */
    busiest = find_busiest_queue(&env, group);
    if (!busiest) {
        schedstat_inc(sd, lb_nobusyq[idle]);
        goto out_balanced;
    }

    BUG_ON(busiest == env.dst_rq);

    schedstat_add(sd, lb_imbalance[idle], env.imbalance);

    env.src_cpu = busiest->cpu;
    env.src_rq = busiest;

    ld_moved = 0;
    /* (7.5) 判断busiest cpu rq中的runnable进程数 > 1?
        至少有进程可以迁移走
     */
    if (busiest->nr_running > 1) {
        /*
         * Attempt to move tasks. If find_busiest_group has found
         * an imbalance but busiest->nr_running <= 1, the group is
         * still unbalanced. ld_moved simply stays zero, so it is
         * correctly treated as an imbalance.
         */
        env.flags |= LBF_ALL_PINNED;
        env.loop_max  = min(sysctl_sched_nr_migrate, busiest->nr_running);

more_balance:
        raw_spin_lock_irqsave(&busiest->lock, flags);

        /* (7.6) 从busiest rq中detach进程, 
            env->imbalance:需要迁移的负载大小
            cur_ld_moved:实际迁移的进程数
         */
        /*
         * cur_ld_moved - load moved in current iteration
         * ld_moved     - cumulative load moved across iterations
         */
        cur_ld_moved = detach_tasks(&env);

        /* (7.7) busiest cpu负载减轻后,
            在sched_freq中判断cpu频率是否可以调低
         */
        /*
         * We want to potentially lower env.src_cpu's OPP.
         */
        if (cur_ld_moved)
            update_capacity_of(env.src_cpu, SCHE_ONESHOT);

        /*
         * We've detached some tasks from busiest_rq. Every
         * task is masked "TASK_ON_RQ_MIGRATING", so we can safely
         * unlock busiest->lock, and we are able to be sure
         * that nobody can manipulate the tasks in parallel.
         * See task_rq_lock() family for the details.
         */

        raw_spin_unlock(&busiest->lock);

        /* (7.8) 把迁移过来的任务attack到dest_cpu上 */
        if (cur_ld_moved) {
            attach_tasks(&env);
            ld_moved += cur_ld_moved;
        }

        local_irq_restore(flags);

        /* (7.9) LBF_NEED_BREAK设置,说明balance还没有完成,循环只是出来休息一下,
            继续重新balance
         */
        if (env.flags & LBF_NEED_BREAK) {
            env.flags &= ~LBF_NEED_BREAK;
            goto more_balance;
        }

        /* (7.10) 设置了LBF_DST_PINNED标志,并且env.imbalance > 0
            说明src_cpu上有些进程因为affinity的原因不能迁移到dst_cpu但是能迁移到同sg的new_dst_cpu上
            把dst_cpu更改为new_dst_cpu,重新开始balance流程
         */
        /*
         * Revisit (affine) tasks on src_cpu that couldn't be moved to
         * us and move them to an alternate dst_cpu in our sched_group
         * where they can run. The upper limit on how many times we
         * iterate on same src_cpu is dependent on number of cpus in our
         * sched_group.
         *
         * This changes load balance semantics a bit on who can move
         * load to a given_cpu. In addition to the given_cpu itself
         * (or a ilb_cpu acting on its behalf where given_cpu is
         * nohz-idle), we now have balance_cpu in a position to move
         * load to given_cpu. In rare situations, this may cause
         * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
         * _independently_ and at _same_ time to move some load to
         * given_cpu) causing exceess load to be moved to given_cpu.
         * This however should not happen so much in practice and
         * moreover subsequent load balance cycles should correct the
         * excess load moved.
         */
        if ((env.flags & LBF_DST_PINNED) && env.imbalance > 0) {

            /* Prevent to re-select dst_cpu via env's cpus */
            cpumask_clear_cpu(env.dst_cpu, env.cpus);

            env.dst_rq   = cpu_rq(env.new_dst_cpu);
            env.dst_cpu  = env.new_dst_cpu;
            env.flags   &= ~LBF_DST_PINNED;
            env.loop     = 0;
            env.loop_break   = sched_nr_migrate_break;

            /*
             * Go back to "more_balance" rather than "redo" since we
             * need to continue with same src_cpu.
             */
            goto more_balance;
        }

        /* (7.11) 设置了LBF_SOME_PINNED标志,说明有些进程因为affinity迁移失败,  
            设置当前sd的parent sd的 sgc->imbalance,让parent sd做rebalance的概率增高
         */
        /*
         * We failed to reach balance because of affinity.
         */
        if (sd_parent) {
            int *group_imbalance = &sd_parent->groups->sgc->imbalance;

            if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0)
                *group_imbalance = 1;
        }

        /* (7.12) 如果LBF_ALL_PINNED标志一直被置位,
            说明busiest_cpu因为affinity没有一个进程迁移成功,哪怕迁移到dst_cpu同sg的其他cpu也没有一个成功
            将busiest cpu从全局cpu mask去掉,重新做整个流程:find_busiest_group -> find_busiest_queue -> detach_tasks -> attach_tasks
         */
        /* All tasks on this runqueue were pinned by CPU affinity */
        if (unlikely(env.flags & LBF_ALL_PINNED)) {
            cpumask_clear_cpu(cpu_of(busiest), cpus);
            if (!cpumask_empty(cpus)) {
                env.loop = 0;
                env.loop_break = sched_nr_migrate_break;
                goto redo;
            }
            goto out_all_pinned;
        }
    }

    /* (7.13) 经过几轮的努力尝试,最终迁移的进程数ld_moved还是0,说明balance失败 */
    if (!ld_moved) {
        schedstat_inc(sd, lb_failed[idle]);
        /*
         * Increment the failure counter only on periodic balance.
         * We do not want newidle balance, which can be very
         * frequent, pollute the failure counter causing
         * excessive cache_hot migrations and active balances.
         */
        if (idle != CPU_NEWLY_IDLE)
            if (env.src_grp_nr_running > 1)
                sd->nr_balance_failed++;

        /* (7.14) 最后一次尝试迁移一个进程 */
        if (need_active_balance(&env)) {
            raw_spin_lock_irqsave(&busiest->lock, flags);

            /* (7.15) 如果当前cpu不在busiest->curr进程的affinity之内,返回失败 */
            /* don't kick the active_load_balance_cpu_stop,
             * if the curr task on busiest cpu can't be
             * moved to this_cpu
             */
            if (!cpumask_test_cpu(this_cpu,
                    tsk_cpus_allowed(busiest->curr))) {
                raw_spin_unlock_irqrestore(&busiest->lock,
                                flags);
                env.flags |= LBF_ALL_PINNED;
                goto out_one_pinned;
            }

            /*
             * ->active_balance synchronizes accesses to
             * ->active_balance_work.  Once set, it's cleared
             * only after active load balance is finished.
             */
            if (!busiest->active_balance && !cpu_park(cpu_of(busiest))) {
                busiest->active_balance = 1; /* load_balance */
                busiest->push_cpu = this_cpu;
                active_balance = 1;
            }
            raw_spin_unlock_irqrestore(&busiest->lock, flags);

            /* (7.16) 迁移busiest->curr进程当前期cpu */
            if (active_balance) {
                if (stop_one_cpu_dispatch(cpu_of(busiest),
                    active_load_balance_cpu_stop, busiest,
                    &busiest->active_balance_work)) {
                    raw_spin_lock_irqsave(&busiest->lock, flags);
                    busiest->active_balance = 0;
                    active_balance = 0;
                    raw_spin_unlock_irqrestore(&busiest->lock, flags);
                }
            }

            /*
             * We've kicked active balancing, reset the failure
             * counter.
             */
            sd->nr_balance_failed = sd->cache_nice_tries+1;
        }
    } else
        sd->nr_balance_failed = 0;

    if (likely(!active_balance)) {
        /* We were unbalanced, so reset the balancing interval */
        sd->balance_interval = sd->min_interval;
    } else {
        /*
         * If we've begun active balancing, start to back off. This
         * case may not be covered by the all_pinned logic if there
         * is only 1 task on the busy runqueue (because we don't call
         * detach_tasks).
         */
        if (sd->balance_interval < sd->max_interval)
            sd->balance_interval *= 2;
    }

    goto out;

out_balanced:
    /*
     * We reach balance although we may have faced some affinity
     * constraints. Clear the imbalance flag if it was set.
     */
    if (sd_parent) {
        int *group_imbalance = &sd_parent->groups->sgc->imbalance;

        if (*group_imbalance)
            *group_imbalance = 0;
    }

out_all_pinned:
    /*
     * We reach balance because all tasks are pinned at this level so
     * we can't migrate them. Let the imbalance flag set so parent level
     * can try to migrate them.
     */
    schedstat_inc(sd, lb_balanced[idle]);

    sd->nr_balance_failed = 0;

out_one_pinned:
    /* tune up the balancing interval */
    if (((env.flags & LBF_ALL_PINNED) &&
            sd->balance_interval < MAX_PINNED_INTERVAL) ||
            (sd->balance_interval < sd->max_interval))
        sd->balance_interval *= 2;

    ld_moved = 0;
out:
    return ld_moved;
}

||→

static int should_we_balance(struct lb_env *env)
{
    struct sched_group *sg = env->sd->groups;
    struct cpumask *sg_cpus, *sg_mask;
    int cpu, balance_cpu = -1;

    /* (7.2.1) 如果本cpu为CPU_NEWLY_IDLE,直接符合迁移条件 */
    /*
     * In the newly idle case, we will allow all the cpu's
     * to do the newly idle load balance.
     */
    if (env->idle == CPU_NEWLY_IDLE)
        return 1;

    sg_cpus = sched_group_cpus(sg);
    sg_mask = sched_group_mask(sg);
    /* (7.2.2) 本sched_group的第一个idle cpu适合做load_balance */
    /* Try to find first idle cpu */
    for_each_cpu_and(cpu, sg_cpus, env->cpus) {
        if (!cpumask_test_cpu(cpu, sg_mask) || !idle_cpu(cpu))
            continue;

        balance_cpu = cpu;
        break;
    }

    /* (7.2.3) 没有idle cpu,则选取本sched_group的第一个cpu做load_balance */
    if (balance_cpu == -1)
        balance_cpu = group_balance_cpu(sg);

    /* (7.2.4) 不满足上述条件的cpu,不适合来启动load_balance */
    /*
     * First idle cpu or the first cpu(busiest) in this sched group
     * is eligible for doing load balancing at this and above domains.
     */
    return balance_cpu == env->dst_cpu;
}

||→

static struct sched_group *find_busiest_group(struct lb_env *env)
{
    struct sg_lb_stats *local, *busiest;
    struct sd_lb_stats sds;
    int local_cpu = 0, busiest_cpu = 0;
    struct cpumask *busiest_cpumask;
    int same_clus = 0;

    init_sd_lb_stats(&sds);

    /* (7.3.1) 更新本层级sched_group链表中,每个sched_group的负载,
        并选出busiest的一个sched_group
     */
    /*
     * Compute the various statistics relavent for load balancing at
     * this level.
     */
    update_sd_lb_stats(env, &sds);

    local = &sds.local_stat;
    busiest = &sds.busiest_stat;

    if (sds.busiest) {
        busiest_cpumask = sched_group_cpus(sds.busiest);
        local_cpu = env->dst_cpu;
        busiest_cpu = group_first_cpu(sds.busiest);

        same_clus = is_the_same_domain(local_cpu, busiest_cpu);
        mt_sched_printf(sched_lb, "%s: local_cpu=%d, busiest_cpu=%d, busiest_mask=%lu, same_cluster=%d",
                __func__, local_cpu, busiest_cpu, busiest_cpumask->bits[0], same_clus);
    }

    /* (7.3.2) 如果EAS使能,跨cluster的任务迁移使用EAS来做 */
    if (energy_aware() && !env->dst_rq->rd->overutilized && !same_clus)
        goto out_balanced;

    /* (7.3.3) */
    /* ASYM feature bypasses nice load balance check */
    if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
        check_asym_packing(env, &sds))
        return sds.busiest;

    /* (7.3.4) busiest sg上没有负载,返回空 */
    /* There is no busy sibling group to pull tasks from */
    if (!sds.busiest || busiest->sum_nr_running == 0) {
        if (!sds.busiest)
            mt_sched_printf(sched_lb, "[%s] %d: fail no busiest ", __func__, env->src_cpu);
        else
            mt_sched_printf(sched_lb, "[%s] %d: fail busiest no task ", __func__, env->src_cpu);
        goto out_balanced;
    }

    /* (7.3.5) sg链表里的平均负载 */
    sds.avg_load = (SCHED_CAPACITY_SCALE * sds.total_load)
                        / sds.total_capacity;

    /* (7.3.6) 如果busiest sg低一级别的因为cpu affinity没有balance成功,设置了group_imbalanced标志 
        强制在当前级别上进行balance
     */
    /*
     * If the busiest group is imbalanced the below checks don't
     * work because they assume all things are equal, which typically
     * isn't true due to cpus_allowed constraints and the like.
     */
    if (busiest->group_type == group_imbalanced)
        goto force_balance;

    /* (7.3.7) 如果dest cpu/group很闲,busiest负载很重,  
        强制开展balance
     */
    /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
    if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) &&
        busiest->group_no_capacity)
        goto force_balance;

    /* (7.3.8)  如果dest_cpu所在sg的负载都大于busiest sg的负载,
        放弃balance
     */
    /*
     * If the local group is busier than the selected busiest group
     * don't try and pull any tasks.
     */
    if (local->avg_load >= busiest->avg_load)
        goto out_balanced;

    /* (7.3.9)  如果dest_cpu所在sg的负载都大于sg链表的平均负载,
        放弃balance
     */
    /*
     * Don't pull any tasks if this group is already above the domain
     * average load.
     */
    if (local->avg_load >= sds.avg_load)
        goto out_balanced;

    /* (7.3.10)  如果dest_cpu为idle,但是dest_cpu所在的sg idle cpu数量小于busiest sg的idle cpu数量
        放弃balance
     */
#ifdef CONFIG_MTK_LOAD_BALANCE_ENHANCEMENT
    if ((env->idle == CPU_IDLE) || (env->idle == CPU_NEWLY_IDLE)) {
        int i = (env->idle == CPU_IDLE) ? 1:0;
#else
    if (env->idle == CPU_IDLE) {
#endif
        /*
         * This cpu is idle. If the busiest group is not overloaded
         * and there is no imbalance between this and busiest group
         * wrt idle cpus, it is balanced. The imbalance becomes
         * significant if the diff is greater than 1 otherwise we
         * might end up to just move the imbalance on another group
         */
#ifdef CONFIG_MTK_LOAD_BALANCE_ENHANCEMENT
        if ((busiest->group_type != group_overloaded) &&
            (local->idle_cpus < (busiest->idle_cpus + i)))
#else
        if ((busiest->group_type != group_overloaded) &&
                (local->idle_cpus <= (busiest->idle_cpus + 1)))
#endif
            goto out_balanced;
    } else {

        /* (7.3.11)  busiest->avg_load大于local->avg_load的比例没有超过env->sd->imbalance_pct
            放弃balance
        */
        /*
         * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
         * imbalance_pct to be conservative.
         */
        if (100 * busiest->avg_load <=
                env->sd->imbalance_pct * local->avg_load)
            goto out_balanced;
    }

force_balance:
    /* Looks like there is an imbalance. Compute it */
    /* (7.3.12) 计算需要迁移的负载值env->imbalance */
    calculate_imbalance(env, &sds);
#ifdef CONFIG_MTK_LOAD_BALANCE_ENHANCEMENT
    env->imbalance = env->imbalance * SCHED_CAPACITY_SCALE
        / (sds.busiest->sgc->capacity / cpumask_weight(sched_group_cpus(sds.busiest)));
#endif

    return sds.busiest;

out_balanced:
    env->imbalance = 0;
    return NULL;
}

|||→

static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
{
    struct sched_domain *child = env->sd->child;
    struct sched_group *sg = env->sd->groups;
    struct sg_lb_stats tmp_sgs;
    int load_idx, prefer_sibling = 0;
    bool overload = false, overutilized = false;

    if (child && child->flags & SD_PREFER_SIBLING)
        prefer_sibling = 1;

    /* (7.3.1.1) 根据idle情况,选择计算cpu负载时的idx,
        idx:是CPU层级负载this_rq->cpu_load[i]数组的index值
     */
    load_idx = get_sd_load_idx(env->sd, env->idle);

    /* (7.3.1.2) 逐个轮询本层级sched_group链表中的每个sched_group */
    do {
        struct sg_lb_stats *sgs = &tmp_sgs;
        int local_group;

        /* (7.3.1.3) 如果sg是当前cpu所在的sg,则本sg称为local_group 
            使用专门的数据结构来存储local_group的信息:
            sds->local = sg;        // 使用sds->local来存储local_group
            sgs = &sds->local_stat; // 使用sds->local_stat来存储local_group的统计
         */
        local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
        if (local_group) {
            sds->local = sg;
            sgs = &sds->local_stat;

            /* (7.3.1.4) 更新local_group的capacity,更新的周期为sd->balance_interval 
                主要目的是动态减去RT进程消耗的capacity
             */
            if (env->idle != CPU_NEWLY_IDLE ||
                time_after_eq(jiffies, sg->sgc->next_update))
                update_group_capacity(env->sd, env->dst_cpu);
        }

        /* (7.3.1.5) 更新当前sched_group的负载统计 
            sgs:sg统计数据放到sgs当中
            overload:rq中runnable的进程>1,那么肯定有进程在等待
            overutilized:cpu的capacity < util,运算能力不足
         */
        update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
                        &overload, &overutilized);

        /* (7.3.1.6) local_group不参与busiest sg的计算 */
        if (local_group)
            goto next_group;

        /* (7.3.1.7) 如果设置了SD_PREFER_SIBLING标志,说明local_group希望其他人迁移任务到它身上,
            提高其他sg的迁移优先级
         */
        /*
         * In case the child domain prefers tasks go to siblings
         * first, lower the sg capacity so that we'll try
         * and move all the excess tasks away. We lower the capacity
         * of a group only if the local group has the capacity to fit
         * these excess tasks. The extra check prevents the case where
         * you always pull from the heaviest group when it is already
         * under-utilized (possible with a large weight task outweighs
         * the tasks on the system).
         */
        if (prefer_sibling && sds->local &&
            group_has_capacity(env, &sds->local_stat) &&
            (sgs->sum_nr_running > 1)) {
            sgs->group_no_capacity = 1;
            sgs->group_type = group_classify(sg, sgs);
        }

        /* (7.3.1.8) 根据计算的sgs统计数据,找出busiest sg */
        if (update_sd_pick_busiest(env, sds, sg, sgs)) {
            sds->busiest = sg;
            sds->busiest_stat = *sgs;
        }

next_group:
        /* (7.3.1.9) 更新sds中的负载、capacity统计 */
        /* Now, start updating sd_lb_stats */
        sds->total_load += sgs->group_load;
        sds->total_capacity += sgs->group_capacity;

        sg = sg->next;
    } while (sg != env->sd->groups);

    if (env->sd->flags & SD_NUMA)
        env->fbq_type = fbq_classify_group(&sds->busiest_stat);

    env->src_grp_nr_running = sds->busiest_stat.sum_nr_running;

    /* (7.3.1.10) 根据最后一个sg的overload、overutilized值
        来更新dst_cpu rq->rd中的对应值 。
        ooooo这里是怎么想的?不是local_group,也不是busiest_group,而是最后一个计算的sg!!!
     */
    if (!env->sd->parent) {
        /* update overload indicator if we are at root domain */
        if (env->dst_rq->rd->overload != overload)
            env->dst_rq->rd->overload = overload;

        /* Update over-utilization (tipping point, U >= 0) indicator */
        if (env->dst_rq->rd->overutilized != overutilized)
            env->dst_rq->rd->overutilized = overutilized;
    } else {
        if (!env->dst_rq->rd->overutilized && overutilized)
            env->dst_rq->rd->overutilized = true;
    }
}

||||→

static inline void update_sg_lb_stats(struct lb_env *env,
            struct sched_group *group, int load_idx,
            int local_group, struct sg_lb_stats *sgs,
            bool *overload, bool *overutilized)
{
    unsigned long load;
    int i;

    memset(sgs, 0, sizeof(*sgs));

    /*  (7.3.1.5.1) 遍历sched_group中的每个cpu */
    for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
        struct rq *rq = cpu_rq(i);

        /* (7.3.1.5.2) 获取本cpu的负载rq->cpu_load[load_idx-1] */
        /* Bias balancing toward cpus of our domain */
        if (local_group)
            /* 如果是local_group,负载往小的取:min(rq->cpu_load[load_idx-1], weighted_cpuload(cpu)) */
            load = target_load(i, load_idx);
        else
            /* 如果不是local_group,负载往大的取:max(rq->cpu_load[load_idx-1], weighted_cpuload(cpu)) */
            load = source_load(i, load_idx);

#ifdef CONFIG_MTK_SCHED_INTEROP
        /* (7.3.1.5.3) 因为rq->cpu_load[]只包含cfs的负载,mtk尝试加上rt部分的负载
            ooooo但是rq->cpu_capacity中已经减去了rt的部分,这里是否还需要??
         */
        load += mt_rt_load(i);
#endif

        /* (7.3.1.5.4) 累加sgs各项值:
            sgs->group_load   // runnable负载带weight分量(cpu_rq(cpu)->cfs.avg.util_avg),经过rq->cpu_load[]计算
            sgs->group_util   // running负载(cpu_rq(cpu)->cfs.avg.load_avg/cpu_rq(cpu)->cfs.runnable_load_avg)
            sgs->sum_nr_running // rq中所有se的总和
            sgs->sum_weighted_load // runnable负载带weight分量(cpu_rq(cpu)->cfs.avg.util_avg)
            sgs->idle_cpus      // idle状态的cpu计数
         */
#ifdef CONFIG_MTK_LOAD_BALANCE_ENHANCEMENT
        sgs->group_load += (load * capacity_orig_of(i)) >> SCHED_CAPACITY_SHIFT;
#else
        sgs->group_load += load;
#endif
        sgs->group_util += cpu_util(i);
        sgs->sum_nr_running += rq->cfs.h_nr_running;

        /* (7.3.1.5.5) 如果rq中进程数量>1,则就会有进程处于runnable状态,
            overload = true
         */
        if (rq->nr_running > 1)
            *overload = true;

#ifdef CONFIG_NUMA_BALANCING
        sgs->nr_numa_running += rq->nr_numa_running;
        sgs->nr_preferred_running += rq->nr_preferred_running;
#endif
        sgs->sum_weighted_load += weighted_cpuload(i);
        if (idle_cpu(i))
            sgs->idle_cpus++;

        /* (7.3.1.5.6) cpu的capacity小于cpu的running状态负载,
            overutilized = true
         */
        if (cpu_overutilized(i))
            *overutilized = true;
    }

    /* (7.3.1.5.7) 更新汇总后sgs的统计数据:
        sgs->group_capacity     // sgs所有cpu capacity的累加
        sgs->avg_load           // 按照group_capacity,等比例放大group_load负载,capacity越小avg_load越大
        sgs->load_per_task      // sgs的平均每个进程的weight负载
        sgs->group_weight       // sgs的online cpu个数
        sgs->group_no_capacity  // sgs的capacity已经不够用,赶不上util
        sgs->group_type         // 严重级别 group_overloaded > group_imbalanced > group_other
                                // group_imbalanced: 下一等级的load_balance因为cpu_affinity的原因没有完成
     */
    /* Adjust by relative CPU capacity of the group */
    sgs->group_capacity = group->sgc->capacity;
    sgs->avg_load = (sgs->group_load*SCHED_CAPACITY_SCALE) / sgs->group_capacity;

    if (sgs->sum_nr_running)
        sgs->load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;

    sgs->group_weight = group->group_weight;

    sgs->group_no_capacity = group_is_overloaded(env, sgs);
    sgs->group_type = group_classify(group, sgs);
}


||||→

static bool update_sd_pick_busiest(struct lb_env *env,
                   struct sd_lb_stats *sds,
                   struct sched_group *sg,
                   struct sg_lb_stats *sgs)
{
    struct sg_lb_stats *busiest = &sds->busiest_stat;

#ifdef CONFIG_MTK_LOAD_BALANCE_ENHANCEMENT
    if (sgs->sum_nr_running == 0) {
        mt_sched_printf(sched_lb_info, "[%s] sgs->sum_nr_running=%d",
            __func__, sgs->sum_nr_running);
        return false;
    }
#endif

    /* (7.3.1.9.1) 如果新的sgs group_type大于旧的busiest sgs,
        新的sgs更busy
     */
    if (sgs->group_type > busiest->group_type)
        return true;

    /* (7.3.1.9.2) 如果新的sgs group_type小于旧的busiest sgs,
        旧的sgs更busy
     */
    if (sgs->group_type < busiest->group_type)
        return false;

    /* (7.3.1.9.3) 在group_type相同的情况下,比较sgs->avg_load 
        sgs->avg_load = rq->cpu_load[load_idx-1] * (group_load*SCHED_CAPACITY_SCALE / sgs->group_capacity)
     */
    if (sgs->avg_load <= busiest->avg_load)
        return false;

    /* (7.3.1.9.4) 如果SD_ASYM_PACKING标志没有置位,
        在group_type相同的情况下,sgs->avg_load值较大的为busiest sg
     */
    /* This is the busiest node in its class. */
    if (!(env->sd->flags & SD_ASYM_PACKING))
        return true;

    /* (7.3.1.9.5) ASYM_PACKING的意思是会把负载移到最低序号的cpu上,
        如果sg的frist cpu序号 > dst_cpu,则busiest
        对个sg的frist cpu序号 > dst_cpu,选择序号小的sg
     */
    /*
     * ASYM_PACKING needs to move all the work to the lowest
     * numbered CPUs in the group, therefore mark all groups
     * higher than ourself as busy.
     */
    if (sgs->sum_nr_running && env->dst_cpu < group_first_cpu(sg)) {
        if (!sds->busiest)
            return true;

        if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
            return true;
    }

    /* (7.3.1.9.6) 设置了ASYM_PACKING,且如果sg的frist cpu序号 <= dst_cpu,
        返回false
     */
    return false;
}

|||→

static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
{
    unsigned long max_pull, load_above_capacity = ~0UL;
    struct sg_lb_stats *local, *busiest;

    /* (7.3.12.1) local sgs和busiest sgs */
    local = &sds->local_stat;
    busiest = &sds->busiest_stat;

    if (busiest->group_type == group_imbalanced) {
        /*
         * In the group_imb case we cannot rely on group-wide averages
         * to ensure cpu-load equilibrium, look at wider averages. XXX
         */
        busiest->load_per_task =
            min(busiest->load_per_task, sds->avg_load);
    }

    /* (7.3.12.2) */
    /*
     * In the presence of smp nice balancing, certain scenarios can have
     * max load less than avg load(as we skip the groups at or below
     * its cpu_capacity, while calculating max_load..)
     */
    if (busiest->avg_load <= sds->avg_load ||
        local->avg_load >= sds->avg_load) {
        env->imbalance = 0;
        return fix_small_imbalance(env, sds);
    }

    /* (7.3.12.3) */
    /*
     * If there aren't any idle cpus, avoid creating some.
     */
    if (busiest->group_type == group_overloaded &&
        local->group_type   == group_overloaded) {
        load_above_capacity = busiest->sum_nr_running *
                    SCHED_LOAD_SCALE;
        if (load_above_capacity > busiest->group_capacity)
            load_above_capacity -= busiest->group_capacity;
        else
            load_above_capacity = ~0UL;
    }

    /* (7.3.12.4) env->imbalance的值等于min((sds->avg - local), (busiest - sds->avg))
        在local和sds平均值,busiest和sds平均值,两个差值之间选择最小值
     */
    /*
     * We're trying to get all the cpus to the average_load, so we don't
     * want to push ourselves above the average load, nor do we wish to
     * reduce the max loaded cpu below the average load. At the same time,
     * we also don't want to reduce the group load below the group capacity
     * (so that we can implement power-savings policies etc). Thus we look
     * for the minimum possible imbalance.
     */
    max_pull = min(busiest->avg_load - sds->avg_load, load_above_capacity);

    /* How much load to actually move to equalise the imbalance */
    env->imbalance = min(
        max_pull * busiest->group_capacity,
        (sds->avg_load - local->avg_load) * local->group_capacity
    ) / SCHED_CAPACITY_SCALE;

    /*
     * if *imbalance is less than the average load per runnable task
     * there is no guarantee that any tasks will be moved so we'll have
     * a think about bumping its value to force at least one task to be
     * moved
     */
    if (env->imbalance < busiest->load_per_task)
        return fix_small_imbalance(env, sds);
}

||→

static struct rq *find_busiest_queue(struct lb_env *env,
                     struct sched_group *group)
{
    struct rq *busiest = NULL, *rq;
    unsigned long busiest_load = 0, busiest_capacity = 1;
    int i;

    /* (7.4.1) 逐个遍历sg中的cpu */
    for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
        unsigned long capacity, wl;
        enum fbq_type rt;

        rq = cpu_rq(i);
        rt = fbq_classify_rq(rq);

        /*
         * We classify groups/runqueues into three groups:
         *  - regular: there are !numa tasks
         *  - remote:  there are numa tasks that run on the 'wrong' node
         *  - all:     there is no distinction
         *
         * In order to avoid migrating ideally placed numa tasks,
         * ignore those when there's better options.
         *
         * If we ignore the actual busiest queue to migrate another
         * task, the next balance pass can still reduce the busiest
         * queue by moving tasks around inside the node.
         *
         * If we cannot move enough load due to this classification
         * the next pass will adjust the group classification and
         * allow migration of more tasks.
         *
         * Both cases only affect the total convergence complexity.
         */
        if (rt > env->fbq_type)
            continue;

        /* (7.4.2) 计算出cpu的capacity和weight_load */
        capacity = capacity_of(i);

        wl = weighted_cpuload(i);

#ifdef CONFIG_MTK_SCHED_INTEROP
        wl += mt_rt_load(i);
#endif

        /*
         * When comparing with imbalance, use weighted_cpuload()
         * which is not scaled with the cpu capacity.
         */

        if (rq->nr_running == 1 && wl > env->imbalance &&
            !check_cpu_capacity(rq, env->sd))
            continue;

        /* (7.4.3) 选出相对负载最重的cpu */
        /*
         * For the load comparisons with the other cpu's, consider
         * the weighted_cpuload() scaled with the cpu capacity, so
         * that the load can be moved away from the cpu that is
         * potentially running at a lower capacity.
         *
         * Thus we're looking for max(wl_i / capacity_i), crosswise
         * multiplication to rid ourselves of the division works out
         * to: wl_i * capacity_j > wl_j * capacity_i;  where j is
         * our previous maximum.
         */
        if (wl * busiest_capacity > busiest_load * capacity) {
            busiest_load = wl;
            busiest_capacity = capacity;
            busiest = rq;
        }
    }

    return busiest;
}

||→

static int detach_tasks(struct lb_env *env)
{
    struct list_head *tasks = &env->src_rq->cfs_tasks;
    struct task_struct *p;
    unsigned long load;
    int detached = 0;

    lockdep_assert_held(&env->src_rq->lock);

    if (env->imbalance <= 0)
        return 0;

    /* (7.6.1) 遍历busiest rq中的进程 */
    while (!list_empty(tasks)) {

        /* (7.6.2) 如果dest cpu不是idle,不能将busiest cpu迁移到idle状态 */    
        /*
         * We don't want to steal all, otherwise we may be treated likewise,
         * which could at worst lead to a livelock crash.
         */
        if (env->idle != CPU_NOT_IDLE && env->src_rq->nr_running <= 1)
            break;

        p = list_first_entry(tasks, struct task_struct, se.group_node);

        /* (7.6.3) 遍历任务最多不超过sysctl_sched_nr_migrate(32) */
        env->loop++;
        /* We've more or less seen every task there is, call it quits */
        if (env->loop > env->loop_max)
            break;

        /* (7.6.4) 每sched_nr_migrate_break个任务遍历需要跳出休息一下,
            如果没有达到env->loop_max,后面会重来
         */
        /* take a breather every nr_migrate tasks */
        if (env->loop > env->loop_break) {
            env->loop_break += sched_nr_migrate_break;
            env->flags |= LBF_NEED_BREAK;
            break;
        }

        /* (7.6.5) 判断任务是否支持迁移? */
        if (!can_migrate_task(p, env))
            goto next;

        /* (7.6.6) 获取p进程相对顶层cfs_rq的负载, 
            根据负载判断进程是否适合迁移
         */
        load = task_h_load(p);

        if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
            goto next;

        if ((load / 2) > env->imbalance)
            goto next;

        /* (7.6.7) detach 进程 */
        detach_task(p, env);
        list_add(&p->se.group_node, &env->tasks);

        detached++;
        env->imbalance -= load;

#ifdef CONFIG_PREEMPT
        /*
         * NEWIDLE balancing is a source of latency, so preemptible
         * kernels will stop after the first task is detached to minimize
         * the critical section.
         */
        if (env->idle == CPU_NEWLY_IDLE)
            break;
#endif

        /*
         * We only want to steal up to the prescribed amount of
         * weighted load.
         */
        if (env->imbalance <= 0)
            break;

        continue;
next:
        list_move_tail(&p->se.group_node, tasks);
    }

    /*
     * Right now, this is one of only two places we collect this stat
     * so we can safely collect detach_one_task() stats here rather
     * than inside detach_one_task().
     */
    schedstat_add(env->sd, lb_gained[env->idle], detached);

    return detached;
}

|||→

static
int can_migrate_task(struct task_struct *p, struct lb_env *env)
{
    int tsk_cache_hot;

    lockdep_assert_held(&env->src_rq->lock);

    /*
     * We do not migrate tasks that are:
     * 1) throttled_lb_pair, or
     * 2) cannot be migrated to this CPU due to cpus_allowed, or
     * 3) running (obviously), or
     * 4) are cache-hot on their current CPU.
     */
    /* (7.6.5.1) 如果达到bandwith限制,返回失败 */
    if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
        return 0;

    /* (7.6.5.2) 如果p进程的cpu affinity不允许迁移到dst_cpu,进一步处理 */
    if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
        int cpu;

        schedstat_inc(p, se.statistics.nr_failed_migrations_affine);

        /* (7.6.5.3) LBF_SOME_PINNED标志,记录有些进程迁移失败 */
        env->flags |= LBF_SOME_PINNED;

        /* (7.6.5.5) 如果已经有其他的LBF_DST_PINNED动作,直接返回失败 */
        /*
         * Remember if this task can be migrated to any other cpu in
         * our sched_group. We may want to revisit it if we couldn't
         * meet load balance goals by pulling other tasks on src_cpu.
         *
         * Also avoid computing new_dst_cpu if we have already computed
         * one in current iteration.
         */
        if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED))
            return 0;

        /* (7.6.5.4) 如果dst_cpu同一sched_group中的其他cpu符合p的affinity,尝试更改dst_cpu,
            设置LBF_DST_PINNED标志
         */
        /* Prevent to re-select dst_cpu via env's cpus */
        for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
            if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
                env->flags |= LBF_DST_PINNED;
                env->new_dst_cpu = cpu;
                break;
            }
        }

        return 0;
    }

    /* (7.6.5.6) 有任何符合affinity条件的p,清除LBF_ALL_PINNED标志 */
    /* Record that we found atleast one task that could run on dst_cpu */
    env->flags &= ~LBF_ALL_PINNED;

    /* (7.6.5.7) 如果p在running状态,返回失败 */
    if (task_running(env->src_rq, p)) {
        schedstat_inc(p, se.statistics.nr_failed_migrations_running);
        return 0;
    }

    /* (7.6.5.8) NUMA 相关的一些判断  */
    /*
     * Aggressive migration if:
     * 1) destination numa is preferred
     * 2) task is cache cold, or
     * 3) too many balance attempts have failed.
     */
    tsk_cache_hot = migrate_degrades_locality(p, env);
    if (tsk_cache_hot == -1)
        tsk_cache_hot = task_hot(p, env);

    if (tsk_cache_hot <= 0 ||
        env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
        if (tsk_cache_hot == 1) {
            schedstat_inc(env->sd, lb_hot_gained[env->idle]);
            schedstat_inc(p, se.statistics.nr_forced_migrations);
        }
        return 1;
    }

    schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
    return 0;
}

|||→

static unsigned long task_h_load(struct task_struct *p)
{
    struct cfs_rq *cfs_rq = task_cfs_rq(p);

    update_cfs_rq_h_load(cfs_rq);
    /* (7.6.6.1) task_h_load的目的是在task_group使能时,rq中有多个层次的cfs_rq 
        如果进程p挂载在底层的cfs_rq中,把p的负载转换成顶层cfs_rq的相对负载
     */
    return div64_ul(p->se.avg.load_avg * cfs_rq->h_load,
            cfs_rq_load_avg(cfs_rq) + 1);
}

static void update_cfs_rq_h_load(struct cfs_rq *cfs_rq)
{
    struct rq *rq = rq_of(cfs_rq);
    struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
    u64 now = sched_clock_cpu(cpu_of(rq));
    unsigned long load;

    /* sched: change to jiffies */
    now = now * HZ >> 30;

    if (cfs_rq->last_h_load_update == now)
        return;

    /* 从底层cfs_rq到顶层cfs_rq建立起层次关系 */
    cfs_rq->h_load_next = NULL;
    for_each_sched_entity(se) {
        cfs_rq = cfs_rq_of(se);
        cfs_rq->h_load_next = se;
        if (cfs_rq->last_h_load_update == now)
            break;
    }

    if (!se) {
        cfs_rq->h_load = cfs_rq_load_avg(cfs_rq);
        cfs_rq->last_h_load_update = now;
    }

    /* 使用建立的关系,从顶层cfs_rq开始计算每个层次cfs_rq的相对顶层负载h_load */
    while ((se = cfs_rq->h_load_next) != NULL) {
        load = cfs_rq->h_load;
        load = div64_ul(load * se->avg.load_avg,
            cfs_rq_load_avg(cfs_rq) + 1);
        cfs_rq = group_cfs_rq(se);
        cfs_rq->h_load = load;
        cfs_rq->last_h_load_update = now;
    }
}


4.1.2.2、nohz_idle_balance()

每个cpu的负载均衡是在本cpu的tick任务scheduler_tick()中判断执行的,如果cpu进入了nohz模式scheduler_tick()被stop,那么本cpu没有机会去做rebalance_domains()。为了解决这个问题,系统设计了nohz_idle_balance(),在运行的cpu上判断进入nohz的cpu是否需要rebalance load,如果需要选择一个idle cpu来帮所有的nohz idle cpu做负载均衡。

在rebalance_domains()函数之前有一个nohz_idle_balance(),这是系统在条件满足的情况下让一个idle cpu做idle负载均衡。主要的原理如下:

  • 1、cpu在进入nohz idle状态时,设置标志:

这里写图片描述

tick_nohz_idle_enter() -> set_cpu_sd_state_idle():

↓

void set_cpu_sd_state_idle(void)
{
    struct sched_domain *sd;
    int cpu = smp_processor_id();

    rcu_read_lock();
    sd = rcu_dereference(per_cpu(sd_busy, cpu));

    if (!sd || sd->nohz_idle)
        goto unlock;

    /* (1.1) 进入nohz idle,设置sd->nohz_idle标志 */
    sd->nohz_idle = 1;

    /* (1.2) 减少sgc->nr_busy_cpus的计数 */
    atomic_dec(&sd->groups->sgc->nr_busy_cpus);
unlock:
    rcu_read_unlock();
}


tick_nohz_idle_enter() -> __tick_nohz_idle_enter() -> tick_nohz_stop_sched_tick() -> nohz_balance_enter_idle():

↓

void nohz_balance_enter_idle(int cpu)
{
    /*
     * If this cpu is going down, then nothing needs to be done.
     */
    if (!cpu_active(cpu))
        return;

    if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
        return;

    /*
     * If we're a completely isolated CPU, we don't play.
     */
    if (on_null_domain(cpu_rq(cpu)))
        return;

    /* (2.1) 进入idle状态,设置nohz.idle_cpus_mask中对应的bit */
    cpumask_set_cpu(cpu, nohz.idle_cpus_mask);

    /* (2.2) 进入idle状态,增加nohz.nr_cpus计数 */
    atomic_inc(&nohz.nr_cpus);

    /* (2.3) 设置cpu_rq(cpu)->nohz_flags中的NOHZ_TICK_STOPPED标志 */
    set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
}
  • 2、在trigger_load_balance()中判断,当前是否需要触发idle load balance:

这里写图片描述

void trigger_load_balance(struct rq *rq)
{

    /* (1) 判断当前是否需要idle load balance */
    if (nohz_kick_needed(rq))

        /* (2) 选中一个idle cpu去做idle load balance */
        nohz_balancer_kick();

}

|→

/*
 * Current heuristic for kicking the idle load balancer in the presence
 * of an idle cpu in the system.
 *   - This rq has more than one task.
 *   - This rq has at least one CFS task and the capacity of the CPU is
 *     significantly reduced because of RT tasks or IRQs.
 *   - At parent of LLC scheduler domain level, this cpu's scheduler group has
 *     multiple busy cpu.
 *   - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
 *     domain span are idle.
 */
static inline bool nohz_kick_needed(struct rq *rq)
{
    unsigned long now = jiffies;
    struct sched_domain *sd;
    struct sched_group_capacity *sgc;
    int nr_busy, cpu = rq->cpu;
    bool kick = false;

    /* (1.1) 如果当前cpu为idle状态,失败退出 */
    if (unlikely(rq->idle_balance))
        return false;


    /* (1.2) 退出nohz状态:set_cpu_sd_state_busy()、nohz_balance_exit_idle(cpu)
        是set_cpu_sd_state_idle()、nohz_balance_enter_idle()的反向操作
     */
   /*
    * We may be recently in ticked or tickless idle mode. At the first
    * busy tick after returning from idle, we will update the busy stats.
    */
    set_cpu_sd_state_busy();
    nohz_balance_exit_idle(cpu);

    /* (1.3) 如果进入nohz idle状态的cpu数量为0,失败退出 */
    /*
     * None are in tickless mode and hence no need for NOHZ idle load
     * balancing.
     */
    if (likely(!atomic_read(&nohz.nr_cpus)))
        return false;

    /* (1.4) nohz balance时间未到,失败退出 */
    if (time_before(now, nohz.next_balance))
        return false;

#if !defined(CONFIG_MTK_LOAD_BALANCE_ENHANCEMENT) && defined(CONFIG_HMP)
    /* for more than two clusters, still need wakup nohz CPUs and force balancing */
    /*
     * Bail out if there are no nohz CPUs in our
     * HMP domain, since we will move tasks between
     * domains through wakeup and force balancing
     * as necessary based upon task load.
     */
    if (sched_feat(SCHED_HMP) && cpumask_first_and(nohz.idle_cpus_mask,
                &((struct hmp_domain *)hmp_cpu_domain(cpu))->cpus) >= nr_cpu_ids)
        return false;
#endif

    /* (1.5) 当前cpu的进程>=2,返回成功 */
    if (rq->nr_running >= 2 &&
        (!energy_aware() || cpu_overutilized(cpu)))
        return true;

    /* (1.6) sd所在sg的nr_busy_cpus>1,返回成功 */
    rcu_read_lock();
    sd = rcu_dereference(per_cpu(sd_busy, cpu));
    if (sd && !energy_aware()) {
        sgc = sd->groups->sgc;
        nr_busy = atomic_read(&sgc->nr_busy_cpus);

        if (nr_busy > 1) {
            kick = true;
            goto unlock;
        }

    }

    /* (1.7) 如果所有层次的se个数>=1,且capacity在减少,返回成功 */
    sd = rcu_dereference(rq->sd);
    if (sd) {
        if ((rq->cfs.h_nr_running >= 1) &&
                check_cpu_capacity(rq, sd)) {
            kick = true;
            goto unlock;
        }
    }

    /* (1.8) 如果本sd->span[]中第一个idle cpu < sd_asym,返回成功 */
    sd = rcu_dereference(per_cpu(sd_asym, cpu));
    if (sd && (cpumask_first_and(nohz.idle_cpus_mask,
                  sched_domain_span(sd)) < cpu)) {
        kick = true;
        goto unlock;
    }

unlock:
    rcu_read_unlock();
    return kick;
}


|→

static void nohz_balancer_kick(void)
{
    int ilb_cpu;

    nohz.next_balance++;

    /* (2.1) 找到所有idle cpu中的第一个idle cpu */
    ilb_cpu = find_new_ilb();

    if (ilb_cpu >= nr_cpu_ids)
        return;

    /* (2.2) 给ilb_cpu的cpu_rq(cpu)->nohz_flags设置NOHZ_BALANCE_KICK标志位 */
    if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
        return;

    /* (2.3) 使用ipi中断来唤醒ilb_cpu执行idle load balance */
    /*
     * Use smp_send_reschedule() instead of resched_cpu().
     * This way we generate a sched IPI on the target cpu which
     * is idle. And the softirq performing nohz idle load balance
     * will be run before returning from the IPI.
     */
    smp_send_reschedule(ilb_cpu);
    return;
}



/* (2.3.1) ilb_cpu倍唤醒后处理IPI_RESCHEDULE,
    会触发一个SCHED_SOFTIRQ软中断来启动run_rebalance_domains()
 */

void handle_IPI(int ipinr, struct pt_regs *regs)
{
    unsigned int cpu = smp_processor_id();
    struct pt_regs *old_regs = set_irq_regs(regs);

    if ((unsigned)ipinr < NR_IPI) {
        trace_ipi_entry_rcuidle(ipi_types[ipinr]);
        __inc_irq_stat(cpu, ipi_irqs[ipinr]);
    }

    switch (ipinr) {
    case IPI_RESCHEDULE:
        scheduler_ipi();
        break;

}

↓

void scheduler_ipi(void)
{

    /*
     * Check if someone kicked us for doing the nohz idle load balance.
     */
    if (unlikely(got_nohz_idle_kick())) {
        this_rq()->idle_balance = 1;
        raise_softirq_irqoff(SCHED_SOFTIRQ);
    }

}
  • 3、被选中的ilb_cpu被唤醒后,需要帮其他所有idle cpu完成rebalance_domains()工作:

这里写图片描述

static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
{
    int this_cpu = this_rq->cpu;
    struct rq *rq;
    int balance_cpu;
    /* Earliest time when we have to do rebalance again */
    unsigned long next_balance = jiffies + 60*HZ;
    int update_next_balance = 0;

    /* (1) 判断当前cpu是不是被选中被唤醒的ilb_cpu */
    if (idle != CPU_IDLE ||
        !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
        goto end;

    /* (2) 轮询所有进入onhz状态的cpu */
    for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {

        /* (3) 只服务非本cpu,且还是idle状态的cpu 
            ooooo本cpu也是idle状态,不需对本cpu做idle负载均衡?
            ooooo给其他idle cpu的rq做了负载均衡后,什么时候唤醒其他idle cpu?
         */
        if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
            continue;

        /* (4) 如果本cpu被设置了resched标志,说明有线程被唤醒,退出idle状态 */
        /*
         * If this cpu gets work to do, stop the load balancing
         * work being done for other cpus. Next load
         * balancing owner will pick it up.
         */
        if (need_resched())
            break;

        /* (5) 需要做负载均衡的idle进程balance_cpu */
        rq = cpu_rq(balance_cpu);

        /* (6) 如果balance_cpu的rq->next_balance时间已到,替其做rebalance_domains() */
        /*
         * If time for next balance is due,
         * do the balance.
         */
        if (time_after_eq(jiffies, rq->next_balance)) {
            raw_spin_lock_irq(&rq->lock);
            update_rq_clock(rq);

            /* (7) 更新idle cpu因为idle造成的负载衰减 */
            update_idle_cpu_load(rq);
            raw_spin_unlock_irq(&rq->lock);

            /* (8) 对balance_cpu做负载均衡 
                ooooo做完负载均衡,什么时候唤醒balance_cpu??
             */
            rebalance_domains(rq, CPU_IDLE);
        }

        if (time_after(next_balance, rq->next_balance)) {
            next_balance = rq->next_balance;
            update_next_balance = 1;
        }
    }

    /* (9) 根据所有进入nohz idle cpu rq的最近的一次到期时间,更新nohz.next_balance */
    /*
     * next_balance will be updated only when there is a need.
     * When the CPU is attached to null domain for ex, it will not be
     * updated.
     */
    if (likely(update_next_balance))
        nohz.next_balance = next_balance;
end:
    clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
}
4.1.2.3、select_task_rq_fair()

除了scheduler_tick()的时候会做负载均衡,另外一个时刻也会做负载均衡。就是fork新进程、wakeup休眠进程时,系统会根据负载均衡挑选一个最合适的cpu给进程运行,其核心函数就是select_task_rq_fair():

  • 1、首先是使用EAS的方法来select_cpu,在EAS使能且没有overutilized时使用EAS方法:

需要重点提一下的是:负载计算计算了3种负载(load_avg、loadwop_avg、util_avg),EAS主要使用其中的util_avg,和capacity一起计算。

  • 1.1、EAS遍历cluster和cpu,找到一个既能满足进程p的affinity又能容纳下进程p的负载util,属于能用最小capacity满足的cluster其中剩余capacity最多的target_cpu;

首先找到能容纳进程p的util且capacity最小的cluster:

这里写图片描述

然后在目标cluster中找到加上进程p以后,剩余capacity最大的cpu:

这里写图片描述

pre_cpu是进程p上一次运行的cpu作为src_cpu,上面选择的target_cpu作为dst_cpu,就是尝试计算进程p从pre_cpu迁移到target_cpu系统的功耗差异:

这里写图片描述

  • 1.2、计算负载变化前后,target_cpu和prev_cpu带来的power变化。如果没有power增加则返回target_cpu,如果有power增加则返回prev_cpu;

计算负载变化的函数energy_diff()循环很多比较复杂,仔细分析下来就是计算target_cpu/prev_cpu在“MC层次cpu所在sg链表”+“DIE层级cpu所在sg”,这两种范围在负载变化中的功耗差异:

这里写图片描述

energy_diff()的计算方法如下:

负载值计算方法说明
idle_idxmin(rq->idle_state_idx)sg多个cpu中,idle_state_idx最小值
eenv->cap_idxfind_new_capacity()在负载变化后,根据sg多个cpu中的最大util值,匹配的cpu freq档位sg->sge->cap_states[eenv->cap_idx].cap
group_util+= (__cpu_util << SCHED_CAPACITY_SHIFT)/sg->sge->cap_states[eenv->cap_idx].cap累加sg中cpu的util值,并且把util转换成capacity的反比
sg_busy_energy(group_util * sg->sge->busy_power(group_first_cpu(sg), eenv, (sd->child) ? 1 : 0)) >> SCHED_CAPACITY_SHIFT使用group_util计算busy部分消耗的功耗
sg_idle_energy((SCHED_LOAD_SCALE - group_util) * sg->sge->idle_power(idle_idx, group_first_cpu(sg), eenv, (sd->child) ? 1 : 0)) >> SCHED_CAPACITY_SHIFT使用(SCHED_LOAD_SCALE - group_util)计算idle部分计算的功耗
total_energysg_busy_energy + sg_idle_energy单个sg的功耗,累计所有相关sg的功耗,总的差异就是进程P迁移以后的功耗差异
  • 2、如果EAS不适应,使用传统的负载均衡方法来select_cpu:
  • 2.1、find_idlest_group() -> find_idlest_cpu() 找出最时候的target_cpu;
  • 2.2、最差的方法使用select_idle_sibling()讲究找到一个idle cpu作为target_cpu;
  • 2.3、确定target_cpu后,继续使用hmp_select_task_rq_fair()来判断是否需要进行hmp迁移;
static int
select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
{
    struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
    int cpu = smp_processor_id();
    int new_cpu = prev_cpu;  /* 默认new_cpu为prev_cpu */
    int want_affine = 0;
    int sync = wake_flags & WF_SYNC;
    int policy = 0;

#ifdef CONFIG_MTK_SCHED_VIP_TASKS
    /* mtk: If task is VIP task, prefer most efficiency idle cpu */
    if (is_vip_task(p)) {
        int vip_idle_cpu;

        vip_idle_cpu = find_idle_vip_cpu(p);
        if (vip_idle_cpu >= 0)
            return vip_idle_cpu;
    }
#endif

    /* (1) 优先使用EAS计算target cpu, 
        mtk 对EAS定义了3种模式:EAS模式(energy_aware())、HMP模式(sched_feat(SCHED_HMP))、hybrid_support(EAS、HMP同时共存);
        hybrid_support()模式下:一般负载均衡交给EAS;如果cpu_rq(cpu)->rd->overutilized负载已经严重不均衡,交给HMP;
     */
    /*
     *  Consider EAS if only EAS enabled, but HMP
     *  if hybrid enabled and system is over-utilized.
     */
    if ((energy_aware() && !hybrid_support()) ||
            (hybrid_support() && !cpu_rq(cpu)->rd->overutilized))
        goto CONSIDER_EAS;

    /* (2) 非EAS情况,fork使用hmp balance */
    /* HMP fork balance:
     * always put non-kernel forking tasks on a big domain
     */
    if (sched_feat(SCHED_HMP) && p->mm && (sd_flag & SD_BALANCE_FORK)) {
        new_cpu = hmp_fork_balance(p, prev_cpu);

        /* to recover new_cpu value if something wrong */
        if (new_cpu >= nr_cpu_ids)
            new_cpu = prev_cpu;
        else {
#ifdef CONFIG_MTK_SCHED_TRACERS
            trace_sched_select_task_rq(p, (LB_FORK | new_cpu), prev_cpu, new_cpu);
#endif
            return new_cpu;
        }
    }

CONSIDER_EAS:

    /* (3) 如果唤醒flag中设置了SD_BALANCE_WAKE,优先使用唤醒cpu来运行进程p,
        还需判断下面3个条件是否满足:
        !wake_wide(p)           // 当前cpu的唤醒次数没有超标
        task_fits_max(p, cpu)   // 当前cpu的capacity能容纳进程p的util
        cpumask_test_cpu(cpu, tsk_cpus_allowed(p)) // 当前cpu在进程在P的affinity中
        EAS利用了want_affine这个标志,只要EAS使能,want_affine =1
     */
    if (sd_flag & SD_BALANCE_WAKE)
        want_affine = (!wake_wide(p) && task_fits_max(p, cpu) &&
                  cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) ||
                  energy_aware();

    rcu_read_lock();
    /* (4) 从下往上遍历当前cpu的sd,查询在哪个层次的sd进行负载均衡 */
    for_each_domain(cpu, tmp) {

        /* (4.1 如果当前sd不支持负载均SD_LOAD_BALANCE,退出) */
        if (!(tmp->flags & SD_LOAD_BALANCE))
            break;

        /* (4.2) 优先找affine_sd,找到直接break;
            需要符合以下3个条件:
            want_affine                     //
            (tmp->flags & SD_WAKE_AFFINE)   // 当前sd支持SD_WAKE_AFFINE标志
            cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))  //当前sd->span[]中同时包含cpu、pre_cpu
         */
        /*
         * If both cpu and prev_cpu are part of this domain,
         * cpu is a valid SD_WAKE_AFFINE target.
         */
        if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
            cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
            affine_sd = tmp;
            break;
        }

        /* (4.3) 其次找一个符合sd_flag的sd */
        if (tmp->flags & sd_flag)
            sd = tmp;
        /* (4.4) 如果以上都失败,直接跳出 */
        else if (!want_affine)
            break;
    }

    /* (5) 如果affine_sd成功找到
     */
    if (affine_sd) {
        sd = NULL; /* Prefer wake_affine over balance flags */
        if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
            new_cpu = cpu;
    }

    /* (6) 没有找到符合sd_flag的sd */
    if (!sd) {
        /* (6.1) EAS使能,且本cpu没有overutilized, 
            使用EAS负载均衡算法
         */
        if (energy_aware() && !cpu_rq(cpu)->rd->overutilized) {
            new_cpu = energy_aware_wake_cpu(p, prev_cpu);
            policy |= LB_EAS;
        }
        /* (6.2) 如果不能使用EAS,且sd_flag中设置SD_BALANCE_WAKE标志 
            尝试在唤醒的cpu上运行p进程,
            ooooo前面辛苦计算的affine_sd没有派上用场?
         */
        else if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */
            if (true) {
#ifdef CONFIG_CGROUP_SCHEDTUNE
                bool prefer_idle = schedtune_prefer_idle(p) > 0;
#else
                bool prefer_idle = true;
#endif
                int idle_cpu;


                idle_cpu = find_best_idle_cpu(p, prefer_idle);
                if (idle_cpu >= 0) {
                    new_cpu = idle_cpu;
                    policy |= LB_IDLEST;
                } else {
                    new_cpu = select_max_spare_capacity_cpu(p, new_cpu);
                    policy |= LB_SPARE;
                }
            } else
            /* (6.3) 不符合上述条件下的默认处理,尝试找一个idle cpu */
                new_cpu = select_idle_sibling(p, new_cpu);
        }
    } else while (sd) {
    /* (7) 找到符合sd_flag的sd */
        struct sched_group *group;
        int weight;

        policy |= LB_SMP;

        /* (7.1) */
        if (!(sd->flags & sd_flag)) {
            sd = sd->child;
            continue;
        }

        /* (7.2) */
        group = find_idlest_group(sd, p, cpu, sd_flag);
        if (!group) {
            sd = sd->child;
            continue;
        }

        /* (7.3) */
        new_cpu = find_idlest_cpu(group, p, cpu);
        if (new_cpu == -1 || new_cpu == cpu) {
            /* Now try balancing at a lower domain level of cpu */
            sd = sd->child;
            continue;
        }

        /* (7.4) */
        /* Now try balancing at a lower domain level of new_cpu */
        cpu = new_cpu;
        weight = sd->span_weight;
        sd = NULL;
        for_each_domain(cpu, tmp) {
            if (weight <= tmp->span_weight)
                break;
            if (tmp->flags & sd_flag)
                sd = tmp;
        }
        /* while loop will break here if sd == NULL */
    }
#ifdef CONFIG_MTK_SCHED_TRACERS
    policy |= (new_cpu << LB_SMP_SHIFT);
#endif

    rcu_read_unlock();


    /* (8) 在EAS不能运行的情况下,在做一次HMP的select操作:
        判断进程p是否符合hmp的迁移条件,如果符合一次迁移到位,避免后续hmp的操作
     */
    /*  Consider hmp if no EAS  or over-utiled in hybrid mode. */
    if ((!energy_aware() && sched_feat(SCHED_HMP)) ||
        (hybrid_support() && cpu_rq(cpu)->rd->overutilized)) {

        new_cpu = hmp_select_task_rq_fair(sd_flag, p, prev_cpu, new_cpu);
#ifdef CONFIG_MTK_SCHED_TRACERS
        policy |= (new_cpu << LB_HMP_SHIFT);
#endif
        policy |= LB_HMP;
    }

#ifdef CONFIG_MTK_SCHED_TRACERS
    trace_sched_select_task_rq(p, policy, prev_cpu, new_cpu);
#endif

    return new_cpu;
}

|→

inline int hmp_fork_balance(struct task_struct *p, int prev_cpu)
{
    int new_cpu = prev_cpu;
    int cpu = smp_processor_id();

    /* (2.1) prev_cpu所在cluster是最快(fastest)的  */
    if (hmp_cpu_is_fastest(prev_cpu)) {
        /* prev_cpu is fastest domain */
        struct hmp_domain *hmpdom;
        __always_unused int lowest_ratio;

        hmpdom = list_entry(
                &hmp_cpu_domain(prev_cpu)->hmp_domains,
                struct hmp_domain, hmp_domains);

        /* (2.2) 尝试选出负载最小的cpu */
        lowest_ratio = hmp_domain_min_load(hmpdom, &new_cpu);

        if (new_cpu < nr_cpu_ids && cpumask_test_cpu(new_cpu, tsk_cpus_allowed(p)))
            return new_cpu;

        new_cpu = cpumask_any_and(&hmp_faster_domain(cpu)->cpus,
                tsk_cpus_allowed(p));

        if (new_cpu < nr_cpu_ids)
            return new_cpu;
    } else {
        /* (2.3) 尝试选出prev_cpu所在cluster中负载最小的cpu */
        /* prev_cpu is not fastest domain */
        new_cpu = hmp_select_faster_cpu(p, prev_cpu);

        if (new_cpu < nr_cpu_ids)
            return new_cpu;
    }

    return new_cpu;
}

|→

static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
{
    s64 this_load, load;
    s64 this_eff_load, prev_eff_load;
    int idx, this_cpu, prev_cpu;
    struct task_group *tg;
    unsigned long weight;
    int balanced;

    idx   = sd->wake_idx;
    this_cpu  = smp_processor_id();
    prev_cpu  = task_cpu(p);
    load      = source_load(prev_cpu, idx);
    this_load = target_load(this_cpu, idx);

    /* (5.1) */
    /*
     * If sync wakeup then subtract the (maximum possible)
     * effect of the currently running task from the load
     * of the current CPU:
     */
    if (sync) {
        tg = task_group(current);
        weight = current->se.avg.load_avg;

        this_load += effective_load(tg, this_cpu, -weight, -weight);
        load += effective_load(tg, prev_cpu, 0, -weight);
    }

    tg = task_group(p);
    weight = p->se.avg.load_avg;

    /*
     * In low-load situations, where prev_cpu is idle and this_cpu is idle
     * due to the sync cause above having dropped this_load to 0, we'll
     * always have an imbalance, but there's really nothing you can do
     * about that, so that's good too.
     *
     * Otherwise check if either cpus are near enough in load to allow this
     * task to be woken on this_cpu.
     */
    this_eff_load = 100;
    this_eff_load *= capacity_of(prev_cpu);

    prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
    prev_eff_load *= capacity_of(this_cpu);

    if (this_load > 0) {
        this_eff_load *= this_load +
            effective_load(tg, this_cpu, weight, weight);

        prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
    }

    balanced = this_eff_load <= prev_eff_load;

    schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);

    if (!balanced)
        return 0;

    schedstat_inc(sd, ttwu_move_affine);
    schedstat_inc(p, se.statistics.nr_wakeups_affine);

    return 1;
}

|→

static int energy_aware_wake_cpu(struct task_struct *p, int target)
{
    int target_max_cap = INT_MAX;
    int target_cpu = task_cpu(p);
    unsigned long min_util;
    unsigned long new_util;
    int i, cpu;
    bool is_tiny = false;
    int nrg_diff = 0;
    int cluster_id = 0;
    struct cpumask cluster_cpus;
    int max_cap_cpu = 0;
    int best_cpu = 0;

    /* (6.1.1) 遍历cluster和cpu,找出一个capacity最小的cpu能容纳下util(p)为best_cpu */
    /*
     * Find group with sufficient capacity. We only get here if no cpu is
     * overutilized. We may end up overutilizing a cpu by adding the task,
     * but that should not be any worse than select_idle_sibling().
     * load_balance() should sort it out later as we get above the tipping
     * point.
     */
    cluster_id = arch_get_nr_clusters();
    for (i = 0; i < cluster_id; i++) {
        arch_get_cluster_cpus(&cluster_cpus, i);
        max_cap_cpu = cpumask_first(&cluster_cpus);

        /* Assuming all cpus are the same in group */
        for_each_cpu(cpu, &cluster_cpus) {

            if (!cpu_online(cpu))
                continue;

            if (capacity_of(max_cap_cpu) < target_max_cap &&
            task_fits_max(p, max_cap_cpu)) {
                best_cpu = cpu;
                target_max_cap = capacity_of(max_cap_cpu);
            }
            break;
        }
    }

    if (task_util(p) < TINY_TASK_THRESHOLD)
        is_tiny = true;

    /* Find cpu with sufficient capacity */
    min_util = boosted_task_util(p);
    if (!is_tiny)
        /* (6.1.2) 根据best_cpu所在的cluster和进程p的affinity,
            找出加上util(p)以后,剩余capacity最大的cpu:target_cpu
         */
        target_cpu = select_max_spare_capacity_cpu(p, best_cpu);
    else
        /* (6.1.3) 根据cluster和进程p的affinity,
            找出加上util(p)以后,当前freq的capacity能满足的第一个cpu:target_cpu
         */
        for_each_cpu_and(i, tsk_cpus_allowed(p), &cluster_cpus) {

            if (!cpu_online(i))
                continue;

            /*
             * p's blocked utilization is still accounted for on prev_cpu
             * so prev_cpu will receive a negative bias due to the double
             * accounting. However, the blocked utilization may be zero.
             */
            new_util = cpu_util(i) + task_util(p);

            /*
             * Ensure minimum capacity to grant the required boost.
             * The target CPU can be already at a capacity level higher
             * than the one required to boost the task.
             */
            new_util = max(min_util, new_util);

#ifdef CONFIG_MTK_SCHED_INTEROP
            if (cpu_rq(i)->rt.rt_nr_running && likely(!is_rt_throttle(i)))
                continue;
#endif
            if (new_util > capacity_orig_of(i))
                continue;

            if (new_util < capacity_curr_of(i)) {
                target_cpu = i;
                if (cpu_rq(i)->nr_running)
                    break;
            }

            /* cpu has capacity at higher OPP, keep it as fallback */
            if (target_cpu == task_cpu(p))
                target_cpu = i;
        }

    /* (6.1.4) 如果pre_cpu和target_cpu是同一个cluster,直接成功返回 */
    /* no need energy calculation if the same domain */
    if (is_the_same_domain(task_cpu(p), target_cpu))
        return target_cpu;

    /* no energy comparison if the same cluster */
    if (target_cpu != task_cpu(p)) {

        /* (6.1.5) 构造需要迁移的环境变量  */
        struct energy_env eenv = {
            .util_delta = task_util(p),
            .src_cpu    = task_cpu(p),
            .dst_cpu    = target_cpu,
            .task       = p,
        };

        /* Not enough spare capacity on previous cpu */
        if (cpu_overutilized(task_cpu(p))) {
            trace_energy_aware_wake_cpu(p, task_cpu(p), target_cpu,
                    (int)task_util(p), nrg_diff, true, is_tiny);
            return target_cpu;
        }

        /* (6.1.6) 计算进程p从pre_cpu迁移到target_cpu后的功耗差值nrg_diff,
            如果功耗增加,nrg_diff >= 0,返回pre_cpu即task_cpu(p),
            如果功耗减少,返回新的target_cpu
         */
        nrg_diff = energy_diff(&eenv);
        if (nrg_diff >= 0) {
            trace_energy_aware_wake_cpu(p, task_cpu(p), target_cpu,
                    (int)task_util(p), nrg_diff, false, is_tiny);
            return task_cpu(p);
        }
    }

    trace_energy_aware_wake_cpu(p, task_cpu(p), target_cpu, (int)task_util(p), nrg_diff, false, is_tiny);
    return target_cpu;
}

||→

static inline int
energy_diff(struct energy_env *eenv)
{
    unsigned int boost;
    int nrg_delta;

    /* Conpute "absolute" energy diff */
    __energy_diff(eenv);

    /* Return energy diff when boost margin is 0 */
#ifdef CONFIG_CGROUP_SCHEDTUNE
    boost = schedtune_task_boost(eenv->task);
#else
    boost = get_sysctl_sched_cfs_boost();
#endif
    if (boost == 0)
        return eenv->nrg.diff;

    /* Compute normalized energy diff */
    nrg_delta = normalize_energy(eenv->nrg.diff);
    eenv->nrg.delta = nrg_delta;

    eenv->payoff = schedtune_accept_deltas(
            eenv->nrg.delta,
            eenv->cap.delta,
            eenv->task);

    /*
     * When SchedTune is enabled, the energy_diff() function will return
     * the computed energy payoff value. Since the energy_diff() return
     * value is expected to be negative by its callers, this evaluation
     * function return a negative value each time the evaluation return a
     * positive payoff, which is the condition for the acceptance of
     * a scheduling decision
     */
    return -eenv->payoff;
}

static int __energy_diff(struct energy_env *eenv)
{
    struct sched_domain *sd;
    struct sched_group *sg;
    int sd_cpu = -1, energy_before = 0, energy_after = 0;

    /* (6.1.6.1) 构造迁移前的环境变量  */
    struct energy_env eenv_before = {
        .util_delta = 0,
        .src_cpu    = eenv->src_cpu,
        .dst_cpu    = eenv->dst_cpu,
        .nrg        = { 0, 0, 0, 0},
        .cap        = { 0, 0, 0 },
    };
#ifdef CONFIG_MTK_SCHED_EAS_POWER_SUPPORT
    int i;
#endif

    if (eenv->src_cpu == eenv->dst_cpu)
        return 0;

#ifdef CONFIG_MTK_SCHED_EAS_POWER_SUPPORT
    /* To get max opp index of every cluster for power estimation of share buck */
    for (i = 0; i < arch_get_nr_clusters(); i++) {
        /* for energy before */
        eenv_before.opp_idx[i]  = mtk_cluster_capacity_idx(i, &eenv_before);

        /* for energy after */
        eenv->opp_idx[i]  = mtk_cluster_capacity_idx(i, eenv);

        mt_sched_printf(sched_eas_energy_calc, "cid=%d, before max_opp:%d, after max_opp:%d\n",
                    i, eenv_before.opp_idx[i], eenv->opp_idx[i]);
    }
#endif

    /* (6.1.6.2) sd来至于cache sd_ea,是cpu对应的顶层sd(tl DIE层) */
    sd_cpu = (eenv->src_cpu != -1) ? eenv->src_cpu : eenv->dst_cpu;
    sd = rcu_dereference(per_cpu(sd_ea, sd_cpu));

    if (!sd)
        return 0; /* Error */


    mt_sched_printf(sched_eas_energy_calc, "0. %s: move task from src=%d to dst=%d util=%d",
                __func__, eenv->src_cpu, eenv->dst_cpu, eenv->util_delta);

    sg = sd->groups;

    /* (6.1.6.3) 遍历sg所在sg链表,找到符合条件的sg, 
        累加计算eenv_before、eenv相关sg的功耗
     */ 
    do {
        /* (6.1.6.4) 如果当前sg包含src_cpu或者dst_cpu,计算 */
        if (cpu_in_sg(sg, eenv->src_cpu) || cpu_in_sg(sg, eenv->dst_cpu)) {

            /* (6.1.6.5) 当前顶层sg为eenv的sg_top  */
            eenv_before.sg_top = eenv->sg_top = sg;

            mt_sched_printf(sched_eas_energy_calc, "1. %s: src=%d dst=%d mask=0x%lx (before)",
                    __func__,  eenv_before.src_cpu, eenv_before.dst_cpu, sg->cpumask[0]);

            /* (6.1.6.6) 计算eenv_before负载下sg的power */
            if (sched_group_energy(&eenv_before))
                return 0; /* Invalid result abort */
            energy_before += eenv_before.energy;

            /* Keep track of SRC cpu (before) capacity */
            eenv->cap.before = eenv_before.cap.before;
            eenv->cap.delta = eenv_before.cap.delta;


            mt_sched_printf(sched_eas_energy_calc, "2. %s: src=%d dst=%d mask=0x%lx (after)",
                    __func__,  eenv->src_cpu, eenv->dst_cpu, sg->cpumask[0]);
            /* (6.1.6.7) 计算eenv负载下sg的power */
            if (sched_group_energy(eenv))
                return 0; /* Invalid result abort */
            energy_after += eenv->energy;
        }
    } while (sg = sg->next, sg != sd->groups);

    /* (6.1.6.8) 计算energy_after - energy_before */
    eenv->nrg.before = energy_before;
    eenv->nrg.after = energy_after;
    eenv->nrg.diff = eenv->nrg.after - eenv->nrg.before;
    eenv->payoff = 0;

    trace_sched_energy_diff(eenv->task,
                eenv->src_cpu, eenv->dst_cpu, eenv->util_delta,
                eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff,
                eenv->cap.before, eenv->cap.after, eenv->cap.delta,
                eenv->nrg.delta, eenv->payoff);

    mt_sched_printf(sched_eas_energy_calc, "5. %s: nrg.diff=%d cap.delta=%d",
                __func__, eenv->nrg.diff, eenv->cap.delta);

    return eenv->nrg.diff;
}

|||→

static int sched_group_energy(struct energy_env *eenv)
{
    struct sched_domain *sd;
    int cpu, total_energy = 0;
    struct cpumask visit_cpus;
    struct sched_group *sg;
#ifdef CONFIG_MTK_SCHED_EAS_POWER_SUPPORT
    int only_lv1_sd = 0;
#endif

    WARN_ON(!eenv->sg_top->sge);

    cpumask_copy(&visit_cpus, sched_group_cpus(eenv->sg_top));

    /* (6.1.6.6.1) 根据sg_top顶层sd,找到需要计算的cpu集合visit_cpus,逐个遍历其中每一个cpu
        ooooo这一套复杂的循环算法计算下来,其实就计算了几个power,以cpu0-cpu3为例:
        4个底层sg的power + 1个顶层sg的power
     */ 
    while (!cpumask_empty(&visit_cpus)) {
        struct sched_group *sg_shared_cap = NULL;

        /* (6.1.6.6.2) 选取visit_cpus中的第一个cpu */
        cpu = cpumask_first(&visit_cpus);

        sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd);
        if (!sd) {
            /* a corner racing with hotplug? sd doesn't exist in this cpu. */

            return -EINVAL;
        }

        /*
         * Is the group utilization affected by cpus outside this
         * sched_group?
         */
        sd = rcu_dereference(per_cpu(sd_scs, cpu));
#ifdef CONFIG_MTK_SCHED_EAS_POWER_SUPPORT
        /* Try to handle one CPU in this cluster by hotplug.
         * In it there is only lv-1 sched_domain exist which having
         * no share_cap_states.
         */
        if (!sd) {
            sd = rcu_dereference(per_cpu(sd_ea, cpu));
            only_lv1_sd = 1;
        }
#endif
        if (!sd) {
            /*
             * We most probably raced with hotplug; returning a
             * wrong energy estimation is better than entering an
             * infinite loop.
             */
            return -EINVAL;
        }

        if (sd->parent)
            sg_shared_cap = sd->parent->groups;

        /* (6.1.6.6.3) 从底层到顶层逐个遍历cpu所在的sd */
        for_each_domain(cpu, sd) {
            sg = sd->groups;

            /* (6.1.6.6.4) 如果是顶层sd,只会计算一个sg */
            /* Has this sched_domain already been visited? */
            if (sd->child && group_first_cpu(sg) != cpu)
                break;

            /* (6.1.6.6.5) 逐个遍历该层次sg链表所在sg */
            do {
                unsigned long group_util;
                int sg_busy_energy, sg_idle_energy;
                int cap_idx, idle_idx;

                if (sg_shared_cap && sg_shared_cap->group_weight >= sg->group_weight)
                    eenv->sg_cap = sg_shared_cap;
                else
                    eenv->sg_cap = sg;

                /* (6.1.6.6.6) 根据eenv指示的负载变化,找出满足该sg中最大负载cpu的capacity_index */
                cap_idx = find_new_capacity(eenv, sg->sge);

                if (sg->group_weight == 1) {
                    /* Remove capacity of src CPU (before task move) */
                    if (eenv->util_delta == 0 &&
                        cpumask_test_cpu(eenv->src_cpu, sched_group_cpus(sg))) {
                        eenv->cap.before = sg->sge->cap_states[cap_idx].cap;
                        eenv->cap.delta -= eenv->cap.before;
                    }
                    /* Add capacity of dst CPU  (after task move) */
                    if (eenv->util_delta != 0 &&
                        cpumask_test_cpu(eenv->dst_cpu, sched_group_cpus(sg))) {
                        eenv->cap.after = sg->sge->cap_states[cap_idx].cap;
                        eenv->cap.delta += eenv->cap.after;
                    }
                }

                /* (6.1.6.6.7) 找出sg所有cpu中最小的idle index */
                idle_idx = group_idle_state(sg);

                /* (6.1.6.6.8) 累加sg中所有cpu的相对负载,
                    最大负载为sg->sge->cap_states[eenv->cap_idx].cap
                 */
                group_util = group_norm_util(eenv, sg);

                /* (6.1.6.6.9) 计算power = busy_power + idle_power */
#ifdef CONFIG_MTK_SCHED_EAS_POWER_SUPPORT
                /*
                 * To support power estimation for MTK soc.
                 * Consider share buck for dynamic power and SPARK/MCDI for static power.
                 */
                sg_busy_energy = (group_util *
                    sg->sge->busy_power(group_first_cpu(sg), eenv, (sd->child) ? 1 : 0))
                                >> SCHED_CAPACITY_SHIFT;
                sg_idle_energy = ((SCHED_LOAD_SCALE - group_util) *
                    sg->sge->idle_power(idle_idx, group_first_cpu(sg), eenv, (sd->child) ? 1 : 0))
                                >> SCHED_CAPACITY_SHIFT;
#else
                /* Power value had been separated to static + dynamic here */
                sg_busy_energy = (group_util * (sg->sge->cap_states[cap_idx].dyn_pwr +
                        sg->sge->cap_states[cap_idx].lkg_pwr[sg->sge->lkg_idx]))
                                >> SCHED_CAPACITY_SHIFT;
                sg_idle_energy = ((SCHED_LOAD_SCALE-group_util) *
                        sg->sge->idle_states[idle_idx].power)
                                >> SCHED_CAPACITY_SHIFT;
#endif

                total_energy += sg_busy_energy + sg_idle_energy;

                mt_sched_printf(sched_eas_energy_calc, "busy_energy=%d idle_eneryg=%d (cost=%d)",
                            sg_busy_energy, sg_idle_energy, total_energy);

                /* (6.1.6.6.10) 如果遍历了底层sd,从visit_cpus中去掉对应的sg cpu */
                if (!sd->child)
                    cpumask_xor(&visit_cpus, &visit_cpus, sched_group_cpus(sg));

#ifdef CONFIG_MTK_SCHED_EAS_POWER_SUPPORT
                /*
                 * We try to get correct energy estimation while racing with hotplug
                 * and avoid entering a infinite loop.
                 */
                if (only_lv1_sd) {
                    eenv->energy = total_energy;
                    return 0;
                }
#endif

                if (cpumask_equal(sched_group_cpus(sg), sched_group_cpus(eenv->sg_top)))
                    goto next_cpu;

            } while (sg = sg->next, sg != sd->groups);
        }

        /* (6.1.6.6.11) 如果遍历了cpu的底层到顶层sd,从visit_cpus中去掉对应的cpu */
next_cpu:
        cpumask_clear_cpu(cpu, &visit_cpus);
        continue;
    }

    eenv->energy = total_energy;
    return 0;
}

|→

static struct sched_group *
find_idlest_group(struct sched_domain *sd, struct task_struct *p,
          int this_cpu, int sd_flag)
{
    struct sched_group *idlest = NULL, *group = sd->groups;
    struct sched_group *fit_group = NULL;
    unsigned long min_load = ULONG_MAX, this_load = 0;
    unsigned long fit_capacity = ULONG_MAX;
    int load_idx = sd->forkexec_idx;
    int imbalance = 100 + (sd->imbalance_pct-100)/2;

    /* (7.2.1) 选择load_idx */
    if (sd_flag & SD_BALANCE_WAKE)
        load_idx = sd->wake_idx;

    /* (7.2.2) 当前cpu所在sd层次的sg,遍历sg所在的sg链表,选出负载最轻的idlest sg */
    do {
        unsigned long load, avg_load;
        int local_group;
        int i;

        /* (7.2.3) 略过不符合p进程affinity的sg */
        /* Skip over this group if it has no CPUs allowed */
        if (!cpumask_intersects(sched_group_cpus(group),
                    tsk_cpus_allowed(p)))
            continue;

        /* (7.2.4) local_group等于本cpu所在的sg */
        local_group = cpumask_test_cpu(this_cpu,
                           sched_group_cpus(group));

        /* Tally up the load of all CPUs in the group */
        avg_load = 0;

        /* (7.2.5) 遍历sg中的所有cpu,累加负载 */
        for_each_cpu(i, sched_group_cpus(group)) {
            /* Bias balancing toward cpus of our domain */
            if (local_group)
                load = source_load(i, load_idx);
            else
                load = target_load(i, load_idx);

#ifdef CONFIG_MTK_SCHED_INTEROP
            load += mt_rt_load(i);
#endif
            avg_load += load;

            /* (7.2.6) 如果EAS使能,找到能最小满足进程p的capacity sg */
            /*
             * Look for most energy-efficient group that can fit
             * that can fit the task.
             */
            if (capacity_of(i) < fit_capacity && task_fits_spare(p, i)) {
                fit_capacity = capacity_of(i);
                fit_group = group;
            }
        }

        /* (7.2.7) 用累计的负载计算相对负载 */
        /* Adjust by relative CPU capacity of the group */
        avg_load = (avg_load * SCHED_CAPACITY_SCALE) / group->sgc->capacity;

        /* (7.2.8) 计算idlest sg */
        if (local_group) {
            this_load = avg_load;
        } else if (avg_load < min_load) {
            min_load = avg_load;
            idlest = group;
        }
    } while (group = group->next, group != sd->groups);

    /* (7.2.9) EAS使能,返回fit_group */
    if (energy_aware() && fit_group)
        return fit_group;

    if (!idlest || 100*this_load < imbalance*min_load)
        return NULL;

    /* (7.2.11) 否则,返回idlest */
    return idlest;
}

|→

static int
find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
{
    unsigned long load, min_load = ULONG_MAX;
    unsigned int min_exit_latency = UINT_MAX;
    u64 latest_idle_timestamp = 0;
    int least_loaded_cpu = this_cpu;
    int shallowest_idle_cpu = -1;
    int i;

    /* (7.3.1) 遍历sg中符合p进程affinity的cpu */
    /* Traverse only the allowed CPUs */
    for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {

        /* (7.3.2) 如果cpu的剩余capacity能容纳下p进程的load */
        if (task_fits_spare(p, i)) {
            struct rq *rq = cpu_rq(i);
            struct cpuidle_state *idle = idle_get_state(rq);

            /* (7.3.2.1) 优先选出idle状态,且退出idle开销最小的cpu */
            if (idle && idle->exit_latency < min_exit_latency) {
                /*
                 * We give priority to a CPU whose idle state
                 * has the smallest exit latency irrespective
                 * of any idle timestamp.
                 */
                min_exit_latency = idle->exit_latency;
                latest_idle_timestamp = rq->idle_stamp;
                shallowest_idle_cpu = i;
            } else if (idle_cpu(i) &&
                   (!idle || idle->exit_latency == min_exit_latency) &&
                   rq->idle_stamp > latest_idle_timestamp) {
                /*
                 * If equal or no active idle state, then
                 * the most recently idled CPU might have
                 * a warmer cache.
                 */
                latest_idle_timestamp = rq->idle_stamp;
                shallowest_idle_cpu = i;
            } else if (shallowest_idle_cpu == -1) {
                /*
                 * If we haven't found an idle CPU yet
                 * pick a non-idle one that can fit the task as
                 * fallback.
                 */
                shallowest_idle_cpu = i;
            }

        /* (7.3.3) cpu的剩余capacity容纳不下进程p,选出负载最轻的cpu */
        } else if (shallowest_idle_cpu == -1) {
            load = weighted_cpuload(i);
#ifdef CONFIG_MTK_SCHED_INTEROP
            load += mt_rt_load(i);
#endif
            if (load < min_load || (load == min_load && i == this_cpu)) {
                min_load = load;
                least_loaded_cpu = i;
            }
        }
    }

    return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
}

|→

static int hmp_select_task_rq_fair(int sd_flag, struct task_struct *p,
        int prev_cpu, int new_cpu)
{
    struct list_head *pos;
    struct sched_entity *se = &p->se;
    struct cpumask fast_cpu_mask, slow_cpu_mask;

#ifdef CONFIG_HMP_TRACER
    int cpu = 0;

    for_each_online_cpu(cpu)
        trace_sched_cfs_runnable_load(cpu, cfs_load(cpu), cfs_length(cpu));
#endif

    /* error handling */
    if (prev_cpu >= num_possible_cpus())
        return new_cpu;

    /*
     * Skip all the checks if only one CPU is online.
     * Otherwise, select the most appropriate CPU from cluster.
     */
    if (num_online_cpus() == 1)
        goto out;

    /* (8.1) 找出fastest hmp_domain,只有一个, 
        找出slow hmp_domain,有多个,
        在一个fast_cpu_mask和多个slow_cpu_mask之间,逐个尝试hmp_select_task_migration()
        p进程是否会满足hmp迁移
     */
    cpumask_clear(&fast_cpu_mask);
    cpumask_clear(&slow_cpu_mask);
    /* order: fast to slow hmp domain */
    list_for_each(pos, &hmp_domains) {
        struct hmp_domain *domain = list_entry(pos, struct hmp_domain, hmp_domains);

        if (!cpumask_empty(&domain->cpus)) {
            if (cpumask_empty(&fast_cpu_mask)) {
                cpumask_copy(&fast_cpu_mask, &domain->possible_cpus);
            } else {
                cpumask_copy(&slow_cpu_mask, &domain->possible_cpus);
                new_cpu = hmp_select_task_migration(sd_flag, p,
                    prev_cpu, new_cpu, &fast_cpu_mask, &slow_cpu_mask);
            }
        }
    }

out:
    /* it happens when num_online_cpus=1 */
    if (new_cpu >= nr_cpu_ids) {
        /* BUG_ON(1); */
        new_cpu = prev_cpu;
    }

    cfs_nr_pending(new_cpu)++;
    cfs_pending_load(new_cpu) += se_load(se);

    return new_cpu;

}

||→

static int hmp_select_task_migration(int sd_flag, struct task_struct *p, int prev_cpu, int new_cpu,
        struct cpumask *fast_cpu_mask, struct cpumask *slow_cpu_mask)
{
    int step = 0;
    struct sched_entity *se = &p->se;
    int B_target = num_possible_cpus();
    int L_target = num_possible_cpus();
    struct clb_env clbenv;

    /* (8.1.1) 找出fast_cpu_mask中负载最轻的cpu B_target,且符合p进程的affinity */
    B_target = hmp_select_cpu(HMP_SELECT_RQ, p, fast_cpu_mask, prev_cpu, 0);

    /* (8.1.2) 找出slow_cpu_mask中负载最轻的cpu L_target,且符合p进程的affinity */
    L_target = hmp_select_cpu(HMP_SELECT_RQ, p, slow_cpu_mask, prev_cpu, 1);

    /*
     * Only one cluster exists or only one cluster is allowed for this task
     * Case 1: return the runqueue whose load is minimum
     * Case 2: return original CFS runqueue selection result
     */
    if (B_target >= num_possible_cpus() && L_target >= num_possible_cpus())
        goto out;
    if (B_target >= num_possible_cpus())
        goto select_slow;
    if (L_target >= num_possible_cpus())
        goto select_fast;

    /*
     * Two clusters exist and both clusters are allowed for this task
     * Step 1: Move newly created task to the cpu where no tasks are running
     * Step 2: Migrate heavy-load task to big
     * Step 3: Migrate light-load task to LITTLE
     * Step 4: Make sure the task stays in its previous hmp domain
     */
    step = 1;
    if (task_created(sd_flag) && !task_low_priority(p->prio)) {
        if (!rq_length(B_target))
            goto select_fast;
        if (!rq_length(L_target))
            goto select_slow;
    }

    /* (8.1.3) 计算如果L_target和B_target发生hmp迁移,各种负载和thershold的计算 */
    memset(&clbenv, 0, sizeof(clbenv));
    clbenv.flags |= HMP_SELECT_RQ;
    cpumask_copy(&clbenv.lcpus, slow_cpu_mask);
    cpumask_copy(&clbenv.bcpus, fast_cpu_mask);
    clbenv.ltarget = L_target;
    clbenv.btarget = B_target;
    sched_update_clbstats(&clbenv);

    /* (8.1.4) 判断进程p从L_target up到 B_target的可行性 */
    step = 2;
    if (hmp_up_migration(L_target, &B_target, se, &clbenv))
        goto select_fast;

    /* (8.1.5) 判断进程p从B_target down到 L_target的可行性 */
    step = 3;
    if (hmp_down_migration(B_target, &L_target, se, &clbenv))
        goto select_slow;

    /* (8.1.6) 如果prev_cpu是slowest */
    step = 4;
    if (hmp_cpu_is_slowest(prev_cpu))
        goto select_slow;
    goto select_fast;

    /* (8.1.7) 返回 B_target */
select_fast:
    new_cpu = B_target;
    cpumask_clear(slow_cpu_mask);
    goto out;

    /* (8.1.8) 返回 L_target */
select_slow:
    new_cpu = L_target;
    cpumask_copy(fast_cpu_mask, slow_cpu_mask);
    cpumask_clear(slow_cpu_mask);
    goto out;

out:
#ifdef CONFIG_HMP_TRACER
    trace_sched_hmp_load(clbenv.bstats.load_avg, clbenv.lstats.load_avg);
#endif
    return new_cpu;
}

4.2、HMP负载均衡

除了SMP load_balance()负载均衡以外,我们还希望在多个SMP cluster之间能遵守一种规则:heavy任务跑在big core上,light任务跑在little core上,这样能快速的达到一个合理的负载状态。这种算法就叫做HMP负载均衡,EAS会统一的考虑负载、性能、功耗,EAS使能后HMP就被禁用了。

HMP负载均衡的操作分两种:

  • 1、heavy task从little cpu迁移到big cpu。这种叫做up操作,对应的函数hmp_force_up_migration();
  • 2、light task从big cpu迁移到little cpu。这种叫做down操作,对应的函数hmp_force_down_migration();

4.2.1、hmp domain初始化

这里写图片描述

hmp在初始化的时候会每个cluster分配一个hmp_domain,把所有hmp_domain加入到全局链表hmp_domains中。hmp_domains链表构建完成以后,离链表头hmp_domains最近的hmp_domain是速度最快的cluster,离hmp_domains越远hmp_domain对应的速度越慢。因为在构造链表时是按照cluster id来加入的,速度最快cluster的hmp_domain最后加入,所以离表头最近。

static int __init hmp_cpu_mask_setup(void)
{
    struct hmp_domain *domain;
    struct list_head *pos;
    int dc, cpu;

    pr_warn("Initializing HMP scheduler:\n");

    /* Initialize hmp_domains using platform code */
    /* (1) 调用arch相关的hmp_domains初始化函数 */
    arch_get_hmp_domains(&hmp_domains);
    if (list_empty(&hmp_domains)) {
        pr_warn("HMP domain list is empty!\n");
        return 0;
    }

    /* Print hmp_domains */
    dc = 0;
    list_for_each(pos, &hmp_domains) {
        domain = list_entry(pos, struct hmp_domain, hmp_domains);

        for_each_cpu(cpu, &domain->possible_cpus) {
            /* (2) 给per_cpu变量hmp_cpu_domain赋值 */
            per_cpu(hmp_cpu_domain, cpu) = domain;
        }
        dc++;
    }

    return 1;
}

|→

void __init arch_get_hmp_domains(struct list_head *hmp_domains_list)
{
    struct hmp_domain *domain;
    struct cpumask cpu_mask;
    int id, maxid;

    cpumask_clear(&cpu_mask);
    maxid = arch_get_nr_clusters();

    /*
     * Initialize hmp_domains
     * Must be ordered with respect to compute capacity.
     * Fastest domain at head of list.
     */
    /* (1.1) 按照cluster id初始化对应的hmp_domain */
    for (id = 0; id < maxid; id++) {
        arch_get_cluster_cpus(&cpu_mask, id);
        domain = (struct hmp_domain *)
            kmalloc(sizeof(struct hmp_domain), GFP_KERNEL);
        cpumask_copy(&domain->possible_cpus, &cpu_mask);
        cpumask_and(&domain->cpus, cpu_online_mask, &domain->possible_cpus);

        /* (1.2) 将hmp_domain加入到全局链表hmp_domains_list即hmp_domains中 */
        list_add(&domain->hmp_domains, hmp_domains_list);
    }
}

4.2.2、hmp_force_up_migration()

hmp_force_up_migration()的操作主要有以下几个步骤:

需要重点提一下的是:负载计算计算了3种负载(load_avg、loadwop_avg、util_avg),rebalance_domains主要使用其中的loadwop_avg

  • 1、根据当前cpu,选择fast_cpu_mask、slow_cpu_mask;

hmp_force_up_migration尝试把slow cpu上的heavy进程迁移到fast cpu上,关于slow、fast的选择有以下几种场景:

这里写图片描述

  • 2、选择当前cpu的heaviest进程作为迁移进程p;并不会遍历cpu上所有进程去选出heaviest进程,只会查询curr进程和cfs_rq中5个进程中的heaviest;

  • 3、根据fast_cpu_mask,选择一个负载最少的target cpu;

这里写图片描述

  • 4、根据源cpu(curr_cpu)、目的cpu(target_cpu),计算负载;

重要的数据计算方法:

重要数据 所属结构 含义 更新/获取函数 计算方法
clbenv->bstats.cpu_power clbenv->bstats B族cpu的绝对计算能力 sched_update_clbstats() arch_scale_cpu_capacity(NULL, clbenv->btarget)
clbenv->lstats.cpu_power clbenv->lstats L族cpu的绝对计算能力 sched_update_clbstats() arch_scale_cpu_capacity(NULL, clbenv->ltarget)
clbenv->lstats.cpu_capacity clbenv->lstats B族cpu的相对计算能力,大于1024 sched_update_clbstats() SCHED_CAPACITY_SCALE * clbenv->bstats.cpu_power / (clbenv->lstats.cpu_power+1)
clbenv->bstats.cpu_capacity clbenv->bstats L族cpu的相对计算能力,等于1024 sched_update_clbstats() SCHED_CAPACITY_SCALE
clbs->ncpu clbenv->bstats/clbenv->lstats L族/B族online的cpu数量 collect_cluster_stats() if (cpu_online(cpu)) clbs->ncpu++;
clbs->ntask clbenv->bstats/clbenv->lstats L族/B族所有online cpu中所有层级se的总和 collect_cluster_stats() clbs->ntask += cpu_rq(cpu)->cfs.h_nr_running;
clbs->load_avg clbenv->bstats/clbenv->lstats L族/B族online cpu的平均runnable负载,不带weight collect_cluster_stats() sum(cpu_rq(cpu)->cfs.avg.loadwop_avg)/clbs->ncpu
clbs->scaled_acap clbenv->bstats/clbenv->lstats L族/B族target cpu计算能力的剩余值 collect_cluster_stats() hmp_scale_down(clbs->cpu_capacity - cpu_rq(target)->cfs.avg.loadwop_avg)
clbs->scaled_atask clbenv->bstats/clbenv->lstats L族/B族target cpu的task space的剩余值 collect_cluster_stats() hmp_scale_down(clbs->cpu_capacity - cpu_rq(target)->cfs.h_nr_running * cpu_rq(target)->cfs.avg.loadwop_avg)
clbenv->bstats.threshold clbenv->bstats 进程要up迁移到B族的负载门限值 adj_threshold() HMP_MAX_LOAD - HMP_MAX_LOAD * b_nacap * b_natask / ((b_nacap + l_nacap) * (b_natask + l_natask) + 1);b_nacap、b_natask会乘以一个放大系数(b_cpu_power/l_cpu_power),类似如cpu_capacity的计算
clbenv->lstats.threshold clbenv->lstats 进程要down迁移到L族的负载门限值 adj_threshold() HMP_MAX_LOAD * l_nacap * l_natask / ((b_nacap + l_nacap) * (b_natask + l_natask) + 1);b_nacap、b_natask会乘以一个放大系数(b_cpu_power/l_cpu_power),类似如cpu_capacity的计算

  • 5、根据计算的负载情况,判断进程p是否符合up迁移条件((se_load(se) > B->threshold),等其他条件);

up-migration条件列表(hmp_up_migration()):

条件 含义 计算方法 计算解析
[1] Migration stabilizing 如果target cpu刚做过up迁移,不适合再进行迁移 if (!hmp_up_stable(*target_cpu)) check->result = 0; (((now - hmp_last_up_migration(cpu)) >> 10)
static void run_rebalance_domains(struct softirq_action *h)
{
    struct rq *this_rq = this_rq();
    enum cpu_idle_type idle = this_rq->idle_balance ?
                        CPU_IDLE : CPU_NOT_IDLE;
    int this_cpu = smp_processor_id();

    /* bypass load balance of HMP if EAS consideration */
    /* (1) 在EAS不使能的情况下,尝试进行HMP负载均衡 */
    if ((!energy_aware() && sched_feat(SCHED_HMP)) ||
            (hybrid_support() && cpu_rq(this_cpu)->rd->overutilized))
        hmp_force_up_migration(this_cpu);

    /*
     * If this cpu has a pending nohz_balance_kick, then do the
     * balancing on behalf of the other idle cpus whose ticks are
     * stopped. Do nohz_idle_balance *before* rebalance_domains to
     * give the idle cpus a chance to load balance. Else we may
     * load balance only within the local sched_domain hierarchy
     * and abort nohz_idle_balance altogether if we pull some load.
     */
    nohz_idle_balance(this_rq, idle);
    rebalance_domains(this_rq, idle);
}

|→

static void hmp_force_up_migration(int this_cpu)
{
    int curr_cpu, target_cpu;a
    struct sched_entity *se;
    struct rq *target;
    unsigned long flags;
    unsigned int force = 0;
    struct task_struct *p;
    struct clb_env clbenv;
#ifdef CONFIG_SCHED_HMP_PLUS
    struct sched_entity *orig;
#endif

    if (!spin_trylock(&hmp_force_migration))
        return;

#ifdef CONFIG_HMP_TRACER
    for_each_online_cpu(curr_cpu)
        trace_sched_cfs_runnable_load(curr_cpu, cfs_load(curr_cpu), cfs_length(curr_cpu));
#endif

    /* Migrate heavy task from LITTLE to big */
    /* (1.1) 逐个online cpu尝试进行heavy task从little cpu到big cpu的迁移 */
    for_each_online_cpu(curr_cpu) {
        struct hmp_domain *hmp_domain = NULL;
        struct cpumask fast_cpu_mask, slow_cpu_mask;

        cpumask_clear(&fast_cpu_mask);
        cpumask_clear(&slow_cpu_mask);
        /* (1.2) 如果当前cpu不属于速度最快(fastest)的domain,
            则尝试进行up操作
         */
        if (!hmp_cpu_is_fastest(curr_cpu)) {
            /* current cpu is slow_cpu_mask*/
            /* (1.2.1) 当前cpu所在的hmp_domain为slow_cpu_mask */
            hmp_domain = hmp_cpu_domain(curr_cpu);
            cpumask_copy(&slow_cpu_mask, &hmp_domain->possible_cpus);

            /* (1.2.2) 最fastest且online的hmp_domain为fast_cpu_mask */
            while (&hmp_domain->hmp_domains != hmp_domains.next) {
                struct list_head *pos = &hmp_domain->hmp_domains;

                hmp_domain = list_entry(pos->prev, struct hmp_domain, hmp_domains);
                if (!cpumask_empty(&hmp_domain->cpus)) {
                    cpumask_copy(&fast_cpu_mask, &hmp_domain->possible_cpus);
                    break;
                }
            }
        } else {
        /* (1.3) 如果当前cpu属于速度最快(fastest)的domain,
            则直接进行down操作
         */
            hmp_force_down_migration(this_cpu);
            continue;
        }
        if (!hmp_domain || hmp_domain == hmp_cpu_domain(curr_cpu))
            continue;

        if (cpumask_empty(&fast_cpu_mask) || cpumask_empty(&slow_cpu_mask))
            continue;

        force = 0;
        /* (1.4) 取出当前cpu的当前cfs进程 */
        target = cpu_rq(curr_cpu);
        raw_spin_lock_irqsave(&target->lock, flags);
        se = target->cfs.curr;
        if (!se) {
            raw_spin_unlock_irqrestore(&target->lock, flags);
            continue;
        }

        /* Find task entity */
        if (!entity_is_task(se)) {
            struct cfs_rq *cfs_rq;

            cfs_rq = group_cfs_rq(se);
            while (cfs_rq) {
                se = cfs_rq->curr;
                cfs_rq = group_cfs_rq(se);
            }
        }
#ifdef CONFIG_SCHED_HMP_PLUS
        orig = se;
        /* (1.5) 或者取出当前cpu前5个cfs进程中,负载最重(heaviest)的进程 */
        se = hmp_get_heaviest_task(se, -1);
        if (!se) {
            raw_spin_unlock_irqrestore(&target->lock, flags);
            continue;
        }
        if (!entity_is_task(se))
            p = task_of(orig);
        else
#endif
            p = task_of(se);

        /* (1.6) 选择fast_cpu_mask domain中,负载最少的cpu */
        target_cpu = hmp_select_cpu(HMP_GB, p, &fast_cpu_mask, -1, 0);
        if (target_cpu >= num_possible_cpus()) {
            raw_spin_unlock_irqrestore(&target->lock, flags);
            continue;
        }

        /* Collect cluster information */
        /* (1.7) up操作的对象已经选择好:
            源little cpu:curr_cpu
            目的big cpu:target_cpu
         */
        memset(&clbenv, 0, sizeof(clbenv));
        clbenv.flags |= HMP_GB;
        clbenv.ltarget = curr_cpu;
        clbenv.btarget = target_cpu;
        cpumask_copy(&clbenv.lcpus, &slow_cpu_mask);
        cpumask_copy(&clbenv.bcpus, &fast_cpu_mask);
        /* (1.8) up操作前的数据计算 */
        sched_update_clbstats(&clbenv);

        /* Check migration threshold */
        /* (1.9) 根据计算的数据,判断up操作的可行性 */
        if (!target->active_balance &&
                hmp_up_migration(curr_cpu, &target_cpu, se, &clbenv) &&
                !cpu_park(cpu_of(target))) {
            if (p->state != TASK_DEAD) {
                /* 准备从target rq中迁移进程p到target_cpu,
                    设置rq正在处理负载balance标志active_balance */
                get_task_struct(p);
                target->active_balance = 1; /* force up */
                target->push_cpu = target_cpu;
                target->migrate_task = p;
                force = 1;
                trace_sched_hmp_migrate(p, target->push_cpu, 1);
                hmp_next_up_delay(&p->se, target->push_cpu);
            }
        }

        raw_spin_unlock_irqrestore(&target->lock, flags);
        /* (1.10) 判断结果是可以进行up操作,
            则调用hmp_force_up_cpu_stop()进行实际的up操作 
         */
        if (force) {
            if (stop_one_cpu_dispatch(cpu_of(target),
                        hmp_force_up_cpu_stop,
                        target, &target->active_balance_work)) {
                /* 迁移完成,清除标志 */
                put_task_struct(p); /* out of rq->lock */
                raw_spin_lock_irqsave(&target->lock, flags);
                target->active_balance = 0;
                force = 0;
                raw_spin_unlock_irqrestore(&target->lock, flags);
            }
        } else
        /* (1.11) 否则,再尝试进行down操作 */
            hmp_force_down_migration(this_cpu);
    }

#ifdef CONFIG_HMP_TRACER
    trace_sched_hmp_load(clbenv.bstats.load_avg, clbenv.lstats.load_avg);
#endif
    spin_unlock(&hmp_force_migration);

}

||→

static const int hmp_max_tasks = 5;
static struct sched_entity *hmp_get_heaviest_task(
        struct sched_entity *se, int target_cpu)
{
    int num_tasks = hmp_max_tasks;
    struct sched_entity *max_se = se;
    unsigned long int max_ratio = se->avg.loadwop_avg;
    const struct cpumask *hmp_target_mask = NULL;
    struct hmp_domain *hmp;

    /* (1.5.1) 如果本cpu是fastest cpu,则不用查找直接返回,
        因为本函数的目的是找little cpu中的heaviest进程
     */
    if (hmp_cpu_is_fastest(cpu_of(se->cfs_rq->rq)))
        return max_se;

    /* (1.5.2) 获取比本cpu fater一级cpu的hmp_domain,作为进程亲和力判断的mask */
    hmp = hmp_faster_domain(cpu_of(se->cfs_rq->rq));
    hmp_target_mask = &hmp->cpus;
    /* (1.5.3) 传入参数target_cpu = -1,
        所以hmp_target_mask使用的是源cpu hmp_domain的hmp->cpus 
     */
    if (target_cpu >= 0) {
        /* idle_balance gets run on a CPU while
         * it is in the middle of being hotplugged
         * out. Bail early in that case.
         */
        if (!cpumask_test_cpu(target_cpu, hmp_target_mask))
            return NULL;
        hmp_target_mask = cpumask_of(target_cpu);
    }
    /* The currently running task is not on the runqueue */
    /* (1.5.4) 从当前cpu的cfs红黑树中,连续5个进程和curr进程比较,选出heaviest进程 
        比较使用的负载为se->avg.loadwop_avg,不带weight分量
     */
    se = __pick_first_entity(cfs_rq_of(se));
    while (num_tasks && se) {
        if (entity_is_task(se) && se->avg.loadwop_avg > max_ratio &&
                cpumask_intersects(hmp_target_mask, tsk_cpus_allowed(task_of(se)))) {
            max_se = se;
            max_ratio = se->avg.loadwop_avg;
        }
        se = __pick_next_entity(se);
        num_tasks--;
    }
    return max_se;
}

||→

static unsigned int hmp_select_cpu(unsigned int caller, struct task_struct *p,
        struct cpumask *mask, int prev, int up)
{
    int curr = 0;
    int target = num_possible_cpus();
    unsigned long curr_wload = 0;
    unsigned long target_wload = 0;
    struct cpumask srcp;

    /* (1.6.1) 综合fast_cpu_mask、cpu_online_mask、tsk_cpus_allowed(p),
        选取first cpu为target
     */
    cpumask_and(&srcp, cpu_online_mask, mask);
    target = cpumask_any_and(&srcp, tsk_cpus_allowed(p));
    if (target >= num_possible_cpus())
        goto out;

    /*
     * RT class is taken into account because CPU load is multiplied
     * by the total number of CPU runnable tasks that includes RT tasks.
     */
    /*  (1.6.2) 计算target cpu所对应的load,
        target_wload = (rq->cfs.avg.loadwop_avg + rq->cfs.avg.pending_load) * (rq->nr_running + rq->cfs.avg.nr_pending)
        该负载会受RT进程的影响,因为rq->nr_running会统计包括RT进程的数量
     */
    target_wload = hmp_inc(cfs_load(target));
    target_wload += cfs_pending_load(target);
    target_wload *= rq_length(target);
    for_each_cpu(curr, mask) {
        /* Check CPU status and task affinity */
        if (!cpu_online(curr) || !cpumask_test_cpu(curr, tsk_cpus_allowed(p)))
            continue;

        /* For global load balancing, unstable CPU will be bypassed */
        /* (1.6.3) 如果当前是up操作,如果cpu在短时间内进行了down操作,则不适合马上进行up操作 */
        if (hmp_caller_is_gb(caller) && !hmp_cpu_stable(curr, up))
            continue;

        curr_wload = hmp_inc(cfs_load(curr));
        curr_wload += cfs_pending_load(curr);
        curr_wload *= rq_length(curr);
        /* (1.6.4) 选择load最小的作为target cpu */
        if (curr_wload < target_wload) {
            target_wload = curr_wload;
            target = curr;
        /* (1.6.5) 在load同样小的情况下,选择prev cpu */
        } else if (curr_wload == target_wload && curr == prev) {
            target = curr;
        }
    }

out:
    return target;
}

||→

static void sched_update_clbstats(struct clb_env *clbenv)
{
    /* init cpu power and capacity */
    /* (1.8.1) L族和B族的绝对运行能力和相对运算能力,
        .cpu_power = 绝对运算能力
        .cpu_capacity = 相对运算能力
     */
    clbenv->bstats.cpu_power = (int) arch_scale_cpu_capacity(NULL, clbenv->btarget);
    clbenv->lstats.cpu_power = (int) arch_scale_cpu_capacity(NULL, clbenv->ltarget);
    clbenv->lstats.cpu_capacity = SCHED_CAPACITY_SCALE;
    clbenv->bstats.cpu_capacity = SCHED_CAPACITY_SCALE * clbenv->bstats.cpu_power / (clbenv->lstats.cpu_power+1);

    /* (1.8.2) L族和B族的 */
    collect_cluster_stats(&clbenv->bstats, &clbenv->bcpus, clbenv->btarget);
    collect_cluster_stats(&clbenv->lstats, &clbenv->lcpus, clbenv->ltarget);

    /* (1.8.3) L族和B族的 */
    adj_threshold(clbenv);
}

|||→

static void collect_cluster_stats(struct clb_stats *clbs, struct cpumask *cluster_cpus, int target)
{
#define HMP_RESOLUTION_SCALING (4)
#define hmp_scale_down(w) ((w) >> HMP_RESOLUTION_SCALING)

    /* Update cluster informatics */
    int cpu;

    /* (1.8.2.1) 累加本族online cpu的值 */
    for_each_cpu(cpu, cluster_cpus) {
        if (cpu_online(cpu)) {
            clbs->ncpu++;
            clbs->ntask += cpu_rq(cpu)->cfs.h_nr_running;
            clbs->load_avg += cpu_rq(cpu)->cfs.avg.loadwop_avg;
#ifdef CONFIG_SCHED_HMP_PRIO_FILTER
            clbs->nr_normal_prio_task += cfs_nr_normal_prio(cpu);
            clbs->nr_dequeuing_low_prio += cfs_nr_dequeuing_low_prio(cpu);
#endif
        }
    }

    if (!clbs->ncpu || target >= num_possible_cpus() || !cpumask_test_cpu(target, cluster_cpus))
        return;

    /*
     * Calculate available CPU capacity
     * Calculate available task space
     *
     * Why load ratio should be multiplied by the number of task ?
     * The task is the entity of scheduling unit so that we should consider
     * it in scheduler. Only considering task load is not enough.
     * Thus, multiplying the number of tasks can adjust load ratio to a more
     * reasonable value.
     */
    /* (1.8.2.2) 计算本族剩余的cpu计算能力 
        capacity = 相对计算能力(clbs->cpu_capacity) - 本cpu的负载(rq->cfs.avg.loadwop_avg)
        :clbs->cpu_capacity是B族和L族相对的(L是1024,B大于1024),而负载(rq->cfs.avg.loadwop_avg)是相对自己的B族和L族的最大值都是1024
     */
    clbs->load_avg /= clbs->ncpu;
    clbs->acap = clbs->cpu_capacity - cpu_rq(target)->cfs.avg.loadwop_avg;
    clbs->scaled_acap = hmp_scale_down(clbs->acap);

    /* (1.8.2.3) 计算本族剩余的task空间
        scaled_atask = 相对计算能力(clbs->cpu_capacity) - 本cpu的负载(rq->cfs.avg.loadwop_avg)*本cpu所有的进程数量(rq->cfs.h_nr_running)
        ooooo这里的计算也不是在同一纬度上的
     */
    clbs->scaled_atask = cpu_rq(target)->cfs.h_nr_running * cpu_rq(target)->cfs.avg.loadwop_avg;
    clbs->scaled_atask = clbs->cpu_capacity - clbs->scaled_atask;
    clbs->scaled_atask = hmp_scale_down(clbs->scaled_atask);

    mt_sched_printf(sched_log, "[%s] cpu/cluster:%d/%02lx load/len:%lu/%u stats:%d,%d,%d,%d,%d,%d,%d,%d\n",
            __func__, target, *cpumask_bits(cluster_cpus),
            cpu_rq(target)->cfs.avg.loadwop_avg,
            cpu_rq(target)->cfs.h_nr_running,
            clbs->ncpu, clbs->ntask, clbs->load_avg, clbs->cpu_capacity,
            clbs->acap, clbs->scaled_acap, clbs->scaled_atask, clbs->threshold);
}

|||/*
 * Task Dynamic Migration Threshold Adjustment.
 *
 * If the workload between clusters is not balanced, adjust migration
 * threshold in an attempt to move task precisely.
 *
 * Diff. = Max Threshold - Min Threshold
 *
 * Dynamic UP-Threshold =
 *                               B_nacap               B_natask
 * Max Threshold - Diff. x  -----------------  x  -------------------
 *                          B_nacap + L_nacap     B_natask + L_natask
 *
 *
 * Dynamic Down-Threshold =
 *                               L_nacap               L_natask
 * Min Threshold + Diff. x  -----------------  x  -------------------
 *                          B_nacap + L_nacap     B_natask + L_natask
 */
static void adj_threshold(struct clb_env *clbenv)
{
#define POSITIVE(x) ((int)(x) < 0 ? 0 : (x))

    unsigned long b_cap = 0, l_cap = 0;
    int b_nacap, l_nacap, b_natask, l_natask;

    b_cap = clbenv->bstats.cpu_power;
    l_cap = clbenv->lstats.cpu_power;

    /* (1.8.3.1) 把B族剩余cpu计算能力和task空间,转换成L族的相对值 */
    b_nacap = POSITIVE(clbenv->bstats.scaled_acap *
            clbenv->bstats.cpu_power / (clbenv->lstats.cpu_power+1));
    b_natask = POSITIVE(clbenv->bstats.scaled_atask *
            clbenv->bstats.cpu_power / (clbenv->lstats.cpu_power+1));

    /* L族的值维持不变 */      
    l_nacap = POSITIVE(clbenv->lstats.scaled_acap);
    l_natask = POSITIVE(clbenv->lstats.scaled_atask);

    /* (1.8.3.2) 计算up的threshold, 
        up-threshold = HMP_MAX_LOAD - HMP_MAX_LOAD*B族剩余
     */
    clbenv->bstats.threshold = HMP_MAX_LOAD - HMP_MAX_LOAD * b_nacap * b_natask /
        ((b_nacap + l_nacap) * (b_natask + l_natask) + 1);

    /* (1.8.3.3) 计算down的threshold, 
        down-threshold = HMP_MAX_LOAD*L族剩余
     */
    clbenv->lstats.threshold = HMP_MAX_LOAD * l_nacap * l_natask /
        ((b_nacap + l_nacap) * (b_natask + l_natask) + 1);

    mt_sched_printf(sched_log, "[%s]\tup/dl:%4d/%4d L(%d:%4lu) b(%d:%4lu)\n", __func__,
            clbenv->bstats.threshold, clbenv->lstats.threshold,
            clbenv->ltarget, l_cap, clbenv->btarget, b_cap);
}

||/*
 * Check whether this task should be migrated to big
 * Briefly summarize the flow as below;
 * 1) Migration stabilizing
 * 2) Filter low-priority task
 * 2.5) Keep all cpu busy
 * 3) Check CPU capacity
 * 4) Check dynamic migration threshold
 */
static unsigned int hmp_up_migration(int cpu, int *target_cpu, struct sched_entity *se,
        struct clb_env *clbenv)
{
    struct task_struct *p = task_of(se);
    struct clb_stats *L, *B;
    struct mcheck *check;
    int curr_cpu = cpu;
#ifdef CONFIG_HMP_TRACER
    unsigned int caller = clbenv->flags;
#endif

    L = &clbenv->lstats;
    B = &clbenv->bstats;
    check = &clbenv->mcheck;

    check->status = clbenv->flags;
    check->status |= HMP_TASK_UP_MIGRATION;
    check->result = 0;

    /*
     * No migration is needed if
     * 1) There is only one cluster
     * 2) Task is already in big cluster
     * 3) It violates task affinity
     */
    if (!L->ncpu || !B->ncpu
            || cpumask_test_cpu(curr_cpu, &clbenv->bcpus)
            || !cpumask_intersects(&clbenv->bcpus, tsk_cpus_allowed(p)))
        goto out;

    /* (1.9.1) 如果目标cpu短时间内已经执行了up操作,则为up unstable状态,退出 */
    /*
     * [1] Migration stabilizing
     * Let the task load settle before doing another up migration.
     * It can prevent a bunch of tasks from migrating to a unstable CPU.
     */
    if (!hmp_up_stable(*target_cpu))
        goto out;

    /* (1.9.2) 过滤掉优先级较低的进程,不进行迁移操作。具体有3个条件:
        (task_low_priority(p->prio) && \    // nice值大于5
        (B->ntask >= B->ncpu || 0 != L->nr_normal_prio_task) && \  // B组进程大于cou数 || 正常优先级的进程不为0
        (p->se.avg.loadwop_avg < 800))  // 平均负载小于800
     */
    /* [2] Filter low-priority task */
#ifdef CONFIG_SCHED_HMP_PRIO_FILTER
    if (hmp_low_prio_task_up_rejected(p, B, L)) {
        check->status |= HMP_LOW_PRIORITY_FILTER;
        goto trace;
    }
#endif

    /* (1.9.3) 如果B组的target cpu为idle,不用过多判断,直接准备迁移 */
    /* [2.5]if big is idle, just go to big */
    if (rq_length(*target_cpu) == 0) {
        check->status |= HMP_BIG_IDLE;
        check->status |= HMP_MIGRATION_APPROVED;
        check->result = 1;
        goto trace;
    }

    /* (1.9.4) 判断B族target cpu的capacity是否足够,
        (se_load(se) + cfs_load(cpu)) < (B->cpu_capacity - (B->cpu_capacity >> 2))
        // target cpu负载 + 要迁移的se负载 是否小于 3/4 B族cpu的capacity
     */
    /*
     * [3] Check CPU capacity
     * Forbid up-migration if big CPU can't handle this task
     */
    if (!hmp_task_fast_cpu_afford(B, se, *target_cpu)) {
        check->status |= HMP_BIG_CAPACITY_INSUFFICIENT;
        goto trace;
    }

    /* (1.9.5) 判断se的负载是否已经大于up-threshold(B->threshold) */
    /*
     * [4] Check dynamic migration threshold
     * Migrate task from LITTLE to big if load is greater than up-threshold
     */
    if (se_load(se) > B->threshold) {
        check->status |= HMP_MIGRATION_APPROVED;
        check->result = 1;
    }

trace:
#ifdef CONFIG_HMP_TRACER
    if (check->result && hmp_caller_is_gb(caller))
        hmp_stats.nr_force_up++;
    trace_sched_hmp_stats(&hmp_stats);
    trace_sched_dynamic_threshold(task_of(se), B->threshold, check->status,
            curr_cpu, *target_cpu, se_load(se), B, L);
    trace_sched_dynamic_threshold_draw(B->threshold, L->threshold);
#endif
out:
    return check->result;
}

||→

static int hmp_force_up_cpu_stop(void *data)
{
    /* (1.10.1) 执行进程迁移 */
    return hmp_active_task_migration_cpu_stop(data);
}

|||→

static int hmp_active_task_migration_cpu_stop(void *data)
{
    struct rq *busiest_rq = data;
    struct task_struct *p = NULL;
    int busiest_cpu = cpu_of(busiest_rq);
    int target_cpu = busiest_rq->push_cpu;
    struct rq *target_rq = cpu_rq(target_cpu);
    struct sched_domain *sd;

    raw_spin_lock_irq(&busiest_rq->lock);
    p = busiest_rq->migrate_task;
    /* make sure the requested cpu hasn't gone down in the meantime */
    if (unlikely(busiest_cpu != smp_processor_id() ||
                !busiest_rq->active_balance)) {
        goto out_unlock;
    }
    /* Is there any task to move? */
    if (busiest_rq->nr_running <= 1)
        goto out_unlock;
    /* Are both target and busiest cpu online */
    if (!cpu_online(busiest_cpu) || !cpu_online(target_cpu))
        goto out_unlock;
    /* Task has migrated meanwhile, abort forced migration */
    if ((!p) || (task_rq(p) != busiest_rq))
        goto out_unlock;
    /*
     * This condition is "impossible", if it occurs
     * we need to fix it. Originally reported by
     * Bjorn Helgaas on a 128-cpu setup.
     */
    WARN_ON(busiest_rq == target_rq);

    /* (1.10.1.1) 将源、目的rq lock住 */
    /* move a task from busiest_rq to target_rq */
    double_lock_balance(busiest_rq, target_rq);

    /* (1.10.1.2) 搜索target cpu所在的某一层次的sd,其sd->span[]即包含源cpu又包含目的cpu */
    /* Search for an sd spanning us and the target CPU. */
    rcu_read_lock();
    for_each_domain(target_cpu, sd) {
        if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
            break;
    }

    /* (1.10.1.3) 构造数据,在同一sd下进行迁移 */
    if (likely(sd)) {
        struct lb_env env = {
            .sd             = sd,
            .dst_cpu        = target_cpu,
            .dst_rq         = target_rq,
            .src_cpu        = busiest_rq->cpu,
            .src_rq         = busiest_rq,
            .idle           = CPU_IDLE,
        };

        schedstat_inc(sd, alb_count);

        /* (1.10.1.4) 任务迁移 */
        if (move_specific_task(&env, p))
            schedstat_inc(sd, alb_pushed);
        else
            schedstat_inc(sd, alb_failed);
    }
    rcu_read_unlock();
    double_unlock_balance(busiest_rq, target_rq);
out_unlock:
    busiest_rq->active_balance = 0;
    raw_spin_unlock_irq(&busiest_rq->lock);

    put_task_struct(p);
    return 0;
}

||||→

static int move_specific_task(struct lb_env *env, struct task_struct *pm)
{
    struct task_struct *p, *n;

    /* (1.10.1.4.1) 从源rq->cfs_tasks逐个取出任务,直到查到pm */
    list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {

        /* (1.10.1.4.2) task group的throttled判断 */
        if (throttled_lb_pair(task_group(p), env->src_rq->cpu,
                    env->dst_cpu))
            continue;

        /* (1.10.1.4.3) 判断任务能否被迁移 */
        if (!hmp_can_migrate_task(p, env))
            continue;
        /* Check if we found the right task */
        if (p != pm)
            continue;

        /* (1.10.1.4.4) 迁移 */
        move_task(p, env);
        /*
         * Right now, this is only the third place move_task()
         * is called, so we can safely collect move_task()
         * stats here rather than inside move_task().
         */
        schedstat_inc(env->sd, lb_gained[env->idle]);
        return 1;
    }
    return 0;
}

|||||→

static void move_task(struct task_struct *p, struct lb_env *env)
{
    deactivate_task(env->src_rq, p, 0);
    set_task_cpu(p, env->dst_cpu);
    activate_task(env->dst_rq, p, 0);
    check_preempt_curr(env->dst_rq, p, 0);
}

4.2.3、hmp_force_down_migration()

hmp_force_down_migration()的操作主要有以下几个步骤:

  • 1、根据当前cpu,选择fast_cpu_mask、slow_cpu_mask;

hmp_force_down_migration尝试把fast cpu上的light进程迁移到slow cpu上,关于fast、slow的选择有以下几种场景:

这里写图片描述

  • 2、选择当前cpu的lightest进程作为迁移进程p;并不会遍历cpu上所有进程去选出lightest进程,只会查询curr进程和cfs_rq中5个进程中的lightest;

  • 3、根据slow_cpu_mask,选择一个负载最少的target cpu;

这里写图片描述

  • 4、根据源cpu(curr_cpu)、目的cpu(target_cpu),计算负载;

重要的数据计算方法和hmp_force_up_migration()一致,参考上一节;

  • 5、根据计算的负载情况,判断进程p是否符合down迁移条件((L->threshold >= se_load(se)),等其他条件);

down-migration条件列表(hmp_down_migration()):

条件 含义 计算方法 计算解析
[1] Migration stabilizing 如果target cpu刚做过down迁移,不适合再进行迁移 if (!hmp_down_stable(*target_cpu)) check->result = 0; (((now - hmp_last_down_migration(cpu)) >> 10)
static void hmp_force_down_migration(int this_cpu)
{
    int target_cpu;
    struct sched_entity *se;
    struct rq *target;
    unsigned long flags;
    unsigned int force = 0;
    struct task_struct *p;
    struct clb_env clbenv;
#ifdef CONFIG_SCHED_HMP_PLUS
    struct sched_entity *orig;
    int B_cpu;
#endif
    struct hmp_domain *hmp_domain = NULL;
    struct cpumask fast_cpu_mask, slow_cpu_mask;

    cpumask_clear(&fast_cpu_mask);
    cpumask_clear(&slow_cpu_mask);

    /* Migrate light task from big to LITTLE */
    /* (1) 如果当前cpu不是最慢的cpu(slowest),则尝试down操作 */
    if (!hmp_cpu_is_slowest(this_cpu)) {

        /* (2) 当前cpu所在的hmp_domain为fast_cpu_mask */
        hmp_domain = hmp_cpu_domain(this_cpu);
        cpumask_copy(&fast_cpu_mask, &hmp_domain->possible_cpus);

        /* (3) 查找相比当前最慢且online的hmp_domain作为slow_cpu_mask */
        while (!list_is_last(&hmp_domain->hmp_domains, &hmp_domains)) {
            struct list_head *pos = &hmp_domain->hmp_domains;

            hmp_domain = list_entry(pos->next, struct hmp_domain, hmp_domains);

            if (!cpumask_empty(&hmp_domain->cpus)) {
                cpumask_copy(&slow_cpu_mask, &hmp_domain->possible_cpus);
                break;
            }
        }
    }

    if (!hmp_domain || hmp_domain == hmp_cpu_domain(this_cpu))
        return;

    /* (4) 找不到可操作的fast_cpu_mask、slow_cpu_mask直接返回 */
    if (cpumask_empty(&fast_cpu_mask) || cpumask_empty(&slow_cpu_mask))
        return;

    /* (5) 源cpu = this_cpu,源rq = target */
    force = 0;
    target = cpu_rq(this_cpu);
    raw_spin_lock_irqsave(&target->lock, flags);
    se = target->cfs.curr;
    if (!se) {
        raw_spin_unlock_irqrestore(&target->lock, flags);
        return;
    }

    /* (6) 首先尝试使用curr进程作为down迁移的进程 */
    /* Find task entity */
    if (!entity_is_task(se)) {
        struct cfs_rq *cfs_rq;

        cfs_rq = group_cfs_rq(se);
        while (cfs_rq) {
            se = cfs_rq->curr;
            cfs_rq = group_cfs_rq(se);
        }
    }
#ifdef CONFIG_SCHED_HMP_PLUS
    /* (7) 在curr进程开始的5个进程中,挑负载最轻的进程作为down迁移进程 */
    orig = se;
    se = hmp_get_lightest_task(orig, 1);
    if (!entity_is_task(se))
        p = task_of(orig);
    else
#endif
        p = task_of(se);
#ifdef CONFIG_SCHED_HMP_PLUS
    /* (8) 找出B族中负载最轻的cpu,如果其为idle状态,则放弃down操作 
        因为load_balance中的idle_balance会重新把任务迁移回idle的big cpu,避免相互的乒乓操作
     */
    /* Don't offload to little if there is one idle big, let load balance to do it's work */
    /* Also, to prevent idle_balance from leading to potential ping-pong */
    B_cpu = hmp_select_cpu(HMP_GB, p, &fast_cpu_mask, this_cpu, 0);
    if (B_cpu < nr_cpu_ids && !rq_length(B_cpu)) {
        raw_spin_unlock_irqrestore(&target->lock, flags);
        return;
    }
#endif

    /* (9) 找出L族中负载最轻的cpu作为target_cpu */
    target_cpu = hmp_select_cpu(HMP_GB, p, &slow_cpu_mask, -1, 1);
    if (target_cpu >= num_possible_cpus()) {
        raw_spin_unlock_irqrestore(&target->lock, flags);
        return;
    }

    /* (10) 迁移前对B族、L族负载和threshold的计算 */
    /* Collect cluster information */
    memset(&clbenv, 0, sizeof(clbenv));
    clbenv.flags |= HMP_GB;
    clbenv.btarget = this_cpu;
    clbenv.ltarget = target_cpu;
    cpumask_copy(&clbenv.lcpus, &slow_cpu_mask);
    cpumask_copy(&clbenv.bcpus, &fast_cpu_mask);
    sched_update_clbstats(&clbenv);

#ifdef CONFIG_SCHED_HMP_PLUS
    if (cpu_rq(this_cpu)->cfs.h_nr_running < 2) {
        raw_spin_unlock_irqrestore(&target->lock, flags);
        return;
    }
#endif

    /* (11) 检查down操作的迁移条件是否成立,hmp_down_migration() */
    /* Check migration threshold */
    if (!target->active_balance &&
            hmp_down_migration(this_cpu, &target_cpu, se, &clbenv) &&
            !cpu_park(cpu_of(target))) {
        if (p->state != TASK_DEAD) {
            get_task_struct(p);
            target->active_balance = 1; /* force down */
            target->push_cpu = target_cpu;
            target->migrate_task = p;
            force = 1;
            trace_sched_hmp_migrate(p, target->push_cpu, 1);
            hmp_next_down_delay(&p->se, target->push_cpu);
        }
    }
    raw_spin_unlock_irqrestore(&target->lock, flags);

    /* (12) 条件成立进行实际的down迁移操作hmp_force_down_cpu_stop() */
    if (force) {
        if (stop_one_cpu_dispatch(cpu_of(target),
                    hmp_force_down_cpu_stop,
                    target, &target->active_balance_work)) {
            put_task_struct(p); /* out of rq->lock */
            raw_spin_lock_irqsave(&target->lock, flags);
            target->active_balance = 0;
            force = 0;
            raw_spin_unlock_irqrestore(&target->lock, flags);
        }
    }

}

|→

static struct sched_entity *hmp_get_lightest_task(
        struct sched_entity *se, int migrate_down)
{
    int num_tasks = hmp_max_tasks;
    struct sched_entity *min_se = se;
    unsigned long int min_ratio = se->avg.loadwop_avg;
    const struct cpumask *hmp_target_mask = NULL;

    if (migrate_down) {
        struct hmp_domain *hmp;

        /* (7.1) 如果cpu是最慢cpu(slowest)则直接退出,
            因为本函数的目的是找出faster cpu中lightest进程
         */
        if (hmp_cpu_is_slowest(cpu_of(se->cfs_rq->rq)))
            return min_se;

        /* (7.2) 将更slow一级的hmp_domain作为进程cpu亲和力的mask */
        hmp = hmp_slower_domain(cpu_of(se->cfs_rq->rq));
        hmp_target_mask = &hmp->cpus;
    }
    /* The currently running task is not on the runqueue */
    se = __pick_first_entity(cfs_rq_of(se));

    /* (7.3) 从当前cpu的cfs红黑树中,连续5个进程和curr进程比较,选出lightest进程 
        比较使用的负载为se->avg.loadwop_avg,不带weight分量
     */
    while (num_tasks && se) {
        if (entity_is_task(se) &&
                (se->avg.loadwop_avg < min_ratio && hmp_target_mask &&
                 cpumask_intersects(hmp_target_mask, tsk_cpus_allowed(task_of(se))))) {
            min_se = se;
            min_ratio = se->avg.loadwop_avg;
        }
        se = __pick_next_entity(se);
        num_tasks--;
    }
    return min_se;
}

|/*
 * Check whether this task should be migrated to LITTLE
 * Briefly summarize the flow as below;
 * 1) Migration stabilizing
 * 1.5) Keep all cpu busy
 * 2) Filter low-priority task
 * 3) Check CPU capacity
 * 4) Check dynamic migration threshold
 */
static unsigned int hmp_down_migration(int cpu, int *target_cpu, struct sched_entity *se,
        struct clb_env *clbenv)
{
    struct task_struct *p = task_of(se);
    struct clb_stats *L, *B;
    struct mcheck *check;
    int curr_cpu = cpu;
    unsigned int caller = clbenv->flags;

    L = &clbenv->lstats;
    B = &clbenv->bstats;
    check = &clbenv->mcheck;

    check->status = caller;
    check->status |= HMP_TASK_DOWN_MIGRATION;
    check->result = 0;

    /*
     * No migration is needed if
     * 1) There is only one cluster
     * 2) Task is already in LITTLE cluster
     * 3) It violates task affinity
     */
    if (!L->ncpu || !B->ncpu
            || cpumask_test_cpu(curr_cpu, &clbenv->lcpus)
            || !cpumask_intersects(&clbenv->lcpus, tsk_cpus_allowed(p)))
        goto out;

    /* (11.1) 目的little cpu target_cpu近期如果有做过down操作,不适合再做down迁移 */
    /*
     * [1] Migration stabilizing
     * Let the task load settle before doing another down migration.
     * It can prevent a bunch of tasks from migrating to a unstable CPU.
     */
    if (!hmp_down_stable(*target_cpu))
        goto out;

    /* (11.2) 如果big busy,little idle则不用进行threshold判断 */
    /* [1.5]if big is busy and little is idle, just go to little */
    if (rq_length(*target_cpu) == 0 && caller == HMP_SELECT_RQ && rq_length(curr_cpu) > 0) {
        struct rq *curr_rq = cpu_rq(curr_cpu);

        /* (11.2.1) 如果big cpu,curr进程不是heavy进程,但是p是heavy进程,直接准许down迁移 
            heavy进程的判断标准为:负载>=650
         */
        /* if current big core is not heavy task and wake up task is heavy task no go to little */
        if (!(!is_heavy_task(curr_rq->curr) && is_heavy_task(p))) {
            check->status |= HMP_BIG_BUSY_LITTLE_IDLE;
            check->status |= HMP_MIGRATION_APPROVED;
            check->result = 1;
            goto trace;
        }
    }

    /* (11.3) 低优先级进程,如果满足以下条件,准许迁移:
        (task_low_priority(p->prio) && !B->nr_dequeuing_low_prio && \   // nice值大于5
         B->ntask >= B->ncpu && 0 != L->nr_normal_prio_task && \        // B和L都不是特别空闲
         (p->se.avg.loadwop_avg < 800))                                 // L上准备迁移的进程负载小于800
     */
    /* [2] Filter low-priority task */
#ifdef CONFIG_SCHED_HMP_PRIO_FILTER
    if (hmp_low_prio_task_down_allowed(p, B, L)) {
        cfs_nr_dequeuing_low_prio(curr_cpu)++;
        check->status |= HMP_LOW_PRIORITY_FILTER;
        check->status |= HMP_MIGRATION_APPROVED;
        check->result = 1;
        goto trace;
    }
#endif

    /*
     * [3] Check CPU capacity
     * Forbid down-migration if either of the following conditions is true
     * 1) big cpu is not oversubscribed (if big CPU seems to have spare
     *    cycles, do not force this task to run on LITTLE CPU, but
     *    keep it staying in its previous cluster instead)
     * 2) LITTLE cpu doesn't have available capacity for this new task
     */
    /* (11.4) 如果big cpu有足够的空闲周期,不需要强制把light任务迁移到little cpu上 
        cfs_load(cpu) < (B->cpu_capacity - (B->cpu_capacity >> 2))
     */
    if (!hmp_fast_cpu_oversubscribed(caller, B, se, curr_cpu)) {
        check->status |= HMP_BIG_NOT_OVERSUBSCRIBED;
        goto trace;
    }

    /* (11.5) 判断L族cpu的capacity是否足够容纳需要迁移的进程,
        (L->acap > 0 && L->acap >= se_load(se))
     */
    if (!hmp_task_slow_cpu_afford(L, se)) {
        check->status |= HMP_LITTLE_CAPACITY_INSUFFICIENT;
        goto trace;
    }


    /* (11.6) 判断se的负载是否已经小于down-threshold(L->threshold) */
    /*
     * [4] Check dynamic migration threshold
     * Migrate task from big to LITTLE if load ratio is less than
     * or equal to down-threshold
     */
    if (L->threshold >= se_load(se)) {
        check->status |= HMP_MIGRATION_APPROVED;
        check->result = 1;
    }

trace:
#ifdef CONFIG_HMP_TRACER
    if (check->result && hmp_caller_is_gb(caller))
        hmp_stats.nr_force_down++;
    trace_sched_hmp_stats(&hmp_stats);
    trace_sched_dynamic_threshold(task_of(se), L->threshold, check->status,
            curr_cpu, *target_cpu, se_load(se), B, L);
    trace_sched_dynamic_threshold_draw(B->threshold, L->threshold);
#endif
out:
    return check->result;
}

4.2.4、hmp_select_task_rq_fair()

4.3、cpu freq调整

前面讲的负载均衡的手段都是负载迁移,把负载迁移到最idle或者最省power的cpu上。另外一种方式就是调整cpu的freq,从而改变cpu的curr_capacity,来满足性能和功耗的需求。

cpu的频率调整是基于3个层次的:cpufreq governor、cpufreq core、cpufreq driver。

  • 1、cpufreq governor决定cpu调频的算法,计算负载、根据负载的变化来动态调整频率;
  • 2、cpufreq core对通用层进行了一些封装,比如cpufreq_policy的封装;
  • 3、cpufreq driver是底层操作的实现,比如freq_table的初始化、cpu target频率的配置;

这里写图片描述

如果是MTK平台,cpufreq driver除了接受governor的频率调整还需要接受ppm的频率调整,它的框图大概如下:

这里写图片描述

4.3.1、cpufreq core & cpufreq driver

cpufreq core层次最核心的就是每个cpu有一个自己的cpufreq_policy policy,放在per_cpu(cpufreq_cpu_data, cpu)变量中。实际上cpufreq_policy是一个cluster对应一个的,因为在现有的架构中,同一个cluster cpu都是同一个频率,所以同cluster中所有cpu的per_cpu(cpufreq_cpu_data, cpu)都指向同一个cpufreq_policy。

这里写图片描述

4.3.1.1、cpufreq_policy policy初始化
struct cpufreq_policy {
    /* CPUs sharing clock, require sw coordination */
    cpumask_var_t       cpus;   /* Online CPUs only */
    cpumask_var_t       related_cpus; /* Online + Offline CPUs */
    cpumask_var_t       real_cpus; /* Related and present */

    unsigned int        shared_type; /* ACPI: ANY or ALL affected CPUs
                        should set cpufreq */
    unsigned int        cpu;    /* cpu managing this policy, must be online */

    struct clk      *clk;
    struct cpufreq_cpuinfo  cpuinfo;/* see above */

    unsigned int        min;    /* in kHz */
    unsigned int        max;    /* in kHz */
    unsigned int        cur;    /* in kHz, only needed if cpufreq
                     * governors are used */
    unsigned int        restore_freq; /* = policy->cur before transition */
    unsigned int        suspend_freq; /* freq to set during suspend */

    unsigned int        policy; /* see above */
    unsigned int        last_policy; /* policy before unplug */
    struct cpufreq_governor *governor; /* see below */
    void            *governor_data;
    bool            governor_enabled; /* governor start/stop flag */
    char            last_governor[CPUFREQ_NAME_LEN]; /* last governor used */

    struct work_struct  update; /* if update_policy() needs to be
                     * called, but you're in IRQ context */

    struct cpufreq_user_policy user_policy;
    struct cpufreq_frequency_table  *freq_table;

    struct list_head        policy_list;
    struct kobject      kobj;
    struct completion   kobj_unregister;

    /*
     * The rules for this semaphore:
     * - Any routine that wants to read from the policy structure will
     *   do a down_read on this semaphore.
     * - Any routine that will write to the policy structure and/or may take away
     *   the policy altogether (eg. CPU hotplug), will hold this lock in write
     *   mode before doing so.
     *
     * Additional rules:
     * - Lock should not be held across
     *     __cpufreq_governor(data, CPUFREQ_GOV_POLICY_EXIT);
     */
    struct rw_semaphore rwsem;

    /* Synchronization for frequency transitions */
    bool            transition_ongoing; /* Tracks transition status */
    spinlock_t      transition_lock;
    wait_queue_head_t   transition_wait;
    struct task_struct  *transition_task; /* Task which is doing the transition */

    /* cpufreq-stats */
    struct cpufreq_stats    *stats;

    /* For cpufreq driver's internal use */
    void            *driver_data;
}

在系统初始化化的时候初始化online cpu的cpufreq_policy,cpu在hotplug online的时候也会重新初始化cpufreq_policy。

  • 1、在mtk的cpufreq_driver驱动初始化函数_mt_cpufreq_pdrv_probe()中注册了_mt_cpufreq_driver:
static int _mt_cpufreq_pdrv_probe(struct platform_device *pdev)
{

    /* 注册cpufreq_driver */
    cpufreq_register_driver(&_mt_cpufreq_driver);

    /* 注册ppm的回调 */
    mt_ppm_register_client(PPM_CLIENT_DVFS, &ppm_limit_callback);

}

static struct cpufreq_driver _mt_cpufreq_driver = {
    .flags = CPUFREQ_ASYNC_NOTIFICATION,
    .verify = _mt_cpufreq_verify,
    .target = _mt_cpufreq_target,
    .init = _mt_cpufreq_init,
    .exit = _mt_cpufreq_exit,
    .get = _mt_cpufreq_get,
    .name = "mt-cpufreq",
    .attr = _mt_cpufreq_attr,
};
  • 2、在驱动注册cpufreq_register_driver()过程中会初始化online cpu的cpufreq_policy:
_mt_cpufreq_pdrv_probe() -> cpufreq_register_driver() -> subsys_interface_register() -> cpufreq_add_dev() -> cpufreq_online()

↓

static int cpufreq_online(unsigned int cpu)
{
    struct cpufreq_policy *policy;
    bool new_policy;
    unsigned long flags;
    unsigned int j;
    int ret;

    pr_debug("%s: bringing CPU%u online\n", __func__, cpu);

    /* (1) 检查per_cpu(cpufreq_cpu_data, cpu)中的cpufreq_policy, 
        如果为NULL,重新分配空间
     */
    /* Check if this CPU already has a policy to manage it */
    policy = per_cpu(cpufreq_cpu_data, cpu);
    if (policy) {
        WARN_ON(!cpumask_test_cpu(cpu, policy->related_cpus));
        if (!policy_is_inactive(policy))
            return cpufreq_add_policy_cpu(policy, cpu);

        /* This is the only online CPU for the policy.  Start over. */
        new_policy = false;
        down_write(&policy->rwsem);
        policy->cpu = cpu;
        policy->governor = NULL;
        up_write(&policy->rwsem);
    } else {
        new_policy = true;
        policy = cpufreq_policy_alloc(cpu);
        if (!policy)
            return -ENOMEM;
    }

    cpumask_copy(policy->cpus, cpumask_of(cpu));

    /* (2) 调用cpufreq_driver的初始化函数来初始化cpufreq_policy, 
        这步比较重要,初始化了以下的数据:

     */
    /* call driver. From then on the cpufreq must be able
     * to accept all calls to ->verify and ->setpolicy for this CPU
     */
    ret = cpufreq_driver->init(policy);
    if (ret) {
        pr_debug("initialization failed\n");
        goto out_free_policy;
    }

    down_write(&policy->rwsem);

    /* (3) 如果cpufreq_policy是新分配空间的,
        做一些相应的初始化工作
     */
    if (new_policy) {
        /* related_cpus should at least include policy->cpus. */
        cpumask_copy(policy->related_cpus, policy->cpus);
        /* Remember CPUs present at the policy creation time. */
        cpumask_and(policy->real_cpus, policy->cpus, cpu_present_mask);

        /* Name and add the kobject */
        ret = kobject_add(&policy->kobj, cpufreq_global_kobject,
                  "policy%u",
                  cpumask_first(policy->related_cpus));
        if (ret) {
            pr_err("%s: failed to add policy->kobj: %d\n", __func__,
                   ret);
            goto out_exit_policy;
        }
    }

    /*
     * affected cpus must always be the one, which are online. We aren't
     * managing offline cpus here.
     */
    cpumask_and(policy->cpus, policy->cpus, cpu_online_mask);

    if (new_policy) {
        policy->user_policy.min = policy->min;
        policy->user_policy.max = policy->max;

        write_lock_irqsave(&cpufreq_driver_lock, flags);

        /* (3.1) 同一个cluster中所有cpu的per_cpu(cpufreq_cpu_data, j),共享同一个cpufreq_policy */
        for_each_cpu(j, policy->related_cpus)
            per_cpu(cpufreq_cpu_data, j) = policy;
        write_unlock_irqrestore(&cpufreq_driver_lock, flags);
    }

    /* (4) 获取cpufreq_policy的当前频率
     */
    if (cpufreq_driver->get && !cpufreq_driver->setpolicy) {
        policy->cur = cpufreq_driver->get(policy->cpu);
        if (!policy->cur) {
            pr_err("%s: ->get() failed\n", __func__);
            goto out_exit_policy;
        }
    }

    /*
     * Sometimes boot loaders set CPU frequency to a value outside of
     * frequency table present with cpufreq core. In such cases CPU might be
     * unstable if it has to run on that frequency for long duration of time
     * and so its better to set it to a frequency which is specified in
     * freq-table. This also makes cpufreq stats inconsistent as
     * cpufreq-stats would fail to register because current frequency of CPU
     * isn't found in freq-table.
     *
     * Because we don't want this change to effect boot process badly, we go
     * for the next freq which is >= policy->cur ('cur' must be set by now,
     * otherwise we will end up setting freq to lowest of the table as 'cur'
     * is initialized to zero).
     *
     * We are passing target-freq as "policy->cur - 1" otherwise
     * __cpufreq_driver_target() would simply fail, as policy->cur will be
     * equal to target-freq.
     */
    if ((cpufreq_driver->flags & CPUFREQ_NEED_INITIAL_FREQ_CHECK)
        && has_target()) {
        /* Are we running at unknown frequency ? */
        ret = cpufreq_frequency_table_get_index(policy, policy->cur);
        if (ret == -EINVAL) {
            /* Warn user and fix it */
            pr_warn("%s: CPU%d: Running at unlisted freq: %u KHz\n",
                __func__, policy->cpu, policy->cur);
            ret = __cpufreq_driver_target(policy, policy->cur - 1,
                CPUFREQ_RELATION_L);

            /*
             * Reaching here after boot in a few seconds may not
             * mean that system will remain stable at "unknown"
             * frequency for longer duration. Hence, a BUG_ON().
             */
            BUG_ON(ret);
            pr_warn("%s: CPU%d: Unlisted initial frequency changed to: %u KHz\n",
                __func__, policy->cpu, policy->cur);
        }
    }

    blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
                     CPUFREQ_START, policy);

    if (new_policy) {
        ret = cpufreq_add_dev_interface(policy);
        if (ret)
            goto out_exit_policy;
        blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
                CPUFREQ_CREATE_POLICY, policy);

        write_lock_irqsave(&cpufreq_driver_lock, flags);
        list_add(&policy->policy_list, &cpufreq_policy_list);
        write_unlock_irqrestore(&cpufreq_driver_lock, flags);
    }

    /* (5) 调用cpufreq governor的初始化函数,来初始化cpufreq_policy
     */
    ret = cpufreq_init_policy(policy);
    if (ret) {
        pr_err("%s: Failed to initialize policy for cpu: %d (%d)\n",
               __func__, cpu, ret);
        /* cpufreq_policy_free() will notify based on this */
        new_policy = false;
        goto out_exit_policy;
    }

    up_write(&policy->rwsem);

    kobject_uevent(&policy->kobj, KOBJ_ADD);

    /* Callback for handling stuff after policy is ready */
    if (cpufreq_driver->ready)
        cpufreq_driver->ready(policy);

    pr_debug("initialization complete\n");

    return 0;

out_exit_policy:
    up_write(&policy->rwsem);

    if (cpufreq_driver->exit)
        cpufreq_driver->exit(policy);
out_free_policy:
    cpufreq_policy_free(policy, !new_policy);
    return ret;
}

|→

static int _mt_cpufreq_init(struct cpufreq_policy *policy)
{
    int ret = -EINVAL;
    unsigned long flags;

    FUNC_ENTER(FUNC_LV_MODULE);

    policy->shared_type = CPUFREQ_SHARED_TYPE_ANY;
    cpumask_setall(policy->cpus);

    policy->cpuinfo.transition_latency = 1000;

    {
        enum mt_cpu_dvfs_id id = _get_cpu_dvfs_id(policy->cpu);
        struct mt_cpu_dvfs *p = id_to_cpu_dvfs(id);
        unsigned int lv = _mt_cpufreq_get_cpu_level();
        struct opp_tbl_info *opp_tbl_info;
        struct opp_tbl_m_info *opp_tbl_m_info;
        struct opp_tbl_m_info *opp_tbl_m_cci_info;
        struct mt_cpu_dvfs *p_cci;

        cpufreq_ver("DVFS: _mt_cpufreq_init: %s(cpu_id = %d)\n", cpu_dvfs_get_name(p), p->cpu_id);

        opp_tbl_info = &opp_tbls[id][lv];

        p->cpu_level = lv;

        /* (2.1) 给policy->freq_table赋值 
            给policy->cpus赋值
            给policy->related_cpus赋值
         */
        ret = _mt_cpufreq_setup_freqs_table(policy,
                            opp_tbl_info->opp_tbl, opp_tbl_info->size);

        /* (2.2) 给policy->cpuinfo.max_freq赋值 
            给policy->cpuinfo.min_freq赋值
         */
        policy->cpuinfo.max_freq = cpu_dvfs_get_max_freq(p);
        policy->cpuinfo.min_freq = cpu_dvfs_get_min_freq(p);

        opp_tbl_m_info = &opp_tbls_m[id][lv];
        p->freq_tbl = opp_tbl_m_info->opp_tbl_m;

        cpufreq_lock(flags);
        /* Sync p */
        if (_mt_cpufreq_sync_opp_tbl_idx(p) >= 0)
            if (p->idx_normal_max_opp == -1)
                p->idx_normal_max_opp = p->idx_opp_tbl;

        /* (2.3) 给policy->cur赋值 
            给policy->max赋值
            给policy->min赋值
         */
        policy->cur = cpu_dvfs_get_cur_freq(p); /* use cur phy freq is better */
        policy->max = cpu_dvfs_get_freq_by_idx(p, p->idx_opp_ppm_limit);
        policy->min = cpu_dvfs_get_freq_by_idx(p, p->idx_opp_ppm_base);
        p->mt_policy = policy;
        p->armpll_is_available = 1;

#ifdef CONFIG_HYBRID_CPU_DVFS
        if (turbo_flag && cpu_dvfs_is(p, MT_CPU_DVFS_B) && !turbo_is_inited) {
            unsigned int turbo_f, turbo_v;

            turbo_f = ((cpu_dvfs_get_max_freq(p) * 104 / 100) / 13) * 13 / 1000;

            if (picachu_need_higher_volt(MT_PICACHU_DOMAIN2))
                turbo_v = MAX_VPROC_VOLT;
            else
                turbo_v = MAX_VPROC_VOLT - 2000;
            /* turbo_v = p->opp_tbl[0].cpufreq_volt; */
            cpuhvfs_set_turbo_scale(turbo_f * 1000, turbo_v);
            turbo_is_inited = 1;
        }
#endif

        /* Sync cci */
        if (cci_is_inited == 0) {
            p_cci = id_to_cpu_dvfs(MT_CPU_DVFS_CCI);

            /* init cci freq idx */
            if (_mt_cpufreq_sync_opp_tbl_idx(p_cci) >= 0)
                if (p_cci->idx_normal_max_opp == -1)
                    p_cci->idx_normal_max_opp = p_cci->idx_opp_tbl;

            opp_tbl_m_cci_info = &opp_tbls_m[MT_CPU_DVFS_CCI][lv];
            p_cci->freq_tbl = opp_tbl_m_cci_info->opp_tbl_m;
            p_cci->mt_policy = NULL;
            p_cci->armpll_is_available = 1;
            cci_is_inited = 1;
        }
#ifdef CONFIG_HYBRID_CPU_DVFS
        cpuhvfs_set_cluster_on_off(arch_get_cluster_id(p->cpu_id), 1);
#endif
        cpufreq_unlock(flags);
    }

    if (ret)
        cpufreq_err("failed to setup frequency table\n");

    FUNC_EXIT(FUNC_LV_MODULE);

    return ret;
}

||→

static int _mt_cpufreq_setup_freqs_table(struct cpufreq_policy *policy,
                     struct mt_cpu_freq_info *freqs, int num)
{
    struct mt_cpu_dvfs *p;
    int ret = 0;

    FUNC_ENTER(FUNC_LV_LOCAL);

    p = id_to_cpu_dvfs(_get_cpu_dvfs_id(policy->cpu));

#ifdef CONFIG_CPU_FREQ
    ret = cpufreq_frequency_table_cpuinfo(policy, p->freq_tbl_for_cpufreq);

    /* (2.1.1) 给policy->freq_table赋值 
     */
    if (!ret)
        policy->freq_table = p->freq_tbl_for_cpufreq;

    /* (2.1.2) 根据cpu相同cluster中有哪些cpu 
        给policy->cpus赋值
        给policy->related_cpus赋值
     */
    cpumask_copy(policy->cpus, topology_core_cpumask(policy->cpu));
    cpumask_copy(policy->related_cpus, policy->cpus);
#endif

    FUNC_EXIT(FUNC_LV_LOCAL);

    return 0;
}
  • 3、在cpufreq_online()初始化完cpufreq_policy,最后会调用cpufreq_init_policy()继续governor的初始化:
static int cpufreq_init_policy(struct cpufreq_policy *policy)
{
    struct cpufreq_governor *gov = NULL;
    struct cpufreq_policy new_policy;

    memcpy(&new_policy, policy, sizeof(*policy));

    /* (5.1) 使用last或者default的governor,
        给new_policy.governor赋值
     */
    /* Update governor of new_policy to the governor used before hotplug */
    gov = find_governor(policy->last_governor);
    if (gov)
        pr_debug("Restoring governor %s for cpu %d\n",
                policy->governor->name, policy->cpu);
    else
        gov = CPUFREQ_DEFAULT_GOVERNOR;

    new_policy.governor = gov;

    /* Use the default policy if there is no last_policy. */
    if (cpufreq_driver->setpolicy) {
        if (policy->last_policy)
            new_policy.policy = policy->last_policy;
        else
            cpufreq_parse_governor(gov->name, &new_policy.policy,
                           NULL);
    }

    /* (5.2) 启动governor来使用cpufreq_policy */
    /* set default policy */
    return cpufreq_set_policy(policy, &new_policy);
}

|→

static int cpufreq_set_policy(struct cpufreq_policy *policy,
                struct cpufreq_policy *new_policy)
{
    struct cpufreq_governor *old_gov;
    int ret;

    pr_debug("setting new policy for CPU %u: %u - %u kHz\n",
         new_policy->cpu, new_policy->min, new_policy->max);

    memcpy(&new_policy->cpuinfo, &policy->cpuinfo, sizeof(policy->cpuinfo));

    /* (5.2.1) 对policy、new_policy的一堆合法性判断 */
    /*
    * This check works well when we store new min/max freq attributes,
    * because new_policy is a copy of policy with one field updated.
    */
    if (new_policy->min > new_policy->max)
        return -EINVAL;

    /* verify the cpu speed can be set within this limit */
    ret = cpufreq_driver->verify(new_policy);
    if (ret)
        return ret;

    /* adjust if necessary - all reasons */
    blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
            CPUFREQ_ADJUST, new_policy);

    /*
     * verify the cpu speed can be set within this limit, which might be
     * different to the first one
     */
    ret = cpufreq_driver->verify(new_policy);
    if (ret)
        return ret;

    /* notification of the new policy */
    blocking_notifier_call_chain(&cpufreq_policy_notifier_list,
            CPUFREQ_NOTIFY, new_policy);

    scale_freq_capacity(new_policy, NULL);

    policy->min = new_policy->min;
    policy->max = new_policy->max;
    trace_cpu_frequency_limits(policy->max, policy->min, policy->cpu);

    pr_debug("new min and max freqs are %u - %u kHz\n",
         policy->min, policy->max);

    if (cpufreq_driver->setpolicy) {
        policy->policy = new_policy->policy;
        pr_debug("setting range\n");
        return cpufreq_driver->setpolicy(new_policy);
    }

    if (new_policy->governor == policy->governor)
        goto out;

    pr_debug("governor switch\n");

    /* (5.2.2) 如果旧的governor在工作中,
        依次调用 CPUFREQ_GOV_STOP、CPUFREQ_GOV_POLICY_EXIT停止旧的governor
     */
    /* save old, working values */
    old_gov = policy->governor;
    /* end old governor */
    if (old_gov) {
        ret = __cpufreq_governor(policy, CPUFREQ_GOV_STOP);
        if (ret) {
            /* This can happen due to race with other operations */
            pr_debug("%s: Failed to Stop Governor: %s (%d)\n",
                 __func__, old_gov->name, ret);
            return ret;
        }

        up_write(&policy->rwsem);
        ret = __cpufreq_governor(policy, CPUFREQ_GOV_POLICY_EXIT);
        down_write(&policy->rwsem);

        if (ret) {
            pr_err("%s: Failed to Exit Governor: %s (%d)\n",
                   __func__, old_gov->name, ret);
            return ret;
        }
    }

    /* (5.2.3) 依次调用 CPUFREQ_GOV_POLICY_INIT、CPUFREQ_GOV_START让新的governor开工
     */
    /* start new governor */
    policy->governor = new_policy->governor;
    ret = __cpufreq_governor(policy, CPUFREQ_GOV_POLICY_INIT);
    if (!ret) {
        ret = __cpufreq_governor(policy, CPUFREQ_GOV_START);
        if (!ret)
            goto out;

        up_write(&policy->rwsem);
        __cpufreq_governor(policy, CPUFREQ_GOV_POLICY_EXIT);
        down_write(&policy->rwsem);
    }

    /* new governor failed, so re-start old one */
    pr_debug("starting governor %s failed\n", policy->governor->name);
    if (old_gov) {
        policy->governor = old_gov;
        if (__cpufreq_governor(policy, CPUFREQ_GOV_POLICY_INIT))
            policy->governor = NULL;
        else
            __cpufreq_governor(policy, CPUFREQ_GOV_START);
    }

    return ret;

 out:
    pr_debug("governor: change or update limits\n");
    return __cpufreq_governor(policy, CPUFREQ_GOV_LIMITS);
}

||→

static int __cpufreq_governor(struct cpufreq_policy *policy,
                    unsigned int event)
{

    /* __cpufreq_governor()调用的各种命令最后调用的都是governor的具体函数 */
    ret = policy->governor->governor(policy, event);
}
  • 4、以interactive governor为例,说明policy->governor->governor()对CPUFREQ_GOV_POLICY_INIT、CPUFREQ_GOV_START、CPUFREQ_GOV_STOP、CPUFREQ_GOV_POLICY_EXIT这几个命令的实现:
struct cpufreq_governor cpufreq_gov_interactive = {
    .name = "interactive",
    .governor = cpufreq_governor_interactive,
    .max_transition_latency = 10000000,
    .owner = THIS_MODULE,
};

↓

static int cpufreq_governor_interactive(struct cpufreq_policy *policy,
        unsigned int event)
{
    int rc;
    unsigned int j;
    struct cpufreq_interactive_cpuinfo *pcpu;
    struct cpufreq_frequency_table *freq_table;
    struct cpufreq_interactive_tunables *tunables;
    unsigned long flags;

    if (have_governor_per_policy())
        tunables = policy->governor_data;
    else
        tunables = common_tunables;

    WARN_ON(!tunables && (event != CPUFREQ_GOV_POLICY_INIT));

    switch (event) {

    /* (1) CPUFREQ_GOV_POLICY_INIT命令的实现:
        初始化tunables,tunables是interactive governor在计算时使用的各种参数
        相关的sysfs注册
     */
    case CPUFREQ_GOV_POLICY_INIT:
        if (have_governor_per_policy()) {
            WARN_ON(tunables);
        } else if (tunables) {
            tunables->usage_count++;
            policy->governor_data = tunables;
            return 0;
        }

        tunables = kzalloc(sizeof(*tunables), GFP_KERNEL);
        if (!tunables) {
            pr_err("%s: POLICY_INIT: kzalloc failed\n", __func__);
            return -ENOMEM;
        }

        tunables->usage_count = 1;
        tunables->above_hispeed_delay = default_above_hispeed_delay;
        tunables->nabove_hispeed_delay =
            ARRAY_SIZE(default_above_hispeed_delay);
        tunables->go_hispeed_load = DEFAULT_GO_HISPEED_LOAD;
        tunables->target_loads = default_target_loads;
        tunables->ntarget_loads = ARRAY_SIZE(default_target_loads);
        tunables->min_sample_time = DEFAULT_MIN_SAMPLE_TIME;
        tunables->timer_rate = DEFAULT_TIMER_RATE;
        tunables->boostpulse_duration_val = DEFAULT_MIN_SAMPLE_TIME;
        tunables->timer_slack_val = DEFAULT_TIMER_SLACK;

        spin_lock_init(&tunables->target_loads_lock);
        spin_lock_init(&tunables->above_hispeed_delay_lock);

        policy->governor_data = tunables;
        if (!have_governor_per_policy()) {
            common_tunables = tunables;
        }

        rc = sysfs_create_group(get_governor_parent_kobj(policy),
                get_sysfs_attr());
        if (rc) {
            kfree(tunables);
            policy->governor_data = NULL;
            if (!have_governor_per_policy()) {
                common_tunables = NULL;
            }
            return rc;
        }

        if (!policy->governor->initialized) {
            idle_notifier_register(&cpufreq_interactive_idle_nb);
            cpufreq_register_notifier(&cpufreq_notifier_block,
                    CPUFREQ_TRANSITION_NOTIFIER);
        }

        break;

    /* (2) CPUFREQ_GOV_POLICY_EXIT命令的实现:
        remove相关的sysfs
     */
    case CPUFREQ_GOV_POLICY_EXIT:
        if (!--tunables->usage_count) {
            if (policy->governor->initialized == 1) {
                cpufreq_unregister_notifier(&cpufreq_notifier_block,
                        CPUFREQ_TRANSITION_NOTIFIER);
                idle_notifier_unregister(&cpufreq_interactive_idle_nb);
            }
#ifdef CONFIG_MEIZU_BSP
        }
#else
            sysfs_remove_group(get_governor_parent_kobj(policy),
                    get_sysfs_attr());

            kfree(tunables);
            common_tunables = NULL;
        }

        policy->governor_data = NULL;
#endif //CONFIG_MEIZU_BSP
        break;

    /* (3) CPUFREQ_GOV_START命令的实现:
        因为同一个cluster中的多个cpu是共享一个cpufreq_policy的,
        所以使用同一个cpufreq_policy来初始化cluster中多个online cpu的per_cpu(cpuinfo, j)变量:
        pcpu->target_freq    // 当前频率
        pcpu->freq_table     // 频率表
        并且启动cpu上的interactive_timer=pcpu->cpu_timer:
        cpufreq_interactive_timer_start(tunables, j);
     */
    case CPUFREQ_GOV_START:
        mutex_lock(&gov_lock);

        freq_table = cpufreq_frequency_get_table(policy->cpu);
        if (tunables && !tunables->hispeed_freq)
            tunables->hispeed_freq = policy->max;

        for_each_cpu(j, policy->cpus) {
            pcpu = &per_cpu(cpuinfo, j);
            pcpu->policy = policy;
            pcpu->target_freq = policy->cur;
            pcpu->freq_table = freq_table;
            pcpu->floor_freq = pcpu->target_freq;
            pcpu->pol_floor_val_time =
                ktime_to_us(ktime_get());
            pcpu->loc_floor_val_time = pcpu->pol_floor_val_time;
            pcpu->pol_hispeed_val_time = pcpu->pol_floor_val_time;
            pcpu->loc_hispeed_val_time = pcpu->pol_floor_val_time;
            down_write(&pcpu->enable_sem);
            del_timer_sync(&pcpu->cpu_timer);
            del_timer_sync(&pcpu->cpu_slack_timer);
            cpufreq_interactive_timer_start(tunables, j);
            pcpu->governor_enabled = 1;
            up_write(&pcpu->enable_sem);
        }

        mutex_unlock(&gov_lock);
        break;

    /* (4) CPUFREQ_GOV_STOP命令的实现:
        如果同一个cluster中的多个cpu都已经offline,停掉对应的governor:
        停掉cpu上的interactive_timer=pcpu->cpu_timer
     */
    case CPUFREQ_GOV_STOP:
        mutex_lock(&gov_lock);
        for_each_cpu(j, policy->cpus) {
            pcpu = &per_cpu(cpuinfo, j);
            down_write(&pcpu->enable_sem);
            pcpu->governor_enabled = 0;
            del_timer_sync(&pcpu->cpu_timer);
            del_timer_sync(&pcpu->cpu_slack_timer);
            up_write(&pcpu->enable_sem);
        }

        mutex_unlock(&gov_lock);
        break;

    case CPUFREQ_GOV_LIMITS:
        if (policy->max < policy->cur)
            __cpufreq_driver_target(policy,
                    policy->max, CPUFREQ_RELATION_H);
        else if (policy->min > policy->cur)
            __cpufreq_driver_target(policy,
                    policy->min, CPUFREQ_RELATION_L);
        for_each_cpu(j, policy->cpus) {
            pcpu = &per_cpu(cpuinfo, j);

            down_read(&pcpu->enable_sem);
            if (pcpu->governor_enabled == 0) {
                up_read(&pcpu->enable_sem);
                continue;
            }

            spin_lock_irqsave(&pcpu->target_freq_lock, flags);
            if (policy->max < pcpu->target_freq)
                pcpu->target_freq = policy->max;
            else if (policy->min > pcpu->target_freq)
                pcpu->target_freq = policy->min;

            spin_unlock_irqrestore(&pcpu->target_freq_lock, flags);
            up_read(&pcpu->enable_sem);
        }
        break;
    }
4.3.1.2、cpufrep的频率配置

cpufreq一个重要的作用就是能把用户需要的cpu频率配置下去,这部分的代码也需要cpufreq core和cpufreq driver的配合。频率调整也叫DVFS(Dynamic Voltage and Frequency Scaling),需要按照对应关系把电压和频率一起配置下去。

具体的代码解析如下:

int __cpufreq_driver_target(struct cpufreq_policy *policy,
                unsigned int target_freq,
                unsigned int relation)
{
    unsigned int old_target_freq = target_freq;
    int retval = -EINVAL;

    if (cpufreq_disabled())
        return -ENODEV;

    /* (1) target目标频率在policy中的合法性检测 */
    /* Make sure that target_freq is within supported range */
    if (target_freq > policy->max)
        target_freq = policy->max;
    if (target_freq < policy->min)
        target_freq = policy->min;

    pr_debug("target for CPU %u: %u kHz, relation %u, requested %u kHz\n",
         policy->cpu, target_freq, relation, old_target_freq);

    /* (2) 如果当前频率就是target频率,不用调整直接返回 */
    /*
     * This might look like a redundant call as we are checking it again
     * after finding index. But it is left intentionally for cases where
     * exactly same freq is called again and so we can save on few function
     * calls.
     */
    if (target_freq == policy->cur)
        return 0;

    /* Save last value to restore later on errors */
    policy->restore_freq = policy->cur;

    if (cpufreq_driver->target)
        /* (3) 调用实际的驱动target()函数来调整cpu频率 */
        retval = cpufreq_driver->target(policy, target_freq, relation);
    else if (cpufreq_driver->target_index) {
        struct cpufreq_frequency_table *freq_table;
        int index;

        freq_table = cpufreq_frequency_get_table(policy->cpu);
        if (unlikely(!freq_table)) {
            pr_err("%s: Unable to find freq_table\n", __func__);
            goto out;
        }

        retval = cpufreq_frequency_table_target(policy, freq_table,
                target_freq, relation, &index);
        if (unlikely(retval)) {
            pr_err("%s: Unable to find matching freq\n", __func__);
            goto out;
        }

        if (freq_table[index].frequency == policy->cur) {
            retval = 0;
            goto out;
        }

        retval = __target_index(policy, freq_table, index);
    }

out:
    return retval;
}

|→

static int _mt_cpufreq_target(struct cpufreq_policy *policy, unsigned int target_freq,
                  unsigned int relation)
{
    struct mt_cpu_dvfs *p;
    int ret;
    unsigned int new_opp_idx;

    p = id_to_cpu_dvfs(_get_cpu_dvfs_id(policy->cpu));
    if (!p)
        return -EINVAL;

    /* (3.1) 驱动根据频率电压表,配置target频率和对应电压 */
    ret = cpufreq_frequency_table_target(policy, p->freq_tbl_for_cpufreq,
                         target_freq, relation, &new_opp_idx);
    if (ret || new_opp_idx >= p->nr_opp_tbl)
        return -EINVAL;

    if (dvfs_disable_flag || p->dvfs_disable_by_suspend || p->dvfs_disable_by_procfs)
        return -EPERM;

    _mt_cpufreq_dvfs_request_wrapper(p, new_opp_idx, MT_CPU_DVFS_NORMAL, NULL);

    return 0;
}

4.3.2、interactive governor

在所有的cpufreq governor中最有名气的就是interactive governor了,因为几乎所有的andriod系统中都在使用。

interactive的思想就是使用cpu的负载来调整cpu频率,核心就是:使用一个20ms的定时器来计算cpu占用率,根据cpu占用率的不同threshold来调整不同档位的频率。

这里写图片描述

interactive的负载计算方法如上图所示。interactive的整个计算方法大概如下:

  • 1、计算cpu的累加负载。每20ms采样一次,每次采样统计增加的active_time和当前频率的乘积:cputime_speedadj += active_time * cur_freq;
  • 2、计算cpu的占用率。当前cpu占用率 = (累加负载100)/(累加时间当前频率),cpu_load = (loadadjfreq*100)/(delta_time*cur_freq);
  • 3、如果cpu_load达到高门限go_hispeed_load(99%)或者发生boost,直接调节频率到hispeed_freq(最高频率);
  • 4、其他情况下使用choose_freq()公式计算新频率:new_freq = cur_freq*(cpu_load/DEFAULT_TARGET_LOAD(90));new_freq = cpufreq_frequency_table_target(new_freq, CPUFREQ_RELATION_L);
  • 5、如果当前频率已经达到hispeed_freq,还需要往上调整,必须在之前的频率上保持above_hispeed_delay(20ms);如果当前频率已经达到hispeed_freq,还需要往下调整,必须在之前的频率上保持min_sample_time(80ms);

interactive governor从原理上看,有以下问题:

  • 1、20ms的采样时间过长,负载变化到频率调整的反应时间过长;
  • 2、负载累加计算有问题,历史负载没有老化机制,历史负载的权重和当前一样,造成当前的负载变化不真实;
  • 3、计算cpu占用率=总历史负载/(总时间*当前频率),算法不合理历史负载对当前影响太大。如果之前是高频率,现在变成低频率,那么cpu_load计算出来的值可能超过100%;如果之前是低频率,现在是高频率,那么cpu_load计算出来的值也会大大被拉低;
  • 4、choose_freq()的计算公式有重大漏洞。比如我们cpu频率表={800M, 900M},当前cur_freq=800m cur_load=100%,那么newfreq = (cur_freq*cur_load)/90 = 889M,使用CPUFREQ_RELATION_L选择档位,选择到还是800M根本不能向高档位前进。这是算法的一个漏洞,如果cpu不同档位的频率差值大于(100/90),那么正常往上调频是调不上去的,会被CPUFREQ_RELATION_L参数拦下来。所以实际的interactive调频,都是使用go_hispeed_load(99%)调到最高值的,再使用choose_freq()来降频。

所以interactive governor会逐渐的被cpufreq gorernor所取代。

4.3.2.1、interactive governor的初始化
  • 1、interactive的一部分初始化在cpufreq_interactive_init()当中:
static int __init cpufreq_interactive_init(void)
{
    unsigned int i;
    struct cpufreq_interactive_cpuinfo *pcpu;
    struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };

    /* (1) 初始化percpu变量per_cpu(cpuinfo, i): 
        每个cpu创建负载计算定时器pcpu->cpu_timer
        其他的锁
     */
    /* Initalize per-cpu timers */
    for_each_possible_cpu(i) {
        pcpu = &per_cpu(cpuinfo, i);
        init_timer_deferrable(&pcpu->cpu_timer);
        pcpu->cpu_timer.function = cpufreq_interactive_timer;
        pcpu->cpu_timer.data = i;
        init_timer(&pcpu->cpu_slack_timer);
        pcpu->cpu_slack_timer.function = cpufreq_interactive_nop_timer;
        spin_lock_init(&pcpu->load_lock);
        spin_lock_init(&pcpu->target_freq_lock);
        init_rwsem(&pcpu->enable_sem);
    }

    spin_lock_init(&speedchange_cpumask_lock);
    mutex_init(&gov_lock);

    /* (2) 创建频率调整进程speedchange_task, 
        把耗时的频率调整工作单独放到一个进程中去做
     */
    speedchange_task =
        kthread_create(cpufreq_interactive_speedchange_task, NULL,
                   "cfinteractive");
    if (IS_ERR(speedchange_task))
        return PTR_ERR(speedchange_task);

    sched_setscheduler_nocheck(speedchange_task, SCHED_FIFO, &param);
    get_task_struct(speedchange_task);

    /* NB: wake up so the thread does not look hung to the freezer */
    wake_up_process(speedchange_task);

    return cpufreq_register_governor(&cpufreq_gov_interactive);
}
  • 2、interactive另一部分初始化在cpufreq_governor_interactive()中的CPUFREQ_GOV_POLICY_INIT、CPUFREQ_GOV_START命令,在cpu online时执行:

static int cpufreq_governor_interactive(struct cpufreq_policy *policy,
        unsigned int event)
{


    switch (event) {
    /* (1)  CPUFREQ_GOV_POLICY_INIT命令初始化interactive governor最核心的参数
     */
    case CPUFREQ_GOV_POLICY_INIT:
        if (have_governor_per_policy()) {
            WARN_ON(tunables);
        } else if (tunables) {
            tunables->usage_count++;
            policy->governor_data = tunables;
            return 0;
        }

        tunables = kzalloc(sizeof(*tunables), GFP_KERNEL);
        if (!tunables) {
            pr_err("%s: POLICY_INIT: kzalloc failed\n", __func__);
            return -ENOMEM;
        }

        tunables->usage_count = 1;
        tunables->above_hispeed_delay = default_above_hispeed_delay;
        tunables->nabove_hispeed_delay =
            ARRAY_SIZE(default_above_hispeed_delay);
        tunables->go_hispeed_load = DEFAULT_GO_HISPEED_LOAD;
        tunables->target_loads = default_target_loads;
        tunables->ntarget_loads = ARRAY_SIZE(default_target_loads);
        tunables->min_sample_time = DEFAULT_MIN_SAMPLE_TIME;
        tunables->timer_rate = DEFAULT_TIMER_RATE;          // interactive负载计算timer默认时间为20ms
        tunables->boostpulse_duration_val = DEFAULT_MIN_SAMPLE_TIME;
        tunables->timer_slack_val = DEFAULT_TIMER_SLACK;

        spin_lock_init(&tunables->target_loads_lock);
        spin_lock_init(&tunables->above_hispeed_delay_lock);

        policy->governor_data = tunables;
        if (!have_governor_per_policy()) {
            common_tunables = tunables;
        }

        rc = sysfs_create_group(get_governor_parent_kobj(policy),
                get_sysfs_attr());
        if (rc) {
            kfree(tunables);
            policy->governor_data = NULL;
            if (!have_governor_per_policy()) {
                common_tunables = NULL;
            }
            return rc;
        }

        if (!policy->governor->initialized) {
            idle_notifier_register(&cpufreq_interactive_idle_nb);
            cpufreq_register_notifier(&cpufreq_notifier_block,
                    CPUFREQ_TRANSITION_NOTIFIER);
        }

        break;


    /* (2) CPUFREQ_GOV_START命令启动interactive负载计算的timer
     */
    case CPUFREQ_GOV_START:
        mutex_lock(&gov_lock);

        freq_table = cpufreq_frequency_get_table(policy->cpu);
        if (tunables && !tunables->hispeed_freq)
            tunables->hispeed_freq = policy->max;

        for_each_cpu(j, policy->cpus) {
            pcpu = &per_cpu(cpuinfo, j);
            pcpu->policy = policy;
            pcpu->target_freq = policy->cur;
            pcpu->freq_table = freq_table;
            pcpu->floor_freq = pcpu->target_freq;
            pcpu->pol_floor_val_time =
                ktime_to_us(ktime_get());
            pcpu->loc_floor_val_time = pcpu->pol_floor_val_time;
            pcpu->pol_hispeed_val_time = pcpu->pol_floor_val_time;
            pcpu->loc_hispeed_val_time = pcpu->pol_floor_val_time;
            down_write(&pcpu->enable_sem);
            del_timer_sync(&pcpu->cpu_timer);
            del_timer_sync(&pcpu->cpu_slack_timer);
            cpufreq_interactive_timer_start(tunables, j);
            pcpu->governor_enabled = 1;
            up_write(&pcpu->enable_sem);
        }

        mutex_unlock(&gov_lock);
        break;


    }
4.3.2.2、interactive governor的算法

interactive governor的核心算法在20ms周期的timer interactive governor()中:

static void cpufreq_interactive_timer(unsigned long data)
{
    u64 now;
    unsigned int delta_time;
    u64 cputime_speedadj;
    int cpu_load;
    struct cpufreq_interactive_cpuinfo *pcpu =
        &per_cpu(cpuinfo, data);
    struct cpufreq_interactive_tunables *tunables =
        pcpu->policy->governor_data;
    unsigned int new_freq;
    unsigned int loadadjfreq;
    unsigned int index;
    unsigned long flags;
    u64 max_fvtime;
    int j;
    unsigned int max_t_freq = 0;

#ifdef CPUDVFS_POWER_MODE
    /* default(normal), low power, just make, performance(sports) */
    int min_sample_t[4] = { 80, 20, 20, 80 };
    int ppb_idx;
#endif

    if (!down_read_trylock(&pcpu->enable_sem))
        return;
    if (!pcpu->governor_enabled)
        goto exit;

    spin_lock_irqsave(&pcpu->load_lock, flags);

    /* (1) 累加cpu上自从cpu_up()以来的负载,
        pcpu->cputime_speedadj += active_time * pcpu->policy->cur;
        pcpu->cputime_speedadj = (active_time * pcpu->policy->cur)samp1 + ... +(active_time * pcpu->policy->cur)sampn ;
        每个采样周期为20mS,累加:第120ms中active_time*cur_cpu_freq + 第220ms中active_time*cur_cpu_freq +...+ 第n个20ms中active_time*cur_cpu_freq
     */
    now = update_load(data);

    /* (2) 自从cpu_up()以来的总的时间
        delta_time = active_time + ilde_time
     */
    delta_time = (unsigned int)(now - pcpu->cputime_speedadj_timestamp);
    cputime_speedadj = pcpu->cputime_speedadj;
    spin_unlock_irqrestore(&pcpu->load_lock, flags);

    if (WARN_ON_ONCE(!delta_time))
        goto rearm;

    spin_lock_irqsave(&pcpu->target_freq_lock, flags);

    /* (3) 总的负载/总时间 = 平均频率 */
    do_div(cputime_speedadj, delta_time);

    /* (4) (平均频率 * 100)/当前频率 = 当前cpu的占用率 
     */
    loadadjfreq = (unsigned int)cputime_speedadj * 100;
    cpu_load = loadadjfreq / pcpu->policy->cur;
    tunables->boosted = tunables->boost_val || now < tunables->boostpulse_endtime;

#ifdef CPUDVFS_POWER_MODE
    ppb_idx = mt_cpufreq_get_ppb_state();

    {
        unsigned int idx = mt_cpufreq_ppb_hispeed_freq(data, ppb_idx);

        tunables->hispeed_freq = pcpu->freq_table[idx].frequency;
        tunables->min_sample_time = min_sample_t[ppb_idx] * USEC_PER_MSEC;

        if (hispeed_freq_perf != 0)
            tunables->hispeed_freq = hispeed_freq_perf;
        if (min_sample_time_perf != 0)
            tunables->min_sample_time = min_sample_time_perf;
    }
#endif

    /* (5) 如果cpu占用率达到go_hispeed_load(99%),或者在boost状态,
        频率直接调整到最高频率hispeed_freq
     */
    if (cpu_load >= tunables->go_hispeed_load || tunables->boosted) {
        if (pcpu->policy->cur < tunables->hispeed_freq) {
            new_freq = tunables->hispeed_freq;
        } else {
            new_freq = choose_freq(pcpu, loadadjfreq);

            if (new_freq < tunables->hispeed_freq)
                new_freq = tunables->hispeed_freq;
        }

    /* (6) 否则使用choose_freq()根据当前负载来计算对应的频率
     */
    } else {
        new_freq = choose_freq(pcpu, loadadjfreq);
        if (new_freq > tunables->hispeed_freq &&
                pcpu->policy->cur < tunables->hispeed_freq)
            new_freq = tunables->hispeed_freq;
    }

    /* (7) 如果计算出的新频率 > hispeed_freq,不能马上调整,
        在hispeed_freq以上的频率上必须待满above_hispeed_delay(20ms),才能继续往上调整频率
     */
    if (pcpu->policy->cur >= tunables->hispeed_freq &&
        new_freq > pcpu->policy->cur &&
        now - pcpu->pol_hispeed_val_time <
        freq_to_above_hispeed_delay(tunables, pcpu->policy->cur)) {
        trace_cpufreq_interactive_notyet(
            data, cpu_load, pcpu->target_freq,
            pcpu->policy->cur, new_freq);
        spin_unlock_irqrestore(&pcpu->target_freq_lock, flags);
        goto rearm;
    }

    pcpu->loc_hispeed_val_time = now;

    if (cpufreq_frequency_table_target(pcpu->policy, pcpu->freq_table,
                       new_freq, CPUFREQ_RELATION_L,
                       &index)) {
        spin_unlock_irqrestore(&pcpu->target_freq_lock, flags);
        goto rearm;
    }

    new_freq = pcpu->freq_table[index].frequency;

    /* (8) 如果之前的频率 > hispeed_freq,或者发生boost
        现在需要往低调频,之前的频率需要待满min_sample_time(80ms)
     */
    /*
     * Do not scale below floor_freq unless we have been at or above the
     * floor frequency for the minimum sample time since last validated.
     */
    max_fvtime = max(pcpu->pol_floor_val_time, pcpu->loc_floor_val_time);
    if (new_freq < pcpu->floor_freq &&
        pcpu->target_freq >= pcpu->policy->cur) {
        if (now - max_fvtime < tunables->min_sample_time) {
            trace_cpufreq_interactive_notyet(
                data, cpu_load, pcpu->target_freq,
                pcpu->policy->cur, new_freq);
            spin_unlock_irqrestore(&pcpu->target_freq_lock, flags);
            goto rearm;
        }
    }

    /*
     * Update the timestamp for checking whether speed has been held at
     * or above the selected frequency for a minimum of min_sample_time,
     * if not boosted to hispeed_freq.  If boosted to hispeed_freq then we
     * allow the speed to drop as soon as the boostpulse duration expires
     * (or the indefinite boost is turned off).
     */

    if (!tunables->boosted || new_freq > tunables->hispeed_freq) {
        pcpu->floor_freq = new_freq;
        if (pcpu->target_freq >= pcpu->policy->cur ||
            new_freq >= pcpu->policy->cur)
            pcpu->loc_floor_val_time = now;
    }

    /* (9) 如果当前cpu往低调整频率,判断当前policy是否需要更新,
        因为多个cpu共享一个policy,取最大期望频率cpu的值作为整个policy的调整值
     */
    if (pcpu->target_freq == new_freq &&
            pcpu->target_freq <= pcpu->policy->cur) {
        max_t_freq = 0;
        for_each_cpu(j, pcpu->policy->cpus) {
            struct cpufreq_interactive_cpuinfo *pjcpu;

            pjcpu = &per_cpu(cpuinfo, j);
            max_t_freq = max(max_t_freq, pjcpu->target_freq);
        }

        if (max_t_freq != pcpu->policy->cur)
            goto pass_t;

        trace_cpufreq_interactive_already(
            data, cpu_load, pcpu->target_freq,
            pcpu->policy->cur, new_freq);
        spin_unlock_irqrestore(&pcpu->target_freq_lock, flags);
        goto rearm;
    }
pass_t:
    trace_cpufreq_interactive_target(data, cpu_load, pcpu->target_freq,
                     pcpu->policy->cur, new_freq);

    /* (10) 如果policy需要更新唤醒speedchange_task来执行调频动作 */
    pcpu->target_freq = new_freq;
    spin_unlock_irqrestore(&pcpu->target_freq_lock, flags);
    spin_lock_irqsave(&speedchange_cpumask_lock, flags);
    cpumask_set_cpu(data, &speedchange_cpumask);
    spin_unlock_irqrestore(&speedchange_cpumask_lock, flags);
    wake_up_process(speedchange_task);

rearm:
    if (!timer_pending(&pcpu->cpu_timer))
        cpufreq_interactive_timer_resched(pcpu);

exit:
    up_read(&pcpu->enable_sem);
    return;
}

|→

static unsigned int choose_freq(struct cpufreq_interactive_cpuinfo *pcpu,
        unsigned int loadadjfreq)
{
    unsigned int freq = pcpu->policy->cur;
    unsigned int prevfreq, freqmin, freqmax;
    unsigned int tl;
    int index;

    freqmin = 0;
    freqmax = UINT_MAX;

    do {
        prevfreq = freq;

        /* (6.1) tl = 90,loadadjfreq = (平均频率 * 100)
            即 newfreq =  (平均频率 * 100)/ 90

            相当于cpufreq_frequency_table_target(CPUFREQ_RELATION_L),
            相当于newfreq往低档位的计算,

            ooooo这里带来一个非常严重的问题,如果档位之间差值大于100/90,向上调频将调不上去
         */
        tl = freq_to_targetload(pcpu->policy->governor_data, freq);

        /*
         * Find the lowest frequency where the computed load is less
         * than or equal to the target load.
         */

        if (cpufreq_frequency_table_target(
                pcpu->policy, pcpu->freq_table, loadadjfreq / tl,
                CPUFREQ_RELATION_L, &index))
            break;
        freq = pcpu->freq_table[index].frequency;

        if (freq > prevfreq) {
            /* The previous frequency is too low. */
            freqmin = prevfreq;

            if (freq >= freqmax) {
                /*
                 * Find the highest frequency that is less
                 * than freqmax.
                 */
                if (cpufreq_frequency_table_target(
                        pcpu->policy, pcpu->freq_table,
                        freqmax - 1, CPUFREQ_RELATION_H,
                        &index))
                    break;
                freq = pcpu->freq_table[index].frequency;

                if (freq == freqmin) {
                    /*
                     * The first frequency below freqmax
                     * has already been found to be too
                     * low.  freqmax is the lowest speed
                     * we found that is fast enough.
                     */
                    freq = freqmax;
                    break;
                }
            }
        } else if (freq < prevfreq) {
            /* The previous frequency is high enough. */
            freqmax = prevfreq;

            if (freq <= freqmin) {
                /*
                 * Find the lowest frequency that is higher
                 * than freqmin.
                 */
                if (cpufreq_frequency_table_target(
                        pcpu->policy, pcpu->freq_table,
                        freqmin + 1, CPUFREQ_RELATION_L,
                        &index))
                    break;
                freq = pcpu->freq_table[index].frequency;

                /*
                 * If freqmax is the first frequency above
                 * freqmin then we have already found that
                 * this speed is fast enough.
                 */
                if (freq == freqmax)
                    break;
            }
        }

        /* If same frequency chosen as previous then done. */
    } while (freq != prevfreq);

    return freq;
}

4.4、cpu hotplug调整

还有一种调节负载的方式是cpu hotplug:

  • 1、cpu被hotplug掉的功耗小于cpu进入idle的功耗;如果整个cluster的cpu都offline,cluster也可以poweroff;所以hotplug能够节省功耗;
  • 2、但是hotplug是有开销的:hotplug动作在速度慢的时候达到了ms级别,另外进程的迁移也是有开销的;cpu的hotplug必须遵循顺序插拔的规则,如果先拔掉负载重的cpu也是不合理的;
  • 3、MTK的技术限制必须使用hotplug:MTK平台只有在剩一个online cpu的情况下才能进入深度idle模式,所以MTK平台必须支持hotplug;而samsung、qualcomm在多核online的情况下可以进入深度idle,所以一般不支持cpu hotplug;

4.4.1、hotplug 底层实现

4.4.1.1、cpu_cup()/cpu_down()

kernel对hotplug的支持是很完善的,标准接口cpu_up()/cpu_down()可以进行hotplug。

这里写图片描述

4.4.1.2、hotplug 进程迁移

在cpu_down()时,需要调用migration_call() -> migrate_tasks()把cpu上所有runnable进程迁移到其他cpu;在cpu_up()时,并不需要在函数中迁移进程,直接等待负载均衡算法的迁移。

static void migrate_tasks(struct rq *dead_rq)
{
    struct rq *rq = dead_rq;
    struct task_struct *next, *stop = rq->stop;
    int dest_cpu;

    /*
     * Fudge the rq selection such that the below task selection loop
     * doesn't get stuck on the currently eligible stop task.
     *
     * We're currently inside stop_machine() and the rq is either stuck
     * in the stop_machine_cpu_stop() loop, or we're executing this code,
     * either way we should never end up calling schedule() until we're
     * done here.
     */
    rq->stop = NULL;

    /*
     * put_prev_task() and pick_next_task() sched
     * class method both need to have an up-to-date
     * value of rq->clock[_task]
     */
    update_rq_clock(rq);
    unthrottle_offline_rt_rqs(rq);

    for (;;) {
        /*
         * There's this thread running, bail when that's the only
         * remaining thread.
         */
        if (rq->nr_running == 1)
            break;

        /* (1) 逐个从rq中获取task = next */
        /*
         * pick_next_task assumes pinned rq->lock.
         */
        lockdep_pin_lock(&rq->lock);
        next = pick_next_task(rq, &fake_task);
        BUG_ON(!next);
        next->sched_class->put_prev_task(rq, next);

        /*
         * Rules for changing task_struct::cpus_allowed are holding
         * both pi_lock and rq->lock, such that holding either
         * stabilizes the mask.
         *
         * Drop rq->lock is not quite as disastrous as it usually is
         * because !cpu_active at this point, which means load-balance
         * will not interfere. Also, stop-machine.
         */
        lockdep_unpin_lock(&rq->lock);
        raw_spin_unlock(&rq->lock);
        raw_spin_lock(&next->pi_lock);
        raw_spin_lock(&rq->lock);

        /*
         * Since we're inside stop-machine, _nothing_ should have
         * changed the task, WARN if weird stuff happened, because in
         * that case the above rq->lock drop is a fail too.
         */
        if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) {
            raw_spin_unlock(&next->pi_lock);
            continue;
        }

        /* (2) 找到最适合next进程迁移的目的cpu */
        /* Find suitable destination for @next, with force if needed. */
        dest_cpu = select_fallback_rq(dead_rq->cpu, next);

        /* (3) 实施进程迁移 */
        rq = __migrate_task(rq, next, dest_cpu);
        if (rq != dead_rq) {
            raw_spin_unlock(&rq->lock);
            rq = dead_rq;
            raw_spin_lock(&rq->lock);
        }
        raw_spin_unlock(&next->pi_lock);
    }

    rq->stop = stop;
}

|→

static int select_fallback_rq(int cpu, struct task_struct *p)
{
    int nid = cpu_to_node(cpu);
    const struct cpumask *nodemask = NULL;
    enum { cpuset, possible, fail } state = cpuset;
    int dest_cpu;

    /*
     * If the node that the cpu is on has been offlined, cpu_to_node()
     * will return -1. There is no cpu on the node, and we should
     * select the cpu on the other node.
     */
    if (nid != -1) {
        nodemask = cpumask_of_node(nid);

        /* Look for allowed, online CPU in same node. */
        for_each_cpu(dest_cpu, nodemask) {
            if (!cpu_online(dest_cpu))
                continue;
            if (!cpu_active(dest_cpu))
                continue;
            if (cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
                return dest_cpu;
        }
    }

    for (;;) {

        /* (2.1) 最好的情况:在tsk_cpus_allowed(p)中能找到online cpu迁移 */
        /* Any allowed, online CPU? */
        for_each_cpu(dest_cpu, tsk_cpus_allowed(p)) {
            if (!cpu_online(dest_cpu))
                continue;
            if (!cpu_active(dest_cpu))
                continue;
            goto out;
        }

        /* No more Mr. Nice Guy. */
        switch (state) {

        /* (2.2) 其次的情况:在cpuset中能找到online cpu迁移 */
        case cpuset:
            if (IS_ENABLED(CONFIG_CPUSETS)) {
                cpuset_cpus_allowed_fallback(p);
                state = possible;
                break;
            }

        /* (2.3) 最差的情况:在系统所有cpu中能找到online cpu迁移 */
            /* fall-through */
        case possible:
            do_set_cpus_allowed(p, cpu_possible_mask);
            state = fail;
            break;

        case fail:
            BUG();
            break;
        }
    }

out:
    if (state != cpuset) {
        /*
         * Don't tell them about moving exiting tasks or
         * kernel threads (both mm NULL), since they never
         * leave kernel.
         */
        if (p->mm && printk_ratelimit()) {
            printk_deferred("process %d (%s) no longer affine to cpu%d\n",
                    task_pid_nr(p), p->comm, cpu);
        }
    }

    return dest_cpu;
}

4.4.2、MTK hotplug算法

在有了hotplug的底层cpu_cup()、cpu_down()的实现以后,在此之上还需要有一套算法根据cpu的负载来动态hotplug。MTK这套算法比较齐全,主要分为HICA、hps_algo_main两部分。

这里写图片描述

4.4.2.1、HICA/PPM

HICA和hps的关系,其实是HICA决定了一种大的mode,而hps在大的mode中实现精细化的调整。

比如对MT6799 HICA支持3种模式:

  • 1、LL_ONLY。 // 只开小核
  • 2、L_ONLY。 // 只开中核
  • 3、ALL。 // LL、L、B10核都可以使用

HICA在mt_ppm_hica_update_algo_data()中计算负载,根据负载变化来决定mode:

_hps_task_main() -> mt_ppm_hica_update_algo_data()

↓

void mt_ppm_hica_update_algo_data(unsigned int cur_loads,
                    unsigned int cur_nr_heavy_task, unsigned int cur_tlp)
{
    struct ppm_power_state_data *state_info = ppm_get_power_state_info();
    struct ppm_state_transfer_data *data;
    enum ppm_power_state cur_state;
    enum ppm_mode cur_mode;
    int i, j;

    FUNC_ENTER(FUNC_LV_HICA);

    ppm_lock(&hica_policy.lock);

    ppm_hica_algo_data.ppm_cur_loads = cur_loads;
    ppm_hica_algo_data.ppm_cur_tlp = cur_tlp;
    ppm_hica_algo_data.ppm_cur_nr_heavy_task = cur_nr_heavy_task;

    cur_state = ppm_hica_algo_data.cur_state;
    cur_mode = ppm_main_info.cur_mode;

    ppm_dbg(HICA, "cur_loads = %d, cur_tlp = %d, cur_nr_heavy_task = %d, cur_state = %s, cur_mode = %d\n",
        cur_loads, cur_tlp, cur_nr_heavy_task, ppm_get_power_state_name(cur_state), cur_mode);

    if (!ppm_main_info.is_enabled || !hica_policy.is_enabled || ppm_main_info.is_in_suspend ||
        cur_state == PPM_POWER_STATE_NONE)
        goto end;

#if defined(CONFIG_MACH_MT6757) || defined(CONFIG_MACH_KIBOPLUS)
    if (setup_max_cpus == 4)
        goto end;
#endif

#ifdef PPM_IC_SEGMENT_CHECK
    if (ppm_main_info.fix_state_by_segment != PPM_POWER_STATE_NONE)
        goto end;
#endif

    /* skip HICA if DVFS is not ready (we cannot get current freq...) */
    if (!ppm_main_info.client_info[PPM_CLIENT_DVFS].limit_cb)
        goto end;

    /* Power state is fixed by user, skip HICA state calculation */
    if (fix_power_state != PPM_POWER_STATE_NONE)
        goto end;

    /* (1) 从transfer_by_perf到transfer_by_pwr逐个遍历判断当前state是否需要改变 */
    for (i = 0; i < 2; i++) {
        data = (i == 0) ? state_info[cur_state].transfer_by_perf
                : state_info[cur_state].transfer_by_pwr;

        /* (2) 如果当前state有几种变化逐个遍历,比如:
            当前state为ALL,
            可以ALL -> LL_ONLY
            也可以ALL -> L_ONLY
         */
        for (j = 0; j < data->size; j++) {
            if (!data->transition_data[j].transition_rule
                || !((1 << cur_mode) & data->transition_data[j].mode_mask))
                continue;

            /* (3) 如果state变化,获取新的state返回 */
            if (data->transition_data[j].transition_rule(
                ppm_hica_algo_data, &data->transition_data[j])) {
                ppm_hica_algo_data.new_state = data->transition_data[j].next_state;
                ppm_dbg(HICA, "[%s(%d)] Need state transfer: %s --> %s\n",
                    (i == 0) ? "PERF" : "PWR",
                    j,
                    ppm_get_power_state_name(cur_state),
                    ppm_get_power_state_name(ppm_hica_algo_data.new_state)
                    );
                goto end;

            /* (4) 如果state不变化,维持当前state,继续遍历*/
            } else {
                ppm_hica_algo_data.new_state = cur_state;
#ifdef PPM_HICA_2P0
                ppm_dbg(HICA, "[%s(%d)]hold in %s state, capacity_hold_cnt = %d, bigtsk_hold_cnt = %d, freq_hold_cnt = %d\n",
                    (i == 0) ? "PERF" : "PWR",
                    j,
                    ppm_get_power_state_name(cur_state),
                    data->transition_data[j].capacity_hold_cnt,
                    data->transition_data[j].bigtsk_hold_cnt,
                    data->transition_data[j].freq_hold_cnt
                    );
#else
#if PPM_HICA_VARIANT_SUPPORT
                ppm_dbg(HICA, "[%s(%d)]hold in %s state, loading_cnt = %d, freq_cnt = %d, overutil_l_hold_cnt = %d, .overutil_h_hold_cnt = %d\n",
                    (i == 0) ? "PERF" : "PWR",
                    j,
                    ppm_get_power_state_name(cur_state),
                    data->transition_data[j].loading_hold_cnt,
                    data->transition_data[j].freq_hold_cnt,
                    data->transition_data[j].overutil_l_hold_cnt,
                    data->transition_data[j].overutil_h_hold_cnt
                    );
#else
                ppm_dbg(HICA, "[%s(%d)]hold in %s state, loading_cnt = %d, freq_cnt = %d\n",
                    (i == 0) ? "PERF" : "PWR",
                    j,
                    ppm_get_power_state_name(cur_state),
                    data->transition_data[j].loading_hold_cnt,
                    data->transition_data[j].freq_hold_cnt
                    );
#endif
#endif
            }
        }
    }

end:
    ppm_unlock(&hica_policy.lock);
    FUNC_EXIT(FUNC_LV_HICA);
}

关于计算state的函数和阈值定义在表中,除了heavy_task和big_task,基本是计算util/capacity的cpu占用情况:

struct ppm_power_state_data pwr_state_info_SB[NR_PPM_POWER_STATE] = {
    [0] = {
        .name = __stringify(LL_ONLY),
        .state = PPM_POWER_STATE_LL_ONLY,
        PWR_STATE_INFO(LL_ONLY, SB)
    },
    [1] = {
        .name = __stringify(L_ONLY),
        .state = PPM_POWER_STATE_L_ONLY,
        PWR_STATE_INFO(L_ONLY, SB)
    },
    [2] = {
        .name = __stringify(ALL),
        .state = PPM_POWER_STATE_ALL,
        PWR_STATE_INFO(ALL, SB)
    },
};

static struct ppm_state_transfer state_pwr_transfer_ALL[] = {
    TRANS_DATA(
        LL_ONLY,
        PPM_MODE_MASK_ALL_MODE,
        ppm_trans_rule_ALL_to_LL_ONLY,
        PPM_DEFAULT_HOLD_TIME,
        PPM_CAPACITY_DOWN,
        PPM_DEFAULT_BIGTSK_TIME,
        0,
        0,
        0
        ),
    TRANS_DATA(
        L_ONLY,
        PPM_MODE_MASK_ALL_MODE,
        ppm_trans_rule_ALL_to_L_ONLY,
        PPM_DEFAULT_HOLD_TIME,
        PPM_CAPACITY_DOWN,
        PPM_DEFAULT_BIGTSK_TIME,
        2,
        4,
        0
        ),
};
STATE_TRANSFER_DATA_PWR(ALL);

static struct ppm_state_transfer state_perf_transfer_ALL[] = {
    TRANS_DATA(NONE, 0, NULL, 0, 0, 0, 0, 0, 0),
};
STATE_TRANSFER_DATA_PERF(ALL);



/* 举例:当前state为ALL
    尝试从power的角度从ALL切换到LL_ONLY:ppm_trans_rule_ALL_to_LL_ONLY()
    尝试从power的角度从ALL切换到L_ONLY:ppm_trans_rule_ALL_to_L_ONLY()
 */
static bool ppm_trans_rule_ALL_to_LL_ONLY(
    struct ppm_hica_algo_data data, struct ppm_state_transfer *settings)
{
    /* keep in ALL state if root cluster is fixed at L or B */
    if (ppm_main_info.fixed_root_cluster == PPM_CLUSTER_L
        || ppm_main_info.fixed_root_cluster == PPM_CLUSTER_B)
        return false;

    /* (1) 从heavy task负载判断是否需要切换模式 */
#if PPM_HEAVY_TASK_INDICATE_SUPPORT
    {
        unsigned int heavy_task, i;

        for_each_ppm_clusters(i) {
            heavy_task = hps_get_hvytsk(i);
            if (heavy_task) {
                ppm_dbg(HICA, "Stay in ALL due to cluster%d heavy task = %d\n",
                    i, heavy_task);
                trace_ppm_hica(
                    ppm_get_power_state_name(PPM_POWER_STATE_ALL),
                    ppm_get_power_state_name(PPM_POWER_STATE_LL_ONLY),
                    -1, -1, -1, -1, heavy_task, -1, false);
                settings->capacity_hold_cnt = 0;
                return false;
            }
        }
    }
#endif

    /* (2) 从big task负载判断是否需要切换模式 */
#if PPM_BIG_TASK_INDICATE_SUPPORT
    {
        unsigned int big_task_L = hps_get_bigtsk(PPM_CLUSTER_L);
        unsigned int big_task_B = hps_get_bigtsk(PPM_CLUSTER_B);

        if (big_task_L || big_task_B) {
            ppm_dbg(HICA, "Stay in ALL due to L/B big task = %d/%d\n",
                big_task_L, big_task_B);
            trace_ppm_hica(
                ppm_get_power_state_name(PPM_POWER_STATE_ALL),
                ppm_get_power_state_name(PPM_POWER_STATE_LL_ONLY),
                -1, -1, big_task_L, big_task_B, -1, -1, false);
            settings->capacity_hold_cnt = 0;
            return false;
        }
    }
#endif

    /* (3) 从util/capacity负载判断是否需要切换模式 */
    {
        /* check capacity */
        unsigned long usage, usage_total = 0, capacity = 0, dummy;
        unsigned int i;

        for_each_ppm_clusters(i) {
            if (sched_get_cluster_util(i, &usage, &dummy)) {
                ppm_err("Get cluster %d util failed\n", i);
                return false;
            }
            usage_total += usage;
            if (i == PPM_CLUSTER_LL)
                capacity = dummy;
        }
        ppm_dbg(HICA, "usage_total = %ld, LL capacity = %ld\n", usage_total, capacity);

        /* (3.1) (util/capacity)超过门限值(settings->capacity_bond) 是否达到次数settings->capacity_hold_time,
            如果条件满足进行state切换
         */
        if (usage_total < capacity * settings->capacity_bond / 100) {
            settings->capacity_hold_cnt++;
            if (settings->capacity_hold_cnt >= settings->capacity_hold_time) {
                trace_ppm_hica(
                    ppm_get_power_state_name(PPM_POWER_STATE_ALL),
                    ppm_get_power_state_name(PPM_POWER_STATE_LL_ONLY),
                    usage_total, capacity, -1, -1, -1, -1, true);
                return true;
            }
        } else
            settings->capacity_hold_cnt = 0;

        trace_ppm_hica(
            ppm_get_power_state_name(PPM_POWER_STATE_ALL),
            ppm_get_power_state_name(PPM_POWER_STATE_LL_ONLY),
            usage_total, capacity, -1, -1, -1, -1, false);
    }

    return false;
}

新的state计算完成后,是通过以下通道配置下去的:

_hps_task_main() -> mt_ppm_main() -> ppm_hica_update_limit_cb() -> ppm_hica_set_default_limit_by_state()

↓

void ppm_hica_set_default_limit_by_state(enum ppm_power_state state,
                    struct ppm_policy_data *policy)
{
    unsigned int i;
    struct ppm_power_state_data *state_info = ppm_get_power_state_info();

    FUNC_ENTER(FUNC_LV_HICA);

    for (i = 0; i < policy->req.cluster_num; i++) {
        if (state >= PPM_POWER_STATE_NONE) {
            if (state > NR_PPM_POWER_STATE)
                ppm_err("@%s: Invalid PPM state(%d)\n", __func__, state);

            policy->req.limit[i].min_cpu_core = get_cluster_min_cpu_core(i);
            policy->req.limit[i].max_cpu_core = get_cluster_max_cpu_core(i);
            policy->req.limit[i].min_cpufreq_idx = get_cluster_min_cpufreq_idx(i);
            policy->req.limit[i].max_cpufreq_idx = get_cluster_max_cpufreq_idx(i);

#ifdef PPM_DISABLE_CLUSTER_MIGRATION
            /* keep at least 1 LL */
            if (i == 0)
                policy->req.limit[i].min_cpu_core = 1;
#endif
        /* (1) HICA根据新的state,配置对应的min_cpu_core/max_cpu_core到本policy当中 */
        } else {
            policy->req.limit[i].min_cpu_core =
                state_info[state].cluster_limit->state_limit[i].min_cpu_core;
            policy->req.limit[i].max_cpu_core =
                state_info[state].cluster_limit->state_limit[i].max_cpu_core;
            policy->req.limit[i].min_cpufreq_idx =
                state_info[state].cluster_limit->state_limit[i].min_cpufreq_idx;
            policy->req.limit[i].max_cpufreq_idx =
                state_info[state].cluster_limit->state_limit[i].max_cpufreq_idx;
        }
    }

#ifdef PPM_IC_SEGMENT_CHECK
        /* ignore HICA min freq setting for L cluster in L_ONLY state */
        if (state == PPM_POWER_STATE_L_ONLY && ppm_main_info.fix_state_by_segment == PPM_POWER_STATE_L_ONLY)
            policy->req.limit[1].min_cpufreq_idx = get_cluster_min_cpufreq_idx(1);
#endif

    FUNC_EXIT(FUNC_LV_HICA);
}



/*==============================================================*/
/* Local Variables                      */
/*==============================================================*/
/* cluster limit for each power state */
static const struct ppm_cluster_limit state_limit_LL_ONLY[] = {
    [0] = LIMIT(15, 0, 1, 4),
    [1] = LIMIT(15, 0, 0, 0),
    [2] = LIMIT(15, 0, 0, 0),
};
STATE_LIMIT(LL_ONLY);

static const struct ppm_cluster_limit state_limit_L_ONLY[] = {
    [0] = LIMIT(15, 0, 0, 0),
    [1] = LIMIT(8, 0, 1, 4),
    [2] = LIMIT(15, 0, 0, 0),
};
STATE_LIMIT(L_ONLY);

static const struct ppm_cluster_limit state_limit_ALL[] = {
    [0] = LIMIT(15, 0, 0, 4),
    [1] = LIMIT(15, 0, 0, 4),
    [2] = LIMIT(15, 0, 0, 2),
};
STATE_LIMIT(ALL);






_hps_task_main() -> mt_ppm_main() -> ppm_limit_callback()

↓

static void ppm_limit_callback(struct ppm_client_req req)
{
    struct ppm_client_req *p = (struct ppm_client_req *)&req;
    int i;

    /* (2) 将HICA state对应的policy配置到hps限制中hps_sys.cluster_info[i].ref_base_value/ref_limit_value */
    mutex_lock(&hps_ctxt.para_lock);
    hps_sys.ppm_root_cluster = p->root_cluster;
    for (i = 0; i < p->cluster_num; i++) {
        /*
         * hps_warn("ppm_limit_callback -> cluster%d: has_advise_core = %d, [%d, %d]\n",
         *  i, p->cpu_limit[i].has_advise_core,
         *  p->cpu_limit[i].min_cpu_core, p->cpu_limit[i].max_cpu_core);
         */
#ifdef _TRACE_
        trace_ppm_limit_callback_update(i, p->cpu_limit[i].has_advise_core,
            p->cpu_limit[i].min_cpu_core, p->cpu_limit[i].max_cpu_core);
#endif
        if (!p->cpu_limit[i].has_advise_core) {
            hps_sys.cluster_info[i].ref_base_value = p->cpu_limit[i].min_cpu_core;
            hps_sys.cluster_info[i].ref_limit_value = p->cpu_limit[i].max_cpu_core;
        } else {
            hps_sys.cluster_info[i].ref_base_value =
                hps_sys.cluster_info[i].ref_limit_value =
                p->cpu_limit[i].advise_cpu_core;
        }
    }
    mutex_unlock(&hps_ctxt.para_lock);
    hps_ctxt.is_interrupt = 1;
    hps_task_wakeup_nolock();

}
4.4.2.2、hps_algo_main
_hps_task_main() -> hps_algo_main()

↓

void hps_algo_main(void)
{
    unsigned int i, val, base_val, action_print, origin_root, action_break;
    char str_online[64], str_ref_limit[64], str_ref_base[64], str_criteria_limit[64],
        str_criteria_base[64], str_target[64], str_hvytsk[64], str_pwrseq[64], str_bigtsk[64];
    char *online_ptr = str_online;
    char *criteria_limit_ptr = str_criteria_limit;
    char *criteria_base_ptr = str_criteria_base;
    char *ref_limit_ptr = str_ref_limit;
    char *ref_base_ptr = str_ref_base;
    char *hvytsk_ptr = str_hvytsk;
    char *target_ptr = str_target;
    char *pwrseq_ptr = str_pwrseq;
    char *bigtsk_ptr = str_bigtsk;
    static unsigned int hrtbt_dbg;
#ifdef CONFIG_MEIZU_BSP
    static unsigned long int j;
#endif //CONFIG_MEIZU_BSP
#ifdef CONFIG_MTK_ICCS_SUPPORT
    unsigned char real_online_power_state_bitmask = 0;
    unsigned char real_target_power_state_bitmask = 0;
    unsigned char iccs_online_power_state_bitmask = 0;
    unsigned char iccs_target_power_state_bitmask = iccs_get_target_power_state_bitmask();
    unsigned char target_cache_shared_state_bitmask = 0;
#endif

    /* Initial value */
    base_val = action_print = action_break = hps_sys.total_online_cores = 0;
    hps_sys.up_load_avg = hps_sys.down_load_avg = hps_sys.tlp_avg = hps_sys.rush_cnt = 0;
    hps_sys.action_id = origin_root = 0;
    /*
     * run algo or not by hps_ctxt.enabled
     */
    if ((u64) ktime_to_ms(ktime_sub(ktime_get(), hps_ctxt.hps_hrt_ktime)) >= HPS_HRT_DBG_MS)
        action_print = hrtbt_dbg = 1;
    else
        hrtbt_dbg = 0;

    mutex_lock(&hps_ctxt.lock);
    hps_ctxt.action = ACTION_NONE;
    atomic_set(&hps_ctxt.is_ondemand, 0);

    if (!hps_ctxt.enabled)
        goto HPS_END;
    if (hps_ctxt.eas_indicator) {
        /*Set cpu cores by scheduler*/
        goto HPS_ALGO_END;
    }
    /*
     * algo - begin
     */
    /*Back up limit and base value for check */

    mutex_lock(&hps_ctxt.para_lock);
    if ((hps_sys.cluster_info[0].base_value == 0) &&
        (hps_sys.cluster_info[1].base_value == 0) &&
        (hps_sys.cluster_info[2].base_value == 0) &&
        (hps_sys.cluster_info[0].limit_value == 0) &&
        (hps_sys.cluster_info[1].limit_value == 0) &&
        (hps_sys.cluster_info[2].limit_value == 0)) {
        hps_sys.cluster_info[0].base_value = hps_sys.cluster_info[0].ref_base_value = 0;
        hps_sys.cluster_info[1].base_value = hps_sys.cluster_info[1].ref_base_value = 0;
        hps_sys.cluster_info[2].base_value = hps_sys.cluster_info[2].ref_base_value = 0;
        hps_sys.cluster_info[0].limit_value = hps_sys.cluster_info[0].ref_limit_value = 4;
        hps_sys.cluster_info[1].limit_value = hps_sys.cluster_info[1].ref_limit_value = 4;
        hps_sys.cluster_info[2].limit_value = hps_sys.cluster_info[2].ref_limit_value = 0;
    }
    for (i = 0; i < hps_sys.cluster_num; i++) {
        hps_sys.cluster_info[i].base_value = hps_sys.cluster_info[i].ref_base_value;
        hps_sys.cluster_info[i].limit_value = hps_sys.cluster_info[i].ref_limit_value;
    }
    for (i = 0; i < hps_sys.cluster_num; i++) {
        base_val += hps_sys.cluster_info[i].base_value;
        hps_sys.cluster_info[i].target_core_num = hps_sys.cluster_info[i].online_core_num =
            0;
        hps_sys.cluster_info[i].online_core_num =
            hps_get_cluster_cpus(hps_sys.cluster_info[i].cluster_id);
        hps_sys.total_online_cores += hps_sys.cluster_info[i].online_core_num;
    }


    mutex_unlock(&hps_ctxt.para_lock);
    /* Determine root cluster */
    origin_root = hps_sys.root_cluster_id;
    hps_define_root_cluster(&hps_sys);
#ifdef CONFIG_MACH_MT6799
    if (hps_ctxt.smart_det_enabled) {
        mutex_lock(&hps_ctxt.para_lock);
        hps_sys.root_cluster_id = 1;/*Change root to L cluster when smart detection is enabled*/
        mutex_unlock(&hps_ctxt.para_lock);
    }
#endif

    if (origin_root != hps_sys.root_cluster_id)
        hps_sys.action_id = HPS_SYS_CHANGE_ROOT;

    /*
     * update history - tlp
     */
    val = hps_ctxt.tlp_history[hps_ctxt.tlp_history_index];
    hps_ctxt.tlp_history[hps_ctxt.tlp_history_index] = hps_ctxt.cur_tlp;
    hps_ctxt.tlp_sum += hps_ctxt.cur_tlp;
    hps_ctxt.tlp_history_index =
        (hps_ctxt.tlp_history_index + 1 ==
         hps_ctxt.tlp_times) ? 0 : hps_ctxt.tlp_history_index + 1;
    ++hps_ctxt.tlp_count;
    if (hps_ctxt.tlp_count > hps_ctxt.tlp_times) {
        WARN_ON(hps_ctxt.tlp_sum < val);
        hps_ctxt.tlp_sum -= val;
        hps_ctxt.tlp_avg = hps_ctxt.tlp_sum / hps_ctxt.tlp_times;
    } else {
        hps_ctxt.tlp_avg = hps_ctxt.tlp_sum / hps_ctxt.tlp_count;
    }
    if (hps_ctxt.stats_dump_enabled)
        hps_ctxt_print_algo_stats_tlp(0);

    /*Determine eas enabled or not*/
    if (!hps_ctxt.eas_enabled)
        hps_sys.hps_sys_ops[2].enabled = 0;

    for (i = 0 ; i < hps_sys.cluster_num ; i++)
        hps_sys.cluster_info[i].target_core_num = hps_sys.cluster_info[i].online_core_num;


    /* (1) 逐个调用 hps_sys_ops()根据各种算法来判断当前cpu是否需要hotplug */
    for (i = 0; i < hps_sys.func_num; i++) {
        if (hps_sys.hps_sys_ops[i].enabled == 1) {
            if (hps_sys.hps_sys_ops[i].hps_sys_func_ptr()) {
                hps_sys.action_id = hps_sys.hps_sys_ops[i].func_id;
                break;
            }
        }
    }
/*
    if (hps_ctxt.heavy_task_enabled)
        if (hps_algo_heavytsk_det())
            hps_sys.action_id = 0xE1;
*/

    if (hps_ctxt.big_task_enabled)
        if (hps_algo_big_task_det())
            hps_sys.action_id = 0xE2;

    if (hps_sys.action_id == 0)
        goto HPS_END;

HPS_ALGO_END:

#ifdef CONFIG_MACH_MT6799
    if (hps_ctxt.smart_det_enabled) {
        if (hps_sys.cluster_info[2].bigTsk_value <= 1) {
            mutex_lock(&hps_ctxt.para_lock);
            hps_sys.cluster_info[2].target_core_num = 1;
            mutex_unlock(&hps_ctxt.para_lock);
        }
    }
#endif



    /*
     * algo - end
     */

    /* (2) 对limit进行判断,HICA的值就配置到这里 */
    /*Base and limit check */
    hps_check_base_limit(&hps_sys);

    /* Ensure that root cluster must one online cpu at less */
    if (hps_sys.cluster_info[hps_sys.root_cluster_id].target_core_num <= 0)
        hps_sys.cluster_info[hps_sys.root_cluster_id].target_core_num = 1;

#ifdef CONFIG_MTK_ICCS_SUPPORT
    real_online_power_state_bitmask = 0;
    real_target_power_state_bitmask = 0;
    for (i = 0; i < hps_sys.cluster_num; i++) {
        real_online_power_state_bitmask |= ((hps_sys.cluster_info[i].online_core_num > 0) << i);
        real_target_power_state_bitmask |= ((hps_sys.cluster_info[i].target_core_num > 0) << i);
    }
    iccs_online_power_state_bitmask = iccs_target_power_state_bitmask;
    iccs_target_power_state_bitmask = real_target_power_state_bitmask;
    iccs_get_target_state(&iccs_target_power_state_bitmask, &target_cache_shared_state_bitmask);

    /*
     * pr_err("[%s] iccs_target_power_state_bitmask: 0x%x\n", __func__, iccs_target_power_state_bitmask);
     */

    for (i = 0; i < hps_sys.cluster_num; i++) {
        hps_sys.cluster_info[i].iccs_state = (((real_online_power_state_bitmask >> i) & 1) << 3) |
                             (((real_target_power_state_bitmask >> i) & 1) << 2) |
                             (((iccs_online_power_state_bitmask >> i) & 1) << 1) |
                             (((iccs_target_power_state_bitmask >> i) & 1) << 0);

        /*
         * pr_err("[%s] cluster: 0x%x iccs_state: 0x%x\n", __func__, i, hps_sys.cluster_info[i].iccs_state);
         */

        if (hps_get_iccs_pwr_status(i) == 0x1)
            iccs_cluster_on_off(i, 1);
        else if (hps_get_iccs_pwr_status(i) == 0x2)
            iccs_cluster_on_off(i, 0);
    }
#endif

    /* (3) 经过各种算法计算后目标值是target_core_num,而当前值是online_core_num;
        如果不一致,进行cpu_up()/cpu_down()操作
     */
#if 1               /*Make sure that priority of power on action is higher than power down. */
    for (i = 0; i < hps_sys.cluster_num; i++) {
        if (hps_sys.cluster_info[i].target_core_num >
            hps_sys.cluster_info[i].online_core_num) {
            if (hps_algo_do_cluster_action(i) == 1) {
                action_print = action_break = 1;
                break;
            }
            action_print = 1;
        }
    }
    if (!action_break) {
        for (i = 0; i < hps_sys.cluster_num; i++) {
            if (hps_sys.cluster_info[i].target_core_num <
                hps_sys.cluster_info[i].online_core_num) {
                if (hps_algo_do_cluster_action(i) == 1) {
                    action_print = action_break = 1;
                    break;
                }

                action_print = 1;
            }
        }
    }
#else
    /*Process root cluster first */
    if (hps_sys.cluster_info[hps_sys.root_cluster_id].target_core_num !=
        hps_sys.cluster_info[hps_sys.root_cluster_id].online_core_num) {
        if (hps_algo_do_cluster_action(hps_sys.root_cluster_id) == 1)
            action_break = 1;
        else
            action_break = 0;
        action_print = 1;
    }

    for (i = 0; i < hps_sys.cluster_num; i++) {
        if (i == hps_sys.root_cluster_id)
            continue;
        if (hps_sys.cluster_info[i].target_core_num !=
            hps_sys.cluster_info[i].online_core_num) {
            if (hps_algo_do_cluster_action(i) == 1)
                action_break = 1;
            else
                action_break = 0;
            action_print = 1;
        }
    }

#endif
#ifdef CONFIG_MTK_ICCS_SUPPORT
    for (i = 0; i < hps_sys.cluster_num; i++) {
        if (hps_get_cluster_cpus(hps_sys.cluster_info[i].cluster_id) !=
                hps_sys.cluster_info[i].target_core_num) {
            if (hps_get_cluster_cpus(hps_sys.cluster_info[i].cluster_id) == 0)
                iccs_target_power_state_bitmask &= ~(1 << i);
            else if (hps_sys.cluster_info[i].target_core_num == 0)
                iccs_target_power_state_bitmask |= (1 << i);
        }
    }
    /*
     * pr_err("[%s] iccs_target_power_state_bitmask: 0x%x\n", __func__, iccs_target_power_state_bitmask);
     */
    iccs_set_target_power_state_bitmask(iccs_target_power_state_bitmask);
#endif
HPS_END:
    if (action_print || hrtbt_dbg) {
        int online, target, ref_limit, ref_base, criteria_limit, criteria_base, hvytsk, pwrseq, bigtsk;

        mutex_lock(&hps_ctxt.para_lock);

        online = target = criteria_limit = criteria_base = 0;
        for (i = 0; i < hps_sys.cluster_num; i++) {
            if (i == origin_root)
                online =
                    sprintf(online_ptr, "<%d>",
                        hps_sys.cluster_info[i].online_core_num);
            else
                online =
                    sprintf(online_ptr, "(%d)",
                        hps_sys.cluster_info[i].online_core_num);

            if (i == hps_sys.root_cluster_id)
                target =
                    sprintf(target_ptr, "<%d>",
                        hps_sys.cluster_info[i].target_core_num);
            else
                target =
                    sprintf(target_ptr, "(%d)",
                        hps_sys.cluster_info[i].target_core_num);

            criteria_limit =
                sprintf(criteria_limit_ptr, "(%d)",
                    hps_sys.cluster_info[i].limit_value);
            criteria_base =
                sprintf(criteria_base_ptr, "(%d)", hps_sys.cluster_info[i].base_value);
            ref_limit =
                sprintf(ref_limit_ptr, "(%d)", hps_sys.cluster_info[i].ref_limit_value);
            ref_base =
                sprintf(ref_base_ptr, "(%d)", hps_sys.cluster_info[i].ref_base_value);
            hvytsk = sprintf(hvytsk_ptr, "(%d)", hps_sys.cluster_info[i].hvyTsk_value);
            bigtsk = sprintf(bigtsk_ptr, "(%d)", hps_sys.cluster_info[i].bigTsk_value);
            if (i == 0)
                pwrseq = sprintf(pwrseq_ptr, "(%d->", hps_sys.cluster_info[i].pwr_seq);
            else if ((i != 0) && (i != (hps_sys.cluster_num - 1)))
                pwrseq = sprintf(pwrseq_ptr, "%d->", hps_sys.cluster_info[i].pwr_seq);
            else if (i == (hps_sys.cluster_num - 1))
                pwrseq = sprintf(pwrseq_ptr, "%d) ", hps_sys.cluster_info[i].pwr_seq);

            online_ptr += online;
            target_ptr += target;
            criteria_limit_ptr += criteria_limit;
            criteria_base_ptr += criteria_base;
            ref_limit_ptr += ref_limit;
            ref_base_ptr += ref_base;
            hvytsk_ptr += hvytsk;
            bigtsk_ptr += bigtsk;
            pwrseq_ptr += pwrseq;
        }
        mutex_unlock(&hps_ctxt.para_lock);
        if (action_print) {
            hps_set_funct_ctrl();
            if (action_break)
                hps_warn
                    ("(0x%X)%s action break!! (%u)(%u)(%u) %s %s%s-->%s%s (%u)(%u)(%u)(%u) %s\n",
                     ((hps_ctxt.hps_func_control << 12) | hps_sys.action_id),
                     str_online, hps_ctxt.cur_loads,
                     hps_ctxt.cur_tlp, hps_ctxt.cur_iowait, str_hvytsk,
                     str_criteria_limit, str_criteria_base,
                     str_ref_limit, str_ref_base,
                     hps_sys.up_load_avg,
                     hps_sys.down_load_avg, hps_sys.tlp_avg, hps_sys.rush_cnt,
                     str_target);
            else {
                char str1[256];
                char str2[256];

                snprintf(str1, sizeof(str1),
    "(0x%X)%s action end (%u)(%u)(%u) %s %s[%u][%u](%u) %s %s%s (%u)(%u)(%u)(%u)",
                        ((hps_ctxt.hps_func_control << 12) | hps_sys.action_id),
                        str_online, hps_ctxt.cur_loads,
                        hps_ctxt.cur_tlp, hps_ctxt.cur_iowait,
                        str_hvytsk, str_bigtsk, hps_ctxt.is_screen_off,
                        hps_ctxt.is_idle, hps_ctxt.idle_ratio,
                        str_pwrseq, str_criteria_limit, str_criteria_base,
                        hps_sys.up_load_avg,
                        hps_sys.down_load_avg,
                        hps_sys.tlp_avg, hps_sys.rush_cnt);

                snprintf(str2, sizeof(str2),
    "[%u,%u|%u,%u|%u,%u][%u,%u,%u] [%u,%u,%u] [%u,%u,%u] [%u,%u,%u] %s",
                        hps_sys.cluster_info[0].up_threshold,
                        hps_sys.cluster_info[0].down_threshold,
                        hps_sys.cluster_info[1].up_threshold,
                        hps_sys.cluster_info[1].down_threshold,
                        hps_sys.cluster_info[2].up_threshold,
                        hps_sys.cluster_info[2].down_threshold,
                        hps_sys.cluster_info[0].loading,
                        hps_sys.cluster_info[1].loading,
                        hps_sys.cluster_info[2].loading,
                        hps_sys.cluster_info[0].rel_load,
                        hps_sys.cluster_info[1].rel_load,
                        hps_sys.cluster_info[2].rel_load,
                        hps_sys.cluster_info[0].abs_load,
                        hps_sys.cluster_info[1].abs_load,
                        hps_sys.cluster_info[2].abs_load,
                        /* sched-assist hotplug: for debug */
                        hps_sys.cluster_info[0].sched_load,
                        hps_sys.cluster_info[1].sched_load,
                        hps_sys.cluster_info[2].sched_load,
                        str_target);
#ifdef CONFIG_MEIZU_BSP
                if (printk_timed_ratelimit(&j, 500))
                    hps_warn("%s%s\n", str1, str2);
#else
                    hps_warn("%s%s\n", str1, str2);
#endif //CONFIG_MEIZU_BSP
#ifdef _TRACE_
                trace_hps_update(hps_sys.action_id, str_online, hps_ctxt.cur_loads,
                        hps_ctxt.cur_tlp, hps_ctxt.cur_iowait, str_hvytsk,
                        str_criteria_limit, str_criteria_base,
                        hps_sys.up_load_avg, hps_sys.down_load_avg,
                        hps_sys.tlp_avg,
                        hps_sys.rush_hps_sys.cluster_info[0].up_threshold,
                        hps_sys.cluster_info[0].down_threshold,
                        hps_sys.cluster_info[0].up_threshold,
                        hps_sys.cluster_info[0].down_threshold,
                        hps_sys.cluster_info[2].up_threshold,
                        hps_sys.cluster_info[2].down_threshold,
                        hps_sys.cluster_info[0].loading, hps_sys.cluster_info[1].loading,
                        hps_sys.cluster_info[2].loading,
                        hps_ctxt.up_times, hps_ctxt.down_times, str_target);
#endif
            }
            hps_ctxt_reset_stas_nolock();
        }
    }
#if HPS_HRT_BT_EN
    if (hrtbt_dbg && (action_print)) {
        hps_set_funct_ctrl();
        hps_warn("(0x%X)%s HRT_BT_DBG (%u)(%u)(%u) %s %s %s %s%s (%u)(%u)(%u)(%u) %s\n",
             ((hps_ctxt.hps_func_control << 12) | hps_sys.action_id),
             str_online, hps_ctxt.cur_loads, hps_ctxt.cur_tlp,
             hps_ctxt.cur_iowait, str_hvytsk, str_bigtsk, str_pwrseq, str_criteria_limit,
             str_criteria_base, hps_sys.up_load_avg, hps_sys.down_load_avg,
             hps_sys.tlp_avg, hps_sys.rush_cnt, str_target);
        hrtbt_dbg = 0;
        hps_ctxt.hps_hrt_ktime = ktime_get();
    }
#endif
    action_print = 0;
    action_break = 0;
    mutex_unlock(&hps_ctxt.lock);
}

当前hps_algo_main()的算法对应有几种:

static int (*hps_func[]) (void) = {
/*hps_algo_perf_indicator, hps_algo_rush_boost, hps_algo_eas, hps_algo_up, hps_algo_down};*/
hps_algo_perf_indicator, hps_algo_rush_boost, hps_algo_eas};


/* (1) 取perf规定的最小值 */
static int hps_algo_perf_indicator(void)
{
    unsigned int i;

    if (atomic_read(&hps_ctxt.is_ondemand) != 0) { /* for ondemand request */
        atomic_set(&hps_ctxt.is_ondemand, 0);

        mutex_lock(&hps_ctxt.para_lock);
        for (i = 0; i < hps_sys.cluster_num; i++)
            hps_sys.cluster_info[i].target_core_num =
                max(hps_sys.cluster_info[i].base_value, hps_sys.cluster_info[i].online_core_num);

        mutex_unlock(&hps_ctxt.para_lock);

        return 1;
    }
    return 0;
}

/* (2) 根据当前load的值是否达到boost门限,来决定是否启动boost */
static int hps_algo_rush_boost(void)
{
    int val, base_val;
    unsigned int idx, total_rel_load;

    idx = total_rel_load = 0;
    for (idx = 0 ; idx < hps_sys.cluster_num ; idx++)
        total_rel_load += hps_sys.cluster_info[idx].rel_load;

    if (!hps_ctxt.rush_boost_enabled)
        return 0;
    base_val = cal_base_cores();

    if (total_rel_load > hps_ctxt.rush_boost_threshold * hps_sys.total_online_cores)
        ++hps_ctxt.rush_count;
    else
        hps_ctxt.rush_count = 0;
    if (hps_ctxt.rush_boost_times == 1)
        hps_ctxt.tlp_avg = hps_ctxt.cur_tlp;

    if ((hps_ctxt.rush_count >= hps_ctxt.rush_boost_times) &&
        (hps_sys.total_online_cores * 100 < hps_ctxt.tlp_avg)) {
        val = hps_ctxt.tlp_avg / 100 + (hps_ctxt.tlp_avg % 100 ? 1 : 0);
        WARN_ON(!(val > hps_sys.total_online_cores));
        if (val > num_possible_cpus())
            val = num_possible_cpus();
        if (val > base_val)
            val -= base_val;
        else
            val = 0;
        hps_sys.tlp_avg = hps_ctxt.tlp_avg;
        hps_sys.rush_cnt = hps_ctxt.rush_count;
        hps_cal_core_num(&hps_sys, val, base_val);


        /* [MET] debug for geekbench */
        met_tag_oneshot(0, "sched_rush_boost", 1);

        return 1;
    } else {
        /* [MET] debug for geekbench */
        met_tag_oneshot(0, "sched_rush_boost", 0);
        return 0;
    }
}

/* (3) 根据负载来计算需要的online cpu */
static int hps_algo_eas(void)
{
    int val, ret, i;

    ret = 0;
    for (i = 0 ; i < hps_sys.cluster_num ; i++) {
        hps_sys.cluster_info[i].target_core_num = hps_sys.cluster_info[i].online_core_num;

        /*if up_threshold > loading > down_threshold ==> No action*/
        if ((hps_sys.cluster_info[i].loading <
        (hps_sys.cluster_info[i].up_threshold*hps_sys.cluster_info[i].online_core_num)) &&
        (hps_sys.cluster_info[i].loading >
        (hps_sys.cluster_info[i].down_threshold*hps_sys.cluster_info[i].online_core_num)))
        continue;

        /*if loading > up_threshod ==> power on cores*/
        if ((hps_sys.cluster_info[i].loading >
            (hps_sys.cluster_info[i].up_threshold*hps_sys.cluster_info[i].online_core_num))) {
            val = hps_sys.cluster_info[i].loading / hps_sys.cluster_info[i].up_threshold;
            if (hps_sys.cluster_info[i].loading % hps_sys.cluster_info[i].up_threshold)
                val++;
            if (val <= hps_sys.cluster_info[i].limit_value)
                hps_sys.cluster_info[i].target_core_num = val;
            else
                hps_sys.cluster_info[i].target_core_num = hps_sys.cluster_info[i].limit_value;
            ret = 1;
        } else if ((hps_sys.cluster_info[i].loading <
            (hps_sys.cluster_info[i].down_threshold*hps_sys.cluster_info[i].online_core_num))) {
        /*if loading < down_threshod ==> power off cores*/
            if (!hps_sys.cluster_info[i].loading) {
                hps_sys.cluster_info[i].target_core_num = 0;
                continue;
            }
            val = hps_sys.cluster_info[i].loading / hps_sys.cluster_info[i].down_threshold;
            if (hps_sys.cluster_info[i].loading % hps_sys.cluster_info[i].down_threshold)
                val++;
            if (val >= hps_sys.cluster_info[i].base_value)
                hps_sys.cluster_info[i].target_core_num = val;
            else
                hps_sys.cluster_info[i].target_core_num = hps_sys.cluster_info[i].base_value;
            ret = 1;
        }
    }

#if 0
    /*Check with big task criteriai*/
    for (i = 1 ; i < hps_sys.cluster_num ; i++) {
        if ((!hps_sys.cluster_info[i].bigTsk_value) &&
        (!(hps_sys.cluster_info[i].loading / hps_sys.cluster_info[i].down_threshold)))
            hps_sys.cluster_info[i].target_core_num = 0;
    }
#endif
    return ret;
}

4.5、NUMA负载均衡

NUMA arm架构没有使用,暂时不去解析。

GitHub 加速计划 / li / linux-dash
10.39 K
1.2 K
下载
A beautiful web dashboard for Linux
最近提交(Master分支:2 个月前 )
186a802e added ecosystem file for PM2 4 年前
5def40a3 Add host customization support for the NodeJS version 4 年前
Logo

旨在为数千万中国开发者提供一个无缝且高效的云端环境,以支持学习、使用和贡献开源项目。

更多推荐