Linux电源管理——PSCI初始化流程和多核启动流程

发布于:2025-05-22 ⋅ 阅读:(20) ⋅ 点赞:(0)

目录

一、PSCI 初始化流程

1、PSCI设备树节点

2、PSCI kernel初始化流程

get_set_conduit_method

set_conduit

psci_probe

二、CPU PSCI 操作初始化流程

1、CPU 设备树节点

2、 struct cpu_operations

3、kernel 流程

cpu_read_bootcpu_ops

smp_init_cpus

三、CPU PSCI多核启动流程

1、boot cpu 启动流程

2、secondary CPU 启动流程


QEMU Version:qemu-7.2.0

Linux Version:linux-5.4.239

        本文主要分析了在ARM64架构中的PSCI电源管理接口在Linux内核中的实现流程,并分析了linux系统中如何通过 PSCI 接口启动 CPUs。

一、PSCI 初始化流程

    PSCI(Power State Coordination Interface),是由ARM定义的电源管理接口规范,Linux系统可以通过smc/hvc指令来进入不同的Exception Level,而调用对应的实现函数,下面将对 PSCI 设备树和 linux kernel 源码中 PSCI 的初始化流程进行简单分析。

1、PSCI设备树节点

psci {
		migrate = <0xc4000005>;
		cpu_on = <0xc4000003>;
		cpu_off = <0x84000002>;
		cpu_suspend = <0xc4000001>;
		method = "hvc";
		compatible = "arm,psci-1.0\0arm,psci-0.2\0arm,psci";
};	

migrate:定义 CPU 任务迁移操作的函数入口地址。
cpu_on:定义启动 CPU 的函数入口地址。
cpu_off:定义关闭 CPU 的函数入口地址。
cpu_suspend:定义 CPU 挂起操作的函数入口地址,以将 CPU 置于低功耗。
method:指定调用 PSCI 函数的方式,hvc 表示通过 Hypervisor Call 指令触发 PSCI 操作,这里还有 smc(Secure Monitor Call)以陷入 ATF 。
compatible:声明设备树节点兼容的 PSCI 规范版本,优先匹配最旧的版本,即 arm,psci-0.2

2、PSCI kernel初始化流程

从 start_kernel 函数开始分析,如下:

start_kernel                           init/main.c

        setup_arch                    arch/arm64/kernel/setup.c

                setup_machine_fdt(__fdt_pointer)        

                psci_dt_init

        在 setup_arch 函数中首先会调用 setup_machine_fdt 函数解析设备树,其中参数__fdt_pointer是从 arch/arm64/kernel/head.S 文件中传过来的,如下:

/*
 * The following fragment of code is executed with the MMU enabled.
 *
 *   x0 = __PHYS_OFFSET
 */
__primary_switched:
	adrp	x4, init_thread_union
	add	sp, x4, #THREAD_SIZE
	adr_l	x5, init_task
	msr	sp_el0, x5			// Save thread_info

	adr_l	x8, vectors			// load VBAR_EL1 with virtual
	msr	vbar_el1, x8			// vector table address
	isb

	stp	xzr, x30, [sp, #-16]!
	mov	x29, sp

	str_l	x21, __fdt_pointer, x5		// Save FDT pointer

	ldr_l	x4, kimage_vaddr		// Save the offset between
	sub	x4, x4, x0			// the kernel virtual and
	str_l	x4, kimage_voffset, x5		// physical mappings

	// Clear BSS
	adr_l	x0, __bss_start
	mov	x1, xzr
	adr_l	x2, __bss_stop
	sub	x2, x2, x0
	bl	__pi_memset
	dsb	ishst				// Make zero page visible to PTW

#ifdef CONFIG_KASAN
	bl	kasan_early_init
#endif
#ifdef CONFIG_RANDOMIZE_BASE
	tst	x23, ~(MIN_KIMG_ALIGN - 1)	// already running randomized?
	b.ne	0f
	mov	x0, x21				// pass FDT address in x0
	bl	kaslr_early_init		// parse FDT for KASLR options
	cbz	x0, 0f				// KASLR disabled? just proceed
	orr	x23, x23, x0			// record KASLR offset
	ldp	x29, x30, [sp], #16		// we must enable KASLR, return
	ret					// to __primary_switch()
0:
#endif
	add	sp, sp, #16
	mov	x29, #0
	mov	x30, #0
	b	start_kernel
ENDPROC(__primary_switched)

在解析完设备树之后会调用 psci_dt_init 函数初始化 PSCI,如下:

// drivers/firmware/psci/psci.c

typedef int (*psci_initcall_t)(const struct device_node *);

static const struct of_device_id psci_of_match[] __initconst = {
	{ .compatible = "arm,psci",	.data = psci_0_1_init},
	{ .compatible = "arm,psci-0.2",	.data = psci_0_2_init},
	{ .compatible = "arm,psci-1.0",	.data = psci_1_0_init},
	{},
};

int __init psci_dt_init(void)
{
	struct device_node *np;
	const struct of_device_id *matched_np;
	psci_initcall_t init_fn;
	int ret;

	np = of_find_matching_node_and_match(NULL, psci_of_match, &matched_np);

	if (!np || !of_device_is_available(np))
		return -ENODEV;

	init_fn = (psci_initcall_t)matched_np->data;
	ret = init_fn(np);

	of_node_put(np);
	return ret;
}

        of_find_matching_node_and_match 函数用于在设备树中查找与指定 psci_of_match 匹配表兼容的节点并初始化matched_np,并且这里会优先使用“arm,psci-1.0” 字段进行匹配并向后兼容。

        在初始化完 matched_np 之后再初始化 init_fn 为 matched_np->data,也即是 init_fn = psci_1_0_init(compatible = "arm,psci-1.0")。

最后调用 init_fn 即 psci_1_0_init,如下:

static int __init psci_1_0_init(struct device_node *np)
{
	int err;

	err = psci_0_2_init(np);
	if (err)
		return err;

	if (psci_has_osi_support())
		pr_info("OSI mode supported.\n");

	return 0;
}

再进入 psci_0_2_init 函数,如下:

/*
 * PSCI init function for PSCI versions >=0.2
 *
 * Probe based on PSCI PSCI_VERSION function
 */
static int __init psci_0_2_init(struct device_node *np)
{
	int err;

	err = get_set_conduit_method(np);
	if (err)
		return err;

	/*
	 * Starting with v0.2, the PSCI specification introduced a call
	 * (PSCI_VERSION) that allows probing the firmware version, so
	 * that PSCI function IDs and version specific initialization
	 * can be carried out according to the specific version reported
	 * by firmware
	 */
	return psci_probe();
}

        psci_0_2_init 函数就是用来初始化 PSCI 相关函数的,并且PSCI版本需要大于等于0.2(PSCI < v0.2 调用 psci_0_1_init) 如下:

get_set_conduit_method

static int get_set_conduit_method(struct device_node *np)
{
	const char *method;

	pr_info("probing for conduit method from DT.\n");

	if (of_property_read_string(np, "method", &method)) {
		pr_warn("missing \"method\" property\n");
		return -ENXIO;
	}

	if (!strcmp("hvc", method)) {
		set_conduit(PSCI_CONDUIT_HVC);
	} else if (!strcmp("smc", method)) {
		set_conduit(PSCI_CONDUIT_SMC);
	} else {
		pr_warn("invalid \"method\" property: %s\n", method);
		return -EINVAL;
	}
	return 0;
}

        该函数首先会解析PSCI设备树节点 np 中的 method 属性,以确定通信指令,如果需要陷入到 hypervisor 则为 hvc,否则如果需要陷入到 ATF,则为 smc,具体需要看PSCI节点的配置。

set_conduit

typedef unsigned long (psci_fn)(unsigned long, unsigned long,
				unsigned long, unsigned long);
static psci_fn *invoke_psci_fn;

static unsigned long __invoke_psci_fn_hvc(unsigned long function_id,
			unsigned long arg0, unsigned long arg1,
			unsigned long arg2)
{
	struct arm_smccc_res res;

	arm_smccc_hvc(function_id, arg0, arg1, arg2, 0, 0, 0, 0, &res);
	return res.a0;
}

static unsigned long __invoke_psci_fn_smc(unsigned long function_id,
			unsigned long arg0, unsigned long arg1,
			unsigned long arg2)
{
	struct arm_smccc_res res;

	arm_smccc_smc(function_id, arg0, arg1, arg2, 0, 0, 0, 0, &res);
	return res.a0;
}

static void set_conduit(enum psci_conduit conduit)
{
	switch (conduit) {
	case PSCI_CONDUIT_HVC:
		invoke_psci_fn = __invoke_psci_fn_hvc;
		break;
	case PSCI_CONDUIT_SMC:
		invoke_psci_fn = __invoke_psci_fn_smc;
		break;
	default:
		WARN(1, "Unexpected PSCI conduit %d\n", conduit);
	}

	psci_ops.conduit = conduit;
}

        set_conduit 函数主要是根据 conduit 的不同而对函数指针 invoke_psci_fn 进行初始化,以方便后面调用。

psci_probe

在分析psci_probe函数之前先看一个PSCI结构体,如下:

struct psci_operations {
	u32 (*get_version)(void);
	int (*cpu_suspend)(u32 state, unsigned long entry_point);
	int (*cpu_off)(u32 state);
	int (*cpu_on)(unsigned long cpuid, unsigned long entry_point);
	int (*migrate)(unsigned long cpuid);
	int (*affinity_info)(unsigned long target_affinity,
			unsigned long lowest_affinity_level);
	int (*migrate_info_type)(void);
	enum psci_conduit conduit;
	enum smccc_version smccc_version;
};

    struct psci_operations 是 Linux 内核中用于抽象 PSCI 接口的核心数据结构,里面定义了操作系统与 Hypervisor/ATF 之间交互的电源管理函数集,如下:

get_version:返回 PSCI 版本号

cpu_suspend:将 CPU 置于指定低功耗状态(state),并在唤醒时跳转到 entry_point

cpu_off:关闭CPU

cpu_on:开启指定 CPU(cpuid),并设置其启动地址为 entry_point

migrate:将当前任务迁移到指定 CPU(cpuid

affinity_info:查询 CPU 拓扑的亲和性信息

migrate_info_type:返回迁移信息的类型编码

conduit:PSCI_CONDUIT_HVCPSCI_CONDUIT_SMC

smccc_version:SMCCC_VERSION_1_0SMCCC_VERSION_1_1

        通过 get_set_conduit_method 函数设置好PSCI通信方法之后,则调用 psci_probe 函数初始化对应版本的电源管理接口,如下:

/*
 * Probe function for PSCI firmware versions >= 0.2
 */
static int __init psci_probe(void)
{
	u32 ver = psci_get_version();

	pr_info("PSCIv%d.%d detected in firmware.\n",
			PSCI_VERSION_MAJOR(ver),
			PSCI_VERSION_MINOR(ver));

	if (PSCI_VERSION_MAJOR(ver) == 0 && PSCI_VERSION_MINOR(ver) < 2) {
		pr_err("Conflicting PSCI version detected.\n");
		return -EINVAL;
	}

	psci_0_2_set_functions();

	psci_init_migrate();

	if (PSCI_VERSION_MAJOR(ver) >= 1) {
		psci_init_smccc();
		psci_init_cpu_suspend();
		psci_init_system_suspend();
		psci_init_system_reset2();
	}

	return 0;
}

        函数首先获取 PSCI的版本号,看是否符合大于等于 0.2 的标准,如果符合则调用 psci_0_2_set_functions 函数,如下:

static void __init psci_0_2_set_functions(void)
{
	pr_info("Using standard PSCI v0.2 function IDs\n");
	psci_ops.get_version = psci_get_version;

	psci_function_id[PSCI_FN_CPU_SUSPEND] =
					PSCI_FN_NATIVE(0_2, CPU_SUSPEND);
	psci_ops.cpu_suspend = psci_cpu_suspend;

	psci_function_id[PSCI_FN_CPU_OFF] = PSCI_0_2_FN_CPU_OFF;
	psci_ops.cpu_off = psci_cpu_off;

	psci_function_id[PSCI_FN_CPU_ON] = PSCI_FN_NATIVE(0_2, CPU_ON);
	psci_ops.cpu_on = psci_cpu_on;

	psci_function_id[PSCI_FN_MIGRATE] = PSCI_FN_NATIVE(0_2, MIGRATE);
	psci_ops.migrate = psci_migrate;

	psci_ops.affinity_info = psci_affinity_info;

	psci_ops.migrate_info_type = psci_migrate_info_type;

	arm_pm_restart = psci_sys_reset;

	pm_power_off = psci_sys_poweroff;
}

        可以看到 psci_0_2_set_functions 函数其实就是对 psci_operations 进行初始化,设置相应的回调函数,比如当操作系统需要hotplug时,就会调用到 psci_cpu_off/psci_cpu_on 函数,到此为止 PSCI 的初始化就完成,下面将介绍对应 CPU 的相关初始化。

二、CPU PSCI 操作初始化流程

1、CPU 设备树节点

cpus {
		#size-cells = <0x00>;
		#address-cells = <0x01>;
        ......

		cpu@0 {
			phandle = <0x8004>;
			reg = <0x00>;
			enable-method = "psci";
			compatible = "arm,cortex-a53";
			device_type = "cpu";
		};

		cpu@1 {
			phandle = <0x8003>;
			reg = <0x01>;
			enable-method = "psci";
			compatible = "arm,cortex-a53";
			device_type = "cpu";
		};

        ......
};

        这里只关注“enable-method”字段,该字段就描述了该 CPU 的启动方式,这里是启动方式是 “PSCI”,还有一种“spin-table”启动方式这里将不再说明。

2、 struct cpu_operations

/**
 * struct cpu_operations - Callback operations for hotplugging CPUs.
 *
 * @name:	Name of the property as appears in a devicetree cpu node's
 *		enable-method property. On systems booting with ACPI, @name
 *		identifies the struct cpu_operations entry corresponding to
 *		the boot protocol specified in the ACPI MADT table.
 * @cpu_init:	Reads any data necessary for a specific enable-method for a
 *		proposed logical id.
 * @cpu_prepare: Early one-time preparation step for a cpu. If there is a
 *		mechanism for doing so, tests whether it is possible to boot
 *		the given CPU.
 * @cpu_boot:	Boots a cpu into the kernel.
 * @cpu_postboot: Optionally, perform any post-boot cleanup or necesary
 *		synchronisation. Called from the cpu being booted.
 * @cpu_can_disable: Determines whether a CPU can be disabled based on
 *		mechanism-specific information.
 * @cpu_disable: Prepares a cpu to die. May fail for some mechanism-specific
 * 		reason, which will cause the hot unplug to be aborted. Called
 * 		from the cpu to be killed.
 * @cpu_die:	Makes a cpu leave the kernel. Must not fail. Called from the
 *		cpu being killed.
 * @cpu_kill:  Ensures a cpu has left the kernel. Called from another cpu.
 * @cpu_init_idle: Reads any data necessary to initialize CPU idle states for
 *		   a proposed logical id.
 * @cpu_suspend: Suspends a cpu and saves the required context. May fail owing
 *               to wrong parameters or error conditions. Called from the
 *               CPU being suspended. Must be called with IRQs disabled.
 */
struct cpu_operations {
	const char	*name;
	int		(*cpu_init)(unsigned int);
	int		(*cpu_prepare)(unsigned int);
	int		(*cpu_boot)(unsigned int);
	void		(*cpu_postboot)(void);
#ifdef CONFIG_HOTPLUG_CPU
	bool		(*cpu_can_disable)(unsigned int cpu);
	int		(*cpu_disable)(unsigned int cpu);
	void		(*cpu_die)(unsigned int cpu);
	int		(*cpu_kill)(unsigned int cpu);
#endif
#ifdef CONFIG_CPU_IDLE
	int		(*cpu_init_idle)(unsigned int);
	int		(*cpu_suspend)(unsigned long);
#endif
};

    struct cpu_operations 是 Linux 内核中定义 CPU hotplug 与电源管理操作的核心数据结构,为不同架构(如 ARM、x86)提供统一的接口,这也是 linux 常用的一种方法了。如下:

name:该操作集的名称,即 cpu 节点的 enable-method 属性

cpu_init:初始化指定 CPU 的特定数据(如寄存器配置)

cpu_prepare:准备启动 CPU,验证其可启动性

cpu_boot:启动 CPU,(会跳转到 PSCI_CPU_ON

cpu_postboot:在目标 CPU 上执行启动后的清理或同步操作

cpu_can_disable:查 CPU 是否可安全禁用

cpu_disable:禁用 CPU 的中断和定时器,准备关闭

cpu_die:使 CPU 退出内核(如进入 WFI )

cpu_kill:确认 CPU 已完全关闭(从其他 CPU 调用)

cpu_init_idle:初始化 CPU idle

cpu_suspend:suspend CPU 并保存上下文

3、kernel 流程

介绍完cpu_operations数据结构后再回到 setup_arch 函数中:

setup_arch

        cpu_read_bootcpu_ops    (CPU0 cpu_operations 的初始化)

        smp_init_cpus                   (CPUx cpu_operations 的初始化)

cpu_read_bootcpu_ops

// arch/arm64/kernel/cpu_ops.c

/*
 * Read a cpu's enable method and record it in cpu_ops.
 */
int __init cpu_read_ops(int cpu)
{
	const char *enable_method = cpu_read_enable_method(cpu);

	if (!enable_method)
		return -ENODEV;

	cpu_ops[cpu] = cpu_get_ops(enable_method);
	if (!cpu_ops[cpu]) {
		pr_warn("Unsupported enable-method: %s\n", enable_method);
		return -EOPNOTSUPP;
	}

	return 0;
}

static inline void __init cpu_read_bootcpu_ops(void)
{
	cpu_read_ops(0);
}

        通过注释就能够看出cpu_read_ops函数是用来读取CPU0的enable方法并将其记录在cpu_ops中,cpu_read_enable_method 函数就是从CPU0的设备树节点中读取“enable-method”属性值,并初始化给 enable_method 变量(通过上面 CPU0 的设备数节点知道这个值为 PSCI),因为这个函数功能比较简单,这里将不再展开分析。

        再继续往下走,首先注意到一个全局的指针数组,如下:

const struct cpu_operations *cpu_ops[NR_CPUS] __ro_after_init;

    这里 cpu_ops 是一个 指针数组,包含 NR_CPUS 个元素,每个元素是 const struct cpu_operations* 类型的指针,主要是用于为每个 CPU 提供独立的操作函数集。所以这里首先是对 CPU0 设置 cpu_operations 函数操作集。

cpu_get_ops 函数如下:

// arch/arm64/kernel/cpu_ops.c

extern const struct cpu_operations smp_spin_table_ops;
extern const struct cpu_operations cpu_psci_ops;

const struct cpu_operations *cpu_ops[NR_CPUS] __ro_after_init;

static const struct cpu_operations *const dt_supported_cpu_ops[] __initconst = {
	&smp_spin_table_ops,
	&cpu_psci_ops,
	NULL,
};

static const struct cpu_operations * __init cpu_get_ops(const char *name)
{
	const struct cpu_operations *const *ops;

	ops = acpi_disabled ? dt_supported_cpu_ops : acpi_supported_cpu_ops;

	while (*ops) {
		if (!strcmp(name, (*ops)->name))
			return *ops;

		ops++;
	}

	return NULL;
}

        cpu_get_ops 函数其实就是通过 acpi_disabled 值的不同对 CPU0 的 cpu_operations 进行不同的初始化,acpi_disabled 默认是1(include/linux/acpi.h),所以设置 dt_supported_cpu_ops 为 CPU0 的操作函数集,前面 cpu_read_bootcpu_ops 函数获取了 CPU0 的 enable-method = PSCI,并传到了 cpu_get_ops 函数,所以这里通过 strcmp 函数最终确定了该 CPU0 的 cpu_operations 为 cpu_psci_ops,如下:

// arch/arm64/kernel/psci.c
static int __init cpu_psci_cpu_init(unsigned int cpu)
{
	return 0;
}

static int __init cpu_psci_cpu_prepare(unsigned int cpu)
{
	if (!psci_ops.cpu_on) {
		pr_err("no cpu_on method, not booting CPU%d\n", cpu);
		return -ENODEV;
	}

	return 0;
}

static int cpu_psci_cpu_boot(unsigned int cpu)
{
	int err = psci_ops.cpu_on(cpu_logical_map(cpu), __pa_symbol(secondary_entry));
	if (err)
		pr_err("failed to boot CPU%d (%d)\n", cpu, err);

	return err;
}

#ifdef CONFIG_HOTPLUG_CPU
static bool cpu_psci_cpu_can_disable(unsigned int cpu)
{
	return !psci_tos_resident_on(cpu);
}

static int cpu_psci_cpu_disable(unsigned int cpu)
{
	/* Fail early if we don't have CPU_OFF support */
	if (!psci_ops.cpu_off)
		return -EOPNOTSUPP;

	/* Trusted OS will deny CPU_OFF */
	if (psci_tos_resident_on(cpu))
		return -EPERM;

	return 0;
}

static void cpu_psci_cpu_die(unsigned int cpu)
{
	/*
	 * There are no known implementations of PSCI actually using the
	 * power state field, pass a sensible default for now.
	 */
	u32 state = PSCI_POWER_STATE_TYPE_POWER_DOWN <<
		    PSCI_0_2_POWER_STATE_TYPE_SHIFT;

	psci_ops.cpu_off(state);
}

static int cpu_psci_cpu_kill(unsigned int cpu)
{
	int err;
	unsigned long start, end;

	if (!psci_ops.affinity_info)
		return 0;
	/*
	 * cpu_kill could race with cpu_die and we can
	 * potentially end up declaring this cpu undead
	 * while it is dying. So, try again a few times.
	 */

	start = jiffies;
	end = start + msecs_to_jiffies(100);
	do {
		err = psci_ops.affinity_info(cpu_logical_map(cpu), 0);
		if (err == PSCI_0_2_AFFINITY_LEVEL_OFF) {
			pr_info("CPU%d killed (polled %d ms)\n", cpu,
				jiffies_to_msecs(jiffies - start));
			return 0;
		}

		usleep_range(100, 1000);
	} while (time_before(jiffies, end));

	pr_warn("CPU%d may not have shut down cleanly (AFFINITY_INFO reports %d)\n",
			cpu, err);
	return -ETIMEDOUT;
}
#endif

const struct cpu_operations cpu_psci_ops = {
	.name		= "psci",
	.cpu_init	= cpu_psci_cpu_init,
	.cpu_prepare	= cpu_psci_cpu_prepare,
	.cpu_boot	= cpu_psci_cpu_boot,
#ifdef CONFIG_HOTPLUG_CPU
	.cpu_can_disable = cpu_psci_cpu_can_disable,
	.cpu_disable	= cpu_psci_cpu_disable,
	.cpu_die	= cpu_psci_cpu_die,
	.cpu_kill	= cpu_psci_cpu_kill,
#endif
};

        cpu_psci_ops 里面就为 CPU0 初始化了 cpu_operations 结构体,可以看到里面的函数指针其实又会调用到前面初始化的 PSCI 操作集函数,这也就是linux系统中常用的分层操作,对上提供统一接口,而对下则对应不同的硬件平台,比如启动 CPU0 时,就会调用 cpu_psci_cpu_boot 函数,进而调用 PSCI 操作集中的 psci_cpu_on 函数。

smp_init_cpus

        因为现在大部分都是SMP(symmetrical mulit-processing)操作系统,所以不可能只有一个 CPU,在介绍完CPU0 的cpu_operations 初始化之后,再介绍一下 CPUx(secondary CPU)cpu_operations 的初始化。如下:

// arch/arm64/kernel/smp.c
/*
 * Enumerate the possible CPU set from the device tree or ACPI and build the
 * cpu logical map array containing MPIDR values related to logical
 * cpus. Assumes that cpu_logical_map(0) has already been initialized.
 */
void __init smp_init_cpus(void)
{
	int i;

	if (acpi_disabled)
		of_parse_and_init_cpus();
	else
		acpi_parse_and_init_cpus();

	if (cpu_count > nr_cpu_ids)
		pr_warn("Number of cores (%d) exceeds configured maximum of %u - clipping\n",
			cpu_count, nr_cpu_ids);

	if (!bootcpu_valid) {
		pr_err("missing boot CPU MPIDR, not enabling secondaries\n");
		return;
	}

	/*
	 * We need to set the cpu_logical_map entries before enabling
	 * the cpus so that cpu processor description entries (DT cpu nodes
	 * and ACPI MADT entries) can be retrieved by matching the cpu hwid
	 * with entries in cpu_logical_map while initializing the cpus.
	 * If the cpu set-up fails, invalidate the cpu_logical_map entry.
	 */
	for (i = 1; i < nr_cpu_ids; i++) {
		if (cpu_logical_map(i) != INVALID_HWID) {
			if (smp_cpu_setup(i))
				set_cpu_logical_map(i, INVALID_HWID);
		}
	}
}

    smp_init_cpus 函数是 Linux 内核中用于初始化多核处理器的核心函数,主要职责是根据硬件描述(设备树或 ACPI)遍历 CPU 标识所有可用的 CPU ,并建立映射

        这里只对 secondary CPU 的 cpu_operations 结构体进行分析也就是 smp_cpu_setup 函数,其它细节这里将不再赘述,如下:

// arch/arm64/kernel/smp.c
/*
 * Initialize cpu operations for a logical cpu and
 * set it in the possible mask on success
 */
static int __init smp_cpu_setup(int cpu)
{
	if (cpu_read_ops(cpu))
		return -ENODEV;

	if (cpu_ops[cpu]->cpu_init(cpu))
		return -ENODEV;

	set_cpu_possible(cpu, true);

	return 0;
}

        和 CPU0 一样,这里也是调用 cpu_read_ops 函数进行初始化 cpu_operations 的,只是传入的 CPU id 不一样,所以将不再赘述。

        在初始化完 cpu_operations 之后就调用了 cpu_ops[cpu]->cpu_init(cpu) 也就是 cpu_psci_cpu_init 函数,但在linux-5.4.239版本中这个函数好像没有做什么操作,如下:

// arch/arm64/kernel/psci.c
static int __init cpu_psci_cpu_init(unsigned int cpu)
{
	return 0;
}

 smp_cpu_setup 函数最后调用 set_cpu_possible 如下:

// include/linux/cpumask.h

static inline void
set_cpu_possible(unsigned int cpu, bool possible)
{
	if (possible)
		cpumask_set_cpu(cpu, &__cpu_possible_mask);
	else
		cpumask_clear_cpu(cpu, &__cpu_possible_mask);
}

        该函数会设置指定 CPU 的掩码位,以告知内核此 CPU 可能已经存在,反之清楚掩码位,让内核忽略此 CPU,不再为其分配任务。

还有一些类似的函数如下:

// 标记 CPU 是否物理存在
static inline void
set_cpu_present(unsigned int cpu, bool present)
{
	if (present)
		cpumask_set_cpu(cpu, &__cpu_present_mask);
	else
		cpumask_clear_cpu(cpu, &__cpu_present_mask);
}

// 标记 CPU 已在线(已启动并加入调度)需平台相关代码实现
void set_cpu_online(unsigned int cpu, bool online);

// 标记 CPU 是否参与负载均衡(允许任务迁移)
static inline void
set_cpu_active(unsigned int cpu, bool active)
{
	if (active)
		cpumask_set_cpu(cpu, &__cpu_active_mask);
	else
		cpumask_clear_cpu(cpu, &__cpu_active_mask);
}

        这些函数都是用来标记 CPU 的不同状态的,以确保 Linux 内核能够高效的为电源管理、热插拔等功能提供服务。

三、CPU PSCI多核启动流程

        分析到这里 PSCI 的初始化流程和CPU 的 cpu_operations 结构体初始化就完成了,那初始化的这些回调函数在什么时候被调用呢?下面将分析使用 PSCI 的 CPU 启动流程(只包含kernel部分)。

1、boot cpu 启动流程

        在 secondary CPU 没有被启动之前所有的操作默认都是由 boot CPU 执行的,所以这里对 boot CPU 的启动只做简单分析。

start_kernel

        ........

        boot_cpu_init

        ........

        arch_call_rest_init

                rest_init

boot_cpu_init:

// kernel/cpu.c

/*
 * Activate the first processor.
 */
void __init boot_cpu_init(void)
{
	int cpu = smp_processor_id();

	/* Mark the boot cpu "present", "online" etc for SMP and UP case */
	set_cpu_online(cpu, true);
	set_cpu_active(cpu, true);
	set_cpu_present(cpu, true);
	set_cpu_possible(cpu, true);

#ifdef CONFIG_SMP
	__boot_cpu_id = cpu;
#endif
}

    boot_cpu_init 是 Linux 内核启动引导 CPU(第一个启动的 CPU)的核心函数,确保其状态掩码被正确标记以支持后续的多核调度。

        函数首先会获取执行当前代码的 CPU 逻辑 ID,因为其它 CPU 还没有启动,所以这里一般为主 CPU 的 id 并且恒为 0

设置 CPU 状态掩码:

set_cpu_online(cpu, true);
set_cpu_active(cpu, true);
set_cpu_present(cpu, true);
set_cpu_possible(cpu, true);

set_cpu_present:标记 CPU 物理真实存在

set_cpu_possible:声明逻辑支持(受 CONFIG_NR_CPUS 限制)

set_cpu_online:启用调度(允许任务分配)

set_cpu_active:参与负载均衡(允许任务迁移)

必须按 present → possible → online → active 顺序设置 CPU 状态掩码。

        最后,如果开启了 CONFIG_SMP 配置,即 SMP 系统,则将 boot CPU id 保存到 __boot_cpu_id 以方便系统需要获取 boot CPU id。

rest_init 函数如下:

// init/main.c
noinline void __ref rest_init(void)
{
	struct task_struct *tsk;
	int pid;

	rcu_scheduler_starting();
	/*
	 * We need to spawn init first so that it obtains pid 1, however
	 * the init task will end up wanting to create kthreads, which, if
	 * we schedule it before we create kthreadd, will OOPS.
	 */
	pid = kernel_thread(kernel_init, NULL, CLONE_FS);
	/*
	 * Pin init on the boot CPU. Task migration is not properly working
	 * until sched_init_smp() has been run. It will set the allowed
	 * CPUs for init to the non isolated CPUs.
	 */
	rcu_read_lock();
	tsk = find_task_by_pid_ns(pid, &init_pid_ns);
	set_cpus_allowed_ptr(tsk, cpumask_of(smp_processor_id()));
	rcu_read_unlock();

	numa_default_policy();
	pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES);
	rcu_read_lock();
	kthreadd_task = find_task_by_pid_ns(pid, &init_pid_ns);
	rcu_read_unlock();

	/*
	 * Enable might_sleep() and smp_processor_id() checks.
	 * They cannot be enabled earlier because with CONFIG_PREEMPTION=y
	 * kernel_thread() would trigger might_sleep() splats. With
	 * CONFIG_PREEMPT_VOLUNTARY=y the init task might have scheduled
	 * already, but it's stuck on the kthreadd_done completion.
	 */
	system_state = SYSTEM_SCHEDULING;

	complete(&kthreadd_done);

	/*
	 * The boot idle thread must execute schedule()
	 * at least once to get things moving:
	 */
	schedule_preempt_disabled();
	/* Call into cpu_idle with preempt disabled */
	cpu_startup_entry(CPUHP_ONLINE);
}

    rest_init 是 Linux 内核启动流程中的核心函数,负责初始化关键系统进程和多核调度环境,下面将对该函数进行分析,如下:

创建 Init 进程:

pid = kernel_thread(kernel_init, NULL, CLONE_FS);

    kernel_init是内核线程入口函数,最终执行用户空间的 /sbin/init,并强制分配 PID 1 给 init 进程。

绑定 init 进程到启动 CPU:

tsk = find_task_by_pid_ns(pid, &init_pid_ns);
set_cpus_allowed_ptr(tsk, cpumask_of(smp_processor_id()));

        在 SMP 调度初始化(sched_init_smp)未完成时,如果出现任务迁移可能导致系统崩溃,init 进程固定在启动 CPU(CPU 0)运行,直到调度器准备就绪。

创建 kthreadd 守护进程:

pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES);
kthreadd_task = find_task_by_pid_ns(pid, &init_pid_ns);

        创建内核线程管理器(PID 2),即所有内核线程的父进程,这里需要注意的是 kthreadd 必须准备就绪后,其他内核线程才能安全创建,即这里的 complete(&kthreadd_done),如下:

同步与调度激活:

system_state = SYSTEM_SCHEDULING;
complete(&kthreadd_done);

        system_state 用来标记系统进入可调度状态,complete(&kthreadd_done)则是通知 kernel_init 进程 kthreadd 已经就绪,可以继续往下运行。

然后再强制执行一次调度激活任务队列,并将 CPU 设置成 CPUHP_ONLINE 状态,等待被中断或任务唤醒。

schedule_preempt_disabled();
cpu_startup_entry(CPUHP_ONLINE);

        这里有一个同步的问题,即需要先创建 kthreadd,再创建 init,并通过 kthreadd_done 确保 initkthreadd 准备就绪后再继续往下执行,若 init 进程在 kthreadd 准备就绪前创建线程,会因无效的 PID 2 而导致崩溃。

2、secondary CPU 启动流程

        前面分析了这么长的流程好像都还没有初始化 secondary CPUs,那 secondary CPUs 是在哪里被开启的呢?直接进入到 kernel_init ,如下:

kernel_init

        kernel_init_freeable

            // Wait until kthreadd is all set-up.

            wait_for_completion(&kthreadd_done);

            smp_init();

        在 kernel_init_freeable 函数中会先等待 kthreadd 准备就绪之后再往下执行,进入到smp_init 函数,如下:

// kernel/smp.c

/* Called by boot processor to activate the rest. */
void __init smp_init(void)
{
	int num_nodes, num_cpus;
	unsigned int cpu;

	idle_threads_init();
	cpuhp_threads_init();

	pr_info("Bringing up secondary CPUs ...\n");

	/* FIXME: This should be done in userspace --RR */
	for_each_present_cpu(cpu) {
		if (num_online_cpus() >= setup_max_cpus)
			break;
		if (!cpu_online(cpu))
			cpu_up(cpu);
	}

	num_nodes = num_online_nodes();
	num_cpus  = num_online_cpus();
	pr_info("Brought up %d node%s, %d CPU%s\n",
		num_nodes, (num_nodes > 1 ? "s" : ""),
		num_cpus,  (num_cpus  > 1 ? "s" : ""));

	/* Any cleanup work */
	smp_cpus_done(setup_max_cpus);
}

        smp_init 函数是被 boot CPU 进行调用的,即主要功能就是启动所有可用的 secondary CPU,

首先会初始化两个线程:

idle_threads_init:为每个 CPU 创建 idle 线程,用于无任务时的低功耗等待。

cpuhp_threads_init:初始化 CPU 热插拔线程,以动态的打开/关闭 CPU。

然后就开始启动 secondary CPUs:

	pr_info("Bringing up secondary CPUs ...\n");

	/* FIXME: This should be done in userspace --RR */
	for_each_present_cpu(cpu) {
		if (num_online_cpus() >= setup_max_cpus)
			break;
		if (!cpu_online(cpu))
			cpu_up(cpu);
	}

遍历 CPU: for_each_present_cpu 会遍历所有由 __cpu_present_mask 标记的 CPU ,即所有物理存在的 CPU

限制数量:setup_max_cpus(内核配置参数)防止超过硬件支持的 CPU 数量

启动 CPU:如果当前CPU不在线,则调用 cpu_up(cpu) 开始 secondary CPUs 的启动流程

cpu_up 流程如下:

cpu_up                //  kernel/cpu.c

        do_cpu_up(cpu, CPUHP_ONLINE);

                _cpu_up(cpu, 0, target)

                        cpuhp_up_callbacks

                                ......

这里关于 cpu_up 的后续流程可以参考:Linux电源管理——CPU Hotplug 流程

再对 nodes 和 CPU 进行统计:

num_nodes = num_online_nodes();  // 统计激活的 NUMA 节点数
num_cpus  = num_online_cpus();   // 统计已启动的 CPU 核心数

最后清理资源并告知 kernel secondary CPUs 初始化完成:

smp_cpus_done(setup_max_cpus);

到目前为止所以 boot CPU 和 secondary CPUs 都已经启动完毕,并都已经进入 idle 线程,等待执行任务。