linux kernel panic之后重启

panic_timeout

//linux-xxx/kernel/panic.c
core_param(panic, panic_timeout, int, 0644);

void panic(const char *fmt, ...)
{
...
        if (panic_timeout > 0) {
                /*
                 * Delay timeout seconds before rebooting the machine.
                 * We can't use the "normal" timers since we just panicked.
		 */
                printk(KERN_EMERG "Rebooting in %d seconds..", panic_timeout);
                for (i = 0; i < panic_timeout * 1000; i += PANIC_TIMER_STEP) {
                        touch_nmi_watchdog();
                        if (i >= i_next) {
                                i += panic_blink(state ^= 1);
                                i_next = i + 3600 / PANIC_BLINK_SPD;
                        }
                        mdelay(PANIC_TIMER_STEP);
                }
        }
        if (panic_timeout != 0) {
                /*
                 * This will not be a clean reboot, with everything
                 * shutting down.  But if there is a chance of
                 * rebooting the system it will be rebooted.
                 */
                emergency_restart();
        }
...
}

只要设置模块参数panic大于0就能调用emergency_restart重启了
core_param(panic, panic_timeout, int, 0644);
通过core_param定义的参数在/sys/module/kernel/parameters目录下
通过module_param定义的参数在/sys/module/xxx/parameters目录下


emergency_restart

emergency_restart
	machine_emergency_restart  //include/asm-generic/emergency_restart.h
		machine_restart  //arch/rlx/kernel/reset.c
			_machine_restart <==> bsp_machine_restart  //bsp/setup.c
				reboot_by_wdt  //drivers/watchdog/rtsx_wdt.c

最终调用了watchdog中的函数来复位整个系统。


reboot命令

reboot来自busybox,看看reboot的一步步调用流程是怎样的
reboot时打印消息:

The system is going down NOW !!  
Sending SIGTERM to all processes.  
Sending SIGKILL to all processes.  
Please stand by while rebooting the system.  
Restarting system.  

用户空间调用流程

在busybox代码init/init.c中有这么一段,

int init_main(int argc UNUSED_PARAM, char **argv)
{
...
                bb_signals(0
                        + (1 << SIGUSR1) /* halt */
                        + (1 << SIGTERM) /* reboot */
                        + (1 << SIGUSR2) /* poweroff */
                        , halt_reboot_pwoff);
                signal(SIGQUIT, restart_handler); /* re-exec another init */
...

单独拿出halt_reboot_pwoff和restart_handler

static void halt_reboot_pwoff(int sig)
{
        const char *m;
        unsigned rb;

        /* We may call run() and it unmasks signals,
         * including the one masked inside this signal handler.
         * Testcase which would start multiple reboot scripts:
         *  while true; do reboot; done
         * Preventing it:
         */
        reset_sighandlers_and_unblock_sigs();
        run_shutdown_and_kill_processes();

        m = "halt";
        rb = RB_HALT_SYSTEM;
        if (sig == SIGTERM) {
                m = "reboot";
                rb = RB_AUTOBOOT;
        } else if (sig == SIGUSR2) {
                m = "poweroff";
                rb = RB_POWER_OFF;
        }
        message(L_CONSOLE, "Requesting system %s", m);
        pause_and_low_level_reboot(rb);
        /* not reached */
}

/* Handler for QUIT - exec "restart" action,
 * else (no such action defined) do nothing */
static void restart_handler(int sig UNUSED_PARAM)
{
        struct init_action *a;

        for (a = init_action_list; a; a = a->next) {
                if (!(a->action_type & RESTART))
                        continue;

                /* Starting from here, we won't return.
                 * Thus don't need to worry about preserving errno
                 * and such.
                 */

                reset_sighandlers_and_unblock_sigs();
                run_shutdown_and_kill_processes();

#ifdef RB_ENABLE_CAD
                /* Allow Ctrl-Alt-Del to reboot the system.
                 * This is how kernel sets it up for init, we follow suit.
                 */
                reboot(RB_ENABLE_CAD); /* misnomer */
#endif

                if (open_stdio_to_tty(a->terminal)) {
                        dbg_message(L_CONSOLE, "Trying to re-exec %s", a->command);
                        /* Theoretically should be safe.
                         * But in practice, kernel bugs may leave
                         * unkillable processes, and wait() may block forever.
                         * Oh well. Hoping "new" init won't be too surprised
                         * by having children it didn't create.
                         */
                reset_sighandlers_and_unblock_sigs();
                run_shutdown_and_kill_processes();

#ifdef RB_ENABLE_CAD
                /* Allow Ctrl-Alt-Del to reboot the system.
                 * This is how kernel sets it up for init, we follow suit.
                 */
                reboot(RB_ENABLE_CAD); /* misnomer */
#endif

                if (open_stdio_to_tty(a->terminal)) {
                        dbg_message(L_CONSOLE, "Trying to re-exec %s", a->command);
                        /* Theoretically should be safe.
                         * But in practice, kernel bugs may leave
                         * unkillable processes, and wait() may block forever.
                         * Oh well. Hoping "new" init won't be too surprised
                         * by having children it didn't create.
                         */
                        //while (wait(NULL) > 0)
                        //      continue;
                        init_exec(a->command);
                }
                /* Open or exec failed */
                pause_and_low_level_reboot(RB_HALT_SYSTEM);
                /* not reached */
        }
}

我们看到他们都会有调用这两个函数:reset_sighandlers_and_unblock_sigs();以及 run_shutdown_and_kill_processes();,我们重点关注如下这个函数:

static void run_shutdown_and_kill_processes(void)
{
        /* Run everything to be run at "shutdown".  This is done _prior_
         * to killing everything, in case people wish to use scripts to
         * shut things down gracefully... */
        run_actions(SHUTDOWN);

        message(L_CONSOLE | L_LOG, "The system is going down NOW!");

        /* Send signals to every process _except_ pid 1 */
        kill(-1, SIGTERM);
        message(L_CONSOLE | L_LOG, "Sent SIG%s to all processes", "TERM");
        sync();
        sleep(1);

        kill(-1, SIGKILL);
        message(L_CONSOLE, "Sent SIG%s to all processes", "KILL");
        sync();
        /*sleep(1); - callers take care about making a pause */
}

终于看到了上面的打印信息:The system is going down NOW !! 以及Sending SIGTERM to all processes.
同时在上面的halt_reboot_pwoff和restart_handler中都会调用这样一个函数:

static void pause_and_low_level_reboot(unsigned magic)
{
        pid_t pid;

        /* Allow time for last message to reach serial console, etc */
        sleep(1);

        /* We have to fork here, since the kernel calls do_exit(EXIT_SUCCESS)
         * in linux/kernel/sys.c, which can cause the machine to panic when
         * the init process exits... */
        pid = vfork();
        if (pid == 0) { /* child */
                reboot(magic);
                _exit(EXIT_SUCCESS);
        }
        while (1)
                sleep(1);
}

这里最终调用了内核提供的reboot系统调用。


linux内核空间调用流程

//reboot系统调用
SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
                void __user *, arg)
{
        switch (cmd) {
        case LINUX_REBOOT_CMD_RESTART:
                kernel_restart(NULL);
...
}

kernel_restart
    kernel_restart_prepare
        blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
    syscore_shutdown
        ops->shutdown()
	machine_restart  //arch/rlx/kernel/reset.c
		_machine_restart <==> bsp_machine_restart  //bsp/setup.c
			reboot_by_wdt  //drivers/watchdog/rtsx_wdt.c

可以实现在驱动中通过register_reboot_notifier向reboot_notifier_list注册回调函数,这样在系统reboot的时候回调函数就会被调用到了
也可以在驱动中实现syscore_ops函数,这样在系统reboot的时候也会被调用到
当调用到machine_restart就和上面发生panic时调用的流程一样了,最终通过watchdog提供的函数接口复位整个系统。


参考文章

  1. 基于Linux与Busybox的Reboot命令流程分析
Logo

旨在为数千万中国开发者提供一个无缝且高效的云端环境,以支持学习、使用和贡献开源项目。

更多推荐