摘自:http://blog.chinaunix.net/xmlrpc.php?r=blog/article&uid=14528823&id=4215888
softlockup(watchdog)用于检测系统调度是否正常,即软锁的情况,当发生softlockup时,内核不能调度,但还能响应中断,对用户的表现可能为:能ping通,但无法登陆系统,无法进行正常操作。
其基本原理为:为每个CPU启动一个内核线程(watchdog/x),此线程为优先级最高的实时线程,在该线程得到调度时,会更新相应的计数(时间戳),同时会启动定时器,当定时器到期时检查相应的时间戳,如果超过指定时间,都没有更新,则说明这段时间内都没有发生调度(因为此线程优先级最高),则打印相应告警或根据配置可以进入panic流程。
基本代码分析(2.6.32)
rest_init->kernel_init->lockup_detector_init->cpu_callback->watchdog_prepare_cpu(初始化watchdog定时器):
- static int watchdog_prepare_cpu(int cpu)
- {
- struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu);
-
- WARN_ON(per_cpu(softlockup_watchdog, cpu));
- hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);//初始化高精度定时器
- hrtimer->function = watchdog_timer_fn;//设置定时器处理函数
-
- return 0;
- }
看门狗定时器处理函数:
- static enum hrtimer_restart watchdog_timer_fn(struct hrtimer *hrtimer)
- {
- //获取计数watchdog_touch_ts,该计数在watchdog内核线程被调度时更新
- unsigned long touch_ts = __get_cpu_var(watchdog_touch_ts);
- struct pt_regs *regs = get_irq_regs();
- int duration;
-
- /* kick the hardlockup detector */
- //增加中断计数,证明没有发生硬锁(关中断死锁)
- watchdog_interrupt_count();
-
- /* kick the softlockup detector */
- //唤醒wathdog内核线程
- wake_up_process(__get_cpu_var(softlockup_watchdog));
-
- /* .. and repeat */
- //重启定时器
- hrtimer_forward_now(hrtimer, ns_to_ktime(get_sample_period()));
- if (touch_ts == 0) {
- if (unlikely(__get_cpu_var(softlockup_touch_sync))) {
- /*
- * If the time stamp was touched atomically
- * make sure the scheduler tick is up to date.
- */
- __get_cpu_var(softlockup_touch_sync) = false;
- sched_clock_tick();
- }
- __touch_watchdog();
- return HRTIMER_RESTART;
- }
-
- /* check for a softlockup
- * This is done by making sure a high priority task is
- * being scheduled. The task touches the watchdog to
- * indicate it is getting cpu time. If it hasn't then
- * this is a good indication some task is hogging the cpu
- */
- //判断是否发生了软锁,原理是判断touch_ts(时间戳)是否超过一定时间没有更新
- duration = is_softlockup(touch_ts);
- if (unlikely(duration)) {
- /* only warn once */
- if (__get_cpu_var(soft_watchdog_warn) == true)
- return HRTIMER_RESTART;
- //发生了软锁后,进行一些列的信息记录和告警。
- printk(KERN_EMERG "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
- smp_processor_id(), duration,
- current->comm, task_pid_nr(current));
- print_modules();
- print_irqtrace_events(current);
- if (regs)
- show_regs(regs);
- else
- dump_stack();
- //如果配置了softlockup_panic(proc中配置),则panic
- if (softlockup_panic)
- panic("softlockup: hung tasks");
- __get_cpu_var(soft_watchdog_warn) = true;
- } else
- __get_cpu_var(soft_watchdog_warn) = false;
-
- return HRTIMER_RESTART;
- }
启动看门狗,即创建watchdog内核线程。
- static int watchdog_enable(int cpu)
- {
- struct task_struct *p = per_cpu(softlockup_watchdog, cpu);
- int err = 0;
-
- /* enable the perf event */
- err = watchdog_nmi_enable(cpu);
-
- /* Regardless of err above, fall through and start softlockup */
-
- /* create the watchdog thread */
- if (!p) {
- //创建watchdog内核线程
- p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu);
- if (IS_ERR(p)) {
- printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu);
- if (!err)
- /* if hardlockup hasn't already set this */
- err = PTR_ERR(p);
- goto out;
- }
- kthread_bind(p, cpu);
- per_cpu(watchdog_touch_ts, cpu) = 0;
- per_cpu(softlockup_watchdog, cpu) = p;
- wake_up_process(p);
- }
-
- out:
- return err;
- }
watchdog内核线程执行主函数,主要是要更新计数(时间戳)
- static int watchdog(void *unused)
- {
- //设置为最高优先级
- struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
- struct hrtimer *hrtimer = &__raw_get_cpu_var(watchdog_hrtimer);
- //设置为实时线程
- sched_setscheduler(current, SCHED_FIFO, ¶m);
-
- /* initialize timestamp */
- //初始化计数(时间戳)
- __touch_watchdog();
-
- /* kick off the timer for the hardlockup detector */
- /* done here because hrtimer_start can only pin to smp_processor_id() */
- //启动定时器,用于检测是否发生软锁
- hrtimer_start(hrtimer, ns_to_ktime(get_sample_period()),
- HRTIMER_MODE_REL_PINNED);
- //睡眠
- set_current_state(TASK_INTERRUPTIBLE);
- /*
- * Run briefly once per second to reset the softlockup timestamp.
- * If this gets delayed for more than 60 seconds then the
- * debug-printout triggers in watchdog_timer_fn().
- */
- while (!kthread_should_stop()) {
- //更新计数
- __touch_watchdog();
- schedule();
-
- if (kthread_should_stop())
- break;
-
- set_current_state(TASK_INTERRUPTIBLE);
- }
- __set_current_state(TASK_RUNNING);
-
- return 0;
- }
判断是否发生软锁:is_softlockup
- static int is_softlockup(unsigned long touch_ts)
- {
- unsigned long now = get_timestamp(smp_processor_id());
-
- /* Warn about unreasonable delays: */
- //检测计数多久没有更新了,如果超过了60s,则表示发生了软锁
- if (time_after(now, touch_ts + softlockup_thresh))
- return now - touch_ts;
-
- return 0;
- }
本站仅提供存储服务,所有内容均由用户发布,如发现有害或侵权内容,请
点击举报。