OOMkiller,即out of memory killer,是linux下面的一种管理当内存耗尽时的处理机制。当内存较少时,OOM会遍历整个进程链表,然后根据进程的内存使用情况以及它的oom score值最终找到得分较高的进程,然后发送kill信号将其杀掉。
伙伴系统中在分配内存时会做判断,当内存不足时,会调用核心函数out_of_memory(), 函数位于文件oom_kill.c@kernel/mm.
- void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
- int order, nodemask_t *nodemask, bool force_kill)
- {
- const nodemask_t *mpol_mask;
- struct task_struct *p;
- unsigned long totalpages;
- unsigned long freed = 0;
- unsigned int points;
- enum oom_constraint constraint = CONSTRAINT_NONE;
- int killed = 0;
-
- ~~snip
- /*如果当前已经有Pending的kill信号,那么马上返回。
- 毕竟oom最中为了free memory而执行sig kill。*/
- if (fatal_signal_pending(current)) {
- set_thread_flag(TIF_MEMDIE);
- return;
- }
-
- ~~snip
- /*用户空间可以通过/proc/sys/vm/panic_on_oom来改变oom的行为,
- 1表示oom的时候直接panic,0就只杀掉”best”进程而让系统继续运行。*/
- check_panic_on_oom(constraint, gfp_mask, order, mpol_mask);
-
- read_lock(&tasklist_lock);
-
- /*同样/proc/sys/vm/ oom_kill_allocating_task为true时表示直接将当前分配的task
- 给kill掉。*/
- if (sysctl_oom_kill_allocating_task &&
- !oom_unkillable_task(current, NULL, nodemask) &&
- current->mm) {
- oom_kill_process(current, gfp_mask, order, 0, totalpages, NULL,
- nodemask,
- "Out of memory (oom_kill_allocating_task)");
- goto out;
- }
- /*根据当前task的内存以oom score信息得到point值最高的那个。*/
- p = select_bad_process(&points, totalpages, NULL, mpol_mask,
- force_kill);
- /* Found nothing?!?! Either we hang forever, or we panic. */
- if (!p) {
- dump_header(NULL, gfp_mask, order, NULL, mpol_mask);
- read_unlock(&tasklist_lock);
- panic("Out of memory and no killable processes...\n");
- }
- if (PTR_ERR(p) != -1UL) {
- /*唔,被杀了,苦逼!*/
- oom_kill_process(p, gfp_mask, order, points, totalpages, NULL,
- nodemask, "Out of memory");
- killed = 1;
- }
- out:
- read_unlock(&tasklist_lock);
-
- /*
- * Give "p" a good chance of killing itself before we
- * retry to allocate memory unless "p" is current
- */
- if (killed && !test_thread_flag(TIF_MEMDIE))
- schedule_timeout_uninterruptible(1);
- }
select_bad_process():
- static struct task_struct *select_bad_process(unsigned int *ppoints,
- unsigned long totalpages, struct mem_cgroup *memcg,
- const nodemask_t *nodemask, bool force_kill)
- {
- struct task_struct *g, *p;
- struct task_struct *chosen = NULL;
- *ppoints = 0;
- /*遍历所有进程*/
- do_each_thread(g, p) {
- unsigned int points;
- /*处于退出的进程就不管了*/
- if (p->exit_state)
- continue;
- /*有些核心的线程不能杀,如init, kernel_thread*/
- if (oom_unkillable_task(p, memcg, nodemask))
- continue;
- /*正在被oom killing的进程也不管。*/
- if (test_tsk_thread_flag(p, TIF_MEMDIE)) {
- if (unlikely(frozen(p)))
- __thaw_task(p);
- if (!force_kill)
- return ERR_PTR(-1UL);
- }
- if (!p->mm)
- continue;
-
- if (p->flags & PF_EXITING) {
- if (p == current) {
- chosen = p;
- *ppoints = 1000;
- } else if (!force_kill) {
- /*
- * If this task is not being ptraced on exit,
- * then wait for it to finish before killing
- * some other task unnecessarily.
- */
- if (!(p->group_leader->ptrace & PT_TRACE_EXIT))
- return ERR_PTR(-1UL);
- }
- }
- /*计算task对应的points*/
- points = oom_badness(p, memcg, nodemask, totalpages);
- /*如果此task比上次的points要大,那么保存point.*/
- if (points > *ppoints) {
- chosen = p;
- *ppoints = points;
- }
- } while_each_thread(g, p);
-
- return chosen;
- }
oom_badness():
- unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
- const nodemask_t *nodemask, unsigned long totalpages)
- {
- long points;
-
- if (oom_unkillable_task(p, memcg, nodemask))
- return 0;
-
- p = find_lock_task_mm(p);
- if (!p)
- return 0;
- /*oom_score_adj为-1000的不做处理,此值可以通过/proc/pid_num/oom_score_adj设置,范围为-1000 ~ 1000,值越大越容易被oom kill掉。*/
- if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
- task_unlock(p);
- return 0;
- }
-
- /*
- * The memory controller may have a limit of 0 bytes, so avoid a divide
- * by zero, if necessary.
- */
- if (!totalpages)
- totalpages = 1;
-
- /* get_mm_rss获取当前用户空间使用文件和匿名页占有内存数,nr_ptes 获取
- 当前保存页表使用的内存。*/
- points = get_mm_rss(p->mm) + p->mm->nr_ptes;
- /*获取交换内存使用的内存数*/
- points += get_mm_counter(p->mm, MM_SWAPENTS);
- /*每个task同等计算,可不管。*/
- points *= 1000;
- points /= totalpages;
- task_unlock(p);
-
- /*当该进程具有CAP_SYS_ADMIN能力,那么Point降低,因为具有ADMIN权限的
- Task是被认为表现良好的。 */
- if (has_capability_noaudit(p, CAP_SYS_ADMIN))
- points -= 30;
-
- /*加上oom_score_adj,范围从-1000 ~ 1000. */
- points += p->signal->oom_score_adj;
-
- /*
- * Never return 0 for an eligible task that may be killed since it's
- * possible that no single user task uses more than 0.1% of memory and
- * no single admin tasks uses more than 3.0%.
- */
- if (points <= 0)
- return 1;
- /*1000封顶*/
- return (points < 1000) ? points : 1000;
- }
oom_kill_process():
- static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
- unsigned int points, unsigned long totalpages,
- struct mem_cgroup *memcg, nodemask_t *nodemask,
- const char *message)
- {
- struct task_struct *victim = p;
- struct task_struct *child;
- struct task_struct *t = p;
- struct mm_struct *mm;
- unsigned int victim_points = 0;
- static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL,
- DEFAULT_RATELIMIT_BURST);
-
- /*
- * If the task is already exiting, don't alarm the sysadmin or kill
- * its children or threads, just set TIF_MEMDIE so it can die quickly
- */
- if (p->flags & PF_EXITING) {
- set_tsk_thread_flag(p, TIF_MEMDIE);
- return;
- }
-
- if (__ratelimit(&oom_rs))
- dump_header(p, gfp_mask, order, memcg, nodemask);
-
- task_lock(p);
- pr_err("%s: Kill process %d (%s) score %d or sacrifice child\n",
- message, task_pid_nr(p), p->comm, points);
- task_unlock(p);
-
- /*当前被选定子进程的mm和父进程不一样时,找到其中最高point
- 的children task,然后替代父进程被杀掉,所以当一个进程有多个子进程并且
- 真用较多内存时,子进程有可能被杀掉,而父进程还可以活着。 */
- do {
- list_for_each_entry(child, &t->children, sibling) {
- unsigned int child_points;
-
- if (child->mm == p->mm)
- continue;
- /*
- * oom_badness() returns 0 if the thread is unkillable
- */
- child_points = oom_badness(child, memcg, nodemask,
- totalpages);
- if (child_points > victim_points) {
- victim = child;
- victim_points = child_points;
- }
- }
- } while_each_thread(p, t);
-
- victim = find_lock_task_mm(victim);
- if (!victim)
- return;
-
- /* mm cannot safely be dereferenced after task_unlock(victim) */
- mm = victim->mm;
- pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
- task_pid_nr(victim), victim->comm, K(victim->mm->total_vm),
- K(get_mm_counter(victim->mm, MM_ANONPAGES)),
- K(get_mm_counter(victim->mm, MM_FILEPAGES)));
- task_unlock(victim);
-
- /*
- 只要mm是一样的,也就是说共享内存的进程,都会和当前找到最高point的
- 指定进程一起被杀掉。 */
- for_each_process(p)
- if (p->mm == mm && !same_thread_group(p, victim) &&
- !(p->flags & PF_KTHREAD)) {
- if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
- continue;
-
- task_lock(p); /* Protect ->comm from prctl() */
- pr_err("Kill process %d (%s) sharing same memory\n",
- task_pid_nr(p), p->comm);
- task_unlock(p);
- /*发送 SIGKILL信号。*/
- do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true);
- }
-
- set_tsk_thread_flag(victim, TIF_MEMDIE);
- do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true);
- }
所以,out_of_memory()做的任务就是遍历系统全部进程,然后根据内存使用情况以及oom_score_adj的值计算得到一个point, 最终将最高point的task给kill掉。
相关知识:
1. Malloc会引起OOM killer,可参考:
http://blog.dccmx.com/2011/04/oom-killer-on-linux
2. OOM killer值是管理计算lowmemory部分,即使High memory有很多空闲内存。
3. 进程rss的计算可参考此文:
http://filwmm1314.blog.163.com/blog/static/2182591920121016541582/
4. 影响到oom killer行为的文件有:
/proc/sys/vm/overcommit_memory
/proc/sys/vm/panic_on_oom
/proc/sys/vm/oom_kill_allocating_task
/porc/pid_xxx/oom_score_adj
2013/04/27