Android U Lmkd源码解析

发布于:2025-09-03 ⋅ 阅读:(16) ⋅ 点赞:(0)

Android 10 及更高版本支持新的 lmkd 模式,它使用内核压力失速信息 (PSI) 监视器来检测内存压力。上游内核中的 PSI 补丁程序集(已向后移植到 4.9 和 4.14 内核)可测量由于内存不足导致任务延迟的时间。由于这些延迟会直接影响用户体验,因此它们代表了确定内存压力严重性的便捷指标。上游内核还包括 PSI 监视器,这类监视器允许特权用户空间进程(例如 lmkd)指定这些延迟的阈值,并在突破阈值时从内核订阅事件。

接下来参考如下android U源码进行lmkd解读

基本介绍其相关分析可以参考:Android Lmkd 低内存终止守护程序-CSDN博客

1.lmkd进程启动和初始化

1.1 init.rc

lmkd由init进程启动,在系统中作为一个单独的进程存在。

// system/core/rootdir/init.rc
    // ...
    # Start lmkd before any other services run so that it can register them
    write /proc/sys/vm/watermark_boost_factor 0
    chown root system /sys/module/lowmemorykiller/parameters/adj
    chmod 0664 /sys/module/lowmemorykiller/parameters/adj
    chown root system /sys/module/lowmemorykiller/parameters/minfree
    chmod 0664 /sys/module/lowmemorykiller/parameters/minfree
    start lmkd // 启动lmkd
    // ...

1.2 lmkd.rc

// system/memory/lmkd/lmkd.rc
service lmkd /system/bin/lmkd
    class core
    user lmkd
    group lmkd system readproc
    capabilities DAC_OVERRIDE KILL IPC_LOCK SYS_NICE SYS_RESOURCE
    critical
    socket lmkd seqpacket+passcred 0660 system system
    task_profiles ServiceCapacityLow

on property:lmkd.reinit=1
    exec_background /system/bin/lmkd --reinit

# reinitialize lmkd after device finished booting if experiments set any flags during boot
on property:sys.boot_completed=1 && property:lmkd.reinit=0
    setprop lmkd.reinit 1

# properties most likely to be used in experiments
# setting persist.device_config.* property either triggers immediate lmkd re-initialization
# if the device finished booting or sets lmkd.reinit=0 to re-initialize lmkd after boot completes
on property:persist.device_config.lmkd_native.debug=*
    setprop lmkd.reinit ${sys.boot_completed:-0}

on property:persist.device_config.lmkd_native.kill_heaviest_task=*
    setprop lmkd.reinit ${sys.boot_completed:-0}

on property:persist.device_config.lmkd_native.kill_timeout_ms=*
    setprop lmkd.reinit ${sys.boot_completed:-0}

on property:persist.device_config.lmkd_native.swap_free_low_percentage=*
    setprop lmkd.reinit ${sys.boot_completed:-0}

on property:persist.device_config.lmkd_native.psi_partial_stall_ms=*
    setprop lmkd.reinit ${sys.boot_completed:-0}

on property:persist.device_config.lmkd_native.psi_complete_stall_ms=*
    setprop lmkd.reinit ${sys.boot_completed:-0}

on property:persist.device_config.lmkd_native.thrashing_limit=*
    setprop lmkd.reinit ${sys.boot_completed:-0}

on property:persist.device_config.lmkd_native.thrashing_limit_decay=*
    setprop lmkd.reinit ${sys.boot_completed:-0}

on property:persist.device_config.lmkd_native.thrashing_limit_critical=*
    setprop lmkd.reinit ${sys.boot_completed:-0}

on property:persist.device_config.lmkd_native.swap_util_max=*
    setprop lmkd.reinit ${sys.boot_completed:-0}

on property:persist.device_config.lmkd_native.filecache_min_kb=*
    setprop lmkd.reinit ${sys.boot_completed:-0}

1.3 lmkd.cpp

启动时直接运行lmkd.cpp中的main函数。main函数中,逻辑较清楚,更新参数,创建logger,之后在if中进行init,之后在mainloop()中循环等待。

int main(int argc, char **argv) {
    if ((argc > 1) && argv[1] && !strcmp(argv[1], "--reinit")) {
        if (property_set(LMKD_REINIT_PROP, "")) {
            ALOGE("Failed to reset " LMKD_REINIT_PROP " property");
        }
        return issue_reinit();
    }
    // 更新参数 见1.3.1
    if (!update_props()) {
        ALOGE("Failed to initialize props, exiting.");
        return -1;
    }
    // 创建logger
    ctx = create_android_logger(KILLINFO_LOG_TAG);

    if (!init()) {
        if (!use_inkernel_interface) {
            /*
             * MCL_ONFAULT pins pages as they fault instead of loading
             * everything immediately all at once. (Which would be bad,
             * because as of this writing, we have a lot of mapped pages we
             * never use.) Old kernels will see MCL_ONFAULT and fail with
             * EINVAL; we ignore this failure.
             *
             * N.B. read the man page for mlockall. MCL_CURRENT | MCL_ONFAULT
             * pins ⊆ MCL_CURRENT, converging to just MCL_CURRENT as we fault
             * in pages.
             */
            /* CAP_IPC_LOCK required */
            if (mlockall(MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT) && (errno != EINVAL)) {
                ALOGW("mlockall failed %s", strerror(errno));
            }

            /* CAP_NICE required */
            struct sched_param param = {
                    .sched_priority = 1,
            };
            if (sched_setscheduler(0, SCHED_FIFO, &param)) {
                ALOGW("set SCHED_FIFO failed %s", strerror(errno));
            }
        }

        if (init_reaper()) {
            ALOGI("Process reaper initialized with %d threads in the pool",
                reaper.thread_cnt());
        }

        if (!watchdog.init()) {
            ALOGE("Failed to initialize the watchdog");
        }
        //在mainloop()中循环等待,见1.3.3
        mainloop();
    }

1.3.1 update_props()

update_props函数中主要是使用GET_LMK_PROPERTY从属性中获取各个参数配置,参数更新,例如从参数中获取low 、medium、critical三种压力等级下,可以kill的adj等级。

static bool update_props() {
    /* By default disable low level vmpressure events */
    level_oomadj[VMPRESS_LEVEL_LOW] =
        GET_LMK_PROPERTY(int32, "low", OOM_SCORE_ADJ_MAX + 1);
    level_oomadj[VMPRESS_LEVEL_MEDIUM] =
        GET_LMK_PROPERTY(int32, "medium", 800);
    level_oomadj[VMPRESS_LEVEL_CRITICAL] =
        GET_LMK_PROPERTY(int32, "critical", 0);
    debug_process_killing = GET_LMK_PROPERTY(bool, "debug", false);

    /* By default disable upgrade/downgrade logic */
    enable_pressure_upgrade =
        GET_LMK_PROPERTY(bool, "critical_upgrade", false);
    upgrade_pressure =
        (int64_t)GET_LMK_PROPERTY(int32, "upgrade_pressure", 100);
    downgrade_pressure =
        (int64_t)GET_LMK_PROPERTY(int32, "downgrade_pressure", 100);
    kill_heaviest_task =
        GET_LMK_PROPERTY(bool, "kill_heaviest_task", false);
    low_ram_device = property_get_bool("ro.config.low_ram", false);
    kill_timeout_ms =
        (unsigned long)GET_LMK_PROPERTY(int32, "kill_timeout_ms", 100);
    use_minfree_levels =
        GET_LMK_PROPERTY(bool, "use_minfree_levels", false);
    per_app_memcg =
        property_get_bool("ro.config.per_app_memcg", low_ram_device);
    swap_free_low_percentage = clamp(0, 100, GET_LMK_PROPERTY(int32, "swap_free_low_percentage",
        DEF_LOW_SWAP));
    psi_partial_stall_ms = GET_LMK_PROPERTY(int32, "psi_partial_stall_ms",
        low_ram_device ? DEF_PARTIAL_STALL_LOWRAM : DEF_PARTIAL_STALL);
    psi_complete_stall_ms = GET_LMK_PROPERTY(int32, "psi_complete_stall_ms",
        DEF_COMPLETE_STALL);
    thrashing_limit_pct =
            std::max(0, GET_LMK_PROPERTY(int32, "thrashing_limit",
                                         low_ram_device ? DEF_THRASHING_LOWRAM : DEF_THRASHING));
    thrashing_limit_decay_pct = clamp(0, 100, GET_LMK_PROPERTY(int32, "thrashing_limit_decay",
        low_ram_device ? DEF_THRASHING_DECAY_LOWRAM : DEF_THRASHING_DECAY));
    thrashing_critical_pct = std::max(
            0, GET_LMK_PROPERTY(int32, "thrashing_limit_critical", thrashing_limit_pct * 2));
    swap_util_max = clamp(0, 100, GET_LMK_PROPERTY(int32, "swap_util_max", 100));
    filecache_min_kb = GET_LMK_PROPERTY(int64, "filecache_min_kb", 0);
    stall_limit_critical = GET_LMK_PROPERTY(int64, "stall_limit_critical", 100);

    reaper.enable_debug(debug_process_killing);

    /* Call the update props hook */
    if (!lmkd_update_props_hook()) {
        ALOGE("Failed to update LMKD hook props.");
        return false;
    }

    return true;
}

GET_LMK_PROPERTY是一个宏定义,用来读取ro.lmk参数

#define GET_LMK_PROPERTY(type, name, def) \
    property_get_##type("persist.device_config.lmkd_native." name, \
        property_get_##type("ro.lmk." name, def))

1.3.2 init()初始化过程

1.3.2.1 创建epoll监听

init中比较重要的一步是创建epoll监听,这里有宏定义MAX_EPOLL_EVENTS是10,也就是epoll监听了10个event。

/* max supported number of data connections (AMS, init, tests) */
/* 支持的最大数据连接数(AMS、init、测试) */
#define MAX_DATA_CONN 3

/*
 * 1 ctrl listen socket, 3 ctrl data socket, 3 memory pressure levels,
 * 1 lmk events + 1 fd to wait for process death + 1 fd to receive kill failure notifications
 * 
 * 1个控制监听socket,3个控制数据通信socket,3个内存压力等级,1个lmk时间,1个监控进程死亡,1个接收kill失败通知
 */
#define MAX_EPOLL_EVENTS (1 + MAX_DATA_CONN + VMPRESS_LEVEL_COUNT + 1 + 1 + 1)

static int epollfd;

static int init(void) {
    static struct event_handler_info kernel_poll_hinfo = { 0, kernel_event_handler };
    struct reread_data file_data = {
        .filename = ZONEINFO_PATH,
        .fd = -1,
    };
    struct epoll_event epev;
    int pidfd;
    int i;
    int ret;

    page_k = sysconf(_SC_PAGESIZE);
    if (page_k == -1)
        page_k = PAGE_SIZE;
    page_k /= 1024;
    //
    epollfd = epoll_create(MAX_EPOLL_EVENTS);
    if (epollfd == -1) {
        ALOGE("epoll_create failed (errno=%d)", errno);
        return -1;
    }

    // mark data connections as not connected
    for (int i = 0; i < MAX_DATA_CONN; i++) {
        data_sock[i].sock = -1;
    }

    ctrl_sock.sock = android_get_control_socket("lmkd");
    if (ctrl_sock.sock < 0) {
        ALOGE("get lmkd control socket failed");
        return -1;
    }
    //监听lmkd socket
    ret = listen(ctrl_sock.sock, MAX_DATA_CONN);
    if (ret < 0) {
        ALOGE("lmkd control socket listen failed (errno=%d)", errno);
        return -1;
    }

    epev.events = EPOLLIN;
    //见1.3.3.2
    ctrl_sock.handler_info.handler = ctrl_connect_handler;
    epev.data.ptr = (void *)&(ctrl_sock.handler_info);
    if (epoll_ctl(epollfd, EPOLL_CTL_ADD, ctrl_sock.sock, &epev) == -1) {
        ALOGE("epoll_ctl for lmkd control socket failed (errno=%d)", errno);
        return -1;
    }
    maxevents++;

例如ams会作为socket客户端,通过/dev/socket/lmkd与lmkd进行socket通信,将进程的adj通知到lmkd,并由lmkd写入"/proc/[pid]/oom_score_adj"路径。

1.3.2.2 初始化lmkd触发方式

接下来init函数需要决定lmkd的触发方式,早期的lmk使用内核驱动的方式,这里通过access确认旧的节点是否还存在(kernel 4.12已废弃)。不支持的话就是执行 init_monitors()。

    has_inkernel_module = !access(INKERNEL_MINFREE_PATH, W_OK);
    use_inkernel_interface = has_inkernel_module;

    if (use_inkernel_interface) 
         // 大多内核已不支持{
        ALOGI("Using in-kernel low memory killer interface");
        if (init_poll_kernel()) {
            epev.events = EPOLLIN;
            epev.data.ptr = (void*)&kernel_poll_hinfo;
            if (epoll_ctl(epollfd, EPOLL_CTL_ADD, kpoll_fd, &epev) != 0) {
                ALOGE("epoll_ctl for lmk events failed (errno=%d)", errno);
                close(kpoll_fd);
                kpoll_fd = -1;
            } else {
                maxevents++;
                /* let the others know it does support reporting kills */
                property_set("sys.lmk.reportkills", "1");
            }
        }
    } else {
        if (!init_monitors()) {
            return -1;
        }
        /* let the others know it does support reporting kills */
        property_set("sys.lmk.reportkills", "1");
    }

注意初始化监控器这里,有4个看起来很像的函数,分别是init_monitors()、init_psi_monitors()、init_mp_psi()、init_psi_monitor(),注意区分。

看代码,在init_monitors()函数中,要确认使用PSI触发还是vmpressure触发;在“ro.lmk. use_psi”属性为true的情况下,调用 init_psi_monitors 初始化PSI监控器,失败才会使用init_mp_common初始化vmpressure监控器,这里可以看出lmkd还是倾向于优先使用PSI触发。

static bool init_monitors() {
    /* 在内核支持的情况下,尽量使用PSI监控器 */
    use_psi_monitors = GET_LMK_PROPERTY(bool, "use_psi", true) &&
        init_psi_monitors();

    /* PSI监控器初始化失败,回退到vmpressure触发 */
    if (!use_psi_monitors &&
        (!init_mp_common(VMPRESS_LEVEL_LOW) ||
        !init_mp_common(VMPRESS_LEVEL_MEDIUM) ||
        !init_mp_common(VMPRESS_LEVEL_CRITICAL))) {
        ALOGE("Kernel does not support memory pressure events or in-kernel low memory killer");
        return false;
    }
    if (use_psi_monitors) {
        ALOGI("Using psi monitors for memory pressure detection");
    } else {
        ALOGI("Using vmpressure for memory pressure detection");
    }
    return true;
}

(1)PSI触发

接下来看调用init_psi_monitors() 初始化PSI监控器,在明确设置属性use_new_strategy为true的情况下,或低内存设备,或明确use_minfree_levels为false的情况下,都是倾向于使用“新的策略”。这里新的策略其实指的是在PSI触发之后,是根据free page的情况(水线)去查杀进程,还是根据不同PSI压力去查杀进程,前者就是旧策略,后者为新策略;个人认为这里用“新旧”去区分非常不优雅。

注意这里新旧的策略,是依据PSI压力杀进程还是依据水线杀进程,但都不影响这里是设置的是PSI监控器,即触发仍然还是用PSI触发的,是杀进程的方式存在不同。

static bool init_psi_monitors() {

    bool use_new_strategy =
        GET_LMK_PROPERTY(bool, "use_new_strategy", low_ram_device || !use_minfree_levels);

    /* 在默认 PSI模式下,使用系统属性覆盖 psi stall阈值 */
    if (use_new_strategy) {
        /* Do not use low pressure level */
        psi_thresholds[VMPRESS_LEVEL_LOW].threshold_ms = 0;
        psi_thresholds[VMPRESS_LEVEL_MEDIUM].threshold_ms = psi_partial_stall_ms;
        psi_thresholds[VMPRESS_LEVEL_CRITICAL].threshold_ms = psi_complete_stall_ms;
    }

    if (!init_mp_psi(VMPRESS_LEVEL_LOW, use_new_strategy)) {
        return false;
    }
    if (!init_mp_psi(VMPRESS_LEVEL_MEDIUM, use_new_strategy)) {
        destroy_mp_psi(VMPRESS_LEVEL_LOW);
        return false;
    }
    if (!init_mp_psi(VMPRESS_LEVEL_CRITICAL, use_new_strategy)) {
        destroy_mp_psi(VMPRESS_LEVEL_MEDIUM);
        destroy_mp_psi(VMPRESS_LEVEL_LOW);
        return false;
    }
    return true;
}

决定好新旧策略后,接下来调用init_mp_psi来初始化各个等级的PSI事件。

init_mp_psi有两个参数,第一个是压力等级,第二个新旧策略的标志位。注意第一个参数的命名是“vmpressure_level”,尽管是“vmpressure”,但实际这里用PSI触发,是根据PSI来判断内存压力等级的,和前面说的vmpressure判断内存压力等级并非同一个“vmpressure”,这是第二个我认为代码非常不优雅的地方,容易引起歧义。vmpressure全称是虚拟内存压力,难道设计者的想法中,PSI所产生的stall ms也是一种虚拟的内存压力?

static bool init_mp_psi(enum vmpressure_level level, bool use_new_strategy) {
    int fd;

    /* Do not register a handler if threshold_ms is not set */
    if (!psi_thresholds[level].threshold_ms) {
        return true;
    }

    fd = init_psi_monitor(psi_thresholds[level].stall_type,
        psi_thresholds[level].threshold_ms * US_PER_MS,
        PSI_WINDOW_SIZE_MS * US_PER_MS);

    if (fd < 0) {
        return false;
    }

    vmpressure_hinfo[level].handler = use_new_strategy ? mp_event_psi : mp_event_common;
    vmpressure_hinfo[level].data = level;
    if (register_psi_monitor(epollfd, fd, &vmpressure_hinfo[level]) < 0) {
        destroy_psi_monitor(fd);
        return false;
    }
    maxevents++;
    mpevfd[level] = fd;

    return true;
}

注意这里的init_psi_monitor和前面的init_psi_monitors做区分,init_psi_monitor是定义在system/memory/lmkd/libpsi/psi.cpp中的,它的作用是根据stall类型、阈值、窗口大小,获取epoll监听的句柄。

然后最重要的就是vmpressure_hinfo[level].handler,其根据是否使用新策略,决定了在这个压力等级事件发生时,要调用的是mp_event_psi还是mp_event_common。也就是使用新策略的情况下,当这个压力事件到来时,会调用mp_event_psi。mp_event_psi

后面register_psi_monitor则是epoll监听压力事件。

至此可以认为init_psi_monitors()也就是PSI监控器初始化完成,各个压力事件发生时,会调用mp_event_psi。

(2)vmpressure触发

  • 由于现在大部分Android机型均使用PSI触发,vmpressure触发这部分暂略过。
  • init中除了init_monitors()还有其他一些初始化过程,也先略过。use_inkernel_interface

1.3.3 mainloop



static void mainloop(void) {
    struct event_handler_info* handler_info;
    struct polling_params poll_params;
    struct timespec curr_tm;
    struct epoll_event *evt;
    long delay = -1;

    poll_params.poll_handler = NULL;
    poll_params.paused_handler = NULL;

    while (1) {
        struct epoll_event events[MAX_EPOLL_EVENTS];
        int nevents;
        int i;

        if (poll_params.poll_handler) {
            bool poll_now;

            clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm);
            if (poll_params.update == POLLING_RESUME) {
                /* Just transitioned into POLLING_RESUME, poll immediately. */
                poll_now = true;
                nevents = 0;
            } else {
                /* Calculate next timeout */
                计算下一次超时时间
                delay = get_time_diff_ms(&poll_params.last_poll_tm, &curr_tm);
                delay = (delay < poll_params.polling_interval_ms) ?
                    poll_params.polling_interval_ms - delay : poll_params.polling_interval_ms;

                /* Wait for events until the next polling timeout */
                //等待epollfd上的事件
                nevents = epoll_wait(epollfd, events, maxevents, delay);

                /* Update current time after wait */
                clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm);
                poll_now = (get_time_diff_ms(&poll_params.last_poll_tm, &curr_tm) >=
                    poll_params.polling_interval_ms);
            }
            if (poll_now) {
                call_handler(poll_params.poll_handler, &poll_params, 0);
            }
        } else {
            if (kill_timeout_ms && is_waiting_for_kill()) {
                clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm);
                delay = kill_timeout_ms - get_time_diff_ms(&last_kill_tm, &curr_tm);
                /* Wait for pidfds notification or kill timeout to expire */
                nevents = (delay > 0) ? epoll_wait(epollfd, events, maxevents, delay) : 0;
                if (nevents == 0) {
                    /* Kill notification timed out */
                    stop_wait_for_proc_kill(false);
                    if (polling_paused(&poll_params)) {
                        clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm);
                        poll_params.update = POLLING_RESUME;
                        resume_polling(&poll_params, curr_tm);
                    }
                }
            } else {
                /* Wait for events with no timeout */
                //等待没有超时的事件
                nevents = epoll_wait(epollfd, events, maxevents, -1);
            }
        }

        if (nevents == -1) {
            if (errno == EINTR)
                continue;
            ALOGE("epoll_wait failed (errno=%d)", errno);
            continue;
        }

        /*
         * First pass to see if any data socket connections were dropped.
         * Dropped connection should be handled before any other events
         * to deallocate data connection and correctly handle cases when
         * connection gets dropped and reestablished in the same epoll cycle.
         * In such cases it's essential to handle connection closures first.
         */
        for (i = 0, evt = &events[0]; i < nevents; ++i, evt++) {
            // 如果是 EPOLLHUP 并且 data.ptr 存在
            if ((evt->events & EPOLLHUP) && evt->data.ptr) {
                ALOGI("lmkd data connection dropped");
                // 获取 event_handler_info 结构体
                handler_info = (struct event_handler_info*)evt->data.ptr;
                watchdog.start();
                // 处理连接关闭事件
                ctrl_data_close(handler_info->data);
                watchdog.stop();
            }
        }

        /* Second pass to handle all other events */
        //第二遍处理所有其他事件
        for (i = 0, evt = &events[0]; i < nevents; ++i, evt++) {
            if (evt->events & EPOLLERR) {
                ALOGD("EPOLLERR on event #%d", i);
            }
            if (evt->events & EPOLLHUP) {
                /* This case was handled in the first pass */
                // 已在第一次循环处理
                continue;
            }
             // 如果 data.ptr 有效
            if (evt->data.ptr) {
                // 获取 event_handler_info 结构体
                handler_info = (struct event_handler_info*)evt->data.ptr;
                // 调用处理函数,见1.3.3.1
                call_handler(handler_info, &poll_params, evt->events);
            }
        }
    }
}
1.3.3.1call_handler
static void call_handler(struct event_handler_info* handler_info,
                         struct polling_params *poll_params, uint32_t events) {
    struct timespec curr_tm;

    watchdog.start();
    poll_params->update = POLLING_DO_NOT_CHANGE;
    //这里的 handler_info->handler 就是 ctrl_connect_handler,它在 init() 中被设置。
    handler_info->handler(handler_info->data, events, poll_params);
    clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm);
    if (poll_params->poll_handler == handler_info) {
        poll_params->last_poll_tm = curr_tm;
    }

    switch (poll_params->update) {
    case POLLING_START:
        /*
         * Poll for the duration of PSI_WINDOW_SIZE_MS after the
         * initial PSI event because psi events are rate-limited
         * at one per sec.
         */
        poll_params->poll_start_tm = curr_tm;
        poll_params->poll_handler = handler_info;
        break;
    case POLLING_PAUSE:
        poll_params->paused_handler = handler_info;
        poll_params->poll_handler = NULL;
        break;
    case POLLING_RESUME:
        resume_polling(poll_params, curr_tm);
        break;
    case POLLING_DO_NOT_CHANGE:
        if (poll_params->poll_handler &&
            get_time_diff_ms(&poll_params->poll_start_tm, &curr_tm) > PSI_WINDOW_SIZE_MS) {
            /* Polled for the duration of PSI window, time to stop */
            poll_params->poll_handler = NULL;
        }
        break;
    }
    watchdog.stop();
}
1.3.3.2 ctrl_connect_handler
static void ctrl_connect_handler(int data __unused, uint32_t events __unused,
                                 struct polling_params *poll_params __unused) {
    struct epoll_event epev;
    int free_dscock_idx = get_free_dsock();

    if (free_dscock_idx < 0) {
        /*
         * Number of data connections exceeded max supported. This should not
         * happen but if it does we drop all existing connections and accept
         * the new one. This prevents inactive connections from monopolizing
         * data socket and if we drop ActivityManager connection it will
         * immediately reconnect.
         */
        for (int i = 0; i < MAX_DATA_CONN; i++) {
            ctrl_data_close(i);
        }
        free_dscock_idx = 0;
    }

    data_sock[free_dscock_idx].sock = accept(ctrl_sock.sock, NULL, NULL);
    if (data_sock[free_dscock_idx].sock < 0) {
        ALOGE("lmkd control socket accept failed; errno=%d", errno);
        return;
    }

    ALOGI("lmkd data connection established");
    /* use data to store data connection idx */
    data_sock[free_dscock_idx].handler_info.data = free_dscock_idx;
    //当事件触发,则调用ctrl_data_handler
    data_sock[free_dscock_idx].handler_info.handler = ctrl_data_handler;
    data_sock[free_dscock_idx].async_event_mask = 0;
    epev.events = EPOLLIN;
    epev.data.ptr = (void *)&(data_sock[free_dscock_idx].handler_info);
    if (epoll_ctl(epollfd, EPOLL_CTL_ADD, data_sock[free_dscock_idx].sock, &epev) == -1) {
        ALOGE("epoll_ctl for data connection socket failed; errno=%d", errno);
        ctrl_data_close(free_dscock_idx);
        return;
    }
    maxevents++;
}
1.3.3.3 ctrl_data_handler
static void ctrl_data_handler(int data, uint32_t events,
                              struct polling_params *poll_params __unused) {
    if (events & EPOLLIN) {
        ctrl_command_handler(data);
    }
}
1.3.3.4 ctrl_command_handler
static void ctrl_command_handler(int dsock_idx) {
    LMKD_CTRL_PACKET packet;
    struct ucred cred;
    int len;
    enum lmk_cmd cmd;
    int nargs;
    int targets;
    int kill_cnt;
    int result;

    len = ctrl_data_read(dsock_idx, (char *)packet, CTRL_PACKET_MAX_SIZE, &cred);
    if (len <= 0)
        return;

    if (len < (int)sizeof(int)) {
        ALOGE("Wrong control socket read length len=%d", len);
        return;
    }

    cmd = lmkd_pack_get_cmd(packet);
    nargs = len / sizeof(int) - 1;
    if (nargs < 0)
        goto wronglen;

    switch(cmd) {
    case LMK_TARGET:
        targets = nargs / 2;
        if (nargs & 0x1 || targets > (int)lowmem_adj.size()) {
            goto wronglen;
        }
        cmd_target(targets, packet);
        break;
    case LMK_PROCPRIO:
        /* process type field is optional for backward compatibility */
        if (nargs < 3 || nargs > 4)
            goto wronglen;
        //见1.3.3.5 设置进程adj
        cmd_procprio(packet, nargs, &cred);
        break;
    case LMK_PROCREMOVE:
        if (nargs != 1)
            goto wronglen;
        cmd_procremove(packet, &cred);
        break;
    case LMK_PROCPURGE:
        if (nargs != 0)
            goto wronglen;
        cmd_procpurge(&cred);
        break;
    case LMK_GETKILLCNT:
        if (nargs != 2)
            goto wronglen;
        kill_cnt = cmd_getkillcnt(packet);
        len = lmkd_pack_set_getkillcnt_repl(packet, kill_cnt);
        if (ctrl_data_write(dsock_idx, (char *)packet, len) != len)
            return;
        break;
    case LMK_SUBSCRIBE:
        if (nargs != 1)
            goto wronglen;
        cmd_subscribe(dsock_idx, packet);
        break;
    case LMK_PROCKILL:
        /* This command code is NOT expected at all */
        ALOGE("Received unexpected command code %d", cmd);
        break;
    case LMK_UPDATE_PROPS:
        if (nargs != 0)
            goto wronglen;
        result = -1;
        if (update_props()) {
            if (!use_inkernel_interface) {
                /* Reinitialize monitors to apply new settings */
                destroy_monitors();
                if (init_monitors()) {
                    result = 0;
                }
            } else {
                result = 0;
            }
        }

        len = lmkd_pack_set_update_props_repl(packet, result);
        if (ctrl_data_write(dsock_idx, (char *)packet, len) != len) {
            ALOGE("Failed to report operation results");
        }
        if (!result) {
            ALOGI("Properties reinitilized");
        } else {
            /* New settings can't be supported, crash to be restarted */
            ALOGE("New configuration is not supported. Exiting...");
            exit(1);
        }
        break;
    default:
        ALOGE("Received unknown command code %d", cmd);
        return;
    }

    return;

wronglen:
    ALOGE("Wrong control socket read length cmd=%d len=%d", cmd, len);
}
1.3.3.5 cmd_procprio
static void cmd_procprio(LMKD_CTRL_PACKET packet, int field_count, struct ucred *cred) {
    struct proc *procp;
    char path[LINE_MAX];
    char val[20];
    int soft_limit_mult;
    struct lmk_procprio params;
    bool is_system_server;
    struct passwd *pwdrec;
    int64_t tgid;
    char buf[PAGE_SIZE];

    lmkd_pack_get_procprio(packet, field_count, &params);

    if (params.oomadj < OOM_SCORE_ADJ_MIN ||
        params.oomadj > OOM_SCORE_ADJ_MAX) {
        ALOGE("Invalid PROCPRIO oomadj argument %d", params.oomadj);
        return;
    }

    if (params.ptype < PROC_TYPE_FIRST || params.ptype >= PROC_TYPE_COUNT) {
        ALOGE("Invalid PROCPRIO process type argument %d", params.ptype);
        return;
    }

    /* Check if registered process is a thread group leader */
    if (read_proc_status(params.pid, buf, sizeof(buf))) {
        if (parse_status_tag(buf, PROC_STATUS_TGID_FIELD, &tgid) && tgid != params.pid) {
            ALOGE("Attempt to register a task that is not a thread group leader "
                  "(tid %d, tgid %" PRId64 ")", params.pid, tgid);
            return;
        }
    }

    /* gid containing AID_READPROC required */
    /* CAP_SYS_RESOURCE required */
    /* CAP_DAC_OVERRIDE required */
    向节点/proc//oom_score_adj`写入oomadj。
    snprintf(path, sizeof(path), "/proc/%d/oom_score_adj", params.pid);
    snprintf(val, sizeof(val), "%d", params.oomadj);
    if (!writefilestring(path, val, false)) {
        ALOGW("Failed to open %s; errno=%d: process %d might have been killed",
              path, errno, params.pid);
        /* If this file does not exist the process is dead. */
        return;
    }

    if (use_inkernel_interface) {
        stats_store_taskname(params.pid, proc_get_name(params.pid, path, sizeof(path)));
        return;
    }

    /* lmkd should not change soft limits for services */
    if (params.ptype == PROC_TYPE_APP && per_app_memcg) {
        if (params.oomadj >= 900) {
            soft_limit_mult = 0;
        } else if (params.oomadj >= 800) {
            soft_limit_mult = 0;
        } else if (params.oomadj >= 700) {
            soft_limit_mult = 0;
        } else if (params.oomadj >= 600) {
            // Launcher should be perceptible, don't kill it.
            params.oomadj = 200;
            soft_limit_mult = 1;
        } else if (params.oomadj >= 500) {
            soft_limit_mult = 0;
        } else if (params.oomadj >= 400) {
            soft_limit_mult = 0;
        } else if (params.oomadj >= 300) {
            soft_limit_mult = 1;
        } else if (params.oomadj >= 200) {
            soft_limit_mult = 8;
        } else if (params.oomadj >= 100) {
            soft_limit_mult = 10;
        } else if (params.oomadj >=   0) {
            soft_limit_mult = 20;
        } else {
            // Persistent processes will have a large
            // soft limit 512MB.
            soft_limit_mult = 64;
        }

        std::string path;
        if (!CgroupGetAttributePathForTask("MemSoftLimit", params.pid, &path)) {
            ALOGE("Querying MemSoftLimit path failed");
            return;
        }

        snprintf(val, sizeof(val), "%d", soft_limit_mult * EIGHT_MEGA);

        /*
         * system_server process has no memcg under /dev/memcg/apps but should be
         * registered with lmkd. This is the best way so far to identify it.
         */
        is_system_server = (params.oomadj == SYSTEM_ADJ &&
                            (pwdrec = getpwnam("system")) != NULL &&
                            params.uid == pwdrec->pw_uid);
        writefilestring(path.c_str(), val, !is_system_server);
    }

    procp = pid_lookup(params.pid);
    if (!procp) {
        int pidfd = -1;

        if (pidfd_supported) {
            pidfd = TEMP_FAILURE_RETRY(pidfd_open(params.pid, 0));
            if (pidfd < 0) {
                ALOGE("pidfd_open for pid %d failed; errno=%d", params.pid, errno);
                return;
            }
        }

        procp = static_cast<struct proc*>(calloc(1, sizeof(struct proc)));
        if (!procp) {
            // Oh, the irony.  May need to rebuild our state.
            return;
        }

        procp->pid = params.pid;
        procp->pidfd = pidfd;
        procp->uid = params.uid;
        procp->reg_pid = cred->pid;
        procp->oomadj = params.oomadj;
        procp->valid = true;
        proc_insert(procp);
    } else {
        if (!claim_record(procp, cred->pid)) {
            char buf[LINE_MAX];
            char *taskname = proc_get_name(cred->pid, buf, sizeof(buf));
            /* Only registrant of the record can remove it */
            ALOGE("%s (%d, %d) attempts to modify a process registered by another client",
                taskname ? taskname : "A process ", cred->uid, cred->pid);
            return;
        }
        proc_unslot(procp);
        procp->oomadj = params.oomadj;
        proc_slot(procp);
    }
}

2.mp_event_psi

mp_event_psi是PSI触发后的新策略,它可以大致分为三个部分,第一部分做一些参数和状态的计算,第二部分根据得出的状态确定查杀原因(kill_reason),第三部分选择进程进行一轮查杀。

这部分代码较多,比较重要的是通过vmstat_parse和meminfo_parse读取信息,判断thrashing、水线、swap状态等,便于下一步确认查杀原因。

static void mp_event_psi(int data, uint32_t events, struct polling_params *poll_params) {
    enum reclaim_state {
        NO_RECLAIM = 0,
        KSWAPD_RECLAIM,
        DIRECT_RECLAIM,
    };
    //static变量,在多次进入这个函数时,这些static变量持续记录状态
    static int64_t init_ws_refault; // 记录 杀进程后 初始的 workingset_refault
    static int64_t prev_workingset_refault; // 记录上一轮的 workingset_refault
    static int64_t base_file_lru;      // 记录初始时的 文件页缓存大小
    static int64_t init_pgscan_kswapd; // 记录初始时的 kswap回收量
    static int64_t init_pgscan_direct; // 记录初始时的 直接回收量
    static bool killing; // 如果有进程被杀会被置为true
    static int thrashing_limit = thrashing_limit_pct; // 抖动的阈值,一开始由参数中获取
    static struct zone_watermarks watermarks;
    static struct timespec wmark_update_tm;
    static struct wakeup_info wi;
    static struct timespec thrashing_reset_tm;
    static int64_t prev_thrash_growth = 0;
    static bool check_filecache = false;
    static int max_thrashing = 0;	
    //一些临时变量
    union meminfo mi;  // 从 /proc/meminfo 解析
    union vmstat vs;   // 从 /proc/vmstat 解析
    struct psi_data psi_data;
    struct timespec curr_tm; // 每轮开始时记录时间
    int64_t thrashing = 0;
    bool swap_is_low = false;
    enum vmpressure_level level = (enum vmpressure_level)data;
    enum kill_reasons kill_reason = NONE;
    bool cycle_after_kill = false; // 如果上一轮有进程被杀,这一轮会被置为true
    enum reclaim_state reclaim = NO_RECLAIM;
    enum zone_watermark wmark = WMARK_NONE;
    char kill_desc[LINE_MAX];
    bool cut_thrashing_limit = false;
    int min_score_adj = 0;
    int swap_util = 0;
    int64_t swap_low_threshold;
    long since_thrashing_reset_ms;
    int64_t workingset_refault_file;
    bool critical_stall = false;
    
if (clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm) != 0) {
        ALOGE("Failed to get current time");
        return;
    }

    record_wakeup_time(&curr_tm, events ? Event : Polling, &wi);

    bool kill_pending = is_kill_pending();
    if (kill_pending && (kill_timeout_ms == 0 ||
        get_time_diff_ms(&last_kill_tm, &curr_tm) < static_cast<long>(kill_timeout_ms))) {
        /* Skip while still killing a process */
        wi.skipped_wakeups++;
        goto no_kill;
    }
    /*
     * Process is dead or kill timeout is over, stop waiting. This has no effect if pidfds are
     * supported and death notification already caused waiting to stop.
     * 进程死亡或者kill超时结束,停止等待。 如果支持pidfd并且死亡通知已导致等待停止,则此操作无效。
     */
    stop_wait_for_proc_kill(!kill_pending);

    // vmstat解析
    if (vmstat_parse(&vs) < 0) {
        ALOGE("Failed to parse vmstat!");
        return;
    }

    /* 从5.9开始内核workingset_refault vmstat字段被重命名为workingset_refault_file */
    workingset_refault_file = vs.field.workingset_refault ? : vs.field.workingset_refault_file;

    // meminfo解析
    if (meminfo_parse(&mi) < 0) {
        ALOGE("Failed to parse meminfo!");
        return;
    }

    /* Reset states after process got killed */
    /* 杀进程后重置一些状态 */
    if (killing) {
        killing = false;
        cycle_after_kill = true;
        /* Reset file-backed pagecache size and refault amounts after a kill */
        base_file_lru = vs.field.nr_inactive_file + vs.field.nr_active_file; // 重置 文件页 缓存大小
        init_ws_refault = workingset_refault_file;                           // 重置 refault量
        thrashing_reset_tm = curr_tm; // thrashing重置时间设置为当前时间
        prev_thrash_growth = 0;       // thrashing重置为0
    }

    /* Check free swap levels */
    /* 确认swap状态: swap_is_low */
    if (swap_free_low_percentage) { // 由属性中获取
        swap_low_threshold = mi.field.total_swap * swap_free_low_percentage / 100;
        swap_is_low = get_free_swap(&mi) < swap_low_threshold;  // free swap低于 XX%,认为swap较低
    } else {
        swap_low_threshold = 0;
    }

    /* Identify reclaim state */
    /* 确认回收状态: reclaim */
    if (vs.field.pgscan_direct != init_pgscan_direct) { // pgscan_direct发生了变化,说明发生了【DIRECT_RECLAIM】
        init_pgscan_direct = vs.field.pgscan_direct;
        init_pgscan_kswapd = vs.field.pgscan_kswapd;
        reclaim = DIRECT_RECLAIM;
    } else if (vs.field.pgscan_kswapd != init_pgscan_kswapd) { // kswapd回收量发生变化,发生了【KSWAPD_RECLAIM】
        init_pgscan_kswapd = vs.field.pgscan_kswapd;
        reclaim = KSWAPD_RECLAIM;
    } else if (workingset_refault_file == prev_workingset_refault) {
        /*
         * Device is not thrashing and not reclaiming, bail out early until we see these stats
         * changing
         * 设备没有抖动也没有回收,该轮不查杀,直到我们看到这些统计数据发生变化
         */
        goto no_kill;
    }

    prev_workingset_refault = workingset_refault_file;

    /*
    * It's possible we fail to find an eligible process to kill (ex. no process is
    * above oom_adj_min). When this happens, we should retry to find a new process
    * for a kill whenever a new eligible process is available.
    * 有可能我们找不到合适的进程来终止(例如,没有进程高于 oom_adj_min)。
    * 这种情况下,只要有新的符合条件的进程可用,我们就应该重试寻找新的进程进行kill。
    * 
    * This is especially important for a slow growing refault case. 
    * 这对于增长缓慢的缺页场景尤为重要。
    * 
    * While retrying, we should keep monitoring new thrashing counter 
    * as someone could release the memory to mitigate the thrashing. 
    * 在重试时,我们应该继续监视新的抖动计数器(thrashing counter),因为有人可以释放内存来减轻抖动。
    * 
    * Thus, when thrashing reset window comes, 
    * we decay the prev thrashing counter by window counts. 
    * 因此,当抖动重置窗口到来时,我们通过窗口计数衰减前一个抖动计数器。
    * 
    * If the counter is still greater than thrashing limit,
    * we preserve the current prev_thrash counter so we will retry kill again. 
    * 如果计数器仍然大于抖动限制,我们将保留当前的 prev_thrash 计数器,以便我们再次重试 kill。
    * 
    * Otherwise, we reset the prev_thrash counter so we will stop retrying.
    * 否则,我们重置 prev_thrash 计数器以停止重试。
    */

    // 从thrashing重置 到 现在 的时间差,注意如果上一轮查杀过,时间会被重置,时间差=0
    since_thrashing_reset_ms = get_time_diff_ms(&thrashing_reset_tm, &curr_tm);
    if (since_thrashing_reset_ms > THRASHING_RESET_INTERVAL_MS) {
        long windows_passed;
        /* Calculate prev_thrash_growth if we crossed THRASHING_RESET_INTERVAL_MS */
        /* 在超过thrashing reset间隔时间的情况下,计算上一次thrash增长 */
        prev_thrash_growth = (workingset_refault_file - init_ws_refault) * 100
                            / (base_file_lru + 1);          // 新增缺页数量 / 总文件页数量 * 100
        // 代表超过了多少个thrashing reset窗口
        windows_passed = (since_thrashing_reset_ms / THRASHING_RESET_INTERVAL_MS);

        /*
         * Decay prev_thrashing unless over-the-limit thrashing was registered in the window we
         * just crossed, which means there were no eligible processes to kill. We preserve the
         * counter in that case to ensure a kill if a new eligible process appears.
         * 
         * 减少 prev_thrashing 除非 在我们刚刚越过的窗口中 注册了超过限制的抖动,这意味着没有符合条件的进程可以杀死。 
         * 在这种情况下,我们保留计数器以确保在出现新的合格进程时终止。
         */
        if (windows_passed > 1 || prev_thrash_growth < thrashing_limit) {
            prev_thrash_growth >>= windows_passed;
        }

        /* Record file-backed pagecache size when crossing THRASHING_RESET_INTERVAL_MS */
        /* 超过THRASHING_RESET_INTERVAL_MS时,记录 文件页数量 */
        // 实际看这里是重置了 文件页大小、refault数量、抖动重置时间、抖动阈值
        base_file_lru = vs.field.nr_inactive_file + vs.field.nr_active_file;
        init_ws_refault = workingset_refault_file;
        thrashing_reset_tm = curr_tm;        // thrashing重置
        thrashing_limit = thrashing_limit_pct;
    } else {
        /* Calculate what % of the file-backed pagecache refaulted so far */
        // 上一轮发生过查杀,或thrashing刚重置没多久,就计算到目前为止,文件页缓存发生缺页的百分比
        thrashing = (workingset_refault_file - init_ws_refault) * 100 / (base_file_lru + 1);
    }
    /* Add previous cycle's decayed thrashing amount */
    // 累加上一轮的thrashing衰减量
    thrashing += prev_thrash_growth;
    if (max_thrashing < thrashing) {
        max_thrashing = thrashing;
    }

    /*
     * Refresh watermarks once per min in case user updated one of the margins.
     * 每 60s 刷新一次水线
     *
     * TODO: b/140521024 replace this periodic update with an API for AMS to notify LMKD
     * that zone watermarks were changed by the system software.
     * TODO: 使用 AMS的API 替换 此定时更新,以通知 LMKD 水线已被系统软件更改。
     */
    if (watermarks.high_wmark == 0 || get_time_diff_ms(&wmark_update_tm, &curr_tm) > 60000) {
        struct zoneinfo zi;

        // 进行一次zoninfo解析
        if (zoneinfo_parse(&zi) < 0) {
            ALOGE("Failed to parse zoneinfo!");
            return;
        }

        // 计算zone水线,看这个函数把各个zone的水线都加了起来,存到watermarks里,见2.1
        calc_zone_watermarks(&zi, &watermarks);
        wmark_update_tm = curr_tm;
    }

    /* Find out which watermark is breached if any */
    // 确认到了哪个水线等级
    wmark = get_lowest_watermark(&mi, &watermarks);

    /* 从/proc/pressure/memory解析 psi 数据,确认是否达到 critical 等级 */
    if (!psi_parse_mem(&psi_data)) {
        critical_stall = psi_data.mem_stats[PSI_FULL].avg10 > (float)stall_limit_critical;
    }

2.1 calc_zone_watermarks

void calc_zone_watermarks(struct zoneinfo *zi, struct zone_watermarks *watermarks) {
    memset(watermarks, 0, sizeof(struct zone_watermarks));

    for (int node_idx = 0; node_idx < zi->node_count; node_idx++) {
        struct zoneinfo_node *node = &zi->nodes[node_idx];
        for (int zone_idx = 0; zone_idx < node->zone_count; zone_idx++) {
            struct zoneinfo_zone *zone = &node->zones[zone_idx];

            if (!zone->fields.field.present) {
                continue;
            }

            watermarks->high_wmark += zone->max_protection + zone->fields.field.high;
            watermarks->low_wmark += zone->max_protection + zone->fields.field.low;
            watermarks->min_wmark += zone->max_protection + zone->fields.field.min;
        }
    }
}

2.2 get_lowest_watermark

static enum zone_watermark get_lowest_watermark(union meminfo *mi,
                                                struct zone_watermarks *watermarks)
{
    int64_t nr_free_pages = mi->field.nr_free_pages - mi->field.cma_free;

    if (nr_free_pages < watermarks->min_wmark) {
        return WMARK_MIN;
    }
    if (nr_free_pages < watermarks->low_wmark) {
        return WMARK_LOW;
    }
    if (nr_free_pages < watermarks->high_wmark) {
        return WMARK_HIGH;
    }
    return WMARK_NONE;
}

2.2 确认查杀原因和最低adj

该部分主要是根据上一部分得出的状态,确认要进行查杀的原因,以及对最低可查杀adj等级(min_score_adj)做出修改,这部分源码基本上全是if else if注释比较详细,kill_reason和kill_desc的赋值也比较直观,芯片厂商也会对这部分代码做较大的改动。如有lmkd异常查杀等情况发生,可以根据lmkd日志中打印的kill reason,在这一部分找到对应的源码。

  • 代码示例:
if (!psi_parse_mem(&psi_data)) {
        critical_stall = psi_data.mem_stats[PSI_FULL].avg10 > (float)stall_limit_critical;
    }
    /*
     * TODO: move this logic into a separate function
     * Decide if killing a process is necessary and record the reason
     */
    if (cycle_after_kill && wmark < WMARK_LOW) {
        /*
         * Prevent kills not freeing enough memory which might lead to OOM kill.
         * This might happen when a process is consuming memory faster than reclaim can
         * free even after a kill. Mostly happens when running memory stress tests.
         */
        kill_reason = PRESSURE_AFTER_KILL;
        strncpy(kill_desc, "min watermark is breached even after kill", sizeof(kill_desc));
    } else if (level == VMPRESS_LEVEL_CRITICAL && events != 0) {
        /*
         * Device is too busy reclaiming memory which might lead to ANR.
         * Critical level is triggered when PSI complete stall (all tasks are blocked because
         * of the memory congestion) breaches the configured threshold.
         */
        kill_reason = NOT_RESPONDING;
        strncpy(kill_desc, "device is not responding", sizeof(kill_desc));
    } else if (swap_is_low && thrashing > thrashing_limit_pct) {
        /* Page cache is thrashing while swap is low */
        kill_reason = LOW_SWAP_AND_THRASHING;
        snprintf(kill_desc, sizeof(kill_desc), "device is low on swap (%" PRId64
            "kB < %" PRId64 "kB) and thrashing (%" PRId64 "%%)",
            get_free_swap(&mi) * page_k, swap_low_threshold * page_k, thrashing);
        /* Do not kill perceptible apps unless below min watermark or heavily thrashing */
        if (wmark > WMARK_MIN && thrashing < thrashing_critical_pct) {
            min_score_adj = PERCEPTIBLE_APP_ADJ + 1;
        }
        check_filecache = true;
    } else if (swap_is_low && wmark < WMARK_HIGH) {
        /* Both free memory and swap are low */
        kill_reason = LOW_MEM_AND_SWAP;
        snprintf(kill_desc, sizeof(kill_desc), "%s watermark is breached and swap is low (%"
            PRId64 "kB < %" PRId64 "kB)", wmark < WMARK_LOW ? "min" : "low",
            get_free_swap(&mi) * page_k, swap_low_threshold * page_k);
        /* Do not kill perceptible apps unless below min watermark or heavily thrashing */
        if (wmark > WMARK_MIN && thrashing < thrashing_critical_pct) {
            min_score_adj = PERCEPTIBLE_APP_ADJ + 1;
        }
    } else if (wmark < WMARK_HIGH && swap_util_max < 100 &&
               (swap_util = calc_swap_utilization(&mi)) > swap_util_max) {
        /*
         * Too much anon memory is swapped out but swap is not low.
         * Non-swappable allocations created memory pressure.
         */
        kill_reason = LOW_MEM_AND_SWAP_UTIL;
        snprintf(kill_desc, sizeof(kill_desc), "%s watermark is breached and swap utilization"
            " is high (%d%% > %d%%)", wmark < WMARK_LOW ? "min" : "low",
            swap_util, swap_util_max);
    } else if (wmark < WMARK_HIGH && thrashing > thrashing_limit) {
        /* Page cache is thrashing while memory is low */
        kill_reason = LOW_MEM_AND_THRASHING;
        snprintf(kill_desc, sizeof(kill_desc), "%s watermark is breached and thrashing (%"
            PRId64 "%%)", wmark < WMARK_LOW ? "min" : "low", thrashing);
        cut_thrashing_limit = true;
        /* Do not kill perceptible apps unless thrashing at critical levels */
        if (thrashing < thrashing_critical_pct) {
            min_score_adj = PERCEPTIBLE_APP_ADJ + 1;
        }
        check_filecache = true;
    } else if (reclaim == DIRECT_RECLAIM && thrashing > thrashing_limit) {
        /* Page cache is thrashing while in direct reclaim (mostly happens on lowram devices) */
        kill_reason = DIRECT_RECL_AND_THRASHING;
        snprintf(kill_desc, sizeof(kill_desc), "device is in direct reclaim and thrashing (%"
            PRId64 "%%)", thrashing);
        cut_thrashing_limit = true;
        /* Do not kill perceptible apps unless thrashing at critical levels */
        if (thrashing < thrashing_critical_pct) {
            min_score_adj = PERCEPTIBLE_APP_ADJ + 1;
        }
        check_filecache = true;
    } else if (check_filecache) {
        int64_t file_lru_kb = (vs.field.nr_inactive_file + vs.field.nr_active_file) * page_k;

        if (file_lru_kb < filecache_min_kb) {
            /* File cache is too low after thrashing, keep killing background processes */
            kill_reason = LOW_FILECACHE_AFTER_THRASHING;
            snprintf(kill_desc, sizeof(kill_desc),
                "filecache is low (%" PRId64 "kB < %" PRId64 "kB) after thrashing",
                file_lru_kb, filecache_min_kb);
            min_score_adj = PERCEPTIBLE_APP_ADJ + 1;
        } else {
            /* File cache is big enough, stop checking */
            check_filecache = false;
        }
    }

2.3 进程查杀

至此已经确定了查杀原因和最低允许查杀的adj,调用find_and_kill_process函数进行查杀。

if (kill_reason != NONE) {
        struct kill_info ki = {
            .kill_reason = kill_reason,
            .kill_desc = kill_desc,
            .thrashing = (int)thrashing,
            .max_thrashing = max_thrashing,
        };
    // ...
        int pages_freed = find_and_kill_process(min_score_adj, &ki, &mi, &wi, &curr_tm, &psi_data);
        if (pages_freed > 0) {
            killing = true;
            // ...
        }
    }

find_and_kill_process函数的作用是在大于等于min_score_adj的范围内,选择合适的进程进行查杀。

static int find_and_kill_process(int min_score_adj, struct kill_info *ki, union meminfo *mi,
                                 struct wakeup_info *wi, struct timespec *tm,
                                 struct psi_data *pd) {
    int i;
    int killed_size = 0;
    bool lmk_state_change_start = false;
    bool choose_heaviest_task = kill_heaviest_task;

    // 从 1000 开始循环
    for (i = OOM_SCORE_ADJ_MAX; i >= min_score_adj; i--) {
        struct proc *procp;

        if (!choose_heaviest_task && i <= PERCEPTIBLE_APP_ADJ) {
            /*
             * If we have to choose a perceptible process, choose the heaviest one to
             * hopefully minimize the number of victims.
             * 如果我们必须选择一个可感知的进程(adj<=200),就选择最严重的一个,来尽量避免查杀过多的进程。
             */
            choose_heaviest_task = true;
        }

        while (true) {
            procp = choose_heaviest_task ?
            // 在adj==i的进程中找"最严重的"或末尾的
                proc_get_heaviest(i) : proc_adj_tail(i); 

            if (!procp)
                break;
            
            killed_size = kill_one_process(procp, min_score_adj, ki, mi, wi, tm, pd);
            if (killed_size >= 0) {
                break;
            }
        }

        // 有进程查杀发生时,不再继续查杀更低adj的进程
        if (killed_size) {
            break;
        }
    }

    return killed_size;
}

从find_and_kill_process可以得知lmkd每次触发查杀时,都是从adj1000的进程开始逐个筛选合适的进程查杀,发生查杀后退出。在“选择合适的进程”的策略中,可以通过kill_heaviest_task系统属性控制lmkd是用proc_get_heaviest还是proc_adj_tail做筛选,不过注意在查杀到adj小于等于200时,已经到了必须查杀用户可感知进程的地步,此时强制筛选“最严重”的进程。

所谓“最严重的进程”,可以在proc_get_heaviest函数中看到是对进程读取"/proc/[pid]/statm"路径,获取进程rss内存占用,排序找出rss最大的进程;

Android 11 之后一般定制lmkd机制是使用一下两个配置项。这些功能在高性能设备和低内存设备上都可使用

ro.lmk.psi_partial_stall_ms

部分 PSI 失速阈值(以毫秒为单位),用于触发内存不足通知。如果设备收到内存压力通知的时间太晚,可以降低此值以在较早的时间触发通知。如果在不必要的情况下触发了内存压力通知,请提高此值以降低设备对噪声的敏感度。默认值:70(高性能机器) 200(低内存机器)

ro.lmk.psi_complete_stall_ms

完全 PSI 失速阈值(以毫秒为单位),用于触发关键内存通知。如果设备收到关键内存压力通知的时间太晚,可以降低该值以在较早的时间触发通知。如果在不必要的情况下触发了关键内存压力通知,可以提高该值以降低设备对噪声的敏感度。默认值:700


网站公告

今日签到

点亮在社区的每一天
去签到