Android 10 及更高版本支持新的 lmkd 模式,它使用内核压力失速信息 (PSI) 监视器来检测内存压力。上游内核中的 PSI 补丁程序集(已向后移植到 4.9 和 4.14 内核)可测量由于内存不足导致任务延迟的时间。由于这些延迟会直接影响用户体验,因此它们代表了确定内存压力严重性的便捷指标。上游内核还包括 PSI 监视器,这类监视器允许特权用户空间进程(例如 lmkd)指定这些延迟的阈值,并在突破阈值时从内核订阅事件。
接下来参考如下android U源码进行lmkd解读
基本介绍其相关分析可以参考:Android Lmkd 低内存终止守护程序-CSDN博客
1.lmkd进程启动和初始化
1.1 init.rc
lmkd由init进程启动,在系统中作为一个单独的进程存在。
// system/core/rootdir/init.rc
// ...
# Start lmkd before any other services run so that it can register them
write /proc/sys/vm/watermark_boost_factor 0
chown root system /sys/module/lowmemorykiller/parameters/adj
chmod 0664 /sys/module/lowmemorykiller/parameters/adj
chown root system /sys/module/lowmemorykiller/parameters/minfree
chmod 0664 /sys/module/lowmemorykiller/parameters/minfree
start lmkd // 启动lmkd
// ...
1.2 lmkd.rc
// system/memory/lmkd/lmkd.rc
service lmkd /system/bin/lmkd
class core
user lmkd
group lmkd system readproc
capabilities DAC_OVERRIDE KILL IPC_LOCK SYS_NICE SYS_RESOURCE
critical
socket lmkd seqpacket+passcred 0660 system system
task_profiles ServiceCapacityLow
on property:lmkd.reinit=1
exec_background /system/bin/lmkd --reinit
# reinitialize lmkd after device finished booting if experiments set any flags during boot
on property:sys.boot_completed=1 && property:lmkd.reinit=0
setprop lmkd.reinit 1
# properties most likely to be used in experiments
# setting persist.device_config.* property either triggers immediate lmkd re-initialization
# if the device finished booting or sets lmkd.reinit=0 to re-initialize lmkd after boot completes
on property:persist.device_config.lmkd_native.debug=*
setprop lmkd.reinit ${sys.boot_completed:-0}
on property:persist.device_config.lmkd_native.kill_heaviest_task=*
setprop lmkd.reinit ${sys.boot_completed:-0}
on property:persist.device_config.lmkd_native.kill_timeout_ms=*
setprop lmkd.reinit ${sys.boot_completed:-0}
on property:persist.device_config.lmkd_native.swap_free_low_percentage=*
setprop lmkd.reinit ${sys.boot_completed:-0}
on property:persist.device_config.lmkd_native.psi_partial_stall_ms=*
setprop lmkd.reinit ${sys.boot_completed:-0}
on property:persist.device_config.lmkd_native.psi_complete_stall_ms=*
setprop lmkd.reinit ${sys.boot_completed:-0}
on property:persist.device_config.lmkd_native.thrashing_limit=*
setprop lmkd.reinit ${sys.boot_completed:-0}
on property:persist.device_config.lmkd_native.thrashing_limit_decay=*
setprop lmkd.reinit ${sys.boot_completed:-0}
on property:persist.device_config.lmkd_native.thrashing_limit_critical=*
setprop lmkd.reinit ${sys.boot_completed:-0}
on property:persist.device_config.lmkd_native.swap_util_max=*
setprop lmkd.reinit ${sys.boot_completed:-0}
on property:persist.device_config.lmkd_native.filecache_min_kb=*
setprop lmkd.reinit ${sys.boot_completed:-0}
1.3 lmkd.cpp
启动时直接运行lmkd.cpp中的main函数。main函数中,逻辑较清楚,更新参数,创建logger,之后在if中进行init,之后在mainloop()中循环等待。
int main(int argc, char **argv) {
if ((argc > 1) && argv[1] && !strcmp(argv[1], "--reinit")) {
if (property_set(LMKD_REINIT_PROP, "")) {
ALOGE("Failed to reset " LMKD_REINIT_PROP " property");
}
return issue_reinit();
}
// 更新参数 见1.3.1
if (!update_props()) {
ALOGE("Failed to initialize props, exiting.");
return -1;
}
// 创建logger
ctx = create_android_logger(KILLINFO_LOG_TAG);
if (!init()) {
if (!use_inkernel_interface) {
/*
* MCL_ONFAULT pins pages as they fault instead of loading
* everything immediately all at once. (Which would be bad,
* because as of this writing, we have a lot of mapped pages we
* never use.) Old kernels will see MCL_ONFAULT and fail with
* EINVAL; we ignore this failure.
*
* N.B. read the man page for mlockall. MCL_CURRENT | MCL_ONFAULT
* pins ⊆ MCL_CURRENT, converging to just MCL_CURRENT as we fault
* in pages.
*/
/* CAP_IPC_LOCK required */
if (mlockall(MCL_CURRENT | MCL_FUTURE | MCL_ONFAULT) && (errno != EINVAL)) {
ALOGW("mlockall failed %s", strerror(errno));
}
/* CAP_NICE required */
struct sched_param param = {
.sched_priority = 1,
};
if (sched_setscheduler(0, SCHED_FIFO, ¶m)) {
ALOGW("set SCHED_FIFO failed %s", strerror(errno));
}
}
if (init_reaper()) {
ALOGI("Process reaper initialized with %d threads in the pool",
reaper.thread_cnt());
}
if (!watchdog.init()) {
ALOGE("Failed to initialize the watchdog");
}
//在mainloop()中循环等待,见1.3.3
mainloop();
}
1.3.1 update_props()
update_props函数中主要是使用GET_LMK_PROPERTY从属性中获取各个参数配置,参数更新,例如从参数中获取low 、medium、critical三种压力等级下,可以kill的adj等级。
static bool update_props() {
/* By default disable low level vmpressure events */
level_oomadj[VMPRESS_LEVEL_LOW] =
GET_LMK_PROPERTY(int32, "low", OOM_SCORE_ADJ_MAX + 1);
level_oomadj[VMPRESS_LEVEL_MEDIUM] =
GET_LMK_PROPERTY(int32, "medium", 800);
level_oomadj[VMPRESS_LEVEL_CRITICAL] =
GET_LMK_PROPERTY(int32, "critical", 0);
debug_process_killing = GET_LMK_PROPERTY(bool, "debug", false);
/* By default disable upgrade/downgrade logic */
enable_pressure_upgrade =
GET_LMK_PROPERTY(bool, "critical_upgrade", false);
upgrade_pressure =
(int64_t)GET_LMK_PROPERTY(int32, "upgrade_pressure", 100);
downgrade_pressure =
(int64_t)GET_LMK_PROPERTY(int32, "downgrade_pressure", 100);
kill_heaviest_task =
GET_LMK_PROPERTY(bool, "kill_heaviest_task", false);
low_ram_device = property_get_bool("ro.config.low_ram", false);
kill_timeout_ms =
(unsigned long)GET_LMK_PROPERTY(int32, "kill_timeout_ms", 100);
use_minfree_levels =
GET_LMK_PROPERTY(bool, "use_minfree_levels", false);
per_app_memcg =
property_get_bool("ro.config.per_app_memcg", low_ram_device);
swap_free_low_percentage = clamp(0, 100, GET_LMK_PROPERTY(int32, "swap_free_low_percentage",
DEF_LOW_SWAP));
psi_partial_stall_ms = GET_LMK_PROPERTY(int32, "psi_partial_stall_ms",
low_ram_device ? DEF_PARTIAL_STALL_LOWRAM : DEF_PARTIAL_STALL);
psi_complete_stall_ms = GET_LMK_PROPERTY(int32, "psi_complete_stall_ms",
DEF_COMPLETE_STALL);
thrashing_limit_pct =
std::max(0, GET_LMK_PROPERTY(int32, "thrashing_limit",
low_ram_device ? DEF_THRASHING_LOWRAM : DEF_THRASHING));
thrashing_limit_decay_pct = clamp(0, 100, GET_LMK_PROPERTY(int32, "thrashing_limit_decay",
low_ram_device ? DEF_THRASHING_DECAY_LOWRAM : DEF_THRASHING_DECAY));
thrashing_critical_pct = std::max(
0, GET_LMK_PROPERTY(int32, "thrashing_limit_critical", thrashing_limit_pct * 2));
swap_util_max = clamp(0, 100, GET_LMK_PROPERTY(int32, "swap_util_max", 100));
filecache_min_kb = GET_LMK_PROPERTY(int64, "filecache_min_kb", 0);
stall_limit_critical = GET_LMK_PROPERTY(int64, "stall_limit_critical", 100);
reaper.enable_debug(debug_process_killing);
/* Call the update props hook */
if (!lmkd_update_props_hook()) {
ALOGE("Failed to update LMKD hook props.");
return false;
}
return true;
}
GET_LMK_PROPERTY是一个宏定义,用来读取ro.lmk参数
#define GET_LMK_PROPERTY(type, name, def) \
property_get_##type("persist.device_config.lmkd_native." name, \
property_get_##type("ro.lmk." name, def))
1.3.2 init()初始化过程
1.3.2.1 创建epoll监听
init中比较重要的一步是创建epoll监听,这里有宏定义MAX_EPOLL_EVENTS是10,也就是epoll监听了10个event。
/* max supported number of data connections (AMS, init, tests) */
/* 支持的最大数据连接数(AMS、init、测试) */
#define MAX_DATA_CONN 3
/*
* 1 ctrl listen socket, 3 ctrl data socket, 3 memory pressure levels,
* 1 lmk events + 1 fd to wait for process death + 1 fd to receive kill failure notifications
*
* 1个控制监听socket,3个控制数据通信socket,3个内存压力等级,1个lmk时间,1个监控进程死亡,1个接收kill失败通知
*/
#define MAX_EPOLL_EVENTS (1 + MAX_DATA_CONN + VMPRESS_LEVEL_COUNT + 1 + 1 + 1)
static int epollfd;
static int init(void) {
static struct event_handler_info kernel_poll_hinfo = { 0, kernel_event_handler };
struct reread_data file_data = {
.filename = ZONEINFO_PATH,
.fd = -1,
};
struct epoll_event epev;
int pidfd;
int i;
int ret;
page_k = sysconf(_SC_PAGESIZE);
if (page_k == -1)
page_k = PAGE_SIZE;
page_k /= 1024;
//
epollfd = epoll_create(MAX_EPOLL_EVENTS);
if (epollfd == -1) {
ALOGE("epoll_create failed (errno=%d)", errno);
return -1;
}
// mark data connections as not connected
for (int i = 0; i < MAX_DATA_CONN; i++) {
data_sock[i].sock = -1;
}
ctrl_sock.sock = android_get_control_socket("lmkd");
if (ctrl_sock.sock < 0) {
ALOGE("get lmkd control socket failed");
return -1;
}
//监听lmkd socket
ret = listen(ctrl_sock.sock, MAX_DATA_CONN);
if (ret < 0) {
ALOGE("lmkd control socket listen failed (errno=%d)", errno);
return -1;
}
epev.events = EPOLLIN;
//见1.3.3.2
ctrl_sock.handler_info.handler = ctrl_connect_handler;
epev.data.ptr = (void *)&(ctrl_sock.handler_info);
if (epoll_ctl(epollfd, EPOLL_CTL_ADD, ctrl_sock.sock, &epev) == -1) {
ALOGE("epoll_ctl for lmkd control socket failed (errno=%d)", errno);
return -1;
}
maxevents++;
例如ams会作为socket客户端,通过/dev/socket/lmkd与lmkd进行socket通信,将进程的adj通知到lmkd,并由lmkd写入"/proc/[pid]/oom_score_adj"路径。
1.3.2.2 初始化lmkd触发方式
接下来init函数需要决定lmkd的触发方式,早期的lmk使用内核驱动的方式,这里通过access确认旧的节点是否还存在(kernel 4.12已废弃)。不支持的话就是执行 init_monitors()。
has_inkernel_module = !access(INKERNEL_MINFREE_PATH, W_OK);
use_inkernel_interface = has_inkernel_module;
if (use_inkernel_interface)
// 大多内核已不支持{
ALOGI("Using in-kernel low memory killer interface");
if (init_poll_kernel()) {
epev.events = EPOLLIN;
epev.data.ptr = (void*)&kernel_poll_hinfo;
if (epoll_ctl(epollfd, EPOLL_CTL_ADD, kpoll_fd, &epev) != 0) {
ALOGE("epoll_ctl for lmk events failed (errno=%d)", errno);
close(kpoll_fd);
kpoll_fd = -1;
} else {
maxevents++;
/* let the others know it does support reporting kills */
property_set("sys.lmk.reportkills", "1");
}
}
} else {
if (!init_monitors()) {
return -1;
}
/* let the others know it does support reporting kills */
property_set("sys.lmk.reportkills", "1");
}
注意初始化监控器这里,有4个看起来很像的函数,分别是init_monitors()、init_psi_monitors()、init_mp_psi()、init_psi_monitor(),注意区分。
看代码,在init_monitors()函数中,要确认使用PSI触发还是vmpressure触发;在“ro.lmk. use_psi”属性为true的情况下,调用 init_psi_monitors 初始化PSI监控器,失败才会使用init_mp_common初始化vmpressure监控器,这里可以看出lmkd还是倾向于优先使用PSI触发。
static bool init_monitors() {
/* 在内核支持的情况下,尽量使用PSI监控器 */
use_psi_monitors = GET_LMK_PROPERTY(bool, "use_psi", true) &&
init_psi_monitors();
/* PSI监控器初始化失败,回退到vmpressure触发 */
if (!use_psi_monitors &&
(!init_mp_common(VMPRESS_LEVEL_LOW) ||
!init_mp_common(VMPRESS_LEVEL_MEDIUM) ||
!init_mp_common(VMPRESS_LEVEL_CRITICAL))) {
ALOGE("Kernel does not support memory pressure events or in-kernel low memory killer");
return false;
}
if (use_psi_monitors) {
ALOGI("Using psi monitors for memory pressure detection");
} else {
ALOGI("Using vmpressure for memory pressure detection");
}
return true;
}
(1)PSI触发
接下来看调用init_psi_monitors() 初始化PSI监控器,在明确设置属性use_new_strategy为true的情况下,或低内存设备,或明确use_minfree_levels为false的情况下,都是倾向于使用“新的策略”。这里新的策略其实指的是在PSI触发之后,是根据free page的情况(水线)去查杀进程,还是根据不同PSI压力去查杀进程,前者就是旧策略,后者为新策略;个人认为这里用“新旧”去区分非常不优雅。
注意这里新旧的策略,是依据PSI压力杀进程还是依据水线杀进程,但都不影响这里是设置的是PSI监控器,即触发仍然还是用PSI触发的,是杀进程的方式存在不同。
static bool init_psi_monitors() {
bool use_new_strategy =
GET_LMK_PROPERTY(bool, "use_new_strategy", low_ram_device || !use_minfree_levels);
/* 在默认 PSI模式下,使用系统属性覆盖 psi stall阈值 */
if (use_new_strategy) {
/* Do not use low pressure level */
psi_thresholds[VMPRESS_LEVEL_LOW].threshold_ms = 0;
psi_thresholds[VMPRESS_LEVEL_MEDIUM].threshold_ms = psi_partial_stall_ms;
psi_thresholds[VMPRESS_LEVEL_CRITICAL].threshold_ms = psi_complete_stall_ms;
}
if (!init_mp_psi(VMPRESS_LEVEL_LOW, use_new_strategy)) {
return false;
}
if (!init_mp_psi(VMPRESS_LEVEL_MEDIUM, use_new_strategy)) {
destroy_mp_psi(VMPRESS_LEVEL_LOW);
return false;
}
if (!init_mp_psi(VMPRESS_LEVEL_CRITICAL, use_new_strategy)) {
destroy_mp_psi(VMPRESS_LEVEL_MEDIUM);
destroy_mp_psi(VMPRESS_LEVEL_LOW);
return false;
}
return true;
}
决定好新旧策略后,接下来调用init_mp_psi来初始化各个等级的PSI事件。
init_mp_psi有两个参数,第一个是压力等级,第二个新旧策略的标志位。注意第一个参数的命名是“vmpressure_level”,尽管是“vmpressure”,但实际这里用PSI触发,是根据PSI来判断内存压力等级的,和前面说的vmpressure判断内存压力等级并非同一个“vmpressure”,这是第二个我认为代码非常不优雅的地方,容易引起歧义。vmpressure全称是虚拟内存压力,难道设计者的想法中,PSI所产生的stall ms也是一种虚拟的内存压力?
static bool init_mp_psi(enum vmpressure_level level, bool use_new_strategy) {
int fd;
/* Do not register a handler if threshold_ms is not set */
if (!psi_thresholds[level].threshold_ms) {
return true;
}
fd = init_psi_monitor(psi_thresholds[level].stall_type,
psi_thresholds[level].threshold_ms * US_PER_MS,
PSI_WINDOW_SIZE_MS * US_PER_MS);
if (fd < 0) {
return false;
}
vmpressure_hinfo[level].handler = use_new_strategy ? mp_event_psi : mp_event_common;
vmpressure_hinfo[level].data = level;
if (register_psi_monitor(epollfd, fd, &vmpressure_hinfo[level]) < 0) {
destroy_psi_monitor(fd);
return false;
}
maxevents++;
mpevfd[level] = fd;
return true;
}
注意这里的init_psi_monitor和前面的init_psi_monitors做区分,init_psi_monitor是定义在system/memory/lmkd/libpsi/psi.cpp中的,它的作用是根据stall类型、阈值、窗口大小,获取epoll监听的句柄。
然后最重要的就是vmpressure_hinfo[level].handler,其根据是否使用新策略,决定了在这个压力等级事件发生时,要调用的是mp_event_psi还是mp_event_common。也就是使用新策略的情况下,当这个压力事件到来时,会调用mp_event_psi。mp_event_psi
后面register_psi_monitor则是epoll监听压力事件。
至此可以认为init_psi_monitors()也就是PSI监控器初始化完成,各个压力事件发生时,会调用mp_event_psi。
(2)vmpressure触发
- 由于现在大部分Android机型均使用PSI触发,vmpressure触发这部分暂略过。
- init中除了init_monitors()还有其他一些初始化过程,也先略过。use_inkernel_interface
1.3.3 mainloop
static void mainloop(void) {
struct event_handler_info* handler_info;
struct polling_params poll_params;
struct timespec curr_tm;
struct epoll_event *evt;
long delay = -1;
poll_params.poll_handler = NULL;
poll_params.paused_handler = NULL;
while (1) {
struct epoll_event events[MAX_EPOLL_EVENTS];
int nevents;
int i;
if (poll_params.poll_handler) {
bool poll_now;
clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm);
if (poll_params.update == POLLING_RESUME) {
/* Just transitioned into POLLING_RESUME, poll immediately. */
poll_now = true;
nevents = 0;
} else {
/* Calculate next timeout */
计算下一次超时时间
delay = get_time_diff_ms(&poll_params.last_poll_tm, &curr_tm);
delay = (delay < poll_params.polling_interval_ms) ?
poll_params.polling_interval_ms - delay : poll_params.polling_interval_ms;
/* Wait for events until the next polling timeout */
//等待epollfd上的事件
nevents = epoll_wait(epollfd, events, maxevents, delay);
/* Update current time after wait */
clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm);
poll_now = (get_time_diff_ms(&poll_params.last_poll_tm, &curr_tm) >=
poll_params.polling_interval_ms);
}
if (poll_now) {
call_handler(poll_params.poll_handler, &poll_params, 0);
}
} else {
if (kill_timeout_ms && is_waiting_for_kill()) {
clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm);
delay = kill_timeout_ms - get_time_diff_ms(&last_kill_tm, &curr_tm);
/* Wait for pidfds notification or kill timeout to expire */
nevents = (delay > 0) ? epoll_wait(epollfd, events, maxevents, delay) : 0;
if (nevents == 0) {
/* Kill notification timed out */
stop_wait_for_proc_kill(false);
if (polling_paused(&poll_params)) {
clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm);
poll_params.update = POLLING_RESUME;
resume_polling(&poll_params, curr_tm);
}
}
} else {
/* Wait for events with no timeout */
//等待没有超时的事件
nevents = epoll_wait(epollfd, events, maxevents, -1);
}
}
if (nevents == -1) {
if (errno == EINTR)
continue;
ALOGE("epoll_wait failed (errno=%d)", errno);
continue;
}
/*
* First pass to see if any data socket connections were dropped.
* Dropped connection should be handled before any other events
* to deallocate data connection and correctly handle cases when
* connection gets dropped and reestablished in the same epoll cycle.
* In such cases it's essential to handle connection closures first.
*/
for (i = 0, evt = &events[0]; i < nevents; ++i, evt++) {
// 如果是 EPOLLHUP 并且 data.ptr 存在
if ((evt->events & EPOLLHUP) && evt->data.ptr) {
ALOGI("lmkd data connection dropped");
// 获取 event_handler_info 结构体
handler_info = (struct event_handler_info*)evt->data.ptr;
watchdog.start();
// 处理连接关闭事件
ctrl_data_close(handler_info->data);
watchdog.stop();
}
}
/* Second pass to handle all other events */
//第二遍处理所有其他事件
for (i = 0, evt = &events[0]; i < nevents; ++i, evt++) {
if (evt->events & EPOLLERR) {
ALOGD("EPOLLERR on event #%d", i);
}
if (evt->events & EPOLLHUP) {
/* This case was handled in the first pass */
// 已在第一次循环处理
continue;
}
// 如果 data.ptr 有效
if (evt->data.ptr) {
// 获取 event_handler_info 结构体
handler_info = (struct event_handler_info*)evt->data.ptr;
// 调用处理函数,见1.3.3.1
call_handler(handler_info, &poll_params, evt->events);
}
}
}
}
1.3.3.1call_handler
static void call_handler(struct event_handler_info* handler_info,
struct polling_params *poll_params, uint32_t events) {
struct timespec curr_tm;
watchdog.start();
poll_params->update = POLLING_DO_NOT_CHANGE;
//这里的 handler_info->handler 就是 ctrl_connect_handler,它在 init() 中被设置。
handler_info->handler(handler_info->data, events, poll_params);
clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm);
if (poll_params->poll_handler == handler_info) {
poll_params->last_poll_tm = curr_tm;
}
switch (poll_params->update) {
case POLLING_START:
/*
* Poll for the duration of PSI_WINDOW_SIZE_MS after the
* initial PSI event because psi events are rate-limited
* at one per sec.
*/
poll_params->poll_start_tm = curr_tm;
poll_params->poll_handler = handler_info;
break;
case POLLING_PAUSE:
poll_params->paused_handler = handler_info;
poll_params->poll_handler = NULL;
break;
case POLLING_RESUME:
resume_polling(poll_params, curr_tm);
break;
case POLLING_DO_NOT_CHANGE:
if (poll_params->poll_handler &&
get_time_diff_ms(&poll_params->poll_start_tm, &curr_tm) > PSI_WINDOW_SIZE_MS) {
/* Polled for the duration of PSI window, time to stop */
poll_params->poll_handler = NULL;
}
break;
}
watchdog.stop();
}
1.3.3.2 ctrl_connect_handler
static void ctrl_connect_handler(int data __unused, uint32_t events __unused,
struct polling_params *poll_params __unused) {
struct epoll_event epev;
int free_dscock_idx = get_free_dsock();
if (free_dscock_idx < 0) {
/*
* Number of data connections exceeded max supported. This should not
* happen but if it does we drop all existing connections and accept
* the new one. This prevents inactive connections from monopolizing
* data socket and if we drop ActivityManager connection it will
* immediately reconnect.
*/
for (int i = 0; i < MAX_DATA_CONN; i++) {
ctrl_data_close(i);
}
free_dscock_idx = 0;
}
data_sock[free_dscock_idx].sock = accept(ctrl_sock.sock, NULL, NULL);
if (data_sock[free_dscock_idx].sock < 0) {
ALOGE("lmkd control socket accept failed; errno=%d", errno);
return;
}
ALOGI("lmkd data connection established");
/* use data to store data connection idx */
data_sock[free_dscock_idx].handler_info.data = free_dscock_idx;
//当事件触发,则调用ctrl_data_handler
data_sock[free_dscock_idx].handler_info.handler = ctrl_data_handler;
data_sock[free_dscock_idx].async_event_mask = 0;
epev.events = EPOLLIN;
epev.data.ptr = (void *)&(data_sock[free_dscock_idx].handler_info);
if (epoll_ctl(epollfd, EPOLL_CTL_ADD, data_sock[free_dscock_idx].sock, &epev) == -1) {
ALOGE("epoll_ctl for data connection socket failed; errno=%d", errno);
ctrl_data_close(free_dscock_idx);
return;
}
maxevents++;
}
1.3.3.3 ctrl_data_handler
static void ctrl_data_handler(int data, uint32_t events,
struct polling_params *poll_params __unused) {
if (events & EPOLLIN) {
ctrl_command_handler(data);
}
}
1.3.3.4 ctrl_command_handler
static void ctrl_command_handler(int dsock_idx) {
LMKD_CTRL_PACKET packet;
struct ucred cred;
int len;
enum lmk_cmd cmd;
int nargs;
int targets;
int kill_cnt;
int result;
len = ctrl_data_read(dsock_idx, (char *)packet, CTRL_PACKET_MAX_SIZE, &cred);
if (len <= 0)
return;
if (len < (int)sizeof(int)) {
ALOGE("Wrong control socket read length len=%d", len);
return;
}
cmd = lmkd_pack_get_cmd(packet);
nargs = len / sizeof(int) - 1;
if (nargs < 0)
goto wronglen;
switch(cmd) {
case LMK_TARGET:
targets = nargs / 2;
if (nargs & 0x1 || targets > (int)lowmem_adj.size()) {
goto wronglen;
}
cmd_target(targets, packet);
break;
case LMK_PROCPRIO:
/* process type field is optional for backward compatibility */
if (nargs < 3 || nargs > 4)
goto wronglen;
//见1.3.3.5 设置进程adj
cmd_procprio(packet, nargs, &cred);
break;
case LMK_PROCREMOVE:
if (nargs != 1)
goto wronglen;
cmd_procremove(packet, &cred);
break;
case LMK_PROCPURGE:
if (nargs != 0)
goto wronglen;
cmd_procpurge(&cred);
break;
case LMK_GETKILLCNT:
if (nargs != 2)
goto wronglen;
kill_cnt = cmd_getkillcnt(packet);
len = lmkd_pack_set_getkillcnt_repl(packet, kill_cnt);
if (ctrl_data_write(dsock_idx, (char *)packet, len) != len)
return;
break;
case LMK_SUBSCRIBE:
if (nargs != 1)
goto wronglen;
cmd_subscribe(dsock_idx, packet);
break;
case LMK_PROCKILL:
/* This command code is NOT expected at all */
ALOGE("Received unexpected command code %d", cmd);
break;
case LMK_UPDATE_PROPS:
if (nargs != 0)
goto wronglen;
result = -1;
if (update_props()) {
if (!use_inkernel_interface) {
/* Reinitialize monitors to apply new settings */
destroy_monitors();
if (init_monitors()) {
result = 0;
}
} else {
result = 0;
}
}
len = lmkd_pack_set_update_props_repl(packet, result);
if (ctrl_data_write(dsock_idx, (char *)packet, len) != len) {
ALOGE("Failed to report operation results");
}
if (!result) {
ALOGI("Properties reinitilized");
} else {
/* New settings can't be supported, crash to be restarted */
ALOGE("New configuration is not supported. Exiting...");
exit(1);
}
break;
default:
ALOGE("Received unknown command code %d", cmd);
return;
}
return;
wronglen:
ALOGE("Wrong control socket read length cmd=%d len=%d", cmd, len);
}
1.3.3.5 cmd_procprio
static void cmd_procprio(LMKD_CTRL_PACKET packet, int field_count, struct ucred *cred) {
struct proc *procp;
char path[LINE_MAX];
char val[20];
int soft_limit_mult;
struct lmk_procprio params;
bool is_system_server;
struct passwd *pwdrec;
int64_t tgid;
char buf[PAGE_SIZE];
lmkd_pack_get_procprio(packet, field_count, ¶ms);
if (params.oomadj < OOM_SCORE_ADJ_MIN ||
params.oomadj > OOM_SCORE_ADJ_MAX) {
ALOGE("Invalid PROCPRIO oomadj argument %d", params.oomadj);
return;
}
if (params.ptype < PROC_TYPE_FIRST || params.ptype >= PROC_TYPE_COUNT) {
ALOGE("Invalid PROCPRIO process type argument %d", params.ptype);
return;
}
/* Check if registered process is a thread group leader */
if (read_proc_status(params.pid, buf, sizeof(buf))) {
if (parse_status_tag(buf, PROC_STATUS_TGID_FIELD, &tgid) && tgid != params.pid) {
ALOGE("Attempt to register a task that is not a thread group leader "
"(tid %d, tgid %" PRId64 ")", params.pid, tgid);
return;
}
}
/* gid containing AID_READPROC required */
/* CAP_SYS_RESOURCE required */
/* CAP_DAC_OVERRIDE required */
向节点/proc//oom_score_adj`写入oomadj。
snprintf(path, sizeof(path), "/proc/%d/oom_score_adj", params.pid);
snprintf(val, sizeof(val), "%d", params.oomadj);
if (!writefilestring(path, val, false)) {
ALOGW("Failed to open %s; errno=%d: process %d might have been killed",
path, errno, params.pid);
/* If this file does not exist the process is dead. */
return;
}
if (use_inkernel_interface) {
stats_store_taskname(params.pid, proc_get_name(params.pid, path, sizeof(path)));
return;
}
/* lmkd should not change soft limits for services */
if (params.ptype == PROC_TYPE_APP && per_app_memcg) {
if (params.oomadj >= 900) {
soft_limit_mult = 0;
} else if (params.oomadj >= 800) {
soft_limit_mult = 0;
} else if (params.oomadj >= 700) {
soft_limit_mult = 0;
} else if (params.oomadj >= 600) {
// Launcher should be perceptible, don't kill it.
params.oomadj = 200;
soft_limit_mult = 1;
} else if (params.oomadj >= 500) {
soft_limit_mult = 0;
} else if (params.oomadj >= 400) {
soft_limit_mult = 0;
} else if (params.oomadj >= 300) {
soft_limit_mult = 1;
} else if (params.oomadj >= 200) {
soft_limit_mult = 8;
} else if (params.oomadj >= 100) {
soft_limit_mult = 10;
} else if (params.oomadj >= 0) {
soft_limit_mult = 20;
} else {
// Persistent processes will have a large
// soft limit 512MB.
soft_limit_mult = 64;
}
std::string path;
if (!CgroupGetAttributePathForTask("MemSoftLimit", params.pid, &path)) {
ALOGE("Querying MemSoftLimit path failed");
return;
}
snprintf(val, sizeof(val), "%d", soft_limit_mult * EIGHT_MEGA);
/*
* system_server process has no memcg under /dev/memcg/apps but should be
* registered with lmkd. This is the best way so far to identify it.
*/
is_system_server = (params.oomadj == SYSTEM_ADJ &&
(pwdrec = getpwnam("system")) != NULL &&
params.uid == pwdrec->pw_uid);
writefilestring(path.c_str(), val, !is_system_server);
}
procp = pid_lookup(params.pid);
if (!procp) {
int pidfd = -1;
if (pidfd_supported) {
pidfd = TEMP_FAILURE_RETRY(pidfd_open(params.pid, 0));
if (pidfd < 0) {
ALOGE("pidfd_open for pid %d failed; errno=%d", params.pid, errno);
return;
}
}
procp = static_cast<struct proc*>(calloc(1, sizeof(struct proc)));
if (!procp) {
// Oh, the irony. May need to rebuild our state.
return;
}
procp->pid = params.pid;
procp->pidfd = pidfd;
procp->uid = params.uid;
procp->reg_pid = cred->pid;
procp->oomadj = params.oomadj;
procp->valid = true;
proc_insert(procp);
} else {
if (!claim_record(procp, cred->pid)) {
char buf[LINE_MAX];
char *taskname = proc_get_name(cred->pid, buf, sizeof(buf));
/* Only registrant of the record can remove it */
ALOGE("%s (%d, %d) attempts to modify a process registered by another client",
taskname ? taskname : "A process ", cred->uid, cred->pid);
return;
}
proc_unslot(procp);
procp->oomadj = params.oomadj;
proc_slot(procp);
}
}
2.mp_event_psi
mp_event_psi是PSI触发后的新策略,它可以大致分为三个部分,第一部分做一些参数和状态的计算,第二部分根据得出的状态确定查杀原因(kill_reason),第三部分选择进程进行一轮查杀。
这部分代码较多,比较重要的是通过vmstat_parse和meminfo_parse读取信息,判断thrashing、水线、swap状态等,便于下一步确认查杀原因。
static void mp_event_psi(int data, uint32_t events, struct polling_params *poll_params) {
enum reclaim_state {
NO_RECLAIM = 0,
KSWAPD_RECLAIM,
DIRECT_RECLAIM,
};
//static变量,在多次进入这个函数时,这些static变量持续记录状态
static int64_t init_ws_refault; // 记录 杀进程后 初始的 workingset_refault
static int64_t prev_workingset_refault; // 记录上一轮的 workingset_refault
static int64_t base_file_lru; // 记录初始时的 文件页缓存大小
static int64_t init_pgscan_kswapd; // 记录初始时的 kswap回收量
static int64_t init_pgscan_direct; // 记录初始时的 直接回收量
static bool killing; // 如果有进程被杀会被置为true
static int thrashing_limit = thrashing_limit_pct; // 抖动的阈值,一开始由参数中获取
static struct zone_watermarks watermarks;
static struct timespec wmark_update_tm;
static struct wakeup_info wi;
static struct timespec thrashing_reset_tm;
static int64_t prev_thrash_growth = 0;
static bool check_filecache = false;
static int max_thrashing = 0;
//一些临时变量
union meminfo mi; // 从 /proc/meminfo 解析
union vmstat vs; // 从 /proc/vmstat 解析
struct psi_data psi_data;
struct timespec curr_tm; // 每轮开始时记录时间
int64_t thrashing = 0;
bool swap_is_low = false;
enum vmpressure_level level = (enum vmpressure_level)data;
enum kill_reasons kill_reason = NONE;
bool cycle_after_kill = false; // 如果上一轮有进程被杀,这一轮会被置为true
enum reclaim_state reclaim = NO_RECLAIM;
enum zone_watermark wmark = WMARK_NONE;
char kill_desc[LINE_MAX];
bool cut_thrashing_limit = false;
int min_score_adj = 0;
int swap_util = 0;
int64_t swap_low_threshold;
long since_thrashing_reset_ms;
int64_t workingset_refault_file;
bool critical_stall = false;
if (clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm) != 0) {
ALOGE("Failed to get current time");
return;
}
record_wakeup_time(&curr_tm, events ? Event : Polling, &wi);
bool kill_pending = is_kill_pending();
if (kill_pending && (kill_timeout_ms == 0 ||
get_time_diff_ms(&last_kill_tm, &curr_tm) < static_cast<long>(kill_timeout_ms))) {
/* Skip while still killing a process */
wi.skipped_wakeups++;
goto no_kill;
}
/*
* Process is dead or kill timeout is over, stop waiting. This has no effect if pidfds are
* supported and death notification already caused waiting to stop.
* 进程死亡或者kill超时结束,停止等待。 如果支持pidfd并且死亡通知已导致等待停止,则此操作无效。
*/
stop_wait_for_proc_kill(!kill_pending);
// vmstat解析
if (vmstat_parse(&vs) < 0) {
ALOGE("Failed to parse vmstat!");
return;
}
/* 从5.9开始内核workingset_refault vmstat字段被重命名为workingset_refault_file */
workingset_refault_file = vs.field.workingset_refault ? : vs.field.workingset_refault_file;
// meminfo解析
if (meminfo_parse(&mi) < 0) {
ALOGE("Failed to parse meminfo!");
return;
}
/* Reset states after process got killed */
/* 杀进程后重置一些状态 */
if (killing) {
killing = false;
cycle_after_kill = true;
/* Reset file-backed pagecache size and refault amounts after a kill */
base_file_lru = vs.field.nr_inactive_file + vs.field.nr_active_file; // 重置 文件页 缓存大小
init_ws_refault = workingset_refault_file; // 重置 refault量
thrashing_reset_tm = curr_tm; // thrashing重置时间设置为当前时间
prev_thrash_growth = 0; // thrashing重置为0
}
/* Check free swap levels */
/* 确认swap状态: swap_is_low */
if (swap_free_low_percentage) { // 由属性中获取
swap_low_threshold = mi.field.total_swap * swap_free_low_percentage / 100;
swap_is_low = get_free_swap(&mi) < swap_low_threshold; // free swap低于 XX%,认为swap较低
} else {
swap_low_threshold = 0;
}
/* Identify reclaim state */
/* 确认回收状态: reclaim */
if (vs.field.pgscan_direct != init_pgscan_direct) { // pgscan_direct发生了变化,说明发生了【DIRECT_RECLAIM】
init_pgscan_direct = vs.field.pgscan_direct;
init_pgscan_kswapd = vs.field.pgscan_kswapd;
reclaim = DIRECT_RECLAIM;
} else if (vs.field.pgscan_kswapd != init_pgscan_kswapd) { // kswapd回收量发生变化,发生了【KSWAPD_RECLAIM】
init_pgscan_kswapd = vs.field.pgscan_kswapd;
reclaim = KSWAPD_RECLAIM;
} else if (workingset_refault_file == prev_workingset_refault) {
/*
* Device is not thrashing and not reclaiming, bail out early until we see these stats
* changing
* 设备没有抖动也没有回收,该轮不查杀,直到我们看到这些统计数据发生变化
*/
goto no_kill;
}
prev_workingset_refault = workingset_refault_file;
/*
* It's possible we fail to find an eligible process to kill (ex. no process is
* above oom_adj_min). When this happens, we should retry to find a new process
* for a kill whenever a new eligible process is available.
* 有可能我们找不到合适的进程来终止(例如,没有进程高于 oom_adj_min)。
* 这种情况下,只要有新的符合条件的进程可用,我们就应该重试寻找新的进程进行kill。
*
* This is especially important for a slow growing refault case.
* 这对于增长缓慢的缺页场景尤为重要。
*
* While retrying, we should keep monitoring new thrashing counter
* as someone could release the memory to mitigate the thrashing.
* 在重试时,我们应该继续监视新的抖动计数器(thrashing counter),因为有人可以释放内存来减轻抖动。
*
* Thus, when thrashing reset window comes,
* we decay the prev thrashing counter by window counts.
* 因此,当抖动重置窗口到来时,我们通过窗口计数衰减前一个抖动计数器。
*
* If the counter is still greater than thrashing limit,
* we preserve the current prev_thrash counter so we will retry kill again.
* 如果计数器仍然大于抖动限制,我们将保留当前的 prev_thrash 计数器,以便我们再次重试 kill。
*
* Otherwise, we reset the prev_thrash counter so we will stop retrying.
* 否则,我们重置 prev_thrash 计数器以停止重试。
*/
// 从thrashing重置 到 现在 的时间差,注意如果上一轮查杀过,时间会被重置,时间差=0
since_thrashing_reset_ms = get_time_diff_ms(&thrashing_reset_tm, &curr_tm);
if (since_thrashing_reset_ms > THRASHING_RESET_INTERVAL_MS) {
long windows_passed;
/* Calculate prev_thrash_growth if we crossed THRASHING_RESET_INTERVAL_MS */
/* 在超过thrashing reset间隔时间的情况下,计算上一次thrash增长 */
prev_thrash_growth = (workingset_refault_file - init_ws_refault) * 100
/ (base_file_lru + 1); // 新增缺页数量 / 总文件页数量 * 100
// 代表超过了多少个thrashing reset窗口
windows_passed = (since_thrashing_reset_ms / THRASHING_RESET_INTERVAL_MS);
/*
* Decay prev_thrashing unless over-the-limit thrashing was registered in the window we
* just crossed, which means there were no eligible processes to kill. We preserve the
* counter in that case to ensure a kill if a new eligible process appears.
*
* 减少 prev_thrashing 除非 在我们刚刚越过的窗口中 注册了超过限制的抖动,这意味着没有符合条件的进程可以杀死。
* 在这种情况下,我们保留计数器以确保在出现新的合格进程时终止。
*/
if (windows_passed > 1 || prev_thrash_growth < thrashing_limit) {
prev_thrash_growth >>= windows_passed;
}
/* Record file-backed pagecache size when crossing THRASHING_RESET_INTERVAL_MS */
/* 超过THRASHING_RESET_INTERVAL_MS时,记录 文件页数量 */
// 实际看这里是重置了 文件页大小、refault数量、抖动重置时间、抖动阈值
base_file_lru = vs.field.nr_inactive_file + vs.field.nr_active_file;
init_ws_refault = workingset_refault_file;
thrashing_reset_tm = curr_tm; // thrashing重置
thrashing_limit = thrashing_limit_pct;
} else {
/* Calculate what % of the file-backed pagecache refaulted so far */
// 上一轮发生过查杀,或thrashing刚重置没多久,就计算到目前为止,文件页缓存发生缺页的百分比
thrashing = (workingset_refault_file - init_ws_refault) * 100 / (base_file_lru + 1);
}
/* Add previous cycle's decayed thrashing amount */
// 累加上一轮的thrashing衰减量
thrashing += prev_thrash_growth;
if (max_thrashing < thrashing) {
max_thrashing = thrashing;
}
/*
* Refresh watermarks once per min in case user updated one of the margins.
* 每 60s 刷新一次水线
*
* TODO: b/140521024 replace this periodic update with an API for AMS to notify LMKD
* that zone watermarks were changed by the system software.
* TODO: 使用 AMS的API 替换 此定时更新,以通知 LMKD 水线已被系统软件更改。
*/
if (watermarks.high_wmark == 0 || get_time_diff_ms(&wmark_update_tm, &curr_tm) > 60000) {
struct zoneinfo zi;
// 进行一次zoninfo解析
if (zoneinfo_parse(&zi) < 0) {
ALOGE("Failed to parse zoneinfo!");
return;
}
// 计算zone水线,看这个函数把各个zone的水线都加了起来,存到watermarks里,见2.1
calc_zone_watermarks(&zi, &watermarks);
wmark_update_tm = curr_tm;
}
/* Find out which watermark is breached if any */
// 确认到了哪个水线等级
wmark = get_lowest_watermark(&mi, &watermarks);
/* 从/proc/pressure/memory解析 psi 数据,确认是否达到 critical 等级 */
if (!psi_parse_mem(&psi_data)) {
critical_stall = psi_data.mem_stats[PSI_FULL].avg10 > (float)stall_limit_critical;
}
2.1 calc_zone_watermarks
void calc_zone_watermarks(struct zoneinfo *zi, struct zone_watermarks *watermarks) {
memset(watermarks, 0, sizeof(struct zone_watermarks));
for (int node_idx = 0; node_idx < zi->node_count; node_idx++) {
struct zoneinfo_node *node = &zi->nodes[node_idx];
for (int zone_idx = 0; zone_idx < node->zone_count; zone_idx++) {
struct zoneinfo_zone *zone = &node->zones[zone_idx];
if (!zone->fields.field.present) {
continue;
}
watermarks->high_wmark += zone->max_protection + zone->fields.field.high;
watermarks->low_wmark += zone->max_protection + zone->fields.field.low;
watermarks->min_wmark += zone->max_protection + zone->fields.field.min;
}
}
}
2.2 get_lowest_watermark
static enum zone_watermark get_lowest_watermark(union meminfo *mi,
struct zone_watermarks *watermarks)
{
int64_t nr_free_pages = mi->field.nr_free_pages - mi->field.cma_free;
if (nr_free_pages < watermarks->min_wmark) {
return WMARK_MIN;
}
if (nr_free_pages < watermarks->low_wmark) {
return WMARK_LOW;
}
if (nr_free_pages < watermarks->high_wmark) {
return WMARK_HIGH;
}
return WMARK_NONE;
}
2.2 确认查杀原因和最低adj
该部分主要是根据上一部分得出的状态,确认要进行查杀的原因,以及对最低可查杀adj等级(min_score_adj)做出修改,这部分源码基本上全是if else if注释比较详细,kill_reason和kill_desc的赋值也比较直观,芯片厂商也会对这部分代码做较大的改动。如有lmkd异常查杀等情况发生,可以根据lmkd日志中打印的kill reason,在这一部分找到对应的源码。
- 代码示例:
if (!psi_parse_mem(&psi_data)) {
critical_stall = psi_data.mem_stats[PSI_FULL].avg10 > (float)stall_limit_critical;
}
/*
* TODO: move this logic into a separate function
* Decide if killing a process is necessary and record the reason
*/
if (cycle_after_kill && wmark < WMARK_LOW) {
/*
* Prevent kills not freeing enough memory which might lead to OOM kill.
* This might happen when a process is consuming memory faster than reclaim can
* free even after a kill. Mostly happens when running memory stress tests.
*/
kill_reason = PRESSURE_AFTER_KILL;
strncpy(kill_desc, "min watermark is breached even after kill", sizeof(kill_desc));
} else if (level == VMPRESS_LEVEL_CRITICAL && events != 0) {
/*
* Device is too busy reclaiming memory which might lead to ANR.
* Critical level is triggered when PSI complete stall (all tasks are blocked because
* of the memory congestion) breaches the configured threshold.
*/
kill_reason = NOT_RESPONDING;
strncpy(kill_desc, "device is not responding", sizeof(kill_desc));
} else if (swap_is_low && thrashing > thrashing_limit_pct) {
/* Page cache is thrashing while swap is low */
kill_reason = LOW_SWAP_AND_THRASHING;
snprintf(kill_desc, sizeof(kill_desc), "device is low on swap (%" PRId64
"kB < %" PRId64 "kB) and thrashing (%" PRId64 "%%)",
get_free_swap(&mi) * page_k, swap_low_threshold * page_k, thrashing);
/* Do not kill perceptible apps unless below min watermark or heavily thrashing */
if (wmark > WMARK_MIN && thrashing < thrashing_critical_pct) {
min_score_adj = PERCEPTIBLE_APP_ADJ + 1;
}
check_filecache = true;
} else if (swap_is_low && wmark < WMARK_HIGH) {
/* Both free memory and swap are low */
kill_reason = LOW_MEM_AND_SWAP;
snprintf(kill_desc, sizeof(kill_desc), "%s watermark is breached and swap is low (%"
PRId64 "kB < %" PRId64 "kB)", wmark < WMARK_LOW ? "min" : "low",
get_free_swap(&mi) * page_k, swap_low_threshold * page_k);
/* Do not kill perceptible apps unless below min watermark or heavily thrashing */
if (wmark > WMARK_MIN && thrashing < thrashing_critical_pct) {
min_score_adj = PERCEPTIBLE_APP_ADJ + 1;
}
} else if (wmark < WMARK_HIGH && swap_util_max < 100 &&
(swap_util = calc_swap_utilization(&mi)) > swap_util_max) {
/*
* Too much anon memory is swapped out but swap is not low.
* Non-swappable allocations created memory pressure.
*/
kill_reason = LOW_MEM_AND_SWAP_UTIL;
snprintf(kill_desc, sizeof(kill_desc), "%s watermark is breached and swap utilization"
" is high (%d%% > %d%%)", wmark < WMARK_LOW ? "min" : "low",
swap_util, swap_util_max);
} else if (wmark < WMARK_HIGH && thrashing > thrashing_limit) {
/* Page cache is thrashing while memory is low */
kill_reason = LOW_MEM_AND_THRASHING;
snprintf(kill_desc, sizeof(kill_desc), "%s watermark is breached and thrashing (%"
PRId64 "%%)", wmark < WMARK_LOW ? "min" : "low", thrashing);
cut_thrashing_limit = true;
/* Do not kill perceptible apps unless thrashing at critical levels */
if (thrashing < thrashing_critical_pct) {
min_score_adj = PERCEPTIBLE_APP_ADJ + 1;
}
check_filecache = true;
} else if (reclaim == DIRECT_RECLAIM && thrashing > thrashing_limit) {
/* Page cache is thrashing while in direct reclaim (mostly happens on lowram devices) */
kill_reason = DIRECT_RECL_AND_THRASHING;
snprintf(kill_desc, sizeof(kill_desc), "device is in direct reclaim and thrashing (%"
PRId64 "%%)", thrashing);
cut_thrashing_limit = true;
/* Do not kill perceptible apps unless thrashing at critical levels */
if (thrashing < thrashing_critical_pct) {
min_score_adj = PERCEPTIBLE_APP_ADJ + 1;
}
check_filecache = true;
} else if (check_filecache) {
int64_t file_lru_kb = (vs.field.nr_inactive_file + vs.field.nr_active_file) * page_k;
if (file_lru_kb < filecache_min_kb) {
/* File cache is too low after thrashing, keep killing background processes */
kill_reason = LOW_FILECACHE_AFTER_THRASHING;
snprintf(kill_desc, sizeof(kill_desc),
"filecache is low (%" PRId64 "kB < %" PRId64 "kB) after thrashing",
file_lru_kb, filecache_min_kb);
min_score_adj = PERCEPTIBLE_APP_ADJ + 1;
} else {
/* File cache is big enough, stop checking */
check_filecache = false;
}
}
2.3 进程查杀
至此已经确定了查杀原因和最低允许查杀的adj,调用find_and_kill_process函数进行查杀。
if (kill_reason != NONE) {
struct kill_info ki = {
.kill_reason = kill_reason,
.kill_desc = kill_desc,
.thrashing = (int)thrashing,
.max_thrashing = max_thrashing,
};
// ...
int pages_freed = find_and_kill_process(min_score_adj, &ki, &mi, &wi, &curr_tm, &psi_data);
if (pages_freed > 0) {
killing = true;
// ...
}
}
find_and_kill_process函数的作用是在大于等于min_score_adj的范围内,选择合适的进程进行查杀。
static int find_and_kill_process(int min_score_adj, struct kill_info *ki, union meminfo *mi,
struct wakeup_info *wi, struct timespec *tm,
struct psi_data *pd) {
int i;
int killed_size = 0;
bool lmk_state_change_start = false;
bool choose_heaviest_task = kill_heaviest_task;
// 从 1000 开始循环
for (i = OOM_SCORE_ADJ_MAX; i >= min_score_adj; i--) {
struct proc *procp;
if (!choose_heaviest_task && i <= PERCEPTIBLE_APP_ADJ) {
/*
* If we have to choose a perceptible process, choose the heaviest one to
* hopefully minimize the number of victims.
* 如果我们必须选择一个可感知的进程(adj<=200),就选择最严重的一个,来尽量避免查杀过多的进程。
*/
choose_heaviest_task = true;
}
while (true) {
procp = choose_heaviest_task ?
// 在adj==i的进程中找"最严重的"或末尾的
proc_get_heaviest(i) : proc_adj_tail(i);
if (!procp)
break;
killed_size = kill_one_process(procp, min_score_adj, ki, mi, wi, tm, pd);
if (killed_size >= 0) {
break;
}
}
// 有进程查杀发生时,不再继续查杀更低adj的进程
if (killed_size) {
break;
}
}
return killed_size;
}
从find_and_kill_process可以得知lmkd每次触发查杀时,都是从adj1000的进程开始逐个筛选合适的进程查杀,发生查杀后退出。在“选择合适的进程”的策略中,可以通过kill_heaviest_task系统属性控制lmkd是用proc_get_heaviest还是proc_adj_tail做筛选,不过注意在查杀到adj小于等于200时,已经到了必须查杀用户可感知进程的地步,此时强制筛选“最严重”的进程。
所谓“最严重的进程”,可以在proc_get_heaviest函数中看到是对进程读取"/proc/[pid]/statm"路径,获取进程rss内存占用,排序找出rss最大的进程;
Android 11 之后一般定制lmkd机制是使用一下两个配置项。这些功能在高性能设备和低内存设备上都可使用
ro.lmk.psi_partial_stall_ms
部分 PSI 失速阈值(以毫秒为单位),用于触发内存不足通知。如果设备收到内存压力通知的时间太晚,可以降低此值以在较早的时间触发通知。如果在不必要的情况下触发了内存压力通知,请提高此值以降低设备对噪声的敏感度。默认值:70(高性能机器) 200(低内存机器)
ro.lmk.psi_complete_stall_ms
完全 PSI 失速阈值(以毫秒为单位),用于触发关键内存通知。如果设备收到关键内存压力通知的时间太晚,可以降低该值以在较早的时间触发通知。如果在不必要的情况下触发了关键内存压力通知,可以提高该值以降低设备对噪声的敏感度。默认值:700