本文基于linux-5.0内核源码分析
include/linux/mmzone.h
include/linux/pagevec.h
include/linux/mm_inline.h
include/linux/pagemap.h
include/linux/vmstat.h
mm/swap.c
mm/vmscan.c
mm/util.c
mm/rmap.c
1. lru_list
#define LRU_BASE 0
#define LRU_ACTIVE 1
#define LRU_FILE 2
// lru是双向链表: 内核根据页面类型(匿名页和文件页)与活跃性(活跃和不活跃), 分成5种类型lru链表
enum lru_list {
// 0: inactive anonymous page lru list
LRU_INACTIVE_ANON = LRU_BASE,
// 1: active anonymous page lru list
LRU_ACTIVE_ANON = LRU_BASE + LRU_ACTIVE,
// 2: inactive page cache lru list
LRU_INACTIVE_FILE = LRU_BASE + LRU_FILE,
// 3: active page cache lru list
LRU_ACTIVE_FILE = LRU_BASE + LRU_FILE + LRU_ACTIVE,
// 4: unevictable page lru list
LRU_UNEVICTABLE,
NR_LRU_LISTS
};
2. lruvec
struct lruvec {
// 每个lruvec都包含5个lru链表
struct list_head lists[NR_LRU_LISTS];
struct zone_reclaim_stat reclaim_stat;
/* Evictions & activations on the inactive file list */
atomic_long_t inactive_age;
/* Refaults at the time of last reclaim cycle */
unsigned long refaults;
#ifdef CONFIG_MEMCG
// 每个node都包含1个lruvec: pgdat标识lruvec所属的node
struct pglist_data *pgdat;
#endif
};
3. pagevec
/* 15 pointers + header align the pagevec structure to a power of two */
// 对比4.14.186的内核: PAGEVEC_SIZE为14
#define PAGEVEC_SIZE 15
// pagevec用于批量处理
struct pagevec {
unsigned long nr;
bool percpu_pvec_drained;
// 每个pagevec都有1个15个page大小的数组
struct page *pages[PAGEVEC_SIZE];
};
4. lru_cache_add
// 将page添加到指定的lru链表
void lru_cache_add(struct page *page)
{
// 活跃且不可回收的页面不能加入lru链表
VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page);
// 已经添加到lru链表的不能再重复添加
VM_BUG_ON_PAGE(PageLRU(page), page);
__lru_cache_add(page);
}
/*
*每个cpu定义1个pagevec
*/
// lru_add_pvec用于存放添加到lru链表的页面
static DEFINE_PER_CPU(struct pagevec, lru_add_pvec);
static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs);
static DEFINE_PER_CPU(struct pagevec, lru_lazyfree_pvecs);
#ifdef CONFIG_SMP
static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs);
#endif
static void __lru_cache_add(struct page *page)
{
// 获取当前cpu的pagevec
struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
get_page(page);
// 1.首先尝试通过pagevec_add将page添加到pagevec的pages数组
// 2.如果添加失败代表当前pagevec已满, 需要将pagevec的15个page批量提交到lru链表
// 3.如果是复合页也直接批量提交
if (!pagevec_add(pvec, page) || PageCompound(page))
__pagevec_lru_add(pvec);
// 更新lru_add_pvec
put_cpu_var(lru_add_pvec);
}
4.1 pagevec_add
// 将page添加到pagevec, 并返回剩余可用的空间
static inline unsigned pagevec_add(struct pagevec *pvec, struct page *page)
{
// 将page保存到pagevec的pages数组, 并将page数量加1
pvec->pages[pvec->nr++] = page;
// 返回剩余空间: 为0代表空间已满添加失败
return pagevec_space(pvec);
}
4.2 pagevec_space
// pagevec最多保存15个page, nr保存pagevec当前存储的page数: 两者之差等于pagevec剩余可用空间
static inline unsigned pagevec_space(struct pagevec *pvec)
{
return PAGEVEC_SIZE - pvec->nr;
}
4.3 __pagevec_lru_add
void __pagevec_lru_add(struct pagevec *pvec)
{
// 批量处理pagevec的所有page: 针对每个page调用__pagevec_lru_add_fn方法
pagevec_lru_move_fn(pvec, __pagevec_lru_add_fn, NULL);
}
4.4 pagevec_lru_move_fn
static void pagevec_lru_move_fn(struct pagevec *pvec,
void (*move_fn)(struct page *page, struct lruvec *lruvec, void *arg),
void *arg)
{
int i;
struct pglist_data *pgdat = NULL;
struct lruvec *lruvec;
unsigned long flags = 0;
// 遍历pagevec中的每个page
for (i = 0; i < pagevec_count(pvec); i++) {
struct page *page = pvec->pages[i];
// page所属的节点
struct pglist_data *pagepgdat = page_pgdat(page);
if (pagepgdat != pgdat) {
if (pgdat)
spin_unlock_irqrestore(&pgdat->lru_lock, flags);
pgdat = pagepgdat;
spin_lock_irqsave(&pgdat->lru_lock, flags);
}
// 1.如果mem_cgroup_disabled: 则返回pglist_data的lruvec
// 2.否则返回mem_cgroup_per_node的lruvec
lruvec = mem_cgroup_page_lruvec(page, pgdat);
// 回调__pagevec_lru_add种定义的move_fn函数: __pagevec_lru_add_fn
(*move_fn)(page, lruvec, arg);
}
if (pgdat)
spin_unlock_irqrestore(&pgdat->lru_lock, flags);
// 释放并重新初始化pagevec
release_pages(pvec->pages, pvec->nr, pvec->cold);
pagevec_reinit(pvec);
}
4.5 __pagevec_lru_add_fn
static inline int page_is_file_cache(struct page *page)
{
// anonymous page通过磁盘上的swap分区或者在RAM开辟swap分区(zram)实现回收
// page cache通过drop或者writeback回收
// PG_swapbacked为0, 即page cache
return !PageSwapBacked(page);
}
// inactive list:包括inactive page cache和inactive anonymous page
static inline enum lru_list page_lru_base_type(struct page *page)
{
if (page_is_file_cache(page))
return LRU_INACTIVE_FILE;
return LRU_INACTIVE_ANON;
}
static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec,
void *arg)
{
// 4.14.186内核实现
// 判断是否文件缓存: 不需要swap分区支持的就是文件缓存
// int file = page_is_file_cache(page);
// 判断是否活跃
// int active = PageActive(page);
// 计算page的lru类型
// enum lru_list lru = page_lru(page);
// 将page添加到lruvec类型为lru的链表上, 然后更新node和zone的统计信息
// add_page_to_lru_list(page, lruvec, lru);
// 更新lruvec的zone_reclaim_stat成员信息
// update_page_reclaim_stat(lruvec, file, active);
// trace_mm_lru_insertion(page, lru);
enum lru_list lru;
// 判断page曾经是否不可回收, 同时清除其PG_unevictable标志位
int was_unevictable = TestClearPageUnevictable(page);
// 不能重复添加到lru链表
VM_BUG_ON_PAGE(PageLRU(page), page);
// 设置PG_lru标志位
SetPageLRU(page);
smp_mb();
// 判断page是否可回收
if (page_evictable(page)) {
// 获取page的lru链表类型
lru = page_lru(page);
update_page_reclaim_stat(lruvec, page_is_file_cache(page),
PageActive(page));
if (was_unevictable)
count_vm_event(UNEVICTABLE_PGRESCUED);
} else {
// page属于不可回收的lru链表
lru = LRU_UNEVICTABLE;
// 清除PG_active标志位
ClearPageActive(page);
// 设置PG_unevictable标志位
SetPageUnevictable(page);
if (!was_unevictable)
count_vm_event(UNEVICTABLE_PGCULLED);
}
// 将page添加到lruvec类型为lru的链表上, 然后更新node和zone的统计信息
add_page_to_lru_list(page, lruvec, lru);
trace_mm_lru_insertion(page, lru);
}
4.5.1 page_evictable
// 两种不可回收的情况
// 1.page->mapping被标记为不可回收
// 2.page属于1个被锁住的vma
int page_evictable(struct page *page)
{
int ret;
/* Prevent address_space of inode and swap cache from being freed */
rcu_read_lock();
// 首先判断page是否可以回收, 其次判断page是否设置PG_mlocked标志位
ret = !mapping_unevictable(page_mapping(page)) && !PageMlocked(page);
rcu_read_unlock();
return ret;
}
4.5.2 page_mapping
struct address_space *page_mapping(struct page *page)
{
struct address_space *mapping;
page = compound_head(page);
/* This happens if someone calls flush_dcache_page on slab page */
if (unlikely(PageSlab(page)))
return NULL;
// swap缓存
if (unlikely(PageSwapCache(page))) {
swp_entry_t entry;
entry.val = page_private(page);
// 返回swapper_spaces数组的address_space元素
return swap_address_space(entry);
}
mapping = page->mapping;
// 如果是匿名映射则返回NULL
if ((unsigned long)mapping & PAGE_MAPPING_ANON)
return NULL;
// 返回page映射的address_space
return (void *)((unsigned long)mapping & ~PAGE_MAPPING_FLAGS);
}
4.5.3 mapping_unevictable
/*
* Bits in mapping->flags.
*/
enum mapping_flags {
AS_EIO = 0, /* IO error on async write */
AS_ENOSPC = 1, /* ENOSPC on async write */
AS_MM_ALL_LOCKS = 2, /* under mm_take_all_locks() */
AS_UNEVICTABLE = 3, /* e.g., ramdisk, SHM_LOCK */
AS_EXITING = 4, /* final truncate in progress */
/* writeback related tags are not used */
AS_NO_WRITEBACK_TAGS = 5,
};
static inline int mapping_unevictable(struct address_space *mapping)
{
// 判断address_space->flags是否含有AS_UNEVICTABLE标志位
if (mapping)
return test_bit(AS_UNEVICTABLE, &mapping->flags);
return !!mapping;
}
4.6 add_page_to_lru_list
static __always_inline void add_page_to_lru_list(struct page *page,
struct lruvec *lruvec, enum lru_list lru)
{
// 更新node和zone中的lru链表大小: page_zonenum返回page对应的zone索引
update_lru_size(lruvec, lru, page_zonenum(page), hpage_nr_pages(page));
// 将page插入到lruvec对应的链表末尾
list_add(&page->lru, &lruvec->lists[lru]);
}
4.6.1 update_lru_size
static __always_inline void update_lru_size(struct lruvec *lruvec,
enum lru_list lru, enum zone_type zid,
int nr_pages)
{
// 继续调用__update_lru_size
__update_lru_size(lruvec, lru, zid, nr_pages);
#ifdef CONFIG_MEMCG
// memory cgroup使能时更新mem_cgroup_per_node
mem_cgroup_update_lru_size(lruvec, lru, zid, nr_pages);
#endif
}
4.6.2 __update_lru_size
static __always_inline void __update_lru_size(struct lruvec *lruvec,
enum lru_list lru, enum zone_type zid,
int nr_pages)
{
// lruvec对应的节点
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
// 更新node统计信息
__mod_node_page_state(pgdat, NR_LRU_BASE + lru, nr_pages);
// 更新zone统计信息
__mod_zone_page_state(&pgdat->node_zones[zid],
NR_ZONE_LRU_BASE + lru, nr_pages);
}
4.6.3 __mod_node_page_state
enum node_stat_item {
NR_LRU_BASE,
NR_INACTIVE_ANON = NR_LRU_BASE, /* must match order of LRU_[IN]ACTIVE */
NR_ACTIVE_ANON, /* " " " " " */
NR_INACTIVE_FILE, /* " " " " " */
NR_ACTIVE_FILE, /* " " " " " */
NR_UNEVICTABLE, /* " " " " " */
...
NR_VM_NODE_STAT_ITEMS
};
static inline void __mod_node_page_state(struct pglist_data *pgdat,
enum node_stat_item item, int delta)
{
// delta代表新增的page数量
node_page_state_add(delta, pgdat, item);
}
static inline void node_page_state_add(long x, struct pglist_data *pgdat,
enum node_stat_item item)
{
// 更新node的vm_stat统计
atomic_long_add(x, &pgdat->vm_stat[item]);
// 更新全局的vm_node_stat统计
atomic_long_add(x, &vm_node_stat[item]);
}
4.6.4 __mod_zone_page_state
enum zone_stat_item {
/* First 128 byte cacheline (assuming 64 bit words) */
NR_FREE_PAGES,
NR_ZONE_LRU_BASE, /* Used only for compaction and reclaim retry */
NR_ZONE_INACTIVE_ANON = NR_ZONE_LRU_BASE,
NR_ZONE_ACTIVE_ANON,
NR_ZONE_INACTIVE_FILE,
NR_ZONE_ACTIVE_FILE,
NR_ZONE_UNEVICTABLE,
...
NR_VM_ZONE_STAT_ITEMS };
static inline void __mod_zone_page_state(struct zone *zone,
enum zone_stat_item item, long delta)
{
// delta代表新增的page数量
zone_page_state_add(delta, zone, item);
}
static inline void zone_page_state_add(long x, struct zone *zone,
enum zone_stat_item item)
{
// 更新zone的vm_stat统计
atomic_long_add(x, &zone->vm_stat[item]);
// 更新全局的vm_zone_stat统计
atomic_long_add(x, &vm_zone_stat[item]);
}
5. mark_page_accessed(二次机会法)
// 当page被访问时会有以下三种PG_active和PG_referenced的组合
// 一.不活跃且未被引用 -> 转换为不活跃且被引用
// 二.不活跃且被引用 -> 转换为活跃且未被引用
// 三.活跃且未被引用 -> 转换为活跃且被引用
void mark_page_accessed(struct page *page)
{
page = compound_head(page);
// 1. PG_active为0, 即inactive page
// 2. PG_unevictable为0, 即可回收的page
// 3. PG_referenced为1, 即已经被使用的page
// 对应第二种组合: inactive,referenced -> active,unreferenced
if (!PageActive(page) && !PageUnevictable(page) &&
PageReferenced(page)) {
// PG_lru为1, 即在lru链表中
if (PageLRU(page))
// 激活page: 将page从inactive list迁移到active list
activate_page(page);
else
// 激活page: 将PG_active标志位设置为1
__lru_cache_activate_page(page);
// 清除PG_referenced标志位
ClearPageReferenced(page);
if (page_is_file_cache(page))
workingset_activation(page);
} else if (!PageReferenced(page)) {
// 对应第一种和第三种组合
// inactive,unreferenced -> inactive,referenced
// active,unreferenced -> active,referenced
// 只需设置PG_referenced标志位
SetPageReferenced(page);
}
if (page_is_idle(page))
clear_page_idle(page);
}
5.1 activate_page
// 支持对称多处理器
#ifdef CONFIG_SMP
// 每个cpu都有1个pagevec用于保存active page
static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs);
void activate_page(struct page *page)
{
page = compound_head(page);
// page需要满足在lru链表, inactive和evictable三个条件
if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
// 获取当前cpu的activate_page_pvecs
struct pagevec *pvec = &get_cpu_var(activate_page_pvecs);
get_page(page);
// 同前面介绍过的__lru_cache_add类似
// 1.首先尝试调用pagevec_add将page添加到pagevec
// 2.如果添加失败代表pagevec已满, 则将pagevec批量激活
if (!pagevec_add(pvec, page) || PageCompound(page))
pagevec_lru_move_fn(pvec, __activate_page, NULL);
// 更新activate_page_pvecs
put_cpu_var(activate_page_pvecs);
}
}
#else
void activate_page(struct page *page)
{
// 获取page对用的zone
struct zone *zone = page_zone(page);
page = compound_head(page);
spin_lock_irq(zone_lru_lock(zone));
// mem_cgroup_page_lruvec返回值
// 1.如果支持memory cgroup, 返回mem_cgroup_per_node->lruvec
// 2.否则返回pglist_data->lruvec
__activate_page(page, mem_cgroup_page_lruvec(page, zone->zone_pgdat), NULL);
spin_unlock_irq(zone_lru_lock(zone));
}
#endif
5.2 __activate_page
// page: 即将被激活的page
// lruvec: page对应的lruvec
static void __activate_page(struct page *page, struct lruvec *lruvec,
void *arg)
{
// 这三个条件已经在mark_page_accessed中判断过
// 对应为: inactive,referenced -> active,unreferenced
if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
// 判断是否page cache
int file = page_is_file_cache(page);
// 判断lru类型
int lru = page_lru_base_type(page);
// 将page从lruvec原有的inactive list中删除
del_page_from_lru_list(page, lruvec, lru);
// 将PG_active置为1, page状态由inactive变为active
SetPageActive(page);
// lru链表类型变为LRU_ACTIVE_ANON或者LRU_ACTIVE_FILE
lru += LRU_ACTIVE;
// 将page插入到lruvec的lists[LRU_ACTIVE_ANON]或者lists[LRU_ACTIVE_FILE]链表尾部
add_page_to_lru_list(page, lruvec, lru);
trace_mm_lru_activate(page);
__count_vm_event(PGACTIVATE);
update_page_reclaim_stat(lruvec, file, 1);
}
}
5.2.1 del_page_from_lru_list
static __always_inline void del_page_from_lru_list(struct page *page,
struct lruvec *lruvec, enum lru_list lru)
{
// 将page从原有lru链表删除
list_del(&page->lru);
// 更新node和zone的数据统计: 这里是删除所以是取page的负数
update_lru_size(lruvec, lru, page_zonenum(page), -hpage_nr_pages(page));
}
5.2.2 add_page_to_lru_list
static __always_inline void add_page_to_lru_list(struct page *page,
struct lruvec *lruvec, enum lru_list lru)
{
// 更新node和zone的数据统计: 这里是添加所以是加上page数
update_lru_size(lruvec, lru, page_zonenum(page), hpage_nr_pages(page));
// 将page添加到lru类型的链表上
list_add(&page->lru, &lruvec->lists[lru]);
}
5.3 __lru_cache_activate_page
// lru_cache_add一节中介绍过lru_add_pvec用于存放待批量操作的page
static DEFINE_PER_CPU(struct pagevec, lru_add_pvec);
static void __lru_cache_activate_page(struct page *page)
{
// 获取当前cpu的pagevec
struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
int i;
// 遍历pagevec中的每个page
for (i = pagevec_count(pvec) - 1; i >= 0; i--) {
struct page *pagevec_page = pvec->pages[i];
// 如果传入的page已经在pagevec中, 则设置其PG_active为1, page状态由inactive改变为active
if (pagevec_page == page) {
SetPageActive(page);
break;
}
}
// 更新pagevec
put_cpu_var(lru_add_pvec);
}
6. page_check_references
static enum page_references page_check_references(struct page *page,
struct scan_control *sc)
{
int referenced_ptes, referenced_page;
unsigned long vm_flags;
// 判断page被pte引用的次数
referenced_ptes = page_referenced(page, 1, sc->target_mem_cgroup,
&vm_flags);
// 返回并清除PG_referenced标志位
referenced_page = TestClearPageReferenced(page);
/*
* Mlock lost the isolation race with us. Let try_to_unmap()
* move the page to the unevictable list.
*/
if (vm_flags & VM_LOCKED)
return PAGEREF_RECLAIM;
if (referenced_ptes) {
if (PageSwapBacked(page))
return PAGEREF_ACTIVATE;
SetPageReferenced(page);
if (referenced_page || referenced_ptes > 1)
return PAGEREF_ACTIVATE;
/*
* Activate file-backed executable pages after first usage.
*/
if (vm_flags & VM_EXEC)
return PAGEREF_ACTIVATE;
return PAGEREF_KEEP;
}
/* Reclaim if clean, defer dirty pages to writeback */
if (referenced_page && !PageSwapBacked(page))
return PAGEREF_RECLAIM_CLEAN;
return PAGEREF_RECLAIM;
}
6.1 page_referenced
// 1.利用rmap系统遍历所有映射该页面的pte
// 2.对每个pte: 如果L_PTE_YOUNG比特位置位说明之前被访问过,referenced计数加1;
// 然后清空L_PTE_YOUNG.对ARM32来说会清空硬件页表项内容, 人为制造一个缺页中断
// 当再次访问该pte时,在缺页中断中设置L_PTE_YOUNG比特位
// 2.返回referenced计数, 表示该页有多少个访问引用pte
int page_referenced(struct page *page,
int is_locked,
struct mem_cgroup *memcg,
unsigned long *vm_flags)
{
int we_locked = 0;
struct page_referenced_arg pra = {
.mapcount = total_mapcount(page),
.memcg = memcg,
};
struct rmap_walk_control rwc = {
.rmap_one = page_referenced_one,
.arg = (void *)&pra,
.anon_lock = page_lock_anon_vma_read,
};
*vm_flags = 0;
if (!page_mapped(page))
return 0;
if (!page_rmapping(page))
return 0;
if (!is_locked && (!PageAnon(page) || PageKsm(page))) {
we_locked = trylock_page(page);
if (!we_locked)
return 1;
}
/*
* If we are reclaiming on behalf of a cgroup, skip
* counting on behalf of references from different
* cgroups
*/
if (memcg) {
rwc.invalid_vma = invalid_page_referenced_vma;
}
rmap_walk(page, &rwc);
*vm_flags = pra.vm_flags;
if (we_locked)
unlock_page(page);
return pra.referenced;
}
6.2 page_referenced_one
static bool page_referenced_one(struct page *page, struct vm_area_struct *vma,
unsigned long address, void *arg)
{
struct page_referenced_arg *pra = arg;
struct page_vma_mapped_walk pvmw = {
.page = page,
.vma = vma,
.address = address,
};
int referenced = 0;
while (page_vma_mapped_walk(&pvmw)) {
address = pvmw.address;
if (vma->vm_flags & VM_LOCKED) {
page_vma_mapped_walk_done(&pvmw);
pra->vm_flags |= VM_LOCKED;
return false; /* To break the loop */
}
if (pvmw.pte) {
if (ptep_clear_flush_young_notify(vma, address,
pvmw.pte)) {
if (likely(!(vma->vm_flags & VM_SEQ_READ)))
referenced++;
}
} else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
if (pmdp_clear_flush_young_notify(vma, address,
pvmw.pmd))
referenced++;
} else {
/* unexpected pmd-mapped page? */
WARN_ON_ONCE(1);
}
pra->mapcount--;
}
if (referenced)
clear_page_idle(page);
if (test_and_clear_page_young(page))
referenced++;
if (referenced) {
pra->referenced++;
pra->vm_flags |= vma->vm_flags;
}
if (!pra->mapcount)
return false; /* To break the loop */
return true;
}
本文含有隐藏内容,请 开通VIP 后查看