【Linux内核学习笔记三】内存管理-页面(page)
1.页面
系统的内存划分成大小确定的许多块,这些块也称为页面帧,页帧是系统内存的最小单位。内核需要保持该结构尽可能小,因为对于现代计算机,物理内存包含大量的页帧,即便是增加page一点点空间,都会导致保存所有页帧page结构多占用大量物理内存。例如当页长度为4KB,主内存384MB时大约需要100000页。
每个物理页面帧由一个 struct page 描述。它在<kernel/include/linux/mm_types.h>中定义如下:
/*
* Each physical page in the system has a struct page associated with
* it to keep track of whatever it is we are using the page for at the
* moment. Note that we have no way to track which tasks are using
* a page, though if it is a pagecache page, rmap structures can tell us
* who is mapping it.
*
* The objects in struct page are organized in double word blocks in
* order to allows us to use atomic double word operations on portions
* of struct page. That is currently only used by slub but the arrangement
* allows the use of atomic double word operations on the flags/mapping
* and lru list pointers also.
*/
struct page {
/* First double word block */
unsigned long flags; /* Atomic flags, some possibly
* updated asynchronously */
union {
struct address_space *mapping; /* If low bit clear, points to
* inode address_space, or NULL.
* If page mapped as anonymous
* memory, low bit is set, and
* it points to anon_vma object:
* see PAGE_MAPPING_ANON below.
*/
void *s_mem; /* slab first object */
};
/* Second double word */
struct {
union {
pgoff_t index; /* Our offset within mapping. */
void *freelist; /* sl[aou]b first free object */
};
union {
#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \
defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE)
/* Used for cmpxchg_double in slub */
unsigned long counters;
#else
/*
* Keep _count separate from slub cmpxchg_double data.
* As the rest of the double word is protected by
* slab_lock but _count is not.
*/
unsigned counters;
#endif
struct {
union {
/*
* Count of ptes mapped in
* mms, to show when page is
* mapped & limit reverse map
* searches.
*
* Used also for tail pages
* refcounting instead of
* _count. Tail pages cannot
* be mapped and keeping the
* tail page _count zero at
* all times guarantees
* get_page_unless_zero() will
* never succeed on tail
* pages.
*/
atomic_t _mapcount;
struct { /* SLUB */
unsigned inuse:16;
unsigned objects:15;
unsigned frozen:1;
};
int units; /* SLOB */
};
atomic_t _count; /* Usage count, see below. */
};
unsigned int active; /* SLAB */
};
};
/* Third double word block */
union {
struct list_head lru; /* Pageout list, eg. active_list
* protected by zone->lru_lock !
* Can be used as a generic list
* by the page owner.
*/
struct { /* slub per cpu partial pages */
struct page *next; /* Next partial slab */
#ifdef CONFIG_64BIT
int pages; /* Nr of partial slabs left */
int pobjects; /* Approximate # of objects */
#else
short int pages;
short int pobjects;
#endif
};
struct slab *slab_page; /* slab fields */
struct rcu_head rcu_head; /* Used by SLAB
* when destroying via RCU
*/
/* First tail page of compound page */
struct {
compound_page_dtor *compound_dtor;
unsigned long compound_order;
};
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && USE_SPLIT_PMD_PTLOCKS
pgtable_t pmd_huge_pte; /* protected by page->ptl */
#endif
};
/* Remainder is not double word aligned */
union {
unsigned long private; /* Mapping-private opaque data:
* usually used for buffer_heads
* if PagePrivate set; used for
* swp_entry_t if PageSwapCache;
* indicates order in the buddy
* system if PG_buddy is set.
*/
#if USE_SPLIT_PTE_PTLOCKS
#if ALLOC_SPLIT_PTLOCKS
spinlock_t *ptl;
#else
spinlock_t ptl;
#endif
#endif
struct kmem_cache *slab_cache; /* SL[AU]B: Pointer to slab */
struct page *first_page; /* Compound tail pages */
};
#ifdef CONFIG_MEMCG
struct mem_cgroup *mem_cgroup;
#endif
/*
* On machines where all RAM is mapped into kernel address space,
* we can simply calculate the virtual address. On machines with
* highmem some memory is mapped into kernel virtual memory
* dynamically, so we need a place to store that address.
* Note that this field could be 16 bits on x86 ... ;)
*
* Architectures with slow multiplication can define
* WANT_PAGE_VIRTUAL in asm/page.h
*/
#if defined(WANT_PAGE_VIRTUAL)
void *virtual; /* Kernel virtual address (NULL if
not kmapped, ie. highmem) */
#endif /* WANT_PAGE_VIRTUAL */
#ifdef CONFIG_KMEMCHECK
/*
* kmemcheck wants to track the status of each byte in a page; this
* is a pointer to such a status block. NULL if not tracked.
*/
void *shadow;
#endif
#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
int _last_cpupid;
#endif
}
下面对page的主要字段进行描述:
- flags:用来描述页面状态的标志,这些标志在<kernel/include/linux/page-flags.h>中定义如下:
enum pageflags {
PG_locked, /* Page is locked. Don't touch. */
PG_error,
PG_referenced,
PG_uptodate,
PG_dirty,
PG_lru,
PG_active,
PG_slab,
PG_owner_priv_1, /* Owner use. If pagecache, fs may use*/
PG_arch_1,
PG_reserved,
PG_private, /* If pagecache, has fs-private data */
PG_private_2, /* If pagecache, has fs aux data */
PG_writeback, /* Page is under writeback */
#ifdef CONFIG_PAGEFLAGS_EXTENDED
PG_head, /* A head page */
PG_tail, /* A tail page */
#else
PG_compound, /* A compound page */
#endif
PG_swapcache, /* Swap page: swp_entry_t in private */
PG_mappedtodisk, /* Has blocks allocated on-disk */
PG_reclaim, /* To be reclaimed asap */
PG_swapbacked, /* Page is backed by RAM/swap */
PG_unevictable, /* Page is "unevictable" */
#ifdef CONFIG_MMU
PG_mlocked, /* Page is vma mlocked */
#endif
#ifdef CONFIG_ARCH_USES_PG_UNCACHED
PG_uncached, /* Page has been mapped as uncached */
#endif
#ifdef CONFIG_MEMORY_FAILURE
PG_hwpoison, /* hardware poisoned page. Don't touch */
#endif
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
PG_compound_lock,
#endif
#if defined(CONFIG_IDLE_PAGE_TRACKING) && defined(CONFIG_64BIT)
PG_young,
PG_idle,
#endif
__NR_PAGEFLAGS,
/* Filesystems */
PG_checked = PG_owner_priv_1,
/* Two page bits are conscripted by FS-Cache to maintain local caching
* state. These bits are set on pages belonging to the netfs's inodes
* when those inodes are being locally cached.
*/
PG_fscache = PG_private_2, /* page backed by cache */
/* XEN */
/* Pinned in Xen as a read-only pagetable page. */
PG_pinned = PG_owner_priv_1,
/* Pinned as part of domain save (see xen_mm_pin_all()). */
PG_savepinned = PG_dirty,
/* Has a grant mapping of another (foreign) domain's page. */
PG_foreign = PG_owner_priv_1,
/* SLOB */
PG_slob_free = PG_private,
};
下表列出了一些常用的标志及其含义所示:
标志名 | 含义 |
---|---|
PG_locked | 页被锁定,例如,在磁盘I/O操作中涉及的页 |
PG_error | 在传输页时发生I/O错误 |
PG_referenced | 刚刚访问过的页 |
PG_uptodate | 在完成读操作后置位,除非发生磁盘I/O错误 |
PG_dirty | 页已经被修改 |
PG_lru | 页在活动或者非活动链表中 |
PG_active | 页在活动页链表中 |
PG_slab | 包含在slab中的页框 |
PG_arch_1 | 特定于体系结构的页面状态位。一般的代码保证当页面首次进入页面缓存时,该位将被清除。 |
PG_reserved | 保留给内核或没有使用的页 |
PG_private | 如果page中的private成员非空,则需要设置该标志。参考下面对private的解释。 |
PG_writeback | 正在将页写到磁盘上 |
PG_compound | 通过扩展分页机制处理页框 |
PG_swapcache | 页属于对换高速缓存 |
PG_mappedtodisk | 页框中的所有数据对应于磁盘上分配的块 |
PG_reclaim | 为回收内存对页已经做了写入磁盘的标记 |
PG_swapbacked | 该page的后备存储器是swap |
PG_unevictable | 该page被锁住,不能交换,并会出现在LRU_UNEVICTABLE链表中,它包括的几种page:ramdisk或ramfs使用的页、shm_locked、mlock锁定的页。 |
PG_mlocked | 该page在vma中被锁定,一般是通过系统调用mlock()锁定了一段内存 |
下面列出了测试,设置和清除这些标志的宏,他们都定义在<kernel/include/linux/page-flags.h>中:
/*
* Macros to create function definitions for page flags
*/
#define TESTPAGEFLAG(uname, lname) \
static inline int Page##uname(const struct page *page) \
{ return test_bit(PG_##lname, &page->flags); }
#define SETPAGEFLAG(uname, lname) \
static inline void SetPage##uname(struct page *page) \
{ set_bit(PG_##lname, &page->flags); }
#define CLEARPAGEFLAG(uname, lname) \
static inline void ClearPage##uname(struct page *page) \
{ clear_bit(PG_##lname, &page->flags); }
#define __SETPAGEFLAG(uname, lname) \
static inline void __SetPage##uname(struct page *page) \
{ __set_bit(PG_##lname, &page->flags); }
#define __CLEARPAGEFLAG(uname, lname) \
static inline void __ClearPage##uname(struct page *page) \
{ __clear_bit(PG_##lname, &page->flags); }
#define TESTSETFLAG(uname, lname) \
static inline int TestSetPage##uname(struct page *page) \
{ return test_and_set_bit(PG_##lname, &page->flags); }
#define TESTCLEARFLAG(uname, lname) \
static inline int TestClearPage##uname(struct page *page) \
{ return test_and_clear_bit(PG_##lname, &page->flags); }
#define __TESTCLEARFLAG(uname, lname) \
static inline int __TestClearPage##uname(struct page *page) \
{ return __test_and_clear_bit(PG_##lname, &page->flags); }
#define PAGEFLAG(uname, lname) TESTPAGEFLAG(uname, lname) \
SETPAGEFLAG(uname, lname) CLEARPAGEFLAG(uname, lname)
#define __PAGEFLAG(uname, lname) TESTPAGEFLAG(uname, lname) \
__SETPAGEFLAG(uname, lname) __CLEARPAGEFLAG(uname, lname)
#define TESTSCFLAG(uname, lname) \
TESTSETFLAG(uname, lname) TESTCLEARFLAG(uname, lname)
#define TESTPAGEFLAG_FALSE(uname) \
static inline int Page##uname(const struct page *page) { return 0; }
#define SETPAGEFLAG_NOOP(uname) \
static inline void SetPage##uname(struct page *page) { }
#define CLEARPAGEFLAG_NOOP(uname) \
static inline void ClearPage##uname(struct page *page) { }
#define __CLEARPAGEFLAG_NOOP(uname) \
static inline void __ClearPage##uname(struct page *page) { }
#define TESTSETFLAG_FALSE(uname) \
static inline int TestSetPage##uname(struct page *page) { return 0; }
#define TESTCLEARFLAG_FALSE(uname) \
static inline int TestClearPage##uname(struct page *page) { return 0; }
#define __TESTCLEARFLAG_FALSE(uname) \
static inline int __TestClearPage##uname(struct page *page) { return 0; }
#define PAGEFLAG_FALSE(uname) TESTPAGEFLAG_FALSE(uname) \
SETPAGEFLAG_NOOP(uname) CLEARPAGEFLAG_NOOP(uname)
#define TESTSCFLAG_FALSE(uname) \
TESTSETFLAG_FALSE(uname) TESTCLEARFLAG_FALSE(uname)
- mapping:
- 当page->mapping == NULL,该page属于交换高速缓存页(swap cache),当需要使用地址空间时会指定交换分区的地址空间swapper_space。
- 当page->mapping != NULL,并且bit[0] == 0,该page属于页缓存或文件映射,mapping指向inode address_space。
- 当page->mapping != NULL,并且bit[0] != 0,该page属于匿名映射,且设置了PAGE_MAPPING_ANON位(bit[0] )被,page->mapping指向struct anon_vma对象。
- 当page->mapping != NULL,并且bit[1] != 0,bit[0] != 0,该page映射在VM_MERGEABLE区域的匿名页面上,如果启用了CONFIG_KSM,那么PAGE_MAPPING_KSM(bit[1]) 位与PAGE_MAPPING_ANON位一起被设置;然后page->mapping,不是指向anon_vma, 而是指向KSM与合并页相关联的private数据结构。
内核定义了一些API来取得mapping指针的状态,如下所示:
/*
* On an anonymous page mapped into a user virtual memory area,
* page->mapping points to its anon_vma, not to a struct address_space;
* with the PAGE_MAPPING_ANON bit set to distinguish it. See rmap.h.
*
* On an anonymous page in a VM_MERGEABLE area, if CONFIG_KSM is enabled,
* the PAGE_MAPPING_KSM bit may be set along with the PAGE_MAPPING_ANON bit;
* and then page->mapping points, not to an anon_vma, but to a private
* structure which KSM associates with that merged page. See ksm.h.
*
* PAGE_MAPPING_KSM without PAGE_MAPPING_ANON is currently never used.
*
* Please note that, confusingly, "page_mapping" refers to the inode
* address_space which maps the page from disk; whereas "page_mapped"
* refers to user virtual address space into which the page is mapped.
*/
#define PAGE_MAPPING_ANON 1
#define PAGE_MAPPING_KSM 2
#define PAGE_MAPPING_FLAGS (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM)
/* 获取page->mapping anon_vma位状态,如果该位被设置,则page->mapping指向该page
* 的anon_vma私有数据
*/
static inline int PageAnon(struct page *page)
{
return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0;
}
#ifdef CONFIG_KSM
/*
* A KSM page is one of those write-protected "shared pages" or "merged pages"
* which KSM maps into multiple mms, wherever identical anonymous page content
* is found in VM_MERGEABLE vmas. It's a PageAnon page, pointing not to any
* anon_vma, but to that page's node of the stable tree.
*/
/* 获取page->mapping anon_vma位和ksm位状态,如果都被设置,则page->mapping指向该page
* 的KSM私有数据
*/
static inline int PageKsm(struct page *page)
{
return ((unsigned long)page->mapping & PAGE_MAPPING_FLAGS) ==
(PAGE_MAPPING_ANON | PAGE_MAPPING_KSM);
}
#else
TESTPAGEFLAG_FALSE(Ksm)
#endif
- s_mem:第一个slab对象。
- index:这个字段有两个用途,它的意义与该页面的状态有关。如果页面是文件映射的一部分,它就是页面在文件中的偏移。如果页面是交换高速缓存的一部分,它就是在交换地址空间中(swapper_space)address_space的偏移量。此外,如果包含页面的块被释放以提供给一个特殊的进程,那么被释放的块的顺序存放在index中。这在函数__free_pages_ok()中设置。
- freelist:所在slab的第一个free对象。
- _mapcount:页映射计数器, 被页表映射的次数,也就是说该page同时被多少个进程共享。 初始值为-1,如果只被一个进程的页表映射了,该值为0. 如果该page处于伙伴系统中,该值为PAGE_BUDDY_MAPCOUNT_VALUE(-128),内核通过判断该值是否为PAGE_BUDDY_MAPCOUNT_VALUE来确定该page是否属于伙伴系统。
- inuse:16:在当前slab中,已经分配的slab数量。
- objects:15:在当前slab中,总slab数量。
- frozen:1:如果为1,表示只能由活动CPU分配SLAB其他CPU只能向其释放SLAB。
- _count:引用计数,表示内核中引用该page的次数, 如果要操作该page, 引用计数会+1, 操作完成-1. 当该值为0时, 表示没有引用该page的位置,所以该page可以被解除映射,这往往在内存回收时是有用的
- active:SLAB用于管理页面中可用的SLAB对象。
- lru:对SLAB来说,通过lru将其链接到SLAB的partial、free等链表中;在伙伴系统中,即当页空闲时,lru指向伙伴系统链表中相邻元素。
- private:是一个指向"私有"数据的指针,虚拟内存管理会忽略该数据。根据页的用途,可以用不同的方式指向该指针。大多数情况下它用于将页与数据缓冲区关联起来。private私有数据指针, 由应用场景确定其具体的含义:
a.如果设置了PG_private标志,则private字段指向struct buffer_head
b.如果设置了PG_compound,则指向struct page
c.如果设置了PG_swapcache标志,private存储了该page在交换分区中对应的位置信息swp_entry_t。
d.如果_mapcount = PAGE_BUDDY_MAPCOUNT_VALUE,说明该page位于伙伴系统,private存储该伙伴的阶 - slab_cache:指向本页关联的SLAB描述符。
2.页面初始化
系统在初始化管理区(zone)后,紧接着就会建立和初始化zone相关的page。在【Linux内核学习笔记二】内存管理-管理区(zone)中已经讲述了free_area_init_core()用于向每个zone填充相关信息 。 free_area_init_core()向每个zone填充相关信息的过程中调用memmap_init()建立和初始化page,并设置所有page的PG_reserved位。
memmap_init()通过memmap_init_zone()建立和初始化某个zone中的page,代码如下:
/*
* Initially all pages are reserved - free ones are freed
* up by free_all_bootmem() once the early boot process is
* done. Non-atomic initialization, single-pass.
*/
/**
* 将zone中的页框PG_reserved置位。表示该页不可用。
* 同时初始化页框中其他值。
*/
void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
unsigned long start_pfn, enum memmap_context context)
{
pg_data_t *pgdat = NODE_DATA(nid);
unsigned long end_pfn = start_pfn + size;
unsigned long pfn;
struct zone *z;
unsigned long nr_initialised = 0;
if (highest_memmap_pfn < end_pfn - 1)
highest_memmap_pfn = end_pfn - 1;
z = &pgdat->node_zones[zone];
for (pfn = start_pfn; pfn < end_pfn; pfn++) {
/*
* There can be holes in boot-time mem_map[]s
* handed to this function. They do not
* exist on hotplugged memory.
*/
if (context == MEMMAP_EARLY) {
if (!early_pfn_valid(pfn))
continue;
if (!early_pfn_in_nid(pfn, nid))
continue;
if (!update_defer_init(pgdat, pfn, end_pfn,
&nr_initialised))
break;
}
/*
* Mark the block movable so that blocks are reserved for
* movable at startup. This will force kernel allocations
* to reserve their blocks rather than leaking throughout
* the address space during boot when many long-lived
* kernel allocations are made. Later some blocks near
* the start are marked MIGRATE_RESERVE by
* setup_zone_migrate_reserve()
*
* bitmap is created for zone's valid pfn range. but memmap
* can be created for invalid pages (for alignment)
* check here not to call set_pageblock_migratetype() against
* pfn out of zone.
*/
if (!(pfn & (pageblock_nr_pages - 1))) {
//创建page
struct page *page = pfn_to_page(pfn);
//初始化page
__init_single_page(page, pfn, zone, nid);
set_pageblock_migratetype(page, MIGRATE_MOVABLE);
} else {
__init_single_pfn(pfn, zone, nid);
}
}
}
memmap_init_zone()中,在一个循环中建立和初始化了该zone地址范围内的所有page,函数pfn_to_page()用于创建page,
__init_single_page()用于初始化page,__init_single_page()代码如下所示:
static void __meminit __init_single_page(struct page *page, unsigned long pfn,
unsigned long zone, int nid)
{
//建立page到zone和node的映射
set_page_links(page, zone, nid, pfn);
init_page_count(page);
page_mapcount_reset(page);
page_cpupid_reset_last(page);
INIT_LIST_HEAD(&page->lru);
#ifdef WANT_PAGE_VIRTUAL
/* The shift won't overflow because ZONE_NORMAL is below 4G. */
if (!is_highmem_idx(zone))
set_page_address(page, __va(pfn << PAGE_SHIFT));
#endif
}
3.页面和zone,node之间的映射
在上一节中内核已经通过set_page_links()将页面映射到了zone和node。映射过程中,内核使用struct page的flags中的字段来保存页所属的zone以及node。通过page_zone(),可以获取某个page所在的zone。
以下是页面和zone,node之间的映射的一些api:
static inline struct zone *page_zone(const struct page *page)
{
return &NODE_DATA(page_to_nid(page))->node_zones[page_zonenum(page)];
}
static inline void set_page_zone(struct page *page, enum zone_type zone)
{
page->flags &= ~(ZONES_MASK << ZONES_PGSHIFT);
page->flags |= (zone & ZONES_MASK) << ZONES_PGSHIFT;
}
static inline void set_page_node(struct page *page, unsigned long node)
{
page->flags &= ~(NODES_MASK << NODES_PGSHIFT);
page->flags |= (node & NODES_MASK) << NODES_PGSHIFT;
}
其中NODE_DATA为全局的node表。在UMA结构中, 只有一个node结点即contig_page_data, 此时NODE_DATA直接指向了全局的contig_page_data, 而与node的编号nid无关,NODE_DATA在<kernel/include/linux/mmzone.h>中定义如下:
#ifndef CONFIG_NEED_MULTIPLE_NODES
extern struct pglist_data contig_page_data;
#define NODE_DATA(nid) (&contig_page_data)
在NUMA结构的系统中, 所有的node都存储在node_data数组中,NODE_DATA直接通过node编号索引即可,以x86-
32系统为例,NODE_DATA的在<kernel/arch/x86/include/asm/mmzone_32.h>中定义如下:
#ifdef CONFIG_NUMA
extern struct pglist_data *node_data[];
#define NODE_DATA(nid) (node_data[nid])
#endif /* CONFIG_NUMA */
更多推荐
所有评论(0)