Linux kernel学习-进程地址空间

Uranus Zhou — Fri, 06 Jul 2012 19:12:14 +0000

本文同步自（如浏览不正常请点击跳转）：https://zohead.com/archives/linux-kernel-learning-process-address-space/

看完 Linux kernel block I/O 层之后来到进程地址空间管理部分，本文中的很多知识和之前的 [进程基本]、[进程调度]、[内存管理] 等章节的知识相关。

1、基础知识：

Linux kernel 给每个进程提供的进程地址空间一般是 32 位或 64 位（硬件相关）的平坦地址空间，但进程是没有权限访问这段地址空间中的所有地址的，能访问的一般是很多的内存地址区间。这种内存地址区间被称为内存区域，进程可以动态添加和删除内存区域到它的地址空间中。内存区域可以有不同的权限，相关进程必须遵守这些权限，例如可读、可写、可执行等。如果进程访问的地址不在一个有效的内存区域中，或者访问时的权限不正确，kernel 将会杀掉进程并给出常见的 "Segmentation Fault" 段错误日志。

内存区域通常包括：

可执行文件的代码段，称为 text 段；
可执行文件的已初始化全局变量段，称为 data 段；
未初始化全局变量段（通常以 0 page 填充），称为 bss 段；
进程的用户空间栈（通常以 0 page 填充）；
每个共享库文件的额外 text、data、bss 段，也被装入进程的地址空间；
内存映射文件；
共享内存区域；
匿名内存映射（新版本的 malloc 函数就除了 brk 之外也通过 mmap 实现）；
应用程序中的堆

2、内存描述符：

kernel 使用 mm_struct 内存描述符结构来表示进程的地址空间信息，它定义在头文件中，这也是一个非常大的结构。

struct vm_area_struct {
	struct mm_struct * vm_mm;	/* The address space we belong to. */
	unsigned long vm_start;		/* Our start address within vm_mm. */
	unsigned long vm_end;		/* The first byte after our end address
					   within vm_mm. */

	/* linked list of VM areas per task, sorted by address */
	struct vm_area_struct *vm_next;

	pgprot_t vm_page_prot;		/* Access permissions of this VMA. */
	unsigned long vm_flags;		/* Flags, see mm.h. */

	struct rb_node vm_rb;

	/*
	 * For areas with an address space and backing store,
	 * linkage into the address_space->i_mmap prio tree, or
	 * linkage to the list of like vmas hanging off its node, or
	 * linkage of vma in the address_space->i_mmap_nonlinear list.
	 */
	union {
		struct {
			struct list_head list;
			void *parent;	/* aligns with prio_tree_node parent */
			struct vm_area_struct *head;
		} vm_set;

		struct raw_prio_tree_node prio_tree_node;
	} shared;

	/*
	 * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma
	 * list, after a COW of one of the file pages.	A MAP_SHARED vma
	 * can only be in the i_mmap tree.  An anonymous MAP_PRIVATE, stack
	 * or brk vma (with NULL file) can only be in an anon_vma list.
	 */
	struct list_head anon_vma_chain; /* Serialized by mmap_sem &
					  * page_table_lock */
	struct anon_vma *anon_vma;	/* Serialized by page_table_lock */

	/* Function pointers to deal with this struct. */
	const struct vm_operations_struct *vm_ops;

	/* Information about our backing store: */
	unsigned long vm_pgoff;		/* Offset (within vm_file) in PAGE_SIZE
					   units, *not* PAGE_CACHE_SIZE */
	struct file * vm_file;		/* File we map to (can be NULL). */
	void * vm_private_data;		/* was vm_pte (shared mem) */
	unsigned long vm_truncate_count;/* truncate_count or restart_addr */

#ifndef CONFIG_MMU
	struct vm_region *vm_region;	/* NOMMU mapping region */
#endif
#ifdef CONFIG_NUMA
	struct mempolicy *vm_policy;	/* NUMA policy for the VMA */
#endif
};

struct mm_struct {
	struct vm_area_struct * mmap;		/* list of VMAs */
	struct rb_root mm_rb;
	struct vm_area_struct * mmap_cache;	/* last find_vma result */
#ifdef CONFIG_MMU
	unsigned long (*get_unmapped_area) (struct file *filp,
				unsigned long addr, unsigned long len,
				unsigned long pgoff, unsigned long flags);
	void (*unmap_area) (struct mm_struct *mm, unsigned long addr);
#endif
	unsigned long mmap_base;		/* base of mmap area */
	unsigned long task_size;		/* size of task vm space */
	unsigned long cached_hole_size; 	/* if non-zero, the largest hole below free_area_cache */
	unsigned long free_area_cache;		/* first hole of size cached_hole_size or larger */
	pgd_t * pgd;
	atomic_t mm_users;			/* How many users with user space? */
	atomic_t mm_count;			/* How many references to "struct mm_struct" (users count as 1) */
	int map_count;				/* number of VMAs */
	struct rw_semaphore mmap_sem;
	spinlock_t page_table_lock;		/* Protects page tables and some counters */

	struct list_head mmlist;		/* List of maybe swapped mm's.	These are globally strung
						 * together off init_mm.mmlist, and are protected
						 * by mmlist_lock
						 */

	unsigned long hiwater_rss;	/* High-watermark of RSS usage */
	unsigned long hiwater_vm;	/* High-water virtual memory usage */

	unsigned long total_vm, locked_vm, shared_vm, exec_vm;
	unsigned long stack_vm, reserved_vm, def_flags, nr_ptes;
	unsigned long start_code, end_code, start_data, end_data;
	unsigned long start_brk, brk, start_stack;
	unsigned long arg_start, arg_end, env_start, env_end;

	unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */

	/*
	 * Special counters, in some configurations protected by the
	 * page_table_lock, in other configurations by being atomic.
	 */
	struct mm_rss_stat rss_stat;

	struct linux_binfmt *binfmt;

	cpumask_t cpu_vm_mask;

	/* Architecture-specific MM context */
	mm_context_t context;

	/* Swap token stuff */
	/*
	 * Last value of global fault stamp as seen by this process.
	 * In other words, this value gives an indication of how long
	 * it has been since this task got the token.
	 * Look at mm/thrash.c
	 */
	unsigned int faultstamp;
	unsigned int token_priority;
	unsigned int last_interval;

	unsigned long flags; /* Must use atomic bitops to access the bits */

	struct core_state *core_state; /* coredumping support */
#ifdef CONFIG_AIO
	spinlock_t		ioctx_lock;
	struct hlist_head	ioctx_list;
#endif
#ifdef CONFIG_MM_OWNER
	/*
	 * "owner" points to a task that is regarded as the canonical
	 * user/owner of this mm. All of the following must be true in
	 * order for it to be changed:
	 *
	 * current == mm->owner
	 * current->mm != mm
	 * new_owner->mm == mm
	 * new_owner->alloc_lock is held
	 */
	struct task_struct *owner;
#endif

#ifdef CONFIG_PROC_FS
	/* store ref to file /proc//exe symlink points to */
	struct file *exe_file;
	unsigned long num_exe_file_vmas;
#endif
#ifdef CONFIG_MMU_NOTIFIER
	struct mmu_notifier_mm *mmu_notifier_mm;
#endif
};

结构的注释中已经包含比较多的注解了哦。mmap 为地址空间的内存区域（用 vm_area_struct 结构来表示啦，也是上面的代码中）链表，mm_rb 则将其以红黑树的形式进行存储，链表形式方便遍历，红黑树形式方便查找。mm_users 为以原子变量形式保护的使用此地址空间的进程数量值（例如：如果有 4 个线程共享此地址空间，则 mm_users 值为 4），mm_count 为引用计数（所有 mm_users 等于一个引用计数），当 mm_count 值为 0 时表示没有再被使用，可以被释放。total_vm 成员表示所有内存区域的数量。

所有的 mm_struct 结构以链表的形式存在 mm_struct 的 mmlist 成员中，该链表的第一个成员就是 init 进程的 mm_struct ：init_mm，该链表被 mmlist_lock 锁保护。

进程的内存描述符是在 task_struct 的 mm 成员中的。fork() 进行创建进程时调用 copy_mm 函数将父进程的内存描述符拷贝给子进程，调用 clone() 函数时如果指定 CLONE_VM 参数将使父进程和子进程地址空间共享（实际上将 mm_users 计数加 1），这种子进程就被称为线程。mm_struct 结构一般是通过 alloc_mm 宏从名为 mm_cachep 的 Slab cache 中分配。

进程退出时调用 exit_mm 函数，该函数再调用 mmput() 函数，此函数中减小地址空间的 mm_users 计数，如果 mm_users 变为 0，调用 mmdrop() 函数减小 mm_count 计数，如果 mm_count 变为 0，则最终调用 free_mm() 宏来释放内存描述符（回归到 Slab cache 中）。

另外需要说明的是 kernel 线程是没有地址空间，也就没有对应的 mm_struct（值为 NULL），kernel 线程使用之前运行的进程的内存描述符，有关 kernel 线程请参考之前的 [进程基本] 文章。

3、VMA 概念：

vm_area_struct 结构即内存区域常被称为虚拟内存区域（简写为 VMA），表示的是在一个地址空间中的一个连续内存地址区间，每个内存区域是一个惟一的对象。vm_area_struct 中的 vm_mm 成员指向关联的内存描述符，vm_ops 成员为非常重要的关联的操作函数结构，vm_start 为起始地址，vm_end 为结束地址之后第一个字节的地址，即地址范围为：[vm_start, vm_end)。每个 VMA 对于它关联的内存描述符来说是惟一的，因此如果两个单独的进程映射相同的文件到各自的地址空间，它们的 VMA 也是不同的。

VMA 中的 vm_flags 表示内存区域中的页的行为状态，常见的状态有：VM_READ（页可读）、VM_WRITE（页可写）、VM_EXEC（页可被执行）、VM_SHARED（页被共享，被设置了称为共享映射，未设置称为私有映射）、VM_SHM（此区域被用作共享内存）、VM_LOCKED（页被锁）、VM_IO（此区域用于映射设备 I/O 空间）、VM_RESERVED（表示内存区域不可被交换出去）、VM_SEQ_READ（连续读，增强 readahead）、VM_RAND_READ（随机读，减弱 readahead）等。VM_SEQ_READ 和 VM_RAND_READ 标志可以通过 madvise() 系统调用来设置。

看看 vm_ops 操作函数结构的 vm_operations_struct 的定义，它在头文件中：

struct vm_operations_struct {
	void (*open)(struct vm_area_struct * area);
	void (*close)(struct vm_area_struct * area);
	int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf);

	/* notification that a previously read-only page is about to become
	 * writable, if an error is returned it will cause a SIGBUS */
	int (*page_mkwrite)(struct vm_area_struct *vma, struct vm_fault *vmf);

	/* called by access_process_vm when get_user_pages() fails, typically
	 * for use by special VMAs that can switch between memory and hardware
	 */
	int (*access)(struct vm_area_struct *vma, unsigned long addr,
		      void *buf, int len, int write);
#ifdef CONFIG_NUMA
	/*
	 * set_policy() op must add a reference to any non-NULL @new mempolicy
	 * to hold the policy upon return.  Caller should pass NULL @new to
	 * remove a policy and fall back to surrounding context--i.e. do not
	 * install a MPOL_DEFAULT policy, nor the task or system default
	 * mempolicy.
	 */
	int (*set_policy)(struct vm_area_struct *vma, struct mempolicy *new);

	/*
	 * get_policy() op must add reference [mpol_get()] to any policy at
	 * (vma,addr) marked as MPOL_SHARED.  The shared policy infrastructure
	 * in mm/mempolicy.c will do this automatically.
	 * get_policy() must NOT add a ref if the policy at (vma,addr) is not
	 * marked as MPOL_SHARED. vma policies are protected by the mmap_sem.
	 * If no [shared/vma] mempolicy exists at the addr, get_policy() op
	 * must return NULL--i.e., do not "fallback" to task or system default
	 * policy.
	 */
	struct mempolicy *(*get_policy)(struct vm_area_struct *vma,
					unsigned long addr);
	int (*migrate)(struct vm_area_struct *vma, const nodemask_t *from,
		const nodemask_t *to, unsigned long flags);
#endif
};

当指定的内存区域被添加到地址空间时，open 函数被调用，反之移除时 close 函数被调用。如果一个不在内存中的页被访问，将触发缺页异常， fault 函数被缺页异常处理函数调用。当一个只读的页变为可写的时候，page_mkwrite 函数也被缺页异常处理函数调用。

mm_struct 中的 mmap 为内存区域链表，通过 VMA 的 vm_next 成员指向下一个内存区域，而且链表中的内存区域是按地址上升排序的，链表中最后一个 VMA 值为 NULL。而对于 mm_struct 的 mm_rb 红黑树，mm_rb 为红黑树的根，每个 VMA 通过其 vm_rb 红黑树节点类型链到红黑树中。

在应用层中可以通过 cat /proc//maps 或者 pmap 程序等方法查看应用程序的内存区域列表。

操作 VMA：

kernel 提供 find_vma() 函数用于查找指定的内存地址在哪个 VMA 上，它的实现在 mm/mmap.c 文件中，输入参数为内存描述符和内存地址：

struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
{
	struct vm_area_struct *vma = NULL;

	if (mm) {
		/* Check the cache first. */
		/* (Cache hit rate is typically around 35%.) */
		vma = mm->mmap_cache;
		if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
			struct rb_node * rb_node;

			rb_node = mm->mm_rb.rb_node;
			vma = NULL;

			while (rb_node) {
				struct vm_area_struct * vma_tmp;

				vma_tmp = rb_entry(rb_node,
						struct vm_area_struct, vm_rb);

				if (vma_tmp->vm_end > addr) {
					vma = vma_tmp;
					if (vma_tmp->vm_start <= addr)
						break;
					rb_node = rb_node->rb_left;
				} else
					rb_node = rb_node->rb_right;
			}
			if (vma)
				mm->mmap_cache = vma;
		}
	}
	return vma;
}

如果找不到对应的 VMA 则返回 NULL。需要注意的是返回的 VMA 的开始地址可能比指定的内存地址大。find_vma() 函数返回的结果会被缓存到内存描述符的 mmap_cache 成员中用于提高之后的查找性能，因为后续的操作很可能还是在同样的 VMA 上。如果在 mmap_cache 中找不到则通过红黑树进行查找。

find_vma_prev() 函数与 find_vma() 函数类似，不过它也会返回指定地址之前的最后一个 VMA：

struct vm_area_struct * find_vma_prev(struct mm_struct *mm, unsigned long addr,
struct vm_area_struct **pprev)

kernel 另外还提供了 find_vma_intersection() 函数返回符合 find_vma() 的条件并且其开始地址不在指定内存结束地址之后的 VMA。

4、mmap 和 munmap：

kernel 提供 do_mmap() 函数创建新的线性地址区间，这是用户层 mmap() 函数的底层实现，它用于将一段地址区间添加到进程的地址空间中。

unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len,
unsigned long prot, unsigned long flag, unsigned long offset)

do_mmap 映射 file 参数指定的文件，并最终返回新创建的地址区间的初始地址。

offset 和 len 指定偏移量和长度。如果 file 为 NULL 并且 offset 为 0 则表示该映射后端不是基于文件的，这种映射被称为匿名映射，否则被称为基于文件的映射。prot 参数指定内存区域中页的访问权限，值可以为：PROT_READ（对应 VM_READ）、PROT_WRITE、PROT_EXEC、PROT_NONE 等。flag 指定 VMA 的其它标志，常用的有：MAP_SHARED（此映射可被共享）、MAP_PRIVATE（私有不可共享）、MAP_ANONYMOUS（指定匿名映射）、MAP_LOCKED 等。

如果可能的话，do_mmap 返回的内存区间会尽量和已有邻近的 VMA 合并（调整 VMA 大小），否则就创建一个新的 VMA。新的 VMA 从名为 vm_area_cachep 的 Slab cache 中分配，并通过 vma_link() 函数被加入到进程地址空间的链表和红黑树中，对应的 mm_struct 的 total_vm 成员也被更新。

do_mmap 是调用 do_mmap_pgoff() 函数完成真正的映射操作的。现在用户层使用的 mmap() 函数实际上是在用户层调用 mmap2() 系统调用并最终通过 do_mmap 来实现的。

do_munmap 用于从地址空间移除指定的地址区间：

int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)

do_munmap 导出给用户层就是 munmap() 函数了。

5、页表及应用程序 VMA：

Linux kernel 使用页式内存管理，应用程序给出的内存地址是虚拟地址，它需要经过若干级页表一级一级的变换，才变成真正的物理地址。有关 Linux 的分级页表结构等相关的知识请参考之前的 [内存寻址] 文章。

每个进程有自己的 task_struct，task_struct 中的 mm 指向其内存描述符，每个 mm 又有自己单独的页表（进程中的线程会进行共享），本文最上面介绍的内存描述符 mm_struct 中的 pgd_t * pgd 就指向进程的 PGD，对页表的操作和遍历等操作也需要用到 mm_struct 中的 page_table_lock 自旋锁成员。

应用程序中对内存的操作例如 malloc 分配内存等一般是改变了某个 VMA，不会直接改变页表。假设用户分配了内存，然后访问这块内存，由于页表里面并没有记录相关的映射，CPU 产生一次缺页异常，内核捕捉到异常，检查产生异常的地址是不是存在于一个合法的 VMA 中，如果不是，则给进程一个 "Segmentation Fault" 段错误，使其崩溃；如果是，则分配一个物理页，并为之建立映射。

应用程序中的堆是一个一端固定、一端可伸缩的 VMA，其大小可以通过 brk 系统调用进行调整，libc 的 malloc 函数就是基于 brk 来实现的（如果需要分配的内存很大时，libc 会通过 mmap 系统调用映射一个新的 VMA 以节省对堆 VMA 的一系列调整操作）。应用程序的栈也是一个 VMA，只是它是一端固定、一端可伸不能缩的，而且它是自动伸展的。另外需要说明的是线程的栈 VMA 明显不是和其它线程共享的，一般是在线程创建时通过 mmap 创建新的 VMA 并以此作为线程的栈。

本文只是对 Linux kernel 的进程地址空间的基础涉及，其中有任何问题，欢迎提出指正哦，玩的开心~~~ ^_^

Soul Of Free Loop » brk

Linux kernel学习-进程地址空间