Soul Of Free Loop » 代码分析

Linux kernel学习-进程地址空间

Uranus Zhou — Fri, 06 Jul 2012 19:12:14 +0000

本文同步自（如浏览不正常请点击跳转）：https://zohead.com/archives/linux-kernel-learning-process-address-space/

看完 Linux kernel block I/O 层之后来到进程地址空间管理部分，本文中的很多知识和之前的 [进程基本]、[进程调度]、[内存管理] 等章节的知识相关。

1、基础知识：

Linux kernel 给每个进程提供的进程地址空间一般是 32 位或 64 位（硬件相关）的平坦地址空间，但进程是没有权限访问这段地址空间中的所有地址的，能访问的一般是很多的内存地址区间。这种内存地址区间被称为内存区域，进程可以动态添加和删除内存区域到它的地址空间中。内存区域可以有不同的权限，相关进程必须遵守这些权限，例如可读、可写、可执行等。如果进程访问的地址不在一个有效的内存区域中，或者访问时的权限不正确，kernel 将会杀掉进程并给出常见的 "Segmentation Fault" 段错误日志。

内存区域通常包括：

可执行文件的代码段，称为 text 段；
可执行文件的已初始化全局变量段，称为 data 段；
未初始化全局变量段（通常以 0 page 填充），称为 bss 段；
进程的用户空间栈（通常以 0 page 填充）；
每个共享库文件的额外 text、data、bss 段，也被装入进程的地址空间；
内存映射文件；
共享内存区域；
匿名内存映射（新版本的 malloc 函数就除了 brk 之外也通过 mmap 实现）；
应用程序中的堆

2、内存描述符：

kernel 使用 mm_struct 内存描述符结构来表示进程的地址空间信息，它定义在头文件中，这也是一个非常大的结构。

struct vm_area_struct {
	struct mm_struct * vm_mm;	/* The address space we belong to. */
	unsigned long vm_start;		/* Our start address within vm_mm. */
	unsigned long vm_end;		/* The first byte after our end address
					   within vm_mm. */

	/* linked list of VM areas per task, sorted by address */
	struct vm_area_struct *vm_next;

	pgprot_t vm_page_prot;		/* Access permissions of this VMA. */
	unsigned long vm_flags;		/* Flags, see mm.h. */

	struct rb_node vm_rb;

	/*
	 * For areas with an address space and backing store,
	 * linkage into the address_space->i_mmap prio tree, or
	 * linkage to the list of like vmas hanging off its node, or
	 * linkage of vma in the address_space->i_mmap_nonlinear list.
	 */
	union {
		struct {
			struct list_head list;
			void *parent;	/* aligns with prio_tree_node parent */
			struct vm_area_struct *head;
		} vm_set;

		struct raw_prio_tree_node prio_tree_node;
	} shared;

	/*
	 * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma
	 * list, after a COW of one of the file pages.	A MAP_SHARED vma
	 * can only be in the i_mmap tree.  An anonymous MAP_PRIVATE, stack
	 * or brk vma (with NULL file) can only be in an anon_vma list.
	 */
	struct list_head anon_vma_chain; /* Serialized by mmap_sem &
					  * page_table_lock */
	struct anon_vma *anon_vma;	/* Serialized by page_table_lock */

	/* Function pointers to deal with this struct. */
	const struct vm_operations_struct *vm_ops;

	/* Information about our backing store: */
	unsigned long vm_pgoff;		/* Offset (within vm_file) in PAGE_SIZE
					   units, *not* PAGE_CACHE_SIZE */
	struct file * vm_file;		/* File we map to (can be NULL). */
	void * vm_private_data;		/* was vm_pte (shared mem) */
	unsigned long vm_truncate_count;/* truncate_count or restart_addr */

#ifndef CONFIG_MMU
	struct vm_region *vm_region;	/* NOMMU mapping region */
#endif
#ifdef CONFIG_NUMA
	struct mempolicy *vm_policy;	/* NUMA policy for the VMA */
#endif
};

struct mm_struct {
	struct vm_area_struct * mmap;		/* list of VMAs */
	struct rb_root mm_rb;
	struct vm_area_struct * mmap_cache;	/* last find_vma result */
#ifdef CONFIG_MMU
	unsigned long (*get_unmapped_area) (struct file *filp,
				unsigned long addr, unsigned long len,
				unsigned long pgoff, unsigned long flags);
	void (*unmap_area) (struct mm_struct *mm, unsigned long addr);
#endif
	unsigned long mmap_base;		/* base of mmap area */
	unsigned long task_size;		/* size of task vm space */
	unsigned long cached_hole_size; 	/* if non-zero, the largest hole below free_area_cache */
	unsigned long free_area_cache;		/* first hole of size cached_hole_size or larger */
	pgd_t * pgd;
	atomic_t mm_users;			/* How many users with user space? */
	atomic_t mm_count;			/* How many references to "struct mm_struct" (users count as 1) */
	int map_count;				/* number of VMAs */
	struct rw_semaphore mmap_sem;
	spinlock_t page_table_lock;		/* Protects page tables and some counters */

	struct list_head mmlist;		/* List of maybe swapped mm's.	These are globally strung
						 * together off init_mm.mmlist, and are protected
						 * by mmlist_lock
						 */

	unsigned long hiwater_rss;	/* High-watermark of RSS usage */
	unsigned long hiwater_vm;	/* High-water virtual memory usage */

	unsigned long total_vm, locked_vm, shared_vm, exec_vm;
	unsigned long stack_vm, reserved_vm, def_flags, nr_ptes;
	unsigned long start_code, end_code, start_data, end_data;
	unsigned long start_brk, brk, start_stack;
	unsigned long arg_start, arg_end, env_start, env_end;

	unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */

	/*
	 * Special counters, in some configurations protected by the
	 * page_table_lock, in other configurations by being atomic.
	 */
	struct mm_rss_stat rss_stat;

	struct linux_binfmt *binfmt;

	cpumask_t cpu_vm_mask;

	/* Architecture-specific MM context */
	mm_context_t context;

	/* Swap token stuff */
	/*
	 * Last value of global fault stamp as seen by this process.
	 * In other words, this value gives an indication of how long
	 * it has been since this task got the token.
	 * Look at mm/thrash.c
	 */
	unsigned int faultstamp;
	unsigned int token_priority;
	unsigned int last_interval;

	unsigned long flags; /* Must use atomic bitops to access the bits */

	struct core_state *core_state; /* coredumping support */
#ifdef CONFIG_AIO
	spinlock_t		ioctx_lock;
	struct hlist_head	ioctx_list;
#endif
#ifdef CONFIG_MM_OWNER
	/*
	 * "owner" points to a task that is regarded as the canonical
	 * user/owner of this mm. All of the following must be true in
	 * order for it to be changed:
	 *
	 * current == mm->owner
	 * current->mm != mm
	 * new_owner->mm == mm
	 * new_owner->alloc_lock is held
	 */
	struct task_struct *owner;
#endif

#ifdef CONFIG_PROC_FS
	/* store ref to file /proc//exe symlink points to */
	struct file *exe_file;
	unsigned long num_exe_file_vmas;
#endif
#ifdef CONFIG_MMU_NOTIFIER
	struct mmu_notifier_mm *mmu_notifier_mm;
#endif
};

结构的注释中已经包含比较多的注解了哦。mmap 为地址空间的内存区域（用 vm_area_struct 结构来表示啦，也是上面的代码中）链表，mm_rb 则将其以红黑树的形式进行存储，链表形式方便遍历，红黑树形式方便查找。mm_users 为以原子变量形式保护的使用此地址空间的进程数量值（例如：如果有 4 个线程共享此地址空间，则 mm_users 值为 4），mm_count 为引用计数（所有 mm_users 等于一个引用计数），当 mm_count 值为 0 时表示没有再被使用，可以被释放。total_vm 成员表示所有内存区域的数量。

所有的 mm_struct 结构以链表的形式存在 mm_struct 的 mmlist 成员中，该链表的第一个成员就是 init 进程的 mm_struct ：init_mm，该链表被 mmlist_lock 锁保护。

进程的内存描述符是在 task_struct 的 mm 成员中的。fork() 进行创建进程时调用 copy_mm 函数将父进程的内存描述符拷贝给子进程，调用 clone() 函数时如果指定 CLONE_VM 参数将使父进程和子进程地址空间共享（实际上将 mm_users 计数加 1），这种子进程就被称为线程。mm_struct 结构一般是通过 alloc_mm 宏从名为 mm_cachep 的 Slab cache 中分配。

进程退出时调用 exit_mm 函数，该函数再调用 mmput() 函数，此函数中减小地址空间的 mm_users 计数，如果 mm_users 变为 0，调用 mmdrop() 函数减小 mm_count 计数，如果 mm_count 变为 0，则最终调用 free_mm() 宏来释放内存描述符（回归到 Slab cache 中）。

另外需要说明的是 kernel 线程是没有地址空间，也就没有对应的 mm_struct（值为 NULL），kernel 线程使用之前运行的进程的内存描述符，有关 kernel 线程请参考之前的 [进程基本] 文章。

3、VMA 概念：

vm_area_struct 结构即内存区域常被称为虚拟内存区域（简写为 VMA），表示的是在一个地址空间中的一个连续内存地址区间，每个内存区域是一个惟一的对象。vm_area_struct 中的 vm_mm 成员指向关联的内存描述符，vm_ops 成员为非常重要的关联的操作函数结构，vm_start 为起始地址，vm_end 为结束地址之后第一个字节的地址，即地址范围为：[vm_start, vm_end)。每个 VMA 对于它关联的内存描述符来说是惟一的，因此如果两个单独的进程映射相同的文件到各自的地址空间，它们的 VMA 也是不同的。

VMA 中的 vm_flags 表示内存区域中的页的行为状态，常见的状态有：VM_READ（页可读）、VM_WRITE（页可写）、VM_EXEC（页可被执行）、VM_SHARED（页被共享，被设置了称为共享映射，未设置称为私有映射）、VM_SHM（此区域被用作共享内存）、VM_LOCKED（页被锁）、VM_IO（此区域用于映射设备 I/O 空间）、VM_RESERVED（表示内存区域不可被交换出去）、VM_SEQ_READ（连续读，增强 readahead）、VM_RAND_READ（随机读，减弱 readahead）等。VM_SEQ_READ 和 VM_RAND_READ 标志可以通过 madvise() 系统调用来设置。

看看 vm_ops 操作函数结构的 vm_operations_struct 的定义，它在头文件中：

struct vm_operations_struct {
	void (*open)(struct vm_area_struct * area);
	void (*close)(struct vm_area_struct * area);
	int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf);

	/* notification that a previously read-only page is about to become
	 * writable, if an error is returned it will cause a SIGBUS */
	int (*page_mkwrite)(struct vm_area_struct *vma, struct vm_fault *vmf);

	/* called by access_process_vm when get_user_pages() fails, typically
	 * for use by special VMAs that can switch between memory and hardware
	 */
	int (*access)(struct vm_area_struct *vma, unsigned long addr,
		      void *buf, int len, int write);
#ifdef CONFIG_NUMA
	/*
	 * set_policy() op must add a reference to any non-NULL @new mempolicy
	 * to hold the policy upon return.  Caller should pass NULL @new to
	 * remove a policy and fall back to surrounding context--i.e. do not
	 * install a MPOL_DEFAULT policy, nor the task or system default
	 * mempolicy.
	 */
	int (*set_policy)(struct vm_area_struct *vma, struct mempolicy *new);

	/*
	 * get_policy() op must add reference [mpol_get()] to any policy at
	 * (vma,addr) marked as MPOL_SHARED.  The shared policy infrastructure
	 * in mm/mempolicy.c will do this automatically.
	 * get_policy() must NOT add a ref if the policy at (vma,addr) is not
	 * marked as MPOL_SHARED. vma policies are protected by the mmap_sem.
	 * If no [shared/vma] mempolicy exists at the addr, get_policy() op
	 * must return NULL--i.e., do not "fallback" to task or system default
	 * policy.
	 */
	struct mempolicy *(*get_policy)(struct vm_area_struct *vma,
					unsigned long addr);
	int (*migrate)(struct vm_area_struct *vma, const nodemask_t *from,
		const nodemask_t *to, unsigned long flags);
#endif
};

当指定的内存区域被添加到地址空间时，open 函数被调用，反之移除时 close 函数被调用。如果一个不在内存中的页被访问，将触发缺页异常， fault 函数被缺页异常处理函数调用。当一个只读的页变为可写的时候，page_mkwrite 函数也被缺页异常处理函数调用。

mm_struct 中的 mmap 为内存区域链表，通过 VMA 的 vm_next 成员指向下一个内存区域，而且链表中的内存区域是按地址上升排序的，链表中最后一个 VMA 值为 NULL。而对于 mm_struct 的 mm_rb 红黑树，mm_rb 为红黑树的根，每个 VMA 通过其 vm_rb 红黑树节点类型链到红黑树中。

在应用层中可以通过 cat /proc//maps 或者 pmap 程序等方法查看应用程序的内存区域列表。

操作 VMA：

kernel 提供 find_vma() 函数用于查找指定的内存地址在哪个 VMA 上，它的实现在 mm/mmap.c 文件中，输入参数为内存描述符和内存地址：

struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
{
	struct vm_area_struct *vma = NULL;

	if (mm) {
		/* Check the cache first. */
		/* (Cache hit rate is typically around 35%.) */
		vma = mm->mmap_cache;
		if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
			struct rb_node * rb_node;

			rb_node = mm->mm_rb.rb_node;
			vma = NULL;

			while (rb_node) {
				struct vm_area_struct * vma_tmp;

				vma_tmp = rb_entry(rb_node,
						struct vm_area_struct, vm_rb);

				if (vma_tmp->vm_end > addr) {
					vma = vma_tmp;
					if (vma_tmp->vm_start <= addr)
						break;
					rb_node = rb_node->rb_left;
				} else
					rb_node = rb_node->rb_right;
			}
			if (vma)
				mm->mmap_cache = vma;
		}
	}
	return vma;
}

如果找不到对应的 VMA 则返回 NULL。需要注意的是返回的 VMA 的开始地址可能比指定的内存地址大。find_vma() 函数返回的结果会被缓存到内存描述符的 mmap_cache 成员中用于提高之后的查找性能，因为后续的操作很可能还是在同样的 VMA 上。如果在 mmap_cache 中找不到则通过红黑树进行查找。

find_vma_prev() 函数与 find_vma() 函数类似，不过它也会返回指定地址之前的最后一个 VMA：

struct vm_area_struct * find_vma_prev(struct mm_struct *mm, unsigned long addr,
struct vm_area_struct **pprev)

kernel 另外还提供了 find_vma_intersection() 函数返回符合 find_vma() 的条件并且其开始地址不在指定内存结束地址之后的 VMA。

4、mmap 和 munmap：

kernel 提供 do_mmap() 函数创建新的线性地址区间，这是用户层 mmap() 函数的底层实现，它用于将一段地址区间添加到进程的地址空间中。

unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len,
unsigned long prot, unsigned long flag, unsigned long offset)

do_mmap 映射 file 参数指定的文件，并最终返回新创建的地址区间的初始地址。

offset 和 len 指定偏移量和长度。如果 file 为 NULL 并且 offset 为 0 则表示该映射后端不是基于文件的，这种映射被称为匿名映射，否则被称为基于文件的映射。prot 参数指定内存区域中页的访问权限，值可以为：PROT_READ（对应 VM_READ）、PROT_WRITE、PROT_EXEC、PROT_NONE 等。flag 指定 VMA 的其它标志，常用的有：MAP_SHARED（此映射可被共享）、MAP_PRIVATE（私有不可共享）、MAP_ANONYMOUS（指定匿名映射）、MAP_LOCKED 等。

如果可能的话，do_mmap 返回的内存区间会尽量和已有邻近的 VMA 合并（调整 VMA 大小），否则就创建一个新的 VMA。新的 VMA 从名为 vm_area_cachep 的 Slab cache 中分配，并通过 vma_link() 函数被加入到进程地址空间的链表和红黑树中，对应的 mm_struct 的 total_vm 成员也被更新。

do_mmap 是调用 do_mmap_pgoff() 函数完成真正的映射操作的。现在用户层使用的 mmap() 函数实际上是在用户层调用 mmap2() 系统调用并最终通过 do_mmap 来实现的。

do_munmap 用于从地址空间移除指定的地址区间：

int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)

do_munmap 导出给用户层就是 munmap() 函数了。

5、页表及应用程序 VMA：

Linux kernel 使用页式内存管理，应用程序给出的内存地址是虚拟地址，它需要经过若干级页表一级一级的变换，才变成真正的物理地址。有关 Linux 的分级页表结构等相关的知识请参考之前的 [内存寻址] 文章。

每个进程有自己的 task_struct，task_struct 中的 mm 指向其内存描述符，每个 mm 又有自己单独的页表（进程中的线程会进行共享），本文最上面介绍的内存描述符 mm_struct 中的 pgd_t * pgd 就指向进程的 PGD，对页表的操作和遍历等操作也需要用到 mm_struct 中的 page_table_lock 自旋锁成员。

应用程序中对内存的操作例如 malloc 分配内存等一般是改变了某个 VMA，不会直接改变页表。假设用户分配了内存，然后访问这块内存，由于页表里面并没有记录相关的映射，CPU 产生一次缺页异常，内核捕捉到异常，检查产生异常的地址是不是存在于一个合法的 VMA 中，如果不是，则给进程一个 "Segmentation Fault" 段错误，使其崩溃；如果是，则分配一个物理页，并为之建立映射。

应用程序中的堆是一个一端固定、一端可伸缩的 VMA，其大小可以通过 brk 系统调用进行调整，libc 的 malloc 函数就是基于 brk 来实现的（如果需要分配的内存很大时，libc 会通过 mmap 系统调用映射一个新的 VMA 以节省对堆 VMA 的一系列调整操作）。应用程序的栈也是一个 VMA，只是它是一端固定、一端可伸不能缩的，而且它是自动伸展的。另外需要说明的是线程的栈 VMA 明显不是和其它线程共享的，一般是在线程创建时通过 mmap 创建新的 VMA 并以此作为线程的栈。

本文只是对 Linux kernel 的进程地址空间的基础涉及，其中有任何问题，欢迎提出指正哦，玩的开心~~~ ^_^

Linux kernel学习-block层

Uranus Zhou — Fri, 06 Jul 2012 01:14:11 +0000

本文同步自（如浏览不正常请点击跳转）：https://zohead.com/archives/linux-kernel-learning-block-layer/

Linux 内核中的 block I/O 层又是非常重要的一个概念，它相对字符设备的实现来说复杂很多，而且在现今应用中，block 层可以说是随处可见，下面分别介绍 kernel block I/O 层的一些知识，你需要对块设备、字符设备的区别清楚，而且对 kernel 基础有一些了解哦。

1、buffer_head 的概念：

buffer_head 是 block 层中一个常见的数据结构（当然和下面的 bio 之类的结构相比就差多了哦，HOHO）。

当块设备中的一个块（一般为扇区大小的整数倍，并不超过一个内存 page 的大小）通过读写等方式存放在内存中，一般被称为存在 buffer 中，每个 buffer 和一个块相关联，它就表示在内存中的磁盘块。kernel 因此需要有相关的控制信息来表示块数据，每个块与一个描述符相关联，这个描述符就被称为 buffer head，并用 struct buffer_head 来表示，其定义在头文件中。

enum bh_state_bits {
	BH_Uptodate,	/* Contains valid data */
	BH_Dirty,	/* Is dirty */
	BH_Lock,	/* Is locked */
	BH_Req,		/* Has been submitted for I/O */
	BH_Uptodate_Lock,/* Used by the first bh in a page, to serialise
			  * IO completion of other buffers in the page
			  */

	BH_Mapped,	/* Has a disk mapping */
	BH_New,		/* Disk mapping was newly created by get_block */
	BH_Async_Read,	/* Is under end_buffer_async_read I/O */
	BH_Async_Write,	/* Is under end_buffer_async_write I/O */
	BH_Delay,	/* Buffer is not yet allocated on disk */
	BH_Boundary,	/* Block is followed by a discontiguity */
	BH_Write_EIO,	/* I/O error on write */
	BH_Ordered,	/* ordered write */
	BH_Eopnotsupp,	/* operation not supported (barrier) */
	BH_Unwritten,	/* Buffer is allocated on disk but not written */
	BH_Quiet,	/* Buffer Error Prinks to be quiet */

	BH_PrivateStart,/* not a state bit, but the first bit available
			 * for private allocation by other entities
			 */
};

struct buffer_head {
	unsigned long b_state;		/* buffer state bitmap (see above) */
	struct buffer_head *b_this_page;/* circular list of page's buffers */
	struct page *b_page;		/* the page this bh is mapped to */

	sector_t b_blocknr;		/* start block number */
	size_t b_size;			/* size of mapping */
	char *b_data;			/* pointer to data within the page */

	struct block_device *b_bdev;
	bh_end_io_t *b_end_io;		/* I/O completion */
 	void *b_private;		/* reserved for b_end_io */
	struct list_head b_assoc_buffers; /* associated with another mapping */
	struct address_space *b_assoc_map;	/* mapping this buffer is
						   associated with */
	atomic_t b_count;		/* users using this buffer_head */
};

b_state 字段说明这段 buffer 的状态，它可以是 bh_state_bits 联合（也在上面的代码中，注释说明状态，应该比较好明白哦）中的一个或多个与值。b_count 为 buffer 的引用计数，它通过 get_bh、put_bh 函数进行原子性的增加和减小，需要操作 buffer_head 时调用 get_bh，完成之后调用 put_bh。b_bdev 表示关联的块设备，下面会单独介绍 block_device 结构，b_blocknr 表示在 b_bdev 块设备上 buffer 所关联的块的起始地址。b_page 指向的内存页即为 buffer 所映射的页。b_data 为指向块的指针（在 b_page 中），并且长度为 b_size。

在 Linux 2.6 版本以前，buffer_head 是 kernel 中非常重要的数据结构，它曾经是 kernel 中 I/O 的基本单位（现在已经是 bio 结构），它曾被用于为一个块映射一个页，它被用于描述磁盘块到物理页的映射关系，所有的 block I/O 操作也包含在 buffer_head 中。但是这样也会引起比较大的问题：buffer_head 结构过大（现在已经缩减了很多），用 buffer head 来操作 I/O 数据太复杂，kernel 更喜欢根据 page 来工作（这样性能也更好）；另一个问题是一个大的 buffer_head 常被用来描述单独的 buffer，而且 buffer 还很可能比一个页还小，这样就会造成效率低下；第三个问题是 buffer_head 只能描述一个 buffer，这样大块的 I/O 操作常被分散为很多个 buffer_head，这样会增加额外占用的空间。因此 2.6 开始的 kernel （实际 2.5 测试版的 kernel 中已经开始引入）使用 bio 结构直接处理 page 和地址空间，而不是 buffer。

2、bio：

说了一堆 buffer_head 的坏话，现在来看看它的替代者：bio，它倾向于为 I/O 请求提供一个轻量级的表示方法，它定义在头文件中。

struct bio {
	sector_t		bi_sector;	/* device address in 512 byte
						   sectors */
	struct bio		*bi_next;	/* request queue link */
	struct block_device	*bi_bdev;
	unsigned long		bi_flags;	/* status, command, etc */
	unsigned long		bi_rw;		/* bottom bits READ/WRITE,
						 * top bits priority
						 */

	unsigned short		bi_vcnt;	/* how many bio_vec's */
	unsigned short		bi_idx;		/* current index into bvl_vec */

	/* Number of segments in this BIO after
	 * physical address coalescing is performed.
	 */
	unsigned int		bi_phys_segments;

	unsigned int		bi_size;	/* residual I/O count */

	/*
	 * To keep track of the max segment size, we account for the
	 * sizes of the first and last mergeable segments in this bio.
	 */
	unsigned int		bi_seg_front_size;
	unsigned int		bi_seg_back_size;

	unsigned int		bi_max_vecs;	/* max bvl_vecs we can hold */

	unsigned int		bi_comp_cpu;	/* completion CPU */

	atomic_t		bi_cnt;		/* pin count */

	struct bio_vec		*bi_io_vec;	/* the actual vec list */

	bio_end_io_t		*bi_end_io;

	void			*bi_private;
#if defined(CONFIG_BLK_DEV_INTEGRITY)
	struct bio_integrity_payload *bi_integrity;  /* data integrity */
#endif

	bio_destructor_t	*bi_destructor;	/* destructor */

	/*
	 * We can inline a number of vecs at the end of the bio, to avoid
	 * double allocations for a small number of bio_vecs. This member
	 * MUST obviously be kept at the very end of the bio.
	 */
	struct bio_vec		bi_inline_vecs[0];
};

struct bio_vec {
	struct page	*bv_page;
	unsigned int	bv_len;
	unsigned int	bv_offset;
};

该定义中已经有详细的注释了哦。bi_sector 为以 512 字节为单位的扇区地址（即使物理设备的扇区大小不是 512 字节，bi_sector 也以 512 字节为单位）。bi_bdev 为关联的块设备。bi_rw 表示为读请求还是写请求。bi_cnt 为引用计数，通过 bio_get、bio_put 宏可以对 bi_cnt 进行增加和减小操作。当 bi_cnt 值为 0 时，bio 结构就被销毁并且后端的内存也被释放。

I/O 向量：

bio 结构中最重要的是 bi_vcnt、bi_idx、bi_io_vec 等成员，bi_vcnt 为 bi_io_vec 所指向的 bio_vec 类型列表个数，bi_io_vec 表示指定的 block I/O 操作中的单独的段（如果你用过 readv 和 writev 函数那应该对这个比较熟悉），bi_idx 为当前在 bi_io_vec 数组中的索引，随着 block I/O 操作的进行，bi_idx 值被不断更新，kernel 提供 bio_for_each_segment 宏用于遍历 bio 中的 bio_vec。另外 kernel 中的 MD 软件 RAID 驱动也会使用 bi_idx 值来将一个 bio 请求分发到不同的磁盘设备上进行处理。

bio_vec 的定义也在上面的代码中，同样在头文件中，每个 bio_vec 类型指向对应的 page，bv_page 表示它所在的页，bv_offset 为块相对于 page 的偏移量，bv_len 即为块的长度。

buffer_head 和 bio 总结：

因此也可以看出 block I/O 请求是以 I/O 向量的形式进行提交和处理的。

bio 相对 buffer_head 的好处有：bio 可以更方便的使用高端内存，因为它只与 page 打交道，并不直接使用地址。bio 可以表示 direct I/O（不经过 page cache，后面再详细描述）。对向量形式的 I/O（包括 sg I/O）支持更好，防止 I/O 被打散。但是 buffer_head 还是需要的，它用于映射磁盘块到内存，因为 bio 中并没有包含 kernel 需要的 buffer 状态的成员以及一些其它信息。

3、请求队列：

块设备使用请求队列来保存等待中的 block I/O 请求，其使用 request_queue 结构来表示，定义在头文件中，此头文件中还包含了非常重要的 request 结构：

struct request {
	struct list_head queuelist;
	struct call_single_data csd;

	struct request_queue *q;

	unsigned int cmd_flags;
	enum rq_cmd_type_bits cmd_type;
	unsigned long atomic_flags;

	int cpu;

	/* the following two fields are internal, NEVER access directly */
	unsigned int __data_len;	/* total data len */
	sector_t __sector;		/* sector cursor */

	struct bio *bio;
	struct bio *biotail;

	struct hlist_node hash;	/* merge hash */
	/*
	 * The rb_node is only used inside the io scheduler, requests
	 * are pruned when moved to the dispatch queue. So let the
	 * completion_data share space with the rb_node.
	 */
	union {
		struct rb_node rb_node;	/* sort/lookup */
		void *completion_data;
	};

	/*
	 * two pointers are available for the IO schedulers, if they need
	 * more they have to dynamically allocate it.
	 */
	void *elevator_private;
	void *elevator_private2;

	struct gendisk *rq_disk;
	unsigned long start_time;

	/* Number of scatter-gather DMA addr+len pairs after
	 * physical address coalescing is performed.
	 */
	unsigned short nr_phys_segments;

	unsigned short ioprio;

	int ref_count;

	void *special;		/* opaque pointer available for LLD use */
	char *buffer;		/* kaddr of the current segment if available */

	int tag;
	int errors;

	/*
	 * when request is used as a packet command carrier
	 */
	unsigned char __cmd[BLK_MAX_CDB];
	unsigned char *cmd;
	unsigned short cmd_len;

	unsigned int extra_len;	/* length of alignment and padding */
	unsigned int sense_len;
	unsigned int resid_len;	/* residual count */
	void *sense;

	unsigned long deadline;
	struct list_head timeout_list;
	unsigned int timeout;
	int retries;

	/*
	 * completion callback.
	 */
	rq_end_io_fn *end_io;
	void *end_io_data;

	/* for bidi */
	struct request *next_rq;
};

struct request_queue
{
	/*
	 * Together with queue_head for cacheline sharing
	 */
	struct list_head	queue_head;
	struct request		*last_merge;
	struct elevator_queue	*elevator;

	/*
	 * the queue request freelist, one for reads and one for writes
	 */
	struct request_list	rq;

	request_fn_proc		*request_fn;
	make_request_fn		*make_request_fn;
	prep_rq_fn		*prep_rq_fn;
	unplug_fn		*unplug_fn;
	merge_bvec_fn		*merge_bvec_fn;
	prepare_flush_fn	*prepare_flush_fn;
	softirq_done_fn		*softirq_done_fn;
	rq_timed_out_fn		*rq_timed_out_fn;
	dma_drain_needed_fn	*dma_drain_needed;
	lld_busy_fn		*lld_busy_fn;

	/*
	 * Dispatch queue sorting
	 */
	sector_t		end_sector;
	struct request		*boundary_rq;

	/*
	 * Auto-unplugging state
	 */
	struct timer_list	unplug_timer;
	int			unplug_thresh;	/* After this many requests */
	unsigned long		unplug_delay;	/* After this many jiffies */
	struct work_struct	unplug_work;

	struct backing_dev_info	backing_dev_info;

	/*
	 * The queue owner gets to use this for whatever they like.
	 * ll_rw_blk doesn't touch it.
	 */
	void			*queuedata;

	/*
	 * queue needs bounce pages for pages above this limit
	 */
	gfp_t			bounce_gfp;

	/*
	 * various queue flags, see QUEUE_* below
	 */
	unsigned long		queue_flags;

	/*
	 * protects queue structures from reentrancy. ->__queue_lock should
	 * _never_ be used directly, it is queue private. always use
	 * ->queue_lock.
	 */
	spinlock_t		__queue_lock;
	spinlock_t		*queue_lock;

	/*
	 * queue kobject
	 */
	struct kobject kobj;

	/*
	 * queue settings
	 */
	unsigned long		nr_requests;	/* Max # of requests */
	unsigned int		nr_congestion_on;
	unsigned int		nr_congestion_off;
	unsigned int		nr_batching;

	void			*dma_drain_buffer;
	unsigned int		dma_drain_size;
	unsigned int		dma_pad_mask;
	unsigned int		dma_alignment;

	struct blk_queue_tag	*queue_tags;
	struct list_head	tag_busy_list;

	unsigned int		nr_sorted;
	unsigned int		in_flight[2];

	unsigned int		rq_timeout;
	struct timer_list	timeout;
	struct list_head	timeout_list;

	struct queue_limits	limits;

	/*
	 * sg stuff
	 */
	unsigned int		sg_timeout;
	unsigned int		sg_reserved_size;
	int			node;
#ifdef CONFIG_BLK_DEV_IO_TRACE
	struct blk_trace	*blk_trace;
#endif
	/*
	 * reserved for flush operations
	 */
	unsigned int		ordered, next_ordered, ordseq;
	int			orderr, ordcolor;
	struct request		pre_flush_rq, bar_rq, post_flush_rq;
	struct request		*orig_bar_rq;

	struct mutex		sysfs_lock;

#if defined(CONFIG_BLK_DEV_BSG)
	struct bsg_class_device bsg_dev;
#endif
};

request_queue 中的很多成员和 I/O 调度器、request、bio 等息息相关。request_queue 中的 queue_head 成员为请求的双向链表。nr_requests 为请求的数量。I/O 请求被文件系统等上层的代码加入到队列中（需要经过 I/O 调度器，下面会介绍），只要队列不为空，block 设备驱动程序就需要从队列中抓取请求并提交到对应的块设备中。这个队列中的就是单独的请求，以 request 结构来表示。

每个 request 结构又可以由多个 bio 组成，一个 request 中放着顺序排列的 bio（请求在多个连续的磁盘块上）。

实际上在 request_queue 中，只有当请求队列有一定数目的请求时，I/O 调度算法才能发挥作用，否则极端情况下它将退化成 “先来先服务算法”，这就悲催了。通过对 request_queue 进行 plug 操作相当于停用，unplug 相当于恢复。请求少时将request_queue 停用，当请求达到一定数目，或者 request_queue 里最 “老” 的请求已经等待一段时间了才将 request_queue 恢复，这些见 request_queue 中的 unplug_fn、unplug_timer、unplug_thresh、unplug_delay 等成员。

4、I/O 调度器：

I/O 调度器也是 block 层的大头，它肩负着非常重要的使命。由于现在的机械硬盘设备的寻道是非常慢的（常常是毫秒级），因此尽可能的减少寻道操作是提高性能的关键所在。一般 I/O 调度器要做的事情就是在完成现有请求的前提下，让磁头尽可能少移动，从而提高磁盘的读写效率。最有名的就是 “电梯算法” 了。

由于 I/O 调度器的存在，kernel 并不会按实际收到的顺序将请求发到底层设备上，而是经过了合并（减少请求数量和寻道，如果无法合并将请求放在队列尾部）和排序处理（类似电梯的减少往返寻道的处理，也是 I/O 调度器被称为 elevators 的原因）。I/O 调度器就是来管理块设备的请求队列的，它来决定队列中请求的顺序，以及每个请求什么时候到派遣到块设备上。

现在的 Linux kernel 中已经有几种好用的 I/O 调度器，常见的包括 Linus（2.4 版本中的调度器）、cfq（很多发行版中的默认调度器）、deadline、noop、anticipatory（相对 deadline 的优化）等。

Linus 调度器同时实现了合并和排序处理，而且是 front merging（新请求在当前的前面）和 back merging（新请求在当前的后面，当然比 front merging 常见）都支持的，并且有一定的请求时限处理。

deadline 调度器主要解决 Linus 调度器导致的请求饥饿问题（不能及时有效的被处理），deadline 调度器保证请求的开始服务时间。另外 deadline 解决了写请求（一般为异步处理）使读请求（一般为同步处理）不能被及时处理的问题，也就是解决读延迟。

noop 调度器几乎保持原始请求顺序不变（仍然有合并），而 cfq 则提供类似完全公平的调度策略。

总之不同的 I/O 调度器通常是对于特定类型的请求进行优化的，有关这些调度器的具体实现，之后将专门写文章来介绍它们，这里就不会熬述咯。

I/O 调度的一些数据结构声明在头文件中，比较重要的包括 elevator_ops、elevator_type 以及 elevator_queue 等。elevator_ops 中定义了 I/O 调度算法的各种操作函数接口。

struct elevator_ops
{
	elevator_merge_fn *elevator_merge_fn;
	elevator_merged_fn *elevator_merged_fn;
	elevator_merge_req_fn *elevator_merge_req_fn;
	elevator_allow_merge_fn *elevator_allow_merge_fn;

	elevator_dispatch_fn *elevator_dispatch_fn;
	elevator_add_req_fn *elevator_add_req_fn;
	elevator_activate_req_fn *elevator_activate_req_fn;
	elevator_deactivate_req_fn *elevator_deactivate_req_fn;

	elevator_queue_empty_fn *elevator_queue_empty_fn;
	elevator_completed_req_fn *elevator_completed_req_fn;

	elevator_request_list_fn *elevator_former_req_fn;
	elevator_request_list_fn *elevator_latter_req_fn;

	elevator_set_req_fn *elevator_set_req_fn;
	elevator_put_req_fn *elevator_put_req_fn;

	elevator_may_queue_fn *elevator_may_queue_fn;

	elevator_init_fn *elevator_init_fn;
	elevator_exit_fn *elevator_exit_fn;
	void (*trim)(struct io_context *);
};

struct elevator_type
{
	struct list_head list;
	struct elevator_ops ops;
	struct elv_fs_entry *elevator_attrs;
	char elevator_name[ELV_NAME_MAX];
	struct module *elevator_owner;
};

struct elevator_queue
{
	struct elevator_ops *ops;
	void *elevator_data;
	struct kobject kobj;
	struct elevator_type *elevator_type;
	struct mutex sysfs_lock;
	struct hlist_head *hash;
};

elevator_type 用来描述不同的 I/O 调度器，你可以在 request_queue 的声明中看到 elevator_queue 的身影。

5、块设备请求处理：

当需要发起块设备读写请求时，kernel 首先根据需求构造 bio 结构（毕竟是 I/O 请求单位哦），其中包含了读写的地址、长度、设备、回调函数等信息，然后 kernel 通过 submit_bio 函数将请求转发给块设备，看看 submit_bio 的实现（在 block/blk-core.c 中，下面几个非常重要的函数也在这个关键的 block 层实现文件中）：

void submit_bio(int rw, struct bio *bio)
{
	int count = bio_sectors(bio);

	bio->bi_rw |= rw;

	/*
	 * If it's a regular read/write or a barrier with data attached,
	 * go through the normal accounting stuff before submission.
	 */
	if (bio_has_data(bio)) {
		if (rw & WRITE) {
			count_vm_events(PGPGOUT, count);
		} else {
			task_io_account_read(bio->bi_size);
			count_vm_events(PGPGIN, count);
		}

		if (unlikely(block_dump)) {
			char b[BDEVNAME_SIZE];
			printk(KERN_DEBUG "%s(%d): %s block %Lu on %s\n",
			current->comm, task_pid_nr(current),
				(rw & WRITE) ? "WRITE" : "READ",
				(unsigned long long)bio->bi_sector,
				bdevname(bio->bi_bdev, b));
		}
	}

	generic_make_request(bio);
}

submit_bio 的输入参数为 bio 结构，submit_bio 最终会调用 generic_make_request 函数不断转发 bio 请求：

static inline void __generic_make_request(struct bio *bio)
{
	struct request_queue *q;
	sector_t old_sector;
	int ret, nr_sectors = bio_sectors(bio);
	dev_t old_dev;
	int err = -EIO;

	might_sleep();

	if (bio_check_eod(bio, nr_sectors))
		goto end_io;

	/*
	 * Resolve the mapping until finished. (drivers are
	 * still free to implement/resolve their own stacking
	 * by explicitly returning 0)
	 *
	 * NOTE: we don't repeat the blk_size check for each new device.
	 * Stacking drivers are expected to know what they are doing.
	 */
	old_sector = -1;
	old_dev = 0;
	do {
		char b[BDEVNAME_SIZE];

		q = bdev_get_queue(bio->bi_bdev);
		if (unlikely(!q)) {
			printk(KERN_ERR
			       "generic_make_request: Trying to access "
				"nonexistent block-device %s (%Lu)\n",
				bdevname(bio->bi_bdev, b),
				(long long) bio->bi_sector);
			goto end_io;
		}

		if (unlikely(!bio_rw_flagged(bio, BIO_RW_DISCARD) &&
			     nr_sectors > queue_max_hw_sectors(q))) {
			printk(KERN_ERR "bio too big device %s (%u > %u)\n",
			       bdevname(bio->bi_bdev, b),
			       bio_sectors(bio),
			       queue_max_hw_sectors(q));
			goto end_io;
		}

		if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
			goto end_io;

		if (should_fail_request(bio))
			goto end_io;

		/*
		 * If this device has partitions, remap block n
		 * of partition p to block n+start(p) of the disk.
		 */
		blk_partition_remap(bio);

		if (bio_integrity_enabled(bio) && bio_integrity_prep(bio))
			goto end_io;

		if (old_sector != -1)
			trace_block_remap(q, bio, old_dev, old_sector);

		old_sector = bio->bi_sector;
		old_dev = bio->bi_bdev->bd_dev;

		if (bio_check_eod(bio, nr_sectors))
			goto end_io;

		if (bio_rw_flagged(bio, BIO_RW_DISCARD) &&
		    !blk_queue_discard(q)) {
			err = -EOPNOTSUPP;
			goto end_io;
		}

		trace_block_bio_queue(q, bio);

		ret = q->make_request_fn(q, bio);
	} while (ret);

	return;

end_io:
	bio_endio(bio, err);
}

void generic_make_request(struct bio *bio)
{
	struct bio_list bio_list_on_stack;

	if (current->bio_list) {
		/* make_request is active */
		bio_list_add(current->bio_list, bio);
		return;
	}
	/* following loop may be a bit non-obvious, and so deserves some
	 * explanation.
	 * Before entering the loop, bio->bi_next is NULL (as all callers
	 * ensure that) so we have a list with a single bio.
	 * We pretend that we have just taken it off a longer list, so
	 * we assign bio_list to a pointer to the bio_list_on_stack,
	 * thus initialising the bio_list of new bios to be
	 * added.  __generic_make_request may indeed add some more bios
	 * through a recursive call to generic_make_request.  If it
	 * did, we find a non-NULL value in bio_list and re-enter the loop
	 * from the top.  In this case we really did just take the bio
	 * of the top of the list (no pretending) and so remove it from
	 * bio_list, and call into __generic_make_request again.
	 *
	 * The loop was structured like this to make only one call to
	 * __generic_make_request (which is important as it is large and
	 * inlined) and to keep the structure simple.
	 */
	BUG_ON(bio->bi_next);
	bio_list_init(&bio_list_on_stack);
	current->bio_list = &bio_list_on_stack;
	do {
		__generic_make_request(bio);
		bio = bio_list_pop(current->bio_list);
	} while (bio);
	current->bio_list = NULL; /* deactivate */
}

generic_make_request 中获取 bio 指向的块设备的请求队列，并循环通过 __generic_make_request 调用请求队列的 make_request_fn 方法（见 request_queue 的声明，里面定义了一系列的函数指针）来下发 bio。

普通的块设备处理中一般会将 __make_request 函数注册到请求队列的 make_request_fn 函数指针上。另外设备驱动程序也可以注册自己的 I/O 提交等函数，这样可以绕过 Linux 默认提供的 I/O 协议栈，不走标准的 I/O 请求队列，由驱动程序自己来处理，有很多 nvram、SSD 卡等的驱动程序会为了提高性能而做这样的处理。

来看看 __make_request 的处理：

static int __make_request(struct request_queue *q, struct bio *bio)
{
	struct request *req;
	int el_ret;
	unsigned int bytes = bio->bi_size;
	const unsigned short prio = bio_prio(bio);
	const bool sync = bio_rw_flagged(bio, BIO_RW_SYNCIO);
	const bool unplug = bio_rw_flagged(bio, BIO_RW_UNPLUG);
	const unsigned int ff = bio->bi_rw & REQ_FAILFAST_MASK;
	int rw_flags;

	if (bio_rw_flagged(bio, BIO_RW_BARRIER) &&
	    (q->next_ordered == QUEUE_ORDERED_NONE)) {
		bio_endio(bio, -EOPNOTSUPP);
		return 0;
	}
	/*
	 * low level driver can indicate that it wants pages above a
	 * certain limit bounced to low memory (ie for highmem, or even
	 * ISA dma in theory)
	 */
	blk_queue_bounce(q, &bio);

	spin_lock_irq(q->queue_lock);

	if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER)) || elv_queue_empty(q))
		goto get_rq;

	el_ret = elv_merge(q, &req, bio);
	switch (el_ret) {
	case ELEVATOR_BACK_MERGE:
		BUG_ON(!rq_mergeable(req));

		if (!ll_back_merge_fn(q, req, bio))
			break;

		trace_block_bio_backmerge(q, bio);

		if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
			blk_rq_set_mixed_merge(req);

		req->biotail->bi_next = bio;
		req->biotail = bio;
		req->__data_len += bytes;
		req->ioprio = ioprio_best(req->ioprio, prio);
		if (!blk_rq_cpu_valid(req))
			req->cpu = bio->bi_comp_cpu;
		drive_stat_acct(req, 0);
		if (!attempt_back_merge(q, req))
			elv_merged_request(q, req, el_ret);
		goto out;

	case ELEVATOR_FRONT_MERGE:
		BUG_ON(!rq_mergeable(req));

		if (!ll_front_merge_fn(q, req, bio))
			break;

		trace_block_bio_frontmerge(q, bio);

		if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) {
			blk_rq_set_mixed_merge(req);
			req->cmd_flags &= ~REQ_FAILFAST_MASK;
			req->cmd_flags |= ff;
		}

		bio->bi_next = req->bio;
		req->bio = bio;

		/*
		 * may not be valid. if the low level driver said
		 * it didn't need a bounce buffer then it better
		 * not touch req->buffer either...
		 */
		req->buffer = bio_data(bio);
		req->__sector = bio->bi_sector;
		req->__data_len += bytes;
		req->ioprio = ioprio_best(req->ioprio, prio);
		if (!blk_rq_cpu_valid(req))
			req->cpu = bio->bi_comp_cpu;
		drive_stat_acct(req, 0);
		if (!attempt_front_merge(q, req))
			elv_merged_request(q, req, el_ret);
		goto out;

	/* ELV_NO_MERGE: elevator says don't/can't merge. */
	default:
		;
	}

get_rq:
	/*
	 * This sync check and mask will be re-done in init_request_from_bio(),
	 * but we need to set it earlier to expose the sync flag to the
	 * rq allocator and io schedulers.
	 */
	rw_flags = bio_data_dir(bio);
	if (sync)
		rw_flags |= REQ_RW_SYNC;

	/*
	 * Grab a free request. This is might sleep but can not fail.
	 * Returns with the queue unlocked.
	 */
	req = get_request_wait(q, rw_flags, bio);

	/*
	 * After dropping the lock and possibly sleeping here, our request
	 * may now be mergeable after it had proven unmergeable (above).
	 * We don't worry about that case for efficiency. It won't happen
	 * often, and the elevators are able to handle it.
	 */
	init_request_from_bio(req, bio);

	spin_lock_irq(q->queue_lock);
	if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) ||
	    bio_flagged(bio, BIO_CPU_AFFINE))
		req->cpu = blk_cpu_to_group(smp_processor_id());
	if (queue_should_plug(q) && elv_queue_empty(q))
		blk_plug_device(q);
	add_request(q, req);
out:
	if (unplug || !queue_should_plug(q))
		__generic_unplug_device(q);
	spin_unlock_irq(q->queue_lock);
	return 0;
}

I/O 调度器的合并处理就在 __make_request 中通过调用相应调度器的函数来完成。__make_request 调用 elv_merge 通过调度器判断是否可以合并，如果可以则根据 front merging 或者 back merging 分别由调度器做处理。如果不能合并则调用 get_request_wait 和 init_request_from_bio 根据 bio 请求创建并初始化新的 request，然后调用 add_request 将 request 加入请求队列。

static inline void add_request(struct request_queue *q, struct request *req)
{
	drive_stat_acct(req, 1);

	/*
	 * elevator indicated where it wants this request to be
	 * inserted at elevator_merge time
	 */
	__elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0);
}

add_request 还是会通过调度器将 request 插入请求队列中合适的位置。

最终 __make_request 返回 0 表示 bio 转发结束。后续 request 的处理方法和设备驱动的实现有关，一般通过注册到 request_queue 的 request_fn 函数指针进行处理，例如常见的 SCSI 设备就会将 scsi_request_fn 注册到 request_fn 上。驱动程序中请求发送以及自己的队列等处理完毕后调用 blk_complete_request 结束请求，而在结束请求过程中会调用 bio 的回调函数结束 bio。

本文只是对 Linux block 层做了基本的介绍，类似 buffer_head 处理、同步异步 I/O 处理等很多都没有涉及，以后再专门来研究了，文章有任何问题欢迎指正哦 ^_^

Linux kernel学习-内存管理

Uranus Zhou — Sat, 30 Jun 2012 19:03:50 +0000

本文同步自（如浏览不正常请点击跳转）：https://zohead.com/archives/linux-kernel-learning-memory-management/

接着之前的 Linux kernel 学习步伐，来到极其重要的内存管理部分，继续本文内容，需要先了解内存寻址的基础知识，见之前的 [内存寻址] 博文。

1、内存页及内存区域：

正如之前所说，Linux kernel 使用物理页作为内存管理的基本单位，其中重要的线程地址和物理地址的转换操作由页单元 MMU 来完成，系统的页表也由 MMU 来维护。kernel 使用 struct page 来表示一个物理页，它的定义在 include/linux/mm_types.h 头文件中：

struct page {
	unsigned long flags;		/* Atomic flags, some possibly
					 * updated asynchronously */
	atomic_t _count;		/* Usage count, see below. */
	union {
		atomic_t _mapcount;	/* Count of ptes mapped in mms,
					 * to show when page is mapped
					 * & limit reverse map searches.
					 */
		struct {		/* SLUB */
			u16 inuse;
			u16 objects;
		};
	};
	union {
	    struct {
		unsigned long private;		/* Mapping-private opaque data:
					 	 * usually used for buffer_heads
						 * if PagePrivate set; used for
						 * swp_entry_t if PageSwapCache;
						 * indicates order in the buddy
						 * system if PG_buddy is set.
						 */
		struct address_space *mapping;	/* If low bit clear, points to
						 * inode address_space, or NULL.
						 * If page mapped as anonymous
						 * memory, low bit is set, and
						 * it points to anon_vma object:
						 * see PAGE_MAPPING_ANON below.
						 */
	    };
#if USE_SPLIT_PTLOCKS
	    spinlock_t ptl;
#endif
	    struct kmem_cache *slab;	/* SLUB: Pointer to slab */
	    struct page *first_page;	/* Compound tail pages */
	};
	union {
		pgoff_t index;		/* Our offset within mapping. */
		void *freelist;		/* SLUB: freelist req. slab lock */
	};
	struct list_head lru;		/* Pageout list, eg. active_list
					 * protected by zone->lru_lock !
					 */
	/*
	 * On machines where all RAM is mapped into kernel address space,
	 * we can simply calculate the virtual address. On machines with
	 * highmem some memory is mapped into kernel virtual memory
	 * dynamically, so we need a place to store that address.
	 * Note that this field could be 16 bits on x86 ...  
	 *
	 * Architectures with slow multiplication can define
	 * WANT_PAGE_VIRTUAL in asm/page.h
	 */
#if defined(WANT_PAGE_VIRTUAL)
	void *virtual;			/* Kernel virtual address (NULL if
					   not kmapped, ie. highmem) */
#endif /* WANT_PAGE_VIRTUAL */
#ifdef CONFIG_WANT_PAGE_DEBUG_FLAGS
	unsigned long debug_flags;	/* Use atomic bitops on this */
#endif

#ifdef CONFIG_KMEMCHECK
	/*
	 * kmemcheck wants to track the status of each byte in a page; this
	 * is a pointer to such a status block. NULL if not tracked.
	 */
	void *shadow;
#endif
};

其中的 flags 用于表示页的状态（是否为脏或者被锁定等），_count 即为页的引用计数，kernel 一般使用 page_count 宏调用 atomic_read 函数原子的读取此值，page_count 返回 0 表示此页可用。如果一个页被作为 page cache 使用，则 page 的 mapping 字段指向映射的 inode 的 address_space 对象，如果页被作为私有数据（作为 buffer_heads 缓冲、buddy 系统等），则 private 常包含对应的信息。注意其中的 virtual 字段为页的虚拟地址，结合之前的知识，对于高端内存来说，其并没有被固定映射到 kernel 地址空间中，因此如果 virtual 字段为 NULL，则表示此页必须被动态映射。

kernel 使用 page 结构记录系统中的所有页，因此 struct page 的大小应该要尽量小以减少内存占用，另外 kernel 必须知道页是否空闲，如果不空闲则拥有者是谁。

由于实际硬件限制，Linux kernel 不可能使用全部的物理内存，kernel 为此将内存划分为不同的区域，一个区域中的内存属性应该也相同。kernel 中常见的内存区域有 ZONE_DMA（可用于 DMA 的页）、ZONE_DMA32（与 ZONE_DMA 类似，但只对 32 位设备可用）、ZONE_NORMAL、ZONE_HIGHMEM（并没有被固定映射的高端内存区域），这些内存区域一般都是硬件相关的，例如在 x86 架构下，ZONE_DMA 的范围为 0MB - 16MB，ZONE_HIGHMEM 为高于 896MB 的物理内存，而在 x86_64 架构下 ZONE_HIGHMEM 则为空。需要注意的是内存的分配不会跨域这些不同的内存区域。内存区域在 kernel 中由 struct zone 结构来表示，其中的 name 字段即为内存区域名称。

2、获取页：

分配和释放内存是 Linux kernel 中极其重要又用的极多的接口。先看看 kernel 提供的直接获取以内存页面为单位的 alloc_pages 函数：

struct page * alloc_pages(gfp_t gfp_mask, unsigned int order)

此函数是最基本的用于分配大小为 2^order 并且连续的物理页的函数，其返回分配到的第一个页面的 page 指针。

来看看比较重要的 gfp_t 类型的 gfp_mask 值：

gfp_t 实际上就是 unsigned int 类型，gfp_mask 常用于指定行为方式、区域方式、类型等信息。常见的行为方式标志有：__GFP_WAIT（标志分配器可以睡眠，明显不适用于中断上下文中）、__GFP_IO（分配器可以启动磁盘 I/O）等。区域方式指定内存从哪里分配，对应的就有：__GFP_DMA、__GFP_DMA32、__GFP_HIGHMEM（从高端内存或普通内存中分配）。类型标志则用于简化分配时的指定操作，常见的有：GFP_ATOMIC（高优先级并不可睡眠，常用于中断、中断下半部、持有自旋锁等环境中）、GFP_NOIO（表示分配可中断但不可以发起 I/O 操作）、GFP_NOFS（分配时不可发起文件 I/O 操作）、GFP_KERNEL（最常见的分配标志，常用于可以睡眠的进程上下文中）、GFP_USER（用于分配内存给用户进程）、GFP_DMA 等。

需要注意的是对 __get_free_pages 和 kmalloc 函数（下面会分别说明）不能指定 __GFP_HIGHMEM 标志，因为它们都是直接返回的虚拟地址，而非 page 结构指针，如果指定了 __GFP_HIGHMEM，则他们可能分配到的内存并没有被映射到 kernel 地址空间，因此这样得不到虚拟地址。只有 alloc_page 函数可以分配高端内存，这个限制在下面的 __get_free_pages 函数的实现中可以看到。

使用 page_address 函数可以将 page 指针转换为虚拟地址（非物理地址）。实际使用中经常会用到 __get_free_pages 函数直接在分配页时直接得到虚拟地址，其参数为 alloc_pages 完全一样，看看它的实现就一目了然了：

unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
{
	struct page *page;

	/*
	 * __get_free_pages() returns a 32-bit address, which cannot represent
	 * a highmem page
	 */
	VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);

	page = alloc_pages(gfp_mask, order);
	if (!page)
		return 0;
	return (unsigned long) page_address(page);
}

另外 kernel 还 “好心” 的提供了两个只分配一个页的函数：alloc_page 和 __get_free_page，可以想象只是把 order 参数设为 0 而已。你可以使用 get_zeroed_page 函数分配一个页并自动清零（gfp_mask 指定 __GFP_ZERO）。

对应的释放页可以用 __free_pages（page 指针为参数）、free_pages（虚拟地址为参数）、free_page（只释放一个页）这些函数。

下面是常用的分配非整数倍页大小的内存的函数。首先是最常用的 kmalloc 函数：

void *kmalloc(size_t size, gfp_t flags)

kmalloc 用于分配最少指定的 size 字节大小的内存（实际分配的可能比 size 多），这与用户空间的 malloc 函数很相似，但需要注意的是 kmalloc 分配的内存物理地址是连续的，这非常重要。

相应的释放内存函数是 kfree：

void kfree(const void *objp)

kfree 用于释放 kmalloc 分配的内存，注意如果使用 kfree 在不是的 kmalloc 分配的内存地址或者已经 kfree 过的地址上，都可能导致 kernel 出错。

紧接着就是大名鼎鼎的 vmalloc 函数了。它与 kmalloc 类似，但它分配的内存只是虚拟连续的而物理地址却不一定连续，这也类似于用户空间的 malloc 函数的效果。vmalloc 由于需要做页表转换之类的操作，性能比 kmalloc 差，而且 vmalloc 得到的页还必须由单独的页来做映射，对 TLB 缓存的效率也会有影响（有关 TLB 缓存参考之前的文章 [内存寻址]），由于这些原因，vmalloc 在 kernel 中用到的机会并不是很多，其常用于分配大量的内存，常见的一个例子就是内核模块的代码就是通过 vmalloc 加载到 kernel 中的。vmalloc 的原型为：

void * vmalloc(unsigned long size)

与之对应的，使用 vfree 释放分配的内存。另外 vmalloc 和 vfree 都是可以睡眠的，因此它们对中断上下文是不适用的。

3、Slab分配器：

Slab 也是 Linux kernel 中非常重要的组成部分，它用于简化内存的分配和释放，它相当于一个可用内存列表，里面包含一堆已经分配好的数据结构，当 kernel 需要分配一个数据结构时，可以直接从这个可用内存列表中取出而节省分配的时间，不需要的时候又可以还给这个列表而不需要释放，因此这个列表用于缓存经常访问的某种类型的数据。为了统一管理和释放，Linux kernel 引入 Slab 分配器作为通用的数据结构缓存层给经常访问的数据结构使用。需要说明的是 kmalloc 就是在 Slab 分配器基础上实现的。

这里简单对 Slab 分配器做个介绍，有关其细节请参考这篇 PDF 文档：

The Slab Allocator: An Object-Caching Kernel Memory Allocator

Slab 层将不同的对象划分到名为 cache 的不同组中，每个组存储不同类型的数据，也就是每种数据类型都有一个 cache。每个 cache 然后被划分为多个 slab，slab 由一个或多个连续的物理页组成（通常只有一个页），每个 slab 又包含一些数量的对象，也就是实际缓存的数据。每个 slab 的状态可以是这三个中的一个：满、部分满、空。当 kernel 请求一个新对象时，优先从状态为部分满的 slab 中取，如果没有则从状态为空的 slab 中分配，如果没有状态为空的 slab 了就创建一个，可以看到这种策略可以相对的减少内存碎片。

kernel 中常用到的 struct inode 结构就是一个典型的例子，它在 VFS 等地方被用到的非常多，因此 kernel 中增加一个名为 inode_cachep 的 cache 用于缓存 inode 结构。

每个 cache 由 kmem_cache 结构来表示，它的 struct kmem_list3 *nodelists[MAX_NUMNODES] 类型字段即为该 cache 包含的所有 slab。每个 slab 由 struct slab 结构来表示，看看 kmem_list3 和 slab 结构的定义：

struct slab {
	struct list_head list;
	unsigned long colouroff;
	void *s_mem;		/* including colour offset */
	unsigned int inuse;	/* num of objs active in slab */
	kmem_bufctl_t free;
	unsigned short nodeid;
};

struct kmem_list3 {
	struct list_head slabs_partial;	/* partial list first, better asm code */
	struct list_head slabs_full;
	struct list_head slabs_free;
	unsigned long free_objects;
	unsigned int free_limit;
	unsigned int colour_next;	/* Per-node cache coloring */
	spinlock_t list_lock;
	struct array_cache *shared;	/* shared per node */
	struct array_cache **alien;	/* on other nodes */
	unsigned long next_reap;	/* updated without locking */
	int free_touched;		/* updated without locking */
};

多个 slab 可以分别链接到 kmem_list3 的满（slabs_full）、部分满（slabs_partial）、空（slabs_free）3 个链表中。

Slab 分配器调用 kmem_getpages 函数分配新的 slab（关于 cache 的创建下面会提到），kmem_getpages 会调用 __get_free_pages 函数分配所需的内存用于保持 cache，因此 kmem_getpages 一般在当部分满（partial）和空（free）slab 的情况下调用，来看看它的实现：

static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
{
	struct page *page;
	int nr_pages;
	int i;

#ifndef CONFIG_MMU
	/*
	 * Nommu uses slab's for process anonymous memory allocations, and thus
	 * requires __GFP_COMP to properly refcount higher order allocations
	 */
	flags |= __GFP_COMP;
#endif

	flags |= cachep->gfpflags;
	if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
		flags |= __GFP_RECLAIMABLE;

	page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder);
	if (!page)
		return NULL;

	nr_pages = (1 << cachep->gfporder);
	if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
		add_zone_page_state(page_zone(page),
			NR_SLAB_RECLAIMABLE, nr_pages);
	else
		add_zone_page_state(page_zone(page),
			NR_SLAB_UNRECLAIMABLE, nr_pages);
	for (i = 0; i < nr_pages; i++)
		__SetPageSlab(page + i);

	if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) {
		kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid);

		if (cachep->ctor)
			kmemcheck_mark_uninitialized_pages(page, nr_pages);
		else
			kmemcheck_mark_unallocated_pages(page, nr_pages);
	}

	return page_address(page);
}

第一个参数 cachep 为需要分配页的 cache，cachep->gfporder 指定要分配的大小，上面的代码中对于 NUMA 架构做了必要的处理。

kmem_getpages 分配的内存通过 kmem_freepages 释放，它调用 free_pages 释放页，kmem_freepages 一般在系统检测到内存不足时调用或者在销毁 cache 时显示调用。

下面重点来看看 Slab 分配器如何使用。

使用 kmem_cache_create 函数创建新的 cache，其定义为：

struct kmem_cache * kmem_cache_create (const char *name, size_t size, size_t align, unsigned long flags, void (*ctor)(void *))

第一个 name 参数指定 cache 的名称，size 为 cache 中对象的大小，align 为对象的对齐（一般为 0），flags 控制 cache 的行为，最后一个参数 ctor 为对象的构造函数，cache 分配新页时会调用此构造函数，现在一般将 ctor 值设为 NULL。

cache 的标志可以是下面常用几种标志的 OR 值：

SLAB_HWCACHE_ALIGN：对 cache 中的每个对象做对齐处理，对齐之后可以提高 cache line 的访问性能，但由于要浪费内存空间，因此一般只在对性能有很高要求的场合使用；
SLAB_POISON：以固定的值填充 slab（默认 0xa5a5a5a5）；
SLAB_PANIC：如果分配失败，kernel 直接 panic；
SLAB_CACHE_DMA ：指定 Slab 层在 ZONE_DMA 上分配每个 slab。

kmem_cache_create 如果成功返回 struct kmem_cache 结构指针，注意由于 kmem_cache_create 函数可能会睡眠，因此不能在中断上下文中使用。

使用 kmem_cache_destroy 函数销毁 kmem_cache_create 返回的 cache，此函数一般在模块退出时调用，你也可以在很多模块的初始化中找到 kmem_cache_create。同样由于会睡眠，kmem_cache_destroy 也不能在中断上下文中使用。

cache 被创建之后，就可以调用 kmem_cache_alloc 函数从 cache 中取得对象，其定义为：

void * kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)

此函数直接返回对象的指针，如果 cache 中所有 slab 都没有空闲的对象了，Slab 层就需要调用 kmem_getpages 获取新的页。

如果一个对象不再需要使用了，可以调用 kmem_cache_free 将其回收到 slab 中：

void kmem_cache_free(struct kmem_cache *cachep, void *objp)

需要注意的就是 kmem_cache_free 和 kmem_cache_destroy 不能混淆。

4、高端内存映射：

由于高端内存不是被固定映射到 kernel 地址空间中，因此 alloc_pages 函数使用时如果指定了 __GFP_HIGHMEM 标志，则它返回的 page 很可能没有有效的虚拟地址。

使用 kmap 函数可以将一个 page 固定的映射到 kernel 地址空间中：

void *kmap(struct page *page)

注意此函数对高端内存和低端内存都是适用的，如果 page 在低端内存，则直接返回页的虚拟地址，否则需要创建内存映射，由于 kmap 可能会睡眠，因此不能在中断上下文中使用。

被映射的高端内存不需要时应使用 kunmap 函数删除映射。

另外对于不能睡眠的进程环境，Linux kernel 又提供了临时的高端内存映射方法。kernel 可以原子地映射一个高端内存页到 kernel 中的保留映射集中的一个，此保留映射集也是专门用于中断上下文等不能睡眠的地方映射高端内存页的需要。临时高端内存映射函数为 kmap_atomic，看看它在 x86 下的实现：

void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
{
	enum fixed_addresses idx;
	unsigned long vaddr;

	/* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
	pagefault_disable();

	if (!PageHighMem(page))
		return page_address(page);

	debug_kmap_atomic(type);

	idx = type + KM_TYPE_NR*smp_processor_id();
	vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
	BUG_ON(!pte_none(*(kmap_pte-idx)));
	set_pte(kmap_pte-idx, mk_pte(page, prot));

	return (void *)vaddr;
}

void *kmap_atomic(struct page *page, enum km_type type)
{
	return kmap_atomic_prot(page, type, kmap_prot);
}

kmap_atomic 实际调用 kmap_atomic_prot 实现临时映射，kmap_atomic_prot 中同样会先做判断，如果要映射的页不在高端内存则直接返回虚拟地址，然后根据 type 和当前处理器 ID 计算得到 fixmap 的索引，并调用 __fix_to_virt 将 fixmap 索引转换为虚拟地址，有关 fixmap 机制见之前的 [内存寻址] 博文。

kmap_atomic 函数的 type 参数用于临时映射的用途，此函数会禁用内核抢占，因为临时映射是和每个处理器相关的，它是直接调用 pagefault_disable 函数禁止 page fault handler，其中会自动禁用内核抢占，看看 pagefault_disable 的实现：

static inline void pagefault_disable(void)
{
	inc_preempt_count();
	/*
	 * make sure to have issued the store before a pagefault
	 * can hit.
	 */
	barrier();
}

临时高端内存映射可以使用 kunmap_atomic 函数删除，它会启用内核抢占，同时它也不会睡眠，需要注意的是此次的临时高端内存映射在下一次临时映射高端内存时就会无效。

本文中如果有任何问题，欢迎提出指正哦，玩的开心~~~ ^_^

Linux kernel percpu变量解析

Uranus Zhou — Sat, 16 Jun 2012 11:27:20 +0000

本文同步自（如浏览不正常请点击跳转）：https://zohead.com/archives/linux-kernel-percpu-variable/

Linux 2.6 kernel 中的 percpu 变量是经常用到的东西，因为现在很多计算机都已经支持多处理器了，而且 kernel 默认都会被编译成 SMP 的，相对于原来多个处理器共享数据并进行处理的方式，用 percpu 变量在 SMP、NUMA 等架构下可以提高性能，而且很多情况下必须用 percpu 来对不同的处理器做出数据区分。

本文以 kernel 中的 softirq 为例简单说下 percpu 变量，我们先来看看 kernel 中唤醒 ksoftirqd 的实现，ksoftirqd 在 ps 命令看到的进程列表中很容易找到，是每个处理器都有一个（如果有 4 个处理器，则有 4 个 kernel 线程名称分别从 ksoftirqd/0 到 ksoftirqd/3），关于 softirq 本身的实现不在本文讨论范围内，唤醒 ksoftirqd 的实现在 kernel/softirq.c 文件中：

static DEFINE_PER_CPU(struct task_struct *, ksoftirqd);

void wakeup_softirqd(void)
{
	/* Interrupts are disabled: no need to stop preemption */
	struct task_struct *tsk = __get_cpu_var(ksoftirqd);

	if (tsk && tsk->state != TASK_RUNNING)
		wake_up_process(tsk);
}

这里就用到了 percpu 变量 ksoftirqd，它是通过 DEFINE_PER_CPU 宏来进程定义的 percpu task_struct 列表，通过 __get_cpu_var 宏来得到相应处理器的 ksoftirqd/n 的 task_struct，然后调用 wake_up_process 函数唤醒进程（也就是 ksoftirqd/n kernel 线程），关于 wake_up_process 等进程调度的相关实现在之前的日志中有介绍的，请参考 [这里]。

__get_cpu_var、DEFINE_PER_CPU 等 percpu 宏的实现在 include/linux/percpu.h、include/asm-generic/percpu.h 等头文件中。先看看 include/asm-generic/percpu.h 中的一些定义：

#ifdef CONFIG_SMP

/*
 * per_cpu_offset() is the offset that has to be added to a
 * percpu variable to get to the instance for a certain processor.
 *
 * Most arches use the __per_cpu_offset array for those offsets but
 * some arches have their own ways of determining the offset (x86_64, s390).
 */
#ifndef __per_cpu_offset
extern unsigned long __per_cpu_offset[NR_CPUS];

#define per_cpu_offset(x) (__per_cpu_offset[x])
#endif

/*
 * Determine the offset for the currently active processor.
 * An arch may define __my_cpu_offset to provide a more effective
 * means of obtaining the offset to the per cpu variables of the
 * current processor.
 */
#ifndef __my_cpu_offset
#define __my_cpu_offset per_cpu_offset(raw_smp_processor_id())
#endif
#ifdef CONFIG_DEBUG_PREEMPT
#define my_cpu_offset per_cpu_offset(smp_processor_id())
#else
#define my_cpu_offset __my_cpu_offset
#endif

/*
 * Add a offset to a pointer but keep the pointer as is.
 *
 * Only S390 provides its own means of moving the pointer.
 */
#ifndef SHIFT_PERCPU_PTR
/* Weird cast keeps both GCC and sparse happy. */
#define SHIFT_PERCPU_PTR(__p, __offset)	({				\
	__verify_pcpu_ptr((__p));					\
	RELOC_HIDE((typeof(*(__p)) __kernel __force *)(__p), (__offset)); \
})
#endif

/*
 * A percpu variable may point to a discarded regions. The following are
 * established ways to produce a usable pointer from the percpu variable
 * offset.
 */
#define per_cpu(var, cpu) \
	(*SHIFT_PERCPU_PTR(&(var), per_cpu_offset(cpu)))
#define __get_cpu_var(var) \
	(*SHIFT_PERCPU_PTR(&(var), my_cpu_offset))
#define __raw_get_cpu_var(var) \
	(*SHIFT_PERCPU_PTR(&(var), __my_cpu_offset))

#define this_cpu_ptr(ptr) SHIFT_PERCPU_PTR(ptr, my_cpu_offset)
#define __this_cpu_ptr(ptr) SHIFT_PERCPU_PTR(ptr, __my_cpu_offset)

#ifdef CONFIG_HAVE_SETUP_PER_CPU_AREA
extern void setup_per_cpu_areas(void);
#endif

#else /* ! SMP */

#define per_cpu(var, cpu)			(*((void)(cpu), &(var)))
#define __get_cpu_var(var)			(var)
#define __raw_get_cpu_var(var)			(var)
#define this_cpu_ptr(ptr) per_cpu_ptr(ptr, 0)
#define __this_cpu_ptr(ptr) this_cpu_ptr(ptr)

#endif	/* SMP */

#ifndef PER_CPU_BASE_SECTION
#ifdef CONFIG_SMP
#define PER_CPU_BASE_SECTION ".data.percpu"
#else
#define PER_CPU_BASE_SECTION ".data"
#endif
#endif

#ifdef CONFIG_SMP

#ifdef MODULE
#define PER_CPU_SHARED_ALIGNED_SECTION ""
#define PER_CPU_ALIGNED_SECTION ""
#else
#define PER_CPU_SHARED_ALIGNED_SECTION ".shared_aligned"
#define PER_CPU_ALIGNED_SECTION ".shared_aligned"
#endif
#define PER_CPU_FIRST_SECTION ".first"

#else

#define PER_CPU_SHARED_ALIGNED_SECTION ""
#define PER_CPU_ALIGNED_SECTION ".shared_aligned"
#define PER_CPU_FIRST_SECTION ""

#endif

通常所有的 percpu 变量是一起存放在特定的 section 里的，像上面头文件中的 .data.percpu 基础 section（当然非 SMP 系统下就是 .data 了）、.shared_aligned、.first section。使用 objdump 可以看到编译 kernel 时的 vmlinux 文件的 section（结果没有完全显示）：

objdump -h vmlinux

vmlinux:     file format elf64-x86-64

  0 .text         0037a127  ffffffff81000000  0000000001000000  00200000  2**12
                  CONTENTS, ALLOC, LOAD, READONLY, CODE
  3 .rodata       0013c8ec  ffffffff8137f000  000000000137f000  0057f000  2**6
                  CONTENTS, ALLOC, LOAD, READONLY, DATA
 11 .data         0004d920  ffffffff814ec000  00000000014ec000  006ec000  2**12
                  CONTENTS, ALLOC, LOAD, DATA
 19 .data.percpu  00012880  0000000000000000  000000000153b000  00a00000  2**12
                  CONTENTS, ALLOC, LOAD, DATA

可以看到 vmlinux 文件中的 .data 和 .data.percpu section。

percpu 变量的地址实际上就是其在上面说到的 section 里的偏移量，这个偏移量还要加上特定处理器的偏移量（也就是上面头文件中的 per_cpu_offset、my_cpu_offset 等）得到最终的变量地址，并最终以指针引用的方式得到值，这样访问的效果就有点类似于访问全局变量了。percpu 变量通常用于更新非常频繁而访问机会又相对比较少的场合，这样的处理方式可以避免多处理器环境下的频繁加锁等操作。

从上面的注释也可以看到 per_cpu_offset 是在一个 percpu 变量上增加的偏移量，大多数系统架构下使用 __per_cpu_offset 数组来作为偏移量，而 x86_64 等架构下处理方式则不同。my_cpu_offset 是在调用 per_cpu_offset 时使用 smp_processor_id() 得到当前处理器 ID 作为参数，__my_cpu_offset 则是用 raw_smp_processor_id() 的值作为 per_cpu_offset 的参数（smp_processor_id() 在抢占被关闭时是安全的）。SHIFT_PERCPU_PTR 宏用于给指针增加偏移量，它使用的 RELOC_HIDE 宏在不同的编译器下实现不同，在 include/linux/compiler.h 头文件中，看看 gcc 编译下的处理：

#define RELOC_HIDE(ptr, off)					\
  ({ unsigned long __ptr;					\
    __asm__ ("" : "=r"(__ptr) : "0"(ptr));		\
    (typeof(ptr)) (__ptr + (off)); })

可以看到 gcc 中使用内嵌汇编先将 ptr 值赋给 __ptr（unsigned long 类型），然后在 __ptr 基础上增加偏移量，这样可以避免编译报错，ptr 值不变而且最终以 ptr 指定的类型来返回。

include/asm-generic/percpu.h 头文件中定义了 per_cpu、__get_cpu_var、__raw_get_cpu_var、this_cpu_ptr、__this_cpu_ptr 等几个常用的宏。per_cpu 就用于得到某个指定处理器的变量，__get_cpu_var 用于得到当前处理器的 percpu 变量值。

再来看看 DEFINE_PER_CPU 的实现，它在 include/linux/percpu-defs.h 头文件中：

#define __PCPU_ATTRS(sec)						\
	__percpu __attribute__((section(PER_CPU_BASE_SECTION sec)))	\
	PER_CPU_ATTRIBUTES

#define DEFINE_PER_CPU_SECTION(type, name, sec)				\
	__PCPU_ATTRS(sec) PER_CPU_DEF_ATTRIBUTES			\
	__typeof__(type) name

#define DEFINE_PER_CPU(type, name)					\
	DEFINE_PER_CPU_SECTION(type, name, "")

使用 DEFINE_PER_CPU 宏可以静态的定义 percpu 变量。__PCPU_ATTRS 指定输入的 section 类型，DEFINE_PER_CPU_SECTION 用于在特定的 section 上定义特定类型的变量。__typeof__ 和上面见到的 typeof 是一样的，都用于获取 type 的数据类型。__attribute__((section(xxx))) 表示把定义的变量存储在指定的 section 上。DEFINE_PER_CPU 就用于定义在 PER_CPU_BASE_SECTION section 上（从最开始的代码中也可以看出非 SMP 时用 .data 段，SMP 时用 .data.percpu 段）。

然后是 get_cpu_var 宏的实现，它在 include/linux/percpu.h 头文件中：

/*
 * Must be an lvalue. Since @var must be a simple identifier,
 * we force a syntax error here if it isn't.
 */
#define get_cpu_var(var) (*({				\
	preempt_disable();				\
	&__get_cpu_var(var); }))

/*
 * The weird & is necessary because sparse considers (void)(var) to be
 * a direct dereference of percpu variable (var).
 */
#define put_cpu_var(var) do {				\
	(void)&(var);					\
	preempt_enable();				\
} while (0)

#define alloc_percpu(type)	\
	(typeof(type) __percpu *)__alloc_percpu(sizeof(type), __alignof__(type))

get_cpu_var 会先禁止抢占然后调用 __get_cpu_var 得到 percpu 变量值。put_cpu_var 则重新启用抢占。

另外在 include/linux/percpu.h 等文件中还定义了 alloc_percpu 和 free_percpu 宏来动态定义和释放 percpu 变量，他们都是通过 percpu memory allocator 来实现的，在 mm/percpu.c 中，动态分配的 percpu 变量可以通过 per_cpu_ptr 宏来得到，为此 kernel 还引入了 this_cpu_ptr、this_cpu_read 等一系列相关机制用寄存器替代内存提高对 percpu 变量的访问速度，关于 percpu memory allocator 等信息以后再来详细分析了。

以上为个人分析结果，有任何问题欢迎指正咯 ^_^

Linux kernel kfifo分析

Uranus Zhou — Mon, 11 Jun 2012 17:32:15 +0000

本文同步自（如浏览不正常请点击跳转）：https://zohead.com/archives/linux-kernel-kfifo/

kfifo 是 Linux kernel 中的一个通用队列实现，对于 kernel 中常见的 FIFO 队列应用还是很有用的，本文主要简单介绍分析下 Linux kernel kfifo。实际上 ChinaUnix 上有个 kfifo 的分析文章，但已经比较老（基于 Linux 2.6.10），而且我现在用的 2.6.34 版本 kernel 中 kfifo 实现有很多改动，故自己简单写下，ChinaUnix 上的 kfifo 介绍帖子在这里：

http://bbs.chinaunix.net/thread-1994832-1-1.html

kfifo 定义在 include/linux/kfifo.h 头文件中，我们经常使用的就是 kfifo 结构，看看它的定义：

struct kfifo {
	unsigned char *buffer;	/* the buffer holding the data */
	unsigned int size;	/* the size of the allocated buffer */
	unsigned int in;	/* data is added at offset (in % size) */
	unsigned int out;	/* data is extracted from off. (out % size) */
};

kfifo 也像其它队列那样提供了两个主要操作：入队列（in）和出队列（out），对应于上面结构中的 in 和 out 两个偏移量，in 偏移量为下次入队列的位置，out 为下次出队列的位置，很容易也能想到 out 值必须小于等于 in 值，当 out 值等于 in 值时表示队列为空，kfifo 中 buffer 为队列的空间，size 为空间大小，必须为 2 的 k 次幂值（原因在下面说明）。当然如果 in 值等于队列长度了，就表示队列已经满了。

先看看 kfifo 最简单的一些操作实现，在 kernel/kfifo.c 文件中：

static void _kfifo_init(struct kfifo *fifo, void *buffer,
		unsigned int size)
{
	fifo->buffer = buffer;
	fifo->size = size;

	kfifo_reset(fifo);
}

/**
 * kfifo_init - initialize a FIFO using a preallocated buffer
 * @fifo: the fifo to assign the buffer
 * @buffer: the preallocated buffer to be used.
 * @size: the size of the internal buffer, this has to be a power of 2.
 *
 */
void kfifo_init(struct kfifo *fifo, void *buffer, unsigned int size)
{
	/* size must be a power of 2 */
	BUG_ON(!is_power_of_2(size));

	_kfifo_init(fifo, buffer, size);
}
EXPORT_SYMBOL(kfifo_init);

/**
 * kfifo_alloc - allocates a new FIFO internal buffer
 * @fifo: the fifo to assign then new buffer
 * @size: the size of the buffer to be allocated, this have to be a power of 2.
 * @gfp_mask: get_free_pages mask, passed to kmalloc()
 *
 * This function dynamically allocates a new fifo internal buffer
 *
 * The size will be rounded-up to a power of 2.
 * The buffer will be release with kfifo_free().
 * Return 0 if no error, otherwise the an error code
 */
int kfifo_alloc(struct kfifo *fifo, unsigned int size, gfp_t gfp_mask)
{
	unsigned char *buffer;

	/*
	 * round up to the next power of 2, since our 'let the indices
	 * wrap' technique works only in this case.
	 */
	if (!is_power_of_2(size)) {
		BUG_ON(size > 0x80000000);
		size = roundup_pow_of_two(size);
	}

	buffer = kmalloc(size, gfp_mask);
	if (!buffer) {
		_kfifo_init(fifo, NULL, 0);
		return -ENOMEM;
	}

	_kfifo_init(fifo, buffer, size);

	return 0;
}
EXPORT_SYMBOL(kfifo_alloc);

/**
 * kfifo_free - frees the FIFO internal buffer
 * @fifo: the fifo to be freed.
 */
void kfifo_free(struct kfifo *fifo)
{
	kfree(fifo->buffer);
	_kfifo_init(fifo, NULL, 0);
}
EXPORT_SYMBOL(kfifo_free);

调用 kfifo_alloc 可以自动分配空间并初始化，你也可以调用 kfifo_init 函数使用自己的空间来初始化队列，可以看到这两个函数中都用 is_power_of_2 做了检查队列空间的操作。kfifo_free 释放队列，它会调用 _kfifo_init 函数（参数为 NULL 和 0 清空队列），调用 kfifo_reset 可以重置队列（将 in 和 out 都设为 0）。你也可以用 DECLARE_KFIFO 和 INIT_KFIFO 静态定义一个 kfifo 队列，尽管这不太会被用到。

调用 kfifo_in 函数将数据加入队列，kfifo_out 将数据从队列中取出并从队列中删除（增加 out 值），Linux 还提供了 kfifo_out_peek 函数从队列中取数据但并不删除（不增加 out 值）。kfifo_in 中会先调用 __kfifo_in_data 将输入加入队列，然后调用 __kfifo_add_in 增加 in 的值，kfifo_out 相反则调用 __kfifo_out_data 和 __kfifo_add_out 函数取出数据并删除。

kfifo 中同时提供了 kfifo_from_user 函数用户将用户空间的数据加入到队列中，它会先调用 __kfifo_from_user_data 将用户空间的数据加入队列，再调用 __kfifo_add_in 增加 in 的值。看看 __kfifo_from_user_data 的实现：

static inline int __kfifo_from_user_data(struct kfifo *fifo,
	 const void __user *from, unsigned int len, unsigned int off,
	 unsigned *lenout)
{
	unsigned int l;
	int ret;

	/*
	 * Ensure that we sample the fifo->out index -before- we
	 * start putting bytes into the kfifo.
	 */

	smp_mb();

	off = __kfifo_off(fifo, fifo->in + off);

	/* first put the data starting from fifo->in to buffer end */
	l = min(len, fifo->size - off);
	ret = copy_from_user(fifo->buffer + off, from, l);
	if (unlikely(ret)) {
		*lenout = ret;
		return -EFAULT;
	}
	*lenout = l;

	/* then put the rest (if any) at the beginning of the buffer */
	ret = copy_from_user(fifo->buffer, from + l, len - l);
	*lenout += ret ? ret : len - l;
	return ret ? -EFAULT : 0;
}

可以看到 __kfifo_from_user_data 中是直接调用 copy_from_user 将用户空间的数据拷贝到 kfifo 队列的空间中。相应的也有 kfifo_to_user 函数将队列中的数据取出到用户空间的地址，他就调用 copy_to_user 将队列中数据拷贝到用户空间。

需要注意的是 __kfifo_from_user_data 中用到的 __kfifo_off 函数：

static inline unsigned int __kfifo_off(struct kfifo *fifo, unsigned int off)
{
	return off & (fifo->size - 1);
}

__kfifo_off 是根据指定的偏移量得到索引值，由这里也可以看出为什么队列的大小为什么必须是 2 的 k 次幂值，否则无法得到正确的值。而且从代码中可以看到 __kfifo_from_user_data、__kfifo_in_n、__kfifo_in_rec 等函数中都用到了 __kfifo_off 函数指定加入队列时的偏移量。

另外从 include/linux/kfifo.h 中你也可以看到新的 kfifo 实现中默认 EXPORT 了非常多的 API 函数给 kernel 开发者使用。

以上为个人分析结果，有任何问题欢迎指正哦 ^_^

Linux kernel学习-内存寻址

Uranus Zhou — Fri, 25 May 2012 20:03:02 +0000

近日在看 Understanding the Linux kernel（慢慢啃E文原版，以下简称 ULK），这本书虽然已经是第三版了，但它基于的 Linux kernel 版本却不是很新，现在 Linux kernel 都已经出到 3.4 版本了，这本书还是基于 2.6.11 的 kernel，不得不说 Linux kernel 的更迭速度太快了。

下面准备以我正在用的 2.6.34 版本的 kernel 为基础进行学习，这本书中不对应的地方我会尽量找到新 kernel 中的实现，并尽量自己做个了解，日后的相同日志如无意外也基于 2.6.34 版本 Linux kernel。

首先已完成第一章：Introduction（这一章没有 Linux kernel 代码），来到第二章 Memory Addressing，开始是介绍逻辑地址、线性地址、物理地址的对应关系，虽然之前用汇编写过 Linux 的 bootloader，用到过实模式和保护模式，但对 GDT、LDT 的概念并没有深入了解过。这一章开篇就介绍了 Intel 80X86 硬件上内存分段的实现，包括段选择子，段寄存器，段描述符。

段式内存管理

每个内存段由 8 个字节的段描述符来表示段的特征。段描述符被存储在 GDT 或者 LDT 中。内存中 GDT 的地址和大小包含在 gdtr 控制寄存器中，LDT 的地址和大小包含在 ldtr 控制寄存器中。段寄存器的高 13 位为段描述符在 GDT 或者 LDT 中的索引，GDT 或者 LDT 结构中包含基地址、段长度等信息。通过检查指令地址和段长度并确定没有越界以及权限是否正确之后，由于线性地址 = 段基指 + 偏移地址，GDT 或者 LDT 中的基地址加上指令中的偏移量就可以得到需要的线性地址。

备注：由于每个进程都可以有 LDT，而 GDT 只有一个，为满足需求 Intel 的做法是将 LDT 嵌套在 GDT 表中。

Linux kernel 中的内存分段

Linux中所有进程使用相同的段寄存器值，因此它们的线性地址集也是相同的，不管在用户模式还是内核模式，都可以使用相同的逻辑地址，32位 kernel下为 4G 的地址空间。

ULK 中介绍的 user code、user data、kernel code、kernel data 这四个段对应的段选择子的宏为：__USER_CS、__USER_DS、__KERNEL_CS、__KERNEL_DS，2.6.11 中这4个宏定义在 include/asm-i386/segment.h 头文件中，2.6.34 中已经挪到 arch/x86/include/asm/segment.h 里，因为 2.6.34 中 i386 和 x86_64 的代码已经尽可能的合并到 x86 目录中，而不像老版本的代码那样弄成两个目录。定义如下：

#define __KERNEL_CS	(GDT_ENTRY_KERNEL_CS*8)
#define __KERNEL_DS	(GDT_ENTRY_KERNEL_DS*8)
#define __USER_DS	(GDT_ENTRY_DEFAULT_USER_DS*8+3)
#define __USER_CS	(GDT_ENTRY_DEFAULT_USER_CS*8+3)

下面是 Linux kernel GDT 的实现：

由于 kernel 中每个内核需要有一个 GDT，因此就有一个 GDT table，ULK 中说的是存在 cpu_gdt_table 中，GDT 的地址和大小存在 cpu_gdt_descr 中，2.6.11 kernel 里都是放在 arch/i386/kernel/head.S，使用的地方：

extern struct desc_struct cpu_gdt_table[GDT_ENTRIES];
DECLARE_PER_CPU(struct desc_struct, cpu_gdt_table[GDT_ENTRIES]);

struct Xgt_desc_struct {
	unsigned short size;
	unsigned long address __attribute__((packed));
	unsigned short pad;
} __attribute__ ((packed));

extern struct Xgt_desc_struct idt_descr, cpu_gdt_descr[NR_CPUS];

到了 2.6.34 中已经改为：

struct gdt_page {
	struct desc_struct gdt[GDT_ENTRIES];
} __attribute__((aligned(PAGE_SIZE)));
DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page);

static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
{
	return per_cpu(gdt_page, cpu).gdt;
}

可以看到 2.6.34 中去掉了原来的 cpu_gdt_table 变量（详见 kernel commit bf50467204b435421d8de33ad080fa46c6f3d50b），新增了一个 gdt_page 结构存放 GDT table，而且提供 get_cpu_gdt_table 函数取得某个 CPU 的 GDT。cpu_gdt_descr 也已去掉，新增了 desc_ptr 结构存放每个 CPU 的 GDT 信息，cpu_gdt_descr 也改为 early_gdt_descr。

struct desc_ptr {
	unsigned short size;
	unsigned long address;
} __attribute__((packed)) ;

看下简单看下新的切换 GDT 的实现：

/*
 * Current gdt points %fs at the "master" per-cpu area: after this,
 * it's on the real one.
 */
void switch_to_new_gdt(int cpu)
{
	struct desc_ptr gdt_descr;

	gdt_descr.address = (long)get_cpu_gdt_table(cpu);
	gdt_descr.size = GDT_SIZE - 1;
	load_gdt(&gdt_descr);
	/* Reload the per-cpu base */

	load_percpu_segment(cpu);
}

load_gdt 最终调用 lgdt 汇编指令。

页式内存管理

Intel 从 80386 开始支持页式内存管理，页单元将线性地址翻译为物理地址。当 CR0 控制寄存器中的 PG 位置为 1 时，启动分页管理功能，为 0 时，禁止分页管理功能，并且把线性地址作物理地址使用。

32 位线性地址的高 10 位为页表目录的下标（指向页表），中间 10 位为页表的下标（指向页面），低 12 位为该地址在页面（通常大小为 4 KB）中的偏移量，这样的二层寻址设计主要为了减少页表本身所占用的内存，由于页表目录和页表都为 10 位，因此都最多包含 1024 个项。正在使用的页表目录的物理地址存在 cr3 控制寄存器中。

在 32 位大小的页表目录（页表）的结构中，其高 20 位为页表（页面）基地址的高 20 位，其它的 flag 中包含一个 Present 标志，如果该值为 1，表示指向的页面或者页表在内存中，如果为 0，页单元会将线性地址存在 cr2 控制寄存器中，并产生异常号 14： page fault。

页表目录结构中另外有一个 Page Size 标志（页表结构没有此标志），如果设为 1，则页面大小可以为 2MB 或者 4MB，这样可以跳过页表转换，将 cr4 寄存器的 PSE 标志启用即可启用大页面支持，此时 32 位线程地址由高 10 位页表目录下标和低 22 位的偏移量。

为满足寻址超过 4GB 的需求，Intel 从 Pentium Pro 处理器开始，将处理器的地址引脚数量由原来的 32 个提升为 36 个，处理器的寻址空间也从 4GB 增到 64GB，并增加 PAE 页面机制（设置 cr4 寄存器的 PAE 标志启用）：64G内存可以划分为 2^24 个页面，页表中的基地址由 20 位增为 24 位，页表结构的大小由 32 位增为 64 位，增加 PDDT 表从而使用三层寻址设计来解释 32 位的线性地址等等。PAE 机制稍显复杂，而且由于仍然使用 32 位线性地址，因此对于应用程序来说，仍然无法使用超过 4GB 的地址空间，64GB 只是对于 kernel 而言的。

顺带说下不同的 64 位架构下的页面寻址级别，见下表，可以看到常用的 x86_64 架构只用了 48 位的线性地址空间，但也达到了 256TB 咯 ^_^

硬件 cache

由于现在 CPU 速度太快，频率已经动辄多少 GHz，而相对的 DRAM 内存频率就慢很多，而且 DRAM 由于设计上电容存在不可避免的漏电原因，DRAM 的数据只能保持很短的时间，必须隔一段时间就刷新一次，不刷新的话会造成存储的信息丢失；而 SRAM 在加电之后不需要刷新，数据也不会丢失，由于 SRAM 的内部结构明显比 DRAM 复杂，而且由于价格原因不能将容量做的很大，DRAM 常用于 PC 机的内存，而 SRAM 常用于 CPU 的 L1 和 L2、L3 缓存，这时位于 SRAM 和 DRAM 之间的处理器 cache 控制器就应运而生了。

首先 CPU 从 cache 里读取的数据是以数据总线宽度为单位的，而新引入的 cache line 则是cache 和 memory 之间数据传输的最小单元，一般的 cache line size 有 32个字节、64个字节等。cache memory 的大小一般以 cache line size 为单位，可以包含多个 cache line，假设 cache line size 是 32 字节，数据总线宽度是 128 位，一个 cache line 就需要多次的总线操作，为此 x86 可以使用锁总线来保证一个操作序列是原子的。

CPU 访问 RAM 地址时，首先会根据地址判断是否在 cache 中，假设 cache 命中，如果是读操作，cache 控制器从 cache memory 中取得数据传给 CPU 寄存器，RAM 就不被访问以提高性能，如果是写操作，cache 控制器一般都需要实现 write-through 和 write-back 两种缓存策略。对于 L1 cache，大多是 write-through 的（同时写 RAM 和 cache line）；L2 cache 则是 write-back 的，只更新 cache line，不会立即写回 memory，只在需要时再更新，而且 cache 控制器一般只在 CPU 得到需要刷新 cache line 的指令时才刷新。反之 cache 未命中时，cache line 中的数据需要写回 memory，如果需要的话，将正确的 cache line 的数据从 RAM 中取出并更新。

如果能提高 CPU 的 cache 命中率，减少 cache 和 memory 之间的数据传输，将会提高系统的性能。

在多处理器环境中，每个处理器都有独立的硬件 cache。因此存在 cache 一致性问题，需要额外的硬件电路同步 cache 内容。

Linux kernel 中默认对所有页面启用 cache，并且都使用 write-back 策略。

TLB（Translation Lookaside Buffers）的作用：

除了通用的硬件 cache 之外，80X86 处理器包含 TLB（Translation Lookaside Buffers）cache 用于提高线性地址转换的速度。某个地址第一次使用时，MMU 将它对应的物理地址填入 TLB 中，下次使用同一地址时就可以从 TLB cache 里取出。TLB 中的内容需要保持与页表的一致，页面目录的物理地址变化时（更新 cr3 寄存器的值），TLB 中的所有内容也会被更新。

另外在多处理器环境中，每个处理器都有自己的 TLB，不过多处理器下的 TLB 是不需要像 CPU cache 那样做同步，因为某个进程对于不同的处理器的线性地址是相同的。

Linux 内存分页管理

2.6.11 之后的新 Linux kernel 中为了兼容 x86_64 等硬件架构已经将原先的两层页结构改为四层页结构：页全局目录（PGD）、页上级目录（PUD）、页中间目录（PMD）、页表（PT）。这样一个线性地址就被分成了 5 个部分，为了适应不同架构的考虑，这 5 个部分的位长度并没有固定，有的 64 位硬件架构只使用了三层页结构。

对于 32 位又没有 PAE 的架构，两层页结构就已经够了，此时 Linux 将的 PUD、PMD 的长度设为 0 位，为了使同样的代码既可以在 32 位又能在 64 位上运行，Linux 会将 PUD、PMD 的条目数设为 1，并将 PUD、PMD 映射放到 PGD 的合适条目上，PUD 的惟一的条目指向下一级的 PMD，PMD 的惟一的条目指向下一级的 PT，这样可以做到对于 OS 来说还是使用四层页结构。对于 32 位并启用了 PAE 的架构，PGD 对应于原来的 PDPT，PUD 被移除（长度为 0 位），PMD 对应于原来的页目录，PT 还是对应于原来的页表。

下面这张摘来的图很好的说明了段式内存管理和页式内存管理的关系（还算简单，没有画出 PGD、PUD、PMD 这种东西）：

每个进程有自己的 PGD 和页表，当进程发生切换时，当前进程的描述符中就保存了 cr3 寄存器的值。

页表、PMD、PUD、PGD 在 Linux kernel 中分别用 pte_t、pmd_t、pud_t 和 pgd_t 来表示，PAE 启用时这些值为 64 位，否则为 32 位。Linux kernel 提供 pte_clear、set_pte、pte_none、pte_same 等宏（函数）来操作或判断页表的值。

pmd_present 宏根据指定的页或页表是否在内存中而返回 1 或 0。而 pud_present、pgd_present 始终返回 1。需要注意的是 pte_present 宏，当页表结构中 Present 或者 Page Size 标志位为 1 时，pte_present 都返回 1，否则返回 0。由于 Page Size 标志位对于页表没有意义，因此对于一些虽然在内存中但并没有读/写/执行权限的 page，kernel 会将其 Present 标志位置为 0 及 Page Size 标志位置为 1，由于 Present 被置为 0，访问这些 page 时将触发 page fault 异常，但这时 kernel 就会根据 Page Size 为 1 而判断出这个异常不是真的由缺页引起的。

pgd_index、pmd_index、pte_index 分别返回指定的线性地址在 PGD、PMD、PT 中映射该线性地址所在项的索引，其它还有一些例如 pte_page、pmd_page、pud_page、pgd_page 这种操作不同种类的 page descriptor 的函数（宏）。

Linux kernel 内存布局

现在的 2.6 bzImage kernel 在启动时一般装载在 0x100000 即 1MB 的内存地址上（2.4 zImage 默认装载在 0x10000 内存地址上，具体请参考 Linux boot protocol - Documentation/x86/boot.txt），因为 1MB 之前的内存被 BIOS 和一些设备使用，这些可以找 BIOS 内存图来参考学习。

kernel 中有 min_low_pfn 变量表示在内存中 kernel 映像之后第一个可用页框的页号，max_pfn 表示最后一个可用页框的页号，max_low_pfn 表示最后一个由 kernel 直接映射的页框的页号（low memory），totalhigh_pages 表示 kernel 不能直接映射的页框数（high memory），highstart_pfn 和 highend_pfn 就比较好理解了。

在 32 位 Linux 中地址空间为4G，0~3G 为用户空间，物理地址的 0~896M 是直接写死的内核空间（即 low memory），大于 896M 的物理地址必须建立映射才能访问，可以通过 alloc_page() 等函数获得对应的 page，大于 896M 的就称为高端内存（high memory），另外剩下 896M~1G 的 128M 空间就用来映射 high memory 的，这段空间包括：

noncontiguous memory area 映射

从 VMALLOC_OFFSET 到 VMALLOC_END，其中 VMALLOC_OFFSET 距 high memory 8MB，每个被映射的 noncontiguous memory area 中间还要间隔一个页面（4KB）；
映射 persistent kernel mapping

从 PKMAP_BASE 开始；
最后用于 fix-mapped linear address

见下面的固定映射的说明。

所有进程从 3GB 到 4GB 的虚拟空间都是一样的，Linux 以此方式让内核态进程共享代码段和数据段。

3GB（0xC0000000）就是物理地址与虚拟地址之间的位移量，在 Linux kernel 代码中就叫做 PAGE_OFFSET。Linux kernel 提供 __pa 宏将从 PAGE_OFFSET 开始的线性地址转换为对应的物理地址，__va 则做相反的操作。

备注：Linux kernel 中有配置选项可以将用户/内核空间划分为分别 2G，64位 Linux 由于不存在 4G 地址空间限制不存在高端内存。

下面以一个编译 32 位 Linux kernel 时产生的 System.map 符号表文件说明下 kernel 在内存中的使用情况，在 System.map 里可以看到下面几个符号：

c1000000 T _text
c131bab6 T _etext
c131d000 R __start_rodata
c143c000 R __end_rodata
c143c000 D _sdata
c1469f40 D _edata
c151a000 B __bss_start
c1585154 B __bss_stop
c16ab000 A _end

_text 表示 kernel code 开始处，这个在符号表就被链接到 0xc0000000 + 0x100000，这样所有的符号地址 = 0xC0000000 + 符号，_etext 表示 kernel code 结束处，之后是 rodata 只读数据段的开始 __start_rodata 和结束位置 __end_rodata；已初始化的 kernel data 在 _sdata 开始并结束于 _edata 位置，紧接着是未初始化的 kernel data（BSS），开始于 __bss_start 结束于 __bss_stop，通常 System.map 的最后都是 _end。（注意：这里看到的 kernel ELF 段分布和 ULK 说的并不完全一样，ULK 说的相对比较笼统）

Linux kernel 内存映射

PGD 被分成了两部分，第一部分表项映射的线性地址小于 0xc0000000 （PGD 共 1024 项，在 PAE 未启用时是前 768 项，PAE 启用时是前 3 项），具体大小依赖特定进程。相反，剩余的表项对所有进程来说都应该是相同的，它们等于 master kernel PGD 的相应表项。

系统在初始化时，kernel 会维护一个 master kernel PGD，初始化之后，这个 master kernel PGD 将不会再被任何进程或者 kernel 线程直接使用，而对于系统中的常规进程的 PGD，最开始的一些 PGD 条目（PAE 禁用时为最开始 768 条，启用时为最开始 3 条）是进程相关的，其它的 PGD 条目则和其它进程一样统一指向对应的 master kernel PGD 中的最高的一些 PGD 条目，master kernel PGD 只相当于参考模型。

kernel 刚被装载到内存时，CPU 还处于实模式，分页功能还未启用，首先 kernel 会创建一个有限的 kernel code 和 data 地址空间、初始页表、以及一些动态数据；然后 kernel 利用这个最小的地址空间完成使用所有 RAM 并正确设置页表。

看看 arch/x86/kernel/head_32.S 中 kernel 临时页表的 AT&T 汇编代码：

page_pde_offset = (__PAGE_OFFSET >> 20);

	movl $pa(__brk_base), %edi
	movl $pa(swapper_pg_dir), %edx
	movl $PTE_IDENT_ATTR, %eax
10:
	leal PDE_IDENT_ATTR(%edi),%ecx		/* Create PDE entry */
	movl %ecx,(%edx)			/* Store identity PDE entry */
	movl %ecx,page_pde_offset(%edx)		/* Store kernel PDE entry */
	addl $4,%edx
	movl $1024, %ecx
11:
	stosl
	addl $0x1000,%eax
	loop 11b
	/*
	 * End condition: we must map up to the end + MAPPING_BEYOND_END.
	 */
	movl $pa(_end) + MAPPING_BEYOND_END + PTE_IDENT_ATTR, %ebp
	cmpl %ebp,%eax
	jb 10b
	addl $__PAGE_OFFSET, %edi
	movl %edi, pa(_brk_end)
	shrl $12, %eax
	movl %eax, pa(max_pfn_mapped)

	/* Do early initialization of the fixmap area */
	movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR,%eax
	movl %eax,pa(swapper_pg_dir+0xffc)

// ... 中间代码省略 ... //

/*
 * Enable paging
 */
	movl $pa(swapper_pg_dir),%eax
	movl %eax,%cr3		/* set the page table pointer.. */
	movl %cr0,%eax
	orl  $X86_CR0_PG,%eax
	movl %eax,%cr0		/* ..and set paging (PG) bit */
	ljmp $__BOOT_CS,$1f	/* Clear prefetch and normalize %eip */
1:
	/* Set up the stack pointer */
	lss stack_start,%esp

kernel 临时页表就包含在 swapper_pg_dir 中，最后通过设置 cr3 寄存器启用内存分页管理。master kernel PGD 在 paging_init 中初始化，其中调用 pagetable_init：

#ifdef CONFIG_HIGHMEM
static void __init permanent_kmaps_init(pgd_t *pgd_base)
{
	unsigned long vaddr;
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;
	pte_t *pte;

	vaddr = PKMAP_BASE;
	page_table_range_init(vaddr, vaddr + PAGE_SIZE*LAST_PKMAP, pgd_base);

	pgd = swapper_pg_dir + pgd_index(vaddr);
	pud = pud_offset(pgd, vaddr);
	pmd = pmd_offset(pud, vaddr);
	pte = pte_offset_kernel(pmd, vaddr);
	pkmap_page_table = pte;
}

#else
static inline void permanent_kmaps_init(pgd_t *pgd_base)
{
}
#endif /* CONFIG_HIGHMEM */

static void __init pagetable_init(void)
{
	pgd_t *pgd_base = swapper_pg_dir;

	permanent_kmaps_init(pgd_base);
}

/*
 * paging_init() sets up the page tables - note that the first 8MB are
 * already mapped by head.S.
 *
 * This routines also unmaps the page at virtual kernel address 0, so
 * that we can trap those pesky NULL-reference errors in the kernel.
 */
void __init paging_init(void)
{
	pagetable_init();

	__flush_tlb_all();

	kmap_init();

	/*
	 * NOTE: at this point the bootmem allocator is fully available.
	 */
	sparse_init();
	zone_sizes_init();
}

如果计算机内存少于 896M，32 位地址就已经足够寻址所有 RAM，就不必要开启 PAE 了。如果内存多于 4GB 且 CPU 支持 PAE，kernel 也已经启用 PAE，则使用三层页结构，并使用大页面以减少页表数。

有关固定映射

kernel 线性地址的 896M 映射系统物理内存，然而至少 128MB 的线性地址总是留作他用，因为内核使用这些线性地址实现非连续内存分配和固定映射的线性地址。

Linux 内核中提供了一段虚拟地址用于固定映射，也就是 fixmap。fixmap 是这样一种机制：提供一些线性地址，在编译时就确定下来，等到 Linux 引导时再为之建立起和物理地址的映射（用 set_fixmap(idx, phys)、set_fixmap_nocache 函数）。fixmap 地址比指针更加好用，dereference 也比普通的指针速度要快，因为普通的指针 dereference 时比 fixmap 地址多一个内存访问，而且 fixmap 在 dereference 时也不需要做检查是否有效的操作。fixmap 地址可以在编译时作为一个常量，只是这个常量在 kernel 启动时被映射。

kernel 能确保在发生上下文切换时 fixmap 的页表项不会从 TLB 中被 flush，这样对它的访问可以始终通过高速缓存。

固定映射的线性地址（fix-mapped linear address）是一个固定的线性地址，它所对应的物理地址不是通过简单的线性转换得到的，而是人为强制指定的。每个固定的线性地址都映射到一块物理内存页。固定映射线性地址能够映射到任何一页物理内存。固定映射线性地址

固定映射线性地址是从整个线性地址空间的最后 4KB 即线性地址 0xfffff000 向低地址进行分配的。在最后 4KB 空间与固定映射线性地址空间的顶端空留一页（未知原因），固定映射线性地址空间前面的地址空间就是 vmalloc 分配的区域，他们之间也空有一页。

固定映射的线性地址基本上是一种类似于 0xffffc000 这样的常量线性地址，其对应的物理地址不必等于线性地址减去 0xc000000，而是通过页表以任意方式建立。因此每个固定映射的线性地址都映射一个物理内存的页框。

每个 fixmap 地址在 kernel 里是放在 enum fixed_addresses 数据结构中的，fix_to_virt 函数用于将 fixmap 在 fixed_addresses 中的索引转换为虚拟地址。fix_to_virt 还是一个 inline 函数，编译时不会产生函数调用代码。

处理硬件 cache 和 TLB

以下措施用于优化硬件cache（L1、L2 cache 等）的命中率：

1) 一个数据结构中使用最频繁的字段被放在数据结构的低偏移量，这样可以在同一个 cache line 里被缓存；
2) 分配很多块数据结构时，kernel 会尝试将它们分别存在 memory 中以使所有 cache line 都能被均匀使用。

80x86 处理器会自动处理 cache 同步，Linux kernel 只对其它没有同步 cache 操作的处理器单独做 cache flushing。

Linux 提供了一些在页表变化时 flush TLB 的函数，例如 flush_tlb（常用于进程切换时）、flush_tlb_all（常用于更新 kernel 页表项时）等。

到此 ULK 的内存寻址部分结束，本文只是我个人看 ULK 时的认识和查找到的一些总结，这算是我所写的日志里花的时间最长的一篇（从 5月22日写到 5月26日），花的心思也很多。本文基于 32 位 Linux kernel 而言，主要着重于 Linux 中最重要的部分之一：内存管理，本文中像 CPU L1、L2 cache 之类的一些信息都是笔者在看 ULK 不太了解时通过在网上查其它的文章而记录下的，因此里面有任何不正确之处欢迎指正 ^_^

page cache诊断控制工具 vmtouch 源代码分析

Uranus Zhou — Thu, 17 May 2012 16:29:51 +0000

本文同步自（如浏览不正常请点击跳转）：https://zohead.com/archives/vmtouch-code-analysis/

vmtouch 是一个 portable 的 page cache 诊断和控制工具，可以查看文件或者设备中有多少在 page cache 中，知道之后对这些在 page cache 中的内存引用可以避免 page fault，支持将文件的内容从 page cache 逐出，同时还可以将文件手工 touch 到 page cache 中，支持 lock 文件部分区域到 memory 中防止被交换出去从而提高。

vmtouch 可以在 Linux、BSD 等系统上使用，在这下载编译：

http://hoytech.com/vmtouch/

今天简单看了下 vmtouch 的代码，发现还比较简单，自己写个类似的程序验证之后，将代码分析结果写下。vmtouch 的代码比较少，我只贴出最关键的一个函数 vmtouch_file（关键部分已经高亮显示），这个函数做分析 page cache 使用、touch、lock 的操作，其它部分只是加了读了目录的遍历处理之类的。

int64_t bytes2pages(int64_t bytes) {
	return (bytes+pagesize-1) / pagesize;
}

int aligned_p(void *p) {
	return 0 == ((long)p & (pagesize-1));
}

int is_mincore_page_resident(char p) {
	return p & 0x1;
}

void vmtouch_file(char *path) {
	int fd;
	void *mem;
	struct stat sb;
	int64_t len_of_file;
	int64_t pages_in_file;
	int i;
	int res;

	res = o_followsymlinks ? stat(path, &sb) : lstat(path, &sb);

	if (res) {
		warning("unable to stat %s (%s), skipping", path, strerror(errno));
		return;
	}

	if (S_ISLNK(sb.st_mode)) {
		warning("not following symbolic link %s", path);
		return;
	}

	if (sb.st_size == 0) return;

	if (sb.st_size > o_max_file_size) {
		warning("file %s too large, skipping", path);
		return;
	}

	len_of_file = sb.st_size;

	retry_open:

	fd = open(path, O_RDONLY, 0);

	if (fd == -1) {
		if (errno == ENFILE || errno == EMFILE) {
			increment_nofile_rlimit();
			goto retry_open;
		}

		warning("unable to open %s (%s), skipping", path, strerror(errno));
		return;
	}

	mem = mmap(NULL, len_of_file, PROT_READ, MAP_SHARED, fd, 0);

	if (mem == MAP_FAILED) {
		warning("unable to mmap file %s (%s), skipping", path, strerror(errno));
		close(fd);
		return;
	}

	if (!aligned_p(mem)) fatal("mmap(%s) wasn't page aligned", path);

	pages_in_file = bytes2pages(len_of_file);

	total_pages += pages_in_file;

	if (o_evict) {
		if (o_verbose) printf("Evicting %s\n", path);

#if defined(__linux__)
		if (posix_fadvise(fd, 0, len_of_file, POSIX_FADV_DONTNEED))
			warning("unable to posix_fadvise file %s (%s)", path, strerror(errno));
#elif defined(__FreeBSD__) || defined(__sun__)
		if (msync(mem, len_of_file, MS_INVALIDATE))
			warning("unable to msync invalidate file %s (%s)", path, strerror(errno));
#else
		fatal("cache eviction not (yet?) supported on this platform");
#endif
	} else {
		char mincore_array[pages_in_file];
		int64_t pages_in_core=0;
		double last_chart_print_time=0.0, temp_time;

		// 3rd arg to mincore is char* on BSD and unsigned char* on linux
		if (mincore(mem, len_of_file, (void*)mincore_array)) fatal("mincore %s (%s)", path, strerror(errno));
		for (i=0; i (last_chart_print_time+CHART_UPDATE_INTERVAL)) {
						last_chart_print_time = temp_time;
						print_page_residency_chart(stdout, mincore_array, pages_in_file);
					}
				}
			}
		}

		if (o_verbose) {
			print_page_residency_chart(stdout, mincore_array, pages_in_file);
			printf("\n");
		}
	}

	if (o_lock) {
		if (mlock(mem, len_of_file))
			fatal("mlock: %s (%s)", path, strerror(errno));
	}

	if (!o_lock && !o_lockall) {
		if (munmap(mem, len_of_file)) warning("unable to munmap file %s (%s)", path, strerror(errno));
		close(fd);
	}
}

稍微有点基础就可以看明白了，先 mmap 映射文件到当前进程，按 page size 对齐之后，调用 mincore 函数就可以得到文件中每一个 page 是否在 page cache 中，结果保存在 mincore_array 数组中，该数据中每个字节的第一位即表示是否在 page cache 中。

将文件内容逐出（指定 o_evict）出 page cache 是通过 posix_fadvise 函数调用 fadvise 系统调用来实现的（BSD通过 msync 实现，这个在 Linux 上没有效果）。fadvise 系统调用可以告诉 kernel 要操作的文件在接下来要干什么，kernel 可以提前做一些操作而提高性能，Linux kernel 里实现了以下几种控制方式：

POSIX_FADV_NORMAL - 正常操作，对文件使用底层设备的默认 readahead 值；
POSIX_FADV_SEQUENTIAL - 顺序I/O，对文件使用两倍的 readahead 值；
POSIX_FADV_RANDOM - 随机I/O，禁用文件上的 readahead；
POSIX_FADV_NOREUSE - 只使用一次
POSIX_FADV_WILLNEED - 很快需要使用，对文件使用非阻塞读到 page cache
POSIX_FADV_DONTNEED - 不再需要使用文件，从 page cache 中逐出

posix_fadvise 加 POSIX_FADV_DONTNEED 参数就可以将文件从 page cache 中逐出，需要注意的是如果需要确保文件从 page cache 中逐出，还需要在调用 fadvise 之前用 fsync/fdatasync/sync_file_range 之类的函数将 dirty page 清理。

下面是我在 Linux 下用 posix_fadvise 的一个测试程序测试的结果：

[root@localhost ~]# echo 3 > /proc/sys/vm/drop_caches
[root@localhost ~]# free
total       used       free     shared    buffers     cached
Mem:        374092      61832     312260          0        136       5060
-/+ buffers/cache:      56636     317456
Swap:       707576        436     707140
[root@localhost ~]# dd if=/dev/zero of=test bs=1024k count=100
记录了100+0 的读入
记录了100+0 的写出
104857600字节(105 MB)已复制，22.5514 秒，4.6 MB/秒
[root@localhost ~]# free
total       used       free     shared    buffers     cached
Mem:        374092     168960     205132          0        564     109816
-/+ buffers/cache:      58580     315512
Swap:       707576        436     707140
[root@localhost ~]# ./fadvise test POSIX_FADV_DONTNEED
OK
[root@localhost ~]# free
total       used       free     shared    buffers     cached
Mem:        374092      63932     310160          0        580       7424
-/+ buffers/cache:      55928     318164
Swap:       707576        436     707140

从 free 命令的结果可以很明显的看到，dd 之后基本文件都在 page cache 中，fadvise 之后从 page cache 中正确逐出。

接着是 vmtouch 中的 touch 操作（指定 o_touch）就更简单了，对 mmap 到的地址直接遍历引用，不在 page cache 的内容会自动产生 page fault 到 page cache 中。

lock 内存（指定 o_lock）也则直接使用 mlock 函数来实现，mlock 对于对安全性和实时性有很高要求的程序非常有用，可以保证指定的文件区域在内存中，不被 swap 出去。

以上为个人分析结果，有任何问题欢迎指正咯 ^_^

Thunderbird（Firefox）病毒扫描原理分析及禁用方法

Uranus Zhou — Fri, 30 Mar 2012 02:42:23 +0000

本文博客链接：https://zohead.com/archives/thunderbird-virus-scan-analyse-disable/

笔者现在浏览网页和电子邮件全部都用 Mozilla 系的 Firefox 和 Thunderbird 了，这俩软件算是我的 Windows 系统中启动之后运行时间最长的两个应用程序，都非常好用，Thunderbird 虽然没有 Firefox 那么受欢迎，也算比 Outlook 好用很多（仅仅个人意见 ^_^），同时都支持扩展和附加组件。

最近发现有一点需要吐槽的是 Thunderbird 和 Firefox 默认下载附件或者下载东西完成之后都需要运行杀毒软件进行病毒扫描，而且附件比较大时还相当缓慢，对于我这种不想装杀毒软件的人或者受不了速度影响的人会有点感官影响，HOHO，寻办法禁用之。

1、Firefox

先说说比较简单的 Firefox 下禁用病毒扫描的方法（可以直接百度、GG 到）：

地址栏中输入“about:config”，忽略坑爹的 “保修提示”，在过滤器中中输入“browser.download.manager.scanWhenDone”，通常找到的第一个就是要改的配置，双击将值改为 false，随便下个软件，马上就可以看到效果。

2、老版本 Thunderbird

接下来 Thunderbird 的禁用方法就没有直接的了，老版本的 Thunderbird 似乎也可以在 “about:config” 设置同样的值，但新版本的 “about:config” 中已经没有这个配置，注意 Thunderbird 的 ”about:config“ 不是直接输入了，需要打开首选项，进入 “高级” 选项卡，点 ”常规“，然后打开 ”配置编辑器“ 就是想要的效果。

如果直接要新版本的修改方法，直接忽略分析部分直接跳到最后。

3、Thunderbird 扫描原理分析

无奈新版本的 Thunderbird 在搜索了 N 次之后找不到解决办法，祭出查源代码大法（一般都比较好使，哈哈），不想下那么大的源码包，直接在 Mozilla 官方网站上找 Thunderbird 的在线版本库，找到了 Mozilla Cross-Reference 做参考：
http://mxr.mozilla.org/

进入 Comm. Central 仓库：
http://mxr.mozilla.org/comm-central/

搜索 virus scan 之类的就能看到下面几个代码段，稍加分析，大略知道 Thunderbird 和 Firefox 的病毒扫描原理，简单分析下。

首先在 nsDownloadScanner.cpp 下找到扫描的初始化部分：

nsresult
nsDownloadScanner::Init()
{
  // This CoInitialize/CoUninitialize pattern seems to be common in the Mozilla
  // codebase. All other COM calls/objects are made on different threads.
  nsresult rv = NS_OK;
  CoInitialize(NULL);

  if (!IsAESAvailable()) {
    CoUninitialize();
    return NS_ERROR_NOT_AVAILABLE;
  }

  mAESExists = true;

  // Initialize scanning
  mWatchdog = new nsDownloadScannerWatchdog();

下面是同一文件中的比较重要的 IsAESAvailable 的实现：

bool
nsDownloadScanner::IsAESAvailable()
{
  // Try to instantiate IAE to see if it's available.
  nsRefPtr ae;
  HRESULT hr;
  hr = CoCreateInstance(CLSID_AttachmentServices, NULL, CLSCTX_INPROC,
                        IID_IAttachmentExecute, getter_AddRefs(ae));
  if (FAILED(hr)) {
    NS_WARNING("Could not instantiate attachment execution service\n");
    return false;
  }
  return true;
}

对是否支持扫描的判断很简单，判断 IAttachmentExecute COM组件是否可用，此组件在 Windows XP SP2 之后的系统中就自带，另外如果在更老的系统中则使用 IOfficeAntiVirus COM组件。

下面是下载管理器 nsDownloadManager.cpp 的处理：

nsresult
nsDownloadManager::Init()
{

  ... 中间省略的代码 ...

  rv = bundleService->CreateBundle(DOWNLOAD_MANAGER_BUNDLE,
                                   getter_AddRefs(mBundle));
  NS_ENSURE_SUCCESS(rv, rv);

#ifdef DOWNLOAD_SCANNER
  mScanner = new nsDownloadScanner();
  if (!mScanner)
    return NS_ERROR_OUT_OF_MEMORY;
  rv = mScanner->Init();
  if (NS_FAILED(rv)) {
    delete mScanner;
    mScanner = nsnull;
  }
#endif

可以看到第12行处会启动 nsDownloadScanner 进行下载附件的扫描处理。

4、组策略处理方法（不完美）

经过粗略了解之后便是解决方法咯，首先看能否对 Thunderbird 禁用 IAttachmentExecute COM组件。

有关 IAttachmentExecute COM组件的配置见这篇文章通过组策略设置“附件管理器”，可以参考这篇文章对整个系统的附件扫描机制做设置：
http://edu.kafan.cn/html/hips/11386.html

简单流程：
开始 - 运行，输入：gpedit.msc，打开组策略编辑器，定位到用户配置 - 管理模板 - Windows组件 - 附件管理器，截图如下，看下应该就能明白

可以看到这个设置是全局，是一种解决方法，但会对其它需要扫描的程序（例如：Outlook）也会造成影响。

5、完美禁用 Thunderbird 病毒扫描

再经过搜寻源代码之后终于找到只改变 Thunderbird 扫描病毒配置的方法，原来 Thunderbird 可能出于安全的考虑将 ”browser.download.manager.scanWhenDone“ 配置从 ”about:config“ 配置编辑器中移除掉了，但这个配置功能并没有真正去掉，我们还有直接修改 Thunderbird 安装目录的配置脚本的方法，进入 Thunderbird 安装目录的 defaults\pref 下，打开 channel-prefs.js 文件编辑，增加扫描的配置。

例如我的配置脚本的路径为：C:\Program Files\Mozilla Thunderbird\defaults\pref\channel-prefs.js，打开该文件，增加一行：
pref("browser.download.manager.scanWhenDone", false);

即可禁用下载附件时的病毒扫描，需要再启用时恢复即可，贴图：

oops，写累了，以上只是粗略的分析，如果有任何错误或者更好的修改方法请PM或评论。 ^_^