Soul Of Free Loop » 线程

Linux kernel学习-进程地址空间

Uranus Zhou — Fri, 06 Jul 2012 19:12:14 +0000

本文同步自（如浏览不正常请点击跳转）：https://zohead.com/archives/linux-kernel-learning-process-address-space/

看完 Linux kernel block I/O 层之后来到进程地址空间管理部分，本文中的很多知识和之前的 [进程基本]、[进程调度]、[内存管理] 等章节的知识相关。

1、基础知识：

Linux kernel 给每个进程提供的进程地址空间一般是 32 位或 64 位（硬件相关）的平坦地址空间，但进程是没有权限访问这段地址空间中的所有地址的，能访问的一般是很多的内存地址区间。这种内存地址区间被称为内存区域，进程可以动态添加和删除内存区域到它的地址空间中。内存区域可以有不同的权限，相关进程必须遵守这些权限，例如可读、可写、可执行等。如果进程访问的地址不在一个有效的内存区域中，或者访问时的权限不正确，kernel 将会杀掉进程并给出常见的 "Segmentation Fault" 段错误日志。

内存区域通常包括：

可执行文件的代码段，称为 text 段；
可执行文件的已初始化全局变量段，称为 data 段；
未初始化全局变量段（通常以 0 page 填充），称为 bss 段；
进程的用户空间栈（通常以 0 page 填充）；
每个共享库文件的额外 text、data、bss 段，也被装入进程的地址空间；
内存映射文件；
共享内存区域；
匿名内存映射（新版本的 malloc 函数就除了 brk 之外也通过 mmap 实现）；
应用程序中的堆

2、内存描述符：

kernel 使用 mm_struct 内存描述符结构来表示进程的地址空间信息，它定义在头文件中，这也是一个非常大的结构。

struct vm_area_struct {
	struct mm_struct * vm_mm;	/* The address space we belong to. */
	unsigned long vm_start;		/* Our start address within vm_mm. */
	unsigned long vm_end;		/* The first byte after our end address
					   within vm_mm. */

	/* linked list of VM areas per task, sorted by address */
	struct vm_area_struct *vm_next;

	pgprot_t vm_page_prot;		/* Access permissions of this VMA. */
	unsigned long vm_flags;		/* Flags, see mm.h. */

	struct rb_node vm_rb;

	/*
	 * For areas with an address space and backing store,
	 * linkage into the address_space->i_mmap prio tree, or
	 * linkage to the list of like vmas hanging off its node, or
	 * linkage of vma in the address_space->i_mmap_nonlinear list.
	 */
	union {
		struct {
			struct list_head list;
			void *parent;	/* aligns with prio_tree_node parent */
			struct vm_area_struct *head;
		} vm_set;

		struct raw_prio_tree_node prio_tree_node;
	} shared;

	/*
	 * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma
	 * list, after a COW of one of the file pages.	A MAP_SHARED vma
	 * can only be in the i_mmap tree.  An anonymous MAP_PRIVATE, stack
	 * or brk vma (with NULL file) can only be in an anon_vma list.
	 */
	struct list_head anon_vma_chain; /* Serialized by mmap_sem &
					  * page_table_lock */
	struct anon_vma *anon_vma;	/* Serialized by page_table_lock */

	/* Function pointers to deal with this struct. */
	const struct vm_operations_struct *vm_ops;

	/* Information about our backing store: */
	unsigned long vm_pgoff;		/* Offset (within vm_file) in PAGE_SIZE
					   units, *not* PAGE_CACHE_SIZE */
	struct file * vm_file;		/* File we map to (can be NULL). */
	void * vm_private_data;		/* was vm_pte (shared mem) */
	unsigned long vm_truncate_count;/* truncate_count or restart_addr */

#ifndef CONFIG_MMU
	struct vm_region *vm_region;	/* NOMMU mapping region */
#endif
#ifdef CONFIG_NUMA
	struct mempolicy *vm_policy;	/* NUMA policy for the VMA */
#endif
};

struct mm_struct {
	struct vm_area_struct * mmap;		/* list of VMAs */
	struct rb_root mm_rb;
	struct vm_area_struct * mmap_cache;	/* last find_vma result */
#ifdef CONFIG_MMU
	unsigned long (*get_unmapped_area) (struct file *filp,
				unsigned long addr, unsigned long len,
				unsigned long pgoff, unsigned long flags);
	void (*unmap_area) (struct mm_struct *mm, unsigned long addr);
#endif
	unsigned long mmap_base;		/* base of mmap area */
	unsigned long task_size;		/* size of task vm space */
	unsigned long cached_hole_size; 	/* if non-zero, the largest hole below free_area_cache */
	unsigned long free_area_cache;		/* first hole of size cached_hole_size or larger */
	pgd_t * pgd;
	atomic_t mm_users;			/* How many users with user space? */
	atomic_t mm_count;			/* How many references to "struct mm_struct" (users count as 1) */
	int map_count;				/* number of VMAs */
	struct rw_semaphore mmap_sem;
	spinlock_t page_table_lock;		/* Protects page tables and some counters */

	struct list_head mmlist;		/* List of maybe swapped mm's.	These are globally strung
						 * together off init_mm.mmlist, and are protected
						 * by mmlist_lock
						 */

	unsigned long hiwater_rss;	/* High-watermark of RSS usage */
	unsigned long hiwater_vm;	/* High-water virtual memory usage */

	unsigned long total_vm, locked_vm, shared_vm, exec_vm;
	unsigned long stack_vm, reserved_vm, def_flags, nr_ptes;
	unsigned long start_code, end_code, start_data, end_data;
	unsigned long start_brk, brk, start_stack;
	unsigned long arg_start, arg_end, env_start, env_end;

	unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */

	/*
	 * Special counters, in some configurations protected by the
	 * page_table_lock, in other configurations by being atomic.
	 */
	struct mm_rss_stat rss_stat;

	struct linux_binfmt *binfmt;

	cpumask_t cpu_vm_mask;

	/* Architecture-specific MM context */
	mm_context_t context;

	/* Swap token stuff */
	/*
	 * Last value of global fault stamp as seen by this process.
	 * In other words, this value gives an indication of how long
	 * it has been since this task got the token.
	 * Look at mm/thrash.c
	 */
	unsigned int faultstamp;
	unsigned int token_priority;
	unsigned int last_interval;

	unsigned long flags; /* Must use atomic bitops to access the bits */

	struct core_state *core_state; /* coredumping support */
#ifdef CONFIG_AIO
	spinlock_t		ioctx_lock;
	struct hlist_head	ioctx_list;
#endif
#ifdef CONFIG_MM_OWNER
	/*
	 * "owner" points to a task that is regarded as the canonical
	 * user/owner of this mm. All of the following must be true in
	 * order for it to be changed:
	 *
	 * current == mm->owner
	 * current->mm != mm
	 * new_owner->mm == mm
	 * new_owner->alloc_lock is held
	 */
	struct task_struct *owner;
#endif

#ifdef CONFIG_PROC_FS
	/* store ref to file /proc//exe symlink points to */
	struct file *exe_file;
	unsigned long num_exe_file_vmas;
#endif
#ifdef CONFIG_MMU_NOTIFIER
	struct mmu_notifier_mm *mmu_notifier_mm;
#endif
};

结构的注释中已经包含比较多的注解了哦。mmap 为地址空间的内存区域（用 vm_area_struct 结构来表示啦，也是上面的代码中）链表，mm_rb 则将其以红黑树的形式进行存储，链表形式方便遍历，红黑树形式方便查找。mm_users 为以原子变量形式保护的使用此地址空间的进程数量值（例如：如果有 4 个线程共享此地址空间，则 mm_users 值为 4），mm_count 为引用计数（所有 mm_users 等于一个引用计数），当 mm_count 值为 0 时表示没有再被使用，可以被释放。total_vm 成员表示所有内存区域的数量。

所有的 mm_struct 结构以链表的形式存在 mm_struct 的 mmlist 成员中，该链表的第一个成员就是 init 进程的 mm_struct ：init_mm，该链表被 mmlist_lock 锁保护。

进程的内存描述符是在 task_struct 的 mm 成员中的。fork() 进行创建进程时调用 copy_mm 函数将父进程的内存描述符拷贝给子进程，调用 clone() 函数时如果指定 CLONE_VM 参数将使父进程和子进程地址空间共享（实际上将 mm_users 计数加 1），这种子进程就被称为线程。mm_struct 结构一般是通过 alloc_mm 宏从名为 mm_cachep 的 Slab cache 中分配。

进程退出时调用 exit_mm 函数，该函数再调用 mmput() 函数，此函数中减小地址空间的 mm_users 计数，如果 mm_users 变为 0，调用 mmdrop() 函数减小 mm_count 计数，如果 mm_count 变为 0，则最终调用 free_mm() 宏来释放内存描述符（回归到 Slab cache 中）。

另外需要说明的是 kernel 线程是没有地址空间，也就没有对应的 mm_struct（值为 NULL），kernel 线程使用之前运行的进程的内存描述符，有关 kernel 线程请参考之前的 [进程基本] 文章。

3、VMA 概念：

vm_area_struct 结构即内存区域常被称为虚拟内存区域（简写为 VMA），表示的是在一个地址空间中的一个连续内存地址区间，每个内存区域是一个惟一的对象。vm_area_struct 中的 vm_mm 成员指向关联的内存描述符，vm_ops 成员为非常重要的关联的操作函数结构，vm_start 为起始地址，vm_end 为结束地址之后第一个字节的地址，即地址范围为：[vm_start, vm_end)。每个 VMA 对于它关联的内存描述符来说是惟一的，因此如果两个单独的进程映射相同的文件到各自的地址空间，它们的 VMA 也是不同的。

VMA 中的 vm_flags 表示内存区域中的页的行为状态，常见的状态有：VM_READ（页可读）、VM_WRITE（页可写）、VM_EXEC（页可被执行）、VM_SHARED（页被共享，被设置了称为共享映射，未设置称为私有映射）、VM_SHM（此区域被用作共享内存）、VM_LOCKED（页被锁）、VM_IO（此区域用于映射设备 I/O 空间）、VM_RESERVED（表示内存区域不可被交换出去）、VM_SEQ_READ（连续读，增强 readahead）、VM_RAND_READ（随机读，减弱 readahead）等。VM_SEQ_READ 和 VM_RAND_READ 标志可以通过 madvise() 系统调用来设置。

看看 vm_ops 操作函数结构的 vm_operations_struct 的定义，它在头文件中：

struct vm_operations_struct {
	void (*open)(struct vm_area_struct * area);
	void (*close)(struct vm_area_struct * area);
	int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf);

	/* notification that a previously read-only page is about to become
	 * writable, if an error is returned it will cause a SIGBUS */
	int (*page_mkwrite)(struct vm_area_struct *vma, struct vm_fault *vmf);

	/* called by access_process_vm when get_user_pages() fails, typically
	 * for use by special VMAs that can switch between memory and hardware
	 */
	int (*access)(struct vm_area_struct *vma, unsigned long addr,
		      void *buf, int len, int write);
#ifdef CONFIG_NUMA
	/*
	 * set_policy() op must add a reference to any non-NULL @new mempolicy
	 * to hold the policy upon return.  Caller should pass NULL @new to
	 * remove a policy and fall back to surrounding context--i.e. do not
	 * install a MPOL_DEFAULT policy, nor the task or system default
	 * mempolicy.
	 */
	int (*set_policy)(struct vm_area_struct *vma, struct mempolicy *new);

	/*
	 * get_policy() op must add reference [mpol_get()] to any policy at
	 * (vma,addr) marked as MPOL_SHARED.  The shared policy infrastructure
	 * in mm/mempolicy.c will do this automatically.
	 * get_policy() must NOT add a ref if the policy at (vma,addr) is not
	 * marked as MPOL_SHARED. vma policies are protected by the mmap_sem.
	 * If no [shared/vma] mempolicy exists at the addr, get_policy() op
	 * must return NULL--i.e., do not "fallback" to task or system default
	 * policy.
	 */
	struct mempolicy *(*get_policy)(struct vm_area_struct *vma,
					unsigned long addr);
	int (*migrate)(struct vm_area_struct *vma, const nodemask_t *from,
		const nodemask_t *to, unsigned long flags);
#endif
};

当指定的内存区域被添加到地址空间时，open 函数被调用，反之移除时 close 函数被调用。如果一个不在内存中的页被访问，将触发缺页异常， fault 函数被缺页异常处理函数调用。当一个只读的页变为可写的时候，page_mkwrite 函数也被缺页异常处理函数调用。

mm_struct 中的 mmap 为内存区域链表，通过 VMA 的 vm_next 成员指向下一个内存区域，而且链表中的内存区域是按地址上升排序的，链表中最后一个 VMA 值为 NULL。而对于 mm_struct 的 mm_rb 红黑树，mm_rb 为红黑树的根，每个 VMA 通过其 vm_rb 红黑树节点类型链到红黑树中。

在应用层中可以通过 cat /proc//maps 或者 pmap 程序等方法查看应用程序的内存区域列表。

操作 VMA：

kernel 提供 find_vma() 函数用于查找指定的内存地址在哪个 VMA 上，它的实现在 mm/mmap.c 文件中，输入参数为内存描述符和内存地址：

struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
{
	struct vm_area_struct *vma = NULL;

	if (mm) {
		/* Check the cache first. */
		/* (Cache hit rate is typically around 35%.) */
		vma = mm->mmap_cache;
		if (!(vma && vma->vm_end > addr && vma->vm_start <= addr)) {
			struct rb_node * rb_node;

			rb_node = mm->mm_rb.rb_node;
			vma = NULL;

			while (rb_node) {
				struct vm_area_struct * vma_tmp;

				vma_tmp = rb_entry(rb_node,
						struct vm_area_struct, vm_rb);

				if (vma_tmp->vm_end > addr) {
					vma = vma_tmp;
					if (vma_tmp->vm_start <= addr)
						break;
					rb_node = rb_node->rb_left;
				} else
					rb_node = rb_node->rb_right;
			}
			if (vma)
				mm->mmap_cache = vma;
		}
	}
	return vma;
}

如果找不到对应的 VMA 则返回 NULL。需要注意的是返回的 VMA 的开始地址可能比指定的内存地址大。find_vma() 函数返回的结果会被缓存到内存描述符的 mmap_cache 成员中用于提高之后的查找性能，因为后续的操作很可能还是在同样的 VMA 上。如果在 mmap_cache 中找不到则通过红黑树进行查找。

find_vma_prev() 函数与 find_vma() 函数类似，不过它也会返回指定地址之前的最后一个 VMA：

struct vm_area_struct * find_vma_prev(struct mm_struct *mm, unsigned long addr,
struct vm_area_struct **pprev)

kernel 另外还提供了 find_vma_intersection() 函数返回符合 find_vma() 的条件并且其开始地址不在指定内存结束地址之后的 VMA。

4、mmap 和 munmap：

kernel 提供 do_mmap() 函数创建新的线性地址区间，这是用户层 mmap() 函数的底层实现，它用于将一段地址区间添加到进程的地址空间中。

unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len,
unsigned long prot, unsigned long flag, unsigned long offset)

do_mmap 映射 file 参数指定的文件，并最终返回新创建的地址区间的初始地址。

offset 和 len 指定偏移量和长度。如果 file 为 NULL 并且 offset 为 0 则表示该映射后端不是基于文件的，这种映射被称为匿名映射，否则被称为基于文件的映射。prot 参数指定内存区域中页的访问权限，值可以为：PROT_READ（对应 VM_READ）、PROT_WRITE、PROT_EXEC、PROT_NONE 等。flag 指定 VMA 的其它标志，常用的有：MAP_SHARED（此映射可被共享）、MAP_PRIVATE（私有不可共享）、MAP_ANONYMOUS（指定匿名映射）、MAP_LOCKED 等。

如果可能的话，do_mmap 返回的内存区间会尽量和已有邻近的 VMA 合并（调整 VMA 大小），否则就创建一个新的 VMA。新的 VMA 从名为 vm_area_cachep 的 Slab cache 中分配，并通过 vma_link() 函数被加入到进程地址空间的链表和红黑树中，对应的 mm_struct 的 total_vm 成员也被更新。

do_mmap 是调用 do_mmap_pgoff() 函数完成真正的映射操作的。现在用户层使用的 mmap() 函数实际上是在用户层调用 mmap2() 系统调用并最终通过 do_mmap 来实现的。

do_munmap 用于从地址空间移除指定的地址区间：

int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)

do_munmap 导出给用户层就是 munmap() 函数了。

5、页表及应用程序 VMA：

Linux kernel 使用页式内存管理，应用程序给出的内存地址是虚拟地址，它需要经过若干级页表一级一级的变换，才变成真正的物理地址。有关 Linux 的分级页表结构等相关的知识请参考之前的 [内存寻址] 文章。

每个进程有自己的 task_struct，task_struct 中的 mm 指向其内存描述符，每个 mm 又有自己单独的页表（进程中的线程会进行共享），本文最上面介绍的内存描述符 mm_struct 中的 pgd_t * pgd 就指向进程的 PGD，对页表的操作和遍历等操作也需要用到 mm_struct 中的 page_table_lock 自旋锁成员。

应用程序中对内存的操作例如 malloc 分配内存等一般是改变了某个 VMA，不会直接改变页表。假设用户分配了内存，然后访问这块内存，由于页表里面并没有记录相关的映射，CPU 产生一次缺页异常，内核捕捉到异常，检查产生异常的地址是不是存在于一个合法的 VMA 中，如果不是，则给进程一个 "Segmentation Fault" 段错误，使其崩溃；如果是，则分配一个物理页，并为之建立映射。

应用程序中的堆是一个一端固定、一端可伸缩的 VMA，其大小可以通过 brk 系统调用进行调整，libc 的 malloc 函数就是基于 brk 来实现的（如果需要分配的内存很大时，libc 会通过 mmap 系统调用映射一个新的 VMA 以节省对堆 VMA 的一系列调整操作）。应用程序的栈也是一个 VMA，只是它是一端固定、一端可伸不能缩的，而且它是自动伸展的。另外需要说明的是线程的栈 VMA 明显不是和其它线程共享的，一般是在线程创建时通过 mmap 创建新的 VMA 并以此作为线程的栈。

本文只是对 Linux kernel 的进程地址空间的基础涉及，其中有任何问题，欢迎提出指正哦，玩的开心~~~ ^_^

Linux kernel学习-进程基本

Uranus Zhou — Sun, 03 Jun 2012 08:18:32 +0000

本文同步自（如浏览不正常请点击跳转）：https://zohead.com/archives/linux-kernel-learning-process/

Linux 中进程通过 fork() 被创建时，它差不多是和父进程一样的，它得到父进程的地址空间拷贝，运行和父进程一样的代码，从 fork() 的后面开始执行，父进程和子进程共享代码页，但子进程的 data 页是独立的（包括 stack 和 heap）。

早期的 Linux kernel 并不支持多线程的程序，从 kernel 来看，一个多线程的程序只是一个普通的进程，它的多个执行流应该完全在 user mode 来完成创建、处理、调度等操作，例如使用 POSIX pthread 库。当然这样的实现是无法让人满意的，Linux 为此使用轻量级进程为多线程程序提供更好的支持，两个轻量级进程可以共享资源（例如：地址空间、打开的文件等等），一个比较简单的方法是将为每个线程关联一个轻量级进程，这样每个线程可以被 kernel 单独调度，使用 Linux 轻量级进程的库有：LinuxThreads、NPTL、NGPT 等。Linux kernel 同时也支持线程组（可以理解为轻量级进程组）的概念。

1、进程描述符：

进程描述符由 task_struct 结构来表示，一般来说，每个可以被独立调度的执行上下文都必须有自己的进程描述符，因此尽管轻量级进程共享了很大一部分 kernel 数据结构，它也必须有自己的 task_struct。task_struct 中包含关于一个进程的差不多所有信息，它定义在 include/linux/sched.h 文件中，你会看到这是非常大的结构，其中还包含指向其它结构的指针。访问进程自身的 task_struct 结构，使用宏操作 current。

task_struct 中的 struct mm_struct *mm 即指向进程的地址空间。task_struct 的 state 字段表示进程的运行状态，取值有 TASK_RUNNING（正在运行或正在队列中等待运行，进程如果在用户空间只能为此状态）、TASK_INTERRUPTIBLE（可响应信号）、TASK_UNINTERRUPTIBLE（不响应信号）、TASK_STOPED 等，另外 state 还有特殊的两个值是 EXIT_ZOMBIE（僵尸进程）和 EXIT_DEAD（进程将被系统移除）。kernel 提供 set_task_state 宏修改进程状态，set_task_state 最终调用 set_mb，set_current_state 用于当前进程的状态。task_struct 的 pid 字段就是咱们喜闻乐见的进程 ID 了。

这是一个典型的 Linux 进程状态机图：

POSIX 1003.1c 标准规定一个多线程程序的每个线程都应该有相同的 PID，这样的好处是例如发一个信号给一个 PID，一个线程组里的所有线程都能收到。同一线程组中的线程有相同的线程组号（Thread Group ID），线程组组号放在 task_struct 的 tgid 成员变量中，一般是线程组里的第一个轻量级进程的 PID。特别需要注意 getpid() 系统调用返回的就是 tgid 的值，而不是 pid 值，这样一个多线程程序的所有线程可以共享一个 PID。

对每个进程，kernel 在通过 slab 分配器分配 task_struct 时，通常是实际分配了两个连续的物理页面（8KB），以 thread_union 联合表示，其中包括一个 thread_info 结构（其 task 成员是指向 task_struct 的指针）以及 kernel 模式的进程堆栈。esp CPU 堆栈指针即表示此进程堆栈的栈顶地址，进程从用户模式切换到 kernel 模式时，kernel 堆栈会被清空。为了效率考虑，kernel 会将这两个连续的物理页面的第一个页面按 2^13（也就是 8KB）对齐，为了避免内存较少时产生问题，kernel 提供配置选项（就是下面的 THREAD_SIZE 了）可以将 thread_info 和堆栈包含在一个页面也就是 4KB 的内存区域里。一般来说，8KB 的堆栈对于内核程序已经够用。

看看 Linux 2.6.34 中 thread_union 的定义：

union thread_union {
	struct thread_info thread_info;
	unsigned long stack[THREAD_SIZE/sizeof(long)];
};

由于 thread_info 和内核堆栈是合并在连续的页面里的，kernel 就可以从 esp 指针得到 thread_info 结构地址，这是通过 current_thread_info 函数来实现的。

/* how to get the current stack pointer from C */
register unsigned long current_stack_pointer asm("esp") __used;

/* how to get the thread information struct from C */
static inline struct thread_info *current_thread_info(void)
{
	return (struct thread_info *)
		(current_stack_pointer & ~(THREAD_SIZE - 1));
}

假设 thread_union 是 8KB 大小，也即 2^13，将 esp 的最低 13 位屏蔽掉即可得到 thread_info 的地址，如果是 4KB 的栈大小，屏蔽掉最低 12 位即可（和上面的代码一致），这样通过 current_thread_info()->task 就能得到当前的 task_struct，这就是 current 宏的实现了。

#define get_current() (current_thread_info()->task)
#define current get_current()

系统中进程的列表保存在 init_task 所在的双向链表中，task_struct 的 tasks 字段就是 list_head，init_task 表示的就是 PID 为 0 的 swapper 进程（或者叫 idle 进程），其 tasks 会依次指向下一个 task_struct，PID 为 1 的进程就是 init 进程，这两个进程都由 kernel 来创建。

而关于可以运行的进程的调度，Linux 2.6.34 和 ULK 上说的已经有很大的不同了。2.6.34 上加上了 struct sched_class 结构体表示不同类型的调度算法类，目前 2.6.34 上实现了三种：Completely Fair Scheduling (CFS) Class（完全公平算法，见 kernel/sched_fair.c）、Real-Time Scheduling Class（实时算法，见 kernel/sched_rt.c）和 idle-task scheduling class（见 kernel/sched_idletask.c），这三个源文件都被 include 在 kernel/sched.c 中进行编译了。CFS Class 使用 sched_entity 结构作为调度实体，其中包含权重、运行时间等信息，比 RT Class 复杂，其中还有专门的红黑树。RT Class 使用 sched_rt_entity 作为调度实体。

每个 task_struct 中都包含了 sched_entity 和 sched_rt_entity 这两个字段，sched_class 中则有 enqueue_task、dequeue_task 等函数指针指向对应调度算法中的实现函数，enqueue_task 将进程加入运行队列，dequeue_task 将进程从队列中移除，由于这段变化较大而且比较复杂，有关这三种调度算法的具体实现以后再来介绍了。

task_struct 的 real_parent 字段指向创建该进程的进程（如果父进程已不存在则为 init 进程），parent 指向当前进程的父进程，children 为该进程子进程列表，sibling 为该进程的兄弟进程列表，group_leader 字段指向该进程的线程组长。与 ULK 不同的是，ULK 中 ptrace_children 为被调试器 trace 的该进程的子进程列表，2.6.34 中 ptraced 字段包含该进程原本的子进程和 ptrace attach 的目标进程，ptrace_list 改为 ptrace_entry。另外 2.6.34 kernel 中已经引入 namespace 的概念，获得进程组 ID 和会话期 ID 的方式也于 ULK 中的有不少区别。

kernel 中进程的 PID 散列表存在 pid_hash 中以加快根据 PID 搜索 task_struct 的速度，pidhash_init 函数初始化此 PID 散列表，由于 2.6.34 中已有 namespace，pid_hashfn 也由原来的一个参数变为两个参数（增加一个 ns 参数表示哪个 namespace）。Linux kernel 也增加了 pid 和 upid 两个结构体，pid 是内核对进程 PID 的内部表示（惟一的），upid 是进程在特定的 namespace 中看到的 PID。

2、进程创建：

Linux 中使用 fork() 函数创建新进程，父进程的地址空间会复制给子进程，为了效率考虑，这个复制通过 COW（Copy-on-write）来实现，真正有写操作时才会复制。fork()、vfork()、__clone() 函数都是通过 clone() 系统调用来实现的，clone() 系统调用最终调用 do_fork()。需要注意的是 vfork() 的结果是子进程完全运行在父进程的地址空间上，父进程的页表项并不会被拷贝，而且子进程优先运行，父进程会一直阻塞直至子进程结束（调用 exec 执行新程序或者 _exit 退出，不可调用 exit 退出）。do_fork 函数定义在 kernel/fork.c 文件中，do_fork() 会再调用 copy_process() 函数。

可以看到 copy_process 是相当长的一个函数：

static struct task_struct *copy_process(unsigned long clone_flags,
					unsigned long stack_start,
					struct pt_regs *regs,
					unsigned long stack_size,
					int __user *child_tidptr,
					struct pid *pid,
					int trace)
{
	int retval;
	struct task_struct *p;
	int cgroup_callbacks_done = 0;

	if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
		return ERR_PTR(-EINVAL);

	/*
	 * Thread groups must share signals as well, and detached threads
	 * can only be started up within the thread group.
	 */
	if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))
		return ERR_PTR(-EINVAL);

	/*
	 * Shared signal handlers imply shared VM. By way of the above,
	 * thread groups also imply shared VM. Blocking this case allows
	 * for various simplifications in other code.
	 */
	if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
		return ERR_PTR(-EINVAL);

	/*
	 * Siblings of global init remain as zombies on exit since they are
	 * not reaped by their parent (swapper). To solve this and to avoid
	 * multi-rooted process trees, prevent global and container-inits
	 * from creating siblings.
	 */
	if ((clone_flags & CLONE_PARENT) &&
				current->signal->flags & SIGNAL_UNKILLABLE)
		return ERR_PTR(-EINVAL);

	retval = security_task_create(clone_flags);
	if (retval)
		goto fork_out;

	retval = -ENOMEM;
	p = dup_task_struct(current);
	if (!p)
		goto fork_out;

	ftrace_graph_init_task(p);

	rt_mutex_init_task(p);

#ifdef CONFIG_PROVE_LOCKING
	DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
	DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
#endif
	retval = -EAGAIN;
	if (atomic_read(&p->real_cred->user->processes) >=
			task_rlimit(p, RLIMIT_NPROC)) {
		if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE) &&
		    p->real_cred->user != INIT_USER)
			goto bad_fork_free;
	}

	retval = copy_creds(p, clone_flags);
	if (retval < 0)
		goto bad_fork_free;

	/*
	 * If multiple threads are within copy_process(), then this check
	 * triggers too late. This doesn't hurt, the check is only there
	 * to stop root fork bombs.
	 */
	retval = -EAGAIN;
	if (nr_threads >= max_threads)
		goto bad_fork_cleanup_count;

	if (!try_module_get(task_thread_info(p)->exec_domain->module))
		goto bad_fork_cleanup_count;

	p->did_exec = 0;
	delayacct_tsk_init(p);	/* Must remain after dup_task_struct() */
	copy_flags(clone_flags, p);
	INIT_LIST_HEAD(&p->children);
	INIT_LIST_HEAD(&p->sibling);
	rcu_copy_process(p);
	p->vfork_done = NULL;
	spin_lock_init(&p->alloc_lock);

	init_sigpending(&p->pending);

	p->utime = cputime_zero;
	p->stime = cputime_zero;
	p->gtime = cputime_zero;
	p->utimescaled = cputime_zero;
	p->stimescaled = cputime_zero;
#ifndef CONFIG_VIRT_CPU_ACCOUNTING
	p->prev_utime = cputime_zero;
	p->prev_stime = cputime_zero;
#endif
#if defined(SPLIT_RSS_COUNTING)
	memset(&p->rss_stat, 0, sizeof(p->rss_stat));
#endif

	p->default_timer_slack_ns = current->timer_slack_ns;

	task_io_accounting_init(&p->ioac);
	acct_clear_integrals(p);

	posix_cpu_timers_init(p);

	p->lock_depth = -1;		/* -1 = no lock */
	do_posix_clock_monotonic_gettime(&p->start_time);
	p->real_start_time = p->start_time;
	monotonic_to_bootbased(&p->real_start_time);
	p->io_context = NULL;
	p->audit_context = NULL;
	cgroup_fork(p);
#ifdef CONFIG_NUMA
	p->mempolicy = mpol_dup(p->mempolicy);
 	if (IS_ERR(p->mempolicy)) {
 		retval = PTR_ERR(p->mempolicy);
 		p->mempolicy = NULL;
 		goto bad_fork_cleanup_cgroup;
 	}
	mpol_fix_fork_child_flag(p);
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
	p->irq_events = 0;
#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
	p->hardirqs_enabled = 1;
#else
	p->hardirqs_enabled = 0;
#endif
	p->hardirq_enable_ip = 0;
	p->hardirq_enable_event = 0;
	p->hardirq_disable_ip = _THIS_IP_;
	p->hardirq_disable_event = 0;
	p->softirqs_enabled = 1;
	p->softirq_enable_ip = _THIS_IP_;
	p->softirq_enable_event = 0;
	p->softirq_disable_ip = 0;
	p->softirq_disable_event = 0;
	p->hardirq_context = 0;
	p->softirq_context = 0;
#endif
#ifdef CONFIG_LOCKDEP
	p->lockdep_depth = 0; /* no locks held yet */
	p->curr_chain_key = 0;
	p->lockdep_recursion = 0;
#endif

#ifdef CONFIG_DEBUG_MUTEXES
	p->blocked_on = NULL; /* not blocked yet */
#endif
#ifdef CONFIG_CGROUP_MEM_RES_CTLR
	p->memcg_batch.do_batch = 0;
	p->memcg_batch.memcg = NULL;
#endif

	p->bts = NULL;

	/* Perform scheduler related setup. Assign this task to a CPU. */
	sched_fork(p, clone_flags);

	retval = perf_event_init_task(p);
	if (retval)
		goto bad_fork_cleanup_policy;

	if ((retval = audit_alloc(p)))
		goto bad_fork_cleanup_policy;
	/* copy all the process information */
	if ((retval = copy_semundo(clone_flags, p)))
		goto bad_fork_cleanup_audit;
	if ((retval = copy_files(clone_flags, p)))
		goto bad_fork_cleanup_semundo;
	if ((retval = copy_fs(clone_flags, p)))
		goto bad_fork_cleanup_files;
	if ((retval = copy_sighand(clone_flags, p)))
		goto bad_fork_cleanup_fs;
	if ((retval = copy_signal(clone_flags, p)))
		goto bad_fork_cleanup_sighand;
	if ((retval = copy_mm(clone_flags, p)))
		goto bad_fork_cleanup_signal;
	if ((retval = copy_namespaces(clone_flags, p)))
		goto bad_fork_cleanup_mm;
	if ((retval = copy_io(clone_flags, p)))
		goto bad_fork_cleanup_namespaces;
	retval = copy_thread(clone_flags, stack_start, stack_size, p, regs);
	if (retval)
		goto bad_fork_cleanup_io;

	if (pid != &init_struct_pid) {
		retval = -ENOMEM;
		pid = alloc_pid(p->nsproxy->pid_ns);
		if (!pid)
			goto bad_fork_cleanup_io;

		if (clone_flags & CLONE_NEWPID) {
			retval = pid_ns_prepare_proc(p->nsproxy->pid_ns);
			if (retval < 0)
				goto bad_fork_free_pid;
		}
	}

	p->pid = pid_nr(pid);
	p->tgid = p->pid;
	if (clone_flags & CLONE_THREAD)
		p->tgid = current->tgid;

	if (current->nsproxy != p->nsproxy) {
		retval = ns_cgroup_clone(p, pid);
		if (retval)
			goto bad_fork_free_pid;
	}

	p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
	/*
	 * Clear TID on mm_release()?
	 */
	p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL;
#ifdef CONFIG_FUTEX
	p->robust_list = NULL;
#ifdef CONFIG_COMPAT
	p->compat_robust_list = NULL;
#endif
	INIT_LIST_HEAD(&p->pi_state_list);
	p->pi_state_cache = NULL;
#endif
	/*
	 * sigaltstack should be cleared when sharing the same VM
	 */
	if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM)
		p->sas_ss_sp = p->sas_ss_size = 0;

	/*
	 * Syscall tracing and stepping should be turned off in the
	 * child regardless of CLONE_PTRACE.
	 */
	user_disable_single_step(p);
	clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);
#ifdef TIF_SYSCALL_EMU
	clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
#endif
	clear_all_latency_tracing(p);

	/* ok, now we should be set up.. */
	p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL);
	p->pdeath_signal = 0;
	p->exit_state = 0;

	/*
	 * Ok, make it visible to the rest of the system.
	 * We dont wake it up yet.
	 */
	p->group_leader = p;
	INIT_LIST_HEAD(&p->thread_group);

	/* Now that the task is set up, run cgroup callbacks if
	 * necessary. We need to run them before the task is visible
	 * on the tasklist. */
	cgroup_fork_callbacks(p);
	cgroup_callbacks_done = 1;

	/* Need tasklist lock for parent etc handling! */
	write_lock_irq(&tasklist_lock);

	/* CLONE_PARENT re-uses the old parent */
	if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
		p->real_parent = current->real_parent;
		p->parent_exec_id = current->parent_exec_id;
	} else {
		p->real_parent = current;
		p->parent_exec_id = current->self_exec_id;
	}

	spin_lock(¤t->sighand->siglock);

	/*
	 * Process group and session signals need to be delivered to just the
	 * parent before the fork or both the parent and the child after the
	 * fork. Restart if a signal comes in before we add the new process to
	 * it's process group.
	 * A fatal signal pending means that current will exit, so the new
	 * thread can't slip out of an OOM kill (or normal SIGKILL).
 	 */
	recalc_sigpending();
	if (signal_pending(current)) {
		spin_unlock(¤t->sighand->siglock);
		write_unlock_irq(&tasklist_lock);
		retval = -ERESTARTNOINTR;
		goto bad_fork_free_pid;
	}

	if (clone_flags & CLONE_THREAD) {
		atomic_inc(¤t->signal->count);
		atomic_inc(¤t->signal->live);
		p->group_leader = current->group_leader;
		list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);
	}

	if (likely(p->pid)) {
		tracehook_finish_clone(p, clone_flags, trace);

		if (thread_group_leader(p)) {
			if (clone_flags & CLONE_NEWPID)
				p->nsproxy->pid_ns->child_reaper = p;

			p->signal->leader_pid = pid;
			tty_kref_put(p->signal->tty);
			p->signal->tty = tty_kref_get(current->signal->tty);
			attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
			attach_pid(p, PIDTYPE_SID, task_session(current));
			list_add_tail(&p->sibling, &p->real_parent->children);
			list_add_tail_rcu(&p->tasks, &init_task.tasks);
			__get_cpu_var(process_counts)++;
		}
		attach_pid(p, PIDTYPE_PID, pid);
		nr_threads++;
	}

	total_forks++;
	spin_unlock(¤t->sighand->siglock);
	write_unlock_irq(&tasklist_lock);
	proc_fork_connector(p);
	cgroup_post_fork(p);
	perf_event_fork(p);
	return p;

bad_fork_free_pid:
	if (pid != &init_struct_pid)
		free_pid(pid);
bad_fork_cleanup_io:
	if (p->io_context)
		exit_io_context(p);
bad_fork_cleanup_namespaces:
	exit_task_namespaces(p);
bad_fork_cleanup_mm:
	if (p->mm)
		mmput(p->mm);
bad_fork_cleanup_signal:
	if (!(clone_flags & CLONE_THREAD))
		__cleanup_signal(p->signal);
bad_fork_cleanup_sighand:
	__cleanup_sighand(p->sighand);
bad_fork_cleanup_fs:
	exit_fs(p); /* blocking */
bad_fork_cleanup_files:
	exit_files(p); /* blocking */
bad_fork_cleanup_semundo:
	exit_sem(p);
bad_fork_cleanup_audit:
	audit_free(p);
bad_fork_cleanup_policy:
	perf_event_free_task(p);
#ifdef CONFIG_NUMA
	mpol_put(p->mempolicy);
bad_fork_cleanup_cgroup:
#endif
	cgroup_exit(p, cgroup_callbacks_done);
	delayacct_tsk_free(p);
	module_put(task_thread_info(p)->exec_domain->module);
bad_fork_cleanup_count:
	atomic_dec(&p->cred->user->processes);
	exit_creds(p);
bad_fork_free:
	free_task(p);
fork_out:
	return ERR_PTR(retval);
}

/*
 *  Ok, this is the main fork-routine.
 *
 * It copies the process, and if successful kick-starts
 * it and waits for it to finish using the VM if required.
 */
long do_fork(unsigned long clone_flags,
	      unsigned long stack_start,
	      struct pt_regs *regs,
	      unsigned long stack_size,
	      int __user *parent_tidptr,
	      int __user *child_tidptr)
{
	struct task_struct *p;
	int trace = 0;
	long nr;

	/*
	 * Do some preliminary argument and permissions checking before we
	 * actually start allocating stuff
	 */
	if (clone_flags & CLONE_NEWUSER) {
		if (clone_flags & CLONE_THREAD)
			return -EINVAL;
		/* hopefully this check will go away when userns support is
		 * complete
		 */
		if (!capable(CAP_SYS_ADMIN) || !capable(CAP_SETUID) ||
				!capable(CAP_SETGID))
			return -EPERM;
	}

	/*
	 * We hope to recycle these flags after 2.6.26
	 */
	if (unlikely(clone_flags & CLONE_STOPPED)) {
		static int __read_mostly count = 100;

		if (count > 0 && printk_ratelimit()) {
			char comm[TASK_COMM_LEN];

			count--;
			printk(KERN_INFO "fork(): process `%s' used deprecated "
					"clone flags 0x%lx\n",
				get_task_comm(comm, current),
				clone_flags & CLONE_STOPPED);
		}
	}

	/*
	 * When called from kernel_thread, don't do user tracing stuff.
	 */
	if (likely(user_mode(regs)))
		trace = tracehook_prepare_clone(clone_flags);

	p = copy_process(clone_flags, stack_start, regs, stack_size,
			 child_tidptr, NULL, trace);
	/*
	 * Do this prior waking up the new thread - the thread pointer
	 * might get invalid after that point, if the thread exits quickly.
	 */
	if (!IS_ERR(p)) {
		struct completion vfork;

		trace_sched_process_fork(current, p);

		nr = task_pid_vnr(p);

		if (clone_flags & CLONE_PARENT_SETTID)
			put_user(nr, parent_tidptr);

		if (clone_flags & CLONE_VFORK) {
			p->vfork_done = &vfork;
			init_completion(&vfork);
		}

		audit_finish_fork(p);
		tracehook_report_clone(regs, clone_flags, nr, p);

		/*
		 * We set PF_STARTING at creation in case tracing wants to
		 * use this to distinguish a fully live task from one that
		 * hasn't gotten to tracehook_report_clone() yet.  Now we
		 * clear it and set the child going.
		 */
		p->flags &= ~PF_STARTING;

		if (unlikely(clone_flags & CLONE_STOPPED)) {
			/*
			 * We'll start up with an immediate SIGSTOP.
			 */
			sigaddset(&p->pending.signal, SIGSTOP);
			set_tsk_thread_flag(p, TIF_SIGPENDING);
			__set_task_state(p, TASK_STOPPED);
		} else {
			wake_up_new_task(p, clone_flags);
		}

		tracehook_report_clone_complete(trace, regs,
						clone_flags, nr, p);

		if (clone_flags & CLONE_VFORK) {
			freezer_do_not_count();
			wait_for_completion(&vfork);
			freezer_count();
			tracehook_report_vfork_done(p, nr);
		}
	} else {
		nr = PTR_ERR(p);
	}
	return nr;
}

copy_process 中会调用 dup_task_struct 先复制 task_struct 进程描述符，并做一些必要的改动，默认会去掉 flags 字段也即描述符标志中的PF_SUPERPRIV 值，表示进程没有使用超级用户权限，默认设置了PF_FORKNOEXEC 表示还没有执行 exec，设置了 PF_STARTING 标志表示进程正在被创建。copy_process 还会调用 copy_creds 复制进程凭证，这个以后再来专门研究，调用 task_io_accounting_init 初始化 I/O 统计，调用 cgroup_fork 将此进程加到父 cgroups 中，cgroups 是新 kernel 加入的一个比较重要的分组控制的机制，也是依赖 namespace 来实现的，有关 cgroups 以后将专门写一篇文章来介绍。

copy_process 然后会调用 sched_fork 为新创建的进程配置调度器，其中会将进程状态设为 TASK_WAKING 保证没人能运行它或者用信号之类将此新进程加入运行队列，并会调用 set_task_cpu 为进程选择一个空闲的 CPU 来运行，sched_fork 相关函数在 kernel/sched.c 中。

copy_process 然后会根据 clone_flags 来调用 copy_files、copy_fs 等来复制文件描述符、文件系统信息、信号处理函数等，copy_process 中调用 alloc_pid 给新进程分配 PID，copy_process 最终返回一个新的 task_struct。do_fork 最终会通过 task_pid_vnr 来返回子进程的 PID。

fork() 之后 Linux kernel 主观上会调用 wake_up_new_task 让子进程先运行，这样避免父进程有写地址空间的改动而可以减少 COW 的次数，但实际运行中并不一定完全会这样。

当使用 vfork() 函数创建进程时，task_struct 的 vfork_done 指向一个特定的地址，kernel 通过 init_completion 初始化等待，从上面的代码也可以看到父进程会一直调用 wait_for_completion 等待直到子进程通过 vfork_done 指针通知它，每个进程退出地址空间时（exec 或者 exit） mm_release() 函数中会检查 vfork_done 如果不为 NULL 父进程就能收到信号，这样就可以保证 vfork() 时必须是子进程先运行。

实际上由于 COW 的存在，vfork() 相对 fork() 惟一的好处只在于少了页表项的拷贝，现在也已经有了测试版的 COW 页表项 patch，这个就以后再抽空测试了。

3、Linux线程实现：

Linux kernel 的线程实现是比较特别的，它没有专门的线程概念，所有线程只是标准的进程，线程只是一个与其它进程共享资源的进程，这与 Windows、Solaris 等操作系统有明显的不同。

Linux 中线程的创建也是通过 clone() 系统调用来实现，只是增加了特别的参数：

clone(CLONE_VM | CLONE_FS | CLONE_FILES | CLONE_SIGHAND, 0);

上面的结果是地址空间、文件系统资源、文件描述符、信号处理都被共享了。

普通的 fork() 的调用方式为：

clone(SIGCHLD, 0);

vfork() 函数的调用方式为：

clone(CLONE_VFORK | CLONE_VM | SIGCHLD, 0);

这样明显就能看到区别了。

另外来专门说下 kernel thread，kernel thread 是一种只在内核空间存在的进程，它不会发生上下文切换到用户空间，它也是可被调度和可被抢占的，它与普通进程的区别在于它没有地址空间，它的 task_struct 的 mm 指针（即进程地址空间）为 NULL。常见的 kernel thread 有 flush、ksoftirqd 等。

可以使用 kthread_create 函数创建 kernel thread，此函数定义在 kernel/kthread.c 中，此函数即返回 task_struct 指针，kernel thread 创建之后默认为 TASK_INTERRUPTIBLE 状态，需要使用 wake_up_process 来唤醒它，因此可以使用 kthread_run 直接创建并运行一个 kernel thread。kthread_stop 用于停止 kernel thread 执行。

看看 kthread_create 的代码：

struct task_struct *kthread_create(int (*threadfn)(void *data),
				   void *data,
				   const char namefmt[],
				   ...)
{
	struct kthread_create_info create;

	create.threadfn = threadfn;
	create.data = data;
	init_completion(&create.done);

	spin_lock(&kthread_create_lock);
	list_add_tail(&create.list, &kthread_create_list);
	spin_unlock(&kthread_create_lock);

	wake_up_process(kthreadd_task);
	wait_for_completion(&create.done);

	if (!IS_ERR(create.result)) {
		struct sched_param param = { .sched_priority = 0 };
		va_list args;

		va_start(args, namefmt);
		vsnprintf(create.result->comm, sizeof(create.result->comm),
			  namefmt, args);
		va_end(args);
		/*
		 * root may have changed our (kthreadd's) priority or CPU mask.
		 * The kernel thread should not inherit these properties.
		 */
		sched_setscheduler_nocheck(create.result, SCHED_NORMAL, ¶m);
		set_cpus_allowed_ptr(create.result, cpu_all_mask);
	}
	return create.result;
}

这时就要请出 PID 为 2 的 kthreadd 进程出场了，kernel thread 的创建是由 kthreadd 完成的。kthread_create 中会先把创建的信息 kthread_create_info 加到 kthread_create_list 链接中，然后唤醒 kthreadd 进程（其进程描述符保存在 kthreadd_task 全局变量中），并使用 wait_for_completion 等待 kthreadd 创建过程完成。

再看看 kthreadd 的实现：

int kthreadd(void *unused)
{
	struct task_struct *tsk = current;

	/* Setup a clean context for our children to inherit. */
	set_task_comm(tsk, "kthreadd");
	ignore_signals(tsk);
	set_cpus_allowed_ptr(tsk, cpu_all_mask);
	set_mems_allowed(node_states[N_HIGH_MEMORY]);

	current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG;

	for (;;) {
		set_current_state(TASK_INTERRUPTIBLE);
		if (list_empty(&kthread_create_list))
			schedule();
		__set_current_state(TASK_RUNNING);

		spin_lock(&kthread_create_lock);
		while (!list_empty(&kthread_create_list)) {
			struct kthread_create_info *create;

			create = list_entry(kthread_create_list.next,
					    struct kthread_create_info, list);
			list_del_init(&create->list);
			spin_unlock(&kthread_create_lock);

			create_kthread(create);

			spin_lock(&kthread_create_lock);
		}
		spin_unlock(&kthread_create_lock);
	}

	return 0;
}

kthreadd 在循环中检查 kthread_create_list 链表，会找到刚才的 kthread_create_info 结构，并将之从 kthread_create_list 链表中删除，然后调用 create_kthread 最终完成创建。

OK，既然都分析这么多了就再来 create_kthread：

static int kthread(void *_create)
{
	/* Copy data: it's on kthread's stack */
	struct kthread_create_info *create = _create;
	int (*threadfn)(void *data) = create->threadfn;
	void *data = create->data;
	struct kthread self;
	int ret;

	self.should_stop = 0;
	init_completion(&self.exited);
	current->vfork_done = &self.exited;

	/* OK, tell user we're spawned, wait for stop or wakeup */
	__set_current_state(TASK_UNINTERRUPTIBLE);
	create->result = current;
	complete(&create->done);
	schedule();

	ret = -EINTR;
	if (!self.should_stop)
		ret = threadfn(data);

	/* we can't just return, we must preserve "self" on stack */
	do_exit(ret);
}

static void create_kthread(struct kthread_create_info *create)
{
	int pid;

	/* We want our own signal handler (we take no signals by default). */
	pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD);
	if (pid < 0) {
		create->result = ERR_PTR(pid);
		complete(&create->done);
	}
}

create_kthread 会调用 kernel_thread 创建新的进程，而且它的入口函数是 kthread 函数，参数为 create。kthread 中把自己设为 TASK_UNINTERRUPTIBLE 状态，并用 complete 告诉 kthread_create 创建好了，接着它调用 schedule() 函数使其所在进程进入睡眠状态，如果被唤醒（可以使用 wait_up_process 之类的了）就执行创建 kthread thread 时指定的入口函数。

而 kernel_thread 的实现则是平台相关的，它会调用 do_fork 创建新的进程，看看 x86 上的实现：

int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
{
	struct pt_regs regs;

	memset(®s, 0, sizeof(regs));

	regs.si = (unsigned long) fn;
	regs.di = (unsigned long) arg;

#ifdef CONFIG_X86_32
	regs.ds = __USER_DS;
	regs.es = __USER_DS;
	regs.fs = __KERNEL_PERCPU;
	regs.gs = __KERNEL_STACK_CANARY;
#else
	regs.ss = __KERNEL_DS;
#endif

	regs.orig_ax = -1;
	regs.ip = (unsigned long) kernel_thread_helper;
	regs.cs = __KERNEL_CS | get_kernel_rpl();
	regs.flags = X86_EFLAGS_IF | 0x2;

	/* Ok, create the new process.. */
	return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL);
}

由此也可以看到在 kernel 中创建 kernel thread 有两种方法：kthread_create 和 kernel_thread，它们的区别是：kthread_create 创建的 kernel thread 有完整的上下文环境，其父进程一定为 kthreadd，而 kernel_thread 的父进程可以为 init 或其它 kernel thread。

4、进程结束：

进程可以使用 exit 函数主动结束自己，也可以在收到信号或异常时非主动结束。不管怎样结束，最终都由 do_exit() 函数来完成，此函数定义在 kernel/exit.c 中，看看它的实现：

static void exit_notify(struct task_struct *tsk, int group_dead)
{
	int signal;
	void *cookie;

	/*
	 * This does two things:
	 *
  	 * A.  Make init inherit all the child processes
	 * B.  Check to see if any process groups have become orphaned
	 *	as a result of our exiting, and if they have any stopped
	 *	jobs, send them a SIGHUP and then a SIGCONT.  (POSIX 3.2.2.2)
	 */
	forget_original_parent(tsk);
	exit_task_namespaces(tsk);

	write_lock_irq(&tasklist_lock);
	if (group_dead)
		kill_orphaned_pgrp(tsk->group_leader, NULL);

	/* Let father know we died
	 *
	 * Thread signals are configurable, but you aren't going to use
	 * that to send signals to arbitary processes.
	 * That stops right now.
	 *
	 * If the parent exec id doesn't match the exec id we saved
	 * when we started then we know the parent has changed security
	 * domain.
	 *
	 * If our self_exec id doesn't match our parent_exec_id then
	 * we have changed execution domain as these two values started
	 * the same after a fork.
	 */
	if (tsk->exit_signal != SIGCHLD && !task_detached(tsk) &&
	    (tsk->parent_exec_id != tsk->real_parent->self_exec_id ||
	     tsk->self_exec_id != tsk->parent_exec_id))
		tsk->exit_signal = SIGCHLD;

	signal = tracehook_notify_death(tsk, &cookie, group_dead);
	if (signal >= 0)
		signal = do_notify_parent(tsk, signal);

	tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE;

	/* mt-exec, de_thread() is waiting for us */
	if (thread_group_leader(tsk) &&
	    tsk->signal->group_exit_task &&
	    tsk->signal->notify_count < 0)
		wake_up_process(tsk->signal->group_exit_task);

	write_unlock_irq(&tasklist_lock);

	tracehook_report_death(tsk, signal, cookie, group_dead);

	/* If the process is dead, release it - nobody will wait for it */
	if (signal == DEATH_REAP)
		release_task(tsk);
}

NORET_TYPE void do_exit(long code)
{
	struct task_struct *tsk = current;
	int group_dead;

	profile_task_exit(tsk);

	WARN_ON(atomic_read(&tsk->fs_excl));

	if (unlikely(in_interrupt()))
		panic("Aiee, killing interrupt handler!");
	if (unlikely(!tsk->pid))
		panic("Attempted to kill the idle task!");

	tracehook_report_exit(&code);

	validate_creds_for_do_exit(tsk);

	/*
	 * We're taking recursive faults here in do_exit. Safest is to just
	 * leave this task alone and wait for reboot.
	 */
	if (unlikely(tsk->flags & PF_EXITING)) {
		printk(KERN_ALERT
			"Fixing recursive fault but reboot is needed!\n");
		/*
		 * We can do this unlocked here. The futex code uses
		 * this flag just to verify whether the pi state
		 * cleanup has been done or not. In the worst case it
		 * loops once more. We pretend that the cleanup was
		 * done as there is no way to return. Either the
		 * OWNER_DIED bit is set by now or we push the blocked
		 * task into the wait for ever nirwana as well.
		 */
		tsk->flags |= PF_EXITPIDONE;
		set_current_state(TASK_UNINTERRUPTIBLE);
		schedule();
	}

	exit_irq_thread();

	exit_signals(tsk);  /* sets PF_EXITING */
	/*
	 * tsk->flags are checked in the futex code to protect against
	 * an exiting task cleaning up the robust pi futexes.
	 */
	smp_mb();
	raw_spin_unlock_wait(&tsk->pi_lock);

	if (unlikely(in_atomic()))
		printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",
				current->comm, task_pid_nr(current),
				preempt_count());

	acct_update_integrals(tsk);
	/* sync mm's RSS info before statistics gathering */
	if (tsk->mm)
		sync_mm_rss(tsk, tsk->mm);
	group_dead = atomic_dec_and_test(&tsk->signal->live);
	if (group_dead) {
		hrtimer_cancel(&tsk->signal->real_timer);
		exit_itimers(tsk->signal);
		if (tsk->mm)
			setmax_mm_hiwater_rss(&tsk->signal->maxrss, tsk->mm);
	}
	acct_collect(code, group_dead);
	if (group_dead)
		tty_audit_exit();
	if (unlikely(tsk->audit_context))
		audit_free(tsk);

	tsk->exit_code = code;
	taskstats_exit(tsk, group_dead);

	exit_mm(tsk);

	if (group_dead)
		acct_process();
	trace_sched_process_exit(tsk);

	exit_sem(tsk);
	exit_files(tsk);
	exit_fs(tsk);
	check_stack_usage();
	exit_thread();
	cgroup_exit(tsk, 1);

	if (group_dead)
		disassociate_ctty(1);

	module_put(task_thread_info(tsk)->exec_domain->module);

	proc_exit_connector(tsk);

	/*
	 * FIXME: do that only when needed, using sched_exit tracepoint
	 */
	flush_ptrace_hw_breakpoint(tsk);
	/*
	 * Flush inherited counters to the parent - before the parent
	 * gets woken up by child-exit notifications.
	 */
	perf_event_exit_task(tsk);

	exit_notify(tsk, group_dead);
#ifdef CONFIG_NUMA
	mpol_put(tsk->mempolicy);
	tsk->mempolicy = NULL;
#endif
#ifdef CONFIG_FUTEX
	if (unlikely(current->pi_state_cache))
		kfree(current->pi_state_cache);
#endif
	/*
	 * Make sure we are holding no locks:
	 */
	debug_check_no_locks_held(tsk);
	/*
	 * We can do this unlocked here. The futex code uses this flag
	 * just to verify whether the pi state cleanup has been done
	 * or not. In the worst case it loops once more.
	 */
	tsk->flags |= PF_EXITPIDONE;

	if (tsk->io_context)
		exit_io_context(tsk);

	if (tsk->splice_pipe)
		__free_pipe_info(tsk->splice_pipe);

	validate_creds_for_do_exit(tsk);

	preempt_disable();
	exit_rcu();
	/* causes final put_task_struct in finish_task_switch(). */
	tsk->state = TASK_DEAD;
	schedule();
	BUG();
	/* Avoid "noreturn function does return".  */
	for (;;)
		cpu_relax();	/* For when BUG is null */
}

do_exit 会将 task_struct 的 flags 字段设为 PF_EXITING（在 exit_signals 中设置），调用 acct_update_integrals 更新进程统计信息，调用 exit_mm 释放进程使用的地址空间（mm_struct），调用 exit_sem、exit_files、exit_fs 释放等待的信号量，递减文件描述符和文件系统的引用计数，task_struct 的 exit_code 被设置为对应的退出值，调用 exit_notify 通知进程的父进程，而在 exit_notify 函数中将此进程的子进程的父进程改为线程组中的另一个线程或者 init 进程，并将 exit_state 标记为 EXIT_ZOMBIE（需要父进程得到退出状态了），最终进程 state 状态被设置为 TASK_DEAD，然后调用 schedule() 以切换到新的进程上。

do_exit 完成之后，进程的描述符和 thread_info 都还是存在的，只是进程是 EXIT_ZOMBIE 状态而且不可运行，如果父进程通过 wait4 等系统调用处理完此进程的状态，此进程的 task_struct 和 thread_info 就会通过调用 release_task() 被最终释放，看看它的实现：

void release_task(struct task_struct * p)
{
	struct task_struct *leader;
	int zap_leader;
repeat:
	tracehook_prepare_release_task(p);
	/* don't need to get the RCU readlock here - the process is dead and
	 * can't be modifying its own credentials. But shut RCU-lockdep up */
	rcu_read_lock();
	atomic_dec(&__task_cred(p)->user->processes);
	rcu_read_unlock();

	proc_flush_task(p);

	write_lock_irq(&tasklist_lock);
	tracehook_finish_release_task(p);
	__exit_signal(p);

	/*
	 * If we are the last non-leader member of the thread
	 * group, and the leader is zombie, then notify the
	 * group leader's parent process. (if it wants notification.)
	 */
	zap_leader = 0;
	leader = p->group_leader;
	if (leader != p && thread_group_empty(leader) && leader->exit_state == EXIT_ZOMBIE) {
		BUG_ON(task_detached(leader));
		do_notify_parent(leader, leader->exit_signal);
		/*
		 * If we were the last child thread and the leader has
		 * exited already, and the leader's parent ignores SIGCHLD,
		 * then we are the one who should release the leader.
		 *
		 * do_notify_parent() will have marked it self-reaping in
		 * that case.
		 */
		zap_leader = task_detached(leader);

		/*
		 * This maintains the invariant that release_task()
		 * only runs on a task in EXIT_DEAD, just for sanity.
		 */
		if (zap_leader)
			leader->exit_state = EXIT_DEAD;
	}

	write_unlock_irq(&tasklist_lock);
	release_thread(p);
	call_rcu(&p->rcu, delayed_put_task_struct);

	p = leader;
	if (unlikely(zap_leader))
		goto repeat;
}

它先调用 __exit_signal，其中会调用 __unhash_process，__unhash_process 调用 detach_pid 将 PID 从上面说的进程 PID hash 表（pid_hash）中删除。release_task 最终会调用 delayed_put_task_struct，其中再调用 put_task_struct，put_task_struct 再分别调用 free_thread_info 和 free_task_struct 来释放 thread_info 和 task_struct。

需要注意的是如果父进程在子进程之前退出，这时需要一个机制设置子进程的新父进程，不然子进程退出时将一直处于僵尸状态。因此进程退出时需要调用 exit_notify 进行通知处理，exit_notify 调用 forget_original_parent，其中再调用 find_new_reaper（意思不错，找到新收割者 ^_^）来设置新父进程，看看它的实现：

static struct task_struct *find_new_reaper(struct task_struct *father)
{
	struct pid_namespace *pid_ns = task_active_pid_ns(father);
	struct task_struct *thread;

	thread = father;
	while_each_thread(father, thread) {
		if (thread->flags & PF_EXITING)
			continue;
		if (unlikely(pid_ns->child_reaper == father))
			pid_ns->child_reaper = thread;
		return thread;
	}

	if (unlikely(pid_ns->child_reaper == father)) {
		write_unlock_irq(&tasklist_lock);
		if (unlikely(pid_ns == &init_pid_ns))
			panic("Attempted to kill init!");

		zap_pid_ns_processes(pid_ns);
		write_lock_irq(&tasklist_lock);
		/*
		 * We can not clear ->child_reaper or leave it alone.
		 * There may by stealth EXIT_DEAD tasks on ->children,
		 * forget_original_parent() must move them somewhere.
		 */
		pid_ns->child_reaper = init_pid_ns.child_reaper;
	}

	return pid_ns->child_reaper;
}

它先用 while_each_thread 在线程组中找一个进程，找到就直接返回，找不到就返回 init 进程了，init 进程会自动调用 wait() 来等待它的子进程。

至此 Linux kernel 中的进程基本实现大概了解了，将了进程和线程概念、进程创建和退出等做了代码上的分析介绍，下面就是专门的进程调度了，有任何问题欢迎指正哦~~~ ^_^