嵌入式软件开发之------浅析 fork（）函数（十二）

最新推荐文章于 2024-05-26 16:17:31 发布

原创最新推荐文章于 2024-05-26 16:17:31 发布 · 892 阅读

本内容遵循CC 4.0 BY-SA版权协议

本文详细探讨了Linux系统中fork与vfork函数的内部实现原理，从源代码层面分析了这两个函数如何创建子进程，包括复制父进程的task_struct、资源分配、权限检查等关键步骤，特别关注了vfork在子进程执行前保持父进程状态不变的特性。

linux代码版本：linux4.4

glibc代码版本：glibc-2.26

导读：在linux内核态也搞了好几年了，公司的新平台也都转向了用户态，发展趋势也是linux的工作量也来越少，更多的工作也将聚焦在业务上。其实无论是在内核态还是用户态编程，对于嵌入式编程来说，没有本质区别，嵌入式产品上也不太会去做数据库等之类的应用，大多还是基于设备文件操作、进程通信、socket等实现业务逻辑。在内核态面临的踩内存死机问题，用户态一样会面临。内核态用kdb，用户态用gdb，当熟悉内核的一些知识后，再用户态编程，感觉要容易一些，当你打开一个文件操作时，open，read，write、select就会想到内核的实现。尤其是gdb功能比kdb强大太多了，有种将刀剑换成冲锋枪的感觉。再发生踩内存也只是死的该进程，不再是死机，调试起来效率高得多。用户态的接口也就那么些，很快看完了。相对于一直搞用户态编程的同事，搞过内核就总想看一下用户态接口的实现。下面从fork（）函数开始。

一、fork 函数在哪？

正要看 fork 函数的实现，就遇到了大麻烦，在内核代码里怎么也找不到 fork 函数的实体。原以为能够直接搜索到 fork 函数，然后里面再调用系统调用进入内核态，全局搜索后也没有找到 fork 函数定义，只在 fork.c 里面找到了

SYSCALL_DEFINE0(fork)
{
#ifdef CONFIG_MMU
	return _do_fork(SIGCHLD, 0, 0, NULL, NULL, 0);
#else
	/* can not support in nommu mode */
	return -EINVAL;
#endif
}

再看 SYSCALL_DEFINE0 定义

#define SYSCALL_DEFINE0(sname)					\
	SYSCALL_METADATA(_##sname, 0);				\
	asmlinkage long sys_##sname(void)

展开说就是

asmlinkage long sys_fork(void);

然后搜索 sys_fork

#define __NR_fork 2
__SYSCALL(__NR_fork, sys_fork)

直接注册成了系统调用，然后就再也找不到 fork 究竟在哪里定义了。代码搜不到，那就只能写一段代码，用 gdb 跟踪一下，

Breakpoint 1, main (argc=1, argv=0x7fffffffe4f8) at test.c:30
30              pid = fork();
(gdb) s
__libc_fork () at ../sysdeps/nptl/fork.c:49
49      ../sysdeps/nptl/fork.c: No such file or directory.
(gdb) bt
#0  __libc_fork () at ../sysdeps/nptl/fork.c:49
#1  0x0000555555554b54 in main (argc=1, argv=0x7fffffffe4f8) at test.c:30
(gdb) s
61      in ../sysdeps/nptl/fork.c
(gdb) s
66      in ../sysdeps/nptl/fork.c

很明显，调用到了 __libc_fork ，也就是在 glibc 里面，然后发现下面的调用

#ifdef ARCH_FORK
  pid = ARCH_FORK ();
#else
# error "ARCH_FORK must be defined so that the CLONE_SETTID flag is used"
  pid = INLINE_SYSCALL (fork, 0);
#endif

看下ia64下的定义

#define INLINE_SYSCALL(name, nr, args...)		\
  ({							\
    DO_INLINE_SYSCALL_NCS (__NR_##name, nr, args)	\
    if (_r10 == -1)					\
      {							\
	__set_errno (_retval);				\
	_retval = -1;					\
      }							\
    _retval; })

调用__NR_fork 系统调用，正好就是 sys_fork 函数！

其实无论 do_fork 、sys_fork、sys_vfork 最终调用的都是 _do_fork ，只是传递参数不同而已，

long do_fork(unsigned long clone_flags,
	      unsigned long stack_start,
	      unsigned long stack_size,
	      int __user *parent_tidptr,
	      int __user *child_tidptr)
{
	return _do_fork(clone_flags, stack_start, stack_size,
			parent_tidptr, child_tidptr, 0);
}
#endif

/*
 * Create a kernel thread.
 */
pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
{
	return _do_fork(flags|CLONE_VM|CLONE_UNTRACED, (unsigned long)fn,
		(unsigned long)arg, NULL, NULL, 0);
}

#ifdef __ARCH_WANT_SYS_FORK
SYSCALL_DEFINE0(fork)
{
#ifdef CONFIG_MMU
	return _do_fork(SIGCHLD, 0, 0, NULL, NULL, 0);
#else
	/* can not support in nommu mode */
	return -EINVAL;
#endif
}
#endif

#ifdef __ARCH_WANT_SYS_VFORK
SYSCALL_DEFINE0(vfork)
{
	return _do_fork(CLONE_VFORK | CLONE_VM | SIGCHLD, 0,
			0, NULL, NULL, 0);
}
#endif

上面可以看出，vfork 相比 fork 多了个CLONE_VFORK 和 CLONE_VM 的传参

下面看 _do_fork ：

long _do_fork(unsigned long clone_flags,
	      unsigned long stack_start,
	      unsigned long stack_size,
	      int __user *parent_tidptr,
	      int __user *child_tidptr,
	      unsigned long tls)
{
	struct task_struct *p;
	int trace = 0;
	long nr;

	/*
	 * Determine whether and which event to report to ptracer.  When
	 * called from kernel_thread or CLONE_UNTRACED is explicitly
	 * requested, no event is reported; otherwise, report if the event
	 * for the type of forking is enabled.
	 */
	 /*决定是否以及什么类型的 event 给 kernel_thread 是明确设置了 CLONE_UNTRACED ，
	  * fork 和 vfork 会进入该分支*/
	if (!(clone_flags & CLONE_UNTRACED)) {
		/* vfork 设置了 CLONE_VFORK */
		if (clone_flags & CLONE_VFORK)
			trace = PTRACE_EVENT_VFORK;
		/*fork 和 fork 都设置了 SIGCHLD ，也就是进程exit是发送 SIGCHLD 信号*/
		else if ((clone_flags & CSIGNAL) != SIGCHLD)
			trace = PTRACE_EVENT_CLONE;
		else
			trace = PTRACE_EVENT_FORK;
       /* 检查当前进程是否使能了上面的 trace 事件*/
		if (likely(!ptrace_event_enabled(current, trace)))
			trace = 0;
	}
    /* 复制父进程的 task_struct */ 
	p = copy_process(clone_flags, stack_start, stack_size,
			 child_tidptr, NULL, trace, tls, NUMA_NO_NODE);
	/*
	 * Do this prior waking up the new thread - the thread pointer
	 * might get invalid after that point, if the thread exits quickly.
	 */
	if (!IS_ERR(p)) {
		struct completion vfork;
		struct pid *pid;

		cpufreq_task_times_alloc(p);

		trace_sched_process_fork(current, p);

		pid = get_task_pid(p, PIDTYPE_PID);
		nr = pid_vnr(pid);
        /*将子进程的 pid 写到 父进程的参数中*/
		if (clone_flags & CLONE_PARENT_SETTID)
			put_user(nr, parent_tidptr);
        /*vfork 的话获取 task_struct */
		if (clone_flags & CLONE_VFORK) {
			p->vfork_done = &vfork;
			init_completion(&vfork);
			get_task_struct(p);
		}
        /*唤醒新创建的进程*/
		wake_up_new_task(p);

		/* forking complete and child started to run, tell ptracer */
		if (unlikely(trace))
			ptrace_event_pid(trace, pid);
        /*如果是 vfork  则等待vfork完成，也就是等待新创建的线程先执行*/
		if (clone_flags & CLONE_VFORK) {
			if (!wait_for_vfork_done(p, &vfork))
				ptrace_event_pid(PTRACE_EVENT_VFORK_DONE, pid);
		}

		put_pid(pid);
	} else {
		nr = PTR_ERR(p);
	}
	return nr;
}

很明显通过 copy_process 复制父进程，并且赋值完成后，会唤醒子进程，如果是vfork ，还会等待子进程执行完成，这也就是 vfork 创建的子进程先执行。

copy_process 函数完成了主要工作，还是比较复杂的。下表是传递的 flags 的一些解释：

Flag name	Description
CLONE_VM	Shares the memory descriptor and all Page Tables .
CLONE_FS	Shares the table that identifies the root directory and the current working directory, as well as the value of the bitmask used to mask the initial file permissions of a new file (the so-called file umask ).
CLONE_FILES	Shares the table that identifies the open files .
CLONE_SIGHAND	Shares the tables that identify the signal handlers and the blocked and pending signals . If this flag is true, the CLONE_VM flag must also be set.
CLONE_PTRACE	If traced, the parent wants the child to be traced too. Furthermore, the debugger may want to trace the child on its own; in this case, the kernel forces the flag to 1.
CLONE_VFORK	Set when the system call issued is a vfork( ) .
CLONE_PARENT	Sets the parent of the child (parent and real_parent fields in the process descriptor) to the parent of the calling process.
CLONE_THREAD	Inserts the child into the same thread group of the parent, and forces the child to share the signal descriptor of the parent. The child's tgid and group_leader fields are set accordingly. If this flag is true, the CLONE_SIGHAND flag must also be set.
CLONE_NEWNS	Set if the clone needs its own namespace, that is, its own view of the mounted filesystems ; it is not possible to specify both CLONE_NEWNS and CLONE_FS .
CLONE_SYSVSEM	Shares the System V IPC undoable semaphore operations .
CLONE_SETTLS	Creates a new Thread Local Storage (TLS) segment for the lightweight process; the segment is described in the structure pointed to by the tls parameter.
CLONE_PARENT_SETTID	Writes the PID of the child into the User Mode variable of the parent pointed to by theptid parameter.
CLONE_CHILD_CLEARTID	When set, the kernel sets up a mechanism to be triggered when the child process will exit or when it will start executing a new program. In these cases, the kernel will clear the User Mode variable pointed to by the ctid parameter and will awaken any process waiting for this event.
CLONE_DETACHED	A legacy flag ignored by the kernel.
CLONE_UNTRACED	Set by the kernel to override the value of the CLONE_PTRACE flag (used for disabling tracing of kernel threads ).
CLONE_CHILD_SETTID	Writes the PID of the child into the User Mode variable of the child pointed to by thectid parameter.
CLONE_STOPPED	Forces the child to start in the TASK_STOPPED state.

static struct task_struct *copy_process(unsigned long clone_flags,
					unsigned long stack_start,
					unsigned long stack_size,
					int __user *child_tidptr,
					struct pid *pid,
					int trace,
					unsigned long tls,
					int node)
{
	int retval;
	struct task_struct *p;
	void *cgrp_ss_priv[CGROUP_CANFORK_COUNT] = {};
    /*参数合法性检查，CLONE_NEWNS 是子进程需要自己 命名空间 ，CLONE_FS 是共享父进程的文件系统及根目录和当前目录，相互冲突*/
	if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
		return ERR_PTR(-EINVAL);
    /*子进程即需要自己的 用户命名空间 ，又共享父进程的根目录和当前目录，是冲突的*/
	if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS))
		return ERR_PTR(-EINVAL);

	/*
	 * Thread groups must share signals as well, and detached threads
	 * can only be started up within the thread group.
	 */
	 /* CLONE_THREAD 子进程和父进程属于同一个线程组，就必须共享信号处理表*/
	if ((clone_flags & CLONE_THREAD) && !(clone_flags & CLONE_SIGHAND))
		return ERR_PTR(-EINVAL);

	/*
	 * Shared signal handlers imply shared VM. By way of the above,
	 * thread groups also imply shared VM. Blocking this case allows
	 * for various simplifications in other code.
	 */
	/*如果子进程共享父进程的信号处理表，那么就必须共享 虚拟内存 区域和页表*/
	if ((clone_flags & CLONE_SIGHAND) && !(clone_flags & CLONE_VM))
		return ERR_PTR(-EINVAL);

	/*
	 * Siblings of global init remain as zombies on exit since they are
	 * not reaped by their parent (swapper). To solve this and to avoid
	 * multi-rooted process trees, prevent global and container-inits
	 * from creating siblings.
	 */
	 /*不能设置为调用者共用父进程（兄弟关系），且又不接受致命信号*/
	if ((clone_flags & CLONE_PARENT) &&
				current->signal->flags & SIGNAL_UNKILLABLE)
		return ERR_PTR(-EINVAL);

	/*
	 * If the new process will be in a different pid or user namespace
	 * do not allow it to share a thread group with the forking task.
	 */
	 /*不能既拥有新的用户命名空间，又在同一个线程组*/
	if (clone_flags & CLONE_THREAD) {
		if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) ||
		    (task_active_pid_ns(current) !=
				current->nsproxy->pid_ns_for_children))
			return ERR_PTR(-EINVAL);
	}
    /*安全检查，询问 Linux Security Module (LSM) 看当前任务是否可以创建一个新任务，LSM是SELinux的核心*/
	retval = security_task_create(clone_flags);
	if (retval)
		goto fork_out;

	retval = -ENOMEM;
	p = dup_task_struct(current, node);
	    {
			struct task_struct *tsk;
			unsigned long *stack;
			int err;
            /* fork 和 vfork 传递的都是 0 */
			if (node == NUMA_NO_NODE)
				node = tsk_fork_get_node(orig);
			/*分配 task_struct */
			tsk = alloc_task_struct_node(node);
			if (!tsk)
				return NULL;
            /*分配栈空间*/
			stack = alloc_thread_stack_node(tsk, node);
			if (!stack)
				goto free_tsk;
            /*将父进程的 task_struct 复制给新建的 task_struct */
			err = arch_dup_task_struct(tsk, orig);
			if (err)
				goto free_stack;
            /*新的 task_struct 栈指向了自己的栈空间*/
			tsk->stack = stack;

			err = kaiser_map_thread_stack(tsk->stack);
			if (err)
				goto free_stack;
#ifdef CONFIG_SECCOMP
			/*
			 * We must handle setting up seccomp filters once we're under
			 * the sighand lock in case orig has changed between now and
			 * then. Until then, filter must be NULL to avoid messing up
			 * the usage counts on the error path calling free_task.
			 */
			tsk->seccomp.filter = NULL;
#endif
            /*复制父进程的 thread_info,并将task指针指向自己的 task_struct */
			setup_thread_stack(tsk, orig);
            /*清除从用户空间返回时通知内核的标志，暂时还不理解啥意思*/
			clear_user_return_notifier(tsk);
			/*清楚重新调度的标志，也就是不能调度*/
			clear_tsk_need_resched(tsk);
			/*清除底设置 魔幻数字，用于栈溢出检查的*/
			set_task_stack_end_magic(tsk);

#ifdef CONFIG_CC_STACKPROTECTOR
			tsk->stack_canary = get_random_long();
#endif

			/*
			 * One for us, one for whoever does the "release_task()" (usually
			 * parent)
			 */
			 /*本身在使用，以及父进程在使用，所以设置为 2 */
			atomic_set(&tsk->usage, 2);
#ifdef CONFIG_BLK_DEV_IO_TRACE
			tsk->btrace_seq = 0;
#endif
			tsk->splice_pipe = NULL;
			tsk->task_frag.page = NULL;
			tsk->wake_q.next = NULL;

			account_kernel_stack(stack, 1);

			kcov_task_init(tsk);

			return tsk;

		free_stack:
			free_thread_stack(stack);
		free_tsk:
			free_task_struct(tsk);
			return NULL;
		}
	if (!p)
		goto fork_out;

	cpufreq_task_times_init(p);

	/*
	 * This _must_ happen before we call free_task(), i.e. before we jump
	 * to any of the bad_fork_* labels. This is to avoid freeing
	 * p->set_child_tid which is (ab)used as a kthread's data pointer for
	 * kernel threads (PF_KTHREAD).
	 */
	/*设置线程id*/
	p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
	/*
	 * Clear TID on mm_release()?
	 */
	p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr : NULL;

	ftrace_graph_init_task(p);
    /*初始化自旋锁*/
	rt_mutex_init_task(p);

#ifdef CONFIG_PROVE_LOCKING
	DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
	DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
#endif
	retval = -EAGAIN;
    /*判断当前用户拥有的进程是否超过最大限制*/
	if (atomic_read(&p->real_cred->user->processes) >=
			task_rlimit(p, RLIMIT_NPROC)) {
			/*权限检查*/
		if (p->real_cred->user != INIT_USER &&
		    !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
			goto bad_fork_free;
	}
	current->flags &= ~PF_NPROC_EXCEEDED;
    /*赋值当前进程的证书*/
	retval = copy_creds(p, clone_flags);
	if (retval < 0)
		goto bad_fork_free;

	/*
	 * If multiple threads are within copy_process(), then this check
	 * triggers too late. This doesn't hurt, the check is only there
	 * to stop root fork bombs.
	 */
	retval = -EAGAIN;
    /*检查当前的进程数量是否超过总限制*/
	if (nr_threads >= max_threads)
		goto bad_fork_cleanup_count;

	delayacct_tsk_init(p);	/* Must remain after dup_task_struct() */
	p->flags &= ~(PF_SUPERPRIV | PF_WQ_WORKER);
	p->flags |= PF_FORKNOEXEC;
	INIT_LIST_HEAD(&p->children);
	INIT_LIST_HEAD(&p->sibling);
	rcu_copy_process(p);
	p->vfork_done = NULL;
	spin_lock_init(&p->alloc_lock);

	init_sigpending(&p->pending);

	p->utime = p->stime = p->gtime = 0;
	p->utimescaled = p->stimescaled = 0;
	prev_cputime_init(&p->prev_cputime);

#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
	seqlock_init(&p->vtime_seqlock);
	p->vtime_snap = 0;
	p->vtime_snap_whence = VTIME_SLEEPING;
#endif

#if defined(SPLIT_RSS_COUNTING)
	memset(&p->rss_stat, 0, sizeof(p->rss_stat));
#endif

	p->default_timer_slack_ns = current->timer_slack_ns;

	task_io_accounting_init(&p->ioac);
	acct_clear_integrals(p);

	posix_cpu_timers_init(p);

	p->io_context = NULL;
	p->audit_context = NULL;
	cgroup_fork(p);
#ifdef CONFIG_NUMA
	p->mempolicy = mpol_dup(p->mempolicy);
	if (IS_ERR(p->mempolicy)) {
		retval = PTR_ERR(p->mempolicy);
		p->mempolicy = NULL;
		goto bad_fork_cleanup_threadgroup_lock;
	}
#endif
#ifdef CONFIG_CPUSETS
	p->cpuset_mem_spread_rotor = NUMA_NO_NODE;
	p->cpuset_slab_spread_rotor = NUMA_NO_NODE;
	seqcount_init(&p->mems_allowed_seq);
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
	p->irq_events = 0;
	p->hardirqs_enabled = 0;
	p->hardirq_enable_ip = 0;
	p->hardirq_enable_event = 0;
	p->hardirq_disable_ip = _THIS_IP_;
	p->hardirq_disable_event = 0;
	p->softirqs_enabled = 1;
	p->softirq_enable_ip = _THIS_IP_;
	p->softirq_enable_event = 0;
	p->softirq_disable_ip = 0;
	p->softirq_disable_event = 0;
	p->hardirq_context = 0;
	p->softirq_context = 0;
#endif

	p->pagefault_disabled = 0;

#ifdef CONFIG_LOCKDEP
	p->lockdep_depth = 0; /* no locks held yet */
	p->curr_chain_key = 0;
	p->lockdep_recursion = 0;
#endif

#ifdef CONFIG_DEBUG_MUTEXES
	p->blocked_on = NULL; /* not blocked yet */
#endif
#ifdef CONFIG_BCACHE
	p->sequential_io	= 0;
	p->sequential_io_avg	= 0;
#endif

	/* Perform scheduler related setup. Assign this task to a CPU. */
	retval = sched_fork(clone_flags, p);
	if (retval)
		goto bad_fork_cleanup_policy;

	retval = perf_event_init_task(p);
	if (retval)
		goto bad_fork_cleanup_policy;
	retval = audit_alloc(p);
	if (retval)
		goto bad_fork_cleanup_perf;
	/* copy all the process information */
	shm_init_task(p);
	retval = copy_semundo(clone_flags, p);
	if (retval)
		goto bad_fork_cleanup_audit;
	retval = copy_files(clone_flags, p);
	if (retval)
		goto bad_fork_cleanup_semundo;
	retval = copy_fs(clone_flags, p);
	if (retval)
		goto bad_fork_cleanup_files;
	retval = copy_sighand(clone_flags, p);
	if (retval)
		goto bad_fork_cleanup_fs;
	retval = copy_signal(clone_flags, p);
	if (retval)
		goto bad_fork_cleanup_sighand;
	retval = copy_mm(clone_flags, p);
	if (retval)
		goto bad_fork_cleanup_signal;
	retval = copy_namespaces(clone_flags, p);
	if (retval)
		goto bad_fork_cleanup_mm;
	retval = copy_io(clone_flags, p);
	if (retval)
		goto bad_fork_cleanup_namespaces;
	retval = copy_thread_tls(clone_flags, stack_start, stack_size, p, tls);
	if (retval)
		goto bad_fork_cleanup_io;

	if (pid != &init_struct_pid) {
		pid = alloc_pid(p->nsproxy->pid_ns_for_children);
		if (IS_ERR(pid)) {
			retval = PTR_ERR(pid);
			goto bad_fork_cleanup_io;
		}
	}

#ifdef CONFIG_BLOCK
	p->plug = NULL;
#endif
#ifdef CONFIG_FUTEX
	p->robust_list = NULL;
#ifdef CONFIG_COMPAT
	p->compat_robust_list = NULL;
#endif
	INIT_LIST_HEAD(&p->pi_state_list);
	p->pi_state_cache = NULL;
#endif
	/*
	 * sigaltstack should be cleared when sharing the same VM
	 */
	if ((clone_flags & (CLONE_VM|CLONE_VFORK)) == CLONE_VM)
		p->sas_ss_sp = p->sas_ss_size = 0;

	/*
	 * Syscall tracing and stepping should be turned off in the
	 * child regardless of CLONE_PTRACE.
	 */
	user_disable_single_step(p);
	clear_tsk_thread_flag(p, TIF_SYSCALL_TRACE);
#ifdef TIF_SYSCALL_EMU
	clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
#endif
	clear_all_latency_tracing(p);

	/* ok, now we should be set up.. */
	p->pid = pid_nr(pid);
	if (clone_flags & CLONE_THREAD) {
		p->exit_signal = -1;
		p->group_leader = current->group_leader;
		p->tgid = current->tgid;
	} else {
		if (clone_flags & CLONE_PARENT)
			p->exit_signal = current->group_leader->exit_signal;
		else
			p->exit_signal = (clone_flags & CSIGNAL);
		p->group_leader = p;
		p->tgid = p->pid;
	}

	p->nr_dirtied = 0;
	p->nr_dirtied_pause = 128 >> (PAGE_SHIFT - 10);
	p->dirty_paused_when = 0;

	p->pdeath_signal = 0;
	INIT_LIST_HEAD(&p->thread_group);
	p->task_works = NULL;

	threadgroup_change_begin(current);
	/*
	 * Ensure that the cgroup subsystem policies allow the new process to be
	 * forked. It should be noted the the new process's css_set can be changed
	 * between here and cgroup_post_fork() if an organisation operation is in
	 * progress.
	 */
	retval = cgroup_can_fork(p, cgrp_ss_priv);
	if (retval)
		goto bad_fork_free_pid;

	/*
	 * From this point on we must avoid any synchronous user-space
	 * communication until we take the tasklist-lock. In particular, we do
	 * not want user-space to be able to predict the process start-time by
	 * stalling fork(2) after we recorded the start_time but before it is
	 * visible to the system.
	 */

	p->start_time = ktime_get_ns();
	p->real_start_time = ktime_get_boot_ns();

	/*
	 * Make it visible to the rest of the system, but dont wake it up yet.
	 * Need tasklist lock for parent etc handling!
	 */
	write_lock_irq(&tasklist_lock);

	/* CLONE_PARENT re-uses the old parent */
	if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
		p->real_parent = current->real_parent;
		p->parent_exec_id = current->parent_exec_id;
	} else {
		p->real_parent = current;
		p->parent_exec_id = current->self_exec_id;
	}

	spin_lock(&current->sighand->siglock);

	/*
	 * Copy seccomp details explicitly here, in case they were changed
	 * before holding sighand lock.
	 */
	copy_seccomp(p);

	/*
	 * Process group and session signals need to be delivered to just the
	 * parent before the fork or both the parent and the child after the
	 * fork. Restart if a signal comes in before we add the new process to
	 * it's process group.
	 * A fatal signal pending means that current will exit, so the new
	 * thread can't slip out of an OOM kill (or normal SIGKILL).
	*/
	recalc_sigpending();
	if (signal_pending(current)) {
		retval = -ERESTARTNOINTR;
		goto bad_fork_cancel_cgroup;
	}
	if (unlikely(!(ns_of_pid(pid)->nr_hashed & PIDNS_HASH_ADDING))) {
		retval = -ENOMEM;
		goto bad_fork_cancel_cgroup;
	}

	if (likely(p->pid)) {
		ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);

		init_task_pid(p, PIDTYPE_PID, pid);
		if (thread_group_leader(p)) {
			init_task_pid(p, PIDTYPE_PGID, task_pgrp(current));
			init_task_pid(p, PIDTYPE_SID, task_session(current));

			if (is_child_reaper(pid)) {
				ns_of_pid(pid)->child_reaper = p;
				p->signal->flags |= SIGNAL_UNKILLABLE;
			}

			p->signal->leader_pid = pid;
			p->signal->tty = tty_kref_get(current->signal->tty);
			list_add_tail(&p->sibling, &p->real_parent->children);
			list_add_tail_rcu(&p->tasks, &init_task.tasks);
			attach_pid(p, PIDTYPE_PGID);
			attach_pid(p, PIDTYPE_SID);
			__this_cpu_inc(process_counts);
		} else {
			current->signal->nr_threads++;
			atomic_inc(&current->signal->live);
			atomic_inc(&current->signal->sigcnt);
			list_add_tail_rcu(&p->thread_group,
					  &p->group_leader->thread_group);
			list_add_tail_rcu(&p->thread_node,
					  &p->signal->thread_head);
		}
		attach_pid(p, PIDTYPE_PID);
		nr_threads++;
	}

	total_forks++;
	spin_unlock(&current->sighand->siglock);
	syscall_tracepoint_update(p);
	write_unlock_irq(&tasklist_lock);

	proc_fork_connector(p);
	cgroup_post_fork(p, cgrp_ss_priv);
	threadgroup_change_end(current);
	perf_event_fork(p);

	trace_task_newtask(p, clone_flags);
	uprobe_copy_process(p, clone_flags);

	return p;

bad_fork_cancel_cgroup:
	spin_unlock(&current->sighand->siglock);
	write_unlock_irq(&tasklist_lock);
	cgroup_cancel_fork(p, cgrp_ss_priv);
bad_fork_free_pid:
	threadgroup_change_end(current);
	if (pid != &init_struct_pid)
		free_pid(pid);
bad_fork_cleanup_io:
	if (p->io_context)
		exit_io_context(p);
bad_fork_cleanup_namespaces:
	exit_task_namespaces(p);
bad_fork_cleanup_mm:
	if (p->mm)
		mmput(p->mm);
bad_fork_cleanup_signal:
	if (!(clone_flags & CLONE_THREAD))
		free_signal_struct(p->signal);
bad_fork_cleanup_sighand:
	__cleanup_sighand(p->sighand);
bad_fork_cleanup_fs:
	exit_fs(p); /* blocking */
bad_fork_cleanup_files:
	exit_files(p); /* blocking */
bad_fork_cleanup_semundo:
	exit_sem(p);
bad_fork_cleanup_audit:
	audit_free(p);
bad_fork_cleanup_perf:
	perf_event_free_task(p);
bad_fork_cleanup_policy:
#ifdef CONFIG_NUMA
	mpol_put(p->mempolicy);
bad_fork_cleanup_threadgroup_lock:
#endif
	delayacct_tsk_free(p);
bad_fork_cleanup_count:
	atomic_dec(&p->cred->user->processes);
	exit_creds(p);
bad_fork_free:
	free_task(p);
fork_out:
	return ERR_PTR(retval);
}

进来先判断了几种不合理的情况然后创建子进程：

1. 子进程独立的命名空间和共享的文件系统（以及根目录、当前目录）是冲突的

2. 子进程独立的用户空间和共享的文件系统（以及根目录、当前目录）是冲突的

3. 子进程设置了和父进程归属相同的线程组，就必须共享信号处理表

4. 子进程共享父进程的信号处理表，那么就必须共享虚拟内存区域和页表

5. 当前进程忽略致命信号（SIGNAL_UNKILLABLE），就不能传递 CLONE_THREAD（变父子为兄弟关系）

7 . 不能即设置新的用户命名空间，又设置属于同一线程组接下来进行 LSM（SELinux）判断是否能创建线程

8. dup_task_struct 复制父进程的 task_struct （申请子进程task_struct 、栈，并复制父进程的 task_struct ）

9. 判断是否超过用户进程限制，以及是否超过系统进程数限制

10. 接下来初始化子进程自己的一些资源，并根据传递的 CLONE_* 来copy_* ，目前没有对 task_struct 结构体研究，成员的作用页不太清楚，日后研究后再补充

copy的好多个资源，特别摘出来 copy_mm

static int copy_mm(unsigned long clone_flags, struct task_struct *tsk)
{
	struct mm_struct *mm, *oldmm;
	int retval;

	tsk->min_flt = tsk->maj_flt = 0;
	tsk->nvcsw = tsk->nivcsw = 0;
#ifdef CONFIG_DETECT_HUNG_TASK
	tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw;
#endif

	tsk->mm = NULL;
	tsk->active_mm = NULL;

	/*
	 * Are we cloning a kernel thread?
	 *
	 * We need to steal a active VM for that..
	 */
	oldmm = current->mm;
	if (!oldmm)
		return 0;

	/* initialize the new vmacache entries */
	vmacache_flush(tsk);

	if (clone_flags & CLONE_VM) {
		atomic_inc(&oldmm->mm_users);
		mm = oldmm;
		goto good_mm;
	}

	retval = -ENOMEM;
	mm = dup_mm(tsk);
	if (!mm)
		goto fail_nomem;

good_mm:
	tsk->mm = mm;
	tsk->active_mm = mm;
	return 0;

fail_nomem:
	return retval;
}

当设置了 CLONE_VM ，也就是共享内存空间，直接将父进程的 mm 赋值过来，vfork 及 kernel_thread 都设置了该标志。kernel_thread 创建内核线程，当然没有独立的内存空间，而vfork是为了 exec ，不需要独立的内存空间，和父进程共享。而 fork 没有该标志，通过 dup_mm 来创建自己的内存空间，其实还是复制父进程的，但是多了重要步骤，将页表权限修改为不可写 ，这样一旦要写入，就会触发权限异常，

通过 do_page_fault 中的 do_wp_page 完成 copy on write 功能！

标签