系统调用：2. select

最新推荐文章于 2025-08-24 22:36:34 发布

原创最新推荐文章于 2025-08-24 22:36:34 发布 · 910 阅读

24 ·

本内容遵循CC 4.0 BY-SA版权协议

GEO检测

收录于

Kernel

Kernel源码笔记目录

系统调用：1. splice

Select系统调用

源码基于4.19.90

最近因为工作需要看了select系统调用的源码，整理成笔记和大家分享。

1. sys_select

/*
5个参数分别是：
n：最多遍历多少个fd
inp：要读的fd集，返回时这里面放的是准备好的fd
outp：要写的fd集，返回时这里面放的是准备好的fd
exp：异常的fd集，返回时这里面放的是准备好的fd
tvp：超时时间，也用来向用户层反馈未用完的时间
*/
SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp,
		fd_set __user *, exp, struct timeval __user *, tvp)
{
	return kern_select(n, inp, outp, exp, tvp);
}

static int kern_select(int n, fd_set __user *inp, fd_set __user *outp,
		       fd_set __user *exp, struct timeval __user *tvp)
{
	struct timespec64 end_time, *to = NULL;
	struct timeval tv;
	int ret;

	// 如果传了超时时间
	if (tvp) {
		// 把超时复制到内核里
		if (copy_from_user(&tv, tvp, sizeof(tv)))
			return -EFAULT;

		to = &end_time;

		// 将相对时间转成绝对时间
		if (poll_select_set_timeout(to,
				tv.tv_sec + (tv.tv_usec / USEC_PER_SEC), // 秒
				(tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC)) // 纳秒
			return -EINVAL;
	}

	// 调select的核心代码
	ret = core_sys_select(n, inp, outp, exp, to);

	// 处理超时时间，然后再返回ret
	return poll_select_finish(&end_time, tvp, PT_TIMEVAL, ret);
}

1.1 poll_select_set_timeout

int poll_select_set_timeout(struct timespec64 *to, time64_t sec, long nsec)
{
	// 初始化秒和纳秒
	struct timespec64 ts = {.tv_sec = sec, .tv_nsec = nsec};

	// 时间值非法，返回
	if (!timespec64_valid(&ts))
		return -EINVAL;

	if (!sec && !nsec) {
		// sec和nsec都没设置，想当于不等待
		// 将to里的值设成0
		to->tv_sec = to->tv_nsec = 0;
	} else {
		// 走到这儿表示设置了时间

		// 先获取系统当前的时间
		ktime_get_ts64(to);
		// 再把超时时间加上就是最终的到期时间
		*to = timespec64_add_safe(*to, ts);
	}
	return 0;
}

static inline bool timespec64_valid(const struct timespec64 *ts)
{
	// 秒不能为0
	if (ts->tv_sec < 0)
		return false;
	// 纳秒不能大于1秒
	if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC)
		return false;
	return true;
}

1.2 poll_select_finish

static int poll_select_finish(struct timespec64 *end_time,
			      void __user *p,
			      enum poll_time_type pt_type, int ret)
{
	struct timespec64 rts;

	// 返回保存的信号
	restore_saved_sigmask_unless(ret == -ERESTARTNOHAND);

	// 下面主要是对超时时间的处理，如果用户没传超时时间的话，
	// 直接返回
	if (!p)
		return ret;

	// 粘滞timeout是什么？
	if (current->personality & STICKY_TIMEOUTS)
		goto sticky;

	// 如果没有设置超时时间，直接返回
	if (!end_time->tv_sec && !end_time->tv_nsec)
		return ret;

	// 处理超时时间
	ktime_get_ts64(&rts);
	rts = timespec64_sub(*end_time, rts);
	if (rts.tv_sec < 0)
		rts.tv_sec = rts.tv_nsec = 0;


	// 下面根据不同的类形，向用户层复制时间数据
	switch (pt_type) {
	case PT_TIMEVAL:
		{
			struct timeval rtv;

			if (sizeof(rtv) > sizeof(rtv.tv_sec) + sizeof(rtv.tv_usec))
				memset(&rtv, 0, sizeof(rtv));
			rtv.tv_sec = rts.tv_sec;
			rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC;
			if (!copy_to_user(p, &rtv, sizeof(rtv)))
				return ret;
		}
		break;
	case PT_OLD_TIMEVAL:
		{
			struct compat_timeval rtv;

			rtv.tv_sec = rts.tv_sec;
			rtv.tv_usec = rts.tv_nsec / NSEC_PER_USEC;
			if (!copy_to_user(p, &rtv, sizeof(rtv)))
				return ret;
		}
		break;
	case PT_TIMESPEC:
		if (!put_timespec64(&rts, p))
			return ret;
		break;
	case PT_OLD_TIMESPEC:
		if (!compat_put_timespec64(&rts, p))
			return ret;
		break;
	default:
		BUG();
	}
	/*
	 * If an application puts its timeval in read-only memory, we
	 * don't want the Linux-specific update to the timeval to
	 * cause a fault after the select has completed
	 * successfully. However, because we're not updating the
	 * timeval, we can't restart the system call.
	 */

sticky:
	// 被中断
	if (ret == -ERESTARTNOHAND)
		ret = -EINTR;
	// 正常情况下，返回准备好的fd数目
	return ret;
}

sys_select 主体流程如下：

先复制、转换用户传下来的超时时间，如果有的话；
调用select核心进行处理；
向用户层返回准备好的fd，同时也对超时时间进行处理，如果有的话。

2. core_sys_select

typedef struct {
	// 三个入参
	unsigned long *in, *out, *ex;

	// 三个出参
	unsigned long *res_in, *res_out, *res_ex;
} fd_set_bits;

int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
			   fd_set __user *exp, struct timespec64 *end_time)
{
	fd_set_bits fds;
	void *bits;
	int ret, max_fds;
	size_t size, alloc_size;
	struct fdtable *fdt;
	/* 先在栈上分配一些空间，节省内存，而且更快 */
	// SELECT_STACK_ALLOC = 256，假设long是8，则这
	// 个数组大小是32
	long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];

	ret = -EINVAL;
	// n < 0，直接返回
	if (n < 0)
		goto out_nofds;

	// 拿到当前进程的文件描述符表
	rcu_read_lock();
	fdt = files_fdtable(current->files);
	// 当前进程最大可分配的fd
	max_fds = fdt->max_fds;
	rcu_read_unlock();
	// n 不能超过max_fds
	if (n > max_fds)
		n = max_fds;

	/*
	 * We need 6 bitmaps (in/out/ex for both incoming and outgoing),
	 * since we used fdset we need to allocate memory in units of
	 * long-words. 
	 	// 一个long能存多少位
	 * 	#define FDS_BITPERLONG	(8*sizeof(long))

		// nr需要多少个long，向上对齐到FDS_BITPERLONG
		#define FDS_LONGS(nr)	(((nr)+FDS_BITPERLONG-1)/FDS_BITPERLONG)
		// nr个fd需要多少字节
		#define FDS_BYTES(nr)	(FDS_LONGS(nr)*sizeof(long))
	 */
	// n个位需要多少个字节
	size = FDS_BYTES(n);
	// 首先用栈上的
	bits = stack_fds;

	if (size > sizeof(stack_fds) / 6) {
		// 需要的long比栈上的大，需要动态分配
		ret = -ENOMEM;
		// 超过了最大值
		if (size > (SIZE_MAX / 6))
			goto out_nofds;

		// 需要分配的数量是size的6倍
		alloc_size = 6 * size;
		bits = kvmalloc(alloc_size, GFP_KERNEL);
		if (!bits)
			goto out_nofds;
	}
	// 指定各自对应的内存
	fds.in      = bits;
	fds.out     = bits +   size;
	fds.ex      = bits + 2*size;

	// 指定各自返回结果对应的内存
	fds.res_in  = bits + 3*size;
	fds.res_out = bits + 4*size;
	fds.res_ex  = bits + 5*size;

	// 从用户层把三个fd集拷过来
	if ((ret = get_fd_set(n, inp, fds.in)) ||
	    (ret = get_fd_set(n, outp, fds.out)) ||
	    (ret = get_fd_set(n, exp, fds.ex)))
		goto out;

	// 把返回结果里的n个long清0
	zero_fd_set(n, fds.res_in);
	zero_fd_set(n, fds.res_out);
	zero_fd_set(n, fds.res_ex);

	// 执行真正的do_select
	ret = do_select(n, &fds, end_time);

	// 返回小于0表示出错
	if (ret < 0)
		goto out;

	// 返回0表示超时
	if (!ret) {
		ret = -ERESTARTNOHAND;
		
		// 被中断打断
		if (signal_pending(current))
			goto out;
		// 如果不是被中断打开，则是真正的超时，返回0
		ret = 0;
	}

	// 将准备好的fd结果放到相应的fd集里
	if (set_fd_set(n, inp, fds.res_in) ||
	    set_fd_set(n, outp, fds.res_out) ||
	    set_fd_set(n, exp, fds.res_ex))
		ret = -EFAULT;

out:
	// 如果是分配的内存，需要释放
	if (bits != stack_fds)
		kvfree(bits);
out_nofds:
	return ret;
}

2.1 get/set_fd_set

static inline
int get_fd_set(unsigned long nr, void __user *ufdset, unsigned long *fdset)
{
	// nr对应的long
	nr = FDS_BYTES(nr);

	// 从用户层把fdset拷过来
	if (ufdset)
		return copy_from_user(fdset, ufdset, nr) ? -EFAULT : 0;

	// 没有用户层的就清0
	memset(fdset, 0, nr);
	return 0;
}

static inline unsigned long __must_check
set_fd_set(unsigned long nr, void __user *ufdset, unsigned long *fdset)
{
	if (ufdset)
		return __copy_to_user(ufdset, fdset, FDS_BYTES(nr));
	return 0;
}

core_sys_select的主要流程如下：

计算并分配遍历n个fd，需要的内存大小，如果有必要需要从slab里分配；
调用 do_select 进行真正的select操作；
将 do_select 的操作复制到用户层相应的数据结构里。

3. do_select

3.1 数据结构

typedef void (*poll_queue_proc)(struct file *, wait_queue_head_t *, struct poll_table_struct *);

typedef unsigned __bitwise __poll_t;

typedef struct poll_table_struct {
	poll_queue_proc _qproc; // poll函数
	__poll_t _key; // 关键字
} poll_table;

struct poll_table_page {
	struct poll_table_page * next; // 链表
	struct poll_table_entry * entry; // entry指针
	struct poll_table_entry entries[0];
};

struct poll_table_entry {
	struct file *filp; // 文件
	__poll_t key; // 关键字
	wait_queue_entry_t wait; // 等待队列
	wait_queue_head_t *wait_address; // 等待队列头
};

struct poll_wqueues {
	poll_table pt; // poll的函数
	struct poll_table_page *table; // 表
	struct task_struct *polling_task; // poll的进行
	int triggered; // ？
	int error; // 有错误？
	int inline_index; // 当前entry的指针

	#define N_INLINE_POLL_ENTRIES	(576 / sizeof(struct poll_table_entry))

	struct poll_table_entry inline_entries[N_INLINE_POLL_ENTRIES];
};

3.2 do_select

static noinline_for_stack int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time)
{
	ktime_t expire, *to = NULL;
	struct poll_wqueues table;
	poll_table *wait;
	int retval, i, timed_out = 0;
	u64 slack = 0;
	// net_busy_loop_on()=sysctl_net_busy_poll的值
	__poll_t busy_flag = net_busy_loop_on() ? POLL_BUSY_LOOP : 0;
	unsigned long busy_start = 0;

	rcu_read_lock();
	// 计算最大的fd
	retval = max_select_fd(n, fds);
	rcu_read_unlock();

	// 小于0,表示出错，直接返回
	if (retval < 0)
		return retval;
	// 将n设置成最大的fd
	n = retval;

	// 初始化poll等待队列
	poll_initwait(&table);

	// polltable用table里的pt
	wait = &table.pt;

	// end_time有值，说明设置了超时时间，但是如果对应的秒、纳秒都是0，表示不等待
	if (end_time && !end_time->tv_sec && !end_time->tv_nsec) {
		// 把等待函数清空
		wait->_qproc = NULL;
		// 这里直接把已超时设置为1，这样在下面只轮循一遍就退出
		timed_out = 1;
	}

	// 如果设定了超时时间，而且没有设置已超时，则重新计算时间精度？
	if (end_time && !timed_out)
		// slack是延迟时间，重新估计时间？
		slack = select_estimate_accuracy(end_time);

	retval = 0;
	for (;;) {
		unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp;
		bool can_busy_loop = false;

		// 从fds里取出对应的值
		inp = fds->in; outp = fds->out; exp = fds->ex;
		rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex;

		// 遍历所有fd
		for (i = 0; i < n; ++rinp, ++routp, ++rexp) {
			unsigned long in, out, ex, all_bits, bit = 1, j;
			unsigned long res_in = 0, res_out = 0, res_ex = 0;
			__poll_t mask;

			// 取出对应的fd，然后把各自指针递增
			in = *inp++; out = *outp++; ex = *exp++;
			
			// 所有设置的位
			all_bits = in | out | ex;
			// 如果三个都没fd
			if (all_bits == 0) {
				// 说明这个long上已经遍历完了，直接给i增加一个long
				i += BITS_PER_LONG;
				continue;
			}

			// bit是in/out/ex的位
			for (j = 0; j < BITS_PER_LONG; ++j, ++i, bit <<= 1) {
				struct fd f;
				// 超过了最大值，退出
				if (i >= n)
					break;
				// 当前bit没有设置，继续循环
				if (!(bit & all_bits))
					continue;
				// EPOLLNVAL表示fd无效
				mask = EPOLLNVAL;

				// 获取i对应的文件
				f = fdget(i);
				// 获取到了文件
				if (f.file) {
					// 设置等待key
					wait_key_set(wait, in, out, bit,
						     busy_flag);
					// 调用具体文件系统的poll函数
					mask = vfs_poll(f.file, wait);
					// 释放文件
					fdput(f);
				}
				/*
				#define POLLIN_SET (EPOLLRDNORM | EPOLLRDBAND | EPOLLIN | EPOLLHUP | EPOLLERR |\
							EPOLLNVAL)
				#define POLLOUT_SET (EPOLLWRBAND | EPOLLWRNORM | EPOLLOUT | EPOLLERR |\
							EPOLLNVAL)
				#define POLLEX_SET (EPOLLPRI | EPOLLNVAL)
				*/
				// 如果mask被设置POLLIN_SET表示该文件可读
				// 设置读
				if ((mask & POLLIN_SET) && (in & bit)) {
					// 设置返回结果的对应位
					res_in |= bit;
					// 准备好的文件+1
					retval++;
					// 等待函数设置为空
					wait->_qproc = NULL;
				}
				// 设置写,同上
				if ((mask & POLLOUT_SET) && (out & bit)) {
					res_out |= bit;
					retval++;
					wait->_qproc = NULL;
				}
				// 设置异常，同上
				if ((mask & POLLEX_SET) && (ex & bit)) {
					res_ex |= bit;
					retval++;
					wait->_qproc = NULL;
				}
				/* 如果有准备好的，就停止轮询*/
				if (retval) {
					// 不循环了
					can_busy_loop = false;
					busy_flag = 0;

				/*
				 * only remember a returned
				 * POLL_BUSY_LOOP if we asked for it
				 */
				} else if (busy_flag & mask)
					// 走到这儿表示没有准备好的
					can_busy_loop = true;
			}
			// 把结果的值，写到相应的指针里
			if (res_in)
				*rinp = res_in;
			if (res_out)
				*routp = res_out;
			if (res_ex)
				*rexp = res_ex;
			cond_resched();
		}
		// 设置等待函数为空 ？
		wait->_qproc = NULL;

		// retval有值说明已经有准备好的fd了
		// timed_out为1说明已经超时
		// 或者有信号，都要退出
		if (retval || timed_out || signal_pending(current))
			break;

		// 有错误了也退出
		if (table.error) {
			retval = table.error;
			break;
		}

		/* only if found POLL_BUSY_LOOP sockets && not out of time */
		// socket相关，可以忙等且不用调度，后面再看
		if (can_busy_loop && !need_resched()) {
			// 如果起始时间为0,获取起始的微秒数
			if (!busy_start) {
				busy_start = busy_loop_current_time();
				continue;
			}

			// 是否超过了 sysctl_net_busy_poll，如果没超过，继续循环
			if (!busy_loop_timeout(busy_start))
				continue;
		}
		// 重置忙的标置
		busy_flag = 0;

		/*
		 * If this is the first loop and we have a timeout
		 * given, then we convert to ktime_t and set the to
		 * pointer to the expiry value.
		 */
		// 如果设置了超时时间，则把超时时间转换成ktime保存在to里
		if (end_time && !to) {
			expire = timespec64_to_ktime(*end_time);
			to = &expire;
		}

		// 进程睡眠，调度出去。并设置当前进程为可中断状态
		if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE,
					   to, slack))
			timed_out = 1;
	}

	poll_freewait(&table);

	return retval;
}

3.2.1 max_select_fd

static int max_select_fd(unsigned long n, fd_set_bits *fds)
{
	unsigned long *open_fds;
	unsigned long set;
	int max;
	struct fdtable *fdt;

	// 取出不够一个long的数量，其实就是 n % BITS_PER_LONG
	set = ~(~0UL << (n & (BITS_PER_LONG-1)));
	// 整个long的数量
	n /= BITS_PER_LONG;
	// 当前进程的fdt
	fdt = files_fdtable(current->files);
	// open_fds里第n个long，open_fds是当前进程已打开的fd位图
	open_fds = fdt->open_fds + n;
	max = 0;

	if (set) {
		/* 
		#define FDS_IN(fds, n)		(fds->in + n)
		#define FDS_OUT(fds, n)		(fds->out + n)
		#define FDS_EX(fds, n)		(fds->ex + n)

		#define BITS(fds, n)	(*FDS_IN(fds, n)|*FDS_OUT(fds, n)|*FDS_EX(fds, n))
		*/
		// 获取三个fd集里第n个long的并集
		set &= BITS(fds, n);
		// 如果set有值，说明n为0，因为set里原来是最后一个long
		if (set) {
			// 如果所要求的fd里的所有fd都打开了，就找到最大的了
			if (!(set & ~*open_fds))
				goto get_max;
			// 否则有没打开的，出错
			return -EBADF;
		}
	}
	while (n) {
		open_fds--;
		n--;
		// 获取三个fd集里第n个long的并集
		set = BITS(fds, n);
		// 如果都没设置，继续
		if (!set)
			continue;
		// 如果有没打开的fd，出错
		if (set & ~*open_fds)
			return -EBADF;
		// 如果max有值，继续
		if (max)
			continue;
get_max:
		// 统计当前long里的fd数量
		do {
			max++;
			set >>= 1;
		} while (set);
		// 再加上前面n位的fd数量，就是最终max的数量
		max += n * BITS_PER_LONG;
	}

	// 返回最大值
	return max;
}

3.2.2 poll_initwait

void poll_initwait(struct poll_wqueues *pwq)
{
	// 默认将函数设置为__pollwait
	init_poll_funcptr(&pwq->pt, __pollwait);
	// 轮询进程设置为当前
	pwq->polling_task = current;
	// 重置各种值
	pwq->triggered = 0;
	pwq->error = 0;
	pwq->table = NULL;
	pwq->inline_index = 0;
}

static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc)
{
	pt->_qproc = qproc;
	// key设置为0
	pt->_key   = ~(__poll_t)0; /* all events enabled */
}

3.2.3 select_estimate_accuracy

u64 select_estimate_accuracy(struct timespec64 *tv)
{
	u64 ret;
	struct timespec64 now;

	// 实时任务不允许有延迟
	if (rt_task(current))
		return 0;

	// 当前时间
	ktime_get_ts64(&now);
	// 减去当前时间
	now = timespec64_sub(*tv, now);
	// TODO： WHAT ？
	ret = __estimate_accuracy(&now);
	// 小于当前进程的延迟时间，则直接返回
	if (ret < current->timer_slack_ns)
		return current->timer_slack_ns;
	return ret;
}

3.2.4 wait_key_set

static inline void wait_key_set(poll_table *wait, unsigned long in,
				unsigned long out, unsigned long bit,
				__poll_t ll_flag)
{
	// #define POLLEX_SET (EPOLLPRI | EPOLLNVAL)
	// 设置默认key
	wait->_key = POLLEX_SET | ll_flag;

	// 根据bit设置对应的 POLLING和POLLOUT
	if (in & bit)
		wait->_key |= POLLIN_SET;
	if (out & bit)
		wait->_key |= POLLOUT_SET;
}

static inline __poll_t vfs_poll(struct file *file, struct poll_table_struct *pt)
{
	// #define DEFAULT_POLLMASK (EPOLLIN | EPOLLOUT | EPOLLRDNORM | EPOLLWRNORM)
	// 如果文件系统没有实现poll，则返回默认标志，
	// 默认标志是读写都准备好了
	if (unlikely(!file->f_op->poll))
		return DEFAULT_POLLMASK;
	// 如果文件系统实现了poll，则调用它
	return file->f_op->poll(file, pt);
}

3.2.5 busy_loop_current_time

static inline unsigned long busy_loop_current_time(void)
{
#ifdef CONFIG_NET_RX_BUSY_POLL
	// 网络接收的忙等时间？
	return (unsigned long)(ktime_get_ns() >> 10);
#else
	return 0;
#endif
}

static inline bool busy_loop_timeout(unsigned long start_time)
{
#ifdef CONFIG_NET_RX_BUSY_POLL
	// 读sysctl_net_busy_poll
	unsigned long bp_usec = READ_ONCE(sysctl_net_busy_poll);

	// 如果设置了
	if (bp_usec) {
		unsigned long end_time = start_time + bp_usec;
		unsigned long now = busy_loop_current_time();

		// 计算是否已经超过了忙等时间
		return time_after(now, end_time);
	}
#endif
	return true;
}

3.2.6 poll_schedule_timeout

static int poll_schedule_timeout(struct poll_wqueues *pwq, int state,
			  ktime_t *expires, unsigned long slack)
{
	int rc = -EINTR;

	// 先设置当前进程的状态
	set_current_state(state);
	// 如果没有触发过
	if (!pwq->triggered)
		// 调度出去，并且有超时时间，HRTIMER_MODE_ABS表示为绝对时间
		// 这个函数会让进程睡眠
		rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS);

	// 走到这儿表示该进程又被调度了

	// 设置当前运行状态为运行
	__set_current_state(TASK_RUNNING);

	/*
	 * Prepare for the next iteration.
	 *
	 * The following smp_store_mb() serves two purposes.  First, it's
	 * the counterpart rmb of the wmb in pollwake() such that data
	 * written before wake up is always visible after wake up.
	 * Second, the full barrier guarantees that triggered clearing
	 * doesn't pass event check of the next iteration.  Note that
	 * this problem doesn't exist for the first iteration as
	 * add_wait_queue() has full barrier semantics.
	 */
	// 把triggered再设置为0，准备下次循环
	smp_store_mb(pwq->triggered, 0);

	return rc;
}

do_select的核心流程如下：

计算要遍历的最大fd号；
初始化poll_wait，并计算超时时间相关；
在各fd对应的文件上调用poll函数；
根据poll的返回结果，设置对应的结果集；
如果有准备好的fd则直接返回，如果没有设置超时，也直接返回；
否则睡眠指定时间。

3.2.7 poll_freewait

void poll_freewait(struct poll_wqueues *pwq)
{
	struct poll_table_page * p = pwq->table;
	int i;
	// 释放所有的静态poll entry
	for (i = 0; i < pwq->inline_index; i++)
		free_poll_entry(pwq->inline_entries + i);

	// 再释放table里的
	while (p) {
		struct poll_table_entry * entry;
		struct poll_table_page *old;

		entry = p->entry;
		do {
			entry--;
			free_poll_entry(entry);
		} while (entry > p->entries);
		old = p;
		p = p->next;
		free_page((unsigned long) old);
	}
}

static void free_poll_entry(struct poll_table_entry *entry)
{
	// 从等待队列里移除
	remove_wait_queue(entry->wait_address, &entry->wait);
	// 释放文件
	fput(entry->filp);
}

4. pipe_poll

实现poll的文件系统并不多，常规磁盘文件系统都没有实现poll，下面以pipe为例。

static __poll_t
pipe_poll(struct file *filp, poll_table *wait)
{
	__poll_t mask;
	struct pipe_inode_info *pipe = filp->private_data;
	int nrbufs;

	// 调用 __pollwait，初始化在pipe上等待的一些流程。
	// 等待队列是pipe的wait，注意这里并不阻塞
	poll_wait(filp, &pipe->wait, wait);

	// pipe的buffer
	nrbufs = pipe->nrbufs;
	mask = 0;

	// 如果该文件是用来读的
	if (filp->f_mode & FMODE_READ) {
		// 有可写的buffer就返回 EPOLLIN | EPOLLRDNORM，否则为0
		mask = (nrbufs > 0) ? EPOLLIN | EPOLLRDNORM : 0;
		// 如果没有写者，且文件里记录的版本与pipe里的版本不一样
		if (!pipe->writers && filp->f_version != pipe->w_counter)
			mask |= EPOLLHUP;
	}

	// 如果是写的话
	if (filp->f_mode & FMODE_WRITE) {
		// 如果还有可写的buffer，则返回 EPOLLOUT | EPOLLWRNORM，否则为0
		mask |= (nrbufs < pipe->buffers) ? EPOLLOUT | EPOLLWRNORM : 0;
		/*
		 * Most Unices do not set EPOLLERR for FIFOs but on Linux they
		 * behave exactly like pipes for poll().
		 */
		// 没有读者就出错
		if (!pipe->readers)
			mask |= EPOLLERR;
	}

	return mask;
}

pipe的poll流程比较简单：

初始化等待相关的数据结构，主要是分配一个entry，然后挂在pipe的等待头上；
然后再根据文件是读还是写，然后看pipe的缓冲区里有无已就绪或者空闲的buffer，如果有的话就或上相应的fd可读写的标志，否则标志为0；

4.1 poll_wait

static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p)
{
	// 先调_qproc函数，如果有的话
	if (p && p->_qproc && wait_address)
		p->_qproc(filp, wait_address, p);
}

4.1.1 __pollwait

typedef struct poll_table_struct {
	poll_queue_proc _qproc; // poll函数
	__poll_t _key; // 关键字
} poll_table;

// 相关数据结构
struct poll_table_page {
	struct poll_table_page * next; // 链表
	struct poll_table_entry * entry; // entry指针
	struct poll_table_entry entries[0];
};

struct poll_table_entry {
	struct file *filp; // 文件
	__poll_t key; // 关键字
	wait_queue_entry_t wait; // 等待队列
	wait_queue_head_t *wait_address; // 等待队列头
};

struct poll_wqueues {
	poll_table pt; // poll的函数
	struct poll_table_page *table; // 表
	struct task_struct *polling_task; // poll的进行
	int triggered; // ？
	int error; // 有错误？
	int inline_index; // 当前entry的指针

	#define N_INLINE_POLL_ENTRIES	(576 / sizeof(struct poll_table_entry))

	struct poll_table_entry inline_entries[N_INLINE_POLL_ENTRIES];
};

// 默认 _qproc 的是 __pollwait 函数
static void __pollwait(struct file *filp, wait_queue_head_t *wait_address,
				poll_table *p)
{
	// 根据 p 找到 poll 队列
	struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt);
	// 分配一个entry内存
	struct poll_table_entry *entry = poll_get_entry(pwq);

	// 内存分配失败
	if (!entry)
		return;

	// 获取文件引用
	entry->filp = get_file(filp);
	// 设置要等待头
	entry->wait_address = wait_address;
	// 等待的key
	entry->key = p->_key;
	// 初始化等待队列, 唤醒函数是pollwake
	init_waitqueue_func_entry(&entry->wait, pollwake);
	// 私有数据是poll队列
	entry->wait.private = pwq;
	// 加到等待头里。注意，这里并不阻塞
	add_wait_queue(wait_address, &entry->wait);
}

static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p)
{
	struct poll_table_page *table = p->table;

	// 如果静态的inline_entries还没用完，就直接用静态分配的
	if (p->inline_index < N_INLINE_POLL_ENTRIES)
		return p->inline_entries + p->inline_index++;

	// 如果下一个entry的地址，超过了一页，则表示table满了
//	#define POLL_TABLE_FULL(table) \
	((unsigned long)((table)->entry+1) > PAGE_SIZE + (unsigned long)(table))

	// 如果table为空，或者table满了，重新分配
	if (!table || POLL_TABLE_FULL(table)) {
		struct poll_table_page *new_table;

		// 分配一页
		new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL);
		// 分配页失败
		if (!new_table) {
			p->error = -ENOMEM;
			return NULL;
		}
		// entry先指向entries[0]
		new_table->entry = new_table->entries;
		// 把new_table插到前面
		new_table->next = table;
		// 让队列的table指向新表
		p->table = new_table;
		table = new_table;
	}

	// 返回当前 entry，并让entry+1
	return table->entry++;
}

static inline void
init_waitqueue_func_entry(struct wait_queue_entry *wq_entry, wait_queue_func_t func)
{
	// 将flag,private置空
	wq_entry->flags		= 0;
	wq_entry->private	= NULL;
	// 设置唤醒函数
	wq_entry->func		= func;
}

4.1.1.1 pollwake


static int pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
{
	struct poll_table_entry *entry;

	// 根据wait找到对应的entry
	entry = container_of(wait, struct poll_table_entry, wait);
	// #define key_to_poll(m) ((__force __poll_t)(uintptr_t)(void *)(m))
	// key和entry里保存的key，完全不相同的话，直接返回0
	if (key && !(key_to_poll(key) & entry->key))
		return 0;
	// 进行唤醒
	return __pollwake(wait, mode, sync, key);
}

static int __pollwake(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
{
	// 拿到等待队列
	struct poll_wqueues *pwq = wait->private;
	// 等待队列
	DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task);

	/*
	 * Although this function is called under waitqueue lock, LOCK
	 * doesn't imply write barrier and the users expect write
	 * barrier semantics on wakeup functions.  The following
	 * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up()
	 * and is paired with smp_store_mb() in poll_schedule_timeout.
	 */
	smp_wmb();
	// 设置trigger
	pwq->triggered = 1;

	/*
	 * Perform the default wake up operation using a dummy
	 * waitqueue.
	 *
	 * TODO: This is hacky but there currently is no interface to
	 * pass in @sync.  @sync is scheduled to be removed and once
	 * that happens, wake_up_process() can be used directly.
	 */
	// default_wake_function直接调用 try_to_wake_up，唤醒目标进程
	// 这里会把 poll_schedule_timeout 函数里的等待唤醒 
	return default_wake_function(&dummy_wait, mode, sync, key);
}

5. pipe_read/write

pipe的队列头由pipe_read/write来唤醒。

下面仅分析和select相关的代码。

static ssize_t
pipe_read(struct kiocb *iocb, struct iov_iter *to)
{
	// 要读的数量
	size_t total_len = iov_iter_count(to);
	struct file *filp = iocb->ki_filp;
	struct pipe_inode_info *pipe = filp->private_data;
	int do_wakeup;
	ssize_t ret;

	// 读的数量为0，直接退出
	if (unlikely(total_len == 0))
		return 0;

	do_wakeup = 0;
	ret = 0;
	__pipe_lock(pipe);
	for (;;) {
		// 总共的buffer数
		int bufs = pipe->nrbufs;
		if (bufs) {
			// 当前buffer
			int curbuf = pipe->curbuf;
			// buffer指针
			struct pipe_buffer *buf = pipe->bufs + curbuf;
			// buffer内容长度
			size_t chars = buf->len;
			size_t written;
			int error;

			// 如果大于要读的，以要读的为准
			if (chars > total_len)
				chars = total_len;
			// 这个函数调用 buf->ops->confirm，对于普通的页缓冲区这个函数指针为空
			error = pipe_buf_confirm(pipe, buf);
			if (error) {
				if (!ret)
					ret = error;
				break;
			}
			// 给用户复制数据
			written = copy_page_to_iter(buf->page, buf->offset, chars, to);
			if (unlikely(written < chars)) {
				if (!ret)
					ret = -EFAULT;
				break;
			}
			// 已复制数量
			ret += chars;
			// 偏移增加
			buf->offset += chars;
			// buf内容长度减小
			buf->len -= chars;

			/* Was it a packet buffer? Clean up and exit */
			if (buf->flags & PIPE_BUF_FLAG_PACKET) {
				total_len = chars;
				buf->len = 0;
			}

			// 如果减至0,则释放buf
			if (!buf->len) {
				// pipe_buf_release会直接调用buf->ops->release方法
				pipe_buf_release(pipe, buf);
				// 下一个buf
				curbuf = (curbuf + 1) & (pipe->buffers - 1);
				// 设置pipe下一个buf
				pipe->curbuf = curbuf;
				// buf数量减少
				pipe->nrbufs = --bufs;
				// 要做唤醒操作
				do_wakeup = 1;
			}
			total_len -= chars;
			if (!total_len)
				break;	/* common path: read succeeded */
		}
		if (bufs)	/* More to do? */
			continue;
		if (!pipe->writers)
			break;
		if (!pipe->waiting_writers) {
			/* syscall merging: Usually we must not sleep
			 * if O_NONBLOCK is set, or if we got some data.
			 * But if a writer sleeps in kernel space, then
			 * we can wait for that data without violating POSIX.
			 */
			if (ret)
				break;
			if (filp->f_flags & O_NONBLOCK) {
				ret = -EAGAIN;
				break;
			}
		}
		if (signal_pending(current)) {
			if (!ret)
				ret = -ERESTARTSYS;
			break;
		}
		if (do_wakeup) {
			// 唤醒pipe的队列头
			wake_up_interruptible_sync_poll(&pipe->wait, EPOLLOUT | EPOLLWRNORM);
 			kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
		}
		pipe_wait(pipe);
	}
	__pipe_unlock(pipe);

	/* Signal writers asynchronously that there is more room. */
	if (do_wakeup) {
		wake_up_interruptible_sync_poll(&pipe->wait, EPOLLOUT | EPOLLWRNORM);
		kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
	}
	if (ret > 0)
		file_accessed(filp);
	return ret;
}