从源码层面解析linux调度的原理，实现及演进

原创已于 2023-06-04 16:20:33 修改 · 1.3k 阅读

lylhw13_

关注

标签

#linux #调度 #源码

分类操作系统

于 2023-06-04 16:16:02 首次发布

Linux 面试问题解析专栏收录该内容

2 篇文章

订阅专栏

本文从源码角度解析Linux调度器的演变，涵盖2.4.22的O(n)调度，2.6.22的O(1)调度，以及2.6.24引入的CFS（Completely Fair Scheduler）。CFS基于红黑树，以虚拟运行时间衡量进程优先级，确保公平的CPU资源分配。

调度策略是操作系统设计比较重要的一环。它直接影响了系统的性能和响应能力。不同的调度策略会对不同类型的工作负载产生不同的影响。例如，实时调度策略可以确保实时任务及时响应，但可能会对系统的吞吐量产生负面影响。相反，非实时调度策略可以提高系统的吞吐量，但可能会导致实时任务响应延迟。因此，选择适当的调度策略对于满足系统的性能和响应需求至关重要。

这里选取Linux 内核中所采用的三种调度方式，通过分析源码来揭开其神秘面纱。

先放结论

操作系统	调度名称	基本原理	复杂度
Linux 2.4.22	基于优先级调度	遍历就绪进程列表	O(n)
Linux 2.6.22	基于多级反馈队列	bitmap + 活跃优先级数组 + 过期优先级数组	O(1)
Linux 2.6.24	完全公平调度算法	vruntime + 红黑树	O(logn)

Linux 2.4.22

在这个版本采用 O(n) 调度算法。O(n)调度器是Linux内核最早采用的基于优先级的一种调度算法。

Linux 2.4 内核及更早的内核，都通过一个 runqueue 记录所有的就绪进程。在每次需要调度时，就会遍历runqueue，找到具有最高优先级的进程，所耗费的时间和进程数目成正比，所以称该调度器为O(n)调度器。当所有进程时间片都用完之后，才会重新计算时间片。

在每个周期内，进程都会被给定一个优先级：

实时进程具有最高优先级。
交互进程根据剩余的时间片，具有动态的优先级。
批处理具有最低的优先级。

// kernel/sched.c

asmlinkage void schedule(void)
{
need_resched_back:
	prev = current;
	this_cpu = prev->processor;
	release_kernel_lock(prev, this_cpu);

repeat_schedule:
	// 遍历出所有进程，找出优先级最高的
	next = idle_task(this_cpu);
	c = -1000;
	list_for_each(tmp, &runqueue_head) {
		p = list_entry(tmp, struct task_struct, run_list);
		if (can_schedule(p, this_cpu)) {
			// 计算优先级，时间片用完时返回 0
			int weight = goodness(p, this_cpu, prev->active_mm);
			if (weight > c)
				c = weight, next = p;
		}
	}
	
	// 当所有进程的时间片都用完时，开始重新计算每个进程的时间片
	if (unlikely(!c)) {
		struct task_struct *p;
		for_each_task(p)
			p->counter = (p->counter >> 1) + NICE_TO_TICKS(p->nice);
		goto repeat_schedule;
	}
	
	// 开始切换进程
	sched_data->curr = next;
	task_set_cpu(next, this_cpu);

	// 在同一个时间片内，进程可能被多次调度到
	if (unlikely(prev == next)) {
		prev->policy &= ~SCHED_YIELD;
		goto same_process;
	}
	
	prepare_to_switch();
	// 保存寄存器及当前栈
	switch_to(prev, next, prev);
	__schedule_tail(prev);

same_process:
	reacquire_kernel_lock(current);
	if (current->need_resched)
		goto need_resched_back;
	return;
}

/*
 * Return values:
 *	 -1000: never select this
 *	     0: out of time, recalculate counters (but it might still be selected)
 *	   +ve: "goodness" value (the larger, the better)
 *	 +1000: realtime process, select this.
 */
static inline int goodness(struct task_struct * p, int this_cpu, struct mm_struct *this_mm)
{
	int weight;
	
	weight = -1;
	if (p->policy & SCHED_YIELD)	// 如果为主动放弃，则不选择该进程
		goto out;

	 // Non-RT process - normal case first.
	if (p->policy == SCHED_OTHER) {

		 // 根据剩余的时间片，计算优先级
		weight = p->counter;
		if (!weight)
			goto out;
		
		// 微调 wight
		if (p->mm == this_mm || !p->mm)
			weight += 1;
		weight += 20 - p->nice;
		goto out;
	}

	// 实时进行具有最高优先级
	weight = 1000 + p->rt_priority;
out:
	return weight;
}

Linux 2.6.22

在Linux 2.6 内核中采用了O(1)调度算法，并且 Linux 2.6.22是最后一个采用该算法的版本。
该算法的核心思想是多级反馈队列算法，即每个CPU都维护一个runqueue结构体，该结构体包含两个优先级数组，即活跃（active）优先级数组和过期（expired）优先级数组。数组的每一项为一个具有同样优先级的队列，同时，优先级数组还有一个bitmap，用于标记数组的每个队列是否为空。因此，在选择下一个进程时，只用查询活跃优先级数组的位图，并选择不为空的队列，然后使用队列首即可。于是，选择下一个被调度进程的时间变成了查询位图操作，而且和系统中就绪的进程数目无关，时间复杂度是O(1)，因此这种调度器称为O(1)调度器。

优点：

每个CPU维护一个runqueue，减少锁竞争
查询时间复杂度为 O(1)

代码实现如下：

struct prio_array {
	unsigned int nr_active;
	DECLARE_BITMAP(bitmap, MAX_PRIO+1); // 位图，标记相应队列是否为空
	struct list_head queue[MAX_PRIO];	// 每一项为一个队列
};

struct rq {
	struct prio_array *active, *expired, arrays[2];
	...
};

asmlinkage void __sched schedule(void)
{
...
	array = rq->active;
	if (unlikely(!array->nr_active)) {
		 // 如果活跃队列为空，则交换两个队列，将过期队列变为活跃队列
		schedstat_inc(rq, sched_switch);
		rq->active = rq->expired;
		rq->expired = array;
		array = rq->active;
		rq->expired_timestamp = 0;
		rq->best_expired_prio = MAX_PRIO;
	}

	// 查找活跃优先级数组的位图，找到第一个为1的idx，查找时间固定
	idx = sched_find_first_bit(array->bitmap);
	queue = array->queue + idx;
	next = list_entry(queue->next, struct task_struct, run_list);

switch_tasks:
	if (next == rq->idle)
		schedstat_inc(rq, sched_goidle);
	prefetch(next);
	update_cpu_clock(prev, rq, now);

	if (likely(prev != next)) {
		prev = context_switch(rq, prev, next);
		barrier();
	} else
		spin_unlock_irq(&rq->lock);
...
}

Linux 2.6.24

Linux从2.6.23版本开始使用CFS（Completely Fair Scheduler）调度策略。CFS是一种基于红黑树的调度器，它旨在提供公平的CPU时间分配，以确保所有进程都能够公平地使用CPU资源。CFS调度器使用进程的虚拟运行时间（virtual runtime）来衡量进程的优先级，虚拟运行时间越短的进程，优先级越高，越容易获得执行机会。CFS调度器还支持实时进程和优先级反转等特性，以提高系统的响应性和稳定性。CFS调度器是Linux内核中的一个重要组件，它被广泛用于桌面系统、服务器系统和嵌入式系统等不同的场景中。

CFS 调度的入口如下：

asmlinkage void __sched schedule(void)
{
...
	// 更新当前任务的时间，并放入红黑树
	prev->sched_class->put_prev_task(rq, prev);
	// 选取vruntime最小的任务
	next = pick_next_task(rq, prev);

	if (likely(prev != next)) {
		rq->nr_switches++;
		rq->curr = next;
		++*switch_count;

		context_switch(rq, prev, next); /* unlocks the rq */
	} 
...
}

可以看出整个调度入口的逻辑非常简单：

更新当前任务的时间，并放入红黑树
选取vruntime最小的任务

所以CFS调度算法的核心就是选取最小vruntime的任务，以达到调度的效果。

第一步: 更新当前任务的时间，并放入红黑树

调用链如下:

put_prev_task
	put_prev_task_fair
		put_prev_entity
			update_curr			// 更新时间	
			__enqueue_entity	// 将entity压入红黑树

更新时间的代码实现

这里时间使用的vruntime，vruntime 大致的计算逻辑如下图，权重越大，vruntime就越小，就越有更多的机会被调度运行。也就是优先级越高。
在这里插入图片描述
调用链如下：

update_curr
	__update_curr
		calc_delta_fair
			calc_delta_mine

// 更新时间
static inline void
__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
	      unsigned long delta_exec)
{
	unsigned long delta_exec_weighted;
	u64 vruntime;

	curr->sum_exec_runtime += delta_exec;
	schedstat_add(cfs_rq, exec_clock, delta_exec);
	delta_exec_weighted = delta_exec;
	if (unlikely(curr->load.weight != NICE_0_LOAD)) {
		delta_exec_weighted = calc_delta_fair(delta_exec_weighted,
							&curr->load);
	}
	curr->vruntime += delta_exec_weighted;

}

// 时间计算方式
static unsigned long
calc_delta_mine(unsigned long delta_exec, unsigned long weight,
		struct load_weight *lw)
{
	u64 tmp;

	tmp = (u64)delta_exec * weight;
	tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);

	return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
}

将调度实体压入二叉树的代码实现

static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
	int leftmost = 1;

	 // 在红黑树中查找合适的位置
	while (*link) {
		parent = *link;
		entry = rb_entry(parent, struct sched_entity, run_node);
		if (key < entity_key(cfs_rq, entry)) {
			link = &parent->rb_left;
		} else {
			link = &parent->rb_right;
			leftmost = 0;
		}
	}

	// 缓存 leftmost, 避免二叉树的频繁遍历
	if (leftmost)
		cfs_rq->rb_leftmost = &se->run_node;

	rb_link_node(&se->run_node, parent, link);
	rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
}

第二步：选取vruntime最小的任务

调用链如下

pick_next_task
	pick_next_task_fair
		pick_next_entity
			__pick_next_entity

注意：__pick_next_entity 函数本身不会遍历树找到最左叶子节点，因为该值已经缓存在rb_leftmost 字段中。

static inline struct rb_node *first_fair(struct cfs_rq *cfs_rq)
{
	return cfs_rq->rb_leftmost;
}

static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
{
	return rb_entry(first_fair(cfs_rq), struct sched_entity, run_node);
}