1 使用kthread_create创建线程:
struct task_struct *kthread_create(int (*threadfn)(void *data),
void *data,
const char *namefmt, ...);
这个函数可以像printk一样传入某种格式的线程名
线程创建后,不会马上运行,而是需要将kthread_create() 返回的task_struct指针传给wake_up_process(),然后通过此函数运行线程。
2. 当然,还有一个创建并启动线程的函数:kthread_run
struct task_struct *kthread_run(int (*threadfn)(void *data),
void *data,
const char *namefmt, ...);
3. 线程一旦启动起来后,会一直运行,除非该线程主动调用do_exit函数,或者其他的进程调用kthread_stop函数,结束线程的运行。
int kthread_stop(struct task_struct *thread);
kthread_stop() 通过发送信号给线程。
如果线程函数正在处理一个非常重要的任务,它不会被中断的。当然如果线程函数永远不返回并且不检查信号,它将永远都不会停止。
参考:Kernel threads made easy
简单的说,就是linux内核为每个下发的scsi磁盘命令加个定时器,如果超时了下面底层驱动还没有处理完这个scsi命令的话,就开始触发scsi_unjam_host开始的动作。这个函数会进行一些逐渐提高等级的恢复操作,然后看你这个底层的scsi host是不是可以恢复到正常状态了。
默认的策略是这样的,不过好像也可以在自己的scsi host里面自定义恢复策略。
scsi_eh_abort_cmds //先是取消这个超时命令,调用scsi host的接口通知下层取消
if (!scsi_eh_stu(shost, work_q, done_q)) //START_STOP_UNIT command 让这个磁盘重新就绪?
if (!scsi_eh_bus_device_reset(shost, work_q, done_q)) //如果还是不行,重启整个磁盘设备
if (!scsi_eh_bus_reset(shost, work_q, done_q)) //如果还是不行,重启整个总线 ,可能影响很多磁盘了
if (!scsi_eh_host_reset(work_q, done_q)) //重启整个host,整个host上面的所有磁盘估计都要重启了。
scsi_eh_offline_sdevs(work_q, done_q); //////////
这里的scsi错误指的不是 scsi命令返回的错误结果,而是scsi命令被分发给下层驱动之后,驱动超时了也没有返回一个结果回来,就是scsi命令在底层挂住了停止响应了。
以前linux内核会为每个scsi磁盘命令建一个 timer的,使用scsi_add_timer这个函数来作的。 不过后来好像2.6.28开始改了办法了,因为他们发现这样可能导致创建的timer 太多,影响性能吧。作磁盘测试的时候,可以达到每秒3000个磁盘操作,每个超时设为100秒的时候,都会导致创建很多timer了。 应该是把这个timer计时放到block层来做了,每个队列 queue里面所有磁盘指令现在共用一个timer,然后每次执行一个磁盘指令的时候,修改这个timer的超时为所有指令的最小超时时间。而不是为每个指令都建立一个新的timer。
这个新的机制应该是在lwn的这篇文章里面有描述的,不过不知道怎么回事今天lwn在我这里打不开。
Block layer: solid-state storage, timeouts, affinity, and more http://lwn.net/Articles/303270/
timer的超时处理函数最后调到这个scsi_eh_scmd_add函数,就会唤醒scsi的错误恢复进程
82/**
83 * scsi_eh_scmd_add - add scsi cmd to error handling.
84 * @scmd: scmd to run eh on.
85 * @eh_flag: optional SCSI_EH flag.
86 *
87 * Return value:
88 * 0 on failure.
89 */
90int scsi_eh_scmd_add(struct scsi_cmnd *scmd, int eh_flag)
91{
92 struct Scsi_Host *shost = scmd->device->host;
93 unsigned long flags;
94 int ret = 0;
95
96 if (!shost->ehandler)
97 return 0;
98
99 spin_lock_irqsave(shost->host_lock, flags);
100 if (scsi_host_set_state(shost, SHOST_RECOVERY))
101 if (scsi_host_set_state(shost, SHOST_CANCEL_RECOVERY))
102 goto out_unlock;
103
104 ret = 1;
105 scmd->eh_eflags |= eh_flag;
106 list_add_tail(&scmd->eh_entry, &shost->eh_cmd_q);
107 shost->host_failed++;
108 scsi_eh_wakeup(shost); ////唤醒超时处理线程
109 out_unlock:
110 spin_unlock_irqrestore(shost->host_lock, flags);
111 return ret;
112}
//////scsi把这个scsi_times_out函数注册为block层的超时处理函数了。
114/**
115 * scsi_times_out - Timeout function for normal scsi commands.
116 * @req: request that is timing out.
117 *
118 * Notes:
119 * We do not need to lock this. There is the potential for a race
120 * only in that the normal completion handling might run, but if the
121 * normal completion function determines that the timer has already
122 * fired, then it mustn't do anything.
123 */
124enum blk_eh_timer_return scsi_times_out(struct request *req)
125{
126 struct scsi_cmnd *scmd = req->special;
127 enum blk_eh_timer_return (*eh_timed_out)(struct scsi_cmnd *);
128 enum blk_eh_timer_return rtn = BLK_EH_NOT_HANDLED;
129
130 scsi_log_completion(scmd, TIMEOUT_ERROR);
131
132 if (scmd->device->host->transportt->eh_timed_out)
133 eh_timed_out = scmd->device->host->transportt->eh_timed_out;
134 else if (scmd->device->host->hostt->eh_timed_out)
135 eh_timed_out = scmd->device->host->hostt->eh_timed_out;
136 else
137 eh_timed_out = NULL;
138
139 if (eh_timed_out)
140 rtn = eh_timed_out(scmd);
141 switch (rtn) {
142 case BLK_EH_NOT_HANDLED:
143 break;
144 default:
145 return rtn;
146 }
147
148 if (unlikely(!scsi_eh_scmd_add(scmd, SCSI_EH_CANCEL_CMD))) { /////////////唤醒进程
149 scmd->result |= DID_TIME_OUT << 16;
150 return BLK_EH_HANDLED;
151 }
152
153 return BLK_EH_NOT_HANDLED;
154}
155
1789struct request_queue *scsi_alloc_queue(struct scsi_device *sdev)
1790{
1791 struct request_queue *q;
1792
1793 q = __scsi_alloc_queue(sdev->host, scsi_request_fn);
1794 if (!q)
1795 return NULL;
1796
1797 blk_queue_prep_rq(q, scsi_prep_fn);
1798 blk_queue_softirq_done(q, scsi_softirq_done);
1799 blk_queue_rq_timed_out(q, scsi_times_out); ////////////////////////////////队列初始化的时候注册的超时处理函数
1800 blk_queue_lld_busy(q, scsi_lld_busy);
1801 return q;
1802}
86void blk_queue_rq_timed_out(struct request_queue *q, rq_timed_out_fn *fn)
87{
88 q->rq_timed_out_fn = fn;
89}
block层超时从这个调用下面各层的超时函数,
83static void blk_rq_timed_out(struct request *req)
84{
85 struct request_queue *q = req->q;
86 enum blk_eh_timer_return ret;
87
88 ret = q->rq_timed_out_fn(req); ////////////////////调用scsi层事先注册的超时处理函数
89 switch (ret) {
90 case BLK_EH_HANDLED:
91 __blk_complete_request(req);
92 break;
93 case BLK_EH_RESET_TIMER:
94 blk_clear_rq_complete(req);
95 blk_add_timer(req);
96 break;
97 case BLK_EH_NOT_HANDLED:
98 /*
99 * LLD handles this for now but in the future
100 * we can send a request msg to abort the command
101 * and we can move more of the generic scsi eh code to
102 * the blk layer.
103 */
104 break;
105 default:
106 printk(KERN_ERR "block: bad eh return: %d\n", ret);
107 break;
108 }
109}
110
111void blk_rq_timed_out_timer(unsigned long data)
112{
113 struct request_queue *q = (struct request_queue *) data;
114 unsigned long flags, uninitialized_var(next), next_set = 0;
115 struct request *rq, *tmp;
116
117 spin_lock_irqsave(q->queue_lock, flags);
118
119 list_for_each_entry_safe(rq, tmp, &q->timeout_list, timeout_list) {
120 if (time_after_eq(jiffies, rq->deadline)) {
121 list_del_init(&rq->timeout_list);
122
123 /*
124 * Check if we raced with end io completion
125 */
126 if (blk_mark_rq_complete(rq))
127 continue;
128 blk_rq_timed_out(rq); /////////////////////block层判断是不是有磁盘命令超时了,然后相应 的超时函数调///////////////////////
129 }
130 if (!next_set) {
131 next = rq->deadline;
132 next_set = 1;
133 } else if (time_after(next, rq->deadline))
134 next = rq->deadline;
135 }
136
137 if (next_set && !list_empty(&q->timeout_list))
138 mod_timer(&q->timeout, round_jiffies_up(next));
139
140 spin_unlock_irqrestore(q->queue_lock, flags);
141}
161/**
162 * blk_add_timer - Start timeout timer for a single request
163 * @req: request that is about to start running.
164 *
165 * Notes:
166 * Each request has its own timer, and as it is added to the queue, we
167 * set up the timer. When the request completes, we cancel the timer.
168 */
169void blk_add_timer(struct request *req)
170{
171 struct request_queue *q = req->q;
172 unsigned long expiry;
173
174 if (!q->rq_timed_out_fn)
175 return;
176
177 BUG_ON(!list_empty(&req->timeout_list));
178 BUG_ON(test_bit(REQ_ATOM_COMPLETE, &req->atomic_flags));
179
180 if (req->timeout)
181 req->deadline = jiffies + req->timeout;
182 else {
183 req->deadline = jiffies + q->rq_timeout;
184 /*
185 * Some LLDs, like scsi, peek at the timeout to prevent
186 * a command from being retried forever.
187 */
188 req->timeout = q->rq_timeout;
189 }
190 list_add_tail(&req->timeout_list, &q->timeout_list);
191
192 /*
193 * If the timer isn't already pending or this timeout is earlier
194 * than an existing one, modify the timer. Round up to next nearest
195 * second.
196 */
197 expiry = round_jiffies_up(req->deadline);
198
199 if (!timer_pending(&q->timeout) ||
200 time_before(expiry, q->timeout.expires))
201 mod_timer(&q->timeout, expiry); //修改 timer的超时时间
202}
1639/**
1640 * blkdev_dequeue_request - dequeue request and start timeout timer
1641 * @req: request to dequeue
1642 *
1643 * Dequeue @req and start timeout timer on it. This hands off the
1644 * request to the driver.
1645 *
1646 * Block internal functions which don't want to start timer should
1647 * call elv_dequeue_request().
1648 */
1649void blkdev_dequeue_request(struct request *req)
1650{
1651 elv_dequeue_request(req->q, req);
1652
1653 /*
1654 * We are now handing the request to the hardware, add the
1655 * timeout handler.
1656 */
1657 blk_add_timer(req); ///每次从queue中取出一个磁盘命令以便传个下层驱动的时候,都重新计算timer的超时时间。
1658}
1659EXPORT_SYMBOL(blkdev_dequeue_request);
就scsi来说,是这样的,
1590static void scsi_request_fn(struct request_queue *q)
req = elv_next_request(q); ////从队列里面取出磁盘命令,这里会 调整定时器的超时
rtn = scsi_dispatch_cmd(cmd);
host->queuecommand() /// 下发给底层驱动
scsi层真正开始错误恢复处理的函数。
1544/**
1545 * scsi_unjam_host - Attempt to fix a host which has a cmd that failed.
1546 * @shost: Host to unjam.
1547 *
1548 * Notes:
1549 * When we come in here, we *know* that all commands on the bus have
1550 * either completed, failed or timed out. we also know that no further
1551 * commands are being sent to the host, so things are relatively quiet
1552 * and we have freedom to fiddle with things as we wish.
1553 *
1554 * This is only the *default* implementation. it is possible for
1555 * individual drivers to supply their own version of this function, and
1556 * if the maintainer wishes to do this, it is strongly suggested that
1557 * this function be taken as a template and modified. this function
1558 * was designed to correctly handle problems for about 95% of the
1559 * different cases out there, and it should always provide at least a
1560 * reasonable amount of error recovery.
1561 *
1562 * Any command marked 'failed' or 'timeout' must eventually have
1563 * scsi_finish_cmd() called for it. we do all of the retry stuff
1564 * here, so when we restart the host after we return it should have an
1565 * empty queue.
1566 **/
1567static void scsi_unjam_host(struct Scsi_Host *shost)
1568{
1569 unsigned long flags;
1570 LIST_HEAD(eh_work_q);
1571 LIST_HEAD(eh_done_q);
1572
1573 spin_lock_irqsave(shost->host_lock, flags);
1574 list_splice_init(&shost->eh_cmd_q, &eh_work_q);
1575 spin_unlock_irqrestore(shost->host_lock, flags);
1576
1577 SCSI_LOG_ERROR_RECOVERY(1, scsi_eh_prt_fail_stats(shost, &eh_work_q));
1578
1579 if (!scsi_eh_get_sense(&eh_work_q, &eh_done_q))
1580 if (!scsi_eh_abort_cmds(&eh_work_q, &eh_done_q))
1581 scsi_eh_ready_devs(shost, &eh_work_q, &eh_done_q);
1582
1583 scsi_eh_flush_done_q(&eh_done_q);
1584}
///scsi的错误恢复处理进程,就是上面超时被唤醒的那个
/**
1587 * scsi_error_handler - Handle errors/timeouts of SCSI cmds.
1588 * @data: Host for which we are running.
1589 *
1590 * Notes:
1591 * This is always run in the context of a kernel thread. The idea is
1592 * that we start this thing up when the kernel starts up (one per host
1593 * that we detect), and it immediately goes to sleep and waits for some
1594 * event (i.e. failure). When this takes place, we have the job of
1595 * trying to unjam the bus and restarting things.
1596 **/
1597int scsi_error_handler(void *data)
1598{
1599 struct Scsi_Host *shost = (struct Scsi_Host *) data;
1600 int rtn;
1601 DECLARE_MUTEX_LOCKED(sem);
1602
1603 /*
1604 * Flush resources
1605 */
1606
1607 daemonize("scsi_eh_%d", shost->host_no);
1608
1609 current->flags |= PF_NOFREEZE;
1610
1611 shost->eh_wait = &sem;
1612 shost->ehandler = current;
1613
1614 /*
1615 * Wake up the thread that created us.
1616 */
1617 SCSI_LOG_ERROR_RECOVERY(3, printk("Wake up parent of"
1618 " scsi_eh_%d\n",shost->host_no));
1619
1620 complete(shost->eh_notify);
1621
1622 while (1) {
1623 /*
1624 * If we get a signal, it means we are supposed to go
1625 * away and die. This typically happens if the user is
1626 * trying to unload a module.
1627 */
1628 SCSI_LOG_ERROR_RECOVERY(1, printk("Error handler"
1629 " scsi_eh_%d"
1630 " sleeping\n",shost->host_no));
1631
1632 /*
1633 * Note - we always use down_interruptible with the semaphore
1634 * even if the module was loaded as part of the kernel. The
1635 * reason is that down() will cause this thread to be counted
1636 * in the load average as a running process, and down
1637 * interruptible doesn't. Given that we need to allow this
1638 * thread to die if the driver was loaded as a module, using
1639 * semaphores isn't unreasonable.
1640 */
1641 down_interruptible(&sem);
1642 if (shost->eh_kill)
1643 break;
1644
1645 SCSI_LOG_ERROR_RECOVERY(1, printk("Error handler"
1646 " scsi_eh_%d waking"
1647 " up\n",shost->host_no));
1648
1649 shost->eh_active = 1; ///////这个很重要?????
1650
1651 /*
1652 * We have a host that is failing for some reason. Figure out
1653 * what we need to do to get it up and online again (if we can).
1654 * If we fail, we end up taking the thing offline.
1655 */
1656 if (shost->hostt->eh_strategy_handler)
1657 rtn = shost->hostt->eh_strategy_handler(shost);
1658 else
1659 scsi_unjam_host(shost); ///////在这个线程里面处理错误的超时的指令
1660
1661 shost->eh_active = 0;
1662
1663 /*
1664 * Note - if the above fails completely, the action is to take
1665 * individual devices offline and flush the queue of any
1666 * outstanding requests that may have been pending. When we
1667 * restart, we restart any I/O to any other devices on the bus
1668 * which are still online.
1669 */
1670 scsi_restart_operations(shost);
1671
1672 }
1673
1674 SCSI_LOG_ERROR_RECOVERY(1, printk("Error handler scsi_eh_%d"
1675 " exiting\n",shost->host_no));
1676
1677 /*
1678 * Make sure that nobody tries to wake us up again.
1679 */
1680 shost->eh_wait = NULL;
1681
1682 /*
1683 * Knock this down too. From this point on, the host is flying
1684 * without a pilot. If this is because the module is being unloaded,
1685 * that's fine. If the user sent a signal to this thing, we are
1686 * potentially in real danger.
1687 */
1688 shost->eh_active = 0;
1689 shost->ehandler = NULL;
1690
1691 /*
1692 * If anyone is waiting for us to exit (i.e. someone trying to unload
1693 * a driver), then wake up that process to let them know we are on
1694 * the way out the door.
1695 */
1696 complete_and_exit(shost->eh_notify, 0);
1697 return 0;
1698}
一步一步的加强等级进行恢复
1521/**
1522 * scsi_eh_ready_devs - check device ready state and recover if not.
1523 * @shost: host to be recovered.
1524 * @eh_done_q: list_head for processed commands.
1525 *
1526 **/
1527static void scsi_eh_ready_devs(struct Scsi_Host *shost,
1528 struct list_head *work_q,
1529 struct list_head *done_q)
1530{
1531 if (!scsi_eh_stu(shost, work_q, done_q))
1532 if (!scsi_eh_bus_device_reset(shost, work_q, done_q))
1533 if (!scsi_eh_bus_reset(shost, work_q, done_q))
1534 if (!scsi_eh_host_reset(work_q, done_q))
1535 scsi_eh_offline_sdevs(work_q, done_q); //////////
1536}
1537
本文详细介绍了Linux内核中针对SCSI磁盘错误的恢复机制,包括如何创建和管理错误处理线程,以及逐步升级的恢复策略。当SCSI命令超时时,内核会启动错误恢复进程,通过取消命令、设备重新就绪、设备或总线重置等手段尝试恢复。此外,文章还探讨了定时器管理的变化,以避免过多定时器影响性能。
4410

被折叠的 条评论
为什么被折叠?



