Linux 内核scsi磁盘的错误恢复机制与线程

最新推荐文章于 2026-06-22 17:38:29 发布

原创最新推荐文章于 2026-06-22 17:38:29 发布 · 2.9k 阅读

1 ·

本内容遵循CC 4.0 BY-SA版权协议

标签

#磁盘 #linux #timer #struct #list

linux 专栏收录该内容

6 篇文章

订阅专栏

本文详细介绍了Linux内核中针对SCSI磁盘错误的恢复机制，包括如何创建和管理错误处理线程，以及逐步升级的恢复策略。当SCSI命令超时时，内核会启动错误恢复进程，通过取消命令、设备重新就绪、设备或总线重置等手段尝试恢复。此外，文章还探讨了定时器管理的变化，以避免过多定时器影响性能。

1 使用kthread_create创建线程：
struct task_struct *kthread_create(int (*threadfn)(void *data),
                                    void *data,
   const char *namefmt, ...);
这个函数可以像printk一样传入某种格式的线程名
线程创建后，不会马上运行，而是需要将kthread_create() 返回的task_struct指针传给wake_up_process()，然后通过此函数运行线程。
2. 当然，还有一个创建并启动线程的函数：kthread_run
struct task_struct *kthread_run(int (*threadfn)(void *data),
                                 void *data,
      const char *namefmt, ...);
3. 线程一旦启动起来后，会一直运行，除非该线程主动调用do_exit函数，或者其他的进程调用kthread_stop函数，结束线程的运行。
int kthread_stop(struct task_struct *thread);
kthread_stop() 通过发送信号给线程。
如果线程函数正在处理一个非常重要的任务，它不会被中断的。当然如果线程函数永远不返回并且不检查信号，它将永远都不会停止。
参考：Kernel threads made easy

简单的说，就是linux内核为每个下发的scsi磁盘命令加个定时器，如果超时了下面底层驱动还没有处理完这个scsi命令的话，就开始触发scsi_unjam_host开始的动作。这个函数会进行一些逐渐提高等级的恢复操作，然后看你这个底层的scsi host是不是可以恢复到正常状态了。
默认的策略是这样的，不过好像也可以在自己的scsi host里面自定义恢复策略。
scsi_eh_abort_cmds        //先是取消这个超时命令，调用scsi host的接口通知下层取消
      if (!scsi_eh_stu(shost, work_q, done_q))    //START_STOP_UNIT command 让这个磁盘重新就绪？
                 if (!scsi_eh_bus_device_reset(shost, work_q, done_q)) //如果还是不行，重启整个磁盘设备
                        if (!scsi_eh_bus_reset(shost, work_q, done_q)) //如果还是不行，重启整个总线，可能影响很多磁盘了
                                if (!scsi_eh_host_reset(work_q, done_q)) //重启整个host，整个host上面的所有磁盘估计都要重启了。
                                        scsi_eh_offline_sdevs(work_q, done_q); //////////

这里的scsi错误指的不是 scsi命令返回的错误结果，而是scsi命令被分发给下层驱动之后，驱动超时了也没有返回一个结果回来，就是scsi命令在底层挂住了停止响应了。

以前linux内核会为每个scsi磁盘命令建一个 timer的，使用scsi_add_timer这个函数来作的。不过后来好像2.6.28开始改了办法了，因为他们发现这样可能导致创建的timer 太多，影响性能吧。作磁盘测试的时候，可以达到每秒3000个磁盘操作，每个超时设为100秒的时候，都会导致创建很多timer了。应该是把这个timer计时放到block层来做了，每个队列 queue里面所有磁盘指令现在共用一个timer，然后每次执行一个磁盘指令的时候，修改这个timer的超时为所有指令的最小超时时间。而不是为每个指令都建立一个新的timer。
这个新的机制应该是在lwn的这篇文章里面有描述的，不过不知道怎么回事今天lwn在我这里打不开。
Block layer: solid-state storage, timeouts, affinity, and more http://lwn.net/Articles/303270/

timer的超时处理函数最后调到这个scsi_eh_scmd_add函数，就会唤醒scsi的错误恢复进程
82/**
83 * scsi_eh_scmd_add - add scsi cmd to error handling.
84 * @scmd:       scmd to run eh on.
85 * @eh_flag:    optional SCSI_EH flag.
86 *
87 * Return value:
88 *      0 on failure.
89 */
90int scsi_eh_scmd_add(struct scsi_cmnd *scmd, int eh_flag)
91{
92        struct Scsi_Host *shost = scmd->device->host;
93        unsigned long flags;
94        int ret = 0;
95
96        if (!shost->ehandler)
97                return 0;
98
99        spin_lock_irqsave(shost->host_lock, flags);
100        if (scsi_host_set_state(shost, SHOST_RECOVERY))
101                if (scsi_host_set_state(shost, SHOST_CANCEL_RECOVERY))
102                        goto out_unlock;
103
104        ret = 1;
105        scmd->eh_eflags |= eh_flag;
106        list_add_tail(&scmd->eh_entry, &shost->eh_cmd_q);
107        shost->host_failed++;
108        scsi_eh_wakeup(shost);     ////唤醒超时处理线程
109 out_unlock:
110        spin_unlock_irqrestore(shost->host_lock, flags);
111        return ret;
112}

//////scsi把这个scsi_times_out函数注册为block层的超时处理函数了。
114/**
115 * scsi_times_out - Timeout function for normal scsi commands.
116 * @req:        request that is timing out.
117 *
118 * Notes:
119 *     We do not need to lock this. There is the potential for a race
120 *     only in that the normal completion handling might run, but if the
121 *     normal completion function determines that the timer has already
122 *     fired, then it mustn't do anything.
123 */
124enum blk_eh_timer_return scsi_times_out(struct request *req)
125{
126        struct scsi_cmnd *scmd = req->special;
127        enum blk_eh_timer_return (*eh_timed_out)(struct scsi_cmnd *);
128        enum blk_eh_timer_return rtn = BLK_EH_NOT_HANDLED;
129
130        scsi_log_completion(scmd, TIMEOUT_ERROR);
131
132        if (scmd->device->host->transportt->eh_timed_out)
133                eh_timed_out = scmd->device->host->transportt->eh_timed_out;
134        else if (scmd->device->host->hostt->eh_timed_out)
135                eh_timed_out = scmd->device->host->hostt->eh_timed_out;
136        else
137                eh_timed_out = NULL;
138
139        if (eh_timed_out)
140                rtn = eh_timed_out(scmd);
141                switch (rtn) {
142                case BLK_EH_NOT_HANDLED:
143                        break;
144                default:
145                        return rtn;
146                }
147
148        if (unlikely(!scsi_eh_scmd_add(scmd, SCSI_EH_CANCEL_CMD))) { /////////////唤醒进程
149                scmd->result |= DID_TIME_OUT << 16;
150                return BLK_EH_HANDLED;
151        }
152
153        return BLK_EH_NOT_HANDLED;
154}
155

1789struct request_queue *scsi_alloc_queue(struct scsi_device *sdev)
1790{
1791        struct request_queue *q;
1792
1793        q = __scsi_alloc_queue(sdev->host, scsi_request_fn);
1794        if (!q)
1795                return NULL;
1796
1797        blk_queue_prep_rq(q, scsi_prep_fn);
1798        blk_queue_softirq_done(q, scsi_softirq_done);
1799        blk_queue_rq_timed_out(q, scsi_times_out); ////////////////////////////////队列初始化的时候注册的超时处理函数
1800        blk_queue_lld_busy(q, scsi_lld_busy);
1801        return q;
1802}

86void blk_queue_rq_timed_out(struct request_queue *q, rq_timed_out_fn *fn)
87{
88        q->rq_timed_out_fn = fn;
89}

block层超时从这个调用下面各层的超时函数，
83static void blk_rq_timed_out(struct request *req)
84{
85        struct request_queue *q = req->q;
86        enum blk_eh_timer_return ret;
87
88        ret = q->rq_timed_out_fn(req);    ////////////////////调用scsi层事先注册的超时处理函数
89        switch (ret) {
90        case BLK_EH_HANDLED:
91                __blk_complete_request(req);
92                break;
93        case BLK_EH_RESET_TIMER:
94                blk_clear_rq_complete(req);
95                blk_add_timer(req);
96                break;
97        case BLK_EH_NOT_HANDLED:
98                /*
99                 * LLD handles this for now but in the future
100                 * we can send a request msg to abort the command
101                 * and we can move more of the generic scsi eh code to
102                 * the blk layer.
103                 */
104                break;
105        default:
106                printk(KERN_ERR "block: bad eh return: %d\n", ret);
107                break;
108        }
109}
110
111void blk_rq_timed_out_timer(unsigned long data)
112{
113        struct request_queue *q = (struct request_queue *) data;
114        unsigned long flags, uninitialized_var(next), next_set = 0;
115        struct request *rq, *tmp;
116
117        spin_lock_irqsave(q->queue_lock, flags);
118
119        list_for_each_entry_safe(rq, tmp, &q->timeout_list, timeout_list) {
120                if (time_after_eq(jiffies, rq->deadline)) {
121                        list_del_init(&rq->timeout_list);
122
123                        /*
124                         * Check if we raced with end io completion
125                         */
126                        if (blk_mark_rq_complete(rq))
127                                continue;
128                        blk_rq_timed_out(rq); /////////////////////block层判断是不是有磁盘命令超时了，然后相应的超时函数调///////////////////////
129                }
130                if (!next_set) {
131                        next = rq->deadline;
132                        next_set = 1;
133                } else if (time_after(next, rq->deadline))
134                        next = rq->deadline;
135        }
136
137        if (next_set && !list_empty(&q->timeout_list))
138                mod_timer(&q->timeout, round_jiffies_up(next));
139
140        spin_unlock_irqrestore(q->queue_lock, flags);
141}

   161/**
162 * blk_add_timer - Start timeout timer for a single request
163 * @req:        request that is about to start running.
164 *
165 * Notes:
166 *    Each request has its own timer, and as it is added to the queue, we
167 *    set up the timer. When the request completes, we cancel the timer.
168 */
169void blk_add_timer(struct request *req)
170{
171        struct request_queue *q = req->q;
172        unsigned long expiry;
173
174        if (!q->rq_timed_out_fn)
175                return;
176
177        BUG_ON(!list_empty(&req->timeout_list));
178        BUG_ON(test_bit(REQ_ATOM_COMPLETE, &req->atomic_flags));
179
180        if (req->timeout)
181                req->deadline = jiffies + req->timeout;
182        else {
183                req->deadline = jiffies + q->rq_timeout;
184                /*
185                 * Some LLDs, like scsi, peek at the timeout to prevent
186                 * a command from being retried forever.
187                 */
188                req->timeout = q->rq_timeout;
189        }
190        list_add_tail(&req->timeout_list, &q->timeout_list);
191
192        /*
193         * If the timer isn't already pending or this timeout is earlier
194         * than an existing one, modify the timer. Round up to next nearest
195         * second.
196         */
197        expiry = round_jiffies_up(req->deadline);
198
199        if (!timer_pending(&q->timeout) ||
200            time_before(expiry, q->timeout.expires))
201                mod_timer(&q->timeout, expiry);                 //修改 timer的超时时间
202}

1639/**
1640 * blkdev_dequeue_request - dequeue request and start timeout timer
1641 * @req: request to dequeue
1642 *
1643 * Dequeue @req and start timeout timer on it. This hands off the
1644 * request to the driver.
1645 *
1646 * Block internal functions which don't want to start timer should
1647 * call elv_dequeue_request().
1648 */
1649void blkdev_dequeue_request(struct request *req)
1650{
1651        elv_dequeue_request(req->q, req);
1652
1653        /*
1654         * We are now handing the request to the hardware, add the
1655         * timeout handler.
1656         */
1657        blk_add_timer(req);        ///每次从queue中取出一个磁盘命令以便传个下层驱动的时候，都重新计算timer的超时时间。
1658}
1659EXPORT_SYMBOL(blkdev_dequeue_request);

就scsi来说，是这样的，

1590static void scsi_request_fn(struct request_queue *q)

                   req = elv_next_request(q);   ////从队列里面取出磁盘命令，这里会调整定时器的超时
                   rtn = scsi_dispatch_cmd(cmd);
                                 host->queuecommand()   /// 下发给底层驱动

scsi层真正开始错误恢复处理的函数。

1544/**
1545 * scsi_unjam_host - Attempt to fix a host which has a cmd that failed.
1546 * @shost:      Host to unjam.
1547 *
1548 * Notes:
1549 *    When we come in here, we *know* that all commands on the bus have
1550 *    either completed, failed or timed out. we also know that no further
1551 *    commands are being sent to the host, so things are relatively quiet
1552 *    and we have freedom to fiddle with things as we wish.
1553 *
1554 *    This is only the *default* implementation. it is possible for
1555 *    individual drivers to supply their own version of this function, and
1556 *    if the maintainer wishes to do this, it is strongly suggested that
1557 *    this function be taken as a template and modified. this function
1558 *    was designed to correctly handle problems for about 95% of the
1559 *    different cases out there, and it should always provide at least a
1560 *    reasonable amount of error recovery.
1561 *
1562 *    Any command marked 'failed' or 'timeout' must eventually have
1563 *    scsi_finish_cmd() called for it. we do all of the retry stuff
1564 *    here, so when we restart the host after we return it should have an
1565 *    empty queue.
1566 **/
1567static void scsi_unjam_host(struct Scsi_Host *shost)
1568{
1569        unsigned long flags;
1570        LIST_HEAD(eh_work_q);
1571        LIST_HEAD(eh_done_q);
1572
1573        spin_lock_irqsave(shost->host_lock, flags);
1574        list_splice_init(&shost->eh_cmd_q, &eh_work_q);
1575        spin_unlock_irqrestore(shost->host_lock, flags);
1576
1577        SCSI_LOG_ERROR_RECOVERY(1, scsi_eh_prt_fail_stats(shost, &eh_work_q));
1578
1579        if (!scsi_eh_get_sense(&eh_work_q, &eh_done_q))
1580                if (!scsi_eh_abort_cmds(&eh_work_q, &eh_done_q))
1581                        scsi_eh_ready_devs(shost, &eh_work_q, &eh_done_q);
1582
1583        scsi_eh_flush_done_q(&eh_done_q);
1584}

///scsi的错误恢复处理进程，就是上面超时被唤醒的那个

/**
1587 * scsi_error_handler - Handle errors/timeouts of SCSI cmds.
1588 * @data:       Host for which we are running.
1589 *
1590 * Notes:
1591 *    This is always run in the context of a kernel thread. The idea is
1592 *    that we start this thing up when the kernel starts up (one per host
1593 *    that we detect), and it immediately goes to sleep and waits for some
1594 *    event (i.e. failure). When this takes place, we have the job of
1595 *    trying to unjam the bus and restarting things.
1596 **/
1597int scsi_error_handler(void *data)
1598{
1599        struct Scsi_Host *shost = (struct Scsi_Host *) data;
1600        int rtn;
1601        DECLARE_MUTEX_LOCKED(sem);
1602
1603        /*
1604         *    Flush resources
1605         */
1606
1607        daemonize("scsi_eh_%d", shost->host_no);
1608
1609        current->flags |= PF_NOFREEZE;
1610
1611        shost->eh_wait = &sem;
1612        shost->ehandler = current;
1613
1614        /*
1615         * Wake up the thread that created us.
1616         */
1617        SCSI_LOG_ERROR_RECOVERY(3, printk("Wake up parent of"
1618                                          " scsi_eh_%d\n",shost->host_no));
1619
1620        complete(shost->eh_notify);
1621
1622        while (1) {
1623                /*
1624                 * If we get a signal, it means we are supposed to go
1625                 * away and die. This typically happens if the user is
1626                 * trying to unload a module.
1627                 */
1628                SCSI_LOG_ERROR_RECOVERY(1, printk("Error handler"
1629                                                  " scsi_eh_%d"
1630                                                  " sleeping\n",shost->host_no));
1631
1632                /*
1633                 * Note - we always use down_interruptible with the semaphore
1634                 * even if the module was loaded as part of the kernel. The
1635                 * reason is that down() will cause this thread to be counted
1636                 * in the load average as a running process, and down
1637                 * interruptible doesn't. Given that we need to allow this
1638                 * thread to die if the driver was loaded as a module, using
1639                 * semaphores isn't unreasonable.
1640                 */
1641                down_interruptible(&sem);
1642                if (shost->eh_kill)
1643                        break;
1644
1645                SCSI_LOG_ERROR_RECOVERY(1, printk("Error handler"
1646                                                  " scsi_eh_%d waking"
1647                                                  " up\n",shost->host_no));
1648
1649                shost->eh_active = 1;               ///////这个很重要？？？？？
1650
1651                /*
1652                 * We have a host that is failing for some reason. Figure out
1653                 * what we need to do to get it up and online again (if we can).
1654                 * If we fail, we end up taking the thing offline.
1655                 */
1656                if (shost->hostt->eh_strategy_handler)
1657                        rtn = shost->hostt->eh_strategy_handler(shost);
1658                else
1659                        scsi_unjam_host(shost);            ///////在这个线程里面处理错误的超时的指令
1660
1661                shost->eh_active = 0;
1662
1663                /*
1664                 * Note - if the above fails completely, the action is to take
1665                 * individual devices offline and flush the queue of any
1666                 * outstanding requests that may have been pending. When we
1667                 * restart, we restart any I/O to any other devices on the bus
1668                 * which are still online.
1669                 */
1670                scsi_restart_operations(shost);
1671
1672        }
1673
1674        SCSI_LOG_ERROR_RECOVERY(1, printk("Error handler scsi_eh_%d"
1675                                          " exiting\n",shost->host_no));
1676
1677        /*
1678         * Make sure that nobody tries to wake us up again.
1679         */
1680        shost->eh_wait = NULL;
1681
1682        /*
1683         * Knock this down too. From this point on, the host is flying
1684         * without a pilot. If this is because the module is being unloaded,
1685         * that's fine. If the user sent a signal to this thing, we are
1686         * potentially in real danger.
1687         */
1688        shost->eh_active = 0;
1689        shost->ehandler = NULL;
1690
1691        /*
1692         * If anyone is waiting for us to exit (i.e. someone trying to unload
1693         * a driver), then wake up that process to let them know we are on
1694         * the way out the door.
1695         */
1696        complete_and_exit(shost->eh_notify, 0);
1697        return 0;
1698}

一步一步的加强等级进行恢复

1521/**
1522 * scsi_eh_ready_devs - check device ready state and recover if not.
1523 * @shost:      host to be recovered.
1524 * @eh_done_q: list_head for processed commands.
1525 *
1526 **/
1527static void scsi_eh_ready_devs(struct Scsi_Host *shost,
1528                               struct list_head *work_q,
1529                               struct list_head *done_q)
1530{
1531        if (!scsi_eh_stu(shost, work_q, done_q))
1532                if (!scsi_eh_bus_device_reset(shost, work_q, done_q))
1533                        if (!scsi_eh_bus_reset(shost, work_q, done_q))
1534                                if (!scsi_eh_host_reset(work_q, done_q))
1535                                        scsi_eh_offline_sdevs(work_q, done_q); //////////
1536}
1537