Linux 内核scsi磁盘的错误恢复机制 与线程

本文详细介绍了Linux内核中针对SCSI磁盘错误的恢复机制,包括如何创建和管理错误处理线程,以及逐步升级的恢复策略。当SCSI命令超时时,内核会启动错误恢复进程,通过取消命令、设备重新就绪、设备或总线重置等手段尝试恢复。此外,文章还探讨了定时器管理的变化,以避免过多定时器影响性能。

 1 使用kthread_create创建线程:
    struct task_struct *kthread_create(int (*threadfn)(void *data),
                                       void *data,
       const char *namefmt, ...);
这个函数可以像printk一样传入某种格式的线程名
线程创建后,不会马上运行,而是需要将kthread_create() 返回的task_struct指针传给wake_up_process(),然后通过此函数运行线程。
2. 当然,还有一个创建并启动线程的函数:kthread_run
   struct task_struct *kthread_run(int (*threadfn)(void *data),
                                    void *data,
                                    const char *namefmt, ...);
3. 线程一旦启动起来后,会一直运行,除非该线程主动调用do_exit函数,或者其他的进程调用kthread_stop函数,结束线程的运行。
    int kthread_stop(struct task_struct *thread);
kthread_stop() 通过发送信号给线程。
如果线程函数正在处理一个非常重要的任务,它不会被中断的。当然如果线程函数永远不返回并且不检查信号,它将永远都不会停止。
参考:Kernel threads made easy

简单的说,就是linux内核为每个下发的scsi磁盘命令加个定时器,如果超时了下面底层驱动还没有处理完这个scsi命令的话,就开始触发scsi_unjam_host开始的动作。这个函数会进行一些逐渐提高等级的恢复操作,然后看你这个底层的scsi host是不是可以恢复到正常状态了。
 默认的策略是这样的,不过好像也可以在自己的scsi host里面自定义恢复策略。
  scsi_eh_abort_cmds        //先是取消这个超时命令,调用scsi host的接口通知下层取消
      if (!scsi_eh_stu(shost, work_q, done_q))    //START_STOP_UNIT command  让这个磁盘重新就绪?
                 if (!scsi_eh_bus_device_reset(shost, work_q, done_q))  //如果还是不行,重启整个磁盘设备
                        if (!scsi_eh_bus_reset(shost, work_q, done_q))  //如果还是不行,重启整个总线 ,可能影响很多磁盘了
                                if (!scsi_eh_host_reset(work_q, done_q))  //重启整个host,整个host上面的所有磁盘估计都要重启了。
                                        scsi_eh_offline_sdevs(work_q, done_q);  //////////
 
 
 
 
  这里的scsi错误指的不是 scsi命令返回的错误结果,而是scsi命令被分发给下层驱动之后,驱动超时了也没有返回一个结果回来,就是scsi命令在底层挂住了停止响应了。
 
  以前linux内核会为每个scsi磁盘命令建一个 timer的,使用scsi_add_timer这个函数来作的。 不过后来好像2.6.28开始改了办法了,因为他们发现这样可能导致创建的timer 太多,影响性能吧。作磁盘测试的时候,可以达到每秒3000个磁盘操作,每个超时设为100秒的时候,都会导致创建很多timer了。 应该是把这个timer计时放到block层来做了,每个队列 queue里面所有磁盘指令现在共用一个timer,然后每次执行一个磁盘指令的时候,修改这个timer的超时为所有指令的最小超时时间。而不是为每个指令都建立一个新的timer。
 这个新的机制应该是在lwn的这篇文章里面有描述的,不过不知道怎么回事今天lwn在我这里打不开。
 Block layer: solid-state storage, timeouts, affinity, and more  http://lwn.net/Articles/303270/
 
timer的超时处理函数最后调到这个scsi_eh_scmd_add函数,就会唤醒scsi的错误恢复进程
  82/**
  83 * scsi_eh_scmd_add - add scsi cmd to error handling.
  84 * @scmd:       scmd to run eh on.
  85 * @eh_flag:    optional SCSI_EH flag.
  86 *
  87 * Return value:
  88 *      0 on failure.
  89 */
  90int scsi_eh_scmd_add(struct scsi_cmnd *scmd, int eh_flag)
  91{
  92        struct Scsi_Host *shost = scmd->device->host;
  93        unsigned long flags;
  94        int ret = 0;
  95
  96        if (!shost->ehandler)
  97                return 0;
  98
  99        spin_lock_irqsave(shost->host_lock, flags);
 100        if (scsi_host_set_state(shost, SHOST_RECOVERY))
 101                if (scsi_host_set_state(shost, SHOST_CANCEL_RECOVERY))
 102                        goto out_unlock;
 103
 104        ret = 1;
 105        scmd->eh_eflags |= eh_flag;
 106        list_add_tail(&scmd->eh_entry, &shost->eh_cmd_q);
 107        shost->host_failed++;
 108        scsi_eh_wakeup(shost);     ////唤醒超时处理线程
 109 out_unlock:
 110        spin_unlock_irqrestore(shost->host_lock, flags);
 111        return ret;
 112}
 

 
 
 //////scsi把这个scsi_times_out函数注册为block层的超时处理函数了。
  114/**
 115 * scsi_times_out - Timeout function for normal scsi commands.
 116 * @req:        request that is timing out.
 117 *
 118 * Notes:
 119 *     We do not need to lock this.  There is the potential for a race
 120 *     only in that the normal completion handling might run, but if the
 121 *     normal completion function determines that the timer has already
 122 *     fired, then it mustn't do anything.
 123 */
 124enum blk_eh_timer_return scsi_times_out(struct request *req)
 125{
 126        struct scsi_cmnd *scmd = req->special;
 127        enum blk_eh_timer_return (*eh_timed_out)(struct scsi_cmnd *);
 128        enum blk_eh_timer_return rtn = BLK_EH_NOT_HANDLED;
 129
 130        scsi_log_completion(scmd, TIMEOUT_ERROR);
 131
 132        if (scmd->device->host->transportt->eh_timed_out)
 133                eh_timed_out = scmd->device->host->transportt->eh_timed_out;
 134        else if (scmd->device->host->hostt->eh_timed_out)
 135                eh_timed_out = scmd->device->host->hostt->eh_timed_out;
 136        else
 137                eh_timed_out = NULL;
 138
 139        if (eh_timed_out)
 140                rtn = eh_timed_out(scmd);
 141                switch (rtn) {
 142                case BLK_EH_NOT_HANDLED:
 143                        break;
 144                default:
 145                        return rtn;
 146                }
 147
 148        if (unlikely(!scsi_eh_scmd_add(scmd, SCSI_EH_CANCEL_CMD))) { /////////////唤醒进程
 149                scmd->result |= DID_TIME_OUT << 16;
 150                return BLK_EH_HANDLED;
 151        }
 152
 153        return BLK_EH_NOT_HANDLED;
 154}
 155
 
 
 
 
 
 
 
  1789struct request_queue *scsi_alloc_queue(struct scsi_device *sdev)
1790{
1791        struct request_queue *q;
1792
1793        q = __scsi_alloc_queue(sdev->host, scsi_request_fn);
1794        if (!q)
1795                return NULL;
1796
1797        blk_queue_prep_rq(q, scsi_prep_fn);
1798        blk_queue_softirq_done(q, scsi_softirq_done);
1799        blk_queue_rq_timed_out(q, scsi_times_out); ////////////////////////////////队列初始化的时候注册的超时处理函数
1800        blk_queue_lld_busy(q, scsi_lld_busy);
1801        return q;
1802}






  86void blk_queue_rq_timed_out(struct request_queue *q, rq_timed_out_fn *fn)
  87{
  88        q->rq_timed_out_fn = fn;
  89}
 
 
 
   
  block层超时从这个调用下面各层的超时函数,
  83static void blk_rq_timed_out(struct request *req)
  84{
  85        struct request_queue *q = req->q;
  86        enum blk_eh_timer_return ret;
  87
  88        ret = q->rq_timed_out_fn(req);    ////////////////////调用scsi层事先注册的超时处理函数
  89        switch (ret) {
  90        case BLK_EH_HANDLED:
  91                __blk_complete_request(req);
  92                break;
  93        case BLK_EH_RESET_TIMER:
  94                blk_clear_rq_complete(req);
  95                blk_add_timer(req);
  96                break;
  97        case BLK_EH_NOT_HANDLED:
  98                /*
  99                 * LLD handles this for now but in the future
 100                 * we can send a request msg to abort the command
 101                 * and we can move more of the generic scsi eh code to
 102                 * the blk layer.
 103                 */
 104                break;
 105        default:
 106                printk(KERN_ERR "block: bad eh return: %d\n", ret);
 107                break;
 108        }
 109}
 110
 111void blk_rq_timed_out_timer(unsigned long data)
 112{
 113        struct request_queue *q = (struct request_queue *) data;
 114        unsigned long flags, uninitialized_var(next), next_set = 0;
 115        struct request *rq, *tmp;
 116
 117        spin_lock_irqsave(q->queue_lock, flags);
 118
 119        list_for_each_entry_safe(rq, tmp, &q->timeout_list, timeout_list) {
 120                if (time_after_eq(jiffies, rq->deadline)) {
 121                        list_del_init(&rq->timeout_list);
 122
 123                        /*
 124                         * Check if we raced with end io completion
 125                         */
 126                        if (blk_mark_rq_complete(rq))
 127                                continue;
 128                        blk_rq_timed_out(rq);  /////////////////////block层判断是不是有磁盘命令超时了,然后相应 的超时函数调///////////////////////
 129                }
 130                if (!next_set) {
 131                        next = rq->deadline;
 132                        next_set = 1;
 133                } else if (time_after(next, rq->deadline))
 134                        next = rq->deadline;
 135        }
 136
 137        if (next_set && !list_empty(&q->timeout_list))
 138                mod_timer(&q->timeout, round_jiffies_up(next));
 139
 140        spin_unlock_irqrestore(q->queue_lock, flags);
 141}
 
 
 
 
 
 
 
 
   161/**
 162 * blk_add_timer - Start timeout timer for a single request
 163 * @req:        request that is about to start running.
 164 *
 165 * Notes:
 166 *    Each request has its own timer, and as it is added to the queue, we
 167 *    set up the timer. When the request completes, we cancel the timer.
 168 */
 169void blk_add_timer(struct request *req)
 170{
 171        struct request_queue *q = req->q;
 172        unsigned long expiry;
 173
 174        if (!q->rq_timed_out_fn)
 175                return;
 176
 177        BUG_ON(!list_empty(&req->timeout_list));
 178        BUG_ON(test_bit(REQ_ATOM_COMPLETE, &req->atomic_flags));
 179
 180        if (req->timeout)
 181                req->deadline = jiffies + req->timeout;
 182        else {
 183                req->deadline = jiffies + q->rq_timeout;
 184                /*
 185                 * Some LLDs, like scsi, peek at the timeout to prevent
 186                 * a command from being retried forever.
 187                 */
 188                req->timeout = q->rq_timeout;
 189        }
 190        list_add_tail(&req->timeout_list, &q->timeout_list);
 191
 192        /*
 193         * If the timer isn't already pending or this timeout is earlier
 194         * than an existing one, modify the timer. Round up to next nearest
 195         * second.
 196         */
 197        expiry = round_jiffies_up(req->deadline);
 198
 199        if (!timer_pending(&q->timeout) ||
 200            time_before(expiry, q->timeout.expires))
 201                mod_timer(&q->timeout, expiry);                 //修改 timer的超时时间
 202}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 1639/**
1640 * blkdev_dequeue_request - dequeue request and start timeout timer
1641 * @req: request to dequeue
1642 *
1643 * Dequeue @req and start timeout timer on it.  This hands off the
1644 * request to the driver.
1645 *
1646 * Block internal functions which don't want to start timer should
1647 * call elv_dequeue_request().
1648 */
1649void blkdev_dequeue_request(struct request *req)
1650{
1651        elv_dequeue_request(req->q, req);
1652
1653        /*
1654         * We are now handing the request to the hardware, add the
1655         * timeout handler.
1656         */
1657        blk_add_timer(req);        ///每次从queue中取出一个磁盘命令以便传个下层驱动的时候,都重新计算timer的超时时间。
1658}
1659EXPORT_SYMBOL(blkdev_dequeue_request);









就scsi来说,是这样的,

1590static void scsi_request_fn(struct request_queue *q)

                   req = elv_next_request(q);   ////从队列里面取出磁盘命令,这里会 调整定时器的超时
                   rtn = scsi_dispatch_cmd(cmd);
                                 host->queuecommand()   /// 下发给底层驱动




scsi层真正开始错误恢复处理的函数。

1544/**
1545 * scsi_unjam_host - Attempt to fix a host which has a cmd that failed.
1546 * @shost:      Host to unjam.
1547 *
1548 * Notes:
1549 *    When we come in here, we *know* that all commands on the bus have
1550 *    either completed, failed or timed out.  we also know that no further
1551 *    commands are being sent to the host, so things are relatively quiet
1552 *    and we have freedom to fiddle with things as we wish.
1553 *
1554 *    This is only the *default* implementation.  it is possible for
1555 *    individual drivers to supply their own version of this function, and
1556 *    if the maintainer wishes to do this, it is strongly suggested that
1557 *    this function be taken as a template and modified.  this function
1558 *    was designed to correctly handle problems for about 95% of the
1559 *    different cases out there, and it should always provide at least a
1560 *    reasonable amount of error recovery.
1561 *
1562 *    Any command marked 'failed' or 'timeout' must eventually have
1563 *    scsi_finish_cmd() called for it.  we do all of the retry stuff
1564 *    here, so when we restart the host after we return it should have an
1565 *    empty queue.
1566 **/
1567static void scsi_unjam_host(struct Scsi_Host *shost)
1568{
1569        unsigned long flags;
1570        LIST_HEAD(eh_work_q);
1571        LIST_HEAD(eh_done_q);
1572
1573        spin_lock_irqsave(shost->host_lock, flags);
1574        list_splice_init(&shost->eh_cmd_q, &eh_work_q);
1575        spin_unlock_irqrestore(shost->host_lock, flags);
1576
1577        SCSI_LOG_ERROR_RECOVERY(1, scsi_eh_prt_fail_stats(shost, &eh_work_q));
1578
1579        if (!scsi_eh_get_sense(&eh_work_q, &eh_done_q))
1580                if (!scsi_eh_abort_cmds(&eh_work_q, &eh_done_q))
1581                        scsi_eh_ready_devs(shost, &eh_work_q, &eh_done_q);
1582
1583        scsi_eh_flush_done_q(&eh_done_q);
1584}













///scsi的错误恢复处理进程,就是上面超时被唤醒的那个

/**
1587 * scsi_error_handler - Handle errors/timeouts of SCSI cmds.
1588 * @data:       Host for which we are running.
1589 *
1590 * Notes:
1591 *    This is always run in the context of a kernel thread.  The idea is
1592 *    that we start this thing up when the kernel starts up (one per host
1593 *    that we detect), and it immediately goes to sleep and waits for some
1594 *    event (i.e. failure).  When this takes place, we have the job of
1595 *    trying to unjam the bus and restarting things.
1596 **/
1597int scsi_error_handler(void *data)
1598{
1599        struct Scsi_Host *shost = (struct Scsi_Host *) data;
1600        int rtn;
1601        DECLARE_MUTEX_LOCKED(sem);
1602
1603        /*
1604         *    Flush resources
1605         */
1606
1607        daemonize("scsi_eh_%d", shost->host_no);
1608
1609        current->flags |= PF_NOFREEZE;
1610
1611        shost->eh_wait = &sem;
1612        shost->ehandler = current;
1613
1614        /*
1615         * Wake up the thread that created us.
1616         */
1617        SCSI_LOG_ERROR_RECOVERY(3, printk("Wake up parent of"
1618                                          " scsi_eh_%d\n",shost->host_no));
1619
1620        complete(shost->eh_notify);
1621
1622        while (1) {
1623                /*
1624                 * If we get a signal, it means we are supposed to go
1625                 * away and die.  This typically happens if the user is
1626                 * trying to unload a module.
1627                 */
1628                SCSI_LOG_ERROR_RECOVERY(1, printk("Error handler"
1629                                                  " scsi_eh_%d"
1630                                                  " sleeping\n",shost->host_no));
1631
1632                /*
1633                 * Note - we always use down_interruptible with the semaphore
1634                 * even if the module was loaded as part of the kernel.  The
1635                 * reason is that down() will cause this thread to be counted
1636                 * in the load average as a running process, and down
1637                 * interruptible doesn't.  Given that we need to allow this
1638                 * thread to die if the driver was loaded as a module, using
1639                 * semaphores isn't unreasonable.
1640                 */
1641                down_interruptible(&sem);
1642                if (shost->eh_kill)
1643                        break;
1644
1645                SCSI_LOG_ERROR_RECOVERY(1, printk("Error handler"
1646                                                  " scsi_eh_%d waking"
1647                                                  " up\n",shost->host_no));
1648
1649                shost->eh_active = 1;               ///////这个很重要?????
1650
1651                /*
1652                 * We have a host that is failing for some reason.  Figure out
1653                 * what we need to do to get it up and online again (if we can).
1654                 * If we fail, we end up taking the thing offline.
1655                 */
1656                if (shost->hostt->eh_strategy_handler)
1657                        rtn = shost->hostt->eh_strategy_handler(shost);
1658                else
1659                        scsi_unjam_host(shost);            ///////在这个线程里面处理错误的超时的指令
1660
1661                shost->eh_active = 0;
1662
1663                /*
1664                 * Note - if the above fails completely, the action is to take
1665                 * individual devices offline and flush the queue of any
1666                 * outstanding requests that may have been pending.  When we
1667                 * restart, we restart any I/O to any other devices on the bus
1668                 * which are still online.
1669                 */
1670                scsi_restart_operations(shost);
1671
1672        }
1673
1674        SCSI_LOG_ERROR_RECOVERY(1, printk("Error handler scsi_eh_%d"
1675                                          " exiting\n",shost->host_no));
1676
1677        /*
1678         * Make sure that nobody tries to wake us up again.
1679         */
1680        shost->eh_wait = NULL;
1681
1682        /*
1683         * Knock this down too.  From this point on, the host is flying
1684         * without a pilot.  If this is because the module is being unloaded,
1685         * that's fine.  If the user sent a signal to this thing, we are
1686         * potentially in real danger.
1687         */
1688        shost->eh_active = 0;
1689        shost->ehandler = NULL;
1690
1691        /*
1692         * If anyone is waiting for us to exit (i.e. someone trying to unload
1693         * a driver), then wake up that process to let them know we are on
1694         * the way out the door.
1695         */
1696        complete_and_exit(shost->eh_notify, 0);
1697        return 0;
1698}








一步一步的加强等级进行恢复

1521/**
1522 * scsi_eh_ready_devs - check device ready state and recover if not.
1523 * @shost:      host to be recovered.
1524 * @eh_done_q:  list_head for processed commands.
1525 *
1526 **/
1527static void scsi_eh_ready_devs(struct Scsi_Host *shost,
1528                               struct list_head *work_q,
1529                               struct list_head *done_q)
1530{
1531        if (!scsi_eh_stu(shost, work_q, done_q))
1532                if (!scsi_eh_bus_device_reset(shost, work_q, done_q))
1533                        if (!scsi_eh_bus_reset(shost, work_q, done_q))
1534                                if (!scsi_eh_host_reset(work_q, done_q))
1535                                        scsi_eh_offline_sdevs(work_q, done_q);  //////////
1536}
1537

 

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值