1 /*
2  *  scsi_error.c Copyright (C) 1997 Eric Youngdale
3  *
4  *  SCSI error/timeout handling
5  *      Initial versions: Eric Youngdale.  Based upon conversations with
6  *                        Leonard Zubkoff and David Miller at Linux Expo,
7  *                        ideas originating from all over the place.
8  *
9  */
10 
11 #define __NO_VERSION__
12 #include <linux/module.h>
13 
14 #include <linux/sched.h>
15 #include <linux/timer.h>
16 #include <linux/string.h>
17 #include <linux/slab.h>
18 #include <linux/ioport.h>
19 #include <linux/kernel.h>
20 #include <linux/stat.h>
21 #include <linux/blk.h>
22 #include <linux/interrupt.h>
23 #include <linux/delay.h>
24 #include <linux/smp_lock.h>
25 
26 #define __KERNEL_SYSCALLS__
27 
28 #include <linux/unistd.h>
29 
30 #include <asm/system.h>
31 #include <asm/irq.h>
32 #include <asm/dma.h>
33 
34 #include "scsi.h"
35 #include "hosts.h"
36 #include "constants.h"
37 
38 /*
39  * We must always allow SHUTDOWN_SIGS.  Even if we are not a module,
40  * the host drivers that we are using may be loaded as modules, and
41  * when we unload these,  we need to ensure that the error handler thread
42  * can be shut down.
43  *
44  * Note - when we unload a module, we send a SIGHUP.  We mustn't
45  * enable SIGTERM, as this is how the init shuts things down when you
46  * go to single-user mode.  For that matter, init also sends SIGKILL,
47  * so we mustn't enable that one either.  We use SIGHUP instead.  Other
48  * options would be SIGPWR, I suppose.
49  */
50 #define SHUTDOWN_SIGS	(sigmask(SIGHUP))
51 
52 #ifdef DEBUG
53 #define SENSE_TIMEOUT SCSI_TIMEOUT
54 #define ABORT_TIMEOUT SCSI_TIMEOUT
55 #define RESET_TIMEOUT SCSI_TIMEOUT
56 #else
57 #define SENSE_TIMEOUT (10*HZ)
58 #define RESET_TIMEOUT (2*HZ)
59 #define ABORT_TIMEOUT (15*HZ)
60 #endif
61 
62 #define STATIC
63 
64 /*
65  * These should *probably* be handled by the host itself.
66  * Since it is allowed to sleep, it probably should.
67  */
68 #define BUS_RESET_SETTLE_TIME   5*HZ
69 #define HOST_RESET_SETTLE_TIME  10*HZ
70 
71 
72 static const char RCSid[] = "$Header: /mnt/ide/home/eric/CVSROOT/linux/drivers/scsi/scsi_error.c,v 1.10 1997/12/08 04:50:35 eric Exp $";
73 
74 STATIC int scsi_check_sense(Scsi_Cmnd * SCpnt);
75 STATIC int scsi_request_sense(Scsi_Cmnd *);
76 STATIC void scsi_send_eh_cmnd(Scsi_Cmnd * SCpnt, int timeout);
77 STATIC int scsi_try_to_abort_command(Scsi_Cmnd *, int);
78 STATIC int scsi_test_unit_ready(Scsi_Cmnd *);
79 STATIC int scsi_try_bus_device_reset(Scsi_Cmnd *, int timeout);
80 STATIC int scsi_try_bus_reset(Scsi_Cmnd *);
81 STATIC int scsi_try_host_reset(Scsi_Cmnd *);
82 STATIC int scsi_unit_is_ready(Scsi_Cmnd *);
83 STATIC void scsi_eh_action_done(Scsi_Cmnd *, int);
84 STATIC int scsi_eh_retry_command(Scsi_Cmnd *);
85 STATIC int scsi_eh_completed_normally(Scsi_Cmnd * SCpnt);
86 STATIC void scsi_restart_operations(struct Scsi_Host *);
87 STATIC void scsi_eh_finish_command(Scsi_Cmnd ** SClist, Scsi_Cmnd * SCpnt);
88 
89 
90 /*
91  * Function:    scsi_add_timer()
92  *
93  * Purpose:     Start timeout timer for a single scsi command.
94  *
95  * Arguments:   SCset   - command that is about to start running.
96  *              timeout - amount of time to allow this command to run.
97  *              complete - timeout function to call if timer isn't
98  *                      canceled.
99  *
100  * Returns:     Nothing
101  *
102  * Notes:       This should be turned into an inline function.
103  *
104  * More Notes:  Each scsi command has it's own timer, and as it is added to
105  *              the queue, we set up the timer.  When the command completes,
106  *              we cancel the timer.  Pretty simple, really, especially
107  *              compared to the old way of handling this crap.
108  */
scsi_add_timer(Scsi_Cmnd * SCset,int timeout,void (* complete)(Scsi_Cmnd *))109 void scsi_add_timer(Scsi_Cmnd * SCset,
110 		    int timeout,
111 		    void (*complete) (Scsi_Cmnd *))
112 {
113 	SCset->eh_timeout.data = (unsigned long) SCset;
114 	SCset->eh_timeout.function = (void (*)(unsigned long)) complete;
115 	mod_timer(&SCset->eh_timeout, jiffies + timeout);
116 
117 	SCset->done_late = 0;
118 
119 	SCSI_LOG_ERROR_RECOVERY(5, printk("Adding timer for command %p at %d (%p)\n", SCset, timeout, complete));
120 }
121 
122 /*
123  * Function:    scsi_delete_timer()
124  *
125  * Purpose:     Delete/cancel timer for a given function.
126  *
127  * Arguments:   SCset   - command that we are canceling timer for.
128  *
129  * Returns:     1 if we were able to detach the timer.  0 if we
130  *              blew it, and the timer function has already started
131  *              to run.
132  *
133  * Notes:       This should be turned into an inline function.
134  */
scsi_delete_timer(Scsi_Cmnd * SCset)135 int scsi_delete_timer(Scsi_Cmnd * SCset)
136 {
137 	int rtn;
138 
139 	rtn = del_timer(&SCset->eh_timeout);
140 
141 	SCSI_LOG_ERROR_RECOVERY(5, printk("Clearing timer for command %p %d\n", SCset, rtn));
142 
143 	SCset->eh_timeout.data = (unsigned long) NULL;
144 	SCset->eh_timeout.function = NULL;
145 
146 	return rtn;
147 }
148 
149 /*
150  * Function:    scsi_times_out()
151  *
152  * Purpose:     Timeout function for normal scsi commands..
153  *
154  * Arguments:   SCpnt   - command that is timing out.
155  *
156  * Returns:     Nothing.
157  *
158  * Notes:       We do not need to lock this.  There is the potential for
159  *              a race only in that the normal completion handling might
160  *              run, but if the normal completion function determines
161  *              that the timer has already fired, then it mustn't do
162  *              anything.
163  */
scsi_times_out(Scsi_Cmnd * SCpnt)164 void scsi_times_out(Scsi_Cmnd * SCpnt)
165 {
166 	/*
167 	 * Notify the low-level code that this operation failed and we are
168 	 * reposessing the command.
169 	 */
170 #ifdef ERIC_neverdef
171 	/*
172 	 * FIXME(eric)
173 	 * Allow the host adapter to push a queue ordering tag
174 	 * out to the bus to force the command in question to complete.
175 	 * If the host wants to do this, then we just restart the timer
176 	 * for the command.  Before we really do this, some real thought
177 	 * as to the optimum way to handle this should be done.  We *do*
178 	 * need to force ordering every so often to ensure that all requests
179 	 * do eventually complete, but I am not sure if this is the best way
180 	 * to actually go about it.
181 	 *
182 	 * Better yet, force a sync here, but don't block since we are in an
183 	 * interrupt.
184 	 */
185 	if (SCpnt->host->hostt->eh_ordered_queue_tag) {
186 		if ((*SCpnt->host->hostt->eh_ordered_queue_tag) (SCpnt)) {
187 			scsi_add_timer(SCpnt, SCpnt->internal_timeout,
188 				       scsi_times_out);
189 			return;
190 		}
191 	}
192 	/*
193 	 * FIXME(eric) - add a second special interface to handle this
194 	 * case.  Ideally that interface can also be used to request
195 	 * a queu
196 	 */
197 	if (SCpnt->host->can_queue) {
198 		SCpnt->host->hostt->queuecommand(SCpnt, NULL);
199 	}
200 #endif
201 
202 	/* Set the serial_number_at_timeout to the current serial_number */
203 	SCpnt->serial_number_at_timeout = SCpnt->serial_number;
204 
205 	SCpnt->eh_state = FAILED;
206 	SCpnt->state = SCSI_STATE_TIMEOUT;
207 	SCpnt->owner = SCSI_OWNER_ERROR_HANDLER;
208 
209 	SCpnt->host->in_recovery = 1;
210 	SCpnt->host->host_failed++;
211 
212 	SCSI_LOG_TIMEOUT(3, printk("Command timed out active=%d busy=%d failed=%d\n",
213 				   atomic_read(&SCpnt->host->host_active),
214 				   SCpnt->host->host_busy,
215 				   SCpnt->host->host_failed));
216 
217 	/*
218 	 * If the host is having troubles, then look to see if this was the last
219 	 * command that might have failed.  If so, wake up the error handler.
220 	 */
221 	if( SCpnt->host->eh_wait == NULL ) {
222 		panic("Error handler thread not present at %p %p %s %d",
223 		      SCpnt, SCpnt->host, __FILE__, __LINE__);
224 	}
225 	if (SCpnt->host->host_busy == SCpnt->host->host_failed) {
226 		up(SCpnt->host->eh_wait);
227 	}
228 }
229 
230 /*
231  * Function     scsi_block_when_processing_errors
232  *
233  * Purpose:     Prevent more commands from being queued while error recovery
234  *              is taking place.
235  *
236  * Arguments:   SDpnt - device on which we are performing recovery.
237  *
238  * Returns:     FALSE   The device was taken offline by error recovery.
239  *              TRUE    OK to proceed.
240  *
241  * Notes:       We block until the host is out of error recovery, and then
242  *              check to see whether the host or the device is offline.
243  */
scsi_block_when_processing_errors(Scsi_Device * SDpnt)244 int scsi_block_when_processing_errors(Scsi_Device * SDpnt)
245 {
246 
247 	SCSI_SLEEP(&SDpnt->host->host_wait, SDpnt->host->in_recovery);
248 
249 	SCSI_LOG_ERROR_RECOVERY(5, printk("Open returning %d\n", SDpnt->online));
250 
251 	return SDpnt->online;
252 }
253 
254 /*
255  * Function:    scsi_eh_times_out()
256  *
257  * Purpose:     Timeout function for error handling.
258  *
259  * Arguments:   SCpnt   - command that is timing out.
260  *
261  * Returns:     Nothing.
262  *
263  * Notes:       During error handling, the kernel thread will be sleeping
264  *              waiting for some action to complete on the device.  Our only
265  *              job is to record that it timed out, and to wake up the
266  *              thread.
267  */
268 STATIC
scsi_eh_times_out(Scsi_Cmnd * SCpnt)269 void scsi_eh_times_out(Scsi_Cmnd * SCpnt)
270 {
271 	SCpnt->eh_state = SCSI_STATE_TIMEOUT;
272 	SCSI_LOG_ERROR_RECOVERY(5, printk("In scsi_eh_times_out %p\n", SCpnt));
273 
274 	if (SCpnt->host->eh_action != NULL)
275 		up(SCpnt->host->eh_action);
276 	else
277 		printk("Missing scsi error handler thread\n");
278 }
279 
280 
281 /*
282  * Function:    scsi_eh_done()
283  *
284  * Purpose:     Completion function for error handling.
285  *
286  * Arguments:   SCpnt   - command that is timing out.
287  *
288  * Returns:     Nothing.
289  *
290  * Notes:       During error handling, the kernel thread will be sleeping
291  *              waiting for some action to complete on the device.  Our only
292  *              job is to record that the action completed, and to wake up the
293  *              thread.
294  */
295 STATIC
scsi_eh_done(Scsi_Cmnd * SCpnt)296 void scsi_eh_done(Scsi_Cmnd * SCpnt)
297 {
298 	int     rtn;
299 
300 	/*
301 	 * If the timeout handler is already running, then just set the
302 	 * flag which says we finished late, and return.  We have no
303 	 * way of stopping the timeout handler from running, so we must
304 	 * always defer to it.
305 	 */
306 	rtn = del_timer(&SCpnt->eh_timeout);
307 	if (!rtn) {
308 		SCpnt->done_late = 1;
309 		return;
310 	}
311 
312 	SCpnt->request.rq_status = RQ_SCSI_DONE;
313 
314 	SCpnt->owner = SCSI_OWNER_ERROR_HANDLER;
315 	SCpnt->eh_state = SUCCESS;
316 
317 	SCSI_LOG_ERROR_RECOVERY(5, printk("In eh_done %p result:%x\n", SCpnt,
318 					  SCpnt->result));
319 
320 	if (SCpnt->host->eh_action != NULL)
321 		up(SCpnt->host->eh_action);
322 }
323 
324 /*
325  * Function:    scsi_eh_action_done()
326  *
327  * Purpose:     Completion function for error handling.
328  *
329  * Arguments:   SCpnt   - command that is timing out.
330  *              answer  - boolean that indicates whether operation succeeded.
331  *
332  * Returns:     Nothing.
333  *
334  * Notes:       This callback is only used for abort and reset operations.
335  */
336 STATIC
scsi_eh_action_done(Scsi_Cmnd * SCpnt,int answer)337 void scsi_eh_action_done(Scsi_Cmnd * SCpnt, int answer)
338 {
339 	SCpnt->request.rq_status = RQ_SCSI_DONE;
340 
341 	SCpnt->owner = SCSI_OWNER_ERROR_HANDLER;
342 	SCpnt->eh_state = (answer ? SUCCESS : FAILED);
343 
344 	if (SCpnt->host->eh_action != NULL)
345 		up(SCpnt->host->eh_action);
346 }
347 
348 /*
349  * Function:  scsi_sense_valid()
350  *
351  * Purpose:     Determine whether a host has automatically obtained sense
352  *              information or not.  If we have it, then give a recommendation
353  *              as to what we should do next.
354  */
scsi_sense_valid(Scsi_Cmnd * SCpnt)355 int scsi_sense_valid(Scsi_Cmnd * SCpnt)
356 {
357 	if (((SCpnt->sense_buffer[0] & 0x70) >> 4) != 7) {
358 		return FALSE;
359 	}
360 	return TRUE;
361 }
362 
363 /*
364  * Function:  scsi_eh_retry_command()
365  *
366  * Purpose:     Retry the original command
367  *
368  * Returns:     SUCCESS - we were able to get the sense data.
369  *              FAILED  - we were not able to get the sense data.
370  *
371  * Notes:       This function will *NOT* return until the command either
372  *              times out, or it completes.
373  */
scsi_eh_retry_command(Scsi_Cmnd * SCpnt)374 STATIC int scsi_eh_retry_command(Scsi_Cmnd * SCpnt)
375 {
376 	memcpy((void *) SCpnt->cmnd, (void *) SCpnt->data_cmnd,
377 	       sizeof(SCpnt->data_cmnd));
378 	SCpnt->request_buffer = SCpnt->buffer;
379 	SCpnt->request_bufflen = SCpnt->bufflen;
380 	SCpnt->use_sg = SCpnt->old_use_sg;
381 	SCpnt->cmd_len = SCpnt->old_cmd_len;
382 	SCpnt->sc_data_direction = SCpnt->sc_old_data_direction;
383 	SCpnt->underflow = SCpnt->old_underflow;
384 
385 	scsi_send_eh_cmnd(SCpnt, SCpnt->timeout_per_command);
386 
387 	/*
388 	 * Hey, we are done.  Let's look to see what happened.
389 	 */
390 	return SCpnt->eh_state;
391 }
392 
393 /*
394  * Function:  scsi_request_sense()
395  *
396  * Purpose:     Request sense data from a particular target.
397  *
398  * Returns:     SUCCESS - we were able to get the sense data.
399  *              FAILED  - we were not able to get the sense data.
400  *
401  * Notes:       Some hosts automatically obtain this information, others
402  *              require that we obtain it on our own.
403  *
404  *              This function will *NOT* return until the command either
405  *              times out, or it completes.
406  */
scsi_request_sense(Scsi_Cmnd * SCpnt)407 STATIC int scsi_request_sense(Scsi_Cmnd * SCpnt)
408 {
409 	static unsigned char generic_sense[6] =
410 	{REQUEST_SENSE, 0, 0, 0, 255, 0};
411 	unsigned char scsi_result0[256], *scsi_result = NULL;
412 	int saved_result;
413 	int saved_resid;
414 
415 	ASSERT_LOCK(&io_request_lock, 0);
416 
417 	memcpy((void *) SCpnt->cmnd, (void *) generic_sense,
418 	       sizeof(generic_sense));
419 
420 	if (SCpnt->device->scsi_level <= SCSI_2)
421 		SCpnt->cmnd[1] = SCpnt->lun << 5;
422 
423 	scsi_result = (!SCpnt->host->hostt->unchecked_isa_dma)
424 	    ? &scsi_result0[0] : kmalloc(512, GFP_ATOMIC | GFP_DMA);
425 
426 	if (scsi_result == NULL) {
427 		printk("cannot allocate scsi_result in scsi_request_sense.\n");
428 		return FAILED;
429 	}
430 	/*
431 	 * Zero the sense buffer.  Some host adapters automatically always request
432 	 * sense, so it is not a good idea that SCpnt->request_buffer and
433 	 * SCpnt->sense_buffer point to the same address (DB).
434 	 * 0 is not a valid sense code.
435 	 */
436 	memset((void *) SCpnt->sense_buffer, 0, sizeof(SCpnt->sense_buffer));
437 	memset((void *) scsi_result, 0, 256);
438 
439 	saved_result = SCpnt->result;
440 	saved_resid = SCpnt->resid;
441 	SCpnt->request_buffer = scsi_result;
442 	SCpnt->request_bufflen = 256;
443 	SCpnt->use_sg = 0;
444 	SCpnt->cmd_len = COMMAND_SIZE(SCpnt->cmnd[0]);
445 	SCpnt->sc_data_direction = SCSI_DATA_READ;
446 	SCpnt->underflow = 0;
447 
448 	scsi_send_eh_cmnd(SCpnt, SENSE_TIMEOUT);
449 
450 	/* Last chance to have valid sense data */
451 	if (!scsi_sense_valid(SCpnt))
452 		memcpy((void *) SCpnt->sense_buffer,
453 		       SCpnt->request_buffer,
454 		       sizeof(SCpnt->sense_buffer));
455 
456 	if (scsi_result != &scsi_result0[0] && scsi_result != NULL)
457 		kfree(scsi_result);
458 
459 	/*
460 	 * When we eventually call scsi_finish, we really wish to complete
461 	 * the original request, so let's restore the original data. (DB)
462 	 */
463 	memcpy((void *) SCpnt->cmnd, (void *) SCpnt->data_cmnd,
464 	       sizeof(SCpnt->data_cmnd));
465 	SCpnt->result = saved_result;
466 	SCpnt->resid = saved_resid;
467 	SCpnt->request_buffer = SCpnt->buffer;
468 	SCpnt->request_bufflen = SCpnt->bufflen;
469 	SCpnt->use_sg = SCpnt->old_use_sg;
470 	SCpnt->cmd_len = SCpnt->old_cmd_len;
471 	SCpnt->sc_data_direction = SCpnt->sc_old_data_direction;
472 	SCpnt->underflow = SCpnt->old_underflow;
473 
474 	/*
475 	 * Hey, we are done.  Let's look to see what happened.
476 	 */
477 	return SCpnt->eh_state;
478 }
479 
480 /*
481  * Function:  scsi_test_unit_ready()
482  *
483  * Purpose:     Run test unit ready command to see if the device is talking to us or not.
484  *
485  */
scsi_test_unit_ready(Scsi_Cmnd * SCpnt)486 STATIC int scsi_test_unit_ready(Scsi_Cmnd * SCpnt)
487 {
488 	static unsigned char tur_command[6] =
489 	{TEST_UNIT_READY, 0, 0, 0, 0, 0};
490 	int saved_resid;
491 
492 	memcpy((void *) SCpnt->cmnd, (void *) tur_command,
493 	       sizeof(tur_command));
494 
495 	if (SCpnt->device->scsi_level <= SCSI_2)
496 		SCpnt->cmnd[1] = SCpnt->lun << 5;
497 
498 	/*
499 	 * Zero the sense buffer.  The SCSI spec mandates that any
500 	 * untransferred sense data should be interpreted as being zero.
501 	 */
502 	memset((void *) SCpnt->sense_buffer, 0, sizeof(SCpnt->sense_buffer));
503 
504 	saved_resid = SCpnt->resid;
505 	SCpnt->request_buffer = NULL;
506 	SCpnt->request_bufflen = 0;
507 	SCpnt->use_sg = 0;
508 	SCpnt->cmd_len = COMMAND_SIZE(SCpnt->cmnd[0]);
509 	SCpnt->underflow = 0;
510 	SCpnt->sc_data_direction = SCSI_DATA_NONE;
511 
512 	scsi_send_eh_cmnd(SCpnt, SENSE_TIMEOUT);
513 
514 	/*
515 	 * When we eventually call scsi_finish, we really wish to complete
516 	 * the original request, so let's restore the original data. (DB)
517 	 */
518 	memcpy((void *) SCpnt->cmnd, (void *) SCpnt->data_cmnd,
519 	       sizeof(SCpnt->data_cmnd));
520 	SCpnt->resid = saved_resid;
521 	SCpnt->request_buffer = SCpnt->buffer;
522 	SCpnt->request_bufflen = SCpnt->bufflen;
523 	SCpnt->use_sg = SCpnt->old_use_sg;
524 	SCpnt->cmd_len = SCpnt->old_cmd_len;
525 	SCpnt->sc_data_direction = SCpnt->sc_old_data_direction;
526 	SCpnt->underflow = SCpnt->old_underflow;
527 
528 	/*
529 	 * Hey, we are done.  Let's look to see what happened.
530 	 */
531 	SCSI_LOG_ERROR_RECOVERY(3,
532 		printk("scsi_test_unit_ready: SCpnt %p eh_state %x\n",
533 		SCpnt, SCpnt->eh_state));
534 	return SCpnt->eh_state;
535 }
536 
537 /*
538  * This would normally need to get the IO request lock,
539  * but as it doesn't actually touch anything that needs
540  * to be locked we can avoid the lock here..
541  */
542 STATIC
scsi_sleep_done(struct semaphore * sem)543 void scsi_sleep_done(struct semaphore *sem)
544 {
545 	if (sem != NULL) {
546 		up(sem);
547 	}
548 }
549 
scsi_sleep(int timeout)550 void scsi_sleep(int timeout)
551 {
552 	DECLARE_MUTEX_LOCKED(sem);
553 	struct timer_list timer;
554 
555 	init_timer(&timer);
556 	timer.data = (unsigned long) &sem;
557 	timer.expires = jiffies + timeout;
558 	timer.function = (void (*)(unsigned long)) scsi_sleep_done;
559 
560 	SCSI_LOG_ERROR_RECOVERY(5, printk("Sleeping for timer tics %d\n", timeout));
561 
562 	add_timer(&timer);
563 
564 	down(&sem);
565 	del_timer(&timer);
566 }
567 
568 /*
569  * Function:  scsi_send_eh_cmnd
570  *
571  * Purpose:     Send a command out to a device as part of error recovery.
572  *
573  * Notes:       The initialization of the structures is quite a bit different
574  *              in this case, and furthermore, there is a different completion
575  *              handler.
576  */
scsi_send_eh_cmnd(Scsi_Cmnd * SCpnt,int timeout)577 STATIC void scsi_send_eh_cmnd(Scsi_Cmnd * SCpnt, int timeout)
578 {
579 	unsigned long flags;
580 	struct Scsi_Host *host;
581 
582 	ASSERT_LOCK(&io_request_lock, 0);
583 
584 	host = SCpnt->host;
585 
586       retry:
587 	/*
588 	 * We will use a queued command if possible, otherwise we will emulate the
589 	 * queuing and calling of completion function ourselves.
590 	 */
591 	SCpnt->owner = SCSI_OWNER_LOWLEVEL;
592 
593 	if (host->can_queue) {
594 		DECLARE_MUTEX_LOCKED(sem);
595 
596 		SCpnt->eh_state = SCSI_STATE_QUEUED;
597 
598 		scsi_add_timer(SCpnt, timeout, scsi_eh_times_out);
599 
600 		/*
601 		 * Set up the semaphore so we wait for the command to complete.
602 		 */
603 		SCpnt->host->eh_action = &sem;
604 		SCpnt->request.rq_status = RQ_SCSI_BUSY;
605 
606 		spin_lock_irqsave(&io_request_lock, flags);
607 		host->hostt->queuecommand(SCpnt, scsi_eh_done);
608 		spin_unlock_irqrestore(&io_request_lock, flags);
609 
610 		down(&sem);
611 
612 		SCpnt->host->eh_action = NULL;
613 
614 		/*
615 		 * See if timeout.  If so, tell the host to forget about it.
616 		 * In other words, we don't want a callback any more.
617 		 */
618 		if (SCpnt->eh_state == SCSI_STATE_TIMEOUT) {
619                         SCpnt->owner = SCSI_OWNER_LOWLEVEL;
620 
621 			/*
622 			 * As far as the low level driver is
623 			 * concerned, this command is still active, so
624 			 * we must give the low level driver a chance
625 			 * to abort it. (DB)
626 			 *
627 			 * FIXME(eric) - we are not tracking whether we could
628 			 * abort a timed out command or not.  Not sure how
629 			 * we should treat them differently anyways.
630 			 */
631 			spin_lock_irqsave(&io_request_lock, flags);
632 			if (SCpnt->host->hostt->eh_abort_handler)
633 				SCpnt->host->hostt->eh_abort_handler(SCpnt);
634 			spin_unlock_irqrestore(&io_request_lock, flags);
635 
636 			SCpnt->request.rq_status = RQ_SCSI_DONE;
637 			SCpnt->owner = SCSI_OWNER_ERROR_HANDLER;
638 
639 			SCpnt->eh_state = FAILED;
640 		}
641 		SCSI_LOG_ERROR_RECOVERY(5, printk("send_eh_cmnd: %p eh_state:%x\n",
642 						SCpnt, SCpnt->eh_state));
643 	} else {
644 		int temp;
645 
646 		/*
647 		 * We damn well had better never use this code.  There is no timeout
648 		 * protection here, since we would end up waiting in the actual low
649 		 * level driver, we don't know how to wake it up.
650 		 */
651 		spin_lock_irqsave(&io_request_lock, flags);
652 		temp = host->hostt->command(SCpnt);
653 		spin_unlock_irqrestore(&io_request_lock, flags);
654 
655 		SCpnt->result = temp;
656 		/* Fall through to code below to examine status. */
657 		SCpnt->eh_state = SUCCESS;
658 	}
659 
660 	/*
661 	 * Now examine the actual status codes to see whether the command actually
662 	 * did complete normally.
663 	 */
664 	if (SCpnt->eh_state == SUCCESS) {
665 		int ret = scsi_eh_completed_normally(SCpnt);
666 		SCSI_LOG_ERROR_RECOVERY(3,
667 			printk("scsi_send_eh_cmnd: scsi_eh_completed_normally %x\n", ret));
668 		switch (ret) {
669 		case SUCCESS:
670 			SCpnt->eh_state = SUCCESS;
671 			break;
672 		case NEEDS_RETRY:
673 			if ((++SCpnt->retries) < SCpnt->allowed)
674 				goto retry;
675 			SCpnt->eh_state = SUCCESS;
676 			break;
677 		case FAILED:
678 		default:
679 			SCpnt->eh_state = FAILED;
680 			break;
681 		}
682 	} else {
683 		SCpnt->eh_state = FAILED;
684 	}
685 }
686 
687 /*
688  * Function:  scsi_unit_is_ready()
689  *
690  * Purpose:     Called after TEST_UNIT_READY is run, to test to see if
691  *              the unit responded in a way that indicates it is ready.
692  */
scsi_unit_is_ready(Scsi_Cmnd * SCpnt)693 STATIC int scsi_unit_is_ready(Scsi_Cmnd * SCpnt)
694 {
695 	if (SCpnt->result) {
696 		if (((driver_byte(SCpnt->result) & DRIVER_SENSE) ||
697 		     (status_byte(SCpnt->result) & CHECK_CONDITION)) &&
698 		    ((SCpnt->sense_buffer[0] & 0x70) >> 4) == 7) {
699 			if (((SCpnt->sense_buffer[2] & 0xf) != NOT_READY) &&
700 			    ((SCpnt->sense_buffer[2] & 0xf) != UNIT_ATTENTION) &&
701 			    ((SCpnt->sense_buffer[2] & 0xf) != ILLEGAL_REQUEST)) {
702 				return 0;
703 			}
704 		}
705 	}
706 	return 1;
707 }
708 
709 /*
710  * Function:    scsi_eh_finish_command
711  *
712  * Purpose:     Handle a command that we are finished with WRT error handling.
713  *
714  * Arguments:   SClist - pointer to list into which we are putting completed commands.
715  *              SCpnt  - command that is completing
716  *
717  * Notes:       We don't want to use the normal command completion while we are
718  *              are still handling errors - it may cause other commands to be queued,
719  *              and that would disturb what we are doing.  Thus we really want to keep
720  *              a list of pending commands for final completion, and once we
721  *              are ready to leave error handling we handle completion for real.
722  */
scsi_eh_finish_command(Scsi_Cmnd ** SClist,Scsi_Cmnd * SCpnt)723 STATIC void scsi_eh_finish_command(Scsi_Cmnd ** SClist, Scsi_Cmnd * SCpnt)
724 {
725 	SCpnt->state = SCSI_STATE_BHQUEUE;
726 	SCpnt->bh_next = *SClist;
727 	/*
728 	 * Set this back so that the upper level can correctly free up
729 	 * things.
730 	 */
731 	SCpnt->use_sg = SCpnt->old_use_sg;
732 	SCpnt->sc_data_direction = SCpnt->sc_old_data_direction;
733 	SCpnt->underflow = SCpnt->old_underflow;
734 	*SClist = SCpnt;
735 }
736 
737 /*
738  * Function:  scsi_try_to_abort_command
739  *
740  * Purpose:     Ask host adapter to abort a running command.
741  *
742  * Returns:     FAILED          Operation failed or not supported.
743  *              SUCCESS         Succeeded.
744  *
745  * Notes:       This function will not return until the user's completion
746  *              function has been called.  There is no timeout on this
747  *              operation.  If the author of the low-level driver wishes
748  *              this operation to be timed, they can provide this facility
749  *              themselves.  Helper functions in scsi_error.c can be supplied
750  *              to make this easier to do.
751  *
752  * Notes:       It may be possible to combine this with all of the reset
753  *              handling to eliminate a lot of code duplication.  I don't
754  *              know what makes more sense at the moment - this is just a
755  *              prototype.
756  */
scsi_try_to_abort_command(Scsi_Cmnd * SCpnt,int timeout)757 STATIC int scsi_try_to_abort_command(Scsi_Cmnd * SCpnt, int timeout)
758 {
759 	int rtn;
760 	unsigned long flags;
761 
762 	SCpnt->eh_state = FAILED;	/* Until we come up with something better */
763 
764 	if (SCpnt->host->hostt->eh_abort_handler == NULL) {
765 		return FAILED;
766 	}
767 	/*
768 	 * scsi_done was called just after the command timed out and before
769 	 * we had a chance to process it. (DB)
770 	 */
771 	if (SCpnt->serial_number == 0)
772 		return SUCCESS;
773 
774 	SCpnt->owner = SCSI_OWNER_LOWLEVEL;
775 
776 	spin_lock_irqsave(&io_request_lock, flags);
777 	rtn = SCpnt->host->hostt->eh_abort_handler(SCpnt);
778 	spin_unlock_irqrestore(&io_request_lock, flags);
779 	return rtn;
780 }
781 
782 /*
783  * Function:  scsi_try_bus_device_reset
784  *
785  * Purpose:     Ask host adapter to perform a bus device reset for a given
786  *              device.
787  *
788  * Returns:     FAILED          Operation failed or not supported.
789  *              SUCCESS         Succeeded.
790  *
791  * Notes:       There is no timeout for this operation.  If this operation is
792  *              unreliable for a given host, then the host itself needs to put a
793  *              timer on it, and set the host back to a consistent state prior
794  *              to returning.
795  */
scsi_try_bus_device_reset(Scsi_Cmnd * SCpnt,int timeout)796 STATIC int scsi_try_bus_device_reset(Scsi_Cmnd * SCpnt, int timeout)
797 {
798 	unsigned long flags;
799 	int rtn;
800 
801 	SCpnt->eh_state = FAILED;	/* Until we come up with something better */
802 
803 	if (SCpnt->host->hostt->eh_device_reset_handler == NULL) {
804 		return FAILED;
805 	}
806 	SCpnt->owner = SCSI_OWNER_LOWLEVEL;
807 
808 	spin_lock_irqsave(&io_request_lock, flags);
809 	rtn = SCpnt->host->hostt->eh_device_reset_handler(SCpnt);
810 	spin_unlock_irqrestore(&io_request_lock, flags);
811 
812 	if (rtn == SUCCESS)
813 		SCpnt->eh_state = SUCCESS;
814 
815 	return SCpnt->eh_state;
816 }
817 
818 /*
819  * Function:  scsi_try_bus_reset
820  *
821  * Purpose:     Ask host adapter to perform a bus reset for a host.
822  *
823  * Returns:     FAILED          Operation failed or not supported.
824  *              SUCCESS         Succeeded.
825  *
826  * Notes:
827  */
scsi_try_bus_reset(Scsi_Cmnd * SCpnt)828 STATIC int scsi_try_bus_reset(Scsi_Cmnd * SCpnt)
829 {
830 	unsigned long flags;
831 	int rtn;
832 
833 	SCpnt->eh_state = FAILED;	/* Until we come up with something better */
834 	SCpnt->owner = SCSI_OWNER_LOWLEVEL;
835 	SCpnt->serial_number_at_timeout = SCpnt->serial_number;
836 
837 	if (SCpnt->host->hostt->eh_bus_reset_handler == NULL) {
838 		return FAILED;
839 	}
840 
841 	spin_lock_irqsave(&io_request_lock, flags);
842 	rtn = SCpnt->host->hostt->eh_bus_reset_handler(SCpnt);
843 	spin_unlock_irqrestore(&io_request_lock, flags);
844 
845 	if (rtn == SUCCESS)
846 		SCpnt->eh_state = SUCCESS;
847 
848 	/*
849 	 * If we had a successful bus reset, mark the command blocks to expect
850 	 * a condition code of unit attention.
851 	 */
852 	scsi_sleep(BUS_RESET_SETTLE_TIME);
853 	if (SCpnt->eh_state == SUCCESS) {
854 		Scsi_Device *SDloop;
855 		for (SDloop = SCpnt->host->host_queue; SDloop; SDloop = SDloop->next) {
856 			if (SCpnt->channel == SDloop->channel) {
857 				SDloop->was_reset = 1;
858 				SDloop->expecting_cc_ua = 1;
859 			}
860 		}
861 	}
862 	return SCpnt->eh_state;
863 }
864 
865 /*
866  * Function:  scsi_try_host_reset
867  *
868  * Purpose:     Ask host adapter to reset itself, and the bus.
869  *
870  * Returns:     FAILED          Operation failed or not supported.
871  *              SUCCESS         Succeeded.
872  *
873  * Notes:
874  */
scsi_try_host_reset(Scsi_Cmnd * SCpnt)875 STATIC int scsi_try_host_reset(Scsi_Cmnd * SCpnt)
876 {
877 	unsigned long flags;
878 	int rtn;
879 
880 	SCpnt->eh_state = FAILED;	/* Until we come up with something better */
881 	SCpnt->owner = SCSI_OWNER_LOWLEVEL;
882 	SCpnt->serial_number_at_timeout = SCpnt->serial_number;
883 
884 	if (SCpnt->host->hostt->eh_host_reset_handler == NULL) {
885 		return FAILED;
886 	}
887 	spin_lock_irqsave(&io_request_lock, flags);
888 	rtn = SCpnt->host->hostt->eh_host_reset_handler(SCpnt);
889 	spin_unlock_irqrestore(&io_request_lock, flags);
890 
891 	if (rtn == SUCCESS)
892 		SCpnt->eh_state = SUCCESS;
893 
894 	/*
895 	 * If we had a successful host reset, mark the command blocks to expect
896 	 * a condition code of unit attention.
897 	 */
898 	scsi_sleep(HOST_RESET_SETTLE_TIME);
899 	if (SCpnt->eh_state == SUCCESS) {
900 		Scsi_Device *SDloop;
901 		for (SDloop = SCpnt->host->host_queue; SDloop; SDloop = SDloop->next) {
902 			SDloop->was_reset = 1;
903 			SDloop->expecting_cc_ua = 1;
904 		}
905 	}
906 	return SCpnt->eh_state;
907 }
908 
909 /*
910  * Function:  scsi_decide_disposition
911  *
912  * Purpose:     Examine a command block that has come back from the low-level
913  *              and figure out what to do next.
914  *
915  * Returns:     SUCCESS         - pass on to upper level.
916  *              FAILED          - pass on to error handler thread.
917  *              RETRY           - command should be retried.
918  *              SOFTERR         - command succeeded, but we need to log
919  *                                a soft error.
920  *
921  * Notes:       This is *ONLY* called when we are examining the status
922  *              after sending out the actual data command.  Any commands
923  *              that are queued for error recovery (i.e. TEST_UNIT_READY)
924  *              do *NOT* come through here.
925  *
926  *              NOTE - When this routine returns FAILED, it means the error
927  *              handler thread is woken.  In cases where the error code
928  *              indicates an error that doesn't require the error handler
929  *              thread (i.e. we don't need to abort/reset), then this function
930  *              should return SUCCESS.
931  */
scsi_decide_disposition(Scsi_Cmnd * SCpnt)932 int scsi_decide_disposition(Scsi_Cmnd * SCpnt)
933 {
934 	int rtn;
935 
936 	/*
937 	 * If the device is offline, then we clearly just pass the result back
938 	 * up to the top level.
939 	 */
940 	if (SCpnt->device->online == FALSE) {
941 		SCSI_LOG_ERROR_RECOVERY(5, printk("scsi_error.c: device offline - report as SUCCESS\n"));
942 		return SUCCESS;
943 	}
944 	/*
945 	 * First check the host byte, to see if there is anything in there
946 	 * that would indicate what we need to do.
947 	 */
948 
949 	switch (host_byte(SCpnt->result)) {
950 	case DID_PASSTHROUGH:
951 		/*
952 		 * No matter what, pass this through to the upper layer.
953 		 * Nuke this special code so that it looks like we are saying
954 		 * DID_OK.
955 		 */
956 		SCpnt->result &= 0xff00ffff;
957 		return SUCCESS;
958 	case DID_OK:
959 		/*
960 		 * Looks good.  Drop through, and check the next byte.
961 		 */
962 		break;
963 	case DID_NO_CONNECT:
964 	case DID_BAD_TARGET:
965 	case DID_ABORT:
966 		/*
967 		 * Note - this means that we just report the status back to the
968 		 * top level driver, not that we actually think that it indicates
969 		 * success.
970 		 */
971 		return SUCCESS;
972 		/*
973 		 * When the low level driver returns DID_SOFT_ERROR,
974 		 * it is responsible for keeping an internal retry counter
975 		 * in order to avoid endless loops (DB)
976 		 *
977 		 * Actually this is a bug in this function here.  We should
978 		 * be mindful of the maximum number of retries specified
979 		 * and not get stuck in a loop.
980 		 */
981 	case DID_SOFT_ERROR:
982 		goto maybe_retry;
983 
984 	case DID_ERROR:
985 		if (msg_byte(SCpnt->result) == COMMAND_COMPLETE &&
986 		    status_byte(SCpnt->result) == RESERVATION_CONFLICT)
987 			/*
988 			 * execute reservation conflict processing code
989 			 * lower down
990 			 */
991 			break;
992 		/* FALLTHROUGH */
993 
994 	case DID_BUS_BUSY:
995 	case DID_PARITY:
996 		goto maybe_retry;
997 	case DID_TIME_OUT:
998 		/*
999 		 * When we scan the bus, we get timeout messages for
1000 		 * these commands if there is no device available.
1001 		 * Other hosts report DID_NO_CONNECT for the same thing.
1002 		 */
1003 		if ((SCpnt->cmnd[0] == TEST_UNIT_READY ||
1004 		     SCpnt->cmnd[0] == INQUIRY)) {
1005 			return SUCCESS;
1006 		} else {
1007 			return FAILED;
1008 		}
1009 	case DID_RESET:
1010 		/*
1011 		 * In the normal case where we haven't initiated a reset, this is
1012 		 * a failure.
1013 		 */
1014 		if (SCpnt->flags & IS_RESETTING) {
1015 			SCpnt->flags &= ~IS_RESETTING;
1016 			goto maybe_retry;
1017 		}
1018 		return SUCCESS;
1019 	default:
1020 		return FAILED;
1021 	}
1022 
1023 	/*
1024 	 * Next, check the message byte.
1025 	 */
1026 	if (msg_byte(SCpnt->result) != COMMAND_COMPLETE) {
1027 		return FAILED;
1028 	}
1029 	/*
1030 	 * Now, check the status byte to see if this indicates anything special.
1031 	 */
1032 	switch (status_byte(SCpnt->result)) {
1033 	case QUEUE_FULL:
1034 		/*
1035 		 * The case of trying to send too many commands to a tagged queueing
1036 		 * device.
1037 		 */
1038 		return ADD_TO_MLQUEUE;
1039 	case GOOD:
1040 	case COMMAND_TERMINATED:
1041 		return SUCCESS;
1042 	case CHECK_CONDITION:
1043 		rtn = scsi_check_sense(SCpnt);
1044 		if (rtn == NEEDS_RETRY) {
1045 			goto maybe_retry;
1046 		}
1047 		return rtn;
1048 	case CONDITION_GOOD:
1049 	case INTERMEDIATE_GOOD:
1050 	case INTERMEDIATE_C_GOOD:
1051 		/*
1052 		 * Who knows?  FIXME(eric)
1053 		 */
1054 		return SUCCESS;
1055 	case BUSY:
1056 		goto maybe_retry;
1057 
1058 	case RESERVATION_CONFLICT:
1059 		printk("scsi%d (%d,%d,%d) : RESERVATION CONFLICT\n",
1060 		       SCpnt->host->host_no, SCpnt->channel,
1061 		       SCpnt->device->id, SCpnt->device->lun);
1062 		return SUCCESS; /* causes immediate I/O error */
1063 	default:
1064 		return FAILED;
1065 	}
1066 	return FAILED;
1067 
1068       maybe_retry:
1069 
1070 	if ((++SCpnt->retries) < SCpnt->allowed) {
1071 		return NEEDS_RETRY;
1072 	} else {
1073                 /*
1074                  * No more retries - report this one back to upper level.
1075                  */
1076 		return SUCCESS;
1077 	}
1078 }
1079 
1080 /*
1081  * Function:  scsi_eh_completed_normally
1082  *
1083  * Purpose:     Examine a command block that has come back from the low-level
1084  *              and figure out what to do next.
1085  *
1086  * Returns:     SUCCESS         - pass on to upper level.
1087  *              FAILED          - pass on to error handler thread.
1088  *              RETRY           - command should be retried.
1089  *              SOFTERR         - command succeeded, but we need to log
1090  *                                a soft error.
1091  *
1092  * Notes:       This is *ONLY* called when we are examining the status
1093  *              of commands queued during error recovery.  The main
1094  *              difference here is that we don't allow for the possibility
1095  *              of retries here, and we are a lot more restrictive about what
1096  *              we consider acceptable.
1097  */
scsi_eh_completed_normally(Scsi_Cmnd * SCpnt)1098 STATIC int scsi_eh_completed_normally(Scsi_Cmnd * SCpnt)
1099 {
1100 	/*
1101 	 * First check the host byte, to see if there is anything in there
1102 	 * that would indicate what we need to do.
1103 	 */
1104 	if (host_byte(SCpnt->result) == DID_RESET) {
1105 		if (SCpnt->flags & IS_RESETTING) {
1106 			/*
1107 			 * OK, this is normal.  We don't know whether in fact the
1108 			 * command in question really needs to be rerun or not -
1109 			 * if this was the original data command then the answer is yes,
1110 			 * otherwise we just flag it as success.
1111 			 */
1112 			SCpnt->flags &= ~IS_RESETTING;
1113 			return NEEDS_RETRY;
1114 		}
1115 		/*
1116 		 * Rats.  We are already in the error handler, so we now get to try
1117 		 * and figure out what to do next.  If the sense is valid, we have
1118 		 * a pretty good idea of what to do.  If not, we mark it as failed.
1119 		 */
1120 		return scsi_check_sense(SCpnt);
1121 	}
1122 	if (host_byte(SCpnt->result) != DID_OK) {
1123 		return FAILED;
1124 	}
1125 	/*
1126 	 * Next, check the message byte.
1127 	 */
1128 	if (msg_byte(SCpnt->result) != COMMAND_COMPLETE) {
1129 		return FAILED;
1130 	}
1131 	/*
1132 	 * Now, check the status byte to see if this indicates anything special.
1133 	 */
1134 	switch (status_byte(SCpnt->result)) {
1135 	case GOOD:
1136 	case COMMAND_TERMINATED:
1137 		return SUCCESS;
1138 	case CHECK_CONDITION:
1139 		return scsi_check_sense(SCpnt);
1140 	case CONDITION_GOOD:
1141 	case INTERMEDIATE_GOOD:
1142 	case INTERMEDIATE_C_GOOD:
1143 		/*
1144 		 * Who knows?  FIXME(eric)
1145 		 */
1146 		return SUCCESS;
1147 	case BUSY:
1148 	case QUEUE_FULL:
1149 	case RESERVATION_CONFLICT:
1150 	default:
1151 		return FAILED;
1152 	}
1153 	return FAILED;
1154 }
1155 
1156 /*
1157  * Function:  scsi_check_sense
1158  *
1159  * Purpose:     Examine sense information - give suggestion as to what
1160  *              we should do with it.
1161  */
scsi_check_sense(Scsi_Cmnd * SCpnt)1162 STATIC int scsi_check_sense(Scsi_Cmnd * SCpnt)
1163 {
1164 	if (!scsi_sense_valid(SCpnt)) {
1165 		return FAILED;
1166 	}
1167 	if (SCpnt->sense_buffer[2] & 0xe0)
1168 		return SUCCESS;
1169 
1170 	switch (SCpnt->sense_buffer[2] & 0xf) {
1171 	case NO_SENSE:
1172 		return SUCCESS;
1173 	case RECOVERED_ERROR:
1174 		return /* SOFT_ERROR */ SUCCESS;
1175 
1176 	case ABORTED_COMMAND:
1177 		return NEEDS_RETRY;
1178 	case NOT_READY:
1179 	case UNIT_ATTENTION:
1180 		/*
1181 		 * If we are expecting a CC/UA because of a bus reset that we
1182 		 * performed, treat this just as a retry.  Otherwise this is
1183 		 * information that we should pass up to the upper-level driver
1184 		 * so that we can deal with it there.
1185 		 */
1186 		if (SCpnt->device->expecting_cc_ua) {
1187 			SCpnt->device->expecting_cc_ua = 0;
1188 			return NEEDS_RETRY;
1189 		}
1190 		/*
1191 		 * If the device is in the process of becoming ready, we
1192 		 * should retry.
1193 		 */
1194 		if ((SCpnt->sense_buffer[12] == 0x04) &&
1195 			(SCpnt->sense_buffer[13] == 0x01)) {
1196 			return NEEDS_RETRY;
1197 		}
1198 		return SUCCESS;
1199 
1200 		/* these three are not supported */
1201 	case COPY_ABORTED:
1202 	case VOLUME_OVERFLOW:
1203 	case MISCOMPARE:
1204 		return SUCCESS;
1205 
1206 	case MEDIUM_ERROR:
1207 		return NEEDS_RETRY;
1208 
1209 	case ILLEGAL_REQUEST:
1210 	case BLANK_CHECK:
1211 	case DATA_PROTECT:
1212 	case HARDWARE_ERROR:
1213 	default:
1214 		return SUCCESS;
1215 	}
1216 }
1217 
1218 
1219 /*
1220  * Function:  scsi_restart_operations
1221  *
1222  * Purpose:     Restart IO operations to the specified host.
1223  *
1224  * Arguments:   host  - host that we are restarting
1225  *
1226  * Lock status: Assumed that locks are not held upon entry.
1227  *
1228  * Returns:     Nothing
1229  *
1230  * Notes:       When we entered the error handler, we blocked all further
1231  *              I/O to this device.  We need to 'reverse' this process.
1232  */
scsi_restart_operations(struct Scsi_Host * host)1233 STATIC void scsi_restart_operations(struct Scsi_Host *host)
1234 {
1235 	Scsi_Device *SDpnt;
1236 	unsigned long flags;
1237 
1238 	ASSERT_LOCK(&io_request_lock, 0);
1239 
1240 	/*
1241 	 * Next free up anything directly waiting upon the host.  This will be
1242 	 * requests for character device operations, and also for ioctls to queued
1243 	 * block devices.
1244 	 */
1245 	SCSI_LOG_ERROR_RECOVERY(5, printk("scsi_error.c: Waking up host to restart\n"));
1246 
1247 	wake_up(&host->host_wait);
1248 
1249 	/*
1250 	 * Finally we need to re-initiate requests that may be pending.  We will
1251 	 * have had everything blocked while error handling is taking place, and
1252 	 * now that error recovery is done, we will need to ensure that these
1253 	 * requests are started.
1254 	 */
1255 	spin_lock_irqsave(&io_request_lock, flags);
1256 	for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) {
1257 		request_queue_t *q;
1258 		if ((host->can_queue > 0 && (host->host_busy >= host->can_queue))
1259 		    || (host->host_blocked)
1260 		    || (host->host_self_blocked)
1261 		    || (SDpnt->device_blocked)) {
1262 			break;
1263 		}
1264 		q = &SDpnt->request_queue;
1265 		q->request_fn(q);
1266 	}
1267 	spin_unlock_irqrestore(&io_request_lock, flags);
1268 }
1269 
1270 /*
1271  * Function:  scsi_unjam_host
1272  *
1273  * Purpose:     Attempt to fix a host which has a command that failed for
1274  *              some reason.
1275  *
1276  * Arguments:   host    - host that needs unjamming.
1277  *
1278  * Returns:     Nothing
1279  *
1280  * Notes:       When we come in here, we *know* that all commands on the
1281  *              bus have either completed, failed or timed out.  We also
1282  *              know that no further commands are being sent to the host,
1283  *              so things are relatively quiet and we have freedom to
1284  *              fiddle with things as we wish.
1285  *
1286  * Additional note:  This is only the *default* implementation.  It is possible
1287  *              for individual drivers to supply their own version of this
1288  *              function, and if the maintainer wishes to do this, it is
1289  *              strongly suggested that this function be taken as a template
1290  *              and modified.  This function was designed to correctly handle
1291  *              problems for about 95% of the different cases out there, and
1292  *              it should always provide at least a reasonable amount of error
1293  *              recovery.
1294  *
1295  * Note3:       Any command marked 'FAILED' or 'TIMEOUT' must eventually
1296  *              have scsi_finish_command() called for it.  We do all of
1297  *              the retry stuff here, so when we restart the host after we
1298  *              return it should have an empty queue.
1299  */
scsi_unjam_host(struct Scsi_Host * host)1300 STATIC int scsi_unjam_host(struct Scsi_Host *host)
1301 {
1302 	int devices_failed;
1303 	int numfailed;
1304 	int ourrtn;
1305 	int rtn = FALSE;
1306 	int result;
1307 	Scsi_Cmnd *SCloop;
1308 	Scsi_Cmnd *SCpnt;
1309 	Scsi_Device *SDpnt;
1310 	Scsi_Device *SDloop;
1311 	Scsi_Cmnd *SCdone;
1312 	int timed_out;
1313 
1314 	ASSERT_LOCK(&io_request_lock, 0);
1315 
1316 	SCdone = NULL;
1317 
1318 	/*
1319 	 * First, protect against any sort of race condition.  If any of the outstanding
1320 	 * commands are in states that indicate that we are not yet blocked (i.e. we are
1321 	 * not in a quiet state) then we got woken up in error.  If we ever end up here,
1322 	 * we need to re-examine some of the assumptions.
1323 	 */
1324 	for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) {
1325 		for (SCpnt = SDpnt->device_queue; SCpnt; SCpnt = SCpnt->next) {
1326 			if (SCpnt->state == SCSI_STATE_FAILED
1327 			    || SCpnt->state == SCSI_STATE_TIMEOUT
1328 			    || SCpnt->state == SCSI_STATE_INITIALIZING
1329 			    || SCpnt->state == SCSI_STATE_UNUSED) {
1330 				continue;
1331 			}
1332 			/*
1333 			 * Rats.  Something is still floating around out there.  This could
1334 			 * be the result of the fact that the upper level drivers are still frobbing
1335 			 * commands that might have succeeded.  There are two outcomes.  One is that
1336 			 * the command block will eventually be freed, and the other one is that
1337 			 * the command will be queued and will be finished along the way.
1338 			 */
1339 			SCSI_LOG_ERROR_RECOVERY(1, printk("Error handler prematurely woken - commands still active (%p %x %d)\n", SCpnt, SCpnt->state, SCpnt->target));
1340 
1341 /*
1342  *        panic("SCSI Error handler woken too early\n");
1343  *
1344  * This is no longer a problem, since now the code cares only about
1345  * SCSI_STATE_TIMEOUT and SCSI_STATE_FAILED.
1346  * Other states are useful only to release active commands when devices are
1347  * set offline. If (host->host_active == host->host_busy) we can safely assume
1348  * that there are no commands in state other then TIMEOUT od FAILED. (DB)
1349  *
1350  * FIXME:
1351  * It is not easy to release correctly commands according to their state when
1352  * devices are set offline, when the state is neither TIMEOUT nor FAILED.
1353  * When a device is set offline, we can have some command with
1354  * rq_status=RQ_SCSY_BUSY, owner=SCSI_STATE_HIGHLEVEL,
1355  * state=SCSI_STATE_INITIALIZING and the driver module cannot be released.
1356  * (DB, 17 May 1998)
1357  */
1358 		}
1359 	}
1360 
1361 	/*
1362 	 * Next, see if we need to request sense information.  if so,
1363 	 * then get it now, so we have a better idea of what to do.
1364 	 * FIXME(eric) this has the unfortunate side effect that if a host
1365 	 * adapter does not automatically request sense information, that we end
1366 	 * up shutting it down before we request it.  All hosts should be doing this
1367 	 * anyways, so for now all I have to say is tough noogies if you end up in here.
1368 	 * On second thought, this is probably a good idea.  We *really* want to give
1369 	 * authors an incentive to automatically request this.
1370 	 */
1371 	SCSI_LOG_ERROR_RECOVERY(3, printk("scsi_unjam_host: Checking to see if we need to request sense\n"));
1372 
1373 	for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) {
1374 		for (SCpnt = SDpnt->device_queue; SCpnt; SCpnt = SCpnt->next) {
1375 		      recheck_sense_valid:
1376 			if (SCpnt->state != SCSI_STATE_FAILED || scsi_sense_valid(SCpnt)) {
1377 				continue;
1378 			}
1379 			SCSI_LOG_ERROR_RECOVERY(2, printk("scsi_unjam_host: Requesting sense for %d\n",
1380 							  SCpnt->target));
1381 			rtn = scsi_request_sense(SCpnt);
1382 			if (rtn != SUCCESS) {
1383 				continue;
1384 			}
1385 			SCSI_LOG_ERROR_RECOVERY(3, printk("Sense requested for %p - result %x\n",
1386 						  SCpnt, SCpnt->result));
1387 			SCSI_LOG_ERROR_RECOVERY(3, print_sense("bh", SCpnt));
1388 
1389 			result = scsi_decide_disposition(SCpnt);
1390 
1391 			/*
1392 			 * If the result was normal, then just pass it along to the
1393 			 * upper level.
1394 			 */
1395 			if (result == SUCCESS) {
1396 				SCpnt->host->host_failed--;
1397 				scsi_eh_finish_command(&SCdone, SCpnt);
1398 			}
1399 			if (result != NEEDS_RETRY) {
1400 				continue;
1401 			}
1402 			/*
1403 			 * We only come in here if we want to retry a
1404 			 * command.  The test to see whether the command
1405 			 * should be retried should be keeping track of the
1406 			 * number of tries, so we don't end up looping, of
1407 			 * course.
1408 			 */
1409 			SCpnt->state = NEEDS_RETRY;
1410 			rtn = scsi_eh_retry_command(SCpnt);
1411 			if (rtn != SUCCESS) {
1412 				SCpnt->state = SCSI_STATE_FAILED;
1413 				goto recheck_sense_valid;
1414 			}
1415 			/*
1416 			 * We eventually hand this one back to the top level.
1417 			 */
1418 			SCpnt->host->host_failed--;
1419 			scsi_eh_finish_command(&SCdone, SCpnt);
1420 		}
1421 	}
1422 
1423 	/*
1424 	 * Go through the list of commands and figure out where we stand and how bad things
1425 	 * really are.
1426 	 */
1427 	numfailed = 0;
1428 	timed_out = 0;
1429 	devices_failed = 0;
1430 	for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) {
1431 		unsigned int device_error = 0;
1432 
1433 		for (SCpnt = SDpnt->device_queue; SCpnt; SCpnt = SCpnt->next) {
1434 			if (SCpnt->state == SCSI_STATE_FAILED) {
1435 				SCSI_LOG_ERROR_RECOVERY(5, printk("Command to ID %d failed\n",
1436 							 SCpnt->target));
1437 				numfailed++;
1438 				device_error++;
1439 			}
1440 			if (SCpnt->state == SCSI_STATE_TIMEOUT) {
1441 				SCSI_LOG_ERROR_RECOVERY(5, printk("Command to ID %d timedout\n",
1442 							 SCpnt->target));
1443 				timed_out++;
1444 				device_error++;
1445 			}
1446 		}
1447 		if (device_error > 0) {
1448 			devices_failed++;
1449 		}
1450 	}
1451 
1452 	SCSI_LOG_ERROR_RECOVERY(2, printk("Total of %d+%d commands on %d devices require eh work\n",
1453 				  numfailed, timed_out, devices_failed));
1454 
1455 	if (host->host_failed == 0) {
1456 		ourrtn = TRUE;
1457 		goto leave;
1458 	}
1459 	/*
1460 	 * Next, try and see whether or not it makes sense to try and abort
1461 	 * the running command.  This only works out to be the case if we have
1462 	 * one command that has timed out.  If the command simply failed, it
1463 	 * makes no sense to try and abort the command, since as far as the
1464 	 * host adapter is concerned, it isn't running.
1465 	 */
1466 
1467 	SCSI_LOG_ERROR_RECOVERY(3, printk("scsi_unjam_host: Checking to see if we want to try abort\n"));
1468 
1469 	for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) {
1470 		for (SCloop = SDpnt->device_queue; SCloop; SCloop = SCloop->next) {
1471 			if (SCloop->state != SCSI_STATE_TIMEOUT) {
1472 				continue;
1473 			}
1474 			rtn = scsi_try_to_abort_command(SCloop, ABORT_TIMEOUT);
1475 			if (rtn == SUCCESS) {
1476 				rtn = scsi_test_unit_ready(SCloop);
1477 
1478 				if (rtn == SUCCESS && scsi_unit_is_ready(SCloop)) {
1479 					rtn = scsi_eh_retry_command(SCloop);
1480 
1481 					if (rtn == SUCCESS) {
1482 						SCloop->host->host_failed--;
1483 						scsi_eh_finish_command(&SCdone, SCloop);
1484 					}
1485 				}
1486 			}
1487 		}
1488 	}
1489 
1490 	/*
1491 	 * If we have corrected all of the problems, then we are done.
1492 	 */
1493 	if (host->host_failed == 0) {
1494 		ourrtn = TRUE;
1495 		goto leave;
1496 	}
1497 	/*
1498 	 * Either the abort wasn't appropriate, or it didn't succeed.
1499 	 * Now try a bus device reset.  Still, look to see whether we have
1500 	 * multiple devices that are jammed or not - if we have multiple devices,
1501 	 * it makes no sense to try BUS_DEVICE_RESET - we really would need
1502 	 * to try a BUS_RESET instead.
1503 	 *
1504 	 * Does this make sense - should we try BDR on each device individually?
1505 	 * Yes, definitely.
1506 	 */
1507 	SCSI_LOG_ERROR_RECOVERY(3, printk("scsi_unjam_host: Checking to see if we want to try BDR\n"));
1508 
1509 	for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) {
1510 		for (SCloop = SDpnt->device_queue; SCloop; SCloop = SCloop->next) {
1511 			if (SCloop->state == SCSI_STATE_FAILED
1512 			    || SCloop->state == SCSI_STATE_TIMEOUT) {
1513 				break;
1514 			}
1515 		}
1516 
1517 		if (SCloop == NULL) {
1518 			continue;
1519 		}
1520 		/*
1521 		 * OK, we have a device that is having problems.  Try and send
1522 		 * a bus device reset to it.
1523 		 *
1524 		 * FIXME(eric) - make sure we handle the case where multiple
1525 		 * commands to the same device have failed. They all must
1526 		 * get properly restarted.
1527 		 */
1528 		rtn = scsi_try_bus_device_reset(SCloop, RESET_TIMEOUT);
1529 
1530 		if (rtn == SUCCESS) {
1531 			rtn = scsi_test_unit_ready(SCloop);
1532 
1533 			if (rtn == SUCCESS && scsi_unit_is_ready(SCloop)) {
1534 				rtn = scsi_eh_retry_command(SCloop);
1535 
1536 				if (rtn == SUCCESS) {
1537 					SCloop->host->host_failed--;
1538 					scsi_eh_finish_command(&SCdone, SCloop);
1539 				}
1540 			}
1541 		}
1542 	}
1543 
1544 	if (host->host_failed == 0) {
1545 		ourrtn = TRUE;
1546 		goto leave;
1547 	}
1548 	/*
1549 	 * If we ended up here, we have serious problems.  The only thing left
1550 	 * to try is a full bus reset.  If someone has grabbed the bus and isn't
1551 	 * letting go, then perhaps this will help.
1552 	 */
1553 	SCSI_LOG_ERROR_RECOVERY(3, printk("scsi_unjam_host: Try hard bus reset\n"));
1554 
1555 	/*
1556 	 * We really want to loop over the various channels, and do this on
1557 	 * a channel by channel basis.  We should also check to see if any
1558 	 * of the failed commands are on soft_reset devices, and if so, skip
1559 	 * the reset.
1560 	 */
1561 	for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) {
1562 	      next_device:
1563 		for (SCpnt = SDpnt->device_queue; SCpnt; SCpnt = SCpnt->next) {
1564 			if (SCpnt->state != SCSI_STATE_FAILED
1565 			    && SCpnt->state != SCSI_STATE_TIMEOUT) {
1566 				continue;
1567 			}
1568 			/*
1569 			 * We have a failed command.  Make sure there are no other failed
1570 			 * commands on the same channel that are timed out and implement a
1571 			 * soft reset.
1572 			 */
1573 			for (SDloop = host->host_queue; SDloop; SDloop = SDloop->next) {
1574 				for (SCloop = SDloop->device_queue; SCloop; SCloop = SCloop->next) {
1575 					if (SCloop->channel != SCpnt->channel) {
1576 						continue;
1577 					}
1578 					if (SCloop->state != SCSI_STATE_FAILED
1579 					    && SCloop->state != SCSI_STATE_TIMEOUT) {
1580 						continue;
1581 					}
1582 					if (SDloop->soft_reset && SCloop->state == SCSI_STATE_TIMEOUT) {
1583 						/*
1584 						 * If this device uses the soft reset option, and this
1585 						 * is one of the devices acting up, then our only
1586 						 * option is to wait a bit, since the command is
1587 						 * supposedly still running.
1588 						 *
1589 						 * FIXME(eric) - right now we will just end up falling
1590 						 * through to the 'take device offline' case.
1591 						 *
1592 						 * FIXME(eric) - It is possible that the command completed
1593 						 * *after* the error recovery procedure started, and if this
1594 						 * is the case, we are worrying about nothing here.
1595 						 */
1596 
1597 						scsi_sleep(1 * HZ);
1598 						goto next_device;
1599 					}
1600 				}
1601 			}
1602 
1603 			/*
1604 			 * We now know that we are able to perform a reset for the
1605 			 * bus that SCpnt points to.  There are no soft-reset devices
1606 			 * with outstanding timed out commands.
1607 			 */
1608 			rtn = scsi_try_bus_reset(SCpnt);
1609 			if (rtn == SUCCESS) {
1610 				for (SDloop = host->host_queue; SDloop; SDloop = SDloop->next) {
1611 					for (SCloop = SDloop->device_queue; SCloop; SCloop = SCloop->next) {
1612 						if (SCloop->channel != SCpnt->channel) {
1613 							continue;
1614 						}
1615 						if (SCloop->state != SCSI_STATE_FAILED
1616 						    && SCloop->state != SCSI_STATE_TIMEOUT) {
1617 							continue;
1618 						}
1619 						rtn = scsi_test_unit_ready(SCloop);
1620 
1621 						if (rtn == SUCCESS && scsi_unit_is_ready(SCloop)) {
1622 							rtn = scsi_eh_retry_command(SCloop);
1623 
1624 							if (rtn == SUCCESS) {
1625 								SCpnt->host->host_failed--;
1626 								scsi_eh_finish_command(&SCdone, SCloop);
1627 							}
1628 						}
1629 						/*
1630 						 * If the bus reset worked, but we are still unable to
1631 						 * talk to the device, take it offline.
1632 						 * FIXME(eric) - is this really the correct thing to do?
1633 						 */
1634 						if (rtn != SUCCESS) {
1635 							printk(KERN_INFO "scsi: device set offline - not ready or command retry failed after bus reset: host %d channel %d id %d lun %d\n", SDloop->host->host_no, SDloop->channel, SDloop->id, SDloop->lun);
1636 
1637 							SDloop->online = FALSE;
1638 							SDloop->host->host_failed--;
1639 							scsi_eh_finish_command(&SCdone, SCloop);
1640 						}
1641 					}
1642 				}
1643 			}
1644 		}
1645 	}
1646 
1647 	if (host->host_failed == 0) {
1648 		ourrtn = TRUE;
1649 		goto leave;
1650 	}
1651 	/*
1652 	 * If we ended up here, we have serious problems.  The only thing left
1653 	 * to try is a full host reset - perhaps the firmware on the device
1654 	 * crashed, or something like that.
1655 	 *
1656 	 * It is assumed that a succesful host reset will cause *all* information
1657 	 * about the command to be flushed from both the host adapter *and* the
1658 	 * device.
1659 	 *
1660 	 * FIXME(eric) - it isn't clear that devices that implement the soft reset
1661 	 * option can ever be cleared except via cycling the power.  The problem is
1662 	 * that sending the host reset command will cause the host to forget
1663 	 * about the pending command, but the device won't forget.  For now, we
1664 	 * skip the host reset option if any of the failed devices are configured
1665 	 * to use the soft reset option.
1666 	 */
1667 	for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) {
1668 	      next_device2:
1669 		for (SCpnt = SDpnt->device_queue; SCpnt; SCpnt = SCpnt->next) {
1670 			if (SCpnt->state != SCSI_STATE_FAILED
1671 			    && SCpnt->state != SCSI_STATE_TIMEOUT) {
1672 				continue;
1673 			}
1674 			if (SDpnt->soft_reset && SCpnt->state == SCSI_STATE_TIMEOUT) {
1675 				/*
1676 				 * If this device uses the soft reset option, and this
1677 				 * is one of the devices acting up, then our only
1678 				 * option is to wait a bit, since the command is
1679 				 * supposedly still running.
1680 				 *
1681 				 * FIXME(eric) - right now we will just end up falling
1682 				 * through to the 'take device offline' case.
1683 				 */
1684 				SCSI_LOG_ERROR_RECOVERY(3,
1685 							printk("scsi_unjam_host: Unable to try hard host reset\n"));
1686 
1687 				/*
1688 				 * Due to the spinlock, we will never get out of this
1689 				 * loop without a proper wait. (DB)
1690 				 */
1691 				scsi_sleep(1 * HZ);
1692 
1693 				goto next_device2;
1694 			}
1695 			SCSI_LOG_ERROR_RECOVERY(3, printk("scsi_unjam_host: Try hard host reset\n"));
1696 
1697 			/*
1698 			 * FIXME(eric) - we need to obtain a valid SCpnt to perform this call.
1699 			 */
1700 			rtn = scsi_try_host_reset(SCpnt);
1701 			if (rtn == SUCCESS) {
1702 				/*
1703 				 * FIXME(eric) we assume that all commands are flushed from the
1704 				 * controller.  We should get a DID_RESET for all of the commands
1705 				 * that were pending.  We should ignore these so that we can
1706 				 * guarantee that we are in a consistent state.
1707 				 *
1708 				 * I believe this to be the case right now, but this needs to be
1709 				 * tested.
1710 				 */
1711 				for (SDloop = host->host_queue; SDloop; SDloop = SDloop->next) {
1712 					for (SCloop = SDloop->device_queue; SCloop; SCloop = SCloop->next) {
1713 						if (SCloop->state != SCSI_STATE_FAILED
1714 						    && SCloop->state != SCSI_STATE_TIMEOUT) {
1715 							continue;
1716 						}
1717 						rtn = scsi_test_unit_ready(SCloop);
1718 
1719 						if (rtn == SUCCESS && scsi_unit_is_ready(SCloop)) {
1720 							rtn = scsi_eh_retry_command(SCloop);
1721 
1722 							if (rtn == SUCCESS) {
1723 								SCpnt->host->host_failed--;
1724 								scsi_eh_finish_command(&SCdone, SCloop);
1725 							}
1726 						}
1727 						if (rtn != SUCCESS) {
1728 							printk(KERN_INFO "scsi: device set offline - not ready or command retry failed after host reset: host %d channel %d id %d lun %d\n", SDloop->host->host_no, SDloop->channel, SDloop->id, SDloop->lun);
1729 							SDloop->online = FALSE;
1730 							SDloop->host->host_failed--;
1731 							scsi_eh_finish_command(&SCdone, SCloop);
1732 						}
1733 					}
1734 				}
1735 			}
1736 		}
1737 	}
1738 
1739 	/*
1740 	 * If we solved all of the problems, then let's rev up the engines again.
1741 	 */
1742 	if (host->host_failed == 0) {
1743 		ourrtn = TRUE;
1744 		goto leave;
1745 	}
1746 	/*
1747 	 * If the HOST RESET failed, then for now we assume that the entire host
1748 	 * adapter is too hosed to be of any use.  For our purposes, however, it is
1749 	 * easier to simply take the devices offline that correspond to commands
1750 	 * that failed.
1751 	 */
1752 	SCSI_LOG_ERROR_RECOVERY(1, printk("scsi_unjam_host: Take device offline\n"));
1753 
1754 	for (SDpnt = host->host_queue; SDpnt; SDpnt = SDpnt->next) {
1755 		for (SCloop = SDpnt->device_queue; SCloop; SCloop = SCloop->next) {
1756 			if (SCloop->state == SCSI_STATE_FAILED || SCloop->state == SCSI_STATE_TIMEOUT) {
1757 				SDloop = SCloop->device;
1758 				if (SDloop->online == TRUE) {
1759 					printk(KERN_INFO "scsi: device set offline - command error recover failed: host %d channel %d id %d lun %d\n", SDloop->host->host_no, SDloop->channel, SDloop->id, SDloop->lun);
1760 					SDloop->online = FALSE;
1761 				}
1762 
1763 				/*
1764 				 * This should pass the failure up to the top level driver, and
1765 				 * it will have to try and do something intelligent with it.
1766 				 */
1767 				SCloop->host->host_failed--;
1768 
1769 				if (SCloop->state == SCSI_STATE_TIMEOUT) {
1770 					SCloop->result |= (DRIVER_TIMEOUT << 24);
1771 				}
1772 				SCSI_LOG_ERROR_RECOVERY(3, printk("Finishing command for device %d %x\n",
1773 				    SDloop->id, SCloop->result));
1774 
1775 				scsi_eh_finish_command(&SCdone, SCloop);
1776 			}
1777 		}
1778 	}
1779 
1780 	if (host->host_failed != 0) {
1781 		panic("scsi_unjam_host: Miscount of number of failed commands.\n");
1782 	}
1783 	SCSI_LOG_ERROR_RECOVERY(3, printk("scsi_unjam_host: Returning\n"));
1784 
1785 	ourrtn = FALSE;
1786 
1787       leave:
1788 
1789 	/*
1790 	 * We should have a list of commands that we 'finished' during the course of
1791 	 * error recovery.  This should be the same as the list of commands that timed out
1792 	 * or failed.  We are currently holding these things in a linked list - we didn't
1793 	 * put them in the bottom half queue because we wanted to keep things quiet while
1794 	 * we were working on recovery, and passing them up to the top level could easily
1795 	 * cause the top level to try and queue something else again.
1796 	 *
1797 	 * Start by marking that the host is no longer in error recovery.
1798 	 */
1799 	host->in_recovery = 0;
1800 
1801 	/*
1802 	 * Take the list of commands, and stick them in the bottom half queue.
1803 	 * The current implementation of scsi_done will do this for us - if need
1804 	 * be we can create a special version of this function to do the
1805 	 * same job for us.
1806 	 */
1807 	for (SCpnt = SCdone; SCpnt != NULL; SCpnt = SCdone) {
1808 		SCdone = SCpnt->bh_next;
1809 		SCpnt->bh_next = NULL;
1810                 /*
1811                  * Oh, this is a vile hack.  scsi_done() expects a timer
1812                  * to be running on the command.  If there isn't, it assumes
1813                  * that the command has actually timed out, and a timer
1814                  * handler is running.  That may well be how we got into
1815                  * this fix, but right now things are stable.  We add
1816                  * a timer back again so that we can report completion.
1817                  * scsi_done() will immediately remove said timer from
1818                  * the command, and then process it.
1819                  */
1820 		scsi_add_timer(SCpnt, 100, scsi_eh_times_out);
1821 		scsi_done(SCpnt);
1822 	}
1823 
1824 	return (ourrtn);
1825 }
1826 
1827 
1828 /*
1829  * Function:  scsi_error_handler
1830  *
1831  * Purpose:     Handle errors/timeouts of scsi commands, try and clean up
1832  *              and unjam the bus, and restart things.
1833  *
1834  * Arguments:   host    - host for which we are running.
1835  *
1836  * Returns:     Never returns.
1837  *
1838  * Notes:       This is always run in the context of a kernel thread.  The
1839  *              idea is that we start this thing up when the kernel starts
1840  *              up (one per host that we detect), and it immediately goes to
1841  *              sleep and waits for some event (i.e. failure).  When this
1842  *              takes place, we have the job of trying to unjam the bus
1843  *              and restarting things.
1844  *
1845  */
scsi_error_handler(void * data)1846 void scsi_error_handler(void *data)
1847 {
1848 	struct Scsi_Host *host = (struct Scsi_Host *) data;
1849 	int rtn;
1850 	DECLARE_MUTEX_LOCKED(sem);
1851 
1852         /*
1853          * We only listen to signals if the HA was loaded as a module.
1854          * If the HA was compiled into the kernel, then we don't listen
1855          * to any signals.
1856          */
1857         if( host->loaded_as_module ) {
1858 	siginitsetinv(&current->blocked, SHUTDOWN_SIGS);
1859 	} else {
1860 	siginitsetinv(&current->blocked, 0);
1861         }
1862 
1863 	lock_kernel();
1864 
1865 	/*
1866 	 *    Flush resources
1867 	 */
1868 
1869 	daemonize();
1870 	reparent_to_init();
1871 
1872 	/*
1873 	 * Set the name of this process.
1874 	 */
1875 
1876 	sprintf(current->comm, "scsi_eh_%d", host->host_no);
1877 
1878 	host->eh_wait = &sem;
1879 	host->ehandler = current;
1880 
1881 	unlock_kernel();
1882 
1883 	/*
1884 	 * Wake up the thread that created us.
1885 	 */
1886 	SCSI_LOG_ERROR_RECOVERY(3, printk("Wake up parent %d\n", sem_getcount(host->eh_notify)));
1887 
1888 	up(host->eh_notify);
1889 
1890 	while (1) {
1891 		/*
1892 		 * If we get a signal, it means we are supposed to go
1893 		 * away and die.  This typically happens if the user is
1894 		 * trying to unload a module.
1895 		 */
1896 		SCSI_LOG_ERROR_RECOVERY(1, printk("Error handler sleeping\n"));
1897 
1898 		/*
1899 		 * Note - we always use down_interruptible with the semaphore
1900 		 * even if the module was loaded as part of the kernel.  The
1901 		 * reason is that down() will cause this thread to be counted
1902 		 * in the load average as a running process, and down
1903 		 * interruptible doesn't.  Given that we need to allow this
1904 		 * thread to die if the driver was loaded as a module, using
1905 		 * semaphores isn't unreasonable.
1906 		 */
1907 		down_interruptible(&sem);
1908 		if( host->loaded_as_module ) {
1909 			if (signal_pending(current))
1910 				break;
1911                 }
1912 
1913 		SCSI_LOG_ERROR_RECOVERY(1, printk("Error handler waking up\n"));
1914 
1915 		host->eh_active = 1;
1916 
1917 		/*
1918 		 * We have a host that is failing for some reason.  Figure out
1919 		 * what we need to do to get it up and online again (if we can).
1920 		 * If we fail, we end up taking the thing offline.
1921 		 */
1922 		if (host->hostt->eh_strategy_handler != NULL) {
1923 			rtn = host->hostt->eh_strategy_handler(host);
1924 		} else {
1925 			rtn = scsi_unjam_host(host);
1926 		}
1927 
1928 		host->eh_active = 0;
1929 
1930 		/*
1931 		 * Note - if the above fails completely, the action is to take
1932 		 * individual devices offline and flush the queue of any
1933 		 * outstanding requests that may have been pending.  When we
1934 		 * restart, we restart any I/O to any other devices on the bus
1935 		 * which are still online.
1936 		 */
1937 		scsi_restart_operations(host);
1938 
1939 	}
1940 
1941 	SCSI_LOG_ERROR_RECOVERY(1, printk("Error handler exiting\n"));
1942 
1943 	/*
1944 	 * Make sure that nobody tries to wake us up again.
1945 	 */
1946 	host->eh_wait = NULL;
1947 
1948 	/*
1949 	 * Knock this down too.  From this point on, the host is flying
1950 	 * without a pilot.  If this is because the module is being unloaded,
1951 	 * that's fine.  If the user sent a signal to this thing, we are
1952 	 * potentially in real danger.
1953 	 */
1954 	host->in_recovery = 0;
1955 	host->eh_active = 0;
1956 	host->ehandler = NULL;
1957 
1958 	/*
1959 	 * If anyone is waiting for us to exit (i.e. someone trying to unload
1960 	 * a driver), then wake up that process to let them know we are on
1961 	 * the way out the door.  This may be overkill - I *think* that we
1962 	 * could probably just unload the driver and send the signal, and when
1963 	 * the error handling thread wakes up that it would just exit without
1964 	 * needing to touch any memory associated with the driver itself.
1965 	 */
1966 	if (host->eh_notify != NULL)
1967 		up(host->eh_notify);
1968 }
1969 
1970 /*
1971  * Function:	scsi_new_reset
1972  *
1973  * Purpose:	Send requested reset to a bus or device at any phase.
1974  *
1975  * Arguments:	SCpnt	- command ptr to send reset with (usually a dummy)
1976  *		flag - reset type (see scsi.h)
1977  *
1978  * Returns:	SUCCESS/FAILURE.
1979  *
1980  * Notes:	This is used by the SCSI Generic driver to provide
1981  *		Bus/Device reset capability.
1982  */
1983 int
scsi_new_reset(Scsi_Cmnd * SCpnt,int flag)1984 scsi_new_reset(Scsi_Cmnd *SCpnt, int flag)
1985 {
1986 	int rtn;
1987 
1988 	switch(flag) {
1989 	case SCSI_TRY_RESET_DEVICE:
1990 		rtn = scsi_try_bus_device_reset(SCpnt, 0);
1991 		if (rtn == SUCCESS)
1992 			break;
1993 		/* FALLTHROUGH */
1994 	case SCSI_TRY_RESET_BUS:
1995 		rtn = scsi_try_bus_reset(SCpnt);
1996 		if (rtn == SUCCESS)
1997 			break;
1998 		/* FALLTHROUGH */
1999 	case SCSI_TRY_RESET_HOST:
2000 		rtn = scsi_try_host_reset(SCpnt);
2001 		break;
2002 	default:
2003 		rtn = FAILED;
2004 	}
2005 
2006 	return rtn;
2007 }
2008 
2009 /*
2010  * Overrides for Emacs so that we follow Linus's tabbing style.
2011  * Emacs will notice this stuff at the end of the file and automatically
2012  * adjust the settings for this buffer only.  This must remain at the end
2013  * of the file.
2014  * ---------------------------------------------------------------------------
2015  * Local variables:
2016  * c-indent-level: 4
2017  * c-brace-imaginary-offset: 0
2018  * c-brace-offset: -4
2019  * c-argdecl-indent: 4
2020  * c-label-offset: -4
2021  * c-continued-statement-offset: 4
2022  * c-continued-brace-offset: 0
2023  * indent-tabs-mode: nil
2024  * tab-width: 8
2025  * End:
2026  */
2027