1 /*
2  * raid1.c : Multiple Devices driver for Linux
3  *
4  * Copyright (C) 1999, 2000 Ingo Molnar, Red Hat
5  *
6  * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
7  *
8  * RAID-1 management functions.
9  *
10  * Better read-balancing code written by Mika Kuoppala <miku@iki.fi>, 2000
11  *
12  * Fixes to reconstruction by Jakob �stergaard" <jakob@ostenfeld.dk>
13  * Various fixes by Neil Brown <neilb@cse.unsw.edu.au>
14  *
15  * This program is free software; you can redistribute it and/or modify
16  * it under the terms of the GNU General Public License as published by
17  * the Free Software Foundation; either version 2, or (at your option)
18  * any later version.
19  *
20  * You should have received a copy of the GNU General Public License
21  * (for example /usr/src/linux/COPYING); if not, write to the Free
22  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23  */
24 
25 #include <linux/module.h>
26 #include <linux/config.h>
27 #include <linux/slab.h>
28 #include <linux/raid/raid1.h>
29 #include <asm/atomic.h>
30 
31 #define MAJOR_NR MD_MAJOR
32 #define MD_DRIVER
33 #define MD_PERSONALITY
34 
35 #define MAX_WORK_PER_DISK 128
36 
37 #define	NR_RESERVED_BUFS	32
38 
39 
40 /*
41  * The following can be used to debug the driver
42  */
43 #define RAID1_DEBUG	0
44 
45 #if RAID1_DEBUG
46 #define PRINTK(x...)   printk(x)
47 #define inline
48 #define __inline__
49 #else
50 #define PRINTK(x...)  do { } while (0)
51 #endif
52 
53 
54 static mdk_personality_t raid1_personality;
55 static md_spinlock_t retry_list_lock = MD_SPIN_LOCK_UNLOCKED;
56 struct raid1_bh *raid1_retry_list = NULL, **raid1_retry_tail;
57 
raid1_alloc_bh(raid1_conf_t * conf,int cnt)58 static struct buffer_head *raid1_alloc_bh(raid1_conf_t *conf, int cnt)
59 {
60 	/* return a linked list of "cnt" struct buffer_heads.
61 	 * don't take any off the free list unless we know we can
62 	 * get all we need, otherwise we could deadlock
63 	 */
64 	struct buffer_head *bh=NULL;
65 
66 	while(cnt) {
67 		struct buffer_head *t;
68 		md_spin_lock_irq(&conf->device_lock);
69 		if (!conf->freebh_blocked && conf->freebh_cnt >= cnt)
70 			while (cnt) {
71 				t = conf->freebh;
72 				conf->freebh = t->b_next;
73 				t->b_next = bh;
74 				bh = t;
75 				t->b_state = 0;
76 				conf->freebh_cnt--;
77 				cnt--;
78 			}
79 		md_spin_unlock_irq(&conf->device_lock);
80 		if (cnt == 0)
81 			break;
82 		t = kmem_cache_alloc(bh_cachep, SLAB_NOIO);
83 		if (t) {
84 			t->b_next = bh;
85 			bh = t;
86 			cnt--;
87 		} else {
88 			PRINTK("raid1: waiting for %d bh\n", cnt);
89 			conf->freebh_blocked = 1;
90 			wait_disk_event(conf->wait_buffer,
91 					!conf->freebh_blocked ||
92 					conf->freebh_cnt > conf->raid_disks * NR_RESERVED_BUFS/2);
93 			conf->freebh_blocked = 0;
94 		}
95 	}
96 	return bh;
97 }
98 
raid1_free_bh(raid1_conf_t * conf,struct buffer_head * bh)99 static inline void raid1_free_bh(raid1_conf_t *conf, struct buffer_head *bh)
100 {
101 	unsigned long flags;
102 	spin_lock_irqsave(&conf->device_lock, flags);
103 	while (bh) {
104 		struct buffer_head *t = bh;
105 		bh=bh->b_next;
106 		if (t->b_pprev == NULL)
107 			kmem_cache_free(bh_cachep, t);
108 		else {
109 			t->b_next= conf->freebh;
110 			conf->freebh = t;
111 			conf->freebh_cnt++;
112 		}
113 	}
114 	spin_unlock_irqrestore(&conf->device_lock, flags);
115 	wake_up(&conf->wait_buffer);
116 }
117 
raid1_grow_bh(raid1_conf_t * conf,int cnt)118 static int raid1_grow_bh(raid1_conf_t *conf, int cnt)
119 {
120 	/* allocate cnt buffer_heads, possibly less if kmalloc fails */
121 	int i = 0;
122 
123 	while (i < cnt) {
124 		struct buffer_head *bh;
125 		bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL);
126 		if (!bh) break;
127 
128 		md_spin_lock_irq(&conf->device_lock);
129 		bh->b_pprev = &conf->freebh;
130 		bh->b_next = conf->freebh;
131 		conf->freebh = bh;
132 		conf->freebh_cnt++;
133 		md_spin_unlock_irq(&conf->device_lock);
134 
135 		i++;
136 	}
137 	return i;
138 }
139 
raid1_shrink_bh(raid1_conf_t * conf)140 static void raid1_shrink_bh(raid1_conf_t *conf)
141 {
142 	/* discard all buffer_heads */
143 
144 	md_spin_lock_irq(&conf->device_lock);
145 	while (conf->freebh) {
146 		struct buffer_head *bh = conf->freebh;
147 		conf->freebh = bh->b_next;
148 		kmem_cache_free(bh_cachep, bh);
149 		conf->freebh_cnt--;
150 	}
151 	md_spin_unlock_irq(&conf->device_lock);
152 }
153 
154 
raid1_alloc_r1bh(raid1_conf_t * conf)155 static struct raid1_bh *raid1_alloc_r1bh(raid1_conf_t *conf)
156 {
157 	struct raid1_bh *r1_bh = NULL;
158 
159 	do {
160 		md_spin_lock_irq(&conf->device_lock);
161 		if (!conf->freer1_blocked && conf->freer1) {
162 			r1_bh = conf->freer1;
163 			conf->freer1 = r1_bh->next_r1;
164 			conf->freer1_cnt--;
165 			r1_bh->next_r1 = NULL;
166 			r1_bh->state = (1 << R1BH_PreAlloc);
167 			r1_bh->bh_req.b_state = 0;
168 		}
169 		md_spin_unlock_irq(&conf->device_lock);
170 		if (r1_bh)
171 			return r1_bh;
172 		r1_bh = (struct raid1_bh *) kmalloc(sizeof(struct raid1_bh), GFP_NOIO);
173 		if (r1_bh) {
174 			memset(r1_bh, 0, sizeof(*r1_bh));
175 			return r1_bh;
176 		}
177 		conf->freer1_blocked = 1;
178 		wait_disk_event(conf->wait_buffer,
179 				!conf->freer1_blocked ||
180 				conf->freer1_cnt > NR_RESERVED_BUFS/2
181 			);
182 		conf->freer1_blocked = 0;
183 	} while (1);
184 }
185 
raid1_free_r1bh(struct raid1_bh * r1_bh)186 static inline void raid1_free_r1bh(struct raid1_bh *r1_bh)
187 {
188 	struct buffer_head *bh = r1_bh->mirror_bh_list;
189 	raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev);
190 
191 	r1_bh->mirror_bh_list = NULL;
192 
193 	if (test_bit(R1BH_PreAlloc, &r1_bh->state)) {
194 		unsigned long flags;
195 		spin_lock_irqsave(&conf->device_lock, flags);
196 		r1_bh->next_r1 = conf->freer1;
197 		conf->freer1 = r1_bh;
198 		conf->freer1_cnt++;
199 		spin_unlock_irqrestore(&conf->device_lock, flags);
200 		/* don't need to wakeup wait_buffer because
201 		 *  raid1_free_bh below will do that
202 		 */
203 	} else {
204 		kfree(r1_bh);
205 	}
206 	raid1_free_bh(conf, bh);
207 }
208 
raid1_grow_r1bh(raid1_conf_t * conf,int cnt)209 static int raid1_grow_r1bh (raid1_conf_t *conf, int cnt)
210 {
211 	int i = 0;
212 
213 	while (i < cnt) {
214 		struct raid1_bh *r1_bh;
215 		r1_bh = (struct raid1_bh*)kmalloc(sizeof(*r1_bh), GFP_KERNEL);
216 		if (!r1_bh)
217 			break;
218 		memset(r1_bh, 0, sizeof(*r1_bh));
219 		set_bit(R1BH_PreAlloc, &r1_bh->state);
220 		r1_bh->mddev = conf->mddev;
221 
222 		raid1_free_r1bh(r1_bh);
223 		i++;
224 	}
225 	return i;
226 }
227 
raid1_shrink_r1bh(raid1_conf_t * conf)228 static void raid1_shrink_r1bh(raid1_conf_t *conf)
229 {
230 	md_spin_lock_irq(&conf->device_lock);
231 	while (conf->freer1) {
232 		struct raid1_bh *r1_bh = conf->freer1;
233 		conf->freer1 = r1_bh->next_r1;
234 		conf->freer1_cnt--;
235 		kfree(r1_bh);
236 	}
237 	md_spin_unlock_irq(&conf->device_lock);
238 }
239 
240 
241 
raid1_free_buf(struct raid1_bh * r1_bh)242 static inline void raid1_free_buf(struct raid1_bh *r1_bh)
243 {
244 	unsigned long flags;
245 	struct buffer_head *bh = r1_bh->mirror_bh_list;
246 	raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev);
247 	r1_bh->mirror_bh_list = NULL;
248 
249 	spin_lock_irqsave(&conf->device_lock, flags);
250 	r1_bh->next_r1 = conf->freebuf;
251 	conf->freebuf = r1_bh;
252 	spin_unlock_irqrestore(&conf->device_lock, flags);
253 	raid1_free_bh(conf, bh);
254 }
255 
raid1_alloc_buf(raid1_conf_t * conf)256 static struct raid1_bh *raid1_alloc_buf(raid1_conf_t *conf)
257 {
258 	struct raid1_bh *r1_bh;
259 
260 	md_spin_lock_irq(&conf->device_lock);
261 	wait_event_lock_irq(conf->wait_buffer, conf->freebuf, conf->device_lock);
262 	r1_bh = conf->freebuf;
263 	conf->freebuf = r1_bh->next_r1;
264 	r1_bh->next_r1= NULL;
265 	md_spin_unlock_irq(&conf->device_lock);
266 
267 	return r1_bh;
268 }
269 
raid1_grow_buffers(raid1_conf_t * conf,int cnt)270 static int raid1_grow_buffers (raid1_conf_t *conf, int cnt)
271 {
272 	int i = 0;
273 	struct raid1_bh *head = NULL, **tail;
274 	tail = &head;
275 
276 	while (i < cnt) {
277 		struct raid1_bh *r1_bh;
278 		struct page *page;
279 
280 		page = alloc_page(GFP_KERNEL);
281 		if (!page)
282 			break;
283 
284 		r1_bh = (struct raid1_bh *) kmalloc(sizeof(*r1_bh), GFP_KERNEL);
285 		if (!r1_bh) {
286 			__free_page(page);
287 			break;
288 		}
289 		memset(r1_bh, 0, sizeof(*r1_bh));
290 		r1_bh->bh_req.b_page = page;
291 		r1_bh->bh_req.b_data = page_address(page);
292 		*tail = r1_bh;
293 		r1_bh->next_r1 = NULL;
294 		tail = & r1_bh->next_r1;
295 		i++;
296 	}
297 	/* this lock probably isn't needed, as at the time when
298 	 * we are allocating buffers, nobody else will be touching the
299 	 * freebuf list.  But it doesn't hurt....
300 	 */
301 	md_spin_lock_irq(&conf->device_lock);
302 	*tail = conf->freebuf;
303 	conf->freebuf = head;
304 	md_spin_unlock_irq(&conf->device_lock);
305 	return i;
306 }
307 
raid1_shrink_buffers(raid1_conf_t * conf)308 static void raid1_shrink_buffers (raid1_conf_t *conf)
309 {
310 	struct raid1_bh *head;
311 	md_spin_lock_irq(&conf->device_lock);
312 	head = conf->freebuf;
313 	conf->freebuf = NULL;
314 	md_spin_unlock_irq(&conf->device_lock);
315 
316 	while (head) {
317 		struct raid1_bh *r1_bh = head;
318 		head = r1_bh->next_r1;
319 		__free_page(r1_bh->bh_req.b_page);
320 		kfree(r1_bh);
321 	}
322 }
323 
raid1_map(mddev_t * mddev,kdev_t * rdev)324 static int raid1_map (mddev_t *mddev, kdev_t *rdev)
325 {
326 	raid1_conf_t *conf = mddev_to_conf(mddev);
327 	int i, disks = MD_SB_DISKS;
328 	unsigned long flags;
329 
330 	/*
331 	 * Later we do read balancing on the read side
332 	 * now we use the first available disk.
333 	 */
334 
335 	md_spin_lock_irqsave(&conf->device_lock, flags);
336 	for (i = 0; i < disks; i++) {
337 		if (conf->mirrors[i].operational) {
338 			*rdev = conf->mirrors[i].dev;
339 			md_spin_unlock_irqrestore(&conf->device_lock, flags);
340 			return (0);
341 		}
342 	}
343 	md_spin_unlock_irqrestore(&conf->device_lock, flags);
344 
345 	printk (KERN_ERR "raid1_map(): huh, no more operational devices?\n");
346 	return (-1);
347 }
348 
raid1_reschedule_retry(struct raid1_bh * r1_bh)349 static void raid1_reschedule_retry (struct raid1_bh *r1_bh)
350 {
351 	unsigned long flags;
352 	mddev_t *mddev = r1_bh->mddev;
353 	raid1_conf_t *conf = mddev_to_conf(mddev);
354 
355 	md_spin_lock_irqsave(&retry_list_lock, flags);
356 	if (raid1_retry_list == NULL)
357 		raid1_retry_tail = &raid1_retry_list;
358 	*raid1_retry_tail = r1_bh;
359 	raid1_retry_tail = &r1_bh->next_r1;
360 	r1_bh->next_r1 = NULL;
361 	md_spin_unlock_irqrestore(&retry_list_lock, flags);
362 	md_wakeup_thread(conf->thread);
363 }
364 
365 
io_request_done(unsigned long sector,raid1_conf_t * conf,int phase)366 static void inline io_request_done(unsigned long sector, raid1_conf_t *conf, int phase)
367 {
368 	unsigned long flags;
369 	spin_lock_irqsave(&conf->segment_lock, flags);
370 	if (sector < conf->start_active)
371 		conf->cnt_done--;
372 	else if (sector >= conf->start_future && conf->phase == phase)
373 		conf->cnt_future--;
374 	else if (!--conf->cnt_pending)
375 		wake_up(&conf->wait_ready);
376 
377 	spin_unlock_irqrestore(&conf->segment_lock, flags);
378 }
379 
sync_request_done(unsigned long sector,raid1_conf_t * conf)380 static void inline sync_request_done (unsigned long sector, raid1_conf_t *conf)
381 {
382 	unsigned long flags;
383 	spin_lock_irqsave(&conf->segment_lock, flags);
384 	if (sector >= conf->start_ready)
385 		--conf->cnt_ready;
386 	else if (sector >= conf->start_active) {
387 		if (!--conf->cnt_active) {
388 			conf->start_active = conf->start_ready;
389 			wake_up(&conf->wait_done);
390 		}
391 	}
392 	spin_unlock_irqrestore(&conf->segment_lock, flags);
393 }
394 
395 /*
396  * raid1_end_bh_io() is called when we have finished servicing a mirrored
397  * operation and are ready to return a success/failure code to the buffer
398  * cache layer.
399  */
raid1_end_bh_io(struct raid1_bh * r1_bh,int uptodate)400 static void raid1_end_bh_io (struct raid1_bh *r1_bh, int uptodate)
401 {
402 	struct buffer_head *bh = r1_bh->master_bh;
403 
404 	io_request_done(bh->b_rsector, mddev_to_conf(r1_bh->mddev),
405 			test_bit(R1BH_SyncPhase, &r1_bh->state));
406 
407 	bh->b_end_io(bh, uptodate);
408 	raid1_free_r1bh(r1_bh);
409 }
raid1_end_request(struct buffer_head * bh,int uptodate)410 void raid1_end_request (struct buffer_head *bh, int uptodate)
411 {
412 	struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private);
413 
414 	/*
415 	 * this branch is our 'one mirror IO has finished' event handler:
416 	 */
417 	if (!uptodate)
418 		md_error (r1_bh->mddev, bh->b_dev);
419 	else
420 		/*
421 		 * Set R1BH_Uptodate in our master buffer_head, so that
422 		 * we will return a good error code for to the higher
423 		 * levels even if IO on some other mirrored buffer fails.
424 		 *
425 		 * The 'master' represents the complex operation to
426 		 * user-side. So if something waits for IO, then it will
427 		 * wait for the 'master' buffer_head.
428 		 */
429 		set_bit (R1BH_Uptodate, &r1_bh->state);
430 
431 	/*
432 	 * We split up the read and write side, imho they are
433 	 * conceptually different.
434 	 */
435 
436 	if ( (r1_bh->cmd == READ) || (r1_bh->cmd == READA) ) {
437 		/*
438 		 * we have only one buffer_head on the read side
439 		 */
440 
441 		if (uptodate) {
442 			raid1_end_bh_io(r1_bh, uptodate);
443 			return;
444 		}
445 		/*
446 		 * oops, read error:
447 		 */
448 		printk(KERN_ERR "raid1: %s: rescheduling block %lu\n",
449 			 partition_name(bh->b_dev), bh->b_blocknr);
450 		raid1_reschedule_retry(r1_bh);
451 		return;
452 	}
453 
454 	/*
455 	 * WRITE:
456 	 *
457 	 * Let's see if all mirrored write operations have finished
458 	 * already.
459 	 */
460 
461 	if (atomic_dec_and_test(&r1_bh->remaining))
462 		raid1_end_bh_io(r1_bh, test_bit(R1BH_Uptodate, &r1_bh->state));
463 }
464 
465 /*
466  * This routine returns the disk from which the requested read should
467  * be done. It bookkeeps the last read position for every disk
468  * in array and when new read requests come, the disk which last
469  * position is nearest to the request, is chosen.
470  *
471  * TODO: now if there are 2 mirrors in the same 2 devices, performance
472  * degrades dramatically because position is mirror, not device based.
473  * This should be changed to be device based. Also atomic sequential
474  * reads should be somehow balanced.
475  */
476 
raid1_read_balance(raid1_conf_t * conf,struct buffer_head * bh)477 static int raid1_read_balance (raid1_conf_t *conf, struct buffer_head *bh)
478 {
479 	int new_disk = conf->last_used;
480 	const int sectors = bh->b_size >> 9;
481 	const unsigned long this_sector = bh->b_rsector;
482 	int disk = new_disk;
483 	unsigned long new_distance;
484 	unsigned long current_distance;
485 
486 	/*
487 	 * Check if it is sane at all to balance
488 	 */
489 
490 	if (conf->resync_mirrors)
491 		goto rb_out;
492 
493 
494 #if defined(CONFIG_ALPHA) && ((__GNUC__ < 3) || \
495 			      ((__GNUC__ == 3) && (__GNUC_MINOR__ < 3)))
496 	/* Work around a compiler bug in older gcc */
497 	new_disk = *(volatile int *)&new_disk;
498 #endif
499 
500 	/* make sure that disk is operational */
501 	while( !conf->mirrors[new_disk].operational) {
502 		if (new_disk <= 0) new_disk = conf->raid_disks;
503 		new_disk--;
504 		if (new_disk == disk) {
505 			/*
506 			 * This means no working disk was found
507 			 * Nothing much to do, lets not change anything
508 			 * and hope for the best...
509 			 */
510 
511 			new_disk = conf->last_used;
512 
513 			goto rb_out;
514 		}
515 	}
516 	disk = new_disk;
517 	/* now disk == new_disk == starting point for search */
518 
519 	/*
520 	 * Don't touch anything for sequential reads.
521 	 */
522 
523 	if (this_sector == conf->mirrors[new_disk].head_position)
524 		goto rb_out;
525 
526 	/*
527 	 * If reads have been done only on a single disk
528 	 * for a time, lets give another disk a change.
529 	 * This is for kicking those idling disks so that
530 	 * they would find work near some hotspot.
531 	 */
532 
533 	if (conf->sect_count >= conf->mirrors[new_disk].sect_limit) {
534 		conf->sect_count = 0;
535 
536 #if defined(CONFIG_SPARC64) && (__GNUC__ == 2) && (__GNUC_MINOR__ == 92)
537 		/* Work around a compiler bug in egcs-2.92.11 19980921 */
538 		new_disk = *(volatile int *)&new_disk;
539 #endif
540 		do {
541 			if (new_disk<=0)
542 				new_disk = conf->raid_disks;
543 			new_disk--;
544 			if (new_disk == disk)
545 				break;
546 		} while ((conf->mirrors[new_disk].write_only) ||
547 			 (!conf->mirrors[new_disk].operational));
548 
549 		goto rb_out;
550 	}
551 
552 	current_distance = abs(this_sector -
553 				conf->mirrors[disk].head_position);
554 
555 	/* Find the disk which is closest */
556 
557 #if defined(CONFIG_ALPHA) && ((__GNUC__ < 3) || \
558 			      ((__GNUC__ == 3) && (__GNUC_MINOR__ < 3)))
559 	/* Work around a compiler bug in older gcc */
560 	disk = *(volatile int *)&disk;
561 #endif
562 	do {
563 		if (disk <= 0)
564 			disk = conf->raid_disks;
565 		disk--;
566 
567 		if ((conf->mirrors[disk].write_only) ||
568 				(!conf->mirrors[disk].operational))
569 			continue;
570 
571 		new_distance = abs(this_sector -
572 					conf->mirrors[disk].head_position);
573 
574 		if (new_distance < current_distance) {
575 			conf->sect_count = 0;
576 			current_distance = new_distance;
577 			new_disk = disk;
578 		}
579 	} while (disk != conf->last_used);
580 
581 rb_out:
582 	conf->mirrors[new_disk].head_position = this_sector + sectors;
583 
584 	conf->last_used = new_disk;
585 	conf->sect_count += sectors;
586 
587 	return new_disk;
588 }
589 
raid1_make_request(mddev_t * mddev,int rw,struct buffer_head * bh)590 static int raid1_make_request (mddev_t *mddev, int rw,
591 			       struct buffer_head * bh)
592 {
593 	raid1_conf_t *conf = mddev_to_conf(mddev);
594 	struct buffer_head *bh_req, *bhl;
595 	struct raid1_bh * r1_bh;
596 	int disks = MD_SB_DISKS;
597 	int i, sum_bhs = 0;
598 	struct mirror_info *mirror;
599 	kdev_t dev;
600 
601 	if (!buffer_locked(bh))
602 		BUG();
603 
604 /*
605  * make_request() can abort the operation when READA is being
606  * used and no empty request is available.
607  *
608  * Currently, just replace the command with READ/WRITE.
609  */
610 	if (rw == READA)
611 		rw = READ;
612 
613 	r1_bh = raid1_alloc_r1bh (conf);
614 
615 	spin_lock_irq(&conf->segment_lock);
616 	wait_event_lock_irq(conf->wait_done,
617 			bh->b_rsector < conf->start_active ||
618 			bh->b_rsector >= conf->start_future,
619 			conf->segment_lock);
620 	if (bh->b_rsector < conf->start_active)
621 		conf->cnt_done++;
622 	else {
623 		conf->cnt_future++;
624 		if (conf->phase)
625 			set_bit(R1BH_SyncPhase, &r1_bh->state);
626 	}
627 	spin_unlock_irq(&conf->segment_lock);
628 
629 	/*
630 	 * i think the read and write branch should be separated completely,
631 	 * since we want to do read balancing on the read side for example.
632 	 * Alternative implementations? :) --mingo
633 	 */
634 
635 	r1_bh->master_bh = bh;
636 	r1_bh->mddev = mddev;
637 	r1_bh->cmd = rw;
638 
639 	if (rw == READ) {
640 		/*
641 		 * read balancing logic:
642 		 */
643 		spin_lock_irq(&conf->device_lock);
644 		mirror = conf->mirrors + raid1_read_balance(conf, bh);
645 		dev = mirror->dev;
646 		spin_unlock_irq(&conf->device_lock);
647 
648 		bh_req = &r1_bh->bh_req;
649 		memcpy(bh_req, bh, sizeof(*bh));
650 		bh_req->b_blocknr = bh->b_rsector;
651 		bh_req->b_dev = dev;
652 		bh_req->b_rdev = dev;
653 	/*	bh_req->b_rsector = bh->n_rsector; */
654 		bh_req->b_end_io = raid1_end_request;
655 		bh_req->b_private = r1_bh;
656 		generic_make_request (rw, bh_req);
657 		return 0;
658 	}
659 
660 	/*
661 	 * WRITE:
662 	 */
663 
664 	bhl = raid1_alloc_bh(conf, conf->raid_disks);
665 	spin_lock_irq(&conf->device_lock);
666 	for (i = 0; i < disks; i++) {
667 		struct buffer_head *mbh;
668 		if (!conf->mirrors[i].operational)
669 			continue;
670 
671 	/*
672 	 * We should use a private pool (size depending on NR_REQUEST),
673 	 * to avoid writes filling up the memory with bhs
674 	 *
675  	 * Such pools are much faster than kmalloc anyways (so we waste
676  	 * almost nothing by not using the master bh when writing and
677  	 * win alot of cleanness) but for now we are cool enough. --mingo
678  	 *
679 	 * It's safe to sleep here, buffer heads cannot be used in a shared
680  	 * manner in the write branch. Look how we lock the buffer at the
681  	 * beginning of this function to grok the difference ;)
682 	 */
683  		mbh = bhl;
684 		if (mbh == NULL) {
685 			MD_BUG();
686 			break;
687 		}
688 		bhl = mbh->b_next;
689 		mbh->b_next = NULL;
690 		mbh->b_this_page = (struct buffer_head *)1;
691 
692  	/*
693  	 * prepare mirrored mbh (fields ordered for max mem throughput):
694  	 */
695 		mbh->b_blocknr    = bh->b_rsector;
696 		mbh->b_dev        = conf->mirrors[i].dev;
697 		mbh->b_rdev	  = conf->mirrors[i].dev;
698 		mbh->b_rsector	  = bh->b_rsector;
699 		mbh->b_state      = (1<<BH_Req) | (1<<BH_Dirty) |
700 						(1<<BH_Mapped) | (1<<BH_Lock);
701 
702 		atomic_set(&mbh->b_count, 1);
703  		mbh->b_size       = bh->b_size;
704  		mbh->b_page	  = bh->b_page;
705  		mbh->b_data	  = bh->b_data;
706  		mbh->b_list       = BUF_LOCKED;
707  		mbh->b_end_io     = raid1_end_request;
708  		mbh->b_private    = r1_bh;
709 
710 		mbh->b_next = r1_bh->mirror_bh_list;
711 		r1_bh->mirror_bh_list = mbh;
712 		sum_bhs++;
713 	}
714 	spin_unlock_irq(&conf->device_lock);
715 	if (bhl) raid1_free_bh(conf,bhl);
716 	if (!sum_bhs) {
717 		/* Gag - all mirrors non-operational.. */
718 		raid1_end_bh_io(r1_bh, 0);
719 		return 0;
720 	}
721 	md_atomic_set(&r1_bh->remaining, sum_bhs);
722 
723 	/*
724 	 * We have to be a bit careful about the semaphore above, thats
725 	 * why we start the requests separately. Since kmalloc() could
726 	 * fail, sleep and make_request() can sleep too, this is the
727 	 * safer solution. Imagine, end_request decreasing the semaphore
728 	 * before we could have set it up ... We could play tricks with
729 	 * the semaphore (presetting it and correcting at the end if
730 	 * sum_bhs is not 'n' but we have to do end_request by hand if
731 	 * all requests finish until we had a chance to set up the
732 	 * semaphore correctly ... lots of races).
733 	 */
734 	bh = r1_bh->mirror_bh_list;
735 	while(bh) {
736 		struct buffer_head *bh2 = bh;
737 		bh = bh->b_next;
738 		generic_make_request(rw, bh2);
739 	}
740 	return (0);
741 }
742 
raid1_status(struct seq_file * seq,mddev_t * mddev)743 static void raid1_status(struct seq_file *seq, mddev_t *mddev)
744 {
745 	raid1_conf_t *conf = mddev_to_conf(mddev);
746 	int i;
747 
748 	seq_printf(seq, " [%d/%d] [", conf->raid_disks,
749 						 conf->working_disks);
750 	for (i = 0; i < conf->raid_disks; i++)
751 		seq_printf(seq, "%s",
752 			conf->mirrors[i].operational ? "U" : "_");
753 	seq_printf(seq, "]");
754 }
755 
756 #define LAST_DISK KERN_ALERT \
757 "raid1: only one disk left and IO error.\n"
758 
759 #define NO_SPARE_DISK KERN_ALERT \
760 "raid1: no spare disk left, degrading mirror level by one.\n"
761 
762 #define DISK_FAILED KERN_ALERT \
763 "raid1: Disk failure on %s, disabling device. \n" \
764 "	Operation continuing on %d devices\n"
765 
766 #define START_SYNCING KERN_ALERT \
767 "raid1: start syncing spare disk.\n"
768 
769 #define ALREADY_SYNCING KERN_INFO \
770 "raid1: syncing already in progress.\n"
771 
mark_disk_bad(mddev_t * mddev,int failed)772 static void mark_disk_bad (mddev_t *mddev, int failed)
773 {
774 	raid1_conf_t *conf = mddev_to_conf(mddev);
775 	struct mirror_info *mirror = conf->mirrors+failed;
776 	mdp_super_t *sb = mddev->sb;
777 
778 	mirror->operational = 0;
779 	mark_disk_faulty(sb->disks+mirror->number);
780 	mark_disk_nonsync(sb->disks+mirror->number);
781 	mark_disk_inactive(sb->disks+mirror->number);
782 	if (!mirror->write_only)
783 		sb->active_disks--;
784 	else
785 		sb->spare_disks--;
786 	sb->working_disks--;
787 	sb->failed_disks++;
788 	mddev->sb_dirty = 1;
789 	md_wakeup_thread(conf->thread);
790 	if (!mirror->write_only)
791 		conf->working_disks--;
792 	printk (DISK_FAILED, partition_name (mirror->dev),
793 				 conf->working_disks);
794 }
795 
raid1_error(mddev_t * mddev,kdev_t dev)796 static int raid1_error (mddev_t *mddev, kdev_t dev)
797 {
798 	raid1_conf_t *conf = mddev_to_conf(mddev);
799 	struct mirror_info * mirrors = conf->mirrors;
800 	int disks = MD_SB_DISKS;
801 	int i;
802 	unsigned long flags;
803 
804 	/* Find the drive.
805 	 * If it is not operational, then we have already marked it as dead
806 	 * else if it is the last working disks, ignore the error, let the
807 	 * next level up know.
808 	 * else mark the drive as failed
809 	 */
810 
811 	for (i = 0; i < disks; i++)
812 		if (mirrors[i].dev==dev && mirrors[i].operational)
813 			break;
814 	if (i == disks)
815 		return 0;
816 
817 	if (i < conf->raid_disks && conf->working_disks == 1) {
818 		/* Don't fail the drive, act as though we were just a
819 		 * normal single drive
820 		 */
821 
822 		return 1;
823 	}
824 	md_spin_lock_irqsave(&conf->device_lock, flags);
825 	mark_disk_bad(mddev, i);
826 	md_spin_unlock_irqrestore(&conf->device_lock, flags);
827 	return 0;
828 }
829 
830 #undef LAST_DISK
831 #undef NO_SPARE_DISK
832 #undef DISK_FAILED
833 #undef START_SYNCING
834 
835 
print_raid1_conf(raid1_conf_t * conf)836 static void print_raid1_conf (raid1_conf_t *conf)
837 {
838 	int i;
839 	struct mirror_info *tmp;
840 
841 	printk("RAID1 conf printout:\n");
842 	if (!conf) {
843 		printk("(conf==NULL)\n");
844 		return;
845 	}
846 	printk(" --- wd:%d rd:%d nd:%d\n", conf->working_disks,
847 			 conf->raid_disks, conf->nr_disks);
848 
849 	for (i = 0; i < MD_SB_DISKS; i++) {
850 		tmp = conf->mirrors + i;
851 		printk(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n",
852 			i, tmp->spare,tmp->operational,
853 			tmp->number,tmp->raid_disk,tmp->used_slot,
854 			partition_name(tmp->dev));
855 	}
856 }
857 
close_sync(raid1_conf_t * conf)858 static void close_sync(raid1_conf_t *conf)
859 {
860 	mddev_t *mddev = conf->mddev;
861 	/* If reconstruction was interrupted, we need to close the "active" and "pending"
862 	 * holes.
863 	 * we know that there are no active rebuild requests, os cnt_active == cnt_ready ==0
864 	 */
865 	/* this is really needed when recovery stops too... */
866 	spin_lock_irq(&conf->segment_lock);
867 	conf->start_active = conf->start_pending;
868 	conf->start_ready = conf->start_pending;
869 	wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock);
870 	conf->start_active =conf->start_ready = conf->start_pending = conf->start_future;
871 	conf->start_future = (mddev->sb->size<<1)+1;
872 	conf->cnt_pending = conf->cnt_future;
873 	conf->cnt_future = 0;
874 	conf->phase = conf->phase ^1;
875 	wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock);
876 	conf->start_active = conf->start_ready = conf->start_pending = conf->start_future = 0;
877 	conf->phase = 0;
878 	conf->cnt_future = conf->cnt_done;;
879 	conf->cnt_done = 0;
880 	spin_unlock_irq(&conf->segment_lock);
881 	wake_up(&conf->wait_done);
882 }
883 
raid1_diskop(mddev_t * mddev,mdp_disk_t ** d,int state)884 static int raid1_diskop(mddev_t *mddev, mdp_disk_t **d, int state)
885 {
886 	int err = 0;
887 	int i, failed_disk=-1, spare_disk=-1, removed_disk=-1, added_disk=-1;
888 	raid1_conf_t *conf = mddev->private;
889 	struct mirror_info *tmp, *sdisk, *fdisk, *rdisk, *adisk;
890 	mdp_super_t *sb = mddev->sb;
891 	mdp_disk_t *failed_desc, *spare_desc, *added_desc;
892 	mdk_rdev_t *spare_rdev, *failed_rdev;
893 
894 	if (conf->resync_mirrors)
895 		return 1; /* Cannot do any diskops during a resync */
896 
897 	switch (state) {
898 	case DISKOP_SPARE_ACTIVE:
899 	case DISKOP_SPARE_INACTIVE:
900 		/* need to wait for pending sync io before locking device */
901 		close_sync(conf);
902 	}
903 
904 	md_spin_lock_irq(&conf->device_lock);
905 	/*
906 	 * Need the conf lock when printing out state else we get BUG()s
907 	 */
908 	print_raid1_conf(conf);
909 	/*
910 	 * find the disk ...
911 	 */
912 	switch (state) {
913 
914 	case DISKOP_SPARE_ACTIVE:
915 
916 		/*
917 		 * Find the failed disk within the RAID1 configuration ...
918 		 * (this can only be in the first conf->working_disks part)
919 		 */
920 		for (i = 0; i < conf->raid_disks; i++) {
921 			tmp = conf->mirrors + i;
922 			if ((!tmp->operational && !tmp->spare) ||
923 					!tmp->used_slot) {
924 				failed_disk = i;
925 				break;
926 			}
927 		}
928 		/*
929 		 * When we activate a spare disk we _must_ have a disk in
930 		 * the lower (active) part of the array to replace.
931 		 */
932 		if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) {
933 			MD_BUG();
934 			err = 1;
935 			goto abort;
936 		}
937 		/* fall through */
938 
939 	case DISKOP_SPARE_WRITE:
940 	case DISKOP_SPARE_INACTIVE:
941 
942 		/*
943 		 * Find the spare disk ... (can only be in the 'high'
944 		 * area of the array)
945 		 */
946 		for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
947 			tmp = conf->mirrors + i;
948 			if (tmp->spare && tmp->number == (*d)->number) {
949 				spare_disk = i;
950 				break;
951 			}
952 		}
953 		if (spare_disk == -1) {
954 			MD_BUG();
955 			err = 1;
956 			goto abort;
957 		}
958 		break;
959 
960 	case DISKOP_HOT_REMOVE_DISK:
961 
962 		for (i = 0; i < MD_SB_DISKS; i++) {
963 			tmp = conf->mirrors + i;
964 			if (tmp->used_slot && (tmp->number == (*d)->number)) {
965 				if (tmp->operational) {
966 					err = -EBUSY;
967 					goto abort;
968 				}
969 				removed_disk = i;
970 				break;
971 			}
972 		}
973 		if (removed_disk == -1) {
974 			MD_BUG();
975 			err = 1;
976 			goto abort;
977 		}
978 		break;
979 
980 	case DISKOP_HOT_ADD_DISK:
981 
982 		for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
983 			tmp = conf->mirrors + i;
984 			if (!tmp->used_slot) {
985 				added_disk = i;
986 				break;
987 			}
988 		}
989 		if (added_disk == -1) {
990 			MD_BUG();
991 			err = 1;
992 			goto abort;
993 		}
994 		break;
995 	}
996 
997 	switch (state) {
998 	/*
999 	 * Switch the spare disk to write-only mode:
1000 	 */
1001 	case DISKOP_SPARE_WRITE:
1002 		sdisk = conf->mirrors + spare_disk;
1003 		sdisk->operational = 1;
1004 		sdisk->write_only = 1;
1005 		break;
1006 	/*
1007 	 * Deactivate a spare disk:
1008 	 */
1009 	case DISKOP_SPARE_INACTIVE:
1010 		if (conf->start_future > 0) {
1011 			MD_BUG();
1012 			err = -EBUSY;
1013 			break;
1014 		}
1015 		sdisk = conf->mirrors + spare_disk;
1016 		sdisk->operational = 0;
1017 		sdisk->write_only = 0;
1018 		break;
1019 	/*
1020 	 * Activate (mark read-write) the (now sync) spare disk,
1021 	 * which means we switch it's 'raid position' (->raid_disk)
1022 	 * with the failed disk. (only the first 'conf->nr_disks'
1023 	 * slots are used for 'real' disks and we must preserve this
1024 	 * property)
1025 	 */
1026 	case DISKOP_SPARE_ACTIVE:
1027 		if (conf->start_future > 0) {
1028 			MD_BUG();
1029 			err = -EBUSY;
1030 			break;
1031 		}
1032 		sdisk = conf->mirrors + spare_disk;
1033 		fdisk = conf->mirrors + failed_disk;
1034 
1035 		spare_desc = &sb->disks[sdisk->number];
1036 		failed_desc = &sb->disks[fdisk->number];
1037 
1038 		if (spare_desc != *d) {
1039 			MD_BUG();
1040 			err = 1;
1041 			goto abort;
1042 		}
1043 
1044 		if (spare_desc->raid_disk != sdisk->raid_disk) {
1045 			MD_BUG();
1046 			err = 1;
1047 			goto abort;
1048 		}
1049 
1050 		if (sdisk->raid_disk != spare_disk) {
1051 			MD_BUG();
1052 			err = 1;
1053 			goto abort;
1054 		}
1055 
1056 		if (failed_desc->raid_disk != fdisk->raid_disk) {
1057 			MD_BUG();
1058 			err = 1;
1059 			goto abort;
1060 		}
1061 
1062 		if (fdisk->raid_disk != failed_disk) {
1063 			MD_BUG();
1064 			err = 1;
1065 			goto abort;
1066 		}
1067 
1068 		/*
1069 		 * do the switch finally
1070 		 */
1071 		spare_rdev = find_rdev_nr(mddev, spare_desc->number);
1072 		failed_rdev = find_rdev_nr(mddev, failed_desc->number);
1073 
1074 		/* There must be a spare_rdev, but there may not be a
1075 		 * failed_rdev.  That slot might be empty...
1076 		 */
1077 		spare_rdev->desc_nr = failed_desc->number;
1078 		if (failed_rdev)
1079 			failed_rdev->desc_nr = spare_desc->number;
1080 
1081 		xchg_values(*spare_desc, *failed_desc);
1082 		xchg_values(*fdisk, *sdisk);
1083 
1084 		/*
1085 		 * (careful, 'failed' and 'spare' are switched from now on)
1086 		 *
1087 		 * we want to preserve linear numbering and we want to
1088 		 * give the proper raid_disk number to the now activated
1089 		 * disk. (this means we switch back these values)
1090 		 */
1091 
1092 		xchg_values(spare_desc->raid_disk, failed_desc->raid_disk);
1093 		xchg_values(sdisk->raid_disk, fdisk->raid_disk);
1094 		xchg_values(spare_desc->number, failed_desc->number);
1095 		xchg_values(sdisk->number, fdisk->number);
1096 
1097 		*d = failed_desc;
1098 
1099 		if (sdisk->dev == MKDEV(0,0))
1100 			sdisk->used_slot = 0;
1101 		/*
1102 		 * this really activates the spare.
1103 		 */
1104 		fdisk->spare = 0;
1105 		fdisk->write_only = 0;
1106 
1107 		/*
1108 		 * if we activate a spare, we definitely replace a
1109 		 * non-operational disk slot in the 'low' area of
1110 		 * the disk array.
1111 		 */
1112 
1113 		conf->working_disks++;
1114 
1115 		break;
1116 
1117 	case DISKOP_HOT_REMOVE_DISK:
1118 		rdisk = conf->mirrors + removed_disk;
1119 
1120 		if (rdisk->spare && (removed_disk < conf->raid_disks)) {
1121 			MD_BUG();
1122 			err = 1;
1123 			goto abort;
1124 		}
1125 		rdisk->dev = MKDEV(0,0);
1126 		rdisk->used_slot = 0;
1127 		conf->nr_disks--;
1128 		break;
1129 
1130 	case DISKOP_HOT_ADD_DISK:
1131 		adisk = conf->mirrors + added_disk;
1132 		added_desc = *d;
1133 
1134 		if (added_disk != added_desc->number) {
1135 			MD_BUG();
1136 			err = 1;
1137 			goto abort;
1138 		}
1139 
1140 		adisk->number = added_desc->number;
1141 		adisk->raid_disk = added_desc->raid_disk;
1142 		adisk->dev = MKDEV(added_desc->major,added_desc->minor);
1143 
1144 		adisk->operational = 0;
1145 		adisk->write_only = 0;
1146 		adisk->spare = 1;
1147 		adisk->used_slot = 1;
1148 		adisk->head_position = 0;
1149 		conf->nr_disks++;
1150 
1151 		break;
1152 
1153 	default:
1154 		MD_BUG();
1155 		err = 1;
1156 		goto abort;
1157 	}
1158 abort:
1159 	print_raid1_conf(conf);
1160 	md_spin_unlock_irq(&conf->device_lock);
1161 	if (state == DISKOP_SPARE_ACTIVE || state == DISKOP_SPARE_INACTIVE)
1162 		/* should move to "END_REBUILD" when such exists */
1163 		raid1_shrink_buffers(conf);
1164 
1165 	return err;
1166 }
1167 
1168 
1169 #define IO_ERROR KERN_ALERT \
1170 "raid1: %s: unrecoverable I/O read error for block %lu\n"
1171 
1172 #define REDIRECT_SECTOR KERN_ERR \
1173 "raid1: %s: redirecting sector %lu to another mirror\n"
1174 
1175 /*
1176  * This is a kernel thread which:
1177  *
1178  *	1.	Retries failed read operations on working mirrors.
1179  *	2.	Updates the raid superblock when problems encounter.
1180  *	3.	Performs writes following reads for array syncronising.
1181  */
1182 static void end_sync_write(struct buffer_head *bh, int uptodate);
1183 static void end_sync_read(struct buffer_head *bh, int uptodate);
1184 
raid1d(void * data)1185 static void raid1d (void *data)
1186 {
1187 	struct raid1_bh *r1_bh;
1188 	struct buffer_head *bh;
1189 	unsigned long flags;
1190 	raid1_conf_t *conf = data;
1191 	mddev_t *mddev = conf->mddev;
1192 	kdev_t dev;
1193 
1194 	if (mddev->sb_dirty)
1195 		md_update_sb(mddev);
1196 
1197 	for (;;) {
1198 		md_spin_lock_irqsave(&retry_list_lock, flags);
1199 		r1_bh = raid1_retry_list;
1200 		if (!r1_bh)
1201 			break;
1202 		raid1_retry_list = r1_bh->next_r1;
1203 		md_spin_unlock_irqrestore(&retry_list_lock, flags);
1204 
1205 		mddev = r1_bh->mddev;
1206 		bh = &r1_bh->bh_req;
1207 		switch(r1_bh->cmd) {
1208 		case SPECIAL:
1209 			/* have to allocate lots of bh structures and
1210 			 * schedule writes
1211 			 */
1212 			if (test_bit(R1BH_Uptodate, &r1_bh->state)) {
1213 				int i, sum_bhs = 0;
1214 				int disks = MD_SB_DISKS;
1215 				struct buffer_head *bhl, *mbh;
1216 
1217 				conf = mddev_to_conf(mddev);
1218 				bhl = raid1_alloc_bh(conf, conf->raid_disks); /* don't really need this many */
1219 				spin_lock_irq(&conf->device_lock);
1220 				for (i = 0; i < disks ; i++) {
1221 					if (!conf->mirrors[i].operational)
1222 						continue;
1223 					if (i==conf->last_used)
1224 						/* we read from here, no need to write */
1225 						continue;
1226 					if (i < conf->raid_disks
1227 					    && !conf->resync_mirrors)
1228 						/* don't need to write this,
1229 						 * we are just rebuilding */
1230 						continue;
1231 					mbh = bhl;
1232 					if (!mbh) {
1233 						MD_BUG();
1234 						break;
1235 					}
1236 					bhl = mbh->b_next;
1237 					mbh->b_this_page = (struct buffer_head *)1;
1238 
1239 
1240 				/*
1241 				 * prepare mirrored bh (fields ordered for max mem throughput):
1242 				 */
1243 					mbh->b_blocknr    = bh->b_blocknr;
1244 					mbh->b_dev        = conf->mirrors[i].dev;
1245 					mbh->b_rdev	  = conf->mirrors[i].dev;
1246 					mbh->b_rsector	  = bh->b_blocknr;
1247 					mbh->b_state      = (1<<BH_Req) | (1<<BH_Dirty) |
1248 						(1<<BH_Mapped) | (1<<BH_Lock);
1249 					atomic_set(&mbh->b_count, 1);
1250 					mbh->b_size       = bh->b_size;
1251 					mbh->b_page	  = bh->b_page;
1252 					mbh->b_data	  = bh->b_data;
1253 					mbh->b_list       = BUF_LOCKED;
1254 					mbh->b_end_io     = end_sync_write;
1255 					mbh->b_private    = r1_bh;
1256 
1257 					mbh->b_next = r1_bh->mirror_bh_list;
1258 					r1_bh->mirror_bh_list = mbh;
1259 
1260 					sum_bhs++;
1261 				}
1262 				spin_unlock_irq(&conf->device_lock);
1263 				md_atomic_set(&r1_bh->remaining, sum_bhs);
1264 				if (bhl) raid1_free_bh(conf, bhl);
1265 				mbh = r1_bh->mirror_bh_list;
1266 
1267 				if (!sum_bhs) {
1268 					/* nowhere to write this too... I guess we
1269 					 * must be done
1270 					 */
1271 					sync_request_done(bh->b_blocknr, conf);
1272 					md_done_sync(mddev, bh->b_size>>9, 0);
1273 					raid1_free_buf(r1_bh);
1274 				} else
1275 				while (mbh) {
1276 					struct buffer_head *bh1 = mbh;
1277 					mbh = mbh->b_next;
1278 					generic_make_request(WRITE, bh1);
1279 					md_sync_acct(bh1->b_dev, bh1->b_size/512);
1280 				}
1281 			} else {
1282 				/* There is no point trying a read-for-reconstruct
1283 				 * as reconstruct is about to be aborted
1284 				 */
1285 
1286 				printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr);
1287 				md_done_sync(mddev, bh->b_size>>9, 0);
1288 			}
1289 
1290 			break;
1291 		case READ:
1292 		case READA:
1293 			dev = bh->b_dev;
1294 			raid1_map (mddev, &bh->b_dev);
1295 			if (bh->b_dev == dev) {
1296 				printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr);
1297 				raid1_end_bh_io(r1_bh, 0);
1298 			} else {
1299 				printk (REDIRECT_SECTOR,
1300 					partition_name(bh->b_dev), bh->b_blocknr);
1301 				bh->b_rdev = bh->b_dev;
1302 				bh->b_rsector = bh->b_blocknr;
1303 				generic_make_request (r1_bh->cmd, bh);
1304 			}
1305 			break;
1306 		}
1307 	}
1308 	md_spin_unlock_irqrestore(&retry_list_lock, flags);
1309 }
1310 #undef IO_ERROR
1311 #undef REDIRECT_SECTOR
1312 
1313 /*
1314  * Private kernel thread to reconstruct mirrors after an unclean
1315  * shutdown.
1316  */
raid1syncd(void * data)1317 static void raid1syncd (void *data)
1318 {
1319 	raid1_conf_t *conf = data;
1320 	mddev_t *mddev = conf->mddev;
1321 
1322 	if (!conf->resync_mirrors)
1323 		return;
1324 	if (conf->resync_mirrors == 2)
1325 		return;
1326 	down(&mddev->recovery_sem);
1327 	if (!md_do_sync(mddev, NULL)) {
1328 		/*
1329 		 * Only if everything went Ok.
1330 		 */
1331 		conf->resync_mirrors = 0;
1332 	}
1333 
1334 	close_sync(conf);
1335 
1336 	up(&mddev->recovery_sem);
1337 	raid1_shrink_buffers(conf);
1338 
1339 	md_recover_arrays(); /* incase we are degraded and a spare is available */
1340 }
1341 
1342 /*
1343  * perform a "sync" on one "block"
1344  *
1345  * We need to make sure that no normal I/O request - particularly write
1346  * requests - conflict with active sync requests.
1347  * This is achieved by conceptually dividing the device space into a
1348  * number of sections:
1349  *  DONE: 0 .. a-1     These blocks are in-sync
1350  *  ACTIVE: a.. b-1    These blocks may have active sync requests, but
1351  *                     no normal IO requests
1352  *  READY: b .. c-1    These blocks have no normal IO requests - sync
1353  *                     request may be happening
1354  *  PENDING: c .. d-1  These blocks may have IO requests, but no new
1355  *                     ones will be added
1356  *  FUTURE:  d .. end  These blocks are not to be considered yet. IO may
1357  *                     be happening, but not sync
1358  *
1359  * We keep a
1360  *   phase    which flips (0 or 1) each time d moves and
1361  * a count of:
1362  *   z =  active io requests in FUTURE since d moved - marked with
1363  *        current phase
1364  *   y =  active io requests in FUTURE before d moved, or PENDING -
1365  *        marked with previous phase
1366  *   x =  active sync requests in READY
1367  *   w =  active sync requests in ACTIVE
1368  *   v =  active io requests in DONE
1369  *
1370  * Normally, a=b=c=d=0 and z= active io requests
1371  *   or a=b=c=d=END and v= active io requests
1372  * Allowed changes to a,b,c,d:
1373  * A:  c==d &&  y==0 -> d+=window, y=z, z=0, phase=!phase
1374  * B:  y==0 -> c=d
1375  * C:   b=c, w+=x, x=0
1376  * D:  w==0 -> a=b
1377  * E: a==b==c==d==end -> a=b=c=d=0, z=v, v=0
1378  *
1379  * At start of sync we apply A.
1380  * When y reaches 0, we apply B then A then being sync requests
1381  * When sync point reaches c-1, we wait for y==0, and W==0, and
1382  * then apply apply B then A then D then C.
1383  * Finally, we apply E
1384  *
1385  * The sync request simply issues a "read" against a working drive
1386  * This is marked so that on completion the raid1d thread is woken to
1387  * issue suitable write requests
1388  */
1389 
raid1_sync_request(mddev_t * mddev,unsigned long sector_nr)1390 static int raid1_sync_request (mddev_t *mddev, unsigned long sector_nr)
1391 {
1392 	raid1_conf_t *conf = mddev_to_conf(mddev);
1393 	struct mirror_info *mirror;
1394 	struct raid1_bh *r1_bh;
1395 	struct buffer_head *bh;
1396 	int bsize;
1397 	int disk;
1398 	int block_nr;
1399 	int buffs;
1400 	kdev_t dev;
1401 
1402 	if (!sector_nr) {
1403 		/* we want enough buffers to hold twice the window of 128*/
1404 		buffs = 128 *2 / (PAGE_SIZE>>9);
1405 		buffs = raid1_grow_buffers(conf, buffs);
1406 		if (buffs < 2)
1407 			goto nomem;
1408 		conf->window = buffs*(PAGE_SIZE>>9)/2;
1409 	}
1410 	spin_lock_irq(&conf->segment_lock);
1411 	if (!sector_nr) {
1412 		/* initialize ...*/
1413 		conf->start_active = 0;
1414 		conf->start_ready = 0;
1415 		conf->start_pending = 0;
1416 		conf->start_future = 0;
1417 		conf->phase = 0;
1418 
1419 		conf->cnt_future += conf->cnt_done+conf->cnt_pending;
1420 		conf->cnt_done = conf->cnt_pending = 0;
1421 		if (conf->cnt_ready || conf->cnt_active)
1422 			MD_BUG();
1423 	}
1424 	while (sector_nr >= conf->start_pending) {
1425 		PRINTK("wait .. sect=%lu start_active=%d ready=%d pending=%d future=%d, cnt_done=%d active=%d ready=%d pending=%d future=%d\n",
1426 			sector_nr, conf->start_active, conf->start_ready, conf->start_pending, conf->start_future,
1427 			conf->cnt_done, conf->cnt_active, conf->cnt_ready, conf->cnt_pending, conf->cnt_future);
1428 		wait_event_lock_irq(conf->wait_done,
1429 					!conf->cnt_active,
1430 					conf->segment_lock);
1431 		wait_event_lock_irq(conf->wait_ready,
1432 					!conf->cnt_pending,
1433 					conf->segment_lock);
1434 		conf->start_active = conf->start_ready;
1435 		conf->start_ready = conf->start_pending;
1436 		conf->start_pending = conf->start_future;
1437 		conf->start_future = conf->start_future+conf->window;
1438 		// Note: falling off the end is not a problem
1439 		conf->phase = conf->phase ^1;
1440 		conf->cnt_active = conf->cnt_ready;
1441 		conf->cnt_ready = 0;
1442 		conf->cnt_pending = conf->cnt_future;
1443 		conf->cnt_future = 0;
1444 		wake_up(&conf->wait_done);
1445 	}
1446 	conf->cnt_ready++;
1447 	spin_unlock_irq(&conf->segment_lock);
1448 
1449 
1450 	/* If reconstructing, and >1 working disc,
1451 	 * could dedicate one to rebuild and others to
1452 	 * service read requests ..
1453 	 */
1454 	spin_lock_irq(&conf->device_lock);
1455 	disk = conf->last_used;
1456 	/* make sure disk is operational */
1457 	while (!conf->mirrors[disk].operational) {
1458 		if (disk <= 0) disk = conf->raid_disks;
1459 		disk--;
1460 		if (disk == conf->last_used)
1461 			break;
1462 	}
1463 	conf->last_used = disk;
1464 
1465 	mirror = conf->mirrors+conf->last_used;
1466 	dev = mirror->dev;
1467 	spin_unlock_irq(&conf->device_lock);
1468 
1469 	r1_bh = raid1_alloc_buf (conf);
1470 	r1_bh->master_bh = NULL;
1471 	r1_bh->mddev = mddev;
1472 	r1_bh->cmd = SPECIAL;
1473 	bh = &r1_bh->bh_req;
1474 
1475 	block_nr = sector_nr;
1476 	bsize = 512;
1477 	while (!(block_nr & 1) && bsize < PAGE_SIZE
1478 			&& (block_nr+2)*(bsize>>9) <= (mddev->sb->size *2)) {
1479 		block_nr >>= 1;
1480 		bsize <<= 1;
1481 	}
1482 	bh->b_size = bsize;
1483 	bh->b_list = BUF_LOCKED;
1484 	bh->b_dev = dev;
1485 	bh->b_rdev = dev;
1486 	bh->b_state = (1<<BH_Req) | (1<<BH_Mapped) | (1<<BH_Lock);
1487 	if (!bh->b_page)
1488 		BUG();
1489 	if (!bh->b_data)
1490 		BUG();
1491 	if (bh->b_data != page_address(bh->b_page))
1492 		BUG();
1493 	bh->b_end_io = end_sync_read;
1494 	bh->b_private = r1_bh;
1495 	bh->b_blocknr = sector_nr;
1496 	bh->b_rsector = sector_nr;
1497 	init_waitqueue_head(&bh->b_wait);
1498 
1499 	generic_make_request(READ, bh);
1500 	md_sync_acct(bh->b_dev, bh->b_size/512);
1501 
1502 	return (bsize >> 9);
1503 
1504 nomem:
1505 	raid1_shrink_buffers(conf);
1506 	return -ENOMEM;
1507 }
1508 
end_sync_read(struct buffer_head * bh,int uptodate)1509 static void end_sync_read(struct buffer_head *bh, int uptodate)
1510 {
1511 	struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private);
1512 
1513 	/* we have read a block, now it needs to be re-written,
1514 	 * or re-read if the read failed.
1515 	 * We don't do much here, just schedule handling by raid1d
1516 	 */
1517 	if (!uptodate)
1518 		md_error (r1_bh->mddev, bh->b_dev);
1519 	else
1520 		set_bit(R1BH_Uptodate, &r1_bh->state);
1521 	raid1_reschedule_retry(r1_bh);
1522 }
1523 
end_sync_write(struct buffer_head * bh,int uptodate)1524 static void end_sync_write(struct buffer_head *bh, int uptodate)
1525 {
1526  	struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private);
1527 
1528 	if (!uptodate)
1529  		md_error (r1_bh->mddev, bh->b_dev);
1530 	if (atomic_dec_and_test(&r1_bh->remaining)) {
1531 		mddev_t *mddev = r1_bh->mddev;
1532  		unsigned long sect = bh->b_blocknr;
1533 		int size = bh->b_size;
1534 		raid1_free_buf(r1_bh);
1535 		sync_request_done(sect, mddev_to_conf(mddev));
1536 		md_done_sync(mddev,size>>9, uptodate);
1537 	}
1538 }
1539 
1540 #define INVALID_LEVEL KERN_WARNING \
1541 "raid1: md%d: raid level not set to mirroring (%d)\n"
1542 
1543 #define NO_SB KERN_ERR \
1544 "raid1: disabled mirror %s (couldn't access raid superblock)\n"
1545 
1546 #define ERRORS KERN_ERR \
1547 "raid1: disabled mirror %s (errors detected)\n"
1548 
1549 #define NOT_IN_SYNC KERN_ERR \
1550 "raid1: disabled mirror %s (not in sync)\n"
1551 
1552 #define INCONSISTENT KERN_ERR \
1553 "raid1: disabled mirror %s (inconsistent descriptor)\n"
1554 
1555 #define ALREADY_RUNNING KERN_ERR \
1556 "raid1: disabled mirror %s (mirror %d already operational)\n"
1557 
1558 #define OPERATIONAL KERN_INFO \
1559 "raid1: device %s operational as mirror %d\n"
1560 
1561 #define MEM_ERROR KERN_ERR \
1562 "raid1: couldn't allocate memory for md%d\n"
1563 
1564 #define SPARE KERN_INFO \
1565 "raid1: spare disk %s\n"
1566 
1567 #define NONE_OPERATIONAL KERN_ERR \
1568 "raid1: no operational mirrors for md%d\n"
1569 
1570 #define ARRAY_IS_ACTIVE KERN_INFO \
1571 "raid1: raid set md%d active with %d out of %d mirrors\n"
1572 
1573 #define THREAD_ERROR KERN_ERR \
1574 "raid1: couldn't allocate thread for md%d\n"
1575 
1576 #define START_RESYNC KERN_WARNING \
1577 "raid1: raid set md%d not clean; reconstructing mirrors\n"
1578 
raid1_run(mddev_t * mddev)1579 static int raid1_run (mddev_t *mddev)
1580 {
1581 	raid1_conf_t *conf;
1582 	int i, j, disk_idx;
1583 	struct mirror_info *disk;
1584 	mdp_super_t *sb = mddev->sb;
1585 	mdp_disk_t *descriptor;
1586 	mdk_rdev_t *rdev;
1587 	struct md_list_head *tmp;
1588 	int start_recovery = 0;
1589 
1590 	MOD_INC_USE_COUNT;
1591 
1592 	if (sb->level != 1) {
1593 		printk(INVALID_LEVEL, mdidx(mddev), sb->level);
1594 		goto out;
1595 	}
1596 	/*
1597 	 * copy the already verified devices into our private RAID1
1598 	 * bookkeeping area. [whatever we allocate in raid1_run(),
1599 	 * should be freed in raid1_stop()]
1600 	 */
1601 
1602 	conf = kmalloc(sizeof(raid1_conf_t), GFP_KERNEL);
1603 	mddev->private = conf;
1604 	if (!conf) {
1605 		printk(MEM_ERROR, mdidx(mddev));
1606 		goto out;
1607 	}
1608 	memset(conf, 0, sizeof(*conf));
1609 
1610 	ITERATE_RDEV(mddev,rdev,tmp) {
1611 		if (rdev->faulty) {
1612 			printk(ERRORS, partition_name(rdev->dev));
1613 		} else {
1614 			if (!rdev->sb) {
1615 				MD_BUG();
1616 				continue;
1617 			}
1618 		}
1619 		if (rdev->desc_nr == -1) {
1620 			MD_BUG();
1621 			continue;
1622 		}
1623 		descriptor = &sb->disks[rdev->desc_nr];
1624 		disk_idx = descriptor->raid_disk;
1625 		disk = conf->mirrors + disk_idx;
1626 
1627 		if (disk_faulty(descriptor)) {
1628 			disk->number = descriptor->number;
1629 			disk->raid_disk = disk_idx;
1630 			disk->dev = rdev->dev;
1631 			disk->sect_limit = MAX_WORK_PER_DISK;
1632 			disk->operational = 0;
1633 			disk->write_only = 0;
1634 			disk->spare = 0;
1635 			disk->used_slot = 1;
1636 			disk->head_position = 0;
1637 			continue;
1638 		}
1639 		if (disk_active(descriptor)) {
1640 			if (!disk_sync(descriptor)) {
1641 				printk(NOT_IN_SYNC,
1642 					partition_name(rdev->dev));
1643 				continue;
1644 			}
1645 			if ((descriptor->number > MD_SB_DISKS) ||
1646 					 (disk_idx > sb->raid_disks)) {
1647 
1648 				printk(INCONSISTENT,
1649 					partition_name(rdev->dev));
1650 				continue;
1651 			}
1652 			if (disk->operational) {
1653 				printk(ALREADY_RUNNING,
1654 					partition_name(rdev->dev),
1655 					disk_idx);
1656 				continue;
1657 			}
1658 			printk(OPERATIONAL, partition_name(rdev->dev),
1659  					disk_idx);
1660 			disk->number = descriptor->number;
1661 			disk->raid_disk = disk_idx;
1662 			disk->dev = rdev->dev;
1663 			disk->sect_limit = MAX_WORK_PER_DISK;
1664 			disk->operational = 1;
1665 			disk->write_only = 0;
1666 			disk->spare = 0;
1667 			disk->used_slot = 1;
1668 			disk->head_position = 0;
1669 			conf->working_disks++;
1670 		} else {
1671 		/*
1672 		 * Must be a spare disk ..
1673 		 */
1674 			printk(SPARE, partition_name(rdev->dev));
1675 			disk->number = descriptor->number;
1676 			disk->raid_disk = disk_idx;
1677 			disk->dev = rdev->dev;
1678 			disk->sect_limit = MAX_WORK_PER_DISK;
1679 			disk->operational = 0;
1680 			disk->write_only = 0;
1681 			disk->spare = 1;
1682 			disk->used_slot = 1;
1683 			disk->head_position = 0;
1684 		}
1685 	}
1686 	conf->raid_disks = sb->raid_disks;
1687 	conf->nr_disks = sb->nr_disks;
1688 	conf->mddev = mddev;
1689 	conf->device_lock = MD_SPIN_LOCK_UNLOCKED;
1690 
1691 	conf->segment_lock = MD_SPIN_LOCK_UNLOCKED;
1692 	init_waitqueue_head(&conf->wait_buffer);
1693 	init_waitqueue_head(&conf->wait_done);
1694 	init_waitqueue_head(&conf->wait_ready);
1695 
1696 	if (!conf->working_disks) {
1697 		printk(NONE_OPERATIONAL, mdidx(mddev));
1698 		goto out_free_conf;
1699 	}
1700 
1701 
1702 	/* pre-allocate some buffer_head structures.
1703 	 * As a minimum, 1 r1bh and raid_disks buffer_heads
1704 	 * would probably get us by in tight memory situations,
1705 	 * but a few more is probably a good idea.
1706 	 * For now, try NR_RESERVED_BUFS r1bh and
1707 	 * NR_RESERVED_BUFS*raid_disks bufferheads
1708 	 * This will allow at least NR_RESERVED_BUFS concurrent
1709 	 * reads or writes even if kmalloc starts failing
1710 	 */
1711 	if (raid1_grow_r1bh(conf, NR_RESERVED_BUFS) < NR_RESERVED_BUFS ||
1712 	    raid1_grow_bh(conf, NR_RESERVED_BUFS*conf->raid_disks)
1713 	                      < NR_RESERVED_BUFS*conf->raid_disks) {
1714 		printk(MEM_ERROR, mdidx(mddev));
1715 		goto out_free_conf;
1716 	}
1717 
1718 	for (i = 0; i < MD_SB_DISKS; i++) {
1719 
1720 		descriptor = sb->disks+i;
1721 		disk_idx = descriptor->raid_disk;
1722 		disk = conf->mirrors + disk_idx;
1723 
1724 		if (disk_faulty(descriptor) && (disk_idx < conf->raid_disks) &&
1725 				!disk->used_slot) {
1726 
1727 			disk->number = descriptor->number;
1728 			disk->raid_disk = disk_idx;
1729 			disk->dev = MKDEV(0,0);
1730 
1731 			disk->operational = 0;
1732 			disk->write_only = 0;
1733 			disk->spare = 0;
1734 			disk->used_slot = 1;
1735 			disk->head_position = 0;
1736 		}
1737 	}
1738 
1739 	/*
1740 	 * find the first working one and use it as a starting point
1741 	 * to read balancing.
1742 	 */
1743 	for (j = 0; !conf->mirrors[j].operational && j < MD_SB_DISKS; j++)
1744 		/* nothing */;
1745 	conf->last_used = j;
1746 
1747 
1748 
1749 	{
1750 		const char * name = "raid1d";
1751 
1752 		conf->thread = md_register_thread(raid1d, conf, name);
1753 		if (!conf->thread) {
1754 			printk(THREAD_ERROR, mdidx(mddev));
1755 			goto out_free_conf;
1756 		}
1757 	}
1758 
1759 	if (!(sb->state & (1 << MD_SB_CLEAN)) &&
1760 	    (conf->working_disks > 1)) {
1761 		const char * name = "raid1syncd";
1762 
1763 		conf->resync_thread = md_register_thread(raid1syncd, conf,name);
1764 		if (!conf->resync_thread) {
1765 			printk(THREAD_ERROR, mdidx(mddev));
1766 			goto out_free_conf;
1767 		}
1768 
1769 		printk(START_RESYNC, mdidx(mddev));
1770 		conf->resync_mirrors = 1;
1771 		md_wakeup_thread(conf->resync_thread);
1772 	} else if (conf->working_disks != sb->raid_disks) {
1773 		printk(KERN_ALERT "raid1: md%d, not all disks are operational -- trying to recover array\n", mdidx(mddev));
1774 		start_recovery = 1;
1775 	}
1776 
1777 	/*
1778 	 * Regenerate the "device is in sync with the raid set" bit for
1779 	 * each device.
1780 	 */
1781 	for (i = 0; i < MD_SB_DISKS; i++) {
1782 		mark_disk_nonsync(sb->disks+i);
1783 		for (j = 0; j < sb->raid_disks; j++) {
1784 			if (!conf->mirrors[j].operational)
1785 				continue;
1786 			if (sb->disks[i].number == conf->mirrors[j].number)
1787 				mark_disk_sync(sb->disks+i);
1788 		}
1789 	}
1790 	sb->active_disks = conf->working_disks;
1791 
1792 	if (start_recovery)
1793 		md_recover_arrays();
1794 
1795 
1796 	printk(ARRAY_IS_ACTIVE, mdidx(mddev), sb->active_disks, sb->raid_disks);
1797 	/*
1798 	 * Ok, everything is just fine now
1799 	 */
1800 	return 0;
1801 
1802 out_free_conf:
1803 	raid1_shrink_r1bh(conf);
1804 	raid1_shrink_bh(conf);
1805 	raid1_shrink_buffers(conf);
1806 	kfree(conf);
1807 	mddev->private = NULL;
1808 out:
1809 	MOD_DEC_USE_COUNT;
1810 	return -EIO;
1811 }
1812 
1813 #undef INVALID_LEVEL
1814 #undef NO_SB
1815 #undef ERRORS
1816 #undef NOT_IN_SYNC
1817 #undef INCONSISTENT
1818 #undef ALREADY_RUNNING
1819 #undef OPERATIONAL
1820 #undef SPARE
1821 #undef NONE_OPERATIONAL
1822 #undef ARRAY_IS_ACTIVE
1823 
raid1_stop_resync(mddev_t * mddev)1824 static int raid1_stop_resync (mddev_t *mddev)
1825 {
1826 	raid1_conf_t *conf = mddev_to_conf(mddev);
1827 
1828 	if (conf->resync_thread) {
1829 		if (conf->resync_mirrors) {
1830 			conf->resync_mirrors = 2;
1831 			md_interrupt_thread(conf->resync_thread);
1832 
1833 			printk(KERN_INFO "raid1: mirror resync was not fully finished, restarting next time.\n");
1834 			return 1;
1835 		}
1836 		return 0;
1837 	}
1838 	return 0;
1839 }
1840 
raid1_restart_resync(mddev_t * mddev)1841 static int raid1_restart_resync (mddev_t *mddev)
1842 {
1843 	raid1_conf_t *conf = mddev_to_conf(mddev);
1844 
1845 	if (conf->resync_mirrors) {
1846 		if (!conf->resync_thread) {
1847 			MD_BUG();
1848 			return 0;
1849 		}
1850 		conf->resync_mirrors = 1;
1851 		md_wakeup_thread(conf->resync_thread);
1852 		return 1;
1853 	}
1854 	return 0;
1855 }
1856 
raid1_stop(mddev_t * mddev)1857 static int raid1_stop (mddev_t *mddev)
1858 {
1859 	raid1_conf_t *conf = mddev_to_conf(mddev);
1860 
1861 	md_unregister_thread(conf->thread);
1862 	if (conf->resync_thread)
1863 		md_unregister_thread(conf->resync_thread);
1864 	raid1_shrink_r1bh(conf);
1865 	raid1_shrink_bh(conf);
1866 	raid1_shrink_buffers(conf);
1867 	kfree(conf);
1868 	mddev->private = NULL;
1869 	MOD_DEC_USE_COUNT;
1870 	return 0;
1871 }
1872 
1873 static mdk_personality_t raid1_personality=
1874 {
1875 	name:		"raid1",
1876 	make_request:	raid1_make_request,
1877 	run:		raid1_run,
1878 	stop:		raid1_stop,
1879 	status:		raid1_status,
1880 	error_handler:	raid1_error,
1881 	diskop:		raid1_diskop,
1882 	stop_resync:	raid1_stop_resync,
1883 	restart_resync:	raid1_restart_resync,
1884 	sync_request:	raid1_sync_request
1885 };
1886 
raid1_init(void)1887 static int md__init raid1_init (void)
1888 {
1889 	return register_md_personality (RAID1, &raid1_personality);
1890 }
1891 
raid1_exit(void)1892 static void raid1_exit (void)
1893 {
1894 	unregister_md_personality (RAID1);
1895 }
1896 
1897 module_init(raid1_init);
1898 module_exit(raid1_exit);
1899 MODULE_LICENSE("GPL");
1900