1 /*
2 * raid1.c : Multiple Devices driver for Linux
3 *
4 * Copyright (C) 1999, 2000 Ingo Molnar, Red Hat
5 *
6 * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
7 *
8 * RAID-1 management functions.
9 *
10 * Better read-balancing code written by Mika Kuoppala <miku@iki.fi>, 2000
11 *
12 * Fixes to reconstruction by Jakob �stergaard" <jakob@ostenfeld.dk>
13 * Various fixes by Neil Brown <neilb@cse.unsw.edu.au>
14 *
15 * This program is free software; you can redistribute it and/or modify
16 * it under the terms of the GNU General Public License as published by
17 * the Free Software Foundation; either version 2, or (at your option)
18 * any later version.
19 *
20 * You should have received a copy of the GNU General Public License
21 * (for example /usr/src/linux/COPYING); if not, write to the Free
22 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 */
24
25 #include <linux/module.h>
26 #include <linux/config.h>
27 #include <linux/slab.h>
28 #include <linux/raid/raid1.h>
29 #include <asm/atomic.h>
30
31 #define MAJOR_NR MD_MAJOR
32 #define MD_DRIVER
33 #define MD_PERSONALITY
34
35 #define MAX_WORK_PER_DISK 128
36
37 #define NR_RESERVED_BUFS 32
38
39
40 /*
41 * The following can be used to debug the driver
42 */
43 #define RAID1_DEBUG 0
44
45 #if RAID1_DEBUG
46 #define PRINTK(x...) printk(x)
47 #define inline
48 #define __inline__
49 #else
50 #define PRINTK(x...) do { } while (0)
51 #endif
52
53
54 static mdk_personality_t raid1_personality;
55 static md_spinlock_t retry_list_lock = MD_SPIN_LOCK_UNLOCKED;
56 struct raid1_bh *raid1_retry_list = NULL, **raid1_retry_tail;
57
raid1_alloc_bh(raid1_conf_t * conf,int cnt)58 static struct buffer_head *raid1_alloc_bh(raid1_conf_t *conf, int cnt)
59 {
60 /* return a linked list of "cnt" struct buffer_heads.
61 * don't take any off the free list unless we know we can
62 * get all we need, otherwise we could deadlock
63 */
64 struct buffer_head *bh=NULL;
65
66 while(cnt) {
67 struct buffer_head *t;
68 md_spin_lock_irq(&conf->device_lock);
69 if (!conf->freebh_blocked && conf->freebh_cnt >= cnt)
70 while (cnt) {
71 t = conf->freebh;
72 conf->freebh = t->b_next;
73 t->b_next = bh;
74 bh = t;
75 t->b_state = 0;
76 conf->freebh_cnt--;
77 cnt--;
78 }
79 md_spin_unlock_irq(&conf->device_lock);
80 if (cnt == 0)
81 break;
82 t = kmem_cache_alloc(bh_cachep, SLAB_NOIO);
83 if (t) {
84 t->b_next = bh;
85 bh = t;
86 cnt--;
87 } else {
88 PRINTK("raid1: waiting for %d bh\n", cnt);
89 conf->freebh_blocked = 1;
90 wait_disk_event(conf->wait_buffer,
91 !conf->freebh_blocked ||
92 conf->freebh_cnt > conf->raid_disks * NR_RESERVED_BUFS/2);
93 conf->freebh_blocked = 0;
94 }
95 }
96 return bh;
97 }
98
raid1_free_bh(raid1_conf_t * conf,struct buffer_head * bh)99 static inline void raid1_free_bh(raid1_conf_t *conf, struct buffer_head *bh)
100 {
101 unsigned long flags;
102 spin_lock_irqsave(&conf->device_lock, flags);
103 while (bh) {
104 struct buffer_head *t = bh;
105 bh=bh->b_next;
106 if (t->b_pprev == NULL)
107 kmem_cache_free(bh_cachep, t);
108 else {
109 t->b_next= conf->freebh;
110 conf->freebh = t;
111 conf->freebh_cnt++;
112 }
113 }
114 spin_unlock_irqrestore(&conf->device_lock, flags);
115 wake_up(&conf->wait_buffer);
116 }
117
raid1_grow_bh(raid1_conf_t * conf,int cnt)118 static int raid1_grow_bh(raid1_conf_t *conf, int cnt)
119 {
120 /* allocate cnt buffer_heads, possibly less if kmalloc fails */
121 int i = 0;
122
123 while (i < cnt) {
124 struct buffer_head *bh;
125 bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL);
126 if (!bh) break;
127
128 md_spin_lock_irq(&conf->device_lock);
129 bh->b_pprev = &conf->freebh;
130 bh->b_next = conf->freebh;
131 conf->freebh = bh;
132 conf->freebh_cnt++;
133 md_spin_unlock_irq(&conf->device_lock);
134
135 i++;
136 }
137 return i;
138 }
139
raid1_shrink_bh(raid1_conf_t * conf)140 static void raid1_shrink_bh(raid1_conf_t *conf)
141 {
142 /* discard all buffer_heads */
143
144 md_spin_lock_irq(&conf->device_lock);
145 while (conf->freebh) {
146 struct buffer_head *bh = conf->freebh;
147 conf->freebh = bh->b_next;
148 kmem_cache_free(bh_cachep, bh);
149 conf->freebh_cnt--;
150 }
151 md_spin_unlock_irq(&conf->device_lock);
152 }
153
154
raid1_alloc_r1bh(raid1_conf_t * conf)155 static struct raid1_bh *raid1_alloc_r1bh(raid1_conf_t *conf)
156 {
157 struct raid1_bh *r1_bh = NULL;
158
159 do {
160 md_spin_lock_irq(&conf->device_lock);
161 if (!conf->freer1_blocked && conf->freer1) {
162 r1_bh = conf->freer1;
163 conf->freer1 = r1_bh->next_r1;
164 conf->freer1_cnt--;
165 r1_bh->next_r1 = NULL;
166 r1_bh->state = (1 << R1BH_PreAlloc);
167 r1_bh->bh_req.b_state = 0;
168 }
169 md_spin_unlock_irq(&conf->device_lock);
170 if (r1_bh)
171 return r1_bh;
172 r1_bh = (struct raid1_bh *) kmalloc(sizeof(struct raid1_bh), GFP_NOIO);
173 if (r1_bh) {
174 memset(r1_bh, 0, sizeof(*r1_bh));
175 return r1_bh;
176 }
177 conf->freer1_blocked = 1;
178 wait_disk_event(conf->wait_buffer,
179 !conf->freer1_blocked ||
180 conf->freer1_cnt > NR_RESERVED_BUFS/2
181 );
182 conf->freer1_blocked = 0;
183 } while (1);
184 }
185
raid1_free_r1bh(struct raid1_bh * r1_bh)186 static inline void raid1_free_r1bh(struct raid1_bh *r1_bh)
187 {
188 struct buffer_head *bh = r1_bh->mirror_bh_list;
189 raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev);
190
191 r1_bh->mirror_bh_list = NULL;
192
193 if (test_bit(R1BH_PreAlloc, &r1_bh->state)) {
194 unsigned long flags;
195 spin_lock_irqsave(&conf->device_lock, flags);
196 r1_bh->next_r1 = conf->freer1;
197 conf->freer1 = r1_bh;
198 conf->freer1_cnt++;
199 spin_unlock_irqrestore(&conf->device_lock, flags);
200 /* don't need to wakeup wait_buffer because
201 * raid1_free_bh below will do that
202 */
203 } else {
204 kfree(r1_bh);
205 }
206 raid1_free_bh(conf, bh);
207 }
208
raid1_grow_r1bh(raid1_conf_t * conf,int cnt)209 static int raid1_grow_r1bh (raid1_conf_t *conf, int cnt)
210 {
211 int i = 0;
212
213 while (i < cnt) {
214 struct raid1_bh *r1_bh;
215 r1_bh = (struct raid1_bh*)kmalloc(sizeof(*r1_bh), GFP_KERNEL);
216 if (!r1_bh)
217 break;
218 memset(r1_bh, 0, sizeof(*r1_bh));
219 set_bit(R1BH_PreAlloc, &r1_bh->state);
220 r1_bh->mddev = conf->mddev;
221
222 raid1_free_r1bh(r1_bh);
223 i++;
224 }
225 return i;
226 }
227
raid1_shrink_r1bh(raid1_conf_t * conf)228 static void raid1_shrink_r1bh(raid1_conf_t *conf)
229 {
230 md_spin_lock_irq(&conf->device_lock);
231 while (conf->freer1) {
232 struct raid1_bh *r1_bh = conf->freer1;
233 conf->freer1 = r1_bh->next_r1;
234 conf->freer1_cnt--;
235 kfree(r1_bh);
236 }
237 md_spin_unlock_irq(&conf->device_lock);
238 }
239
240
241
raid1_free_buf(struct raid1_bh * r1_bh)242 static inline void raid1_free_buf(struct raid1_bh *r1_bh)
243 {
244 unsigned long flags;
245 struct buffer_head *bh = r1_bh->mirror_bh_list;
246 raid1_conf_t *conf = mddev_to_conf(r1_bh->mddev);
247 r1_bh->mirror_bh_list = NULL;
248
249 spin_lock_irqsave(&conf->device_lock, flags);
250 r1_bh->next_r1 = conf->freebuf;
251 conf->freebuf = r1_bh;
252 spin_unlock_irqrestore(&conf->device_lock, flags);
253 raid1_free_bh(conf, bh);
254 }
255
raid1_alloc_buf(raid1_conf_t * conf)256 static struct raid1_bh *raid1_alloc_buf(raid1_conf_t *conf)
257 {
258 struct raid1_bh *r1_bh;
259
260 md_spin_lock_irq(&conf->device_lock);
261 wait_event_lock_irq(conf->wait_buffer, conf->freebuf, conf->device_lock);
262 r1_bh = conf->freebuf;
263 conf->freebuf = r1_bh->next_r1;
264 r1_bh->next_r1= NULL;
265 md_spin_unlock_irq(&conf->device_lock);
266
267 return r1_bh;
268 }
269
raid1_grow_buffers(raid1_conf_t * conf,int cnt)270 static int raid1_grow_buffers (raid1_conf_t *conf, int cnt)
271 {
272 int i = 0;
273 struct raid1_bh *head = NULL, **tail;
274 tail = &head;
275
276 while (i < cnt) {
277 struct raid1_bh *r1_bh;
278 struct page *page;
279
280 page = alloc_page(GFP_KERNEL);
281 if (!page)
282 break;
283
284 r1_bh = (struct raid1_bh *) kmalloc(sizeof(*r1_bh), GFP_KERNEL);
285 if (!r1_bh) {
286 __free_page(page);
287 break;
288 }
289 memset(r1_bh, 0, sizeof(*r1_bh));
290 r1_bh->bh_req.b_page = page;
291 r1_bh->bh_req.b_data = page_address(page);
292 *tail = r1_bh;
293 r1_bh->next_r1 = NULL;
294 tail = & r1_bh->next_r1;
295 i++;
296 }
297 /* this lock probably isn't needed, as at the time when
298 * we are allocating buffers, nobody else will be touching the
299 * freebuf list. But it doesn't hurt....
300 */
301 md_spin_lock_irq(&conf->device_lock);
302 *tail = conf->freebuf;
303 conf->freebuf = head;
304 md_spin_unlock_irq(&conf->device_lock);
305 return i;
306 }
307
raid1_shrink_buffers(raid1_conf_t * conf)308 static void raid1_shrink_buffers (raid1_conf_t *conf)
309 {
310 struct raid1_bh *head;
311 md_spin_lock_irq(&conf->device_lock);
312 head = conf->freebuf;
313 conf->freebuf = NULL;
314 md_spin_unlock_irq(&conf->device_lock);
315
316 while (head) {
317 struct raid1_bh *r1_bh = head;
318 head = r1_bh->next_r1;
319 __free_page(r1_bh->bh_req.b_page);
320 kfree(r1_bh);
321 }
322 }
323
raid1_map(mddev_t * mddev,kdev_t * rdev)324 static int raid1_map (mddev_t *mddev, kdev_t *rdev)
325 {
326 raid1_conf_t *conf = mddev_to_conf(mddev);
327 int i, disks = MD_SB_DISKS;
328 unsigned long flags;
329
330 /*
331 * Later we do read balancing on the read side
332 * now we use the first available disk.
333 */
334
335 md_spin_lock_irqsave(&conf->device_lock, flags);
336 for (i = 0; i < disks; i++) {
337 if (conf->mirrors[i].operational) {
338 *rdev = conf->mirrors[i].dev;
339 md_spin_unlock_irqrestore(&conf->device_lock, flags);
340 return (0);
341 }
342 }
343 md_spin_unlock_irqrestore(&conf->device_lock, flags);
344
345 printk (KERN_ERR "raid1_map(): huh, no more operational devices?\n");
346 return (-1);
347 }
348
raid1_reschedule_retry(struct raid1_bh * r1_bh)349 static void raid1_reschedule_retry (struct raid1_bh *r1_bh)
350 {
351 unsigned long flags;
352 mddev_t *mddev = r1_bh->mddev;
353 raid1_conf_t *conf = mddev_to_conf(mddev);
354
355 md_spin_lock_irqsave(&retry_list_lock, flags);
356 if (raid1_retry_list == NULL)
357 raid1_retry_tail = &raid1_retry_list;
358 *raid1_retry_tail = r1_bh;
359 raid1_retry_tail = &r1_bh->next_r1;
360 r1_bh->next_r1 = NULL;
361 md_spin_unlock_irqrestore(&retry_list_lock, flags);
362 md_wakeup_thread(conf->thread);
363 }
364
365
io_request_done(unsigned long sector,raid1_conf_t * conf,int phase)366 static void inline io_request_done(unsigned long sector, raid1_conf_t *conf, int phase)
367 {
368 unsigned long flags;
369 spin_lock_irqsave(&conf->segment_lock, flags);
370 if (sector < conf->start_active)
371 conf->cnt_done--;
372 else if (sector >= conf->start_future && conf->phase == phase)
373 conf->cnt_future--;
374 else if (!--conf->cnt_pending)
375 wake_up(&conf->wait_ready);
376
377 spin_unlock_irqrestore(&conf->segment_lock, flags);
378 }
379
sync_request_done(unsigned long sector,raid1_conf_t * conf)380 static void inline sync_request_done (unsigned long sector, raid1_conf_t *conf)
381 {
382 unsigned long flags;
383 spin_lock_irqsave(&conf->segment_lock, flags);
384 if (sector >= conf->start_ready)
385 --conf->cnt_ready;
386 else if (sector >= conf->start_active) {
387 if (!--conf->cnt_active) {
388 conf->start_active = conf->start_ready;
389 wake_up(&conf->wait_done);
390 }
391 }
392 spin_unlock_irqrestore(&conf->segment_lock, flags);
393 }
394
395 /*
396 * raid1_end_bh_io() is called when we have finished servicing a mirrored
397 * operation and are ready to return a success/failure code to the buffer
398 * cache layer.
399 */
raid1_end_bh_io(struct raid1_bh * r1_bh,int uptodate)400 static void raid1_end_bh_io (struct raid1_bh *r1_bh, int uptodate)
401 {
402 struct buffer_head *bh = r1_bh->master_bh;
403
404 io_request_done(bh->b_rsector, mddev_to_conf(r1_bh->mddev),
405 test_bit(R1BH_SyncPhase, &r1_bh->state));
406
407 bh->b_end_io(bh, uptodate);
408 raid1_free_r1bh(r1_bh);
409 }
raid1_end_request(struct buffer_head * bh,int uptodate)410 void raid1_end_request (struct buffer_head *bh, int uptodate)
411 {
412 struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private);
413
414 /*
415 * this branch is our 'one mirror IO has finished' event handler:
416 */
417 if (!uptodate)
418 md_error (r1_bh->mddev, bh->b_dev);
419 else
420 /*
421 * Set R1BH_Uptodate in our master buffer_head, so that
422 * we will return a good error code for to the higher
423 * levels even if IO on some other mirrored buffer fails.
424 *
425 * The 'master' represents the complex operation to
426 * user-side. So if something waits for IO, then it will
427 * wait for the 'master' buffer_head.
428 */
429 set_bit (R1BH_Uptodate, &r1_bh->state);
430
431 /*
432 * We split up the read and write side, imho they are
433 * conceptually different.
434 */
435
436 if ( (r1_bh->cmd == READ) || (r1_bh->cmd == READA) ) {
437 /*
438 * we have only one buffer_head on the read side
439 */
440
441 if (uptodate) {
442 raid1_end_bh_io(r1_bh, uptodate);
443 return;
444 }
445 /*
446 * oops, read error:
447 */
448 printk(KERN_ERR "raid1: %s: rescheduling block %lu\n",
449 partition_name(bh->b_dev), bh->b_blocknr);
450 raid1_reschedule_retry(r1_bh);
451 return;
452 }
453
454 /*
455 * WRITE:
456 *
457 * Let's see if all mirrored write operations have finished
458 * already.
459 */
460
461 if (atomic_dec_and_test(&r1_bh->remaining))
462 raid1_end_bh_io(r1_bh, test_bit(R1BH_Uptodate, &r1_bh->state));
463 }
464
465 /*
466 * This routine returns the disk from which the requested read should
467 * be done. It bookkeeps the last read position for every disk
468 * in array and when new read requests come, the disk which last
469 * position is nearest to the request, is chosen.
470 *
471 * TODO: now if there are 2 mirrors in the same 2 devices, performance
472 * degrades dramatically because position is mirror, not device based.
473 * This should be changed to be device based. Also atomic sequential
474 * reads should be somehow balanced.
475 */
476
raid1_read_balance(raid1_conf_t * conf,struct buffer_head * bh)477 static int raid1_read_balance (raid1_conf_t *conf, struct buffer_head *bh)
478 {
479 int new_disk = conf->last_used;
480 const int sectors = bh->b_size >> 9;
481 const unsigned long this_sector = bh->b_rsector;
482 int disk = new_disk;
483 unsigned long new_distance;
484 unsigned long current_distance;
485
486 /*
487 * Check if it is sane at all to balance
488 */
489
490 if (conf->resync_mirrors)
491 goto rb_out;
492
493
494 #if defined(CONFIG_ALPHA) && ((__GNUC__ < 3) || \
495 ((__GNUC__ == 3) && (__GNUC_MINOR__ < 3)))
496 /* Work around a compiler bug in older gcc */
497 new_disk = *(volatile int *)&new_disk;
498 #endif
499
500 /* make sure that disk is operational */
501 while( !conf->mirrors[new_disk].operational) {
502 if (new_disk <= 0) new_disk = conf->raid_disks;
503 new_disk--;
504 if (new_disk == disk) {
505 /*
506 * This means no working disk was found
507 * Nothing much to do, lets not change anything
508 * and hope for the best...
509 */
510
511 new_disk = conf->last_used;
512
513 goto rb_out;
514 }
515 }
516 disk = new_disk;
517 /* now disk == new_disk == starting point for search */
518
519 /*
520 * Don't touch anything for sequential reads.
521 */
522
523 if (this_sector == conf->mirrors[new_disk].head_position)
524 goto rb_out;
525
526 /*
527 * If reads have been done only on a single disk
528 * for a time, lets give another disk a change.
529 * This is for kicking those idling disks so that
530 * they would find work near some hotspot.
531 */
532
533 if (conf->sect_count >= conf->mirrors[new_disk].sect_limit) {
534 conf->sect_count = 0;
535
536 #if defined(CONFIG_SPARC64) && (__GNUC__ == 2) && (__GNUC_MINOR__ == 92)
537 /* Work around a compiler bug in egcs-2.92.11 19980921 */
538 new_disk = *(volatile int *)&new_disk;
539 #endif
540 do {
541 if (new_disk<=0)
542 new_disk = conf->raid_disks;
543 new_disk--;
544 if (new_disk == disk)
545 break;
546 } while ((conf->mirrors[new_disk].write_only) ||
547 (!conf->mirrors[new_disk].operational));
548
549 goto rb_out;
550 }
551
552 current_distance = abs(this_sector -
553 conf->mirrors[disk].head_position);
554
555 /* Find the disk which is closest */
556
557 #if defined(CONFIG_ALPHA) && ((__GNUC__ < 3) || \
558 ((__GNUC__ == 3) && (__GNUC_MINOR__ < 3)))
559 /* Work around a compiler bug in older gcc */
560 disk = *(volatile int *)&disk;
561 #endif
562 do {
563 if (disk <= 0)
564 disk = conf->raid_disks;
565 disk--;
566
567 if ((conf->mirrors[disk].write_only) ||
568 (!conf->mirrors[disk].operational))
569 continue;
570
571 new_distance = abs(this_sector -
572 conf->mirrors[disk].head_position);
573
574 if (new_distance < current_distance) {
575 conf->sect_count = 0;
576 current_distance = new_distance;
577 new_disk = disk;
578 }
579 } while (disk != conf->last_used);
580
581 rb_out:
582 conf->mirrors[new_disk].head_position = this_sector + sectors;
583
584 conf->last_used = new_disk;
585 conf->sect_count += sectors;
586
587 return new_disk;
588 }
589
raid1_make_request(mddev_t * mddev,int rw,struct buffer_head * bh)590 static int raid1_make_request (mddev_t *mddev, int rw,
591 struct buffer_head * bh)
592 {
593 raid1_conf_t *conf = mddev_to_conf(mddev);
594 struct buffer_head *bh_req, *bhl;
595 struct raid1_bh * r1_bh;
596 int disks = MD_SB_DISKS;
597 int i, sum_bhs = 0;
598 struct mirror_info *mirror;
599 kdev_t dev;
600
601 if (!buffer_locked(bh))
602 BUG();
603
604 /*
605 * make_request() can abort the operation when READA is being
606 * used and no empty request is available.
607 *
608 * Currently, just replace the command with READ/WRITE.
609 */
610 if (rw == READA)
611 rw = READ;
612
613 r1_bh = raid1_alloc_r1bh (conf);
614
615 spin_lock_irq(&conf->segment_lock);
616 wait_event_lock_irq(conf->wait_done,
617 bh->b_rsector < conf->start_active ||
618 bh->b_rsector >= conf->start_future,
619 conf->segment_lock);
620 if (bh->b_rsector < conf->start_active)
621 conf->cnt_done++;
622 else {
623 conf->cnt_future++;
624 if (conf->phase)
625 set_bit(R1BH_SyncPhase, &r1_bh->state);
626 }
627 spin_unlock_irq(&conf->segment_lock);
628
629 /*
630 * i think the read and write branch should be separated completely,
631 * since we want to do read balancing on the read side for example.
632 * Alternative implementations? :) --mingo
633 */
634
635 r1_bh->master_bh = bh;
636 r1_bh->mddev = mddev;
637 r1_bh->cmd = rw;
638
639 if (rw == READ) {
640 /*
641 * read balancing logic:
642 */
643 spin_lock_irq(&conf->device_lock);
644 mirror = conf->mirrors + raid1_read_balance(conf, bh);
645 dev = mirror->dev;
646 spin_unlock_irq(&conf->device_lock);
647
648 bh_req = &r1_bh->bh_req;
649 memcpy(bh_req, bh, sizeof(*bh));
650 bh_req->b_blocknr = bh->b_rsector;
651 bh_req->b_dev = dev;
652 bh_req->b_rdev = dev;
653 /* bh_req->b_rsector = bh->n_rsector; */
654 bh_req->b_end_io = raid1_end_request;
655 bh_req->b_private = r1_bh;
656 generic_make_request (rw, bh_req);
657 return 0;
658 }
659
660 /*
661 * WRITE:
662 */
663
664 bhl = raid1_alloc_bh(conf, conf->raid_disks);
665 spin_lock_irq(&conf->device_lock);
666 for (i = 0; i < disks; i++) {
667 struct buffer_head *mbh;
668 if (!conf->mirrors[i].operational)
669 continue;
670
671 /*
672 * We should use a private pool (size depending on NR_REQUEST),
673 * to avoid writes filling up the memory with bhs
674 *
675 * Such pools are much faster than kmalloc anyways (so we waste
676 * almost nothing by not using the master bh when writing and
677 * win alot of cleanness) but for now we are cool enough. --mingo
678 *
679 * It's safe to sleep here, buffer heads cannot be used in a shared
680 * manner in the write branch. Look how we lock the buffer at the
681 * beginning of this function to grok the difference ;)
682 */
683 mbh = bhl;
684 if (mbh == NULL) {
685 MD_BUG();
686 break;
687 }
688 bhl = mbh->b_next;
689 mbh->b_next = NULL;
690 mbh->b_this_page = (struct buffer_head *)1;
691
692 /*
693 * prepare mirrored mbh (fields ordered for max mem throughput):
694 */
695 mbh->b_blocknr = bh->b_rsector;
696 mbh->b_dev = conf->mirrors[i].dev;
697 mbh->b_rdev = conf->mirrors[i].dev;
698 mbh->b_rsector = bh->b_rsector;
699 mbh->b_state = (1<<BH_Req) | (1<<BH_Dirty) |
700 (1<<BH_Mapped) | (1<<BH_Lock);
701
702 atomic_set(&mbh->b_count, 1);
703 mbh->b_size = bh->b_size;
704 mbh->b_page = bh->b_page;
705 mbh->b_data = bh->b_data;
706 mbh->b_list = BUF_LOCKED;
707 mbh->b_end_io = raid1_end_request;
708 mbh->b_private = r1_bh;
709
710 mbh->b_next = r1_bh->mirror_bh_list;
711 r1_bh->mirror_bh_list = mbh;
712 sum_bhs++;
713 }
714 spin_unlock_irq(&conf->device_lock);
715 if (bhl) raid1_free_bh(conf,bhl);
716 if (!sum_bhs) {
717 /* Gag - all mirrors non-operational.. */
718 raid1_end_bh_io(r1_bh, 0);
719 return 0;
720 }
721 md_atomic_set(&r1_bh->remaining, sum_bhs);
722
723 /*
724 * We have to be a bit careful about the semaphore above, thats
725 * why we start the requests separately. Since kmalloc() could
726 * fail, sleep and make_request() can sleep too, this is the
727 * safer solution. Imagine, end_request decreasing the semaphore
728 * before we could have set it up ... We could play tricks with
729 * the semaphore (presetting it and correcting at the end if
730 * sum_bhs is not 'n' but we have to do end_request by hand if
731 * all requests finish until we had a chance to set up the
732 * semaphore correctly ... lots of races).
733 */
734 bh = r1_bh->mirror_bh_list;
735 while(bh) {
736 struct buffer_head *bh2 = bh;
737 bh = bh->b_next;
738 generic_make_request(rw, bh2);
739 }
740 return (0);
741 }
742
raid1_status(struct seq_file * seq,mddev_t * mddev)743 static void raid1_status(struct seq_file *seq, mddev_t *mddev)
744 {
745 raid1_conf_t *conf = mddev_to_conf(mddev);
746 int i;
747
748 seq_printf(seq, " [%d/%d] [", conf->raid_disks,
749 conf->working_disks);
750 for (i = 0; i < conf->raid_disks; i++)
751 seq_printf(seq, "%s",
752 conf->mirrors[i].operational ? "U" : "_");
753 seq_printf(seq, "]");
754 }
755
756 #define LAST_DISK KERN_ALERT \
757 "raid1: only one disk left and IO error.\n"
758
759 #define NO_SPARE_DISK KERN_ALERT \
760 "raid1: no spare disk left, degrading mirror level by one.\n"
761
762 #define DISK_FAILED KERN_ALERT \
763 "raid1: Disk failure on %s, disabling device. \n" \
764 " Operation continuing on %d devices\n"
765
766 #define START_SYNCING KERN_ALERT \
767 "raid1: start syncing spare disk.\n"
768
769 #define ALREADY_SYNCING KERN_INFO \
770 "raid1: syncing already in progress.\n"
771
mark_disk_bad(mddev_t * mddev,int failed)772 static void mark_disk_bad (mddev_t *mddev, int failed)
773 {
774 raid1_conf_t *conf = mddev_to_conf(mddev);
775 struct mirror_info *mirror = conf->mirrors+failed;
776 mdp_super_t *sb = mddev->sb;
777
778 mirror->operational = 0;
779 mark_disk_faulty(sb->disks+mirror->number);
780 mark_disk_nonsync(sb->disks+mirror->number);
781 mark_disk_inactive(sb->disks+mirror->number);
782 if (!mirror->write_only)
783 sb->active_disks--;
784 else
785 sb->spare_disks--;
786 sb->working_disks--;
787 sb->failed_disks++;
788 mddev->sb_dirty = 1;
789 md_wakeup_thread(conf->thread);
790 if (!mirror->write_only)
791 conf->working_disks--;
792 printk (DISK_FAILED, partition_name (mirror->dev),
793 conf->working_disks);
794 }
795
raid1_error(mddev_t * mddev,kdev_t dev)796 static int raid1_error (mddev_t *mddev, kdev_t dev)
797 {
798 raid1_conf_t *conf = mddev_to_conf(mddev);
799 struct mirror_info * mirrors = conf->mirrors;
800 int disks = MD_SB_DISKS;
801 int i;
802 unsigned long flags;
803
804 /* Find the drive.
805 * If it is not operational, then we have already marked it as dead
806 * else if it is the last working disks, ignore the error, let the
807 * next level up know.
808 * else mark the drive as failed
809 */
810
811 for (i = 0; i < disks; i++)
812 if (mirrors[i].dev==dev && mirrors[i].operational)
813 break;
814 if (i == disks)
815 return 0;
816
817 if (i < conf->raid_disks && conf->working_disks == 1) {
818 /* Don't fail the drive, act as though we were just a
819 * normal single drive
820 */
821
822 return 1;
823 }
824 md_spin_lock_irqsave(&conf->device_lock, flags);
825 mark_disk_bad(mddev, i);
826 md_spin_unlock_irqrestore(&conf->device_lock, flags);
827 return 0;
828 }
829
830 #undef LAST_DISK
831 #undef NO_SPARE_DISK
832 #undef DISK_FAILED
833 #undef START_SYNCING
834
835
print_raid1_conf(raid1_conf_t * conf)836 static void print_raid1_conf (raid1_conf_t *conf)
837 {
838 int i;
839 struct mirror_info *tmp;
840
841 printk("RAID1 conf printout:\n");
842 if (!conf) {
843 printk("(conf==NULL)\n");
844 return;
845 }
846 printk(" --- wd:%d rd:%d nd:%d\n", conf->working_disks,
847 conf->raid_disks, conf->nr_disks);
848
849 for (i = 0; i < MD_SB_DISKS; i++) {
850 tmp = conf->mirrors + i;
851 printk(" disk %d, s:%d, o:%d, n:%d rd:%d us:%d dev:%s\n",
852 i, tmp->spare,tmp->operational,
853 tmp->number,tmp->raid_disk,tmp->used_slot,
854 partition_name(tmp->dev));
855 }
856 }
857
close_sync(raid1_conf_t * conf)858 static void close_sync(raid1_conf_t *conf)
859 {
860 mddev_t *mddev = conf->mddev;
861 /* If reconstruction was interrupted, we need to close the "active" and "pending"
862 * holes.
863 * we know that there are no active rebuild requests, os cnt_active == cnt_ready ==0
864 */
865 /* this is really needed when recovery stops too... */
866 spin_lock_irq(&conf->segment_lock);
867 conf->start_active = conf->start_pending;
868 conf->start_ready = conf->start_pending;
869 wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock);
870 conf->start_active =conf->start_ready = conf->start_pending = conf->start_future;
871 conf->start_future = (mddev->sb->size<<1)+1;
872 conf->cnt_pending = conf->cnt_future;
873 conf->cnt_future = 0;
874 conf->phase = conf->phase ^1;
875 wait_event_lock_irq(conf->wait_ready, !conf->cnt_pending, conf->segment_lock);
876 conf->start_active = conf->start_ready = conf->start_pending = conf->start_future = 0;
877 conf->phase = 0;
878 conf->cnt_future = conf->cnt_done;;
879 conf->cnt_done = 0;
880 spin_unlock_irq(&conf->segment_lock);
881 wake_up(&conf->wait_done);
882 }
883
raid1_diskop(mddev_t * mddev,mdp_disk_t ** d,int state)884 static int raid1_diskop(mddev_t *mddev, mdp_disk_t **d, int state)
885 {
886 int err = 0;
887 int i, failed_disk=-1, spare_disk=-1, removed_disk=-1, added_disk=-1;
888 raid1_conf_t *conf = mddev->private;
889 struct mirror_info *tmp, *sdisk, *fdisk, *rdisk, *adisk;
890 mdp_super_t *sb = mddev->sb;
891 mdp_disk_t *failed_desc, *spare_desc, *added_desc;
892 mdk_rdev_t *spare_rdev, *failed_rdev;
893
894 if (conf->resync_mirrors)
895 return 1; /* Cannot do any diskops during a resync */
896
897 switch (state) {
898 case DISKOP_SPARE_ACTIVE:
899 case DISKOP_SPARE_INACTIVE:
900 /* need to wait for pending sync io before locking device */
901 close_sync(conf);
902 }
903
904 md_spin_lock_irq(&conf->device_lock);
905 /*
906 * Need the conf lock when printing out state else we get BUG()s
907 */
908 print_raid1_conf(conf);
909 /*
910 * find the disk ...
911 */
912 switch (state) {
913
914 case DISKOP_SPARE_ACTIVE:
915
916 /*
917 * Find the failed disk within the RAID1 configuration ...
918 * (this can only be in the first conf->working_disks part)
919 */
920 for (i = 0; i < conf->raid_disks; i++) {
921 tmp = conf->mirrors + i;
922 if ((!tmp->operational && !tmp->spare) ||
923 !tmp->used_slot) {
924 failed_disk = i;
925 break;
926 }
927 }
928 /*
929 * When we activate a spare disk we _must_ have a disk in
930 * the lower (active) part of the array to replace.
931 */
932 if ((failed_disk == -1) || (failed_disk >= conf->raid_disks)) {
933 MD_BUG();
934 err = 1;
935 goto abort;
936 }
937 /* fall through */
938
939 case DISKOP_SPARE_WRITE:
940 case DISKOP_SPARE_INACTIVE:
941
942 /*
943 * Find the spare disk ... (can only be in the 'high'
944 * area of the array)
945 */
946 for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
947 tmp = conf->mirrors + i;
948 if (tmp->spare && tmp->number == (*d)->number) {
949 spare_disk = i;
950 break;
951 }
952 }
953 if (spare_disk == -1) {
954 MD_BUG();
955 err = 1;
956 goto abort;
957 }
958 break;
959
960 case DISKOP_HOT_REMOVE_DISK:
961
962 for (i = 0; i < MD_SB_DISKS; i++) {
963 tmp = conf->mirrors + i;
964 if (tmp->used_slot && (tmp->number == (*d)->number)) {
965 if (tmp->operational) {
966 err = -EBUSY;
967 goto abort;
968 }
969 removed_disk = i;
970 break;
971 }
972 }
973 if (removed_disk == -1) {
974 MD_BUG();
975 err = 1;
976 goto abort;
977 }
978 break;
979
980 case DISKOP_HOT_ADD_DISK:
981
982 for (i = conf->raid_disks; i < MD_SB_DISKS; i++) {
983 tmp = conf->mirrors + i;
984 if (!tmp->used_slot) {
985 added_disk = i;
986 break;
987 }
988 }
989 if (added_disk == -1) {
990 MD_BUG();
991 err = 1;
992 goto abort;
993 }
994 break;
995 }
996
997 switch (state) {
998 /*
999 * Switch the spare disk to write-only mode:
1000 */
1001 case DISKOP_SPARE_WRITE:
1002 sdisk = conf->mirrors + spare_disk;
1003 sdisk->operational = 1;
1004 sdisk->write_only = 1;
1005 break;
1006 /*
1007 * Deactivate a spare disk:
1008 */
1009 case DISKOP_SPARE_INACTIVE:
1010 if (conf->start_future > 0) {
1011 MD_BUG();
1012 err = -EBUSY;
1013 break;
1014 }
1015 sdisk = conf->mirrors + spare_disk;
1016 sdisk->operational = 0;
1017 sdisk->write_only = 0;
1018 break;
1019 /*
1020 * Activate (mark read-write) the (now sync) spare disk,
1021 * which means we switch it's 'raid position' (->raid_disk)
1022 * with the failed disk. (only the first 'conf->nr_disks'
1023 * slots are used for 'real' disks and we must preserve this
1024 * property)
1025 */
1026 case DISKOP_SPARE_ACTIVE:
1027 if (conf->start_future > 0) {
1028 MD_BUG();
1029 err = -EBUSY;
1030 break;
1031 }
1032 sdisk = conf->mirrors + spare_disk;
1033 fdisk = conf->mirrors + failed_disk;
1034
1035 spare_desc = &sb->disks[sdisk->number];
1036 failed_desc = &sb->disks[fdisk->number];
1037
1038 if (spare_desc != *d) {
1039 MD_BUG();
1040 err = 1;
1041 goto abort;
1042 }
1043
1044 if (spare_desc->raid_disk != sdisk->raid_disk) {
1045 MD_BUG();
1046 err = 1;
1047 goto abort;
1048 }
1049
1050 if (sdisk->raid_disk != spare_disk) {
1051 MD_BUG();
1052 err = 1;
1053 goto abort;
1054 }
1055
1056 if (failed_desc->raid_disk != fdisk->raid_disk) {
1057 MD_BUG();
1058 err = 1;
1059 goto abort;
1060 }
1061
1062 if (fdisk->raid_disk != failed_disk) {
1063 MD_BUG();
1064 err = 1;
1065 goto abort;
1066 }
1067
1068 /*
1069 * do the switch finally
1070 */
1071 spare_rdev = find_rdev_nr(mddev, spare_desc->number);
1072 failed_rdev = find_rdev_nr(mddev, failed_desc->number);
1073
1074 /* There must be a spare_rdev, but there may not be a
1075 * failed_rdev. That slot might be empty...
1076 */
1077 spare_rdev->desc_nr = failed_desc->number;
1078 if (failed_rdev)
1079 failed_rdev->desc_nr = spare_desc->number;
1080
1081 xchg_values(*spare_desc, *failed_desc);
1082 xchg_values(*fdisk, *sdisk);
1083
1084 /*
1085 * (careful, 'failed' and 'spare' are switched from now on)
1086 *
1087 * we want to preserve linear numbering and we want to
1088 * give the proper raid_disk number to the now activated
1089 * disk. (this means we switch back these values)
1090 */
1091
1092 xchg_values(spare_desc->raid_disk, failed_desc->raid_disk);
1093 xchg_values(sdisk->raid_disk, fdisk->raid_disk);
1094 xchg_values(spare_desc->number, failed_desc->number);
1095 xchg_values(sdisk->number, fdisk->number);
1096
1097 *d = failed_desc;
1098
1099 if (sdisk->dev == MKDEV(0,0))
1100 sdisk->used_slot = 0;
1101 /*
1102 * this really activates the spare.
1103 */
1104 fdisk->spare = 0;
1105 fdisk->write_only = 0;
1106
1107 /*
1108 * if we activate a spare, we definitely replace a
1109 * non-operational disk slot in the 'low' area of
1110 * the disk array.
1111 */
1112
1113 conf->working_disks++;
1114
1115 break;
1116
1117 case DISKOP_HOT_REMOVE_DISK:
1118 rdisk = conf->mirrors + removed_disk;
1119
1120 if (rdisk->spare && (removed_disk < conf->raid_disks)) {
1121 MD_BUG();
1122 err = 1;
1123 goto abort;
1124 }
1125 rdisk->dev = MKDEV(0,0);
1126 rdisk->used_slot = 0;
1127 conf->nr_disks--;
1128 break;
1129
1130 case DISKOP_HOT_ADD_DISK:
1131 adisk = conf->mirrors + added_disk;
1132 added_desc = *d;
1133
1134 if (added_disk != added_desc->number) {
1135 MD_BUG();
1136 err = 1;
1137 goto abort;
1138 }
1139
1140 adisk->number = added_desc->number;
1141 adisk->raid_disk = added_desc->raid_disk;
1142 adisk->dev = MKDEV(added_desc->major,added_desc->minor);
1143
1144 adisk->operational = 0;
1145 adisk->write_only = 0;
1146 adisk->spare = 1;
1147 adisk->used_slot = 1;
1148 adisk->head_position = 0;
1149 conf->nr_disks++;
1150
1151 break;
1152
1153 default:
1154 MD_BUG();
1155 err = 1;
1156 goto abort;
1157 }
1158 abort:
1159 print_raid1_conf(conf);
1160 md_spin_unlock_irq(&conf->device_lock);
1161 if (state == DISKOP_SPARE_ACTIVE || state == DISKOP_SPARE_INACTIVE)
1162 /* should move to "END_REBUILD" when such exists */
1163 raid1_shrink_buffers(conf);
1164
1165 return err;
1166 }
1167
1168
1169 #define IO_ERROR KERN_ALERT \
1170 "raid1: %s: unrecoverable I/O read error for block %lu\n"
1171
1172 #define REDIRECT_SECTOR KERN_ERR \
1173 "raid1: %s: redirecting sector %lu to another mirror\n"
1174
1175 /*
1176 * This is a kernel thread which:
1177 *
1178 * 1. Retries failed read operations on working mirrors.
1179 * 2. Updates the raid superblock when problems encounter.
1180 * 3. Performs writes following reads for array syncronising.
1181 */
1182 static void end_sync_write(struct buffer_head *bh, int uptodate);
1183 static void end_sync_read(struct buffer_head *bh, int uptodate);
1184
raid1d(void * data)1185 static void raid1d (void *data)
1186 {
1187 struct raid1_bh *r1_bh;
1188 struct buffer_head *bh;
1189 unsigned long flags;
1190 raid1_conf_t *conf = data;
1191 mddev_t *mddev = conf->mddev;
1192 kdev_t dev;
1193
1194 if (mddev->sb_dirty)
1195 md_update_sb(mddev);
1196
1197 for (;;) {
1198 md_spin_lock_irqsave(&retry_list_lock, flags);
1199 r1_bh = raid1_retry_list;
1200 if (!r1_bh)
1201 break;
1202 raid1_retry_list = r1_bh->next_r1;
1203 md_spin_unlock_irqrestore(&retry_list_lock, flags);
1204
1205 mddev = r1_bh->mddev;
1206 bh = &r1_bh->bh_req;
1207 switch(r1_bh->cmd) {
1208 case SPECIAL:
1209 /* have to allocate lots of bh structures and
1210 * schedule writes
1211 */
1212 if (test_bit(R1BH_Uptodate, &r1_bh->state)) {
1213 int i, sum_bhs = 0;
1214 int disks = MD_SB_DISKS;
1215 struct buffer_head *bhl, *mbh;
1216
1217 conf = mddev_to_conf(mddev);
1218 bhl = raid1_alloc_bh(conf, conf->raid_disks); /* don't really need this many */
1219 spin_lock_irq(&conf->device_lock);
1220 for (i = 0; i < disks ; i++) {
1221 if (!conf->mirrors[i].operational)
1222 continue;
1223 if (i==conf->last_used)
1224 /* we read from here, no need to write */
1225 continue;
1226 if (i < conf->raid_disks
1227 && !conf->resync_mirrors)
1228 /* don't need to write this,
1229 * we are just rebuilding */
1230 continue;
1231 mbh = bhl;
1232 if (!mbh) {
1233 MD_BUG();
1234 break;
1235 }
1236 bhl = mbh->b_next;
1237 mbh->b_this_page = (struct buffer_head *)1;
1238
1239
1240 /*
1241 * prepare mirrored bh (fields ordered for max mem throughput):
1242 */
1243 mbh->b_blocknr = bh->b_blocknr;
1244 mbh->b_dev = conf->mirrors[i].dev;
1245 mbh->b_rdev = conf->mirrors[i].dev;
1246 mbh->b_rsector = bh->b_blocknr;
1247 mbh->b_state = (1<<BH_Req) | (1<<BH_Dirty) |
1248 (1<<BH_Mapped) | (1<<BH_Lock);
1249 atomic_set(&mbh->b_count, 1);
1250 mbh->b_size = bh->b_size;
1251 mbh->b_page = bh->b_page;
1252 mbh->b_data = bh->b_data;
1253 mbh->b_list = BUF_LOCKED;
1254 mbh->b_end_io = end_sync_write;
1255 mbh->b_private = r1_bh;
1256
1257 mbh->b_next = r1_bh->mirror_bh_list;
1258 r1_bh->mirror_bh_list = mbh;
1259
1260 sum_bhs++;
1261 }
1262 spin_unlock_irq(&conf->device_lock);
1263 md_atomic_set(&r1_bh->remaining, sum_bhs);
1264 if (bhl) raid1_free_bh(conf, bhl);
1265 mbh = r1_bh->mirror_bh_list;
1266
1267 if (!sum_bhs) {
1268 /* nowhere to write this too... I guess we
1269 * must be done
1270 */
1271 sync_request_done(bh->b_blocknr, conf);
1272 md_done_sync(mddev, bh->b_size>>9, 0);
1273 raid1_free_buf(r1_bh);
1274 } else
1275 while (mbh) {
1276 struct buffer_head *bh1 = mbh;
1277 mbh = mbh->b_next;
1278 generic_make_request(WRITE, bh1);
1279 md_sync_acct(bh1->b_dev, bh1->b_size/512);
1280 }
1281 } else {
1282 /* There is no point trying a read-for-reconstruct
1283 * as reconstruct is about to be aborted
1284 */
1285
1286 printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr);
1287 md_done_sync(mddev, bh->b_size>>9, 0);
1288 }
1289
1290 break;
1291 case READ:
1292 case READA:
1293 dev = bh->b_dev;
1294 raid1_map (mddev, &bh->b_dev);
1295 if (bh->b_dev == dev) {
1296 printk (IO_ERROR, partition_name(bh->b_dev), bh->b_blocknr);
1297 raid1_end_bh_io(r1_bh, 0);
1298 } else {
1299 printk (REDIRECT_SECTOR,
1300 partition_name(bh->b_dev), bh->b_blocknr);
1301 bh->b_rdev = bh->b_dev;
1302 bh->b_rsector = bh->b_blocknr;
1303 generic_make_request (r1_bh->cmd, bh);
1304 }
1305 break;
1306 }
1307 }
1308 md_spin_unlock_irqrestore(&retry_list_lock, flags);
1309 }
1310 #undef IO_ERROR
1311 #undef REDIRECT_SECTOR
1312
1313 /*
1314 * Private kernel thread to reconstruct mirrors after an unclean
1315 * shutdown.
1316 */
raid1syncd(void * data)1317 static void raid1syncd (void *data)
1318 {
1319 raid1_conf_t *conf = data;
1320 mddev_t *mddev = conf->mddev;
1321
1322 if (!conf->resync_mirrors)
1323 return;
1324 if (conf->resync_mirrors == 2)
1325 return;
1326 down(&mddev->recovery_sem);
1327 if (!md_do_sync(mddev, NULL)) {
1328 /*
1329 * Only if everything went Ok.
1330 */
1331 conf->resync_mirrors = 0;
1332 }
1333
1334 close_sync(conf);
1335
1336 up(&mddev->recovery_sem);
1337 raid1_shrink_buffers(conf);
1338
1339 md_recover_arrays(); /* incase we are degraded and a spare is available */
1340 }
1341
1342 /*
1343 * perform a "sync" on one "block"
1344 *
1345 * We need to make sure that no normal I/O request - particularly write
1346 * requests - conflict with active sync requests.
1347 * This is achieved by conceptually dividing the device space into a
1348 * number of sections:
1349 * DONE: 0 .. a-1 These blocks are in-sync
1350 * ACTIVE: a.. b-1 These blocks may have active sync requests, but
1351 * no normal IO requests
1352 * READY: b .. c-1 These blocks have no normal IO requests - sync
1353 * request may be happening
1354 * PENDING: c .. d-1 These blocks may have IO requests, but no new
1355 * ones will be added
1356 * FUTURE: d .. end These blocks are not to be considered yet. IO may
1357 * be happening, but not sync
1358 *
1359 * We keep a
1360 * phase which flips (0 or 1) each time d moves and
1361 * a count of:
1362 * z = active io requests in FUTURE since d moved - marked with
1363 * current phase
1364 * y = active io requests in FUTURE before d moved, or PENDING -
1365 * marked with previous phase
1366 * x = active sync requests in READY
1367 * w = active sync requests in ACTIVE
1368 * v = active io requests in DONE
1369 *
1370 * Normally, a=b=c=d=0 and z= active io requests
1371 * or a=b=c=d=END and v= active io requests
1372 * Allowed changes to a,b,c,d:
1373 * A: c==d && y==0 -> d+=window, y=z, z=0, phase=!phase
1374 * B: y==0 -> c=d
1375 * C: b=c, w+=x, x=0
1376 * D: w==0 -> a=b
1377 * E: a==b==c==d==end -> a=b=c=d=0, z=v, v=0
1378 *
1379 * At start of sync we apply A.
1380 * When y reaches 0, we apply B then A then being sync requests
1381 * When sync point reaches c-1, we wait for y==0, and W==0, and
1382 * then apply apply B then A then D then C.
1383 * Finally, we apply E
1384 *
1385 * The sync request simply issues a "read" against a working drive
1386 * This is marked so that on completion the raid1d thread is woken to
1387 * issue suitable write requests
1388 */
1389
raid1_sync_request(mddev_t * mddev,unsigned long sector_nr)1390 static int raid1_sync_request (mddev_t *mddev, unsigned long sector_nr)
1391 {
1392 raid1_conf_t *conf = mddev_to_conf(mddev);
1393 struct mirror_info *mirror;
1394 struct raid1_bh *r1_bh;
1395 struct buffer_head *bh;
1396 int bsize;
1397 int disk;
1398 int block_nr;
1399 int buffs;
1400 kdev_t dev;
1401
1402 if (!sector_nr) {
1403 /* we want enough buffers to hold twice the window of 128*/
1404 buffs = 128 *2 / (PAGE_SIZE>>9);
1405 buffs = raid1_grow_buffers(conf, buffs);
1406 if (buffs < 2)
1407 goto nomem;
1408 conf->window = buffs*(PAGE_SIZE>>9)/2;
1409 }
1410 spin_lock_irq(&conf->segment_lock);
1411 if (!sector_nr) {
1412 /* initialize ...*/
1413 conf->start_active = 0;
1414 conf->start_ready = 0;
1415 conf->start_pending = 0;
1416 conf->start_future = 0;
1417 conf->phase = 0;
1418
1419 conf->cnt_future += conf->cnt_done+conf->cnt_pending;
1420 conf->cnt_done = conf->cnt_pending = 0;
1421 if (conf->cnt_ready || conf->cnt_active)
1422 MD_BUG();
1423 }
1424 while (sector_nr >= conf->start_pending) {
1425 PRINTK("wait .. sect=%lu start_active=%d ready=%d pending=%d future=%d, cnt_done=%d active=%d ready=%d pending=%d future=%d\n",
1426 sector_nr, conf->start_active, conf->start_ready, conf->start_pending, conf->start_future,
1427 conf->cnt_done, conf->cnt_active, conf->cnt_ready, conf->cnt_pending, conf->cnt_future);
1428 wait_event_lock_irq(conf->wait_done,
1429 !conf->cnt_active,
1430 conf->segment_lock);
1431 wait_event_lock_irq(conf->wait_ready,
1432 !conf->cnt_pending,
1433 conf->segment_lock);
1434 conf->start_active = conf->start_ready;
1435 conf->start_ready = conf->start_pending;
1436 conf->start_pending = conf->start_future;
1437 conf->start_future = conf->start_future+conf->window;
1438 // Note: falling off the end is not a problem
1439 conf->phase = conf->phase ^1;
1440 conf->cnt_active = conf->cnt_ready;
1441 conf->cnt_ready = 0;
1442 conf->cnt_pending = conf->cnt_future;
1443 conf->cnt_future = 0;
1444 wake_up(&conf->wait_done);
1445 }
1446 conf->cnt_ready++;
1447 spin_unlock_irq(&conf->segment_lock);
1448
1449
1450 /* If reconstructing, and >1 working disc,
1451 * could dedicate one to rebuild and others to
1452 * service read requests ..
1453 */
1454 spin_lock_irq(&conf->device_lock);
1455 disk = conf->last_used;
1456 /* make sure disk is operational */
1457 while (!conf->mirrors[disk].operational) {
1458 if (disk <= 0) disk = conf->raid_disks;
1459 disk--;
1460 if (disk == conf->last_used)
1461 break;
1462 }
1463 conf->last_used = disk;
1464
1465 mirror = conf->mirrors+conf->last_used;
1466 dev = mirror->dev;
1467 spin_unlock_irq(&conf->device_lock);
1468
1469 r1_bh = raid1_alloc_buf (conf);
1470 r1_bh->master_bh = NULL;
1471 r1_bh->mddev = mddev;
1472 r1_bh->cmd = SPECIAL;
1473 bh = &r1_bh->bh_req;
1474
1475 block_nr = sector_nr;
1476 bsize = 512;
1477 while (!(block_nr & 1) && bsize < PAGE_SIZE
1478 && (block_nr+2)*(bsize>>9) <= (mddev->sb->size *2)) {
1479 block_nr >>= 1;
1480 bsize <<= 1;
1481 }
1482 bh->b_size = bsize;
1483 bh->b_list = BUF_LOCKED;
1484 bh->b_dev = dev;
1485 bh->b_rdev = dev;
1486 bh->b_state = (1<<BH_Req) | (1<<BH_Mapped) | (1<<BH_Lock);
1487 if (!bh->b_page)
1488 BUG();
1489 if (!bh->b_data)
1490 BUG();
1491 if (bh->b_data != page_address(bh->b_page))
1492 BUG();
1493 bh->b_end_io = end_sync_read;
1494 bh->b_private = r1_bh;
1495 bh->b_blocknr = sector_nr;
1496 bh->b_rsector = sector_nr;
1497 init_waitqueue_head(&bh->b_wait);
1498
1499 generic_make_request(READ, bh);
1500 md_sync_acct(bh->b_dev, bh->b_size/512);
1501
1502 return (bsize >> 9);
1503
1504 nomem:
1505 raid1_shrink_buffers(conf);
1506 return -ENOMEM;
1507 }
1508
end_sync_read(struct buffer_head * bh,int uptodate)1509 static void end_sync_read(struct buffer_head *bh, int uptodate)
1510 {
1511 struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private);
1512
1513 /* we have read a block, now it needs to be re-written,
1514 * or re-read if the read failed.
1515 * We don't do much here, just schedule handling by raid1d
1516 */
1517 if (!uptodate)
1518 md_error (r1_bh->mddev, bh->b_dev);
1519 else
1520 set_bit(R1BH_Uptodate, &r1_bh->state);
1521 raid1_reschedule_retry(r1_bh);
1522 }
1523
end_sync_write(struct buffer_head * bh,int uptodate)1524 static void end_sync_write(struct buffer_head *bh, int uptodate)
1525 {
1526 struct raid1_bh * r1_bh = (struct raid1_bh *)(bh->b_private);
1527
1528 if (!uptodate)
1529 md_error (r1_bh->mddev, bh->b_dev);
1530 if (atomic_dec_and_test(&r1_bh->remaining)) {
1531 mddev_t *mddev = r1_bh->mddev;
1532 unsigned long sect = bh->b_blocknr;
1533 int size = bh->b_size;
1534 raid1_free_buf(r1_bh);
1535 sync_request_done(sect, mddev_to_conf(mddev));
1536 md_done_sync(mddev,size>>9, uptodate);
1537 }
1538 }
1539
1540 #define INVALID_LEVEL KERN_WARNING \
1541 "raid1: md%d: raid level not set to mirroring (%d)\n"
1542
1543 #define NO_SB KERN_ERR \
1544 "raid1: disabled mirror %s (couldn't access raid superblock)\n"
1545
1546 #define ERRORS KERN_ERR \
1547 "raid1: disabled mirror %s (errors detected)\n"
1548
1549 #define NOT_IN_SYNC KERN_ERR \
1550 "raid1: disabled mirror %s (not in sync)\n"
1551
1552 #define INCONSISTENT KERN_ERR \
1553 "raid1: disabled mirror %s (inconsistent descriptor)\n"
1554
1555 #define ALREADY_RUNNING KERN_ERR \
1556 "raid1: disabled mirror %s (mirror %d already operational)\n"
1557
1558 #define OPERATIONAL KERN_INFO \
1559 "raid1: device %s operational as mirror %d\n"
1560
1561 #define MEM_ERROR KERN_ERR \
1562 "raid1: couldn't allocate memory for md%d\n"
1563
1564 #define SPARE KERN_INFO \
1565 "raid1: spare disk %s\n"
1566
1567 #define NONE_OPERATIONAL KERN_ERR \
1568 "raid1: no operational mirrors for md%d\n"
1569
1570 #define ARRAY_IS_ACTIVE KERN_INFO \
1571 "raid1: raid set md%d active with %d out of %d mirrors\n"
1572
1573 #define THREAD_ERROR KERN_ERR \
1574 "raid1: couldn't allocate thread for md%d\n"
1575
1576 #define START_RESYNC KERN_WARNING \
1577 "raid1: raid set md%d not clean; reconstructing mirrors\n"
1578
raid1_run(mddev_t * mddev)1579 static int raid1_run (mddev_t *mddev)
1580 {
1581 raid1_conf_t *conf;
1582 int i, j, disk_idx;
1583 struct mirror_info *disk;
1584 mdp_super_t *sb = mddev->sb;
1585 mdp_disk_t *descriptor;
1586 mdk_rdev_t *rdev;
1587 struct md_list_head *tmp;
1588 int start_recovery = 0;
1589
1590 MOD_INC_USE_COUNT;
1591
1592 if (sb->level != 1) {
1593 printk(INVALID_LEVEL, mdidx(mddev), sb->level);
1594 goto out;
1595 }
1596 /*
1597 * copy the already verified devices into our private RAID1
1598 * bookkeeping area. [whatever we allocate in raid1_run(),
1599 * should be freed in raid1_stop()]
1600 */
1601
1602 conf = kmalloc(sizeof(raid1_conf_t), GFP_KERNEL);
1603 mddev->private = conf;
1604 if (!conf) {
1605 printk(MEM_ERROR, mdidx(mddev));
1606 goto out;
1607 }
1608 memset(conf, 0, sizeof(*conf));
1609
1610 ITERATE_RDEV(mddev,rdev,tmp) {
1611 if (rdev->faulty) {
1612 printk(ERRORS, partition_name(rdev->dev));
1613 } else {
1614 if (!rdev->sb) {
1615 MD_BUG();
1616 continue;
1617 }
1618 }
1619 if (rdev->desc_nr == -1) {
1620 MD_BUG();
1621 continue;
1622 }
1623 descriptor = &sb->disks[rdev->desc_nr];
1624 disk_idx = descriptor->raid_disk;
1625 disk = conf->mirrors + disk_idx;
1626
1627 if (disk_faulty(descriptor)) {
1628 disk->number = descriptor->number;
1629 disk->raid_disk = disk_idx;
1630 disk->dev = rdev->dev;
1631 disk->sect_limit = MAX_WORK_PER_DISK;
1632 disk->operational = 0;
1633 disk->write_only = 0;
1634 disk->spare = 0;
1635 disk->used_slot = 1;
1636 disk->head_position = 0;
1637 continue;
1638 }
1639 if (disk_active(descriptor)) {
1640 if (!disk_sync(descriptor)) {
1641 printk(NOT_IN_SYNC,
1642 partition_name(rdev->dev));
1643 continue;
1644 }
1645 if ((descriptor->number > MD_SB_DISKS) ||
1646 (disk_idx > sb->raid_disks)) {
1647
1648 printk(INCONSISTENT,
1649 partition_name(rdev->dev));
1650 continue;
1651 }
1652 if (disk->operational) {
1653 printk(ALREADY_RUNNING,
1654 partition_name(rdev->dev),
1655 disk_idx);
1656 continue;
1657 }
1658 printk(OPERATIONAL, partition_name(rdev->dev),
1659 disk_idx);
1660 disk->number = descriptor->number;
1661 disk->raid_disk = disk_idx;
1662 disk->dev = rdev->dev;
1663 disk->sect_limit = MAX_WORK_PER_DISK;
1664 disk->operational = 1;
1665 disk->write_only = 0;
1666 disk->spare = 0;
1667 disk->used_slot = 1;
1668 disk->head_position = 0;
1669 conf->working_disks++;
1670 } else {
1671 /*
1672 * Must be a spare disk ..
1673 */
1674 printk(SPARE, partition_name(rdev->dev));
1675 disk->number = descriptor->number;
1676 disk->raid_disk = disk_idx;
1677 disk->dev = rdev->dev;
1678 disk->sect_limit = MAX_WORK_PER_DISK;
1679 disk->operational = 0;
1680 disk->write_only = 0;
1681 disk->spare = 1;
1682 disk->used_slot = 1;
1683 disk->head_position = 0;
1684 }
1685 }
1686 conf->raid_disks = sb->raid_disks;
1687 conf->nr_disks = sb->nr_disks;
1688 conf->mddev = mddev;
1689 conf->device_lock = MD_SPIN_LOCK_UNLOCKED;
1690
1691 conf->segment_lock = MD_SPIN_LOCK_UNLOCKED;
1692 init_waitqueue_head(&conf->wait_buffer);
1693 init_waitqueue_head(&conf->wait_done);
1694 init_waitqueue_head(&conf->wait_ready);
1695
1696 if (!conf->working_disks) {
1697 printk(NONE_OPERATIONAL, mdidx(mddev));
1698 goto out_free_conf;
1699 }
1700
1701
1702 /* pre-allocate some buffer_head structures.
1703 * As a minimum, 1 r1bh and raid_disks buffer_heads
1704 * would probably get us by in tight memory situations,
1705 * but a few more is probably a good idea.
1706 * For now, try NR_RESERVED_BUFS r1bh and
1707 * NR_RESERVED_BUFS*raid_disks bufferheads
1708 * This will allow at least NR_RESERVED_BUFS concurrent
1709 * reads or writes even if kmalloc starts failing
1710 */
1711 if (raid1_grow_r1bh(conf, NR_RESERVED_BUFS) < NR_RESERVED_BUFS ||
1712 raid1_grow_bh(conf, NR_RESERVED_BUFS*conf->raid_disks)
1713 < NR_RESERVED_BUFS*conf->raid_disks) {
1714 printk(MEM_ERROR, mdidx(mddev));
1715 goto out_free_conf;
1716 }
1717
1718 for (i = 0; i < MD_SB_DISKS; i++) {
1719
1720 descriptor = sb->disks+i;
1721 disk_idx = descriptor->raid_disk;
1722 disk = conf->mirrors + disk_idx;
1723
1724 if (disk_faulty(descriptor) && (disk_idx < conf->raid_disks) &&
1725 !disk->used_slot) {
1726
1727 disk->number = descriptor->number;
1728 disk->raid_disk = disk_idx;
1729 disk->dev = MKDEV(0,0);
1730
1731 disk->operational = 0;
1732 disk->write_only = 0;
1733 disk->spare = 0;
1734 disk->used_slot = 1;
1735 disk->head_position = 0;
1736 }
1737 }
1738
1739 /*
1740 * find the first working one and use it as a starting point
1741 * to read balancing.
1742 */
1743 for (j = 0; !conf->mirrors[j].operational && j < MD_SB_DISKS; j++)
1744 /* nothing */;
1745 conf->last_used = j;
1746
1747
1748
1749 {
1750 const char * name = "raid1d";
1751
1752 conf->thread = md_register_thread(raid1d, conf, name);
1753 if (!conf->thread) {
1754 printk(THREAD_ERROR, mdidx(mddev));
1755 goto out_free_conf;
1756 }
1757 }
1758
1759 if (!(sb->state & (1 << MD_SB_CLEAN)) &&
1760 (conf->working_disks > 1)) {
1761 const char * name = "raid1syncd";
1762
1763 conf->resync_thread = md_register_thread(raid1syncd, conf,name);
1764 if (!conf->resync_thread) {
1765 printk(THREAD_ERROR, mdidx(mddev));
1766 goto out_free_conf;
1767 }
1768
1769 printk(START_RESYNC, mdidx(mddev));
1770 conf->resync_mirrors = 1;
1771 md_wakeup_thread(conf->resync_thread);
1772 } else if (conf->working_disks != sb->raid_disks) {
1773 printk(KERN_ALERT "raid1: md%d, not all disks are operational -- trying to recover array\n", mdidx(mddev));
1774 start_recovery = 1;
1775 }
1776
1777 /*
1778 * Regenerate the "device is in sync with the raid set" bit for
1779 * each device.
1780 */
1781 for (i = 0; i < MD_SB_DISKS; i++) {
1782 mark_disk_nonsync(sb->disks+i);
1783 for (j = 0; j < sb->raid_disks; j++) {
1784 if (!conf->mirrors[j].operational)
1785 continue;
1786 if (sb->disks[i].number == conf->mirrors[j].number)
1787 mark_disk_sync(sb->disks+i);
1788 }
1789 }
1790 sb->active_disks = conf->working_disks;
1791
1792 if (start_recovery)
1793 md_recover_arrays();
1794
1795
1796 printk(ARRAY_IS_ACTIVE, mdidx(mddev), sb->active_disks, sb->raid_disks);
1797 /*
1798 * Ok, everything is just fine now
1799 */
1800 return 0;
1801
1802 out_free_conf:
1803 raid1_shrink_r1bh(conf);
1804 raid1_shrink_bh(conf);
1805 raid1_shrink_buffers(conf);
1806 kfree(conf);
1807 mddev->private = NULL;
1808 out:
1809 MOD_DEC_USE_COUNT;
1810 return -EIO;
1811 }
1812
1813 #undef INVALID_LEVEL
1814 #undef NO_SB
1815 #undef ERRORS
1816 #undef NOT_IN_SYNC
1817 #undef INCONSISTENT
1818 #undef ALREADY_RUNNING
1819 #undef OPERATIONAL
1820 #undef SPARE
1821 #undef NONE_OPERATIONAL
1822 #undef ARRAY_IS_ACTIVE
1823
raid1_stop_resync(mddev_t * mddev)1824 static int raid1_stop_resync (mddev_t *mddev)
1825 {
1826 raid1_conf_t *conf = mddev_to_conf(mddev);
1827
1828 if (conf->resync_thread) {
1829 if (conf->resync_mirrors) {
1830 conf->resync_mirrors = 2;
1831 md_interrupt_thread(conf->resync_thread);
1832
1833 printk(KERN_INFO "raid1: mirror resync was not fully finished, restarting next time.\n");
1834 return 1;
1835 }
1836 return 0;
1837 }
1838 return 0;
1839 }
1840
raid1_restart_resync(mddev_t * mddev)1841 static int raid1_restart_resync (mddev_t *mddev)
1842 {
1843 raid1_conf_t *conf = mddev_to_conf(mddev);
1844
1845 if (conf->resync_mirrors) {
1846 if (!conf->resync_thread) {
1847 MD_BUG();
1848 return 0;
1849 }
1850 conf->resync_mirrors = 1;
1851 md_wakeup_thread(conf->resync_thread);
1852 return 1;
1853 }
1854 return 0;
1855 }
1856
raid1_stop(mddev_t * mddev)1857 static int raid1_stop (mddev_t *mddev)
1858 {
1859 raid1_conf_t *conf = mddev_to_conf(mddev);
1860
1861 md_unregister_thread(conf->thread);
1862 if (conf->resync_thread)
1863 md_unregister_thread(conf->resync_thread);
1864 raid1_shrink_r1bh(conf);
1865 raid1_shrink_bh(conf);
1866 raid1_shrink_buffers(conf);
1867 kfree(conf);
1868 mddev->private = NULL;
1869 MOD_DEC_USE_COUNT;
1870 return 0;
1871 }
1872
1873 static mdk_personality_t raid1_personality=
1874 {
1875 name: "raid1",
1876 make_request: raid1_make_request,
1877 run: raid1_run,
1878 stop: raid1_stop,
1879 status: raid1_status,
1880 error_handler: raid1_error,
1881 diskop: raid1_diskop,
1882 stop_resync: raid1_stop_resync,
1883 restart_resync: raid1_restart_resync,
1884 sync_request: raid1_sync_request
1885 };
1886
raid1_init(void)1887 static int md__init raid1_init (void)
1888 {
1889 return register_md_personality (RAID1, &raid1_personality);
1890 }
1891
raid1_exit(void)1892 static void raid1_exit (void)
1893 {
1894 unregister_md_personality (RAID1);
1895 }
1896
1897 module_init(raid1_init);
1898 module_exit(raid1_exit);
1899 MODULE_LICENSE("GPL");
1900