1 /*
2 * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33 /*
34 * The xfs_buf.c code provides an abstract buffer cache model on top
35 * of the Linux page cache. Cached metadata blocks for a file system
36 * are hashed to the inode for the block device. xfs_buf.c assembles
37 * buffers (xfs_buf_t) on demand to aggregate such cached pages for I/O.
38 *
39 * Written by Steve Lord, Jim Mostek, Russell Cattelan
40 * and Rajagopal Ananthanarayanan ("ananth") at SGI.
41 *
42 */
43
44 #include <linux/stddef.h>
45 #include <linux/errno.h>
46 #include <linux/slab.h>
47 #include <linux/pagemap.h>
48 #include <linux/init.h>
49 #include <linux/vmalloc.h>
50 #include <linux/blkdev.h>
51 #include <linux/locks.h>
52 #include <linux/sysctl.h>
53 #include <linux/proc_fs.h>
54
55 #include "xfs_linux.h"
56
57 #define BN_ALIGN_MASK ((1 << (PAGE_CACHE_SHIFT - BBSHIFT)) - 1)
58
59 #ifndef GFP_READAHEAD
60 #define GFP_READAHEAD 0
61 #endif
62
63 /*
64 * A backport of the 2.5 scheduler is used by many vendors of 2.4-based
65 * distributions.
66 * We can only guess it's presences by the lack of the SCHED_YIELD flag.
67 * If the heuristic doesn't work, change this define by hand.
68 */
69 #ifndef SCHED_YIELD
70 #define __HAVE_NEW_SCHEDULER 1
71 #endif
72
73 /*
74 * cpumask_t is used for supporting NR_CPUS > BITS_PER_LONG.
75 * If support for this is present, migrate_to_cpu exists and provides
76 * a wrapper around the set_cpus_allowed routine.
77 */
78 #ifdef copy_cpumask
79 #define __HAVE_CPUMASK_T 1
80 #endif
81
82 #ifndef __HAVE_CPUMASK_T
83 # ifndef __HAVE_NEW_SCHEDULER
84 # define migrate_to_cpu(cpu) \
85 do { current->cpus_allowed = 1UL << (cpu); } while (0)
86 # else
87 # define migrate_to_cpu(cpu) \
88 set_cpus_allowed(current, 1UL << (cpu))
89 # endif
90 #endif
91
92 #ifndef VM_MAP
93 #define VM_MAP VM_ALLOC
94 #endif
95
96 /*
97 * File wide globals
98 */
99
100 STATIC kmem_cache_t *pagebuf_cache;
101 STATIC kmem_shaker_t pagebuf_shake;
102
103 #define MAX_IO_DAEMONS NR_CPUS
104 #define CPU_TO_DAEMON(cpu) (cpu)
105 STATIC int pb_logio_daemons[MAX_IO_DAEMONS];
106 STATIC struct list_head pagebuf_logiodone_tq[MAX_IO_DAEMONS];
107 STATIC wait_queue_head_t pagebuf_logiodone_wait[MAX_IO_DAEMONS];
108 STATIC int pb_dataio_daemons[MAX_IO_DAEMONS];
109 STATIC struct list_head pagebuf_dataiodone_tq[MAX_IO_DAEMONS];
110 STATIC wait_queue_head_t pagebuf_dataiodone_wait[MAX_IO_DAEMONS];
111
112 /*
113 * For pre-allocated buffer head pool
114 */
115
116 #define NR_RESERVED_BH 64
117 static wait_queue_head_t pb_resv_bh_wait;
118 static spinlock_t pb_resv_bh_lock = SPIN_LOCK_UNLOCKED;
119 struct buffer_head *pb_resv_bh = NULL; /* list of bh */
120 int pb_resv_bh_cnt = 0; /* # of bh available */
121
122 STATIC void _pagebuf_ioapply(xfs_buf_t *);
123 STATIC int pagebuf_daemon_wakeup(int, unsigned int);
124 STATIC void pagebuf_delwri_queue(xfs_buf_t *, int);
125 STATIC void pagebuf_runall_queues(struct list_head[]);
126
127 /*
128 * Pagebuf debugging
129 */
130
131 #ifdef PAGEBUF_TRACE
132 void
pagebuf_trace(xfs_buf_t * pb,char * id,void * data,void * ra)133 pagebuf_trace(
134 xfs_buf_t *pb,
135 char *id,
136 void *data,
137 void *ra)
138 {
139 ktrace_enter(pagebuf_trace_buf,
140 pb, id,
141 (void *)(unsigned long)pb->pb_flags,
142 (void *)(unsigned long)pb->pb_hold.counter,
143 (void *)(unsigned long)pb->pb_sema.count.counter,
144 (void *)current,
145 data, ra,
146 (void *)(unsigned long)((pb->pb_file_offset>>32) & 0xffffffff),
147 (void *)(unsigned long)(pb->pb_file_offset & 0xffffffff),
148 (void *)(unsigned long)pb->pb_buffer_length,
149 NULL, NULL, NULL, NULL, NULL);
150 }
151 ktrace_t *pagebuf_trace_buf;
152 #define PAGEBUF_TRACE_SIZE 4096
153 #define PB_TRACE(pb, id, data) \
154 pagebuf_trace(pb, id, (void *)data, (void *)__builtin_return_address(0))
155 #else
156 #define PB_TRACE(pb, id, data) do { } while (0)
157 #endif
158
159 #ifdef PAGEBUF_LOCK_TRACKING
160 # define PB_SET_OWNER(pb) ((pb)->pb_last_holder = current->pid)
161 # define PB_CLEAR_OWNER(pb) ((pb)->pb_last_holder = -1)
162 # define PB_GET_OWNER(pb) ((pb)->pb_last_holder)
163 #else
164 # define PB_SET_OWNER(pb) do { } while (0)
165 # define PB_CLEAR_OWNER(pb) do { } while (0)
166 # define PB_GET_OWNER(pb) do { } while (0)
167 #endif
168
169 /*
170 * Pagebuf allocation / freeing.
171 */
172
173 #define pb_to_gfp(flags) \
174 (((flags) & PBF_READ_AHEAD) ? GFP_READAHEAD : \
175 ((flags) & PBF_DONT_BLOCK) ? GFP_NOFS : GFP_KERNEL)
176
177 #define pb_to_km(flags) \
178 (((flags) & PBF_DONT_BLOCK) ? KM_NOFS : KM_SLEEP)
179
180
181 #define pagebuf_allocate(flags) \
182 kmem_zone_alloc(pagebuf_cache, pb_to_km(flags))
183 #define pagebuf_deallocate(pb) \
184 kmem_zone_free(pagebuf_cache, (pb));
185
186 /*
187 * Pagebuf hashing
188 */
189
190 #define NBITS 8
191 #define NHASH (1<<NBITS)
192
193 typedef struct {
194 struct list_head pb_hash;
195 spinlock_t pb_hash_lock;
196 } pb_hash_t;
197
198 STATIC pb_hash_t pbhash[NHASH];
199 #define pb_hash(pb) &pbhash[pb->pb_hash_index]
200
201 STATIC int
_bhash(struct block_device * bdev,loff_t base)202 _bhash(
203 struct block_device *bdev,
204 loff_t base)
205 {
206 int bit, hval;
207
208 base >>= 9;
209 base ^= (unsigned long)bdev / L1_CACHE_BYTES;
210 for (bit = hval = 0; base && bit < sizeof(base) * 8; bit += NBITS) {
211 hval ^= (int)base & (NHASH-1);
212 base >>= NBITS;
213 }
214 return hval;
215 }
216
217 /*
218 * Mapping of multi-page buffers into contiguous virtual space
219 */
220
221 typedef struct a_list {
222 void *vm_addr;
223 struct a_list *next;
224 } a_list_t;
225
226 STATIC a_list_t *as_free_head;
227 STATIC int as_list_len;
228 STATIC spinlock_t as_lock = SPIN_LOCK_UNLOCKED;
229
230 /*
231 * Try to batch vunmaps because they are costly.
232 */
233 STATIC void
free_address(void * addr)234 free_address(
235 void *addr)
236 {
237 a_list_t *aentry;
238
239 aentry = kmalloc(sizeof(a_list_t), GFP_ATOMIC);
240 if (aentry) {
241 spin_lock(&as_lock);
242 aentry->next = as_free_head;
243 aentry->vm_addr = addr;
244 as_free_head = aentry;
245 as_list_len++;
246 spin_unlock(&as_lock);
247 } else {
248 vunmap(addr);
249 }
250 }
251
252 STATIC void
purge_addresses(void)253 purge_addresses(void)
254 {
255 a_list_t *aentry, *old;
256
257 if (as_free_head == NULL)
258 return;
259
260 spin_lock(&as_lock);
261 aentry = as_free_head;
262 as_free_head = NULL;
263 as_list_len = 0;
264 spin_unlock(&as_lock);
265
266 while ((old = aentry) != NULL) {
267 vunmap(aentry->vm_addr);
268 aentry = aentry->next;
269 kfree(old);
270 }
271 }
272
273 /*
274 * Internal pagebuf object manipulation
275 */
276
277 STATIC void
_pagebuf_initialize(xfs_buf_t * pb,xfs_buftarg_t * target,loff_t range_base,size_t range_length,page_buf_flags_t flags)278 _pagebuf_initialize(
279 xfs_buf_t *pb,
280 xfs_buftarg_t *target,
281 loff_t range_base,
282 size_t range_length,
283 page_buf_flags_t flags)
284 {
285 /*
286 * We don't want certain flags to appear in pb->pb_flags.
287 */
288 flags &= ~(PBF_LOCK|PBF_MAPPED|PBF_DONT_BLOCK|PBF_READ_AHEAD);
289
290 memset(pb, 0, sizeof(xfs_buf_t));
291 atomic_set(&pb->pb_hold, 1);
292 init_MUTEX_LOCKED(&pb->pb_iodonesema);
293 INIT_LIST_HEAD(&pb->pb_list);
294 INIT_LIST_HEAD(&pb->pb_hash_list);
295 init_MUTEX_LOCKED(&pb->pb_sema); /* held, no waiters */
296 PB_SET_OWNER(pb);
297 pb->pb_target = target;
298 pb->pb_file_offset = range_base;
299 /*
300 * Set buffer_length and count_desired to the same value initially.
301 * I/O routines should use count_desired, which will be the same in
302 * most cases but may be reset (e.g. XFS recovery).
303 */
304 pb->pb_buffer_length = pb->pb_count_desired = range_length;
305 pb->pb_flags = flags | PBF_NONE;
306 pb->pb_bn = XFS_BUF_DADDR_NULL;
307 atomic_set(&pb->pb_pin_count, 0);
308 init_waitqueue_head(&pb->pb_waiters);
309
310 XFS_STATS_INC(pb_create);
311 PB_TRACE(pb, "initialize", target);
312 }
313
314 /*
315 * Allocate a page array capable of holding a specified number
316 * of pages, and point the page buf at it.
317 */
318 STATIC int
_pagebuf_get_pages(xfs_buf_t * pb,int page_count,page_buf_flags_t flags)319 _pagebuf_get_pages(
320 xfs_buf_t *pb,
321 int page_count,
322 page_buf_flags_t flags)
323 {
324 /* Make sure that we have a page list */
325 if (pb->pb_pages == NULL) {
326 pb->pb_offset = page_buf_poff(pb->pb_file_offset);
327 pb->pb_page_count = page_count;
328 if (page_count <= PB_PAGES) {
329 pb->pb_pages = pb->pb_page_array;
330 } else {
331 pb->pb_pages = kmem_alloc(sizeof(struct page *) *
332 page_count, pb_to_km(flags));
333 if (pb->pb_pages == NULL)
334 return -ENOMEM;
335 }
336 memset(pb->pb_pages, 0, sizeof(struct page *) * page_count);
337 }
338 return 0;
339 }
340
341 /*
342 * Frees pb_pages if it was malloced.
343 */
344 STATIC void
_pagebuf_free_pages(xfs_buf_t * bp)345 _pagebuf_free_pages(
346 xfs_buf_t *bp)
347 {
348 if (bp->pb_pages != bp->pb_page_array) {
349 kmem_free(bp->pb_pages,
350 bp->pb_page_count * sizeof(struct page *));
351 }
352 }
353
354 /*
355 * Releases the specified buffer.
356 *
357 * The modification state of any associated pages is left unchanged.
358 * The buffer most not be on any hash - use pagebuf_rele instead for
359 * hashed and refcounted buffers
360 */
361 void
pagebuf_free(xfs_buf_t * bp)362 pagebuf_free(
363 xfs_buf_t *bp)
364 {
365 PB_TRACE(bp, "free", 0);
366
367 ASSERT(list_empty(&bp->pb_hash_list));
368
369 if (bp->pb_flags & _PBF_PAGE_CACHE) {
370 uint i;
371
372 if ((bp->pb_flags & PBF_MAPPED) && (bp->pb_page_count > 1))
373 free_address(bp->pb_addr - bp->pb_offset);
374
375 for (i = 0; i < bp->pb_page_count; i++)
376 page_cache_release(bp->pb_pages[i]);
377 _pagebuf_free_pages(bp);
378 } else if (bp->pb_flags & _PBF_KMEM_ALLOC) {
379 /*
380 * XXX(hch): bp->pb_count_desired might be incorrect (see
381 * pagebuf_associate_memory for details), but fortunately
382 * the Linux version of kmem_free ignores the len argument..
383 */
384 kmem_free(bp->pb_addr, bp->pb_count_desired);
385 _pagebuf_free_pages(bp);
386 }
387
388 pagebuf_deallocate(bp);
389 }
390
391 /*
392 * Finds all pages for buffer in question and builds it's page list.
393 */
394 STATIC int
_pagebuf_lookup_pages(xfs_buf_t * bp,uint flags)395 _pagebuf_lookup_pages(
396 xfs_buf_t *bp,
397 uint flags)
398 {
399 struct address_space *mapping = bp->pb_target->pbr_mapping;
400 size_t blocksize = bp->pb_target->pbr_bsize;
401 int gfp_mask = pb_to_gfp(flags);
402 unsigned short page_count, i;
403 pgoff_t first;
404 loff_t end;
405 int error;
406
407 end = bp->pb_file_offset + bp->pb_buffer_length;
408 page_count = page_buf_btoc(end) - page_buf_btoct(bp->pb_file_offset);
409
410 error = _pagebuf_get_pages(bp, page_count, flags);
411 if (unlikely(error))
412 return error;
413 bp->pb_flags |= _PBF_PAGE_CACHE;
414
415 first = bp->pb_file_offset >> PAGE_CACHE_SHIFT;
416
417 for (i = 0; i < bp->pb_page_count; i++) {
418 struct page *page;
419 uint retries = 0;
420
421 retry:
422 page = find_or_create_page(mapping, first + i, gfp_mask);
423 if (unlikely(page == NULL)) {
424 if (flags & PBF_READ_AHEAD) {
425 bp->pb_page_count = i;
426 for (i = 0; i < bp->pb_page_count; i++)
427 unlock_page(bp->pb_pages[i]);
428 return -ENOMEM;
429 }
430
431 /*
432 * This could deadlock.
433 *
434 * But until all the XFS lowlevel code is revamped to
435 * handle buffer allocation failures we can't do much.
436 */
437 if (!(++retries % 100))
438 printk(KERN_ERR
439 "possible deadlock in %s (mode:0x%x)\n",
440 __FUNCTION__, gfp_mask);
441
442 XFS_STATS_INC(pb_page_retries);
443 pagebuf_daemon_wakeup(0, gfp_mask);
444 set_current_state(TASK_UNINTERRUPTIBLE);
445 schedule_timeout(10);
446 goto retry;
447 }
448
449 XFS_STATS_INC(pb_page_found);
450
451 /* if we need to do I/O on a page record the fact */
452 if (!Page_Uptodate(page)) {
453 page_count--;
454 if (blocksize == PAGE_CACHE_SIZE && (flags & PBF_READ))
455 bp->pb_locked = 1;
456 }
457
458 bp->pb_pages[i] = page;
459 }
460
461 if (!bp->pb_locked) {
462 for (i = 0; i < bp->pb_page_count; i++)
463 unlock_page(bp->pb_pages[i]);
464 }
465
466 if (page_count) {
467 /* if we have any uptodate pages, mark that in the buffer */
468 bp->pb_flags &= ~PBF_NONE;
469
470 /* if some pages aren't uptodate, mark that in the buffer */
471 if (page_count != bp->pb_page_count)
472 bp->pb_flags |= PBF_PARTIAL;
473 }
474
475 PB_TRACE(bp, "lookup_pages", (long)page_count);
476 return error;
477 }
478
479 /*
480 * Map buffer into kernel address-space if nessecary.
481 */
482 STATIC int
_pagebuf_map_pages(xfs_buf_t * bp,uint flags)483 _pagebuf_map_pages(
484 xfs_buf_t *bp,
485 uint flags)
486 {
487 /* A single page buffer is always mappable */
488 if (bp->pb_page_count == 1) {
489 bp->pb_addr = page_address(bp->pb_pages[0]) + bp->pb_offset;
490 bp->pb_flags |= PBF_MAPPED;
491 } else if (flags & PBF_MAPPED) {
492 if (as_list_len > 64)
493 purge_addresses();
494 bp->pb_addr = vmap(bp->pb_pages, bp->pb_page_count,
495 VM_MAP, PAGE_KERNEL);
496 if (unlikely(bp->pb_addr == NULL))
497 return -ENOMEM;
498 bp->pb_addr += bp->pb_offset;
499 bp->pb_flags |= PBF_MAPPED;
500 }
501
502 return 0;
503 }
504
505 /*
506 * Pre-allocation of a pool of buffer heads for use in
507 * low-memory situations.
508 */
509
510 /*
511 * _pagebuf_prealloc_bh
512 *
513 * Pre-allocate a pool of "count" buffer heads at startup.
514 * Puts them on a list at "pb_resv_bh"
515 * Returns number of bh actually allocated to pool.
516 */
517 STATIC int
_pagebuf_prealloc_bh(int count)518 _pagebuf_prealloc_bh(
519 int count)
520 {
521 struct buffer_head *bh;
522 int i;
523
524 for (i = 0; i < count; i++) {
525 bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL);
526 if (!bh)
527 break;
528 bh->b_pprev = &pb_resv_bh;
529 bh->b_next = pb_resv_bh;
530 pb_resv_bh = bh;
531 pb_resv_bh_cnt++;
532 }
533 return i;
534 }
535
536 /*
537 * _pagebuf_get_prealloc_bh
538 *
539 * Get one buffer head from our pre-allocated pool.
540 * If pool is empty, sleep 'til one comes back in.
541 * Returns aforementioned buffer head.
542 */
543 STATIC struct buffer_head *
_pagebuf_get_prealloc_bh(void)544 _pagebuf_get_prealloc_bh(void)
545 {
546 unsigned long flags;
547 struct buffer_head *bh;
548 DECLARE_WAITQUEUE (wait, current);
549
550 spin_lock_irqsave(&pb_resv_bh_lock, flags);
551
552 if (pb_resv_bh_cnt < 1) {
553 add_wait_queue(&pb_resv_bh_wait, &wait);
554 do {
555 set_current_state(TASK_UNINTERRUPTIBLE);
556 spin_unlock_irqrestore(&pb_resv_bh_lock, flags);
557 run_task_queue(&tq_disk);
558 schedule();
559 spin_lock_irqsave(&pb_resv_bh_lock, flags);
560 } while (pb_resv_bh_cnt < 1);
561 __set_current_state(TASK_RUNNING);
562 remove_wait_queue(&pb_resv_bh_wait, &wait);
563 }
564
565 BUG_ON(pb_resv_bh_cnt < 1);
566 BUG_ON(!pb_resv_bh);
567
568 bh = pb_resv_bh;
569 pb_resv_bh = bh->b_next;
570 pb_resv_bh_cnt--;
571
572 spin_unlock_irqrestore(&pb_resv_bh_lock, flags);
573 return bh;
574 }
575
576 /*
577 * _pagebuf_free_bh
578 *
579 * Take care of buffer heads that we're finished with.
580 * Call this instead of just kmem_cache_free(bh_cachep, bh)
581 * when you're done with a bh.
582 *
583 * If our pre-allocated pool is full, just free the buffer head.
584 * Otherwise, put it back in the pool, and wake up anybody
585 * waiting for one.
586 */
587 STATIC inline void
_pagebuf_free_bh(struct buffer_head * bh)588 _pagebuf_free_bh(
589 struct buffer_head *bh)
590 {
591 unsigned long flags;
592 int free;
593
594 if (! (free = pb_resv_bh_cnt >= NR_RESERVED_BH)) {
595 spin_lock_irqsave(&pb_resv_bh_lock, flags);
596
597 if (! (free = pb_resv_bh_cnt >= NR_RESERVED_BH)) {
598 bh->b_pprev = &pb_resv_bh;
599 bh->b_next = pb_resv_bh;
600 pb_resv_bh = bh;
601 pb_resv_bh_cnt++;
602
603 if (waitqueue_active(&pb_resv_bh_wait)) {
604 wake_up(&pb_resv_bh_wait);
605 }
606 }
607
608 spin_unlock_irqrestore(&pb_resv_bh_lock, flags);
609 }
610 if (free) {
611 kmem_cache_free(bh_cachep, bh);
612 }
613 }
614
615 /*
616 * Finding and Reading Buffers
617 */
618
619 /*
620 * _pagebuf_find
621 *
622 * Looks up, and creates if absent, a lockable buffer for
623 * a given range of an inode. The buffer is returned
624 * locked. If other overlapping buffers exist, they are
625 * released before the new buffer is created and locked,
626 * which may imply that this call will block until those buffers
627 * are unlocked. No I/O is implied by this call.
628 */
629 xfs_buf_t *
_pagebuf_find(xfs_buftarg_t * target,loff_t ioff,size_t isize,page_buf_flags_t flags,xfs_buf_t * new_pb)630 _pagebuf_find( /* find buffer for block */
631 xfs_buftarg_t *target,/* target for block */
632 loff_t ioff, /* starting offset of range */
633 size_t isize, /* length of range */
634 page_buf_flags_t flags, /* PBF_TRYLOCK */
635 xfs_buf_t *new_pb)/* newly allocated buffer */
636 {
637 loff_t range_base;
638 size_t range_length;
639 int hval;
640 pb_hash_t *h;
641 xfs_buf_t *pb, *n;
642 int not_locked;
643
644 range_base = (ioff << BBSHIFT);
645 range_length = (isize << BBSHIFT);
646
647 /* Ensure we never do IOs smaller than the sector size */
648 BUG_ON(range_length < (1 << target->pbr_sshift));
649
650 /* Ensure we never do IOs that are not sector aligned */
651 BUG_ON(range_base & (loff_t)target->pbr_smask);
652
653 hval = _bhash(target->pbr_bdev, range_base);
654 h = &pbhash[hval];
655
656 spin_lock(&h->pb_hash_lock);
657
658 list_for_each_entry_safe(pb, n, &h->pb_hash, pb_hash_list) {
659 if (pb->pb_target == target &&
660 pb->pb_file_offset == range_base &&
661 pb->pb_buffer_length == range_length) {
662 /* If we look at something bring it to the
663 * front of the list for next time
664 */
665 atomic_inc(&pb->pb_hold);
666 list_move(&pb->pb_hash_list, &h->pb_hash);
667 goto found;
668 }
669 }
670
671 /* No match found */
672 if (new_pb) {
673 _pagebuf_initialize(new_pb, target, range_base,
674 range_length, flags);
675 new_pb->pb_hash_index = hval;
676 list_add(&new_pb->pb_hash_list, &h->pb_hash);
677 } else {
678 XFS_STATS_INC(pb_miss_locked);
679 }
680
681 spin_unlock(&h->pb_hash_lock);
682 return (new_pb);
683
684 found:
685 spin_unlock(&h->pb_hash_lock);
686
687 /* Attempt to get the semaphore without sleeping,
688 * if this does not work then we need to drop the
689 * spinlock and do a hard attempt on the semaphore.
690 */
691 not_locked = down_trylock(&pb->pb_sema);
692 if (not_locked) {
693 if (!(flags & PBF_TRYLOCK)) {
694 /* wait for buffer ownership */
695 PB_TRACE(pb, "get_lock", 0);
696 pagebuf_lock(pb);
697 XFS_STATS_INC(pb_get_locked_waited);
698 } else {
699 /* We asked for a trylock and failed, no need
700 * to look at file offset and length here, we
701 * know that this pagebuf at least overlaps our
702 * pagebuf and is locked, therefore our buffer
703 * either does not exist, or is this buffer
704 */
705
706 pagebuf_rele(pb);
707 XFS_STATS_INC(pb_busy_locked);
708 return (NULL);
709 }
710 } else {
711 /* trylock worked */
712 PB_SET_OWNER(pb);
713 }
714
715 if (pb->pb_flags & PBF_STALE)
716 pb->pb_flags &= PBF_MAPPED;
717 PB_TRACE(pb, "got_lock", 0);
718 XFS_STATS_INC(pb_get_locked);
719 return (pb);
720 }
721
722 /*
723 * xfs_buf_get_flags assembles a buffer covering the specified range.
724 *
725 * Storage in memory for all portions of the buffer will be allocated,
726 * although backing storage may not be.
727 */
728 xfs_buf_t *
xfs_buf_get_flags(xfs_buftarg_t * target,loff_t ioff,size_t isize,page_buf_flags_t flags)729 xfs_buf_get_flags( /* allocate a buffer */
730 xfs_buftarg_t *target,/* target for buffer */
731 loff_t ioff, /* starting offset of range */
732 size_t isize, /* length of range */
733 page_buf_flags_t flags) /* PBF_TRYLOCK */
734 {
735 xfs_buf_t *pb, *new_pb;
736 int error = 0, i;
737
738 new_pb = pagebuf_allocate(flags);
739 if (unlikely(!new_pb))
740 return NULL;
741
742 pb = _pagebuf_find(target, ioff, isize, flags, new_pb);
743 if (pb == new_pb) {
744 error = _pagebuf_lookup_pages(pb, flags);
745 if (error)
746 goto no_buffer;
747 } else {
748 pagebuf_deallocate(new_pb);
749 if (unlikely(pb == NULL))
750 return NULL;
751 }
752
753 for (i = 0; i < pb->pb_page_count; i++)
754 mark_page_accessed(pb->pb_pages[i]);
755
756 if (!(pb->pb_flags & PBF_MAPPED)) {
757 error = _pagebuf_map_pages(pb, flags);
758 if (unlikely(error)) {
759 printk(KERN_WARNING "%s: failed to map pages\n",
760 __FUNCTION__);
761 goto no_buffer;
762 }
763 }
764
765 XFS_STATS_INC(pb_get);
766
767 /*
768 * Always fill in the block number now, the mapped cases can do
769 * their own overlay of this later.
770 */
771 pb->pb_bn = ioff;
772 pb->pb_count_desired = pb->pb_buffer_length;
773
774 PB_TRACE(pb, "get", (unsigned long)flags);
775 return pb;
776
777 no_buffer:
778 if (flags & (PBF_LOCK | PBF_TRYLOCK))
779 pagebuf_unlock(pb);
780 pagebuf_rele(pb);
781 return NULL;
782 }
783
784 xfs_buf_t *
xfs_buf_read_flags(xfs_buftarg_t * target,loff_t ioff,size_t isize,page_buf_flags_t flags)785 xfs_buf_read_flags(
786 xfs_buftarg_t *target,
787 loff_t ioff,
788 size_t isize,
789 page_buf_flags_t flags)
790 {
791 xfs_buf_t *pb;
792
793 flags |= PBF_READ;
794
795 pb = xfs_buf_get_flags(target, ioff, isize, flags);
796 if (pb) {
797 if (PBF_NOT_DONE(pb)) {
798 PB_TRACE(pb, "read", (unsigned long)flags);
799 XFS_STATS_INC(pb_get_read);
800 pagebuf_iostart(pb, flags);
801 } else if (flags & PBF_ASYNC) {
802 PB_TRACE(pb, "read_async", (unsigned long)flags);
803 /*
804 * Read ahead call which is already satisfied,
805 * drop the buffer
806 */
807 goto no_buffer;
808 } else {
809 PB_TRACE(pb, "read_done", (unsigned long)flags);
810 /* We do not want read in the flags */
811 pb->pb_flags &= ~PBF_READ;
812 }
813 }
814
815 return pb;
816
817 no_buffer:
818 if (flags & (PBF_LOCK | PBF_TRYLOCK))
819 pagebuf_unlock(pb);
820 pagebuf_rele(pb);
821 return NULL;
822 }
823
824 /*
825 * Create a skeletal pagebuf (no pages associated with it).
826 */
827 xfs_buf_t *
pagebuf_lookup(xfs_buftarg_t * target,loff_t ioff,size_t isize,page_buf_flags_t flags)828 pagebuf_lookup(
829 xfs_buftarg_t *target,
830 loff_t ioff,
831 size_t isize,
832 page_buf_flags_t flags)
833 {
834 xfs_buf_t *pb;
835
836 flags |= _PBF_PRIVATE_BH;
837 pb = pagebuf_allocate(flags);
838 if (pb) {
839 _pagebuf_initialize(pb, target, ioff, isize, flags);
840 }
841 return pb;
842 }
843
844 /*
845 * If we are not low on memory then do the readahead in a deadlock
846 * safe manner.
847 */
848 void
pagebuf_readahead(xfs_buftarg_t * target,loff_t ioff,size_t isize,page_buf_flags_t flags)849 pagebuf_readahead(
850 xfs_buftarg_t *target,
851 loff_t ioff,
852 size_t isize,
853 page_buf_flags_t flags)
854 {
855 flags |= (PBF_TRYLOCK|PBF_ASYNC|PBF_READ_AHEAD);
856 xfs_buf_read_flags(target, ioff, isize, flags);
857 }
858
859 xfs_buf_t *
pagebuf_get_empty(size_t len,xfs_buftarg_t * target)860 pagebuf_get_empty(
861 size_t len,
862 xfs_buftarg_t *target)
863 {
864 xfs_buf_t *pb;
865
866 pb = pagebuf_allocate(0);
867 if (pb)
868 _pagebuf_initialize(pb, target, 0, len, 0);
869 return pb;
870 }
871
872 static inline struct page *
mem_to_page(void * addr)873 mem_to_page(
874 void *addr)
875 {
876 if (((unsigned long)addr < VMALLOC_START) ||
877 ((unsigned long)addr >= VMALLOC_END)) {
878 return virt_to_page(addr);
879 } else {
880 return vmalloc_to_page(addr);
881 }
882 }
883
884 int
pagebuf_associate_memory(xfs_buf_t * pb,void * mem,size_t len)885 pagebuf_associate_memory(
886 xfs_buf_t *pb,
887 void *mem,
888 size_t len)
889 {
890 int rval;
891 int i = 0;
892 size_t ptr;
893 size_t end, end_cur;
894 off_t offset;
895 int page_count;
896
897 page_count = PAGE_CACHE_ALIGN(len) >> PAGE_CACHE_SHIFT;
898 offset = (off_t) mem - ((off_t)mem & PAGE_CACHE_MASK);
899 if (offset && (len > PAGE_CACHE_SIZE))
900 page_count++;
901
902 /* Free any previous set of page pointers */
903 if (pb->pb_pages)
904 _pagebuf_free_pages(pb);
905
906 pb->pb_pages = NULL;
907 pb->pb_addr = mem;
908
909 rval = _pagebuf_get_pages(pb, page_count, 0);
910 if (rval)
911 return rval;
912
913 pb->pb_offset = offset;
914 ptr = (size_t) mem & PAGE_CACHE_MASK;
915 end = PAGE_CACHE_ALIGN((size_t) mem + len);
916 end_cur = end;
917 /* set up first page */
918 pb->pb_pages[0] = mem_to_page(mem);
919
920 ptr += PAGE_CACHE_SIZE;
921 pb->pb_page_count = ++i;
922 while (ptr < end) {
923 pb->pb_pages[i] = mem_to_page((void *)ptr);
924 pb->pb_page_count = ++i;
925 ptr += PAGE_CACHE_SIZE;
926 }
927 pb->pb_locked = 0;
928
929 pb->pb_count_desired = pb->pb_buffer_length = len;
930 pb->pb_flags |= PBF_MAPPED | _PBF_PRIVATE_BH;
931
932 return 0;
933 }
934
935 xfs_buf_t *
pagebuf_get_no_daddr(size_t len,xfs_buftarg_t * target)936 pagebuf_get_no_daddr(
937 size_t len,
938 xfs_buftarg_t *target)
939 {
940 size_t malloc_len = len;
941 xfs_buf_t *bp;
942 void *data;
943 int error;
944
945 bp = pagebuf_allocate(0);
946 if (unlikely(bp == NULL))
947 goto fail;
948 _pagebuf_initialize(bp, target, 0, len, PBF_FORCEIO);
949
950 try_again:
951 data = kmem_alloc(malloc_len, KM_SLEEP | KM_MAYFAIL);
952 if (unlikely(data == NULL))
953 goto fail_free_buf;
954
955 /* check whether alignment matches.. */
956 if ((__psunsigned_t)data !=
957 ((__psunsigned_t)data & ~target->pbr_smask)) {
958 /* .. else double the size and try again */
959 kmem_free(data, malloc_len);
960 malloc_len <<= 1;
961 goto try_again;
962 }
963
964 error = pagebuf_associate_memory(bp, data, len);
965 if (error)
966 goto fail_free_mem;
967 bp->pb_flags |= _PBF_KMEM_ALLOC;
968
969 pagebuf_unlock(bp);
970
971 PB_TRACE(bp, "no_daddr", data);
972 return bp;
973 fail_free_mem:
974 kmem_free(data, malloc_len);
975 fail_free_buf:
976 pagebuf_free(bp);
977 fail:
978 return NULL;
979 }
980
981 /*
982 * pagebuf_hold
983 *
984 * Increment reference count on buffer, to hold the buffer concurrently
985 * with another thread which may release (free) the buffer asynchronously.
986 *
987 * Must hold the buffer already to call this function.
988 */
989 void
pagebuf_hold(xfs_buf_t * pb)990 pagebuf_hold(
991 xfs_buf_t *pb)
992 {
993 atomic_inc(&pb->pb_hold);
994 PB_TRACE(pb, "hold", 0);
995 }
996
997 /*
998 * pagebuf_rele
999 *
1000 * pagebuf_rele releases a hold on the specified buffer. If the
1001 * the hold count is 1, pagebuf_rele calls pagebuf_free.
1002 */
1003 void
pagebuf_rele(xfs_buf_t * pb)1004 pagebuf_rele(
1005 xfs_buf_t *pb)
1006 {
1007 pb_hash_t *hash = pb_hash(pb);
1008
1009 PB_TRACE(pb, "rele", pb->pb_relse);
1010
1011 if (atomic_dec_and_lock(&pb->pb_hold, &hash->pb_hash_lock)) {
1012 int do_free = 1;
1013
1014 if (pb->pb_relse) {
1015 atomic_inc(&pb->pb_hold);
1016 spin_unlock(&hash->pb_hash_lock);
1017 (*(pb->pb_relse)) (pb);
1018 spin_lock(&hash->pb_hash_lock);
1019 do_free = 0;
1020 }
1021
1022 if (pb->pb_flags & PBF_DELWRI) {
1023 pb->pb_flags |= PBF_ASYNC;
1024 atomic_inc(&pb->pb_hold);
1025 pagebuf_delwri_queue(pb, 0);
1026 do_free = 0;
1027 } else if (pb->pb_flags & PBF_FS_MANAGED) {
1028 do_free = 0;
1029 }
1030
1031 if (do_free) {
1032 list_del_init(&pb->pb_hash_list);
1033 spin_unlock(&hash->pb_hash_lock);
1034 xfs_buf_free(pb);
1035 } else {
1036 spin_unlock(&hash->pb_hash_lock);
1037 }
1038 }
1039 }
1040
1041
1042 /*
1043 * Mutual exclusion on buffers. Locking model:
1044 *
1045 * Buffers associated with inodes for which buffer locking
1046 * is not enabled are not protected by semaphores, and are
1047 * assumed to be exclusively owned by the caller. There is a
1048 * spinlock in the buffer, used by the caller when concurrent
1049 * access is possible.
1050 */
1051
1052 /*
1053 * pagebuf_cond_lock
1054 *
1055 * pagebuf_cond_lock locks a buffer object, if it is not already locked.
1056 * Note that this in no way
1057 * locks the underlying pages, so it is only useful for synchronizing
1058 * concurrent use of page buffer objects, not for synchronizing independent
1059 * access to the underlying pages.
1060 */
1061 int
pagebuf_cond_lock(xfs_buf_t * pb)1062 pagebuf_cond_lock( /* lock buffer, if not locked */
1063 /* returns -EBUSY if locked) */
1064 xfs_buf_t *pb)
1065 {
1066 int locked;
1067
1068 locked = down_trylock(&pb->pb_sema) == 0;
1069 if (locked) {
1070 PB_SET_OWNER(pb);
1071 }
1072 PB_TRACE(pb, "cond_lock", (long)locked);
1073 return(locked ? 0 : -EBUSY);
1074 }
1075
1076 #if defined(DEBUG) || defined(XFS_BLI_TRACE)
1077 /*
1078 * pagebuf_lock_value
1079 *
1080 * Return lock value for a pagebuf
1081 */
1082 int
pagebuf_lock_value(xfs_buf_t * pb)1083 pagebuf_lock_value(
1084 xfs_buf_t *pb)
1085 {
1086 return(atomic_read(&pb->pb_sema.count));
1087 }
1088 #endif
1089
1090 /*
1091 * pagebuf_lock
1092 *
1093 * pagebuf_lock locks a buffer object. Note that this in no way
1094 * locks the underlying pages, so it is only useful for synchronizing
1095 * concurrent use of page buffer objects, not for synchronizing independent
1096 * access to the underlying pages.
1097 */
1098 int
pagebuf_lock(xfs_buf_t * pb)1099 pagebuf_lock(
1100 xfs_buf_t *pb)
1101 {
1102 PB_TRACE(pb, "lock", 0);
1103 if (atomic_read(&pb->pb_io_remaining))
1104 run_task_queue(&tq_disk);
1105 down(&pb->pb_sema);
1106 PB_SET_OWNER(pb);
1107 PB_TRACE(pb, "locked", 0);
1108 return 0;
1109 }
1110
1111 /*
1112 * pagebuf_unlock
1113 *
1114 * pagebuf_unlock releases the lock on the buffer object created by
1115 * pagebuf_lock or pagebuf_cond_lock (not any
1116 * pinning of underlying pages created by pagebuf_pin).
1117 */
1118 void
pagebuf_unlock(xfs_buf_t * pb)1119 pagebuf_unlock( /* unlock buffer */
1120 xfs_buf_t *pb) /* buffer to unlock */
1121 {
1122 PB_CLEAR_OWNER(pb);
1123 up(&pb->pb_sema);
1124 PB_TRACE(pb, "unlock", 0);
1125 }
1126
1127
1128 /*
1129 * Pinning Buffer Storage in Memory
1130 */
1131
1132 /*
1133 * pagebuf_pin
1134 *
1135 * pagebuf_pin locks all of the memory represented by a buffer in
1136 * memory. Multiple calls to pagebuf_pin and pagebuf_unpin, for
1137 * the same or different buffers affecting a given page, will
1138 * properly count the number of outstanding "pin" requests. The
1139 * buffer may be released after the pagebuf_pin and a different
1140 * buffer used when calling pagebuf_unpin, if desired.
1141 * pagebuf_pin should be used by the file system when it wants be
1142 * assured that no attempt will be made to force the affected
1143 * memory to disk. It does not assure that a given logical page
1144 * will not be moved to a different physical page.
1145 */
1146 void
pagebuf_pin(xfs_buf_t * pb)1147 pagebuf_pin(
1148 xfs_buf_t *pb)
1149 {
1150 atomic_inc(&pb->pb_pin_count);
1151 PB_TRACE(pb, "pin", (long)pb->pb_pin_count.counter);
1152 }
1153
1154 /*
1155 * pagebuf_unpin
1156 *
1157 * pagebuf_unpin reverses the locking of memory performed by
1158 * pagebuf_pin. Note that both functions affected the logical
1159 * pages associated with the buffer, not the buffer itself.
1160 */
1161 void
pagebuf_unpin(xfs_buf_t * pb)1162 pagebuf_unpin(
1163 xfs_buf_t *pb)
1164 {
1165 if (atomic_dec_and_test(&pb->pb_pin_count)) {
1166 wake_up_all(&pb->pb_waiters);
1167 }
1168 PB_TRACE(pb, "unpin", (long)pb->pb_pin_count.counter);
1169 }
1170
1171 int
pagebuf_ispin(xfs_buf_t * pb)1172 pagebuf_ispin(
1173 xfs_buf_t *pb)
1174 {
1175 return atomic_read(&pb->pb_pin_count);
1176 }
1177
1178 /*
1179 * pagebuf_wait_unpin
1180 *
1181 * pagebuf_wait_unpin waits until all of the memory associated
1182 * with the buffer is not longer locked in memory. It returns
1183 * immediately if none of the affected pages are locked.
1184 */
1185 static inline void
_pagebuf_wait_unpin(xfs_buf_t * pb)1186 _pagebuf_wait_unpin(
1187 xfs_buf_t *pb)
1188 {
1189 DECLARE_WAITQUEUE (wait, current);
1190
1191 if (atomic_read(&pb->pb_pin_count) == 0)
1192 return;
1193
1194 add_wait_queue(&pb->pb_waiters, &wait);
1195 for (;;) {
1196 set_current_state(TASK_UNINTERRUPTIBLE);
1197 if (atomic_read(&pb->pb_pin_count) == 0)
1198 break;
1199 if (atomic_read(&pb->pb_io_remaining))
1200 run_task_queue(&tq_disk);
1201 schedule();
1202 }
1203 remove_wait_queue(&pb->pb_waiters, &wait);
1204 set_current_state(TASK_RUNNING);
1205 }
1206
1207
1208 /*
1209 * Buffer Utility Routines
1210 */
1211
1212 /*
1213 * pagebuf_iodone
1214 *
1215 * pagebuf_iodone marks a buffer for which I/O is in progress
1216 * done with respect to that I/O. The pb_iodone routine, if
1217 * present, will be called as a side-effect.
1218 */
1219 void
pagebuf_iodone_sched(void * v)1220 pagebuf_iodone_sched(
1221 void *v)
1222 {
1223 xfs_buf_t *bp = (xfs_buf_t *)v;
1224
1225 if (bp->pb_iodone)
1226 (*(bp->pb_iodone))(bp);
1227 else if (bp->pb_flags & PBF_ASYNC)
1228 xfs_buf_relse(bp);
1229 }
1230
1231 void
pagebuf_iodone(xfs_buf_t * pb,int dataio,int schedule)1232 pagebuf_iodone(
1233 xfs_buf_t *pb,
1234 int dataio,
1235 int schedule)
1236 {
1237 pb->pb_flags &= ~(PBF_READ | PBF_WRITE);
1238 if (pb->pb_error == 0) {
1239 pb->pb_flags &= ~(PBF_PARTIAL | PBF_NONE);
1240 }
1241
1242 PB_TRACE(pb, "iodone", pb->pb_iodone);
1243
1244 if ((pb->pb_iodone) || (pb->pb_flags & PBF_ASYNC)) {
1245 if (schedule) {
1246 int daemon = CPU_TO_DAEMON(smp_processor_id());
1247
1248 INIT_TQUEUE(&pb->pb_iodone_sched,
1249 pagebuf_iodone_sched, (void *)pb);
1250 queue_task(&pb->pb_iodone_sched, dataio ?
1251 &pagebuf_dataiodone_tq[daemon] :
1252 &pagebuf_logiodone_tq[daemon]);
1253 wake_up(dataio ?
1254 &pagebuf_dataiodone_wait[daemon] :
1255 &pagebuf_logiodone_wait[daemon]);
1256 } else {
1257 pagebuf_iodone_sched(pb);
1258 }
1259 } else {
1260 up(&pb->pb_iodonesema);
1261 }
1262 }
1263
1264 /*
1265 * pagebuf_ioerror
1266 *
1267 * pagebuf_ioerror sets the error code for a buffer.
1268 */
1269 void
pagebuf_ioerror(xfs_buf_t * pb,int error)1270 pagebuf_ioerror( /* mark/clear buffer error flag */
1271 xfs_buf_t *pb, /* buffer to mark */
1272 int error) /* error to store (0 if none) */
1273 {
1274 ASSERT(error >= 0 && error <= 0xffff);
1275 pb->pb_error = (unsigned short)error;
1276 PB_TRACE(pb, "ioerror", (unsigned long)error);
1277 }
1278
1279 /*
1280 * pagebuf_iostart
1281 *
1282 * pagebuf_iostart initiates I/O on a buffer, based on the flags supplied.
1283 * If necessary, it will arrange for any disk space allocation required,
1284 * and it will break up the request if the block mappings require it.
1285 * The pb_iodone routine in the buffer supplied will only be called
1286 * when all of the subsidiary I/O requests, if any, have been completed.
1287 * pagebuf_iostart calls the pagebuf_ioinitiate routine or
1288 * pagebuf_iorequest, if the former routine is not defined, to start
1289 * the I/O on a given low-level request.
1290 */
1291 int
pagebuf_iostart(xfs_buf_t * pb,page_buf_flags_t flags)1292 pagebuf_iostart( /* start I/O on a buffer */
1293 xfs_buf_t *pb, /* buffer to start */
1294 page_buf_flags_t flags) /* PBF_LOCK, PBF_ASYNC, PBF_READ, */
1295 /* PBF_WRITE, PBF_DELWRI, */
1296 /* PBF_DONT_BLOCK */
1297 {
1298 int status = 0;
1299
1300 PB_TRACE(pb, "iostart", (unsigned long)flags);
1301
1302 if (flags & PBF_DELWRI) {
1303 pb->pb_flags &= ~(PBF_READ | PBF_WRITE | PBF_ASYNC);
1304 pb->pb_flags |= flags & (PBF_DELWRI | PBF_ASYNC);
1305 pagebuf_delwri_queue(pb, 1);
1306 return status;
1307 }
1308
1309 pb->pb_flags &= ~(PBF_READ | PBF_WRITE | PBF_ASYNC | PBF_DELWRI | \
1310 PBF_READ_AHEAD | _PBF_RUN_QUEUES);
1311 pb->pb_flags |= flags & (PBF_READ | PBF_WRITE | PBF_ASYNC | \
1312 PBF_READ_AHEAD | _PBF_RUN_QUEUES);
1313
1314 BUG_ON(pb->pb_bn == XFS_BUF_DADDR_NULL);
1315
1316 /* For writes allow an alternate strategy routine to precede
1317 * the actual I/O request (which may not be issued at all in
1318 * a shutdown situation, for example).
1319 */
1320 status = (flags & PBF_WRITE) ?
1321 pagebuf_iostrategy(pb) : pagebuf_iorequest(pb);
1322
1323 /* Wait for I/O if we are not an async request.
1324 * Note: async I/O request completion will release the buffer,
1325 * and that can already be done by this point. So using the
1326 * buffer pointer from here on, after async I/O, is invalid.
1327 */
1328 if (!status && !(flags & PBF_ASYNC))
1329 status = pagebuf_iowait(pb);
1330
1331 return status;
1332 }
1333
1334
1335 /*
1336 * Helper routines for pagebuf_iorequest (pagebuf I/O completion)
1337 */
1338
1339 STATIC __inline__ int
_pagebuf_iolocked(xfs_buf_t * pb)1340 _pagebuf_iolocked(
1341 xfs_buf_t *pb)
1342 {
1343 ASSERT(pb->pb_flags & (PBF_READ|PBF_WRITE));
1344 if (pb->pb_target->pbr_bsize < PAGE_CACHE_SIZE)
1345 return pb->pb_locked;
1346 if (pb->pb_flags & PBF_READ)
1347 return pb->pb_locked;
1348 return (pb->pb_flags & _PBF_PAGE_CACHE);
1349 }
1350
1351 STATIC void
_pagebuf_iodone(xfs_buf_t * pb,int schedule)1352 _pagebuf_iodone(
1353 xfs_buf_t *pb,
1354 int schedule)
1355 {
1356 int i;
1357
1358 if (atomic_dec_and_test(&pb->pb_io_remaining) != 1)
1359 return;
1360
1361 if (_pagebuf_iolocked(pb))
1362 for (i = 0; i < pb->pb_page_count; i++)
1363 unlock_page(pb->pb_pages[i]);
1364 pb->pb_locked = 0;
1365 pagebuf_iodone(pb, (pb->pb_flags & PBF_FS_DATAIOD), schedule);
1366 }
1367
1368 STATIC void
_end_io_pagebuf(struct buffer_head * bh,int uptodate,int fullpage)1369 _end_io_pagebuf(
1370 struct buffer_head *bh,
1371 int uptodate,
1372 int fullpage)
1373 {
1374 struct page *page = bh->b_page;
1375 xfs_buf_t *pb = (xfs_buf_t *)bh->b_private;
1376
1377 mark_buffer_uptodate(bh, uptodate);
1378 put_bh(bh);
1379
1380 if (!uptodate) {
1381 SetPageError(page);
1382 pb->pb_error = EIO;
1383 }
1384
1385 if (fullpage) {
1386 unlock_buffer(bh);
1387 _pagebuf_free_bh(bh);
1388 if (!PageError(page))
1389 SetPageUptodate(page);
1390 } else {
1391 static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
1392 struct buffer_head *bp;
1393 unsigned long flags;
1394
1395 ASSERT(PageLocked(page));
1396 spin_lock_irqsave(&page_uptodate_lock, flags);
1397 clear_buffer_async(bh);
1398 unlock_buffer(bh);
1399 for (bp = bh->b_this_page; bp != bh; bp = bp->b_this_page) {
1400 if (buffer_locked(bp)) {
1401 if (buffer_async(bp))
1402 break;
1403 } else if (!buffer_uptodate(bp))
1404 break;
1405 }
1406 spin_unlock_irqrestore(&page_uptodate_lock, flags);
1407 if (bp == bh && !PageError(page))
1408 SetPageUptodate(page);
1409 }
1410
1411 _pagebuf_iodone(pb, 1);
1412 }
1413
1414 STATIC void
_pagebuf_end_io_complete_pages(struct buffer_head * bh,int uptodate)1415 _pagebuf_end_io_complete_pages(
1416 struct buffer_head *bh,
1417 int uptodate)
1418 {
1419 _end_io_pagebuf(bh, uptodate, 1);
1420 }
1421
1422 STATIC void
_pagebuf_end_io_partial_pages(struct buffer_head * bh,int uptodate)1423 _pagebuf_end_io_partial_pages(
1424 struct buffer_head *bh,
1425 int uptodate)
1426 {
1427 _end_io_pagebuf(bh, uptodate, 0);
1428 }
1429
1430 /*
1431 * Handling of buftargs.
1432 */
1433
1434 /*
1435 * Wait for any bufs with callbacks that have been submitted but
1436 * have not yet returned... walk the hash list for the target.
1437 */
1438 void
xfs_wait_buftarg(xfs_buftarg_t * target)1439 xfs_wait_buftarg(
1440 xfs_buftarg_t *target)
1441 {
1442 xfs_buf_t *pb, *n;
1443 pb_hash_t *h;
1444 int i;
1445
1446 for (i = 0; i < NHASH; i++) {
1447 h = &pbhash[i];
1448 again:
1449 spin_lock(&h->pb_hash_lock);
1450 list_for_each_entry_safe(pb, n, &h->pb_hash, pb_hash_list) {
1451 if (pb->pb_target == target &&
1452 !(pb->pb_flags & PBF_FS_MANAGED)) {
1453 spin_unlock(&h->pb_hash_lock);
1454 delay(100);
1455 goto again;
1456 }
1457 }
1458 spin_unlock(&h->pb_hash_lock);
1459 }
1460 }
1461
1462 void
xfs_free_buftarg(xfs_buftarg_t * btp,int external)1463 xfs_free_buftarg(
1464 xfs_buftarg_t *btp,
1465 int external)
1466 {
1467 xfs_flush_buftarg(btp, 1);
1468 if (external)
1469 xfs_blkdev_put(btp->pbr_bdev);
1470 iput(btp->pbr_mapping->host);
1471 kmem_free(btp, sizeof(*btp));
1472 }
1473
1474 void
xfs_incore_relse(xfs_buftarg_t * btp,int delwri_only,int wait)1475 xfs_incore_relse(
1476 xfs_buftarg_t *btp,
1477 int delwri_only,
1478 int wait)
1479 {
1480 destroy_buffers(btp->pbr_kdev);
1481 truncate_inode_pages(btp->pbr_mapping, 0LL);
1482 }
1483
1484 int
xfs_setsize_buftarg(xfs_buftarg_t * btp,unsigned int blocksize,unsigned int sectorsize)1485 xfs_setsize_buftarg(
1486 xfs_buftarg_t *btp,
1487 unsigned int blocksize,
1488 unsigned int sectorsize)
1489 {
1490 btp->pbr_bsize = blocksize;
1491 btp->pbr_sshift = ffs(sectorsize) - 1;
1492 btp->pbr_smask = sectorsize - 1;
1493
1494 if (set_blocksize(btp->pbr_kdev, sectorsize)) {
1495 printk(KERN_WARNING
1496 "XFS: Cannot set_blocksize to %u on device 0x%x\n",
1497 sectorsize, kdev_t_to_nr(btp->pbr_kdev));
1498 return EINVAL;
1499 }
1500 return 0;
1501 }
1502
1503 STATIC int
xfs_mapping_buftarg(xfs_buftarg_t * btp,struct block_device * bdev)1504 xfs_mapping_buftarg(
1505 xfs_buftarg_t *btp,
1506 struct block_device *bdev)
1507 {
1508 kdev_t kdev;
1509 struct inode *inode;
1510 struct address_space *mapping;
1511 static struct address_space_operations mapping_aops = {
1512 .sync_page = block_sync_page,
1513 };
1514
1515 kdev = to_kdev_t(bdev->bd_dev);
1516 inode = new_inode(bdev->bd_inode->i_sb);
1517 if (!inode) {
1518 printk(KERN_WARNING
1519 "XFS: Cannot allocate mapping inode for device %s\n",
1520 XFS_BUFTARG_NAME(btp));
1521 return ENOMEM;
1522 }
1523 inode->i_mode = S_IFBLK;
1524 inode->i_dev = kdev;
1525 inode->i_rdev = kdev;
1526 inode->i_bdev = bdev;
1527 mapping = &inode->i_data;
1528 mapping->a_ops = &mapping_aops;
1529 mapping->gfp_mask = GFP_KERNEL;
1530 btp->pbr_mapping = mapping;
1531 return 0;
1532 }
1533
1534 xfs_buftarg_t *
xfs_alloc_buftarg(struct block_device * bdev)1535 xfs_alloc_buftarg(
1536 struct block_device *bdev)
1537 {
1538 xfs_buftarg_t *btp;
1539 kdev_t kdev;
1540
1541 btp = kmem_zalloc(sizeof(*btp), KM_SLEEP);
1542
1543 kdev = to_kdev_t(bdev->bd_dev);
1544 btp->pbr_dev = bdev->bd_dev;
1545 btp->pbr_kdev = kdev;
1546 btp->pbr_bdev = bdev;
1547 switch (MAJOR(btp->pbr_dev)) {
1548 case MD_MAJOR:
1549 case EVMS_MAJOR:
1550 btp->pbr_flags = PBR_ALIGNED_ONLY;
1551 break;
1552 case LOOP_MAJOR:
1553 case LVM_BLK_MAJOR:
1554 btp->pbr_flags = PBR_SECTOR_ONLY;
1555 break;
1556 }
1557 if (xfs_setsize_buftarg(btp, PAGE_CACHE_SIZE, get_hardsect_size(kdev)))
1558 goto error;
1559 if (xfs_mapping_buftarg(btp, bdev))
1560 goto error;
1561 return btp;
1562
1563 error:
1564 kmem_free(btp, sizeof(*btp));
1565 return NULL;
1566 }
1567
1568 /*
1569 * Initiate I/O on part of a page we are interested in
1570 */
1571 STATIC int
_pagebuf_page_io(struct page * page,xfs_buftarg_t * pbr,xfs_buf_t * pb,xfs_daddr_t bn,size_t pg_offset,size_t pg_length,int rw,int flush)1572 _pagebuf_page_io(
1573 struct page *page, /* Page structure we are dealing with */
1574 xfs_buftarg_t *pbr, /* device parameters (bsz, ssz, dev) */
1575 xfs_buf_t *pb, /* pagebuf holding it, can be NULL */
1576 xfs_daddr_t bn, /* starting block number */
1577 size_t pg_offset, /* starting offset in page */
1578 size_t pg_length, /* count of data to process */
1579 int rw, /* read/write operation */
1580 int flush)
1581 {
1582 size_t sector;
1583 size_t blk_length = 0;
1584 struct buffer_head *bh, *head, *bufferlist[MAX_BUF_PER_PAGE];
1585 int sector_shift = pbr->pbr_sshift;
1586 int i = 0, cnt = 0;
1587 int public_bh = 0;
1588 int multi_ok;
1589
1590 if ((pbr->pbr_bsize < PAGE_CACHE_SIZE) &&
1591 !(pb->pb_flags & _PBF_PRIVATE_BH)) {
1592 int cache_ok;
1593
1594 cache_ok = !((pb->pb_flags & PBF_FORCEIO) || (rw == WRITE));
1595 public_bh = multi_ok = 1;
1596 sector = 1 << sector_shift;
1597
1598 ASSERT(PageLocked(page));
1599 if (!page_has_buffers(page))
1600 create_empty_buffers(page, pbr->pbr_kdev, sector);
1601
1602 i = sector >> BBSHIFT;
1603 bn -= (pg_offset >> BBSHIFT);
1604
1605 /* Find buffer_heads belonging to just this pagebuf */
1606 bh = head = page_buffers(page);
1607 do {
1608 if (buffer_uptodate(bh) && cache_ok)
1609 continue;
1610 if (blk_length < pg_offset)
1611 continue;
1612 if (blk_length >= pg_offset + pg_length)
1613 break;
1614
1615 lock_buffer(bh);
1616 get_bh(bh);
1617 bh->b_size = sector;
1618 bh->b_blocknr = bn;
1619 bufferlist[cnt++] = bh;
1620
1621 } while ((bn += i),
1622 (blk_length += sector),
1623 (bh = bh->b_this_page) != head);
1624
1625 goto request;
1626 }
1627
1628 /* Calculate the block offsets and length we will be using */
1629 if (pg_offset) {
1630 size_t block_offset;
1631
1632 block_offset = pg_offset >> sector_shift;
1633 block_offset = pg_offset - (block_offset << sector_shift);
1634 blk_length = (pg_length + block_offset + pbr->pbr_smask) >>
1635 sector_shift;
1636 } else {
1637 blk_length = (pg_length + pbr->pbr_smask) >> sector_shift;
1638 }
1639
1640 /* This will attempt to make a request bigger than the sector
1641 * size if we are well aligned.
1642 */
1643 switch (pb->pb_target->pbr_flags) {
1644 case 0:
1645 sector = blk_length << sector_shift;
1646 blk_length = 1;
1647 break;
1648 case PBR_ALIGNED_ONLY:
1649 if ((pg_offset == 0) && (pg_length == PAGE_CACHE_SIZE) &&
1650 (((unsigned int) bn) & BN_ALIGN_MASK) == 0) {
1651 sector = blk_length << sector_shift;
1652 blk_length = 1;
1653 break;
1654 }
1655 case PBR_SECTOR_ONLY:
1656 /* Fallthrough, same as default */
1657 default:
1658 sector = 1 << sector_shift;
1659 }
1660
1661 /* If we are doing I/O larger than the bh->b_size field then
1662 * we need to split this request up.
1663 */
1664 while (sector > ((1ULL << NBBY * sizeof(bh->b_size)) - 1)) {
1665 sector >>= 1;
1666 blk_length++;
1667 }
1668
1669 multi_ok = (blk_length != 1);
1670 i = sector >> BBSHIFT;
1671
1672 for (; blk_length > 0; bn += i, blk_length--, pg_offset += sector) {
1673 bh = kmem_cache_alloc(bh_cachep, SLAB_NOFS);
1674 if (!bh)
1675 bh = _pagebuf_get_prealloc_bh();
1676 memset(bh, 0, sizeof(*bh));
1677 bh->b_blocknr = bn;
1678 bh->b_size = sector;
1679 bh->b_dev = pbr->pbr_kdev;
1680 set_buffer_locked(bh);
1681 set_bh_page(bh, page, pg_offset);
1682 init_waitqueue_head(&bh->b_wait);
1683 atomic_set(&bh->b_count, 1);
1684 bufferlist[cnt++] = bh;
1685 }
1686
1687 request:
1688 if (cnt) {
1689 void (*callback)(struct buffer_head *, int);
1690
1691 callback = (multi_ok && public_bh) ?
1692 _pagebuf_end_io_partial_pages :
1693 _pagebuf_end_io_complete_pages;
1694
1695 /* Account for additional buffers in progress */
1696 atomic_add(cnt, &pb->pb_io_remaining);
1697
1698 #ifdef RQ_WRITE_ORDERED
1699 if (flush)
1700 set_bit(BH_Ordered_Flush, &bufferlist[cnt-1]->b_state);
1701 #endif
1702
1703 for (i = 0; i < cnt; i++) {
1704 bh = bufferlist[i];
1705 init_buffer(bh, callback, pb);
1706 bh->b_rdev = bh->b_dev;
1707 bh->b_rsector = bh->b_blocknr;
1708 set_buffer_mapped(bh);
1709 set_buffer_async(bh);
1710 set_buffer_req(bh);
1711 if (rw == WRITE)
1712 set_buffer_uptodate(bh);
1713 generic_make_request(rw, bh);
1714 }
1715 return 0;
1716 }
1717
1718 /*
1719 * We have no I/O to submit, let the caller know that
1720 * we have skipped over this page entirely.
1721 */
1722 return 1;
1723 }
1724
1725 STATIC void
_pagebuf_page_apply(xfs_buf_t * pb,loff_t offset,struct page * page,size_t pg_offset,size_t pg_length,int last)1726 _pagebuf_page_apply(
1727 xfs_buf_t *pb,
1728 loff_t offset,
1729 struct page *page,
1730 size_t pg_offset,
1731 size_t pg_length,
1732 int last)
1733 {
1734 xfs_daddr_t bn = pb->pb_bn;
1735 xfs_buftarg_t *pbr = pb->pb_target;
1736 loff_t pb_offset;
1737 int status, locking;
1738
1739 ASSERT(page);
1740 ASSERT(pb->pb_flags & (PBF_READ|PBF_WRITE));
1741
1742 if ((pbr->pbr_bsize == PAGE_CACHE_SIZE) &&
1743 (pb->pb_buffer_length < PAGE_CACHE_SIZE) &&
1744 (pb->pb_flags & PBF_READ) && pb->pb_locked) {
1745 bn -= (pb->pb_offset >> BBSHIFT);
1746 pg_offset = 0;
1747 pg_length = PAGE_CACHE_SIZE;
1748 } else {
1749 pb_offset = offset - pb->pb_file_offset;
1750 if (pb_offset) {
1751 bn += (pb_offset + BBMASK) >> BBSHIFT;
1752 }
1753 }
1754
1755 locking = _pagebuf_iolocked(pb);
1756 if (pb->pb_flags & PBF_WRITE) {
1757 if (locking && !pb->pb_locked)
1758 lock_page(page);
1759 status = _pagebuf_page_io(page, pbr, pb, bn,
1760 pg_offset, pg_length, WRITE,
1761 last && (pb->pb_flags & PBF_FLUSH));
1762 } else {
1763 status = _pagebuf_page_io(page, pbr, pb, bn,
1764 pg_offset, pg_length, READ, 0);
1765 }
1766 if (status && locking && !(pb->pb_target->pbr_bsize < PAGE_CACHE_SIZE))
1767 unlock_page(page);
1768 }
1769
1770 /*
1771 * pagebuf_iorequest -- the core I/O request routine.
1772 */
1773 int
pagebuf_iorequest(xfs_buf_t * pb)1774 pagebuf_iorequest( /* start real I/O */
1775 xfs_buf_t *pb) /* buffer to convey to device */
1776 {
1777 PB_TRACE(pb, "iorequest", 0);
1778
1779 if (pb->pb_flags & PBF_DELWRI) {
1780 pagebuf_delwri_queue(pb, 1);
1781 return 0;
1782 }
1783
1784 if (pb->pb_flags & PBF_WRITE) {
1785 _pagebuf_wait_unpin(pb);
1786 }
1787
1788 pagebuf_hold(pb);
1789
1790 /* Set the count to 1 initially, this will stop an I/O
1791 * completion callout which happens before we have started
1792 * all the I/O from calling pagebuf_iodone too early.
1793 */
1794 atomic_set(&pb->pb_io_remaining, 1);
1795 _pagebuf_ioapply(pb);
1796 _pagebuf_iodone(pb, 0);
1797
1798 pagebuf_rele(pb);
1799 return 0;
1800 }
1801
1802 /*
1803 * pagebuf_iowait
1804 *
1805 * pagebuf_iowait waits for I/O to complete on the buffer supplied.
1806 * It returns immediately if no I/O is pending. In any case, it returns
1807 * the error code, if any, or 0 if there is no error.
1808 */
1809 int
pagebuf_iowait(xfs_buf_t * pb)1810 pagebuf_iowait(
1811 xfs_buf_t *pb)
1812 {
1813 PB_TRACE(pb, "iowait", 0);
1814 if (atomic_read(&pb->pb_io_remaining))
1815 run_task_queue(&tq_disk);
1816 if ((pb->pb_flags & PBF_FS_DATAIOD))
1817 pagebuf_runall_queues(pagebuf_dataiodone_tq);
1818 down(&pb->pb_iodonesema);
1819 PB_TRACE(pb, "iowaited", (long)pb->pb_error);
1820 return pb->pb_error;
1821 }
1822
1823 caddr_t
pagebuf_offset(xfs_buf_t * pb,size_t offset)1824 pagebuf_offset(
1825 xfs_buf_t *pb,
1826 size_t offset)
1827 {
1828 struct page *page;
1829
1830 offset += pb->pb_offset;
1831
1832 page = pb->pb_pages[offset >> PAGE_CACHE_SHIFT];
1833 return (caddr_t) page_address(page) + (offset & (PAGE_CACHE_SIZE - 1));
1834 }
1835
1836 /*
1837 * pagebuf_iomove
1838 *
1839 * Move data into or out of a buffer.
1840 */
1841 void
pagebuf_iomove(xfs_buf_t * pb,size_t boff,size_t bsize,caddr_t data,page_buf_rw_t mode)1842 pagebuf_iomove(
1843 xfs_buf_t *pb, /* buffer to process */
1844 size_t boff, /* starting buffer offset */
1845 size_t bsize, /* length to copy */
1846 caddr_t data, /* data address */
1847 page_buf_rw_t mode) /* read/write flag */
1848 {
1849 size_t bend, cpoff, csize;
1850 struct page *page;
1851
1852 bend = boff + bsize;
1853 while (boff < bend) {
1854 page = pb->pb_pages[page_buf_btoct(boff + pb->pb_offset)];
1855 cpoff = page_buf_poff(boff + pb->pb_offset);
1856 csize = min_t(size_t,
1857 PAGE_CACHE_SIZE-cpoff, pb->pb_count_desired-boff);
1858
1859 ASSERT(((csize + cpoff) <= PAGE_CACHE_SIZE));
1860
1861 switch (mode) {
1862 case PBRW_ZERO:
1863 memset(page_address(page) + cpoff, 0, csize);
1864 break;
1865 case PBRW_READ:
1866 memcpy(data, page_address(page) + cpoff, csize);
1867 break;
1868 case PBRW_WRITE:
1869 memcpy(page_address(page) + cpoff, data, csize);
1870 }
1871
1872 boff += csize;
1873 data += csize;
1874 }
1875 }
1876
1877 /*
1878 * _pagebuf_ioapply
1879 *
1880 * Applies _pagebuf_page_apply to each page of the xfs_buf_t.
1881 */
1882 STATIC void
_pagebuf_ioapply(xfs_buf_t * pb)1883 _pagebuf_ioapply( /* apply function to pages */
1884 xfs_buf_t *pb) /* buffer to examine */
1885 {
1886 int index;
1887 loff_t buffer_offset = pb->pb_file_offset;
1888 size_t buffer_len = pb->pb_count_desired;
1889 size_t page_offset, len;
1890 size_t cur_offset, cur_len;
1891
1892 cur_offset = pb->pb_offset;
1893 cur_len = buffer_len;
1894
1895 if (!pb->pb_locked && !(pb->pb_flags & PBF_DIRECTIO) &&
1896 (pb->pb_target->pbr_bsize < PAGE_CACHE_SIZE)) {
1897 for (index = 0; index < pb->pb_page_count; index++)
1898 lock_page(pb->pb_pages[index]);
1899 pb->pb_locked = 1;
1900 }
1901
1902 for (index = 0; index < pb->pb_page_count; index++) {
1903 if (cur_len == 0)
1904 break;
1905 if (cur_offset >= PAGE_CACHE_SIZE) {
1906 cur_offset -= PAGE_CACHE_SIZE;
1907 continue;
1908 }
1909
1910 page_offset = cur_offset;
1911 cur_offset = 0;
1912
1913 len = PAGE_CACHE_SIZE - page_offset;
1914 if (len > cur_len)
1915 len = cur_len;
1916 cur_len -= len;
1917
1918 _pagebuf_page_apply(pb, buffer_offset,
1919 pb->pb_pages[index], page_offset, len,
1920 index + 1 == pb->pb_page_count);
1921 buffer_offset += len;
1922 buffer_len -= len;
1923 }
1924
1925 /*
1926 * Run the block device task queue here, while we have
1927 * a hold on the pagebuf (important to have that hold).
1928 */
1929 if (pb->pb_flags & _PBF_RUN_QUEUES) {
1930 pb->pb_flags &= ~_PBF_RUN_QUEUES;
1931 if (atomic_read(&pb->pb_io_remaining) > 1)
1932 run_task_queue(&tq_disk);
1933 }
1934 }
1935
1936
1937 /*
1938 * Delayed write buffer list handling
1939 */
1940
1941 STATIC LIST_HEAD(pbd_delwrite_queue);
1942 STATIC spinlock_t pbd_delwrite_lock = SPIN_LOCK_UNLOCKED;
1943
1944 STATIC void
pagebuf_delwri_queue(xfs_buf_t * pb,int unlock)1945 pagebuf_delwri_queue(
1946 xfs_buf_t *pb,
1947 int unlock)
1948 {
1949 PB_TRACE(pb, "delwri_q", (long)unlock);
1950 ASSERT(pb->pb_flags & PBF_DELWRI);
1951
1952 spin_lock(&pbd_delwrite_lock);
1953 /* If already in the queue, dequeue and place at tail */
1954 if (!list_empty(&pb->pb_list)) {
1955 if (unlock)
1956 atomic_dec(&pb->pb_hold);
1957 list_del(&pb->pb_list);
1958 }
1959
1960 list_add_tail(&pb->pb_list, &pbd_delwrite_queue);
1961 pb->pb_queuetime = jiffies;
1962 spin_unlock(&pbd_delwrite_lock);
1963
1964 if (unlock)
1965 pagebuf_unlock(pb);
1966 }
1967
1968 void
pagebuf_delwri_dequeue(xfs_buf_t * pb)1969 pagebuf_delwri_dequeue(
1970 xfs_buf_t *pb)
1971 {
1972 int dequeued = 0;
1973
1974 spin_lock(&pbd_delwrite_lock);
1975 if ((pb->pb_flags & PBF_DELWRI) && !list_empty(&pb->pb_list)) {
1976 list_del_init(&pb->pb_list);
1977 dequeued = 1;
1978 }
1979 pb->pb_flags &= ~PBF_DELWRI;
1980 spin_unlock(&pbd_delwrite_lock);
1981
1982 if (dequeued)
1983 pagebuf_rele(pb);
1984
1985 PB_TRACE(pb, "delwri_dq", (long)dequeued);
1986 }
1987
1988
1989 /*
1990 * The pagebuf iodone daemons
1991 */
1992
1993 STATIC int
pagebuf_iodone_daemon(void * __bind_cpu,const char * name,int pagebuf_daemons[],struct list_head pagebuf_iodone_tq[],wait_queue_head_t pagebuf_iodone_wait[])1994 pagebuf_iodone_daemon(
1995 void *__bind_cpu,
1996 const char *name,
1997 int pagebuf_daemons[],
1998 struct list_head pagebuf_iodone_tq[],
1999 wait_queue_head_t pagebuf_iodone_wait[])
2000 {
2001 int bind_cpu, cpu;
2002 DECLARE_WAITQUEUE (wait, current);
2003
2004 bind_cpu = (int) (long)__bind_cpu;
2005 cpu = CPU_TO_DAEMON(cpu_logical_map(bind_cpu));
2006
2007 /* Set up the thread */
2008 daemonize();
2009
2010 /* Avoid signals */
2011 sigmask_lock();
2012 sigfillset(¤t->blocked);
2013 __recalc_sigpending(current);
2014 sigmask_unlock();
2015
2016 /* Migrate to the right CPU */
2017 migrate_to_cpu(cpu);
2018 #ifdef __HAVE_NEW_SCHEDULER
2019 if (smp_processor_id() != cpu)
2020 BUG();
2021 #else
2022 while (smp_processor_id() != cpu)
2023 schedule();
2024 #endif
2025
2026 sprintf(current->comm, "%s/%d", name, bind_cpu);
2027 INIT_LIST_HEAD(&pagebuf_iodone_tq[cpu]);
2028 init_waitqueue_head(&pagebuf_iodone_wait[cpu]);
2029 __set_current_state(TASK_INTERRUPTIBLE);
2030 mb();
2031
2032 pagebuf_daemons[cpu] = 1;
2033
2034 for (;;) {
2035 add_wait_queue(&pagebuf_iodone_wait[cpu], &wait);
2036
2037 if (TQ_ACTIVE(pagebuf_iodone_tq[cpu]))
2038 __set_task_state(current, TASK_RUNNING);
2039 schedule();
2040 remove_wait_queue(&pagebuf_iodone_wait[cpu], &wait);
2041 run_task_queue(&pagebuf_iodone_tq[cpu]);
2042 if (pagebuf_daemons[cpu] == 0)
2043 break;
2044 __set_current_state(TASK_INTERRUPTIBLE);
2045 }
2046
2047 pagebuf_daemons[cpu] = -1;
2048 wake_up_interruptible(&pagebuf_iodone_wait[cpu]);
2049 return 0;
2050 }
2051
2052 STATIC void
pagebuf_runall_queues(struct list_head pagebuf_iodone_tq[])2053 pagebuf_runall_queues(
2054 struct list_head pagebuf_iodone_tq[])
2055 {
2056 int pcpu, cpu;
2057
2058 for (cpu = 0; cpu < min(smp_num_cpus, MAX_IO_DAEMONS); cpu++) {
2059 pcpu = CPU_TO_DAEMON(cpu_logical_map(cpu));
2060
2061 run_task_queue(&pagebuf_iodone_tq[pcpu]);
2062 }
2063 }
2064
2065 STATIC int
pagebuf_logiodone_daemon(void * __bind_cpu)2066 pagebuf_logiodone_daemon(
2067 void *__bind_cpu)
2068 {
2069 return pagebuf_iodone_daemon(__bind_cpu, "xfslogd", pb_logio_daemons,
2070 pagebuf_logiodone_tq, pagebuf_logiodone_wait);
2071 }
2072
2073 STATIC int
pagebuf_dataiodone_daemon(void * __bind_cpu)2074 pagebuf_dataiodone_daemon(
2075 void *__bind_cpu)
2076 {
2077 return pagebuf_iodone_daemon(__bind_cpu, "xfsdatad", pb_dataio_daemons,
2078 pagebuf_dataiodone_tq, pagebuf_dataiodone_wait);
2079 }
2080
2081
2082 /* Defines for pagebuf daemon */
2083 STATIC DECLARE_COMPLETION(pagebuf_daemon_done);
2084 STATIC struct task_struct *pagebuf_daemon_task;
2085 STATIC int pagebuf_daemon_active;
2086 STATIC int force_flush;
2087
2088
2089 STATIC int
pagebuf_daemon_wakeup(int priority,unsigned int mask)2090 pagebuf_daemon_wakeup(
2091 int priority,
2092 unsigned int mask)
2093 {
2094 force_flush = 1;
2095 barrier();
2096 wake_up_process(pagebuf_daemon_task);
2097 return 0;
2098 }
2099
2100 STATIC int
pagebuf_daemon(void * data)2101 pagebuf_daemon(
2102 void *data)
2103 {
2104 struct list_head tmp;
2105 unsigned long age;
2106 xfs_buf_t *pb, *n;
2107 int count;
2108
2109 /* Set up the thread */
2110 daemonize();
2111
2112 /* Mark it active */
2113 pagebuf_daemon_task = current;
2114 pagebuf_daemon_active = 1;
2115 barrier();
2116
2117 /* Avoid signals */
2118 sigmask_lock();
2119 sigfillset(¤t->blocked);
2120 __recalc_sigpending(current);
2121 sigmask_unlock();
2122
2123 strcpy(current->comm, "xfsbufd");
2124 current->flags |= PF_MEMALLOC;
2125
2126 INIT_LIST_HEAD(&tmp);
2127 do {
2128 set_current_state(TASK_INTERRUPTIBLE);
2129 schedule_timeout((xfs_buf_timer_centisecs * HZ) / 100);
2130
2131 count = 0;
2132 age = (xfs_buf_age_centisecs * HZ) / 100;
2133 spin_lock(&pbd_delwrite_lock);
2134 list_for_each_entry_safe(pb, n, &pbd_delwrite_queue, pb_list) {
2135 PB_TRACE(pb, "walkq1", (long)pagebuf_ispin(pb));
2136 ASSERT(pb->pb_flags & PBF_DELWRI);
2137
2138 if (!pagebuf_ispin(pb) && !pagebuf_cond_lock(pb)) {
2139 if (!force_flush &&
2140 time_before(jiffies,
2141 pb->pb_queuetime + age)) {
2142 pagebuf_unlock(pb);
2143 break;
2144 }
2145
2146 pb->pb_flags &= ~PBF_DELWRI;
2147 pb->pb_flags |= PBF_WRITE;
2148 list_move(&pb->pb_list, &tmp);
2149 count++;
2150 }
2151 }
2152 spin_unlock(&pbd_delwrite_lock);
2153
2154 while (!list_empty(&tmp)) {
2155 pb = list_entry(tmp.next, xfs_buf_t, pb_list);
2156 list_del_init(&pb->pb_list);
2157 pagebuf_iostrategy(pb);
2158 }
2159
2160 if (as_list_len > 0)
2161 purge_addresses();
2162 if (count)
2163 run_task_queue(&tq_disk);
2164
2165 force_flush = 0;
2166 } while (pagebuf_daemon_active);
2167
2168 complete_and_exit(&pagebuf_daemon_done, 0);
2169 }
2170
2171 /*
2172 * Go through all incore buffers, and release buffers if they belong to
2173 * the given device. This is used in filesystem error handling to
2174 * preserve the consistency of its metadata.
2175 */
2176 int
xfs_flush_buftarg(xfs_buftarg_t * target,int wait)2177 xfs_flush_buftarg(
2178 xfs_buftarg_t *target,
2179 int wait)
2180 {
2181 struct list_head tmp;
2182 xfs_buf_t *pb, *n;
2183 int pincount = 0;
2184 int flush_cnt = 0;
2185
2186 pagebuf_runall_queues(pagebuf_dataiodone_tq);
2187 pagebuf_runall_queues(pagebuf_logiodone_tq);
2188
2189 INIT_LIST_HEAD(&tmp);
2190 spin_lock(&pbd_delwrite_lock);
2191 list_for_each_entry_safe(pb, n, &pbd_delwrite_queue, pb_list) {
2192
2193 if (pb->pb_target != target)
2194 continue;
2195
2196 ASSERT(pb->pb_flags & PBF_DELWRI);
2197 PB_TRACE(pb, "walkq2", (long)pagebuf_ispin(pb));
2198 if (pagebuf_ispin(pb)) {
2199 pincount++;
2200 continue;
2201 }
2202
2203 pb->pb_flags &= ~PBF_DELWRI;
2204 pb->pb_flags |= PBF_WRITE;
2205 list_move(&pb->pb_list, &tmp);
2206 }
2207 spin_unlock(&pbd_delwrite_lock);
2208
2209 /*
2210 * Dropped the delayed write list lock, now walk the temporary list
2211 */
2212 list_for_each_entry_safe(pb, n, &tmp, pb_list) {
2213
2214 if (wait)
2215 pb->pb_flags &= ~PBF_ASYNC;
2216 else
2217 list_del_init(&pb->pb_list);
2218
2219 pagebuf_lock(pb);
2220 pagebuf_iostrategy(pb);
2221
2222 if (++flush_cnt > 32) {
2223 run_task_queue(&tq_disk);
2224 flush_cnt = 0;
2225 }
2226 }
2227
2228 run_task_queue(&tq_disk);
2229
2230 /*
2231 * Remaining list items must be flushed before returning
2232 */
2233 while (!list_empty(&tmp)) {
2234 pb = list_entry(tmp.next, xfs_buf_t, pb_list);
2235
2236 list_del_init(&pb->pb_list);
2237
2238 xfs_iowait(pb);
2239 xfs_buf_relse(pb);
2240 }
2241
2242 return pincount;
2243 }
2244
2245 STATIC int
pagebuf_daemon_start(void)2246 pagebuf_daemon_start(void)
2247 {
2248 int cpu, pcpu;
2249
2250 kernel_thread(pagebuf_daemon, NULL, CLONE_FS|CLONE_FILES|CLONE_VM);
2251
2252 for (cpu = 0; cpu < min(smp_num_cpus, MAX_IO_DAEMONS); cpu++) {
2253 pcpu = CPU_TO_DAEMON(cpu_logical_map(cpu));
2254
2255 if (kernel_thread(pagebuf_logiodone_daemon,
2256 (void *)(long) cpu,
2257 CLONE_FS|CLONE_FILES|CLONE_VM) < 0) {
2258 printk("pagebuf_logiodone daemon failed to start\n");
2259 } else {
2260 while (!pb_logio_daemons[pcpu])
2261 yield();
2262 }
2263 }
2264 for (cpu = 0; cpu < min(smp_num_cpus, MAX_IO_DAEMONS); cpu++) {
2265 pcpu = CPU_TO_DAEMON(cpu_logical_map(cpu));
2266
2267 if (kernel_thread(pagebuf_dataiodone_daemon,
2268 (void *)(long) cpu,
2269 CLONE_FS|CLONE_FILES|CLONE_VM) < 0) {
2270 printk("pagebuf_dataiodone daemon failed to start\n");
2271 } else {
2272 while (!pb_dataio_daemons[pcpu])
2273 yield();
2274 }
2275 }
2276 return 0;
2277 }
2278
2279 /*
2280 * pagebuf_daemon_stop
2281 *
2282 * Note: do not mark as __exit, it is called from pagebuf_terminate.
2283 */
2284 STATIC void
pagebuf_daemon_stop(void)2285 pagebuf_daemon_stop(void)
2286 {
2287 int cpu, pcpu;
2288
2289 pagebuf_daemon_active = 0;
2290 barrier();
2291 wait_for_completion(&pagebuf_daemon_done);
2292
2293 for (pcpu = 0; pcpu < min(smp_num_cpus, MAX_IO_DAEMONS); pcpu++) {
2294 cpu = CPU_TO_DAEMON(cpu_logical_map(pcpu));
2295
2296 pb_logio_daemons[cpu] = 0;
2297 wake_up(&pagebuf_logiodone_wait[cpu]);
2298 wait_event_interruptible(pagebuf_logiodone_wait[cpu],
2299 pb_logio_daemons[cpu] == -1);
2300
2301 pb_dataio_daemons[cpu] = 0;
2302 wake_up(&pagebuf_dataiodone_wait[cpu]);
2303 wait_event_interruptible(pagebuf_dataiodone_wait[cpu],
2304 pb_dataio_daemons[cpu] == -1);
2305 }
2306 }
2307
2308 /*
2309 * Initialization and Termination
2310 */
2311
2312 int __init
pagebuf_init(void)2313 pagebuf_init(void)
2314 {
2315 int i;
2316
2317 pagebuf_cache = kmem_cache_create("xfs_buf_t", sizeof(xfs_buf_t), 0,
2318 SLAB_HWCACHE_ALIGN, NULL, NULL);
2319 if (pagebuf_cache == NULL) {
2320 printk("XFS: couldn't init xfs_buf_t cache\n");
2321 return -ENOMEM;
2322 }
2323
2324 if (_pagebuf_prealloc_bh(NR_RESERVED_BH) < NR_RESERVED_BH) {
2325 printk("XFS: couldn't allocate %d reserved buffers\n",
2326 NR_RESERVED_BH);
2327 kmem_zone_destroy(pagebuf_cache);
2328 return -ENOMEM;
2329 }
2330 init_waitqueue_head(&pb_resv_bh_wait);
2331
2332 #ifdef PAGEBUF_TRACE
2333 pagebuf_trace_buf = ktrace_alloc(PAGEBUF_TRACE_SIZE, KM_SLEEP);
2334 #endif
2335
2336 pagebuf_daemon_start();
2337
2338 pagebuf_shake = kmem_shake_register(pagebuf_daemon_wakeup);
2339 if (pagebuf_shake == NULL) {
2340 pagebuf_terminate();
2341 return -ENOMEM;
2342 }
2343
2344 for (i = 0; i < NHASH; i++) {
2345 spin_lock_init(&pbhash[i].pb_hash_lock);
2346 INIT_LIST_HEAD(&pbhash[i].pb_hash);
2347 }
2348
2349 return 0;
2350 }
2351
2352 /*
2353 * pagebuf_terminate.
2354 *
2355 * Note: do not mark as __exit, this is also called from the __init code.
2356 */
2357 void
pagebuf_terminate(void)2358 pagebuf_terminate(void)
2359 {
2360 pagebuf_daemon_stop();
2361
2362 #ifdef PAGEBUF_TRACE
2363 ktrace_free(pagebuf_trace_buf);
2364 #endif
2365
2366 kmem_zone_destroy(pagebuf_cache);
2367 kmem_shake_deregister(pagebuf_shake);
2368 }
2369