1 /*
2  * Copyright (c) 2000-2004 Silicon Graphics, Inc.  All Rights Reserved.
3  *
4  * This program is free software; you can redistribute it and/or modify it
5  * under the terms of version 2 of the GNU General Public License as
6  * published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it would be useful, but
9  * WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11  *
12  * Further, this software is distributed without any warranty that it is
13  * free of the rightful claim of any third person regarding infringement
14  * or the like.  Any license provided herein, whether implied or
15  * otherwise, applies only to this software file.  Patent licenses, if
16  * any, provided herein do not apply to combinations of this program with
17  * other software, or any other product whatsoever.
18  *
19  * You should have received a copy of the GNU General Public License along
20  * with this program; if not, write the Free Software Foundation, Inc., 59
21  * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22  *
23  * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24  * Mountain View, CA  94043, or:
25  *
26  * http://www.sgi.com
27  *
28  * For further information regarding this notice, see:
29  *
30  * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31  */
32 
33 /*
34  *	The xfs_buf.c code provides an abstract buffer cache model on top
35  *	of the Linux page cache.  Cached metadata blocks for a file system
36  *	are hashed to the inode for the block device.  xfs_buf.c assembles
37  *	buffers (xfs_buf_t) on demand to aggregate such cached pages for I/O.
38  *
39  *      Written by Steve Lord, Jim Mostek, Russell Cattelan
40  *		    and Rajagopal Ananthanarayanan ("ananth") at SGI.
41  *
42  */
43 
44 #include <linux/stddef.h>
45 #include <linux/errno.h>
46 #include <linux/slab.h>
47 #include <linux/pagemap.h>
48 #include <linux/init.h>
49 #include <linux/vmalloc.h>
50 #include <linux/blkdev.h>
51 #include <linux/locks.h>
52 #include <linux/sysctl.h>
53 #include <linux/proc_fs.h>
54 
55 #include "xfs_linux.h"
56 
57 #define BN_ALIGN_MASK	((1 << (PAGE_CACHE_SHIFT - BBSHIFT)) - 1)
58 
59 #ifndef GFP_READAHEAD
60 #define GFP_READAHEAD	0
61 #endif
62 
63 /*
64  * A backport of the 2.5 scheduler is used by many vendors of 2.4-based
65  * distributions.
66  * We can only guess it's presences by the lack of the SCHED_YIELD flag.
67  * If the heuristic doesn't work, change this define by hand.
68  */
69 #ifndef SCHED_YIELD
70 #define __HAVE_NEW_SCHEDULER	1
71 #endif
72 
73 /*
74  * cpumask_t is used for supporting NR_CPUS > BITS_PER_LONG.
75  * If support for this is present, migrate_to_cpu exists and provides
76  * a wrapper around the set_cpus_allowed routine.
77  */
78 #ifdef copy_cpumask
79 #define __HAVE_CPUMASK_T	1
80 #endif
81 
82 #ifndef __HAVE_CPUMASK_T
83 # ifndef __HAVE_NEW_SCHEDULER
84 #  define migrate_to_cpu(cpu)	\
85 	do { current->cpus_allowed = 1UL << (cpu); } while (0)
86 # else
87 #  define migrate_to_cpu(cpu)	\
88 	set_cpus_allowed(current, 1UL << (cpu))
89 # endif
90 #endif
91 
92 #ifndef VM_MAP
93 #define VM_MAP	VM_ALLOC
94 #endif
95 
96 /*
97  * File wide globals
98  */
99 
100 STATIC kmem_cache_t *pagebuf_cache;
101 STATIC kmem_shaker_t pagebuf_shake;
102 
103 #define MAX_IO_DAEMONS		NR_CPUS
104 #define CPU_TO_DAEMON(cpu)	(cpu)
105 STATIC int pb_logio_daemons[MAX_IO_DAEMONS];
106 STATIC struct list_head pagebuf_logiodone_tq[MAX_IO_DAEMONS];
107 STATIC wait_queue_head_t pagebuf_logiodone_wait[MAX_IO_DAEMONS];
108 STATIC int pb_dataio_daemons[MAX_IO_DAEMONS];
109 STATIC struct list_head pagebuf_dataiodone_tq[MAX_IO_DAEMONS];
110 STATIC wait_queue_head_t pagebuf_dataiodone_wait[MAX_IO_DAEMONS];
111 
112 /*
113  * For pre-allocated buffer head pool
114  */
115 
116 #define NR_RESERVED_BH	64
117 static wait_queue_head_t	pb_resv_bh_wait;
118 static spinlock_t		pb_resv_bh_lock = SPIN_LOCK_UNLOCKED;
119 struct buffer_head		*pb_resv_bh = NULL;	/* list of bh */
120 int				pb_resv_bh_cnt = 0;	/* # of bh available */
121 
122 STATIC void _pagebuf_ioapply(xfs_buf_t *);
123 STATIC int pagebuf_daemon_wakeup(int, unsigned int);
124 STATIC void pagebuf_delwri_queue(xfs_buf_t *, int);
125 STATIC void pagebuf_runall_queues(struct list_head[]);
126 
127 /*
128  * Pagebuf debugging
129  */
130 
131 #ifdef PAGEBUF_TRACE
132 void
pagebuf_trace(xfs_buf_t * pb,char * id,void * data,void * ra)133 pagebuf_trace(
134 	xfs_buf_t	*pb,
135 	char		*id,
136 	void		*data,
137 	void		*ra)
138 {
139 	ktrace_enter(pagebuf_trace_buf,
140 		pb, id,
141 		(void *)(unsigned long)pb->pb_flags,
142 		(void *)(unsigned long)pb->pb_hold.counter,
143 		(void *)(unsigned long)pb->pb_sema.count.counter,
144 		(void *)current,
145 		data, ra,
146 		(void *)(unsigned long)((pb->pb_file_offset>>32) & 0xffffffff),
147 		(void *)(unsigned long)(pb->pb_file_offset & 0xffffffff),
148 		(void *)(unsigned long)pb->pb_buffer_length,
149 		NULL, NULL, NULL, NULL, NULL);
150 }
151 ktrace_t *pagebuf_trace_buf;
152 #define PAGEBUF_TRACE_SIZE	4096
153 #define PB_TRACE(pb, id, data)	\
154 	pagebuf_trace(pb, id, (void *)data, (void *)__builtin_return_address(0))
155 #else
156 #define PB_TRACE(pb, id, data)	do { } while (0)
157 #endif
158 
159 #ifdef PAGEBUF_LOCK_TRACKING
160 # define PB_SET_OWNER(pb)	((pb)->pb_last_holder = current->pid)
161 # define PB_CLEAR_OWNER(pb)	((pb)->pb_last_holder = -1)
162 # define PB_GET_OWNER(pb)	((pb)->pb_last_holder)
163 #else
164 # define PB_SET_OWNER(pb)	do { } while (0)
165 # define PB_CLEAR_OWNER(pb)	do { } while (0)
166 # define PB_GET_OWNER(pb)	do { } while (0)
167 #endif
168 
169 /*
170  * Pagebuf allocation / freeing.
171  */
172 
173 #define pb_to_gfp(flags) \
174 	(((flags) & PBF_READ_AHEAD) ? GFP_READAHEAD : \
175 	 ((flags) & PBF_DONT_BLOCK) ? GFP_NOFS : GFP_KERNEL)
176 
177 #define pb_to_km(flags) \
178 	 (((flags) & PBF_DONT_BLOCK) ? KM_NOFS : KM_SLEEP)
179 
180 
181 #define pagebuf_allocate(flags) \
182 	kmem_zone_alloc(pagebuf_cache, pb_to_km(flags))
183 #define pagebuf_deallocate(pb) \
184 	kmem_zone_free(pagebuf_cache, (pb));
185 
186 /*
187  * Pagebuf hashing
188  */
189 
190 #define NBITS	8
191 #define NHASH	(1<<NBITS)
192 
193 typedef struct {
194 	struct list_head	pb_hash;
195 	spinlock_t		pb_hash_lock;
196 } pb_hash_t;
197 
198 STATIC pb_hash_t	pbhash[NHASH];
199 #define pb_hash(pb)	&pbhash[pb->pb_hash_index]
200 
201 STATIC int
_bhash(struct block_device * bdev,loff_t base)202 _bhash(
203 	struct block_device *bdev,
204 	loff_t		base)
205 {
206 	int		bit, hval;
207 
208 	base >>= 9;
209 	base ^= (unsigned long)bdev / L1_CACHE_BYTES;
210 	for (bit = hval = 0; base && bit < sizeof(base) * 8; bit += NBITS) {
211 		hval ^= (int)base & (NHASH-1);
212 		base >>= NBITS;
213 	}
214 	return hval;
215 }
216 
217 /*
218  * Mapping of multi-page buffers into contiguous virtual space
219  */
220 
221 typedef struct a_list {
222 	void		*vm_addr;
223 	struct a_list	*next;
224 } a_list_t;
225 
226 STATIC a_list_t		*as_free_head;
227 STATIC int		as_list_len;
228 STATIC spinlock_t	as_lock = SPIN_LOCK_UNLOCKED;
229 
230 /*
231  * Try to batch vunmaps because they are costly.
232  */
233 STATIC void
free_address(void * addr)234 free_address(
235 	void		*addr)
236 {
237 	a_list_t	*aentry;
238 
239 	aentry = kmalloc(sizeof(a_list_t), GFP_ATOMIC);
240 	if (aentry) {
241 		spin_lock(&as_lock);
242 		aentry->next = as_free_head;
243 		aentry->vm_addr = addr;
244 		as_free_head = aentry;
245 		as_list_len++;
246 		spin_unlock(&as_lock);
247 	} else {
248 		vunmap(addr);
249 	}
250 }
251 
252 STATIC void
purge_addresses(void)253 purge_addresses(void)
254 {
255 	a_list_t	*aentry, *old;
256 
257 	if (as_free_head == NULL)
258 		return;
259 
260 	spin_lock(&as_lock);
261 	aentry = as_free_head;
262 	as_free_head = NULL;
263 	as_list_len = 0;
264 	spin_unlock(&as_lock);
265 
266 	while ((old = aentry) != NULL) {
267 		vunmap(aentry->vm_addr);
268 		aentry = aentry->next;
269 		kfree(old);
270 	}
271 }
272 
273 /*
274  *	Internal pagebuf object manipulation
275  */
276 
277 STATIC void
_pagebuf_initialize(xfs_buf_t * pb,xfs_buftarg_t * target,loff_t range_base,size_t range_length,page_buf_flags_t flags)278 _pagebuf_initialize(
279 	xfs_buf_t		*pb,
280 	xfs_buftarg_t		*target,
281 	loff_t			range_base,
282 	size_t			range_length,
283 	page_buf_flags_t	flags)
284 {
285 	/*
286 	 * We don't want certain flags to appear in pb->pb_flags.
287 	 */
288 	flags &= ~(PBF_LOCK|PBF_MAPPED|PBF_DONT_BLOCK|PBF_READ_AHEAD);
289 
290 	memset(pb, 0, sizeof(xfs_buf_t));
291 	atomic_set(&pb->pb_hold, 1);
292 	init_MUTEX_LOCKED(&pb->pb_iodonesema);
293 	INIT_LIST_HEAD(&pb->pb_list);
294 	INIT_LIST_HEAD(&pb->pb_hash_list);
295 	init_MUTEX_LOCKED(&pb->pb_sema); /* held, no waiters */
296 	PB_SET_OWNER(pb);
297 	pb->pb_target = target;
298 	pb->pb_file_offset = range_base;
299 	/*
300 	 * Set buffer_length and count_desired to the same value initially.
301 	 * I/O routines should use count_desired, which will be the same in
302 	 * most cases but may be reset (e.g. XFS recovery).
303 	 */
304 	pb->pb_buffer_length = pb->pb_count_desired = range_length;
305 	pb->pb_flags = flags | PBF_NONE;
306 	pb->pb_bn = XFS_BUF_DADDR_NULL;
307 	atomic_set(&pb->pb_pin_count, 0);
308 	init_waitqueue_head(&pb->pb_waiters);
309 
310 	XFS_STATS_INC(pb_create);
311 	PB_TRACE(pb, "initialize", target);
312 }
313 
314 /*
315  * Allocate a page array capable of holding a specified number
316  * of pages, and point the page buf at it.
317  */
318 STATIC int
_pagebuf_get_pages(xfs_buf_t * pb,int page_count,page_buf_flags_t flags)319 _pagebuf_get_pages(
320 	xfs_buf_t		*pb,
321 	int			page_count,
322 	page_buf_flags_t	flags)
323 {
324 	/* Make sure that we have a page list */
325 	if (pb->pb_pages == NULL) {
326 		pb->pb_offset = page_buf_poff(pb->pb_file_offset);
327 		pb->pb_page_count = page_count;
328 		if (page_count <= PB_PAGES) {
329 			pb->pb_pages = pb->pb_page_array;
330 		} else {
331 			pb->pb_pages = kmem_alloc(sizeof(struct page *) *
332 					page_count, pb_to_km(flags));
333 			if (pb->pb_pages == NULL)
334 				return -ENOMEM;
335 		}
336 		memset(pb->pb_pages, 0, sizeof(struct page *) * page_count);
337 	}
338 	return 0;
339 }
340 
341 /*
342  *	Frees pb_pages if it was malloced.
343  */
344 STATIC void
_pagebuf_free_pages(xfs_buf_t * bp)345 _pagebuf_free_pages(
346 	xfs_buf_t	*bp)
347 {
348 	if (bp->pb_pages != bp->pb_page_array) {
349 		kmem_free(bp->pb_pages,
350 			  bp->pb_page_count * sizeof(struct page *));
351 	}
352 }
353 
354 /*
355  *	Releases the specified buffer.
356  *
357  * 	The modification state of any associated pages is left unchanged.
358  * 	The buffer most not be on any hash - use pagebuf_rele instead for
359  * 	hashed and refcounted buffers
360  */
361 void
pagebuf_free(xfs_buf_t * bp)362 pagebuf_free(
363 	xfs_buf_t		*bp)
364 {
365 	PB_TRACE(bp, "free", 0);
366 
367 	ASSERT(list_empty(&bp->pb_hash_list));
368 
369 	if (bp->pb_flags & _PBF_PAGE_CACHE) {
370 		uint		i;
371 
372 		if ((bp->pb_flags & PBF_MAPPED) && (bp->pb_page_count > 1))
373 			free_address(bp->pb_addr - bp->pb_offset);
374 
375 		for (i = 0; i < bp->pb_page_count; i++)
376 			page_cache_release(bp->pb_pages[i]);
377 		_pagebuf_free_pages(bp);
378 	} else if (bp->pb_flags & _PBF_KMEM_ALLOC) {
379 		 /*
380 		  * XXX(hch): bp->pb_count_desired might be incorrect (see
381 		  * pagebuf_associate_memory for details), but fortunately
382 		  * the Linux version of kmem_free ignores the len argument..
383 		  */
384 		kmem_free(bp->pb_addr, bp->pb_count_desired);
385 		_pagebuf_free_pages(bp);
386 	}
387 
388 	pagebuf_deallocate(bp);
389 }
390 
391 /*
392  *	Finds all pages for buffer in question and builds it's page list.
393  */
394 STATIC int
_pagebuf_lookup_pages(xfs_buf_t * bp,uint flags)395 _pagebuf_lookup_pages(
396 	xfs_buf_t		*bp,
397 	uint			flags)
398 {
399 	struct address_space	*mapping = bp->pb_target->pbr_mapping;
400 	size_t			blocksize = bp->pb_target->pbr_bsize;
401 	int			gfp_mask = pb_to_gfp(flags);
402 	unsigned short		page_count, i;
403 	pgoff_t			first;
404 	loff_t			end;
405 	int			error;
406 
407 	end = bp->pb_file_offset + bp->pb_buffer_length;
408 	page_count = page_buf_btoc(end) - page_buf_btoct(bp->pb_file_offset);
409 
410 	error = _pagebuf_get_pages(bp, page_count, flags);
411 	if (unlikely(error))
412 		return error;
413 	bp->pb_flags |= _PBF_PAGE_CACHE;
414 
415 	first = bp->pb_file_offset >> PAGE_CACHE_SHIFT;
416 
417 	for (i = 0; i < bp->pb_page_count; i++) {
418 		struct page	*page;
419 		uint		retries = 0;
420 
421 	      retry:
422 		page = find_or_create_page(mapping, first + i, gfp_mask);
423 		if (unlikely(page == NULL)) {
424 			if (flags & PBF_READ_AHEAD) {
425 				bp->pb_page_count = i;
426 				for (i = 0; i < bp->pb_page_count; i++)
427 					unlock_page(bp->pb_pages[i]);
428 				return -ENOMEM;
429 			}
430 
431 			/*
432 			 * This could deadlock.
433 			 *
434 			 * But until all the XFS lowlevel code is revamped to
435 			 * handle buffer allocation failures we can't do much.
436 			 */
437 			if (!(++retries % 100))
438 				printk(KERN_ERR
439 					"possible deadlock in %s (mode:0x%x)\n",
440 					__FUNCTION__, gfp_mask);
441 
442 			XFS_STATS_INC(pb_page_retries);
443 			pagebuf_daemon_wakeup(0, gfp_mask);
444 			set_current_state(TASK_UNINTERRUPTIBLE);
445 			schedule_timeout(10);
446 			goto retry;
447 		}
448 
449 		XFS_STATS_INC(pb_page_found);
450 
451 		/* if we need to do I/O on a page record the fact */
452 		if (!Page_Uptodate(page)) {
453 			page_count--;
454 			if (blocksize == PAGE_CACHE_SIZE && (flags & PBF_READ))
455 				bp->pb_locked = 1;
456 		}
457 
458 		bp->pb_pages[i] = page;
459 	}
460 
461 	if (!bp->pb_locked) {
462 		for (i = 0; i < bp->pb_page_count; i++)
463 			unlock_page(bp->pb_pages[i]);
464 	}
465 
466 	if (page_count) {
467 		/* if we have any uptodate pages, mark that in the buffer */
468 		bp->pb_flags &= ~PBF_NONE;
469 
470 		/* if some pages aren't uptodate, mark that in the buffer */
471 		if (page_count != bp->pb_page_count)
472 			bp->pb_flags |= PBF_PARTIAL;
473 	}
474 
475 	PB_TRACE(bp, "lookup_pages", (long)page_count);
476 	return error;
477 }
478 
479 /*
480  *	Map buffer into kernel address-space if nessecary.
481  */
482 STATIC int
_pagebuf_map_pages(xfs_buf_t * bp,uint flags)483 _pagebuf_map_pages(
484 	xfs_buf_t		*bp,
485 	uint			flags)
486 {
487 	/* A single page buffer is always mappable */
488 	if (bp->pb_page_count == 1) {
489 		bp->pb_addr = page_address(bp->pb_pages[0]) + bp->pb_offset;
490 		bp->pb_flags |= PBF_MAPPED;
491 	} else if (flags & PBF_MAPPED) {
492 		if (as_list_len > 64)
493 			purge_addresses();
494 		bp->pb_addr = vmap(bp->pb_pages, bp->pb_page_count,
495 				VM_MAP, PAGE_KERNEL);
496 		if (unlikely(bp->pb_addr == NULL))
497 			return -ENOMEM;
498 		bp->pb_addr += bp->pb_offset;
499 		bp->pb_flags |= PBF_MAPPED;
500 	}
501 
502 	return 0;
503 }
504 
505 /*
506  *	Pre-allocation of a pool of buffer heads for use in
507  *	low-memory situations.
508  */
509 
510 /*
511  *	_pagebuf_prealloc_bh
512  *
513  *	Pre-allocate a pool of "count" buffer heads at startup.
514  *	Puts them on a list at "pb_resv_bh"
515  *	Returns number of bh actually allocated to pool.
516  */
517 STATIC int
_pagebuf_prealloc_bh(int count)518 _pagebuf_prealloc_bh(
519 	int			count)
520 {
521 	struct buffer_head	*bh;
522 	int			i;
523 
524 	for (i = 0; i < count; i++) {
525 		bh = kmem_cache_alloc(bh_cachep, SLAB_KERNEL);
526 		if (!bh)
527 			break;
528 		bh->b_pprev = &pb_resv_bh;
529 		bh->b_next = pb_resv_bh;
530 		pb_resv_bh = bh;
531 		pb_resv_bh_cnt++;
532 	}
533 	return i;
534 }
535 
536 /*
537  *	_pagebuf_get_prealloc_bh
538  *
539  *	Get one buffer head from our pre-allocated pool.
540  *	If pool is empty, sleep 'til one comes back in.
541  *	Returns aforementioned buffer head.
542  */
543 STATIC struct buffer_head *
_pagebuf_get_prealloc_bh(void)544 _pagebuf_get_prealloc_bh(void)
545 {
546 	unsigned long		flags;
547 	struct buffer_head	*bh;
548 	DECLARE_WAITQUEUE	(wait, current);
549 
550 	spin_lock_irqsave(&pb_resv_bh_lock, flags);
551 
552 	if (pb_resv_bh_cnt < 1) {
553 		add_wait_queue(&pb_resv_bh_wait, &wait);
554 		do {
555 			set_current_state(TASK_UNINTERRUPTIBLE);
556 			spin_unlock_irqrestore(&pb_resv_bh_lock, flags);
557 			run_task_queue(&tq_disk);
558 			schedule();
559 			spin_lock_irqsave(&pb_resv_bh_lock, flags);
560 		} while (pb_resv_bh_cnt < 1);
561 		__set_current_state(TASK_RUNNING);
562 		remove_wait_queue(&pb_resv_bh_wait, &wait);
563 	}
564 
565 	BUG_ON(pb_resv_bh_cnt < 1);
566 	BUG_ON(!pb_resv_bh);
567 
568 	bh = pb_resv_bh;
569 	pb_resv_bh = bh->b_next;
570 	pb_resv_bh_cnt--;
571 
572 	spin_unlock_irqrestore(&pb_resv_bh_lock, flags);
573 	return bh;
574 }
575 
576 /*
577  *	_pagebuf_free_bh
578  *
579  *	Take care of buffer heads that we're finished with.
580  *	Call this instead of just kmem_cache_free(bh_cachep, bh)
581  *	when you're done with a bh.
582  *
583  *	If our pre-allocated pool is full, just free the buffer head.
584  *	Otherwise, put it back in the pool, and wake up anybody
585  *	waiting for one.
586  */
587 STATIC inline void
_pagebuf_free_bh(struct buffer_head * bh)588 _pagebuf_free_bh(
589 	struct buffer_head	*bh)
590 {
591 	unsigned long		flags;
592 	int			free;
593 
594 	if (! (free = pb_resv_bh_cnt >= NR_RESERVED_BH)) {
595 		spin_lock_irqsave(&pb_resv_bh_lock, flags);
596 
597 		if (! (free = pb_resv_bh_cnt >= NR_RESERVED_BH)) {
598 			bh->b_pprev = &pb_resv_bh;
599 			bh->b_next = pb_resv_bh;
600 			pb_resv_bh = bh;
601 			pb_resv_bh_cnt++;
602 
603 			if (waitqueue_active(&pb_resv_bh_wait)) {
604 				wake_up(&pb_resv_bh_wait);
605 			}
606 		}
607 
608 		spin_unlock_irqrestore(&pb_resv_bh_lock, flags);
609 	}
610 	if (free) {
611 		kmem_cache_free(bh_cachep, bh);
612 	}
613 }
614 
615 /*
616  *	Finding and Reading Buffers
617  */
618 
619 /*
620  *	_pagebuf_find
621  *
622  *	Looks up, and creates if absent, a lockable buffer for
623  *	a given range of an inode.  The buffer is returned
624  *	locked.	 If other overlapping buffers exist, they are
625  *	released before the new buffer is created and locked,
626  *	which may imply that this call will block until those buffers
627  *	are unlocked.  No I/O is implied by this call.
628  */
629 xfs_buf_t *
_pagebuf_find(xfs_buftarg_t * target,loff_t ioff,size_t isize,page_buf_flags_t flags,xfs_buf_t * new_pb)630 _pagebuf_find(				/* find buffer for block	*/
631 	xfs_buftarg_t		*target,/* target for block		*/
632 	loff_t			ioff,	/* starting offset of range	*/
633 	size_t			isize,	/* length of range		*/
634 	page_buf_flags_t	flags,	/* PBF_TRYLOCK			*/
635 	xfs_buf_t		*new_pb)/* newly allocated buffer	*/
636 {
637 	loff_t			range_base;
638 	size_t			range_length;
639 	int			hval;
640 	pb_hash_t		*h;
641 	xfs_buf_t		*pb, *n;
642 	int			not_locked;
643 
644 	range_base = (ioff << BBSHIFT);
645 	range_length = (isize << BBSHIFT);
646 
647 	/* Ensure we never do IOs smaller than the sector size */
648 	BUG_ON(range_length < (1 << target->pbr_sshift));
649 
650 	/* Ensure we never do IOs that are not sector aligned */
651 	BUG_ON(range_base & (loff_t)target->pbr_smask);
652 
653 	hval = _bhash(target->pbr_bdev, range_base);
654 	h = &pbhash[hval];
655 
656 	spin_lock(&h->pb_hash_lock);
657 
658 	list_for_each_entry_safe(pb, n, &h->pb_hash, pb_hash_list) {
659 		if (pb->pb_target == target &&
660 		    pb->pb_file_offset == range_base &&
661 		    pb->pb_buffer_length == range_length) {
662 			/* If we look at something bring it to the
663 			 * front of the list for next time
664 			 */
665 			atomic_inc(&pb->pb_hold);
666 			list_move(&pb->pb_hash_list, &h->pb_hash);
667 			goto found;
668 		}
669 	}
670 
671 	/* No match found */
672 	if (new_pb) {
673 		_pagebuf_initialize(new_pb, target, range_base,
674 				range_length, flags);
675 		new_pb->pb_hash_index = hval;
676 		list_add(&new_pb->pb_hash_list, &h->pb_hash);
677 	} else {
678 		XFS_STATS_INC(pb_miss_locked);
679 	}
680 
681 	spin_unlock(&h->pb_hash_lock);
682 	return (new_pb);
683 
684 found:
685 	spin_unlock(&h->pb_hash_lock);
686 
687 	/* Attempt to get the semaphore without sleeping,
688 	 * if this does not work then we need to drop the
689 	 * spinlock and do a hard attempt on the semaphore.
690 	 */
691 	not_locked = down_trylock(&pb->pb_sema);
692 	if (not_locked) {
693 		if (!(flags & PBF_TRYLOCK)) {
694 			/* wait for buffer ownership */
695 			PB_TRACE(pb, "get_lock", 0);
696 			pagebuf_lock(pb);
697 			XFS_STATS_INC(pb_get_locked_waited);
698 		} else {
699 			/* We asked for a trylock and failed, no need
700 			 * to look at file offset and length here, we
701 			 * know that this pagebuf at least overlaps our
702 			 * pagebuf and is locked, therefore our buffer
703 			 * either does not exist, or is this buffer
704 			 */
705 
706 			pagebuf_rele(pb);
707 			XFS_STATS_INC(pb_busy_locked);
708 			return (NULL);
709 		}
710 	} else {
711 		/* trylock worked */
712 		PB_SET_OWNER(pb);
713 	}
714 
715 	if (pb->pb_flags & PBF_STALE)
716 		pb->pb_flags &= PBF_MAPPED;
717 	PB_TRACE(pb, "got_lock", 0);
718 	XFS_STATS_INC(pb_get_locked);
719 	return (pb);
720 }
721 
722 /*
723  *	xfs_buf_get_flags assembles a buffer covering the specified range.
724  *
725  *	Storage in memory for all portions of the buffer will be allocated,
726  *	although backing storage may not be.
727  */
728 xfs_buf_t *
xfs_buf_get_flags(xfs_buftarg_t * target,loff_t ioff,size_t isize,page_buf_flags_t flags)729 xfs_buf_get_flags(			/* allocate a buffer		*/
730 	xfs_buftarg_t		*target,/* target for buffer		*/
731 	loff_t			ioff,	/* starting offset of range	*/
732 	size_t			isize,	/* length of range		*/
733 	page_buf_flags_t	flags)	/* PBF_TRYLOCK			*/
734 {
735 	xfs_buf_t		*pb, *new_pb;
736 	int			error = 0, i;
737 
738 	new_pb = pagebuf_allocate(flags);
739 	if (unlikely(!new_pb))
740 		return NULL;
741 
742 	pb = _pagebuf_find(target, ioff, isize, flags, new_pb);
743 	if (pb == new_pb) {
744 		error = _pagebuf_lookup_pages(pb, flags);
745 		if (error)
746 			goto no_buffer;
747 	} else {
748 		pagebuf_deallocate(new_pb);
749 		if (unlikely(pb == NULL))
750 			return NULL;
751 	}
752 
753 	for (i = 0; i < pb->pb_page_count; i++)
754 		mark_page_accessed(pb->pb_pages[i]);
755 
756 	if (!(pb->pb_flags & PBF_MAPPED)) {
757 		error = _pagebuf_map_pages(pb, flags);
758 		if (unlikely(error)) {
759 			printk(KERN_WARNING "%s: failed to map pages\n",
760 					__FUNCTION__);
761 			goto no_buffer;
762 		}
763 	}
764 
765 	XFS_STATS_INC(pb_get);
766 
767 	/*
768 	 * Always fill in the block number now, the mapped cases can do
769 	 * their own overlay of this later.
770 	 */
771 	pb->pb_bn = ioff;
772 	pb->pb_count_desired = pb->pb_buffer_length;
773 
774 	PB_TRACE(pb, "get", (unsigned long)flags);
775 	return pb;
776 
777  no_buffer:
778 	if (flags & (PBF_LOCK | PBF_TRYLOCK))
779 		pagebuf_unlock(pb);
780 	pagebuf_rele(pb);
781 	return NULL;
782 }
783 
784 xfs_buf_t *
xfs_buf_read_flags(xfs_buftarg_t * target,loff_t ioff,size_t isize,page_buf_flags_t flags)785 xfs_buf_read_flags(
786 	xfs_buftarg_t		*target,
787 	loff_t			ioff,
788 	size_t			isize,
789 	page_buf_flags_t	flags)
790 {
791 	xfs_buf_t		*pb;
792 
793 	flags |= PBF_READ;
794 
795 	pb = xfs_buf_get_flags(target, ioff, isize, flags);
796 	if (pb) {
797 		if (PBF_NOT_DONE(pb)) {
798 			PB_TRACE(pb, "read", (unsigned long)flags);
799 			XFS_STATS_INC(pb_get_read);
800 			pagebuf_iostart(pb, flags);
801 		} else if (flags & PBF_ASYNC) {
802 			PB_TRACE(pb, "read_async", (unsigned long)flags);
803 			/*
804 			 * Read ahead call which is already satisfied,
805 			 * drop the buffer
806 			 */
807 			goto no_buffer;
808 		} else {
809 			PB_TRACE(pb, "read_done", (unsigned long)flags);
810 			/* We do not want read in the flags */
811 			pb->pb_flags &= ~PBF_READ;
812 		}
813 	}
814 
815 	return pb;
816 
817  no_buffer:
818 	if (flags & (PBF_LOCK | PBF_TRYLOCK))
819 		pagebuf_unlock(pb);
820 	pagebuf_rele(pb);
821 	return NULL;
822 }
823 
824 /*
825  * Create a skeletal pagebuf (no pages associated with it).
826  */
827 xfs_buf_t *
pagebuf_lookup(xfs_buftarg_t * target,loff_t ioff,size_t isize,page_buf_flags_t flags)828 pagebuf_lookup(
829 	xfs_buftarg_t		*target,
830 	loff_t			ioff,
831 	size_t			isize,
832 	page_buf_flags_t	flags)
833 {
834 	xfs_buf_t		*pb;
835 
836 	flags |= _PBF_PRIVATE_BH;
837 	pb = pagebuf_allocate(flags);
838 	if (pb) {
839 		_pagebuf_initialize(pb, target, ioff, isize, flags);
840 	}
841 	return pb;
842 }
843 
844 /*
845  * If we are not low on memory then do the readahead in a deadlock
846  * safe manner.
847  */
848 void
pagebuf_readahead(xfs_buftarg_t * target,loff_t ioff,size_t isize,page_buf_flags_t flags)849 pagebuf_readahead(
850 	xfs_buftarg_t		*target,
851 	loff_t			ioff,
852 	size_t			isize,
853 	page_buf_flags_t	flags)
854 {
855 	flags |= (PBF_TRYLOCK|PBF_ASYNC|PBF_READ_AHEAD);
856 	xfs_buf_read_flags(target, ioff, isize, flags);
857 }
858 
859 xfs_buf_t *
pagebuf_get_empty(size_t len,xfs_buftarg_t * target)860 pagebuf_get_empty(
861 	size_t			len,
862 	xfs_buftarg_t		*target)
863 {
864 	xfs_buf_t		*pb;
865 
866 	pb = pagebuf_allocate(0);
867 	if (pb)
868 		_pagebuf_initialize(pb, target, 0, len, 0);
869 	return pb;
870 }
871 
872 static inline struct page *
mem_to_page(void * addr)873 mem_to_page(
874 	void			*addr)
875 {
876 	if (((unsigned long)addr < VMALLOC_START) ||
877 	    ((unsigned long)addr >= VMALLOC_END)) {
878 		return virt_to_page(addr);
879 	} else {
880 		return vmalloc_to_page(addr);
881 	}
882 }
883 
884 int
pagebuf_associate_memory(xfs_buf_t * pb,void * mem,size_t len)885 pagebuf_associate_memory(
886 	xfs_buf_t		*pb,
887 	void			*mem,
888 	size_t			len)
889 {
890 	int			rval;
891 	int			i = 0;
892 	size_t			ptr;
893 	size_t			end, end_cur;
894 	off_t			offset;
895 	int			page_count;
896 
897 	page_count = PAGE_CACHE_ALIGN(len) >> PAGE_CACHE_SHIFT;
898 	offset = (off_t) mem - ((off_t)mem & PAGE_CACHE_MASK);
899 	if (offset && (len > PAGE_CACHE_SIZE))
900 		page_count++;
901 
902 	/* Free any previous set of page pointers */
903 	if (pb->pb_pages)
904 		_pagebuf_free_pages(pb);
905 
906 	pb->pb_pages = NULL;
907 	pb->pb_addr = mem;
908 
909 	rval = _pagebuf_get_pages(pb, page_count, 0);
910 	if (rval)
911 		return rval;
912 
913 	pb->pb_offset = offset;
914 	ptr = (size_t) mem & PAGE_CACHE_MASK;
915 	end = PAGE_CACHE_ALIGN((size_t) mem + len);
916 	end_cur = end;
917 	/* set up first page */
918 	pb->pb_pages[0] = mem_to_page(mem);
919 
920 	ptr += PAGE_CACHE_SIZE;
921 	pb->pb_page_count = ++i;
922 	while (ptr < end) {
923 		pb->pb_pages[i] = mem_to_page((void *)ptr);
924 		pb->pb_page_count = ++i;
925 		ptr += PAGE_CACHE_SIZE;
926 	}
927 	pb->pb_locked = 0;
928 
929 	pb->pb_count_desired = pb->pb_buffer_length = len;
930 	pb->pb_flags |= PBF_MAPPED | _PBF_PRIVATE_BH;
931 
932 	return 0;
933 }
934 
935 xfs_buf_t *
pagebuf_get_no_daddr(size_t len,xfs_buftarg_t * target)936 pagebuf_get_no_daddr(
937 	size_t			len,
938 	xfs_buftarg_t		*target)
939 {
940 	size_t			malloc_len = len;
941 	xfs_buf_t		*bp;
942 	void			*data;
943 	int			error;
944 
945 	bp = pagebuf_allocate(0);
946 	if (unlikely(bp == NULL))
947 		goto fail;
948 	_pagebuf_initialize(bp, target, 0, len, PBF_FORCEIO);
949 
950  try_again:
951 	data = kmem_alloc(malloc_len, KM_SLEEP | KM_MAYFAIL);
952 	if (unlikely(data == NULL))
953 		goto fail_free_buf;
954 
955 	/* check whether alignment matches.. */
956 	if ((__psunsigned_t)data !=
957 	    ((__psunsigned_t)data & ~target->pbr_smask)) {
958 		/* .. else double the size and try again */
959 		kmem_free(data, malloc_len);
960 		malloc_len <<= 1;
961 		goto try_again;
962 	}
963 
964 	error = pagebuf_associate_memory(bp, data, len);
965 	if (error)
966 		goto fail_free_mem;
967 	bp->pb_flags |= _PBF_KMEM_ALLOC;
968 
969 	pagebuf_unlock(bp);
970 
971 	PB_TRACE(bp, "no_daddr", data);
972 	return bp;
973  fail_free_mem:
974 	kmem_free(data, malloc_len);
975  fail_free_buf:
976 	pagebuf_free(bp);
977  fail:
978 	return NULL;
979 }
980 
981 /*
982  *	pagebuf_hold
983  *
984  *	Increment reference count on buffer, to hold the buffer concurrently
985  *	with another thread which may release (free) the buffer asynchronously.
986  *
987  *	Must hold the buffer already to call this function.
988  */
989 void
pagebuf_hold(xfs_buf_t * pb)990 pagebuf_hold(
991 	xfs_buf_t		*pb)
992 {
993 	atomic_inc(&pb->pb_hold);
994 	PB_TRACE(pb, "hold", 0);
995 }
996 
997 /*
998  *	pagebuf_rele
999  *
1000  *	pagebuf_rele releases a hold on the specified buffer.  If the
1001  *	the hold count is 1, pagebuf_rele calls pagebuf_free.
1002  */
1003 void
pagebuf_rele(xfs_buf_t * pb)1004 pagebuf_rele(
1005 	xfs_buf_t		*pb)
1006 {
1007 	pb_hash_t		*hash = pb_hash(pb);
1008 
1009 	PB_TRACE(pb, "rele", pb->pb_relse);
1010 
1011 	if (atomic_dec_and_lock(&pb->pb_hold, &hash->pb_hash_lock)) {
1012 		int		do_free = 1;
1013 
1014 		if (pb->pb_relse) {
1015 			atomic_inc(&pb->pb_hold);
1016 			spin_unlock(&hash->pb_hash_lock);
1017 			(*(pb->pb_relse)) (pb);
1018 			spin_lock(&hash->pb_hash_lock);
1019 			do_free = 0;
1020 		}
1021 
1022 		if (pb->pb_flags & PBF_DELWRI) {
1023 			pb->pb_flags |= PBF_ASYNC;
1024 			atomic_inc(&pb->pb_hold);
1025 			pagebuf_delwri_queue(pb, 0);
1026 			do_free = 0;
1027 		} else if (pb->pb_flags & PBF_FS_MANAGED) {
1028 			do_free = 0;
1029 		}
1030 
1031 		if (do_free) {
1032 			list_del_init(&pb->pb_hash_list);
1033 			spin_unlock(&hash->pb_hash_lock);
1034 			xfs_buf_free(pb);
1035 		} else {
1036 			spin_unlock(&hash->pb_hash_lock);
1037 		}
1038 	}
1039 }
1040 
1041 
1042 /*
1043  *	Mutual exclusion on buffers.  Locking model:
1044  *
1045  *	Buffers associated with inodes for which buffer locking
1046  *	is not enabled are not protected by semaphores, and are
1047  *	assumed to be exclusively owned by the caller.  There is a
1048  *	spinlock in the buffer, used by the caller when concurrent
1049  *	access is possible.
1050  */
1051 
1052 /*
1053  *	pagebuf_cond_lock
1054  *
1055  *	pagebuf_cond_lock locks a buffer object, if it is not already locked.
1056  *	Note that this in no way
1057  *	locks the underlying pages, so it is only useful for synchronizing
1058  *	concurrent use of page buffer objects, not for synchronizing independent
1059  *	access to the underlying pages.
1060  */
1061 int
pagebuf_cond_lock(xfs_buf_t * pb)1062 pagebuf_cond_lock(			/* lock buffer, if not locked	*/
1063 					/* returns -EBUSY if locked)	*/
1064 	xfs_buf_t		*pb)
1065 {
1066 	int			locked;
1067 
1068 	locked = down_trylock(&pb->pb_sema) == 0;
1069 	if (locked) {
1070 		PB_SET_OWNER(pb);
1071 	}
1072 	PB_TRACE(pb, "cond_lock", (long)locked);
1073 	return(locked ? 0 : -EBUSY);
1074 }
1075 
1076 #if defined(DEBUG) || defined(XFS_BLI_TRACE)
1077 /*
1078  *	pagebuf_lock_value
1079  *
1080  *	Return lock value for a pagebuf
1081  */
1082 int
pagebuf_lock_value(xfs_buf_t * pb)1083 pagebuf_lock_value(
1084 	xfs_buf_t		*pb)
1085 {
1086 	return(atomic_read(&pb->pb_sema.count));
1087 }
1088 #endif
1089 
1090 /*
1091  *	pagebuf_lock
1092  *
1093  *	pagebuf_lock locks a buffer object.  Note that this in no way
1094  *	locks the underlying pages, so it is only useful for synchronizing
1095  *	concurrent use of page buffer objects, not for synchronizing independent
1096  *	access to the underlying pages.
1097  */
1098 int
pagebuf_lock(xfs_buf_t * pb)1099 pagebuf_lock(
1100 	xfs_buf_t		*pb)
1101 {
1102 	PB_TRACE(pb, "lock", 0);
1103 	if (atomic_read(&pb->pb_io_remaining))
1104 		run_task_queue(&tq_disk);
1105 	down(&pb->pb_sema);
1106 	PB_SET_OWNER(pb);
1107 	PB_TRACE(pb, "locked", 0);
1108 	return 0;
1109 }
1110 
1111 /*
1112  *	pagebuf_unlock
1113  *
1114  *	pagebuf_unlock releases the lock on the buffer object created by
1115  *	pagebuf_lock or pagebuf_cond_lock (not any
1116  *	pinning of underlying pages created by pagebuf_pin).
1117  */
1118 void
pagebuf_unlock(xfs_buf_t * pb)1119 pagebuf_unlock(				/* unlock buffer		*/
1120 	xfs_buf_t		*pb)	/* buffer to unlock		*/
1121 {
1122 	PB_CLEAR_OWNER(pb);
1123 	up(&pb->pb_sema);
1124 	PB_TRACE(pb, "unlock", 0);
1125 }
1126 
1127 
1128 /*
1129  *	Pinning Buffer Storage in Memory
1130  */
1131 
1132 /*
1133  *	pagebuf_pin
1134  *
1135  *	pagebuf_pin locks all of the memory represented by a buffer in
1136  *	memory.  Multiple calls to pagebuf_pin and pagebuf_unpin, for
1137  *	the same or different buffers affecting a given page, will
1138  *	properly count the number of outstanding "pin" requests.  The
1139  *	buffer may be released after the pagebuf_pin and a different
1140  *	buffer used when calling pagebuf_unpin, if desired.
1141  *	pagebuf_pin should be used by the file system when it wants be
1142  *	assured that no attempt will be made to force the affected
1143  *	memory to disk.	 It does not assure that a given logical page
1144  *	will not be moved to a different physical page.
1145  */
1146 void
pagebuf_pin(xfs_buf_t * pb)1147 pagebuf_pin(
1148 	xfs_buf_t		*pb)
1149 {
1150 	atomic_inc(&pb->pb_pin_count);
1151 	PB_TRACE(pb, "pin", (long)pb->pb_pin_count.counter);
1152 }
1153 
1154 /*
1155  *	pagebuf_unpin
1156  *
1157  *	pagebuf_unpin reverses the locking of memory performed by
1158  *	pagebuf_pin.  Note that both functions affected the logical
1159  *	pages associated with the buffer, not the buffer itself.
1160  */
1161 void
pagebuf_unpin(xfs_buf_t * pb)1162 pagebuf_unpin(
1163 	xfs_buf_t		*pb)
1164 {
1165 	if (atomic_dec_and_test(&pb->pb_pin_count)) {
1166 		wake_up_all(&pb->pb_waiters);
1167 	}
1168 	PB_TRACE(pb, "unpin", (long)pb->pb_pin_count.counter);
1169 }
1170 
1171 int
pagebuf_ispin(xfs_buf_t * pb)1172 pagebuf_ispin(
1173 	xfs_buf_t		*pb)
1174 {
1175 	return atomic_read(&pb->pb_pin_count);
1176 }
1177 
1178 /*
1179  *	pagebuf_wait_unpin
1180  *
1181  *	pagebuf_wait_unpin waits until all of the memory associated
1182  *	with the buffer is not longer locked in memory.  It returns
1183  *	immediately if none of the affected pages are locked.
1184  */
1185 static inline void
_pagebuf_wait_unpin(xfs_buf_t * pb)1186 _pagebuf_wait_unpin(
1187 	xfs_buf_t		*pb)
1188 {
1189 	DECLARE_WAITQUEUE	(wait, current);
1190 
1191 	if (atomic_read(&pb->pb_pin_count) == 0)
1192 		return;
1193 
1194 	add_wait_queue(&pb->pb_waiters, &wait);
1195 	for (;;) {
1196 		set_current_state(TASK_UNINTERRUPTIBLE);
1197 		if (atomic_read(&pb->pb_pin_count) == 0)
1198 			break;
1199 		if (atomic_read(&pb->pb_io_remaining))
1200 			run_task_queue(&tq_disk);
1201 		schedule();
1202 	}
1203 	remove_wait_queue(&pb->pb_waiters, &wait);
1204 	set_current_state(TASK_RUNNING);
1205 }
1206 
1207 
1208 /*
1209  *	Buffer Utility Routines
1210  */
1211 
1212 /*
1213  *	pagebuf_iodone
1214  *
1215  *	pagebuf_iodone marks a buffer for which I/O is in progress
1216  *	done with respect to that I/O.	The pb_iodone routine, if
1217  *	present, will be called as a side-effect.
1218  */
1219 void
pagebuf_iodone_sched(void * v)1220 pagebuf_iodone_sched(
1221 	void			*v)
1222 {
1223 	xfs_buf_t		*bp = (xfs_buf_t *)v;
1224 
1225 	if (bp->pb_iodone)
1226 		(*(bp->pb_iodone))(bp);
1227 	else if (bp->pb_flags & PBF_ASYNC)
1228 		xfs_buf_relse(bp);
1229 }
1230 
1231 void
pagebuf_iodone(xfs_buf_t * pb,int dataio,int schedule)1232 pagebuf_iodone(
1233 	xfs_buf_t		*pb,
1234 	int			dataio,
1235 	int			schedule)
1236 {
1237 	pb->pb_flags &= ~(PBF_READ | PBF_WRITE);
1238 	if (pb->pb_error == 0) {
1239 		pb->pb_flags &= ~(PBF_PARTIAL | PBF_NONE);
1240 	}
1241 
1242 	PB_TRACE(pb, "iodone", pb->pb_iodone);
1243 
1244 	if ((pb->pb_iodone) || (pb->pb_flags & PBF_ASYNC)) {
1245 		if (schedule) {
1246 			int	daemon = CPU_TO_DAEMON(smp_processor_id());
1247 
1248 			INIT_TQUEUE(&pb->pb_iodone_sched,
1249 				pagebuf_iodone_sched, (void *)pb);
1250 			queue_task(&pb->pb_iodone_sched, dataio ?
1251 				&pagebuf_dataiodone_tq[daemon] :
1252 				&pagebuf_logiodone_tq[daemon]);
1253 			wake_up(dataio ?
1254 				&pagebuf_dataiodone_wait[daemon] :
1255 				&pagebuf_logiodone_wait[daemon]);
1256 		} else {
1257 			pagebuf_iodone_sched(pb);
1258 		}
1259 	} else {
1260 		up(&pb->pb_iodonesema);
1261 	}
1262 }
1263 
1264 /*
1265  *	pagebuf_ioerror
1266  *
1267  *	pagebuf_ioerror sets the error code for a buffer.
1268  */
1269 void
pagebuf_ioerror(xfs_buf_t * pb,int error)1270 pagebuf_ioerror(			/* mark/clear buffer error flag */
1271 	xfs_buf_t		*pb,	/* buffer to mark		*/
1272 	int			error)	/* error to store (0 if none)	*/
1273 {
1274 	ASSERT(error >= 0 && error <= 0xffff);
1275 	pb->pb_error = (unsigned short)error;
1276 	PB_TRACE(pb, "ioerror", (unsigned long)error);
1277 }
1278 
1279 /*
1280  *	pagebuf_iostart
1281  *
1282  *	pagebuf_iostart initiates I/O on a buffer, based on the flags supplied.
1283  *	If necessary, it will arrange for any disk space allocation required,
1284  *	and it will break up the request if the block mappings require it.
1285  *	The pb_iodone routine in the buffer supplied will only be called
1286  *	when all of the subsidiary I/O requests, if any, have been completed.
1287  *	pagebuf_iostart calls the pagebuf_ioinitiate routine or
1288  *	pagebuf_iorequest, if the former routine is not defined, to start
1289  *	the I/O on a given low-level request.
1290  */
1291 int
pagebuf_iostart(xfs_buf_t * pb,page_buf_flags_t flags)1292 pagebuf_iostart(			/* start I/O on a buffer	  */
1293 	xfs_buf_t		*pb,	/* buffer to start		  */
1294 	page_buf_flags_t	flags)	/* PBF_LOCK, PBF_ASYNC, PBF_READ, */
1295 					/* PBF_WRITE, PBF_DELWRI,	  */
1296 					/* PBF_DONT_BLOCK		  */
1297 {
1298 	int			status = 0;
1299 
1300 	PB_TRACE(pb, "iostart", (unsigned long)flags);
1301 
1302 	if (flags & PBF_DELWRI) {
1303 		pb->pb_flags &= ~(PBF_READ | PBF_WRITE | PBF_ASYNC);
1304 		pb->pb_flags |= flags & (PBF_DELWRI | PBF_ASYNC);
1305 		pagebuf_delwri_queue(pb, 1);
1306 		return status;
1307 	}
1308 
1309 	pb->pb_flags &= ~(PBF_READ | PBF_WRITE | PBF_ASYNC | PBF_DELWRI | \
1310 			PBF_READ_AHEAD | _PBF_RUN_QUEUES);
1311 	pb->pb_flags |= flags & (PBF_READ | PBF_WRITE | PBF_ASYNC | \
1312 			PBF_READ_AHEAD | _PBF_RUN_QUEUES);
1313 
1314 	BUG_ON(pb->pb_bn == XFS_BUF_DADDR_NULL);
1315 
1316 	/* For writes allow an alternate strategy routine to precede
1317 	 * the actual I/O request (which may not be issued at all in
1318 	 * a shutdown situation, for example).
1319 	 */
1320 	status = (flags & PBF_WRITE) ?
1321 		pagebuf_iostrategy(pb) : pagebuf_iorequest(pb);
1322 
1323 	/* Wait for I/O if we are not an async request.
1324 	 * Note: async I/O request completion will release the buffer,
1325 	 * and that can already be done by this point.  So using the
1326 	 * buffer pointer from here on, after async I/O, is invalid.
1327 	 */
1328 	if (!status && !(flags & PBF_ASYNC))
1329 		status = pagebuf_iowait(pb);
1330 
1331 	return status;
1332 }
1333 
1334 
1335 /*
1336  * Helper routines for pagebuf_iorequest (pagebuf I/O completion)
1337  */
1338 
1339 STATIC __inline__ int
_pagebuf_iolocked(xfs_buf_t * pb)1340 _pagebuf_iolocked(
1341 	xfs_buf_t		*pb)
1342 {
1343 	ASSERT(pb->pb_flags & (PBF_READ|PBF_WRITE));
1344 	if (pb->pb_target->pbr_bsize < PAGE_CACHE_SIZE)
1345 		return pb->pb_locked;
1346 	if (pb->pb_flags & PBF_READ)
1347 		return pb->pb_locked;
1348 	return (pb->pb_flags & _PBF_PAGE_CACHE);
1349 }
1350 
1351 STATIC void
_pagebuf_iodone(xfs_buf_t * pb,int schedule)1352 _pagebuf_iodone(
1353 	xfs_buf_t		*pb,
1354 	int			schedule)
1355 {
1356 	int			i;
1357 
1358 	if (atomic_dec_and_test(&pb->pb_io_remaining) != 1)
1359 		return;
1360 
1361 	if (_pagebuf_iolocked(pb))
1362 		for (i = 0; i < pb->pb_page_count; i++)
1363 			unlock_page(pb->pb_pages[i]);
1364 	pb->pb_locked = 0;
1365 	pagebuf_iodone(pb, (pb->pb_flags & PBF_FS_DATAIOD), schedule);
1366 }
1367 
1368 STATIC void
_end_io_pagebuf(struct buffer_head * bh,int uptodate,int fullpage)1369 _end_io_pagebuf(
1370 	struct buffer_head	*bh,
1371 	int			uptodate,
1372 	int			fullpage)
1373 {
1374 	struct page		*page = bh->b_page;
1375 	xfs_buf_t		*pb = (xfs_buf_t *)bh->b_private;
1376 
1377 	mark_buffer_uptodate(bh, uptodate);
1378 	put_bh(bh);
1379 
1380 	if (!uptodate) {
1381 		SetPageError(page);
1382 		pb->pb_error = EIO;
1383 	}
1384 
1385 	if (fullpage) {
1386 		unlock_buffer(bh);
1387 		_pagebuf_free_bh(bh);
1388 		if (!PageError(page))
1389 			SetPageUptodate(page);
1390 	} else {
1391 		static spinlock_t page_uptodate_lock = SPIN_LOCK_UNLOCKED;
1392 		struct buffer_head *bp;
1393 		unsigned long flags;
1394 
1395 		ASSERT(PageLocked(page));
1396 		spin_lock_irqsave(&page_uptodate_lock, flags);
1397 		clear_buffer_async(bh);
1398 		unlock_buffer(bh);
1399 		for (bp = bh->b_this_page; bp != bh; bp = bp->b_this_page) {
1400 			if (buffer_locked(bp)) {
1401 				if (buffer_async(bp))
1402 					break;
1403 			} else if (!buffer_uptodate(bp))
1404 				break;
1405 		}
1406 		spin_unlock_irqrestore(&page_uptodate_lock, flags);
1407 		if (bp == bh && !PageError(page))
1408 			SetPageUptodate(page);
1409 	}
1410 
1411 	_pagebuf_iodone(pb, 1);
1412 }
1413 
1414 STATIC void
_pagebuf_end_io_complete_pages(struct buffer_head * bh,int uptodate)1415 _pagebuf_end_io_complete_pages(
1416 	struct buffer_head	*bh,
1417 	int			uptodate)
1418 {
1419 	_end_io_pagebuf(bh, uptodate, 1);
1420 }
1421 
1422 STATIC void
_pagebuf_end_io_partial_pages(struct buffer_head * bh,int uptodate)1423 _pagebuf_end_io_partial_pages(
1424 	struct buffer_head	*bh,
1425 	int			uptodate)
1426 {
1427 	_end_io_pagebuf(bh, uptodate, 0);
1428 }
1429 
1430 /*
1431  *	Handling of buftargs.
1432  */
1433 
1434 /*
1435  * Wait for any bufs with callbacks that have been submitted but
1436  * have not yet returned... walk the hash list for the target.
1437  */
1438 void
xfs_wait_buftarg(xfs_buftarg_t * target)1439 xfs_wait_buftarg(
1440 	xfs_buftarg_t *target)
1441 {
1442 	xfs_buf_t	*pb, *n;
1443 	pb_hash_t	*h;
1444 	int		i;
1445 
1446 	for (i = 0; i < NHASH; i++) {
1447 		h = &pbhash[i];
1448 again:
1449 		spin_lock(&h->pb_hash_lock);
1450 		list_for_each_entry_safe(pb, n, &h->pb_hash, pb_hash_list) {
1451 			if (pb->pb_target == target &&
1452 					!(pb->pb_flags & PBF_FS_MANAGED)) {
1453 				spin_unlock(&h->pb_hash_lock);
1454 				delay(100);
1455 				goto again;
1456 			}
1457 		}
1458 		spin_unlock(&h->pb_hash_lock);
1459 	}
1460 }
1461 
1462 void
xfs_free_buftarg(xfs_buftarg_t * btp,int external)1463 xfs_free_buftarg(
1464 	xfs_buftarg_t		*btp,
1465 	int			external)
1466 {
1467 	xfs_flush_buftarg(btp, 1);
1468 	if (external)
1469 		xfs_blkdev_put(btp->pbr_bdev);
1470 	iput(btp->pbr_mapping->host);
1471 	kmem_free(btp, sizeof(*btp));
1472 }
1473 
1474 void
xfs_incore_relse(xfs_buftarg_t * btp,int delwri_only,int wait)1475 xfs_incore_relse(
1476 	xfs_buftarg_t		*btp,
1477 	int			delwri_only,
1478 	int			wait)
1479 {
1480 	destroy_buffers(btp->pbr_kdev);
1481 	truncate_inode_pages(btp->pbr_mapping, 0LL);
1482 }
1483 
1484 int
xfs_setsize_buftarg(xfs_buftarg_t * btp,unsigned int blocksize,unsigned int sectorsize)1485 xfs_setsize_buftarg(
1486 	xfs_buftarg_t		*btp,
1487 	unsigned int		blocksize,
1488 	unsigned int		sectorsize)
1489 {
1490 	btp->pbr_bsize = blocksize;
1491 	btp->pbr_sshift = ffs(sectorsize) - 1;
1492 	btp->pbr_smask = sectorsize - 1;
1493 
1494 	if (set_blocksize(btp->pbr_kdev, sectorsize)) {
1495 		printk(KERN_WARNING
1496 			"XFS: Cannot set_blocksize to %u on device 0x%x\n",
1497 			sectorsize, kdev_t_to_nr(btp->pbr_kdev));
1498 		return EINVAL;
1499 	}
1500 	return 0;
1501 }
1502 
1503 STATIC int
xfs_mapping_buftarg(xfs_buftarg_t * btp,struct block_device * bdev)1504 xfs_mapping_buftarg(
1505 	xfs_buftarg_t		*btp,
1506 	struct block_device	*bdev)
1507 {
1508 	kdev_t			kdev;
1509 	struct inode		*inode;
1510 	struct address_space	*mapping;
1511 	static struct address_space_operations mapping_aops = {
1512 		.sync_page = block_sync_page,
1513 	};
1514 
1515 	kdev = to_kdev_t(bdev->bd_dev);
1516 	inode = new_inode(bdev->bd_inode->i_sb);
1517 	if (!inode) {
1518 		printk(KERN_WARNING
1519 			"XFS: Cannot allocate mapping inode for device %s\n",
1520 			XFS_BUFTARG_NAME(btp));
1521 		return ENOMEM;
1522 	}
1523 	inode->i_mode = S_IFBLK;
1524 	inode->i_dev  = kdev;
1525 	inode->i_rdev = kdev;
1526 	inode->i_bdev = bdev;
1527 	mapping = &inode->i_data;
1528 	mapping->a_ops = &mapping_aops;
1529 	mapping->gfp_mask = GFP_KERNEL;
1530 	btp->pbr_mapping = mapping;
1531 	return 0;
1532 }
1533 
1534 xfs_buftarg_t *
xfs_alloc_buftarg(struct block_device * bdev)1535 xfs_alloc_buftarg(
1536 	struct block_device	*bdev)
1537 {
1538 	xfs_buftarg_t		*btp;
1539 	kdev_t			kdev;
1540 
1541 	btp = kmem_zalloc(sizeof(*btp), KM_SLEEP);
1542 
1543 	kdev = to_kdev_t(bdev->bd_dev);
1544 	btp->pbr_dev =  bdev->bd_dev;
1545 	btp->pbr_kdev = kdev;
1546 	btp->pbr_bdev = bdev;
1547 	switch (MAJOR(btp->pbr_dev)) {
1548 	case MD_MAJOR:
1549 	case EVMS_MAJOR:
1550 		btp->pbr_flags = PBR_ALIGNED_ONLY;
1551 		break;
1552 	case LOOP_MAJOR:
1553 	case LVM_BLK_MAJOR:
1554 		btp->pbr_flags = PBR_SECTOR_ONLY;
1555 		break;
1556 	}
1557 	if (xfs_setsize_buftarg(btp, PAGE_CACHE_SIZE, get_hardsect_size(kdev)))
1558 		goto error;
1559 	if (xfs_mapping_buftarg(btp, bdev))
1560 		goto error;
1561 	return btp;
1562 
1563 error:
1564 	kmem_free(btp, sizeof(*btp));
1565 	return NULL;
1566 }
1567 
1568 /*
1569  * Initiate I/O on part of a page we are interested in
1570  */
1571 STATIC int
_pagebuf_page_io(struct page * page,xfs_buftarg_t * pbr,xfs_buf_t * pb,xfs_daddr_t bn,size_t pg_offset,size_t pg_length,int rw,int flush)1572 _pagebuf_page_io(
1573 	struct page		*page,	/* Page structure we are dealing with */
1574 	xfs_buftarg_t		*pbr,	/* device parameters (bsz, ssz, dev) */
1575 	xfs_buf_t		*pb,	/* pagebuf holding it, can be NULL */
1576 	xfs_daddr_t		bn,	/* starting block number */
1577 	size_t			pg_offset,	/* starting offset in page */
1578 	size_t			pg_length,	/* count of data to process */
1579 	int			rw,	/* read/write operation */
1580 	int			flush)
1581 {
1582 	size_t			sector;
1583 	size_t			blk_length = 0;
1584 	struct buffer_head	*bh, *head, *bufferlist[MAX_BUF_PER_PAGE];
1585 	int			sector_shift = pbr->pbr_sshift;
1586 	int			i = 0, cnt = 0;
1587 	int			public_bh = 0;
1588 	int			multi_ok;
1589 
1590 	if ((pbr->pbr_bsize < PAGE_CACHE_SIZE) &&
1591 	    !(pb->pb_flags & _PBF_PRIVATE_BH)) {
1592 		int		cache_ok;
1593 
1594 		cache_ok = !((pb->pb_flags & PBF_FORCEIO) || (rw == WRITE));
1595 		public_bh = multi_ok = 1;
1596 		sector = 1 << sector_shift;
1597 
1598 		ASSERT(PageLocked(page));
1599 		if (!page_has_buffers(page))
1600 			create_empty_buffers(page, pbr->pbr_kdev, sector);
1601 
1602 		i = sector >> BBSHIFT;
1603 		bn -= (pg_offset >> BBSHIFT);
1604 
1605 		/* Find buffer_heads belonging to just this pagebuf */
1606 		bh = head = page_buffers(page);
1607 		do {
1608 			if (buffer_uptodate(bh) && cache_ok)
1609 				continue;
1610 			if (blk_length < pg_offset)
1611 				continue;
1612 			if (blk_length >= pg_offset + pg_length)
1613 				break;
1614 
1615 			lock_buffer(bh);
1616 			get_bh(bh);
1617 			bh->b_size = sector;
1618 			bh->b_blocknr = bn;
1619 			bufferlist[cnt++] = bh;
1620 
1621 		} while ((bn += i),
1622 			 (blk_length += sector),
1623 			  (bh = bh->b_this_page) != head);
1624 
1625 		goto request;
1626 	}
1627 
1628 	/* Calculate the block offsets and length we will be using */
1629 	if (pg_offset) {
1630 		size_t		block_offset;
1631 
1632 		block_offset = pg_offset >> sector_shift;
1633 		block_offset = pg_offset - (block_offset << sector_shift);
1634 		blk_length = (pg_length + block_offset + pbr->pbr_smask) >>
1635 								sector_shift;
1636 	} else {
1637 		blk_length = (pg_length + pbr->pbr_smask) >> sector_shift;
1638 	}
1639 
1640 	/* This will attempt to make a request bigger than the sector
1641 	 * size if we are well aligned.
1642 	 */
1643 	switch (pb->pb_target->pbr_flags) {
1644 	case 0:
1645 		sector = blk_length << sector_shift;
1646 		blk_length = 1;
1647 		break;
1648 	case PBR_ALIGNED_ONLY:
1649 		if ((pg_offset == 0) && (pg_length == PAGE_CACHE_SIZE) &&
1650 		    (((unsigned int) bn) & BN_ALIGN_MASK) == 0) {
1651 			sector = blk_length << sector_shift;
1652 			blk_length = 1;
1653 			break;
1654 		}
1655 	case PBR_SECTOR_ONLY:
1656 		/* Fallthrough, same as default */
1657 	default:
1658 		sector = 1 << sector_shift;
1659 	}
1660 
1661 	/* If we are doing I/O larger than the bh->b_size field then
1662 	 * we need to split this request up.
1663 	 */
1664 	while (sector > ((1ULL << NBBY * sizeof(bh->b_size)) - 1)) {
1665 		sector >>= 1;
1666 		blk_length++;
1667 	}
1668 
1669 	multi_ok = (blk_length != 1);
1670 	i = sector >> BBSHIFT;
1671 
1672 	for (; blk_length > 0; bn += i, blk_length--, pg_offset += sector) {
1673 		bh = kmem_cache_alloc(bh_cachep, SLAB_NOFS);
1674 		if (!bh)
1675 			bh = _pagebuf_get_prealloc_bh();
1676 		memset(bh, 0, sizeof(*bh));
1677 		bh->b_blocknr = bn;
1678 		bh->b_size = sector;
1679 		bh->b_dev = pbr->pbr_kdev;
1680 		set_buffer_locked(bh);
1681 		set_bh_page(bh, page, pg_offset);
1682 		init_waitqueue_head(&bh->b_wait);
1683 		atomic_set(&bh->b_count, 1);
1684 		bufferlist[cnt++] = bh;
1685 	}
1686 
1687 request:
1688 	if (cnt) {
1689 		void	(*callback)(struct buffer_head *, int);
1690 
1691 		callback = (multi_ok && public_bh) ?
1692 				_pagebuf_end_io_partial_pages :
1693 				_pagebuf_end_io_complete_pages;
1694 
1695 		/* Account for additional buffers in progress */
1696 		atomic_add(cnt, &pb->pb_io_remaining);
1697 
1698 #ifdef RQ_WRITE_ORDERED
1699 		if (flush)
1700 			set_bit(BH_Ordered_Flush, &bufferlist[cnt-1]->b_state);
1701 #endif
1702 
1703 		for (i = 0; i < cnt; i++) {
1704 			bh = bufferlist[i];
1705 			init_buffer(bh, callback, pb);
1706 			bh->b_rdev = bh->b_dev;
1707 			bh->b_rsector = bh->b_blocknr;
1708 			set_buffer_mapped(bh);
1709 			set_buffer_async(bh);
1710 			set_buffer_req(bh);
1711 			if (rw == WRITE)
1712 				set_buffer_uptodate(bh);
1713 			generic_make_request(rw, bh);
1714 		}
1715 		return 0;
1716 	}
1717 
1718 	/*
1719 	 * We have no I/O to submit, let the caller know that
1720 	 * we have skipped over this page entirely.
1721 	 */
1722 	return 1;
1723 }
1724 
1725 STATIC void
_pagebuf_page_apply(xfs_buf_t * pb,loff_t offset,struct page * page,size_t pg_offset,size_t pg_length,int last)1726 _pagebuf_page_apply(
1727 	xfs_buf_t		*pb,
1728 	loff_t			offset,
1729 	struct page		*page,
1730 	size_t			pg_offset,
1731 	size_t			pg_length,
1732 	int			last)
1733 {
1734 	xfs_daddr_t		bn = pb->pb_bn;
1735 	xfs_buftarg_t		*pbr = pb->pb_target;
1736 	loff_t			pb_offset;
1737 	int			status, locking;
1738 
1739 	ASSERT(page);
1740 	ASSERT(pb->pb_flags & (PBF_READ|PBF_WRITE));
1741 
1742 	if ((pbr->pbr_bsize == PAGE_CACHE_SIZE) &&
1743 	    (pb->pb_buffer_length < PAGE_CACHE_SIZE) &&
1744 	    (pb->pb_flags & PBF_READ) && pb->pb_locked) {
1745 		bn -= (pb->pb_offset >> BBSHIFT);
1746 		pg_offset = 0;
1747 		pg_length = PAGE_CACHE_SIZE;
1748 	} else {
1749 		pb_offset = offset - pb->pb_file_offset;
1750 		if (pb_offset) {
1751 			bn += (pb_offset + BBMASK) >> BBSHIFT;
1752 		}
1753 	}
1754 
1755 	locking = _pagebuf_iolocked(pb);
1756 	if (pb->pb_flags & PBF_WRITE) {
1757 		if (locking && !pb->pb_locked)
1758 			lock_page(page);
1759 		status = _pagebuf_page_io(page, pbr, pb, bn,
1760 				pg_offset, pg_length, WRITE,
1761 				last && (pb->pb_flags & PBF_FLUSH));
1762 	} else {
1763 		status = _pagebuf_page_io(page, pbr, pb, bn,
1764 				pg_offset, pg_length, READ, 0);
1765 	}
1766 	if (status && locking && !(pb->pb_target->pbr_bsize < PAGE_CACHE_SIZE))
1767 		unlock_page(page);
1768 }
1769 
1770 /*
1771  *	pagebuf_iorequest -- the core I/O request routine.
1772  */
1773 int
pagebuf_iorequest(xfs_buf_t * pb)1774 pagebuf_iorequest(			/* start real I/O		*/
1775 	xfs_buf_t		*pb)	/* buffer to convey to device	*/
1776 {
1777 	PB_TRACE(pb, "iorequest", 0);
1778 
1779 	if (pb->pb_flags & PBF_DELWRI) {
1780 		pagebuf_delwri_queue(pb, 1);
1781 		return 0;
1782 	}
1783 
1784 	if (pb->pb_flags & PBF_WRITE) {
1785 		_pagebuf_wait_unpin(pb);
1786 	}
1787 
1788 	pagebuf_hold(pb);
1789 
1790 	/* Set the count to 1 initially, this will stop an I/O
1791 	 * completion callout which happens before we have started
1792 	 * all the I/O from calling pagebuf_iodone too early.
1793 	 */
1794 	atomic_set(&pb->pb_io_remaining, 1);
1795 	_pagebuf_ioapply(pb);
1796 	_pagebuf_iodone(pb, 0);
1797 
1798 	pagebuf_rele(pb);
1799 	return 0;
1800 }
1801 
1802 /*
1803  *	pagebuf_iowait
1804  *
1805  *	pagebuf_iowait waits for I/O to complete on the buffer supplied.
1806  *	It returns immediately if no I/O is pending.  In any case, it returns
1807  *	the error code, if any, or 0 if there is no error.
1808  */
1809 int
pagebuf_iowait(xfs_buf_t * pb)1810 pagebuf_iowait(
1811 	xfs_buf_t		*pb)
1812 {
1813 	PB_TRACE(pb, "iowait", 0);
1814 	if (atomic_read(&pb->pb_io_remaining))
1815 		run_task_queue(&tq_disk);
1816 	if ((pb->pb_flags & PBF_FS_DATAIOD))
1817 		pagebuf_runall_queues(pagebuf_dataiodone_tq);
1818 	down(&pb->pb_iodonesema);
1819 	PB_TRACE(pb, "iowaited", (long)pb->pb_error);
1820 	return pb->pb_error;
1821 }
1822 
1823 caddr_t
pagebuf_offset(xfs_buf_t * pb,size_t offset)1824 pagebuf_offset(
1825 	xfs_buf_t		*pb,
1826 	size_t			offset)
1827 {
1828 	struct page		*page;
1829 
1830 	offset += pb->pb_offset;
1831 
1832 	page = pb->pb_pages[offset >> PAGE_CACHE_SHIFT];
1833 	return (caddr_t) page_address(page) + (offset & (PAGE_CACHE_SIZE - 1));
1834 }
1835 
1836 /*
1837  *	pagebuf_iomove
1838  *
1839  *	Move data into or out of a buffer.
1840  */
1841 void
pagebuf_iomove(xfs_buf_t * pb,size_t boff,size_t bsize,caddr_t data,page_buf_rw_t mode)1842 pagebuf_iomove(
1843 	xfs_buf_t		*pb,	/* buffer to process		*/
1844 	size_t			boff,	/* starting buffer offset	*/
1845 	size_t			bsize,	/* length to copy		*/
1846 	caddr_t			data,	/* data address			*/
1847 	page_buf_rw_t		mode)	/* read/write flag		*/
1848 {
1849 	size_t			bend, cpoff, csize;
1850 	struct page		*page;
1851 
1852 	bend = boff + bsize;
1853 	while (boff < bend) {
1854 		page = pb->pb_pages[page_buf_btoct(boff + pb->pb_offset)];
1855 		cpoff = page_buf_poff(boff + pb->pb_offset);
1856 		csize = min_t(size_t,
1857 			      PAGE_CACHE_SIZE-cpoff, pb->pb_count_desired-boff);
1858 
1859 		ASSERT(((csize + cpoff) <= PAGE_CACHE_SIZE));
1860 
1861 		switch (mode) {
1862 		case PBRW_ZERO:
1863 			memset(page_address(page) + cpoff, 0, csize);
1864 			break;
1865 		case PBRW_READ:
1866 			memcpy(data, page_address(page) + cpoff, csize);
1867 			break;
1868 		case PBRW_WRITE:
1869 			memcpy(page_address(page) + cpoff, data, csize);
1870 		}
1871 
1872 		boff += csize;
1873 		data += csize;
1874 	}
1875 }
1876 
1877 /*
1878  *	_pagebuf_ioapply
1879  *
1880  *	Applies _pagebuf_page_apply to each page of the xfs_buf_t.
1881  */
1882 STATIC void
_pagebuf_ioapply(xfs_buf_t * pb)1883 _pagebuf_ioapply(			/* apply function to pages	*/
1884 	xfs_buf_t		*pb)	/* buffer to examine		*/
1885 {
1886 	int			index;
1887 	loff_t			buffer_offset = pb->pb_file_offset;
1888 	size_t			buffer_len = pb->pb_count_desired;
1889 	size_t			page_offset, len;
1890 	size_t			cur_offset, cur_len;
1891 
1892 	cur_offset = pb->pb_offset;
1893 	cur_len = buffer_len;
1894 
1895 	if (!pb->pb_locked && !(pb->pb_flags & PBF_DIRECTIO) &&
1896 	    (pb->pb_target->pbr_bsize < PAGE_CACHE_SIZE)) {
1897 		for (index = 0; index < pb->pb_page_count; index++)
1898 			lock_page(pb->pb_pages[index]);
1899 		pb->pb_locked = 1;
1900 	}
1901 
1902 	for (index = 0; index < pb->pb_page_count; index++) {
1903 		if (cur_len == 0)
1904 			break;
1905 		if (cur_offset >= PAGE_CACHE_SIZE) {
1906 			cur_offset -= PAGE_CACHE_SIZE;
1907 			continue;
1908 		}
1909 
1910 		page_offset = cur_offset;
1911 		cur_offset = 0;
1912 
1913 		len = PAGE_CACHE_SIZE - page_offset;
1914 		if (len > cur_len)
1915 			len = cur_len;
1916 		cur_len -= len;
1917 
1918 		_pagebuf_page_apply(pb, buffer_offset,
1919 				pb->pb_pages[index], page_offset, len,
1920 				index + 1 == pb->pb_page_count);
1921 		buffer_offset += len;
1922 		buffer_len -= len;
1923 	}
1924 
1925 	/*
1926 	 * Run the block device task queue here, while we have
1927 	 * a hold on the pagebuf (important to have that hold).
1928 	 */
1929 	if (pb->pb_flags & _PBF_RUN_QUEUES) {
1930 		pb->pb_flags &= ~_PBF_RUN_QUEUES;
1931 		if (atomic_read(&pb->pb_io_remaining) > 1)
1932 			run_task_queue(&tq_disk);
1933 	}
1934 }
1935 
1936 
1937 /*
1938  * Delayed write buffer list handling
1939  */
1940 
1941 STATIC LIST_HEAD(pbd_delwrite_queue);
1942 STATIC spinlock_t pbd_delwrite_lock = SPIN_LOCK_UNLOCKED;
1943 
1944 STATIC void
pagebuf_delwri_queue(xfs_buf_t * pb,int unlock)1945 pagebuf_delwri_queue(
1946 	xfs_buf_t		*pb,
1947 	int			unlock)
1948 {
1949 	PB_TRACE(pb, "delwri_q", (long)unlock);
1950 	ASSERT(pb->pb_flags & PBF_DELWRI);
1951 
1952 	spin_lock(&pbd_delwrite_lock);
1953 	/* If already in the queue, dequeue and place at tail */
1954 	if (!list_empty(&pb->pb_list)) {
1955 		if (unlock)
1956 			atomic_dec(&pb->pb_hold);
1957 		list_del(&pb->pb_list);
1958 	}
1959 
1960 	list_add_tail(&pb->pb_list, &pbd_delwrite_queue);
1961 	pb->pb_queuetime = jiffies;
1962 	spin_unlock(&pbd_delwrite_lock);
1963 
1964 	if (unlock)
1965 		pagebuf_unlock(pb);
1966 }
1967 
1968 void
pagebuf_delwri_dequeue(xfs_buf_t * pb)1969 pagebuf_delwri_dequeue(
1970 	xfs_buf_t		*pb)
1971 {
1972 	int			dequeued = 0;
1973 
1974 	spin_lock(&pbd_delwrite_lock);
1975 	if ((pb->pb_flags & PBF_DELWRI) && !list_empty(&pb->pb_list)) {
1976 		list_del_init(&pb->pb_list);
1977 		dequeued = 1;
1978 	}
1979 	pb->pb_flags &= ~PBF_DELWRI;
1980 	spin_unlock(&pbd_delwrite_lock);
1981 
1982 	if (dequeued)
1983 		pagebuf_rele(pb);
1984 
1985 	PB_TRACE(pb, "delwri_dq", (long)dequeued);
1986 }
1987 
1988 
1989 /*
1990  * The pagebuf iodone daemons
1991  */
1992 
1993 STATIC int
pagebuf_iodone_daemon(void * __bind_cpu,const char * name,int pagebuf_daemons[],struct list_head pagebuf_iodone_tq[],wait_queue_head_t pagebuf_iodone_wait[])1994 pagebuf_iodone_daemon(
1995 	void			*__bind_cpu,
1996 	const char		*name,
1997 	int			pagebuf_daemons[],
1998 	struct list_head	pagebuf_iodone_tq[],
1999 	wait_queue_head_t	pagebuf_iodone_wait[])
2000 {
2001 	int			bind_cpu, cpu;
2002 	DECLARE_WAITQUEUE	(wait, current);
2003 
2004 	bind_cpu = (int) (long)__bind_cpu;
2005 	cpu = CPU_TO_DAEMON(cpu_logical_map(bind_cpu));
2006 
2007 	/*  Set up the thread  */
2008 	daemonize();
2009 
2010 	/* Avoid signals */
2011 	sigmask_lock();
2012 	sigfillset(&current->blocked);
2013 	__recalc_sigpending(current);
2014 	sigmask_unlock();
2015 
2016 	/* Migrate to the right CPU */
2017 	migrate_to_cpu(cpu);
2018 #ifdef __HAVE_NEW_SCHEDULER
2019 	if (smp_processor_id() != cpu)
2020 		BUG();
2021 #else
2022 	while (smp_processor_id() != cpu)
2023 		schedule();
2024 #endif
2025 
2026 	sprintf(current->comm, "%s/%d", name, bind_cpu);
2027 	INIT_LIST_HEAD(&pagebuf_iodone_tq[cpu]);
2028 	init_waitqueue_head(&pagebuf_iodone_wait[cpu]);
2029 	__set_current_state(TASK_INTERRUPTIBLE);
2030 	mb();
2031 
2032 	pagebuf_daemons[cpu] = 1;
2033 
2034 	for (;;) {
2035 		add_wait_queue(&pagebuf_iodone_wait[cpu], &wait);
2036 
2037 		if (TQ_ACTIVE(pagebuf_iodone_tq[cpu]))
2038 			__set_task_state(current, TASK_RUNNING);
2039 		schedule();
2040 		remove_wait_queue(&pagebuf_iodone_wait[cpu], &wait);
2041 		run_task_queue(&pagebuf_iodone_tq[cpu]);
2042 		if (pagebuf_daemons[cpu] == 0)
2043 			break;
2044 		__set_current_state(TASK_INTERRUPTIBLE);
2045 	}
2046 
2047 	pagebuf_daemons[cpu] = -1;
2048 	wake_up_interruptible(&pagebuf_iodone_wait[cpu]);
2049 	return 0;
2050 }
2051 
2052 STATIC void
pagebuf_runall_queues(struct list_head pagebuf_iodone_tq[])2053 pagebuf_runall_queues(
2054 	struct list_head	pagebuf_iodone_tq[])
2055 {
2056 	int	pcpu, cpu;
2057 
2058 	for (cpu = 0; cpu < min(smp_num_cpus, MAX_IO_DAEMONS); cpu++) {
2059 		pcpu = CPU_TO_DAEMON(cpu_logical_map(cpu));
2060 
2061 		run_task_queue(&pagebuf_iodone_tq[pcpu]);
2062 	}
2063 }
2064 
2065 STATIC int
pagebuf_logiodone_daemon(void * __bind_cpu)2066 pagebuf_logiodone_daemon(
2067 	void			*__bind_cpu)
2068 {
2069 	return pagebuf_iodone_daemon(__bind_cpu, "xfslogd", pb_logio_daemons,
2070 			pagebuf_logiodone_tq, pagebuf_logiodone_wait);
2071 }
2072 
2073 STATIC int
pagebuf_dataiodone_daemon(void * __bind_cpu)2074 pagebuf_dataiodone_daemon(
2075 	void			*__bind_cpu)
2076 {
2077 	return pagebuf_iodone_daemon(__bind_cpu, "xfsdatad", pb_dataio_daemons,
2078 			pagebuf_dataiodone_tq, pagebuf_dataiodone_wait);
2079 }
2080 
2081 
2082 /* Defines for pagebuf daemon */
2083 STATIC DECLARE_COMPLETION(pagebuf_daemon_done);
2084 STATIC struct task_struct *pagebuf_daemon_task;
2085 STATIC int pagebuf_daemon_active;
2086 STATIC int force_flush;
2087 
2088 
2089 STATIC int
pagebuf_daemon_wakeup(int priority,unsigned int mask)2090 pagebuf_daemon_wakeup(
2091 	int			priority,
2092 	unsigned int		mask)
2093 {
2094 	force_flush = 1;
2095 	barrier();
2096 	wake_up_process(pagebuf_daemon_task);
2097 	return 0;
2098 }
2099 
2100 STATIC int
pagebuf_daemon(void * data)2101 pagebuf_daemon(
2102 	void			*data)
2103 {
2104 	struct list_head	tmp;
2105 	unsigned long		age;
2106 	xfs_buf_t		*pb, *n;
2107 	int			count;
2108 
2109 	/*  Set up the thread  */
2110 	daemonize();
2111 
2112 	/* Mark it active */
2113 	pagebuf_daemon_task = current;
2114 	pagebuf_daemon_active = 1;
2115 	barrier();
2116 
2117 	/* Avoid signals */
2118 	sigmask_lock();
2119 	sigfillset(&current->blocked);
2120 	__recalc_sigpending(current);
2121 	sigmask_unlock();
2122 
2123 	strcpy(current->comm, "xfsbufd");
2124 	current->flags |= PF_MEMALLOC;
2125 
2126 	INIT_LIST_HEAD(&tmp);
2127 	do {
2128 		set_current_state(TASK_INTERRUPTIBLE);
2129 		schedule_timeout((xfs_buf_timer_centisecs * HZ) / 100);
2130 
2131 		count = 0;
2132 		age = (xfs_buf_age_centisecs * HZ) / 100;
2133 		spin_lock(&pbd_delwrite_lock);
2134 		list_for_each_entry_safe(pb, n, &pbd_delwrite_queue, pb_list) {
2135 			PB_TRACE(pb, "walkq1", (long)pagebuf_ispin(pb));
2136 			ASSERT(pb->pb_flags & PBF_DELWRI);
2137 
2138 			if (!pagebuf_ispin(pb) && !pagebuf_cond_lock(pb)) {
2139 				if (!force_flush &&
2140 				    time_before(jiffies,
2141 						pb->pb_queuetime + age)) {
2142 					pagebuf_unlock(pb);
2143 					break;
2144 				}
2145 
2146 				pb->pb_flags &= ~PBF_DELWRI;
2147 				pb->pb_flags |= PBF_WRITE;
2148 				list_move(&pb->pb_list, &tmp);
2149 				count++;
2150 			}
2151 		}
2152 		spin_unlock(&pbd_delwrite_lock);
2153 
2154 		while (!list_empty(&tmp)) {
2155 			pb = list_entry(tmp.next, xfs_buf_t, pb_list);
2156 			list_del_init(&pb->pb_list);
2157 			pagebuf_iostrategy(pb);
2158 		}
2159 
2160 		if (as_list_len > 0)
2161 			purge_addresses();
2162 		if (count)
2163 			run_task_queue(&tq_disk);
2164 
2165 		force_flush = 0;
2166 	} while (pagebuf_daemon_active);
2167 
2168 	complete_and_exit(&pagebuf_daemon_done, 0);
2169 }
2170 
2171 /*
2172  * Go through all incore buffers, and release buffers if they belong to
2173  * the given device. This is used in filesystem error handling to
2174  * preserve the consistency of its metadata.
2175  */
2176 int
xfs_flush_buftarg(xfs_buftarg_t * target,int wait)2177 xfs_flush_buftarg(
2178 	xfs_buftarg_t		*target,
2179 	int			wait)
2180 {
2181 	struct list_head	tmp;
2182 	xfs_buf_t		*pb, *n;
2183 	int			pincount = 0;
2184 	int			flush_cnt = 0;
2185 
2186 	pagebuf_runall_queues(pagebuf_dataiodone_tq);
2187 	pagebuf_runall_queues(pagebuf_logiodone_tq);
2188 
2189 	INIT_LIST_HEAD(&tmp);
2190 	spin_lock(&pbd_delwrite_lock);
2191 	list_for_each_entry_safe(pb, n, &pbd_delwrite_queue, pb_list) {
2192 
2193 		if (pb->pb_target != target)
2194 			continue;
2195 
2196 		ASSERT(pb->pb_flags & PBF_DELWRI);
2197 		PB_TRACE(pb, "walkq2", (long)pagebuf_ispin(pb));
2198 		if (pagebuf_ispin(pb)) {
2199 			pincount++;
2200 			continue;
2201 		}
2202 
2203 		pb->pb_flags &= ~PBF_DELWRI;
2204 		pb->pb_flags |= PBF_WRITE;
2205 		list_move(&pb->pb_list, &tmp);
2206 	}
2207 	spin_unlock(&pbd_delwrite_lock);
2208 
2209 	/*
2210 	 * Dropped the delayed write list lock, now walk the temporary list
2211 	 */
2212 	list_for_each_entry_safe(pb, n, &tmp, pb_list) {
2213 
2214 		if (wait)
2215 			pb->pb_flags &= ~PBF_ASYNC;
2216 		else
2217 			list_del_init(&pb->pb_list);
2218 
2219 		pagebuf_lock(pb);
2220 		pagebuf_iostrategy(pb);
2221 
2222 		if (++flush_cnt > 32) {
2223 			run_task_queue(&tq_disk);
2224 			flush_cnt = 0;
2225 		}
2226 	}
2227 
2228 	run_task_queue(&tq_disk);
2229 
2230 	/*
2231 	 * Remaining list items must be flushed before returning
2232 	 */
2233 	while (!list_empty(&tmp)) {
2234 		pb = list_entry(tmp.next, xfs_buf_t, pb_list);
2235 
2236 		list_del_init(&pb->pb_list);
2237 
2238 		xfs_iowait(pb);
2239 		xfs_buf_relse(pb);
2240 	}
2241 
2242 	return pincount;
2243 }
2244 
2245 STATIC int
pagebuf_daemon_start(void)2246 pagebuf_daemon_start(void)
2247 {
2248 	int		cpu, pcpu;
2249 
2250 	kernel_thread(pagebuf_daemon, NULL, CLONE_FS|CLONE_FILES|CLONE_VM);
2251 
2252 	for (cpu = 0; cpu < min(smp_num_cpus, MAX_IO_DAEMONS); cpu++) {
2253 		pcpu = CPU_TO_DAEMON(cpu_logical_map(cpu));
2254 
2255 		if (kernel_thread(pagebuf_logiodone_daemon,
2256 				(void *)(long) cpu,
2257 				CLONE_FS|CLONE_FILES|CLONE_VM) < 0) {
2258 			printk("pagebuf_logiodone daemon failed to start\n");
2259 		} else {
2260 			while (!pb_logio_daemons[pcpu])
2261 				yield();
2262 		}
2263 	}
2264 	for (cpu = 0; cpu < min(smp_num_cpus, MAX_IO_DAEMONS); cpu++) {
2265 		pcpu = CPU_TO_DAEMON(cpu_logical_map(cpu));
2266 
2267 		if (kernel_thread(pagebuf_dataiodone_daemon,
2268 				(void *)(long) cpu,
2269 				CLONE_FS|CLONE_FILES|CLONE_VM) < 0) {
2270 			printk("pagebuf_dataiodone daemon failed to start\n");
2271 		} else {
2272 			while (!pb_dataio_daemons[pcpu])
2273 				yield();
2274 		}
2275 	}
2276 	return 0;
2277 }
2278 
2279 /*
2280  * pagebuf_daemon_stop
2281  *
2282  * Note: do not mark as __exit, it is called from pagebuf_terminate.
2283  */
2284 STATIC void
pagebuf_daemon_stop(void)2285 pagebuf_daemon_stop(void)
2286 {
2287 	int		cpu, pcpu;
2288 
2289 	pagebuf_daemon_active = 0;
2290 	barrier();
2291 	wait_for_completion(&pagebuf_daemon_done);
2292 
2293 	for (pcpu = 0; pcpu < min(smp_num_cpus, MAX_IO_DAEMONS); pcpu++) {
2294 		cpu = CPU_TO_DAEMON(cpu_logical_map(pcpu));
2295 
2296 		pb_logio_daemons[cpu] = 0;
2297 		wake_up(&pagebuf_logiodone_wait[cpu]);
2298 		wait_event_interruptible(pagebuf_logiodone_wait[cpu],
2299 				pb_logio_daemons[cpu] == -1);
2300 
2301 		pb_dataio_daemons[cpu] = 0;
2302 		wake_up(&pagebuf_dataiodone_wait[cpu]);
2303 		wait_event_interruptible(pagebuf_dataiodone_wait[cpu],
2304 				pb_dataio_daemons[cpu] == -1);
2305 	}
2306 }
2307 
2308 /*
2309  *	Initialization and Termination
2310  */
2311 
2312 int __init
pagebuf_init(void)2313 pagebuf_init(void)
2314 {
2315 	int			i;
2316 
2317 	pagebuf_cache = kmem_cache_create("xfs_buf_t", sizeof(xfs_buf_t), 0,
2318 			SLAB_HWCACHE_ALIGN, NULL, NULL);
2319 	if (pagebuf_cache == NULL) {
2320 		printk("XFS: couldn't init xfs_buf_t cache\n");
2321 		return -ENOMEM;
2322 	}
2323 
2324 	if (_pagebuf_prealloc_bh(NR_RESERVED_BH) < NR_RESERVED_BH) {
2325 		printk("XFS: couldn't allocate %d reserved buffers\n",
2326 			NR_RESERVED_BH);
2327 		kmem_zone_destroy(pagebuf_cache);
2328 		return -ENOMEM;
2329 	}
2330 	init_waitqueue_head(&pb_resv_bh_wait);
2331 
2332 #ifdef PAGEBUF_TRACE
2333 	pagebuf_trace_buf = ktrace_alloc(PAGEBUF_TRACE_SIZE, KM_SLEEP);
2334 #endif
2335 
2336 	pagebuf_daemon_start();
2337 
2338 	pagebuf_shake = kmem_shake_register(pagebuf_daemon_wakeup);
2339 	if (pagebuf_shake == NULL) {
2340 		pagebuf_terminate();
2341 		return -ENOMEM;
2342 	}
2343 
2344 	for (i = 0; i < NHASH; i++) {
2345 		spin_lock_init(&pbhash[i].pb_hash_lock);
2346 		INIT_LIST_HEAD(&pbhash[i].pb_hash);
2347 	}
2348 
2349 	return 0;
2350 }
2351 
2352 /*
2353  *	pagebuf_terminate.
2354  *
2355  *	Note: do not mark as __exit, this is also called from the __init code.
2356  */
2357 void
pagebuf_terminate(void)2358 pagebuf_terminate(void)
2359 {
2360 	pagebuf_daemon_stop();
2361 
2362 #ifdef PAGEBUF_TRACE
2363 	ktrace_free(pagebuf_trace_buf);
2364 #endif
2365 
2366 	kmem_zone_destroy(pagebuf_cache);
2367 	kmem_shake_deregister(pagebuf_shake);
2368 }
2369