1 /*
2 * Copyright (c) 2000-2004 Silicon Graphics, Inc. All Rights Reserved.
3 *
4 * This program is free software; you can redistribute it and/or modify it
5 * under the terms of version 2 of the GNU General Public License as
6 * published by the Free Software Foundation.
7 *
8 * This program is distributed in the hope that it would be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
11 *
12 * Further, this software is distributed without any warranty that it is
13 * free of the rightful claim of any third person regarding infringement
14 * or the like. Any license provided herein, whether implied or
15 * otherwise, applies only to this software file. Patent licenses, if
16 * any, provided herein do not apply to combinations of this program with
17 * other software, or any other product whatsoever.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write the Free Software Foundation, Inc., 59
21 * Temple Place - Suite 330, Boston MA 02111-1307, USA.
22 *
23 * Contact information: Silicon Graphics, Inc., 1600 Amphitheatre Pkwy,
24 * Mountain View, CA 94043, or:
25 *
26 * http://www.sgi.com
27 *
28 * For further information regarding this notice, see:
29 *
30 * http://oss.sgi.com/projects/GenInfo/SGIGPLNoticeExplan/
31 */
32
33 #include "xfs.h"
34 #include "xfs_inum.h"
35 #include "xfs_log.h"
36 #include "xfs_sb.h"
37 #include "xfs_dir.h"
38 #include "xfs_dir2.h"
39 #include "xfs_trans.h"
40 #include "xfs_dmapi.h"
41 #include "xfs_mount.h"
42 #include "xfs_bmap_btree.h"
43 #include "xfs_alloc_btree.h"
44 #include "xfs_ialloc_btree.h"
45 #include "xfs_alloc.h"
46 #include "xfs_btree.h"
47 #include "xfs_attr_sf.h"
48 #include "xfs_dir_sf.h"
49 #include "xfs_dir2_sf.h"
50 #include "xfs_dinode.h"
51 #include "xfs_inode.h"
52 #include "xfs_error.h"
53 #include "xfs_rw.h"
54 #include "xfs_iomap.h"
55 #include <linux/iobuf.h>
56
57 STATIC void xfs_count_page_state(struct page *, int *, int *, int *);
58 STATIC void xfs_convert_page(struct inode *, struct page *,
59 xfs_iomap_t *, void *, int, int);
60
61 #if defined(XFS_RW_TRACE)
62 void
xfs_page_trace(int tag,struct inode * inode,struct page * page,int mask)63 xfs_page_trace(
64 int tag,
65 struct inode *inode,
66 struct page *page,
67 int mask)
68 {
69 xfs_inode_t *ip;
70 bhv_desc_t *bdp;
71 vnode_t *vp = LINVFS_GET_VP(inode);
72 loff_t isize = i_size_read(inode);
73 loff_t offset = page->index << PAGE_CACHE_SHIFT;
74 int delalloc = -1, unmapped = -1, unwritten = -1;
75
76 if (page_has_buffers(page))
77 xfs_count_page_state(page, &delalloc, &unmapped, &unwritten);
78
79 bdp = vn_bhv_lookup(VN_BHV_HEAD(vp), &xfs_vnodeops);
80 ip = XFS_BHVTOI(bdp);
81 if (!ip->i_rwtrace)
82 return;
83
84 ktrace_enter(ip->i_rwtrace,
85 (void *)((unsigned long)tag),
86 (void *)ip,
87 (void *)inode,
88 (void *)page,
89 (void *)((unsigned long)mask),
90 (void *)((unsigned long)((ip->i_d.di_size >> 32) & 0xffffffff)),
91 (void *)((unsigned long)(ip->i_d.di_size & 0xffffffff)),
92 (void *)((unsigned long)((isize >> 32) & 0xffffffff)),
93 (void *)((unsigned long)(isize & 0xffffffff)),
94 (void *)((unsigned long)((offset >> 32) & 0xffffffff)),
95 (void *)((unsigned long)(offset & 0xffffffff)),
96 (void *)((unsigned long)delalloc),
97 (void *)((unsigned long)unmapped),
98 (void *)((unsigned long)unwritten),
99 (void *)NULL,
100 (void *)NULL);
101 }
102 #else
103 #define xfs_page_trace(tag, inode, page, mask)
104 #endif
105
106 void
linvfs_unwritten_done(struct buffer_head * bh,int uptodate)107 linvfs_unwritten_done(
108 struct buffer_head *bh,
109 int uptodate)
110 {
111 xfs_buf_t *pb = (xfs_buf_t *)bh->b_private;
112
113 ASSERT(buffer_unwritten(bh));
114 bh->b_end_io = NULL;
115 clear_buffer_unwritten(bh);
116 if (!uptodate)
117 pagebuf_ioerror(pb, EIO);
118 if (atomic_dec_and_test(&pb->pb_io_remaining) == 1) {
119 pagebuf_iodone(pb, 1, 1);
120 }
121 end_buffer_io_async(bh, uptodate);
122 }
123
124 /*
125 * Issue transactions to convert a buffer range from unwritten
126 * to written extents.
127 */
128 STATIC void
linvfs_unwritten_convert(xfs_buf_t * bp)129 linvfs_unwritten_convert(
130 xfs_buf_t *bp)
131 {
132 vnode_t *vp = XFS_BUF_FSPRIVATE(bp, vnode_t *);
133 int error;
134
135 BUG_ON(atomic_read(&bp->pb_hold) < 1);
136 VOP_BMAP(vp, XFS_BUF_OFFSET(bp), XFS_BUF_SIZE(bp),
137 BMAPI_UNWRITTEN, NULL, NULL, error);
138 XFS_BUF_SET_FSPRIVATE(bp, NULL);
139 XFS_BUF_CLR_IODONE_FUNC(bp);
140 XFS_BUF_UNDATAIO(bp);
141 iput(LINVFS_GET_IP(vp));
142 pagebuf_iodone(bp, 0, 0);
143 }
144
145 STATIC int
xfs_map_blocks(struct inode * inode,loff_t offset,ssize_t count,xfs_iomap_t * iomapp,int flags)146 xfs_map_blocks(
147 struct inode *inode,
148 loff_t offset,
149 ssize_t count,
150 xfs_iomap_t *iomapp,
151 int flags)
152 {
153 vnode_t *vp = LINVFS_GET_VP(inode);
154 int error, niomaps = 1;
155
156 VOP_BMAP(vp, offset, count, flags, iomapp, &niomaps, error);
157 if (!error && (flags & (BMAPI_WRITE|BMAPI_ALLOCATE)))
158 VMODIFY(vp);
159 return -error;
160 }
161
162 /*
163 * Finds the corresponding mapping in block @map array of the
164 * given @offset within a @page.
165 */
166 STATIC xfs_iomap_t *
xfs_offset_to_map(struct page * page,xfs_iomap_t * iomapp,unsigned long offset)167 xfs_offset_to_map(
168 struct page *page,
169 xfs_iomap_t *iomapp,
170 unsigned long offset)
171 {
172 loff_t full_offset; /* offset from start of file */
173
174 ASSERT(offset < PAGE_CACHE_SIZE);
175
176 full_offset = page->index; /* NB: using 64bit number */
177 full_offset <<= PAGE_CACHE_SHIFT; /* offset from file start */
178 full_offset += offset; /* offset from page start */
179
180 if (full_offset < iomapp->iomap_offset)
181 return NULL;
182 if (iomapp->iomap_offset + (iomapp->iomap_bsize -1) >= full_offset)
183 return iomapp;
184 return NULL;
185 }
186
187 STATIC void
xfs_map_at_offset(struct page * page,struct buffer_head * bh,unsigned long offset,int block_bits,xfs_iomap_t * iomapp)188 xfs_map_at_offset(
189 struct page *page,
190 struct buffer_head *bh,
191 unsigned long offset,
192 int block_bits,
193 xfs_iomap_t *iomapp)
194 {
195 xfs_daddr_t bn;
196 loff_t delta;
197 int sector_shift;
198
199 ASSERT(!(iomapp->iomap_flags & IOMAP_HOLE));
200 ASSERT(!(iomapp->iomap_flags & IOMAP_DELAY));
201 ASSERT(iomapp->iomap_bn != IOMAP_DADDR_NULL);
202
203 delta = page->index;
204 delta <<= PAGE_CACHE_SHIFT;
205 delta += offset;
206 delta -= iomapp->iomap_offset;
207 delta >>= block_bits;
208
209 sector_shift = block_bits - BBSHIFT;
210 bn = iomapp->iomap_bn >> sector_shift;
211 bn += delta;
212 BUG_ON(!bn && !(iomapp->iomap_flags & IOMAP_REALTIME));
213 ASSERT((bn << sector_shift) >= iomapp->iomap_bn);
214
215 lock_buffer(bh);
216 bh->b_blocknr = bn;
217 bh->b_dev = iomapp->iomap_target->pbr_kdev;
218 set_buffer_mapped(bh);
219 clear_buffer_delay(bh);
220 }
221
222 /*
223 * Look for a page at index which is unlocked and contains our
224 * unwritten extent flagged buffers at its head. Returns page
225 * locked and with an extra reference count, and length of the
226 * unwritten extent component on this page that we can write,
227 * in units of filesystem blocks.
228 */
229 STATIC struct page *
xfs_probe_unwritten_page(struct address_space * mapping,pgoff_t index,xfs_iomap_t * iomapp,xfs_buf_t * pb,unsigned long max_offset,unsigned long * fsbs,unsigned int bbits)230 xfs_probe_unwritten_page(
231 struct address_space *mapping,
232 pgoff_t index,
233 xfs_iomap_t *iomapp,
234 xfs_buf_t *pb,
235 unsigned long max_offset,
236 unsigned long *fsbs,
237 unsigned int bbits)
238 {
239 struct page *page;
240
241 page = find_trylock_page(mapping, index);
242 if (!page)
243 return 0;
244
245 if (page->mapping && page_has_buffers(page)) {
246 struct buffer_head *bh, *head;
247 unsigned long p_offset = 0;
248
249 *fsbs = 0;
250 bh = head = page_buffers(page);
251 do {
252 if (!buffer_unwritten(bh) || !buffer_uptodate(bh))
253 break;
254 if (!xfs_offset_to_map(page, iomapp, p_offset))
255 break;
256 if (p_offset >= max_offset)
257 break;
258 xfs_map_at_offset(page, bh, p_offset, bbits, iomapp);
259 set_buffer_unwritten_io(bh);
260 bh->b_private = pb;
261 p_offset += bh->b_size;
262 (*fsbs)++;
263 } while ((bh = bh->b_this_page) != head);
264
265 if (p_offset)
266 return page;
267 }
268
269 unlock_page(page);
270 return NULL;
271 }
272
273 /*
274 * Look for a page at index which is unlocked and not mapped
275 * yet - clustering for mmap write case.
276 */
277 STATIC unsigned int
xfs_probe_unmapped_page(struct address_space * mapping,pgoff_t index,unsigned int pg_offset)278 xfs_probe_unmapped_page(
279 struct address_space *mapping,
280 pgoff_t index,
281 unsigned int pg_offset)
282 {
283 struct page *page;
284 int ret = 0;
285
286 page = find_trylock_page(mapping, index);
287 if (!page)
288 return 0;
289
290 if (page->mapping && PageDirty(page)) {
291 if (page_has_buffers(page)) {
292 struct buffer_head *bh, *head;
293
294 bh = head = page_buffers(page);
295 do {
296 if (buffer_mapped(bh) || !buffer_uptodate(bh))
297 break;
298 ret += bh->b_size;
299 if (ret >= pg_offset)
300 break;
301 } while ((bh = bh->b_this_page) != head);
302 } else
303 ret = PAGE_CACHE_SIZE;
304 }
305
306 unlock_page(page);
307 return ret;
308 }
309
310 STATIC unsigned int
xfs_probe_unmapped_cluster(struct inode * inode,struct page * startpage,struct buffer_head * bh,struct buffer_head * head)311 xfs_probe_unmapped_cluster(
312 struct inode *inode,
313 struct page *startpage,
314 struct buffer_head *bh,
315 struct buffer_head *head)
316 {
317 pgoff_t tindex, tlast, tloff;
318 unsigned int pg_offset, len, total = 0;
319 struct address_space *mapping = inode->i_mapping;
320
321 /* First sum forwards in this page */
322 do {
323 if (buffer_mapped(bh))
324 break;
325 total += bh->b_size;
326 } while ((bh = bh->b_this_page) != head);
327
328 /* If we reached the end of the page, sum forwards in
329 * following pages.
330 */
331 if (bh == head) {
332 tlast = i_size_read(inode) >> PAGE_CACHE_SHIFT;
333 /* Prune this back to avoid pathological behavior */
334 tloff = min(tlast, startpage->index + 64);
335 for (tindex = startpage->index + 1; tindex < tloff; tindex++) {
336 len = xfs_probe_unmapped_page(mapping, tindex,
337 PAGE_CACHE_SIZE);
338 if (!len)
339 return total;
340 total += len;
341 }
342 if (tindex == tlast &&
343 (pg_offset = i_size_read(inode) & (PAGE_CACHE_SIZE - 1))) {
344 total += xfs_probe_unmapped_page(mapping,
345 tindex, pg_offset);
346 }
347 }
348 return total;
349 }
350
351 /*
352 * Probe for a given page (index) in the inode and test if it is delayed
353 * and without unwritten buffers. Returns page locked and with an extra
354 * reference count.
355 */
356 STATIC struct page *
xfs_probe_delalloc_page(struct inode * inode,pgoff_t index)357 xfs_probe_delalloc_page(
358 struct inode *inode,
359 pgoff_t index)
360 {
361 struct page *page;
362
363 page = find_trylock_page(inode->i_mapping, index);
364 if (!page)
365 return NULL;
366
367 if (page->mapping && page_has_buffers(page)) {
368 struct buffer_head *bh, *head;
369 int acceptable = 0;
370
371 bh = head = page_buffers(page);
372 do {
373 if (buffer_unwritten(bh)) {
374 acceptable = 0;
375 break;
376 } else if (buffer_delay(bh)) {
377 acceptable = 1;
378 }
379 } while ((bh = bh->b_this_page) != head);
380
381 if (acceptable)
382 return page;
383 }
384
385 unlock_page(page);
386 return NULL;
387 }
388
389 STATIC int
xfs_map_unwritten(struct inode * inode,struct page * start_page,struct buffer_head * head,struct buffer_head * curr,unsigned long p_offset,int block_bits,xfs_iomap_t * iomapp,int startio,int all_bh)390 xfs_map_unwritten(
391 struct inode *inode,
392 struct page *start_page,
393 struct buffer_head *head,
394 struct buffer_head *curr,
395 unsigned long p_offset,
396 int block_bits,
397 xfs_iomap_t *iomapp,
398 int startio,
399 int all_bh)
400 {
401 struct buffer_head *bh = curr;
402 xfs_iomap_t *tmp;
403 xfs_buf_t *pb;
404 loff_t offset, size;
405 unsigned long nblocks = 0;
406
407 offset = start_page->index;
408 offset <<= PAGE_CACHE_SHIFT;
409 offset += p_offset;
410
411 /* get an "empty" pagebuf to manage IO completion
412 * Proper values will be set before returning */
413 pb = pagebuf_lookup(iomapp->iomap_target, 0, 0, 0);
414 if (!pb)
415 return -EAGAIN;
416
417 /* Take a reference to the inode to prevent it from
418 * being reclaimed while we have outstanding unwritten
419 * extent IO on it.
420 */
421 if ((igrab(inode)) != inode) {
422 pagebuf_free(pb);
423 return -EAGAIN;
424 }
425
426 /* Set the count to 1 initially, this will stop an I/O
427 * completion callout which happens before we have started
428 * all the I/O from calling pagebuf_iodone too early.
429 */
430 atomic_set(&pb->pb_io_remaining, 1);
431
432 /* First map forwards in the page consecutive buffers
433 * covering this unwritten extent
434 */
435 do {
436 if (!buffer_unwritten(bh))
437 break;
438 tmp = xfs_offset_to_map(start_page, iomapp, p_offset);
439 if (!tmp)
440 break;
441 xfs_map_at_offset(start_page, bh, p_offset, block_bits, iomapp);
442 set_buffer_unwritten_io(bh);
443 bh->b_private = pb;
444 p_offset += bh->b_size;
445 nblocks++;
446 } while ((bh = bh->b_this_page) != head);
447
448 atomic_add(nblocks, &pb->pb_io_remaining);
449
450 /* If we reached the end of the page, map forwards in any
451 * following pages which are also covered by this extent.
452 */
453 if (bh == head) {
454 struct address_space *mapping = inode->i_mapping;
455 pgoff_t tindex, tloff, tlast;
456 unsigned long bs;
457 unsigned int pg_offset, bbits = inode->i_blkbits;
458 struct page *page;
459
460 tlast = i_size_read(inode) >> PAGE_CACHE_SHIFT;
461 tloff = (iomapp->iomap_offset + iomapp->iomap_bsize) >> PAGE_CACHE_SHIFT;
462 tloff = min(tlast, tloff);
463 for (tindex = start_page->index + 1; tindex < tloff; tindex++) {
464 page = xfs_probe_unwritten_page(mapping,
465 tindex, iomapp, pb,
466 PAGE_CACHE_SIZE, &bs, bbits);
467 if (!page)
468 break;
469 nblocks += bs;
470 atomic_add(bs, &pb->pb_io_remaining);
471 xfs_convert_page(inode, page, iomapp, pb,
472 startio, all_bh);
473 /* stop if converting the next page might add
474 * enough blocks that the corresponding byte
475 * count won't fit in our ulong page buf length */
476 if (nblocks >= ((ULONG_MAX - PAGE_SIZE) >> block_bits))
477 goto enough;
478 }
479
480 if (tindex == tlast &&
481 (pg_offset = (i_size_read(inode) & (PAGE_CACHE_SIZE - 1)))) {
482 page = xfs_probe_unwritten_page(mapping,
483 tindex, iomapp, pb,
484 pg_offset, &bs, bbits);
485 if (page) {
486 nblocks += bs;
487 atomic_add(bs, &pb->pb_io_remaining);
488 xfs_convert_page(inode, page, iomapp, pb,
489 startio, all_bh);
490 if (nblocks >= ((ULONG_MAX - PAGE_SIZE) >> block_bits))
491 goto enough;
492 }
493 }
494 }
495
496 enough:
497 size = nblocks; /* NB: using 64bit number here */
498 size <<= block_bits; /* convert fsb's to byte range */
499
500 XFS_BUF_DATAIO(pb);
501 XFS_BUF_ASYNC(pb);
502 XFS_BUF_SET_SIZE(pb, size);
503 XFS_BUF_SET_COUNT(pb, size);
504 XFS_BUF_SET_OFFSET(pb, offset);
505 XFS_BUF_SET_FSPRIVATE(pb, LINVFS_GET_VP(inode));
506 XFS_BUF_SET_IODONE_FUNC(pb, linvfs_unwritten_convert);
507
508 if (atomic_dec_and_test(&pb->pb_io_remaining) == 1) {
509 pagebuf_iodone(pb, 1, 1);
510 }
511
512 return 0;
513 }
514
515 STATIC void
xfs_submit_page(struct page * page,struct buffer_head * bh_arr[],int bh_count)516 xfs_submit_page(
517 struct page *page,
518 struct buffer_head *bh_arr[],
519 int bh_count)
520 {
521 struct buffer_head *bh;
522 int i;
523
524 if (bh_count) {
525 for (i = 0; i < bh_count; i++) {
526 bh = bh_arr[i];
527 set_buffer_async_io(bh);
528 if (buffer_unwritten(bh))
529 set_buffer_unwritten_io(bh);
530 set_buffer_uptodate(bh);
531 clear_buffer_dirty(bh);
532 }
533
534 for (i = 0; i < bh_count; i++) {
535 refile_buffer(bh_arr[i]);
536 submit_bh(WRITE, bh_arr[i]);
537 }
538 } else {
539 unlock_page(page);
540 }
541 }
542
543 /*
544 * Allocate & map buffers for page given the extent map. Write it out.
545 * except for the original page of a writepage, this is called on
546 * delalloc/unwritten pages only, for the original page it is possible
547 * that the page has no mapping at all.
548 */
549 STATIC void
xfs_convert_page(struct inode * inode,struct page * page,xfs_iomap_t * iomapp,void * private,int startio,int all_bh)550 xfs_convert_page(
551 struct inode *inode,
552 struct page *page,
553 xfs_iomap_t *iomapp,
554 void *private,
555 int startio,
556 int all_bh)
557 {
558 struct buffer_head *bh_arr[MAX_BUF_PER_PAGE], *bh, *head;
559 xfs_iomap_t *mp = iomapp, *tmp;
560 unsigned long end, offset;
561 pgoff_t end_index;
562 int i = 0, index = 0;
563 int bbits = inode->i_blkbits;
564
565 end_index = i_size_read(inode) >> PAGE_CACHE_SHIFT;
566 if (page->index < end_index) {
567 end = PAGE_CACHE_SIZE;
568 } else {
569 end = i_size_read(inode) & (PAGE_CACHE_SIZE-1);
570 }
571 bh = head = page_buffers(page);
572 do {
573 offset = i << bbits;
574 if (offset >= end)
575 break;
576 if (!(PageUptodate(page) || buffer_uptodate(bh)))
577 continue;
578 if (buffer_mapped(bh) && all_bh &&
579 !buffer_unwritten(bh) && !buffer_delay(bh)) {
580 if (startio) {
581 lock_buffer(bh);
582 bh_arr[index++] = bh;
583 }
584 continue;
585 }
586 tmp = xfs_offset_to_map(page, mp, offset);
587 if (!tmp)
588 continue;
589 ASSERT(!(tmp->iomap_flags & IOMAP_HOLE));
590 ASSERT(!(tmp->iomap_flags & IOMAP_DELAY));
591
592 /* If this is a new unwritten extent buffer (i.e. one
593 * that we haven't passed in private data for, we must
594 * now map this buffer too.
595 */
596 if (buffer_unwritten(bh) && !bh->b_end_io) {
597 ASSERT(tmp->iomap_flags & IOMAP_UNWRITTEN);
598 xfs_map_unwritten(inode, page, head, bh,
599 offset, bbits, tmp, startio, all_bh);
600 } else if (! (buffer_unwritten(bh) && buffer_locked(bh))) {
601 xfs_map_at_offset(page, bh, offset, bbits, tmp);
602 if (buffer_unwritten(bh)) {
603 set_buffer_unwritten_io(bh);
604 bh->b_private = private;
605 ASSERT(private);
606 }
607 }
608 if (startio) {
609 bh_arr[index++] = bh;
610 } else {
611 unlock_buffer(bh);
612 mark_buffer_dirty(bh);
613 }
614 } while (i++, (bh = bh->b_this_page) != head);
615
616 if (startio) {
617 xfs_submit_page(page, bh_arr, index);
618 } else {
619 unlock_page(page);
620 }
621 }
622
623 /*
624 * Convert & write out a cluster of pages in the same extent as defined
625 * by mp and following the start page.
626 */
627 STATIC void
xfs_cluster_write(struct inode * inode,pgoff_t tindex,xfs_iomap_t * iomapp,int startio,int all_bh,pgoff_t tlast)628 xfs_cluster_write(
629 struct inode *inode,
630 pgoff_t tindex,
631 xfs_iomap_t *iomapp,
632 int startio,
633 int all_bh,
634 pgoff_t tlast)
635 {
636 struct page *page;
637
638 for (; tindex <= tlast; tindex++) {
639 page = xfs_probe_delalloc_page(inode, tindex);
640 if (!page)
641 break;
642 xfs_convert_page(inode, page, iomapp, NULL, startio, all_bh);
643 }
644 }
645
646 /*
647 * Calling this without startio set means we are being asked to make a dirty
648 * page ready for freeing it's buffers. When called with startio set then
649 * we are coming from writepage.
650 *
651 * When called with startio set it is important that we write the WHOLE
652 * page if possible.
653 * The bh->b_state's cannot know if any of the blocks or which block for
654 * that matter are dirty due to mmap writes, and therefore bh uptodate is
655 * only vaild if the page itself isn't completely uptodate. Some layers
656 * may clear the page dirty flag prior to calling write page, under the
657 * assumption the entire page will be written out; by not writing out the
658 * whole page the page can be reused before all valid dirty data is
659 * written out. Note: in the case of a page that has been dirty'd by
660 * mapwrite and but partially setup by block_prepare_write the
661 * bh->b_states's will not agree and only ones setup by BPW/BCW will have
662 * valid state, thus the whole page must be written out thing.
663 */
664
665 STATIC int
xfs_page_state_convert(struct inode * inode,struct page * page,int startio,int unmapped)666 xfs_page_state_convert(
667 struct inode *inode,
668 struct page *page,
669 int startio,
670 int unmapped) /* also implies page uptodate */
671 {
672 struct buffer_head *bh_arr[MAX_BUF_PER_PAGE], *bh, *head;
673 xfs_iomap_t *iomp, iomap;
674 loff_t offset;
675 unsigned long p_offset = 0;
676 __uint64_t end_offset;
677 pgoff_t end_index, last_index, tlast;
678 int len, err, i, cnt = 0, uptodate = 1;
679 int flags = startio ? 0 : BMAPI_TRYLOCK;
680 int page_dirty = 1;
681 int delalloc = 0;
682
683
684 /* Are we off the end of the file ? */
685 offset = i_size_read(inode);
686 end_index = offset >> PAGE_CACHE_SHIFT;
687 last_index = (offset - 1) >> PAGE_CACHE_SHIFT;
688 if (page->index >= end_index) {
689 if ((page->index >= end_index + 1) ||
690 !(i_size_read(inode) & (PAGE_CACHE_SIZE - 1))) {
691 err = -EIO;
692 goto error;
693 }
694 }
695
696 offset = (loff_t)page->index << PAGE_CACHE_SHIFT;
697 end_offset = min_t(unsigned long long,
698 offset + PAGE_CACHE_SIZE, i_size_read(inode));
699
700 bh = head = page_buffers(page);
701 iomp = NULL;
702
703 len = bh->b_size;
704 do {
705 if (offset >= end_offset)
706 break;
707 if (!buffer_uptodate(bh))
708 uptodate = 0;
709 if (!(PageUptodate(page) || buffer_uptodate(bh)) && !startio)
710 continue;
711
712 if (iomp) {
713 iomp = xfs_offset_to_map(page, &iomap, p_offset);
714 }
715
716 /*
717 * First case, map an unwritten extent and prepare for
718 * extent state conversion transaction on completion.
719 */
720 if (buffer_unwritten(bh)) {
721 if (!startio)
722 continue;
723 if (!iomp) {
724 err = xfs_map_blocks(inode, offset, len, &iomap,
725 BMAPI_READ|BMAPI_IGNSTATE);
726 if (err) {
727 goto error;
728 }
729 iomp = xfs_offset_to_map(page, &iomap,
730 p_offset);
731 }
732 if (iomp) {
733 if (!bh->b_end_io) {
734 err = xfs_map_unwritten(inode, page,
735 head, bh, p_offset,
736 inode->i_blkbits, iomp,
737 startio, unmapped);
738 if (err) {
739 goto error;
740 }
741 } else {
742 set_bit(BH_Lock, &bh->b_state);
743 }
744 BUG_ON(!buffer_locked(bh));
745 bh_arr[cnt++] = bh;
746 page_dirty = 0;
747 }
748 /*
749 * Second case, allocate space for a delalloc buffer.
750 * We can return EAGAIN here in the release page case.
751 */
752 } else if (buffer_delay(bh)) {
753 if (!iomp) {
754 delalloc = 1;
755 err = xfs_map_blocks(inode, offset, len, &iomap,
756 BMAPI_ALLOCATE | flags);
757 if (err) {
758 goto error;
759 }
760 iomp = xfs_offset_to_map(page, &iomap,
761 p_offset);
762 }
763 if (iomp) {
764 xfs_map_at_offset(page, bh, p_offset,
765 inode->i_blkbits, iomp);
766 if (startio) {
767 bh_arr[cnt++] = bh;
768 } else {
769 unlock_buffer(bh);
770 mark_buffer_dirty(bh);
771 }
772 page_dirty = 0;
773 }
774 } else if ((buffer_uptodate(bh) || PageUptodate(page)) &&
775 (unmapped || startio)) {
776
777 if (!buffer_mapped(bh)) {
778 int size;
779
780 /*
781 * Getting here implies an unmapped buffer
782 * was found, and we are in a path where we
783 * need to write the whole page out.
784 */
785 if (!iomp) {
786 size = xfs_probe_unmapped_cluster(
787 inode, page, bh, head);
788 err = xfs_map_blocks(inode, offset,
789 size, &iomap,
790 BMAPI_WRITE|BMAPI_MMAP);
791 if (err) {
792 goto error;
793 }
794 iomp = xfs_offset_to_map(page, &iomap,
795 p_offset);
796 }
797 if (iomp) {
798 xfs_map_at_offset(page,
799 bh, p_offset,
800 inode->i_blkbits, iomp);
801 if (startio) {
802 bh_arr[cnt++] = bh;
803 } else {
804 unlock_buffer(bh);
805 mark_buffer_dirty(bh);
806 }
807 page_dirty = 0;
808 }
809 } else if (startio) {
810 if (buffer_uptodate(bh) &&
811 !test_and_set_bit(BH_Lock, &bh->b_state)) {
812 bh_arr[cnt++] = bh;
813 page_dirty = 0;
814 }
815 }
816 }
817 } while (offset += len, p_offset += len,
818 ((bh = bh->b_this_page) != head));
819
820 if (uptodate && bh == head)
821 SetPageUptodate(page);
822
823 if (startio)
824 xfs_submit_page(page, bh_arr, cnt);
825
826 if (iomp) {
827 tlast = (iomp->iomap_offset + iomp->iomap_bsize - 1) >>
828 PAGE_CACHE_SHIFT;
829 if (delalloc && (tlast > last_index))
830 tlast = last_index;
831 xfs_cluster_write(inode, page->index + 1, iomp,
832 startio, unmapped, tlast);
833 }
834
835 return page_dirty;
836
837 error:
838 for (i = 0; i < cnt; i++) {
839 unlock_buffer(bh_arr[i]);
840 }
841
842 /*
843 * If it's delalloc and we have nowhere to put it,
844 * throw it away, unless the lower layers told
845 * us to try again.
846 */
847 if (err != -EAGAIN) {
848 if (!unmapped) {
849 block_flushpage(page, 0);
850 }
851 ClearPageUptodate(page);
852 }
853 return err;
854 }
855
856 STATIC int
linvfs_get_block_core(struct inode * inode,long iblock,struct buffer_head * bh_result,int create,int direct,bmapi_flags_t flags)857 linvfs_get_block_core(
858 struct inode *inode,
859 long iblock,
860 struct buffer_head *bh_result,
861 int create,
862 int direct,
863 bmapi_flags_t flags)
864 {
865 vnode_t *vp = LINVFS_GET_VP(inode);
866 xfs_iomap_t iomap;
867 int retpbbm = 1;
868 int error;
869 ssize_t size;
870 loff_t offset = (loff_t)iblock << inode->i_blkbits;
871
872 size = 1 << inode->i_blkbits;
873 VOP_BMAP(vp, offset, size,
874 create ? flags : BMAPI_READ, &iomap, &retpbbm, error);
875 if (error)
876 return -error;
877
878 if (retpbbm == 0)
879 return 0;
880
881 if (iomap.iomap_bn != IOMAP_DADDR_NULL) {
882 xfs_daddr_t bn;
883 loff_t delta;
884
885 /* For unwritten extents do not report a disk address on
886 * the read case (treat as if we're reading into a hole).
887 */
888 if (create || !(iomap.iomap_flags & IOMAP_UNWRITTEN)) {
889 delta = offset - iomap.iomap_offset;
890 delta >>= inode->i_blkbits;
891
892 bn = iomap.iomap_bn >> (inode->i_blkbits - BBSHIFT);
893 bn += delta;
894 BUG_ON(!bn && !(iomap.iomap_flags & IOMAP_REALTIME));
895 bh_result->b_blocknr = bn;
896 set_buffer_mapped(bh_result);
897 }
898 if (create && (iomap.iomap_flags & IOMAP_UNWRITTEN)) {
899 set_buffer_unwritten(bh_result);
900 set_buffer_delay(bh_result);
901 }
902 }
903
904 /* If this is a realtime file, data might be on a new device */
905 bh_result->b_dev = iomap.iomap_target->pbr_kdev;
906
907 /* If we previously allocated a block out beyond eof and
908 * we are now coming back to use it then we will need to
909 * flag it as new even if it has a disk address.
910 */
911 if (create &&
912 ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) ||
913 (offset >= i_size_read(inode)))) {
914 set_buffer_new(bh_result);
915 }
916
917 if (iomap.iomap_flags & IOMAP_DELAY) {
918 BUG_ON(direct);
919 if (create) {
920 set_buffer_mapped(bh_result);
921 set_buffer_uptodate(bh_result);
922 }
923 set_buffer_delay(bh_result);
924 }
925
926 return 0;
927 }
928
929 int
linvfs_get_block(struct inode * inode,long iblock,struct buffer_head * bh_result,int create)930 linvfs_get_block(
931 struct inode *inode,
932 long iblock,
933 struct buffer_head *bh_result,
934 int create)
935 {
936 return linvfs_get_block_core(inode, iblock, bh_result,
937 create, 0, BMAPI_WRITE);
938 }
939
940 STATIC int
linvfs_get_block_direct(struct inode * inode,long iblock,struct buffer_head * bh_result,int create)941 linvfs_get_block_direct(
942 struct inode *inode,
943 long iblock,
944 struct buffer_head *bh_result,
945 int create)
946 {
947 return linvfs_get_block_core(inode, iblock, bh_result,
948 create, 1, BMAPI_WRITE|BMAPI_DIRECT);
949 }
950
951 STATIC int
linvfs_bmap(struct address_space * mapping,long block)952 linvfs_bmap(
953 struct address_space *mapping,
954 long block)
955 {
956 struct inode *inode = (struct inode *)mapping->host;
957 vnode_t *vp = LINVFS_GET_VP(inode);
958 int error;
959
960 vn_trace_entry(vp, "linvfs_bmap", (inst_t *)__return_address);
961
962 VOP_RWLOCK(vp, VRWLOCK_READ);
963 VOP_FLUSH_PAGES(vp, (xfs_off_t)0, -1, 0, FI_REMAPF, error);
964 VOP_RWUNLOCK(vp, VRWLOCK_READ);
965 return generic_block_bmap(mapping, block, linvfs_get_block_direct);
966 }
967
968 STATIC int
linvfs_readpage(struct file * unused,struct page * page)969 linvfs_readpage(
970 struct file *unused,
971 struct page *page)
972 {
973 return block_read_full_page(page, linvfs_get_block);
974 }
975
976 STATIC void
xfs_count_page_state(struct page * page,int * delalloc,int * unmapped,int * unwritten)977 xfs_count_page_state(
978 struct page *page,
979 int *delalloc,
980 int *unmapped,
981 int *unwritten)
982 {
983 struct buffer_head *bh, *head;
984
985 *delalloc = *unmapped = *unwritten = 0;
986
987 bh = head = page_buffers(page);
988 do {
989 if (buffer_uptodate(bh) && !buffer_mapped(bh))
990 (*unmapped) = 1;
991 else if (buffer_unwritten(bh) && !buffer_delay(bh))
992 clear_buffer_unwritten(bh);
993 else if (buffer_unwritten(bh))
994 (*unwritten) = 1;
995 else if (buffer_delay(bh))
996 (*delalloc) = 1;
997 } while ((bh = bh->b_this_page) != head);
998 }
999
1000
1001 /*
1002 * writepage: Called from one of two places:
1003 *
1004 * 1. we are flushing a delalloc buffer head.
1005 *
1006 * 2. we are writing out a dirty page. Typically the page dirty
1007 * state is cleared before we get here. In this case is it
1008 * conceivable we have no buffer heads.
1009 *
1010 * For delalloc space on the page we need to allocate space and
1011 * flush it. For unmapped buffer heads on the page we should
1012 * allocate space if the page is uptodate. For any other dirty
1013 * buffer heads on the page we should flush them.
1014 *
1015 * If we detect that a transaction would be required to flush
1016 * the page, we have to check the process flags first, if we
1017 * are already in a transaction or disk I/O during allocations
1018 * is off, we need to fail the writepage and redirty the page.
1019 */
1020
1021 STATIC int
linvfs_writepage(struct page * page)1022 linvfs_writepage(
1023 struct page *page)
1024 {
1025 int error;
1026 int need_trans;
1027 int delalloc, unmapped, unwritten;
1028 struct inode *inode = page->mapping->host;
1029
1030 xfs_page_trace(XFS_WRITEPAGE_ENTER, inode, page, 0);
1031
1032 /*
1033 * We need a transaction if:
1034 * 1. There are delalloc buffers on the page
1035 * 2. The page is uptodate and we have unmapped buffers
1036 * 3. The page is uptodate and we have no buffers
1037 * 4. There are unwritten buffers on the page
1038 */
1039
1040 if (!page_has_buffers(page)) {
1041 unmapped = 1;
1042 need_trans = 1;
1043 } else {
1044 xfs_count_page_state(page, &delalloc, &unmapped, &unwritten);
1045 if (!PageUptodate(page))
1046 unmapped = 0;
1047 need_trans = delalloc + unmapped + unwritten;
1048 }
1049
1050 /*
1051 * If we need a transaction and the process flags say
1052 * we are already in a transaction, or no IO is allowed
1053 * then mark the page dirty again and leave the page
1054 * as is.
1055 */
1056
1057 if ((PFLAGS_TEST_FSTRANS() || PFLAGS_TEST_NOIO()) && need_trans)
1058 goto out_fail;
1059
1060 /*
1061 * Delay hooking up buffer heads until we have
1062 * made our go/no-go decision.
1063 */
1064 if (!page_has_buffers(page))
1065 create_empty_buffers(page, inode->i_dev, 1 << inode->i_blkbits);
1066
1067 /*
1068 * Convert delayed allocate, unwritten or unmapped space
1069 * to real space and flush out to disk.
1070 */
1071 if (need_trans)
1072 PFLAGS_SET_NOIO();
1073 error = xfs_page_state_convert(inode, page, 1, unmapped);
1074 if (need_trans)
1075 PFLAGS_CLEAR_NOIO();
1076 if (error == -EAGAIN)
1077 goto out_fail;
1078
1079 if (unlikely(error < 0)) {
1080 unlock_page(page);
1081 return error;
1082 }
1083
1084 return 0;
1085
1086 out_fail:
1087 SetPageDirty(page);
1088 unlock_page(page);
1089 return 0;
1090 }
1091
1092 /*
1093 * Called to move a page into cleanable state - and from there
1094 * to be released. Possibly the page is already clean. We always
1095 * have buffer heads in this call.
1096 *
1097 * Returns 0 if the page is ok to release, 1 otherwise.
1098 *
1099 * Possible scenarios are:
1100 *
1101 * 1. We are being called to release a page which has been written
1102 * to via regular I/O. buffer heads will be dirty and possibly
1103 * delalloc. If no delalloc buffer heads in this case then we
1104 * can just return zero.
1105 *
1106 * 2. We are called to release a page which has been written via
1107 * mmap, all we need to do is ensure there is no delalloc
1108 * state in the buffer heads, if not we can let the caller
1109 * free them and we should come back later via writepage.
1110 */
1111 STATIC int
linvfs_release_page(struct page * page,int gfp_mask)1112 linvfs_release_page(
1113 struct page *page,
1114 int gfp_mask)
1115 {
1116 struct inode *inode = page->mapping->host;
1117 int dirty, delalloc, unmapped, unwritten;
1118
1119 xfs_page_trace(XFS_RELEASEPAGE_ENTER, inode, page, gfp_mask);
1120
1121 xfs_count_page_state(page, &delalloc, &unmapped, &unwritten);
1122 if (!delalloc && !unwritten)
1123 return 1;
1124
1125 if (!(gfp_mask & __GFP_FS))
1126 return 0;
1127
1128 /* If we are already inside a transaction or the thread cannot
1129 * do I/O, we cannot release this page.
1130 */
1131 if (PFLAGS_TEST_FSTRANS() || PFLAGS_TEST_NOIO())
1132 return 0;
1133
1134 /*
1135 * Convert delalloc space to real space, do not flush the
1136 * data out to disk, that will be done by the caller.
1137 * Never need to allocate space here - we will always
1138 * come back to writepage in that case.
1139 */
1140 dirty = xfs_page_state_convert(inode, page, 0, 0);
1141 return (dirty == 0 && !unwritten) ? 1 : 0;
1142 }
1143
1144 STATIC int
linvfs_prepare_write(struct file * file,struct page * page,unsigned int from,unsigned int to)1145 linvfs_prepare_write(
1146 struct file *file,
1147 struct page *page,
1148 unsigned int from,
1149 unsigned int to)
1150 {
1151 return block_prepare_write(page, from, to, linvfs_get_block);
1152 }
1153
1154 /*
1155 * Initiate I/O on a kiobuf of user memory
1156 */
1157 STATIC int
linvfs_direct_IO(int rw,struct inode * inode,struct kiobuf * iobuf,unsigned long blocknr,int blocksize)1158 linvfs_direct_IO(
1159 int rw,
1160 struct inode *inode,
1161 struct kiobuf *iobuf,
1162 unsigned long blocknr,
1163 int blocksize)
1164 {
1165 struct page **maplist;
1166 size_t page_offset;
1167 xfs_buf_t *pb;
1168 xfs_iomap_t iomap;
1169 int niomap, error = 0;
1170 int pb_flags, map_flags, pg_index = 0;
1171 size_t length, total;
1172 loff_t offset, map_size;
1173 size_t size;
1174 vnode_t *vp = LINVFS_GET_VP(inode);
1175
1176 /* Note - although the iomap could have a 64-bit size,
1177 * kiobuf->length is only an int, so the min(map_size, length)
1178 * test will keep us from overflowing the pagebuf size_t size.
1179 */
1180 total = length = iobuf->length;
1181 offset = blocknr;
1182 offset <<= inode->i_blkbits;
1183
1184 maplist = iobuf->maplist;
1185 page_offset = iobuf->offset;
1186
1187 map_flags = (rw ? BMAPI_WRITE : BMAPI_READ) | BMAPI_DIRECT;
1188 pb_flags = (rw ? PBF_WRITE : PBF_READ) | PBF_FORCEIO | PBF_DIRECTIO;
1189 while (length) {
1190 niomap = 1;
1191 VOP_BMAP(vp, offset, length, map_flags, &iomap, &niomap, error);
1192 if (unlikely(!error && niomap &&
1193 (iomap.iomap_flags & IOMAP_DELAY))) {
1194 #ifdef DEBUG
1195 printk(
1196 "XFS: %s - direct IO (%lld:%ld) into a delayed allocate extent?\n",
1197 __FUNCTION__, (long long)offset, (long)length);
1198 xfs_stack_trace();
1199 #endif
1200 VOP_BMAP(vp, offset, length, BMAPI_ALLOCATE,
1201 &iomap, &niomap, error);
1202 }
1203 if (error)
1204 break;
1205
1206 if (rw == WRITE)
1207 VMODIFY(vp);
1208
1209 BUG_ON(niomap != 1);
1210 BUG_ON(iomap.iomap_flags & IOMAP_DELAY);
1211
1212 map_size = iomap.iomap_bsize - iomap.iomap_delta;
1213 size = (size_t)min(map_size, (loff_t)length);
1214
1215 if ((iomap.iomap_flags & IOMAP_HOLE) ||
1216 ((iomap.iomap_flags & IOMAP_UNWRITTEN) && rw == READ)) {
1217 size_t zero_len = size;
1218
1219 if (rw == WRITE)
1220 break;
1221
1222 /* Need to zero it all */
1223 while (zero_len) {
1224 struct page *page;
1225 size_t pg_len;
1226
1227 pg_len = min((size_t)
1228 (PAGE_CACHE_SIZE - page_offset),
1229 zero_len);
1230
1231 page = maplist[pg_index];
1232
1233 memset(kmap(page) + page_offset, 0, pg_len);
1234 flush_dcache_page(page);
1235 kunmap(page);
1236
1237 zero_len -= pg_len;
1238 if ((pg_len + page_offset) == PAGE_CACHE_SIZE) {
1239 pg_index++;
1240 page_offset = 0;
1241 } else {
1242 page_offset = (page_offset + pg_len) &
1243 ~PAGE_CACHE_MASK;
1244 }
1245 }
1246 } else {
1247 int pg_count;
1248
1249 pg_count = (size + page_offset + PAGE_CACHE_SIZE - 1)
1250 >> PAGE_CACHE_SHIFT;
1251 if ((pb = pagebuf_lookup(iomap.iomap_target, offset,
1252 size, pb_flags)) == NULL) {
1253 error = ENOMEM;
1254 break;
1255 }
1256 /* Need to hook up pagebuf to kiobuf pages */
1257 pb->pb_pages = &maplist[pg_index];
1258 pb->pb_offset = page_offset;
1259 pb->pb_page_count = pg_count;
1260 pb->pb_bn = iomap.iomap_bn + (iomap.iomap_delta >> BBSHIFT);
1261
1262 error = pagebuf_iostart(pb, pb_flags);
1263 if (!error && (iomap.iomap_flags & IOMAP_UNWRITTEN)) {
1264 VOP_BMAP(vp, XFS_BUF_OFFSET(pb),
1265 XFS_BUF_SIZE(pb),
1266 BMAPI_UNWRITTEN, NULL, NULL, error);
1267 }
1268
1269 pagebuf_rele(pb);
1270
1271 if (error)
1272 break;
1273
1274 page_offset = (page_offset + size) & ~PAGE_CACHE_MASK;
1275 if (page_offset)
1276 pg_count--;
1277 pg_index += pg_count;
1278 }
1279
1280 offset += size;
1281 length -= size;
1282 }
1283
1284 if (error)
1285 return -error;
1286 return (int)(total - length);
1287 }
1288
1289
1290 struct address_space_operations linvfs_aops = {
1291 .readpage = linvfs_readpage,
1292 .writepage = linvfs_writepage,
1293 .sync_page = block_sync_page,
1294 .releasepage = linvfs_release_page,
1295 .prepare_write = linvfs_prepare_write,
1296 .commit_write = generic_commit_write,
1297 .bmap = linvfs_bmap,
1298 .direct_IO = linvfs_direct_IO,
1299 };
1300