1 /*
2  * Copyright (c) 2000-2006 Silicon Graphics, Inc.
3  * All Rights Reserved.
4  *
5  * This program is free software; you can redistribute it and/or
6  * modify it under the terms of the GNU General Public License as
7  * published by the Free Software Foundation.
8  *
9  * This program is distributed in the hope that it would be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  * GNU General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public License
15  * along with this program; if not, write the Free Software Foundation,
16  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17  */
18 #include "xfs.h"
19 #include "xfs_fs.h"
20 #include "xfs_types.h"
21 #include "xfs_bit.h"
22 #include "xfs_log.h"
23 #include "xfs_inum.h"
24 #include "xfs_trans.h"
25 #include "xfs_sb.h"
26 #include "xfs_ag.h"
27 #include "xfs_mount.h"
28 #include "xfs_error.h"
29 #include "xfs_bmap_btree.h"
30 #include "xfs_alloc_btree.h"
31 #include "xfs_ialloc_btree.h"
32 #include "xfs_dinode.h"
33 #include "xfs_inode.h"
34 #include "xfs_inode_item.h"
35 #include "xfs_alloc.h"
36 #include "xfs_ialloc.h"
37 #include "xfs_log_priv.h"
38 #include "xfs_buf_item.h"
39 #include "xfs_log_recover.h"
40 #include "xfs_extfree_item.h"
41 #include "xfs_trans_priv.h"
42 #include "xfs_quota.h"
43 #include "xfs_rw.h"
44 #include "xfs_utils.h"
45 #include "xfs_trace.h"
46 
47 STATIC int	xlog_find_zeroed(xlog_t *, xfs_daddr_t *);
48 STATIC int	xlog_clear_stale_blocks(xlog_t *, xfs_lsn_t);
49 #if defined(DEBUG)
50 STATIC void	xlog_recover_check_summary(xlog_t *);
51 #else
52 #define	xlog_recover_check_summary(log)
53 #endif
54 
55 /*
56  * This structure is used during recovery to record the buf log items which
57  * have been canceled and should not be replayed.
58  */
59 struct xfs_buf_cancel {
60 	xfs_daddr_t		bc_blkno;
61 	uint			bc_len;
62 	int			bc_refcount;
63 	struct list_head	bc_list;
64 };
65 
66 /*
67  * Sector aligned buffer routines for buffer create/read/write/access
68  */
69 
70 /*
71  * Verify the given count of basic blocks is valid number of blocks
72  * to specify for an operation involving the given XFS log buffer.
73  * Returns nonzero if the count is valid, 0 otherwise.
74  */
75 
76 static inline int
xlog_buf_bbcount_valid(xlog_t * log,int bbcount)77 xlog_buf_bbcount_valid(
78 	xlog_t		*log,
79 	int		bbcount)
80 {
81 	return bbcount > 0 && bbcount <= log->l_logBBsize;
82 }
83 
84 /*
85  * Allocate a buffer to hold log data.  The buffer needs to be able
86  * to map to a range of nbblks basic blocks at any valid (basic
87  * block) offset within the log.
88  */
89 STATIC xfs_buf_t *
xlog_get_bp(xlog_t * log,int nbblks)90 xlog_get_bp(
91 	xlog_t		*log,
92 	int		nbblks)
93 {
94 	struct xfs_buf	*bp;
95 
96 	if (!xlog_buf_bbcount_valid(log, nbblks)) {
97 		xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
98 			nbblks);
99 		XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
100 		return NULL;
101 	}
102 
103 	/*
104 	 * We do log I/O in units of log sectors (a power-of-2
105 	 * multiple of the basic block size), so we round up the
106 	 * requested size to accommodate the basic blocks required
107 	 * for complete log sectors.
108 	 *
109 	 * In addition, the buffer may be used for a non-sector-
110 	 * aligned block offset, in which case an I/O of the
111 	 * requested size could extend beyond the end of the
112 	 * buffer.  If the requested size is only 1 basic block it
113 	 * will never straddle a sector boundary, so this won't be
114 	 * an issue.  Nor will this be a problem if the log I/O is
115 	 * done in basic blocks (sector size 1).  But otherwise we
116 	 * extend the buffer by one extra log sector to ensure
117 	 * there's space to accommodate this possibility.
118 	 */
119 	if (nbblks > 1 && log->l_sectBBsize > 1)
120 		nbblks += log->l_sectBBsize;
121 	nbblks = round_up(nbblks, log->l_sectBBsize);
122 
123 	bp = xfs_buf_get_uncached(log->l_mp->m_logdev_targp, BBTOB(nbblks), 0);
124 	if (bp)
125 		xfs_buf_unlock(bp);
126 	return bp;
127 }
128 
129 STATIC void
xlog_put_bp(xfs_buf_t * bp)130 xlog_put_bp(
131 	xfs_buf_t	*bp)
132 {
133 	xfs_buf_free(bp);
134 }
135 
136 /*
137  * Return the address of the start of the given block number's data
138  * in a log buffer.  The buffer covers a log sector-aligned region.
139  */
140 STATIC xfs_caddr_t
xlog_align(xlog_t * log,xfs_daddr_t blk_no,int nbblks,xfs_buf_t * bp)141 xlog_align(
142 	xlog_t		*log,
143 	xfs_daddr_t	blk_no,
144 	int		nbblks,
145 	xfs_buf_t	*bp)
146 {
147 	xfs_daddr_t	offset = blk_no & ((xfs_daddr_t)log->l_sectBBsize - 1);
148 
149 	ASSERT(BBTOB(offset + nbblks) <= XFS_BUF_SIZE(bp));
150 	return bp->b_addr + BBTOB(offset);
151 }
152 
153 
154 /*
155  * nbblks should be uint, but oh well.  Just want to catch that 32-bit length.
156  */
157 STATIC int
xlog_bread_noalign(xlog_t * log,xfs_daddr_t blk_no,int nbblks,xfs_buf_t * bp)158 xlog_bread_noalign(
159 	xlog_t		*log,
160 	xfs_daddr_t	blk_no,
161 	int		nbblks,
162 	xfs_buf_t	*bp)
163 {
164 	int		error;
165 
166 	if (!xlog_buf_bbcount_valid(log, nbblks)) {
167 		xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
168 			nbblks);
169 		XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
170 		return EFSCORRUPTED;
171 	}
172 
173 	blk_no = round_down(blk_no, log->l_sectBBsize);
174 	nbblks = round_up(nbblks, log->l_sectBBsize);
175 
176 	ASSERT(nbblks > 0);
177 	ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp));
178 
179 	XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
180 	XFS_BUF_READ(bp);
181 	XFS_BUF_SET_COUNT(bp, BBTOB(nbblks));
182 
183 	xfsbdstrat(log->l_mp, bp);
184 	error = xfs_buf_iowait(bp);
185 	if (error)
186 		xfs_buf_ioerror_alert(bp, __func__);
187 	return error;
188 }
189 
190 STATIC int
xlog_bread(xlog_t * log,xfs_daddr_t blk_no,int nbblks,xfs_buf_t * bp,xfs_caddr_t * offset)191 xlog_bread(
192 	xlog_t		*log,
193 	xfs_daddr_t	blk_no,
194 	int		nbblks,
195 	xfs_buf_t	*bp,
196 	xfs_caddr_t	*offset)
197 {
198 	int		error;
199 
200 	error = xlog_bread_noalign(log, blk_no, nbblks, bp);
201 	if (error)
202 		return error;
203 
204 	*offset = xlog_align(log, blk_no, nbblks, bp);
205 	return 0;
206 }
207 
208 /*
209  * Read at an offset into the buffer. Returns with the buffer in it's original
210  * state regardless of the result of the read.
211  */
212 STATIC int
xlog_bread_offset(xlog_t * log,xfs_daddr_t blk_no,int nbblks,xfs_buf_t * bp,xfs_caddr_t offset)213 xlog_bread_offset(
214 	xlog_t		*log,
215 	xfs_daddr_t	blk_no,		/* block to read from */
216 	int		nbblks,		/* blocks to read */
217 	xfs_buf_t	*bp,
218 	xfs_caddr_t	offset)
219 {
220 	xfs_caddr_t	orig_offset = bp->b_addr;
221 	int		orig_len = bp->b_buffer_length;
222 	int		error, error2;
223 
224 	error = xfs_buf_associate_memory(bp, offset, BBTOB(nbblks));
225 	if (error)
226 		return error;
227 
228 	error = xlog_bread_noalign(log, blk_no, nbblks, bp);
229 
230 	/* must reset buffer pointer even on error */
231 	error2 = xfs_buf_associate_memory(bp, orig_offset, orig_len);
232 	if (error)
233 		return error;
234 	return error2;
235 }
236 
237 /*
238  * Write out the buffer at the given block for the given number of blocks.
239  * The buffer is kept locked across the write and is returned locked.
240  * This can only be used for synchronous log writes.
241  */
242 STATIC int
xlog_bwrite(xlog_t * log,xfs_daddr_t blk_no,int nbblks,xfs_buf_t * bp)243 xlog_bwrite(
244 	xlog_t		*log,
245 	xfs_daddr_t	blk_no,
246 	int		nbblks,
247 	xfs_buf_t	*bp)
248 {
249 	int		error;
250 
251 	if (!xlog_buf_bbcount_valid(log, nbblks)) {
252 		xfs_warn(log->l_mp, "Invalid block length (0x%x) for buffer",
253 			nbblks);
254 		XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
255 		return EFSCORRUPTED;
256 	}
257 
258 	blk_no = round_down(blk_no, log->l_sectBBsize);
259 	nbblks = round_up(nbblks, log->l_sectBBsize);
260 
261 	ASSERT(nbblks > 0);
262 	ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp));
263 
264 	XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
265 	XFS_BUF_ZEROFLAGS(bp);
266 	xfs_buf_hold(bp);
267 	xfs_buf_lock(bp);
268 	XFS_BUF_SET_COUNT(bp, BBTOB(nbblks));
269 
270 	error = xfs_bwrite(bp);
271 	if (error)
272 		xfs_buf_ioerror_alert(bp, __func__);
273 	xfs_buf_relse(bp);
274 	return error;
275 }
276 
277 #ifdef DEBUG
278 /*
279  * dump debug superblock and log record information
280  */
281 STATIC void
xlog_header_check_dump(xfs_mount_t * mp,xlog_rec_header_t * head)282 xlog_header_check_dump(
283 	xfs_mount_t		*mp,
284 	xlog_rec_header_t	*head)
285 {
286 	xfs_debug(mp, "%s:  SB : uuid = %pU, fmt = %d\n",
287 		__func__, &mp->m_sb.sb_uuid, XLOG_FMT);
288 	xfs_debug(mp, "    log : uuid = %pU, fmt = %d\n",
289 		&head->h_fs_uuid, be32_to_cpu(head->h_fmt));
290 }
291 #else
292 #define xlog_header_check_dump(mp, head)
293 #endif
294 
295 /*
296  * check log record header for recovery
297  */
298 STATIC int
xlog_header_check_recover(xfs_mount_t * mp,xlog_rec_header_t * head)299 xlog_header_check_recover(
300 	xfs_mount_t		*mp,
301 	xlog_rec_header_t	*head)
302 {
303 	ASSERT(head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM));
304 
305 	/*
306 	 * IRIX doesn't write the h_fmt field and leaves it zeroed
307 	 * (XLOG_FMT_UNKNOWN). This stops us from trying to recover
308 	 * a dirty log created in IRIX.
309 	 */
310 	if (unlikely(head->h_fmt != cpu_to_be32(XLOG_FMT))) {
311 		xfs_warn(mp,
312 	"dirty log written in incompatible format - can't recover");
313 		xlog_header_check_dump(mp, head);
314 		XFS_ERROR_REPORT("xlog_header_check_recover(1)",
315 				 XFS_ERRLEVEL_HIGH, mp);
316 		return XFS_ERROR(EFSCORRUPTED);
317 	} else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
318 		xfs_warn(mp,
319 	"dirty log entry has mismatched uuid - can't recover");
320 		xlog_header_check_dump(mp, head);
321 		XFS_ERROR_REPORT("xlog_header_check_recover(2)",
322 				 XFS_ERRLEVEL_HIGH, mp);
323 		return XFS_ERROR(EFSCORRUPTED);
324 	}
325 	return 0;
326 }
327 
328 /*
329  * read the head block of the log and check the header
330  */
331 STATIC int
xlog_header_check_mount(xfs_mount_t * mp,xlog_rec_header_t * head)332 xlog_header_check_mount(
333 	xfs_mount_t		*mp,
334 	xlog_rec_header_t	*head)
335 {
336 	ASSERT(head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM));
337 
338 	if (uuid_is_nil(&head->h_fs_uuid)) {
339 		/*
340 		 * IRIX doesn't write the h_fs_uuid or h_fmt fields. If
341 		 * h_fs_uuid is nil, we assume this log was last mounted
342 		 * by IRIX and continue.
343 		 */
344 		xfs_warn(mp, "nil uuid in log - IRIX style log");
345 	} else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
346 		xfs_warn(mp, "log has mismatched uuid - can't recover");
347 		xlog_header_check_dump(mp, head);
348 		XFS_ERROR_REPORT("xlog_header_check_mount",
349 				 XFS_ERRLEVEL_HIGH, mp);
350 		return XFS_ERROR(EFSCORRUPTED);
351 	}
352 	return 0;
353 }
354 
355 STATIC void
xlog_recover_iodone(struct xfs_buf * bp)356 xlog_recover_iodone(
357 	struct xfs_buf	*bp)
358 {
359 	if (bp->b_error) {
360 		/*
361 		 * We're not going to bother about retrying
362 		 * this during recovery. One strike!
363 		 */
364 		xfs_buf_ioerror_alert(bp, __func__);
365 		xfs_force_shutdown(bp->b_target->bt_mount,
366 					SHUTDOWN_META_IO_ERROR);
367 	}
368 	bp->b_iodone = NULL;
369 	xfs_buf_ioend(bp, 0);
370 }
371 
372 /*
373  * This routine finds (to an approximation) the first block in the physical
374  * log which contains the given cycle.  It uses a binary search algorithm.
375  * Note that the algorithm can not be perfect because the disk will not
376  * necessarily be perfect.
377  */
378 STATIC int
xlog_find_cycle_start(xlog_t * log,xfs_buf_t * bp,xfs_daddr_t first_blk,xfs_daddr_t * last_blk,uint cycle)379 xlog_find_cycle_start(
380 	xlog_t		*log,
381 	xfs_buf_t	*bp,
382 	xfs_daddr_t	first_blk,
383 	xfs_daddr_t	*last_blk,
384 	uint		cycle)
385 {
386 	xfs_caddr_t	offset;
387 	xfs_daddr_t	mid_blk;
388 	xfs_daddr_t	end_blk;
389 	uint		mid_cycle;
390 	int		error;
391 
392 	end_blk = *last_blk;
393 	mid_blk = BLK_AVG(first_blk, end_blk);
394 	while (mid_blk != first_blk && mid_blk != end_blk) {
395 		error = xlog_bread(log, mid_blk, 1, bp, &offset);
396 		if (error)
397 			return error;
398 		mid_cycle = xlog_get_cycle(offset);
399 		if (mid_cycle == cycle)
400 			end_blk = mid_blk;   /* last_half_cycle == mid_cycle */
401 		else
402 			first_blk = mid_blk; /* first_half_cycle == mid_cycle */
403 		mid_blk = BLK_AVG(first_blk, end_blk);
404 	}
405 	ASSERT((mid_blk == first_blk && mid_blk+1 == end_blk) ||
406 	       (mid_blk == end_blk && mid_blk-1 == first_blk));
407 
408 	*last_blk = end_blk;
409 
410 	return 0;
411 }
412 
413 /*
414  * Check that a range of blocks does not contain stop_on_cycle_no.
415  * Fill in *new_blk with the block offset where such a block is
416  * found, or with -1 (an invalid block number) if there is no such
417  * block in the range.  The scan needs to occur from front to back
418  * and the pointer into the region must be updated since a later
419  * routine will need to perform another test.
420  */
421 STATIC int
xlog_find_verify_cycle(xlog_t * log,xfs_daddr_t start_blk,int nbblks,uint stop_on_cycle_no,xfs_daddr_t * new_blk)422 xlog_find_verify_cycle(
423 	xlog_t		*log,
424 	xfs_daddr_t	start_blk,
425 	int		nbblks,
426 	uint		stop_on_cycle_no,
427 	xfs_daddr_t	*new_blk)
428 {
429 	xfs_daddr_t	i, j;
430 	uint		cycle;
431 	xfs_buf_t	*bp;
432 	xfs_daddr_t	bufblks;
433 	xfs_caddr_t	buf = NULL;
434 	int		error = 0;
435 
436 	/*
437 	 * Greedily allocate a buffer big enough to handle the full
438 	 * range of basic blocks we'll be examining.  If that fails,
439 	 * try a smaller size.  We need to be able to read at least
440 	 * a log sector, or we're out of luck.
441 	 */
442 	bufblks = 1 << ffs(nbblks);
443 	while (!(bp = xlog_get_bp(log, bufblks))) {
444 		bufblks >>= 1;
445 		if (bufblks < log->l_sectBBsize)
446 			return ENOMEM;
447 	}
448 
449 	for (i = start_blk; i < start_blk + nbblks; i += bufblks) {
450 		int	bcount;
451 
452 		bcount = min(bufblks, (start_blk + nbblks - i));
453 
454 		error = xlog_bread(log, i, bcount, bp, &buf);
455 		if (error)
456 			goto out;
457 
458 		for (j = 0; j < bcount; j++) {
459 			cycle = xlog_get_cycle(buf);
460 			if (cycle == stop_on_cycle_no) {
461 				*new_blk = i+j;
462 				goto out;
463 			}
464 
465 			buf += BBSIZE;
466 		}
467 	}
468 
469 	*new_blk = -1;
470 
471 out:
472 	xlog_put_bp(bp);
473 	return error;
474 }
475 
476 /*
477  * Potentially backup over partial log record write.
478  *
479  * In the typical case, last_blk is the number of the block directly after
480  * a good log record.  Therefore, we subtract one to get the block number
481  * of the last block in the given buffer.  extra_bblks contains the number
482  * of blocks we would have read on a previous read.  This happens when the
483  * last log record is split over the end of the physical log.
484  *
485  * extra_bblks is the number of blocks potentially verified on a previous
486  * call to this routine.
487  */
488 STATIC int
xlog_find_verify_log_record(xlog_t * log,xfs_daddr_t start_blk,xfs_daddr_t * last_blk,int extra_bblks)489 xlog_find_verify_log_record(
490 	xlog_t			*log,
491 	xfs_daddr_t		start_blk,
492 	xfs_daddr_t		*last_blk,
493 	int			extra_bblks)
494 {
495 	xfs_daddr_t		i;
496 	xfs_buf_t		*bp;
497 	xfs_caddr_t		offset = NULL;
498 	xlog_rec_header_t	*head = NULL;
499 	int			error = 0;
500 	int			smallmem = 0;
501 	int			num_blks = *last_blk - start_blk;
502 	int			xhdrs;
503 
504 	ASSERT(start_blk != 0 || *last_blk != start_blk);
505 
506 	if (!(bp = xlog_get_bp(log, num_blks))) {
507 		if (!(bp = xlog_get_bp(log, 1)))
508 			return ENOMEM;
509 		smallmem = 1;
510 	} else {
511 		error = xlog_bread(log, start_blk, num_blks, bp, &offset);
512 		if (error)
513 			goto out;
514 		offset += ((num_blks - 1) << BBSHIFT);
515 	}
516 
517 	for (i = (*last_blk) - 1; i >= 0; i--) {
518 		if (i < start_blk) {
519 			/* valid log record not found */
520 			xfs_warn(log->l_mp,
521 		"Log inconsistent (didn't find previous header)");
522 			ASSERT(0);
523 			error = XFS_ERROR(EIO);
524 			goto out;
525 		}
526 
527 		if (smallmem) {
528 			error = xlog_bread(log, i, 1, bp, &offset);
529 			if (error)
530 				goto out;
531 		}
532 
533 		head = (xlog_rec_header_t *)offset;
534 
535 		if (head->h_magicno == cpu_to_be32(XLOG_HEADER_MAGIC_NUM))
536 			break;
537 
538 		if (!smallmem)
539 			offset -= BBSIZE;
540 	}
541 
542 	/*
543 	 * We hit the beginning of the physical log & still no header.  Return
544 	 * to caller.  If caller can handle a return of -1, then this routine
545 	 * will be called again for the end of the physical log.
546 	 */
547 	if (i == -1) {
548 		error = -1;
549 		goto out;
550 	}
551 
552 	/*
553 	 * We have the final block of the good log (the first block
554 	 * of the log record _before_ the head. So we check the uuid.
555 	 */
556 	if ((error = xlog_header_check_mount(log->l_mp, head)))
557 		goto out;
558 
559 	/*
560 	 * We may have found a log record header before we expected one.
561 	 * last_blk will be the 1st block # with a given cycle #.  We may end
562 	 * up reading an entire log record.  In this case, we don't want to
563 	 * reset last_blk.  Only when last_blk points in the middle of a log
564 	 * record do we update last_blk.
565 	 */
566 	if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
567 		uint	h_size = be32_to_cpu(head->h_size);
568 
569 		xhdrs = h_size / XLOG_HEADER_CYCLE_SIZE;
570 		if (h_size % XLOG_HEADER_CYCLE_SIZE)
571 			xhdrs++;
572 	} else {
573 		xhdrs = 1;
574 	}
575 
576 	if (*last_blk - i + extra_bblks !=
577 	    BTOBB(be32_to_cpu(head->h_len)) + xhdrs)
578 		*last_blk = i;
579 
580 out:
581 	xlog_put_bp(bp);
582 	return error;
583 }
584 
585 /*
586  * Head is defined to be the point of the log where the next log write
587  * write could go.  This means that incomplete LR writes at the end are
588  * eliminated when calculating the head.  We aren't guaranteed that previous
589  * LR have complete transactions.  We only know that a cycle number of
590  * current cycle number -1 won't be present in the log if we start writing
591  * from our current block number.
592  *
593  * last_blk contains the block number of the first block with a given
594  * cycle number.
595  *
596  * Return: zero if normal, non-zero if error.
597  */
598 STATIC int
xlog_find_head(xlog_t * log,xfs_daddr_t * return_head_blk)599 xlog_find_head(
600 	xlog_t 		*log,
601 	xfs_daddr_t	*return_head_blk)
602 {
603 	xfs_buf_t	*bp;
604 	xfs_caddr_t	offset;
605 	xfs_daddr_t	new_blk, first_blk, start_blk, last_blk, head_blk;
606 	int		num_scan_bblks;
607 	uint		first_half_cycle, last_half_cycle;
608 	uint		stop_on_cycle;
609 	int		error, log_bbnum = log->l_logBBsize;
610 
611 	/* Is the end of the log device zeroed? */
612 	if ((error = xlog_find_zeroed(log, &first_blk)) == -1) {
613 		*return_head_blk = first_blk;
614 
615 		/* Is the whole lot zeroed? */
616 		if (!first_blk) {
617 			/* Linux XFS shouldn't generate totally zeroed logs -
618 			 * mkfs etc write a dummy unmount record to a fresh
619 			 * log so we can store the uuid in there
620 			 */
621 			xfs_warn(log->l_mp, "totally zeroed log");
622 		}
623 
624 		return 0;
625 	} else if (error) {
626 		xfs_warn(log->l_mp, "empty log check failed");
627 		return error;
628 	}
629 
630 	first_blk = 0;			/* get cycle # of 1st block */
631 	bp = xlog_get_bp(log, 1);
632 	if (!bp)
633 		return ENOMEM;
634 
635 	error = xlog_bread(log, 0, 1, bp, &offset);
636 	if (error)
637 		goto bp_err;
638 
639 	first_half_cycle = xlog_get_cycle(offset);
640 
641 	last_blk = head_blk = log_bbnum - 1;	/* get cycle # of last block */
642 	error = xlog_bread(log, last_blk, 1, bp, &offset);
643 	if (error)
644 		goto bp_err;
645 
646 	last_half_cycle = xlog_get_cycle(offset);
647 	ASSERT(last_half_cycle != 0);
648 
649 	/*
650 	 * If the 1st half cycle number is equal to the last half cycle number,
651 	 * then the entire log is stamped with the same cycle number.  In this
652 	 * case, head_blk can't be set to zero (which makes sense).  The below
653 	 * math doesn't work out properly with head_blk equal to zero.  Instead,
654 	 * we set it to log_bbnum which is an invalid block number, but this
655 	 * value makes the math correct.  If head_blk doesn't changed through
656 	 * all the tests below, *head_blk is set to zero at the very end rather
657 	 * than log_bbnum.  In a sense, log_bbnum and zero are the same block
658 	 * in a circular file.
659 	 */
660 	if (first_half_cycle == last_half_cycle) {
661 		/*
662 		 * In this case we believe that the entire log should have
663 		 * cycle number last_half_cycle.  We need to scan backwards
664 		 * from the end verifying that there are no holes still
665 		 * containing last_half_cycle - 1.  If we find such a hole,
666 		 * then the start of that hole will be the new head.  The
667 		 * simple case looks like
668 		 *        x | x ... | x - 1 | x
669 		 * Another case that fits this picture would be
670 		 *        x | x + 1 | x ... | x
671 		 * In this case the head really is somewhere at the end of the
672 		 * log, as one of the latest writes at the beginning was
673 		 * incomplete.
674 		 * One more case is
675 		 *        x | x + 1 | x ... | x - 1 | x
676 		 * This is really the combination of the above two cases, and
677 		 * the head has to end up at the start of the x-1 hole at the
678 		 * end of the log.
679 		 *
680 		 * In the 256k log case, we will read from the beginning to the
681 		 * end of the log and search for cycle numbers equal to x-1.
682 		 * We don't worry about the x+1 blocks that we encounter,
683 		 * because we know that they cannot be the head since the log
684 		 * started with x.
685 		 */
686 		head_blk = log_bbnum;
687 		stop_on_cycle = last_half_cycle - 1;
688 	} else {
689 		/*
690 		 * In this case we want to find the first block with cycle
691 		 * number matching last_half_cycle.  We expect the log to be
692 		 * some variation on
693 		 *        x + 1 ... | x ... | x
694 		 * The first block with cycle number x (last_half_cycle) will
695 		 * be where the new head belongs.  First we do a binary search
696 		 * for the first occurrence of last_half_cycle.  The binary
697 		 * search may not be totally accurate, so then we scan back
698 		 * from there looking for occurrences of last_half_cycle before
699 		 * us.  If that backwards scan wraps around the beginning of
700 		 * the log, then we look for occurrences of last_half_cycle - 1
701 		 * at the end of the log.  The cases we're looking for look
702 		 * like
703 		 *                               v binary search stopped here
704 		 *        x + 1 ... | x | x + 1 | x ... | x
705 		 *                   ^ but we want to locate this spot
706 		 * or
707 		 *        <---------> less than scan distance
708 		 *        x + 1 ... | x ... | x - 1 | x
709 		 *                           ^ we want to locate this spot
710 		 */
711 		stop_on_cycle = last_half_cycle;
712 		if ((error = xlog_find_cycle_start(log, bp, first_blk,
713 						&head_blk, last_half_cycle)))
714 			goto bp_err;
715 	}
716 
717 	/*
718 	 * Now validate the answer.  Scan back some number of maximum possible
719 	 * blocks and make sure each one has the expected cycle number.  The
720 	 * maximum is determined by the total possible amount of buffering
721 	 * in the in-core log.  The following number can be made tighter if
722 	 * we actually look at the block size of the filesystem.
723 	 */
724 	num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);
725 	if (head_blk >= num_scan_bblks) {
726 		/*
727 		 * We are guaranteed that the entire check can be performed
728 		 * in one buffer.
729 		 */
730 		start_blk = head_blk - num_scan_bblks;
731 		if ((error = xlog_find_verify_cycle(log,
732 						start_blk, num_scan_bblks,
733 						stop_on_cycle, &new_blk)))
734 			goto bp_err;
735 		if (new_blk != -1)
736 			head_blk = new_blk;
737 	} else {		/* need to read 2 parts of log */
738 		/*
739 		 * We are going to scan backwards in the log in two parts.
740 		 * First we scan the physical end of the log.  In this part
741 		 * of the log, we are looking for blocks with cycle number
742 		 * last_half_cycle - 1.
743 		 * If we find one, then we know that the log starts there, as
744 		 * we've found a hole that didn't get written in going around
745 		 * the end of the physical log.  The simple case for this is
746 		 *        x + 1 ... | x ... | x - 1 | x
747 		 *        <---------> less than scan distance
748 		 * If all of the blocks at the end of the log have cycle number
749 		 * last_half_cycle, then we check the blocks at the start of
750 		 * the log looking for occurrences of last_half_cycle.  If we
751 		 * find one, then our current estimate for the location of the
752 		 * first occurrence of last_half_cycle is wrong and we move
753 		 * back to the hole we've found.  This case looks like
754 		 *        x + 1 ... | x | x + 1 | x ...
755 		 *                               ^ binary search stopped here
756 		 * Another case we need to handle that only occurs in 256k
757 		 * logs is
758 		 *        x + 1 ... | x ... | x+1 | x ...
759 		 *                   ^ binary search stops here
760 		 * In a 256k log, the scan at the end of the log will see the
761 		 * x + 1 blocks.  We need to skip past those since that is
762 		 * certainly not the head of the log.  By searching for
763 		 * last_half_cycle-1 we accomplish that.
764 		 */
765 		ASSERT(head_blk <= INT_MAX &&
766 			(xfs_daddr_t) num_scan_bblks >= head_blk);
767 		start_blk = log_bbnum - (num_scan_bblks - head_blk);
768 		if ((error = xlog_find_verify_cycle(log, start_blk,
769 					num_scan_bblks - (int)head_blk,
770 					(stop_on_cycle - 1), &new_blk)))
771 			goto bp_err;
772 		if (new_blk != -1) {
773 			head_blk = new_blk;
774 			goto validate_head;
775 		}
776 
777 		/*
778 		 * Scan beginning of log now.  The last part of the physical
779 		 * log is good.  This scan needs to verify that it doesn't find
780 		 * the last_half_cycle.
781 		 */
782 		start_blk = 0;
783 		ASSERT(head_blk <= INT_MAX);
784 		if ((error = xlog_find_verify_cycle(log,
785 					start_blk, (int)head_blk,
786 					stop_on_cycle, &new_blk)))
787 			goto bp_err;
788 		if (new_blk != -1)
789 			head_blk = new_blk;
790 	}
791 
792 validate_head:
793 	/*
794 	 * Now we need to make sure head_blk is not pointing to a block in
795 	 * the middle of a log record.
796 	 */
797 	num_scan_bblks = XLOG_REC_SHIFT(log);
798 	if (head_blk >= num_scan_bblks) {
799 		start_blk = head_blk - num_scan_bblks; /* don't read head_blk */
800 
801 		/* start ptr at last block ptr before head_blk */
802 		if ((error = xlog_find_verify_log_record(log, start_blk,
803 							&head_blk, 0)) == -1) {
804 			error = XFS_ERROR(EIO);
805 			goto bp_err;
806 		} else if (error)
807 			goto bp_err;
808 	} else {
809 		start_blk = 0;
810 		ASSERT(head_blk <= INT_MAX);
811 		if ((error = xlog_find_verify_log_record(log, start_blk,
812 							&head_blk, 0)) == -1) {
813 			/* We hit the beginning of the log during our search */
814 			start_blk = log_bbnum - (num_scan_bblks - head_blk);
815 			new_blk = log_bbnum;
816 			ASSERT(start_blk <= INT_MAX &&
817 				(xfs_daddr_t) log_bbnum-start_blk >= 0);
818 			ASSERT(head_blk <= INT_MAX);
819 			if ((error = xlog_find_verify_log_record(log,
820 							start_blk, &new_blk,
821 							(int)head_blk)) == -1) {
822 				error = XFS_ERROR(EIO);
823 				goto bp_err;
824 			} else if (error)
825 				goto bp_err;
826 			if (new_blk != log_bbnum)
827 				head_blk = new_blk;
828 		} else if (error)
829 			goto bp_err;
830 	}
831 
832 	xlog_put_bp(bp);
833 	if (head_blk == log_bbnum)
834 		*return_head_blk = 0;
835 	else
836 		*return_head_blk = head_blk;
837 	/*
838 	 * When returning here, we have a good block number.  Bad block
839 	 * means that during a previous crash, we didn't have a clean break
840 	 * from cycle number N to cycle number N-1.  In this case, we need
841 	 * to find the first block with cycle number N-1.
842 	 */
843 	return 0;
844 
845  bp_err:
846 	xlog_put_bp(bp);
847 
848 	if (error)
849 		xfs_warn(log->l_mp, "failed to find log head");
850 	return error;
851 }
852 
853 /*
854  * Find the sync block number or the tail of the log.
855  *
856  * This will be the block number of the last record to have its
857  * associated buffers synced to disk.  Every log record header has
858  * a sync lsn embedded in it.  LSNs hold block numbers, so it is easy
859  * to get a sync block number.  The only concern is to figure out which
860  * log record header to believe.
861  *
862  * The following algorithm uses the log record header with the largest
863  * lsn.  The entire log record does not need to be valid.  We only care
864  * that the header is valid.
865  *
866  * We could speed up search by using current head_blk buffer, but it is not
867  * available.
868  */
869 STATIC int
xlog_find_tail(xlog_t * log,xfs_daddr_t * head_blk,xfs_daddr_t * tail_blk)870 xlog_find_tail(
871 	xlog_t			*log,
872 	xfs_daddr_t		*head_blk,
873 	xfs_daddr_t		*tail_blk)
874 {
875 	xlog_rec_header_t	*rhead;
876 	xlog_op_header_t	*op_head;
877 	xfs_caddr_t		offset = NULL;
878 	xfs_buf_t		*bp;
879 	int			error, i, found;
880 	xfs_daddr_t		umount_data_blk;
881 	xfs_daddr_t		after_umount_blk;
882 	xfs_lsn_t		tail_lsn;
883 	int			hblks;
884 
885 	found = 0;
886 
887 	/*
888 	 * Find previous log record
889 	 */
890 	if ((error = xlog_find_head(log, head_blk)))
891 		return error;
892 
893 	bp = xlog_get_bp(log, 1);
894 	if (!bp)
895 		return ENOMEM;
896 	if (*head_blk == 0) {				/* special case */
897 		error = xlog_bread(log, 0, 1, bp, &offset);
898 		if (error)
899 			goto done;
900 
901 		if (xlog_get_cycle(offset) == 0) {
902 			*tail_blk = 0;
903 			/* leave all other log inited values alone */
904 			goto done;
905 		}
906 	}
907 
908 	/*
909 	 * Search backwards looking for log record header block
910 	 */
911 	ASSERT(*head_blk < INT_MAX);
912 	for (i = (int)(*head_blk) - 1; i >= 0; i--) {
913 		error = xlog_bread(log, i, 1, bp, &offset);
914 		if (error)
915 			goto done;
916 
917 		if (*(__be32 *)offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
918 			found = 1;
919 			break;
920 		}
921 	}
922 	/*
923 	 * If we haven't found the log record header block, start looking
924 	 * again from the end of the physical log.  XXXmiken: There should be
925 	 * a check here to make sure we didn't search more than N blocks in
926 	 * the previous code.
927 	 */
928 	if (!found) {
929 		for (i = log->l_logBBsize - 1; i >= (int)(*head_blk); i--) {
930 			error = xlog_bread(log, i, 1, bp, &offset);
931 			if (error)
932 				goto done;
933 
934 			if (*(__be32 *)offset ==
935 			    cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
936 				found = 2;
937 				break;
938 			}
939 		}
940 	}
941 	if (!found) {
942 		xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
943 		ASSERT(0);
944 		return XFS_ERROR(EIO);
945 	}
946 
947 	/* find blk_no of tail of log */
948 	rhead = (xlog_rec_header_t *)offset;
949 	*tail_blk = BLOCK_LSN(be64_to_cpu(rhead->h_tail_lsn));
950 
951 	/*
952 	 * Reset log values according to the state of the log when we
953 	 * crashed.  In the case where head_blk == 0, we bump curr_cycle
954 	 * one because the next write starts a new cycle rather than
955 	 * continuing the cycle of the last good log record.  At this
956 	 * point we have guaranteed that all partial log records have been
957 	 * accounted for.  Therefore, we know that the last good log record
958 	 * written was complete and ended exactly on the end boundary
959 	 * of the physical log.
960 	 */
961 	log->l_prev_block = i;
962 	log->l_curr_block = (int)*head_blk;
963 	log->l_curr_cycle = be32_to_cpu(rhead->h_cycle);
964 	if (found == 2)
965 		log->l_curr_cycle++;
966 	atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn));
967 	atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn));
968 	xlog_assign_grant_head(&log->l_reserve_head.grant, log->l_curr_cycle,
969 					BBTOB(log->l_curr_block));
970 	xlog_assign_grant_head(&log->l_write_head.grant, log->l_curr_cycle,
971 					BBTOB(log->l_curr_block));
972 
973 	/*
974 	 * Look for unmount record.  If we find it, then we know there
975 	 * was a clean unmount.  Since 'i' could be the last block in
976 	 * the physical log, we convert to a log block before comparing
977 	 * to the head_blk.
978 	 *
979 	 * Save the current tail lsn to use to pass to
980 	 * xlog_clear_stale_blocks() below.  We won't want to clear the
981 	 * unmount record if there is one, so we pass the lsn of the
982 	 * unmount record rather than the block after it.
983 	 */
984 	if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
985 		int	h_size = be32_to_cpu(rhead->h_size);
986 		int	h_version = be32_to_cpu(rhead->h_version);
987 
988 		if ((h_version & XLOG_VERSION_2) &&
989 		    (h_size > XLOG_HEADER_CYCLE_SIZE)) {
990 			hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
991 			if (h_size % XLOG_HEADER_CYCLE_SIZE)
992 				hblks++;
993 		} else {
994 			hblks = 1;
995 		}
996 	} else {
997 		hblks = 1;
998 	}
999 	after_umount_blk = (i + hblks + (int)
1000 		BTOBB(be32_to_cpu(rhead->h_len))) % log->l_logBBsize;
1001 	tail_lsn = atomic64_read(&log->l_tail_lsn);
1002 	if (*head_blk == after_umount_blk &&
1003 	    be32_to_cpu(rhead->h_num_logops) == 1) {
1004 		umount_data_blk = (i + hblks) % log->l_logBBsize;
1005 		error = xlog_bread(log, umount_data_blk, 1, bp, &offset);
1006 		if (error)
1007 			goto done;
1008 
1009 		op_head = (xlog_op_header_t *)offset;
1010 		if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) {
1011 			/*
1012 			 * Set tail and last sync so that newly written
1013 			 * log records will point recovery to after the
1014 			 * current unmount record.
1015 			 */
1016 			xlog_assign_atomic_lsn(&log->l_tail_lsn,
1017 					log->l_curr_cycle, after_umount_blk);
1018 			xlog_assign_atomic_lsn(&log->l_last_sync_lsn,
1019 					log->l_curr_cycle, after_umount_blk);
1020 			*tail_blk = after_umount_blk;
1021 
1022 			/*
1023 			 * Note that the unmount was clean. If the unmount
1024 			 * was not clean, we need to know this to rebuild the
1025 			 * superblock counters from the perag headers if we
1026 			 * have a filesystem using non-persistent counters.
1027 			 */
1028 			log->l_mp->m_flags |= XFS_MOUNT_WAS_CLEAN;
1029 		}
1030 	}
1031 
1032 	/*
1033 	 * Make sure that there are no blocks in front of the head
1034 	 * with the same cycle number as the head.  This can happen
1035 	 * because we allow multiple outstanding log writes concurrently,
1036 	 * and the later writes might make it out before earlier ones.
1037 	 *
1038 	 * We use the lsn from before modifying it so that we'll never
1039 	 * overwrite the unmount record after a clean unmount.
1040 	 *
1041 	 * Do this only if we are going to recover the filesystem
1042 	 *
1043 	 * NOTE: This used to say "if (!readonly)"
1044 	 * However on Linux, we can & do recover a read-only filesystem.
1045 	 * We only skip recovery if NORECOVERY is specified on mount,
1046 	 * in which case we would not be here.
1047 	 *
1048 	 * But... if the -device- itself is readonly, just skip this.
1049 	 * We can't recover this device anyway, so it won't matter.
1050 	 */
1051 	if (!xfs_readonly_buftarg(log->l_mp->m_logdev_targp))
1052 		error = xlog_clear_stale_blocks(log, tail_lsn);
1053 
1054 done:
1055 	xlog_put_bp(bp);
1056 
1057 	if (error)
1058 		xfs_warn(log->l_mp, "failed to locate log tail");
1059 	return error;
1060 }
1061 
1062 /*
1063  * Is the log zeroed at all?
1064  *
1065  * The last binary search should be changed to perform an X block read
1066  * once X becomes small enough.  You can then search linearly through
1067  * the X blocks.  This will cut down on the number of reads we need to do.
1068  *
1069  * If the log is partially zeroed, this routine will pass back the blkno
1070  * of the first block with cycle number 0.  It won't have a complete LR
1071  * preceding it.
1072  *
1073  * Return:
1074  *	0  => the log is completely written to
1075  *	-1 => use *blk_no as the first block of the log
1076  *	>0 => error has occurred
1077  */
1078 STATIC int
xlog_find_zeroed(xlog_t * log,xfs_daddr_t * blk_no)1079 xlog_find_zeroed(
1080 	xlog_t		*log,
1081 	xfs_daddr_t	*blk_no)
1082 {
1083 	xfs_buf_t	*bp;
1084 	xfs_caddr_t	offset;
1085 	uint	        first_cycle, last_cycle;
1086 	xfs_daddr_t	new_blk, last_blk, start_blk;
1087 	xfs_daddr_t     num_scan_bblks;
1088 	int	        error, log_bbnum = log->l_logBBsize;
1089 
1090 	*blk_no = 0;
1091 
1092 	/* check totally zeroed log */
1093 	bp = xlog_get_bp(log, 1);
1094 	if (!bp)
1095 		return ENOMEM;
1096 	error = xlog_bread(log, 0, 1, bp, &offset);
1097 	if (error)
1098 		goto bp_err;
1099 
1100 	first_cycle = xlog_get_cycle(offset);
1101 	if (first_cycle == 0) {		/* completely zeroed log */
1102 		*blk_no = 0;
1103 		xlog_put_bp(bp);
1104 		return -1;
1105 	}
1106 
1107 	/* check partially zeroed log */
1108 	error = xlog_bread(log, log_bbnum-1, 1, bp, &offset);
1109 	if (error)
1110 		goto bp_err;
1111 
1112 	last_cycle = xlog_get_cycle(offset);
1113 	if (last_cycle != 0) {		/* log completely written to */
1114 		xlog_put_bp(bp);
1115 		return 0;
1116 	} else if (first_cycle != 1) {
1117 		/*
1118 		 * If the cycle of the last block is zero, the cycle of
1119 		 * the first block must be 1. If it's not, maybe we're
1120 		 * not looking at a log... Bail out.
1121 		 */
1122 		xfs_warn(log->l_mp,
1123 			"Log inconsistent or not a log (last==0, first!=1)");
1124 		return XFS_ERROR(EINVAL);
1125 	}
1126 
1127 	/* we have a partially zeroed log */
1128 	last_blk = log_bbnum-1;
1129 	if ((error = xlog_find_cycle_start(log, bp, 0, &last_blk, 0)))
1130 		goto bp_err;
1131 
1132 	/*
1133 	 * Validate the answer.  Because there is no way to guarantee that
1134 	 * the entire log is made up of log records which are the same size,
1135 	 * we scan over the defined maximum blocks.  At this point, the maximum
1136 	 * is not chosen to mean anything special.   XXXmiken
1137 	 */
1138 	num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);
1139 	ASSERT(num_scan_bblks <= INT_MAX);
1140 
1141 	if (last_blk < num_scan_bblks)
1142 		num_scan_bblks = last_blk;
1143 	start_blk = last_blk - num_scan_bblks;
1144 
1145 	/*
1146 	 * We search for any instances of cycle number 0 that occur before
1147 	 * our current estimate of the head.  What we're trying to detect is
1148 	 *        1 ... | 0 | 1 | 0...
1149 	 *                       ^ binary search ends here
1150 	 */
1151 	if ((error = xlog_find_verify_cycle(log, start_blk,
1152 					 (int)num_scan_bblks, 0, &new_blk)))
1153 		goto bp_err;
1154 	if (new_blk != -1)
1155 		last_blk = new_blk;
1156 
1157 	/*
1158 	 * Potentially backup over partial log record write.  We don't need
1159 	 * to search the end of the log because we know it is zero.
1160 	 */
1161 	if ((error = xlog_find_verify_log_record(log, start_blk,
1162 				&last_blk, 0)) == -1) {
1163 	    error = XFS_ERROR(EIO);
1164 	    goto bp_err;
1165 	} else if (error)
1166 	    goto bp_err;
1167 
1168 	*blk_no = last_blk;
1169 bp_err:
1170 	xlog_put_bp(bp);
1171 	if (error)
1172 		return error;
1173 	return -1;
1174 }
1175 
1176 /*
1177  * These are simple subroutines used by xlog_clear_stale_blocks() below
1178  * to initialize a buffer full of empty log record headers and write
1179  * them into the log.
1180  */
1181 STATIC void
xlog_add_record(xlog_t * log,xfs_caddr_t buf,int cycle,int block,int tail_cycle,int tail_block)1182 xlog_add_record(
1183 	xlog_t			*log,
1184 	xfs_caddr_t		buf,
1185 	int			cycle,
1186 	int			block,
1187 	int			tail_cycle,
1188 	int			tail_block)
1189 {
1190 	xlog_rec_header_t	*recp = (xlog_rec_header_t *)buf;
1191 
1192 	memset(buf, 0, BBSIZE);
1193 	recp->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM);
1194 	recp->h_cycle = cpu_to_be32(cycle);
1195 	recp->h_version = cpu_to_be32(
1196 			xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? 2 : 1);
1197 	recp->h_lsn = cpu_to_be64(xlog_assign_lsn(cycle, block));
1198 	recp->h_tail_lsn = cpu_to_be64(xlog_assign_lsn(tail_cycle, tail_block));
1199 	recp->h_fmt = cpu_to_be32(XLOG_FMT);
1200 	memcpy(&recp->h_fs_uuid, &log->l_mp->m_sb.sb_uuid, sizeof(uuid_t));
1201 }
1202 
1203 STATIC int
xlog_write_log_records(xlog_t * log,int cycle,int start_block,int blocks,int tail_cycle,int tail_block)1204 xlog_write_log_records(
1205 	xlog_t		*log,
1206 	int		cycle,
1207 	int		start_block,
1208 	int		blocks,
1209 	int		tail_cycle,
1210 	int		tail_block)
1211 {
1212 	xfs_caddr_t	offset;
1213 	xfs_buf_t	*bp;
1214 	int		balign, ealign;
1215 	int		sectbb = log->l_sectBBsize;
1216 	int		end_block = start_block + blocks;
1217 	int		bufblks;
1218 	int		error = 0;
1219 	int		i, j = 0;
1220 
1221 	/*
1222 	 * Greedily allocate a buffer big enough to handle the full
1223 	 * range of basic blocks to be written.  If that fails, try
1224 	 * a smaller size.  We need to be able to write at least a
1225 	 * log sector, or we're out of luck.
1226 	 */
1227 	bufblks = 1 << ffs(blocks);
1228 	while (!(bp = xlog_get_bp(log, bufblks))) {
1229 		bufblks >>= 1;
1230 		if (bufblks < sectbb)
1231 			return ENOMEM;
1232 	}
1233 
1234 	/* We may need to do a read at the start to fill in part of
1235 	 * the buffer in the starting sector not covered by the first
1236 	 * write below.
1237 	 */
1238 	balign = round_down(start_block, sectbb);
1239 	if (balign != start_block) {
1240 		error = xlog_bread_noalign(log, start_block, 1, bp);
1241 		if (error)
1242 			goto out_put_bp;
1243 
1244 		j = start_block - balign;
1245 	}
1246 
1247 	for (i = start_block; i < end_block; i += bufblks) {
1248 		int		bcount, endcount;
1249 
1250 		bcount = min(bufblks, end_block - start_block);
1251 		endcount = bcount - j;
1252 
1253 		/* We may need to do a read at the end to fill in part of
1254 		 * the buffer in the final sector not covered by the write.
1255 		 * If this is the same sector as the above read, skip it.
1256 		 */
1257 		ealign = round_down(end_block, sectbb);
1258 		if (j == 0 && (start_block + endcount > ealign)) {
1259 			offset = bp->b_addr + BBTOB(ealign - start_block);
1260 			error = xlog_bread_offset(log, ealign, sectbb,
1261 							bp, offset);
1262 			if (error)
1263 				break;
1264 
1265 		}
1266 
1267 		offset = xlog_align(log, start_block, endcount, bp);
1268 		for (; j < endcount; j++) {
1269 			xlog_add_record(log, offset, cycle, i+j,
1270 					tail_cycle, tail_block);
1271 			offset += BBSIZE;
1272 		}
1273 		error = xlog_bwrite(log, start_block, endcount, bp);
1274 		if (error)
1275 			break;
1276 		start_block += endcount;
1277 		j = 0;
1278 	}
1279 
1280  out_put_bp:
1281 	xlog_put_bp(bp);
1282 	return error;
1283 }
1284 
1285 /*
1286  * This routine is called to blow away any incomplete log writes out
1287  * in front of the log head.  We do this so that we won't become confused
1288  * if we come up, write only a little bit more, and then crash again.
1289  * If we leave the partial log records out there, this situation could
1290  * cause us to think those partial writes are valid blocks since they
1291  * have the current cycle number.  We get rid of them by overwriting them
1292  * with empty log records with the old cycle number rather than the
1293  * current one.
1294  *
1295  * The tail lsn is passed in rather than taken from
1296  * the log so that we will not write over the unmount record after a
1297  * clean unmount in a 512 block log.  Doing so would leave the log without
1298  * any valid log records in it until a new one was written.  If we crashed
1299  * during that time we would not be able to recover.
1300  */
1301 STATIC int
xlog_clear_stale_blocks(xlog_t * log,xfs_lsn_t tail_lsn)1302 xlog_clear_stale_blocks(
1303 	xlog_t		*log,
1304 	xfs_lsn_t	tail_lsn)
1305 {
1306 	int		tail_cycle, head_cycle;
1307 	int		tail_block, head_block;
1308 	int		tail_distance, max_distance;
1309 	int		distance;
1310 	int		error;
1311 
1312 	tail_cycle = CYCLE_LSN(tail_lsn);
1313 	tail_block = BLOCK_LSN(tail_lsn);
1314 	head_cycle = log->l_curr_cycle;
1315 	head_block = log->l_curr_block;
1316 
1317 	/*
1318 	 * Figure out the distance between the new head of the log
1319 	 * and the tail.  We want to write over any blocks beyond the
1320 	 * head that we may have written just before the crash, but
1321 	 * we don't want to overwrite the tail of the log.
1322 	 */
1323 	if (head_cycle == tail_cycle) {
1324 		/*
1325 		 * The tail is behind the head in the physical log,
1326 		 * so the distance from the head to the tail is the
1327 		 * distance from the head to the end of the log plus
1328 		 * the distance from the beginning of the log to the
1329 		 * tail.
1330 		 */
1331 		if (unlikely(head_block < tail_block || head_block >= log->l_logBBsize)) {
1332 			XFS_ERROR_REPORT("xlog_clear_stale_blocks(1)",
1333 					 XFS_ERRLEVEL_LOW, log->l_mp);
1334 			return XFS_ERROR(EFSCORRUPTED);
1335 		}
1336 		tail_distance = tail_block + (log->l_logBBsize - head_block);
1337 	} else {
1338 		/*
1339 		 * The head is behind the tail in the physical log,
1340 		 * so the distance from the head to the tail is just
1341 		 * the tail block minus the head block.
1342 		 */
1343 		if (unlikely(head_block >= tail_block || head_cycle != (tail_cycle + 1))){
1344 			XFS_ERROR_REPORT("xlog_clear_stale_blocks(2)",
1345 					 XFS_ERRLEVEL_LOW, log->l_mp);
1346 			return XFS_ERROR(EFSCORRUPTED);
1347 		}
1348 		tail_distance = tail_block - head_block;
1349 	}
1350 
1351 	/*
1352 	 * If the head is right up against the tail, we can't clear
1353 	 * anything.
1354 	 */
1355 	if (tail_distance <= 0) {
1356 		ASSERT(tail_distance == 0);
1357 		return 0;
1358 	}
1359 
1360 	max_distance = XLOG_TOTAL_REC_SHIFT(log);
1361 	/*
1362 	 * Take the smaller of the maximum amount of outstanding I/O
1363 	 * we could have and the distance to the tail to clear out.
1364 	 * We take the smaller so that we don't overwrite the tail and
1365 	 * we don't waste all day writing from the head to the tail
1366 	 * for no reason.
1367 	 */
1368 	max_distance = MIN(max_distance, tail_distance);
1369 
1370 	if ((head_block + max_distance) <= log->l_logBBsize) {
1371 		/*
1372 		 * We can stomp all the blocks we need to without
1373 		 * wrapping around the end of the log.  Just do it
1374 		 * in a single write.  Use the cycle number of the
1375 		 * current cycle minus one so that the log will look like:
1376 		 *     n ... | n - 1 ...
1377 		 */
1378 		error = xlog_write_log_records(log, (head_cycle - 1),
1379 				head_block, max_distance, tail_cycle,
1380 				tail_block);
1381 		if (error)
1382 			return error;
1383 	} else {
1384 		/*
1385 		 * We need to wrap around the end of the physical log in
1386 		 * order to clear all the blocks.  Do it in two separate
1387 		 * I/Os.  The first write should be from the head to the
1388 		 * end of the physical log, and it should use the current
1389 		 * cycle number minus one just like above.
1390 		 */
1391 		distance = log->l_logBBsize - head_block;
1392 		error = xlog_write_log_records(log, (head_cycle - 1),
1393 				head_block, distance, tail_cycle,
1394 				tail_block);
1395 
1396 		if (error)
1397 			return error;
1398 
1399 		/*
1400 		 * Now write the blocks at the start of the physical log.
1401 		 * This writes the remainder of the blocks we want to clear.
1402 		 * It uses the current cycle number since we're now on the
1403 		 * same cycle as the head so that we get:
1404 		 *    n ... n ... | n - 1 ...
1405 		 *    ^^^^^ blocks we're writing
1406 		 */
1407 		distance = max_distance - (log->l_logBBsize - head_block);
1408 		error = xlog_write_log_records(log, head_cycle, 0, distance,
1409 				tail_cycle, tail_block);
1410 		if (error)
1411 			return error;
1412 	}
1413 
1414 	return 0;
1415 }
1416 
1417 /******************************************************************************
1418  *
1419  *		Log recover routines
1420  *
1421  ******************************************************************************
1422  */
1423 
1424 STATIC xlog_recover_t *
xlog_recover_find_tid(struct hlist_head * head,xlog_tid_t tid)1425 xlog_recover_find_tid(
1426 	struct hlist_head	*head,
1427 	xlog_tid_t		tid)
1428 {
1429 	xlog_recover_t		*trans;
1430 	struct hlist_node	*n;
1431 
1432 	hlist_for_each_entry(trans, n, head, r_list) {
1433 		if (trans->r_log_tid == tid)
1434 			return trans;
1435 	}
1436 	return NULL;
1437 }
1438 
1439 STATIC void
xlog_recover_new_tid(struct hlist_head * head,xlog_tid_t tid,xfs_lsn_t lsn)1440 xlog_recover_new_tid(
1441 	struct hlist_head	*head,
1442 	xlog_tid_t		tid,
1443 	xfs_lsn_t		lsn)
1444 {
1445 	xlog_recover_t		*trans;
1446 
1447 	trans = kmem_zalloc(sizeof(xlog_recover_t), KM_SLEEP);
1448 	trans->r_log_tid   = tid;
1449 	trans->r_lsn	   = lsn;
1450 	INIT_LIST_HEAD(&trans->r_itemq);
1451 
1452 	INIT_HLIST_NODE(&trans->r_list);
1453 	hlist_add_head(&trans->r_list, head);
1454 }
1455 
1456 STATIC void
xlog_recover_add_item(struct list_head * head)1457 xlog_recover_add_item(
1458 	struct list_head	*head)
1459 {
1460 	xlog_recover_item_t	*item;
1461 
1462 	item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP);
1463 	INIT_LIST_HEAD(&item->ri_list);
1464 	list_add_tail(&item->ri_list, head);
1465 }
1466 
1467 STATIC int
xlog_recover_add_to_cont_trans(struct log * log,xlog_recover_t * trans,xfs_caddr_t dp,int len)1468 xlog_recover_add_to_cont_trans(
1469 	struct log		*log,
1470 	xlog_recover_t		*trans,
1471 	xfs_caddr_t		dp,
1472 	int			len)
1473 {
1474 	xlog_recover_item_t	*item;
1475 	xfs_caddr_t		ptr, old_ptr;
1476 	int			old_len;
1477 
1478 	if (list_empty(&trans->r_itemq)) {
1479 		/* finish copying rest of trans header */
1480 		xlog_recover_add_item(&trans->r_itemq);
1481 		ptr = (xfs_caddr_t) &trans->r_theader +
1482 				sizeof(xfs_trans_header_t) - len;
1483 		memcpy(ptr, dp, len); /* d, s, l */
1484 		return 0;
1485 	}
1486 	/* take the tail entry */
1487 	item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
1488 
1489 	old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
1490 	old_len = item->ri_buf[item->ri_cnt-1].i_len;
1491 
1492 	ptr = kmem_realloc(old_ptr, len+old_len, old_len, KM_SLEEP);
1493 	memcpy(&ptr[old_len], dp, len); /* d, s, l */
1494 	item->ri_buf[item->ri_cnt-1].i_len += len;
1495 	item->ri_buf[item->ri_cnt-1].i_addr = ptr;
1496 	trace_xfs_log_recover_item_add_cont(log, trans, item, 0);
1497 	return 0;
1498 }
1499 
1500 /*
1501  * The next region to add is the start of a new region.  It could be
1502  * a whole region or it could be the first part of a new region.  Because
1503  * of this, the assumption here is that the type and size fields of all
1504  * format structures fit into the first 32 bits of the structure.
1505  *
1506  * This works because all regions must be 32 bit aligned.  Therefore, we
1507  * either have both fields or we have neither field.  In the case we have
1508  * neither field, the data part of the region is zero length.  We only have
1509  * a log_op_header and can throw away the header since a new one will appear
1510  * later.  If we have at least 4 bytes, then we can determine how many regions
1511  * will appear in the current log item.
1512  */
1513 STATIC int
xlog_recover_add_to_trans(struct log * log,xlog_recover_t * trans,xfs_caddr_t dp,int len)1514 xlog_recover_add_to_trans(
1515 	struct log		*log,
1516 	xlog_recover_t		*trans,
1517 	xfs_caddr_t		dp,
1518 	int			len)
1519 {
1520 	xfs_inode_log_format_t	*in_f;			/* any will do */
1521 	xlog_recover_item_t	*item;
1522 	xfs_caddr_t		ptr;
1523 
1524 	if (!len)
1525 		return 0;
1526 	if (list_empty(&trans->r_itemq)) {
1527 		/* we need to catch log corruptions here */
1528 		if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) {
1529 			xfs_warn(log->l_mp, "%s: bad header magic number",
1530 				__func__);
1531 			ASSERT(0);
1532 			return XFS_ERROR(EIO);
1533 		}
1534 		if (len == sizeof(xfs_trans_header_t))
1535 			xlog_recover_add_item(&trans->r_itemq);
1536 		memcpy(&trans->r_theader, dp, len); /* d, s, l */
1537 		return 0;
1538 	}
1539 
1540 	ptr = kmem_alloc(len, KM_SLEEP);
1541 	memcpy(ptr, dp, len);
1542 	in_f = (xfs_inode_log_format_t *)ptr;
1543 
1544 	/* take the tail entry */
1545 	item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
1546 	if (item->ri_total != 0 &&
1547 	     item->ri_total == item->ri_cnt) {
1548 		/* tail item is in use, get a new one */
1549 		xlog_recover_add_item(&trans->r_itemq);
1550 		item = list_entry(trans->r_itemq.prev,
1551 					xlog_recover_item_t, ri_list);
1552 	}
1553 
1554 	if (item->ri_total == 0) {		/* first region to be added */
1555 		if (in_f->ilf_size == 0 ||
1556 		    in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) {
1557 			xfs_warn(log->l_mp,
1558 		"bad number of regions (%d) in inode log format",
1559 				  in_f->ilf_size);
1560 			ASSERT(0);
1561 			return XFS_ERROR(EIO);
1562 		}
1563 
1564 		item->ri_total = in_f->ilf_size;
1565 		item->ri_buf =
1566 			kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t),
1567 				    KM_SLEEP);
1568 	}
1569 	ASSERT(item->ri_total > item->ri_cnt);
1570 	/* Description region is ri_buf[0] */
1571 	item->ri_buf[item->ri_cnt].i_addr = ptr;
1572 	item->ri_buf[item->ri_cnt].i_len  = len;
1573 	item->ri_cnt++;
1574 	trace_xfs_log_recover_item_add(log, trans, item, 0);
1575 	return 0;
1576 }
1577 
1578 /*
1579  * Sort the log items in the transaction. Cancelled buffers need
1580  * to be put first so they are processed before any items that might
1581  * modify the buffers. If they are cancelled, then the modifications
1582  * don't need to be replayed.
1583  */
1584 STATIC int
xlog_recover_reorder_trans(struct log * log,xlog_recover_t * trans,int pass)1585 xlog_recover_reorder_trans(
1586 	struct log		*log,
1587 	xlog_recover_t		*trans,
1588 	int			pass)
1589 {
1590 	xlog_recover_item_t	*item, *n;
1591 	LIST_HEAD(sort_list);
1592 
1593 	list_splice_init(&trans->r_itemq, &sort_list);
1594 	list_for_each_entry_safe(item, n, &sort_list, ri_list) {
1595 		xfs_buf_log_format_t	*buf_f = item->ri_buf[0].i_addr;
1596 
1597 		switch (ITEM_TYPE(item)) {
1598 		case XFS_LI_BUF:
1599 			if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) {
1600 				trace_xfs_log_recover_item_reorder_head(log,
1601 							trans, item, pass);
1602 				list_move(&item->ri_list, &trans->r_itemq);
1603 				break;
1604 			}
1605 		case XFS_LI_INODE:
1606 		case XFS_LI_DQUOT:
1607 		case XFS_LI_QUOTAOFF:
1608 		case XFS_LI_EFD:
1609 		case XFS_LI_EFI:
1610 			trace_xfs_log_recover_item_reorder_tail(log,
1611 							trans, item, pass);
1612 			list_move_tail(&item->ri_list, &trans->r_itemq);
1613 			break;
1614 		default:
1615 			xfs_warn(log->l_mp,
1616 				"%s: unrecognized type of log operation",
1617 				__func__);
1618 			ASSERT(0);
1619 			return XFS_ERROR(EIO);
1620 		}
1621 	}
1622 	ASSERT(list_empty(&sort_list));
1623 	return 0;
1624 }
1625 
1626 /*
1627  * Build up the table of buf cancel records so that we don't replay
1628  * cancelled data in the second pass.  For buffer records that are
1629  * not cancel records, there is nothing to do here so we just return.
1630  *
1631  * If we get a cancel record which is already in the table, this indicates
1632  * that the buffer was cancelled multiple times.  In order to ensure
1633  * that during pass 2 we keep the record in the table until we reach its
1634  * last occurrence in the log, we keep a reference count in the cancel
1635  * record in the table to tell us how many times we expect to see this
1636  * record during the second pass.
1637  */
1638 STATIC int
xlog_recover_buffer_pass1(struct log * log,xlog_recover_item_t * item)1639 xlog_recover_buffer_pass1(
1640 	struct log		*log,
1641 	xlog_recover_item_t	*item)
1642 {
1643 	xfs_buf_log_format_t	*buf_f = item->ri_buf[0].i_addr;
1644 	struct list_head	*bucket;
1645 	struct xfs_buf_cancel	*bcp;
1646 
1647 	/*
1648 	 * If this isn't a cancel buffer item, then just return.
1649 	 */
1650 	if (!(buf_f->blf_flags & XFS_BLF_CANCEL)) {
1651 		trace_xfs_log_recover_buf_not_cancel(log, buf_f);
1652 		return 0;
1653 	}
1654 
1655 	/*
1656 	 * Insert an xfs_buf_cancel record into the hash table of them.
1657 	 * If there is already an identical record, bump its reference count.
1658 	 */
1659 	bucket = XLOG_BUF_CANCEL_BUCKET(log, buf_f->blf_blkno);
1660 	list_for_each_entry(bcp, bucket, bc_list) {
1661 		if (bcp->bc_blkno == buf_f->blf_blkno &&
1662 		    bcp->bc_len == buf_f->blf_len) {
1663 			bcp->bc_refcount++;
1664 			trace_xfs_log_recover_buf_cancel_ref_inc(log, buf_f);
1665 			return 0;
1666 		}
1667 	}
1668 
1669 	bcp = kmem_alloc(sizeof(struct xfs_buf_cancel), KM_SLEEP);
1670 	bcp->bc_blkno = buf_f->blf_blkno;
1671 	bcp->bc_len = buf_f->blf_len;
1672 	bcp->bc_refcount = 1;
1673 	list_add_tail(&bcp->bc_list, bucket);
1674 
1675 	trace_xfs_log_recover_buf_cancel_add(log, buf_f);
1676 	return 0;
1677 }
1678 
1679 /*
1680  * Check to see whether the buffer being recovered has a corresponding
1681  * entry in the buffer cancel record table.  If it does then return 1
1682  * so that it will be cancelled, otherwise return 0.  If the buffer is
1683  * actually a buffer cancel item (XFS_BLF_CANCEL is set), then decrement
1684  * the refcount on the entry in the table and remove it from the table
1685  * if this is the last reference.
1686  *
1687  * We remove the cancel record from the table when we encounter its
1688  * last occurrence in the log so that if the same buffer is re-used
1689  * again after its last cancellation we actually replay the changes
1690  * made at that point.
1691  */
1692 STATIC int
xlog_check_buffer_cancelled(struct log * log,xfs_daddr_t blkno,uint len,ushort flags)1693 xlog_check_buffer_cancelled(
1694 	struct log		*log,
1695 	xfs_daddr_t		blkno,
1696 	uint			len,
1697 	ushort			flags)
1698 {
1699 	struct list_head	*bucket;
1700 	struct xfs_buf_cancel	*bcp;
1701 
1702 	if (log->l_buf_cancel_table == NULL) {
1703 		/*
1704 		 * There is nothing in the table built in pass one,
1705 		 * so this buffer must not be cancelled.
1706 		 */
1707 		ASSERT(!(flags & XFS_BLF_CANCEL));
1708 		return 0;
1709 	}
1710 
1711 	/*
1712 	 * Search for an entry in the  cancel table that matches our buffer.
1713 	 */
1714 	bucket = XLOG_BUF_CANCEL_BUCKET(log, blkno);
1715 	list_for_each_entry(bcp, bucket, bc_list) {
1716 		if (bcp->bc_blkno == blkno && bcp->bc_len == len)
1717 			goto found;
1718 	}
1719 
1720 	/*
1721 	 * We didn't find a corresponding entry in the table, so return 0 so
1722 	 * that the buffer is NOT cancelled.
1723 	 */
1724 	ASSERT(!(flags & XFS_BLF_CANCEL));
1725 	return 0;
1726 
1727 found:
1728 	/*
1729 	 * We've go a match, so return 1 so that the recovery of this buffer
1730 	 * is cancelled.  If this buffer is actually a buffer cancel log
1731 	 * item, then decrement the refcount on the one in the table and
1732 	 * remove it if this is the last reference.
1733 	 */
1734 	if (flags & XFS_BLF_CANCEL) {
1735 		if (--bcp->bc_refcount == 0) {
1736 			list_del(&bcp->bc_list);
1737 			kmem_free(bcp);
1738 		}
1739 	}
1740 	return 1;
1741 }
1742 
1743 /*
1744  * Perform recovery for a buffer full of inodes.  In these buffers, the only
1745  * data which should be recovered is that which corresponds to the
1746  * di_next_unlinked pointers in the on disk inode structures.  The rest of the
1747  * data for the inodes is always logged through the inodes themselves rather
1748  * than the inode buffer and is recovered in xlog_recover_inode_pass2().
1749  *
1750  * The only time when buffers full of inodes are fully recovered is when the
1751  * buffer is full of newly allocated inodes.  In this case the buffer will
1752  * not be marked as an inode buffer and so will be sent to
1753  * xlog_recover_do_reg_buffer() below during recovery.
1754  */
1755 STATIC int
xlog_recover_do_inode_buffer(struct xfs_mount * mp,xlog_recover_item_t * item,struct xfs_buf * bp,xfs_buf_log_format_t * buf_f)1756 xlog_recover_do_inode_buffer(
1757 	struct xfs_mount	*mp,
1758 	xlog_recover_item_t	*item,
1759 	struct xfs_buf		*bp,
1760 	xfs_buf_log_format_t	*buf_f)
1761 {
1762 	int			i;
1763 	int			item_index = 0;
1764 	int			bit = 0;
1765 	int			nbits = 0;
1766 	int			reg_buf_offset = 0;
1767 	int			reg_buf_bytes = 0;
1768 	int			next_unlinked_offset;
1769 	int			inodes_per_buf;
1770 	xfs_agino_t		*logged_nextp;
1771 	xfs_agino_t		*buffer_nextp;
1772 
1773 	trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f);
1774 
1775 	inodes_per_buf = XFS_BUF_COUNT(bp) >> mp->m_sb.sb_inodelog;
1776 	for (i = 0; i < inodes_per_buf; i++) {
1777 		next_unlinked_offset = (i * mp->m_sb.sb_inodesize) +
1778 			offsetof(xfs_dinode_t, di_next_unlinked);
1779 
1780 		while (next_unlinked_offset >=
1781 		       (reg_buf_offset + reg_buf_bytes)) {
1782 			/*
1783 			 * The next di_next_unlinked field is beyond
1784 			 * the current logged region.  Find the next
1785 			 * logged region that contains or is beyond
1786 			 * the current di_next_unlinked field.
1787 			 */
1788 			bit += nbits;
1789 			bit = xfs_next_bit(buf_f->blf_data_map,
1790 					   buf_f->blf_map_size, bit);
1791 
1792 			/*
1793 			 * If there are no more logged regions in the
1794 			 * buffer, then we're done.
1795 			 */
1796 			if (bit == -1)
1797 				return 0;
1798 
1799 			nbits = xfs_contig_bits(buf_f->blf_data_map,
1800 						buf_f->blf_map_size, bit);
1801 			ASSERT(nbits > 0);
1802 			reg_buf_offset = bit << XFS_BLF_SHIFT;
1803 			reg_buf_bytes = nbits << XFS_BLF_SHIFT;
1804 			item_index++;
1805 		}
1806 
1807 		/*
1808 		 * If the current logged region starts after the current
1809 		 * di_next_unlinked field, then move on to the next
1810 		 * di_next_unlinked field.
1811 		 */
1812 		if (next_unlinked_offset < reg_buf_offset)
1813 			continue;
1814 
1815 		ASSERT(item->ri_buf[item_index].i_addr != NULL);
1816 		ASSERT((item->ri_buf[item_index].i_len % XFS_BLF_CHUNK) == 0);
1817 		ASSERT((reg_buf_offset + reg_buf_bytes) <= XFS_BUF_COUNT(bp));
1818 
1819 		/*
1820 		 * The current logged region contains a copy of the
1821 		 * current di_next_unlinked field.  Extract its value
1822 		 * and copy it to the buffer copy.
1823 		 */
1824 		logged_nextp = item->ri_buf[item_index].i_addr +
1825 				next_unlinked_offset - reg_buf_offset;
1826 		if (unlikely(*logged_nextp == 0)) {
1827 			xfs_alert(mp,
1828 		"Bad inode buffer log record (ptr = 0x%p, bp = 0x%p). "
1829 		"Trying to replay bad (0) inode di_next_unlinked field.",
1830 				item, bp);
1831 			XFS_ERROR_REPORT("xlog_recover_do_inode_buf",
1832 					 XFS_ERRLEVEL_LOW, mp);
1833 			return XFS_ERROR(EFSCORRUPTED);
1834 		}
1835 
1836 		buffer_nextp = (xfs_agino_t *)xfs_buf_offset(bp,
1837 					      next_unlinked_offset);
1838 		*buffer_nextp = *logged_nextp;
1839 	}
1840 
1841 	return 0;
1842 }
1843 
1844 /*
1845  * Perform a 'normal' buffer recovery.  Each logged region of the
1846  * buffer should be copied over the corresponding region in the
1847  * given buffer.  The bitmap in the buf log format structure indicates
1848  * where to place the logged data.
1849  */
1850 STATIC void
xlog_recover_do_reg_buffer(struct xfs_mount * mp,xlog_recover_item_t * item,struct xfs_buf * bp,xfs_buf_log_format_t * buf_f)1851 xlog_recover_do_reg_buffer(
1852 	struct xfs_mount	*mp,
1853 	xlog_recover_item_t	*item,
1854 	struct xfs_buf		*bp,
1855 	xfs_buf_log_format_t	*buf_f)
1856 {
1857 	int			i;
1858 	int			bit;
1859 	int			nbits;
1860 	int                     error;
1861 
1862 	trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f);
1863 
1864 	bit = 0;
1865 	i = 1;  /* 0 is the buf format structure */
1866 	while (1) {
1867 		bit = xfs_next_bit(buf_f->blf_data_map,
1868 				   buf_f->blf_map_size, bit);
1869 		if (bit == -1)
1870 			break;
1871 		nbits = xfs_contig_bits(buf_f->blf_data_map,
1872 					buf_f->blf_map_size, bit);
1873 		ASSERT(nbits > 0);
1874 		ASSERT(item->ri_buf[i].i_addr != NULL);
1875 		ASSERT(item->ri_buf[i].i_len % XFS_BLF_CHUNK == 0);
1876 		ASSERT(XFS_BUF_COUNT(bp) >=
1877 		       ((uint)bit << XFS_BLF_SHIFT)+(nbits<<XFS_BLF_SHIFT));
1878 
1879 		/*
1880 		 * Do a sanity check if this is a dquot buffer. Just checking
1881 		 * the first dquot in the buffer should do. XXXThis is
1882 		 * probably a good thing to do for other buf types also.
1883 		 */
1884 		error = 0;
1885 		if (buf_f->blf_flags &
1886 		   (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
1887 			if (item->ri_buf[i].i_addr == NULL) {
1888 				xfs_alert(mp,
1889 					"XFS: NULL dquot in %s.", __func__);
1890 				goto next;
1891 			}
1892 			if (item->ri_buf[i].i_len < sizeof(xfs_disk_dquot_t)) {
1893 				xfs_alert(mp,
1894 					"XFS: dquot too small (%d) in %s.",
1895 					item->ri_buf[i].i_len, __func__);
1896 				goto next;
1897 			}
1898 			error = xfs_qm_dqcheck(mp, item->ri_buf[i].i_addr,
1899 					       -1, 0, XFS_QMOPT_DOWARN,
1900 					       "dquot_buf_recover");
1901 			if (error)
1902 				goto next;
1903 		}
1904 
1905 		memcpy(xfs_buf_offset(bp,
1906 			(uint)bit << XFS_BLF_SHIFT),	/* dest */
1907 			item->ri_buf[i].i_addr,		/* source */
1908 			nbits<<XFS_BLF_SHIFT);		/* length */
1909  next:
1910 		i++;
1911 		bit += nbits;
1912 	}
1913 
1914 	/* Shouldn't be any more regions */
1915 	ASSERT(i == item->ri_total);
1916 }
1917 
1918 /*
1919  * Do some primitive error checking on ondisk dquot data structures.
1920  */
1921 int
xfs_qm_dqcheck(struct xfs_mount * mp,xfs_disk_dquot_t * ddq,xfs_dqid_t id,uint type,uint flags,char * str)1922 xfs_qm_dqcheck(
1923 	struct xfs_mount *mp,
1924 	xfs_disk_dquot_t *ddq,
1925 	xfs_dqid_t	 id,
1926 	uint		 type,	  /* used only when IO_dorepair is true */
1927 	uint		 flags,
1928 	char		 *str)
1929 {
1930 	xfs_dqblk_t	 *d = (xfs_dqblk_t *)ddq;
1931 	int		errs = 0;
1932 
1933 	/*
1934 	 * We can encounter an uninitialized dquot buffer for 2 reasons:
1935 	 * 1. If we crash while deleting the quotainode(s), and those blks got
1936 	 *    used for user data. This is because we take the path of regular
1937 	 *    file deletion; however, the size field of quotainodes is never
1938 	 *    updated, so all the tricks that we play in itruncate_finish
1939 	 *    don't quite matter.
1940 	 *
1941 	 * 2. We don't play the quota buffers when there's a quotaoff logitem.
1942 	 *    But the allocation will be replayed so we'll end up with an
1943 	 *    uninitialized quota block.
1944 	 *
1945 	 * This is all fine; things are still consistent, and we haven't lost
1946 	 * any quota information. Just don't complain about bad dquot blks.
1947 	 */
1948 	if (ddq->d_magic != cpu_to_be16(XFS_DQUOT_MAGIC)) {
1949 		if (flags & XFS_QMOPT_DOWARN)
1950 			xfs_alert(mp,
1951 			"%s : XFS dquot ID 0x%x, magic 0x%x != 0x%x",
1952 			str, id, be16_to_cpu(ddq->d_magic), XFS_DQUOT_MAGIC);
1953 		errs++;
1954 	}
1955 	if (ddq->d_version != XFS_DQUOT_VERSION) {
1956 		if (flags & XFS_QMOPT_DOWARN)
1957 			xfs_alert(mp,
1958 			"%s : XFS dquot ID 0x%x, version 0x%x != 0x%x",
1959 			str, id, ddq->d_version, XFS_DQUOT_VERSION);
1960 		errs++;
1961 	}
1962 
1963 	if (ddq->d_flags != XFS_DQ_USER &&
1964 	    ddq->d_flags != XFS_DQ_PROJ &&
1965 	    ddq->d_flags != XFS_DQ_GROUP) {
1966 		if (flags & XFS_QMOPT_DOWARN)
1967 			xfs_alert(mp,
1968 			"%s : XFS dquot ID 0x%x, unknown flags 0x%x",
1969 			str, id, ddq->d_flags);
1970 		errs++;
1971 	}
1972 
1973 	if (id != -1 && id != be32_to_cpu(ddq->d_id)) {
1974 		if (flags & XFS_QMOPT_DOWARN)
1975 			xfs_alert(mp,
1976 			"%s : ondisk-dquot 0x%p, ID mismatch: "
1977 			"0x%x expected, found id 0x%x",
1978 			str, ddq, id, be32_to_cpu(ddq->d_id));
1979 		errs++;
1980 	}
1981 
1982 	if (!errs && ddq->d_id) {
1983 		if (ddq->d_blk_softlimit &&
1984 		    be64_to_cpu(ddq->d_bcount) >
1985 				be64_to_cpu(ddq->d_blk_softlimit)) {
1986 			if (!ddq->d_btimer) {
1987 				if (flags & XFS_QMOPT_DOWARN)
1988 					xfs_alert(mp,
1989 			"%s : Dquot ID 0x%x (0x%p) BLK TIMER NOT STARTED",
1990 					str, (int)be32_to_cpu(ddq->d_id), ddq);
1991 				errs++;
1992 			}
1993 		}
1994 		if (ddq->d_ino_softlimit &&
1995 		    be64_to_cpu(ddq->d_icount) >
1996 				be64_to_cpu(ddq->d_ino_softlimit)) {
1997 			if (!ddq->d_itimer) {
1998 				if (flags & XFS_QMOPT_DOWARN)
1999 					xfs_alert(mp,
2000 			"%s : Dquot ID 0x%x (0x%p) INODE TIMER NOT STARTED",
2001 					str, (int)be32_to_cpu(ddq->d_id), ddq);
2002 				errs++;
2003 			}
2004 		}
2005 		if (ddq->d_rtb_softlimit &&
2006 		    be64_to_cpu(ddq->d_rtbcount) >
2007 				be64_to_cpu(ddq->d_rtb_softlimit)) {
2008 			if (!ddq->d_rtbtimer) {
2009 				if (flags & XFS_QMOPT_DOWARN)
2010 					xfs_alert(mp,
2011 			"%s : Dquot ID 0x%x (0x%p) RTBLK TIMER NOT STARTED",
2012 					str, (int)be32_to_cpu(ddq->d_id), ddq);
2013 				errs++;
2014 			}
2015 		}
2016 	}
2017 
2018 	if (!errs || !(flags & XFS_QMOPT_DQREPAIR))
2019 		return errs;
2020 
2021 	if (flags & XFS_QMOPT_DOWARN)
2022 		xfs_notice(mp, "Re-initializing dquot ID 0x%x", id);
2023 
2024 	/*
2025 	 * Typically, a repair is only requested by quotacheck.
2026 	 */
2027 	ASSERT(id != -1);
2028 	ASSERT(flags & XFS_QMOPT_DQREPAIR);
2029 	memset(d, 0, sizeof(xfs_dqblk_t));
2030 
2031 	d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);
2032 	d->dd_diskdq.d_version = XFS_DQUOT_VERSION;
2033 	d->dd_diskdq.d_flags = type;
2034 	d->dd_diskdq.d_id = cpu_to_be32(id);
2035 
2036 	return errs;
2037 }
2038 
2039 /*
2040  * Perform a dquot buffer recovery.
2041  * Simple algorithm: if we have found a QUOTAOFF logitem of the same type
2042  * (ie. USR or GRP), then just toss this buffer away; don't recover it.
2043  * Else, treat it as a regular buffer and do recovery.
2044  */
2045 STATIC void
xlog_recover_do_dquot_buffer(xfs_mount_t * mp,xlog_t * log,xlog_recover_item_t * item,xfs_buf_t * bp,xfs_buf_log_format_t * buf_f)2046 xlog_recover_do_dquot_buffer(
2047 	xfs_mount_t		*mp,
2048 	xlog_t			*log,
2049 	xlog_recover_item_t	*item,
2050 	xfs_buf_t		*bp,
2051 	xfs_buf_log_format_t	*buf_f)
2052 {
2053 	uint			type;
2054 
2055 	trace_xfs_log_recover_buf_dquot_buf(log, buf_f);
2056 
2057 	/*
2058 	 * Filesystems are required to send in quota flags at mount time.
2059 	 */
2060 	if (mp->m_qflags == 0) {
2061 		return;
2062 	}
2063 
2064 	type = 0;
2065 	if (buf_f->blf_flags & XFS_BLF_UDQUOT_BUF)
2066 		type |= XFS_DQ_USER;
2067 	if (buf_f->blf_flags & XFS_BLF_PDQUOT_BUF)
2068 		type |= XFS_DQ_PROJ;
2069 	if (buf_f->blf_flags & XFS_BLF_GDQUOT_BUF)
2070 		type |= XFS_DQ_GROUP;
2071 	/*
2072 	 * This type of quotas was turned off, so ignore this buffer
2073 	 */
2074 	if (log->l_quotaoffs_flag & type)
2075 		return;
2076 
2077 	xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
2078 }
2079 
2080 /*
2081  * This routine replays a modification made to a buffer at runtime.
2082  * There are actually two types of buffer, regular and inode, which
2083  * are handled differently.  Inode buffers are handled differently
2084  * in that we only recover a specific set of data from them, namely
2085  * the inode di_next_unlinked fields.  This is because all other inode
2086  * data is actually logged via inode records and any data we replay
2087  * here which overlaps that may be stale.
2088  *
2089  * When meta-data buffers are freed at run time we log a buffer item
2090  * with the XFS_BLF_CANCEL bit set to indicate that previous copies
2091  * of the buffer in the log should not be replayed at recovery time.
2092  * This is so that if the blocks covered by the buffer are reused for
2093  * file data before we crash we don't end up replaying old, freed
2094  * meta-data into a user's file.
2095  *
2096  * To handle the cancellation of buffer log items, we make two passes
2097  * over the log during recovery.  During the first we build a table of
2098  * those buffers which have been cancelled, and during the second we
2099  * only replay those buffers which do not have corresponding cancel
2100  * records in the table.  See xlog_recover_do_buffer_pass[1,2] above
2101  * for more details on the implementation of the table of cancel records.
2102  */
2103 STATIC int
xlog_recover_buffer_pass2(xlog_t * log,xlog_recover_item_t * item)2104 xlog_recover_buffer_pass2(
2105 	xlog_t			*log,
2106 	xlog_recover_item_t	*item)
2107 {
2108 	xfs_buf_log_format_t	*buf_f = item->ri_buf[0].i_addr;
2109 	xfs_mount_t		*mp = log->l_mp;
2110 	xfs_buf_t		*bp;
2111 	int			error;
2112 	uint			buf_flags;
2113 
2114 	/*
2115 	 * In this pass we only want to recover all the buffers which have
2116 	 * not been cancelled and are not cancellation buffers themselves.
2117 	 */
2118 	if (xlog_check_buffer_cancelled(log, buf_f->blf_blkno,
2119 			buf_f->blf_len, buf_f->blf_flags)) {
2120 		trace_xfs_log_recover_buf_cancel(log, buf_f);
2121 		return 0;
2122 	}
2123 
2124 	trace_xfs_log_recover_buf_recover(log, buf_f);
2125 
2126 	buf_flags = XBF_LOCK;
2127 	if (!(buf_f->blf_flags & XFS_BLF_INODE_BUF))
2128 		buf_flags |= XBF_MAPPED;
2129 
2130 	bp = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len,
2131 			  buf_flags);
2132 	if (!bp)
2133 		return XFS_ERROR(ENOMEM);
2134 	error = bp->b_error;
2135 	if (error) {
2136 		xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#1)");
2137 		xfs_buf_relse(bp);
2138 		return error;
2139 	}
2140 
2141 	if (buf_f->blf_flags & XFS_BLF_INODE_BUF) {
2142 		error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
2143 	} else if (buf_f->blf_flags &
2144 		  (XFS_BLF_UDQUOT_BUF|XFS_BLF_PDQUOT_BUF|XFS_BLF_GDQUOT_BUF)) {
2145 		xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
2146 	} else {
2147 		xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
2148 	}
2149 	if (error)
2150 		return XFS_ERROR(error);
2151 
2152 	/*
2153 	 * Perform delayed write on the buffer.  Asynchronous writes will be
2154 	 * slower when taking into account all the buffers to be flushed.
2155 	 *
2156 	 * Also make sure that only inode buffers with good sizes stay in
2157 	 * the buffer cache.  The kernel moves inodes in buffers of 1 block
2158 	 * or XFS_INODE_CLUSTER_SIZE bytes, whichever is bigger.  The inode
2159 	 * buffers in the log can be a different size if the log was generated
2160 	 * by an older kernel using unclustered inode buffers or a newer kernel
2161 	 * running with a different inode cluster size.  Regardless, if the
2162 	 * the inode buffer size isn't MAX(blocksize, XFS_INODE_CLUSTER_SIZE)
2163 	 * for *our* value of XFS_INODE_CLUSTER_SIZE, then we need to keep
2164 	 * the buffer out of the buffer cache so that the buffer won't
2165 	 * overlap with future reads of those inodes.
2166 	 */
2167 	if (XFS_DINODE_MAGIC ==
2168 	    be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) &&
2169 	    (XFS_BUF_COUNT(bp) != MAX(log->l_mp->m_sb.sb_blocksize,
2170 			(__uint32_t)XFS_INODE_CLUSTER_SIZE(log->l_mp)))) {
2171 		xfs_buf_stale(bp);
2172 		error = xfs_bwrite(bp);
2173 	} else {
2174 		ASSERT(bp->b_target->bt_mount == mp);
2175 		bp->b_iodone = xlog_recover_iodone;
2176 		xfs_buf_delwri_queue(bp);
2177 	}
2178 
2179 	xfs_buf_relse(bp);
2180 	return error;
2181 }
2182 
2183 STATIC int
xlog_recover_inode_pass2(xlog_t * log,xlog_recover_item_t * item)2184 xlog_recover_inode_pass2(
2185 	xlog_t			*log,
2186 	xlog_recover_item_t	*item)
2187 {
2188 	xfs_inode_log_format_t	*in_f;
2189 	xfs_mount_t		*mp = log->l_mp;
2190 	xfs_buf_t		*bp;
2191 	xfs_dinode_t		*dip;
2192 	int			len;
2193 	xfs_caddr_t		src;
2194 	xfs_caddr_t		dest;
2195 	int			error;
2196 	int			attr_index;
2197 	uint			fields;
2198 	xfs_icdinode_t		*dicp;
2199 	int			need_free = 0;
2200 
2201 	if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) {
2202 		in_f = item->ri_buf[0].i_addr;
2203 	} else {
2204 		in_f = kmem_alloc(sizeof(xfs_inode_log_format_t), KM_SLEEP);
2205 		need_free = 1;
2206 		error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f);
2207 		if (error)
2208 			goto error;
2209 	}
2210 
2211 	/*
2212 	 * Inode buffers can be freed, look out for it,
2213 	 * and do not replay the inode.
2214 	 */
2215 	if (xlog_check_buffer_cancelled(log, in_f->ilf_blkno,
2216 					in_f->ilf_len, 0)) {
2217 		error = 0;
2218 		trace_xfs_log_recover_inode_cancel(log, in_f);
2219 		goto error;
2220 	}
2221 	trace_xfs_log_recover_inode_recover(log, in_f);
2222 
2223 	bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len,
2224 			  XBF_LOCK);
2225 	if (!bp) {
2226 		error = ENOMEM;
2227 		goto error;
2228 	}
2229 	error = bp->b_error;
2230 	if (error) {
2231 		xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#2)");
2232 		xfs_buf_relse(bp);
2233 		goto error;
2234 	}
2235 	ASSERT(in_f->ilf_fields & XFS_ILOG_CORE);
2236 	dip = (xfs_dinode_t *)xfs_buf_offset(bp, in_f->ilf_boffset);
2237 
2238 	/*
2239 	 * Make sure the place we're flushing out to really looks
2240 	 * like an inode!
2241 	 */
2242 	if (unlikely(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))) {
2243 		xfs_buf_relse(bp);
2244 		xfs_alert(mp,
2245 	"%s: Bad inode magic number, dip = 0x%p, dino bp = 0x%p, ino = %Ld",
2246 			__func__, dip, bp, in_f->ilf_ino);
2247 		XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)",
2248 				 XFS_ERRLEVEL_LOW, mp);
2249 		error = EFSCORRUPTED;
2250 		goto error;
2251 	}
2252 	dicp = item->ri_buf[1].i_addr;
2253 	if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) {
2254 		xfs_buf_relse(bp);
2255 		xfs_alert(mp,
2256 			"%s: Bad inode log record, rec ptr 0x%p, ino %Ld",
2257 			__func__, item, in_f->ilf_ino);
2258 		XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)",
2259 				 XFS_ERRLEVEL_LOW, mp);
2260 		error = EFSCORRUPTED;
2261 		goto error;
2262 	}
2263 
2264 	/* Skip replay when the on disk inode is newer than the log one */
2265 	if (dicp->di_flushiter < be16_to_cpu(dip->di_flushiter)) {
2266 		/*
2267 		 * Deal with the wrap case, DI_MAX_FLUSH is less
2268 		 * than smaller numbers
2269 		 */
2270 		if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH &&
2271 		    dicp->di_flushiter < (DI_MAX_FLUSH >> 1)) {
2272 			/* do nothing */
2273 		} else {
2274 			xfs_buf_relse(bp);
2275 			trace_xfs_log_recover_inode_skip(log, in_f);
2276 			error = 0;
2277 			goto error;
2278 		}
2279 	}
2280 	/* Take the opportunity to reset the flush iteration count */
2281 	dicp->di_flushiter = 0;
2282 
2283 	if (unlikely(S_ISREG(dicp->di_mode))) {
2284 		if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
2285 		    (dicp->di_format != XFS_DINODE_FMT_BTREE)) {
2286 			XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)",
2287 					 XFS_ERRLEVEL_LOW, mp, dicp);
2288 			xfs_buf_relse(bp);
2289 			xfs_alert(mp,
2290 		"%s: Bad regular inode log record, rec ptr 0x%p, "
2291 		"ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
2292 				__func__, item, dip, bp, in_f->ilf_ino);
2293 			error = EFSCORRUPTED;
2294 			goto error;
2295 		}
2296 	} else if (unlikely(S_ISDIR(dicp->di_mode))) {
2297 		if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
2298 		    (dicp->di_format != XFS_DINODE_FMT_BTREE) &&
2299 		    (dicp->di_format != XFS_DINODE_FMT_LOCAL)) {
2300 			XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)",
2301 					     XFS_ERRLEVEL_LOW, mp, dicp);
2302 			xfs_buf_relse(bp);
2303 			xfs_alert(mp,
2304 		"%s: Bad dir inode log record, rec ptr 0x%p, "
2305 		"ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
2306 				__func__, item, dip, bp, in_f->ilf_ino);
2307 			error = EFSCORRUPTED;
2308 			goto error;
2309 		}
2310 	}
2311 	if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){
2312 		XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)",
2313 				     XFS_ERRLEVEL_LOW, mp, dicp);
2314 		xfs_buf_relse(bp);
2315 		xfs_alert(mp,
2316 	"%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "
2317 	"dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld",
2318 			__func__, item, dip, bp, in_f->ilf_ino,
2319 			dicp->di_nextents + dicp->di_anextents,
2320 			dicp->di_nblocks);
2321 		error = EFSCORRUPTED;
2322 		goto error;
2323 	}
2324 	if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) {
2325 		XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)",
2326 				     XFS_ERRLEVEL_LOW, mp, dicp);
2327 		xfs_buf_relse(bp);
2328 		xfs_alert(mp,
2329 	"%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "
2330 	"dino bp 0x%p, ino %Ld, forkoff 0x%x", __func__,
2331 			item, dip, bp, in_f->ilf_ino, dicp->di_forkoff);
2332 		error = EFSCORRUPTED;
2333 		goto error;
2334 	}
2335 	if (unlikely(item->ri_buf[1].i_len > sizeof(struct xfs_icdinode))) {
2336 		XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)",
2337 				     XFS_ERRLEVEL_LOW, mp, dicp);
2338 		xfs_buf_relse(bp);
2339 		xfs_alert(mp,
2340 			"%s: Bad inode log record length %d, rec ptr 0x%p",
2341 			__func__, item->ri_buf[1].i_len, item);
2342 		error = EFSCORRUPTED;
2343 		goto error;
2344 	}
2345 
2346 	/* The core is in in-core format */
2347 	xfs_dinode_to_disk(dip, item->ri_buf[1].i_addr);
2348 
2349 	/* the rest is in on-disk format */
2350 	if (item->ri_buf[1].i_len > sizeof(struct xfs_icdinode)) {
2351 		memcpy((xfs_caddr_t) dip + sizeof(struct xfs_icdinode),
2352 			item->ri_buf[1].i_addr + sizeof(struct xfs_icdinode),
2353 			item->ri_buf[1].i_len  - sizeof(struct xfs_icdinode));
2354 	}
2355 
2356 	fields = in_f->ilf_fields;
2357 	switch (fields & (XFS_ILOG_DEV | XFS_ILOG_UUID)) {
2358 	case XFS_ILOG_DEV:
2359 		xfs_dinode_put_rdev(dip, in_f->ilf_u.ilfu_rdev);
2360 		break;
2361 	case XFS_ILOG_UUID:
2362 		memcpy(XFS_DFORK_DPTR(dip),
2363 		       &in_f->ilf_u.ilfu_uuid,
2364 		       sizeof(uuid_t));
2365 		break;
2366 	}
2367 
2368 	if (in_f->ilf_size == 2)
2369 		goto write_inode_buffer;
2370 	len = item->ri_buf[2].i_len;
2371 	src = item->ri_buf[2].i_addr;
2372 	ASSERT(in_f->ilf_size <= 4);
2373 	ASSERT((in_f->ilf_size == 3) || (fields & XFS_ILOG_AFORK));
2374 	ASSERT(!(fields & XFS_ILOG_DFORK) ||
2375 	       (len == in_f->ilf_dsize));
2376 
2377 	switch (fields & XFS_ILOG_DFORK) {
2378 	case XFS_ILOG_DDATA:
2379 	case XFS_ILOG_DEXT:
2380 		memcpy(XFS_DFORK_DPTR(dip), src, len);
2381 		break;
2382 
2383 	case XFS_ILOG_DBROOT:
2384 		xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, len,
2385 				 (xfs_bmdr_block_t *)XFS_DFORK_DPTR(dip),
2386 				 XFS_DFORK_DSIZE(dip, mp));
2387 		break;
2388 
2389 	default:
2390 		/*
2391 		 * There are no data fork flags set.
2392 		 */
2393 		ASSERT((fields & XFS_ILOG_DFORK) == 0);
2394 		break;
2395 	}
2396 
2397 	/*
2398 	 * If we logged any attribute data, recover it.  There may or
2399 	 * may not have been any other non-core data logged in this
2400 	 * transaction.
2401 	 */
2402 	if (in_f->ilf_fields & XFS_ILOG_AFORK) {
2403 		if (in_f->ilf_fields & XFS_ILOG_DFORK) {
2404 			attr_index = 3;
2405 		} else {
2406 			attr_index = 2;
2407 		}
2408 		len = item->ri_buf[attr_index].i_len;
2409 		src = item->ri_buf[attr_index].i_addr;
2410 		ASSERT(len == in_f->ilf_asize);
2411 
2412 		switch (in_f->ilf_fields & XFS_ILOG_AFORK) {
2413 		case XFS_ILOG_ADATA:
2414 		case XFS_ILOG_AEXT:
2415 			dest = XFS_DFORK_APTR(dip);
2416 			ASSERT(len <= XFS_DFORK_ASIZE(dip, mp));
2417 			memcpy(dest, src, len);
2418 			break;
2419 
2420 		case XFS_ILOG_ABROOT:
2421 			dest = XFS_DFORK_APTR(dip);
2422 			xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src,
2423 					 len, (xfs_bmdr_block_t*)dest,
2424 					 XFS_DFORK_ASIZE(dip, mp));
2425 			break;
2426 
2427 		default:
2428 			xfs_warn(log->l_mp, "%s: Invalid flag", __func__);
2429 			ASSERT(0);
2430 			xfs_buf_relse(bp);
2431 			error = EIO;
2432 			goto error;
2433 		}
2434 	}
2435 
2436 write_inode_buffer:
2437 	ASSERT(bp->b_target->bt_mount == mp);
2438 	bp->b_iodone = xlog_recover_iodone;
2439 	xfs_buf_delwri_queue(bp);
2440 	xfs_buf_relse(bp);
2441 error:
2442 	if (need_free)
2443 		kmem_free(in_f);
2444 	return XFS_ERROR(error);
2445 }
2446 
2447 /*
2448  * Recover QUOTAOFF records. We simply make a note of it in the xlog_t
2449  * structure, so that we know not to do any dquot item or dquot buffer recovery,
2450  * of that type.
2451  */
2452 STATIC int
xlog_recover_quotaoff_pass1(xlog_t * log,xlog_recover_item_t * item)2453 xlog_recover_quotaoff_pass1(
2454 	xlog_t			*log,
2455 	xlog_recover_item_t	*item)
2456 {
2457 	xfs_qoff_logformat_t	*qoff_f = item->ri_buf[0].i_addr;
2458 	ASSERT(qoff_f);
2459 
2460 	/*
2461 	 * The logitem format's flag tells us if this was user quotaoff,
2462 	 * group/project quotaoff or both.
2463 	 */
2464 	if (qoff_f->qf_flags & XFS_UQUOTA_ACCT)
2465 		log->l_quotaoffs_flag |= XFS_DQ_USER;
2466 	if (qoff_f->qf_flags & XFS_PQUOTA_ACCT)
2467 		log->l_quotaoffs_flag |= XFS_DQ_PROJ;
2468 	if (qoff_f->qf_flags & XFS_GQUOTA_ACCT)
2469 		log->l_quotaoffs_flag |= XFS_DQ_GROUP;
2470 
2471 	return (0);
2472 }
2473 
2474 /*
2475  * Recover a dquot record
2476  */
2477 STATIC int
xlog_recover_dquot_pass2(xlog_t * log,xlog_recover_item_t * item)2478 xlog_recover_dquot_pass2(
2479 	xlog_t			*log,
2480 	xlog_recover_item_t	*item)
2481 {
2482 	xfs_mount_t		*mp = log->l_mp;
2483 	xfs_buf_t		*bp;
2484 	struct xfs_disk_dquot	*ddq, *recddq;
2485 	int			error;
2486 	xfs_dq_logformat_t	*dq_f;
2487 	uint			type;
2488 
2489 
2490 	/*
2491 	 * Filesystems are required to send in quota flags at mount time.
2492 	 */
2493 	if (mp->m_qflags == 0)
2494 		return (0);
2495 
2496 	recddq = item->ri_buf[1].i_addr;
2497 	if (recddq == NULL) {
2498 		xfs_alert(log->l_mp, "NULL dquot in %s.", __func__);
2499 		return XFS_ERROR(EIO);
2500 	}
2501 	if (item->ri_buf[1].i_len < sizeof(xfs_disk_dquot_t)) {
2502 		xfs_alert(log->l_mp, "dquot too small (%d) in %s.",
2503 			item->ri_buf[1].i_len, __func__);
2504 		return XFS_ERROR(EIO);
2505 	}
2506 
2507 	/*
2508 	 * This type of quotas was turned off, so ignore this record.
2509 	 */
2510 	type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP);
2511 	ASSERT(type);
2512 	if (log->l_quotaoffs_flag & type)
2513 		return (0);
2514 
2515 	/*
2516 	 * At this point we know that quota was _not_ turned off.
2517 	 * Since the mount flags are not indicating to us otherwise, this
2518 	 * must mean that quota is on, and the dquot needs to be replayed.
2519 	 * Remember that we may not have fully recovered the superblock yet,
2520 	 * so we can't do the usual trick of looking at the SB quota bits.
2521 	 *
2522 	 * The other possibility, of course, is that the quota subsystem was
2523 	 * removed since the last mount - ENOSYS.
2524 	 */
2525 	dq_f = item->ri_buf[0].i_addr;
2526 	ASSERT(dq_f);
2527 	error = xfs_qm_dqcheck(mp, recddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
2528 			   "xlog_recover_dquot_pass2 (log copy)");
2529 	if (error)
2530 		return XFS_ERROR(EIO);
2531 	ASSERT(dq_f->qlf_len == 1);
2532 
2533 	error = xfs_read_buf(mp, mp->m_ddev_targp,
2534 			     dq_f->qlf_blkno,
2535 			     XFS_FSB_TO_BB(mp, dq_f->qlf_len),
2536 			     0, &bp);
2537 	if (error) {
2538 		xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#3)");
2539 		return error;
2540 	}
2541 	ASSERT(bp);
2542 	ddq = (xfs_disk_dquot_t *)xfs_buf_offset(bp, dq_f->qlf_boffset);
2543 
2544 	/*
2545 	 * At least the magic num portion should be on disk because this
2546 	 * was among a chunk of dquots created earlier, and we did some
2547 	 * minimal initialization then.
2548 	 */
2549 	error = xfs_qm_dqcheck(mp, ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
2550 			   "xlog_recover_dquot_pass2");
2551 	if (error) {
2552 		xfs_buf_relse(bp);
2553 		return XFS_ERROR(EIO);
2554 	}
2555 
2556 	memcpy(ddq, recddq, item->ri_buf[1].i_len);
2557 
2558 	ASSERT(dq_f->qlf_size == 2);
2559 	ASSERT(bp->b_target->bt_mount == mp);
2560 	bp->b_iodone = xlog_recover_iodone;
2561 	xfs_buf_delwri_queue(bp);
2562 	xfs_buf_relse(bp);
2563 
2564 	return (0);
2565 }
2566 
2567 /*
2568  * This routine is called to create an in-core extent free intent
2569  * item from the efi format structure which was logged on disk.
2570  * It allocates an in-core efi, copies the extents from the format
2571  * structure into it, and adds the efi to the AIL with the given
2572  * LSN.
2573  */
2574 STATIC int
xlog_recover_efi_pass2(xlog_t * log,xlog_recover_item_t * item,xfs_lsn_t lsn)2575 xlog_recover_efi_pass2(
2576 	xlog_t			*log,
2577 	xlog_recover_item_t	*item,
2578 	xfs_lsn_t		lsn)
2579 {
2580 	int			error;
2581 	xfs_mount_t		*mp = log->l_mp;
2582 	xfs_efi_log_item_t	*efip;
2583 	xfs_efi_log_format_t	*efi_formatp;
2584 
2585 	efi_formatp = item->ri_buf[0].i_addr;
2586 
2587 	efip = xfs_efi_init(mp, efi_formatp->efi_nextents);
2588 	if ((error = xfs_efi_copy_format(&(item->ri_buf[0]),
2589 					 &(efip->efi_format)))) {
2590 		xfs_efi_item_free(efip);
2591 		return error;
2592 	}
2593 	atomic_set(&efip->efi_next_extent, efi_formatp->efi_nextents);
2594 
2595 	spin_lock(&log->l_ailp->xa_lock);
2596 	/*
2597 	 * xfs_trans_ail_update() drops the AIL lock.
2598 	 */
2599 	xfs_trans_ail_update(log->l_ailp, &efip->efi_item, lsn);
2600 	return 0;
2601 }
2602 
2603 
2604 /*
2605  * This routine is called when an efd format structure is found in
2606  * a committed transaction in the log.  It's purpose is to cancel
2607  * the corresponding efi if it was still in the log.  To do this
2608  * it searches the AIL for the efi with an id equal to that in the
2609  * efd format structure.  If we find it, we remove the efi from the
2610  * AIL and free it.
2611  */
2612 STATIC int
xlog_recover_efd_pass2(xlog_t * log,xlog_recover_item_t * item)2613 xlog_recover_efd_pass2(
2614 	xlog_t			*log,
2615 	xlog_recover_item_t	*item)
2616 {
2617 	xfs_efd_log_format_t	*efd_formatp;
2618 	xfs_efi_log_item_t	*efip = NULL;
2619 	xfs_log_item_t		*lip;
2620 	__uint64_t		efi_id;
2621 	struct xfs_ail_cursor	cur;
2622 	struct xfs_ail		*ailp = log->l_ailp;
2623 
2624 	efd_formatp = item->ri_buf[0].i_addr;
2625 	ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) +
2626 		((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) ||
2627 	       (item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_64_t) +
2628 		((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_64_t)))));
2629 	efi_id = efd_formatp->efd_efi_id;
2630 
2631 	/*
2632 	 * Search for the efi with the id in the efd format structure
2633 	 * in the AIL.
2634 	 */
2635 	spin_lock(&ailp->xa_lock);
2636 	lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
2637 	while (lip != NULL) {
2638 		if (lip->li_type == XFS_LI_EFI) {
2639 			efip = (xfs_efi_log_item_t *)lip;
2640 			if (efip->efi_format.efi_id == efi_id) {
2641 				/*
2642 				 * xfs_trans_ail_delete() drops the
2643 				 * AIL lock.
2644 				 */
2645 				xfs_trans_ail_delete(ailp, lip);
2646 				xfs_efi_item_free(efip);
2647 				spin_lock(&ailp->xa_lock);
2648 				break;
2649 			}
2650 		}
2651 		lip = xfs_trans_ail_cursor_next(ailp, &cur);
2652 	}
2653 	xfs_trans_ail_cursor_done(ailp, &cur);
2654 	spin_unlock(&ailp->xa_lock);
2655 
2656 	return 0;
2657 }
2658 
2659 /*
2660  * Free up any resources allocated by the transaction
2661  *
2662  * Remember that EFIs, EFDs, and IUNLINKs are handled later.
2663  */
2664 STATIC void
xlog_recover_free_trans(struct xlog_recover * trans)2665 xlog_recover_free_trans(
2666 	struct xlog_recover	*trans)
2667 {
2668 	xlog_recover_item_t	*item, *n;
2669 	int			i;
2670 
2671 	list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) {
2672 		/* Free the regions in the item. */
2673 		list_del(&item->ri_list);
2674 		for (i = 0; i < item->ri_cnt; i++)
2675 			kmem_free(item->ri_buf[i].i_addr);
2676 		/* Free the item itself */
2677 		kmem_free(item->ri_buf);
2678 		kmem_free(item);
2679 	}
2680 	/* Free the transaction recover structure */
2681 	kmem_free(trans);
2682 }
2683 
2684 STATIC int
xlog_recover_commit_pass1(struct log * log,struct xlog_recover * trans,xlog_recover_item_t * item)2685 xlog_recover_commit_pass1(
2686 	struct log		*log,
2687 	struct xlog_recover	*trans,
2688 	xlog_recover_item_t	*item)
2689 {
2690 	trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS1);
2691 
2692 	switch (ITEM_TYPE(item)) {
2693 	case XFS_LI_BUF:
2694 		return xlog_recover_buffer_pass1(log, item);
2695 	case XFS_LI_QUOTAOFF:
2696 		return xlog_recover_quotaoff_pass1(log, item);
2697 	case XFS_LI_INODE:
2698 	case XFS_LI_EFI:
2699 	case XFS_LI_EFD:
2700 	case XFS_LI_DQUOT:
2701 		/* nothing to do in pass 1 */
2702 		return 0;
2703 	default:
2704 		xfs_warn(log->l_mp, "%s: invalid item type (%d)",
2705 			__func__, ITEM_TYPE(item));
2706 		ASSERT(0);
2707 		return XFS_ERROR(EIO);
2708 	}
2709 }
2710 
2711 STATIC int
xlog_recover_commit_pass2(struct log * log,struct xlog_recover * trans,xlog_recover_item_t * item)2712 xlog_recover_commit_pass2(
2713 	struct log		*log,
2714 	struct xlog_recover	*trans,
2715 	xlog_recover_item_t	*item)
2716 {
2717 	trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS2);
2718 
2719 	switch (ITEM_TYPE(item)) {
2720 	case XFS_LI_BUF:
2721 		return xlog_recover_buffer_pass2(log, item);
2722 	case XFS_LI_INODE:
2723 		return xlog_recover_inode_pass2(log, item);
2724 	case XFS_LI_EFI:
2725 		return xlog_recover_efi_pass2(log, item, trans->r_lsn);
2726 	case XFS_LI_EFD:
2727 		return xlog_recover_efd_pass2(log, item);
2728 	case XFS_LI_DQUOT:
2729 		return xlog_recover_dquot_pass2(log, item);
2730 	case XFS_LI_QUOTAOFF:
2731 		/* nothing to do in pass2 */
2732 		return 0;
2733 	default:
2734 		xfs_warn(log->l_mp, "%s: invalid item type (%d)",
2735 			__func__, ITEM_TYPE(item));
2736 		ASSERT(0);
2737 		return XFS_ERROR(EIO);
2738 	}
2739 }
2740 
2741 /*
2742  * Perform the transaction.
2743  *
2744  * If the transaction modifies a buffer or inode, do it now.  Otherwise,
2745  * EFIs and EFDs get queued up by adding entries into the AIL for them.
2746  */
2747 STATIC int
xlog_recover_commit_trans(struct log * log,struct xlog_recover * trans,int pass)2748 xlog_recover_commit_trans(
2749 	struct log		*log,
2750 	struct xlog_recover	*trans,
2751 	int			pass)
2752 {
2753 	int			error = 0;
2754 	xlog_recover_item_t	*item;
2755 
2756 	hlist_del(&trans->r_list);
2757 
2758 	error = xlog_recover_reorder_trans(log, trans, pass);
2759 	if (error)
2760 		return error;
2761 
2762 	list_for_each_entry(item, &trans->r_itemq, ri_list) {
2763 		if (pass == XLOG_RECOVER_PASS1)
2764 			error = xlog_recover_commit_pass1(log, trans, item);
2765 		else
2766 			error = xlog_recover_commit_pass2(log, trans, item);
2767 		if (error)
2768 			return error;
2769 	}
2770 
2771 	xlog_recover_free_trans(trans);
2772 	return 0;
2773 }
2774 
2775 STATIC int
xlog_recover_unmount_trans(struct log * log,xlog_recover_t * trans)2776 xlog_recover_unmount_trans(
2777 	struct log		*log,
2778 	xlog_recover_t		*trans)
2779 {
2780 	/* Do nothing now */
2781 	xfs_warn(log->l_mp, "%s: Unmount LR", __func__);
2782 	return 0;
2783 }
2784 
2785 /*
2786  * There are two valid states of the r_state field.  0 indicates that the
2787  * transaction structure is in a normal state.  We have either seen the
2788  * start of the transaction or the last operation we added was not a partial
2789  * operation.  If the last operation we added to the transaction was a
2790  * partial operation, we need to mark r_state with XLOG_WAS_CONT_TRANS.
2791  *
2792  * NOTE: skip LRs with 0 data length.
2793  */
2794 STATIC int
xlog_recover_process_data(xlog_t * log,struct hlist_head rhash[],xlog_rec_header_t * rhead,xfs_caddr_t dp,int pass)2795 xlog_recover_process_data(
2796 	xlog_t			*log,
2797 	struct hlist_head	rhash[],
2798 	xlog_rec_header_t	*rhead,
2799 	xfs_caddr_t		dp,
2800 	int			pass)
2801 {
2802 	xfs_caddr_t		lp;
2803 	int			num_logops;
2804 	xlog_op_header_t	*ohead;
2805 	xlog_recover_t		*trans;
2806 	xlog_tid_t		tid;
2807 	int			error;
2808 	unsigned long		hash;
2809 	uint			flags;
2810 
2811 	lp = dp + be32_to_cpu(rhead->h_len);
2812 	num_logops = be32_to_cpu(rhead->h_num_logops);
2813 
2814 	/* check the log format matches our own - else we can't recover */
2815 	if (xlog_header_check_recover(log->l_mp, rhead))
2816 		return (XFS_ERROR(EIO));
2817 
2818 	while ((dp < lp) && num_logops) {
2819 		ASSERT(dp + sizeof(xlog_op_header_t) <= lp);
2820 		ohead = (xlog_op_header_t *)dp;
2821 		dp += sizeof(xlog_op_header_t);
2822 		if (ohead->oh_clientid != XFS_TRANSACTION &&
2823 		    ohead->oh_clientid != XFS_LOG) {
2824 			xfs_warn(log->l_mp, "%s: bad clientid 0x%x",
2825 					__func__, ohead->oh_clientid);
2826 			ASSERT(0);
2827 			return (XFS_ERROR(EIO));
2828 		}
2829 		tid = be32_to_cpu(ohead->oh_tid);
2830 		hash = XLOG_RHASH(tid);
2831 		trans = xlog_recover_find_tid(&rhash[hash], tid);
2832 		if (trans == NULL) {		   /* not found; add new tid */
2833 			if (ohead->oh_flags & XLOG_START_TRANS)
2834 				xlog_recover_new_tid(&rhash[hash], tid,
2835 					be64_to_cpu(rhead->h_lsn));
2836 		} else {
2837 			if (dp + be32_to_cpu(ohead->oh_len) > lp) {
2838 				xfs_warn(log->l_mp, "%s: bad length 0x%x",
2839 					__func__, be32_to_cpu(ohead->oh_len));
2840 				WARN_ON(1);
2841 				return (XFS_ERROR(EIO));
2842 			}
2843 			flags = ohead->oh_flags & ~XLOG_END_TRANS;
2844 			if (flags & XLOG_WAS_CONT_TRANS)
2845 				flags &= ~XLOG_CONTINUE_TRANS;
2846 			switch (flags) {
2847 			case XLOG_COMMIT_TRANS:
2848 				error = xlog_recover_commit_trans(log,
2849 								trans, pass);
2850 				break;
2851 			case XLOG_UNMOUNT_TRANS:
2852 				error = xlog_recover_unmount_trans(log, trans);
2853 				break;
2854 			case XLOG_WAS_CONT_TRANS:
2855 				error = xlog_recover_add_to_cont_trans(log,
2856 						trans, dp,
2857 						be32_to_cpu(ohead->oh_len));
2858 				break;
2859 			case XLOG_START_TRANS:
2860 				xfs_warn(log->l_mp, "%s: bad transaction",
2861 					__func__);
2862 				ASSERT(0);
2863 				error = XFS_ERROR(EIO);
2864 				break;
2865 			case 0:
2866 			case XLOG_CONTINUE_TRANS:
2867 				error = xlog_recover_add_to_trans(log, trans,
2868 						dp, be32_to_cpu(ohead->oh_len));
2869 				break;
2870 			default:
2871 				xfs_warn(log->l_mp, "%s: bad flag 0x%x",
2872 					__func__, flags);
2873 				ASSERT(0);
2874 				error = XFS_ERROR(EIO);
2875 				break;
2876 			}
2877 			if (error)
2878 				return error;
2879 		}
2880 		dp += be32_to_cpu(ohead->oh_len);
2881 		num_logops--;
2882 	}
2883 	return 0;
2884 }
2885 
2886 /*
2887  * Process an extent free intent item that was recovered from
2888  * the log.  We need to free the extents that it describes.
2889  */
2890 STATIC int
xlog_recover_process_efi(xfs_mount_t * mp,xfs_efi_log_item_t * efip)2891 xlog_recover_process_efi(
2892 	xfs_mount_t		*mp,
2893 	xfs_efi_log_item_t	*efip)
2894 {
2895 	xfs_efd_log_item_t	*efdp;
2896 	xfs_trans_t		*tp;
2897 	int			i;
2898 	int			error = 0;
2899 	xfs_extent_t		*extp;
2900 	xfs_fsblock_t		startblock_fsb;
2901 
2902 	ASSERT(!test_bit(XFS_EFI_RECOVERED, &efip->efi_flags));
2903 
2904 	/*
2905 	 * First check the validity of the extents described by the
2906 	 * EFI.  If any are bad, then assume that all are bad and
2907 	 * just toss the EFI.
2908 	 */
2909 	for (i = 0; i < efip->efi_format.efi_nextents; i++) {
2910 		extp = &(efip->efi_format.efi_extents[i]);
2911 		startblock_fsb = XFS_BB_TO_FSB(mp,
2912 				   XFS_FSB_TO_DADDR(mp, extp->ext_start));
2913 		if ((startblock_fsb == 0) ||
2914 		    (extp->ext_len == 0) ||
2915 		    (startblock_fsb >= mp->m_sb.sb_dblocks) ||
2916 		    (extp->ext_len >= mp->m_sb.sb_agblocks)) {
2917 			/*
2918 			 * This will pull the EFI from the AIL and
2919 			 * free the memory associated with it.
2920 			 */
2921 			xfs_efi_release(efip, efip->efi_format.efi_nextents);
2922 			return XFS_ERROR(EIO);
2923 		}
2924 	}
2925 
2926 	tp = xfs_trans_alloc(mp, 0);
2927 	error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, 0, 0);
2928 	if (error)
2929 		goto abort_error;
2930 	efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents);
2931 
2932 	for (i = 0; i < efip->efi_format.efi_nextents; i++) {
2933 		extp = &(efip->efi_format.efi_extents[i]);
2934 		error = xfs_free_extent(tp, extp->ext_start, extp->ext_len);
2935 		if (error)
2936 			goto abort_error;
2937 		xfs_trans_log_efd_extent(tp, efdp, extp->ext_start,
2938 					 extp->ext_len);
2939 	}
2940 
2941 	set_bit(XFS_EFI_RECOVERED, &efip->efi_flags);
2942 	error = xfs_trans_commit(tp, 0);
2943 	return error;
2944 
2945 abort_error:
2946 	xfs_trans_cancel(tp, XFS_TRANS_ABORT);
2947 	return error;
2948 }
2949 
2950 /*
2951  * When this is called, all of the EFIs which did not have
2952  * corresponding EFDs should be in the AIL.  What we do now
2953  * is free the extents associated with each one.
2954  *
2955  * Since we process the EFIs in normal transactions, they
2956  * will be removed at some point after the commit.  This prevents
2957  * us from just walking down the list processing each one.
2958  * We'll use a flag in the EFI to skip those that we've already
2959  * processed and use the AIL iteration mechanism's generation
2960  * count to try to speed this up at least a bit.
2961  *
2962  * When we start, we know that the EFIs are the only things in
2963  * the AIL.  As we process them, however, other items are added
2964  * to the AIL.  Since everything added to the AIL must come after
2965  * everything already in the AIL, we stop processing as soon as
2966  * we see something other than an EFI in the AIL.
2967  */
2968 STATIC int
xlog_recover_process_efis(xlog_t * log)2969 xlog_recover_process_efis(
2970 	xlog_t			*log)
2971 {
2972 	xfs_log_item_t		*lip;
2973 	xfs_efi_log_item_t	*efip;
2974 	int			error = 0;
2975 	struct xfs_ail_cursor	cur;
2976 	struct xfs_ail		*ailp;
2977 
2978 	ailp = log->l_ailp;
2979 	spin_lock(&ailp->xa_lock);
2980 	lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
2981 	while (lip != NULL) {
2982 		/*
2983 		 * We're done when we see something other than an EFI.
2984 		 * There should be no EFIs left in the AIL now.
2985 		 */
2986 		if (lip->li_type != XFS_LI_EFI) {
2987 #ifdef DEBUG
2988 			for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur))
2989 				ASSERT(lip->li_type != XFS_LI_EFI);
2990 #endif
2991 			break;
2992 		}
2993 
2994 		/*
2995 		 * Skip EFIs that we've already processed.
2996 		 */
2997 		efip = (xfs_efi_log_item_t *)lip;
2998 		if (test_bit(XFS_EFI_RECOVERED, &efip->efi_flags)) {
2999 			lip = xfs_trans_ail_cursor_next(ailp, &cur);
3000 			continue;
3001 		}
3002 
3003 		spin_unlock(&ailp->xa_lock);
3004 		error = xlog_recover_process_efi(log->l_mp, efip);
3005 		spin_lock(&ailp->xa_lock);
3006 		if (error)
3007 			goto out;
3008 		lip = xfs_trans_ail_cursor_next(ailp, &cur);
3009 	}
3010 out:
3011 	xfs_trans_ail_cursor_done(ailp, &cur);
3012 	spin_unlock(&ailp->xa_lock);
3013 	return error;
3014 }
3015 
3016 /*
3017  * This routine performs a transaction to null out a bad inode pointer
3018  * in an agi unlinked inode hash bucket.
3019  */
3020 STATIC void
xlog_recover_clear_agi_bucket(xfs_mount_t * mp,xfs_agnumber_t agno,int bucket)3021 xlog_recover_clear_agi_bucket(
3022 	xfs_mount_t	*mp,
3023 	xfs_agnumber_t	agno,
3024 	int		bucket)
3025 {
3026 	xfs_trans_t	*tp;
3027 	xfs_agi_t	*agi;
3028 	xfs_buf_t	*agibp;
3029 	int		offset;
3030 	int		error;
3031 
3032 	tp = xfs_trans_alloc(mp, XFS_TRANS_CLEAR_AGI_BUCKET);
3033 	error = xfs_trans_reserve(tp, 0, XFS_CLEAR_AGI_BUCKET_LOG_RES(mp),
3034 				  0, 0, 0);
3035 	if (error)
3036 		goto out_abort;
3037 
3038 	error = xfs_read_agi(mp, tp, agno, &agibp);
3039 	if (error)
3040 		goto out_abort;
3041 
3042 	agi = XFS_BUF_TO_AGI(agibp);
3043 	agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
3044 	offset = offsetof(xfs_agi_t, agi_unlinked) +
3045 		 (sizeof(xfs_agino_t) * bucket);
3046 	xfs_trans_log_buf(tp, agibp, offset,
3047 			  (offset + sizeof(xfs_agino_t) - 1));
3048 
3049 	error = xfs_trans_commit(tp, 0);
3050 	if (error)
3051 		goto out_error;
3052 	return;
3053 
3054 out_abort:
3055 	xfs_trans_cancel(tp, XFS_TRANS_ABORT);
3056 out_error:
3057 	xfs_warn(mp, "%s: failed to clear agi %d. Continuing.", __func__, agno);
3058 	return;
3059 }
3060 
3061 STATIC xfs_agino_t
xlog_recover_process_one_iunlink(struct xfs_mount * mp,xfs_agnumber_t agno,xfs_agino_t agino,int bucket)3062 xlog_recover_process_one_iunlink(
3063 	struct xfs_mount		*mp,
3064 	xfs_agnumber_t			agno,
3065 	xfs_agino_t			agino,
3066 	int				bucket)
3067 {
3068 	struct xfs_buf			*ibp;
3069 	struct xfs_dinode		*dip;
3070 	struct xfs_inode		*ip;
3071 	xfs_ino_t			ino;
3072 	int				error;
3073 
3074 	ino = XFS_AGINO_TO_INO(mp, agno, agino);
3075 	error = xfs_iget(mp, NULL, ino, 0, 0, &ip);
3076 	if (error)
3077 		goto fail;
3078 
3079 	/*
3080 	 * Get the on disk inode to find the next inode in the bucket.
3081 	 */
3082 	error = xfs_itobp(mp, NULL, ip, &dip, &ibp, XBF_LOCK);
3083 	if (error)
3084 		goto fail_iput;
3085 
3086 	ASSERT(ip->i_d.di_nlink == 0);
3087 	ASSERT(ip->i_d.di_mode != 0);
3088 
3089 	/* setup for the next pass */
3090 	agino = be32_to_cpu(dip->di_next_unlinked);
3091 	xfs_buf_relse(ibp);
3092 
3093 	/*
3094 	 * Prevent any DMAPI event from being sent when the reference on
3095 	 * the inode is dropped.
3096 	 */
3097 	ip->i_d.di_dmevmask = 0;
3098 
3099 	IRELE(ip);
3100 	return agino;
3101 
3102  fail_iput:
3103 	IRELE(ip);
3104  fail:
3105 	/*
3106 	 * We can't read in the inode this bucket points to, or this inode
3107 	 * is messed up.  Just ditch this bucket of inodes.  We will lose
3108 	 * some inodes and space, but at least we won't hang.
3109 	 *
3110 	 * Call xlog_recover_clear_agi_bucket() to perform a transaction to
3111 	 * clear the inode pointer in the bucket.
3112 	 */
3113 	xlog_recover_clear_agi_bucket(mp, agno, bucket);
3114 	return NULLAGINO;
3115 }
3116 
3117 /*
3118  * xlog_iunlink_recover
3119  *
3120  * This is called during recovery to process any inodes which
3121  * we unlinked but not freed when the system crashed.  These
3122  * inodes will be on the lists in the AGI blocks.  What we do
3123  * here is scan all the AGIs and fully truncate and free any
3124  * inodes found on the lists.  Each inode is removed from the
3125  * lists when it has been fully truncated and is freed.  The
3126  * freeing of the inode and its removal from the list must be
3127  * atomic.
3128  */
3129 STATIC void
xlog_recover_process_iunlinks(xlog_t * log)3130 xlog_recover_process_iunlinks(
3131 	xlog_t		*log)
3132 {
3133 	xfs_mount_t	*mp;
3134 	xfs_agnumber_t	agno;
3135 	xfs_agi_t	*agi;
3136 	xfs_buf_t	*agibp;
3137 	xfs_agino_t	agino;
3138 	int		bucket;
3139 	int		error;
3140 	uint		mp_dmevmask;
3141 
3142 	mp = log->l_mp;
3143 
3144 	/*
3145 	 * Prevent any DMAPI event from being sent while in this function.
3146 	 */
3147 	mp_dmevmask = mp->m_dmevmask;
3148 	mp->m_dmevmask = 0;
3149 
3150 	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
3151 		/*
3152 		 * Find the agi for this ag.
3153 		 */
3154 		error = xfs_read_agi(mp, NULL, agno, &agibp);
3155 		if (error) {
3156 			/*
3157 			 * AGI is b0rked. Don't process it.
3158 			 *
3159 			 * We should probably mark the filesystem as corrupt
3160 			 * after we've recovered all the ag's we can....
3161 			 */
3162 			continue;
3163 		}
3164 		/*
3165 		 * Unlock the buffer so that it can be acquired in the normal
3166 		 * course of the transaction to truncate and free each inode.
3167 		 * Because we are not racing with anyone else here for the AGI
3168 		 * buffer, we don't even need to hold it locked to read the
3169 		 * initial unlinked bucket entries out of the buffer. We keep
3170 		 * buffer reference though, so that it stays pinned in memory
3171 		 * while we need the buffer.
3172 		 */
3173 		agi = XFS_BUF_TO_AGI(agibp);
3174 		xfs_buf_unlock(agibp);
3175 
3176 		for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) {
3177 			agino = be32_to_cpu(agi->agi_unlinked[bucket]);
3178 			while (agino != NULLAGINO) {
3179 				agino = xlog_recover_process_one_iunlink(mp,
3180 							agno, agino, bucket);
3181 			}
3182 		}
3183 		xfs_buf_rele(agibp);
3184 	}
3185 
3186 	mp->m_dmevmask = mp_dmevmask;
3187 }
3188 
3189 
3190 #ifdef DEBUG
3191 STATIC void
xlog_pack_data_checksum(xlog_t * log,xlog_in_core_t * iclog,int size)3192 xlog_pack_data_checksum(
3193 	xlog_t		*log,
3194 	xlog_in_core_t	*iclog,
3195 	int		size)
3196 {
3197 	int		i;
3198 	__be32		*up;
3199 	uint		chksum = 0;
3200 
3201 	up = (__be32 *)iclog->ic_datap;
3202 	/* divide length by 4 to get # words */
3203 	for (i = 0; i < (size >> 2); i++) {
3204 		chksum ^= be32_to_cpu(*up);
3205 		up++;
3206 	}
3207 	iclog->ic_header.h_chksum = cpu_to_be32(chksum);
3208 }
3209 #else
3210 #define xlog_pack_data_checksum(log, iclog, size)
3211 #endif
3212 
3213 /*
3214  * Stamp cycle number in every block
3215  */
3216 void
xlog_pack_data(xlog_t * log,xlog_in_core_t * iclog,int roundoff)3217 xlog_pack_data(
3218 	xlog_t			*log,
3219 	xlog_in_core_t		*iclog,
3220 	int			roundoff)
3221 {
3222 	int			i, j, k;
3223 	int			size = iclog->ic_offset + roundoff;
3224 	__be32			cycle_lsn;
3225 	xfs_caddr_t		dp;
3226 
3227 	xlog_pack_data_checksum(log, iclog, size);
3228 
3229 	cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn);
3230 
3231 	dp = iclog->ic_datap;
3232 	for (i = 0; i < BTOBB(size) &&
3233 		i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
3234 		iclog->ic_header.h_cycle_data[i] = *(__be32 *)dp;
3235 		*(__be32 *)dp = cycle_lsn;
3236 		dp += BBSIZE;
3237 	}
3238 
3239 	if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
3240 		xlog_in_core_2_t *xhdr = iclog->ic_data;
3241 
3242 		for ( ; i < BTOBB(size); i++) {
3243 			j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
3244 			k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
3245 			xhdr[j].hic_xheader.xh_cycle_data[k] = *(__be32 *)dp;
3246 			*(__be32 *)dp = cycle_lsn;
3247 			dp += BBSIZE;
3248 		}
3249 
3250 		for (i = 1; i < log->l_iclog_heads; i++) {
3251 			xhdr[i].hic_xheader.xh_cycle = cycle_lsn;
3252 		}
3253 	}
3254 }
3255 
3256 STATIC void
xlog_unpack_data(xlog_rec_header_t * rhead,xfs_caddr_t dp,xlog_t * log)3257 xlog_unpack_data(
3258 	xlog_rec_header_t	*rhead,
3259 	xfs_caddr_t		dp,
3260 	xlog_t			*log)
3261 {
3262 	int			i, j, k;
3263 
3264 	for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) &&
3265 		  i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
3266 		*(__be32 *)dp = *(__be32 *)&rhead->h_cycle_data[i];
3267 		dp += BBSIZE;
3268 	}
3269 
3270 	if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
3271 		xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead;
3272 		for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) {
3273 			j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
3274 			k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
3275 			*(__be32 *)dp = xhdr[j].hic_xheader.xh_cycle_data[k];
3276 			dp += BBSIZE;
3277 		}
3278 	}
3279 }
3280 
3281 STATIC int
xlog_valid_rec_header(xlog_t * log,xlog_rec_header_t * rhead,xfs_daddr_t blkno)3282 xlog_valid_rec_header(
3283 	xlog_t			*log,
3284 	xlog_rec_header_t	*rhead,
3285 	xfs_daddr_t		blkno)
3286 {
3287 	int			hlen;
3288 
3289 	if (unlikely(rhead->h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM))) {
3290 		XFS_ERROR_REPORT("xlog_valid_rec_header(1)",
3291 				XFS_ERRLEVEL_LOW, log->l_mp);
3292 		return XFS_ERROR(EFSCORRUPTED);
3293 	}
3294 	if (unlikely(
3295 	    (!rhead->h_version ||
3296 	    (be32_to_cpu(rhead->h_version) & (~XLOG_VERSION_OKBITS))))) {
3297 		xfs_warn(log->l_mp, "%s: unrecognised log version (%d).",
3298 			__func__, be32_to_cpu(rhead->h_version));
3299 		return XFS_ERROR(EIO);
3300 	}
3301 
3302 	/* LR body must have data or it wouldn't have been written */
3303 	hlen = be32_to_cpu(rhead->h_len);
3304 	if (unlikely( hlen <= 0 || hlen > INT_MAX )) {
3305 		XFS_ERROR_REPORT("xlog_valid_rec_header(2)",
3306 				XFS_ERRLEVEL_LOW, log->l_mp);
3307 		return XFS_ERROR(EFSCORRUPTED);
3308 	}
3309 	if (unlikely( blkno > log->l_logBBsize || blkno > INT_MAX )) {
3310 		XFS_ERROR_REPORT("xlog_valid_rec_header(3)",
3311 				XFS_ERRLEVEL_LOW, log->l_mp);
3312 		return XFS_ERROR(EFSCORRUPTED);
3313 	}
3314 	return 0;
3315 }
3316 
3317 /*
3318  * Read the log from tail to head and process the log records found.
3319  * Handle the two cases where the tail and head are in the same cycle
3320  * and where the active portion of the log wraps around the end of
3321  * the physical log separately.  The pass parameter is passed through
3322  * to the routines called to process the data and is not looked at
3323  * here.
3324  */
3325 STATIC int
xlog_do_recovery_pass(xlog_t * log,xfs_daddr_t head_blk,xfs_daddr_t tail_blk,int pass)3326 xlog_do_recovery_pass(
3327 	xlog_t			*log,
3328 	xfs_daddr_t		head_blk,
3329 	xfs_daddr_t		tail_blk,
3330 	int			pass)
3331 {
3332 	xlog_rec_header_t	*rhead;
3333 	xfs_daddr_t		blk_no;
3334 	xfs_caddr_t		offset;
3335 	xfs_buf_t		*hbp, *dbp;
3336 	int			error = 0, h_size;
3337 	int			bblks, split_bblks;
3338 	int			hblks, split_hblks, wrapped_hblks;
3339 	struct hlist_head	rhash[XLOG_RHASH_SIZE];
3340 
3341 	ASSERT(head_blk != tail_blk);
3342 
3343 	/*
3344 	 * Read the header of the tail block and get the iclog buffer size from
3345 	 * h_size.  Use this to tell how many sectors make up the log header.
3346 	 */
3347 	if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
3348 		/*
3349 		 * When using variable length iclogs, read first sector of
3350 		 * iclog header and extract the header size from it.  Get a
3351 		 * new hbp that is the correct size.
3352 		 */
3353 		hbp = xlog_get_bp(log, 1);
3354 		if (!hbp)
3355 			return ENOMEM;
3356 
3357 		error = xlog_bread(log, tail_blk, 1, hbp, &offset);
3358 		if (error)
3359 			goto bread_err1;
3360 
3361 		rhead = (xlog_rec_header_t *)offset;
3362 		error = xlog_valid_rec_header(log, rhead, tail_blk);
3363 		if (error)
3364 			goto bread_err1;
3365 		h_size = be32_to_cpu(rhead->h_size);
3366 		if ((be32_to_cpu(rhead->h_version) & XLOG_VERSION_2) &&
3367 		    (h_size > XLOG_HEADER_CYCLE_SIZE)) {
3368 			hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
3369 			if (h_size % XLOG_HEADER_CYCLE_SIZE)
3370 				hblks++;
3371 			xlog_put_bp(hbp);
3372 			hbp = xlog_get_bp(log, hblks);
3373 		} else {
3374 			hblks = 1;
3375 		}
3376 	} else {
3377 		ASSERT(log->l_sectBBsize == 1);
3378 		hblks = 1;
3379 		hbp = xlog_get_bp(log, 1);
3380 		h_size = XLOG_BIG_RECORD_BSIZE;
3381 	}
3382 
3383 	if (!hbp)
3384 		return ENOMEM;
3385 	dbp = xlog_get_bp(log, BTOBB(h_size));
3386 	if (!dbp) {
3387 		xlog_put_bp(hbp);
3388 		return ENOMEM;
3389 	}
3390 
3391 	memset(rhash, 0, sizeof(rhash));
3392 	if (tail_blk <= head_blk) {
3393 		for (blk_no = tail_blk; blk_no < head_blk; ) {
3394 			error = xlog_bread(log, blk_no, hblks, hbp, &offset);
3395 			if (error)
3396 				goto bread_err2;
3397 
3398 			rhead = (xlog_rec_header_t *)offset;
3399 			error = xlog_valid_rec_header(log, rhead, blk_no);
3400 			if (error)
3401 				goto bread_err2;
3402 
3403 			/* blocks in data section */
3404 			bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
3405 			error = xlog_bread(log, blk_no + hblks, bblks, dbp,
3406 					   &offset);
3407 			if (error)
3408 				goto bread_err2;
3409 
3410 			xlog_unpack_data(rhead, offset, log);
3411 			if ((error = xlog_recover_process_data(log,
3412 						rhash, rhead, offset, pass)))
3413 				goto bread_err2;
3414 			blk_no += bblks + hblks;
3415 		}
3416 	} else {
3417 		/*
3418 		 * Perform recovery around the end of the physical log.
3419 		 * When the head is not on the same cycle number as the tail,
3420 		 * we can't do a sequential recovery as above.
3421 		 */
3422 		blk_no = tail_blk;
3423 		while (blk_no < log->l_logBBsize) {
3424 			/*
3425 			 * Check for header wrapping around physical end-of-log
3426 			 */
3427 			offset = hbp->b_addr;
3428 			split_hblks = 0;
3429 			wrapped_hblks = 0;
3430 			if (blk_no + hblks <= log->l_logBBsize) {
3431 				/* Read header in one read */
3432 				error = xlog_bread(log, blk_no, hblks, hbp,
3433 						   &offset);
3434 				if (error)
3435 					goto bread_err2;
3436 			} else {
3437 				/* This LR is split across physical log end */
3438 				if (blk_no != log->l_logBBsize) {
3439 					/* some data before physical log end */
3440 					ASSERT(blk_no <= INT_MAX);
3441 					split_hblks = log->l_logBBsize - (int)blk_no;
3442 					ASSERT(split_hblks > 0);
3443 					error = xlog_bread(log, blk_no,
3444 							   split_hblks, hbp,
3445 							   &offset);
3446 					if (error)
3447 						goto bread_err2;
3448 				}
3449 
3450 				/*
3451 				 * Note: this black magic still works with
3452 				 * large sector sizes (non-512) only because:
3453 				 * - we increased the buffer size originally
3454 				 *   by 1 sector giving us enough extra space
3455 				 *   for the second read;
3456 				 * - the log start is guaranteed to be sector
3457 				 *   aligned;
3458 				 * - we read the log end (LR header start)
3459 				 *   _first_, then the log start (LR header end)
3460 				 *   - order is important.
3461 				 */
3462 				wrapped_hblks = hblks - split_hblks;
3463 				error = xlog_bread_offset(log, 0,
3464 						wrapped_hblks, hbp,
3465 						offset + BBTOB(split_hblks));
3466 				if (error)
3467 					goto bread_err2;
3468 			}
3469 			rhead = (xlog_rec_header_t *)offset;
3470 			error = xlog_valid_rec_header(log, rhead,
3471 						split_hblks ? blk_no : 0);
3472 			if (error)
3473 				goto bread_err2;
3474 
3475 			bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
3476 			blk_no += hblks;
3477 
3478 			/* Read in data for log record */
3479 			if (blk_no + bblks <= log->l_logBBsize) {
3480 				error = xlog_bread(log, blk_no, bblks, dbp,
3481 						   &offset);
3482 				if (error)
3483 					goto bread_err2;
3484 			} else {
3485 				/* This log record is split across the
3486 				 * physical end of log */
3487 				offset = dbp->b_addr;
3488 				split_bblks = 0;
3489 				if (blk_no != log->l_logBBsize) {
3490 					/* some data is before the physical
3491 					 * end of log */
3492 					ASSERT(!wrapped_hblks);
3493 					ASSERT(blk_no <= INT_MAX);
3494 					split_bblks =
3495 						log->l_logBBsize - (int)blk_no;
3496 					ASSERT(split_bblks > 0);
3497 					error = xlog_bread(log, blk_no,
3498 							split_bblks, dbp,
3499 							&offset);
3500 					if (error)
3501 						goto bread_err2;
3502 				}
3503 
3504 				/*
3505 				 * Note: this black magic still works with
3506 				 * large sector sizes (non-512) only because:
3507 				 * - we increased the buffer size originally
3508 				 *   by 1 sector giving us enough extra space
3509 				 *   for the second read;
3510 				 * - the log start is guaranteed to be sector
3511 				 *   aligned;
3512 				 * - we read the log end (LR header start)
3513 				 *   _first_, then the log start (LR header end)
3514 				 *   - order is important.
3515 				 */
3516 				error = xlog_bread_offset(log, 0,
3517 						bblks - split_bblks, dbp,
3518 						offset + BBTOB(split_bblks));
3519 				if (error)
3520 					goto bread_err2;
3521 			}
3522 			xlog_unpack_data(rhead, offset, log);
3523 			if ((error = xlog_recover_process_data(log, rhash,
3524 							rhead, offset, pass)))
3525 				goto bread_err2;
3526 			blk_no += bblks;
3527 		}
3528 
3529 		ASSERT(blk_no >= log->l_logBBsize);
3530 		blk_no -= log->l_logBBsize;
3531 
3532 		/* read first part of physical log */
3533 		while (blk_no < head_blk) {
3534 			error = xlog_bread(log, blk_no, hblks, hbp, &offset);
3535 			if (error)
3536 				goto bread_err2;
3537 
3538 			rhead = (xlog_rec_header_t *)offset;
3539 			error = xlog_valid_rec_header(log, rhead, blk_no);
3540 			if (error)
3541 				goto bread_err2;
3542 
3543 			bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
3544 			error = xlog_bread(log, blk_no+hblks, bblks, dbp,
3545 					   &offset);
3546 			if (error)
3547 				goto bread_err2;
3548 
3549 			xlog_unpack_data(rhead, offset, log);
3550 			if ((error = xlog_recover_process_data(log, rhash,
3551 							rhead, offset, pass)))
3552 				goto bread_err2;
3553 			blk_no += bblks + hblks;
3554 		}
3555 	}
3556 
3557  bread_err2:
3558 	xlog_put_bp(dbp);
3559  bread_err1:
3560 	xlog_put_bp(hbp);
3561 	return error;
3562 }
3563 
3564 /*
3565  * Do the recovery of the log.  We actually do this in two phases.
3566  * The two passes are necessary in order to implement the function
3567  * of cancelling a record written into the log.  The first pass
3568  * determines those things which have been cancelled, and the
3569  * second pass replays log items normally except for those which
3570  * have been cancelled.  The handling of the replay and cancellations
3571  * takes place in the log item type specific routines.
3572  *
3573  * The table of items which have cancel records in the log is allocated
3574  * and freed at this level, since only here do we know when all of
3575  * the log recovery has been completed.
3576  */
3577 STATIC int
xlog_do_log_recovery(xlog_t * log,xfs_daddr_t head_blk,xfs_daddr_t tail_blk)3578 xlog_do_log_recovery(
3579 	xlog_t		*log,
3580 	xfs_daddr_t	head_blk,
3581 	xfs_daddr_t	tail_blk)
3582 {
3583 	int		error, i;
3584 
3585 	ASSERT(head_blk != tail_blk);
3586 
3587 	/*
3588 	 * First do a pass to find all of the cancelled buf log items.
3589 	 * Store them in the buf_cancel_table for use in the second pass.
3590 	 */
3591 	log->l_buf_cancel_table = kmem_zalloc(XLOG_BC_TABLE_SIZE *
3592 						 sizeof(struct list_head),
3593 						 KM_SLEEP);
3594 	for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
3595 		INIT_LIST_HEAD(&log->l_buf_cancel_table[i]);
3596 
3597 	error = xlog_do_recovery_pass(log, head_blk, tail_blk,
3598 				      XLOG_RECOVER_PASS1);
3599 	if (error != 0) {
3600 		kmem_free(log->l_buf_cancel_table);
3601 		log->l_buf_cancel_table = NULL;
3602 		return error;
3603 	}
3604 	/*
3605 	 * Then do a second pass to actually recover the items in the log.
3606 	 * When it is complete free the table of buf cancel items.
3607 	 */
3608 	error = xlog_do_recovery_pass(log, head_blk, tail_blk,
3609 				      XLOG_RECOVER_PASS2);
3610 #ifdef DEBUG
3611 	if (!error) {
3612 		int	i;
3613 
3614 		for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
3615 			ASSERT(list_empty(&log->l_buf_cancel_table[i]));
3616 	}
3617 #endif	/* DEBUG */
3618 
3619 	kmem_free(log->l_buf_cancel_table);
3620 	log->l_buf_cancel_table = NULL;
3621 
3622 	return error;
3623 }
3624 
3625 /*
3626  * Do the actual recovery
3627  */
3628 STATIC int
xlog_do_recover(xlog_t * log,xfs_daddr_t head_blk,xfs_daddr_t tail_blk)3629 xlog_do_recover(
3630 	xlog_t		*log,
3631 	xfs_daddr_t	head_blk,
3632 	xfs_daddr_t	tail_blk)
3633 {
3634 	int		error;
3635 	xfs_buf_t	*bp;
3636 	xfs_sb_t	*sbp;
3637 
3638 	/*
3639 	 * First replay the images in the log.
3640 	 */
3641 	error = xlog_do_log_recovery(log, head_blk, tail_blk);
3642 	if (error) {
3643 		return error;
3644 	}
3645 
3646 	xfs_flush_buftarg(log->l_mp->m_ddev_targp, 1);
3647 
3648 	/*
3649 	 * If IO errors happened during recovery, bail out.
3650 	 */
3651 	if (XFS_FORCED_SHUTDOWN(log->l_mp)) {
3652 		return (EIO);
3653 	}
3654 
3655 	/*
3656 	 * We now update the tail_lsn since much of the recovery has completed
3657 	 * and there may be space available to use.  If there were no extent
3658 	 * or iunlinks, we can free up the entire log and set the tail_lsn to
3659 	 * be the last_sync_lsn.  This was set in xlog_find_tail to be the
3660 	 * lsn of the last known good LR on disk.  If there are extent frees
3661 	 * or iunlinks they will have some entries in the AIL; so we look at
3662 	 * the AIL to determine how to set the tail_lsn.
3663 	 */
3664 	xlog_assign_tail_lsn(log->l_mp);
3665 
3666 	/*
3667 	 * Now that we've finished replaying all buffer and inode
3668 	 * updates, re-read in the superblock.
3669 	 */
3670 	bp = xfs_getsb(log->l_mp, 0);
3671 	XFS_BUF_UNDONE(bp);
3672 	ASSERT(!(XFS_BUF_ISWRITE(bp)));
3673 	ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
3674 	XFS_BUF_READ(bp);
3675 	XFS_BUF_UNASYNC(bp);
3676 	xfsbdstrat(log->l_mp, bp);
3677 	error = xfs_buf_iowait(bp);
3678 	if (error) {
3679 		xfs_buf_ioerror_alert(bp, __func__);
3680 		ASSERT(0);
3681 		xfs_buf_relse(bp);
3682 		return error;
3683 	}
3684 
3685 	/* Convert superblock from on-disk format */
3686 	sbp = &log->l_mp->m_sb;
3687 	xfs_sb_from_disk(log->l_mp, XFS_BUF_TO_SBP(bp));
3688 	ASSERT(sbp->sb_magicnum == XFS_SB_MAGIC);
3689 	ASSERT(xfs_sb_good_version(sbp));
3690 	xfs_buf_relse(bp);
3691 
3692 	/* We've re-read the superblock so re-initialize per-cpu counters */
3693 	xfs_icsb_reinit_counters(log->l_mp);
3694 
3695 	xlog_recover_check_summary(log);
3696 
3697 	/* Normal transactions can now occur */
3698 	log->l_flags &= ~XLOG_ACTIVE_RECOVERY;
3699 	return 0;
3700 }
3701 
3702 /*
3703  * Perform recovery and re-initialize some log variables in xlog_find_tail.
3704  *
3705  * Return error or zero.
3706  */
3707 int
xlog_recover(xlog_t * log)3708 xlog_recover(
3709 	xlog_t		*log)
3710 {
3711 	xfs_daddr_t	head_blk, tail_blk;
3712 	int		error;
3713 
3714 	/* find the tail of the log */
3715 	if ((error = xlog_find_tail(log, &head_blk, &tail_blk)))
3716 		return error;
3717 
3718 	if (tail_blk != head_blk) {
3719 		/* There used to be a comment here:
3720 		 *
3721 		 * disallow recovery on read-only mounts.  note -- mount
3722 		 * checks for ENOSPC and turns it into an intelligent
3723 		 * error message.
3724 		 * ...but this is no longer true.  Now, unless you specify
3725 		 * NORECOVERY (in which case this function would never be
3726 		 * called), we just go ahead and recover.  We do this all
3727 		 * under the vfs layer, so we can get away with it unless
3728 		 * the device itself is read-only, in which case we fail.
3729 		 */
3730 		if ((error = xfs_dev_is_read_only(log->l_mp, "recovery"))) {
3731 			return error;
3732 		}
3733 
3734 		xfs_notice(log->l_mp, "Starting recovery (logdev: %s)",
3735 				log->l_mp->m_logname ? log->l_mp->m_logname
3736 						     : "internal");
3737 
3738 		error = xlog_do_recover(log, head_blk, tail_blk);
3739 		log->l_flags |= XLOG_RECOVERY_NEEDED;
3740 	}
3741 	return error;
3742 }
3743 
3744 /*
3745  * In the first part of recovery we replay inodes and buffers and build
3746  * up the list of extent free items which need to be processed.  Here
3747  * we process the extent free items and clean up the on disk unlinked
3748  * inode lists.  This is separated from the first part of recovery so
3749  * that the root and real-time bitmap inodes can be read in from disk in
3750  * between the two stages.  This is necessary so that we can free space
3751  * in the real-time portion of the file system.
3752  */
3753 int
xlog_recover_finish(xlog_t * log)3754 xlog_recover_finish(
3755 	xlog_t		*log)
3756 {
3757 	/*
3758 	 * Now we're ready to do the transactions needed for the
3759 	 * rest of recovery.  Start with completing all the extent
3760 	 * free intent records and then process the unlinked inode
3761 	 * lists.  At this point, we essentially run in normal mode
3762 	 * except that we're still performing recovery actions
3763 	 * rather than accepting new requests.
3764 	 */
3765 	if (log->l_flags & XLOG_RECOVERY_NEEDED) {
3766 		int	error;
3767 		error = xlog_recover_process_efis(log);
3768 		if (error) {
3769 			xfs_alert(log->l_mp, "Failed to recover EFIs");
3770 			return error;
3771 		}
3772 		/*
3773 		 * Sync the log to get all the EFIs out of the AIL.
3774 		 * This isn't absolutely necessary, but it helps in
3775 		 * case the unlink transactions would have problems
3776 		 * pushing the EFIs out of the way.
3777 		 */
3778 		xfs_log_force(log->l_mp, XFS_LOG_SYNC);
3779 
3780 		xlog_recover_process_iunlinks(log);
3781 
3782 		xlog_recover_check_summary(log);
3783 
3784 		xfs_notice(log->l_mp, "Ending recovery (logdev: %s)",
3785 				log->l_mp->m_logname ? log->l_mp->m_logname
3786 						     : "internal");
3787 		log->l_flags &= ~XLOG_RECOVERY_NEEDED;
3788 	} else {
3789 		xfs_info(log->l_mp, "Ending clean mount");
3790 	}
3791 	return 0;
3792 }
3793 
3794 
3795 #if defined(DEBUG)
3796 /*
3797  * Read all of the agf and agi counters and check that they
3798  * are consistent with the superblock counters.
3799  */
3800 void
xlog_recover_check_summary(xlog_t * log)3801 xlog_recover_check_summary(
3802 	xlog_t		*log)
3803 {
3804 	xfs_mount_t	*mp;
3805 	xfs_agf_t	*agfp;
3806 	xfs_buf_t	*agfbp;
3807 	xfs_buf_t	*agibp;
3808 	xfs_agnumber_t	agno;
3809 	__uint64_t	freeblks;
3810 	__uint64_t	itotal;
3811 	__uint64_t	ifree;
3812 	int		error;
3813 
3814 	mp = log->l_mp;
3815 
3816 	freeblks = 0LL;
3817 	itotal = 0LL;
3818 	ifree = 0LL;
3819 	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
3820 		error = xfs_read_agf(mp, NULL, agno, 0, &agfbp);
3821 		if (error) {
3822 			xfs_alert(mp, "%s agf read failed agno %d error %d",
3823 						__func__, agno, error);
3824 		} else {
3825 			agfp = XFS_BUF_TO_AGF(agfbp);
3826 			freeblks += be32_to_cpu(agfp->agf_freeblks) +
3827 				    be32_to_cpu(agfp->agf_flcount);
3828 			xfs_buf_relse(agfbp);
3829 		}
3830 
3831 		error = xfs_read_agi(mp, NULL, agno, &agibp);
3832 		if (error) {
3833 			xfs_alert(mp, "%s agi read failed agno %d error %d",
3834 						__func__, agno, error);
3835 		} else {
3836 			struct xfs_agi	*agi = XFS_BUF_TO_AGI(agibp);
3837 
3838 			itotal += be32_to_cpu(agi->agi_count);
3839 			ifree += be32_to_cpu(agi->agi_freecount);
3840 			xfs_buf_relse(agibp);
3841 		}
3842 	}
3843 }
3844 #endif /* DEBUG */
3845