1 /*
2  * linux/fs/jbd2/commit.c
3  *
4  * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
5  *
6  * Copyright 1998 Red Hat corp --- All Rights Reserved
7  *
8  * This file is part of the Linux kernel and is made available under
9  * the terms of the GNU General Public License, version 2, or at your
10  * option, any later version, incorporated herein by reference.
11  *
12  * Journal commit routines for the generic filesystem journaling code;
13  * part of the ext2fs journaling system.
14  */
15 
16 #include <linux/time.h>
17 #include <linux/fs.h>
18 #include <linux/jbd2.h>
19 #include <linux/errno.h>
20 #include <linux/slab.h>
21 #include <linux/mm.h>
22 #include <linux/pagemap.h>
23 #include <linux/jiffies.h>
24 #include <linux/crc32.h>
25 #include <linux/writeback.h>
26 #include <linux/backing-dev.h>
27 #include <linux/bio.h>
28 #include <linux/blkdev.h>
29 #include <linux/bitops.h>
30 #include <trace/events/jbd2.h>
31 
32 /*
33  * Default IO end handler for temporary BJ_IO buffer_heads.
34  */
journal_end_buffer_io_sync(struct buffer_head * bh,int uptodate)35 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
36 {
37 	BUFFER_TRACE(bh, "");
38 	if (uptodate)
39 		set_buffer_uptodate(bh);
40 	else
41 		clear_buffer_uptodate(bh);
42 	unlock_buffer(bh);
43 }
44 
45 /*
46  * When an ext4 file is truncated, it is possible that some pages are not
47  * successfully freed, because they are attached to a committing transaction.
48  * After the transaction commits, these pages are left on the LRU, with no
49  * ->mapping, and with attached buffers.  These pages are trivially reclaimable
50  * by the VM, but their apparent absence upsets the VM accounting, and it makes
51  * the numbers in /proc/meminfo look odd.
52  *
53  * So here, we have a buffer which has just come off the forget list.  Look to
54  * see if we can strip all buffers from the backing page.
55  *
56  * Called under lock_journal(), and possibly under journal_datalist_lock.  The
57  * caller provided us with a ref against the buffer, and we drop that here.
58  */
release_buffer_page(struct buffer_head * bh)59 static void release_buffer_page(struct buffer_head *bh)
60 {
61 	struct page *page;
62 
63 	if (buffer_dirty(bh))
64 		goto nope;
65 	if (atomic_read(&bh->b_count) != 1)
66 		goto nope;
67 	page = bh->b_page;
68 	if (!page)
69 		goto nope;
70 	if (page->mapping)
71 		goto nope;
72 
73 	/* OK, it's a truncated page */
74 	if (!trylock_page(page))
75 		goto nope;
76 
77 	page_cache_get(page);
78 	__brelse(bh);
79 	try_to_free_buffers(page);
80 	unlock_page(page);
81 	page_cache_release(page);
82 	return;
83 
84 nope:
85 	__brelse(bh);
86 }
87 
88 /*
89  * Done it all: now submit the commit record.  We should have
90  * cleaned up our previous buffers by now, so if we are in abort
91  * mode we can now just skip the rest of the journal write
92  * entirely.
93  *
94  * Returns 1 if the journal needs to be aborted or 0 on success
95  */
journal_submit_commit_record(journal_t * journal,transaction_t * commit_transaction,struct buffer_head ** cbh,__u32 crc32_sum)96 static int journal_submit_commit_record(journal_t *journal,
97 					transaction_t *commit_transaction,
98 					struct buffer_head **cbh,
99 					__u32 crc32_sum)
100 {
101 	struct journal_head *descriptor;
102 	struct commit_header *tmp;
103 	struct buffer_head *bh;
104 	int ret;
105 	struct timespec now = current_kernel_time();
106 
107 	*cbh = NULL;
108 
109 	if (is_journal_aborted(journal))
110 		return 0;
111 
112 	descriptor = jbd2_journal_get_descriptor_buffer(journal);
113 	if (!descriptor)
114 		return 1;
115 
116 	bh = jh2bh(descriptor);
117 
118 	tmp = (struct commit_header *)bh->b_data;
119 	tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
120 	tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
121 	tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
122 	tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
123 	tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
124 
125 	if (JBD2_HAS_COMPAT_FEATURE(journal,
126 				    JBD2_FEATURE_COMPAT_CHECKSUM)) {
127 		tmp->h_chksum_type 	= JBD2_CRC32_CHKSUM;
128 		tmp->h_chksum_size 	= JBD2_CRC32_CHKSUM_SIZE;
129 		tmp->h_chksum[0] 	= cpu_to_be32(crc32_sum);
130 	}
131 
132 	JBUFFER_TRACE(descriptor, "submit commit block");
133 	lock_buffer(bh);
134 	clear_buffer_dirty(bh);
135 	set_buffer_uptodate(bh);
136 	bh->b_end_io = journal_end_buffer_io_sync;
137 
138 	if (journal->j_flags & JBD2_BARRIER &&
139 	    !JBD2_HAS_INCOMPAT_FEATURE(journal,
140 				       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT))
141 		ret = submit_bh(WRITE_SYNC | WRITE_FLUSH_FUA, bh);
142 	else
143 		ret = submit_bh(WRITE_SYNC, bh);
144 
145 	*cbh = bh;
146 	return ret;
147 }
148 
149 /*
150  * This function along with journal_submit_commit_record
151  * allows to write the commit record asynchronously.
152  */
journal_wait_on_commit_record(journal_t * journal,struct buffer_head * bh)153 static int journal_wait_on_commit_record(journal_t *journal,
154 					 struct buffer_head *bh)
155 {
156 	int ret = 0;
157 
158 	clear_buffer_dirty(bh);
159 	wait_on_buffer(bh);
160 
161 	if (unlikely(!buffer_uptodate(bh)))
162 		ret = -EIO;
163 	put_bh(bh);            /* One for getblk() */
164 	jbd2_journal_put_journal_head(bh2jh(bh));
165 
166 	return ret;
167 }
168 
169 /*
170  * write the filemap data using writepage() address_space_operations.
171  * We don't do block allocation here even for delalloc. We don't
172  * use writepages() because with dealyed allocation we may be doing
173  * block allocation in writepages().
174  */
journal_submit_inode_data_buffers(struct address_space * mapping)175 static int journal_submit_inode_data_buffers(struct address_space *mapping)
176 {
177 	int ret;
178 	struct writeback_control wbc = {
179 		.sync_mode =  WB_SYNC_ALL,
180 		.nr_to_write = mapping->nrpages * 2,
181 		.range_start = 0,
182 		.range_end = i_size_read(mapping->host),
183 	};
184 
185 	ret = generic_writepages(mapping, &wbc);
186 	return ret;
187 }
188 
189 /*
190  * Submit all the data buffers of inode associated with the transaction to
191  * disk.
192  *
193  * We are in a committing transaction. Therefore no new inode can be added to
194  * our inode list. We use JI_COMMIT_RUNNING flag to protect inode we currently
195  * operate on from being released while we write out pages.
196  */
journal_submit_data_buffers(journal_t * journal,transaction_t * commit_transaction)197 static int journal_submit_data_buffers(journal_t *journal,
198 		transaction_t *commit_transaction)
199 {
200 	struct jbd2_inode *jinode;
201 	int err, ret = 0;
202 	struct address_space *mapping;
203 
204 	spin_lock(&journal->j_list_lock);
205 	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
206 		mapping = jinode->i_vfs_inode->i_mapping;
207 		set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
208 		spin_unlock(&journal->j_list_lock);
209 		/*
210 		 * submit the inode data buffers. We use writepage
211 		 * instead of writepages. Because writepages can do
212 		 * block allocation  with delalloc. We need to write
213 		 * only allocated blocks here.
214 		 */
215 		trace_jbd2_submit_inode_data(jinode->i_vfs_inode);
216 		err = journal_submit_inode_data_buffers(mapping);
217 		if (!ret)
218 			ret = err;
219 		spin_lock(&journal->j_list_lock);
220 		J_ASSERT(jinode->i_transaction == commit_transaction);
221 		clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
222 		smp_mb__after_clear_bit();
223 		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
224 	}
225 	spin_unlock(&journal->j_list_lock);
226 	return ret;
227 }
228 
229 /*
230  * Wait for data submitted for writeout, refile inodes to proper
231  * transaction if needed.
232  *
233  */
journal_finish_inode_data_buffers(journal_t * journal,transaction_t * commit_transaction)234 static int journal_finish_inode_data_buffers(journal_t *journal,
235 		transaction_t *commit_transaction)
236 {
237 	struct jbd2_inode *jinode, *next_i;
238 	int err, ret = 0;
239 
240 	/* For locking, see the comment in journal_submit_data_buffers() */
241 	spin_lock(&journal->j_list_lock);
242 	list_for_each_entry(jinode, &commit_transaction->t_inode_list, i_list) {
243 		set_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
244 		spin_unlock(&journal->j_list_lock);
245 		err = filemap_fdatawait(jinode->i_vfs_inode->i_mapping);
246 		if (err) {
247 			/*
248 			 * Because AS_EIO is cleared by
249 			 * filemap_fdatawait_range(), set it again so
250 			 * that user process can get -EIO from fsync().
251 			 */
252 			set_bit(AS_EIO,
253 				&jinode->i_vfs_inode->i_mapping->flags);
254 
255 			if (!ret)
256 				ret = err;
257 		}
258 		spin_lock(&journal->j_list_lock);
259 		clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
260 		smp_mb__after_clear_bit();
261 		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
262 	}
263 
264 	/* Now refile inode to proper lists */
265 	list_for_each_entry_safe(jinode, next_i,
266 				 &commit_transaction->t_inode_list, i_list) {
267 		list_del(&jinode->i_list);
268 		if (jinode->i_next_transaction) {
269 			jinode->i_transaction = jinode->i_next_transaction;
270 			jinode->i_next_transaction = NULL;
271 			list_add(&jinode->i_list,
272 				&jinode->i_transaction->t_inode_list);
273 		} else {
274 			jinode->i_transaction = NULL;
275 		}
276 	}
277 	spin_unlock(&journal->j_list_lock);
278 
279 	return ret;
280 }
281 
jbd2_checksum_data(__u32 crc32_sum,struct buffer_head * bh)282 static __u32 jbd2_checksum_data(__u32 crc32_sum, struct buffer_head *bh)
283 {
284 	struct page *page = bh->b_page;
285 	char *addr;
286 	__u32 checksum;
287 
288 	addr = kmap_atomic(page);
289 	checksum = crc32_be(crc32_sum,
290 		(void *)(addr + offset_in_page(bh->b_data)), bh->b_size);
291 	kunmap_atomic(addr);
292 
293 	return checksum;
294 }
295 
write_tag_block(int tag_bytes,journal_block_tag_t * tag,unsigned long long block)296 static void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
297 				   unsigned long long block)
298 {
299 	tag->t_blocknr = cpu_to_be32(block & (u32)~0);
300 	if (tag_bytes > JBD2_TAG_SIZE32)
301 		tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
302 }
303 
304 /*
305  * jbd2_journal_commit_transaction
306  *
307  * The primary function for committing a transaction to the log.  This
308  * function is called by the journal thread to begin a complete commit.
309  */
jbd2_journal_commit_transaction(journal_t * journal)310 void jbd2_journal_commit_transaction(journal_t *journal)
311 {
312 	struct transaction_stats_s stats;
313 	transaction_t *commit_transaction;
314 	struct journal_head *jh, *new_jh, *descriptor;
315 	struct buffer_head **wbuf = journal->j_wbuf;
316 	int bufs;
317 	int flags;
318 	int err;
319 	unsigned long long blocknr;
320 	ktime_t start_time;
321 	u64 commit_time;
322 	char *tagp = NULL;
323 	journal_header_t *header;
324 	journal_block_tag_t *tag = NULL;
325 	int space_left = 0;
326 	int first_tag = 0;
327 	int tag_flag;
328 	int i;
329 	int tag_bytes = journal_tag_bytes(journal);
330 	struct buffer_head *cbh = NULL; /* For transactional checksums */
331 	__u32 crc32_sum = ~0;
332 	struct blk_plug plug;
333 	/* Tail of the journal */
334 	unsigned long first_block;
335 	tid_t first_tid;
336 	int update_tail;
337 
338 	/*
339 	 * First job: lock down the current transaction and wait for
340 	 * all outstanding updates to complete.
341 	 */
342 
343 	/* Do we need to erase the effects of a prior jbd2_journal_flush? */
344 	if (journal->j_flags & JBD2_FLUSHED) {
345 		jbd_debug(3, "super block updated\n");
346 		mutex_lock(&journal->j_checkpoint_mutex);
347 		/*
348 		 * We hold j_checkpoint_mutex so tail cannot change under us.
349 		 * We don't need any special data guarantees for writing sb
350 		 * since journal is empty and it is ok for write to be
351 		 * flushed only with transaction commit.
352 		 */
353 		jbd2_journal_update_sb_log_tail(journal,
354 						journal->j_tail_sequence,
355 						journal->j_tail,
356 						WRITE_SYNC);
357 		mutex_unlock(&journal->j_checkpoint_mutex);
358 	} else {
359 		jbd_debug(3, "superblock not updated\n");
360 	}
361 
362 	J_ASSERT(journal->j_running_transaction != NULL);
363 	J_ASSERT(journal->j_committing_transaction == NULL);
364 
365 	commit_transaction = journal->j_running_transaction;
366 	J_ASSERT(commit_transaction->t_state == T_RUNNING);
367 
368 	trace_jbd2_start_commit(journal, commit_transaction);
369 	jbd_debug(1, "JBD2: starting commit of transaction %d\n",
370 			commit_transaction->t_tid);
371 
372 	write_lock(&journal->j_state_lock);
373 	commit_transaction->t_state = T_LOCKED;
374 
375 	trace_jbd2_commit_locking(journal, commit_transaction);
376 	stats.run.rs_wait = commit_transaction->t_max_wait;
377 	stats.run.rs_locked = jiffies;
378 	stats.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
379 					      stats.run.rs_locked);
380 
381 	spin_lock(&commit_transaction->t_handle_lock);
382 	while (atomic_read(&commit_transaction->t_updates)) {
383 		DEFINE_WAIT(wait);
384 
385 		prepare_to_wait(&journal->j_wait_updates, &wait,
386 					TASK_UNINTERRUPTIBLE);
387 		if (atomic_read(&commit_transaction->t_updates)) {
388 			spin_unlock(&commit_transaction->t_handle_lock);
389 			write_unlock(&journal->j_state_lock);
390 			schedule();
391 			write_lock(&journal->j_state_lock);
392 			spin_lock(&commit_transaction->t_handle_lock);
393 		}
394 		finish_wait(&journal->j_wait_updates, &wait);
395 	}
396 	spin_unlock(&commit_transaction->t_handle_lock);
397 
398 	J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <=
399 			journal->j_max_transaction_buffers);
400 
401 	/*
402 	 * First thing we are allowed to do is to discard any remaining
403 	 * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
404 	 * that there are no such buffers: if a large filesystem
405 	 * operation like a truncate needs to split itself over multiple
406 	 * transactions, then it may try to do a jbd2_journal_restart() while
407 	 * there are still BJ_Reserved buffers outstanding.  These must
408 	 * be released cleanly from the current transaction.
409 	 *
410 	 * In this case, the filesystem must still reserve write access
411 	 * again before modifying the buffer in the new transaction, but
412 	 * we do not require it to remember exactly which old buffers it
413 	 * has reserved.  This is consistent with the existing behaviour
414 	 * that multiple jbd2_journal_get_write_access() calls to the same
415 	 * buffer are perfectly permissible.
416 	 */
417 	while (commit_transaction->t_reserved_list) {
418 		jh = commit_transaction->t_reserved_list;
419 		JBUFFER_TRACE(jh, "reserved, unused: refile");
420 		/*
421 		 * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
422 		 * leave undo-committed data.
423 		 */
424 		if (jh->b_committed_data) {
425 			struct buffer_head *bh = jh2bh(jh);
426 
427 			jbd_lock_bh_state(bh);
428 			jbd2_free(jh->b_committed_data, bh->b_size);
429 			jh->b_committed_data = NULL;
430 			jbd_unlock_bh_state(bh);
431 		}
432 		jbd2_journal_refile_buffer(journal, jh);
433 	}
434 
435 	/*
436 	 * Now try to drop any written-back buffers from the journal's
437 	 * checkpoint lists.  We do this *before* commit because it potentially
438 	 * frees some memory
439 	 */
440 	spin_lock(&journal->j_list_lock);
441 	__jbd2_journal_clean_checkpoint_list(journal);
442 	spin_unlock(&journal->j_list_lock);
443 
444 	jbd_debug(3, "JBD2: commit phase 1\n");
445 
446 	/*
447 	 * Clear revoked flag to reflect there is no revoked buffers
448 	 * in the next transaction which is going to be started.
449 	 */
450 	jbd2_clear_buffer_revoked_flags(journal);
451 
452 	/*
453 	 * Switch to a new revoke table.
454 	 */
455 	jbd2_journal_switch_revoke_table(journal);
456 
457 	trace_jbd2_commit_flushing(journal, commit_transaction);
458 	stats.run.rs_flushing = jiffies;
459 	stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
460 					     stats.run.rs_flushing);
461 
462 	commit_transaction->t_state = T_FLUSH;
463 	journal->j_committing_transaction = commit_transaction;
464 	journal->j_running_transaction = NULL;
465 	start_time = ktime_get();
466 	commit_transaction->t_log_start = journal->j_head;
467 	wake_up(&journal->j_wait_transaction_locked);
468 	write_unlock(&journal->j_state_lock);
469 
470 	jbd_debug(3, "JBD2: commit phase 2\n");
471 
472 	/*
473 	 * Now start flushing things to disk, in the order they appear
474 	 * on the transaction lists.  Data blocks go first.
475 	 */
476 	err = journal_submit_data_buffers(journal, commit_transaction);
477 	if (err)
478 		jbd2_journal_abort(journal, err);
479 
480 	blk_start_plug(&plug);
481 	jbd2_journal_write_revoke_records(journal, commit_transaction,
482 					  WRITE_SYNC);
483 	blk_finish_plug(&plug);
484 
485 	jbd_debug(3, "JBD2: commit phase 2\n");
486 
487 	/*
488 	 * Way to go: we have now written out all of the data for a
489 	 * transaction!  Now comes the tricky part: we need to write out
490 	 * metadata.  Loop over the transaction's entire buffer list:
491 	 */
492 	write_lock(&journal->j_state_lock);
493 	commit_transaction->t_state = T_COMMIT;
494 	write_unlock(&journal->j_state_lock);
495 
496 	trace_jbd2_commit_logging(journal, commit_transaction);
497 	stats.run.rs_logging = jiffies;
498 	stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing,
499 					       stats.run.rs_logging);
500 	stats.run.rs_blocks =
501 		atomic_read(&commit_transaction->t_outstanding_credits);
502 	stats.run.rs_blocks_logged = 0;
503 
504 	J_ASSERT(commit_transaction->t_nr_buffers <=
505 		 atomic_read(&commit_transaction->t_outstanding_credits));
506 
507 	err = 0;
508 	descriptor = NULL;
509 	bufs = 0;
510 	blk_start_plug(&plug);
511 	while (commit_transaction->t_buffers) {
512 
513 		/* Find the next buffer to be journaled... */
514 
515 		jh = commit_transaction->t_buffers;
516 
517 		/* If we're in abort mode, we just un-journal the buffer and
518 		   release it. */
519 
520 		if (is_journal_aborted(journal)) {
521 			clear_buffer_jbddirty(jh2bh(jh));
522 			JBUFFER_TRACE(jh, "journal is aborting: refile");
523 			jbd2_buffer_abort_trigger(jh,
524 						  jh->b_frozen_data ?
525 						  jh->b_frozen_triggers :
526 						  jh->b_triggers);
527 			jbd2_journal_refile_buffer(journal, jh);
528 			/* If that was the last one, we need to clean up
529 			 * any descriptor buffers which may have been
530 			 * already allocated, even if we are now
531 			 * aborting. */
532 			if (!commit_transaction->t_buffers)
533 				goto start_journal_io;
534 			continue;
535 		}
536 
537 		/* Make sure we have a descriptor block in which to
538 		   record the metadata buffer. */
539 
540 		if (!descriptor) {
541 			struct buffer_head *bh;
542 
543 			J_ASSERT (bufs == 0);
544 
545 			jbd_debug(4, "JBD2: get descriptor\n");
546 
547 			descriptor = jbd2_journal_get_descriptor_buffer(journal);
548 			if (!descriptor) {
549 				jbd2_journal_abort(journal, -EIO);
550 				continue;
551 			}
552 
553 			bh = jh2bh(descriptor);
554 			jbd_debug(4, "JBD2: got buffer %llu (%p)\n",
555 				(unsigned long long)bh->b_blocknr, bh->b_data);
556 			header = (journal_header_t *)&bh->b_data[0];
557 			header->h_magic     = cpu_to_be32(JBD2_MAGIC_NUMBER);
558 			header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
559 			header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
560 
561 			tagp = &bh->b_data[sizeof(journal_header_t)];
562 			space_left = bh->b_size - sizeof(journal_header_t);
563 			first_tag = 1;
564 			set_buffer_jwrite(bh);
565 			set_buffer_dirty(bh);
566 			wbuf[bufs++] = bh;
567 
568 			/* Record it so that we can wait for IO
569                            completion later */
570 			BUFFER_TRACE(bh, "ph3: file as descriptor");
571 			jbd2_journal_file_buffer(descriptor, commit_transaction,
572 					BJ_LogCtl);
573 		}
574 
575 		/* Where is the buffer to be written? */
576 
577 		err = jbd2_journal_next_log_block(journal, &blocknr);
578 		/* If the block mapping failed, just abandon the buffer
579 		   and repeat this loop: we'll fall into the
580 		   refile-on-abort condition above. */
581 		if (err) {
582 			jbd2_journal_abort(journal, err);
583 			continue;
584 		}
585 
586 		/*
587 		 * start_this_handle() uses t_outstanding_credits to determine
588 		 * the free space in the log, but this counter is changed
589 		 * by jbd2_journal_next_log_block() also.
590 		 */
591 		atomic_dec(&commit_transaction->t_outstanding_credits);
592 
593 		/* Bump b_count to prevent truncate from stumbling over
594                    the shadowed buffer!  @@@ This can go if we ever get
595                    rid of the BJ_IO/BJ_Shadow pairing of buffers. */
596 		atomic_inc(&jh2bh(jh)->b_count);
597 
598 		/* Make a temporary IO buffer with which to write it out
599                    (this will requeue both the metadata buffer and the
600                    temporary IO buffer). new_bh goes on BJ_IO*/
601 
602 		set_bit(BH_JWrite, &jh2bh(jh)->b_state);
603 		/*
604 		 * akpm: jbd2_journal_write_metadata_buffer() sets
605 		 * new_bh->b_transaction to commit_transaction.
606 		 * We need to clean this up before we release new_bh
607 		 * (which is of type BJ_IO)
608 		 */
609 		JBUFFER_TRACE(jh, "ph3: write metadata");
610 		flags = jbd2_journal_write_metadata_buffer(commit_transaction,
611 						      jh, &new_jh, blocknr);
612 		if (flags < 0) {
613 			jbd2_journal_abort(journal, flags);
614 			continue;
615 		}
616 		set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
617 		wbuf[bufs++] = jh2bh(new_jh);
618 
619 		/* Record the new block's tag in the current descriptor
620                    buffer */
621 
622 		tag_flag = 0;
623 		if (flags & 1)
624 			tag_flag |= JBD2_FLAG_ESCAPE;
625 		if (!first_tag)
626 			tag_flag |= JBD2_FLAG_SAME_UUID;
627 
628 		tag = (journal_block_tag_t *) tagp;
629 		write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
630 		tag->t_flags = cpu_to_be32(tag_flag);
631 		tagp += tag_bytes;
632 		space_left -= tag_bytes;
633 
634 		if (first_tag) {
635 			memcpy (tagp, journal->j_uuid, 16);
636 			tagp += 16;
637 			space_left -= 16;
638 			first_tag = 0;
639 		}
640 
641 		/* If there's no more to do, or if the descriptor is full,
642 		   let the IO rip! */
643 
644 		if (bufs == journal->j_wbufsize ||
645 		    commit_transaction->t_buffers == NULL ||
646 		    space_left < tag_bytes + 16) {
647 
648 			jbd_debug(4, "JBD2: Submit %d IOs\n", bufs);
649 
650 			/* Write an end-of-descriptor marker before
651                            submitting the IOs.  "tag" still points to
652                            the last tag we set up. */
653 
654 			tag->t_flags |= cpu_to_be32(JBD2_FLAG_LAST_TAG);
655 
656 start_journal_io:
657 			for (i = 0; i < bufs; i++) {
658 				struct buffer_head *bh = wbuf[i];
659 				/*
660 				 * Compute checksum.
661 				 */
662 				if (JBD2_HAS_COMPAT_FEATURE(journal,
663 					JBD2_FEATURE_COMPAT_CHECKSUM)) {
664 					crc32_sum =
665 					    jbd2_checksum_data(crc32_sum, bh);
666 				}
667 
668 				lock_buffer(bh);
669 				clear_buffer_dirty(bh);
670 				set_buffer_uptodate(bh);
671 				bh->b_end_io = journal_end_buffer_io_sync;
672 				submit_bh(WRITE_SYNC, bh);
673 			}
674 			cond_resched();
675 			stats.run.rs_blocks_logged += bufs;
676 
677 			/* Force a new descriptor to be generated next
678                            time round the loop. */
679 			descriptor = NULL;
680 			bufs = 0;
681 		}
682 	}
683 
684 	err = journal_finish_inode_data_buffers(journal, commit_transaction);
685 	if (err) {
686 		printk(KERN_WARNING
687 			"JBD2: Detected IO errors while flushing file data "
688 		       "on %s\n", journal->j_devname);
689 		if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR)
690 			jbd2_journal_abort(journal, err);
691 		err = 0;
692 	}
693 
694 	/*
695 	 * Get current oldest transaction in the log before we issue flush
696 	 * to the filesystem device. After the flush we can be sure that
697 	 * blocks of all older transactions are checkpointed to persistent
698 	 * storage and we will be safe to update journal start in the
699 	 * superblock with the numbers we get here.
700 	 */
701 	update_tail =
702 		jbd2_journal_get_log_tail(journal, &first_tid, &first_block);
703 
704 	write_lock(&journal->j_state_lock);
705 	if (update_tail) {
706 		long freed = first_block - journal->j_tail;
707 
708 		if (first_block < journal->j_tail)
709 			freed += journal->j_last - journal->j_first;
710 		/* Update tail only if we free significant amount of space */
711 		if (freed < journal->j_maxlen / 4)
712 			update_tail = 0;
713 	}
714 	J_ASSERT(commit_transaction->t_state == T_COMMIT);
715 	commit_transaction->t_state = T_COMMIT_DFLUSH;
716 	write_unlock(&journal->j_state_lock);
717 
718 	/*
719 	 * If the journal is not located on the file system device,
720 	 * then we must flush the file system device before we issue
721 	 * the commit record
722 	 */
723 	if (commit_transaction->t_need_data_flush &&
724 	    (journal->j_fs_dev != journal->j_dev) &&
725 	    (journal->j_flags & JBD2_BARRIER))
726 		blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS, NULL);
727 
728 	/* Done it all: now write the commit record asynchronously. */
729 	if (JBD2_HAS_INCOMPAT_FEATURE(journal,
730 				      JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
731 		err = journal_submit_commit_record(journal, commit_transaction,
732 						 &cbh, crc32_sum);
733 		if (err)
734 			__jbd2_journal_abort_hard(journal);
735 	}
736 
737 	blk_finish_plug(&plug);
738 
739 	/* Lo and behold: we have just managed to send a transaction to
740            the log.  Before we can commit it, wait for the IO so far to
741            complete.  Control buffers being written are on the
742            transaction's t_log_list queue, and metadata buffers are on
743            the t_iobuf_list queue.
744 
745 	   Wait for the buffers in reverse order.  That way we are
746 	   less likely to be woken up until all IOs have completed, and
747 	   so we incur less scheduling load.
748 	*/
749 
750 	jbd_debug(3, "JBD2: commit phase 3\n");
751 
752 	/*
753 	 * akpm: these are BJ_IO, and j_list_lock is not needed.
754 	 * See __journal_try_to_free_buffer.
755 	 */
756 wait_for_iobuf:
757 	while (commit_transaction->t_iobuf_list != NULL) {
758 		struct buffer_head *bh;
759 
760 		jh = commit_transaction->t_iobuf_list->b_tprev;
761 		bh = jh2bh(jh);
762 		if (buffer_locked(bh)) {
763 			wait_on_buffer(bh);
764 			goto wait_for_iobuf;
765 		}
766 		if (cond_resched())
767 			goto wait_for_iobuf;
768 
769 		if (unlikely(!buffer_uptodate(bh)))
770 			err = -EIO;
771 
772 		clear_buffer_jwrite(bh);
773 
774 		JBUFFER_TRACE(jh, "ph4: unfile after journal write");
775 		jbd2_journal_unfile_buffer(journal, jh);
776 
777 		/*
778 		 * ->t_iobuf_list should contain only dummy buffer_heads
779 		 * which were created by jbd2_journal_write_metadata_buffer().
780 		 */
781 		BUFFER_TRACE(bh, "dumping temporary bh");
782 		jbd2_journal_put_journal_head(jh);
783 		__brelse(bh);
784 		J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
785 		free_buffer_head(bh);
786 
787 		/* We also have to unlock and free the corresponding
788                    shadowed buffer */
789 		jh = commit_transaction->t_shadow_list->b_tprev;
790 		bh = jh2bh(jh);
791 		clear_bit(BH_JWrite, &bh->b_state);
792 		J_ASSERT_BH(bh, buffer_jbddirty(bh));
793 
794 		/* The metadata is now released for reuse, but we need
795                    to remember it against this transaction so that when
796                    we finally commit, we can do any checkpointing
797                    required. */
798 		JBUFFER_TRACE(jh, "file as BJ_Forget");
799 		jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
800 		/*
801 		 * Wake up any transactions which were waiting for this IO to
802 		 * complete. The barrier must be here so that changes by
803 		 * jbd2_journal_file_buffer() take effect before wake_up_bit()
804 		 * does the waitqueue check.
805 		 */
806 		smp_mb();
807 		wake_up_bit(&bh->b_state, BH_Unshadow);
808 		JBUFFER_TRACE(jh, "brelse shadowed buffer");
809 		__brelse(bh);
810 	}
811 
812 	J_ASSERT (commit_transaction->t_shadow_list == NULL);
813 
814 	jbd_debug(3, "JBD2: commit phase 4\n");
815 
816 	/* Here we wait for the revoke record and descriptor record buffers */
817  wait_for_ctlbuf:
818 	while (commit_transaction->t_log_list != NULL) {
819 		struct buffer_head *bh;
820 
821 		jh = commit_transaction->t_log_list->b_tprev;
822 		bh = jh2bh(jh);
823 		if (buffer_locked(bh)) {
824 			wait_on_buffer(bh);
825 			goto wait_for_ctlbuf;
826 		}
827 		if (cond_resched())
828 			goto wait_for_ctlbuf;
829 
830 		if (unlikely(!buffer_uptodate(bh)))
831 			err = -EIO;
832 
833 		BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
834 		clear_buffer_jwrite(bh);
835 		jbd2_journal_unfile_buffer(journal, jh);
836 		jbd2_journal_put_journal_head(jh);
837 		__brelse(bh);		/* One for getblk */
838 		/* AKPM: bforget here */
839 	}
840 
841 	if (err)
842 		jbd2_journal_abort(journal, err);
843 
844 	jbd_debug(3, "JBD2: commit phase 5\n");
845 	write_lock(&journal->j_state_lock);
846 	J_ASSERT(commit_transaction->t_state == T_COMMIT_DFLUSH);
847 	commit_transaction->t_state = T_COMMIT_JFLUSH;
848 	write_unlock(&journal->j_state_lock);
849 
850 	if (!JBD2_HAS_INCOMPAT_FEATURE(journal,
851 				       JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) {
852 		err = journal_submit_commit_record(journal, commit_transaction,
853 						&cbh, crc32_sum);
854 		if (err)
855 			__jbd2_journal_abort_hard(journal);
856 	}
857 	if (cbh)
858 		err = journal_wait_on_commit_record(journal, cbh);
859 	if (JBD2_HAS_INCOMPAT_FEATURE(journal,
860 				      JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT) &&
861 	    journal->j_flags & JBD2_BARRIER) {
862 		blkdev_issue_flush(journal->j_dev, GFP_NOFS, NULL);
863 	}
864 
865 	if (err)
866 		jbd2_journal_abort(journal, err);
867 
868 	/*
869 	 * Now disk caches for filesystem device are flushed so we are safe to
870 	 * erase checkpointed transactions from the log by updating journal
871 	 * superblock.
872 	 */
873 	if (update_tail)
874 		jbd2_update_log_tail(journal, first_tid, first_block);
875 
876 	/* End of a transaction!  Finally, we can do checkpoint
877            processing: any buffers committed as a result of this
878            transaction can be removed from any checkpoint list it was on
879            before. */
880 
881 	jbd_debug(3, "JBD2: commit phase 6\n");
882 
883 	J_ASSERT(list_empty(&commit_transaction->t_inode_list));
884 	J_ASSERT(commit_transaction->t_buffers == NULL);
885 	J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
886 	J_ASSERT(commit_transaction->t_iobuf_list == NULL);
887 	J_ASSERT(commit_transaction->t_shadow_list == NULL);
888 	J_ASSERT(commit_transaction->t_log_list == NULL);
889 
890 restart_loop:
891 	/*
892 	 * As there are other places (journal_unmap_buffer()) adding buffers
893 	 * to this list we have to be careful and hold the j_list_lock.
894 	 */
895 	spin_lock(&journal->j_list_lock);
896 	while (commit_transaction->t_forget) {
897 		transaction_t *cp_transaction;
898 		struct buffer_head *bh;
899 		int try_to_free = 0;
900 
901 		jh = commit_transaction->t_forget;
902 		spin_unlock(&journal->j_list_lock);
903 		bh = jh2bh(jh);
904 		/*
905 		 * Get a reference so that bh cannot be freed before we are
906 		 * done with it.
907 		 */
908 		get_bh(bh);
909 		jbd_lock_bh_state(bh);
910 		J_ASSERT_JH(jh,	jh->b_transaction == commit_transaction);
911 
912 		/*
913 		 * If there is undo-protected committed data against
914 		 * this buffer, then we can remove it now.  If it is a
915 		 * buffer needing such protection, the old frozen_data
916 		 * field now points to a committed version of the
917 		 * buffer, so rotate that field to the new committed
918 		 * data.
919 		 *
920 		 * Otherwise, we can just throw away the frozen data now.
921 		 *
922 		 * We also know that the frozen data has already fired
923 		 * its triggers if they exist, so we can clear that too.
924 		 */
925 		if (jh->b_committed_data) {
926 			jbd2_free(jh->b_committed_data, bh->b_size);
927 			jh->b_committed_data = NULL;
928 			if (jh->b_frozen_data) {
929 				jh->b_committed_data = jh->b_frozen_data;
930 				jh->b_frozen_data = NULL;
931 				jh->b_frozen_triggers = NULL;
932 			}
933 		} else if (jh->b_frozen_data) {
934 			jbd2_free(jh->b_frozen_data, bh->b_size);
935 			jh->b_frozen_data = NULL;
936 			jh->b_frozen_triggers = NULL;
937 		}
938 
939 		spin_lock(&journal->j_list_lock);
940 		cp_transaction = jh->b_cp_transaction;
941 		if (cp_transaction) {
942 			JBUFFER_TRACE(jh, "remove from old cp transaction");
943 			cp_transaction->t_chp_stats.cs_dropped++;
944 			__jbd2_journal_remove_checkpoint(jh);
945 		}
946 
947 		/* Only re-checkpoint the buffer_head if it is marked
948 		 * dirty.  If the buffer was added to the BJ_Forget list
949 		 * by jbd2_journal_forget, it may no longer be dirty and
950 		 * there's no point in keeping a checkpoint record for
951 		 * it. */
952 
953 		/* A buffer which has been freed while still being
954 		 * journaled by a previous transaction may end up still
955 		 * being dirty here, but we want to avoid writing back
956 		 * that buffer in the future after the "add to orphan"
957 		 * operation been committed,  That's not only a performance
958 		 * gain, it also stops aliasing problems if the buffer is
959 		 * left behind for writeback and gets reallocated for another
960 		 * use in a different page. */
961 		if (buffer_freed(bh) && !jh->b_next_transaction) {
962 			clear_buffer_freed(bh);
963 			clear_buffer_jbddirty(bh);
964 		}
965 
966 		if (buffer_jbddirty(bh)) {
967 			JBUFFER_TRACE(jh, "add to new checkpointing trans");
968 			__jbd2_journal_insert_checkpoint(jh, commit_transaction);
969 			if (is_journal_aborted(journal))
970 				clear_buffer_jbddirty(bh);
971 		} else {
972 			J_ASSERT_BH(bh, !buffer_dirty(bh));
973 			/*
974 			 * The buffer on BJ_Forget list and not jbddirty means
975 			 * it has been freed by this transaction and hence it
976 			 * could not have been reallocated until this
977 			 * transaction has committed. *BUT* it could be
978 			 * reallocated once we have written all the data to
979 			 * disk and before we process the buffer on BJ_Forget
980 			 * list.
981 			 */
982 			if (!jh->b_next_transaction)
983 				try_to_free = 1;
984 		}
985 		JBUFFER_TRACE(jh, "refile or unfile buffer");
986 		__jbd2_journal_refile_buffer(jh);
987 		jbd_unlock_bh_state(bh);
988 		if (try_to_free)
989 			release_buffer_page(bh);	/* Drops bh reference */
990 		else
991 			__brelse(bh);
992 		cond_resched_lock(&journal->j_list_lock);
993 	}
994 	spin_unlock(&journal->j_list_lock);
995 	/*
996 	 * This is a bit sleazy.  We use j_list_lock to protect transition
997 	 * of a transaction into T_FINISHED state and calling
998 	 * __jbd2_journal_drop_transaction(). Otherwise we could race with
999 	 * other checkpointing code processing the transaction...
1000 	 */
1001 	write_lock(&journal->j_state_lock);
1002 	spin_lock(&journal->j_list_lock);
1003 	/*
1004 	 * Now recheck if some buffers did not get attached to the transaction
1005 	 * while the lock was dropped...
1006 	 */
1007 	if (commit_transaction->t_forget) {
1008 		spin_unlock(&journal->j_list_lock);
1009 		write_unlock(&journal->j_state_lock);
1010 		goto restart_loop;
1011 	}
1012 
1013 	/* Done with this transaction! */
1014 
1015 	jbd_debug(3, "JBD2: commit phase 7\n");
1016 
1017 	J_ASSERT(commit_transaction->t_state == T_COMMIT_JFLUSH);
1018 
1019 	commit_transaction->t_start = jiffies;
1020 	stats.run.rs_logging = jbd2_time_diff(stats.run.rs_logging,
1021 					      commit_transaction->t_start);
1022 
1023 	/*
1024 	 * File the transaction statistics
1025 	 */
1026 	stats.ts_tid = commit_transaction->t_tid;
1027 	stats.run.rs_handle_count =
1028 		atomic_read(&commit_transaction->t_handle_count);
1029 	trace_jbd2_run_stats(journal->j_fs_dev->bd_dev,
1030 			     commit_transaction->t_tid, &stats.run);
1031 
1032 	/*
1033 	 * Calculate overall stats
1034 	 */
1035 	spin_lock(&journal->j_history_lock);
1036 	journal->j_stats.ts_tid++;
1037 	journal->j_stats.run.rs_wait += stats.run.rs_wait;
1038 	journal->j_stats.run.rs_running += stats.run.rs_running;
1039 	journal->j_stats.run.rs_locked += stats.run.rs_locked;
1040 	journal->j_stats.run.rs_flushing += stats.run.rs_flushing;
1041 	journal->j_stats.run.rs_logging += stats.run.rs_logging;
1042 	journal->j_stats.run.rs_handle_count += stats.run.rs_handle_count;
1043 	journal->j_stats.run.rs_blocks += stats.run.rs_blocks;
1044 	journal->j_stats.run.rs_blocks_logged += stats.run.rs_blocks_logged;
1045 	spin_unlock(&journal->j_history_lock);
1046 
1047 	commit_transaction->t_state = T_COMMIT_CALLBACK;
1048 	J_ASSERT(commit_transaction == journal->j_committing_transaction);
1049 	journal->j_commit_sequence = commit_transaction->t_tid;
1050 	journal->j_committing_transaction = NULL;
1051 	commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
1052 
1053 	/*
1054 	 * weight the commit time higher than the average time so we don't
1055 	 * react too strongly to vast changes in the commit time
1056 	 */
1057 	if (likely(journal->j_average_commit_time))
1058 		journal->j_average_commit_time = (commit_time +
1059 				journal->j_average_commit_time*3) / 4;
1060 	else
1061 		journal->j_average_commit_time = commit_time;
1062 
1063 	write_unlock(&journal->j_state_lock);
1064 
1065 	if (journal->j_checkpoint_transactions == NULL) {
1066 		journal->j_checkpoint_transactions = commit_transaction;
1067 		commit_transaction->t_cpnext = commit_transaction;
1068 		commit_transaction->t_cpprev = commit_transaction;
1069 	} else {
1070 		commit_transaction->t_cpnext =
1071 			journal->j_checkpoint_transactions;
1072 		commit_transaction->t_cpprev =
1073 			commit_transaction->t_cpnext->t_cpprev;
1074 		commit_transaction->t_cpnext->t_cpprev =
1075 			commit_transaction;
1076 		commit_transaction->t_cpprev->t_cpnext =
1077 				commit_transaction;
1078 	}
1079 	spin_unlock(&journal->j_list_lock);
1080 	/* Drop all spin_locks because commit_callback may be block.
1081 	 * __journal_remove_checkpoint() can not destroy transaction
1082 	 * under us because it is not marked as T_FINISHED yet */
1083 	if (journal->j_commit_callback)
1084 		journal->j_commit_callback(journal, commit_transaction);
1085 
1086 	trace_jbd2_end_commit(journal, commit_transaction);
1087 	jbd_debug(1, "JBD2: commit %d complete, head %d\n",
1088 		  journal->j_commit_sequence, journal->j_tail_sequence);
1089 
1090 	write_lock(&journal->j_state_lock);
1091 	spin_lock(&journal->j_list_lock);
1092 	commit_transaction->t_state = T_FINISHED;
1093 	/* Recheck checkpoint lists after j_list_lock was dropped */
1094 	if (commit_transaction->t_checkpoint_list == NULL &&
1095 	    commit_transaction->t_checkpoint_io_list == NULL) {
1096 		__jbd2_journal_drop_transaction(journal, commit_transaction);
1097 		jbd2_journal_free_transaction(commit_transaction);
1098 	}
1099 	spin_unlock(&journal->j_list_lock);
1100 	write_unlock(&journal->j_state_lock);
1101 	wake_up(&journal->j_wait_done_commit);
1102 }
1103