1 /*
2  * linux/fs/jbd/commit.c
3  *
4  * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
5  *
6  * Copyright 1998 Red Hat corp --- All Rights Reserved
7  *
8  * This file is part of the Linux kernel and is made available under
9  * the terms of the GNU General Public License, version 2, or at your
10  * option, any later version, incorporated herein by reference.
11  *
12  * Journal commit routines for the generic filesystem journaling code;
13  * part of the ext2fs journaling system.
14  */
15 
16 #include <linux/time.h>
17 #include <linux/fs.h>
18 #include <linux/jbd.h>
19 #include <linux/errno.h>
20 #include <linux/mm.h>
21 #include <linux/pagemap.h>
22 #include <linux/bio.h>
23 #include <linux/blkdev.h>
24 #include <trace/events/jbd.h>
25 
26 /*
27  * Default IO end handler for temporary BJ_IO buffer_heads.
28  */
journal_end_buffer_io_sync(struct buffer_head * bh,int uptodate)29 static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
30 {
31 	BUFFER_TRACE(bh, "");
32 	if (uptodate)
33 		set_buffer_uptodate(bh);
34 	else
35 		clear_buffer_uptodate(bh);
36 	unlock_buffer(bh);
37 }
38 
39 /*
40  * When an ext3-ordered file is truncated, it is possible that many pages are
41  * not successfully freed, because they are attached to a committing transaction.
42  * After the transaction commits, these pages are left on the LRU, with no
43  * ->mapping, and with attached buffers.  These pages are trivially reclaimable
44  * by the VM, but their apparent absence upsets the VM accounting, and it makes
45  * the numbers in /proc/meminfo look odd.
46  *
47  * So here, we have a buffer which has just come off the forget list.  Look to
48  * see if we can strip all buffers from the backing page.
49  *
50  * Called under journal->j_list_lock.  The caller provided us with a ref
51  * against the buffer, and we drop that here.
52  */
release_buffer_page(struct buffer_head * bh)53 static void release_buffer_page(struct buffer_head *bh)
54 {
55 	struct page *page;
56 
57 	if (buffer_dirty(bh))
58 		goto nope;
59 	if (atomic_read(&bh->b_count) != 1)
60 		goto nope;
61 	page = bh->b_page;
62 	if (!page)
63 		goto nope;
64 	if (page->mapping)
65 		goto nope;
66 
67 	/* OK, it's a truncated page */
68 	if (!trylock_page(page))
69 		goto nope;
70 
71 	page_cache_get(page);
72 	__brelse(bh);
73 	try_to_free_buffers(page);
74 	unlock_page(page);
75 	page_cache_release(page);
76 	return;
77 
78 nope:
79 	__brelse(bh);
80 }
81 
82 /*
83  * Decrement reference counter for data buffer. If it has been marked
84  * 'BH_Freed', release it and the page to which it belongs if possible.
85  */
release_data_buffer(struct buffer_head * bh)86 static void release_data_buffer(struct buffer_head *bh)
87 {
88 	if (buffer_freed(bh)) {
89 		WARN_ON_ONCE(buffer_dirty(bh));
90 		clear_buffer_freed(bh);
91 		clear_buffer_mapped(bh);
92 		clear_buffer_new(bh);
93 		clear_buffer_req(bh);
94 		bh->b_bdev = NULL;
95 		release_buffer_page(bh);
96 	} else
97 		put_bh(bh);
98 }
99 
100 /*
101  * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
102  * held.  For ranking reasons we must trylock.  If we lose, schedule away and
103  * return 0.  j_list_lock is dropped in this case.
104  */
inverted_lock(journal_t * journal,struct buffer_head * bh)105 static int inverted_lock(journal_t *journal, struct buffer_head *bh)
106 {
107 	if (!jbd_trylock_bh_state(bh)) {
108 		spin_unlock(&journal->j_list_lock);
109 		schedule();
110 		return 0;
111 	}
112 	return 1;
113 }
114 
115 /* Done it all: now write the commit record.  We should have
116  * cleaned up our previous buffers by now, so if we are in abort
117  * mode we can now just skip the rest of the journal write
118  * entirely.
119  *
120  * Returns 1 if the journal needs to be aborted or 0 on success
121  */
journal_write_commit_record(journal_t * journal,transaction_t * commit_transaction)122 static int journal_write_commit_record(journal_t *journal,
123 					transaction_t *commit_transaction)
124 {
125 	struct journal_head *descriptor;
126 	struct buffer_head *bh;
127 	journal_header_t *header;
128 	int ret;
129 
130 	if (is_journal_aborted(journal))
131 		return 0;
132 
133 	descriptor = journal_get_descriptor_buffer(journal);
134 	if (!descriptor)
135 		return 1;
136 
137 	bh = jh2bh(descriptor);
138 
139 	header = (journal_header_t *)(bh->b_data);
140 	header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
141 	header->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK);
142 	header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
143 
144 	JBUFFER_TRACE(descriptor, "write commit block");
145 	set_buffer_dirty(bh);
146 
147 	if (journal->j_flags & JFS_BARRIER)
148 		ret = __sync_dirty_buffer(bh, WRITE_SYNC | WRITE_FLUSH_FUA);
149 	else
150 		ret = sync_dirty_buffer(bh);
151 
152 	put_bh(bh);		/* One for getblk() */
153 	journal_put_journal_head(descriptor);
154 
155 	return (ret == -EIO);
156 }
157 
journal_do_submit_data(struct buffer_head ** wbuf,int bufs,int write_op)158 static void journal_do_submit_data(struct buffer_head **wbuf, int bufs,
159 				   int write_op)
160 {
161 	int i;
162 
163 	for (i = 0; i < bufs; i++) {
164 		wbuf[i]->b_end_io = end_buffer_write_sync;
165 		/* We use-up our safety reference in submit_bh() */
166 		submit_bh(write_op, wbuf[i]);
167 	}
168 }
169 
170 /*
171  *  Submit all the data buffers to disk
172  */
journal_submit_data_buffers(journal_t * journal,transaction_t * commit_transaction,int write_op)173 static int journal_submit_data_buffers(journal_t *journal,
174 				       transaction_t *commit_transaction,
175 				       int write_op)
176 {
177 	struct journal_head *jh;
178 	struct buffer_head *bh;
179 	int locked;
180 	int bufs = 0;
181 	struct buffer_head **wbuf = journal->j_wbuf;
182 	int err = 0;
183 
184 	/*
185 	 * Whenever we unlock the journal and sleep, things can get added
186 	 * onto ->t_sync_datalist, so we have to keep looping back to
187 	 * write_out_data until we *know* that the list is empty.
188 	 *
189 	 * Cleanup any flushed data buffers from the data list.  Even in
190 	 * abort mode, we want to flush this out as soon as possible.
191 	 */
192 write_out_data:
193 	cond_resched();
194 	spin_lock(&journal->j_list_lock);
195 
196 	while (commit_transaction->t_sync_datalist) {
197 		jh = commit_transaction->t_sync_datalist;
198 		bh = jh2bh(jh);
199 		locked = 0;
200 
201 		/* Get reference just to make sure buffer does not disappear
202 		 * when we are forced to drop various locks */
203 		get_bh(bh);
204 		/* If the buffer is dirty, we need to submit IO and hence
205 		 * we need the buffer lock. We try to lock the buffer without
206 		 * blocking. If we fail, we need to drop j_list_lock and do
207 		 * blocking lock_buffer().
208 		 */
209 		if (buffer_dirty(bh)) {
210 			if (!trylock_buffer(bh)) {
211 				BUFFER_TRACE(bh, "needs blocking lock");
212 				spin_unlock(&journal->j_list_lock);
213 				trace_jbd_do_submit_data(journal,
214 						     commit_transaction);
215 				/* Write out all data to prevent deadlocks */
216 				journal_do_submit_data(wbuf, bufs, write_op);
217 				bufs = 0;
218 				lock_buffer(bh);
219 				spin_lock(&journal->j_list_lock);
220 			}
221 			locked = 1;
222 		}
223 		/* We have to get bh_state lock. Again out of order, sigh. */
224 		if (!inverted_lock(journal, bh)) {
225 			jbd_lock_bh_state(bh);
226 			spin_lock(&journal->j_list_lock);
227 		}
228 		/* Someone already cleaned up the buffer? */
229 		if (!buffer_jbd(bh) || bh2jh(bh) != jh
230 			|| jh->b_transaction != commit_transaction
231 			|| jh->b_jlist != BJ_SyncData) {
232 			jbd_unlock_bh_state(bh);
233 			if (locked)
234 				unlock_buffer(bh);
235 			BUFFER_TRACE(bh, "already cleaned up");
236 			release_data_buffer(bh);
237 			continue;
238 		}
239 		if (locked && test_clear_buffer_dirty(bh)) {
240 			BUFFER_TRACE(bh, "needs writeout, adding to array");
241 			wbuf[bufs++] = bh;
242 			__journal_file_buffer(jh, commit_transaction,
243 						BJ_Locked);
244 			jbd_unlock_bh_state(bh);
245 			if (bufs == journal->j_wbufsize) {
246 				spin_unlock(&journal->j_list_lock);
247 				trace_jbd_do_submit_data(journal,
248 						     commit_transaction);
249 				journal_do_submit_data(wbuf, bufs, write_op);
250 				bufs = 0;
251 				goto write_out_data;
252 			}
253 		} else if (!locked && buffer_locked(bh)) {
254 			__journal_file_buffer(jh, commit_transaction,
255 						BJ_Locked);
256 			jbd_unlock_bh_state(bh);
257 			put_bh(bh);
258 		} else {
259 			BUFFER_TRACE(bh, "writeout complete: unfile");
260 			if (unlikely(!buffer_uptodate(bh)))
261 				err = -EIO;
262 			__journal_unfile_buffer(jh);
263 			jbd_unlock_bh_state(bh);
264 			if (locked)
265 				unlock_buffer(bh);
266 			release_data_buffer(bh);
267 		}
268 
269 		if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
270 			spin_unlock(&journal->j_list_lock);
271 			goto write_out_data;
272 		}
273 	}
274 	spin_unlock(&journal->j_list_lock);
275 	trace_jbd_do_submit_data(journal, commit_transaction);
276 	journal_do_submit_data(wbuf, bufs, write_op);
277 
278 	return err;
279 }
280 
281 /*
282  * journal_commit_transaction
283  *
284  * The primary function for committing a transaction to the log.  This
285  * function is called by the journal thread to begin a complete commit.
286  */
journal_commit_transaction(journal_t * journal)287 void journal_commit_transaction(journal_t *journal)
288 {
289 	transaction_t *commit_transaction;
290 	struct journal_head *jh, *new_jh, *descriptor;
291 	struct buffer_head **wbuf = journal->j_wbuf;
292 	int bufs;
293 	int flags;
294 	int err;
295 	unsigned int blocknr;
296 	ktime_t start_time;
297 	u64 commit_time;
298 	char *tagp = NULL;
299 	journal_header_t *header;
300 	journal_block_tag_t *tag = NULL;
301 	int space_left = 0;
302 	int first_tag = 0;
303 	int tag_flag;
304 	int i;
305 	struct blk_plug plug;
306 
307 	/*
308 	 * First job: lock down the current transaction and wait for
309 	 * all outstanding updates to complete.
310 	 */
311 
312 	/* Do we need to erase the effects of a prior journal_flush? */
313 	if (journal->j_flags & JFS_FLUSHED) {
314 		jbd_debug(3, "super block updated\n");
315 		journal_update_superblock(journal, 1);
316 	} else {
317 		jbd_debug(3, "superblock not updated\n");
318 	}
319 
320 	J_ASSERT(journal->j_running_transaction != NULL);
321 	J_ASSERT(journal->j_committing_transaction == NULL);
322 
323 	commit_transaction = journal->j_running_transaction;
324 	J_ASSERT(commit_transaction->t_state == T_RUNNING);
325 
326 	trace_jbd_start_commit(journal, commit_transaction);
327 	jbd_debug(1, "JBD: starting commit of transaction %d\n",
328 			commit_transaction->t_tid);
329 
330 	spin_lock(&journal->j_state_lock);
331 	commit_transaction->t_state = T_LOCKED;
332 
333 	trace_jbd_commit_locking(journal, commit_transaction);
334 	spin_lock(&commit_transaction->t_handle_lock);
335 	while (commit_transaction->t_updates) {
336 		DEFINE_WAIT(wait);
337 
338 		prepare_to_wait(&journal->j_wait_updates, &wait,
339 					TASK_UNINTERRUPTIBLE);
340 		if (commit_transaction->t_updates) {
341 			spin_unlock(&commit_transaction->t_handle_lock);
342 			spin_unlock(&journal->j_state_lock);
343 			schedule();
344 			spin_lock(&journal->j_state_lock);
345 			spin_lock(&commit_transaction->t_handle_lock);
346 		}
347 		finish_wait(&journal->j_wait_updates, &wait);
348 	}
349 	spin_unlock(&commit_transaction->t_handle_lock);
350 
351 	J_ASSERT (commit_transaction->t_outstanding_credits <=
352 			journal->j_max_transaction_buffers);
353 
354 	/*
355 	 * First thing we are allowed to do is to discard any remaining
356 	 * BJ_Reserved buffers.  Note, it is _not_ permissible to assume
357 	 * that there are no such buffers: if a large filesystem
358 	 * operation like a truncate needs to split itself over multiple
359 	 * transactions, then it may try to do a journal_restart() while
360 	 * there are still BJ_Reserved buffers outstanding.  These must
361 	 * be released cleanly from the current transaction.
362 	 *
363 	 * In this case, the filesystem must still reserve write access
364 	 * again before modifying the buffer in the new transaction, but
365 	 * we do not require it to remember exactly which old buffers it
366 	 * has reserved.  This is consistent with the existing behaviour
367 	 * that multiple journal_get_write_access() calls to the same
368 	 * buffer are perfectly permissible.
369 	 */
370 	while (commit_transaction->t_reserved_list) {
371 		jh = commit_transaction->t_reserved_list;
372 		JBUFFER_TRACE(jh, "reserved, unused: refile");
373 		/*
374 		 * A journal_get_undo_access()+journal_release_buffer() may
375 		 * leave undo-committed data.
376 		 */
377 		if (jh->b_committed_data) {
378 			struct buffer_head *bh = jh2bh(jh);
379 
380 			jbd_lock_bh_state(bh);
381 			jbd_free(jh->b_committed_data, bh->b_size);
382 			jh->b_committed_data = NULL;
383 			jbd_unlock_bh_state(bh);
384 		}
385 		journal_refile_buffer(journal, jh);
386 	}
387 
388 	/*
389 	 * Now try to drop any written-back buffers from the journal's
390 	 * checkpoint lists.  We do this *before* commit because it potentially
391 	 * frees some memory
392 	 */
393 	spin_lock(&journal->j_list_lock);
394 	__journal_clean_checkpoint_list(journal);
395 	spin_unlock(&journal->j_list_lock);
396 
397 	jbd_debug (3, "JBD: commit phase 1\n");
398 
399 	/*
400 	 * Clear revoked flag to reflect there is no revoked buffers
401 	 * in the next transaction which is going to be started.
402 	 */
403 	journal_clear_buffer_revoked_flags(journal);
404 
405 	/*
406 	 * Switch to a new revoke table.
407 	 */
408 	journal_switch_revoke_table(journal);
409 
410 	trace_jbd_commit_flushing(journal, commit_transaction);
411 	commit_transaction->t_state = T_FLUSH;
412 	journal->j_committing_transaction = commit_transaction;
413 	journal->j_running_transaction = NULL;
414 	start_time = ktime_get();
415 	commit_transaction->t_log_start = journal->j_head;
416 	wake_up(&journal->j_wait_transaction_locked);
417 	spin_unlock(&journal->j_state_lock);
418 
419 	jbd_debug (3, "JBD: commit phase 2\n");
420 
421 	/*
422 	 * Now start flushing things to disk, in the order they appear
423 	 * on the transaction lists.  Data blocks go first.
424 	 */
425 	blk_start_plug(&plug);
426 	err = journal_submit_data_buffers(journal, commit_transaction,
427 					  WRITE_SYNC);
428 	blk_finish_plug(&plug);
429 
430 	/*
431 	 * Wait for all previously submitted IO to complete.
432 	 */
433 	spin_lock(&journal->j_list_lock);
434 	while (commit_transaction->t_locked_list) {
435 		struct buffer_head *bh;
436 
437 		jh = commit_transaction->t_locked_list->b_tprev;
438 		bh = jh2bh(jh);
439 		get_bh(bh);
440 		if (buffer_locked(bh)) {
441 			spin_unlock(&journal->j_list_lock);
442 			wait_on_buffer(bh);
443 			spin_lock(&journal->j_list_lock);
444 		}
445 		if (unlikely(!buffer_uptodate(bh))) {
446 			if (!trylock_page(bh->b_page)) {
447 				spin_unlock(&journal->j_list_lock);
448 				lock_page(bh->b_page);
449 				spin_lock(&journal->j_list_lock);
450 			}
451 			if (bh->b_page->mapping)
452 				set_bit(AS_EIO, &bh->b_page->mapping->flags);
453 
454 			unlock_page(bh->b_page);
455 			SetPageError(bh->b_page);
456 			err = -EIO;
457 		}
458 		if (!inverted_lock(journal, bh)) {
459 			put_bh(bh);
460 			spin_lock(&journal->j_list_lock);
461 			continue;
462 		}
463 		if (buffer_jbd(bh) && bh2jh(bh) == jh &&
464 		    jh->b_transaction == commit_transaction &&
465 		    jh->b_jlist == BJ_Locked)
466 			__journal_unfile_buffer(jh);
467 		jbd_unlock_bh_state(bh);
468 		release_data_buffer(bh);
469 		cond_resched_lock(&journal->j_list_lock);
470 	}
471 	spin_unlock(&journal->j_list_lock);
472 
473 	if (err) {
474 		char b[BDEVNAME_SIZE];
475 
476 		printk(KERN_WARNING
477 			"JBD: Detected IO errors while flushing file data "
478 			"on %s\n", bdevname(journal->j_fs_dev, b));
479 		if (journal->j_flags & JFS_ABORT_ON_SYNCDATA_ERR)
480 			journal_abort(journal, err);
481 		err = 0;
482 	}
483 
484 	blk_start_plug(&plug);
485 
486 	journal_write_revoke_records(journal, commit_transaction, WRITE_SYNC);
487 
488 	/*
489 	 * If we found any dirty or locked buffers, then we should have
490 	 * looped back up to the write_out_data label.  If there weren't
491 	 * any then journal_clean_data_list should have wiped the list
492 	 * clean by now, so check that it is in fact empty.
493 	 */
494 	J_ASSERT (commit_transaction->t_sync_datalist == NULL);
495 
496 	jbd_debug (3, "JBD: commit phase 3\n");
497 
498 	/*
499 	 * Way to go: we have now written out all of the data for a
500 	 * transaction!  Now comes the tricky part: we need to write out
501 	 * metadata.  Loop over the transaction's entire buffer list:
502 	 */
503 	spin_lock(&journal->j_state_lock);
504 	commit_transaction->t_state = T_COMMIT;
505 	spin_unlock(&journal->j_state_lock);
506 
507 	trace_jbd_commit_logging(journal, commit_transaction);
508 	J_ASSERT(commit_transaction->t_nr_buffers <=
509 		 commit_transaction->t_outstanding_credits);
510 
511 	descriptor = NULL;
512 	bufs = 0;
513 	while (commit_transaction->t_buffers) {
514 
515 		/* Find the next buffer to be journaled... */
516 
517 		jh = commit_transaction->t_buffers;
518 
519 		/* If we're in abort mode, we just un-journal the buffer and
520 		   release it. */
521 
522 		if (is_journal_aborted(journal)) {
523 			clear_buffer_jbddirty(jh2bh(jh));
524 			JBUFFER_TRACE(jh, "journal is aborting: refile");
525 			journal_refile_buffer(journal, jh);
526 			/* If that was the last one, we need to clean up
527 			 * any descriptor buffers which may have been
528 			 * already allocated, even if we are now
529 			 * aborting. */
530 			if (!commit_transaction->t_buffers)
531 				goto start_journal_io;
532 			continue;
533 		}
534 
535 		/* Make sure we have a descriptor block in which to
536 		   record the metadata buffer. */
537 
538 		if (!descriptor) {
539 			struct buffer_head *bh;
540 
541 			J_ASSERT (bufs == 0);
542 
543 			jbd_debug(4, "JBD: get descriptor\n");
544 
545 			descriptor = journal_get_descriptor_buffer(journal);
546 			if (!descriptor) {
547 				journal_abort(journal, -EIO);
548 				continue;
549 			}
550 
551 			bh = jh2bh(descriptor);
552 			jbd_debug(4, "JBD: got buffer %llu (%p)\n",
553 				(unsigned long long)bh->b_blocknr, bh->b_data);
554 			header = (journal_header_t *)&bh->b_data[0];
555 			header->h_magic     = cpu_to_be32(JFS_MAGIC_NUMBER);
556 			header->h_blocktype = cpu_to_be32(JFS_DESCRIPTOR_BLOCK);
557 			header->h_sequence  = cpu_to_be32(commit_transaction->t_tid);
558 
559 			tagp = &bh->b_data[sizeof(journal_header_t)];
560 			space_left = bh->b_size - sizeof(journal_header_t);
561 			first_tag = 1;
562 			set_buffer_jwrite(bh);
563 			set_buffer_dirty(bh);
564 			wbuf[bufs++] = bh;
565 
566 			/* Record it so that we can wait for IO
567                            completion later */
568 			BUFFER_TRACE(bh, "ph3: file as descriptor");
569 			journal_file_buffer(descriptor, commit_transaction,
570 					BJ_LogCtl);
571 		}
572 
573 		/* Where is the buffer to be written? */
574 
575 		err = journal_next_log_block(journal, &blocknr);
576 		/* If the block mapping failed, just abandon the buffer
577 		   and repeat this loop: we'll fall into the
578 		   refile-on-abort condition above. */
579 		if (err) {
580 			journal_abort(journal, err);
581 			continue;
582 		}
583 
584 		/*
585 		 * start_this_handle() uses t_outstanding_credits to determine
586 		 * the free space in the log, but this counter is changed
587 		 * by journal_next_log_block() also.
588 		 */
589 		commit_transaction->t_outstanding_credits--;
590 
591 		/* Bump b_count to prevent truncate from stumbling over
592                    the shadowed buffer!  @@@ This can go if we ever get
593                    rid of the BJ_IO/BJ_Shadow pairing of buffers. */
594 		get_bh(jh2bh(jh));
595 
596 		/* Make a temporary IO buffer with which to write it out
597                    (this will requeue both the metadata buffer and the
598                    temporary IO buffer). new_bh goes on BJ_IO*/
599 
600 		set_buffer_jwrite(jh2bh(jh));
601 		/*
602 		 * akpm: journal_write_metadata_buffer() sets
603 		 * new_bh->b_transaction to commit_transaction.
604 		 * We need to clean this up before we release new_bh
605 		 * (which is of type BJ_IO)
606 		 */
607 		JBUFFER_TRACE(jh, "ph3: write metadata");
608 		flags = journal_write_metadata_buffer(commit_transaction,
609 						      jh, &new_jh, blocknr);
610 		set_buffer_jwrite(jh2bh(new_jh));
611 		wbuf[bufs++] = jh2bh(new_jh);
612 
613 		/* Record the new block's tag in the current descriptor
614                    buffer */
615 
616 		tag_flag = 0;
617 		if (flags & 1)
618 			tag_flag |= JFS_FLAG_ESCAPE;
619 		if (!first_tag)
620 			tag_flag |= JFS_FLAG_SAME_UUID;
621 
622 		tag = (journal_block_tag_t *) tagp;
623 		tag->t_blocknr = cpu_to_be32(jh2bh(jh)->b_blocknr);
624 		tag->t_flags = cpu_to_be32(tag_flag);
625 		tagp += sizeof(journal_block_tag_t);
626 		space_left -= sizeof(journal_block_tag_t);
627 
628 		if (first_tag) {
629 			memcpy (tagp, journal->j_uuid, 16);
630 			tagp += 16;
631 			space_left -= 16;
632 			first_tag = 0;
633 		}
634 
635 		/* If there's no more to do, or if the descriptor is full,
636 		   let the IO rip! */
637 
638 		if (bufs == journal->j_wbufsize ||
639 		    commit_transaction->t_buffers == NULL ||
640 		    space_left < sizeof(journal_block_tag_t) + 16) {
641 
642 			jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
643 
644 			/* Write an end-of-descriptor marker before
645                            submitting the IOs.  "tag" still points to
646                            the last tag we set up. */
647 
648 			tag->t_flags |= cpu_to_be32(JFS_FLAG_LAST_TAG);
649 
650 start_journal_io:
651 			for (i = 0; i < bufs; i++) {
652 				struct buffer_head *bh = wbuf[i];
653 				lock_buffer(bh);
654 				clear_buffer_dirty(bh);
655 				set_buffer_uptodate(bh);
656 				bh->b_end_io = journal_end_buffer_io_sync;
657 				submit_bh(WRITE_SYNC, bh);
658 			}
659 			cond_resched();
660 
661 			/* Force a new descriptor to be generated next
662                            time round the loop. */
663 			descriptor = NULL;
664 			bufs = 0;
665 		}
666 	}
667 
668 	blk_finish_plug(&plug);
669 
670 	/* Lo and behold: we have just managed to send a transaction to
671            the log.  Before we can commit it, wait for the IO so far to
672            complete.  Control buffers being written are on the
673            transaction's t_log_list queue, and metadata buffers are on
674            the t_iobuf_list queue.
675 
676 	   Wait for the buffers in reverse order.  That way we are
677 	   less likely to be woken up until all IOs have completed, and
678 	   so we incur less scheduling load.
679 	*/
680 
681 	jbd_debug(3, "JBD: commit phase 4\n");
682 
683 	/*
684 	 * akpm: these are BJ_IO, and j_list_lock is not needed.
685 	 * See __journal_try_to_free_buffer.
686 	 */
687 wait_for_iobuf:
688 	while (commit_transaction->t_iobuf_list != NULL) {
689 		struct buffer_head *bh;
690 
691 		jh = commit_transaction->t_iobuf_list->b_tprev;
692 		bh = jh2bh(jh);
693 		if (buffer_locked(bh)) {
694 			wait_on_buffer(bh);
695 			goto wait_for_iobuf;
696 		}
697 		if (cond_resched())
698 			goto wait_for_iobuf;
699 
700 		if (unlikely(!buffer_uptodate(bh)))
701 			err = -EIO;
702 
703 		clear_buffer_jwrite(bh);
704 
705 		JBUFFER_TRACE(jh, "ph4: unfile after journal write");
706 		journal_unfile_buffer(journal, jh);
707 
708 		/*
709 		 * ->t_iobuf_list should contain only dummy buffer_heads
710 		 * which were created by journal_write_metadata_buffer().
711 		 */
712 		BUFFER_TRACE(bh, "dumping temporary bh");
713 		journal_put_journal_head(jh);
714 		__brelse(bh);
715 		J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
716 		free_buffer_head(bh);
717 
718 		/* We also have to unlock and free the corresponding
719                    shadowed buffer */
720 		jh = commit_transaction->t_shadow_list->b_tprev;
721 		bh = jh2bh(jh);
722 		clear_buffer_jwrite(bh);
723 		J_ASSERT_BH(bh, buffer_jbddirty(bh));
724 
725 		/* The metadata is now released for reuse, but we need
726                    to remember it against this transaction so that when
727                    we finally commit, we can do any checkpointing
728                    required. */
729 		JBUFFER_TRACE(jh, "file as BJ_Forget");
730 		journal_file_buffer(jh, commit_transaction, BJ_Forget);
731 		/*
732 		 * Wake up any transactions which were waiting for this
733 		 * IO to complete. The barrier must be here so that changes
734 		 * by journal_file_buffer() take effect before wake_up_bit()
735 		 * does the waitqueue check.
736 		 */
737 		smp_mb();
738 		wake_up_bit(&bh->b_state, BH_Unshadow);
739 		JBUFFER_TRACE(jh, "brelse shadowed buffer");
740 		__brelse(bh);
741 	}
742 
743 	J_ASSERT (commit_transaction->t_shadow_list == NULL);
744 
745 	jbd_debug(3, "JBD: commit phase 5\n");
746 
747 	/* Here we wait for the revoke record and descriptor record buffers */
748  wait_for_ctlbuf:
749 	while (commit_transaction->t_log_list != NULL) {
750 		struct buffer_head *bh;
751 
752 		jh = commit_transaction->t_log_list->b_tprev;
753 		bh = jh2bh(jh);
754 		if (buffer_locked(bh)) {
755 			wait_on_buffer(bh);
756 			goto wait_for_ctlbuf;
757 		}
758 		if (cond_resched())
759 			goto wait_for_ctlbuf;
760 
761 		if (unlikely(!buffer_uptodate(bh)))
762 			err = -EIO;
763 
764 		BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
765 		clear_buffer_jwrite(bh);
766 		journal_unfile_buffer(journal, jh);
767 		journal_put_journal_head(jh);
768 		__brelse(bh);		/* One for getblk */
769 		/* AKPM: bforget here */
770 	}
771 
772 	if (err)
773 		journal_abort(journal, err);
774 
775 	jbd_debug(3, "JBD: commit phase 6\n");
776 
777 	/* All metadata is written, now write commit record and do cleanup */
778 	spin_lock(&journal->j_state_lock);
779 	J_ASSERT(commit_transaction->t_state == T_COMMIT);
780 	commit_transaction->t_state = T_COMMIT_RECORD;
781 	spin_unlock(&journal->j_state_lock);
782 
783 	if (journal_write_commit_record(journal, commit_transaction))
784 		err = -EIO;
785 
786 	if (err)
787 		journal_abort(journal, err);
788 
789 	/* End of a transaction!  Finally, we can do checkpoint
790            processing: any buffers committed as a result of this
791            transaction can be removed from any checkpoint list it was on
792            before. */
793 
794 	jbd_debug(3, "JBD: commit phase 7\n");
795 
796 	J_ASSERT(commit_transaction->t_sync_datalist == NULL);
797 	J_ASSERT(commit_transaction->t_buffers == NULL);
798 	J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
799 	J_ASSERT(commit_transaction->t_iobuf_list == NULL);
800 	J_ASSERT(commit_transaction->t_shadow_list == NULL);
801 	J_ASSERT(commit_transaction->t_log_list == NULL);
802 
803 restart_loop:
804 	/*
805 	 * As there are other places (journal_unmap_buffer()) adding buffers
806 	 * to this list we have to be careful and hold the j_list_lock.
807 	 */
808 	spin_lock(&journal->j_list_lock);
809 	while (commit_transaction->t_forget) {
810 		transaction_t *cp_transaction;
811 		struct buffer_head *bh;
812 		int try_to_free = 0;
813 
814 		jh = commit_transaction->t_forget;
815 		spin_unlock(&journal->j_list_lock);
816 		bh = jh2bh(jh);
817 		/*
818 		 * Get a reference so that bh cannot be freed before we are
819 		 * done with it.
820 		 */
821 		get_bh(bh);
822 		jbd_lock_bh_state(bh);
823 		J_ASSERT_JH(jh,	jh->b_transaction == commit_transaction ||
824 			jh->b_transaction == journal->j_running_transaction);
825 
826 		/*
827 		 * If there is undo-protected committed data against
828 		 * this buffer, then we can remove it now.  If it is a
829 		 * buffer needing such protection, the old frozen_data
830 		 * field now points to a committed version of the
831 		 * buffer, so rotate that field to the new committed
832 		 * data.
833 		 *
834 		 * Otherwise, we can just throw away the frozen data now.
835 		 */
836 		if (jh->b_committed_data) {
837 			jbd_free(jh->b_committed_data, bh->b_size);
838 			jh->b_committed_data = NULL;
839 			if (jh->b_frozen_data) {
840 				jh->b_committed_data = jh->b_frozen_data;
841 				jh->b_frozen_data = NULL;
842 			}
843 		} else if (jh->b_frozen_data) {
844 			jbd_free(jh->b_frozen_data, bh->b_size);
845 			jh->b_frozen_data = NULL;
846 		}
847 
848 		spin_lock(&journal->j_list_lock);
849 		cp_transaction = jh->b_cp_transaction;
850 		if (cp_transaction) {
851 			JBUFFER_TRACE(jh, "remove from old cp transaction");
852 			__journal_remove_checkpoint(jh);
853 		}
854 
855 		/* Only re-checkpoint the buffer_head if it is marked
856 		 * dirty.  If the buffer was added to the BJ_Forget list
857 		 * by journal_forget, it may no longer be dirty and
858 		 * there's no point in keeping a checkpoint record for
859 		 * it. */
860 
861 		/*
862 		 * A buffer which has been freed while still being journaled by
863 		 * a previous transaction.
864 		 */
865 		if (buffer_freed(bh)) {
866 			/*
867 			 * If the running transaction is the one containing
868 			 * "add to orphan" operation (b_next_transaction !=
869 			 * NULL), we have to wait for that transaction to
870 			 * commit before we can really get rid of the buffer.
871 			 * So just clear b_modified to not confuse transaction
872 			 * credit accounting and refile the buffer to
873 			 * BJ_Forget of the running transaction. If the just
874 			 * committed transaction contains "add to orphan"
875 			 * operation, we can completely invalidate the buffer
876 			 * now. We are rather throughout in that since the
877 			 * buffer may be still accessible when blocksize <
878 			 * pagesize and it is attached to the last partial
879 			 * page.
880 			 */
881 			jh->b_modified = 0;
882 			if (!jh->b_next_transaction) {
883 				clear_buffer_freed(bh);
884 				clear_buffer_jbddirty(bh);
885 				clear_buffer_mapped(bh);
886 				clear_buffer_new(bh);
887 				clear_buffer_req(bh);
888 				bh->b_bdev = NULL;
889 			}
890 		}
891 
892 		if (buffer_jbddirty(bh)) {
893 			JBUFFER_TRACE(jh, "add to new checkpointing trans");
894 			__journal_insert_checkpoint(jh, commit_transaction);
895 			if (is_journal_aborted(journal))
896 				clear_buffer_jbddirty(bh);
897 		} else {
898 			J_ASSERT_BH(bh, !buffer_dirty(bh));
899 			/*
900 			 * The buffer on BJ_Forget list and not jbddirty means
901 			 * it has been freed by this transaction and hence it
902 			 * could not have been reallocated until this
903 			 * transaction has committed. *BUT* it could be
904 			 * reallocated once we have written all the data to
905 			 * disk and before we process the buffer on BJ_Forget
906 			 * list.
907 			 */
908 			if (!jh->b_next_transaction)
909 				try_to_free = 1;
910 		}
911 		JBUFFER_TRACE(jh, "refile or unfile freed buffer");
912 		__journal_refile_buffer(jh);
913 		jbd_unlock_bh_state(bh);
914 		if (try_to_free)
915 			release_buffer_page(bh);
916 		else
917 			__brelse(bh);
918 		cond_resched_lock(&journal->j_list_lock);
919 	}
920 	spin_unlock(&journal->j_list_lock);
921 	/*
922 	 * This is a bit sleazy.  We use j_list_lock to protect transition
923 	 * of a transaction into T_FINISHED state and calling
924 	 * __journal_drop_transaction(). Otherwise we could race with
925 	 * other checkpointing code processing the transaction...
926 	 */
927 	spin_lock(&journal->j_state_lock);
928 	spin_lock(&journal->j_list_lock);
929 	/*
930 	 * Now recheck if some buffers did not get attached to the transaction
931 	 * while the lock was dropped...
932 	 */
933 	if (commit_transaction->t_forget) {
934 		spin_unlock(&journal->j_list_lock);
935 		spin_unlock(&journal->j_state_lock);
936 		goto restart_loop;
937 	}
938 
939 	/* Done with this transaction! */
940 
941 	jbd_debug(3, "JBD: commit phase 8\n");
942 
943 	J_ASSERT(commit_transaction->t_state == T_COMMIT_RECORD);
944 
945 	commit_transaction->t_state = T_FINISHED;
946 	J_ASSERT(commit_transaction == journal->j_committing_transaction);
947 	journal->j_commit_sequence = commit_transaction->t_tid;
948 	journal->j_committing_transaction = NULL;
949 	commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
950 
951 	/*
952 	 * weight the commit time higher than the average time so we don't
953 	 * react too strongly to vast changes in commit time
954 	 */
955 	if (likely(journal->j_average_commit_time))
956 		journal->j_average_commit_time = (commit_time*3 +
957 				journal->j_average_commit_time) / 4;
958 	else
959 		journal->j_average_commit_time = commit_time;
960 
961 	spin_unlock(&journal->j_state_lock);
962 
963 	if (commit_transaction->t_checkpoint_list == NULL &&
964 	    commit_transaction->t_checkpoint_io_list == NULL) {
965 		__journal_drop_transaction(journal, commit_transaction);
966 	} else {
967 		if (journal->j_checkpoint_transactions == NULL) {
968 			journal->j_checkpoint_transactions = commit_transaction;
969 			commit_transaction->t_cpnext = commit_transaction;
970 			commit_transaction->t_cpprev = commit_transaction;
971 		} else {
972 			commit_transaction->t_cpnext =
973 				journal->j_checkpoint_transactions;
974 			commit_transaction->t_cpprev =
975 				commit_transaction->t_cpnext->t_cpprev;
976 			commit_transaction->t_cpnext->t_cpprev =
977 				commit_transaction;
978 			commit_transaction->t_cpprev->t_cpnext =
979 				commit_transaction;
980 		}
981 	}
982 	spin_unlock(&journal->j_list_lock);
983 
984 	trace_jbd_end_commit(journal, commit_transaction);
985 	jbd_debug(1, "JBD: commit %d complete, head %d\n",
986 		  journal->j_commit_sequence, journal->j_tail_sequence);
987 
988 	wake_up(&journal->j_wait_done_commit);
989 }
990