1 /*
2  *  linux/fs/ext3/balloc.c
3  *
4  * Copyright (C) 1992, 1993, 1994, 1995
5  * Remy Card (card@masi.ibp.fr)
6  * Laboratoire MASI - Institut Blaise Pascal
7  * Universite Pierre et Marie Curie (Paris VI)
8  *
9  *  Enhanced block allocation by Stephen Tweedie (sct@redhat.com), 1993
10  *  Big-endian to little-endian byte-swapping/bitmaps by
11  *        David S. Miller (davem@caip.rutgers.edu), 1995
12  */
13 
14 #include <linux/time.h>
15 #include <linux/capability.h>
16 #include <linux/fs.h>
17 #include <linux/slab.h>
18 #include <linux/jbd.h>
19 #include <linux/ext3_fs.h>
20 #include <linux/ext3_jbd.h>
21 #include <linux/quotaops.h>
22 #include <linux/buffer_head.h>
23 #include <linux/blkdev.h>
24 
25 /*
26  * balloc.c contains the blocks allocation and deallocation routines
27  */
28 
29 /*
30  * The free blocks are managed by bitmaps.  A file system contains several
31  * blocks groups.  Each group contains 1 bitmap block for blocks, 1 bitmap
32  * block for inodes, N blocks for the inode table and data blocks.
33  *
34  * The file system contains group descriptors which are located after the
35  * super block.  Each descriptor contains the number of the bitmap block and
36  * the free blocks count in the block.  The descriptors are loaded in memory
37  * when a file system is mounted (see ext3_fill_super).
38  */
39 
40 
41 #define in_range(b, first, len)	((b) >= (first) && (b) <= (first) + (len) - 1)
42 
43 /*
44  * Calculate the block group number and offset, given a block number
45  */
ext3_get_group_no_and_offset(struct super_block * sb,ext3_fsblk_t blocknr,unsigned long * blockgrpp,ext3_grpblk_t * offsetp)46 static void ext3_get_group_no_and_offset(struct super_block *sb,
47 	ext3_fsblk_t blocknr, unsigned long *blockgrpp, ext3_grpblk_t *offsetp)
48 {
49 	struct ext3_super_block *es = EXT3_SB(sb)->s_es;
50 
51 	blocknr = blocknr - le32_to_cpu(es->s_first_data_block);
52 	if (offsetp)
53 		*offsetp = blocknr % EXT3_BLOCKS_PER_GROUP(sb);
54 	if (blockgrpp)
55 		*blockgrpp = blocknr / EXT3_BLOCKS_PER_GROUP(sb);
56 }
57 
58 /**
59  * ext3_get_group_desc() -- load group descriptor from disk
60  * @sb:			super block
61  * @block_group:	given block group
62  * @bh:			pointer to the buffer head to store the block
63  *			group descriptor
64  */
ext3_get_group_desc(struct super_block * sb,unsigned int block_group,struct buffer_head ** bh)65 struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb,
66 					     unsigned int block_group,
67 					     struct buffer_head ** bh)
68 {
69 	unsigned long group_desc;
70 	unsigned long offset;
71 	struct ext3_group_desc * desc;
72 	struct ext3_sb_info *sbi = EXT3_SB(sb);
73 
74 	if (block_group >= sbi->s_groups_count) {
75 		ext3_error (sb, "ext3_get_group_desc",
76 			    "block_group >= groups_count - "
77 			    "block_group = %d, groups_count = %lu",
78 			    block_group, sbi->s_groups_count);
79 
80 		return NULL;
81 	}
82 	smp_rmb();
83 
84 	group_desc = block_group >> EXT3_DESC_PER_BLOCK_BITS(sb);
85 	offset = block_group & (EXT3_DESC_PER_BLOCK(sb) - 1);
86 	if (!sbi->s_group_desc[group_desc]) {
87 		ext3_error (sb, "ext3_get_group_desc",
88 			    "Group descriptor not loaded - "
89 			    "block_group = %d, group_desc = %lu, desc = %lu",
90 			     block_group, group_desc, offset);
91 		return NULL;
92 	}
93 
94 	desc = (struct ext3_group_desc *) sbi->s_group_desc[group_desc]->b_data;
95 	if (bh)
96 		*bh = sbi->s_group_desc[group_desc];
97 	return desc + offset;
98 }
99 
ext3_valid_block_bitmap(struct super_block * sb,struct ext3_group_desc * desc,unsigned int block_group,struct buffer_head * bh)100 static int ext3_valid_block_bitmap(struct super_block *sb,
101 					struct ext3_group_desc *desc,
102 					unsigned int block_group,
103 					struct buffer_head *bh)
104 {
105 	ext3_grpblk_t offset;
106 	ext3_grpblk_t next_zero_bit;
107 	ext3_fsblk_t bitmap_blk;
108 	ext3_fsblk_t group_first_block;
109 
110 	group_first_block = ext3_group_first_block_no(sb, block_group);
111 
112 	/* check whether block bitmap block number is set */
113 	bitmap_blk = le32_to_cpu(desc->bg_block_bitmap);
114 	offset = bitmap_blk - group_first_block;
115 	if (!ext3_test_bit(offset, bh->b_data))
116 		/* bad block bitmap */
117 		goto err_out;
118 
119 	/* check whether the inode bitmap block number is set */
120 	bitmap_blk = le32_to_cpu(desc->bg_inode_bitmap);
121 	offset = bitmap_blk - group_first_block;
122 	if (!ext3_test_bit(offset, bh->b_data))
123 		/* bad block bitmap */
124 		goto err_out;
125 
126 	/* check whether the inode table block number is set */
127 	bitmap_blk = le32_to_cpu(desc->bg_inode_table);
128 	offset = bitmap_blk - group_first_block;
129 	next_zero_bit = ext3_find_next_zero_bit(bh->b_data,
130 				offset + EXT3_SB(sb)->s_itb_per_group,
131 				offset);
132 	if (next_zero_bit >= offset + EXT3_SB(sb)->s_itb_per_group)
133 		/* good bitmap for inode tables */
134 		return 1;
135 
136 err_out:
137 	ext3_error(sb, __func__,
138 			"Invalid block bitmap - "
139 			"block_group = %d, block = %lu",
140 			block_group, bitmap_blk);
141 	return 0;
142 }
143 
144 /**
145  * read_block_bitmap()
146  * @sb:			super block
147  * @block_group:	given block group
148  *
149  * Read the bitmap for a given block_group,and validate the
150  * bits for block/inode/inode tables are set in the bitmaps
151  *
152  * Return buffer_head on success or NULL in case of failure.
153  */
154 static struct buffer_head *
read_block_bitmap(struct super_block * sb,unsigned int block_group)155 read_block_bitmap(struct super_block *sb, unsigned int block_group)
156 {
157 	struct ext3_group_desc * desc;
158 	struct buffer_head * bh = NULL;
159 	ext3_fsblk_t bitmap_blk;
160 
161 	desc = ext3_get_group_desc(sb, block_group, NULL);
162 	if (!desc)
163 		return NULL;
164 	bitmap_blk = le32_to_cpu(desc->bg_block_bitmap);
165 	bh = sb_getblk(sb, bitmap_blk);
166 	if (unlikely(!bh)) {
167 		ext3_error(sb, __func__,
168 			    "Cannot read block bitmap - "
169 			    "block_group = %d, block_bitmap = %u",
170 			    block_group, le32_to_cpu(desc->bg_block_bitmap));
171 		return NULL;
172 	}
173 	if (likely(bh_uptodate_or_lock(bh)))
174 		return bh;
175 
176 	if (bh_submit_read(bh) < 0) {
177 		brelse(bh);
178 		ext3_error(sb, __func__,
179 			    "Cannot read block bitmap - "
180 			    "block_group = %d, block_bitmap = %u",
181 			    block_group, le32_to_cpu(desc->bg_block_bitmap));
182 		return NULL;
183 	}
184 	ext3_valid_block_bitmap(sb, desc, block_group, bh);
185 	/*
186 	 * file system mounted not to panic on error, continue with corrupt
187 	 * bitmap
188 	 */
189 	return bh;
190 }
191 /*
192  * The reservation window structure operations
193  * --------------------------------------------
194  * Operations include:
195  * dump, find, add, remove, is_empty, find_next_reservable_window, etc.
196  *
197  * We use a red-black tree to represent per-filesystem reservation
198  * windows.
199  *
200  */
201 
202 /**
203  * __rsv_window_dump() -- Dump the filesystem block allocation reservation map
204  * @rb_root:		root of per-filesystem reservation rb tree
205  * @verbose:		verbose mode
206  * @fn:			function which wishes to dump the reservation map
207  *
208  * If verbose is turned on, it will print the whole block reservation
209  * windows(start, end).	Otherwise, it will only print out the "bad" windows,
210  * those windows that overlap with their immediate neighbors.
211  */
212 #if 1
__rsv_window_dump(struct rb_root * root,int verbose,const char * fn)213 static void __rsv_window_dump(struct rb_root *root, int verbose,
214 			      const char *fn)
215 {
216 	struct rb_node *n;
217 	struct ext3_reserve_window_node *rsv, *prev;
218 	int bad;
219 
220 restart:
221 	n = rb_first(root);
222 	bad = 0;
223 	prev = NULL;
224 
225 	printk("Block Allocation Reservation Windows Map (%s):\n", fn);
226 	while (n) {
227 		rsv = rb_entry(n, struct ext3_reserve_window_node, rsv_node);
228 		if (verbose)
229 			printk("reservation window 0x%p "
230 			       "start:  %lu, end:  %lu\n",
231 			       rsv, rsv->rsv_start, rsv->rsv_end);
232 		if (rsv->rsv_start && rsv->rsv_start >= rsv->rsv_end) {
233 			printk("Bad reservation %p (start >= end)\n",
234 			       rsv);
235 			bad = 1;
236 		}
237 		if (prev && prev->rsv_end >= rsv->rsv_start) {
238 			printk("Bad reservation %p (prev->end >= start)\n",
239 			       rsv);
240 			bad = 1;
241 		}
242 		if (bad) {
243 			if (!verbose) {
244 				printk("Restarting reservation walk in verbose mode\n");
245 				verbose = 1;
246 				goto restart;
247 			}
248 		}
249 		n = rb_next(n);
250 		prev = rsv;
251 	}
252 	printk("Window map complete.\n");
253 	BUG_ON(bad);
254 }
255 #define rsv_window_dump(root, verbose) \
256 	__rsv_window_dump((root), (verbose), __func__)
257 #else
258 #define rsv_window_dump(root, verbose) do {} while (0)
259 #endif
260 
261 /**
262  * goal_in_my_reservation()
263  * @rsv:		inode's reservation window
264  * @grp_goal:		given goal block relative to the allocation block group
265  * @group:		the current allocation block group
266  * @sb:			filesystem super block
267  *
268  * Test if the given goal block (group relative) is within the file's
269  * own block reservation window range.
270  *
271  * If the reservation window is outside the goal allocation group, return 0;
272  * grp_goal (given goal block) could be -1, which means no specific
273  * goal block. In this case, always return 1.
274  * If the goal block is within the reservation window, return 1;
275  * otherwise, return 0;
276  */
277 static int
goal_in_my_reservation(struct ext3_reserve_window * rsv,ext3_grpblk_t grp_goal,unsigned int group,struct super_block * sb)278 goal_in_my_reservation(struct ext3_reserve_window *rsv, ext3_grpblk_t grp_goal,
279 			unsigned int group, struct super_block * sb)
280 {
281 	ext3_fsblk_t group_first_block, group_last_block;
282 
283 	group_first_block = ext3_group_first_block_no(sb, group);
284 	group_last_block = group_first_block + (EXT3_BLOCKS_PER_GROUP(sb) - 1);
285 
286 	if ((rsv->_rsv_start > group_last_block) ||
287 	    (rsv->_rsv_end < group_first_block))
288 		return 0;
289 	if ((grp_goal >= 0) && ((grp_goal + group_first_block < rsv->_rsv_start)
290 		|| (grp_goal + group_first_block > rsv->_rsv_end)))
291 		return 0;
292 	return 1;
293 }
294 
295 /**
296  * search_reserve_window()
297  * @rb_root:		root of reservation tree
298  * @goal:		target allocation block
299  *
300  * Find the reserved window which includes the goal, or the previous one
301  * if the goal is not in any window.
302  * Returns NULL if there are no windows or if all windows start after the goal.
303  */
304 static struct ext3_reserve_window_node *
search_reserve_window(struct rb_root * root,ext3_fsblk_t goal)305 search_reserve_window(struct rb_root *root, ext3_fsblk_t goal)
306 {
307 	struct rb_node *n = root->rb_node;
308 	struct ext3_reserve_window_node *rsv;
309 
310 	if (!n)
311 		return NULL;
312 
313 	do {
314 		rsv = rb_entry(n, struct ext3_reserve_window_node, rsv_node);
315 
316 		if (goal < rsv->rsv_start)
317 			n = n->rb_left;
318 		else if (goal > rsv->rsv_end)
319 			n = n->rb_right;
320 		else
321 			return rsv;
322 	} while (n);
323 	/*
324 	 * We've fallen off the end of the tree: the goal wasn't inside
325 	 * any particular node.  OK, the previous node must be to one
326 	 * side of the interval containing the goal.  If it's the RHS,
327 	 * we need to back up one.
328 	 */
329 	if (rsv->rsv_start > goal) {
330 		n = rb_prev(&rsv->rsv_node);
331 		rsv = rb_entry(n, struct ext3_reserve_window_node, rsv_node);
332 	}
333 	return rsv;
334 }
335 
336 /**
337  * ext3_rsv_window_add() -- Insert a window to the block reservation rb tree.
338  * @sb:			super block
339  * @rsv:		reservation window to add
340  *
341  * Must be called with rsv_lock hold.
342  */
ext3_rsv_window_add(struct super_block * sb,struct ext3_reserve_window_node * rsv)343 void ext3_rsv_window_add(struct super_block *sb,
344 		    struct ext3_reserve_window_node *rsv)
345 {
346 	struct rb_root *root = &EXT3_SB(sb)->s_rsv_window_root;
347 	struct rb_node *node = &rsv->rsv_node;
348 	ext3_fsblk_t start = rsv->rsv_start;
349 
350 	struct rb_node ** p = &root->rb_node;
351 	struct rb_node * parent = NULL;
352 	struct ext3_reserve_window_node *this;
353 
354 	while (*p)
355 	{
356 		parent = *p;
357 		this = rb_entry(parent, struct ext3_reserve_window_node, rsv_node);
358 
359 		if (start < this->rsv_start)
360 			p = &(*p)->rb_left;
361 		else if (start > this->rsv_end)
362 			p = &(*p)->rb_right;
363 		else {
364 			rsv_window_dump(root, 1);
365 			BUG();
366 		}
367 	}
368 
369 	rb_link_node(node, parent, p);
370 	rb_insert_color(node, root);
371 }
372 
373 /**
374  * ext3_rsv_window_remove() -- unlink a window from the reservation rb tree
375  * @sb:			super block
376  * @rsv:		reservation window to remove
377  *
378  * Mark the block reservation window as not allocated, and unlink it
379  * from the filesystem reservation window rb tree. Must be called with
380  * rsv_lock hold.
381  */
rsv_window_remove(struct super_block * sb,struct ext3_reserve_window_node * rsv)382 static void rsv_window_remove(struct super_block *sb,
383 			      struct ext3_reserve_window_node *rsv)
384 {
385 	rsv->rsv_start = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
386 	rsv->rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
387 	rsv->rsv_alloc_hit = 0;
388 	rb_erase(&rsv->rsv_node, &EXT3_SB(sb)->s_rsv_window_root);
389 }
390 
391 /*
392  * rsv_is_empty() -- Check if the reservation window is allocated.
393  * @rsv:		given reservation window to check
394  *
395  * returns 1 if the end block is EXT3_RESERVE_WINDOW_NOT_ALLOCATED.
396  */
rsv_is_empty(struct ext3_reserve_window * rsv)397 static inline int rsv_is_empty(struct ext3_reserve_window *rsv)
398 {
399 	/* a valid reservation end block could not be 0 */
400 	return rsv->_rsv_end == EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
401 }
402 
403 /**
404  * ext3_init_block_alloc_info()
405  * @inode:		file inode structure
406  *
407  * Allocate and initialize the	reservation window structure, and
408  * link the window to the ext3 inode structure at last
409  *
410  * The reservation window structure is only dynamically allocated
411  * and linked to ext3 inode the first time the open file
412  * needs a new block. So, before every ext3_new_block(s) call, for
413  * regular files, we should check whether the reservation window
414  * structure exists or not. In the latter case, this function is called.
415  * Fail to do so will result in block reservation being turned off for that
416  * open file.
417  *
418  * This function is called from ext3_get_blocks_handle(), also called
419  * when setting the reservation window size through ioctl before the file
420  * is open for write (needs block allocation).
421  *
422  * Needs truncate_mutex protection prior to call this function.
423  */
ext3_init_block_alloc_info(struct inode * inode)424 void ext3_init_block_alloc_info(struct inode *inode)
425 {
426 	struct ext3_inode_info *ei = EXT3_I(inode);
427 	struct ext3_block_alloc_info *block_i = ei->i_block_alloc_info;
428 	struct super_block *sb = inode->i_sb;
429 
430 	block_i = kmalloc(sizeof(*block_i), GFP_NOFS);
431 	if (block_i) {
432 		struct ext3_reserve_window_node *rsv = &block_i->rsv_window_node;
433 
434 		rsv->rsv_start = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
435 		rsv->rsv_end = EXT3_RESERVE_WINDOW_NOT_ALLOCATED;
436 
437 		/*
438 		 * if filesystem is mounted with NORESERVATION, the goal
439 		 * reservation window size is set to zero to indicate
440 		 * block reservation is off
441 		 */
442 		if (!test_opt(sb, RESERVATION))
443 			rsv->rsv_goal_size = 0;
444 		else
445 			rsv->rsv_goal_size = EXT3_DEFAULT_RESERVE_BLOCKS;
446 		rsv->rsv_alloc_hit = 0;
447 		block_i->last_alloc_logical_block = 0;
448 		block_i->last_alloc_physical_block = 0;
449 	}
450 	ei->i_block_alloc_info = block_i;
451 }
452 
453 /**
454  * ext3_discard_reservation()
455  * @inode:		inode
456  *
457  * Discard(free) block reservation window on last file close, or truncate
458  * or at last iput().
459  *
460  * It is being called in three cases:
461  *	ext3_release_file(): last writer close the file
462  *	ext3_clear_inode(): last iput(), when nobody link to this file.
463  *	ext3_truncate(): when the block indirect map is about to change.
464  *
465  */
ext3_discard_reservation(struct inode * inode)466 void ext3_discard_reservation(struct inode *inode)
467 {
468 	struct ext3_inode_info *ei = EXT3_I(inode);
469 	struct ext3_block_alloc_info *block_i = ei->i_block_alloc_info;
470 	struct ext3_reserve_window_node *rsv;
471 	spinlock_t *rsv_lock = &EXT3_SB(inode->i_sb)->s_rsv_window_lock;
472 
473 	if (!block_i)
474 		return;
475 
476 	rsv = &block_i->rsv_window_node;
477 	if (!rsv_is_empty(&rsv->rsv_window)) {
478 		spin_lock(rsv_lock);
479 		if (!rsv_is_empty(&rsv->rsv_window))
480 			rsv_window_remove(inode->i_sb, rsv);
481 		spin_unlock(rsv_lock);
482 	}
483 }
484 
485 /**
486  * ext3_free_blocks_sb() -- Free given blocks and update quota
487  * @handle:			handle to this transaction
488  * @sb:				super block
489  * @block:			start physcial block to free
490  * @count:			number of blocks to free
491  * @pdquot_freed_blocks:	pointer to quota
492  */
ext3_free_blocks_sb(handle_t * handle,struct super_block * sb,ext3_fsblk_t block,unsigned long count,unsigned long * pdquot_freed_blocks)493 void ext3_free_blocks_sb(handle_t *handle, struct super_block *sb,
494 			 ext3_fsblk_t block, unsigned long count,
495 			 unsigned long *pdquot_freed_blocks)
496 {
497 	struct buffer_head *bitmap_bh = NULL;
498 	struct buffer_head *gd_bh;
499 	unsigned long block_group;
500 	ext3_grpblk_t bit;
501 	unsigned long i;
502 	unsigned long overflow;
503 	struct ext3_group_desc * desc;
504 	struct ext3_super_block * es;
505 	struct ext3_sb_info *sbi;
506 	int err = 0, ret;
507 	ext3_grpblk_t group_freed;
508 
509 	*pdquot_freed_blocks = 0;
510 	sbi = EXT3_SB(sb);
511 	es = sbi->s_es;
512 	if (block < le32_to_cpu(es->s_first_data_block) ||
513 	    block + count < block ||
514 	    block + count > le32_to_cpu(es->s_blocks_count)) {
515 		ext3_error (sb, "ext3_free_blocks",
516 			    "Freeing blocks not in datazone - "
517 			    "block = "E3FSBLK", count = %lu", block, count);
518 		goto error_return;
519 	}
520 
521 	ext3_debug ("freeing block(s) %lu-%lu\n", block, block + count - 1);
522 
523 do_more:
524 	overflow = 0;
525 	block_group = (block - le32_to_cpu(es->s_first_data_block)) /
526 		      EXT3_BLOCKS_PER_GROUP(sb);
527 	bit = (block - le32_to_cpu(es->s_first_data_block)) %
528 		      EXT3_BLOCKS_PER_GROUP(sb);
529 	/*
530 	 * Check to see if we are freeing blocks across a group
531 	 * boundary.
532 	 */
533 	if (bit + count > EXT3_BLOCKS_PER_GROUP(sb)) {
534 		overflow = bit + count - EXT3_BLOCKS_PER_GROUP(sb);
535 		count -= overflow;
536 	}
537 	brelse(bitmap_bh);
538 	bitmap_bh = read_block_bitmap(sb, block_group);
539 	if (!bitmap_bh)
540 		goto error_return;
541 	desc = ext3_get_group_desc (sb, block_group, &gd_bh);
542 	if (!desc)
543 		goto error_return;
544 
545 	if (in_range (le32_to_cpu(desc->bg_block_bitmap), block, count) ||
546 	    in_range (le32_to_cpu(desc->bg_inode_bitmap), block, count) ||
547 	    in_range (block, le32_to_cpu(desc->bg_inode_table),
548 		      sbi->s_itb_per_group) ||
549 	    in_range (block + count - 1, le32_to_cpu(desc->bg_inode_table),
550 		      sbi->s_itb_per_group)) {
551 		ext3_error (sb, "ext3_free_blocks",
552 			    "Freeing blocks in system zones - "
553 			    "Block = "E3FSBLK", count = %lu",
554 			    block, count);
555 		goto error_return;
556 	}
557 
558 	/*
559 	 * We are about to start releasing blocks in the bitmap,
560 	 * so we need undo access.
561 	 */
562 	/* @@@ check errors */
563 	BUFFER_TRACE(bitmap_bh, "getting undo access");
564 	err = ext3_journal_get_undo_access(handle, bitmap_bh);
565 	if (err)
566 		goto error_return;
567 
568 	/*
569 	 * We are about to modify some metadata.  Call the journal APIs
570 	 * to unshare ->b_data if a currently-committing transaction is
571 	 * using it
572 	 */
573 	BUFFER_TRACE(gd_bh, "get_write_access");
574 	err = ext3_journal_get_write_access(handle, gd_bh);
575 	if (err)
576 		goto error_return;
577 
578 	jbd_lock_bh_state(bitmap_bh);
579 
580 	for (i = 0, group_freed = 0; i < count; i++) {
581 		/*
582 		 * An HJ special.  This is expensive...
583 		 */
584 #ifdef CONFIG_JBD_DEBUG
585 		jbd_unlock_bh_state(bitmap_bh);
586 		{
587 			struct buffer_head *debug_bh;
588 			debug_bh = sb_find_get_block(sb, block + i);
589 			if (debug_bh) {
590 				BUFFER_TRACE(debug_bh, "Deleted!");
591 				if (!bh2jh(bitmap_bh)->b_committed_data)
592 					BUFFER_TRACE(debug_bh,
593 						"No committed data in bitmap");
594 				BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap");
595 				__brelse(debug_bh);
596 			}
597 		}
598 		jbd_lock_bh_state(bitmap_bh);
599 #endif
600 		if (need_resched()) {
601 			jbd_unlock_bh_state(bitmap_bh);
602 			cond_resched();
603 			jbd_lock_bh_state(bitmap_bh);
604 		}
605 		/* @@@ This prevents newly-allocated data from being
606 		 * freed and then reallocated within the same
607 		 * transaction.
608 		 *
609 		 * Ideally we would want to allow that to happen, but to
610 		 * do so requires making journal_forget() capable of
611 		 * revoking the queued write of a data block, which
612 		 * implies blocking on the journal lock.  *forget()
613 		 * cannot block due to truncate races.
614 		 *
615 		 * Eventually we can fix this by making journal_forget()
616 		 * return a status indicating whether or not it was able
617 		 * to revoke the buffer.  On successful revoke, it is
618 		 * safe not to set the allocation bit in the committed
619 		 * bitmap, because we know that there is no outstanding
620 		 * activity on the buffer any more and so it is safe to
621 		 * reallocate it.
622 		 */
623 		BUFFER_TRACE(bitmap_bh, "set in b_committed_data");
624 		J_ASSERT_BH(bitmap_bh,
625 				bh2jh(bitmap_bh)->b_committed_data != NULL);
626 		ext3_set_bit_atomic(sb_bgl_lock(sbi, block_group), bit + i,
627 				bh2jh(bitmap_bh)->b_committed_data);
628 
629 		/*
630 		 * We clear the bit in the bitmap after setting the committed
631 		 * data bit, because this is the reverse order to that which
632 		 * the allocator uses.
633 		 */
634 		BUFFER_TRACE(bitmap_bh, "clear bit");
635 		if (!ext3_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
636 						bit + i, bitmap_bh->b_data)) {
637 			jbd_unlock_bh_state(bitmap_bh);
638 			ext3_error(sb, __func__,
639 				"bit already cleared for block "E3FSBLK,
640 				 block + i);
641 			jbd_lock_bh_state(bitmap_bh);
642 			BUFFER_TRACE(bitmap_bh, "bit already cleared");
643 		} else {
644 			group_freed++;
645 		}
646 	}
647 	jbd_unlock_bh_state(bitmap_bh);
648 
649 	spin_lock(sb_bgl_lock(sbi, block_group));
650 	le16_add_cpu(&desc->bg_free_blocks_count, group_freed);
651 	spin_unlock(sb_bgl_lock(sbi, block_group));
652 	percpu_counter_add(&sbi->s_freeblocks_counter, count);
653 
654 	/* We dirtied the bitmap block */
655 	BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
656 	err = ext3_journal_dirty_metadata(handle, bitmap_bh);
657 
658 	/* And the group descriptor block */
659 	BUFFER_TRACE(gd_bh, "dirtied group descriptor block");
660 	ret = ext3_journal_dirty_metadata(handle, gd_bh);
661 	if (!err) err = ret;
662 	*pdquot_freed_blocks += group_freed;
663 
664 	if (overflow && !err) {
665 		block += count;
666 		count = overflow;
667 		goto do_more;
668 	}
669 
670 error_return:
671 	brelse(bitmap_bh);
672 	ext3_std_error(sb, err);
673 	return;
674 }
675 
676 /**
677  * ext3_free_blocks() -- Free given blocks and update quota
678  * @handle:		handle for this transaction
679  * @inode:		inode
680  * @block:		start physical block to free
681  * @count:		number of blocks to count
682  */
ext3_free_blocks(handle_t * handle,struct inode * inode,ext3_fsblk_t block,unsigned long count)683 void ext3_free_blocks(handle_t *handle, struct inode *inode,
684 			ext3_fsblk_t block, unsigned long count)
685 {
686 	struct super_block * sb;
687 	unsigned long dquot_freed_blocks;
688 
689 	sb = inode->i_sb;
690 	if (!sb) {
691 		printk ("ext3_free_blocks: nonexistent device");
692 		return;
693 	}
694 	ext3_free_blocks_sb(handle, sb, block, count, &dquot_freed_blocks);
695 	if (dquot_freed_blocks)
696 		dquot_free_block(inode, dquot_freed_blocks);
697 	return;
698 }
699 
700 /**
701  * ext3_test_allocatable()
702  * @nr:			given allocation block group
703  * @bh:			bufferhead contains the bitmap of the given block group
704  *
705  * For ext3 allocations, we must not reuse any blocks which are
706  * allocated in the bitmap buffer's "last committed data" copy.  This
707  * prevents deletes from freeing up the page for reuse until we have
708  * committed the delete transaction.
709  *
710  * If we didn't do this, then deleting something and reallocating it as
711  * data would allow the old block to be overwritten before the
712  * transaction committed (because we force data to disk before commit).
713  * This would lead to corruption if we crashed between overwriting the
714  * data and committing the delete.
715  *
716  * @@@ We may want to make this allocation behaviour conditional on
717  * data-writes at some point, and disable it for metadata allocations or
718  * sync-data inodes.
719  */
ext3_test_allocatable(ext3_grpblk_t nr,struct buffer_head * bh)720 static int ext3_test_allocatable(ext3_grpblk_t nr, struct buffer_head *bh)
721 {
722 	int ret;
723 	struct journal_head *jh = bh2jh(bh);
724 
725 	if (ext3_test_bit(nr, bh->b_data))
726 		return 0;
727 
728 	jbd_lock_bh_state(bh);
729 	if (!jh->b_committed_data)
730 		ret = 1;
731 	else
732 		ret = !ext3_test_bit(nr, jh->b_committed_data);
733 	jbd_unlock_bh_state(bh);
734 	return ret;
735 }
736 
737 /**
738  * bitmap_search_next_usable_block()
739  * @start:		the starting block (group relative) of the search
740  * @bh:			bufferhead contains the block group bitmap
741  * @maxblocks:		the ending block (group relative) of the reservation
742  *
743  * The bitmap search --- search forward alternately through the actual
744  * bitmap on disk and the last-committed copy in journal, until we find a
745  * bit free in both bitmaps.
746  */
747 static ext3_grpblk_t
bitmap_search_next_usable_block(ext3_grpblk_t start,struct buffer_head * bh,ext3_grpblk_t maxblocks)748 bitmap_search_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh,
749 					ext3_grpblk_t maxblocks)
750 {
751 	ext3_grpblk_t next;
752 	struct journal_head *jh = bh2jh(bh);
753 
754 	while (start < maxblocks) {
755 		next = ext3_find_next_zero_bit(bh->b_data, maxblocks, start);
756 		if (next >= maxblocks)
757 			return -1;
758 		if (ext3_test_allocatable(next, bh))
759 			return next;
760 		jbd_lock_bh_state(bh);
761 		if (jh->b_committed_data)
762 			start = ext3_find_next_zero_bit(jh->b_committed_data,
763 							maxblocks, next);
764 		jbd_unlock_bh_state(bh);
765 	}
766 	return -1;
767 }
768 
769 /**
770  * find_next_usable_block()
771  * @start:		the starting block (group relative) to find next
772  *			allocatable block in bitmap.
773  * @bh:			bufferhead contains the block group bitmap
774  * @maxblocks:		the ending block (group relative) for the search
775  *
776  * Find an allocatable block in a bitmap.  We honor both the bitmap and
777  * its last-committed copy (if that exists), and perform the "most
778  * appropriate allocation" algorithm of looking for a free block near
779  * the initial goal; then for a free byte somewhere in the bitmap; then
780  * for any free bit in the bitmap.
781  */
782 static ext3_grpblk_t
find_next_usable_block(ext3_grpblk_t start,struct buffer_head * bh,ext3_grpblk_t maxblocks)783 find_next_usable_block(ext3_grpblk_t start, struct buffer_head *bh,
784 			ext3_grpblk_t maxblocks)
785 {
786 	ext3_grpblk_t here, next;
787 	char *p, *r;
788 
789 	if (start > 0) {
790 		/*
791 		 * The goal was occupied; search forward for a free
792 		 * block within the next XX blocks.
793 		 *
794 		 * end_goal is more or less random, but it has to be
795 		 * less than EXT3_BLOCKS_PER_GROUP. Aligning up to the
796 		 * next 64-bit boundary is simple..
797 		 */
798 		ext3_grpblk_t end_goal = (start + 63) & ~63;
799 		if (end_goal > maxblocks)
800 			end_goal = maxblocks;
801 		here = ext3_find_next_zero_bit(bh->b_data, end_goal, start);
802 		if (here < end_goal && ext3_test_allocatable(here, bh))
803 			return here;
804 		ext3_debug("Bit not found near goal\n");
805 	}
806 
807 	here = start;
808 	if (here < 0)
809 		here = 0;
810 
811 	p = bh->b_data + (here >> 3);
812 	r = memscan(p, 0, ((maxblocks + 7) >> 3) - (here >> 3));
813 	next = (r - bh->b_data) << 3;
814 
815 	if (next < maxblocks && next >= start && ext3_test_allocatable(next, bh))
816 		return next;
817 
818 	/*
819 	 * The bitmap search --- search forward alternately through the actual
820 	 * bitmap and the last-committed copy until we find a bit free in
821 	 * both
822 	 */
823 	here = bitmap_search_next_usable_block(here, bh, maxblocks);
824 	return here;
825 }
826 
827 /**
828  * claim_block()
829  * @lock:		the spin lock for this block group
830  * @block:		the free block (group relative) to allocate
831  * @bh:			the buffer_head contains the block group bitmap
832  *
833  * We think we can allocate this block in this bitmap.  Try to set the bit.
834  * If that succeeds then check that nobody has allocated and then freed the
835  * block since we saw that is was not marked in b_committed_data.  If it _was_
836  * allocated and freed then clear the bit in the bitmap again and return
837  * zero (failure).
838  */
839 static inline int
claim_block(spinlock_t * lock,ext3_grpblk_t block,struct buffer_head * bh)840 claim_block(spinlock_t *lock, ext3_grpblk_t block, struct buffer_head *bh)
841 {
842 	struct journal_head *jh = bh2jh(bh);
843 	int ret;
844 
845 	if (ext3_set_bit_atomic(lock, block, bh->b_data))
846 		return 0;
847 	jbd_lock_bh_state(bh);
848 	if (jh->b_committed_data && ext3_test_bit(block,jh->b_committed_data)) {
849 		ext3_clear_bit_atomic(lock, block, bh->b_data);
850 		ret = 0;
851 	} else {
852 		ret = 1;
853 	}
854 	jbd_unlock_bh_state(bh);
855 	return ret;
856 }
857 
858 /**
859  * ext3_try_to_allocate()
860  * @sb:			superblock
861  * @handle:		handle to this transaction
862  * @group:		given allocation block group
863  * @bitmap_bh:		bufferhead holds the block bitmap
864  * @grp_goal:		given target block within the group
865  * @count:		target number of blocks to allocate
866  * @my_rsv:		reservation window
867  *
868  * Attempt to allocate blocks within a give range. Set the range of allocation
869  * first, then find the first free bit(s) from the bitmap (within the range),
870  * and at last, allocate the blocks by claiming the found free bit as allocated.
871  *
872  * To set the range of this allocation:
873  *	if there is a reservation window, only try to allocate block(s) from the
874  *	file's own reservation window;
875  *	Otherwise, the allocation range starts from the give goal block, ends at
876  *	the block group's last block.
877  *
878  * If we failed to allocate the desired block then we may end up crossing to a
879  * new bitmap.  In that case we must release write access to the old one via
880  * ext3_journal_release_buffer(), else we'll run out of credits.
881  */
882 static ext3_grpblk_t
ext3_try_to_allocate(struct super_block * sb,handle_t * handle,int group,struct buffer_head * bitmap_bh,ext3_grpblk_t grp_goal,unsigned long * count,struct ext3_reserve_window * my_rsv)883 ext3_try_to_allocate(struct super_block *sb, handle_t *handle, int group,
884 			struct buffer_head *bitmap_bh, ext3_grpblk_t grp_goal,
885 			unsigned long *count, struct ext3_reserve_window *my_rsv)
886 {
887 	ext3_fsblk_t group_first_block;
888 	ext3_grpblk_t start, end;
889 	unsigned long num = 0;
890 
891 	/* we do allocation within the reservation window if we have a window */
892 	if (my_rsv) {
893 		group_first_block = ext3_group_first_block_no(sb, group);
894 		if (my_rsv->_rsv_start >= group_first_block)
895 			start = my_rsv->_rsv_start - group_first_block;
896 		else
897 			/* reservation window cross group boundary */
898 			start = 0;
899 		end = my_rsv->_rsv_end - group_first_block + 1;
900 		if (end > EXT3_BLOCKS_PER_GROUP(sb))
901 			/* reservation window crosses group boundary */
902 			end = EXT3_BLOCKS_PER_GROUP(sb);
903 		if ((start <= grp_goal) && (grp_goal < end))
904 			start = grp_goal;
905 		else
906 			grp_goal = -1;
907 	} else {
908 		if (grp_goal > 0)
909 			start = grp_goal;
910 		else
911 			start = 0;
912 		end = EXT3_BLOCKS_PER_GROUP(sb);
913 	}
914 
915 	BUG_ON(start > EXT3_BLOCKS_PER_GROUP(sb));
916 
917 repeat:
918 	if (grp_goal < 0 || !ext3_test_allocatable(grp_goal, bitmap_bh)) {
919 		grp_goal = find_next_usable_block(start, bitmap_bh, end);
920 		if (grp_goal < 0)
921 			goto fail_access;
922 		if (!my_rsv) {
923 			int i;
924 
925 			for (i = 0; i < 7 && grp_goal > start &&
926 					ext3_test_allocatable(grp_goal - 1,
927 								bitmap_bh);
928 					i++, grp_goal--)
929 				;
930 		}
931 	}
932 	start = grp_goal;
933 
934 	if (!claim_block(sb_bgl_lock(EXT3_SB(sb), group),
935 		grp_goal, bitmap_bh)) {
936 		/*
937 		 * The block was allocated by another thread, or it was
938 		 * allocated and then freed by another thread
939 		 */
940 		start++;
941 		grp_goal++;
942 		if (start >= end)
943 			goto fail_access;
944 		goto repeat;
945 	}
946 	num++;
947 	grp_goal++;
948 	while (num < *count && grp_goal < end
949 		&& ext3_test_allocatable(grp_goal, bitmap_bh)
950 		&& claim_block(sb_bgl_lock(EXT3_SB(sb), group),
951 				grp_goal, bitmap_bh)) {
952 		num++;
953 		grp_goal++;
954 	}
955 	*count = num;
956 	return grp_goal - num;
957 fail_access:
958 	*count = num;
959 	return -1;
960 }
961 
962 /**
963  *	find_next_reservable_window():
964  *		find a reservable space within the given range.
965  *		It does not allocate the reservation window for now:
966  *		alloc_new_reservation() will do the work later.
967  *
968  *	@search_head: the head of the searching list;
969  *		This is not necessarily the list head of the whole filesystem
970  *
971  *		We have both head and start_block to assist the search
972  *		for the reservable space. The list starts from head,
973  *		but we will shift to the place where start_block is,
974  *		then start from there, when looking for a reservable space.
975  *
976  *	@my_rsv: the reservation window
977  *
978  *	@sb: the super block
979  *
980  *	@start_block: the first block we consider to start
981  *			the real search from
982  *
983  *	@last_block:
984  *		the maximum block number that our goal reservable space
985  *		could start from. This is normally the last block in this
986  *		group. The search will end when we found the start of next
987  *		possible reservable space is out of this boundary.
988  *		This could handle the cross boundary reservation window
989  *		request.
990  *
991  *	basically we search from the given range, rather than the whole
992  *	reservation double linked list, (start_block, last_block)
993  *	to find a free region that is of my size and has not
994  *	been reserved.
995  *
996  */
find_next_reservable_window(struct ext3_reserve_window_node * search_head,struct ext3_reserve_window_node * my_rsv,struct super_block * sb,ext3_fsblk_t start_block,ext3_fsblk_t last_block)997 static int find_next_reservable_window(
998 				struct ext3_reserve_window_node *search_head,
999 				struct ext3_reserve_window_node *my_rsv,
1000 				struct super_block * sb,
1001 				ext3_fsblk_t start_block,
1002 				ext3_fsblk_t last_block)
1003 {
1004 	struct rb_node *next;
1005 	struct ext3_reserve_window_node *rsv, *prev;
1006 	ext3_fsblk_t cur;
1007 	int size = my_rsv->rsv_goal_size;
1008 
1009 	/* TODO: make the start of the reservation window byte-aligned */
1010 	/* cur = *start_block & ~7;*/
1011 	cur = start_block;
1012 	rsv = search_head;
1013 	if (!rsv)
1014 		return -1;
1015 
1016 	while (1) {
1017 		if (cur <= rsv->rsv_end)
1018 			cur = rsv->rsv_end + 1;
1019 
1020 		/* TODO?
1021 		 * in the case we could not find a reservable space
1022 		 * that is what is expected, during the re-search, we could
1023 		 * remember what's the largest reservable space we could have
1024 		 * and return that one.
1025 		 *
1026 		 * For now it will fail if we could not find the reservable
1027 		 * space with expected-size (or more)...
1028 		 */
1029 		if (cur > last_block)
1030 			return -1;		/* fail */
1031 
1032 		prev = rsv;
1033 		next = rb_next(&rsv->rsv_node);
1034 		rsv = rb_entry(next,struct ext3_reserve_window_node,rsv_node);
1035 
1036 		/*
1037 		 * Reached the last reservation, we can just append to the
1038 		 * previous one.
1039 		 */
1040 		if (!next)
1041 			break;
1042 
1043 		if (cur + size <= rsv->rsv_start) {
1044 			/*
1045 			 * Found a reserveable space big enough.  We could
1046 			 * have a reservation across the group boundary here
1047 			 */
1048 			break;
1049 		}
1050 	}
1051 	/*
1052 	 * we come here either :
1053 	 * when we reach the end of the whole list,
1054 	 * and there is empty reservable space after last entry in the list.
1055 	 * append it to the end of the list.
1056 	 *
1057 	 * or we found one reservable space in the middle of the list,
1058 	 * return the reservation window that we could append to.
1059 	 * succeed.
1060 	 */
1061 
1062 	if ((prev != my_rsv) && (!rsv_is_empty(&my_rsv->rsv_window)))
1063 		rsv_window_remove(sb, my_rsv);
1064 
1065 	/*
1066 	 * Let's book the whole available window for now.  We will check the
1067 	 * disk bitmap later and then, if there are free blocks then we adjust
1068 	 * the window size if it's larger than requested.
1069 	 * Otherwise, we will remove this node from the tree next time
1070 	 * call find_next_reservable_window.
1071 	 */
1072 	my_rsv->rsv_start = cur;
1073 	my_rsv->rsv_end = cur + size - 1;
1074 	my_rsv->rsv_alloc_hit = 0;
1075 
1076 	if (prev != my_rsv)
1077 		ext3_rsv_window_add(sb, my_rsv);
1078 
1079 	return 0;
1080 }
1081 
1082 /**
1083  *	alloc_new_reservation()--allocate a new reservation window
1084  *
1085  *		To make a new reservation, we search part of the filesystem
1086  *		reservation list (the list that inside the group). We try to
1087  *		allocate a new reservation window near the allocation goal,
1088  *		or the beginning of the group, if there is no goal.
1089  *
1090  *		We first find a reservable space after the goal, then from
1091  *		there, we check the bitmap for the first free block after
1092  *		it. If there is no free block until the end of group, then the
1093  *		whole group is full, we failed. Otherwise, check if the free
1094  *		block is inside the expected reservable space, if so, we
1095  *		succeed.
1096  *		If the first free block is outside the reservable space, then
1097  *		start from the first free block, we search for next available
1098  *		space, and go on.
1099  *
1100  *	on succeed, a new reservation will be found and inserted into the list
1101  *	It contains at least one free block, and it does not overlap with other
1102  *	reservation windows.
1103  *
1104  *	failed: we failed to find a reservation window in this group
1105  *
1106  *	@my_rsv: the reservation window
1107  *
1108  *	@grp_goal: The goal (group-relative).  It is where the search for a
1109  *		free reservable space should start from.
1110  *		if we have a grp_goal(grp_goal >0 ), then start from there,
1111  *		no grp_goal(grp_goal = -1), we start from the first block
1112  *		of the group.
1113  *
1114  *	@sb: the super block
1115  *	@group: the group we are trying to allocate in
1116  *	@bitmap_bh: the block group block bitmap
1117  *
1118  */
alloc_new_reservation(struct ext3_reserve_window_node * my_rsv,ext3_grpblk_t grp_goal,struct super_block * sb,unsigned int group,struct buffer_head * bitmap_bh)1119 static int alloc_new_reservation(struct ext3_reserve_window_node *my_rsv,
1120 		ext3_grpblk_t grp_goal, struct super_block *sb,
1121 		unsigned int group, struct buffer_head *bitmap_bh)
1122 {
1123 	struct ext3_reserve_window_node *search_head;
1124 	ext3_fsblk_t group_first_block, group_end_block, start_block;
1125 	ext3_grpblk_t first_free_block;
1126 	struct rb_root *fs_rsv_root = &EXT3_SB(sb)->s_rsv_window_root;
1127 	unsigned long size;
1128 	int ret;
1129 	spinlock_t *rsv_lock = &EXT3_SB(sb)->s_rsv_window_lock;
1130 
1131 	group_first_block = ext3_group_first_block_no(sb, group);
1132 	group_end_block = group_first_block + (EXT3_BLOCKS_PER_GROUP(sb) - 1);
1133 
1134 	if (grp_goal < 0)
1135 		start_block = group_first_block;
1136 	else
1137 		start_block = grp_goal + group_first_block;
1138 
1139 	size = my_rsv->rsv_goal_size;
1140 
1141 	if (!rsv_is_empty(&my_rsv->rsv_window)) {
1142 		/*
1143 		 * if the old reservation is cross group boundary
1144 		 * and if the goal is inside the old reservation window,
1145 		 * we will come here when we just failed to allocate from
1146 		 * the first part of the window. We still have another part
1147 		 * that belongs to the next group. In this case, there is no
1148 		 * point to discard our window and try to allocate a new one
1149 		 * in this group(which will fail). we should
1150 		 * keep the reservation window, just simply move on.
1151 		 *
1152 		 * Maybe we could shift the start block of the reservation
1153 		 * window to the first block of next group.
1154 		 */
1155 
1156 		if ((my_rsv->rsv_start <= group_end_block) &&
1157 				(my_rsv->rsv_end > group_end_block) &&
1158 				(start_block >= my_rsv->rsv_start))
1159 			return -1;
1160 
1161 		if ((my_rsv->rsv_alloc_hit >
1162 		     (my_rsv->rsv_end - my_rsv->rsv_start + 1) / 2)) {
1163 			/*
1164 			 * if the previously allocation hit ratio is
1165 			 * greater than 1/2, then we double the size of
1166 			 * the reservation window the next time,
1167 			 * otherwise we keep the same size window
1168 			 */
1169 			size = size * 2;
1170 			if (size > EXT3_MAX_RESERVE_BLOCKS)
1171 				size = EXT3_MAX_RESERVE_BLOCKS;
1172 			my_rsv->rsv_goal_size= size;
1173 		}
1174 	}
1175 
1176 	spin_lock(rsv_lock);
1177 	/*
1178 	 * shift the search start to the window near the goal block
1179 	 */
1180 	search_head = search_reserve_window(fs_rsv_root, start_block);
1181 
1182 	/*
1183 	 * find_next_reservable_window() simply finds a reservable window
1184 	 * inside the given range(start_block, group_end_block).
1185 	 *
1186 	 * To make sure the reservation window has a free bit inside it, we
1187 	 * need to check the bitmap after we found a reservable window.
1188 	 */
1189 retry:
1190 	ret = find_next_reservable_window(search_head, my_rsv, sb,
1191 						start_block, group_end_block);
1192 
1193 	if (ret == -1) {
1194 		if (!rsv_is_empty(&my_rsv->rsv_window))
1195 			rsv_window_remove(sb, my_rsv);
1196 		spin_unlock(rsv_lock);
1197 		return -1;
1198 	}
1199 
1200 	/*
1201 	 * On success, find_next_reservable_window() returns the
1202 	 * reservation window where there is a reservable space after it.
1203 	 * Before we reserve this reservable space, we need
1204 	 * to make sure there is at least a free block inside this region.
1205 	 *
1206 	 * searching the first free bit on the block bitmap and copy of
1207 	 * last committed bitmap alternatively, until we found a allocatable
1208 	 * block. Search start from the start block of the reservable space
1209 	 * we just found.
1210 	 */
1211 	spin_unlock(rsv_lock);
1212 	first_free_block = bitmap_search_next_usable_block(
1213 			my_rsv->rsv_start - group_first_block,
1214 			bitmap_bh, group_end_block - group_first_block + 1);
1215 
1216 	if (first_free_block < 0) {
1217 		/*
1218 		 * no free block left on the bitmap, no point
1219 		 * to reserve the space. return failed.
1220 		 */
1221 		spin_lock(rsv_lock);
1222 		if (!rsv_is_empty(&my_rsv->rsv_window))
1223 			rsv_window_remove(sb, my_rsv);
1224 		spin_unlock(rsv_lock);
1225 		return -1;		/* failed */
1226 	}
1227 
1228 	start_block = first_free_block + group_first_block;
1229 	/*
1230 	 * check if the first free block is within the
1231 	 * free space we just reserved
1232 	 */
1233 	if (start_block >= my_rsv->rsv_start && start_block <= my_rsv->rsv_end)
1234 		return 0;		/* success */
1235 	/*
1236 	 * if the first free bit we found is out of the reservable space
1237 	 * continue search for next reservable space,
1238 	 * start from where the free block is,
1239 	 * we also shift the list head to where we stopped last time
1240 	 */
1241 	search_head = my_rsv;
1242 	spin_lock(rsv_lock);
1243 	goto retry;
1244 }
1245 
1246 /**
1247  * try_to_extend_reservation()
1248  * @my_rsv:		given reservation window
1249  * @sb:			super block
1250  * @size:		the delta to extend
1251  *
1252  * Attempt to expand the reservation window large enough to have
1253  * required number of free blocks
1254  *
1255  * Since ext3_try_to_allocate() will always allocate blocks within
1256  * the reservation window range, if the window size is too small,
1257  * multiple blocks allocation has to stop at the end of the reservation
1258  * window. To make this more efficient, given the total number of
1259  * blocks needed and the current size of the window, we try to
1260  * expand the reservation window size if necessary on a best-effort
1261  * basis before ext3_new_blocks() tries to allocate blocks,
1262  */
try_to_extend_reservation(struct ext3_reserve_window_node * my_rsv,struct super_block * sb,int size)1263 static void try_to_extend_reservation(struct ext3_reserve_window_node *my_rsv,
1264 			struct super_block *sb, int size)
1265 {
1266 	struct ext3_reserve_window_node *next_rsv;
1267 	struct rb_node *next;
1268 	spinlock_t *rsv_lock = &EXT3_SB(sb)->s_rsv_window_lock;
1269 
1270 	if (!spin_trylock(rsv_lock))
1271 		return;
1272 
1273 	next = rb_next(&my_rsv->rsv_node);
1274 
1275 	if (!next)
1276 		my_rsv->rsv_end += size;
1277 	else {
1278 		next_rsv = rb_entry(next, struct ext3_reserve_window_node, rsv_node);
1279 
1280 		if ((next_rsv->rsv_start - my_rsv->rsv_end - 1) >= size)
1281 			my_rsv->rsv_end += size;
1282 		else
1283 			my_rsv->rsv_end = next_rsv->rsv_start - 1;
1284 	}
1285 	spin_unlock(rsv_lock);
1286 }
1287 
1288 /**
1289  * ext3_try_to_allocate_with_rsv()
1290  * @sb:			superblock
1291  * @handle:		handle to this transaction
1292  * @group:		given allocation block group
1293  * @bitmap_bh:		bufferhead holds the block bitmap
1294  * @grp_goal:		given target block within the group
1295  * @my_rsv:		reservation window
1296  * @count:		target number of blocks to allocate
1297  * @errp:		pointer to store the error code
1298  *
1299  * This is the main function used to allocate a new block and its reservation
1300  * window.
1301  *
1302  * Each time when a new block allocation is need, first try to allocate from
1303  * its own reservation.  If it does not have a reservation window, instead of
1304  * looking for a free bit on bitmap first, then look up the reservation list to
1305  * see if it is inside somebody else's reservation window, we try to allocate a
1306  * reservation window for it starting from the goal first. Then do the block
1307  * allocation within the reservation window.
1308  *
1309  * This will avoid keeping on searching the reservation list again and
1310  * again when somebody is looking for a free block (without
1311  * reservation), and there are lots of free blocks, but they are all
1312  * being reserved.
1313  *
1314  * We use a red-black tree for the per-filesystem reservation list.
1315  *
1316  */
1317 static ext3_grpblk_t
ext3_try_to_allocate_with_rsv(struct super_block * sb,handle_t * handle,unsigned int group,struct buffer_head * bitmap_bh,ext3_grpblk_t grp_goal,struct ext3_reserve_window_node * my_rsv,unsigned long * count,int * errp)1318 ext3_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle,
1319 			unsigned int group, struct buffer_head *bitmap_bh,
1320 			ext3_grpblk_t grp_goal,
1321 			struct ext3_reserve_window_node * my_rsv,
1322 			unsigned long *count, int *errp)
1323 {
1324 	ext3_fsblk_t group_first_block, group_last_block;
1325 	ext3_grpblk_t ret = 0;
1326 	int fatal;
1327 	unsigned long num = *count;
1328 
1329 	*errp = 0;
1330 
1331 	/*
1332 	 * Make sure we use undo access for the bitmap, because it is critical
1333 	 * that we do the frozen_data COW on bitmap buffers in all cases even
1334 	 * if the buffer is in BJ_Forget state in the committing transaction.
1335 	 */
1336 	BUFFER_TRACE(bitmap_bh, "get undo access for new block");
1337 	fatal = ext3_journal_get_undo_access(handle, bitmap_bh);
1338 	if (fatal) {
1339 		*errp = fatal;
1340 		return -1;
1341 	}
1342 
1343 	/*
1344 	 * we don't deal with reservation when
1345 	 * filesystem is mounted without reservation
1346 	 * or the file is not a regular file
1347 	 * or last attempt to allocate a block with reservation turned on failed
1348 	 */
1349 	if (my_rsv == NULL ) {
1350 		ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh,
1351 						grp_goal, count, NULL);
1352 		goto out;
1353 	}
1354 	/*
1355 	 * grp_goal is a group relative block number (if there is a goal)
1356 	 * 0 <= grp_goal < EXT3_BLOCKS_PER_GROUP(sb)
1357 	 * first block is a filesystem wide block number
1358 	 * first block is the block number of the first block in this group
1359 	 */
1360 	group_first_block = ext3_group_first_block_no(sb, group);
1361 	group_last_block = group_first_block + (EXT3_BLOCKS_PER_GROUP(sb) - 1);
1362 
1363 	/*
1364 	 * Basically we will allocate a new block from inode's reservation
1365 	 * window.
1366 	 *
1367 	 * We need to allocate a new reservation window, if:
1368 	 * a) inode does not have a reservation window; or
1369 	 * b) last attempt to allocate a block from existing reservation
1370 	 *    failed; or
1371 	 * c) we come here with a goal and with a reservation window
1372 	 *
1373 	 * We do not need to allocate a new reservation window if we come here
1374 	 * at the beginning with a goal and the goal is inside the window, or
1375 	 * we don't have a goal but already have a reservation window.
1376 	 * then we could go to allocate from the reservation window directly.
1377 	 */
1378 	while (1) {
1379 		if (rsv_is_empty(&my_rsv->rsv_window) || (ret < 0) ||
1380 			!goal_in_my_reservation(&my_rsv->rsv_window,
1381 						grp_goal, group, sb)) {
1382 			if (my_rsv->rsv_goal_size < *count)
1383 				my_rsv->rsv_goal_size = *count;
1384 			ret = alloc_new_reservation(my_rsv, grp_goal, sb,
1385 							group, bitmap_bh);
1386 			if (ret < 0)
1387 				break;			/* failed */
1388 
1389 			if (!goal_in_my_reservation(&my_rsv->rsv_window,
1390 							grp_goal, group, sb))
1391 				grp_goal = -1;
1392 		} else if (grp_goal >= 0) {
1393 			int curr = my_rsv->rsv_end -
1394 					(grp_goal + group_first_block) + 1;
1395 
1396 			if (curr < *count)
1397 				try_to_extend_reservation(my_rsv, sb,
1398 							*count - curr);
1399 		}
1400 
1401 		if ((my_rsv->rsv_start > group_last_block) ||
1402 				(my_rsv->rsv_end < group_first_block)) {
1403 			rsv_window_dump(&EXT3_SB(sb)->s_rsv_window_root, 1);
1404 			BUG();
1405 		}
1406 		ret = ext3_try_to_allocate(sb, handle, group, bitmap_bh,
1407 					   grp_goal, &num, &my_rsv->rsv_window);
1408 		if (ret >= 0) {
1409 			my_rsv->rsv_alloc_hit += num;
1410 			*count = num;
1411 			break;				/* succeed */
1412 		}
1413 		num = *count;
1414 	}
1415 out:
1416 	if (ret >= 0) {
1417 		BUFFER_TRACE(bitmap_bh, "journal_dirty_metadata for "
1418 					"bitmap block");
1419 		fatal = ext3_journal_dirty_metadata(handle, bitmap_bh);
1420 		if (fatal) {
1421 			*errp = fatal;
1422 			return -1;
1423 		}
1424 		return ret;
1425 	}
1426 
1427 	BUFFER_TRACE(bitmap_bh, "journal_release_buffer");
1428 	ext3_journal_release_buffer(handle, bitmap_bh);
1429 	return ret;
1430 }
1431 
1432 /**
1433  * ext3_has_free_blocks()
1434  * @sbi:		in-core super block structure.
1435  *
1436  * Check if filesystem has at least 1 free block available for allocation.
1437  */
ext3_has_free_blocks(struct ext3_sb_info * sbi)1438 static int ext3_has_free_blocks(struct ext3_sb_info *sbi)
1439 {
1440 	ext3_fsblk_t free_blocks, root_blocks;
1441 
1442 	free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter);
1443 	root_blocks = le32_to_cpu(sbi->s_es->s_r_blocks_count);
1444 	if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) &&
1445 		sbi->s_resuid != current_fsuid() &&
1446 		(sbi->s_resgid == 0 || !in_group_p (sbi->s_resgid))) {
1447 		return 0;
1448 	}
1449 	return 1;
1450 }
1451 
1452 /**
1453  * ext3_should_retry_alloc()
1454  * @sb:			super block
1455  * @retries		number of attemps has been made
1456  *
1457  * ext3_should_retry_alloc() is called when ENOSPC is returned, and if
1458  * it is profitable to retry the operation, this function will wait
1459  * for the current or committing transaction to complete, and then
1460  * return TRUE.
1461  *
1462  * if the total number of retries exceed three times, return FALSE.
1463  */
ext3_should_retry_alloc(struct super_block * sb,int * retries)1464 int ext3_should_retry_alloc(struct super_block *sb, int *retries)
1465 {
1466 	if (!ext3_has_free_blocks(EXT3_SB(sb)) || (*retries)++ > 3)
1467 		return 0;
1468 
1469 	jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
1470 
1471 	return journal_force_commit_nested(EXT3_SB(sb)->s_journal);
1472 }
1473 
1474 /**
1475  * ext3_new_blocks() -- core block(s) allocation function
1476  * @handle:		handle to this transaction
1477  * @inode:		file inode
1478  * @goal:		given target block(filesystem wide)
1479  * @count:		target number of blocks to allocate
1480  * @errp:		error code
1481  *
1482  * ext3_new_blocks uses a goal block to assist allocation.  It tries to
1483  * allocate block(s) from the block group contains the goal block first. If that
1484  * fails, it will try to allocate block(s) from other block groups without
1485  * any specific goal block.
1486  *
1487  */
ext3_new_blocks(handle_t * handle,struct inode * inode,ext3_fsblk_t goal,unsigned long * count,int * errp)1488 ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode,
1489 			ext3_fsblk_t goal, unsigned long *count, int *errp)
1490 {
1491 	struct buffer_head *bitmap_bh = NULL;
1492 	struct buffer_head *gdp_bh;
1493 	int group_no;
1494 	int goal_group;
1495 	ext3_grpblk_t grp_target_blk;	/* blockgroup relative goal block */
1496 	ext3_grpblk_t grp_alloc_blk;	/* blockgroup-relative allocated block*/
1497 	ext3_fsblk_t ret_block;		/* filesyetem-wide allocated block */
1498 	int bgi;			/* blockgroup iteration index */
1499 	int fatal = 0, err;
1500 	int performed_allocation = 0;
1501 	ext3_grpblk_t free_blocks;	/* number of free blocks in a group */
1502 	struct super_block *sb;
1503 	struct ext3_group_desc *gdp;
1504 	struct ext3_super_block *es;
1505 	struct ext3_sb_info *sbi;
1506 	struct ext3_reserve_window_node *my_rsv = NULL;
1507 	struct ext3_block_alloc_info *block_i;
1508 	unsigned short windowsz = 0;
1509 #ifdef EXT3FS_DEBUG
1510 	static int goal_hits, goal_attempts;
1511 #endif
1512 	unsigned long ngroups;
1513 	unsigned long num = *count;
1514 
1515 	*errp = -ENOSPC;
1516 	sb = inode->i_sb;
1517 	if (!sb) {
1518 		printk("ext3_new_block: nonexistent device");
1519 		return 0;
1520 	}
1521 
1522 	/*
1523 	 * Check quota for allocation of this block.
1524 	 */
1525 	err = dquot_alloc_block(inode, num);
1526 	if (err) {
1527 		*errp = err;
1528 		return 0;
1529 	}
1530 
1531 	sbi = EXT3_SB(sb);
1532 	es = EXT3_SB(sb)->s_es;
1533 	ext3_debug("goal=%lu.\n", goal);
1534 	/*
1535 	 * Allocate a block from reservation only when
1536 	 * filesystem is mounted with reservation(default,-o reservation), and
1537 	 * it's a regular file, and
1538 	 * the desired window size is greater than 0 (One could use ioctl
1539 	 * command EXT3_IOC_SETRSVSZ to set the window size to 0 to turn off
1540 	 * reservation on that particular file)
1541 	 */
1542 	block_i = EXT3_I(inode)->i_block_alloc_info;
1543 	if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0))
1544 		my_rsv = &block_i->rsv_window_node;
1545 
1546 	if (!ext3_has_free_blocks(sbi)) {
1547 		*errp = -ENOSPC;
1548 		goto out;
1549 	}
1550 
1551 	/*
1552 	 * First, test whether the goal block is free.
1553 	 */
1554 	if (goal < le32_to_cpu(es->s_first_data_block) ||
1555 	    goal >= le32_to_cpu(es->s_blocks_count))
1556 		goal = le32_to_cpu(es->s_first_data_block);
1557 	group_no = (goal - le32_to_cpu(es->s_first_data_block)) /
1558 			EXT3_BLOCKS_PER_GROUP(sb);
1559 	goal_group = group_no;
1560 retry_alloc:
1561 	gdp = ext3_get_group_desc(sb, group_no, &gdp_bh);
1562 	if (!gdp)
1563 		goto io_error;
1564 
1565 	free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
1566 	/*
1567 	 * if there is not enough free blocks to make a new resevation
1568 	 * turn off reservation for this allocation
1569 	 */
1570 	if (my_rsv && (free_blocks < windowsz)
1571 		&& (free_blocks > 0)
1572 		&& (rsv_is_empty(&my_rsv->rsv_window)))
1573 		my_rsv = NULL;
1574 
1575 	if (free_blocks > 0) {
1576 		grp_target_blk = ((goal - le32_to_cpu(es->s_first_data_block)) %
1577 				EXT3_BLOCKS_PER_GROUP(sb));
1578 		bitmap_bh = read_block_bitmap(sb, group_no);
1579 		if (!bitmap_bh)
1580 			goto io_error;
1581 		grp_alloc_blk = ext3_try_to_allocate_with_rsv(sb, handle,
1582 					group_no, bitmap_bh, grp_target_blk,
1583 					my_rsv,	&num, &fatal);
1584 		if (fatal)
1585 			goto out;
1586 		if (grp_alloc_blk >= 0)
1587 			goto allocated;
1588 	}
1589 
1590 	ngroups = EXT3_SB(sb)->s_groups_count;
1591 	smp_rmb();
1592 
1593 	/*
1594 	 * Now search the rest of the groups.  We assume that
1595 	 * group_no and gdp correctly point to the last group visited.
1596 	 */
1597 	for (bgi = 0; bgi < ngroups; bgi++) {
1598 		group_no++;
1599 		if (group_no >= ngroups)
1600 			group_no = 0;
1601 		gdp = ext3_get_group_desc(sb, group_no, &gdp_bh);
1602 		if (!gdp)
1603 			goto io_error;
1604 		free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
1605 		/*
1606 		 * skip this group (and avoid loading bitmap) if there
1607 		 * are no free blocks
1608 		 */
1609 		if (!free_blocks)
1610 			continue;
1611 		/*
1612 		 * skip this group if the number of
1613 		 * free blocks is less than half of the reservation
1614 		 * window size.
1615 		 */
1616 		if (my_rsv && (free_blocks <= (windowsz/2)))
1617 			continue;
1618 
1619 		brelse(bitmap_bh);
1620 		bitmap_bh = read_block_bitmap(sb, group_no);
1621 		if (!bitmap_bh)
1622 			goto io_error;
1623 		/*
1624 		 * try to allocate block(s) from this group, without a goal(-1).
1625 		 */
1626 		grp_alloc_blk = ext3_try_to_allocate_with_rsv(sb, handle,
1627 					group_no, bitmap_bh, -1, my_rsv,
1628 					&num, &fatal);
1629 		if (fatal)
1630 			goto out;
1631 		if (grp_alloc_blk >= 0)
1632 			goto allocated;
1633 	}
1634 	/*
1635 	 * We may end up a bogus earlier ENOSPC error due to
1636 	 * filesystem is "full" of reservations, but
1637 	 * there maybe indeed free blocks available on disk
1638 	 * In this case, we just forget about the reservations
1639 	 * just do block allocation as without reservations.
1640 	 */
1641 	if (my_rsv) {
1642 		my_rsv = NULL;
1643 		windowsz = 0;
1644 		group_no = goal_group;
1645 		goto retry_alloc;
1646 	}
1647 	/* No space left on the device */
1648 	*errp = -ENOSPC;
1649 	goto out;
1650 
1651 allocated:
1652 
1653 	ext3_debug("using block group %d(%d)\n",
1654 			group_no, gdp->bg_free_blocks_count);
1655 
1656 	BUFFER_TRACE(gdp_bh, "get_write_access");
1657 	fatal = ext3_journal_get_write_access(handle, gdp_bh);
1658 	if (fatal)
1659 		goto out;
1660 
1661 	ret_block = grp_alloc_blk + ext3_group_first_block_no(sb, group_no);
1662 
1663 	if (in_range(le32_to_cpu(gdp->bg_block_bitmap), ret_block, num) ||
1664 	    in_range(le32_to_cpu(gdp->bg_inode_bitmap), ret_block, num) ||
1665 	    in_range(ret_block, le32_to_cpu(gdp->bg_inode_table),
1666 		      EXT3_SB(sb)->s_itb_per_group) ||
1667 	    in_range(ret_block + num - 1, le32_to_cpu(gdp->bg_inode_table),
1668 		      EXT3_SB(sb)->s_itb_per_group)) {
1669 		ext3_error(sb, "ext3_new_block",
1670 			    "Allocating block in system zone - "
1671 			    "blocks from "E3FSBLK", length %lu",
1672 			     ret_block, num);
1673 		/*
1674 		 * claim_block() marked the blocks we allocated as in use. So we
1675 		 * may want to selectively mark some of the blocks as free.
1676 		 */
1677 		goto retry_alloc;
1678 	}
1679 
1680 	performed_allocation = 1;
1681 
1682 #ifdef CONFIG_JBD_DEBUG
1683 	{
1684 		struct buffer_head *debug_bh;
1685 
1686 		/* Record bitmap buffer state in the newly allocated block */
1687 		debug_bh = sb_find_get_block(sb, ret_block);
1688 		if (debug_bh) {
1689 			BUFFER_TRACE(debug_bh, "state when allocated");
1690 			BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap state");
1691 			brelse(debug_bh);
1692 		}
1693 	}
1694 	jbd_lock_bh_state(bitmap_bh);
1695 	spin_lock(sb_bgl_lock(sbi, group_no));
1696 	if (buffer_jbd(bitmap_bh) && bh2jh(bitmap_bh)->b_committed_data) {
1697 		int i;
1698 
1699 		for (i = 0; i < num; i++) {
1700 			if (ext3_test_bit(grp_alloc_blk+i,
1701 					bh2jh(bitmap_bh)->b_committed_data)) {
1702 				printk("%s: block was unexpectedly set in "
1703 					"b_committed_data\n", __func__);
1704 			}
1705 		}
1706 	}
1707 	ext3_debug("found bit %d\n", grp_alloc_blk);
1708 	spin_unlock(sb_bgl_lock(sbi, group_no));
1709 	jbd_unlock_bh_state(bitmap_bh);
1710 #endif
1711 
1712 	if (ret_block + num - 1 >= le32_to_cpu(es->s_blocks_count)) {
1713 		ext3_error(sb, "ext3_new_block",
1714 			    "block("E3FSBLK") >= blocks count(%d) - "
1715 			    "block_group = %d, es == %p ", ret_block,
1716 			le32_to_cpu(es->s_blocks_count), group_no, es);
1717 		goto out;
1718 	}
1719 
1720 	/*
1721 	 * It is up to the caller to add the new buffer to a journal
1722 	 * list of some description.  We don't know in advance whether
1723 	 * the caller wants to use it as metadata or data.
1724 	 */
1725 	ext3_debug("allocating block %lu. Goal hits %d of %d.\n",
1726 			ret_block, goal_hits, goal_attempts);
1727 
1728 	spin_lock(sb_bgl_lock(sbi, group_no));
1729 	le16_add_cpu(&gdp->bg_free_blocks_count, -num);
1730 	spin_unlock(sb_bgl_lock(sbi, group_no));
1731 	percpu_counter_sub(&sbi->s_freeblocks_counter, num);
1732 
1733 	BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor");
1734 	err = ext3_journal_dirty_metadata(handle, gdp_bh);
1735 	if (!fatal)
1736 		fatal = err;
1737 
1738 	if (fatal)
1739 		goto out;
1740 
1741 	*errp = 0;
1742 	brelse(bitmap_bh);
1743 	dquot_free_block(inode, *count-num);
1744 	*count = num;
1745 	return ret_block;
1746 
1747 io_error:
1748 	*errp = -EIO;
1749 out:
1750 	if (fatal) {
1751 		*errp = fatal;
1752 		ext3_std_error(sb, fatal);
1753 	}
1754 	/*
1755 	 * Undo the block allocation
1756 	 */
1757 	if (!performed_allocation)
1758 		dquot_free_block(inode, *count);
1759 	brelse(bitmap_bh);
1760 	return 0;
1761 }
1762 
ext3_new_block(handle_t * handle,struct inode * inode,ext3_fsblk_t goal,int * errp)1763 ext3_fsblk_t ext3_new_block(handle_t *handle, struct inode *inode,
1764 			ext3_fsblk_t goal, int *errp)
1765 {
1766 	unsigned long count = 1;
1767 
1768 	return ext3_new_blocks(handle, inode, goal, &count, errp);
1769 }
1770 
1771 /**
1772  * ext3_count_free_blocks() -- count filesystem free blocks
1773  * @sb:		superblock
1774  *
1775  * Adds up the number of free blocks from each block group.
1776  */
ext3_count_free_blocks(struct super_block * sb)1777 ext3_fsblk_t ext3_count_free_blocks(struct super_block *sb)
1778 {
1779 	ext3_fsblk_t desc_count;
1780 	struct ext3_group_desc *gdp;
1781 	int i;
1782 	unsigned long ngroups = EXT3_SB(sb)->s_groups_count;
1783 #ifdef EXT3FS_DEBUG
1784 	struct ext3_super_block *es;
1785 	ext3_fsblk_t bitmap_count;
1786 	unsigned long x;
1787 	struct buffer_head *bitmap_bh = NULL;
1788 
1789 	es = EXT3_SB(sb)->s_es;
1790 	desc_count = 0;
1791 	bitmap_count = 0;
1792 	gdp = NULL;
1793 
1794 	smp_rmb();
1795 	for (i = 0; i < ngroups; i++) {
1796 		gdp = ext3_get_group_desc(sb, i, NULL);
1797 		if (!gdp)
1798 			continue;
1799 		desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
1800 		brelse(bitmap_bh);
1801 		bitmap_bh = read_block_bitmap(sb, i);
1802 		if (bitmap_bh == NULL)
1803 			continue;
1804 
1805 		x = ext3_count_free(bitmap_bh, sb->s_blocksize);
1806 		printk("group %d: stored = %d, counted = %lu\n",
1807 			i, le16_to_cpu(gdp->bg_free_blocks_count), x);
1808 		bitmap_count += x;
1809 	}
1810 	brelse(bitmap_bh);
1811 	printk("ext3_count_free_blocks: stored = "E3FSBLK
1812 		", computed = "E3FSBLK", "E3FSBLK"\n",
1813 	       le32_to_cpu(es->s_free_blocks_count),
1814 		desc_count, bitmap_count);
1815 	return bitmap_count;
1816 #else
1817 	desc_count = 0;
1818 	smp_rmb();
1819 	for (i = 0; i < ngroups; i++) {
1820 		gdp = ext3_get_group_desc(sb, i, NULL);
1821 		if (!gdp)
1822 			continue;
1823 		desc_count += le16_to_cpu(gdp->bg_free_blocks_count);
1824 	}
1825 
1826 	return desc_count;
1827 #endif
1828 }
1829 
test_root(int a,int b)1830 static inline int test_root(int a, int b)
1831 {
1832 	int num = b;
1833 
1834 	while (a > num)
1835 		num *= b;
1836 	return num == a;
1837 }
1838 
ext3_group_sparse(int group)1839 static int ext3_group_sparse(int group)
1840 {
1841 	if (group <= 1)
1842 		return 1;
1843 	if (!(group & 1))
1844 		return 0;
1845 	return (test_root(group, 7) || test_root(group, 5) ||
1846 		test_root(group, 3));
1847 }
1848 
1849 /**
1850  *	ext3_bg_has_super - number of blocks used by the superblock in group
1851  *	@sb: superblock for filesystem
1852  *	@group: group number to check
1853  *
1854  *	Return the number of blocks used by the superblock (primary or backup)
1855  *	in this group.  Currently this will be only 0 or 1.
1856  */
ext3_bg_has_super(struct super_block * sb,int group)1857 int ext3_bg_has_super(struct super_block *sb, int group)
1858 {
1859 	if (EXT3_HAS_RO_COMPAT_FEATURE(sb,
1860 				EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER) &&
1861 			!ext3_group_sparse(group))
1862 		return 0;
1863 	return 1;
1864 }
1865 
ext3_bg_num_gdb_meta(struct super_block * sb,int group)1866 static unsigned long ext3_bg_num_gdb_meta(struct super_block *sb, int group)
1867 {
1868 	unsigned long metagroup = group / EXT3_DESC_PER_BLOCK(sb);
1869 	unsigned long first = metagroup * EXT3_DESC_PER_BLOCK(sb);
1870 	unsigned long last = first + EXT3_DESC_PER_BLOCK(sb) - 1;
1871 
1872 	if (group == first || group == first + 1 || group == last)
1873 		return 1;
1874 	return 0;
1875 }
1876 
ext3_bg_num_gdb_nometa(struct super_block * sb,int group)1877 static unsigned long ext3_bg_num_gdb_nometa(struct super_block *sb, int group)
1878 {
1879 	return ext3_bg_has_super(sb, group) ? EXT3_SB(sb)->s_gdb_count : 0;
1880 }
1881 
1882 /**
1883  *	ext3_bg_num_gdb - number of blocks used by the group table in group
1884  *	@sb: superblock for filesystem
1885  *	@group: group number to check
1886  *
1887  *	Return the number of blocks used by the group descriptor table
1888  *	(primary or backup) in this group.  In the future there may be a
1889  *	different number of descriptor blocks in each group.
1890  */
ext3_bg_num_gdb(struct super_block * sb,int group)1891 unsigned long ext3_bg_num_gdb(struct super_block *sb, int group)
1892 {
1893 	unsigned long first_meta_bg =
1894 			le32_to_cpu(EXT3_SB(sb)->s_es->s_first_meta_bg);
1895 	unsigned long metagroup = group / EXT3_DESC_PER_BLOCK(sb);
1896 
1897 	if (!EXT3_HAS_INCOMPAT_FEATURE(sb,EXT3_FEATURE_INCOMPAT_META_BG) ||
1898 			metagroup < first_meta_bg)
1899 		return ext3_bg_num_gdb_nometa(sb,group);
1900 
1901 	return ext3_bg_num_gdb_meta(sb,group);
1902 
1903 }
1904 
1905 /**
1906  * ext3_trim_all_free -- function to trim all free space in alloc. group
1907  * @sb:			super block for file system
1908  * @group:		allocation group to trim
1909  * @start:		first group block to examine
1910  * @max:		last group block to examine
1911  * @gdp:		allocation group description structure
1912  * @minblocks:		minimum extent block count
1913  *
1914  * ext3_trim_all_free walks through group's block bitmap searching for free
1915  * blocks. When the free block is found, it tries to allocate this block and
1916  * consequent free block to get the biggest free extent possible, until it
1917  * reaches any used block. Then issue a TRIM command on this extent and free
1918  * the extent in the block bitmap. This is done until whole group is scanned.
1919  */
ext3_trim_all_free(struct super_block * sb,unsigned int group,ext3_grpblk_t start,ext3_grpblk_t max,ext3_grpblk_t minblocks)1920 ext3_grpblk_t ext3_trim_all_free(struct super_block *sb, unsigned int group,
1921 				ext3_grpblk_t start, ext3_grpblk_t max,
1922 				ext3_grpblk_t minblocks)
1923 {
1924 	handle_t *handle;
1925 	ext3_grpblk_t next, free_blocks, bit, freed, count = 0;
1926 	ext3_fsblk_t discard_block;
1927 	struct ext3_sb_info *sbi;
1928 	struct buffer_head *gdp_bh, *bitmap_bh = NULL;
1929 	struct ext3_group_desc *gdp;
1930 	int err = 0, ret = 0;
1931 
1932 	/*
1933 	 * We will update one block bitmap, and one group descriptor
1934 	 */
1935 	handle = ext3_journal_start_sb(sb, 2);
1936 	if (IS_ERR(handle))
1937 		return PTR_ERR(handle);
1938 
1939 	bitmap_bh = read_block_bitmap(sb, group);
1940 	if (!bitmap_bh) {
1941 		err = -EIO;
1942 		goto err_out;
1943 	}
1944 
1945 	BUFFER_TRACE(bitmap_bh, "getting undo access");
1946 	err = ext3_journal_get_undo_access(handle, bitmap_bh);
1947 	if (err)
1948 		goto err_out;
1949 
1950 	gdp = ext3_get_group_desc(sb, group, &gdp_bh);
1951 	if (!gdp) {
1952 		err = -EIO;
1953 		goto err_out;
1954 	}
1955 
1956 	BUFFER_TRACE(gdp_bh, "get_write_access");
1957 	err = ext3_journal_get_write_access(handle, gdp_bh);
1958 	if (err)
1959 		goto err_out;
1960 
1961 	free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
1962 	sbi = EXT3_SB(sb);
1963 
1964 	 /* Walk through the whole group */
1965 	while (start < max) {
1966 		start = bitmap_search_next_usable_block(start, bitmap_bh, max);
1967 		if (start < 0)
1968 			break;
1969 		next = start;
1970 
1971 		/*
1972 		 * Allocate contiguous free extents by setting bits in the
1973 		 * block bitmap
1974 		 */
1975 		while (next < max
1976 			&& claim_block(sb_bgl_lock(sbi, group),
1977 					next, bitmap_bh)) {
1978 			next++;
1979 		}
1980 
1981 		 /* We did not claim any blocks */
1982 		if (next == start)
1983 			continue;
1984 
1985 		discard_block = (ext3_fsblk_t)start +
1986 				ext3_group_first_block_no(sb, group);
1987 
1988 		/* Update counters */
1989 		spin_lock(sb_bgl_lock(sbi, group));
1990 		le16_add_cpu(&gdp->bg_free_blocks_count, start - next);
1991 		spin_unlock(sb_bgl_lock(sbi, group));
1992 		percpu_counter_sub(&sbi->s_freeblocks_counter, next - start);
1993 
1994 		free_blocks -= next - start;
1995 		/* Do not issue a TRIM on extents smaller than minblocks */
1996 		if ((next - start) < minblocks)
1997 			goto free_extent;
1998 
1999 		 /* Send the TRIM command down to the device */
2000 		err = sb_issue_discard(sb, discard_block, next - start,
2001 				       GFP_NOFS, 0);
2002 		count += (next - start);
2003 free_extent:
2004 		freed = 0;
2005 
2006 		/*
2007 		 * Clear bits in the bitmap
2008 		 */
2009 		for (bit = start; bit < next; bit++) {
2010 			BUFFER_TRACE(bitmap_bh, "clear bit");
2011 			if (!ext3_clear_bit_atomic(sb_bgl_lock(sbi, group),
2012 						bit, bitmap_bh->b_data)) {
2013 				ext3_error(sb, __func__,
2014 					"bit already cleared for block "E3FSBLK,
2015 					 (unsigned long)bit);
2016 				BUFFER_TRACE(bitmap_bh, "bit already cleared");
2017 			} else {
2018 				freed++;
2019 			}
2020 		}
2021 
2022 		/* Update couters */
2023 		spin_lock(sb_bgl_lock(sbi, group));
2024 		le16_add_cpu(&gdp->bg_free_blocks_count, freed);
2025 		spin_unlock(sb_bgl_lock(sbi, group));
2026 		percpu_counter_add(&sbi->s_freeblocks_counter, freed);
2027 
2028 		start = next;
2029 		if (err < 0) {
2030 			if (err != -EOPNOTSUPP)
2031 				ext3_warning(sb, __func__, "Discard command "
2032 					     "returned error %d\n", err);
2033 			break;
2034 		}
2035 
2036 		if (fatal_signal_pending(current)) {
2037 			err = -ERESTARTSYS;
2038 			break;
2039 		}
2040 
2041 		cond_resched();
2042 
2043 		/* No more suitable extents */
2044 		if (free_blocks < minblocks)
2045 			break;
2046 	}
2047 
2048 	/* We dirtied the bitmap block */
2049 	BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
2050 	ret = ext3_journal_dirty_metadata(handle, bitmap_bh);
2051 	if (!err)
2052 		err = ret;
2053 
2054 	/* And the group descriptor block */
2055 	BUFFER_TRACE(gdp_bh, "dirtied group descriptor block");
2056 	ret = ext3_journal_dirty_metadata(handle, gdp_bh);
2057 	if (!err)
2058 		err = ret;
2059 
2060 	ext3_debug("trimmed %d blocks in the group %d\n",
2061 		count, group);
2062 
2063 err_out:
2064 	if (err)
2065 		count = err;
2066 	ext3_journal_stop(handle);
2067 	brelse(bitmap_bh);
2068 
2069 	return count;
2070 }
2071 
2072 /**
2073  * ext3_trim_fs() -- trim ioctl handle function
2074  * @sb:			superblock for filesystem
2075  * @start:		First Byte to trim
2076  * @len:		number of Bytes to trim from start
2077  * @minlen:		minimum extent length in Bytes
2078  *
2079  * ext3_trim_fs goes through all allocation groups containing Bytes from
2080  * start to start+len. For each such a group ext3_trim_all_free function
2081  * is invoked to trim all free space.
2082  */
ext3_trim_fs(struct super_block * sb,struct fstrim_range * range)2083 int ext3_trim_fs(struct super_block *sb, struct fstrim_range *range)
2084 {
2085 	ext3_grpblk_t last_block, first_block, free_blocks;
2086 	unsigned long first_group, last_group;
2087 	unsigned long group, ngroups;
2088 	struct ext3_group_desc *gdp;
2089 	struct ext3_super_block *es = EXT3_SB(sb)->s_es;
2090 	uint64_t start, len, minlen, trimmed;
2091 	ext3_fsblk_t max_blks = le32_to_cpu(es->s_blocks_count);
2092 	int ret = 0;
2093 
2094 	start = (range->start >> sb->s_blocksize_bits) +
2095 		le32_to_cpu(es->s_first_data_block);
2096 	len = range->len >> sb->s_blocksize_bits;
2097 	minlen = range->minlen >> sb->s_blocksize_bits;
2098 	trimmed = 0;
2099 
2100 	if (unlikely(minlen > EXT3_BLOCKS_PER_GROUP(sb)))
2101 		return -EINVAL;
2102 	if (start >= max_blks)
2103 		goto out;
2104 	if (start + len > max_blks)
2105 		len = max_blks - start;
2106 
2107 	ngroups = EXT3_SB(sb)->s_groups_count;
2108 	smp_rmb();
2109 
2110 	/* Determine first and last group to examine based on start and len */
2111 	ext3_get_group_no_and_offset(sb, (ext3_fsblk_t) start,
2112 				     &first_group, &first_block);
2113 	ext3_get_group_no_and_offset(sb, (ext3_fsblk_t) (start + len),
2114 				     &last_group, &last_block);
2115 	last_group = (last_group > ngroups - 1) ? ngroups - 1 : last_group;
2116 	last_block = EXT3_BLOCKS_PER_GROUP(sb);
2117 
2118 	if (first_group > last_group)
2119 		return -EINVAL;
2120 
2121 	for (group = first_group; group <= last_group; group++) {
2122 		gdp = ext3_get_group_desc(sb, group, NULL);
2123 		if (!gdp)
2124 			break;
2125 
2126 		free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
2127 		if (free_blocks < minlen)
2128 			continue;
2129 
2130 		/*
2131 		 * For all the groups except the last one, last block will
2132 		 * always be EXT3_BLOCKS_PER_GROUP(sb), so we only need to
2133 		 * change it for the last group in which case first_block +
2134 		 * len < EXT3_BLOCKS_PER_GROUP(sb).
2135 		 */
2136 		if (first_block + len < EXT3_BLOCKS_PER_GROUP(sb))
2137 			last_block = first_block + len;
2138 		len -= last_block - first_block;
2139 
2140 		ret = ext3_trim_all_free(sb, group, first_block,
2141 					last_block, minlen);
2142 		if (ret < 0)
2143 			break;
2144 
2145 		trimmed += ret;
2146 		first_block = 0;
2147 	}
2148 
2149 	if (ret >= 0)
2150 		ret = 0;
2151 
2152 out:
2153 	range->len = trimmed * sb->s_blocksize;
2154 
2155 	return ret;
2156 }
2157