1 /*
2  *   Copyright (C) International Business Machines Corp., 2000-2004
3  *
4  *   This program is free software;  you can redistribute it and/or modify
5  *   it under the terms of the GNU General Public License as published by
6  *   the Free Software Foundation; either version 2 of the License, or
7  *   (at your option) any later version.
8  *
9  *   This program is distributed in the hope that it will be useful,
10  *   but WITHOUT ANY WARRANTY;  without even the implied warranty of
11  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
12  *   the GNU General Public License for more details.
13  *
14  *   You should have received a copy of the GNU General Public License
15  *   along with this program;  if not, write to the Free Software
16  *   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17  */
18 
19 #include <linux/fs.h>
20 #include "jfs_incore.h"
21 #include "jfs_superblock.h"
22 #include "jfs_dmap.h"
23 #include "jfs_imap.h"
24 #include "jfs_lock.h"
25 #include "jfs_metapage.h"
26 #include "jfs_superblock.h"
27 #include "jfs_debug.h"
28 
29 /*
30  *	Debug code for double-checking block map
31  */
32 /* #define	_JFS_DEBUG_DMAP	1 */
33 
34 #ifdef	_JFS_DEBUG_DMAP
35 #define DBINITMAP(size,ipbmap,results) \
36 	DBinitmap(size,ipbmap,results)
37 #define DBALLOC(dbmap,mapsize,blkno,nblocks) \
38 	DBAlloc(dbmap,mapsize,blkno,nblocks)
39 #define DBFREE(dbmap,mapsize,blkno,nblocks) \
40 	DBFree(dbmap,mapsize,blkno,nblocks)
41 #define DBALLOCCK(dbmap,mapsize,blkno,nblocks) \
42 	DBAllocCK(dbmap,mapsize,blkno,nblocks)
43 #define DBFREECK(dbmap,mapsize,blkno,nblocks) \
44 	DBFreeCK(dbmap,mapsize,blkno,nblocks)
45 
46 static void DBinitmap(s64, struct inode *, u32 **);
47 static void DBAlloc(uint *, s64, s64, s64);
48 static void DBFree(uint *, s64, s64, s64);
49 static void DBAllocCK(uint *, s64, s64, s64);
50 static void DBFreeCK(uint *, s64, s64, s64);
51 #else
52 #define DBINITMAP(size,ipbmap,results)
53 #define DBALLOC(dbmap, mapsize, blkno, nblocks)
54 #define DBFREE(dbmap, mapsize, blkno, nblocks)
55 #define DBALLOCCK(dbmap, mapsize, blkno, nblocks)
56 #define DBFREECK(dbmap, mapsize, blkno, nblocks)
57 #endif				/* _JFS_DEBUG_DMAP */
58 
59 /*
60  *	SERIALIZATION of the Block Allocation Map.
61  *
62  *	the working state of the block allocation map is accessed in
63  *	two directions:
64  *
65  *	1) allocation and free requests that start at the dmap
66  *	   level and move up through the dmap control pages (i.e.
67  *	   the vast majority of requests).
68  *
69  * 	2) allocation requests that start at dmap control page
70  *	   level and work down towards the dmaps.
71  *
72  *	the serialization scheme used here is as follows.
73  *
74  *	requests which start at the bottom are serialized against each
75  *	other through buffers and each requests holds onto its buffers
76  *	as it works it way up from a single dmap to the required level
77  *	of dmap control page.
78  *	requests that start at the top are serialized against each other
79  *	and request that start from the bottom by the multiple read/single
80  *	write inode lock of the bmap inode. requests starting at the top
81  *	take this lock in write mode while request starting at the bottom
82  *	take the lock in read mode.  a single top-down request may proceed
83  *	exclusively while multiple bottoms-up requests may proceed
84  * 	simultaneously (under the protection of busy buffers).
85  *
86  *	in addition to information found in dmaps and dmap control pages,
87  *	the working state of the block allocation map also includes read/
88  *	write information maintained in the bmap descriptor (i.e. total
89  *	free block count, allocation group level free block counts).
90  *	a single exclusive lock (BMAP_LOCK) is used to guard this information
91  *	in the face of multiple-bottoms up requests.
92  *	(lock ordering: IREAD_LOCK, BMAP_LOCK);
93  *
94  *	accesses to the persistent state of the block allocation map (limited
95  *	to the persistent bitmaps in dmaps) is guarded by (busy) buffers.
96  */
97 
98 #define BMAP_LOCK_INIT(bmp)	init_MUTEX(&bmp->db_bmaplock)
99 #define BMAP_LOCK(bmp)		down(&bmp->db_bmaplock)
100 #define BMAP_UNLOCK(bmp)	up(&bmp->db_bmaplock)
101 
102 /*
103  * forward references
104  */
105 static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
106 			int nblocks);
107 static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval);
108 static void dbBackSplit(dmtree_t * tp, int leafno);
109 static void dbJoin(dmtree_t * tp, int leafno, int newval);
110 static void dbAdjTree(dmtree_t * tp, int leafno, int newval);
111 static int dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc,
112 		    int level);
113 static int dbAllocAny(struct bmap * bmp, s64 nblocks, int l2nb, s64 * results);
114 static int dbAllocNext(struct bmap * bmp, struct dmap * dp, s64 blkno,
115 		       int nblocks);
116 static int dbAllocNear(struct bmap * bmp, struct dmap * dp, s64 blkno,
117 		       int nblocks,
118 		       int l2nb, s64 * results);
119 static int dbAllocDmap(struct bmap * bmp, struct dmap * dp, s64 blkno,
120 		       int nblocks);
121 static int dbAllocDmapLev(struct bmap * bmp, struct dmap * dp, int nblocks,
122 			  int l2nb,
123 			  s64 * results);
124 static int dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb,
125 		     s64 * results);
126 static int dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno,
127 		      s64 * results);
128 static int dbExtend(struct inode *ip, s64 blkno, s64 nblocks, s64 addnblocks);
129 static int dbFindBits(u32 word, int l2nb);
130 static int dbFindCtl(struct bmap * bmp, int l2nb, int level, s64 * blkno);
131 static int dbFindLeaf(dmtree_t * tp, int l2nb, int *leafidx);
132 static void dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
133 		       int nblocks);
134 static int dbFreeDmap(struct bmap * bmp, struct dmap * dp, s64 blkno,
135 		      int nblocks);
136 static int dbMaxBud(u8 * cp);
137 s64 dbMapFileSizeToMapSize(struct inode *ipbmap);
138 static int blkstol2(s64 nb);
139 
140 static int cntlz(u32 value);
141 static int cnttz(u32 word);
142 
143 static int dbAllocDmapBU(struct bmap * bmp, struct dmap * dp, s64 blkno,
144 			 int nblocks);
145 static int dbInitDmap(struct dmap * dp, s64 blkno, int nblocks);
146 static int dbInitDmapTree(struct dmap * dp);
147 static int dbInitTree(struct dmaptree * dtp);
148 static int dbInitDmapCtl(struct dmapctl * dcp, int level, int i);
149 static int dbGetL2AGSize(s64 nblocks);
150 
151 /*
152  *	buddy table
153  *
154  * table used for determining buddy sizes within characters of
155  * dmap bitmap words.  the characters themselves serve as indexes
156  * into the table, with the table elements yielding the maximum
157  * binary buddy of free bits within the character.
158  */
159 static s8 budtab[256] = {
160 	3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
161 	2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
162 	2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
163 	2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
164 	2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
165 	2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
166 	2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
167 	2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
168 	2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
169 	2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
170 	2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
171 	2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
172 	2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
173 	2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
174 	2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0,
175 	2, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, -1
176 };
177 
178 
179 /*
180  * NAME:    	dbMount()
181  *
182  * FUNCTION:	initializate the block allocation map.
183  *
184  *		memory is allocated for the in-core bmap descriptor and
185  *		the in-core descriptor is initialized from disk.
186  *
187  * PARAMETERS:
188  *      ipbmap	-  pointer to in-core inode for the block map.
189  *
190  * RETURN VALUES:
191  *      0	- success
192  *      -ENOMEM	- insufficient memory
193  *      -EIO	- i/o error
194  */
dbMount(struct inode * ipbmap)195 int dbMount(struct inode *ipbmap)
196 {
197 	struct bmap *bmp;
198 	struct dbmap *dbmp_le;
199 	struct metapage *mp;
200 	int i;
201 
202 	/*
203 	 * allocate/initialize the in-memory bmap descriptor
204 	 */
205 	/* allocate memory for the in-memory bmap descriptor */
206 	bmp = kmalloc(sizeof(struct bmap), GFP_KERNEL);
207 	if (bmp == NULL)
208 		return -ENOMEM;
209 
210 	/* read the on-disk bmap descriptor. */
211 	mp = read_metapage(ipbmap,
212 			   BMAPBLKNO << JFS_SBI(ipbmap->i_sb)->l2nbperpage,
213 			   PSIZE, 0);
214 	if (mp == NULL) {
215 		kfree(bmp);
216 		return -EIO;
217 	}
218 
219 	/* copy the on-disk bmap descriptor to its in-memory version. */
220 	dbmp_le = (struct dbmap *) mp->data;
221 	bmp->db_mapsize = le64_to_cpu(dbmp_le->dn_mapsize);
222 	bmp->db_nfree = le64_to_cpu(dbmp_le->dn_nfree);
223 	bmp->db_l2nbperpage = le32_to_cpu(dbmp_le->dn_l2nbperpage);
224 	bmp->db_numag = le32_to_cpu(dbmp_le->dn_numag);
225 	bmp->db_maxlevel = le32_to_cpu(dbmp_le->dn_maxlevel);
226 	bmp->db_maxag = le32_to_cpu(dbmp_le->dn_maxag);
227 	bmp->db_agpref = le32_to_cpu(dbmp_le->dn_agpref);
228 	bmp->db_aglevel = le32_to_cpu(dbmp_le->dn_aglevel);
229 	bmp->db_agheigth = le32_to_cpu(dbmp_le->dn_agheigth);
230 	bmp->db_agwidth = le32_to_cpu(dbmp_le->dn_agwidth);
231 	bmp->db_agstart = le32_to_cpu(dbmp_le->dn_agstart);
232 	bmp->db_agl2size = le32_to_cpu(dbmp_le->dn_agl2size);
233 	for (i = 0; i < MAXAG; i++)
234 		bmp->db_agfree[i] = le64_to_cpu(dbmp_le->dn_agfree[i]);
235 	bmp->db_agsize = le64_to_cpu(dbmp_le->dn_agsize);
236 	bmp->db_maxfreebud = dbmp_le->dn_maxfreebud;
237 
238 	/* release the buffer. */
239 	release_metapage(mp);
240 
241 	/* bind the bmap inode and the bmap descriptor to each other. */
242 	bmp->db_ipbmap = ipbmap;
243 	JFS_SBI(ipbmap->i_sb)->bmap = bmp;
244 
245 	memset(bmp->db_active, 0, sizeof(bmp->db_active));
246 	DBINITMAP(bmp->db_mapsize, ipbmap, &bmp->db_DBmap);
247 
248 	/*
249 	 * allocate/initialize the bmap lock
250 	 */
251 	BMAP_LOCK_INIT(bmp);
252 
253 	return (0);
254 }
255 
256 
257 /*
258  * NAME:    	dbUnmount()
259  *
260  * FUNCTION:	terminate the block allocation map in preparation for
261  *		file system unmount.
262  *
263  * 		the in-core bmap descriptor is written to disk and
264  *		the memory for this descriptor is freed.
265  *
266  * PARAMETERS:
267  *      ipbmap	-  pointer to in-core inode for the block map.
268  *
269  * RETURN VALUES:
270  *      0	- success
271  *      -EIO	- i/o error
272  */
dbUnmount(struct inode * ipbmap,int mounterror)273 int dbUnmount(struct inode *ipbmap, int mounterror)
274 {
275 	struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap;
276 	int i;
277 
278 	if (!(mounterror || isReadOnly(ipbmap)))
279 		dbSync(ipbmap);
280 
281 	/*
282 	 * Invalidate the page cache buffers
283 	 */
284 	truncate_inode_pages(ipbmap->i_mapping, 0);
285 
286 	/*
287 	 * Sanity Check
288 	 */
289 	for (i = 0; i < bmp->db_numag; i++)
290 		if (atomic_read(&bmp->db_active[i]))
291 			printk(KERN_ERR "dbUnmount: db_active[%d] = %d\n",
292 			       i, atomic_read(&bmp->db_active[i]));
293 
294 	/* free the memory for the in-memory bmap. */
295 	kfree(bmp);
296 
297 	return (0);
298 }
299 
300 /*
301  *	dbSync()
302  */
dbSync(struct inode * ipbmap)303 int dbSync(struct inode *ipbmap)
304 {
305 	struct dbmap *dbmp_le;
306 	struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap;
307 	struct metapage *mp;
308 	int i;
309 
310 	/*
311 	 * write bmap global control page
312 	 */
313 	/* get the buffer for the on-disk bmap descriptor. */
314 	mp = read_metapage(ipbmap,
315 			   BMAPBLKNO << JFS_SBI(ipbmap->i_sb)->l2nbperpage,
316 			   PSIZE, 0);
317 	if (mp == NULL) {
318 		jfs_err("dbSync: read_metapage failed!");
319 		return -EIO;
320 	}
321 	/* copy the in-memory version of the bmap to the on-disk version */
322 	dbmp_le = (struct dbmap *) mp->data;
323 	dbmp_le->dn_mapsize = cpu_to_le64(bmp->db_mapsize);
324 	dbmp_le->dn_nfree = cpu_to_le64(bmp->db_nfree);
325 	dbmp_le->dn_l2nbperpage = cpu_to_le32(bmp->db_l2nbperpage);
326 	dbmp_le->dn_numag = cpu_to_le32(bmp->db_numag);
327 	dbmp_le->dn_maxlevel = cpu_to_le32(bmp->db_maxlevel);
328 	dbmp_le->dn_maxag = cpu_to_le32(bmp->db_maxag);
329 	dbmp_le->dn_agpref = cpu_to_le32(bmp->db_agpref);
330 	dbmp_le->dn_aglevel = cpu_to_le32(bmp->db_aglevel);
331 	dbmp_le->dn_agheigth = cpu_to_le32(bmp->db_agheigth);
332 	dbmp_le->dn_agwidth = cpu_to_le32(bmp->db_agwidth);
333 	dbmp_le->dn_agstart = cpu_to_le32(bmp->db_agstart);
334 	dbmp_le->dn_agl2size = cpu_to_le32(bmp->db_agl2size);
335 	for (i = 0; i < MAXAG; i++)
336 		dbmp_le->dn_agfree[i] = cpu_to_le64(bmp->db_agfree[i]);
337 	dbmp_le->dn_agsize = cpu_to_le64(bmp->db_agsize);
338 	dbmp_le->dn_maxfreebud = bmp->db_maxfreebud;
339 
340 	/* write the buffer */
341 	write_metapage(mp);
342 
343 	/*
344 	 * write out dirty pages of bmap
345 	 */
346 	fsync_inode_data_buffers(ipbmap);
347 
348 	ipbmap->i_state |= I_DIRTY;
349 	diWriteSpecial(ipbmap, 0);
350 
351 	return (0);
352 }
353 
354 
355 /*
356  * NAME:    	dbFree()
357  *
358  * FUNCTION:	free the specified block range from the working block
359  *		allocation map.
360  *
361  *		the blocks will be free from the working map one dmap
362  *		at a time.
363  *
364  * PARAMETERS:
365  *      ip	-  pointer to in-core inode;
366  *      blkno	-  starting block number to be freed.
367  *      nblocks	-  number of blocks to be freed.
368  *
369  * RETURN VALUES:
370  *      0	- success
371  *      -EIO	- i/o error
372  */
dbFree(struct inode * ip,s64 blkno,s64 nblocks)373 int dbFree(struct inode *ip, s64 blkno, s64 nblocks)
374 {
375 	struct metapage *mp;
376 	struct dmap *dp;
377 	int nb, rc;
378 	s64 lblkno, rem;
379 	struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
380 	struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap;
381 
382 	IREAD_LOCK(ipbmap);
383 
384 	/* block to be freed better be within the mapsize. */
385 	if (unlikely((blkno == 0) || (blkno + nblocks > bmp->db_mapsize))) {
386 		IREAD_UNLOCK(ipbmap);
387 		printk(KERN_ERR "blkno = %Lx, nblocks = %Lx\n",
388 		       (unsigned long long) blkno,
389 		       (unsigned long long) nblocks);
390 		jfs_error(ip->i_sb,
391 			  "dbFree: block to be freed is outside the map");
392 		return -EIO;
393 	}
394 
395 	/*
396 	 * free the blocks a dmap at a time.
397 	 */
398 	mp = NULL;
399 	for (rem = nblocks; rem > 0; rem -= nb, blkno += nb) {
400 		/* release previous dmap if any */
401 		if (mp) {
402 			write_metapage(mp);
403 		}
404 
405 		/* get the buffer for the current dmap. */
406 		lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage);
407 		mp = read_metapage(ipbmap, lblkno, PSIZE, 0);
408 		if (mp == NULL) {
409 			IREAD_UNLOCK(ipbmap);
410 			return -EIO;
411 		}
412 		dp = (struct dmap *) mp->data;
413 
414 		/* determine the number of blocks to be freed from
415 		 * this dmap.
416 		 */
417 		nb = min(rem, BPERDMAP - (blkno & (BPERDMAP - 1)));
418 
419 		DBALLOCCK(bmp->db_DBmap, bmp->db_mapsize, blkno, nb);
420 
421 		/* free the blocks. */
422 		if ((rc = dbFreeDmap(bmp, dp, blkno, nb))) {
423 			release_metapage(mp);
424 			IREAD_UNLOCK(ipbmap);
425 			return (rc);
426 		}
427 
428 		DBFREE(bmp->db_DBmap, bmp->db_mapsize, blkno, nb);
429 	}
430 
431 	/* write the last buffer. */
432 	write_metapage(mp);
433 
434 	IREAD_UNLOCK(ipbmap);
435 
436 	return (0);
437 }
438 
439 
440 /*
441  * NAME:	dbUpdatePMap()
442  *
443  * FUNCTION:    update the allocation state (free or allocate) of the
444  *		specified block range in the persistent block allocation map.
445  *
446  *		the blocks will be updated in the persistent map one
447  *		dmap at a time.
448  *
449  * PARAMETERS:
450  *      ipbmap	-  pointer to in-core inode for the block map.
451  *      free	- TRUE if block range is to be freed from the persistent
452  *		  map; FALSE if it is to   be allocated.
453  *      blkno	-  starting block number of the range.
454  *      nblocks	-  number of contiguous blocks in the range.
455  *      tblk	-  transaction block;
456  *
457  * RETURN VALUES:
458  *      0	- success
459  *      -EIO	- i/o error
460  */
461 int
dbUpdatePMap(struct inode * ipbmap,int free,s64 blkno,s64 nblocks,struct tblock * tblk)462 dbUpdatePMap(struct inode *ipbmap,
463 	     int free, s64 blkno, s64 nblocks, struct tblock * tblk)
464 {
465 	int nblks, dbitno, wbitno, rbits;
466 	int word, nbits, nwords;
467 	struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap;
468 	s64 lblkno, rem, lastlblkno;
469 	u32 mask;
470 	struct dmap *dp;
471 	struct metapage *mp;
472 	struct jfs_log *log;
473 	int lsn, difft, diffp;
474 
475 	/* the blocks better be within the mapsize. */
476 	if (blkno + nblocks > bmp->db_mapsize) {
477 		printk(KERN_ERR "blkno = %Lx, nblocks = %Lx\n",
478 		       (unsigned long long) blkno,
479 		       (unsigned long long) nblocks);
480 		jfs_error(ipbmap->i_sb,
481 			  "dbUpdatePMap: blocks are outside the map");
482 		return -EIO;
483 	}
484 
485 	/* compute delta of transaction lsn from log syncpt */
486 	lsn = tblk->lsn;
487 	log = (struct jfs_log *) JFS_SBI(tblk->sb)->log;
488 	logdiff(difft, lsn, log);
489 
490 	/*
491 	 * update the block state a dmap at a time.
492 	 */
493 	mp = NULL;
494 	lastlblkno = 0;
495 	for (rem = nblocks; rem > 0; rem -= nblks, blkno += nblks) {
496 		/* get the buffer for the current dmap. */
497 		lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage);
498 		if (lblkno != lastlblkno) {
499 			if (mp) {
500 				write_metapage(mp);
501 			}
502 
503 			mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE,
504 					   0);
505 			if (mp == NULL)
506 				return -EIO;
507 		}
508 		dp = (struct dmap *) mp->data;
509 
510 		/* determine the bit number and word within the dmap of
511 		 * the starting block.  also determine how many blocks
512 		 * are to be updated within this dmap.
513 		 */
514 		dbitno = blkno & (BPERDMAP - 1);
515 		word = dbitno >> L2DBWORD;
516 		nblks = min(rem, (s64)BPERDMAP - dbitno);
517 
518 		/* update the bits of the dmap words. the first and last
519 		 * words may only have a subset of their bits updated. if
520 		 * this is the case, we'll work against that word (i.e.
521 		 * partial first and/or last) only in a single pass.  a
522 		 * single pass will also be used to update all words that
523 		 * are to have all their bits updated.
524 		 */
525 		for (rbits = nblks; rbits > 0;
526 		     rbits -= nbits, dbitno += nbits) {
527 			/* determine the bit number within the word and
528 			 * the number of bits within the word.
529 			 */
530 			wbitno = dbitno & (DBWORD - 1);
531 			nbits = min(rbits, DBWORD - wbitno);
532 
533 			/* check if only part of the word is to be updated. */
534 			if (nbits < DBWORD) {
535 				/* update (free or allocate) the bits
536 				 * in this word.
537 				 */
538 				mask =
539 				    (ONES << (DBWORD - nbits) >> wbitno);
540 				if (free)
541 					dp->pmap[word] &=
542 					    cpu_to_le32(~mask);
543 				else
544 					dp->pmap[word] |=
545 					    cpu_to_le32(mask);
546 
547 				word += 1;
548 			} else {
549 				/* one or more words are to have all
550 				 * their bits updated.  determine how
551 				 * many words and how many bits.
552 				 */
553 				nwords = rbits >> L2DBWORD;
554 				nbits = nwords << L2DBWORD;
555 
556 				/* update (free or allocate) the bits
557 				 * in these words.
558 				 */
559 				if (free)
560 					memset(&dp->pmap[word], 0,
561 					       nwords * 4);
562 				else
563 					memset(&dp->pmap[word], (int) ONES,
564 					       nwords * 4);
565 
566 				word += nwords;
567 			}
568 		}
569 
570 		/*
571 		 * update dmap lsn
572 		 */
573 		if (lblkno == lastlblkno)
574 			continue;
575 
576 		lastlblkno = lblkno;
577 
578 		if (mp->lsn != 0) {
579 			/* inherit older/smaller lsn */
580 			logdiff(diffp, mp->lsn, log);
581 			if (difft < diffp) {
582 				mp->lsn = lsn;
583 
584 				/* move bp after tblock in logsync list */
585 				LOGSYNC_LOCK(log);
586 				list_del(&mp->synclist);
587 				list_add(&mp->synclist, &tblk->synclist);
588 				LOGSYNC_UNLOCK(log);
589 			}
590 
591 			/* inherit younger/larger clsn */
592 			LOGSYNC_LOCK(log);
593 			logdiff(difft, tblk->clsn, log);
594 			logdiff(diffp, mp->clsn, log);
595 			if (difft > diffp)
596 				mp->clsn = tblk->clsn;
597 			LOGSYNC_UNLOCK(log);
598 		} else {
599 			mp->log = log;
600 			mp->lsn = lsn;
601 
602 			/* insert bp after tblock in logsync list */
603 			LOGSYNC_LOCK(log);
604 
605 			log->count++;
606 			list_add(&mp->synclist, &tblk->synclist);
607 
608 			mp->clsn = tblk->clsn;
609 			LOGSYNC_UNLOCK(log);
610 		}
611 	}
612 
613 	/* write the last buffer. */
614 	if (mp) {
615 		write_metapage(mp);
616 	}
617 
618 	return (0);
619 }
620 
621 
622 /*
623  * NAME:	dbNextAG()
624  *
625  * FUNCTION:    find the preferred allocation group for new allocations.
626  *
627  *		Within the allocation groups, we maintain a preferred
628  *		allocation group which consists of a group with at least
629  *		average free space.  It is the preferred group that we target
630  *		new inode allocation towards.  The tie-in between inode
631  *		allocation and block allocation occurs as we allocate the
632  *		first (data) block of an inode and specify the inode (block)
633  *		as the allocation hint for this block.
634  *
635  *		We try to avoid having more than one open file growing in
636  *		an allocation group, as this will lead to fragmentation.
637  *		This differs from the old OS/2 method of trying to keep
638  *		empty ags around for large allocations.
639  *
640  * PARAMETERS:
641  *      ipbmap	-  pointer to in-core inode for the block map.
642  *
643  * RETURN VALUES:
644  *      the preferred allocation group number.
645  */
dbNextAG(struct inode * ipbmap)646 int dbNextAG(struct inode *ipbmap)
647 {
648 	s64 avgfree;
649 	int agpref;
650 	s64 hwm = 0;
651 	int i;
652 	int next_best = -1;
653 	struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap;
654 
655 	BMAP_LOCK(bmp);
656 
657 	/* determine the average number of free blocks within the ags. */
658 	avgfree = (u32)bmp->db_nfree / bmp->db_numag;
659 
660 	/*
661 	 * if the current preferred ag does not have an active allocator
662 	 * and has at least average freespace, return it
663 	 */
664 	agpref = bmp->db_agpref;
665 	if ((atomic_read(&bmp->db_active[agpref]) == 0) &&
666 	    (bmp->db_agfree[agpref] >= avgfree))
667 		goto unlock;
668 
669 	/* From the last preferred ag, find the next one with at least
670 	 * average free space.
671 	 */
672 	for (i = 0 ; i < bmp->db_numag; i++, agpref++) {
673 		if (agpref == bmp->db_numag)
674 			agpref = 0;
675 
676 		if (atomic_read(&bmp->db_active[agpref]))
677 			/* open file is currently growing in this ag */
678 			continue;
679 		if (bmp->db_agfree[agpref] >= avgfree) {
680 			/* Return this one */
681 			bmp->db_agpref = agpref;
682 			goto unlock;
683 		} else if (bmp->db_agfree[agpref] > hwm) {
684 			/* Less than avg. freespace, but best so far */
685 			hwm = bmp->db_agfree[agpref];
686 			next_best = agpref;
687 		}
688 	}
689 
690 	/*
691 	 * If no inactive ag was found with average freespace, use the
692 	 * next best
693 	 */
694 	if (next_best != -1)
695 		bmp->db_agpref = next_best;
696 	/* else leave db_agpref unchanged */
697 unlock:
698 	BMAP_UNLOCK(bmp);
699 
700 	/* return the preferred group.
701 	 */
702 	return (bmp->db_agpref);
703 }
704 
705 /*
706  * NAME:	dbAlloc()
707  *
708  * FUNCTION:    attempt to allocate a specified number of contiguous free
709  *		blocks from the working allocation block map.
710  *
711  *		the block allocation policy uses hints and a multi-step
712  *		approach.
713  *
714  *	  	for allocation requests smaller than the number of blocks
715  *		per dmap, we first try to allocate the new blocks
716  *		immediately following the hint.  if these blocks are not
717  *		available, we try to allocate blocks near the hint.  if
718  *		no blocks near the hint are available, we next try to
719  *		allocate within the same dmap as contains the hint.
720  *
721  *		if no blocks are available in the dmap or the allocation
722  *		request is larger than the dmap size, we try to allocate
723  *		within the same allocation group as contains the hint. if
724  *		this does not succeed, we finally try to allocate anywhere
725  *		within the aggregate.
726  *
727  *		we also try to allocate anywhere within the aggregate for
728  *		for allocation requests larger than the allocation group
729  *		size or requests that specify no hint value.
730  *
731  * PARAMETERS:
732  *      ip	-  pointer to in-core inode;
733  *      hint	- allocation hint.
734  *      nblocks	- number of contiguous blocks in the range.
735  *      results	- on successful return, set to the starting block number
736  *		  of the newly allocated contiguous range.
737  *
738  * RETURN VALUES:
739  *      0	- success
740  *      -ENOSPC	- insufficient disk resources
741  *      -EIO	- i/o error
742  */
dbAlloc(struct inode * ip,s64 hint,s64 nblocks,s64 * results)743 int dbAlloc(struct inode *ip, s64 hint, s64 nblocks, s64 * results)
744 {
745 	int rc, agno;
746 	struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
747 	struct bmap *bmp;
748 	struct metapage *mp;
749 	s64 lblkno, blkno;
750 	struct dmap *dp;
751 	int l2nb;
752 	s64 mapSize;
753 	int writers;
754 
755 	/* assert that nblocks is valid */
756 	assert(nblocks > 0);
757 
758 #ifdef _STILL_TO_PORT
759 	/* DASD limit check                                     F226941 */
760 	if (OVER_LIMIT(ip, nblocks))
761 		return -ENOSPC;
762 #endif				/* _STILL_TO_PORT */
763 
764 	/* get the log2 number of blocks to be allocated.
765 	 * if the number of blocks is not a log2 multiple,
766 	 * it will be rounded up to the next log2 multiple.
767 	 */
768 	l2nb = BLKSTOL2(nblocks);
769 
770 	bmp = JFS_SBI(ip->i_sb)->bmap;
771 
772 //retry:        /* serialize w.r.t.extendfs() */
773 	mapSize = bmp->db_mapsize;
774 
775 	/* the hint should be within the map */
776 	if (hint >= mapSize) {
777 		jfs_error(ip->i_sb, "dbAlloc: the hint is outside the map");
778 		return -EIO;
779 	}
780 
781 	/* if the number of blocks to be allocated is greater than the
782 	 * allocation group size, try to allocate anywhere.
783 	 */
784 	if (l2nb > bmp->db_agl2size) {
785 		IWRITE_LOCK(ipbmap);
786 
787 		rc = dbAllocAny(bmp, nblocks, l2nb, results);
788 		if (rc == 0) {
789 			DBALLOC(bmp->db_DBmap, bmp->db_mapsize, *results,
790 				nblocks);
791 		}
792 
793 		goto write_unlock;
794 	}
795 
796 	/*
797 	 * If no hint, let dbNextAG recommend an allocation group
798 	 */
799 	if (hint == 0)
800 		goto pref_ag;
801 
802 	/* we would like to allocate close to the hint.  adjust the
803 	 * hint to the block following the hint since the allocators
804 	 * will start looking for free space starting at this point.
805 	 */
806 	blkno = hint + 1;
807 
808 	if (blkno >= bmp->db_mapsize)
809 		goto pref_ag;
810 
811 	agno = blkno >> bmp->db_agl2size;
812 
813 	/* check if blkno crosses over into a new allocation group.
814 	 * if so, check if we should allow allocations within this
815 	 * allocation group.
816 	 */
817 	if ((blkno & (bmp->db_agsize - 1)) == 0)
818 		/* check if the AG is currenly being written to.
819 		 * if so, call dbNextAG() to find a non-busy
820 		 * AG with sufficient free space.
821 		 */
822 		if (atomic_read(&bmp->db_active[agno]))
823 			goto pref_ag;
824 
825 	/* check if the allocation request size can be satisfied from a
826 	 * single dmap.  if so, try to allocate from the dmap containing
827 	 * the hint using a tiered strategy.
828 	 */
829 	if (nblocks <= BPERDMAP) {
830 		IREAD_LOCK(ipbmap);
831 
832 		/* get the buffer for the dmap containing the hint.
833 		 */
834 		rc = -EIO;
835 		lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage);
836 		mp = read_metapage(ipbmap, lblkno, PSIZE, 0);
837 		if (mp == NULL)
838 			goto read_unlock;
839 
840 		dp = (struct dmap *) mp->data;
841 
842 		/* first, try to satisfy the allocation request with the
843 		 * blocks beginning at the hint.
844 		 */
845 		if ((rc = dbAllocNext(bmp, dp, blkno, (int) nblocks))
846 		    != -ENOSPC) {
847 			if (rc == 0) {
848 				*results = blkno;
849 				DBALLOC(bmp->db_DBmap, bmp->db_mapsize,
850 					*results, nblocks);
851 				mark_metapage_dirty(mp);
852 			}
853 
854 			release_metapage(mp);
855 			goto read_unlock;
856 		}
857 
858 		writers = atomic_read(&bmp->db_active[agno]);
859 		if ((writers > 1) ||
860 		    ((writers == 1) && (JFS_IP(ip)->active_ag != agno))) {
861 			/*
862 			 * Someone else is writing in this allocation
863 			 * group.  To avoid fragmenting, try another ag
864 			 */
865 			release_metapage(mp);
866 			IREAD_UNLOCK(ipbmap);
867 			goto pref_ag;
868 		}
869 
870 		/* next, try to satisfy the allocation request with blocks
871 		 * near the hint.
872 		 */
873 		if ((rc =
874 		     dbAllocNear(bmp, dp, blkno, (int) nblocks, l2nb, results))
875 		    != -ENOSPC) {
876 			if (rc == 0) {
877 				DBALLOC(bmp->db_DBmap, bmp->db_mapsize,
878 					*results, nblocks);
879 				mark_metapage_dirty(mp);
880 			}
881 
882 			release_metapage(mp);
883 			goto read_unlock;
884 		}
885 
886 		/* try to satisfy the allocation request with blocks within
887 		 * the same dmap as the hint.
888 		 */
889 		if ((rc = dbAllocDmapLev(bmp, dp, (int) nblocks, l2nb, results))
890 		    != -ENOSPC) {
891 			if (rc == 0) {
892 				DBALLOC(bmp->db_DBmap, bmp->db_mapsize,
893 					*results, nblocks);
894 				mark_metapage_dirty(mp);
895 			}
896 
897 			release_metapage(mp);
898 			goto read_unlock;
899 		}
900 
901 		release_metapage(mp);
902 		IREAD_UNLOCK(ipbmap);
903 	}
904 
905 	/* try to satisfy the allocation request with blocks within
906 	 * the same allocation group as the hint.
907 	 */
908 	IWRITE_LOCK(ipbmap);
909 	if ((rc = dbAllocAG(bmp, agno, nblocks, l2nb, results))
910 	    != -ENOSPC) {
911 		if (rc == 0)
912 			DBALLOC(bmp->db_DBmap, bmp->db_mapsize,
913 				*results, nblocks);
914 		goto write_unlock;
915 	}
916 	IWRITE_UNLOCK(ipbmap);
917 
918 
919       pref_ag:
920 	/*
921 	 * Let dbNextAG recommend a preferred allocation group
922 	 */
923 	agno = dbNextAG(ipbmap);
924 	IWRITE_LOCK(ipbmap);
925 
926 	/* Try to allocate within this allocation group.  if that fails, try to
927 	 * allocate anywhere in the map.
928 	 */
929 	if ((rc = dbAllocAG(bmp, agno, nblocks, l2nb, results)) == -ENOSPC)
930 		rc = dbAllocAny(bmp, nblocks, l2nb, results);
931 	if (rc == 0) {
932 		DBALLOC(bmp->db_DBmap, bmp->db_mapsize, *results, nblocks);
933 	}
934 
935       write_unlock:
936 	IWRITE_UNLOCK(ipbmap);
937 
938 	return (rc);
939 
940       read_unlock:
941 	IREAD_UNLOCK(ipbmap);
942 
943 	return (rc);
944 }
945 
946 #ifdef _NOTYET
947 /*
948  * NAME:	dbAllocExact()
949  *
950  * FUNCTION:    try to allocate the requested extent;
951  *
952  * PARAMETERS:
953  *      ip	- pointer to in-core inode;
954  *      blkno	- extent address;
955  *      nblocks	- extent length;
956  *
957  * RETURN VALUES:
958  *      0	- success
959  *      -ENOSPC	- insufficient disk resources
960  *      -EIO	- i/o error
961  */
dbAllocExact(struct inode * ip,s64 blkno,int nblocks)962 int dbAllocExact(struct inode *ip, s64 blkno, int nblocks)
963 {
964 	int rc;
965 	struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
966 	struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap;
967 	struct dmap *dp;
968 	s64 lblkno;
969 	struct metapage *mp;
970 
971 	IREAD_LOCK(ipbmap);
972 
973 	/*
974 	 * validate extent request:
975 	 *
976 	 * note: defragfs policy:
977 	 *  max 64 blocks will be moved.
978 	 *  allocation request size must be satisfied from a single dmap.
979 	 */
980 	if (nblocks <= 0 || nblocks > BPERDMAP || blkno >= bmp->db_mapsize) {
981 		IREAD_UNLOCK(ipbmap);
982 		return -EINVAL;
983 	}
984 
985 	if (nblocks > ((s64) 1 << bmp->db_maxfreebud)) {
986 		/* the free space is no longer available */
987 		IREAD_UNLOCK(ipbmap);
988 		return -ENOSPC;
989 	}
990 
991 	/* read in the dmap covering the extent */
992 	lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage);
993 	mp = read_metapage(ipbmap, lblkno, PSIZE, 0);
994 	if (mp == NULL) {
995 		IREAD_UNLOCK(ipbmap);
996 		return -EIO;
997 	}
998 	dp = (struct dmap *) mp->data;
999 
1000 	/* try to allocate the requested extent */
1001 	rc = dbAllocNext(bmp, dp, blkno, nblocks);
1002 
1003 	IREAD_UNLOCK(ipbmap);
1004 
1005 	if (rc == 0) {
1006 		DBALLOC(bmp->db_DBmap, bmp->db_mapsize, blkno, nblocks);
1007 		mark_metapage_dirty(mp);
1008 	}
1009 	release_metapage(mp);
1010 
1011 	return (rc);
1012 }
1013 #endif /* _NOTYET */
1014 
1015 /*
1016  * NAME:	dbReAlloc()
1017  *
1018  * FUNCTION:    attempt to extend a current allocation by a specified
1019  *		number of blocks.
1020  *
1021  *		this routine attempts to satisfy the allocation request
1022  *		by first trying to extend the existing allocation in
1023  *		place by allocating the additional blocks as the blocks
1024  *		immediately following the current allocation.  if these
1025  *		blocks are not available, this routine will attempt to
1026  *		allocate a new set of contiguous blocks large enough
1027  *		to cover the existing allocation plus the additional
1028  *		number of blocks required.
1029  *
1030  * PARAMETERS:
1031  *      ip	    -  pointer to in-core inode requiring allocation.
1032  *      blkno	    -  starting block of the current allocation.
1033  *      nblocks	    -  number of contiguous blocks within the current
1034  *		       allocation.
1035  *      addnblocks  -  number of blocks to add to the allocation.
1036  *      results	-      on successful return, set to the starting block number
1037  *		       of the existing allocation if the existing allocation
1038  *		       was extended in place or to a newly allocated contiguous
1039  *		       range if the existing allocation could not be extended
1040  *		       in place.
1041  *
1042  * RETURN VALUES:
1043  *      0	- success
1044  *      -ENOSPC	- insufficient disk resources
1045  *      -EIO	- i/o error
1046  */
1047 int
dbReAlloc(struct inode * ip,s64 blkno,s64 nblocks,s64 addnblocks,s64 * results)1048 dbReAlloc(struct inode *ip,
1049 	  s64 blkno, s64 nblocks, s64 addnblocks, s64 * results)
1050 {
1051 	int rc;
1052 
1053 	/* try to extend the allocation in place.
1054 	 */
1055 	if ((rc = dbExtend(ip, blkno, nblocks, addnblocks)) == 0) {
1056 		*results = blkno;
1057 		return (0);
1058 	} else {
1059 		if (rc != -ENOSPC)
1060 			return (rc);
1061 	}
1062 
1063 	/* could not extend the allocation in place, so allocate a
1064 	 * new set of blocks for the entire request (i.e. try to get
1065 	 * a range of contiguous blocks large enough to cover the
1066 	 * existing allocation plus the additional blocks.)
1067 	 */
1068 	return (dbAlloc
1069 		(ip, blkno + nblocks - 1, addnblocks + nblocks, results));
1070 }
1071 
1072 
1073 /*
1074  * NAME:	dbExtend()
1075  *
1076  * FUNCTION:    attempt to extend a current allocation by a specified
1077  *		number of blocks.
1078  *
1079  *		this routine attempts to satisfy the allocation request
1080  *		by first trying to extend the existing allocation in
1081  *		place by allocating the additional blocks as the blocks
1082  *		immediately following the current allocation.
1083  *
1084  * PARAMETERS:
1085  *      ip	    -  pointer to in-core inode requiring allocation.
1086  *      blkno	    -  starting block of the current allocation.
1087  *      nblocks	    -  number of contiguous blocks within the current
1088  *		       allocation.
1089  *      addnblocks  -  number of blocks to add to the allocation.
1090  *
1091  * RETURN VALUES:
1092  *      0	- success
1093  *      -ENOSPC	- insufficient disk resources
1094  *      -EIO	- i/o error
1095  */
dbExtend(struct inode * ip,s64 blkno,s64 nblocks,s64 addnblocks)1096 static int dbExtend(struct inode *ip, s64 blkno, s64 nblocks, s64 addnblocks)
1097 {
1098 	struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb);
1099 	s64 lblkno, lastblkno, extblkno;
1100 	uint rel_block;
1101 	struct metapage *mp;
1102 	struct dmap *dp;
1103 	int rc;
1104 	struct inode *ipbmap = sbi->ipbmap;
1105 	struct bmap *bmp;
1106 
1107 	/*
1108 	 * We don't want a non-aligned extent to cross a page boundary
1109 	 */
1110 	if (((rel_block = blkno & (sbi->nbperpage - 1))) &&
1111 	    (rel_block + nblocks + addnblocks > sbi->nbperpage))
1112 		return -ENOSPC;
1113 
1114 	/* get the last block of the current allocation */
1115 	lastblkno = blkno + nblocks - 1;
1116 
1117 	/* determine the block number of the block following
1118 	 * the existing allocation.
1119 	 */
1120 	extblkno = lastblkno + 1;
1121 
1122 	IREAD_LOCK(ipbmap);
1123 
1124 	/* better be within the file system */
1125 	bmp = sbi->bmap;
1126 	if (lastblkno < 0 || lastblkno >= bmp->db_mapsize) {
1127 		IREAD_UNLOCK(ipbmap);
1128 		jfs_error(ip->i_sb,
1129 			  "dbExtend: the block is outside the filesystem");
1130 		return -EIO;
1131 	}
1132 
1133 	/* we'll attempt to extend the current allocation in place by
1134 	 * allocating the additional blocks as the blocks immediately
1135 	 * following the current allocation.  we only try to extend the
1136 	 * current allocation in place if the number of additional blocks
1137 	 * can fit into a dmap, the last block of the current allocation
1138 	 * is not the last block of the file system, and the start of the
1139 	 * inplace extension is not on an allocation group boundry.
1140 	 */
1141 	if (addnblocks > BPERDMAP || extblkno >= bmp->db_mapsize ||
1142 	    (extblkno & (bmp->db_agsize - 1)) == 0) {
1143 		IREAD_UNLOCK(ipbmap);
1144 		return -ENOSPC;
1145 	}
1146 
1147 	/* get the buffer for the dmap containing the first block
1148 	 * of the extension.
1149 	 */
1150 	lblkno = BLKTODMAP(extblkno, bmp->db_l2nbperpage);
1151 	mp = read_metapage(ipbmap, lblkno, PSIZE, 0);
1152 	if (mp == NULL) {
1153 		IREAD_UNLOCK(ipbmap);
1154 		return -EIO;
1155 	}
1156 
1157 	DBALLOCCK(bmp->db_DBmap, bmp->db_mapsize, blkno, nblocks);
1158 	dp = (struct dmap *) mp->data;
1159 
1160 	/* try to allocate the blocks immediately following the
1161 	 * current allocation.
1162 	 */
1163 	rc = dbAllocNext(bmp, dp, extblkno, (int) addnblocks);
1164 
1165 	IREAD_UNLOCK(ipbmap);
1166 
1167 	/* were we successful ? */
1168 	if (rc == 0) {
1169 		DBALLOC(bmp->db_DBmap, bmp->db_mapsize, extblkno,
1170 			addnblocks);
1171 		write_metapage(mp);
1172 	} else
1173 		/* we were not successful */
1174 		release_metapage(mp);
1175 
1176 
1177 	return (rc);
1178 }
1179 
1180 
1181 /*
1182  * NAME:	dbAllocNext()
1183  *
1184  * FUNCTION:    attempt to allocate the blocks of the specified block
1185  *		range within a dmap.
1186  *
1187  * PARAMETERS:
1188  *      bmp	-  pointer to bmap descriptor
1189  *      dp	-  pointer to dmap.
1190  *      blkno	-  starting block number of the range.
1191  *      nblocks	-  number of contiguous free blocks of the range.
1192  *
1193  * RETURN VALUES:
1194  *      0	- success
1195  *      -ENOSPC	- insufficient disk resources
1196  *      -EIO	- i/o error
1197  *
1198  * serialization: IREAD_LOCK(ipbmap) held on entry/exit;
1199  */
dbAllocNext(struct bmap * bmp,struct dmap * dp,s64 blkno,int nblocks)1200 static int dbAllocNext(struct bmap * bmp, struct dmap * dp, s64 blkno,
1201 		       int nblocks)
1202 {
1203 	int dbitno, word, rembits, nb, nwords, wbitno, nw;
1204 	int l2size;
1205 	s8 *leaf;
1206 	u32 mask;
1207 
1208 	if (dp->tree.leafidx != cpu_to_le32(LEAFIND)) {
1209 		jfs_error(bmp->db_ipbmap->i_sb,
1210 			  "dbAllocNext: Corrupt dmap page");
1211 		return -EIO;
1212 	}
1213 
1214 	/* pick up a pointer to the leaves of the dmap tree.
1215 	 */
1216 	leaf = dp->tree.stree + le32_to_cpu(dp->tree.leafidx);
1217 
1218 	/* determine the bit number and word within the dmap of the
1219 	 * starting block.
1220 	 */
1221 	dbitno = blkno & (BPERDMAP - 1);
1222 	word = dbitno >> L2DBWORD;
1223 
1224 	/* check if the specified block range is contained within
1225 	 * this dmap.
1226 	 */
1227 	if (dbitno + nblocks > BPERDMAP)
1228 		return -ENOSPC;
1229 
1230 	/* check if the starting leaf indicates that anything
1231 	 * is free.
1232 	 */
1233 	if (leaf[word] == NOFREE)
1234 		return -ENOSPC;
1235 
1236 	/* check the dmaps words corresponding to block range to see
1237 	 * if the block range is free.  not all bits of the first and
1238 	 * last words may be contained within the block range.  if this
1239 	 * is the case, we'll work against those words (i.e. partial first
1240 	 * and/or last) on an individual basis (a single pass) and examine
1241 	 * the actual bits to determine if they are free.  a single pass
1242 	 * will be used for all dmap words fully contained within the
1243 	 * specified range.  within this pass, the leaves of the dmap
1244 	 * tree will be examined to determine if the blocks are free. a
1245 	 * single leaf may describe the free space of multiple dmap
1246 	 * words, so we may visit only a subset of the actual leaves
1247 	 * corresponding to the dmap words of the block range.
1248 	 */
1249 	for (rembits = nblocks; rembits > 0; rembits -= nb, dbitno += nb) {
1250 		/* determine the bit number within the word and
1251 		 * the number of bits within the word.
1252 		 */
1253 		wbitno = dbitno & (DBWORD - 1);
1254 		nb = min(rembits, DBWORD - wbitno);
1255 
1256 		/* check if only part of the word is to be examined.
1257 		 */
1258 		if (nb < DBWORD) {
1259 			/* check if the bits are free.
1260 			 */
1261 			mask = (ONES << (DBWORD - nb) >> wbitno);
1262 			if ((mask & ~le32_to_cpu(dp->wmap[word])) != mask)
1263 				return -ENOSPC;
1264 
1265 			word += 1;
1266 		} else {
1267 			/* one or more dmap words are fully contained
1268 			 * within the block range.  determine how many
1269 			 * words and how many bits.
1270 			 */
1271 			nwords = rembits >> L2DBWORD;
1272 			nb = nwords << L2DBWORD;
1273 
1274 			/* now examine the appropriate leaves to determine
1275 			 * if the blocks are free.
1276 			 */
1277 			while (nwords > 0) {
1278 				/* does the leaf describe any free space ?
1279 				 */
1280 				if (leaf[word] < BUDMIN)
1281 					return -ENOSPC;
1282 
1283 				/* determine the l2 number of bits provided
1284 				 * by this leaf.
1285 				 */
1286 				l2size =
1287 				    min((int)leaf[word], NLSTOL2BSZ(nwords));
1288 
1289 				/* determine how many words were handled.
1290 				 */
1291 				nw = BUDSIZE(l2size, BUDMIN);
1292 
1293 				nwords -= nw;
1294 				word += nw;
1295 			}
1296 		}
1297 	}
1298 
1299 	/* allocate the blocks.
1300 	 */
1301 	return (dbAllocDmap(bmp, dp, blkno, nblocks));
1302 }
1303 
1304 
1305 /*
1306  * NAME:	dbAllocNear()
1307  *
1308  * FUNCTION:    attempt to allocate a number of contiguous free blocks near
1309  *		a specified block (hint) within a dmap.
1310  *
1311  *		starting with the dmap leaf that covers the hint, we'll
1312  *		check the next four contiguous leaves for sufficient free
1313  *		space.  if sufficient free space is found, we'll allocate
1314  *		the desired free space.
1315  *
1316  * PARAMETERS:
1317  *      bmp	-  pointer to bmap descriptor
1318  *      dp	-  pointer to dmap.
1319  *      blkno	-  block number to allocate near.
1320  *      nblocks	-  actual number of contiguous free blocks desired.
1321  *      l2nb	-  log2 number of contiguous free blocks desired.
1322  *      results	-  on successful return, set to the starting block number
1323  *		   of the newly allocated range.
1324  *
1325  * RETURN VALUES:
1326  *      0	- success
1327  *      -ENOSPC	- insufficient disk resources
1328  *      -EIO	- i/o error
1329  *
1330  * serialization: IREAD_LOCK(ipbmap) held on entry/exit;
1331  */
1332 static int
dbAllocNear(struct bmap * bmp,struct dmap * dp,s64 blkno,int nblocks,int l2nb,s64 * results)1333 dbAllocNear(struct bmap * bmp,
1334 	    struct dmap * dp, s64 blkno, int nblocks, int l2nb, s64 * results)
1335 {
1336 	int word, lword, rc;
1337 	s8 *leaf;
1338 
1339 	if (dp->tree.leafidx != cpu_to_le32(LEAFIND)) {
1340 		jfs_error(bmp->db_ipbmap->i_sb,
1341 			  "dbAllocNear: Corrupt dmap page");
1342 		return -EIO;
1343 	}
1344 
1345 	leaf = dp->tree.stree + le32_to_cpu(dp->tree.leafidx);
1346 
1347 	/* determine the word within the dmap that holds the hint
1348 	 * (i.e. blkno).  also, determine the last word in the dmap
1349 	 * that we'll include in our examination.
1350 	 */
1351 	word = (blkno & (BPERDMAP - 1)) >> L2DBWORD;
1352 	lword = min(word + 4, LPERDMAP);
1353 
1354 	/* examine the leaves for sufficient free space.
1355 	 */
1356 	for (; word < lword; word++) {
1357 		/* does the leaf describe sufficient free space ?
1358 		 */
1359 		if (leaf[word] < l2nb)
1360 			continue;
1361 
1362 		/* determine the block number within the file system
1363 		 * of the first block described by this dmap word.
1364 		 */
1365 		blkno = le64_to_cpu(dp->start) + (word << L2DBWORD);
1366 
1367 		/* if not all bits of the dmap word are free, get the
1368 		 * starting bit number within the dmap word of the required
1369 		 * string of free bits and adjust the block number with the
1370 		 * value.
1371 		 */
1372 		if (leaf[word] < BUDMIN)
1373 			blkno +=
1374 			    dbFindBits(le32_to_cpu(dp->wmap[word]), l2nb);
1375 
1376 		/* allocate the blocks.
1377 		 */
1378 		if ((rc = dbAllocDmap(bmp, dp, blkno, nblocks)) == 0)
1379 			*results = blkno;
1380 
1381 		return (rc);
1382 	}
1383 
1384 	return -ENOSPC;
1385 }
1386 
1387 
1388 /*
1389  * NAME:	dbAllocAG()
1390  *
1391  * FUNCTION:    attempt to allocate the specified number of contiguous
1392  *		free blocks within the specified allocation group.
1393  *
1394  *		unless the allocation group size is equal to the number
1395  *		of blocks per dmap, the dmap control pages will be used to
1396  *		find the required free space, if available.  we start the
1397  *		search at the highest dmap control page level which
1398  *		distinctly describes the allocation group's free space
1399  *		(i.e. the highest level at which the allocation group's
1400  *		free space is not mixed in with that of any other group).
1401  *		in addition, we start the search within this level at a
1402  *		height of the dmapctl dmtree at which the nodes distinctly
1403  *		describe the allocation group's free space.  at this height,
1404  *		the allocation group's free space may be represented by 1
1405  *		or two sub-trees, depending on the allocation group size.
1406  *		we search the top nodes of these subtrees left to right for
1407  *		sufficient free space.  if sufficient free space is found,
1408  *		the subtree is searched to find the leftmost leaf that
1409  *		has free space.  once we have made it to the leaf, we
1410  *		move the search to the next lower level dmap control page
1411  *		corresponding to this leaf.  we continue down the dmap control
1412  *		pages until we find the dmap that contains or starts the
1413  *		sufficient free space and we allocate at this dmap.
1414  *
1415  *		if the allocation group size is equal to the dmap size,
1416  *		we'll start at the dmap corresponding to the allocation
1417  *		group and attempt the allocation at this level.
1418  *
1419  *		the dmap control page search is also not performed if the
1420  *		allocation group is completely free and we go to the first
1421  *		dmap of the allocation group to do the allocation.  this is
1422  *		done because the allocation group may be part (not the first
1423  *		part) of a larger binary buddy system, causing the dmap
1424  *		control pages to indicate no free space (NOFREE) within
1425  *		the allocation group.
1426  *
1427  * PARAMETERS:
1428  *      bmp	-  pointer to bmap descriptor
1429  *	agno	- allocation group number.
1430  *      nblocks	-  actual number of contiguous free blocks desired.
1431  *      l2nb	-  log2 number of contiguous free blocks desired.
1432  *      results	-  on successful return, set to the starting block number
1433  *		   of the newly allocated range.
1434  *
1435  * RETURN VALUES:
1436  *      0	- success
1437  *      -ENOSPC	- insufficient disk resources
1438  *      -EIO	- i/o error
1439  *
1440  * note: IWRITE_LOCK(ipmap) held on entry/exit;
1441  */
1442 static int
dbAllocAG(struct bmap * bmp,int agno,s64 nblocks,int l2nb,s64 * results)1443 dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results)
1444 {
1445 	struct metapage *mp;
1446 	struct dmapctl *dcp;
1447 	int rc, ti, i, k, m, n, agperlev;
1448 	s64 blkno, lblkno;
1449 	int budmin;
1450 
1451 	/* allocation request should not be for more than the
1452 	 * allocation group size.
1453 	 */
1454 	if (l2nb > bmp->db_agl2size) {
1455 		jfs_error(bmp->db_ipbmap->i_sb,
1456 			  "dbAllocAG: allocation request is larger than the "
1457 			  "allocation group size");
1458 		return -EIO;
1459 	}
1460 
1461 	/* determine the starting block number of the allocation
1462 	 * group.
1463 	 */
1464 	blkno = (s64) agno << bmp->db_agl2size;
1465 
1466 	/* check if the allocation group size is the minimum allocation
1467 	 * group size or if the allocation group is completely free. if
1468 	 * the allocation group size is the minimum size of BPERDMAP (i.e.
1469 	 * 1 dmap), there is no need to search the dmap control page (below)
1470 	 * that fully describes the allocation group since the allocation
1471 	 * group is already fully described by a dmap.  in this case, we
1472 	 * just call dbAllocCtl() to search the dmap tree and allocate the
1473 	 * required space if available.
1474 	 *
1475 	 * if the allocation group is completely free, dbAllocCtl() is
1476 	 * also called to allocate the required space.  this is done for
1477 	 * two reasons.  first, it makes no sense searching the dmap control
1478 	 * pages for free space when we know that free space exists.  second,
1479 	 * the dmap control pages may indicate that the allocation group
1480 	 * has no free space if the allocation group is part (not the first
1481 	 * part) of a larger binary buddy system.
1482 	 */
1483 	if (bmp->db_agsize == BPERDMAP
1484 	    || bmp->db_agfree[agno] == bmp->db_agsize) {
1485 		rc = dbAllocCtl(bmp, nblocks, l2nb, blkno, results);
1486 		if ((rc == -ENOSPC) &&
1487 		    (bmp->db_agfree[agno] == bmp->db_agsize)) {
1488 			printk(KERN_ERR "blkno = %Lx, blocks = %Lx\n",
1489 			       (unsigned long long) blkno,
1490 			       (unsigned long long) nblocks);
1491 			jfs_error(bmp->db_ipbmap->i_sb,
1492 				  "dbAllocAG: dbAllocCtl failed in free AG");
1493 		}
1494 		return (rc);
1495 	}
1496 
1497 	/* the buffer for the dmap control page that fully describes the
1498 	 * allocation group.
1499 	 */
1500 	lblkno = BLKTOCTL(blkno, bmp->db_l2nbperpage, bmp->db_aglevel);
1501 	mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0);
1502 	if (mp == NULL)
1503 		return -EIO;
1504 	dcp = (struct dmapctl *) mp->data;
1505 	budmin = dcp->budmin;
1506 
1507 	if (dcp->leafidx != cpu_to_le32(CTLLEAFIND)) {
1508 		jfs_error(bmp->db_ipbmap->i_sb,
1509 			  "dbAllocAG: Corrupt dmapctl page");
1510 		release_metapage(mp);
1511 		return -EIO;
1512 	}
1513 
1514 	/* search the subtree(s) of the dmap control page that describes
1515 	 * the allocation group, looking for sufficient free space.  to begin,
1516 	 * determine how many allocation groups are represented in a dmap
1517 	 * control page at the control page level (i.e. L0, L1, L2) that
1518 	 * fully describes an allocation group. next, determine the starting
1519 	 * tree index of this allocation group within the control page.
1520 	 */
1521 	agperlev =
1522 	    (1 << (L2LPERCTL - (bmp->db_agheigth << 1))) / bmp->db_agwidth;
1523 	ti = bmp->db_agstart + bmp->db_agwidth * (agno & (agperlev - 1));
1524 
1525 	/* dmap control page trees fan-out by 4 and a single allocation
1526 	 * group may be described by 1 or 2 subtrees within the ag level
1527 	 * dmap control page, depending upon the ag size. examine the ag's
1528 	 * subtrees for sufficient free space, starting with the leftmost
1529 	 * subtree.
1530 	 */
1531 	for (i = 0; i < bmp->db_agwidth; i++, ti++) {
1532 		/* is there sufficient free space ?
1533 		 */
1534 		if (l2nb > dcp->stree[ti])
1535 			continue;
1536 
1537 		/* sufficient free space found in a subtree. now search down
1538 		 * the subtree to find the leftmost leaf that describes this
1539 		 * free space.
1540 		 */
1541 		for (k = bmp->db_agheigth; k > 0; k--) {
1542 			for (n = 0, m = (ti << 2) + 1; n < 4; n++) {
1543 				if (l2nb <= dcp->stree[m + n]) {
1544 					ti = m + n;
1545 					break;
1546 				}
1547 			}
1548 			if (n == 4) {
1549 				jfs_error(bmp->db_ipbmap->i_sb,
1550 					  "dbAllocAG: failed descending stree");
1551 				release_metapage(mp);
1552 				return -EIO;
1553 			}
1554 		}
1555 
1556 		/* determine the block number within the file system
1557 		 * that corresponds to this leaf.
1558 		 */
1559 		if (bmp->db_aglevel == 2)
1560 			blkno = 0;
1561 		else if (bmp->db_aglevel == 1)
1562 			blkno &= ~(MAXL1SIZE - 1);
1563 		else		/* bmp->db_aglevel == 0 */
1564 			blkno &= ~(MAXL0SIZE - 1);
1565 
1566 		blkno +=
1567 		    ((s64) (ti - le32_to_cpu(dcp->leafidx))) << budmin;
1568 
1569 		/* release the buffer in preparation for going down
1570 		 * the next level of dmap control pages.
1571 		 */
1572 		release_metapage(mp);
1573 
1574 		/* check if we need to continue to search down the lower
1575 		 * level dmap control pages.  we need to if the number of
1576 		 * blocks required is less than maximum number of blocks
1577 		 * described at the next lower level.
1578 		 */
1579 		if (l2nb < budmin) {
1580 
1581 			/* search the lower level dmap control pages to get
1582 			 * the starting block number of the the dmap that
1583 			 * contains or starts off the free space.
1584 			 */
1585 			if ((rc =
1586 			     dbFindCtl(bmp, l2nb, bmp->db_aglevel - 1,
1587 				       &blkno))) {
1588 				if (rc == -ENOSPC) {
1589 					jfs_error(bmp->db_ipbmap->i_sb,
1590 						  "dbAllocAG: control page "
1591 						  "inconsistent");
1592 					return -EIO;
1593 				}
1594 				return (rc);
1595 			}
1596 		}
1597 
1598 		/* allocate the blocks.
1599 		 */
1600 		rc = dbAllocCtl(bmp, nblocks, l2nb, blkno, results);
1601 		if (rc == -ENOSPC) {
1602 			jfs_error(bmp->db_ipbmap->i_sb,
1603 				  "dbAllocAG: unable to allocate blocks");
1604 			rc = -EIO;
1605 		}
1606 		return (rc);
1607 	}
1608 
1609 	/* no space in the allocation group.  release the buffer and
1610 	 * return -ENOSPC.
1611 	 */
1612 	release_metapage(mp);
1613 
1614 	return -ENOSPC;
1615 }
1616 
1617 
1618 /*
1619  * NAME:	dbAllocAny()
1620  *
1621  * FUNCTION:    attempt to allocate the specified number of contiguous
1622  *		free blocks anywhere in the file system.
1623  *
1624  *		dbAllocAny() attempts to find the sufficient free space by
1625  *		searching down the dmap control pages, starting with the
1626  *		highest level (i.e. L0, L1, L2) control page.  if free space
1627  *		large enough to satisfy the desired free space is found, the
1628  *		desired free space is allocated.
1629  *
1630  * PARAMETERS:
1631  *      bmp	-  pointer to bmap descriptor
1632  *      nblocks	 -  actual number of contiguous free blocks desired.
1633  *      l2nb	 -  log2 number of contiguous free blocks desired.
1634  *      results	-  on successful return, set to the starting block number
1635  *		   of the newly allocated range.
1636  *
1637  * RETURN VALUES:
1638  *      0	- success
1639  *      -ENOSPC	- insufficient disk resources
1640  *      -EIO	- i/o error
1641  *
1642  * serialization: IWRITE_LOCK(ipbmap) held on entry/exit;
1643  */
dbAllocAny(struct bmap * bmp,s64 nblocks,int l2nb,s64 * results)1644 static int dbAllocAny(struct bmap * bmp, s64 nblocks, int l2nb, s64 * results)
1645 {
1646 	int rc;
1647 	s64 blkno = 0;
1648 
1649 	/* starting with the top level dmap control page, search
1650 	 * down the dmap control levels for sufficient free space.
1651 	 * if free space is found, dbFindCtl() returns the starting
1652 	 * block number of the dmap that contains or starts off the
1653 	 * range of free space.
1654 	 */
1655 	if ((rc = dbFindCtl(bmp, l2nb, bmp->db_maxlevel, &blkno)))
1656 		return (rc);
1657 
1658 	/* allocate the blocks.
1659 	 */
1660 	rc = dbAllocCtl(bmp, nblocks, l2nb, blkno, results);
1661 	if (rc == -ENOSPC) {
1662 		jfs_error(bmp->db_ipbmap->i_sb,
1663 			  "dbAllocAny: unable to allocate blocks");
1664 		return -EIO;
1665 	}
1666 	return (rc);
1667 }
1668 
1669 
1670 /*
1671  * NAME:	dbFindCtl()
1672  *
1673  * FUNCTION:    starting at a specified dmap control page level and block
1674  *		number, search down the dmap control levels for a range of
1675  *	        contiguous free blocks large enough to satisfy an allocation
1676  *		request for the specified number of free blocks.
1677  *
1678  *		if sufficient contiguous free blocks are found, this routine
1679  *		returns the starting block number within a dmap page that
1680  *		contains or starts a range of contiqious free blocks that
1681  *		is sufficient in size.
1682  *
1683  * PARAMETERS:
1684  *      bmp	-  pointer to bmap descriptor
1685  *      level	-  starting dmap control page level.
1686  *      l2nb	-  log2 number of contiguous free blocks desired.
1687  *      *blkno	-  on entry, starting block number for conducting the search.
1688  *		   on successful return, the first block within a dmap page
1689  *		   that contains or starts a range of contiguous free blocks.
1690  *
1691  * RETURN VALUES:
1692  *      0	- success
1693  *      -ENOSPC	- insufficient disk resources
1694  *      -EIO	- i/o error
1695  *
1696  * serialization: IWRITE_LOCK(ipbmap) held on entry/exit;
1697  */
dbFindCtl(struct bmap * bmp,int l2nb,int level,s64 * blkno)1698 static int dbFindCtl(struct bmap * bmp, int l2nb, int level, s64 * blkno)
1699 {
1700 	int rc, leafidx, lev;
1701 	s64 b, lblkno;
1702 	struct dmapctl *dcp;
1703 	int budmin;
1704 	struct metapage *mp;
1705 
1706 	/* starting at the specified dmap control page level and block
1707 	 * number, search down the dmap control levels for the starting
1708 	 * block number of a dmap page that contains or starts off
1709 	 * sufficient free blocks.
1710 	 */
1711 	for (lev = level, b = *blkno; lev >= 0; lev--) {
1712 		/* get the buffer of the dmap control page for the block
1713 		 * number and level (i.e. L0, L1, L2).
1714 		 */
1715 		lblkno = BLKTOCTL(b, bmp->db_l2nbperpage, lev);
1716 		mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0);
1717 		if (mp == NULL)
1718 			return -EIO;
1719 		dcp = (struct dmapctl *) mp->data;
1720 		budmin = dcp->budmin;
1721 
1722 		if (dcp->leafidx != cpu_to_le32(CTLLEAFIND)) {
1723 			jfs_error(bmp->db_ipbmap->i_sb,
1724 				  "dbFindCtl: Corrupt dmapctl page");
1725 			release_metapage(mp);
1726 			return -EIO;
1727 		}
1728 
1729 		/* search the tree within the dmap control page for
1730 		 * sufficent free space.  if sufficient free space is found,
1731 		 * dbFindLeaf() returns the index of the leaf at which
1732 		 * free space was found.
1733 		 */
1734 		rc = dbFindLeaf((dmtree_t *) dcp, l2nb, &leafidx);
1735 
1736 		/* release the buffer.
1737 		 */
1738 		release_metapage(mp);
1739 
1740 		/* space found ?
1741 		 */
1742 		if (rc) {
1743 			if (lev != level) {
1744 				jfs_error(bmp->db_ipbmap->i_sb,
1745 					  "dbFindCtl: dmap inconsistent");
1746 				return -EIO;
1747 			}
1748 			return -ENOSPC;
1749 		}
1750 
1751 		/* adjust the block number to reflect the location within
1752 		 * the dmap control page (i.e. the leaf) at which free
1753 		 * space was found.
1754 		 */
1755 		b += (((s64) leafidx) << budmin);
1756 
1757 		/* we stop the search at this dmap control page level if
1758 		 * the number of blocks required is greater than or equal
1759 		 * to the maximum number of blocks described at the next
1760 		 * (lower) level.
1761 		 */
1762 		if (l2nb >= budmin)
1763 			break;
1764 	}
1765 
1766 	*blkno = b;
1767 	return (0);
1768 }
1769 
1770 
1771 /*
1772  * NAME:	dbAllocCtl()
1773  *
1774  * FUNCTION:    attempt to allocate a specified number of contiguous
1775  *		blocks starting within a specific dmap.
1776  *
1777  *		this routine is called by higher level routines that search
1778  *		the dmap control pages above the actual dmaps for contiguous
1779  *		free space.  the result of successful searches by these
1780  * 		routines are the starting block numbers within dmaps, with
1781  *		the dmaps themselves containing the desired contiguous free
1782  *		space or starting a contiguous free space of desired size
1783  *		that is made up of the blocks of one or more dmaps. these
1784  *		calls should not fail due to insufficent resources.
1785  *
1786  *		this routine is called in some cases where it is not known
1787  *		whether it will fail due to insufficient resources.  more
1788  *		specifically, this occurs when allocating from an allocation
1789  *		group whose size is equal to the number of blocks per dmap.
1790  *		in this case, the dmap control pages are not examined prior
1791  *		to calling this routine (to save pathlength) and the call
1792  *		might fail.
1793  *
1794  *		for a request size that fits within a dmap, this routine relies
1795  *		upon the dmap's dmtree to find the requested contiguous free
1796  *		space.  for request sizes that are larger than a dmap, the
1797  *		requested free space will start at the first block of the
1798  *		first dmap (i.e. blkno).
1799  *
1800  * PARAMETERS:
1801  *      bmp	-  pointer to bmap descriptor
1802  *      nblocks	 -  actual number of contiguous free blocks to allocate.
1803  *      l2nb	 -  log2 number of contiguous free blocks to allocate.
1804  *      blkno	 -  starting block number of the dmap to start the allocation
1805  *		    from.
1806  *      results	-  on successful return, set to the starting block number
1807  *		   of the newly allocated range.
1808  *
1809  * RETURN VALUES:
1810  *      0	- success
1811  *      -ENOSPC	- insufficient disk resources
1812  *      -EIO	- i/o error
1813  *
1814  * serialization: IWRITE_LOCK(ipbmap) held on entry/exit;
1815  */
1816 static int
dbAllocCtl(struct bmap * bmp,s64 nblocks,int l2nb,s64 blkno,s64 * results)1817 dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno, s64 * results)
1818 {
1819 	int rc, nb;
1820 	s64 b, lblkno, n;
1821 	struct metapage *mp;
1822 	struct dmap *dp;
1823 
1824 	/* check if the allocation request is confined to a single dmap.
1825 	 */
1826 	if (l2nb <= L2BPERDMAP) {
1827 		/* get the buffer for the dmap.
1828 		 */
1829 		lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage);
1830 		mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0);
1831 		if (mp == NULL)
1832 			return -EIO;
1833 		dp = (struct dmap *) mp->data;
1834 
1835 		/* try to allocate the blocks.
1836 		 */
1837 		rc = dbAllocDmapLev(bmp, dp, (int) nblocks, l2nb, results);
1838 		if (rc == 0)
1839 			mark_metapage_dirty(mp);
1840 
1841 		release_metapage(mp);
1842 
1843 		return (rc);
1844 	}
1845 
1846 	/* allocation request involving multiple dmaps. it must start on
1847 	 * a dmap boundary.
1848 	 */
1849 	assert((blkno & (BPERDMAP - 1)) == 0);
1850 
1851 	/* allocate the blocks dmap by dmap.
1852 	 */
1853 	for (n = nblocks, b = blkno; n > 0; n -= nb, b += nb) {
1854 		/* get the buffer for the dmap.
1855 		 */
1856 		lblkno = BLKTODMAP(b, bmp->db_l2nbperpage);
1857 		mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0);
1858 		if (mp == NULL) {
1859 			rc = -EIO;
1860 			goto backout;
1861 		}
1862 		dp = (struct dmap *) mp->data;
1863 
1864 		/* the dmap better be all free.
1865 		 */
1866 		if (dp->tree.stree[ROOT] != L2BPERDMAP) {
1867 			release_metapage(mp);
1868 			jfs_error(bmp->db_ipbmap->i_sb,
1869 				  "dbAllocCtl: the dmap is not all free");
1870 			rc = -EIO;
1871 			goto backout;
1872 		}
1873 
1874 		/* determine how many blocks to allocate from this dmap.
1875 		 */
1876 		nb = min(n, (s64)BPERDMAP);
1877 
1878 		/* allocate the blocks from the dmap.
1879 		 */
1880 		if ((rc = dbAllocDmap(bmp, dp, b, nb))) {
1881 			release_metapage(mp);
1882 			goto backout;
1883 		}
1884 
1885 		/* write the buffer.
1886 		 */
1887 		write_metapage(mp);
1888 	}
1889 
1890 	/* set the results (starting block number) and return.
1891 	 */
1892 	*results = blkno;
1893 	return (0);
1894 
1895 	/* something failed in handling an allocation request involving
1896 	 * multiple dmaps.  we'll try to clean up by backing out any
1897 	 * allocation that has already happened for this request.  if
1898 	 * we fail in backing out the allocation, we'll mark the file
1899 	 * system to indicate that blocks have been leaked.
1900 	 */
1901       backout:
1902 
1903 	/* try to backout the allocations dmap by dmap.
1904 	 */
1905 	for (n = nblocks - n, b = blkno; n > 0;
1906 	     n -= BPERDMAP, b += BPERDMAP) {
1907 		/* get the buffer for this dmap.
1908 		 */
1909 		lblkno = BLKTODMAP(b, bmp->db_l2nbperpage);
1910 		mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0);
1911 		if (mp == NULL) {
1912 			/* could not back out.  mark the file system
1913 			 * to indicate that we have leaked blocks.
1914 			 */
1915 			jfs_error(bmp->db_ipbmap->i_sb,
1916 				  "dbAllocCtl: I/O Error: Block Leakage.");
1917 			continue;
1918 		}
1919 		dp = (struct dmap *) mp->data;
1920 
1921 		/* free the blocks is this dmap.
1922 		 */
1923 		if (dbFreeDmap(bmp, dp, b, BPERDMAP)) {
1924 			/* could not back out.  mark the file system
1925 			 * to indicate that we have leaked blocks.
1926 			 */
1927 			release_metapage(mp);
1928 			jfs_error(bmp->db_ipbmap->i_sb,
1929 				  "dbAllocCtl: Block Leakage.");
1930 			continue;
1931 		}
1932 
1933 		/* write the buffer.
1934 		 */
1935 		write_metapage(mp);
1936 	}
1937 
1938 	return (rc);
1939 }
1940 
1941 
1942 /*
1943  * NAME:	dbAllocDmapLev()
1944  *
1945  * FUNCTION:    attempt to allocate a specified number of contiguous blocks
1946  *		from a specified dmap.
1947  *
1948  *		this routine checks if the contiguous blocks are available.
1949  *		if so, nblocks of blocks are allocated; otherwise, ENOSPC is
1950  *		returned.
1951  *
1952  * PARAMETERS:
1953  *      mp	-  pointer to bmap descriptor
1954  *      dp	-  pointer to dmap to attempt to allocate blocks from.
1955  *      l2nb	-  log2 number of contiguous block desired.
1956  *      nblocks	-  actual number of contiguous block desired.
1957  *      results	-  on successful return, set to the starting block number
1958  *		   of the newly allocated range.
1959  *
1960  * RETURN VALUES:
1961  *      0	- success
1962  *      -ENOSPC	- insufficient disk resources
1963  *      -EIO	- i/o error
1964  *
1965  * serialization: IREAD_LOCK(ipbmap), e.g., from dbAlloc(), or
1966  *	IWRITE_LOCK(ipbmap), e.g., dbAllocCtl(), held on entry/exit;
1967  */
1968 static int
dbAllocDmapLev(struct bmap * bmp,struct dmap * dp,int nblocks,int l2nb,s64 * results)1969 dbAllocDmapLev(struct bmap * bmp,
1970 	       struct dmap * dp, int nblocks, int l2nb, s64 * results)
1971 {
1972 	s64 blkno;
1973 	int leafidx, rc;
1974 
1975 	/* can't be more than a dmaps worth of blocks */
1976 	assert(l2nb <= L2BPERDMAP);
1977 
1978 	/* search the tree within the dmap page for sufficient
1979 	 * free space.  if sufficient free space is found, dbFindLeaf()
1980 	 * returns the index of the leaf at which free space was found.
1981 	 */
1982 	if (dbFindLeaf((dmtree_t *) & dp->tree, l2nb, &leafidx))
1983 		return -ENOSPC;
1984 
1985 	/* determine the block number within the file system corresponding
1986 	 * to the leaf at which free space was found.
1987 	 */
1988 	blkno = le64_to_cpu(dp->start) + (leafidx << L2DBWORD);
1989 
1990 	/* if not all bits of the dmap word are free, get the starting
1991 	 * bit number within the dmap word of the required string of free
1992 	 * bits and adjust the block number with this value.
1993 	 */
1994 	if (dp->tree.stree[leafidx + LEAFIND] < BUDMIN)
1995 		blkno += dbFindBits(le32_to_cpu(dp->wmap[leafidx]), l2nb);
1996 
1997 	/* allocate the blocks */
1998 	if ((rc = dbAllocDmap(bmp, dp, blkno, nblocks)) == 0)
1999 		*results = blkno;
2000 
2001 	return (rc);
2002 }
2003 
2004 
2005 /*
2006  * NAME:	dbAllocDmap()
2007  *
2008  * FUNCTION:    adjust the disk allocation map to reflect the allocation
2009  *		of a specified block range within a dmap.
2010  *
2011  *		this routine allocates the specified blocks from the dmap
2012  *		through a call to dbAllocBits(). if the allocation of the
2013  *		block range causes the maximum string of free blocks within
2014  *		the dmap to change (i.e. the value of the root of the dmap's
2015  *		dmtree), this routine will cause this change to be reflected
2016  *		up through the appropriate levels of the dmap control pages
2017  *		by a call to dbAdjCtl() for the L0 dmap control page that
2018  *		covers this dmap.
2019  *
2020  * PARAMETERS:
2021  *      bmp	-  pointer to bmap descriptor
2022  *      dp	-  pointer to dmap to allocate the block range from.
2023  *      blkno	-  starting block number of the block to be allocated.
2024  *      nblocks	-  number of blocks to be allocated.
2025  *
2026  * RETURN VALUES:
2027  *      0	- success
2028  *      -EIO	- i/o error
2029  *
2030  * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
2031  */
dbAllocDmap(struct bmap * bmp,struct dmap * dp,s64 blkno,int nblocks)2032 static int dbAllocDmap(struct bmap * bmp, struct dmap * dp, s64 blkno,
2033 		       int nblocks)
2034 {
2035 	s8 oldroot;
2036 	int rc;
2037 
2038 	/* save the current value of the root (i.e. maximum free string)
2039 	 * of the dmap tree.
2040 	 */
2041 	oldroot = dp->tree.stree[ROOT];
2042 
2043 	/* allocate the specified (blocks) bits */
2044 	dbAllocBits(bmp, dp, blkno, nblocks);
2045 
2046 	/* if the root has not changed, done. */
2047 	if (dp->tree.stree[ROOT] == oldroot)
2048 		return (0);
2049 
2050 	/* root changed. bubble the change up to the dmap control pages.
2051 	 * if the adjustment of the upper level control pages fails,
2052 	 * backout the bit allocation (thus making everything consistent).
2053 	 */
2054 	if ((rc = dbAdjCtl(bmp, blkno, dp->tree.stree[ROOT], 1, 0)))
2055 		dbFreeBits(bmp, dp, blkno, nblocks);
2056 
2057 	return (rc);
2058 }
2059 
2060 
2061 /*
2062  * NAME:	dbFreeDmap()
2063  *
2064  * FUNCTION:    adjust the disk allocation map to reflect the allocation
2065  *		of a specified block range within a dmap.
2066  *
2067  *		this routine frees the specified blocks from the dmap through
2068  *		a call to dbFreeBits(). if the deallocation of the block range
2069  *		causes the maximum string of free blocks within the dmap to
2070  *		change (i.e. the value of the root of the dmap's dmtree), this
2071  *		routine will cause this change to be reflected up through the
2072  *	        appropriate levels of the dmap control pages by a call to
2073  *		dbAdjCtl() for the L0 dmap control page that covers this dmap.
2074  *
2075  * PARAMETERS:
2076  *      bmp	-  pointer to bmap descriptor
2077  *      dp	-  pointer to dmap to free the block range from.
2078  *      blkno	-  starting block number of the block to be freed.
2079  *      nblocks	-  number of blocks to be freed.
2080  *
2081  * RETURN VALUES:
2082  *      0	- success
2083  *      -EIO	- i/o error
2084  *
2085  * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
2086  */
dbFreeDmap(struct bmap * bmp,struct dmap * dp,s64 blkno,int nblocks)2087 static int dbFreeDmap(struct bmap * bmp, struct dmap * dp, s64 blkno,
2088 		      int nblocks)
2089 {
2090 	s8 oldroot;
2091 	int rc, word;
2092 
2093 	/* save the current value of the root (i.e. maximum free string)
2094 	 * of the dmap tree.
2095 	 */
2096 	oldroot = dp->tree.stree[ROOT];
2097 
2098 	/* free the specified (blocks) bits */
2099 	dbFreeBits(bmp, dp, blkno, nblocks);
2100 
2101 	/* if the root has not changed, done. */
2102 	if (dp->tree.stree[ROOT] == oldroot)
2103 		return (0);
2104 
2105 	/* root changed. bubble the change up to the dmap control pages.
2106 	 * if the adjustment of the upper level control pages fails,
2107 	 * backout the deallocation.
2108 	 */
2109 	if ((rc = dbAdjCtl(bmp, blkno, dp->tree.stree[ROOT], 0, 0))) {
2110 		word = (blkno & (BPERDMAP - 1)) >> L2DBWORD;
2111 
2112 		/* as part of backing out the deallocation, we will have
2113 		 * to back split the dmap tree if the deallocation caused
2114 		 * the freed blocks to become part of a larger binary buddy
2115 		 * system.
2116 		 */
2117 		if (dp->tree.stree[word] == NOFREE)
2118 			dbBackSplit((dmtree_t *) & dp->tree, word);
2119 
2120 		dbAllocBits(bmp, dp, blkno, nblocks);
2121 	}
2122 
2123 	return (rc);
2124 }
2125 
2126 
2127 /*
2128  * NAME:	dbAllocBits()
2129  *
2130  * FUNCTION:    allocate a specified block range from a dmap.
2131  *
2132  *		this routine updates the dmap to reflect the working
2133  *		state allocation of the specified block range. it directly
2134  *		updates the bits of the working map and causes the adjustment
2135  *		of the binary buddy system described by the dmap's dmtree
2136  *		leaves to reflect the bits allocated.  it also causes the
2137  *		dmap's dmtree, as a whole, to reflect the allocated range.
2138  *
2139  * PARAMETERS:
2140  *      bmp	-  pointer to bmap descriptor
2141  *      dp	-  pointer to dmap to allocate bits from.
2142  *      blkno	-  starting block number of the bits to be allocated.
2143  *      nblocks	-  number of bits to be allocated.
2144  *
2145  * RETURN VALUES: none
2146  *
2147  * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
2148  */
dbAllocBits(struct bmap * bmp,struct dmap * dp,s64 blkno,int nblocks)2149 static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
2150 			int nblocks)
2151 {
2152 	int dbitno, word, rembits, nb, nwords, wbitno, nw, agno;
2153 	dmtree_t *tp = (dmtree_t *) & dp->tree;
2154 	int size;
2155 	s8 *leaf;
2156 
2157 	/* pick up a pointer to the leaves of the dmap tree */
2158 	leaf = dp->tree.stree + LEAFIND;
2159 
2160 	/* determine the bit number and word within the dmap of the
2161 	 * starting block.
2162 	 */
2163 	dbitno = blkno & (BPERDMAP - 1);
2164 	word = dbitno >> L2DBWORD;
2165 
2166 	/* block range better be within the dmap */
2167 	assert(dbitno + nblocks <= BPERDMAP);
2168 
2169 	/* allocate the bits of the dmap's words corresponding to the block
2170 	 * range. not all bits of the first and last words may be contained
2171 	 * within the block range.  if this is the case, we'll work against
2172 	 * those words (i.e. partial first and/or last) on an individual basis
2173 	 * (a single pass), allocating the bits of interest by hand and
2174 	 * updating the leaf corresponding to the dmap word. a single pass
2175 	 * will be used for all dmap words fully contained within the
2176 	 * specified range.  within this pass, the bits of all fully contained
2177 	 * dmap words will be marked as free in a single shot and the leaves
2178 	 * will be updated. a single leaf may describe the free space of
2179 	 * multiple dmap words, so we may update only a subset of the actual
2180 	 * leaves corresponding to the dmap words of the block range.
2181 	 */
2182 	for (rembits = nblocks; rembits > 0; rembits -= nb, dbitno += nb) {
2183 		/* determine the bit number within the word and
2184 		 * the number of bits within the word.
2185 		 */
2186 		wbitno = dbitno & (DBWORD - 1);
2187 		nb = min(rembits, DBWORD - wbitno);
2188 
2189 		/* check if only part of a word is to be allocated.
2190 		 */
2191 		if (nb < DBWORD) {
2192 			/* allocate (set to 1) the appropriate bits within
2193 			 * this dmap word.
2194 			 */
2195 			dp->wmap[word] |= cpu_to_le32(ONES << (DBWORD - nb)
2196 						      >> wbitno);
2197 
2198 			/* update the leaf for this dmap word. in addition
2199 			 * to setting the leaf value to the binary buddy max
2200 			 * of the updated dmap word, dbSplit() will split
2201 			 * the binary system of the leaves if need be.
2202 			 */
2203 			dbSplit(tp, word, BUDMIN,
2204 				dbMaxBud((u8 *) & dp->wmap[word]));
2205 
2206 			word += 1;
2207 		} else {
2208 			/* one or more dmap words are fully contained
2209 			 * within the block range.  determine how many
2210 			 * words and allocate (set to 1) the bits of these
2211 			 * words.
2212 			 */
2213 			nwords = rembits >> L2DBWORD;
2214 			memset(&dp->wmap[word], (int) ONES, nwords * 4);
2215 
2216 			/* determine how many bits.
2217 			 */
2218 			nb = nwords << L2DBWORD;
2219 
2220 			/* now update the appropriate leaves to reflect
2221 			 * the allocated words.
2222 			 */
2223 			for (; nwords > 0; nwords -= nw) {
2224 			        if (leaf[word] < BUDMIN) {
2225 					jfs_error(bmp->db_ipbmap->i_sb,
2226 						  "dbAllocBits: leaf page "
2227 						  "corrupt");
2228 					break;
2229 				}
2230 
2231 				/* determine what the leaf value should be
2232 				 * updated to as the minimum of the l2 number
2233 				 * of bits being allocated and the l2 number
2234 				 * of bits currently described by this leaf.
2235 				 */
2236 				size = min((int)leaf[word], NLSTOL2BSZ(nwords));
2237 
2238 				/* update the leaf to reflect the allocation.
2239 				 * in addition to setting the leaf value to
2240 				 * NOFREE, dbSplit() will split the binary
2241 				 * system of the leaves to reflect the current
2242 				 * allocation (size).
2243 				 */
2244 				dbSplit(tp, word, size, NOFREE);
2245 
2246 				/* get the number of dmap words handled */
2247 				nw = BUDSIZE(size, BUDMIN);
2248 				word += nw;
2249 			}
2250 		}
2251 	}
2252 
2253 	/* update the free count for this dmap */
2254 	dp->nfree = cpu_to_le32(le32_to_cpu(dp->nfree) - nblocks);
2255 
2256 	BMAP_LOCK(bmp);
2257 
2258 	/* if this allocation group is completely free,
2259 	 * update the maximum allocation group number if this allocation
2260 	 * group is the new max.
2261 	 */
2262 	agno = blkno >> bmp->db_agl2size;
2263 	if (agno > bmp->db_maxag)
2264 		bmp->db_maxag = agno;
2265 
2266 	/* update the free count for the allocation group and map */
2267 	bmp->db_agfree[agno] -= nblocks;
2268 	bmp->db_nfree -= nblocks;
2269 
2270 	BMAP_UNLOCK(bmp);
2271 }
2272 
2273 
2274 /*
2275  * NAME:	dbFreeBits()
2276  *
2277  * FUNCTION:    free a specified block range from a dmap.
2278  *
2279  *		this routine updates the dmap to reflect the working
2280  *		state allocation of the specified block range. it directly
2281  *		updates the bits of the working map and causes the adjustment
2282  *		of the binary buddy system described by the dmap's dmtree
2283  *		leaves to reflect the bits freed.  it also causes the dmap's
2284  *		dmtree, as a whole, to reflect the deallocated range.
2285  *
2286  * PARAMETERS:
2287  *      bmp	-  pointer to bmap descriptor
2288  *      dp	-  pointer to dmap to free bits from.
2289  *      blkno	-  starting block number of the bits to be freed.
2290  *      nblocks	-  number of bits to be freed.
2291  *
2292  * RETURN VALUES: none
2293  *
2294  * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
2295  */
dbFreeBits(struct bmap * bmp,struct dmap * dp,s64 blkno,int nblocks)2296 static void dbFreeBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
2297 		       int nblocks)
2298 {
2299 	int dbitno, word, rembits, nb, nwords, wbitno, nw, agno;
2300 	dmtree_t *tp = (dmtree_t *) & dp->tree;
2301 	int size;
2302 
2303 	/* determine the bit number and word within the dmap of the
2304 	 * starting block.
2305 	 */
2306 	dbitno = blkno & (BPERDMAP - 1);
2307 	word = dbitno >> L2DBWORD;
2308 
2309 	/* block range better be within the dmap.
2310 	 */
2311 	assert(dbitno + nblocks <= BPERDMAP);
2312 
2313 	/* free the bits of the dmaps words corresponding to the block range.
2314 	 * not all bits of the first and last words may be contained within
2315 	 * the block range.  if this is the case, we'll work against those
2316 	 * words (i.e. partial first and/or last) on an individual basis
2317 	 * (a single pass), freeing the bits of interest by hand and updating
2318 	 * the leaf corresponding to the dmap word. a single pass will be used
2319 	 * for all dmap words fully contained within the specified range.
2320 	 * within this pass, the bits of all fully contained dmap words will
2321 	 * be marked as free in a single shot and the leaves will be updated. a
2322 	 * single leaf may describe the free space of multiple dmap words,
2323 	 * so we may update only a subset of the actual leaves corresponding
2324 	 * to the dmap words of the block range.
2325 	 *
2326 	 * dbJoin() is used to update leaf values and will join the binary
2327 	 * buddy system of the leaves if the new leaf values indicate this
2328 	 * should be done.
2329 	 */
2330 	for (rembits = nblocks; rembits > 0; rembits -= nb, dbitno += nb) {
2331 		/* determine the bit number within the word and
2332 		 * the number of bits within the word.
2333 		 */
2334 		wbitno = dbitno & (DBWORD - 1);
2335 		nb = min(rembits, DBWORD - wbitno);
2336 
2337 		/* check if only part of a word is to be freed.
2338 		 */
2339 		if (nb < DBWORD) {
2340 			/* free (zero) the appropriate bits within this
2341 			 * dmap word.
2342 			 */
2343 			dp->wmap[word] &=
2344 			    cpu_to_le32(~(ONES << (DBWORD - nb)
2345 					  >> wbitno));
2346 
2347 			/* update the leaf for this dmap word.
2348 			 */
2349 			dbJoin(tp, word,
2350 			       dbMaxBud((u8 *) & dp->wmap[word]));
2351 
2352 			word += 1;
2353 		} else {
2354 			/* one or more dmap words are fully contained
2355 			 * within the block range.  determine how many
2356 			 * words and free (zero) the bits of these words.
2357 			 */
2358 			nwords = rembits >> L2DBWORD;
2359 			memset(&dp->wmap[word], 0, nwords * 4);
2360 
2361 			/* determine how many bits.
2362 			 */
2363 			nb = nwords << L2DBWORD;
2364 
2365 			/* now update the appropriate leaves to reflect
2366 			 * the freed words.
2367 			 */
2368 			for (; nwords > 0; nwords -= nw) {
2369 				/* determine what the leaf value should be
2370 				 * updated to as the minimum of the l2 number
2371 				 * of bits being freed and the l2 (max) number
2372 				 * of bits that can be described by this leaf.
2373 				 */
2374 				size =
2375 				    min(LITOL2BSZ
2376 					(word, L2LPERDMAP, BUDMIN),
2377 					NLSTOL2BSZ(nwords));
2378 
2379 				/* update the leaf.
2380 				 */
2381 				dbJoin(tp, word, size);
2382 
2383 				/* get the number of dmap words handled.
2384 				 */
2385 				nw = BUDSIZE(size, BUDMIN);
2386 				word += nw;
2387 			}
2388 		}
2389 	}
2390 
2391 	/* update the free count for this dmap.
2392 	 */
2393 	dp->nfree = cpu_to_le32(le32_to_cpu(dp->nfree) + nblocks);
2394 
2395 	BMAP_LOCK(bmp);
2396 
2397 	/* update the free count for the allocation group and
2398 	 * map.
2399 	 */
2400 	agno = blkno >> bmp->db_agl2size;
2401 	bmp->db_nfree += nblocks;
2402 	bmp->db_agfree[agno] += nblocks;
2403 
2404 	/* check if this allocation group is not completely free and
2405 	 * if it is currently the maximum (rightmost) allocation group.
2406 	 * if so, establish the new maximum allocation group number by
2407 	 * searching left for the first allocation group with allocation.
2408 	 */
2409 	if ((bmp->db_agfree[agno] == bmp->db_agsize && agno == bmp->db_maxag) ||
2410 	    (agno == bmp->db_numag - 1 &&
2411 	     bmp->db_agfree[agno] == (bmp-> db_mapsize & (BPERDMAP - 1)))) {
2412 		while (bmp->db_maxag > 0) {
2413 			bmp->db_maxag -= 1;
2414 			if (bmp->db_agfree[bmp->db_maxag] !=
2415 			    bmp->db_agsize)
2416 				break;
2417 		}
2418 
2419 		/* re-establish the allocation group preference if the
2420 		 * current preference is right of the maximum allocation
2421 		 * group.
2422 		 */
2423 		if (bmp->db_agpref > bmp->db_maxag)
2424 			bmp->db_agpref = bmp->db_maxag;
2425 	}
2426 
2427 	BMAP_UNLOCK(bmp);
2428 }
2429 
2430 
2431 /*
2432  * NAME:	dbAdjCtl()
2433  *
2434  * FUNCTION:	adjust a dmap control page at a specified level to reflect
2435  *		the change in a lower level dmap or dmap control page's
2436  *		maximum string of free blocks (i.e. a change in the root
2437  *		of the lower level object's dmtree) due to the allocation
2438  *		or deallocation of a range of blocks with a single dmap.
2439  *
2440  *		on entry, this routine is provided with the new value of
2441  *		the lower level dmap or dmap control page root and the
2442  *		starting block number of the block range whose allocation
2443  *		or deallocation resulted in the root change.  this range
2444  *		is respresented by a single leaf of the current dmapctl
2445  *		and the leaf will be updated with this value, possibly
2446  *		causing a binary buddy system within the leaves to be
2447  *		split or joined.  the update may also cause the dmapctl's
2448  *		dmtree to be updated.
2449  *
2450  *		if the adjustment of the dmap control page, itself, causes its
2451  *		root to change, this change will be bubbled up to the next dmap
2452  *		control level by a recursive call to this routine, specifying
2453  *		the new root value and the next dmap control page level to
2454  *		be adjusted.
2455  * PARAMETERS:
2456  *      bmp	-  pointer to bmap descriptor
2457  *      blkno	-  the first block of a block range within a dmap.  it is
2458  *		   the allocation or deallocation of this block range that
2459  *		   requires the dmap control page to be adjusted.
2460  *      newval	-  the new value of the lower level dmap or dmap control
2461  *		   page root.
2462  *      alloc	-  TRUE if adjustment is due to an allocation.
2463  *      level	-  current level of dmap control page (i.e. L0, L1, L2) to
2464  *		   be adjusted.
2465  *
2466  * RETURN VALUES:
2467  *      0	- success
2468  *      -EIO	- i/o error
2469  *
2470  * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
2471  */
2472 static int
dbAdjCtl(struct bmap * bmp,s64 blkno,int newval,int alloc,int level)2473 dbAdjCtl(struct bmap * bmp, s64 blkno, int newval, int alloc, int level)
2474 {
2475 	struct metapage *mp;
2476 	s8 oldroot;
2477 	int oldval;
2478 	s64 lblkno;
2479 	struct dmapctl *dcp;
2480 	int rc, leafno, ti;
2481 
2482 	/* get the buffer for the dmap control page for the specified
2483 	 * block number and control page level.
2484 	 */
2485 	lblkno = BLKTOCTL(blkno, bmp->db_l2nbperpage, level);
2486 	mp = read_metapage(bmp->db_ipbmap, lblkno, PSIZE, 0);
2487 	if (mp == NULL)
2488 		return -EIO;
2489 	dcp = (struct dmapctl *) mp->data;
2490 
2491 	if (dcp->leafidx != cpu_to_le32(CTLLEAFIND)) {
2492 		jfs_error(bmp->db_ipbmap->i_sb,
2493 			  "dbAdjCtl: Corrupt dmapctl page");
2494 		release_metapage(mp);
2495 		return -EIO;
2496 	}
2497 
2498 	/* determine the leaf number corresponding to the block and
2499 	 * the index within the dmap control tree.
2500 	 */
2501 	leafno = BLKTOCTLLEAF(blkno, dcp->budmin);
2502 	ti = leafno + le32_to_cpu(dcp->leafidx);
2503 
2504 	/* save the current leaf value and the current root level (i.e.
2505 	 * maximum l2 free string described by this dmapctl).
2506 	 */
2507 	oldval = dcp->stree[ti];
2508 	oldroot = dcp->stree[ROOT];
2509 
2510 	/* check if this is a control page update for an allocation.
2511 	 * if so, update the leaf to reflect the new leaf value using
2512 	 * dbSplit(); otherwise (deallocation), use dbJoin() to udpate
2513 	 * the leaf with the new value.  in addition to updating the
2514 	 * leaf, dbSplit() will also split the binary buddy system of
2515 	 * the leaves, if required, and bubble new values within the
2516 	 * dmapctl tree, if required.  similarly, dbJoin() will join
2517 	 * the binary buddy system of leaves and bubble new values up
2518 	 * the dmapctl tree as required by the new leaf value.
2519 	 */
2520 	if (alloc) {
2521 		/* check if we are in the middle of a binary buddy
2522 		 * system.  this happens when we are performing the
2523 		 * first allocation out of an allocation group that
2524 		 * is part (not the first part) of a larger binary
2525 		 * buddy system.  if we are in the middle, back split
2526 		 * the system prior to calling dbSplit() which assumes
2527 		 * that it is at the front of a binary buddy system.
2528 		 */
2529 		if (oldval == NOFREE) {
2530 			dbBackSplit((dmtree_t *) dcp, leafno);
2531 			oldval = dcp->stree[ti];
2532 		}
2533 		dbSplit((dmtree_t *) dcp, leafno, dcp->budmin, newval);
2534 	} else {
2535 		dbJoin((dmtree_t *) dcp, leafno, newval);
2536 	}
2537 
2538 	/* check if the root of the current dmap control page changed due
2539 	 * to the update and if the current dmap control page is not at
2540 	 * the current top level (i.e. L0, L1, L2) of the map.  if so (i.e.
2541 	 * root changed and this is not the top level), call this routine
2542 	 * again (recursion) for the next higher level of the mapping to
2543 	 * reflect the change in root for the current dmap control page.
2544 	 */
2545 	if (dcp->stree[ROOT] != oldroot) {
2546 		/* are we below the top level of the map.  if so,
2547 		 * bubble the root up to the next higher level.
2548 		 */
2549 		if (level < bmp->db_maxlevel) {
2550 			/* bubble up the new root of this dmap control page to
2551 			 * the next level.
2552 			 */
2553 			if ((rc =
2554 			     dbAdjCtl(bmp, blkno, dcp->stree[ROOT], alloc,
2555 				      level + 1))) {
2556 				/* something went wrong in bubbling up the new
2557 				 * root value, so backout the changes to the
2558 				 * current dmap control page.
2559 				 */
2560 				if (alloc) {
2561 					dbJoin((dmtree_t *) dcp, leafno,
2562 					       oldval);
2563 				} else {
2564 					/* the dbJoin() above might have
2565 					 * caused a larger binary buddy system
2566 					 * to form and we may now be in the
2567 					 * middle of it.  if this is the case,
2568 					 * back split the buddies.
2569 					 */
2570 					if (dcp->stree[ti] == NOFREE)
2571 						dbBackSplit((dmtree_t *)
2572 							    dcp, leafno);
2573 					dbSplit((dmtree_t *) dcp, leafno,
2574 						dcp->budmin, oldval);
2575 				}
2576 
2577 				/* release the buffer and return the error.
2578 				 */
2579 				release_metapage(mp);
2580 				return (rc);
2581 			}
2582 		} else {
2583 			/* we're at the top level of the map. update
2584 			 * the bmap control page to reflect the size
2585 			 * of the maximum free buddy system.
2586 			 */
2587 			assert(level == bmp->db_maxlevel);
2588 			if (bmp->db_maxfreebud != oldroot) {
2589 				jfs_error(bmp->db_ipbmap->i_sb,
2590 					  "dbAdjCtl: the maximum free buddy is "
2591 					  "not the old root");
2592 			}
2593 			bmp->db_maxfreebud = dcp->stree[ROOT];
2594 		}
2595 	}
2596 
2597 	/* write the buffer.
2598 	 */
2599 	write_metapage(mp);
2600 
2601 	return (0);
2602 }
2603 
2604 
2605 /*
2606  * NAME:	dbSplit()
2607  *
2608  * FUNCTION:    update the leaf of a dmtree with a new value, splitting
2609  *		the leaf from the binary buddy system of the dmtree's
2610  *		leaves, as required.
2611  *
2612  * PARAMETERS:
2613  *      tp	- pointer to the tree containing the leaf.
2614  *      leafno	- the number of the leaf to be updated.
2615  *      splitsz	- the size the binary buddy system starting at the leaf
2616  *		  must be split to, specified as the log2 number of blocks.
2617  *      newval	- the new value for the leaf.
2618  *
2619  * RETURN VALUES: none
2620  *
2621  * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
2622  */
dbSplit(dmtree_t * tp,int leafno,int splitsz,int newval)2623 static void dbSplit(dmtree_t * tp, int leafno, int splitsz, int newval)
2624 {
2625 	int budsz;
2626 	int cursz;
2627 	s8 *leaf = tp->dmt_stree + le32_to_cpu(tp->dmt_leafidx);
2628 
2629 	/* check if the leaf needs to be split.
2630 	 */
2631 	if (leaf[leafno] > tp->dmt_budmin) {
2632 		/* the split occurs by cutting the buddy system in half
2633 		 * at the specified leaf until we reach the specified
2634 		 * size.  pick up the starting split size (current size
2635 		 * - 1 in l2) and the corresponding buddy size.
2636 		 */
2637 		cursz = leaf[leafno] - 1;
2638 		budsz = BUDSIZE(cursz, tp->dmt_budmin);
2639 
2640 		/* split until we reach the specified size.
2641 		 */
2642 		while (cursz >= splitsz) {
2643 			/* update the buddy's leaf with its new value.
2644 			 */
2645 			dbAdjTree(tp, leafno ^ budsz, cursz);
2646 
2647 			/* on to the next size and buddy.
2648 			 */
2649 			cursz -= 1;
2650 			budsz >>= 1;
2651 		}
2652 	}
2653 
2654 	/* adjust the dmap tree to reflect the specified leaf's new
2655 	 * value.
2656 	 */
2657 	dbAdjTree(tp, leafno, newval);
2658 }
2659 
2660 
2661 /*
2662  * NAME:	dbBackSplit()
2663  *
2664  * FUNCTION:    back split the binary buddy system of dmtree leaves
2665  *		that hold a specified leaf until the specified leaf
2666  *		starts its own binary buddy system.
2667  *
2668  *		the allocators typically perform allocations at the start
2669  *		of binary buddy systems and dbSplit() is used to accomplish
2670  *		any required splits.  in some cases, however, allocation
2671  *		may occur in the middle of a binary system and requires a
2672  *		back split, with the split proceeding out from the middle of
2673  *		the system (less efficient) rather than the start of the
2674  *		system (more efficient).  the cases in which a back split
2675  *		is required are rare and are limited to the first allocation
2676  *		within an allocation group which is a part (not first part)
2677  *		of a larger binary buddy system and a few exception cases
2678  *		in which a previous join operation must be backed out.
2679  *
2680  * PARAMETERS:
2681  *      tp	- pointer to the tree containing the leaf.
2682  *      leafno	- the number of the leaf to be updated.
2683  *
2684  * RETURN VALUES: none
2685  *
2686  * serialization: IREAD_LOCK(ipbmap) or IWRITE_LOCK(ipbmap) held on entry/exit;
2687  */
dbBackSplit(dmtree_t * tp,int leafno)2688 static void dbBackSplit(dmtree_t * tp, int leafno)
2689 {
2690 	int budsz, bud, w, bsz, size;
2691 	int cursz;
2692 	s8 *leaf = tp->dmt_stree + le32_to_cpu(tp->dmt_leafidx);
2693 
2694 	/* leaf should be part (not first part) of a binary
2695 	 * buddy system.
2696 	 */
2697 	assert(leaf[leafno] == NOFREE);
2698 
2699 	/* the back split is accomplished by iteratively finding the leaf
2700 	 * that starts the buddy system that contains the specified leaf and
2701 	 * splitting that system in two.  this iteration continues until
2702 	 * the specified leaf becomes the start of a buddy system.
2703 	 *
2704 	 * determine maximum possible l2 size for the specified leaf.
2705 	 */
2706 	size =
2707 	    LITOL2BSZ(leafno, le32_to_cpu(tp->dmt_l2nleafs),
2708 		      tp->dmt_budmin);
2709 
2710 	/* determine the number of leaves covered by this size.  this
2711 	 * is the buddy size that we will start with as we search for
2712 	 * the buddy system that contains the specified leaf.
2713 	 */
2714 	budsz = BUDSIZE(size, tp->dmt_budmin);
2715 
2716 	/* back split.
2717 	 */
2718 	while (leaf[leafno] == NOFREE) {
2719 		/* find the leftmost buddy leaf.
2720 		 */
2721 		for (w = leafno, bsz = budsz;; bsz <<= 1,
2722 		     w = (w < bud) ? w : bud) {
2723 			assert(bsz < le32_to_cpu(tp->dmt_nleafs));
2724 
2725 			/* determine the buddy.
2726 			 */
2727 			bud = w ^ bsz;
2728 
2729 			/* check if this buddy is the start of the system.
2730 			 */
2731 			if (leaf[bud] != NOFREE) {
2732 				/* split the leaf at the start of the
2733 				 * system in two.
2734 				 */
2735 				cursz = leaf[bud] - 1;
2736 				dbSplit(tp, bud, cursz, cursz);
2737 				break;
2738 			}
2739 		}
2740 	}
2741 
2742 	assert(leaf[leafno] == size);
2743 }
2744 
2745 
2746 /*
2747  * NAME:	dbJoin()
2748  *
2749  * FUNCTION:    update the leaf of a dmtree with a new value, joining
2750  *		the leaf with other leaves of the dmtree into a multi-leaf
2751  *		binary buddy system, as required.
2752  *
2753  * PARAMETERS:
2754  *      tp	- pointer to the tree containing the leaf.
2755  *      leafno	- the number of the leaf to be updated.
2756  *      newval	- the new value for the leaf.
2757  *
2758  * RETURN VALUES: none
2759  */
dbJoin(dmtree_t * tp,int leafno,int newval)2760 static void dbJoin(dmtree_t * tp, int leafno, int newval)
2761 {
2762 	int budsz, buddy;
2763 	s8 *leaf;
2764 
2765 	/* can the new leaf value require a join with other leaves ?
2766 	 */
2767 	if (newval >= tp->dmt_budmin) {
2768 		/* pickup a pointer to the leaves of the tree.
2769 		 */
2770 		leaf = tp->dmt_stree + le32_to_cpu(tp->dmt_leafidx);
2771 
2772 		/* try to join the specified leaf into a large binary
2773 		 * buddy system.  the join proceeds by attempting to join
2774 		 * the specified leafno with its buddy (leaf) at new value.
2775 		 * if the join occurs, we attempt to join the left leaf
2776 		 * of the joined buddies with its buddy at new value + 1.
2777 		 * we continue to join until we find a buddy that cannot be
2778 		 * joined (does not have a value equal to the size of the
2779 		 * last join) or until all leaves have been joined into a
2780 		 * single system.
2781 		 *
2782 		 * get the buddy size (number of words covered) of
2783 		 * the new value.
2784 		 */
2785 		budsz = BUDSIZE(newval, tp->dmt_budmin);
2786 
2787 		/* try to join.
2788 		 */
2789 		while (budsz < le32_to_cpu(tp->dmt_nleafs)) {
2790 			/* get the buddy leaf.
2791 			 */
2792 			buddy = leafno ^ budsz;
2793 
2794 			/* if the leaf's new value is greater than its
2795 			 * buddy's value, we join no more.
2796 			 */
2797 			if (newval > leaf[buddy])
2798 				break;
2799 
2800 			assert(newval == leaf[buddy]);
2801 
2802 			/* check which (leafno or buddy) is the left buddy.
2803 			 * the left buddy gets to claim the blocks resulting
2804 			 * from the join while the right gets to claim none.
2805 			 * the left buddy is also eligable to participate in
2806 			 * a join at the next higher level while the right
2807 			 * is not.
2808 			 *
2809 			 */
2810 			if (leafno < buddy) {
2811 				/* leafno is the left buddy.
2812 				 */
2813 				dbAdjTree(tp, buddy, NOFREE);
2814 			} else {
2815 				/* buddy is the left buddy and becomes
2816 				 * leafno.
2817 				 */
2818 				dbAdjTree(tp, leafno, NOFREE);
2819 				leafno = buddy;
2820 			}
2821 
2822 			/* on to try the next join.
2823 			 */
2824 			newval += 1;
2825 			budsz <<= 1;
2826 		}
2827 	}
2828 
2829 	/* update the leaf value.
2830 	 */
2831 	dbAdjTree(tp, leafno, newval);
2832 }
2833 
2834 
2835 /*
2836  * NAME:	dbAdjTree()
2837  *
2838  * FUNCTION:    update a leaf of a dmtree with a new value, adjusting
2839  *		the dmtree, as required, to reflect the new leaf value.
2840  *		the combination of any buddies must already be done before
2841  *		this is called.
2842  *
2843  * PARAMETERS:
2844  *      tp	- pointer to the tree to be adjusted.
2845  *      leafno	- the number of the leaf to be updated.
2846  *      newval	- the new value for the leaf.
2847  *
2848  * RETURN VALUES: none
2849  */
dbAdjTree(dmtree_t * tp,int leafno,int newval)2850 static void dbAdjTree(dmtree_t * tp, int leafno, int newval)
2851 {
2852 	int lp, pp, k;
2853 	int max;
2854 
2855 	/* pick up the index of the leaf for this leafno.
2856 	 */
2857 	lp = leafno + le32_to_cpu(tp->dmt_leafidx);
2858 
2859 	/* is the current value the same as the old value ?  if so,
2860 	 * there is nothing to do.
2861 	 */
2862 	if (tp->dmt_stree[lp] == newval)
2863 		return;
2864 
2865 	/* set the new value.
2866 	 */
2867 	tp->dmt_stree[lp] = newval;
2868 
2869 	/* bubble the new value up the tree as required.
2870 	 */
2871 	for (k = 0; k < le32_to_cpu(tp->dmt_height); k++) {
2872 		/* get the index of the first leaf of the 4 leaf
2873 		 * group containing the specified leaf (leafno).
2874 		 */
2875 		lp = ((lp - 1) & ~0x03) + 1;
2876 
2877 		/* get the index of the parent of this 4 leaf group.
2878 		 */
2879 		pp = (lp - 1) >> 2;
2880 
2881 		/* determine the maximum of the 4 leaves.
2882 		 */
2883 		max = TREEMAX(&tp->dmt_stree[lp]);
2884 
2885 		/* if the maximum of the 4 is the same as the
2886 		 * parent's value, we're done.
2887 		 */
2888 		if (tp->dmt_stree[pp] == max)
2889 			break;
2890 
2891 		/* parent gets new value.
2892 		 */
2893 		tp->dmt_stree[pp] = max;
2894 
2895 		/* parent becomes leaf for next go-round.
2896 		 */
2897 		lp = pp;
2898 	}
2899 }
2900 
2901 
2902 /*
2903  * NAME:	dbFindLeaf()
2904  *
2905  * FUNCTION:    search a dmtree_t for sufficient free blocks, returning
2906  *		the index of a leaf describing the free blocks if
2907  *		sufficient free blocks are found.
2908  *
2909  *		the search starts at the top of the dmtree_t tree and
2910  *		proceeds down the tree to the leftmost leaf with sufficient
2911  *		free space.
2912  *
2913  * PARAMETERS:
2914  *      tp	- pointer to the tree to be searched.
2915  *      l2nb	- log2 number of free blocks to search for.
2916  *	leafidx	- return pointer to be set to the index of the leaf
2917  *		  describing at least l2nb free blocks if sufficient
2918  *		  free blocks are found.
2919  *
2920  * RETURN VALUES:
2921  *      0	- success
2922  *      -ENOSPC	- insufficient free blocks.
2923  */
dbFindLeaf(dmtree_t * tp,int l2nb,int * leafidx)2924 static int dbFindLeaf(dmtree_t * tp, int l2nb, int *leafidx)
2925 {
2926 	int ti, n = 0, k, x = 0;
2927 
2928 	/* first check the root of the tree to see if there is
2929 	 * sufficient free space.
2930 	 */
2931 	if (l2nb > tp->dmt_stree[ROOT])
2932 		return -ENOSPC;
2933 
2934 	/* sufficient free space available. now search down the tree
2935 	 * starting at the next level for the leftmost leaf that
2936 	 * describes sufficient free space.
2937 	 */
2938 	for (k = le32_to_cpu(tp->dmt_height), ti = 1;
2939 	     k > 0; k--, ti = ((ti + n) << 2) + 1) {
2940 		/* search the four nodes at this level, starting from
2941 		 * the left.
2942 		 */
2943 		for (x = ti, n = 0; n < 4; n++) {
2944 			/* sufficient free space found.  move to the next
2945 			 * level (or quit if this is the last level).
2946 			 */
2947 			if (l2nb <= tp->dmt_stree[x + n])
2948 				break;
2949 		}
2950 
2951 		/* better have found something since the higher
2952 		 * levels of the tree said it was here.
2953 		 */
2954 		assert(n < 4);
2955 	}
2956 
2957 	/* set the return to the leftmost leaf describing sufficient
2958 	 * free space.
2959 	 */
2960 	*leafidx = x + n - le32_to_cpu(tp->dmt_leafidx);
2961 
2962 	return (0);
2963 }
2964 
2965 
2966 /*
2967  * NAME:	dbFindBits()
2968  *
2969  * FUNCTION:    find a specified number of binary buddy free bits within a
2970  *		dmap bitmap word value.
2971  *
2972  *		this routine searches the bitmap value for (1 << l2nb) free
2973  *		bits at (1 << l2nb) alignments within the value.
2974  *
2975  * PARAMETERS:
2976  *      word	-  dmap bitmap word value.
2977  *      l2nb	-  number of free bits specified as a log2 number.
2978  *
2979  * RETURN VALUES:
2980  *      starting bit number of free bits.
2981  */
dbFindBits(u32 word,int l2nb)2982 static int dbFindBits(u32 word, int l2nb)
2983 {
2984 	int bitno, nb;
2985 	u32 mask;
2986 
2987 	/* get the number of bits.
2988 	 */
2989 	nb = 1 << l2nb;
2990 	assert(nb <= DBWORD);
2991 
2992 	/* complement the word so we can use a mask (i.e. 0s represent
2993 	 * free bits) and compute the mask.
2994 	 */
2995 	word = ~word;
2996 	mask = ONES << (DBWORD - nb);
2997 
2998 	/* scan the word for nb free bits at nb alignments.
2999 	 */
3000 	for (bitno = 0; mask != 0; bitno += nb, mask >>= nb) {
3001 		if ((mask & word) == mask)
3002 			break;
3003 	}
3004 
3005 	ASSERT(bitno < 32);
3006 
3007 	/* return the bit number.
3008 	 */
3009 	return (bitno);
3010 }
3011 
3012 
3013 /*
3014  * NAME:	dbMaxBud(u8 *cp)
3015  *
3016  * FUNCTION:    determine the largest binary buddy string of free
3017  *		bits within 32-bits of the map.
3018  *
3019  * PARAMETERS:
3020  *      cp	-  pointer to the 32-bit value.
3021  *
3022  * RETURN VALUES:
3023  *      largest binary buddy of free bits within a dmap word.
3024  */
dbMaxBud(u8 * cp)3025 static int dbMaxBud(u8 * cp)
3026 {
3027 	signed char tmp1, tmp2;
3028 
3029 	/* check if the wmap word is all free. if so, the
3030 	 * free buddy size is BUDMIN.
3031 	 */
3032 	if (*((uint *) cp) == 0)
3033 		return (BUDMIN);
3034 
3035 	/* check if the wmap word is half free. if so, the
3036 	 * free buddy size is BUDMIN-1.
3037 	 */
3038 	if (*((u16 *) cp) == 0 || *((u16 *) cp + 1) == 0)
3039 		return (BUDMIN - 1);
3040 
3041 	/* not all free or half free. determine the free buddy
3042 	 * size thru table lookup using quarters of the wmap word.
3043 	 */
3044 	tmp1 = max(budtab[cp[2]], budtab[cp[3]]);
3045 	tmp2 = max(budtab[cp[0]], budtab[cp[1]]);
3046 	return (max(tmp1, tmp2));
3047 }
3048 
3049 
3050 /*
3051  * NAME:	cnttz(uint word)
3052  *
3053  * FUNCTION:    determine the number of trailing zeros within a 32-bit
3054  *		value.
3055  *
3056  * PARAMETERS:
3057  *      value	-  32-bit value to be examined.
3058  *
3059  * RETURN VALUES:
3060  *      count of trailing zeros
3061  */
cnttz(u32 word)3062 static int cnttz(u32 word)
3063 {
3064 	int n;
3065 
3066 	for (n = 0; n < 32; n++, word >>= 1) {
3067 		if (word & 0x01)
3068 			break;
3069 	}
3070 
3071 	return (n);
3072 }
3073 
3074 
3075 /*
3076  * NAME:	cntlz(u32 value)
3077  *
3078  * FUNCTION:    determine the number of leading zeros within a 32-bit
3079  *		value.
3080  *
3081  * PARAMETERS:
3082  *      value	-  32-bit value to be examined.
3083  *
3084  * RETURN VALUES:
3085  *      count of leading zeros
3086  */
cntlz(u32 value)3087 static int cntlz(u32 value)
3088 {
3089 	int n;
3090 
3091 	for (n = 0; n < 32; n++, value <<= 1) {
3092 		if (value & HIGHORDER)
3093 			break;
3094 	}
3095 	return (n);
3096 }
3097 
3098 
3099 /*
3100  * NAME:	blkstol2(s64 nb)
3101  *
3102  * FUNCTION:	convert a block count to its log2 value. if the block
3103  *	        count is not a l2 multiple, it is rounded up to the next
3104  *		larger l2 multiple.
3105  *
3106  * PARAMETERS:
3107  *      nb	-  number of blocks
3108  *
3109  * RETURN VALUES:
3110  *      log2 number of blocks
3111  */
blkstol2(s64 nb)3112 int blkstol2(s64 nb)
3113 {
3114 	int l2nb;
3115 	s64 mask;		/* meant to be signed */
3116 
3117 	mask = (s64) 1 << (64 - 1);
3118 
3119 	/* count the leading bits.
3120 	 */
3121 	for (l2nb = 0; l2nb < 64; l2nb++, mask >>= 1) {
3122 		/* leading bit found.
3123 		 */
3124 		if (nb & mask) {
3125 			/* determine the l2 value.
3126 			 */
3127 			l2nb = (64 - 1) - l2nb;
3128 
3129 			/* check if we need to round up.
3130 			 */
3131 			if (~mask & nb)
3132 				l2nb++;
3133 
3134 			return (l2nb);
3135 		}
3136 	}
3137 	assert(0);
3138 	return 0;		/* fix compiler warning */
3139 }
3140 
3141 
3142 /*
3143  * NAME:    	dbAllocBottomUp()
3144  *
3145  * FUNCTION:	alloc the specified block range from the working block
3146  *		allocation map.
3147  *
3148  *		the blocks will be alloc from the working map one dmap
3149  *		at a time.
3150  *
3151  * PARAMETERS:
3152  *      ip	-  pointer to in-core inode;
3153  *      blkno	-  starting block number to be freed.
3154  *      nblocks	-  number of blocks to be freed.
3155  *
3156  * RETURN VALUES:
3157  *      0	- success
3158  *      -EIO	- i/o error
3159  */
dbAllocBottomUp(struct inode * ip,s64 blkno,s64 nblocks)3160 int dbAllocBottomUp(struct inode *ip, s64 blkno, s64 nblocks)
3161 {
3162 	struct metapage *mp;
3163 	struct dmap *dp;
3164 	int nb, rc;
3165 	s64 lblkno, rem;
3166 	struct inode *ipbmap = JFS_SBI(ip->i_sb)->ipbmap;
3167 	struct bmap *bmp = JFS_SBI(ip->i_sb)->bmap;
3168 
3169 	IREAD_LOCK(ipbmap);
3170 
3171 	/* block to be allocated better be within the mapsize. */
3172 	ASSERT(nblocks <= bmp->db_mapsize - blkno);
3173 
3174 	/*
3175 	 * allocate the blocks a dmap at a time.
3176 	 */
3177 	mp = NULL;
3178 	for (rem = nblocks; rem > 0; rem -= nb, blkno += nb) {
3179 		/* release previous dmap if any */
3180 		if (mp) {
3181 			write_metapage(mp);
3182 		}
3183 
3184 		/* get the buffer for the current dmap. */
3185 		lblkno = BLKTODMAP(blkno, bmp->db_l2nbperpage);
3186 		mp = read_metapage(ipbmap, lblkno, PSIZE, 0);
3187 		if (mp == NULL) {
3188 			IREAD_UNLOCK(ipbmap);
3189 			return -EIO;
3190 		}
3191 		dp = (struct dmap *) mp->data;
3192 
3193 		/* determine the number of blocks to be allocated from
3194 		 * this dmap.
3195 		 */
3196 		nb = min(rem, BPERDMAP - (blkno & (BPERDMAP - 1)));
3197 
3198 		DBFREECK(bmp->db_DBmap, bmp->db_mapsize, blkno, nb);
3199 
3200 		/* allocate the blocks. */
3201 		if ((rc = dbAllocDmapBU(bmp, dp, blkno, nb))) {
3202 			release_metapage(mp);
3203 			IREAD_UNLOCK(ipbmap);
3204 			return (rc);
3205 		}
3206 
3207 		DBALLOC(bmp->db_DBmap, bmp->db_mapsize, blkno, nb);
3208 	}
3209 
3210 	/* write the last buffer. */
3211 	write_metapage(mp);
3212 
3213 	IREAD_UNLOCK(ipbmap);
3214 
3215 	return (0);
3216 }
3217 
3218 
dbAllocDmapBU(struct bmap * bmp,struct dmap * dp,s64 blkno,int nblocks)3219 static int dbAllocDmapBU(struct bmap * bmp, struct dmap * dp, s64 blkno,
3220 			 int nblocks)
3221 {
3222 	int rc;
3223 	int dbitno, word, rembits, nb, nwords, wbitno, agno;
3224 	s8 oldroot, *leaf;
3225 	struct dmaptree *tp = (struct dmaptree *) & dp->tree;
3226 
3227 	/* save the current value of the root (i.e. maximum free string)
3228 	 * of the dmap tree.
3229 	 */
3230 	oldroot = tp->stree[ROOT];
3231 
3232 	/* pick up a pointer to the leaves of the dmap tree */
3233 	leaf = tp->stree + LEAFIND;
3234 
3235 	/* determine the bit number and word within the dmap of the
3236 	 * starting block.
3237 	 */
3238 	dbitno = blkno & (BPERDMAP - 1);
3239 	word = dbitno >> L2DBWORD;
3240 
3241 	/* block range better be within the dmap */
3242 	assert(dbitno + nblocks <= BPERDMAP);
3243 
3244 	/* allocate the bits of the dmap's words corresponding to the block
3245 	 * range. not all bits of the first and last words may be contained
3246 	 * within the block range.  if this is the case, we'll work against
3247 	 * those words (i.e. partial first and/or last) on an individual basis
3248 	 * (a single pass), allocating the bits of interest by hand and
3249 	 * updating the leaf corresponding to the dmap word. a single pass
3250 	 * will be used for all dmap words fully contained within the
3251 	 * specified range.  within this pass, the bits of all fully contained
3252 	 * dmap words will be marked as free in a single shot and the leaves
3253 	 * will be updated. a single leaf may describe the free space of
3254 	 * multiple dmap words, so we may update only a subset of the actual
3255 	 * leaves corresponding to the dmap words of the block range.
3256 	 */
3257 	for (rembits = nblocks; rembits > 0; rembits -= nb, dbitno += nb) {
3258 		/* determine the bit number within the word and
3259 		 * the number of bits within the word.
3260 		 */
3261 		wbitno = dbitno & (DBWORD - 1);
3262 		nb = min(rembits, DBWORD - wbitno);
3263 
3264 		/* check if only part of a word is to be allocated.
3265 		 */
3266 		if (nb < DBWORD) {
3267 			/* allocate (set to 1) the appropriate bits within
3268 			 * this dmap word.
3269 			 */
3270 			dp->wmap[word] |= cpu_to_le32(ONES << (DBWORD - nb)
3271 						      >> wbitno);
3272 
3273 			word++;
3274 		} else {
3275 			/* one or more dmap words are fully contained
3276 			 * within the block range.  determine how many
3277 			 * words and allocate (set to 1) the bits of these
3278 			 * words.
3279 			 */
3280 			nwords = rembits >> L2DBWORD;
3281 			memset(&dp->wmap[word], (int) ONES, nwords * 4);
3282 
3283 			/* determine how many bits */
3284 			nb = nwords << L2DBWORD;
3285 			word += nwords;
3286 		}
3287 	}
3288 
3289 	/* update the free count for this dmap */
3290 	dp->nfree = cpu_to_le32(le32_to_cpu(dp->nfree) - nblocks);
3291 
3292 	/* reconstruct summary tree */
3293 	dbInitDmapTree(dp);
3294 
3295 	BMAP_LOCK(bmp);
3296 
3297 	/* if this allocation group is completely free,
3298 	 * update the highest active allocation group number
3299 	 * if this allocation group is the new max.
3300 	 */
3301 	agno = blkno >> bmp->db_agl2size;
3302 	if (agno > bmp->db_maxag)
3303 		bmp->db_maxag = agno;
3304 
3305 	/* update the free count for the allocation group and map */
3306 	bmp->db_agfree[agno] -= nblocks;
3307 	bmp->db_nfree -= nblocks;
3308 
3309 	BMAP_UNLOCK(bmp);
3310 
3311 	/* if the root has not changed, done. */
3312 	if (tp->stree[ROOT] == oldroot)
3313 		return (0);
3314 
3315 	/* root changed. bubble the change up to the dmap control pages.
3316 	 * if the adjustment of the upper level control pages fails,
3317 	 * backout the bit allocation (thus making everything consistent).
3318 	 */
3319 	if ((rc = dbAdjCtl(bmp, blkno, tp->stree[ROOT], 1, 0)))
3320 		dbFreeBits(bmp, dp, blkno, nblocks);
3321 
3322 	return (rc);
3323 }
3324 
3325 
3326 /*
3327  * NAME:	dbExtendFS()
3328  *
3329  * FUNCTION:	extend bmap from blkno for nblocks;
3330  * 		dbExtendFS() updates bmap ready for dbAllocBottomUp();
3331  *
3332  * L2
3333  *  |
3334  *   L1---------------------------------L1
3335  *    |                                  |
3336  *     L0---------L0---------L0           L0---------L0---------L0
3337  *      |          |          |            |          |          |
3338  *       d0,...,dn  d0,...,dn  d0,...,dn    d0,...,dn  d0,...,dn  d0,.,dm;
3339  * L2L1L0d0,...,dnL0d0,...,dnL0d0,...,dnL1L0d0,...,dnL0d0,...,dnL0d0,..dm
3340  *
3341  * <---old---><----------------------------extend----------------------->
3342  */
dbExtendFS(struct inode * ipbmap,s64 blkno,s64 nblocks)3343 int dbExtendFS(struct inode *ipbmap, s64 blkno,	s64 nblocks)
3344 {
3345 	struct jfs_sb_info *sbi = JFS_SBI(ipbmap->i_sb);
3346 	int nbperpage = sbi->nbperpage;
3347 	int i, i0 = TRUE, j, j0 = TRUE, k, n;
3348 	s64 newsize;
3349 	s64 p;
3350 	struct metapage *mp, *l2mp, *l1mp = NULL, *l0mp = NULL;
3351 	struct dmapctl *l2dcp, *l1dcp, *l0dcp;
3352 	struct dmap *dp;
3353 	s8 *l0leaf, *l1leaf, *l2leaf;
3354 	struct bmap *bmp = sbi->bmap;
3355 	int agno, l2agsize, oldl2agsize;
3356 	s64 ag_rem;
3357 
3358 	newsize = blkno + nblocks;
3359 
3360 	jfs_info("dbExtendFS: blkno:%Ld nblocks:%Ld newsize:%Ld",
3361 		 (long long) blkno, (long long) nblocks, (long long) newsize);
3362 
3363 	/*
3364 	 *      initialize bmap control page.
3365 	 *
3366 	 * all the data in bmap control page should exclude
3367 	 * the mkfs hidden dmap page.
3368 	 */
3369 
3370 	/* update mapsize */
3371 	bmp->db_mapsize = newsize;
3372 	bmp->db_maxlevel = BMAPSZTOLEV(bmp->db_mapsize);
3373 
3374 	/* compute new AG size */
3375 	l2agsize = dbGetL2AGSize(newsize);
3376 	oldl2agsize = bmp->db_agl2size;
3377 
3378 	bmp->db_agl2size = l2agsize;
3379 	bmp->db_agsize = 1 << l2agsize;
3380 
3381 	/* compute new number of AG */
3382 	agno = bmp->db_numag;
3383 	bmp->db_numag = newsize >> l2agsize;
3384 	bmp->db_numag += ((u32) newsize % (u32) bmp->db_agsize) ? 1 : 0;
3385 
3386 	/*
3387 	 *      reconfigure db_agfree[]
3388 	 * from old AG configuration to new AG configuration;
3389 	 *
3390 	 * coalesce contiguous k (newAGSize/oldAGSize) AGs;
3391 	 * i.e., (AGi, ..., AGj) where i = k*n and j = k*(n+1) - 1 to AGn;
3392 	 * note: new AG size = old AG size * (2**x).
3393 	 */
3394 	if (l2agsize == oldl2agsize)
3395 		goto extend;
3396 	k = 1 << (l2agsize - oldl2agsize);
3397 	ag_rem = bmp->db_agfree[0];	/* save agfree[0] */
3398 	for (i = 0, n = 0; i < agno; n++) {
3399 		bmp->db_agfree[n] = 0;	/* init collection point */
3400 
3401 		/* coalesce cotiguous k AGs; */
3402 		for (j = 0; j < k && i < agno; j++, i++) {
3403 			/* merge AGi to AGn */
3404 			bmp->db_agfree[n] += bmp->db_agfree[i];
3405 		}
3406 	}
3407 	bmp->db_agfree[0] += ag_rem;	/* restore agfree[0] */
3408 
3409 	for (; n < MAXAG; n++)
3410 		bmp->db_agfree[n] = 0;
3411 
3412 	/*
3413 	 * update highest active ag number
3414 	 */
3415 
3416 	bmp->db_maxag = bmp->db_maxag / k;
3417 
3418 	/*
3419 	 *      extend bmap
3420 	 *
3421 	 * update bit maps and corresponding level control pages;
3422 	 * global control page db_nfree, db_agfree[agno], db_maxfreebud;
3423 	 */
3424       extend:
3425 	/* get L2 page */
3426 	p = BMAPBLKNO + nbperpage;	/* L2 page */
3427 	l2mp = read_metapage(ipbmap, p, PSIZE, 0);
3428 	if (!l2mp) {
3429 		jfs_error(ipbmap->i_sb, "dbExtendFS: L2 page could not be read");
3430 		return -EIO;
3431 	}
3432 	l2dcp = (struct dmapctl *) l2mp->data;
3433 
3434 	/* compute start L1 */
3435 	k = blkno >> L2MAXL1SIZE;
3436 	l2leaf = l2dcp->stree + CTLLEAFIND + k;
3437 	p = BLKTOL1(blkno, sbi->l2nbperpage);	/* L1 page */
3438 
3439 	/*
3440 	 * extend each L1 in L2
3441 	 */
3442 	for (; k < LPERCTL; k++, p += nbperpage) {
3443 		/* get L1 page */
3444 		if (j0) {
3445 			/* read in L1 page: (blkno & (MAXL1SIZE - 1)) */
3446 			l1mp = read_metapage(ipbmap, p, PSIZE, 0);
3447 			if (l1mp == NULL)
3448 				goto errout;
3449 			l1dcp = (struct dmapctl *) l1mp->data;
3450 
3451 			/* compute start L0 */
3452 			j = (blkno & (MAXL1SIZE - 1)) >> L2MAXL0SIZE;
3453 			l1leaf = l1dcp->stree + CTLLEAFIND + j;
3454 			p = BLKTOL0(blkno, sbi->l2nbperpage);
3455 			j0 = FALSE;
3456 		} else {
3457 			/* assign/init L1 page */
3458 			l1mp = get_metapage(ipbmap, p, PSIZE, 0);
3459 			if (l1mp == NULL)
3460 				goto errout;
3461 
3462 			l1dcp = (struct dmapctl *) l1mp->data;
3463 
3464 			/* compute start L0 */
3465 			j = 0;
3466 			l1leaf = l1dcp->stree + CTLLEAFIND;
3467 			p += nbperpage;	/* 1st L0 of L1.k  */
3468 		}
3469 
3470 		/*
3471 		 * extend each L0 in L1
3472 		 */
3473 		for (; j < LPERCTL; j++) {
3474 			/* get L0 page */
3475 			if (i0) {
3476 				/* read in L0 page: (blkno & (MAXL0SIZE - 1)) */
3477 
3478 				l0mp = read_metapage(ipbmap, p, PSIZE, 0);
3479 				if (l0mp == NULL)
3480 					goto errout;
3481 				l0dcp = (struct dmapctl *) l0mp->data;
3482 
3483 				/* compute start dmap */
3484 				i = (blkno & (MAXL0SIZE - 1)) >>
3485 				    L2BPERDMAP;
3486 				l0leaf = l0dcp->stree + CTLLEAFIND + i;
3487 				p = BLKTODMAP(blkno,
3488 					      sbi->l2nbperpage);
3489 				i0 = FALSE;
3490 			} else {
3491 				/* assign/init L0 page */
3492 				l0mp = get_metapage(ipbmap, p, PSIZE, 0);
3493 				if (l0mp == NULL)
3494 					goto errout;
3495 
3496 				l0dcp = (struct dmapctl *) l0mp->data;
3497 
3498 				/* compute start dmap */
3499 				i = 0;
3500 				l0leaf = l0dcp->stree + CTLLEAFIND;
3501 				p += nbperpage;	/* 1st dmap of L0.j */
3502 			}
3503 
3504 			/*
3505 			 * extend each dmap in L0
3506 			 */
3507 			for (; i < LPERCTL; i++) {
3508 				/*
3509 				 * reconstruct the dmap page, and
3510 				 * initialize corresponding parent L0 leaf
3511 				 */
3512 				if ((n = blkno & (BPERDMAP - 1))) {
3513 					/* read in dmap page: */
3514 					mp = read_metapage(ipbmap, p,
3515 							   PSIZE, 0);
3516 					if (mp == NULL)
3517 						goto errout;
3518 					n = min(nblocks, (s64)BPERDMAP - n);
3519 				} else {
3520 					/* assign/init dmap page */
3521 					mp = read_metapage(ipbmap, p,
3522 							   PSIZE, 0);
3523 					if (mp == NULL)
3524 						goto errout;
3525 
3526 					n = min(nblocks, (s64)BPERDMAP);
3527 				}
3528 
3529 				dp = (struct dmap *) mp->data;
3530 				*l0leaf = dbInitDmap(dp, blkno, n);
3531 
3532 				bmp->db_nfree += n;
3533 				agno = le64_to_cpu(dp->start) >> l2agsize;
3534 				bmp->db_agfree[agno] += n;
3535 
3536 				write_metapage(mp);
3537 
3538 				l0leaf++;
3539 				p += nbperpage;
3540 
3541 				blkno += n;
3542 				nblocks -= n;
3543 				if (nblocks == 0)
3544 					break;
3545 			}	/* for each dmap in a L0 */
3546 
3547 			/*
3548 			 * build current L0 page from its leaves, and
3549 			 * initialize corresponding parent L1 leaf
3550 			 */
3551 			*l1leaf = dbInitDmapCtl(l0dcp, 0, ++i);
3552 			write_metapage(l0mp);
3553 			l0mp = NULL;
3554 
3555 			if (nblocks)
3556 				l1leaf++;	/* continue for next L0 */
3557 			else {
3558 				/* more than 1 L0 ? */
3559 				if (j > 0)
3560 					break;	/* build L1 page */
3561 				else {
3562 					/* summarize in global bmap page */
3563 					bmp->db_maxfreebud = *l1leaf;
3564 					release_metapage(l1mp);
3565 					release_metapage(l2mp);
3566 					goto finalize;
3567 				}
3568 			}
3569 		}		/* for each L0 in a L1 */
3570 
3571 		/*
3572 		 * build current L1 page from its leaves, and
3573 		 * initialize corresponding parent L2 leaf
3574 		 */
3575 		*l2leaf = dbInitDmapCtl(l1dcp, 1, ++j);
3576 		write_metapage(l1mp);
3577 		l1mp = NULL;
3578 
3579 		if (nblocks)
3580 			l2leaf++;	/* continue for next L1 */
3581 		else {
3582 			/* more than 1 L1 ? */
3583 			if (k > 0)
3584 				break;	/* build L2 page */
3585 			else {
3586 				/* summarize in global bmap page */
3587 				bmp->db_maxfreebud = *l2leaf;
3588 				release_metapage(l2mp);
3589 				goto finalize;
3590 			}
3591 		}
3592 	}			/* for each L1 in a L2 */
3593 
3594 	jfs_error(ipbmap->i_sb,
3595 		  "dbExtendFS: function has not returned as expected");
3596 errout:
3597 	if (l0mp)
3598 		release_metapage(l0mp);
3599 	if (l1mp)
3600 		release_metapage(l1mp);
3601 	release_metapage(l2mp);
3602 	return -EIO;
3603 
3604 	/*
3605 	 *      finalize bmap control page
3606 	 */
3607 finalize:
3608 
3609 	return 0;
3610 }
3611 
3612 
3613 /*
3614  *	dbFinalizeBmap()
3615  */
dbFinalizeBmap(struct inode * ipbmap)3616 void dbFinalizeBmap(struct inode *ipbmap)
3617 {
3618 	struct bmap *bmp = JFS_SBI(ipbmap->i_sb)->bmap;
3619 	int actags, inactags, l2nl;
3620 	s64 ag_rem, actfree, inactfree, avgfree;
3621 	int i, n;
3622 
3623 	/*
3624 	 *      finalize bmap control page
3625 	 */
3626 //finalize:
3627 	/*
3628 	 * compute db_agpref: preferred ag to allocate from
3629 	 * (the leftmost ag with average free space in it);
3630 	 */
3631 //agpref:
3632 	/* get the number of active ags and inacitve ags */
3633 	actags = bmp->db_maxag + 1;
3634 	inactags = bmp->db_numag - actags;
3635 	ag_rem = bmp->db_mapsize & (bmp->db_agsize - 1);	/* ??? */
3636 
3637 	/* determine how many blocks are in the inactive allocation
3638 	 * groups. in doing this, we must account for the fact that
3639 	 * the rightmost group might be a partial group (i.e. file
3640 	 * system size is not a multiple of the group size).
3641 	 */
3642 	inactfree = (inactags && ag_rem) ?
3643 	    ((inactags - 1) << bmp->db_agl2size) + ag_rem
3644 	    : inactags << bmp->db_agl2size;
3645 
3646 	/* determine how many free blocks are in the active
3647 	 * allocation groups plus the average number of free blocks
3648 	 * within the active ags.
3649 	 */
3650 	actfree = bmp->db_nfree - inactfree;
3651 	avgfree = (u32) actfree / (u32) actags;
3652 
3653 	/* if the preferred allocation group has not average free space.
3654 	 * re-establish the preferred group as the leftmost
3655 	 * group with average free space.
3656 	 */
3657 	if (bmp->db_agfree[bmp->db_agpref] < avgfree) {
3658 		for (bmp->db_agpref = 0; bmp->db_agpref < actags;
3659 		     bmp->db_agpref++) {
3660 			if (bmp->db_agfree[bmp->db_agpref] >= avgfree)
3661 				break;
3662 		}
3663 		if (bmp->db_agpref >= bmp->db_numag) {
3664 			jfs_error(ipbmap->i_sb,
3665 				  "cannot find ag with average freespace");
3666 		}
3667 	}
3668 
3669 	/*
3670 	 * compute db_aglevel, db_agheigth, db_width, db_agstart:
3671 	 * an ag is covered in aglevel dmapctl summary tree,
3672 	 * at agheight level height (from leaf) with agwidth number of nodes
3673 	 * each, which starts at agstart index node of the smmary tree node
3674 	 * array;
3675 	 */
3676 	bmp->db_aglevel = BMAPSZTOLEV(bmp->db_agsize);
3677 	l2nl =
3678 	    bmp->db_agl2size - (L2BPERDMAP + bmp->db_aglevel * L2LPERCTL);
3679 	bmp->db_agheigth = l2nl >> 1;
3680 	bmp->db_agwidth = 1 << (l2nl - (bmp->db_agheigth << 1));
3681 	for (i = 5 - bmp->db_agheigth, bmp->db_agstart = 0, n = 1; i > 0;
3682 	     i--) {
3683 		bmp->db_agstart += n;
3684 		n <<= 2;
3685 	}
3686 
3687 }
3688 
3689 
3690 /*
3691  * NAME:	dbInitDmap()/ujfs_idmap_page()
3692  *
3693  * FUNCTION:	initialize working/persistent bitmap of the dmap page
3694  *		for the specified number of blocks:
3695  *
3696  *		at entry, the bitmaps had been initialized as free (ZEROS);
3697  *		The number of blocks will only account for the actually
3698  *		existing blocks. Blocks which don't actually exist in
3699  *		the aggregate will be marked as allocated (ONES);
3700  *
3701  * PARAMETERS:
3702  *	dp	- pointer to page of map
3703  *	nblocks	- number of blocks this page
3704  *
3705  * RETURNS: NONE
3706  */
dbInitDmap(struct dmap * dp,s64 Blkno,int nblocks)3707 static int dbInitDmap(struct dmap * dp, s64 Blkno, int nblocks)
3708 {
3709 	int blkno, w, b, r, nw, nb, i;
3710 
3711 	/* starting block number within the dmap */
3712 	blkno = Blkno & (BPERDMAP - 1);
3713 
3714 	if (blkno == 0) {
3715 		dp->nblocks = dp->nfree = cpu_to_le32(nblocks);
3716 		dp->start = cpu_to_le64(Blkno);
3717 
3718 		if (nblocks == BPERDMAP) {
3719 			memset(&dp->wmap[0], 0, LPERDMAP * 4);
3720 			memset(&dp->pmap[0], 0, LPERDMAP * 4);
3721 			goto initTree;
3722 		}
3723 	} else {
3724 		dp->nblocks =
3725 		    cpu_to_le32(le32_to_cpu(dp->nblocks) + nblocks);
3726 		dp->nfree = cpu_to_le32(le32_to_cpu(dp->nfree) + nblocks);
3727 	}
3728 
3729 	/* word number containing start block number */
3730 	w = blkno >> L2DBWORD;
3731 
3732 	/*
3733 	 * free the bits corresponding to the block range (ZEROS):
3734 	 * note: not all bits of the first and last words may be contained
3735 	 * within the block range.
3736 	 */
3737 	for (r = nblocks; r > 0; r -= nb, blkno += nb) {
3738 		/* number of bits preceding range to be freed in the word */
3739 		b = blkno & (DBWORD - 1);
3740 		/* number of bits to free in the word */
3741 		nb = min(r, DBWORD - b);
3742 
3743 		/* is partial word to be freed ? */
3744 		if (nb < DBWORD) {
3745 			/* free (set to 0) from the bitmap word */
3746 			dp->wmap[w] &= cpu_to_le32(~(ONES << (DBWORD - nb)
3747 						     >> b));
3748 			dp->pmap[w] &= cpu_to_le32(~(ONES << (DBWORD - nb)
3749 						     >> b));
3750 
3751 			/* skip the word freed */
3752 			w++;
3753 		} else {
3754 			/* free (set to 0) contiguous bitmap words */
3755 			nw = r >> L2DBWORD;
3756 			memset(&dp->wmap[w], 0, nw * 4);
3757 			memset(&dp->pmap[w], 0, nw * 4);
3758 
3759 			/* skip the words freed */
3760 			nb = nw << L2DBWORD;
3761 			w += nw;
3762 		}
3763 	}
3764 
3765 	/*
3766 	 * mark bits following the range to be freed (non-existing
3767 	 * blocks) as allocated (ONES)
3768 	 */
3769 
3770 	if (blkno == BPERDMAP)
3771 		goto initTree;
3772 
3773 	/* the first word beyond the end of existing blocks */
3774 	w = blkno >> L2DBWORD;
3775 
3776 	/* does nblocks fall on a 32-bit boundary ? */
3777 	b = blkno & (DBWORD - 1);
3778 	if (b) {
3779 		/* mark a partial word allocated */
3780 		dp->wmap[w] = dp->pmap[w] = cpu_to_le32(ONES >> b);
3781 		w++;
3782 	}
3783 
3784 	/* set the rest of the words in the page to allocated (ONES) */
3785 	for (i = w; i < LPERDMAP; i++)
3786 		dp->pmap[i] = dp->wmap[i] = ONES;
3787 
3788 	/*
3789 	 * init tree
3790 	 */
3791       initTree:
3792 	return (dbInitDmapTree(dp));
3793 }
3794 
3795 
3796 /*
3797  * NAME:	dbInitDmapTree()/ujfs_complete_dmap()
3798  *
3799  * FUNCTION:	initialize summary tree of the specified dmap:
3800  *
3801  *		at entry, bitmap of the dmap has been initialized;
3802  *
3803  * PARAMETERS:
3804  *	dp	- dmap to complete
3805  *	blkno	- starting block number for this dmap
3806  *	treemax	- will be filled in with max free for this dmap
3807  *
3808  * RETURNS:	max free string at the root of the tree
3809  */
dbInitDmapTree(struct dmap * dp)3810 static int dbInitDmapTree(struct dmap * dp)
3811 {
3812 	struct dmaptree *tp;
3813 	s8 *cp;
3814 	int i;
3815 
3816 	/* init fixed info of tree */
3817 	tp = &dp->tree;
3818 	tp->nleafs = cpu_to_le32(LPERDMAP);
3819 	tp->l2nleafs = cpu_to_le32(L2LPERDMAP);
3820 	tp->leafidx = cpu_to_le32(LEAFIND);
3821 	tp->height = cpu_to_le32(4);
3822 	tp->budmin = BUDMIN;
3823 
3824 	/* init each leaf from corresponding wmap word:
3825 	 * note: leaf is set to NOFREE(-1) if all blocks of corresponding
3826 	 * bitmap word are allocated.
3827 	 */
3828 	cp = tp->stree + le32_to_cpu(tp->leafidx);
3829 	for (i = 0; i < LPERDMAP; i++)
3830 		*cp++ = dbMaxBud((u8 *) & dp->wmap[i]);
3831 
3832 	/* build the dmap's binary buddy summary tree */
3833 	return (dbInitTree(tp));
3834 }
3835 
3836 
3837 /*
3838  * NAME:	dbInitTree()/ujfs_adjtree()
3839  *
3840  * FUNCTION:	initialize binary buddy summary tree of a dmap or dmapctl.
3841  *
3842  *		at entry, the leaves of the tree has been initialized
3843  *		from corresponding bitmap word or root of summary tree
3844  *		of the child control page;
3845  *		configure binary buddy system at the leaf level, then
3846  *		bubble up the values of the leaf nodes up the tree.
3847  *
3848  * PARAMETERS:
3849  *	cp	- Pointer to the root of the tree
3850  *	l2leaves- Number of leaf nodes as a power of 2
3851  *	l2min	- Number of blocks that can be covered by a leaf
3852  *		  as a power of 2
3853  *
3854  * RETURNS: max free string at the root of the tree
3855  */
dbInitTree(struct dmaptree * dtp)3856 static int dbInitTree(struct dmaptree * dtp)
3857 {
3858 	int l2max, l2free, bsize, nextb, i;
3859 	int child, parent, nparent;
3860 	s8 *tp, *cp, *cp1;
3861 
3862 	tp = dtp->stree;
3863 
3864 	/* Determine the maximum free string possible for the leaves */
3865 	l2max = le32_to_cpu(dtp->l2nleafs) + dtp->budmin;
3866 
3867 	/*
3868 	 * configure the leaf levevl into binary buddy system
3869 	 *
3870 	 * Try to combine buddies starting with a buddy size of 1
3871 	 * (i.e. two leaves). At a buddy size of 1 two buddy leaves
3872 	 * can be combined if both buddies have a maximum free of l2min;
3873 	 * the combination will result in the left-most buddy leaf having
3874 	 * a maximum free of l2min+1.
3875 	 * After processing all buddies for a given size, process buddies
3876 	 * at the next higher buddy size (i.e. current size * 2) and
3877 	 * the next maximum free (current free + 1).
3878 	 * This continues until the maximum possible buddy combination
3879 	 * yields maximum free.
3880 	 */
3881 	for (l2free = dtp->budmin, bsize = 1; l2free < l2max;
3882 	     l2free++, bsize = nextb) {
3883 		/* get next buddy size == current buddy pair size */
3884 		nextb = bsize << 1;
3885 
3886 		/* scan each adjacent buddy pair at current buddy size */
3887 		for (i = 0, cp = tp + le32_to_cpu(dtp->leafidx);
3888 		     i < le32_to_cpu(dtp->nleafs);
3889 		     i += nextb, cp += nextb) {
3890 			/* coalesce if both adjacent buddies are max free */
3891 			if (*cp == l2free && *(cp + bsize) == l2free) {
3892 				*cp = l2free + 1;	/* left take right */
3893 				*(cp + bsize) = -1;	/* right give left */
3894 			}
3895 		}
3896 	}
3897 
3898 	/*
3899 	 * bubble summary information of leaves up the tree.
3900 	 *
3901 	 * Starting at the leaf node level, the four nodes described by
3902 	 * the higher level parent node are compared for a maximum free and
3903 	 * this maximum becomes the value of the parent node.
3904 	 * when all lower level nodes are processed in this fashion then
3905 	 * move up to the next level (parent becomes a lower level node) and
3906 	 * continue the process for that level.
3907 	 */
3908 	for (child = le32_to_cpu(dtp->leafidx),
3909 	     nparent = le32_to_cpu(dtp->nleafs) >> 2;
3910 	     nparent > 0; nparent >>= 2, child = parent) {
3911 		/* get index of 1st node of parent level */
3912 		parent = (child - 1) >> 2;
3913 
3914 		/* set the value of the parent node as the maximum
3915 		 * of the four nodes of the current level.
3916 		 */
3917 		for (i = 0, cp = tp + child, cp1 = tp + parent;
3918 		     i < nparent; i++, cp += 4, cp1++)
3919 			*cp1 = TREEMAX(cp);
3920 	}
3921 
3922 	return (*tp);
3923 }
3924 
3925 
3926 /*
3927  *	dbInitDmapCtl()
3928  *
3929  * function: initialize dmapctl page
3930  */
dbInitDmapCtl(struct dmapctl * dcp,int level,int i)3931 static int dbInitDmapCtl(struct dmapctl * dcp, int level, int i)
3932 {				/* start leaf index not covered by range */
3933 	s8 *cp;
3934 
3935 	dcp->nleafs = cpu_to_le32(LPERCTL);
3936 	dcp->l2nleafs = cpu_to_le32(L2LPERCTL);
3937 	dcp->leafidx = cpu_to_le32(CTLLEAFIND);
3938 	dcp->height = cpu_to_le32(5);
3939 	dcp->budmin = L2BPERDMAP + L2LPERCTL * level;
3940 
3941 	/*
3942 	 * initialize the leaves of current level that were not covered
3943 	 * by the specified input block range (i.e. the leaves have no
3944 	 * low level dmapctl or dmap).
3945 	 */
3946 	cp = &dcp->stree[CTLLEAFIND + i];
3947 	for (; i < LPERCTL; i++)
3948 		*cp++ = NOFREE;
3949 
3950 	/* build the dmap's binary buddy summary tree */
3951 	return (dbInitTree((struct dmaptree *) dcp));
3952 }
3953 
3954 
3955 /*
3956  * NAME:	dbGetL2AGSize()/ujfs_getagl2size()
3957  *
3958  * FUNCTION:	Determine log2(allocation group size) from aggregate size
3959  *
3960  * PARAMETERS:
3961  *	nblocks	- Number of blocks in aggregate
3962  *
3963  * RETURNS: log2(allocation group size) in aggregate blocks
3964  */
dbGetL2AGSize(s64 nblocks)3965 static int dbGetL2AGSize(s64 nblocks)
3966 {
3967 	s64 sz;
3968 	s64 m;
3969 	int l2sz;
3970 
3971 	if (nblocks < BPERDMAP * MAXAG)
3972 		return (L2BPERDMAP);
3973 
3974 	/* round up aggregate size to power of 2 */
3975 	m = ((u64) 1 << (64 - 1));
3976 	for (l2sz = 64; l2sz >= 0; l2sz--, m >>= 1) {
3977 		if (m & nblocks)
3978 			break;
3979 	}
3980 
3981 	sz = (s64) 1 << l2sz;
3982 	if (sz < nblocks)
3983 		l2sz += 1;
3984 
3985 	/* agsize = roundupSize/max_number_of_ag */
3986 	return (l2sz - L2MAXAG);
3987 }
3988 
3989 
3990 /*
3991  * NAME:	dbMapFileSizeToMapSize()
3992  *
3993  * FUNCTION:	compute number of blocks the block allocation map file
3994  *		can cover from the map file size;
3995  *
3996  * RETURNS:	Number of blocks which can be covered by this block map file;
3997  */
3998 
3999 /*
4000  * maximum number of map pages at each level including control pages
4001  */
4002 #define MAXL0PAGES	(1 + LPERCTL)
4003 #define MAXL1PAGES	(1 + LPERCTL * MAXL0PAGES)
4004 #define MAXL2PAGES	(1 + LPERCTL * MAXL1PAGES)
4005 
4006 /*
4007  * convert number of map pages to the zero origin top dmapctl level
4008  */
4009 #define BMAPPGTOLEV(npages)	\
4010 	(((npages) <= 3 + MAXL0PAGES) ? 0 \
4011        : ((npages) <= 2 + MAXL1PAGES) ? 1 : 2)
4012 
dbMapFileSizeToMapSize(struct inode * ipbmap)4013 s64 dbMapFileSizeToMapSize(struct inode * ipbmap)
4014 {
4015 	struct super_block *sb = ipbmap->i_sb;
4016 	s64 nblocks;
4017 	s64 npages, ndmaps;
4018 	int level, i;
4019 	int complete, factor;
4020 
4021 	nblocks = ipbmap->i_size >> JFS_SBI(sb)->l2bsize;
4022 	npages = nblocks >> JFS_SBI(sb)->l2nbperpage;
4023 	level = BMAPPGTOLEV(npages);
4024 
4025 	/* At each level, accumulate the number of dmap pages covered by
4026 	 * the number of full child levels below it;
4027 	 * repeat for the last incomplete child level.
4028 	 */
4029 	ndmaps = 0;
4030 	npages--;		/* skip the first global control page */
4031 	/* skip higher level control pages above top level covered by map */
4032 	npages -= (2 - level);
4033 	npages--;		/* skip top level's control page */
4034 	for (i = level; i >= 0; i--) {
4035 		factor =
4036 		    (i == 2) ? MAXL1PAGES : ((i == 1) ? MAXL0PAGES : 1);
4037 		complete = (u32) npages / factor;
4038 		ndmaps += complete * ((i == 2) ? LPERCTL * LPERCTL
4039 				      : ((i == 1) ? LPERCTL : 1));
4040 
4041 		/* pages in last/incomplete child */
4042 		npages = (u32) npages % factor;
4043 		/* skip incomplete child's level control page */
4044 		npages--;
4045 	}
4046 
4047 	/* convert the number of dmaps into the number of blocks
4048 	 * which can be covered by the dmaps;
4049 	 */
4050 	nblocks = ndmaps << L2BPERDMAP;
4051 
4052 	return (nblocks);
4053 }
4054 
4055 
4056 #ifdef	_JFS_DEBUG_DMAP
4057 /*
4058  *	DBinitmap()
4059  */
DBinitmap(s64 size,struct inode * ipbmap,u32 ** results)4060 static void DBinitmap(s64 size, struct inode *ipbmap, u32 ** results)
4061 {
4062 	int npages;
4063 	u32 *dbmap, *d;
4064 	int n;
4065 	s64 lblkno, cur_block;
4066 	struct dmap *dp;
4067 	struct metapage *mp;
4068 
4069 	npages = size / 32768;
4070 	npages += (size % 32768) ? 1 : 0;
4071 
4072 	dbmap = (u32 *) xmalloc(npages * 4096, L2PSIZE, kernel_heap);
4073 	if (dbmap == NULL)
4074 		BUG();	/* Not robust since this is only unused debug code */
4075 
4076 	for (n = 0, d = dbmap; n < npages; n++, d += 1024)
4077 		bzero(d, 4096);
4078 
4079 	/* Need to initialize from disk map pages
4080 	 */
4081 	for (d = dbmap, cur_block = 0; cur_block < size;
4082 	     cur_block += BPERDMAP, d += LPERDMAP) {
4083 		lblkno = BLKTODMAP(cur_block,
4084 				   JFS_SBI(ipbmap->i_sb)->bmap->
4085 				   db_l2nbperpage);
4086 		mp = read_metapage(ipbmap, lblkno, PSIZE, 0);
4087 		if (mp == NULL) {
4088 			jfs_error(ipbmap->i_sb,
4089 				  "DBinitmap: could not read disk map page");
4090 			continue;
4091 		}
4092 		dp = (struct dmap *) mp->data;
4093 
4094 		for (n = 0; n < LPERDMAP; n++)
4095 			d[n] = le32_to_cpu(dp->wmap[n]);
4096 
4097 		release_metapage(mp);
4098 	}
4099 
4100 	*results = dbmap;
4101 }
4102 
4103 
4104 /*
4105  *	DBAlloc()
4106  */
DBAlloc(uint * dbmap,s64 mapsize,s64 blkno,s64 nblocks)4107 void DBAlloc(uint * dbmap, s64 mapsize, s64 blkno, s64 nblocks)
4108 {
4109 	int word, nb, bitno;
4110 	u32 mask;
4111 
4112 	assert(blkno > 0 && blkno < mapsize);
4113 	assert(nblocks > 0 && nblocks <= mapsize);
4114 
4115 	assert(blkno + nblocks <= mapsize);
4116 
4117 	dbmap += (blkno / 32);
4118 	while (nblocks > 0) {
4119 		bitno = blkno & (32 - 1);
4120 		nb = min(nblocks, 32 - bitno);
4121 
4122 		mask = (0xffffffff << (32 - nb) >> bitno);
4123 		assert((mask & *dbmap) == 0);
4124 		*dbmap |= mask;
4125 
4126 		dbmap++;
4127 		blkno += nb;
4128 		nblocks -= nb;
4129 	}
4130 }
4131 
4132 
4133 /*
4134  *	DBFree()
4135  */
DBFree(uint * dbmap,s64 mapsize,s64 blkno,s64 nblocks)4136 static void DBFree(uint * dbmap, s64 mapsize, s64 blkno, s64 nblocks)
4137 {
4138 	int word, nb, bitno;
4139 	u32 mask;
4140 
4141 	assert(blkno > 0 && blkno < mapsize);
4142 	assert(nblocks > 0 && nblocks <= mapsize);
4143 
4144 	assert(blkno + nblocks <= mapsize);
4145 
4146 	dbmap += (blkno / 32);
4147 	while (nblocks > 0) {
4148 		bitno = blkno & (32 - 1);
4149 		nb = min(nblocks, 32 - bitno);
4150 
4151 		mask = (0xffffffff << (32 - nb) >> bitno);
4152 		assert((mask & *dbmap) == mask);
4153 		*dbmap &= ~mask;
4154 
4155 		dbmap++;
4156 		blkno += nb;
4157 		nblocks -= nb;
4158 	}
4159 }
4160 
4161 
4162 /*
4163  *	DBAllocCK()
4164  */
DBAllocCK(uint * dbmap,s64 mapsize,s64 blkno,s64 nblocks)4165 static void DBAllocCK(uint * dbmap, s64 mapsize, s64 blkno, s64 nblocks)
4166 {
4167 	int word, nb, bitno;
4168 	u32 mask;
4169 
4170 	assert(blkno > 0 && blkno < mapsize);
4171 	assert(nblocks > 0 && nblocks <= mapsize);
4172 
4173 	assert(blkno + nblocks <= mapsize);
4174 
4175 	dbmap += (blkno / 32);
4176 	while (nblocks > 0) {
4177 		bitno = blkno & (32 - 1);
4178 		nb = min(nblocks, 32 - bitno);
4179 
4180 		mask = (0xffffffff << (32 - nb) >> bitno);
4181 		assert((mask & *dbmap) == mask);
4182 
4183 		dbmap++;
4184 		blkno += nb;
4185 		nblocks -= nb;
4186 	}
4187 }
4188 
4189 
4190 /*
4191  *	DBFreeCK()
4192  */
DBFreeCK(uint * dbmap,s64 mapsize,s64 blkno,s64 nblocks)4193 static void DBFreeCK(uint * dbmap, s64 mapsize, s64 blkno, s64 nblocks)
4194 {
4195 	int word, nb, bitno;
4196 	u32 mask;
4197 
4198 	assert(blkno > 0 && blkno < mapsize);
4199 	assert(nblocks > 0 && nblocks <= mapsize);
4200 
4201 	assert(blkno + nblocks <= mapsize);
4202 
4203 	dbmap += (blkno / 32);
4204 	while (nblocks > 0) {
4205 		bitno = blkno & (32 - 1);
4206 		nb = min(nblocks, 32 - bitno);
4207 
4208 		mask = (0xffffffff << (32 - nb) >> bitno);
4209 		assert((mask & *dbmap) == 0);
4210 
4211 		dbmap++;
4212 		blkno += nb;
4213 		nblocks -= nb;
4214 	}
4215 }
4216 
4217 
4218 /*
4219  *	dbPrtMap()
4220  */
dbPrtMap(struct bmap * bmp)4221 static void dbPrtMap(struct bmap * bmp)
4222 {
4223 	printk("   mapsize:   %d%d\n", bmp->db_mapsize);
4224 	printk("   nfree:     %d%d\n", bmp->db_nfree);
4225 	printk("   numag:     %d\n", bmp->db_numag);
4226 	printk("   agsize:    %d%d\n", bmp->db_agsize);
4227 	printk("   agl2size:  %d\n", bmp->db_agl2size);
4228 	printk("   agwidth:   %d\n", bmp->db_agwidth);
4229 	printk("   agstart:   %d\n", bmp->db_agstart);
4230 	printk("   agheigth:  %d\n", bmp->db_agheigth);
4231 	printk("   aglevel:   %d\n", bmp->db_aglevel);
4232 	printk("   maxlevel:  %d\n", bmp->db_maxlevel);
4233 	printk("   maxag:     %d\n", bmp->db_maxag);
4234 	printk("   agpref:    %d\n", bmp->db_agpref);
4235 	printk("   l2nbppg:   %d\n", bmp->db_l2nbperpage);
4236 }
4237 
4238 
4239 /*
4240  *	dbPrtCtl()
4241  */
dbPrtCtl(struct dmapctl * dcp)4242 static void dbPrtCtl(struct dmapctl * dcp)
4243 {
4244 	int i, j, n;
4245 
4246 	printk("   height:    %08x\n", le32_to_cpu(dcp->height));
4247 	printk("   leafidx:   %08x\n", le32_to_cpu(dcp->leafidx));
4248 	printk("   budmin:    %08x\n", dcp->budmin);
4249 	printk("   nleafs:    %08x\n", le32_to_cpu(dcp->nleafs));
4250 	printk("   l2nleafs:  %08x\n", le32_to_cpu(dcp->l2nleafs));
4251 
4252 	printk("\n Tree:\n");
4253 	for (i = 0; i < CTLLEAFIND; i += 8) {
4254 		n = min(8, CTLLEAFIND - i);
4255 
4256 		for (j = 0; j < n; j++)
4257 			printf("  [%03x]: %02x", i + j,
4258 			       (char) dcp->stree[i + j]);
4259 		printf("\n");
4260 	}
4261 
4262 	printk("\n Tree Leaves:\n");
4263 	for (i = 0; i < LPERCTL; i += 8) {
4264 		n = min(8, LPERCTL - i);
4265 
4266 		for (j = 0; j < n; j++)
4267 			printf("  [%03x]: %02x",
4268 			       i + j,
4269 			       (char) dcp->stree[i + j + CTLLEAFIND]);
4270 		printf("\n");
4271 	}
4272 }
4273 #endif				/* _JFS_DEBUG_DMAP */
4274