1 /* 2 * Copyright (c) International Business Machines Corp., 2000-2003 3 * Portions Copyright (c) Christoph Hellwig, 2001-2002 4 * 5 * This program is free software; you can redistribute it and/or modify 6 * it under the terms of the GNU General Public License as published by 7 * the Free Software Foundation; either version 2 of the License, or 8 * (at your option) any later version. 9 * 10 * This program is distributed in the hope that it will be useful, 11 * but WITHOUT ANY WARRANTY; without even the implied warranty of 12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See 13 * the GNU General Public License for more details. 14 * 15 * You should have received a copy of the GNU General Public License 16 * along with this program; if not, write to the Free Software 17 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 18 */ 19 #ifndef _H_JFS_LOGMGR 20 #define _H_JFS_LOGMGR 21 22 #include "jfs_filsys.h" 23 #include "jfs_lock.h" 24 25 /* 26 * log manager configuration parameters 27 */ 28 29 /* log page size */ 30 #define LOGPSIZE 4096 31 #define L2LOGPSIZE 12 32 33 #define LOGPAGES 16 /* Log pages per mounted file system */ 34 35 /* 36 * log logical volume 37 * 38 * a log is used to make the commit operation on journalled 39 * files within the same logical volume group atomic. 40 * a log is implemented with a logical volume. 41 * there is one log per logical volume group. 42 * 43 * block 0 of the log logical volume is not used (ipl etc). 44 * block 1 contains a log "superblock" and is used by logFormat(), 45 * lmLogInit(), lmLogShutdown(), and logRedo() to record status 46 * of the log but is not otherwise used during normal processing. 47 * blocks 2 - (N-1) are used to contain log records. 48 * 49 * when a volume group is varied-on-line, logRedo() must have 50 * been executed before the file systems (logical volumes) in 51 * the volume group can be mounted. 52 */ 53 /* 54 * log superblock (block 1 of logical volume) 55 */ 56 #define LOGSUPER_B 1 57 #define LOGSTART_B 2 58 59 #define LOGMAGIC 0x87654321 60 #define LOGVERSION 1 61 62 #define MAX_ACTIVE 128 /* Max active file systems sharing log */ 63 64 struct logsuper { 65 u32 magic; /* 4: log lv identifier */ 66 s32 version; /* 4: version number */ 67 s32 serial; /* 4: log open/mount counter */ 68 s32 size; /* 4: size in number of LOGPSIZE blocks */ 69 s32 bsize; /* 4: logical block size in byte */ 70 s32 l2bsize; /* 4: log2 of bsize */ 71 72 u32 flag; /* 4: option */ 73 u32 state; /* 4: state - see below */ 74 75 s32 end; /* 4: addr of last log record set by logredo */ 76 char uuid[16]; /* 16: 128-bit journal uuid */ 77 char label[16]; /* 16: journal label */ 78 struct { 79 char uuid[16]; 80 } active[MAX_ACTIVE]; /* 2048: active file systems list */ 81 }; 82 83 #define NULL_UUID "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0" 84 85 /* log flag: commit option (see jfs_filsys.h) */ 86 87 /* log state */ 88 #define LOGMOUNT 0 /* log mounted by lmLogInit() */ 89 #define LOGREDONE 1 /* log shutdown by lmLogShutdown(). 90 * log redo completed by logredo(). 91 */ 92 #define LOGWRAP 2 /* log wrapped */ 93 #define LOGREADERR 3 /* log read error detected in logredo() */ 94 95 96 /* 97 * log logical page 98 * 99 * (this comment should be rewritten !) 100 * the header and trailer structures (h,t) will normally have 101 * the same page and eor value. 102 * An exception to this occurs when a complete page write is not 103 * accomplished on a power failure. Since the hardware may "split write" 104 * sectors in the page, any out of order sequence may occur during powerfail 105 * and needs to be recognized during log replay. The xor value is 106 * an "exclusive or" of all log words in the page up to eor. This 107 * 32 bit eor is stored with the top 16 bits in the header and the 108 * bottom 16 bits in the trailer. logredo can easily recognize pages 109 * that were not completed by reconstructing this eor and checking 110 * the log page. 111 * 112 * Previous versions of the operating system did not allow split 113 * writes and detected partially written records in logredo by 114 * ordering the updates to the header, trailer, and the move of data 115 * into the logdata area. The order: (1) data is moved (2) header 116 * is updated (3) trailer is updated. In logredo, when the header 117 * differed from the trailer, the header and trailer were reconciled 118 * as follows: if h.page != t.page they were set to the smaller of 119 * the two and h.eor and t.eor set to 8 (i.e. empty page). if (only) 120 * h.eor != t.eor they were set to the smaller of their two values. 121 */ 122 struct logpage { 123 struct { /* header */ 124 s32 page; /* 4: log sequence page number */ 125 s16 rsrvd; /* 2: */ 126 s16 eor; /* 2: end-of-log offset of lasrt record write */ 127 } h; 128 129 s32 data[LOGPSIZE / 4 - 4]; /* log record area */ 130 131 struct { /* trailer */ 132 s32 page; /* 4: normally the same as h.page */ 133 s16 rsrvd; /* 2: */ 134 s16 eor; /* 2: normally the same as h.eor */ 135 } t; 136 }; 137 138 #define LOGPHDRSIZE 8 /* log page header size */ 139 #define LOGPTLRSIZE 8 /* log page trailer size */ 140 141 142 /* 143 * log record 144 * 145 * (this comment should be rewritten !) 146 * jfs uses only "after" log records (only a single writer is allowed 147 * in a page, pages are written to temporary paging space if 148 * if they must be written to disk before commit, and i/o is 149 * scheduled for modified pages to their home location after 150 * the log records containing the after values and the commit 151 * record is written to the log on disk, undo discards the copy 152 * in main-memory.) 153 * 154 * a log record consists of a data area of variable length followed by 155 * a descriptor of fixed size LOGRDSIZE bytes. 156 * the data area is rounded up to an integral number of 4-bytes and 157 * must be no longer than LOGPSIZE. 158 * the descriptor is of size of multiple of 4-bytes and aligned on a 159 * 4-byte boundary. 160 * records are packed one after the other in the data area of log pages. 161 * (sometimes a DUMMY record is inserted so that at least one record ends 162 * on every page or the longest record is placed on at most two pages). 163 * the field eor in page header/trailer points to the byte following 164 * the last record on a page. 165 */ 166 167 /* log record types */ 168 #define LOG_COMMIT 0x8000 169 #define LOG_SYNCPT 0x4000 170 #define LOG_MOUNT 0x2000 171 #define LOG_REDOPAGE 0x0800 172 #define LOG_NOREDOPAGE 0x0080 173 #define LOG_NOREDOINOEXT 0x0040 174 #define LOG_UPDATEMAP 0x0008 175 #define LOG_NOREDOFILE 0x0001 176 177 /* REDOPAGE/NOREDOPAGE log record data type */ 178 #define LOG_INODE 0x0001 179 #define LOG_XTREE 0x0002 180 #define LOG_DTREE 0x0004 181 #define LOG_BTROOT 0x0010 182 #define LOG_EA 0x0020 183 #define LOG_ACL 0x0040 184 #define LOG_DATA 0x0080 185 #define LOG_NEW 0x0100 186 #define LOG_EXTEND 0x0200 187 #define LOG_RELOCATE 0x0400 188 #define LOG_DIR_XTREE 0x0800 /* Xtree is in directory inode */ 189 190 /* UPDATEMAP log record descriptor type */ 191 #define LOG_ALLOCXADLIST 0x0080 192 #define LOG_ALLOCPXDLIST 0x0040 193 #define LOG_ALLOCXAD 0x0020 194 #define LOG_ALLOCPXD 0x0010 195 #define LOG_FREEXADLIST 0x0008 196 #define LOG_FREEPXDLIST 0x0004 197 #define LOG_FREEXAD 0x0002 198 #define LOG_FREEPXD 0x0001 199 200 201 struct lrd { 202 /* 203 * type independent area 204 */ 205 s32 logtid; /* 4: log transaction identifier */ 206 s32 backchain; /* 4: ptr to prev record of same transaction */ 207 u16 type; /* 2: record type */ 208 s16 length; /* 2: length of data in record (in byte) */ 209 u32 aggregate; /* 4: file system lv/aggregate */ 210 /* (16) */ 211 212 /* 213 * type dependent area (20) 214 */ 215 union { 216 217 /* 218 * COMMIT: commit 219 * 220 * transaction commit: no type-dependent information; 221 */ 222 223 /* 224 * REDOPAGE: after-image 225 * 226 * apply after-image; 227 * 228 * N.B. REDOPAGE, NOREDOPAGE, and UPDATEMAP must be same format; 229 */ 230 struct { 231 u32 fileset; /* 4: fileset number */ 232 u32 inode; /* 4: inode number */ 233 u16 type; /* 2: REDOPAGE record type */ 234 s16 l2linesize; /* 2: log2 of line size */ 235 pxd_t pxd; /* 8: on-disk page pxd */ 236 } redopage; /* (20) */ 237 238 /* 239 * NOREDOPAGE: the page is freed 240 * 241 * do not apply after-image records which precede this record 242 * in the log with the same page block number to this page. 243 * 244 * N.B. REDOPAGE, NOREDOPAGE, and UPDATEMAP must be same format; 245 */ 246 struct { 247 s32 fileset; /* 4: fileset number */ 248 u32 inode; /* 4: inode number */ 249 u16 type; /* 2: NOREDOPAGE record type */ 250 s16 rsrvd; /* 2: reserved */ 251 pxd_t pxd; /* 8: on-disk page pxd */ 252 } noredopage; /* (20) */ 253 254 /* 255 * UPDATEMAP: update block allocation map 256 * 257 * either in-line PXD, 258 * or out-of-line XADLIST; 259 * 260 * N.B. REDOPAGE, NOREDOPAGE, and UPDATEMAP must be same format; 261 */ 262 struct { 263 u32 fileset; /* 4: fileset number */ 264 u32 inode; /* 4: inode number */ 265 u16 type; /* 2: UPDATEMAP record type */ 266 s16 nxd; /* 2: number of extents */ 267 pxd_t pxd; /* 8: pxd */ 268 } updatemap; /* (20) */ 269 270 /* 271 * NOREDOINOEXT: the inode extent is freed 272 * 273 * do not apply after-image records which precede this 274 * record in the log with the any of the 4 page block 275 * numbers in this inode extent. 276 * 277 * NOTE: The fileset and pxd fields MUST remain in 278 * the same fields in the REDOPAGE record format. 279 * 280 */ 281 struct { 282 s32 fileset; /* 4: fileset number */ 283 s32 iagnum; /* 4: IAG number */ 284 s32 inoext_idx; /* 4: inode extent index */ 285 pxd_t pxd; /* 8: on-disk page pxd */ 286 } noredoinoext; /* (20) */ 287 288 /* 289 * SYNCPT: log sync point 290 * 291 * replay log upto syncpt address specified; 292 */ 293 struct { 294 s32 sync; /* 4: syncpt address (0 = here) */ 295 } syncpt; 296 297 /* 298 * MOUNT: file system mount 299 * 300 * file system mount: no type-dependent information; 301 */ 302 303 /* 304 * ? FREEXTENT: free specified extent(s) 305 * 306 * free specified extent(s) from block allocation map 307 * N.B.: nextents should be length of data/sizeof(xad_t) 308 */ 309 struct { 310 s32 type; /* 4: FREEXTENT record type */ 311 s32 nextent; /* 4: number of extents */ 312 313 /* data: PXD or XAD list */ 314 } freextent; 315 316 /* 317 * ? NOREDOFILE: this file is freed 318 * 319 * do not apply records which precede this record in the log 320 * with the same inode number. 321 * 322 * NOREDILE must be the first to be written at commit 323 * (last to be read in logredo()) - it prevents 324 * replay of preceding updates of all preceding generations 325 * of the inumber esp. the on-disk inode itself, 326 * but does NOT prevent 327 * replay of the 328 */ 329 struct { 330 s32 fileset; /* 4: fileset number */ 331 u32 inode; /* 4: inode number */ 332 } noredofile; 333 334 /* 335 * ? NEWPAGE: 336 * 337 * metadata type dependent 338 */ 339 struct { 340 s32 fileset; /* 4: fileset number */ 341 u32 inode; /* 4: inode number */ 342 s32 type; /* 4: NEWPAGE record type */ 343 pxd_t pxd; /* 8: on-disk page pxd */ 344 } newpage; 345 346 /* 347 * ? DUMMY: filler 348 * 349 * no type-dependent information 350 */ 351 } log; 352 }; /* (36) */ 353 354 #define LOGRDSIZE (sizeof(struct lrd)) 355 356 /* 357 * line vector descriptor 358 */ 359 struct lvd { 360 s16 offset; 361 s16 length; 362 }; 363 364 365 /* 366 * log logical volume 367 */ 368 struct jfs_log { 369 370 struct super_block *sb; /* 4: This is used to sync metadata 371 * before writing syncpt. Will 372 * need to be a list if we share 373 * the log between fs's 374 */ 375 struct block_device *bdev; /* 4: log lv pointer */ 376 s32 serial; /* 4: log mount serial number */ 377 378 s64 base; /* @8: log extent address (inline log ) */ 379 int size; /* 4: log size in log page (in page) */ 380 int l2bsize; /* 4: log2 of bsize */ 381 382 long flag; /* 4: flag */ 383 384 struct lbuf *lbuf_free; /* 4: free lbufs */ 385 wait_queue_head_t free_wait; /* 4: */ 386 387 /* log write */ 388 int logtid; /* 4: log tid */ 389 int page; /* 4: page number of eol page */ 390 int eor; /* 4: eor of last record in eol page */ 391 struct lbuf *bp; /* 4: current log page buffer */ 392 393 struct semaphore loglock; /* 4: log write serialization lock */ 394 395 /* syncpt */ 396 int nextsync; /* 4: bytes to write before next syncpt */ 397 int active; /* 4: */ 398 wait_queue_head_t syncwait; /* 4: */ 399 400 /* commit */ 401 uint cflag; /* 4: */ 402 struct { /* 8: FIFO commit queue header */ 403 struct tblock *head; 404 struct tblock *tail; 405 } cqueue; 406 struct tblock *flush_tblk; /* tblk we're waiting on for flush */ 407 int gcrtc; /* 4: GC_READY transaction count */ 408 struct tblock *gclrt; /* 4: latest GC_READY transaction */ 409 spinlock_t gclock; /* 4: group commit lock */ 410 int logsize; /* 4: log data area size in byte */ 411 int lsn; /* 4: end-of-log */ 412 int clsn; /* 4: clsn */ 413 int syncpt; /* 4: addr of last syncpt record */ 414 int sync; /* 4: addr from last logsync() */ 415 struct list_head synclist; /* 8: logsynclist anchor */ 416 spinlock_t synclock; /* 4: synclist lock */ 417 struct lbuf *wqueue; /* 4: log pageout queue */ 418 int count; /* 4: count */ 419 char uuid[16]; /* 16: 128-bit uuid of log device */ 420 421 int no_integrity; /* flag to disable journaling to disk */ 422 int ni_page; /* backup of page for nointegrity option */ 423 int ni_eor; /* backup of eor for nointegrity option */ 424 }; 425 426 /* 427 * Log flag 428 */ 429 #define log_INLINELOG 1 430 #define log_SYNCBARRIER 2 431 #define log_QUIESCE 3 432 #define log_FLUSH 4 433 434 /* 435 * group commit flag 436 */ 437 /* jfs_log */ 438 #define logGC_PAGEOUT 0x00000001 439 440 /* tblock/lbuf */ 441 #define tblkGC_QUEUE 0x0001 442 #define tblkGC_READY 0x0002 443 #define tblkGC_COMMIT 0x0004 444 #define tblkGC_COMMITTED 0x0008 445 #define tblkGC_EOP 0x0010 446 #define tblkGC_FREE 0x0020 447 #define tblkGC_LEADER 0x0040 448 #define tblkGC_ERROR 0x0080 449 #define tblkGC_LAZY 0x0100 // D230860 450 #define tblkGC_UNLOCKED 0x0200 // D230860 451 452 /* 453 * log cache buffer header 454 */ 455 struct lbuf { 456 struct buffer_head l_bh; /* for doing I/O */ 457 struct jfs_log *l_log; /* 4: log associated with buffer */ 458 459 /* 460 * data buffer base area 461 */ 462 uint l_flag; /* 4: pageout control flags */ 463 464 struct lbuf *l_wqnext; /* 4: write queue link */ 465 struct lbuf *l_freelist; /* 4: freelistlink */ 466 467 int l_pn; /* 4: log page number */ 468 int l_eor; /* 4: log record eor */ 469 int l_ceor; /* 4: committed log record eor */ 470 471 s64 l_blkno; /* 8: log page block number */ 472 caddr_t l_ldata; /* 4: data page */ 473 474 wait_queue_head_t l_ioevent; /* 4: i/o done event */ 475 struct page *l_page; /* The page itself */ 476 }; 477 478 /* Reuse l_freelist for redrive list */ 479 #define l_redrive_next l_freelist 480 481 /* 482 * logsynclist block 483 * 484 * common logsyncblk prefix for jbuf_t and tblock 485 */ 486 struct logsyncblk { 487 u16 xflag; /* flags */ 488 u16 flag; /* only meaninful in tblock */ 489 lid_t lid; /* lock id */ 490 s32 lsn; /* log sequence number */ 491 struct list_head synclist; /* log sync list link */ 492 }; 493 494 /* 495 * logsynclist serialization (per log) 496 */ 497 498 #define LOGSYNC_LOCK_INIT(log) spin_lock_init(&(log)->synclock) 499 #define LOGSYNC_LOCK(log) spin_lock(&(log)->synclock) 500 #define LOGSYNC_UNLOCK(log) spin_unlock(&(log)->synclock) 501 502 /* compute the difference in bytes of lsn from sync point */ 503 #define logdiff(diff, lsn, log)\ 504 {\ 505 diff = (lsn) - (log)->syncpt;\ 506 if (diff < 0)\ 507 diff += (log)->logsize;\ 508 } 509 510 extern int lmLogOpen(struct super_block *sb, struct jfs_log ** log); 511 extern int lmLogClose(struct super_block *sb, struct jfs_log * log); 512 extern int lmLogShutdown(struct jfs_log * log); 513 extern int lmLogInit(struct jfs_log * log); 514 extern int lmLogFormat(struct jfs_log *log, s64 logAddress, int logSize); 515 extern void jfs_flush_journal(struct jfs_log * log, int wait); 516 517 #endif /* _H_JFS_LOGMGR */ 518