1 /*
2  * fs/logfs/logfs_abi.h
3  *
4  * As should be obvious for Linux kernel code, license is GPLv2
5  *
6  * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7  *
8  * Public header for logfs.
9  */
10 #ifndef FS_LOGFS_LOGFS_ABI_H
11 #define FS_LOGFS_LOGFS_ABI_H
12 
13 /* For out-of-kernel compiles */
14 #ifndef BUILD_BUG_ON
15 #define BUILD_BUG_ON(condition) /**/
16 #endif
17 
18 #define SIZE_CHECK(type, size)					\
19 static inline void check_##type(void)				\
20 {								\
21 	BUILD_BUG_ON(sizeof(struct type) != (size));		\
22 }
23 
24 /*
25  * Throughout the logfs code, we're constantly dealing with blocks at
26  * various positions or offsets.  To remove confusion, we stricly
27  * distinguish between a "position" - the logical position within a
28  * file and an "offset" - the physical location within the device.
29  *
30  * Any usage of the term offset for a logical location or position for
31  * a physical one is a bug and should get fixed.
32  */
33 
34 /*
35  * Block are allocated in one of several segments depending on their
36  * level.  The following levels are used:
37  *  0	- regular data block
38  *  1	- i1 indirect blocks
39  *  2	- i2 indirect blocks
40  *  3	- i3 indirect blocks
41  *  4	- i4 indirect blocks
42  *  5	- i5 indirect blocks
43  *  6	- ifile data blocks
44  *  7	- ifile i1 indirect blocks
45  *  8	- ifile i2 indirect blocks
46  *  9	- ifile i3 indirect blocks
47  * 10	- ifile i4 indirect blocks
48  * 11	- ifile i5 indirect blocks
49  * Potential levels to be used in the future:
50  * 12	- gc recycled blocks, long-lived data
51  * 13	- replacement blocks, short-lived data
52  *
53  * Levels 1-11 are necessary for robust gc operations and help separate
54  * short-lived metadata from longer-lived file data.  In the future,
55  * file data should get separated into several segments based on simple
56  * heuristics.  Old data recycled during gc operation is expected to be
57  * long-lived.  New data is of uncertain life expectancy.  New data
58  * used to replace older blocks in existing files is expected to be
59  * short-lived.
60  */
61 
62 
63 /* Magic numbers.  64bit for superblock, 32bit for statfs f_type */
64 #define LOGFS_MAGIC		0x7a3a8e5cb9d5bf67ull
65 #define LOGFS_MAGIC_U32		0xc97e8168u
66 
67 /*
68  * Various blocksize related macros.  Blocksize is currently fixed at 4KiB.
69  * Sooner or later that should become configurable and the macros replaced
70  * by something superblock-dependent.  Pointers in indirect blocks are and
71  * will remain 64bit.
72  *
73  * LOGFS_BLOCKSIZE	- self-explaining
74  * LOGFS_BLOCK_FACTOR	- number of pointers per indirect block
75  * LOGFS_BLOCK_BITS	- log2 of LOGFS_BLOCK_FACTOR, used for shifts
76  */
77 #define LOGFS_BLOCKSIZE		(4096ull)
78 #define LOGFS_BLOCK_FACTOR	(LOGFS_BLOCKSIZE / sizeof(u64))
79 #define LOGFS_BLOCK_BITS	(9)
80 
81 /*
82  * Number of blocks at various levels of indirection.  There are 16 direct
83  * block pointers plus a single indirect pointer.
84  */
85 #define I0_BLOCKS		(16)
86 #define I1_BLOCKS		LOGFS_BLOCK_FACTOR
87 #define I2_BLOCKS		(LOGFS_BLOCK_FACTOR * I1_BLOCKS)
88 #define I3_BLOCKS		(LOGFS_BLOCK_FACTOR * I2_BLOCKS)
89 #define I4_BLOCKS		(LOGFS_BLOCK_FACTOR * I3_BLOCKS)
90 #define I5_BLOCKS		(LOGFS_BLOCK_FACTOR * I4_BLOCKS)
91 
92 #define INDIRECT_INDEX		I0_BLOCKS
93 #define LOGFS_EMBEDDED_FIELDS	(I0_BLOCKS + 1)
94 
95 /*
96  * Sizes at which files require another level of indirection.  Files smaller
97  * than LOGFS_EMBEDDED_SIZE can be completely stored in the inode itself,
98  * similar like ext2 fast symlinks.
99  *
100  * Data at a position smaller than LOGFS_I0_SIZE is accessed through the
101  * direct pointers, else through the 1x indirect pointer and so forth.
102  */
103 #define LOGFS_EMBEDDED_SIZE	(LOGFS_EMBEDDED_FIELDS * sizeof(u64))
104 #define LOGFS_I0_SIZE		(I0_BLOCKS * LOGFS_BLOCKSIZE)
105 #define LOGFS_I1_SIZE		(I1_BLOCKS * LOGFS_BLOCKSIZE)
106 #define LOGFS_I2_SIZE		(I2_BLOCKS * LOGFS_BLOCKSIZE)
107 #define LOGFS_I3_SIZE		(I3_BLOCKS * LOGFS_BLOCKSIZE)
108 #define LOGFS_I4_SIZE		(I4_BLOCKS * LOGFS_BLOCKSIZE)
109 #define LOGFS_I5_SIZE		(I5_BLOCKS * LOGFS_BLOCKSIZE)
110 
111 /*
112  * Each indirect block pointer must have this flag set, if all block pointers
113  * behind it are set, i.e. there is no hole hidden in the shadow of this
114  * indirect block pointer.
115  */
116 #define LOGFS_FULLY_POPULATED (1ULL << 63)
117 #define pure_ofs(ofs) (ofs & ~LOGFS_FULLY_POPULATED)
118 
119 /*
120  * LogFS needs to separate data into levels.  Each level is defined as the
121  * maximal possible distance from the master inode (inode of the inode file).
122  * Data blocks reside on level 0, 1x indirect block on level 1, etc.
123  * Inodes reside on level 6, indirect blocks for the inode file on levels 7-11.
124  * This effort is necessary to guarantee garbage collection to always make
125  * progress.
126  *
127  * LOGFS_MAX_INDIRECT is the maximal indirection through indirect blocks,
128  * LOGFS_MAX_LEVELS is one more for the actual data level of a file.  It is
129  * the maximal number of levels for one file.
130  * LOGFS_NO_AREAS is twice that, as the inode file and regular files are
131  * effectively stacked on top of each other.
132  */
133 #define LOGFS_MAX_INDIRECT	(5)
134 #define LOGFS_MAX_LEVELS	(LOGFS_MAX_INDIRECT + 1)
135 #define LOGFS_NO_AREAS		(2 * LOGFS_MAX_LEVELS)
136 
137 /* Maximum size of filenames */
138 #define LOGFS_MAX_NAMELEN	(255)
139 
140 /* Number of segments in the primary journal. */
141 #define LOGFS_JOURNAL_SEGS	(16)
142 
143 /* Maximum number of free/erased/etc. segments in journal entries */
144 #define MAX_CACHED_SEGS		(64)
145 
146 
147 /*
148  * LOGFS_OBJECT_HEADERSIZE is the size of a single header in the object store,
149  * LOGFS_MAX_OBJECTSIZE the size of the largest possible object, including
150  * its header,
151  * LOGFS_SEGMENT_RESERVE is the amount of space reserved for each segment for
152  * its segment header and the padded space at the end when no further objects
153  * fit.
154  */
155 #define LOGFS_OBJECT_HEADERSIZE	(0x1c)
156 #define LOGFS_SEGMENT_HEADERSIZE (0x18)
157 #define LOGFS_MAX_OBJECTSIZE	(LOGFS_OBJECT_HEADERSIZE + LOGFS_BLOCKSIZE)
158 #define LOGFS_SEGMENT_RESERVE	\
159 	(LOGFS_SEGMENT_HEADERSIZE + LOGFS_MAX_OBJECTSIZE - 1)
160 
161 /*
162  * Segment types:
163  * SEG_SUPER	- Data or indirect block
164  * SEG_JOURNAL	- Inode
165  * SEG_OSTORE	- Dentry
166  */
167 enum {
168 	SEG_SUPER	= 0x01,
169 	SEG_JOURNAL	= 0x02,
170 	SEG_OSTORE	= 0x03,
171 };
172 
173 /**
174  * struct logfs_segment_header - per-segment header in the ostore
175  *
176  * @crc:			crc32 of header (there is no data)
177  * @pad:			unused, must be 0
178  * @type:			segment type, see above
179  * @level:			GC level for all objects in this segment
180  * @segno:			segment number
181  * @ec:				erase count for this segment
182  * @gec:			global erase count at time of writing
183  */
184 struct logfs_segment_header {
185 	__be32	crc;
186 	__be16	pad;
187 	__u8	type;
188 	__u8	level;
189 	__be32	segno;
190 	__be32	ec;
191 	__be64	gec;
192 };
193 
194 SIZE_CHECK(logfs_segment_header, LOGFS_SEGMENT_HEADERSIZE);
195 
196 #define LOGFS_FEATURES_INCOMPAT		(0ull)
197 #define LOGFS_FEATURES_RO_COMPAT	(0ull)
198 #define LOGFS_FEATURES_COMPAT		(0ull)
199 
200 /**
201  * struct logfs_disk_super - on-medium superblock
202  *
203  * @ds_magic:			magic number, must equal LOGFS_MAGIC
204  * @ds_crc:			crc32 of structure starting with the next field
205  * @ds_ifile_levels:		maximum number of levels for ifile
206  * @ds_iblock_levels:		maximum number of levels for regular files
207  * @ds_data_levels:		number of separate levels for data
208  * @pad0:			reserved, must be 0
209  * @ds_feature_incompat:	incompatible filesystem features
210  * @ds_feature_ro_compat:	read-only compatible filesystem features
211  * @ds_feature_compat:		compatible filesystem features
212  * @ds_flags:			flags
213  * @ds_segment_shift:		log2 of segment size
214  * @ds_block_shift:		log2 of block size
215  * @ds_write_shift:		log2 of write size
216  * @pad1:			reserved, must be 0
217  * @ds_journal_seg:		segments used by primary journal
218  * @ds_root_reserve:		bytes reserved for the superuser
219  * @ds_speed_reserve:		bytes reserved to speed up GC
220  * @ds_bad_seg_reserve:		number of segments reserved to handle bad blocks
221  * @pad2:			reserved, must be 0
222  * @pad3:			reserved, must be 0
223  *
224  * Contains only read-only fields.  Read-write fields like the amount of used
225  * space is tracked in the dynamic superblock, which is stored in the journal.
226  */
227 struct logfs_disk_super {
228 	struct logfs_segment_header ds_sh;
229 	__be64	ds_magic;
230 
231 	__be32	ds_crc;
232 	__u8	ds_ifile_levels;
233 	__u8	ds_iblock_levels;
234 	__u8	ds_data_levels;
235 	__u8	ds_segment_shift;
236 	__u8	ds_block_shift;
237 	__u8	ds_write_shift;
238 	__u8	pad0[6];
239 
240 	__be64	ds_filesystem_size;
241 	__be32	ds_segment_size;
242 	__be32  ds_bad_seg_reserve;
243 
244 	__be64	ds_feature_incompat;
245 	__be64	ds_feature_ro_compat;
246 
247 	__be64	ds_feature_compat;
248 	__be64	ds_feature_flags;
249 
250 	__be64	ds_root_reserve;
251 	__be64  ds_speed_reserve;
252 
253 	__be32	ds_journal_seg[LOGFS_JOURNAL_SEGS];
254 
255 	__be64	ds_super_ofs[2];
256 	__be64	pad3[8];
257 };
258 
259 SIZE_CHECK(logfs_disk_super, 256);
260 
261 /*
262  * Object types:
263  * OBJ_BLOCK	- Data or indirect block
264  * OBJ_INODE	- Inode
265  * OBJ_DENTRY	- Dentry
266  */
267 enum {
268 	OBJ_BLOCK	= 0x04,
269 	OBJ_INODE	= 0x05,
270 	OBJ_DENTRY	= 0x06,
271 };
272 
273 /**
274  * struct logfs_object_header - per-object header in the ostore
275  *
276  * @crc:			crc32 of header, excluding data_crc
277  * @len:			length of data
278  * @type:			object type, see above
279  * @compr:			compression type
280  * @ino:			inode number
281  * @bix:			block index
282  * @data_crc:			crc32 of payload
283  */
284 struct logfs_object_header {
285 	__be32	crc;
286 	__be16	len;
287 	__u8	type;
288 	__u8	compr;
289 	__be64	ino;
290 	__be64	bix;
291 	__be32	data_crc;
292 } __attribute__((packed));
293 
294 SIZE_CHECK(logfs_object_header, LOGFS_OBJECT_HEADERSIZE);
295 
296 /*
297  * Reserved inode numbers:
298  * LOGFS_INO_MASTER	- master inode (for inode file)
299  * LOGFS_INO_ROOT	- root directory
300  * LOGFS_INO_SEGFILE	- per-segment used bytes and erase count
301  */
302 enum {
303 	LOGFS_INO_MAPPING	= 0x00,
304 	LOGFS_INO_MASTER	= 0x01,
305 	LOGFS_INO_ROOT		= 0x02,
306 	LOGFS_INO_SEGFILE	= 0x03,
307 	LOGFS_RESERVED_INOS	= 0x10,
308 };
309 
310 /*
311  * Inode flags.  High bits should never be written to the medium.  They are
312  * reserved for in-memory usage.
313  * Low bits should either remain in sync with the corresponding FS_*_FL or
314  * reuse slots that obviously don't make sense for logfs.
315  *
316  * LOGFS_IF_DIRTY	Inode must be written back
317  * LOGFS_IF_ZOMBIE	Inode has been deleted
318  * LOGFS_IF_STILLBORN	-ENOSPC happened when creating inode
319  */
320 #define LOGFS_IF_COMPRESSED	0x00000004 /* == FS_COMPR_FL */
321 #define LOGFS_IF_DIRTY		0x20000000
322 #define LOGFS_IF_ZOMBIE		0x40000000
323 #define LOGFS_IF_STILLBORN	0x80000000
324 
325 /* Flags available to chattr */
326 #define LOGFS_FL_USER_VISIBLE	(LOGFS_IF_COMPRESSED)
327 #define LOGFS_FL_USER_MODIFIABLE (LOGFS_IF_COMPRESSED)
328 /* Flags inherited from parent directory on file/directory creation */
329 #define LOGFS_FL_INHERITED	(LOGFS_IF_COMPRESSED)
330 
331 /**
332  * struct logfs_disk_inode - on-medium inode
333  *
334  * @di_mode:			file mode
335  * @di_pad:			reserved, must be 0
336  * @di_flags:			inode flags, see above
337  * @di_uid:			user id
338  * @di_gid:			group id
339  * @di_ctime:			change time
340  * @di_mtime:			modify time
341  * @di_refcount:		reference count (aka nlink or link count)
342  * @di_generation:		inode generation, for nfs
343  * @di_used_bytes:		number of bytes used
344  * @di_size:			file size
345  * @di_data:			data pointers
346  */
347 struct logfs_disk_inode {
348 	__be16	di_mode;
349 	__u8	di_height;
350 	__u8	di_pad;
351 	__be32	di_flags;
352 	__be32	di_uid;
353 	__be32	di_gid;
354 
355 	__be64	di_ctime;
356 	__be64	di_mtime;
357 
358 	__be64	di_atime;
359 	__be32	di_refcount;
360 	__be32	di_generation;
361 
362 	__be64	di_used_bytes;
363 	__be64	di_size;
364 
365 	__be64	di_data[LOGFS_EMBEDDED_FIELDS];
366 };
367 
368 SIZE_CHECK(logfs_disk_inode, 200);
369 
370 #define INODE_POINTER_OFS \
371 	(offsetof(struct logfs_disk_inode, di_data) / sizeof(__be64))
372 #define INODE_USED_OFS \
373 	(offsetof(struct logfs_disk_inode, di_used_bytes) / sizeof(__be64))
374 #define INODE_SIZE_OFS \
375 	(offsetof(struct logfs_disk_inode, di_size) / sizeof(__be64))
376 #define INODE_HEIGHT_OFS	(0)
377 
378 /**
379  * struct logfs_disk_dentry - on-medium dentry structure
380  *
381  * @ino:			inode number
382  * @namelen:			length of file name
383  * @type:			file type, identical to bits 12..15 of mode
384  * @name:			file name
385  */
386 /* FIXME: add 6 bytes of padding to remove the __packed */
387 struct logfs_disk_dentry {
388 	__be64	ino;
389 	__be16	namelen;
390 	__u8	type;
391 	__u8	name[LOGFS_MAX_NAMELEN];
392 } __attribute__((packed));
393 
394 SIZE_CHECK(logfs_disk_dentry, 266);
395 
396 #define RESERVED		0xffffffff
397 #define BADSEG			0xffffffff
398 /**
399  * struct logfs_segment_entry - segment file entry
400  *
401  * @ec_level:			erase count and level
402  * @valid:			number of valid bytes
403  *
404  * Segment file contains one entry for every segment.  ec_level contains the
405  * erasecount in the upper 28 bits and the level in the lower 4 bits.  An
406  * ec_level of BADSEG (-1) identifies bad segments.  valid contains the number
407  * of valid bytes or RESERVED (-1 again) if the segment is used for either the
408  * superblock or the journal, or when the segment is bad.
409  */
410 struct logfs_segment_entry {
411 	__be32	ec_level;
412 	__be32	valid;
413 };
414 
415 SIZE_CHECK(logfs_segment_entry, 8);
416 
417 /**
418  * struct logfs_journal_header - header for journal entries (JEs)
419  *
420  * @h_crc:			crc32 of journal entry
421  * @h_len:			length of compressed journal entry,
422  *				not including header
423  * @h_datalen:			length of uncompressed data
424  * @h_type:			JE type
425  * @h_compr:			compression type
426  * @h_pad:			reserved
427  */
428 struct logfs_journal_header {
429 	__be32	h_crc;
430 	__be16	h_len;
431 	__be16	h_datalen;
432 	__be16	h_type;
433 	__u8	h_compr;
434 	__u8	h_pad[5];
435 };
436 
437 SIZE_CHECK(logfs_journal_header, 16);
438 
439 /*
440  * Life expectency of data.
441  * VIM_DEFAULT		- default vim
442  * VIM_SEGFILE		- for segment file only - very short-living
443  * VIM_GC		- GC'd data - likely long-living
444  */
445 enum logfs_vim {
446 	VIM_DEFAULT	= 0,
447 	VIM_SEGFILE	= 1,
448 };
449 
450 /**
451  * struct logfs_je_area - wbuf header
452  *
453  * @segno:			segment number of area
454  * @used_bytes:			number of bytes already used
455  * @gc_level:			GC level
456  * @vim:			life expectancy of data
457  *
458  * "Areas" are segments currently being used for writing.  There is at least
459  * one area per GC level.  Several may be used to separate long-living from
460  * short-living data.  If an area with unknown vim is encountered, it can
461  * simply be closed.
462  * The write buffer immediately follow this header.
463  */
464 struct logfs_je_area {
465 	__be32	segno;
466 	__be32	used_bytes;
467 	__u8	gc_level;
468 	__u8	vim;
469 } __attribute__((packed));
470 
471 SIZE_CHECK(logfs_je_area, 10);
472 
473 #define MAX_JOURNAL_HEADER \
474 	(sizeof(struct logfs_journal_header) + sizeof(struct logfs_je_area))
475 
476 /**
477  * struct logfs_je_dynsb - dynamic superblock
478  *
479  * @ds_gec:			global erase count
480  * @ds_sweeper:			current position of GC "sweeper"
481  * @ds_rename_dir:		source directory ino (see dir.c documentation)
482  * @ds_rename_pos:		position of source dd (see dir.c documentation)
483  * @ds_victim_ino:		victims of incomplete dir operation (see dir.c)
484  * @ds_victim_ino:		parent inode of victim (see dir.c)
485  * @ds_used_bytes:		number of used bytes
486  */
487 struct logfs_je_dynsb {
488 	__be64	ds_gec;
489 	__be64	ds_sweeper;
490 
491 	__be64	ds_rename_dir;
492 	__be64	ds_rename_pos;
493 
494 	__be64	ds_victim_ino;
495 	__be64	ds_victim_parent; /* XXX */
496 
497 	__be64	ds_used_bytes;
498 	__be32	ds_generation;
499 	__be32	pad;
500 };
501 
502 SIZE_CHECK(logfs_je_dynsb, 64);
503 
504 /**
505  * struct logfs_je_anchor - anchor of filesystem tree, aka master inode
506  *
507  * @da_size:			size of inode file
508  * @da_last_ino:		last created inode
509  * @da_used_bytes:		number of bytes used
510  * @da_data:			data pointers
511  */
512 struct logfs_je_anchor {
513 	__be64	da_size;
514 	__be64	da_last_ino;
515 
516 	__be64	da_used_bytes;
517 	u8	da_height;
518 	u8	pad[7];
519 
520 	__be64	da_data[LOGFS_EMBEDDED_FIELDS];
521 };
522 
523 SIZE_CHECK(logfs_je_anchor, 168);
524 
525 /**
526  * struct logfs_je_spillout - spillout entry (from 1st to 2nd journal)
527  *
528  * @so_segment:			segments used for 2nd journal
529  *
530  * Length of the array is given by h_len field in the header.
531  */
532 struct logfs_je_spillout {
533 	__be64	so_segment[0];
534 };
535 
536 SIZE_CHECK(logfs_je_spillout, 0);
537 
538 /**
539  * struct logfs_je_journal_ec - erase counts for all journal segments
540  *
541  * @ec:				erase count
542  *
543  * Length of the array is given by h_len field in the header.
544  */
545 struct logfs_je_journal_ec {
546 	__be32	ec[0];
547 };
548 
549 SIZE_CHECK(logfs_je_journal_ec, 0);
550 
551 /**
552  * struct logfs_je_free_segments - list of free segmetns with erase count
553  */
554 struct logfs_je_free_segments {
555 	__be32	segno;
556 	__be32	ec;
557 };
558 
559 SIZE_CHECK(logfs_je_free_segments, 8);
560 
561 /**
562  * struct logfs_seg_alias - list of segment aliases
563  */
564 struct logfs_seg_alias {
565 	__be32	old_segno;
566 	__be32	new_segno;
567 };
568 
569 SIZE_CHECK(logfs_seg_alias, 8);
570 
571 /**
572  * struct logfs_obj_alias - list of object aliases
573  */
574 struct logfs_obj_alias {
575 	__be64	ino;
576 	__be64	bix;
577 	__be64	val;
578 	u8	level;
579 	u8	pad[5];
580 	__be16	child_no;
581 };
582 
583 SIZE_CHECK(logfs_obj_alias, 32);
584 
585 /**
586  * Compression types.
587  *
588  * COMPR_NONE	- uncompressed
589  * COMPR_ZLIB	- compressed with zlib
590  */
591 enum {
592 	COMPR_NONE	= 0,
593 	COMPR_ZLIB	= 1,
594 };
595 
596 /*
597  * Journal entries come in groups of 16.  First group contains unique
598  * entries, next groups contain one entry per level
599  *
600  * JE_FIRST	- smallest possible journal entry number
601  *
602  * JEG_BASE	- base group, containing unique entries
603  * JE_COMMIT	- commit entry, validates all previous entries
604  * JE_DYNSB	- dynamic superblock, anything that ought to be in the
605  *		  superblock but cannot because it is read-write data
606  * JE_ANCHOR	- anchor aka master inode aka inode file's inode
607  * JE_ERASECOUNT  erasecounts for all journal segments
608  * JE_SPILLOUT	- unused
609  * JE_SEG_ALIAS	- aliases segments
610  * JE_AREA	- area description
611  *
612  * JE_LAST	- largest possible journal entry number
613  */
614 enum {
615 	JE_FIRST	= 0x01,
616 
617 	JEG_BASE	= 0x00,
618 	JE_COMMIT	= 0x02,
619 	JE_DYNSB	= 0x03,
620 	JE_ANCHOR	= 0x04,
621 	JE_ERASECOUNT	= 0x05,
622 	JE_SPILLOUT	= 0x06,
623 	JE_OBJ_ALIAS	= 0x0d,
624 	JE_AREA		= 0x0e,
625 
626 	JE_LAST		= 0x0e,
627 };
628 
629 #endif
630