1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2
3 #include <errno.h>
4 #include <fcntl.h>
5 #include <linux/fs.h>
6 #include <linux/magic.h>
7 #include <pthread.h>
8 #include <stddef.h>
9 #include <sys/mman.h>
10 #include <sys/statvfs.h>
11 #include <sys/uio.h>
12 #include <unistd.h>
13
14 #include "sd-event.h"
15
16 #include "alloc-util.h"
17 #include "chattr-util.h"
18 #include "compress.h"
19 #include "env-util.h"
20 #include "fd-util.h"
21 #include "format-util.h"
22 #include "fs-util.h"
23 #include "journal-authenticate.h"
24 #include "journal-def.h"
25 #include "journal-file.h"
26 #include "lookup3.h"
27 #include "memory-util.h"
28 #include "path-util.h"
29 #include "random-util.h"
30 #include "set.h"
31 #include "sort-util.h"
32 #include "stat-util.h"
33 #include "string-table.h"
34 #include "string-util.h"
35 #include "strv.h"
36 #include "sync-util.h"
37 #include "xattr-util.h"
38
39 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
40 #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
41
42 #define DEFAULT_COMPRESS_THRESHOLD (512ULL)
43 #define MIN_COMPRESS_THRESHOLD (8ULL)
44
45 /* This is the minimum journal file size */
46 #define JOURNAL_FILE_SIZE_MIN (512 * 1024ULL) /* 512 KiB */
47
48 /* These are the lower and upper bounds if we deduce the max_use value
49 * from the file system size */
50 #define MAX_USE_LOWER (1 * 1024 * 1024ULL) /* 1 MiB */
51 #define MAX_USE_UPPER (4 * 1024 * 1024 * 1024ULL) /* 4 GiB */
52
53 /* Those are the lower and upper bounds for the minimal use limit,
54 * i.e. how much we'll use even if keep_free suggests otherwise. */
55 #define MIN_USE_LOW (1 * 1024 * 1024ULL) /* 1 MiB */
56 #define MIN_USE_HIGH (16 * 1024 * 1024ULL) /* 16 MiB */
57
58 /* This is the upper bound if we deduce max_size from max_use */
59 #define MAX_SIZE_UPPER (128 * 1024 * 1024ULL) /* 128 MiB */
60
61 /* This is the upper bound if we deduce the keep_free value from the
62 * file system size */
63 #define KEEP_FREE_UPPER (4 * 1024 * 1024 * 1024ULL) /* 4 GiB */
64
65 /* This is the keep_free value when we can't determine the system
66 * size */
67 #define DEFAULT_KEEP_FREE (1024 * 1024ULL) /* 1 MB */
68
69 /* This is the default maximum number of journal files to keep around. */
70 #define DEFAULT_N_MAX_FILES 100
71
72 /* n_data was the first entry we added after the initial file format design */
73 #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
74
75 /* How many entries to keep in the entry array chain cache at max */
76 #define CHAIN_CACHE_MAX 20
77
78 /* How much to increase the journal file size at once each time we allocate something new. */
79 #define FILE_SIZE_INCREASE (8 * 1024 * 1024ULL) /* 8MB */
80
81 /* Reread fstat() of the file for detecting deletions at least this often */
82 #define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
83
84 /* The mmap context to use for the header we pick as one above the last defined typed */
85 #define CONTEXT_HEADER _OBJECT_TYPE_MAX
86
87 /* Longest hash chain to rotate after */
88 #define HASH_CHAIN_DEPTH_MAX 100
89
90 #ifdef __clang__
91 # pragma GCC diagnostic ignored "-Waddress-of-packed-member"
92 #endif
93
journal_file_tail_end_by_pread(JournalFile * f,uint64_t * ret_offset)94 int journal_file_tail_end_by_pread(JournalFile *f, uint64_t *ret_offset) {
95 uint64_t p;
96 int r;
97
98 assert(f);
99 assert(f->header);
100 assert(ret_offset);
101
102 /* Same as journal_file_tail_end_by_mmap() below, but operates with pread() to avoid the mmap cache
103 * (and thus is thread safe) */
104
105 p = le64toh(f->header->tail_object_offset);
106 if (p == 0)
107 p = le64toh(f->header->header_size);
108 else {
109 Object tail;
110 uint64_t sz;
111
112 r = journal_file_read_object_header(f, OBJECT_UNUSED, p, &tail);
113 if (r < 0)
114 return r;
115
116 sz = le64toh(tail.object.size);
117 if (sz > UINT64_MAX - sizeof(uint64_t) + 1)
118 return -EBADMSG;
119
120 sz = ALIGN64(sz);
121 if (p > UINT64_MAX - sz)
122 return -EBADMSG;
123
124 p += sz;
125 }
126
127 *ret_offset = p;
128
129 return 0;
130 }
131
journal_file_tail_end_by_mmap(JournalFile * f,uint64_t * ret_offset)132 int journal_file_tail_end_by_mmap(JournalFile *f, uint64_t *ret_offset) {
133 uint64_t p;
134 int r;
135
136 assert(f);
137 assert(f->header);
138 assert(ret_offset);
139
140 /* Same as journal_file_tail_end_by_pread() above, but operates with the usual mmap logic */
141
142 p = le64toh(f->header->tail_object_offset);
143 if (p == 0)
144 p = le64toh(f->header->header_size);
145 else {
146 Object *tail;
147 uint64_t sz;
148
149 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
150 if (r < 0)
151 return r;
152
153 sz = le64toh(READ_NOW(tail->object.size));
154 if (sz > UINT64_MAX - sizeof(uint64_t) + 1)
155 return -EBADMSG;
156
157 sz = ALIGN64(sz);
158 if (p > UINT64_MAX - sz)
159 return -EBADMSG;
160
161 p += sz;
162 }
163
164 *ret_offset = p;
165
166 return 0;
167 }
168
journal_file_set_offline_thread_join(JournalFile * f)169 int journal_file_set_offline_thread_join(JournalFile *f) {
170 int r;
171
172 assert(f);
173
174 if (f->offline_state == OFFLINE_JOINED)
175 return 0;
176
177 r = pthread_join(f->offline_thread, NULL);
178 if (r)
179 return -r;
180
181 f->offline_state = OFFLINE_JOINED;
182
183 if (mmap_cache_fd_got_sigbus(f->cache_fd))
184 return -EIO;
185
186 return 0;
187 }
188
journal_file_set_online(JournalFile * f)189 static int journal_file_set_online(JournalFile *f) {
190 bool wait = true;
191
192 assert(f);
193
194 if (!journal_file_writable(f))
195 return -EPERM;
196
197 if (f->fd < 0 || !f->header)
198 return -EINVAL;
199
200 while (wait) {
201 switch (f->offline_state) {
202 case OFFLINE_JOINED:
203 /* No offline thread, no need to wait. */
204 wait = false;
205 break;
206
207 case OFFLINE_SYNCING:
208 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_CANCEL))
209 continue;
210 /* Canceled syncing prior to offlining, no need to wait. */
211 wait = false;
212 break;
213
214 case OFFLINE_AGAIN_FROM_SYNCING:
215 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_SYNCING, OFFLINE_CANCEL))
216 continue;
217 /* Canceled restart from syncing, no need to wait. */
218 wait = false;
219 break;
220
221 case OFFLINE_AGAIN_FROM_OFFLINING:
222 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_OFFLINING, OFFLINE_CANCEL))
223 continue;
224 /* Canceled restart from offlining, must wait for offlining to complete however. */
225 _fallthrough_;
226 default: {
227 int r;
228
229 r = journal_file_set_offline_thread_join(f);
230 if (r < 0)
231 return r;
232
233 wait = false;
234 break;
235 }
236 }
237 }
238
239 if (mmap_cache_fd_got_sigbus(f->cache_fd))
240 return -EIO;
241
242 switch (f->header->state) {
243 case STATE_ONLINE:
244 return 0;
245
246 case STATE_OFFLINE:
247 f->header->state = STATE_ONLINE;
248 (void) fsync(f->fd);
249 return 0;
250
251 default:
252 return -EINVAL;
253 }
254 }
255
journal_file_close(JournalFile * f)256 JournalFile* journal_file_close(JournalFile *f) {
257 if (!f)
258 return NULL;
259
260 if (f->cache_fd)
261 mmap_cache_fd_free(f->cache_fd);
262
263 if (f->close_fd)
264 safe_close(f->fd);
265 free(f->path);
266
267 ordered_hashmap_free_free(f->chain_cache);
268
269 #if HAVE_COMPRESSION
270 free(f->compress_buffer);
271 #endif
272
273 #if HAVE_GCRYPT
274 if (f->fss_file)
275 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
276 else
277 free(f->fsprg_state);
278
279 free(f->fsprg_seed);
280
281 if (f->hmac)
282 gcry_md_close(f->hmac);
283 #endif
284
285 return mfree(f);
286 }
287
journal_file_init_header(JournalFile * f,JournalFileFlags file_flags,JournalFile * template)288 static int journal_file_init_header(JournalFile *f, JournalFileFlags file_flags, JournalFile *template) {
289 Header h = {};
290 ssize_t k;
291 bool keyed_hash, seal = false;
292 int r;
293
294 assert(f);
295
296 /* We turn on keyed hashes by default, but provide an environment variable to turn them off, if
297 * people really want that */
298 r = getenv_bool("SYSTEMD_JOURNAL_KEYED_HASH");
299 if (r < 0) {
300 if (r != -ENXIO)
301 log_debug_errno(r, "Failed to parse $SYSTEMD_JOURNAL_KEYED_HASH environment variable, ignoring: %m");
302 keyed_hash = true;
303 } else
304 keyed_hash = r;
305
306 #if HAVE_GCRYPT
307 /* Try to load the FSPRG state, and if we can't, then just don't do sealing */
308 seal = FLAGS_SET(file_flags, JOURNAL_SEAL) && journal_file_fss_load(f) >= 0;
309 #endif
310
311 memcpy(h.signature, HEADER_SIGNATURE, 8);
312 h.header_size = htole64(ALIGN64(sizeof(h)));
313
314 h.incompatible_flags |= htole32(
315 FLAGS_SET(file_flags, JOURNAL_COMPRESS) *
316 COMPRESSION_TO_HEADER_INCOMPATIBLE_FLAG(DEFAULT_COMPRESSION) |
317 keyed_hash * HEADER_INCOMPATIBLE_KEYED_HASH);
318
319 h.compatible_flags = htole32(seal * HEADER_COMPATIBLE_SEALED);
320
321 r = sd_id128_randomize(&h.file_id);
322 if (r < 0)
323 return r;
324
325 if (template) {
326 h.seqnum_id = template->header->seqnum_id;
327 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
328 } else
329 h.seqnum_id = h.file_id;
330
331 k = pwrite(f->fd, &h, sizeof(h), 0);
332 if (k < 0)
333 return -errno;
334
335 if (k != sizeof(h))
336 return -EIO;
337
338 return 0;
339 }
340
journal_file_refresh_header(JournalFile * f)341 static int journal_file_refresh_header(JournalFile *f) {
342 int r;
343
344 assert(f);
345 assert(f->header);
346
347 r = sd_id128_get_machine(&f->header->machine_id);
348 if (IN_SET(r, -ENOENT, -ENOMEDIUM))
349 /* We don't have a machine-id, let's continue without */
350 zero(f->header->machine_id);
351 else if (r < 0)
352 return r;
353
354 r = sd_id128_get_boot(&f->header->boot_id);
355 if (r < 0)
356 return r;
357
358 r = journal_file_set_online(f);
359
360 /* Sync the online state to disk; likely just created a new file, also sync the directory this file
361 * is located in. */
362 (void) fsync_full(f->fd);
363
364 return r;
365 }
366
warn_wrong_flags(const JournalFile * f,bool compatible)367 static bool warn_wrong_flags(const JournalFile *f, bool compatible) {
368 const uint32_t any = compatible ? HEADER_COMPATIBLE_ANY : HEADER_INCOMPATIBLE_ANY,
369 supported = compatible ? HEADER_COMPATIBLE_SUPPORTED : HEADER_INCOMPATIBLE_SUPPORTED;
370 const char *type = compatible ? "compatible" : "incompatible";
371 uint32_t flags;
372
373 flags = le32toh(compatible ? f->header->compatible_flags : f->header->incompatible_flags);
374
375 if (flags & ~supported) {
376 if (flags & ~any)
377 log_debug("Journal file %s has unknown %s flags 0x%"PRIx32,
378 f->path, type, flags & ~any);
379 flags = (flags & any) & ~supported;
380 if (flags) {
381 const char* strv[5];
382 size_t n = 0;
383 _cleanup_free_ char *t = NULL;
384
385 if (compatible) {
386 if (flags & HEADER_COMPATIBLE_SEALED)
387 strv[n++] = "sealed";
388 } else {
389 if (flags & HEADER_INCOMPATIBLE_COMPRESSED_XZ)
390 strv[n++] = "xz-compressed";
391 if (flags & HEADER_INCOMPATIBLE_COMPRESSED_LZ4)
392 strv[n++] = "lz4-compressed";
393 if (flags & HEADER_INCOMPATIBLE_COMPRESSED_ZSTD)
394 strv[n++] = "zstd-compressed";
395 if (flags & HEADER_INCOMPATIBLE_KEYED_HASH)
396 strv[n++] = "keyed-hash";
397 }
398 strv[n] = NULL;
399 assert(n < ELEMENTSOF(strv));
400
401 t = strv_join((char**) strv, ", ");
402 log_debug("Journal file %s uses %s %s %s disabled at compilation time.",
403 f->path, type, n > 1 ? "flags" : "flag", strnull(t));
404 }
405 return true;
406 }
407
408 return false;
409 }
410
journal_file_verify_header(JournalFile * f)411 static int journal_file_verify_header(JournalFile *f) {
412 uint64_t arena_size, header_size;
413
414 assert(f);
415 assert(f->header);
416
417 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
418 return -EBADMSG;
419
420 /* In both read and write mode we refuse to open files with incompatible
421 * flags we don't know. */
422 if (warn_wrong_flags(f, false))
423 return -EPROTONOSUPPORT;
424
425 /* When open for writing we refuse to open files with compatible flags, too. */
426 if (journal_file_writable(f) && warn_wrong_flags(f, true))
427 return -EPROTONOSUPPORT;
428
429 if (f->header->state >= _STATE_MAX)
430 return -EBADMSG;
431
432 header_size = le64toh(READ_NOW(f->header->header_size));
433
434 /* The first addition was n_data, so check that we are at least this large */
435 if (header_size < HEADER_SIZE_MIN)
436 return -EBADMSG;
437
438 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
439 return -EBADMSG;
440
441 arena_size = le64toh(READ_NOW(f->header->arena_size));
442
443 if (UINT64_MAX - header_size < arena_size || header_size + arena_size > (uint64_t) f->last_stat.st_size)
444 return -ENODATA;
445
446 if (le64toh(f->header->tail_object_offset) > header_size + arena_size)
447 return -ENODATA;
448
449 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
450 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
451 !VALID64(le64toh(f->header->tail_object_offset)) ||
452 !VALID64(le64toh(f->header->entry_array_offset)))
453 return -ENODATA;
454
455 if (journal_file_writable(f)) {
456 sd_id128_t machine_id;
457 uint8_t state;
458 int r;
459
460 r = sd_id128_get_machine(&machine_id);
461 if (r < 0)
462 return r;
463
464 if (!sd_id128_equal(machine_id, f->header->machine_id))
465 return -EHOSTDOWN;
466
467 state = f->header->state;
468
469 if (state == STATE_ARCHIVED)
470 return -ESHUTDOWN; /* Already archived */
471 else if (state == STATE_ONLINE)
472 return log_debug_errno(SYNTHETIC_ERRNO(EBUSY),
473 "Journal file %s is already online. Assuming unclean closing.",
474 f->path);
475 else if (state != STATE_OFFLINE)
476 return log_debug_errno(SYNTHETIC_ERRNO(EBUSY),
477 "Journal file %s has unknown state %i.",
478 f->path, state);
479
480 if (f->header->field_hash_table_size == 0 || f->header->data_hash_table_size == 0)
481 return -EBADMSG;
482
483 /* Don't permit appending to files from the future. Because otherwise the realtime timestamps wouldn't
484 * be strictly ordered in the entries in the file anymore, and we can't have that since it breaks
485 * bisection. */
486 if (le64toh(f->header->tail_entry_realtime) > now(CLOCK_REALTIME))
487 return log_debug_errno(SYNTHETIC_ERRNO(ETXTBSY),
488 "Journal file %s is from the future, refusing to append new data to it that'd be older.",
489 f->path);
490 }
491
492 return 0;
493 }
494
journal_file_fstat(JournalFile * f)495 int journal_file_fstat(JournalFile *f) {
496 int r;
497
498 assert(f);
499 assert(f->fd >= 0);
500
501 if (fstat(f->fd, &f->last_stat) < 0)
502 return -errno;
503
504 f->last_stat_usec = now(CLOCK_MONOTONIC);
505
506 /* Refuse dealing with files that aren't regular */
507 r = stat_verify_regular(&f->last_stat);
508 if (r < 0)
509 return r;
510
511 /* Refuse appending to files that are already deleted */
512 if (f->last_stat.st_nlink <= 0)
513 return -EIDRM;
514
515 return 0;
516 }
517
journal_file_allocate(JournalFile * f,uint64_t offset,uint64_t size)518 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
519 uint64_t old_size, new_size, old_header_size, old_arena_size;
520 int r;
521
522 assert(f);
523 assert(f->header);
524
525 /* We assume that this file is not sparse, and we know that for sure, since we always call
526 * posix_fallocate() ourselves */
527
528 if (size > PAGE_ALIGN_DOWN(UINT64_MAX) - offset)
529 return -EINVAL;
530
531 if (mmap_cache_fd_got_sigbus(f->cache_fd))
532 return -EIO;
533
534 old_header_size = le64toh(READ_NOW(f->header->header_size));
535 old_arena_size = le64toh(READ_NOW(f->header->arena_size));
536 if (old_arena_size > PAGE_ALIGN_DOWN(UINT64_MAX) - old_header_size)
537 return -EBADMSG;
538
539 old_size = old_header_size + old_arena_size;
540
541 new_size = MAX(PAGE_ALIGN(offset + size), old_header_size);
542
543 if (new_size <= old_size) {
544
545 /* We already pre-allocated enough space, but before
546 * we write to it, let's check with fstat() if the
547 * file got deleted, in order make sure we don't throw
548 * away the data immediately. Don't check fstat() for
549 * all writes though, but only once ever 10s. */
550
551 if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC))
552 return 0;
553
554 return journal_file_fstat(f);
555 }
556
557 /* Allocate more space. */
558
559 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
560 return -E2BIG;
561
562 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
563 struct statvfs svfs;
564
565 if (fstatvfs(f->fd, &svfs) >= 0) {
566 uint64_t available;
567
568 available = LESS_BY((uint64_t) svfs.f_bfree * (uint64_t) svfs.f_bsize, f->metrics.keep_free);
569
570 if (new_size - old_size > available)
571 return -E2BIG;
572 }
573 }
574
575 /* Increase by larger blocks at once */
576 new_size = DIV_ROUND_UP(new_size, FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
577 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
578 new_size = f->metrics.max_size;
579
580 /* Note that the glibc fallocate() fallback is very
581 inefficient, hence we try to minimize the allocation area
582 as we can. */
583 r = posix_fallocate_loop(f->fd, old_size, new_size - old_size);
584 if (r < 0)
585 return r;
586
587 f->header->arena_size = htole64(new_size - old_header_size);
588
589 return journal_file_fstat(f);
590 }
591
type_to_context(ObjectType type)592 static unsigned type_to_context(ObjectType type) {
593 /* One context for each type, plus one catch-all for the rest */
594 assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
595 assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
596 return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
597 }
598
journal_file_move_to(JournalFile * f,ObjectType type,bool keep_always,uint64_t offset,uint64_t size,void ** ret)599 static int journal_file_move_to(
600 JournalFile *f,
601 ObjectType type,
602 bool keep_always,
603 uint64_t offset,
604 uint64_t size,
605 void **ret) {
606
607 int r;
608
609 assert(f);
610 assert(ret);
611
612 if (size <= 0)
613 return -EINVAL;
614
615 if (size > UINT64_MAX - offset)
616 return -EBADMSG;
617
618 /* Avoid SIGBUS on invalid accesses */
619 if (offset + size > (uint64_t) f->last_stat.st_size) {
620 /* Hmm, out of range? Let's refresh the fstat() data
621 * first, before we trust that check. */
622
623 r = journal_file_fstat(f);
624 if (r < 0)
625 return r;
626
627 if (offset + size > (uint64_t) f->last_stat.st_size)
628 return -EADDRNOTAVAIL;
629 }
630
631 return mmap_cache_fd_get(f->cache_fd, type_to_context(type), keep_always, offset, size, &f->last_stat, ret);
632 }
633
minimum_header_size(Object * o)634 static uint64_t minimum_header_size(Object *o) {
635
636 static const uint64_t table[] = {
637 [OBJECT_DATA] = sizeof(DataObject),
638 [OBJECT_FIELD] = sizeof(FieldObject),
639 [OBJECT_ENTRY] = sizeof(EntryObject),
640 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
641 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
642 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
643 [OBJECT_TAG] = sizeof(TagObject),
644 };
645
646 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
647 return sizeof(ObjectHeader);
648
649 return table[o->object.type];
650 }
651
652 /* Lightweight object checks. We want this to be fast, so that we won't
653 * slowdown every journal_file_move_to_object() call too much. */
journal_file_check_object(JournalFile * f,uint64_t offset,Object * o)654 static int journal_file_check_object(JournalFile *f, uint64_t offset, Object *o) {
655 assert(f);
656 assert(o);
657
658 switch (o->object.type) {
659
660 case OBJECT_DATA:
661 if ((le64toh(o->data.entry_offset) == 0) ^ (le64toh(o->data.n_entries) == 0))
662 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
663 "Bad n_entries: %" PRIu64 ": %" PRIu64,
664 le64toh(o->data.n_entries),
665 offset);
666
667 if (le64toh(o->object.size) <= offsetof(Object, data.payload))
668 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
669 "Bad object size (<= %zu): %" PRIu64 ": %" PRIu64,
670 offsetof(Object, data.payload),
671 le64toh(o->object.size),
672 offset);
673
674 if (!VALID64(le64toh(o->data.next_hash_offset)) ||
675 !VALID64(le64toh(o->data.next_field_offset)) ||
676 !VALID64(le64toh(o->data.entry_offset)) ||
677 !VALID64(le64toh(o->data.entry_array_offset)))
678 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
679 "Invalid offset, next_hash_offset=" OFSfmt ", next_field_offset=" OFSfmt ", entry_offset=" OFSfmt ", entry_array_offset=" OFSfmt ": %" PRIu64,
680 le64toh(o->data.next_hash_offset),
681 le64toh(o->data.next_field_offset),
682 le64toh(o->data.entry_offset),
683 le64toh(o->data.entry_array_offset),
684 offset);
685
686 break;
687
688 case OBJECT_FIELD:
689 if (le64toh(o->object.size) <= offsetof(Object, field.payload))
690 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
691 "Bad field size (<= %zu): %" PRIu64 ": %" PRIu64,
692 offsetof(Object, field.payload),
693 le64toh(o->object.size),
694 offset);
695
696 if (!VALID64(le64toh(o->field.next_hash_offset)) ||
697 !VALID64(le64toh(o->field.head_data_offset)))
698 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
699 "Invalid offset, next_hash_offset=" OFSfmt ", head_data_offset=" OFSfmt ": %" PRIu64,
700 le64toh(o->field.next_hash_offset),
701 le64toh(o->field.head_data_offset),
702 offset);
703 break;
704
705 case OBJECT_ENTRY: {
706 uint64_t sz;
707
708 sz = le64toh(READ_NOW(o->object.size));
709 if (sz < offsetof(Object, entry.items) ||
710 (sz - offsetof(Object, entry.items)) % sizeof(EntryItem) != 0)
711 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
712 "Bad entry size (<= %zu): %" PRIu64 ": %" PRIu64,
713 offsetof(Object, entry.items),
714 sz,
715 offset);
716
717 if ((sz - offsetof(Object, entry.items)) / sizeof(EntryItem) <= 0)
718 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
719 "Invalid number items in entry: %" PRIu64 ": %" PRIu64,
720 (sz - offsetof(Object, entry.items)) / sizeof(EntryItem),
721 offset);
722
723 if (le64toh(o->entry.seqnum) <= 0)
724 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
725 "Invalid entry seqnum: %" PRIx64 ": %" PRIu64,
726 le64toh(o->entry.seqnum),
727 offset);
728
729 if (!VALID_REALTIME(le64toh(o->entry.realtime)))
730 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
731 "Invalid entry realtime timestamp: %" PRIu64 ": %" PRIu64,
732 le64toh(o->entry.realtime),
733 offset);
734
735 if (!VALID_MONOTONIC(le64toh(o->entry.monotonic)))
736 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
737 "Invalid entry monotonic timestamp: %" PRIu64 ": %" PRIu64,
738 le64toh(o->entry.monotonic),
739 offset);
740
741 break;
742 }
743
744 case OBJECT_DATA_HASH_TABLE:
745 case OBJECT_FIELD_HASH_TABLE: {
746 uint64_t sz;
747
748 sz = le64toh(READ_NOW(o->object.size));
749 if (sz < offsetof(Object, hash_table.items) ||
750 (sz - offsetof(Object, hash_table.items)) % sizeof(HashItem) != 0 ||
751 (sz - offsetof(Object, hash_table.items)) / sizeof(HashItem) <= 0)
752 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
753 "Invalid %s hash table size: %" PRIu64 ": %" PRIu64,
754 o->object.type == OBJECT_DATA_HASH_TABLE ? "data" : "field",
755 sz,
756 offset);
757
758 break;
759 }
760
761 case OBJECT_ENTRY_ARRAY: {
762 uint64_t sz;
763
764 sz = le64toh(READ_NOW(o->object.size));
765 if (sz < offsetof(Object, entry_array.items) ||
766 (sz - offsetof(Object, entry_array.items)) % sizeof(le64_t) != 0 ||
767 (sz - offsetof(Object, entry_array.items)) / sizeof(le64_t) <= 0)
768 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
769 "Invalid object entry array size: %" PRIu64 ": %" PRIu64,
770 sz,
771 offset);
772
773 if (!VALID64(le64toh(o->entry_array.next_entry_array_offset)))
774 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
775 "Invalid object entry array next_entry_array_offset: " OFSfmt ": %" PRIu64,
776 le64toh(o->entry_array.next_entry_array_offset),
777 offset);
778
779 break;
780 }
781
782 case OBJECT_TAG:
783 if (le64toh(o->object.size) != sizeof(TagObject))
784 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
785 "Invalid object tag size: %" PRIu64 ": %" PRIu64,
786 le64toh(o->object.size),
787 offset);
788
789 if (!VALID_EPOCH(le64toh(o->tag.epoch)))
790 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
791 "Invalid object tag epoch: %" PRIu64 ": %" PRIu64,
792 le64toh(o->tag.epoch), offset);
793
794 break;
795 }
796
797 return 0;
798 }
799
journal_file_move_to_object(JournalFile * f,ObjectType type,uint64_t offset,Object ** ret)800 int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
801 int r;
802 void *t;
803 Object *o;
804 uint64_t s;
805
806 assert(f);
807
808 /* Objects may only be located at multiple of 64 bit */
809 if (!VALID64(offset))
810 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
811 "Attempt to move to object at non-64bit boundary: %" PRIu64,
812 offset);
813
814 /* Object may not be located in the file header */
815 if (offset < le64toh(f->header->header_size))
816 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
817 "Attempt to move to object located in file header: %" PRIu64,
818 offset);
819
820 r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t);
821 if (r < 0)
822 return r;
823
824 o = (Object*) t;
825 s = le64toh(READ_NOW(o->object.size));
826
827 if (s == 0)
828 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
829 "Attempt to move to uninitialized object: %" PRIu64,
830 offset);
831 if (s < sizeof(ObjectHeader))
832 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
833 "Attempt to move to overly short object: %" PRIu64,
834 offset);
835
836 if (o->object.type <= OBJECT_UNUSED)
837 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
838 "Attempt to move to object with invalid type: %" PRIu64,
839 offset);
840
841 if (s < minimum_header_size(o))
842 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
843 "Attempt to move to truncated object: %" PRIu64,
844 offset);
845
846 if (type > OBJECT_UNUSED && o->object.type != type)
847 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
848 "Attempt to move to object of unexpected type: %" PRIu64,
849 offset);
850
851 r = journal_file_move_to(f, type, false, offset, s, &t);
852 if (r < 0)
853 return r;
854
855 o = (Object*) t;
856
857 r = journal_file_check_object(f, offset, o);
858 if (r < 0)
859 return r;
860
861 if (ret)
862 *ret = o;
863
864 return 0;
865 }
866
journal_file_read_object_header(JournalFile * f,ObjectType type,uint64_t offset,Object * ret)867 int journal_file_read_object_header(JournalFile *f, ObjectType type, uint64_t offset, Object *ret) {
868 uint64_t s;
869 ssize_t n;
870 Object o;
871 int r;
872
873 assert(f);
874
875 /* Objects may only be located at multiple of 64 bit */
876 if (!VALID64(offset))
877 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
878 "Attempt to read object at non-64bit boundary: %" PRIu64,
879 offset);
880
881 /* Object may not be located in the file header */
882 if (offset < le64toh(f->header->header_size))
883 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
884 "Attempt to read object located in file header: %" PRIu64,
885 offset);
886
887 /* This will likely read too much data but it avoids having to call pread() twice. */
888 n = pread(f->fd, &o, sizeof(o), offset);
889 if (n < 0)
890 return log_debug_errno(errno, "Failed to read journal file at offset: %" PRIu64,
891 offset);
892
893 if ((size_t) n < sizeof(o.object))
894 return log_debug_errno(SYNTHETIC_ERRNO(EIO),
895 "Failed to read short object at offset: %" PRIu64,
896 offset);
897
898 s = le64toh(o.object.size);
899 if (s == 0)
900 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
901 "Attempt to read uninitialized object: %" PRIu64,
902 offset);
903 if (s < sizeof(o.object))
904 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
905 "Attempt to read overly short object: %" PRIu64,
906 offset);
907
908 if (o.object.type <= OBJECT_UNUSED)
909 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
910 "Attempt to read object with invalid type: %" PRIu64,
911 offset);
912
913 if (s < minimum_header_size(&o))
914 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
915 "Attempt to read truncated object: %" PRIu64,
916 offset);
917
918 if ((size_t) n < minimum_header_size(&o))
919 return log_debug_errno(SYNTHETIC_ERRNO(EIO),
920 "Short read while reading object: %" PRIu64,
921 offset);
922
923 if (type > OBJECT_UNUSED && o.object.type != type)
924 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
925 "Attempt to read object of unexpected type: %" PRIu64,
926 offset);
927
928 r = journal_file_check_object(f, offset, &o);
929 if (r < 0)
930 return r;
931
932 if (ret)
933 *ret = o;
934
935 return 0;
936 }
937
journal_file_entry_seqnum(JournalFile * f,uint64_t * seqnum)938 static uint64_t journal_file_entry_seqnum(
939 JournalFile *f,
940 uint64_t *seqnum) {
941
942 uint64_t ret;
943
944 assert(f);
945 assert(f->header);
946
947 /* Picks a new sequence number for the entry we are about to add and returns it. */
948
949 ret = le64toh(f->header->tail_entry_seqnum) + 1;
950
951 if (seqnum) {
952 /* If an external seqnum counter was passed, we update both the local and the external one,
953 * and set it to the maximum of both */
954
955 if (*seqnum + 1 > ret)
956 ret = *seqnum + 1;
957
958 *seqnum = ret;
959 }
960
961 f->header->tail_entry_seqnum = htole64(ret);
962
963 if (f->header->head_entry_seqnum == 0)
964 f->header->head_entry_seqnum = htole64(ret);
965
966 return ret;
967 }
968
journal_file_append_object(JournalFile * f,ObjectType type,uint64_t size,Object ** ret,uint64_t * ret_offset)969 int journal_file_append_object(
970 JournalFile *f,
971 ObjectType type,
972 uint64_t size,
973 Object **ret,
974 uint64_t *ret_offset) {
975
976 int r;
977 uint64_t p;
978 Object *o;
979 void *t;
980
981 assert(f);
982 assert(f->header);
983 assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
984 assert(size >= sizeof(ObjectHeader));
985
986 r = journal_file_set_online(f);
987 if (r < 0)
988 return r;
989
990 r = journal_file_tail_end_by_mmap(f, &p);
991 if (r < 0)
992 return r;
993
994 r = journal_file_allocate(f, p, size);
995 if (r < 0)
996 return r;
997
998 r = journal_file_move_to(f, type, false, p, size, &t);
999 if (r < 0)
1000 return r;
1001
1002 o = (Object*) t;
1003 o->object = (ObjectHeader) {
1004 .type = type,
1005 .size = htole64(size),
1006 };
1007
1008 f->header->tail_object_offset = htole64(p);
1009 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
1010
1011 if (ret)
1012 *ret = o;
1013
1014 if (ret_offset)
1015 *ret_offset = p;
1016
1017 return 0;
1018 }
1019
journal_file_setup_data_hash_table(JournalFile * f)1020 static int journal_file_setup_data_hash_table(JournalFile *f) {
1021 uint64_t s, p;
1022 Object *o;
1023 int r;
1024
1025 assert(f);
1026 assert(f->header);
1027
1028 /* We estimate that we need 1 hash table entry per 768 bytes
1029 of journal file and we want to make sure we never get
1030 beyond 75% fill level. Calculate the hash table size for
1031 the maximum file size based on these metrics. */
1032
1033 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
1034 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
1035 s = DEFAULT_DATA_HASH_TABLE_SIZE;
1036
1037 log_debug("Reserving %"PRIu64" entries in data hash table.", s / sizeof(HashItem));
1038
1039 r = journal_file_append_object(f,
1040 OBJECT_DATA_HASH_TABLE,
1041 offsetof(Object, hash_table.items) + s,
1042 &o, &p);
1043 if (r < 0)
1044 return r;
1045
1046 memzero(o->hash_table.items, s);
1047
1048 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
1049 f->header->data_hash_table_size = htole64(s);
1050
1051 return 0;
1052 }
1053
journal_file_setup_field_hash_table(JournalFile * f)1054 static int journal_file_setup_field_hash_table(JournalFile *f) {
1055 uint64_t s, p;
1056 Object *o;
1057 int r;
1058
1059 assert(f);
1060 assert(f->header);
1061
1062 /* We use a fixed size hash table for the fields as this
1063 * number should grow very slowly only */
1064
1065 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
1066 log_debug("Reserving %"PRIu64" entries in field hash table.", s / sizeof(HashItem));
1067
1068 r = journal_file_append_object(f,
1069 OBJECT_FIELD_HASH_TABLE,
1070 offsetof(Object, hash_table.items) + s,
1071 &o, &p);
1072 if (r < 0)
1073 return r;
1074
1075 memzero(o->hash_table.items, s);
1076
1077 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
1078 f->header->field_hash_table_size = htole64(s);
1079
1080 return 0;
1081 }
1082
journal_file_map_data_hash_table(JournalFile * f)1083 int journal_file_map_data_hash_table(JournalFile *f) {
1084 uint64_t s, p;
1085 void *t;
1086 int r;
1087
1088 assert(f);
1089 assert(f->header);
1090
1091 if (f->data_hash_table)
1092 return 0;
1093
1094 p = le64toh(f->header->data_hash_table_offset);
1095 s = le64toh(f->header->data_hash_table_size);
1096
1097 r = journal_file_move_to(f,
1098 OBJECT_DATA_HASH_TABLE,
1099 true,
1100 p, s,
1101 &t);
1102 if (r < 0)
1103 return r;
1104
1105 f->data_hash_table = t;
1106 return 0;
1107 }
1108
journal_file_map_field_hash_table(JournalFile * f)1109 int journal_file_map_field_hash_table(JournalFile *f) {
1110 uint64_t s, p;
1111 void *t;
1112 int r;
1113
1114 assert(f);
1115 assert(f->header);
1116
1117 if (f->field_hash_table)
1118 return 0;
1119
1120 p = le64toh(f->header->field_hash_table_offset);
1121 s = le64toh(f->header->field_hash_table_size);
1122
1123 r = journal_file_move_to(f,
1124 OBJECT_FIELD_HASH_TABLE,
1125 true,
1126 p, s,
1127 &t);
1128 if (r < 0)
1129 return r;
1130
1131 f->field_hash_table = t;
1132 return 0;
1133 }
1134
journal_file_link_field(JournalFile * f,Object * o,uint64_t offset,uint64_t hash)1135 static int journal_file_link_field(
1136 JournalFile *f,
1137 Object *o,
1138 uint64_t offset,
1139 uint64_t hash) {
1140
1141 uint64_t p, h, m;
1142 int r;
1143
1144 assert(f);
1145 assert(f->header);
1146 assert(f->field_hash_table);
1147 assert(o);
1148 assert(offset > 0);
1149
1150 if (o->object.type != OBJECT_FIELD)
1151 return -EINVAL;
1152
1153 m = le64toh(READ_NOW(f->header->field_hash_table_size)) / sizeof(HashItem);
1154 if (m <= 0)
1155 return -EBADMSG;
1156
1157 /* This might alter the window we are looking at */
1158 o->field.next_hash_offset = o->field.head_data_offset = 0;
1159
1160 h = hash % m;
1161 p = le64toh(f->field_hash_table[h].tail_hash_offset);
1162 if (p == 0)
1163 f->field_hash_table[h].head_hash_offset = htole64(offset);
1164 else {
1165 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1166 if (r < 0)
1167 return r;
1168
1169 o->field.next_hash_offset = htole64(offset);
1170 }
1171
1172 f->field_hash_table[h].tail_hash_offset = htole64(offset);
1173
1174 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
1175 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
1176
1177 return 0;
1178 }
1179
journal_file_link_data(JournalFile * f,Object * o,uint64_t offset,uint64_t hash)1180 static int journal_file_link_data(
1181 JournalFile *f,
1182 Object *o,
1183 uint64_t offset,
1184 uint64_t hash) {
1185
1186 uint64_t p, h, m;
1187 int r;
1188
1189 assert(f);
1190 assert(f->header);
1191 assert(f->data_hash_table);
1192 assert(o);
1193 assert(offset > 0);
1194
1195 if (o->object.type != OBJECT_DATA)
1196 return -EINVAL;
1197
1198 m = le64toh(READ_NOW(f->header->data_hash_table_size)) / sizeof(HashItem);
1199 if (m <= 0)
1200 return -EBADMSG;
1201
1202 /* This might alter the window we are looking at */
1203 o->data.next_hash_offset = o->data.next_field_offset = 0;
1204 o->data.entry_offset = o->data.entry_array_offset = 0;
1205 o->data.n_entries = 0;
1206
1207 h = hash % m;
1208 p = le64toh(f->data_hash_table[h].tail_hash_offset);
1209 if (p == 0)
1210 /* Only entry in the hash table is easy */
1211 f->data_hash_table[h].head_hash_offset = htole64(offset);
1212 else {
1213 /* Move back to the previous data object, to patch in
1214 * pointer */
1215
1216 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1217 if (r < 0)
1218 return r;
1219
1220 o->data.next_hash_offset = htole64(offset);
1221 }
1222
1223 f->data_hash_table[h].tail_hash_offset = htole64(offset);
1224
1225 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
1226 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
1227
1228 return 0;
1229 }
1230
next_hash_offset(JournalFile * f,uint64_t * p,le64_t * next_hash_offset,uint64_t * depth,le64_t * header_max_depth)1231 static int next_hash_offset(
1232 JournalFile *f,
1233 uint64_t *p,
1234 le64_t *next_hash_offset,
1235 uint64_t *depth,
1236 le64_t *header_max_depth) {
1237
1238 uint64_t nextp;
1239
1240 nextp = le64toh(READ_NOW(*next_hash_offset));
1241 if (nextp > 0) {
1242 if (nextp <= *p) /* Refuse going in loops */
1243 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
1244 "Detected hash item loop in %s, refusing.", f->path);
1245
1246 (*depth)++;
1247
1248 /* If the depth of this hash chain is larger than all others we have seen so far, record it */
1249 if (header_max_depth && journal_file_writable(f))
1250 *header_max_depth = htole64(MAX(*depth, le64toh(*header_max_depth)));
1251 }
1252
1253 *p = nextp;
1254 return 0;
1255 }
1256
journal_file_find_field_object_with_hash(JournalFile * f,const void * field,uint64_t size,uint64_t hash,Object ** ret,uint64_t * ret_offset)1257 int journal_file_find_field_object_with_hash(
1258 JournalFile *f,
1259 const void *field, uint64_t size, uint64_t hash,
1260 Object **ret, uint64_t *ret_offset) {
1261
1262 uint64_t p, osize, h, m, depth = 0;
1263 int r;
1264
1265 assert(f);
1266 assert(f->header);
1267 assert(field && size > 0);
1268
1269 /* If the field hash table is empty, we can't find anything */
1270 if (le64toh(f->header->field_hash_table_size) <= 0)
1271 return 0;
1272
1273 /* Map the field hash table, if it isn't mapped yet. */
1274 r = journal_file_map_field_hash_table(f);
1275 if (r < 0)
1276 return r;
1277
1278 osize = offsetof(Object, field.payload) + size;
1279
1280 m = le64toh(READ_NOW(f->header->field_hash_table_size)) / sizeof(HashItem);
1281 if (m <= 0)
1282 return -EBADMSG;
1283
1284 h = hash % m;
1285 p = le64toh(f->field_hash_table[h].head_hash_offset);
1286 while (p > 0) {
1287 Object *o;
1288
1289 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1290 if (r < 0)
1291 return r;
1292
1293 if (le64toh(o->field.hash) == hash &&
1294 le64toh(o->object.size) == osize &&
1295 memcmp(o->field.payload, field, size) == 0) {
1296
1297 if (ret)
1298 *ret = o;
1299 if (ret_offset)
1300 *ret_offset = p;
1301
1302 return 1;
1303 }
1304
1305 r = next_hash_offset(
1306 f,
1307 &p,
1308 &o->field.next_hash_offset,
1309 &depth,
1310 JOURNAL_HEADER_CONTAINS(f->header, field_hash_chain_depth) ? &f->header->field_hash_chain_depth : NULL);
1311 if (r < 0)
1312 return r;
1313 }
1314
1315 return 0;
1316 }
1317
journal_file_hash_data(JournalFile * f,const void * data,size_t sz)1318 uint64_t journal_file_hash_data(
1319 JournalFile *f,
1320 const void *data,
1321 size_t sz) {
1322
1323 assert(f);
1324 assert(data || sz == 0);
1325
1326 /* We try to unify our codebase on siphash, hence new-styled journal files utilizing the keyed hash
1327 * function use siphash. Old journal files use the Jenkins hash. */
1328
1329 if (JOURNAL_HEADER_KEYED_HASH(f->header))
1330 return siphash24(data, sz, f->header->file_id.bytes);
1331
1332 return jenkins_hash64(data, sz);
1333 }
1334
journal_file_find_field_object(JournalFile * f,const void * field,uint64_t size,Object ** ret,uint64_t * ret_offset)1335 int journal_file_find_field_object(
1336 JournalFile *f,
1337 const void *field, uint64_t size,
1338 Object **ret, uint64_t *ret_offset) {
1339
1340 assert(f);
1341 assert(field && size > 0);
1342
1343 return journal_file_find_field_object_with_hash(
1344 f,
1345 field, size,
1346 journal_file_hash_data(f, field, size),
1347 ret, ret_offset);
1348 }
1349
journal_file_find_data_object_with_hash(JournalFile * f,const void * data,uint64_t size,uint64_t hash,Object ** ret,uint64_t * ret_offset)1350 int journal_file_find_data_object_with_hash(
1351 JournalFile *f,
1352 const void *data, uint64_t size, uint64_t hash,
1353 Object **ret, uint64_t *ret_offset) {
1354
1355 uint64_t p, osize, h, m, depth = 0;
1356 int r;
1357
1358 assert(f);
1359 assert(f->header);
1360 assert(data || size == 0);
1361
1362 /* If there's no data hash table, then there's no entry. */
1363 if (le64toh(f->header->data_hash_table_size) <= 0)
1364 return 0;
1365
1366 /* Map the data hash table, if it isn't mapped yet. */
1367 r = journal_file_map_data_hash_table(f);
1368 if (r < 0)
1369 return r;
1370
1371 osize = offsetof(Object, data.payload) + size;
1372
1373 m = le64toh(READ_NOW(f->header->data_hash_table_size)) / sizeof(HashItem);
1374 if (m <= 0)
1375 return -EBADMSG;
1376
1377 h = hash % m;
1378 p = le64toh(f->data_hash_table[h].head_hash_offset);
1379
1380 while (p > 0) {
1381 Compression c;
1382 Object *o;
1383
1384 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1385 if (r < 0)
1386 return r;
1387
1388 if (le64toh(o->data.hash) != hash)
1389 goto next;
1390
1391 c = COMPRESSION_FROM_OBJECT(o);
1392 if (c < 0)
1393 return -EPROTONOSUPPORT;
1394 if (c != COMPRESSION_NONE) {
1395 #if HAVE_COMPRESSION
1396 uint64_t l;
1397 size_t rsize = 0;
1398
1399 l = le64toh(READ_NOW(o->object.size));
1400 if (l <= offsetof(Object, data.payload))
1401 return -EBADMSG;
1402
1403 l -= offsetof(Object, data.payload);
1404
1405 r = decompress_blob(c, o->data.payload, l, &f->compress_buffer, &rsize, 0);
1406 if (r < 0)
1407 return r;
1408
1409 if (rsize == size &&
1410 memcmp(f->compress_buffer, data, size) == 0) {
1411
1412 if (ret)
1413 *ret = o;
1414
1415 if (ret_offset)
1416 *ret_offset = p;
1417
1418 return 1;
1419 }
1420 #else
1421 return -EPROTONOSUPPORT;
1422 #endif
1423 } else if (le64toh(o->object.size) == osize &&
1424 memcmp(o->data.payload, data, size) == 0) {
1425
1426 if (ret)
1427 *ret = o;
1428
1429 if (ret_offset)
1430 *ret_offset = p;
1431
1432 return 1;
1433 }
1434
1435 next:
1436 r = next_hash_offset(
1437 f,
1438 &p,
1439 &o->data.next_hash_offset,
1440 &depth,
1441 JOURNAL_HEADER_CONTAINS(f->header, data_hash_chain_depth) ? &f->header->data_hash_chain_depth : NULL);
1442 if (r < 0)
1443 return r;
1444 }
1445
1446 return 0;
1447 }
1448
journal_file_find_data_object(JournalFile * f,const void * data,uint64_t size,Object ** ret,uint64_t * ret_offset)1449 int journal_file_find_data_object(
1450 JournalFile *f,
1451 const void *data, uint64_t size,
1452 Object **ret, uint64_t *ret_offset) {
1453
1454 assert(f);
1455 assert(data || size == 0);
1456
1457 return journal_file_find_data_object_with_hash(
1458 f,
1459 data, size,
1460 journal_file_hash_data(f, data, size),
1461 ret, ret_offset);
1462 }
1463
journal_field_valid(const char * p,size_t l,bool allow_protected)1464 bool journal_field_valid(const char *p, size_t l, bool allow_protected) {
1465 /* We kinda enforce POSIX syntax recommendations for
1466 environment variables here, but make a couple of additional
1467 requirements.
1468
1469 http://pubs.opengroup.org/onlinepubs/000095399/basedefs/xbd_chap08.html */
1470
1471 if (l == SIZE_MAX)
1472 l = strlen(p);
1473
1474 /* No empty field names */
1475 if (l <= 0)
1476 return false;
1477
1478 /* Don't allow names longer than 64 chars */
1479 if (l > 64)
1480 return false;
1481
1482 /* Variables starting with an underscore are protected */
1483 if (!allow_protected && p[0] == '_')
1484 return false;
1485
1486 /* Don't allow digits as first character */
1487 if (p[0] >= '0' && p[0] <= '9')
1488 return false;
1489
1490 /* Only allow A-Z0-9 and '_' */
1491 for (const char *a = p; a < p + l; a++)
1492 if ((*a < 'A' || *a > 'Z') &&
1493 (*a < '0' || *a > '9') &&
1494 *a != '_')
1495 return false;
1496
1497 return true;
1498 }
1499
journal_file_append_field(JournalFile * f,const void * field,uint64_t size,Object ** ret,uint64_t * ret_offset)1500 static int journal_file_append_field(
1501 JournalFile *f,
1502 const void *field, uint64_t size,
1503 Object **ret, uint64_t *ret_offset) {
1504
1505 uint64_t hash, p;
1506 uint64_t osize;
1507 Object *o;
1508 int r;
1509
1510 assert(f);
1511 assert(field && size > 0);
1512
1513 if (!journal_field_valid(field, size, true))
1514 return -EBADMSG;
1515
1516 hash = journal_file_hash_data(f, field, size);
1517
1518 r = journal_file_find_field_object_with_hash(f, field, size, hash, ret, ret_offset);
1519 if (r < 0)
1520 return r;
1521 if (r > 0)
1522 return 0;
1523
1524 osize = offsetof(Object, field.payload) + size;
1525 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
1526 if (r < 0)
1527 return r;
1528
1529 o->field.hash = htole64(hash);
1530 memcpy(o->field.payload, field, size);
1531
1532 r = journal_file_link_field(f, o, p, hash);
1533 if (r < 0)
1534 return r;
1535
1536 /* The linking might have altered the window, so let's only pass the offset to hmac which will
1537 * move to the object again if needed. */
1538
1539 #if HAVE_GCRYPT
1540 r = journal_file_hmac_put_object(f, OBJECT_FIELD, NULL, p);
1541 if (r < 0)
1542 return r;
1543 #endif
1544
1545 if (ret) {
1546 r = journal_file_move_to_object(f, OBJECT_FIELD, p, ret);
1547 if (r < 0)
1548 return r;
1549 }
1550
1551 if (ret_offset)
1552 *ret_offset = p;
1553
1554 return 0;
1555 }
1556
journal_file_append_data(JournalFile * f,const void * data,uint64_t size,Object ** ret,uint64_t * ret_offset)1557 static int journal_file_append_data(
1558 JournalFile *f,
1559 const void *data, uint64_t size,
1560 Object **ret, uint64_t *ret_offset) {
1561
1562 uint64_t hash, p, fp, osize;
1563 Object *o, *fo;
1564 int r, compression = 0;
1565 const void *eq;
1566
1567 assert(f);
1568
1569 if (!data || size == 0)
1570 return -EINVAL;
1571
1572 hash = journal_file_hash_data(f, data, size);
1573
1574 r = journal_file_find_data_object_with_hash(f, data, size, hash, ret, ret_offset);
1575 if (r < 0)
1576 return r;
1577 if (r > 0)
1578 return 0;
1579
1580 eq = memchr(data, '=', size);
1581 if (!eq)
1582 return -EINVAL;
1583
1584 osize = offsetof(Object, data.payload) + size;
1585 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
1586 if (r < 0)
1587 return r;
1588
1589 o->data.hash = htole64(hash);
1590
1591 #if HAVE_COMPRESSION
1592 if (JOURNAL_FILE_COMPRESS(f) && size >= f->compress_threshold_bytes) {
1593 size_t rsize = 0;
1594
1595 compression = compress_blob(data, size, o->data.payload, size - 1, &rsize);
1596 if (compression > COMPRESSION_NONE) {
1597 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
1598 o->object.flags |= COMPRESSION_TO_OBJECT_FLAG(compression);
1599
1600 log_debug("Compressed data object %"PRIu64" -> %zu using %s",
1601 size, rsize, compression_to_string(compression));
1602 } else
1603 /* Compression didn't work, we don't really care why, let's continue without compression */
1604 compression = COMPRESSION_NONE;
1605 }
1606 #endif
1607
1608 if (compression == 0)
1609 memcpy_safe(o->data.payload, data, size);
1610
1611 r = journal_file_link_data(f, o, p, hash);
1612 if (r < 0)
1613 return r;
1614
1615 /* The linking might have altered the window, so let's refresh our pointer. */
1616 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1617 if (r < 0)
1618 return r;
1619
1620 #if HAVE_GCRYPT
1621 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1622 if (r < 0)
1623 return r;
1624 #endif
1625
1626 /* Create field object ... */
1627 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1628 if (r < 0)
1629 return r;
1630
1631 /* ... and link it in. */
1632 o->data.next_field_offset = fo->field.head_data_offset;
1633 fo->field.head_data_offset = le64toh(p);
1634
1635 if (ret)
1636 *ret = o;
1637
1638 if (ret_offset)
1639 *ret_offset = p;
1640
1641 return 0;
1642 }
1643
journal_file_entry_n_items(Object * o)1644 uint64_t journal_file_entry_n_items(Object *o) {
1645 uint64_t sz;
1646 assert(o);
1647
1648 if (o->object.type != OBJECT_ENTRY)
1649 return 0;
1650
1651 sz = le64toh(READ_NOW(o->object.size));
1652 if (sz < offsetof(Object, entry.items))
1653 return 0;
1654
1655 return (sz - offsetof(Object, entry.items)) / sizeof(EntryItem);
1656 }
1657
journal_file_entry_array_n_items(Object * o)1658 uint64_t journal_file_entry_array_n_items(Object *o) {
1659 uint64_t sz;
1660
1661 assert(o);
1662
1663 if (o->object.type != OBJECT_ENTRY_ARRAY)
1664 return 0;
1665
1666 sz = le64toh(READ_NOW(o->object.size));
1667 if (sz < offsetof(Object, entry_array.items))
1668 return 0;
1669
1670 return (sz - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1671 }
1672
journal_file_hash_table_n_items(Object * o)1673 uint64_t journal_file_hash_table_n_items(Object *o) {
1674 uint64_t sz;
1675
1676 assert(o);
1677
1678 if (!IN_SET(o->object.type, OBJECT_DATA_HASH_TABLE, OBJECT_FIELD_HASH_TABLE))
1679 return 0;
1680
1681 sz = le64toh(READ_NOW(o->object.size));
1682 if (sz < offsetof(Object, hash_table.items))
1683 return 0;
1684
1685 return (sz - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1686 }
1687
link_entry_into_array(JournalFile * f,le64_t * first,le64_t * idx,uint64_t p)1688 static int link_entry_into_array(JournalFile *f,
1689 le64_t *first,
1690 le64_t *idx,
1691 uint64_t p) {
1692 int r;
1693 uint64_t n = 0, ap = 0, q, i, a, hidx;
1694 Object *o;
1695
1696 assert(f);
1697 assert(f->header);
1698 assert(first);
1699 assert(idx);
1700 assert(p > 0);
1701
1702 a = le64toh(*first);
1703 i = hidx = le64toh(READ_NOW(*idx));
1704 while (a > 0) {
1705
1706 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1707 if (r < 0)
1708 return r;
1709
1710 n = journal_file_entry_array_n_items(o);
1711 if (i < n) {
1712 o->entry_array.items[i] = htole64(p);
1713 *idx = htole64(hidx + 1);
1714 return 0;
1715 }
1716
1717 i -= n;
1718 ap = a;
1719 a = le64toh(o->entry_array.next_entry_array_offset);
1720 }
1721
1722 if (hidx > n)
1723 n = (hidx+1) * 2;
1724 else
1725 n = n * 2;
1726
1727 if (n < 4)
1728 n = 4;
1729
1730 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1731 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1732 &o, &q);
1733 if (r < 0)
1734 return r;
1735
1736 #if HAVE_GCRYPT
1737 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
1738 if (r < 0)
1739 return r;
1740 #endif
1741
1742 o->entry_array.items[i] = htole64(p);
1743
1744 if (ap == 0)
1745 *first = htole64(q);
1746 else {
1747 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
1748 if (r < 0)
1749 return r;
1750
1751 o->entry_array.next_entry_array_offset = htole64(q);
1752 }
1753
1754 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1755 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1756
1757 *idx = htole64(hidx + 1);
1758
1759 return 0;
1760 }
1761
link_entry_into_array_plus_one(JournalFile * f,le64_t * extra,le64_t * first,le64_t * idx,uint64_t p)1762 static int link_entry_into_array_plus_one(JournalFile *f,
1763 le64_t *extra,
1764 le64_t *first,
1765 le64_t *idx,
1766 uint64_t p) {
1767
1768 uint64_t hidx;
1769 int r;
1770
1771 assert(f);
1772 assert(extra);
1773 assert(first);
1774 assert(idx);
1775 assert(p > 0);
1776
1777 hidx = le64toh(READ_NOW(*idx));
1778 if (hidx == UINT64_MAX)
1779 return -EBADMSG;
1780 if (hidx == 0)
1781 *extra = htole64(p);
1782 else {
1783 le64_t i;
1784
1785 i = htole64(hidx - 1);
1786 r = link_entry_into_array(f, first, &i, p);
1787 if (r < 0)
1788 return r;
1789 }
1790
1791 *idx = htole64(hidx + 1);
1792 return 0;
1793 }
1794
journal_file_link_entry_item(JournalFile * f,Object * o,uint64_t offset,uint64_t i)1795 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1796 uint64_t p;
1797 int r;
1798
1799 assert(f);
1800 assert(o);
1801 assert(offset > 0);
1802
1803 p = le64toh(o->entry.items[i].object_offset);
1804 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1805 if (r < 0)
1806 return r;
1807
1808 return link_entry_into_array_plus_one(f,
1809 &o->data.entry_offset,
1810 &o->data.entry_array_offset,
1811 &o->data.n_entries,
1812 offset);
1813 }
1814
journal_file_link_entry(JournalFile * f,Object * o,uint64_t offset)1815 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
1816 uint64_t n;
1817 int r;
1818
1819 assert(f);
1820 assert(f->header);
1821 assert(o);
1822 assert(offset > 0);
1823
1824 if (o->object.type != OBJECT_ENTRY)
1825 return -EINVAL;
1826
1827 __sync_synchronize();
1828
1829 /* Link up the entry itself */
1830 r = link_entry_into_array(f,
1831 &f->header->entry_array_offset,
1832 &f->header->n_entries,
1833 offset);
1834 if (r < 0)
1835 return r;
1836
1837 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
1838
1839 if (f->header->head_entry_realtime == 0)
1840 f->header->head_entry_realtime = o->entry.realtime;
1841
1842 f->header->tail_entry_realtime = o->entry.realtime;
1843 f->header->tail_entry_monotonic = o->entry.monotonic;
1844
1845 /* Link up the items */
1846 n = journal_file_entry_n_items(o);
1847 for (uint64_t i = 0; i < n; i++) {
1848 int k;
1849
1850 /* If we fail to link an entry item because we can't allocate a new entry array, don't fail
1851 * immediately but try to link the other entry items since it might still be possible to link
1852 * those if they don't require a new entry array to be allocated. */
1853
1854 k = journal_file_link_entry_item(f, o, offset, i);
1855 if (k == -E2BIG)
1856 r = k;
1857 else if (k < 0)
1858 return k;
1859 }
1860
1861 return r;
1862 }
1863
journal_file_append_entry_internal(JournalFile * f,const dual_timestamp * ts,const sd_id128_t * boot_id,uint64_t xor_hash,const EntryItem items[],unsigned n_items,uint64_t * seqnum,Object ** ret,uint64_t * ret_offset)1864 static int journal_file_append_entry_internal(
1865 JournalFile *f,
1866 const dual_timestamp *ts,
1867 const sd_id128_t *boot_id,
1868 uint64_t xor_hash,
1869 const EntryItem items[], unsigned n_items,
1870 uint64_t *seqnum,
1871 Object **ret, uint64_t *ret_offset) {
1872 uint64_t np;
1873 uint64_t osize;
1874 Object *o;
1875 int r;
1876
1877 assert(f);
1878 assert(f->header);
1879 assert(items || n_items == 0);
1880 assert(ts);
1881
1882 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1883
1884 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
1885 if (r < 0)
1886 return r;
1887
1888 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
1889 memcpy_safe(o->entry.items, items, n_items * sizeof(EntryItem));
1890 o->entry.realtime = htole64(ts->realtime);
1891 o->entry.monotonic = htole64(ts->monotonic);
1892 o->entry.xor_hash = htole64(xor_hash);
1893 if (boot_id)
1894 f->header->boot_id = *boot_id;
1895 o->entry.boot_id = f->header->boot_id;
1896
1897 #if HAVE_GCRYPT
1898 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
1899 if (r < 0)
1900 return r;
1901 #endif
1902
1903 r = journal_file_link_entry(f, o, np);
1904 if (r < 0)
1905 return r;
1906
1907 if (ret)
1908 *ret = o;
1909
1910 if (ret_offset)
1911 *ret_offset = np;
1912
1913 return r;
1914 }
1915
journal_file_post_change(JournalFile * f)1916 void journal_file_post_change(JournalFile *f) {
1917 assert(f);
1918
1919 if (f->fd < 0)
1920 return;
1921
1922 /* inotify() does not receive IN_MODIFY events from file
1923 * accesses done via mmap(). After each access we hence
1924 * trigger IN_MODIFY by truncating the journal file to its
1925 * current size which triggers IN_MODIFY. */
1926
1927 __sync_synchronize();
1928
1929 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1930 log_debug_errno(errno, "Failed to truncate file to its own size: %m");
1931 }
1932
post_change_thunk(sd_event_source * timer,uint64_t usec,void * userdata)1933 static int post_change_thunk(sd_event_source *timer, uint64_t usec, void *userdata) {
1934 assert(userdata);
1935
1936 journal_file_post_change(userdata);
1937
1938 return 1;
1939 }
1940
schedule_post_change(JournalFile * f)1941 static void schedule_post_change(JournalFile *f) {
1942 sd_event *e;
1943 int r;
1944
1945 assert(f);
1946 assert(f->post_change_timer);
1947
1948 assert_se(e = sd_event_source_get_event(f->post_change_timer));
1949
1950 /* If we are already going down, post the change immediately. */
1951 if (IN_SET(sd_event_get_state(e), SD_EVENT_EXITING, SD_EVENT_FINISHED))
1952 goto fail;
1953
1954 r = sd_event_source_get_enabled(f->post_change_timer, NULL);
1955 if (r < 0) {
1956 log_debug_errno(r, "Failed to get ftruncate timer state: %m");
1957 goto fail;
1958 }
1959 if (r > 0)
1960 return;
1961
1962 r = sd_event_source_set_time_relative(f->post_change_timer, f->post_change_timer_period);
1963 if (r < 0) {
1964 log_debug_errno(r, "Failed to set time for scheduling ftruncate: %m");
1965 goto fail;
1966 }
1967
1968 r = sd_event_source_set_enabled(f->post_change_timer, SD_EVENT_ONESHOT);
1969 if (r < 0) {
1970 log_debug_errno(r, "Failed to enable scheduled ftruncate: %m");
1971 goto fail;
1972 }
1973
1974 return;
1975
1976 fail:
1977 /* On failure, let's simply post the change immediately. */
1978 journal_file_post_change(f);
1979 }
1980
1981 /* Enable coalesced change posting in a timer on the provided sd_event instance */
journal_file_enable_post_change_timer(JournalFile * f,sd_event * e,usec_t t)1982 int journal_file_enable_post_change_timer(JournalFile *f, sd_event *e, usec_t t) {
1983 _cleanup_(sd_event_source_unrefp) sd_event_source *timer = NULL;
1984 int r;
1985
1986 assert(f);
1987 assert_return(!f->post_change_timer, -EINVAL);
1988 assert(e);
1989 assert(t);
1990
1991 r = sd_event_add_time(e, &timer, CLOCK_MONOTONIC, 0, 0, post_change_thunk, f);
1992 if (r < 0)
1993 return r;
1994
1995 r = sd_event_source_set_enabled(timer, SD_EVENT_OFF);
1996 if (r < 0)
1997 return r;
1998
1999 f->post_change_timer = TAKE_PTR(timer);
2000 f->post_change_timer_period = t;
2001
2002 return r;
2003 }
2004
entry_item_cmp(const EntryItem * a,const EntryItem * b)2005 static int entry_item_cmp(const EntryItem *a, const EntryItem *b) {
2006 return CMP(le64toh(a->object_offset), le64toh(b->object_offset));
2007 }
2008
remove_duplicate_entry_items(EntryItem items[],size_t n)2009 static size_t remove_duplicate_entry_items(EntryItem items[], size_t n) {
2010
2011 /* This function relies on the items array being sorted. */
2012 size_t j = 1;
2013
2014 if (n <= 1)
2015 return n;
2016
2017 for (size_t i = 1; i < n; i++)
2018 if (items[i].object_offset != items[j - 1].object_offset)
2019 items[j++] = items[i];
2020
2021 return j;
2022 }
2023
journal_file_append_entry(JournalFile * f,const dual_timestamp * ts,const sd_id128_t * boot_id,const struct iovec iovec[],unsigned n_iovec,uint64_t * seqnum,Object ** ret,uint64_t * ret_offset)2024 int journal_file_append_entry(
2025 JournalFile *f,
2026 const dual_timestamp *ts,
2027 const sd_id128_t *boot_id,
2028 const struct iovec iovec[], unsigned n_iovec,
2029 uint64_t *seqnum,
2030 Object **ret, uint64_t *ret_offset) {
2031
2032 EntryItem *items;
2033 int r;
2034 uint64_t xor_hash = 0;
2035 struct dual_timestamp _ts;
2036
2037 assert(f);
2038 assert(f->header);
2039 assert(iovec && n_iovec > 0);
2040
2041 if (ts) {
2042 if (!VALID_REALTIME(ts->realtime))
2043 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
2044 "Invalid realtime timestamp %" PRIu64 ", refusing entry.",
2045 ts->realtime);
2046 if (!VALID_MONOTONIC(ts->monotonic))
2047 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
2048 "Invalid monotomic timestamp %" PRIu64 ", refusing entry.",
2049 ts->monotonic);
2050 } else {
2051 dual_timestamp_get(&_ts);
2052 ts = &_ts;
2053 }
2054
2055 #if HAVE_GCRYPT
2056 r = journal_file_maybe_append_tag(f, ts->realtime);
2057 if (r < 0)
2058 return r;
2059 #endif
2060
2061 items = newa(EntryItem, n_iovec);
2062
2063 for (size_t i = 0; i < n_iovec; i++) {
2064 uint64_t p;
2065 Object *o;
2066
2067 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
2068 if (r < 0)
2069 return r;
2070
2071 /* When calculating the XOR hash field, we need to take special care if the "keyed-hash"
2072 * journal file flag is on. We use the XOR hash field to quickly determine the identity of a
2073 * specific record, and give records with otherwise identical position (i.e. match in seqno,
2074 * timestamp, …) a stable ordering. But for that we can't have it that the hash of the
2075 * objects in each file is different since they are keyed. Hence let's calculate the Jenkins
2076 * hash here for that. This also has the benefit that cursors for old and new journal files
2077 * are completely identical (they include the XOR hash after all). For classic Jenkins-hash
2078 * files things are easier, we can just take the value from the stored record directly. */
2079
2080 if (JOURNAL_HEADER_KEYED_HASH(f->header))
2081 xor_hash ^= jenkins_hash64(iovec[i].iov_base, iovec[i].iov_len);
2082 else
2083 xor_hash ^= le64toh(o->data.hash);
2084
2085 items[i] = (EntryItem) {
2086 .object_offset = htole64(p),
2087 .hash = o->data.hash,
2088 };
2089 }
2090
2091 /* Order by the position on disk, in order to improve seek
2092 * times for rotating media. */
2093 typesafe_qsort(items, n_iovec, entry_item_cmp);
2094 n_iovec = remove_duplicate_entry_items(items, n_iovec);
2095
2096 r = journal_file_append_entry_internal(f, ts, boot_id, xor_hash, items, n_iovec, seqnum, ret, ret_offset);
2097
2098 /* If the memory mapping triggered a SIGBUS then we return an
2099 * IO error and ignore the error code passed down to us, since
2100 * it is very likely just an effect of a nullified replacement
2101 * mapping page */
2102
2103 if (mmap_cache_fd_got_sigbus(f->cache_fd))
2104 r = -EIO;
2105
2106 if (f->post_change_timer)
2107 schedule_post_change(f);
2108 else
2109 journal_file_post_change(f);
2110
2111 return r;
2112 }
2113
2114 typedef struct ChainCacheItem {
2115 uint64_t first; /* the array at the beginning of the chain */
2116 uint64_t array; /* the cached array */
2117 uint64_t begin; /* the first item in the cached array */
2118 uint64_t total; /* the total number of items in all arrays before this one in the chain */
2119 uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
2120 } ChainCacheItem;
2121
chain_cache_put(OrderedHashmap * h,ChainCacheItem * ci,uint64_t first,uint64_t array,uint64_t begin,uint64_t total,uint64_t last_index)2122 static void chain_cache_put(
2123 OrderedHashmap *h,
2124 ChainCacheItem *ci,
2125 uint64_t first,
2126 uint64_t array,
2127 uint64_t begin,
2128 uint64_t total,
2129 uint64_t last_index) {
2130
2131 if (!ci) {
2132 /* If the chain item to cache for this chain is the
2133 * first one it's not worth caching anything */
2134 if (array == first)
2135 return;
2136
2137 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
2138 ci = ordered_hashmap_steal_first(h);
2139 assert(ci);
2140 } else {
2141 ci = new(ChainCacheItem, 1);
2142 if (!ci)
2143 return;
2144 }
2145
2146 ci->first = first;
2147
2148 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
2149 free(ci);
2150 return;
2151 }
2152 } else
2153 assert(ci->first == first);
2154
2155 ci->array = array;
2156 ci->begin = begin;
2157 ci->total = total;
2158 ci->last_index = last_index;
2159 }
2160
bump_array_index(uint64_t * i,direction_t direction,uint64_t n)2161 static int bump_array_index(uint64_t *i, direction_t direction, uint64_t n) {
2162 assert(i);
2163
2164 /* Increase or decrease the specified index, in the right direction. */
2165
2166 if (direction == DIRECTION_DOWN) {
2167 if (*i >= n - 1)
2168 return 0;
2169
2170 (*i)++;
2171 } else {
2172 if (*i <= 0)
2173 return 0;
2174
2175 (*i)--;
2176 }
2177
2178 return 1;
2179 }
2180
bump_entry_array(JournalFile * f,Object * o,uint64_t offset,uint64_t first,direction_t direction,uint64_t * ret)2181 static int bump_entry_array(JournalFile *f, Object *o, uint64_t offset, uint64_t first, direction_t direction, uint64_t *ret) {
2182 uint64_t p, q = 0;
2183 int r;
2184
2185 assert(f);
2186 assert(offset);
2187 assert(ret);
2188
2189 if (direction == DIRECTION_DOWN)
2190 return le64toh(o->entry_array.next_entry_array_offset);
2191
2192 /* Entry array chains are a singly linked list, so to find the previous array in the chain, we have
2193 * to start iterating from the top. */
2194
2195 p = first;
2196
2197 while (p > 0 && p != offset) {
2198 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, p, &o);
2199 if (r < 0)
2200 return r;
2201
2202 q = p;
2203 p = le64toh(o->entry_array.next_entry_array_offset);
2204 }
2205
2206 /* If we can't find the previous entry array in the entry array chain, we're likely dealing with a
2207 * corrupted journal file. */
2208 if (p == 0)
2209 return -EBADMSG;
2210
2211 *ret = q;
2212
2213 return 0;
2214 }
2215
generic_array_get(JournalFile * f,uint64_t first,uint64_t i,direction_t direction,Object ** ret,uint64_t * ret_offset)2216 static int generic_array_get(
2217 JournalFile *f,
2218 uint64_t first,
2219 uint64_t i,
2220 direction_t direction,
2221 Object **ret, uint64_t *ret_offset) {
2222
2223 Object *o;
2224 uint64_t p = 0, a, t = 0, k;
2225 int r;
2226 ChainCacheItem *ci;
2227
2228 assert(f);
2229
2230 a = first;
2231
2232 /* Try the chain cache first */
2233 ci = ordered_hashmap_get(f->chain_cache, &first);
2234 if (ci && i > ci->total) {
2235 a = ci->array;
2236 i -= ci->total;
2237 t = ci->total;
2238 }
2239
2240 while (a > 0) {
2241 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
2242 if (IN_SET(r, -EBADMSG, -EADDRNOTAVAIL)) {
2243 /* If there's corruption and we're going downwards, let's pretend we reached the
2244 * final entry in the entry array chain. */
2245
2246 if (direction == DIRECTION_DOWN)
2247 return 0;
2248
2249 /* If there's corruption and we're going upwards, move back to the previous entry
2250 * array and start iterating entries from there. */
2251
2252 r = bump_entry_array(f, NULL, a, first, DIRECTION_UP, &a);
2253 if (r < 0)
2254 return r;
2255
2256 i = UINT64_MAX;
2257
2258 break;
2259 }
2260 if (r < 0)
2261 return r;
2262
2263 k = journal_file_entry_array_n_items(o);
2264 if (i < k)
2265 break;
2266
2267 i -= k;
2268 t += k;
2269 a = le64toh(o->entry_array.next_entry_array_offset);
2270 }
2271
2272 /* If we've found the right location, now look for the first non-corrupt entry object (in the right
2273 * direction). */
2274
2275 while (a > 0) {
2276 /* In the first iteration of the while loop, we reuse i, k and o from the previous while
2277 * loop. */
2278 if (i == UINT64_MAX) {
2279 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
2280 if (r < 0)
2281 return r;
2282
2283 k = journal_file_entry_array_n_items(o);
2284 if (k == 0)
2285 break;
2286
2287 i = direction == DIRECTION_DOWN ? 0 : k - 1;
2288 }
2289
2290 do {
2291 p = le64toh(o->entry_array.items[i]);
2292
2293 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, ret);
2294 if (r >= 0) {
2295 /* Let's cache this item for the next invocation */
2296 chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
2297
2298 if (ret_offset)
2299 *ret_offset = p;
2300
2301 return 1;
2302 }
2303 if (!IN_SET(r, -EADDRNOTAVAIL, -EBADMSG))
2304 return r;
2305
2306 /* OK, so this entry is borked. Most likely some entry didn't get synced to
2307 * disk properly, let's see if the next one might work for us instead. */
2308 log_debug_errno(r, "Entry item %" PRIu64 " is bad, skipping over it.", i);
2309 } while (bump_array_index(&i, direction, k) > 0);
2310
2311 r = bump_entry_array(f, o, a, first, direction, &a);
2312 if (r < 0)
2313 return r;
2314
2315 t += k;
2316 i = UINT64_MAX;
2317 }
2318
2319 return 0;
2320 }
2321
generic_array_get_plus_one(JournalFile * f,uint64_t extra,uint64_t first,uint64_t i,direction_t direction,Object ** ret,uint64_t * ret_offset)2322 static int generic_array_get_plus_one(
2323 JournalFile *f,
2324 uint64_t extra,
2325 uint64_t first,
2326 uint64_t i,
2327 direction_t direction,
2328 Object **ret, uint64_t *ret_offset) {
2329
2330 int r;
2331
2332 assert(f);
2333
2334 if (i == 0) {
2335 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, ret);
2336 if (IN_SET(r, -EADDRNOTAVAIL, -EBADMSG))
2337 return generic_array_get(f, first, 0, direction, ret, ret_offset);
2338 if (r < 0)
2339 return r;
2340
2341 if (ret_offset)
2342 *ret_offset = extra;
2343
2344 return 1;
2345 }
2346
2347 return generic_array_get(f, first, i - 1, direction, ret, ret_offset);
2348 }
2349
2350 enum {
2351 TEST_FOUND,
2352 TEST_LEFT,
2353 TEST_RIGHT
2354 };
2355
generic_array_bisect(JournalFile * f,uint64_t first,uint64_t n,uint64_t needle,int (* test_object)(JournalFile * f,uint64_t p,uint64_t needle),direction_t direction,Object ** ret,uint64_t * ret_offset,uint64_t * ret_idx)2356 static int generic_array_bisect(
2357 JournalFile *f,
2358 uint64_t first,
2359 uint64_t n,
2360 uint64_t needle,
2361 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
2362 direction_t direction,
2363 Object **ret,
2364 uint64_t *ret_offset,
2365 uint64_t *ret_idx) {
2366
2367 /* Given an entry array chain, this function finds the object "closest" to the given needle in the
2368 * chain, taking into account the provided direction. A function can be provided to determine how
2369 * an object is matched against the given needle.
2370 *
2371 * Given a journal file, the offset of an object and the needle, the test_object() function should
2372 * return TEST_LEFT if the needle is located earlier in the entry array chain, TEST_RIGHT if the
2373 * needle is located later in the entry array chain and TEST_FOUND if the object matches the needle.
2374 * If test_object() returns TEST_FOUND for a specific object, that object's information will be used
2375 * to populate the return values of this function. If test_object() never returns TEST_FOUND, the
2376 * return values are populated with the details of one of the objects closest to the needle. If the
2377 * direction is DIRECTION_UP, the earlier object is used. Otherwise, the later object is used.
2378 */
2379
2380 uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = UINT64_MAX;
2381 bool subtract_one = false;
2382 Object *array = NULL;
2383 int r;
2384 ChainCacheItem *ci;
2385
2386 assert(f);
2387 assert(test_object);
2388
2389 /* Start with the first array in the chain */
2390 a = first;
2391
2392 ci = ordered_hashmap_get(f->chain_cache, &first);
2393 if (ci && n > ci->total && ci->begin != 0) {
2394 /* Ah, we have iterated this bisection array chain
2395 * previously! Let's see if we can skip ahead in the
2396 * chain, as far as the last time. But we can't jump
2397 * backwards in the chain, so let's check that
2398 * first. */
2399
2400 r = test_object(f, ci->begin, needle);
2401 if (r < 0)
2402 return r;
2403
2404 if (r == TEST_LEFT) {
2405 /* OK, what we are looking for is right of the
2406 * begin of this EntryArray, so let's jump
2407 * straight to previously cached array in the
2408 * chain */
2409
2410 a = ci->array;
2411 n -= ci->total;
2412 t = ci->total;
2413 last_index = ci->last_index;
2414 }
2415 }
2416
2417 while (a > 0) {
2418 uint64_t left, right, k, lp;
2419
2420 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
2421 if (r < 0)
2422 return r;
2423
2424 k = journal_file_entry_array_n_items(array);
2425 right = MIN(k, n);
2426 if (right <= 0)
2427 return 0;
2428
2429 i = right - 1;
2430 lp = p = le64toh(array->entry_array.items[i]);
2431 if (p <= 0)
2432 r = -EBADMSG;
2433 else
2434 r = test_object(f, p, needle);
2435 if (r == -EBADMSG) {
2436 log_debug_errno(r, "Encountered invalid entry while bisecting, cutting algorithm short. (1)");
2437 n = i;
2438 continue;
2439 }
2440 if (r < 0)
2441 return r;
2442
2443 if (r == TEST_FOUND)
2444 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2445
2446 if (r == TEST_RIGHT) {
2447 left = 0;
2448 right -= 1;
2449
2450 if (last_index != UINT64_MAX) {
2451 assert(last_index <= right);
2452
2453 /* If we cached the last index we
2454 * looked at, let's try to not to jump
2455 * too wildly around and see if we can
2456 * limit the range to look at early to
2457 * the immediate neighbors of the last
2458 * index we looked at. */
2459
2460 if (last_index > 0) {
2461 uint64_t x = last_index - 1;
2462
2463 p = le64toh(array->entry_array.items[x]);
2464 if (p <= 0)
2465 return -EBADMSG;
2466
2467 r = test_object(f, p, needle);
2468 if (r < 0)
2469 return r;
2470
2471 if (r == TEST_FOUND)
2472 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2473
2474 if (r == TEST_RIGHT)
2475 right = x;
2476 else
2477 left = x + 1;
2478 }
2479
2480 if (last_index < right) {
2481 uint64_t y = last_index + 1;
2482
2483 p = le64toh(array->entry_array.items[y]);
2484 if (p <= 0)
2485 return -EBADMSG;
2486
2487 r = test_object(f, p, needle);
2488 if (r < 0)
2489 return r;
2490
2491 if (r == TEST_FOUND)
2492 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2493
2494 if (r == TEST_RIGHT)
2495 right = y;
2496 else
2497 left = y + 1;
2498 }
2499 }
2500
2501 for (;;) {
2502 if (left == right) {
2503 if (direction == DIRECTION_UP)
2504 subtract_one = true;
2505
2506 i = left;
2507 goto found;
2508 }
2509
2510 assert(left < right);
2511 i = (left + right) / 2;
2512
2513 p = le64toh(array->entry_array.items[i]);
2514 if (p <= 0)
2515 r = -EBADMSG;
2516 else
2517 r = test_object(f, p, needle);
2518 if (r == -EBADMSG) {
2519 log_debug_errno(r, "Encountered invalid entry while bisecting, cutting algorithm short. (2)");
2520 right = n = i;
2521 continue;
2522 }
2523 if (r < 0)
2524 return r;
2525
2526 if (r == TEST_FOUND)
2527 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2528
2529 if (r == TEST_RIGHT)
2530 right = i;
2531 else
2532 left = i + 1;
2533 }
2534 }
2535
2536 if (k >= n) {
2537 if (direction == DIRECTION_UP) {
2538 i = n;
2539 subtract_one = true;
2540 goto found;
2541 }
2542
2543 return 0;
2544 }
2545
2546 last_p = lp;
2547
2548 n -= k;
2549 t += k;
2550 last_index = UINT64_MAX;
2551 a = le64toh(array->entry_array.next_entry_array_offset);
2552 }
2553
2554 return 0;
2555
2556 found:
2557 if (subtract_one && t == 0 && i == 0)
2558 return 0;
2559
2560 /* Let's cache this item for the next invocation */
2561 chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : UINT64_MAX) : i);
2562
2563 if (subtract_one && i == 0)
2564 p = last_p;
2565 else if (subtract_one)
2566 p = le64toh(array->entry_array.items[i-1]);
2567 else
2568 p = le64toh(array->entry_array.items[i]);
2569
2570 if (ret) {
2571 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, ret);
2572 if (r < 0)
2573 return r;
2574 }
2575
2576 if (ret_offset)
2577 *ret_offset = p;
2578
2579 if (ret_idx)
2580 *ret_idx = t + i + (subtract_one ? -1 : 0);
2581
2582 return 1;
2583 }
2584
generic_array_bisect_plus_one(JournalFile * f,uint64_t extra,uint64_t first,uint64_t n,uint64_t needle,int (* test_object)(JournalFile * f,uint64_t p,uint64_t needle),direction_t direction,Object ** ret,uint64_t * ret_offset,uint64_t * ret_idx)2585 static int generic_array_bisect_plus_one(
2586 JournalFile *f,
2587 uint64_t extra,
2588 uint64_t first,
2589 uint64_t n,
2590 uint64_t needle,
2591 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
2592 direction_t direction,
2593 Object **ret,
2594 uint64_t *ret_offset,
2595 uint64_t *ret_idx) {
2596
2597 int r;
2598 bool step_back = false;
2599
2600 assert(f);
2601 assert(test_object);
2602
2603 if (n <= 0)
2604 return 0;
2605
2606 /* This bisects the array in object 'first', but first checks
2607 * an extra */
2608 r = test_object(f, extra, needle);
2609 if (r < 0)
2610 return r;
2611
2612 if (r == TEST_FOUND)
2613 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2614
2615 /* if we are looking with DIRECTION_UP then we need to first
2616 see if in the actual array there is a matching entry, and
2617 return the last one of that. But if there isn't any we need
2618 to return this one. Hence remember this, and return it
2619 below. */
2620 if (r == TEST_LEFT)
2621 step_back = direction == DIRECTION_UP;
2622
2623 if (r == TEST_RIGHT) {
2624 if (direction == DIRECTION_DOWN)
2625 goto found;
2626 else
2627 return 0;
2628 }
2629
2630 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, ret_offset, ret_idx);
2631
2632 if (r == 0 && step_back)
2633 goto found;
2634
2635 if (r > 0 && ret_idx)
2636 (*ret_idx)++;
2637
2638 return r;
2639
2640 found:
2641 if (ret) {
2642 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, ret);
2643 if (r < 0)
2644 return r;
2645 }
2646
2647 if (ret_offset)
2648 *ret_offset = extra;
2649
2650 if (ret_idx)
2651 *ret_idx = 0;
2652
2653 return 1;
2654 }
2655
test_object_offset(JournalFile * f,uint64_t p,uint64_t needle)2656 _pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
2657 assert(f);
2658 assert(p > 0);
2659
2660 if (p == needle)
2661 return TEST_FOUND;
2662 else if (p < needle)
2663 return TEST_LEFT;
2664 else
2665 return TEST_RIGHT;
2666 }
2667
journal_file_move_to_entry_by_offset(JournalFile * f,uint64_t p,direction_t direction,Object ** ret,uint64_t * ret_offset)2668 int journal_file_move_to_entry_by_offset(
2669 JournalFile *f,
2670 uint64_t p,
2671 direction_t direction,
2672 Object **ret,
2673 uint64_t *ret_offset) {
2674
2675 assert(f);
2676 assert(f->header);
2677
2678 return generic_array_bisect(
2679 f,
2680 le64toh(f->header->entry_array_offset),
2681 le64toh(f->header->n_entries),
2682 p,
2683 test_object_offset,
2684 direction,
2685 ret, ret_offset, NULL);
2686 }
2687
test_object_seqnum(JournalFile * f,uint64_t p,uint64_t needle)2688 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
2689 uint64_t sq;
2690 Object *o;
2691 int r;
2692
2693 assert(f);
2694 assert(p > 0);
2695
2696 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2697 if (r < 0)
2698 return r;
2699
2700 sq = le64toh(READ_NOW(o->entry.seqnum));
2701 if (sq == needle)
2702 return TEST_FOUND;
2703 else if (sq < needle)
2704 return TEST_LEFT;
2705 else
2706 return TEST_RIGHT;
2707 }
2708
journal_file_move_to_entry_by_seqnum(JournalFile * f,uint64_t seqnum,direction_t direction,Object ** ret,uint64_t * ret_offset)2709 int journal_file_move_to_entry_by_seqnum(
2710 JournalFile *f,
2711 uint64_t seqnum,
2712 direction_t direction,
2713 Object **ret,
2714 uint64_t *ret_offset) {
2715 assert(f);
2716 assert(f->header);
2717
2718 return generic_array_bisect(
2719 f,
2720 le64toh(f->header->entry_array_offset),
2721 le64toh(f->header->n_entries),
2722 seqnum,
2723 test_object_seqnum,
2724 direction,
2725 ret, ret_offset, NULL);
2726 }
2727
test_object_realtime(JournalFile * f,uint64_t p,uint64_t needle)2728 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
2729 Object *o;
2730 uint64_t rt;
2731 int r;
2732
2733 assert(f);
2734 assert(p > 0);
2735
2736 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2737 if (r < 0)
2738 return r;
2739
2740 rt = le64toh(READ_NOW(o->entry.realtime));
2741 if (rt == needle)
2742 return TEST_FOUND;
2743 else if (rt < needle)
2744 return TEST_LEFT;
2745 else
2746 return TEST_RIGHT;
2747 }
2748
journal_file_move_to_entry_by_realtime(JournalFile * f,uint64_t realtime,direction_t direction,Object ** ret,uint64_t * ret_offset)2749 int journal_file_move_to_entry_by_realtime(
2750 JournalFile *f,
2751 uint64_t realtime,
2752 direction_t direction,
2753 Object **ret,
2754 uint64_t *ret_offset) {
2755 assert(f);
2756 assert(f->header);
2757
2758 return generic_array_bisect(
2759 f,
2760 le64toh(f->header->entry_array_offset),
2761 le64toh(f->header->n_entries),
2762 realtime,
2763 test_object_realtime,
2764 direction,
2765 ret, ret_offset, NULL);
2766 }
2767
test_object_monotonic(JournalFile * f,uint64_t p,uint64_t needle)2768 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
2769 Object *o;
2770 uint64_t m;
2771 int r;
2772
2773 assert(f);
2774 assert(p > 0);
2775
2776 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2777 if (r < 0)
2778 return r;
2779
2780 m = le64toh(READ_NOW(o->entry.monotonic));
2781 if (m == needle)
2782 return TEST_FOUND;
2783 else if (m < needle)
2784 return TEST_LEFT;
2785 else
2786 return TEST_RIGHT;
2787 }
2788
find_data_object_by_boot_id(JournalFile * f,sd_id128_t boot_id,Object ** o,uint64_t * b)2789 static int find_data_object_by_boot_id(
2790 JournalFile *f,
2791 sd_id128_t boot_id,
2792 Object **o,
2793 uint64_t *b) {
2794
2795 char t[STRLEN("_BOOT_ID=") + 32 + 1] = "_BOOT_ID=";
2796
2797 sd_id128_to_string(boot_id, t + 9);
2798 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
2799 }
2800
journal_file_move_to_entry_by_monotonic(JournalFile * f,sd_id128_t boot_id,uint64_t monotonic,direction_t direction,Object ** ret,uint64_t * ret_offset)2801 int journal_file_move_to_entry_by_monotonic(
2802 JournalFile *f,
2803 sd_id128_t boot_id,
2804 uint64_t monotonic,
2805 direction_t direction,
2806 Object **ret,
2807 uint64_t *ret_offset) {
2808
2809 Object *o;
2810 int r;
2811
2812 assert(f);
2813
2814 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
2815 if (r < 0)
2816 return r;
2817 if (r == 0)
2818 return -ENOENT;
2819
2820 return generic_array_bisect_plus_one(
2821 f,
2822 le64toh(o->data.entry_offset),
2823 le64toh(o->data.entry_array_offset),
2824 le64toh(o->data.n_entries),
2825 monotonic,
2826 test_object_monotonic,
2827 direction,
2828 ret, ret_offset, NULL);
2829 }
2830
journal_file_reset_location(JournalFile * f)2831 void journal_file_reset_location(JournalFile *f) {
2832 f->location_type = LOCATION_HEAD;
2833 f->current_offset = 0;
2834 f->current_seqnum = 0;
2835 f->current_realtime = 0;
2836 f->current_monotonic = 0;
2837 zero(f->current_boot_id);
2838 f->current_xor_hash = 0;
2839 }
2840
journal_file_save_location(JournalFile * f,Object * o,uint64_t offset)2841 void journal_file_save_location(JournalFile *f, Object *o, uint64_t offset) {
2842 f->location_type = LOCATION_SEEK;
2843 f->current_offset = offset;
2844 f->current_seqnum = le64toh(o->entry.seqnum);
2845 f->current_realtime = le64toh(o->entry.realtime);
2846 f->current_monotonic = le64toh(o->entry.monotonic);
2847 f->current_boot_id = o->entry.boot_id;
2848 f->current_xor_hash = le64toh(o->entry.xor_hash);
2849 }
2850
journal_file_compare_locations(JournalFile * af,JournalFile * bf)2851 int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
2852 int r;
2853
2854 assert(af);
2855 assert(af->header);
2856 assert(bf);
2857 assert(bf->header);
2858 assert(af->location_type == LOCATION_SEEK);
2859 assert(bf->location_type == LOCATION_SEEK);
2860
2861 /* If contents, timestamps and seqnum match, these entries are
2862 * identical. */
2863 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
2864 af->current_monotonic == bf->current_monotonic &&
2865 af->current_realtime == bf->current_realtime &&
2866 af->current_xor_hash == bf->current_xor_hash &&
2867 sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id) &&
2868 af->current_seqnum == bf->current_seqnum)
2869 return 0;
2870
2871 if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
2872
2873 /* If this is from the same seqnum source, compare
2874 * seqnums */
2875 r = CMP(af->current_seqnum, bf->current_seqnum);
2876 if (r != 0)
2877 return r;
2878
2879 /* Wow! This is weird, different data but the same
2880 * seqnums? Something is borked, but let's make the
2881 * best of it and compare by time. */
2882 }
2883
2884 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
2885
2886 /* If the boot id matches, compare monotonic time */
2887 r = CMP(af->current_monotonic, bf->current_monotonic);
2888 if (r != 0)
2889 return r;
2890 }
2891
2892 /* Otherwise, compare UTC time */
2893 r = CMP(af->current_realtime, bf->current_realtime);
2894 if (r != 0)
2895 return r;
2896
2897 /* Finally, compare by contents */
2898 return CMP(af->current_xor_hash, bf->current_xor_hash);
2899 }
2900
check_properly_ordered(uint64_t new_offset,uint64_t old_offset,direction_t direction)2901 static bool check_properly_ordered(uint64_t new_offset, uint64_t old_offset, direction_t direction) {
2902
2903 /* Consider it an error if any of the two offsets is uninitialized */
2904 if (old_offset == 0 || new_offset == 0)
2905 return false;
2906
2907 /* If we go down, the new offset must be larger than the old one. */
2908 return direction == DIRECTION_DOWN ?
2909 new_offset > old_offset :
2910 new_offset < old_offset;
2911 }
2912
journal_file_next_entry(JournalFile * f,uint64_t p,direction_t direction,Object ** ret,uint64_t * ret_offset)2913 int journal_file_next_entry(
2914 JournalFile *f,
2915 uint64_t p,
2916 direction_t direction,
2917 Object **ret, uint64_t *ret_offset) {
2918
2919 uint64_t i, n, ofs;
2920 int r;
2921
2922 assert(f);
2923 assert(f->header);
2924
2925 n = le64toh(READ_NOW(f->header->n_entries));
2926 if (n <= 0)
2927 return 0;
2928
2929 if (p == 0)
2930 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2931 else {
2932 r = generic_array_bisect(f,
2933 le64toh(f->header->entry_array_offset),
2934 le64toh(f->header->n_entries),
2935 p,
2936 test_object_offset,
2937 DIRECTION_DOWN,
2938 NULL, NULL,
2939 &i);
2940 if (r <= 0)
2941 return r;
2942
2943 r = bump_array_index(&i, direction, n);
2944 if (r <= 0)
2945 return r;
2946 }
2947
2948 /* And jump to it */
2949 r = generic_array_get(f, le64toh(f->header->entry_array_offset), i, direction, ret, &ofs);
2950 if (r <= 0)
2951 return r;
2952
2953 /* Ensure our array is properly ordered. */
2954 if (p > 0 && !check_properly_ordered(ofs, p, direction))
2955 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
2956 "%s: entry array not properly ordered at entry %" PRIu64,
2957 f->path, i);
2958
2959 if (ret_offset)
2960 *ret_offset = ofs;
2961
2962 return 1;
2963 }
2964
journal_file_next_entry_for_data(JournalFile * f,Object * d,direction_t direction,Object ** ret,uint64_t * ret_offset)2965 int journal_file_next_entry_for_data(
2966 JournalFile *f,
2967 Object *d,
2968 direction_t direction,
2969 Object **ret, uint64_t *ret_offset) {
2970
2971 uint64_t i, n, ofs;
2972 int r;
2973
2974 assert(f);
2975 assert(d);
2976 assert(d->object.type == OBJECT_DATA);
2977
2978 n = le64toh(READ_NOW(d->data.n_entries));
2979 if (n <= 0)
2980 return n;
2981
2982 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2983
2984 r = generic_array_get_plus_one(f,
2985 le64toh(d->data.entry_offset),
2986 le64toh(d->data.entry_array_offset),
2987 i,
2988 direction,
2989 ret, &ofs);
2990 if (r <= 0)
2991 return r;
2992
2993 if (ret_offset)
2994 *ret_offset = ofs;
2995
2996 return 1;
2997 }
2998
journal_file_move_to_entry_by_offset_for_data(JournalFile * f,Object * d,uint64_t p,direction_t direction,Object ** ret,uint64_t * ret_offset)2999 int journal_file_move_to_entry_by_offset_for_data(
3000 JournalFile *f,
3001 Object *d,
3002 uint64_t p,
3003 direction_t direction,
3004 Object **ret, uint64_t *ret_offset) {
3005
3006 assert(f);
3007 assert(d);
3008 assert(d->object.type == OBJECT_DATA);
3009
3010 return generic_array_bisect_plus_one(
3011 f,
3012 le64toh(d->data.entry_offset),
3013 le64toh(d->data.entry_array_offset),
3014 le64toh(d->data.n_entries),
3015 p,
3016 test_object_offset,
3017 direction,
3018 ret, ret_offset, NULL);
3019 }
3020
journal_file_move_to_entry_by_monotonic_for_data(JournalFile * f,Object * d,sd_id128_t boot_id,uint64_t monotonic,direction_t direction,Object ** ret,uint64_t * ret_offset)3021 int journal_file_move_to_entry_by_monotonic_for_data(
3022 JournalFile *f,
3023 Object *d,
3024 sd_id128_t boot_id,
3025 uint64_t monotonic,
3026 direction_t direction,
3027 Object **ret, uint64_t *ret_offset) {
3028
3029 Object *o;
3030 int r;
3031 uint64_t b, z, entry_offset, entry_array_offset, n_entries;
3032
3033 assert(f);
3034 assert(d);
3035 assert(d->object.type == OBJECT_DATA);
3036
3037 /* Save all the required data before the data object gets invalidated. */
3038 entry_offset = le64toh(READ_NOW(d->data.entry_offset));
3039 entry_array_offset = le64toh(READ_NOW(d->data.entry_array_offset));
3040 n_entries = le64toh(READ_NOW(d->data.n_entries));
3041
3042 /* First, seek by time */
3043 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
3044 if (r < 0)
3045 return r;
3046 if (r == 0)
3047 return -ENOENT;
3048
3049 r = generic_array_bisect_plus_one(f,
3050 le64toh(o->data.entry_offset),
3051 le64toh(o->data.entry_array_offset),
3052 le64toh(o->data.n_entries),
3053 monotonic,
3054 test_object_monotonic,
3055 direction,
3056 NULL, &z, NULL);
3057 if (r <= 0)
3058 return r;
3059
3060 /* And now, continue seeking until we find an entry that
3061 * exists in both bisection arrays */
3062
3063 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
3064 if (r < 0)
3065 return r;
3066
3067 for (;;) {
3068 uint64_t p, q;
3069
3070 r = generic_array_bisect_plus_one(f,
3071 entry_offset,
3072 entry_array_offset,
3073 n_entries,
3074 z,
3075 test_object_offset,
3076 direction,
3077 NULL, &p, NULL);
3078 if (r <= 0)
3079 return r;
3080
3081 r = generic_array_bisect_plus_one(f,
3082 le64toh(o->data.entry_offset),
3083 le64toh(o->data.entry_array_offset),
3084 le64toh(o->data.n_entries),
3085 p,
3086 test_object_offset,
3087 direction,
3088 NULL, &q, NULL);
3089
3090 if (r <= 0)
3091 return r;
3092
3093 if (p == q) {
3094 if (ret) {
3095 r = journal_file_move_to_object(f, OBJECT_ENTRY, q, ret);
3096 if (r < 0)
3097 return r;
3098 }
3099
3100 if (ret_offset)
3101 *ret_offset = q;
3102
3103 return 1;
3104 }
3105
3106 z = q;
3107 }
3108 }
3109
journal_file_move_to_entry_by_seqnum_for_data(JournalFile * f,Object * d,uint64_t seqnum,direction_t direction,Object ** ret,uint64_t * ret_offset)3110 int journal_file_move_to_entry_by_seqnum_for_data(
3111 JournalFile *f,
3112 Object *d,
3113 uint64_t seqnum,
3114 direction_t direction,
3115 Object **ret, uint64_t *ret_offset) {
3116
3117 assert(f);
3118 assert(d);
3119 assert(d->object.type == OBJECT_DATA);
3120
3121 return generic_array_bisect_plus_one(
3122 f,
3123 le64toh(d->data.entry_offset),
3124 le64toh(d->data.entry_array_offset),
3125 le64toh(d->data.n_entries),
3126 seqnum,
3127 test_object_seqnum,
3128 direction,
3129 ret, ret_offset, NULL);
3130 }
3131
journal_file_move_to_entry_by_realtime_for_data(JournalFile * f,Object * d,uint64_t realtime,direction_t direction,Object ** ret,uint64_t * ret_offset)3132 int journal_file_move_to_entry_by_realtime_for_data(
3133 JournalFile *f,
3134 Object *d,
3135 uint64_t realtime,
3136 direction_t direction,
3137 Object **ret, uint64_t *ret_offset) {
3138
3139 assert(f);
3140 assert(d);
3141 assert(d->object.type == OBJECT_DATA);
3142
3143 return generic_array_bisect_plus_one(
3144 f,
3145 le64toh(d->data.entry_offset),
3146 le64toh(d->data.entry_array_offset),
3147 le64toh(d->data.n_entries),
3148 realtime,
3149 test_object_realtime,
3150 direction,
3151 ret, ret_offset, NULL);
3152 }
3153
journal_file_dump(JournalFile * f)3154 void journal_file_dump(JournalFile *f) {
3155 Object *o;
3156 int r;
3157 uint64_t p;
3158
3159 assert(f);
3160 assert(f->header);
3161
3162 journal_file_print_header(f);
3163
3164 p = le64toh(READ_NOW(f->header->header_size));
3165 while (p != 0) {
3166 const char *s;
3167 Compression c;
3168
3169 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
3170 if (r < 0)
3171 goto fail;
3172
3173 s = journal_object_type_to_string(o->object.type);
3174
3175 switch (o->object.type) {
3176
3177 case OBJECT_ENTRY:
3178 assert(s);
3179
3180 printf("Type: %s seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
3181 s,
3182 le64toh(o->entry.seqnum),
3183 le64toh(o->entry.monotonic),
3184 le64toh(o->entry.realtime));
3185 break;
3186
3187 case OBJECT_TAG:
3188 assert(s);
3189
3190 printf("Type: %s seqnum=%"PRIu64" epoch=%"PRIu64"\n",
3191 s,
3192 le64toh(o->tag.seqnum),
3193 le64toh(o->tag.epoch));
3194 break;
3195
3196 default:
3197 if (s)
3198 printf("Type: %s \n", s);
3199 else
3200 printf("Type: unknown (%i)", o->object.type);
3201
3202 break;
3203 }
3204
3205 c = COMPRESSION_FROM_OBJECT(o);
3206 if (c > COMPRESSION_NONE)
3207 printf("Flags: %s\n",
3208 compression_to_string(c));
3209
3210 if (p == le64toh(f->header->tail_object_offset))
3211 p = 0;
3212 else
3213 p += ALIGN64(le64toh(o->object.size));
3214 }
3215
3216 return;
3217 fail:
3218 log_error("File corrupt");
3219 }
3220
3221 /* Note: the lifetime of the compound literal is the immediately surrounding block. */
3222 #define FORMAT_TIMESTAMP_SAFE(t) (FORMAT_TIMESTAMP(t) ?: " --- ")
3223
journal_file_print_header(JournalFile * f)3224 void journal_file_print_header(JournalFile *f) {
3225 struct stat st;
3226
3227 assert(f);
3228 assert(f->header);
3229
3230 printf("File path: %s\n"
3231 "File ID: %s\n"
3232 "Machine ID: %s\n"
3233 "Boot ID: %s\n"
3234 "Sequential number ID: %s\n"
3235 "State: %s\n"
3236 "Compatible flags:%s%s\n"
3237 "Incompatible flags:%s%s%s%s%s\n"
3238 "Header size: %"PRIu64"\n"
3239 "Arena size: %"PRIu64"\n"
3240 "Data hash table size: %"PRIu64"\n"
3241 "Field hash table size: %"PRIu64"\n"
3242 "Rotate suggested: %s\n"
3243 "Head sequential number: %"PRIu64" (%"PRIx64")\n"
3244 "Tail sequential number: %"PRIu64" (%"PRIx64")\n"
3245 "Head realtime timestamp: %s (%"PRIx64")\n"
3246 "Tail realtime timestamp: %s (%"PRIx64")\n"
3247 "Tail monotonic timestamp: %s (%"PRIx64")\n"
3248 "Objects: %"PRIu64"\n"
3249 "Entry objects: %"PRIu64"\n",
3250 f->path,
3251 SD_ID128_TO_STRING(f->header->file_id),
3252 SD_ID128_TO_STRING(f->header->machine_id),
3253 SD_ID128_TO_STRING(f->header->boot_id),
3254 SD_ID128_TO_STRING(f->header->seqnum_id),
3255 f->header->state == STATE_OFFLINE ? "OFFLINE" :
3256 f->header->state == STATE_ONLINE ? "ONLINE" :
3257 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
3258 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
3259 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
3260 JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
3261 JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
3262 JOURNAL_HEADER_COMPRESSED_ZSTD(f->header) ? " COMPRESSED-ZSTD" : "",
3263 JOURNAL_HEADER_KEYED_HASH(f->header) ? " KEYED-HASH" : "",
3264 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
3265 le64toh(f->header->header_size),
3266 le64toh(f->header->arena_size),
3267 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3268 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
3269 yes_no(journal_file_rotate_suggested(f, 0, LOG_DEBUG)),
3270 le64toh(f->header->head_entry_seqnum), le64toh(f->header->head_entry_seqnum),
3271 le64toh(f->header->tail_entry_seqnum), le64toh(f->header->tail_entry_seqnum),
3272 FORMAT_TIMESTAMP_SAFE(le64toh(f->header->head_entry_realtime)), le64toh(f->header->head_entry_realtime),
3273 FORMAT_TIMESTAMP_SAFE(le64toh(f->header->tail_entry_realtime)), le64toh(f->header->tail_entry_realtime),
3274 FORMAT_TIMESPAN(le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC), le64toh(f->header->tail_entry_monotonic),
3275 le64toh(f->header->n_objects),
3276 le64toh(f->header->n_entries));
3277
3278 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
3279 printf("Data objects: %"PRIu64"\n"
3280 "Data hash table fill: %.1f%%\n",
3281 le64toh(f->header->n_data),
3282 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
3283
3284 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
3285 printf("Field objects: %"PRIu64"\n"
3286 "Field hash table fill: %.1f%%\n",
3287 le64toh(f->header->n_fields),
3288 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
3289
3290 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
3291 printf("Tag objects: %"PRIu64"\n",
3292 le64toh(f->header->n_tags));
3293 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
3294 printf("Entry array objects: %"PRIu64"\n",
3295 le64toh(f->header->n_entry_arrays));
3296
3297 if (JOURNAL_HEADER_CONTAINS(f->header, field_hash_chain_depth))
3298 printf("Deepest field hash chain: %" PRIu64"\n",
3299 f->header->field_hash_chain_depth);
3300
3301 if (JOURNAL_HEADER_CONTAINS(f->header, data_hash_chain_depth))
3302 printf("Deepest data hash chain: %" PRIu64"\n",
3303 f->header->data_hash_chain_depth);
3304
3305 if (fstat(f->fd, &st) >= 0)
3306 printf("Disk usage: %s\n", FORMAT_BYTES((uint64_t) st.st_blocks * 512ULL));
3307 }
3308
journal_file_warn_btrfs(JournalFile * f)3309 static int journal_file_warn_btrfs(JournalFile *f) {
3310 unsigned attrs;
3311 int r;
3312
3313 assert(f);
3314
3315 /* Before we write anything, check if the COW logic is turned
3316 * off on btrfs. Given our write pattern that is quite
3317 * unfriendly to COW file systems this should greatly improve
3318 * performance on COW file systems, such as btrfs, at the
3319 * expense of data integrity features (which shouldn't be too
3320 * bad, given that we do our own checksumming). */
3321
3322 r = fd_is_fs_type(f->fd, BTRFS_SUPER_MAGIC);
3323 if (r < 0)
3324 return log_warning_errno(r, "Failed to determine if journal is on btrfs: %m");
3325 if (!r)
3326 return 0;
3327
3328 r = read_attr_fd(f->fd, &attrs);
3329 if (r < 0)
3330 return log_warning_errno(r, "Failed to read file attributes: %m");
3331
3332 if (attrs & FS_NOCOW_FL) {
3333 log_debug("Detected btrfs file system with copy-on-write disabled, all is good.");
3334 return 0;
3335 }
3336
3337 log_notice("Creating journal file %s on a btrfs file system, and copy-on-write is enabled. "
3338 "This is likely to slow down journal access substantially, please consider turning "
3339 "off the copy-on-write file attribute on the journal directory, using chattr +C.", f->path);
3340
3341 return 1;
3342 }
3343
journal_default_metrics(JournalMetrics * m,int fd)3344 static void journal_default_metrics(JournalMetrics *m, int fd) {
3345 struct statvfs ss;
3346 uint64_t fs_size = 0;
3347
3348 assert(m);
3349 assert(fd >= 0);
3350
3351 if (fstatvfs(fd, &ss) >= 0)
3352 fs_size = ss.f_frsize * ss.f_blocks;
3353 else
3354 log_debug_errno(errno, "Failed to determine disk size: %m");
3355
3356 if (m->max_use == UINT64_MAX) {
3357
3358 if (fs_size > 0)
3359 m->max_use = CLAMP(PAGE_ALIGN(fs_size / 10), /* 10% of file system size */
3360 MAX_USE_LOWER, MAX_USE_UPPER);
3361 else
3362 m->max_use = MAX_USE_LOWER;
3363 } else {
3364 m->max_use = PAGE_ALIGN(m->max_use);
3365
3366 if (m->max_use != 0 && m->max_use < JOURNAL_FILE_SIZE_MIN*2)
3367 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
3368 }
3369
3370 if (m->min_use == UINT64_MAX) {
3371 if (fs_size > 0)
3372 m->min_use = CLAMP(PAGE_ALIGN(fs_size / 50), /* 2% of file system size */
3373 MIN_USE_LOW, MIN_USE_HIGH);
3374 else
3375 m->min_use = MIN_USE_LOW;
3376 }
3377
3378 if (m->min_use > m->max_use)
3379 m->min_use = m->max_use;
3380
3381 if (m->max_size == UINT64_MAX)
3382 m->max_size = MIN(PAGE_ALIGN(m->max_use / 8), /* 8 chunks */
3383 MAX_SIZE_UPPER);
3384 else
3385 m->max_size = PAGE_ALIGN(m->max_size);
3386
3387 if (m->max_size != 0) {
3388 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
3389 m->max_size = JOURNAL_FILE_SIZE_MIN;
3390
3391 if (m->max_use != 0 && m->max_size*2 > m->max_use)
3392 m->max_use = m->max_size*2;
3393 }
3394
3395 if (m->min_size == UINT64_MAX)
3396 m->min_size = JOURNAL_FILE_SIZE_MIN;
3397 else
3398 m->min_size = CLAMP(PAGE_ALIGN(m->min_size),
3399 JOURNAL_FILE_SIZE_MIN,
3400 m->max_size ?: UINT64_MAX);
3401
3402 if (m->keep_free == UINT64_MAX) {
3403 if (fs_size > 0)
3404 m->keep_free = MIN(PAGE_ALIGN(fs_size / 20), /* 5% of file system size */
3405 KEEP_FREE_UPPER);
3406 else
3407 m->keep_free = DEFAULT_KEEP_FREE;
3408 }
3409
3410 if (m->n_max_files == UINT64_MAX)
3411 m->n_max_files = DEFAULT_N_MAX_FILES;
3412
3413 log_debug("Fixed min_use=%s max_use=%s max_size=%s min_size=%s keep_free=%s n_max_files=%" PRIu64,
3414 FORMAT_BYTES(m->min_use),
3415 FORMAT_BYTES(m->max_use),
3416 FORMAT_BYTES(m->max_size),
3417 FORMAT_BYTES(m->min_size),
3418 FORMAT_BYTES(m->keep_free),
3419 m->n_max_files);
3420 }
3421
journal_file_open(int fd,const char * fname,int open_flags,JournalFileFlags file_flags,mode_t mode,uint64_t compress_threshold_bytes,JournalMetrics * metrics,MMapCache * mmap_cache,JournalFile * template,JournalFile ** ret)3422 int journal_file_open(
3423 int fd,
3424 const char *fname,
3425 int open_flags,
3426 JournalFileFlags file_flags,
3427 mode_t mode,
3428 uint64_t compress_threshold_bytes,
3429 JournalMetrics *metrics,
3430 MMapCache *mmap_cache,
3431 JournalFile *template,
3432 JournalFile **ret) {
3433
3434 bool newly_created = false;
3435 JournalFile *f;
3436 void *h;
3437 int r;
3438
3439 assert(ret);
3440 assert(fd >= 0 || fname);
3441 assert(mmap_cache);
3442
3443 if (!IN_SET((open_flags & O_ACCMODE), O_RDONLY, O_RDWR))
3444 return -EINVAL;
3445
3446 if ((open_flags & O_ACCMODE) == O_RDONLY && FLAGS_SET(open_flags, O_CREAT))
3447 return -EINVAL;
3448
3449 if (fname && (open_flags & O_CREAT) && !endswith(fname, ".journal"))
3450 return -EINVAL;
3451
3452 f = new(JournalFile, 1);
3453 if (!f)
3454 return -ENOMEM;
3455
3456 *f = (JournalFile) {
3457 .fd = fd,
3458 .mode = mode,
3459 .open_flags = open_flags,
3460 .compress_threshold_bytes = compress_threshold_bytes == UINT64_MAX ?
3461 DEFAULT_COMPRESS_THRESHOLD :
3462 MAX(MIN_COMPRESS_THRESHOLD, compress_threshold_bytes),
3463 };
3464
3465 if (fname) {
3466 f->path = strdup(fname);
3467 if (!f->path) {
3468 r = -ENOMEM;
3469 goto fail;
3470 }
3471 } else {
3472 assert(fd >= 0);
3473
3474 /* If we don't know the path, fill in something explanatory and vaguely useful */
3475 if (asprintf(&f->path, "/proc/self/%i", fd) < 0) {
3476 r = -ENOMEM;
3477 goto fail;
3478 }
3479 }
3480
3481 f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
3482 if (!f->chain_cache) {
3483 r = -ENOMEM;
3484 goto fail;
3485 }
3486
3487 if (f->fd < 0) {
3488 /* We pass O_NONBLOCK here, so that in case somebody pointed us to some character device node or FIFO
3489 * or so, we likely fail quickly than block for long. For regular files O_NONBLOCK has no effect, hence
3490 * it doesn't hurt in that case. */
3491
3492 f->fd = openat_report_new(AT_FDCWD, f->path, f->open_flags|O_CLOEXEC|O_NONBLOCK, f->mode, &newly_created);
3493 if (f->fd < 0) {
3494 r = f->fd;
3495 goto fail;
3496 }
3497
3498 /* fds we opened here by us should also be closed by us. */
3499 f->close_fd = true;
3500
3501 r = fd_nonblock(f->fd, false);
3502 if (r < 0)
3503 goto fail;
3504
3505 if (!newly_created) {
3506 r = journal_file_fstat(f);
3507 if (r < 0)
3508 goto fail;
3509 }
3510 } else {
3511 r = journal_file_fstat(f);
3512 if (r < 0)
3513 goto fail;
3514
3515 /* If we just got the fd passed in, we don't really know if we created the file anew */
3516 newly_created = f->last_stat.st_size == 0 && journal_file_writable(f);
3517 }
3518
3519 f->cache_fd = mmap_cache_add_fd(mmap_cache, f->fd, prot_from_flags(open_flags));
3520 if (!f->cache_fd) {
3521 r = -ENOMEM;
3522 goto fail;
3523 }
3524
3525 if (newly_created) {
3526 (void) journal_file_warn_btrfs(f);
3527
3528 /* Let's attach the creation time to the journal file, so that the vacuuming code knows the age of this
3529 * file even if the file might end up corrupted one day... Ideally we'd just use the creation time many
3530 * file systems maintain for each file, but the API to query this is very new, hence let's emulate this
3531 * via extended attributes. If extended attributes are not supported we'll just skip this, and rely
3532 * solely on mtime/atime/ctime of the file. */
3533 (void) fd_setcrtime(f->fd, 0);
3534
3535 r = journal_file_init_header(f, file_flags, template);
3536 if (r < 0)
3537 goto fail;
3538
3539 r = journal_file_fstat(f);
3540 if (r < 0)
3541 goto fail;
3542 }
3543
3544 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
3545 r = -ENODATA;
3546 goto fail;
3547 }
3548
3549 r = mmap_cache_fd_get(f->cache_fd, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h);
3550 if (r == -EINVAL) {
3551 /* Some file systems (jffs2 or p9fs) don't support mmap() properly (or only read-only
3552 * mmap()), and return EINVAL in that case. Let's propagate that as a more recognizable error
3553 * code. */
3554 r = -EAFNOSUPPORT;
3555 goto fail;
3556 }
3557 if (r < 0)
3558 goto fail;
3559
3560 f->header = h;
3561
3562 if (!newly_created) {
3563 r = journal_file_verify_header(f);
3564 if (r < 0)
3565 goto fail;
3566 }
3567
3568 #if HAVE_GCRYPT
3569 if (!newly_created && journal_file_writable(f) && JOURNAL_HEADER_SEALED(f->header)) {
3570 r = journal_file_fss_load(f);
3571 if (r < 0)
3572 goto fail;
3573 }
3574 #endif
3575
3576 if (journal_file_writable(f)) {
3577 if (metrics) {
3578 journal_default_metrics(metrics, f->fd);
3579 f->metrics = *metrics;
3580 } else if (template)
3581 f->metrics = template->metrics;
3582
3583 r = journal_file_refresh_header(f);
3584 if (r < 0)
3585 goto fail;
3586 }
3587
3588 #if HAVE_GCRYPT
3589 r = journal_file_hmac_setup(f);
3590 if (r < 0)
3591 goto fail;
3592 #endif
3593
3594 if (newly_created) {
3595 r = journal_file_setup_field_hash_table(f);
3596 if (r < 0)
3597 goto fail;
3598
3599 r = journal_file_setup_data_hash_table(f);
3600 if (r < 0)
3601 goto fail;
3602
3603 #if HAVE_GCRYPT
3604 r = journal_file_append_first_tag(f);
3605 if (r < 0)
3606 goto fail;
3607 #endif
3608 }
3609
3610 if (mmap_cache_fd_got_sigbus(f->cache_fd)) {
3611 r = -EIO;
3612 goto fail;
3613 }
3614
3615 if (template && template->post_change_timer) {
3616 r = journal_file_enable_post_change_timer(
3617 f,
3618 sd_event_source_get_event(template->post_change_timer),
3619 template->post_change_timer_period);
3620
3621 if (r < 0)
3622 goto fail;
3623 }
3624
3625 /* The file is opened now successfully, thus we take possession of any passed in fd. */
3626 f->close_fd = true;
3627
3628 if (DEBUG_LOGGING) {
3629 static int last_seal = -1, last_compress = -1, last_keyed_hash = -1;
3630 static uint64_t last_bytes = UINT64_MAX;
3631
3632 if (last_seal != JOURNAL_HEADER_SEALED(f->header) ||
3633 last_keyed_hash != JOURNAL_HEADER_KEYED_HASH(f->header) ||
3634 last_compress != JOURNAL_FILE_COMPRESS(f) ||
3635 last_bytes != f->compress_threshold_bytes) {
3636
3637 log_debug("Journal effective settings seal=%s keyed_hash=%s compress=%s compress_threshold_bytes=%s",
3638 yes_no(JOURNAL_HEADER_SEALED(f->header)), yes_no(JOURNAL_HEADER_KEYED_HASH(f->header)),
3639 yes_no(JOURNAL_FILE_COMPRESS(f)), FORMAT_BYTES(f->compress_threshold_bytes));
3640 last_seal = JOURNAL_HEADER_SEALED(f->header);
3641 last_keyed_hash = JOURNAL_HEADER_KEYED_HASH(f->header);
3642 last_compress = JOURNAL_FILE_COMPRESS(f);
3643 last_bytes = f->compress_threshold_bytes;
3644 }
3645 }
3646
3647 *ret = f;
3648 return 0;
3649
3650 fail:
3651 if (f->cache_fd && mmap_cache_fd_got_sigbus(f->cache_fd))
3652 r = -EIO;
3653
3654 (void) journal_file_close(f);
3655
3656 if (newly_created && fd < 0)
3657 (void) unlink(fname);
3658
3659 return r;
3660 }
3661
journal_file_archive(JournalFile * f,char ** ret_previous_path)3662 int journal_file_archive(JournalFile *f, char **ret_previous_path) {
3663 _cleanup_free_ char *p = NULL;
3664
3665 assert(f);
3666
3667 if (!journal_file_writable(f))
3668 return -EINVAL;
3669
3670 /* Is this a journal file that was passed to us as fd? If so, we synthesized a path name for it, and we refuse
3671 * rotation, since we don't know the actual path, and couldn't rename the file hence. */
3672 if (path_startswith(f->path, "/proc/self/fd"))
3673 return -EINVAL;
3674
3675 if (!endswith(f->path, ".journal"))
3676 return -EINVAL;
3677
3678 if (asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
3679 (int) strlen(f->path) - 8, f->path,
3680 SD_ID128_FORMAT_VAL(f->header->seqnum_id),
3681 le64toh(f->header->head_entry_seqnum),
3682 le64toh(f->header->head_entry_realtime)) < 0)
3683 return -ENOMEM;
3684
3685 /* Try to rename the file to the archived version. If the file already was deleted, we'll get ENOENT, let's
3686 * ignore that case. */
3687 if (rename(f->path, p) < 0 && errno != ENOENT)
3688 return -errno;
3689
3690 /* Sync the rename to disk */
3691 (void) fsync_directory_of_file(f->fd);
3692
3693 if (ret_previous_path)
3694 *ret_previous_path = f->path;
3695 else
3696 free(f->path);
3697
3698 f->path = TAKE_PTR(p);
3699
3700 /* Set as archive so offlining commits w/state=STATE_ARCHIVED. Previously we would set old_file->header->state
3701 * to STATE_ARCHIVED directly here, but journal_file_set_offline() short-circuits when state != STATE_ONLINE,
3702 * which would result in the rotated journal never getting fsync() called before closing. Now we simply queue
3703 * the archive state by setting an archive bit, leaving the state as STATE_ONLINE so proper offlining
3704 * occurs. */
3705 f->archive = true;
3706
3707 return 0;
3708 }
3709
journal_file_dispose(int dir_fd,const char * fname)3710 int journal_file_dispose(int dir_fd, const char *fname) {
3711 _cleanup_free_ char *p = NULL;
3712
3713 assert(fname);
3714
3715 /* Renames a journal file to *.journal~, i.e. to mark it as corrupted or otherwise uncleanly shutdown. Note that
3716 * this is done without looking into the file or changing any of its contents. The idea is that this is called
3717 * whenever something is suspicious and we want to move the file away and make clear that it is not accessed
3718 * for writing anymore. */
3719
3720 if (!endswith(fname, ".journal"))
3721 return -EINVAL;
3722
3723 if (asprintf(&p, "%.*s@%016" PRIx64 "-%016" PRIx64 ".journal~",
3724 (int) strlen(fname) - 8, fname,
3725 now(CLOCK_REALTIME),
3726 random_u64()) < 0)
3727 return -ENOMEM;
3728
3729 if (renameat(dir_fd, fname, dir_fd, p) < 0)
3730 return -errno;
3731
3732 return 0;
3733 }
3734
journal_file_copy_entry(JournalFile * from,JournalFile * to,Object * o,uint64_t p)3735 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p) {
3736 uint64_t q, n, xor_hash = 0;
3737 const sd_id128_t *boot_id;
3738 dual_timestamp ts;
3739 EntryItem *items;
3740 int r;
3741
3742 assert(from);
3743 assert(to);
3744 assert(o);
3745 assert(p);
3746
3747 if (!journal_file_writable(to))
3748 return -EPERM;
3749
3750 ts = (dual_timestamp) {
3751 .monotonic = le64toh(o->entry.monotonic),
3752 .realtime = le64toh(o->entry.realtime),
3753 };
3754 boot_id = &o->entry.boot_id;
3755
3756 n = journal_file_entry_n_items(o);
3757 items = newa(EntryItem, n);
3758
3759 for (uint64_t i = 0; i < n; i++) {
3760 Compression c;
3761 uint64_t l, h;
3762 size_t t;
3763 void *data;
3764 Object *u;
3765
3766 q = le64toh(o->entry.items[i].object_offset);
3767
3768 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
3769 if (r < 0)
3770 return r;
3771
3772 l = le64toh(READ_NOW(o->object.size));
3773 if (l < offsetof(Object, data.payload))
3774 return -EBADMSG;
3775
3776 l -= offsetof(Object, data.payload);
3777 t = (size_t) l;
3778
3779 /* We hit the limit on 32bit machines */
3780 if ((uint64_t) t != l)
3781 return -E2BIG;
3782
3783 c = COMPRESSION_FROM_OBJECT(o);
3784 if (c < 0)
3785 return -EPROTONOSUPPORT;
3786 if (c != COMPRESSION_NONE) {
3787 #if HAVE_COMPRESSION
3788 size_t rsize = 0;
3789
3790 r = decompress_blob(
3791 c,
3792 o->data.payload, l,
3793 &from->compress_buffer, &rsize,
3794 0);
3795 if (r < 0)
3796 return r;
3797
3798 data = from->compress_buffer;
3799 l = rsize;
3800 #else
3801 return -EPROTONOSUPPORT;
3802 #endif
3803 } else
3804 data = o->data.payload;
3805
3806 if (l == 0)
3807 return -EBADMSG;
3808
3809 r = journal_file_append_data(to, data, l, &u, &h);
3810 if (r < 0)
3811 return r;
3812
3813 if (JOURNAL_HEADER_KEYED_HASH(to->header))
3814 xor_hash ^= jenkins_hash64(data, l);
3815 else
3816 xor_hash ^= le64toh(u->data.hash);
3817
3818 items[i] = (EntryItem) {
3819 .object_offset = htole64(h),
3820 .hash = u->data.hash,
3821 };
3822
3823 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
3824 if (r < 0)
3825 return r;
3826 }
3827
3828 r = journal_file_append_entry_internal(to, &ts, boot_id, xor_hash, items, n, NULL, NULL, NULL);
3829
3830 if (mmap_cache_fd_got_sigbus(to->cache_fd))
3831 return -EIO;
3832
3833 return r;
3834 }
3835
journal_reset_metrics(JournalMetrics * m)3836 void journal_reset_metrics(JournalMetrics *m) {
3837 assert(m);
3838
3839 /* Set everything to "pick automatic values". */
3840
3841 *m = (JournalMetrics) {
3842 .min_use = UINT64_MAX,
3843 .max_use = UINT64_MAX,
3844 .min_size = UINT64_MAX,
3845 .max_size = UINT64_MAX,
3846 .keep_free = UINT64_MAX,
3847 .n_max_files = UINT64_MAX,
3848 };
3849 }
3850
journal_file_get_cutoff_realtime_usec(JournalFile * f,usec_t * from,usec_t * to)3851 int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
3852 assert(f);
3853 assert(f->header);
3854 assert(from || to);
3855
3856 if (from) {
3857 if (f->header->head_entry_realtime == 0)
3858 return -ENOENT;
3859
3860 *from = le64toh(f->header->head_entry_realtime);
3861 }
3862
3863 if (to) {
3864 if (f->header->tail_entry_realtime == 0)
3865 return -ENOENT;
3866
3867 *to = le64toh(f->header->tail_entry_realtime);
3868 }
3869
3870 return 1;
3871 }
3872
journal_file_get_cutoff_monotonic_usec(JournalFile * f,sd_id128_t boot_id,usec_t * from,usec_t * to)3873 int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
3874 Object *o;
3875 uint64_t p;
3876 int r;
3877
3878 assert(f);
3879 assert(from || to);
3880
3881 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
3882 if (r <= 0)
3883 return r;
3884
3885 if (le64toh(o->data.n_entries) <= 0)
3886 return 0;
3887
3888 if (from) {
3889 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
3890 if (r < 0)
3891 return r;
3892
3893 *from = le64toh(o->entry.monotonic);
3894 }
3895
3896 if (to) {
3897 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
3898 if (r < 0)
3899 return r;
3900
3901 r = generic_array_get_plus_one(f,
3902 le64toh(o->data.entry_offset),
3903 le64toh(o->data.entry_array_offset),
3904 le64toh(o->data.n_entries) - 1,
3905 DIRECTION_UP,
3906 &o, NULL);
3907 if (r <= 0)
3908 return r;
3909
3910 *to = le64toh(o->entry.monotonic);
3911 }
3912
3913 return 1;
3914 }
3915
journal_file_rotate_suggested(JournalFile * f,usec_t max_file_usec,int log_level)3916 bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec, int log_level) {
3917 assert(f);
3918 assert(f->header);
3919
3920 /* If we gained new header fields we gained new features,
3921 * hence suggest a rotation */
3922 if (le64toh(f->header->header_size) < sizeof(Header)) {
3923 log_full(log_level, "%s uses an outdated header, suggesting rotation.", f->path);
3924 return true;
3925 }
3926
3927 /* Let's check if the hash tables grew over a certain fill level (75%, borrowing this value from
3928 * Java's hash table implementation), and if so suggest a rotation. To calculate the fill level we
3929 * need the n_data field, which only exists in newer versions. */
3930
3931 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
3932 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3933 log_full(log_level,
3934 "Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
3935 f->path,
3936 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
3937 le64toh(f->header->n_data),
3938 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3939 (unsigned long long) f->last_stat.st_size,
3940 f->last_stat.st_size / le64toh(f->header->n_data));
3941 return true;
3942 }
3943
3944 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
3945 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
3946 log_full(log_level,
3947 "Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
3948 f->path,
3949 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
3950 le64toh(f->header->n_fields),
3951 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
3952 return true;
3953 }
3954
3955 /* If there are too many hash collisions somebody is most likely playing games with us. Hence, if our
3956 * longest chain is longer than some threshold, let's suggest rotation. */
3957 if (JOURNAL_HEADER_CONTAINS(f->header, data_hash_chain_depth) &&
3958 le64toh(f->header->data_hash_chain_depth) > HASH_CHAIN_DEPTH_MAX) {
3959 log_full(log_level,
3960 "Data hash table of %s has deepest hash chain of length %" PRIu64 ", suggesting rotation.",
3961 f->path, le64toh(f->header->data_hash_chain_depth));
3962 return true;
3963 }
3964
3965 if (JOURNAL_HEADER_CONTAINS(f->header, field_hash_chain_depth) &&
3966 le64toh(f->header->field_hash_chain_depth) > HASH_CHAIN_DEPTH_MAX) {
3967 log_full(log_level,
3968 "Field hash table of %s has deepest hash chain of length at %" PRIu64 ", suggesting rotation.",
3969 f->path, le64toh(f->header->field_hash_chain_depth));
3970 return true;
3971 }
3972
3973 /* Are the data objects properly indexed by field objects? */
3974 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
3975 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
3976 le64toh(f->header->n_data) > 0 &&
3977 le64toh(f->header->n_fields) == 0) {
3978 log_full(log_level,
3979 "Data objects of %s are not indexed by field objects, suggesting rotation.",
3980 f->path);
3981 return true;
3982 }
3983
3984 if (max_file_usec > 0) {
3985 usec_t t, h;
3986
3987 h = le64toh(f->header->head_entry_realtime);
3988 t = now(CLOCK_REALTIME);
3989
3990 if (h > 0 && t > h + max_file_usec) {
3991 log_full(log_level,
3992 "Oldest entry in %s is older than the configured file retention duration (%s), suggesting rotation.",
3993 f->path, FORMAT_TIMESPAN(max_file_usec, USEC_PER_SEC));
3994 return true;
3995 }
3996 }
3997
3998 return false;
3999 }
4000
4001 static const char * const journal_object_type_table[] = {
4002 [OBJECT_UNUSED] = "unused",
4003 [OBJECT_DATA] = "data",
4004 [OBJECT_FIELD] = "field",
4005 [OBJECT_ENTRY] = "entry",
4006 [OBJECT_DATA_HASH_TABLE] = "data hash table",
4007 [OBJECT_FIELD_HASH_TABLE] = "field hash table",
4008 [OBJECT_ENTRY_ARRAY] = "entry array",
4009 [OBJECT_TAG] = "tag",
4010 };
4011
4012 DEFINE_STRING_TABLE_LOOKUP_TO_STRING(journal_object_type, ObjectType);
4013